Back to home page

LXR

 
 

    


0001 /*
0002  *
0003  * Copyright IBM Corporation, 2012
0004  * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
0005  *
0006  * This program is free software; you can redistribute it and/or modify it
0007  * under the terms of version 2.1 of the GNU Lesser General Public License
0008  * as published by the Free Software Foundation.
0009  *
0010  * This program is distributed in the hope that it would be useful, but
0011  * WITHOUT ANY WARRANTY; without even the implied warranty of
0012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
0013  *
0014  */
0015 
0016 #include <linux/cgroup.h>
0017 #include <linux/page_counter.h>
0018 #include <linux/slab.h>
0019 #include <linux/hugetlb.h>
0020 #include <linux/hugetlb_cgroup.h>
0021 
0022 struct hugetlb_cgroup {
0023     struct cgroup_subsys_state css;
0024     /*
0025      * the counter to account for hugepages from hugetlb.
0026      */
0027     struct page_counter hugepage[HUGE_MAX_HSTATE];
0028 };
0029 
0030 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
0031 #define MEMFILE_IDX(val)    (((val) >> 16) & 0xffff)
0032 #define MEMFILE_ATTR(val)   ((val) & 0xffff)
0033 
0034 static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
0035 
0036 static inline
0037 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
0038 {
0039     return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
0040 }
0041 
0042 static inline
0043 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
0044 {
0045     return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
0046 }
0047 
0048 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
0049 {
0050     return (h_cg == root_h_cgroup);
0051 }
0052 
0053 static inline struct hugetlb_cgroup *
0054 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
0055 {
0056     return hugetlb_cgroup_from_css(h_cg->css.parent);
0057 }
0058 
0059 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
0060 {
0061     int idx;
0062 
0063     for (idx = 0; idx < hugetlb_max_hstate; idx++) {
0064         if (page_counter_read(&h_cg->hugepage[idx]))
0065             return true;
0066     }
0067     return false;
0068 }
0069 
0070 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
0071                 struct hugetlb_cgroup *parent_h_cgroup)
0072 {
0073     int idx;
0074 
0075     for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
0076         struct page_counter *counter = &h_cgroup->hugepage[idx];
0077         struct page_counter *parent = NULL;
0078         unsigned long limit;
0079         int ret;
0080 
0081         if (parent_h_cgroup)
0082             parent = &parent_h_cgroup->hugepage[idx];
0083         page_counter_init(counter, parent);
0084 
0085         limit = round_down(PAGE_COUNTER_MAX,
0086                    1 << huge_page_order(&hstates[idx]));
0087         ret = page_counter_limit(counter, limit);
0088         VM_BUG_ON(ret);
0089     }
0090 }
0091 
0092 static struct cgroup_subsys_state *
0093 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
0094 {
0095     struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
0096     struct hugetlb_cgroup *h_cgroup;
0097 
0098     h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
0099     if (!h_cgroup)
0100         return ERR_PTR(-ENOMEM);
0101 
0102     if (!parent_h_cgroup)
0103         root_h_cgroup = h_cgroup;
0104 
0105     hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
0106     return &h_cgroup->css;
0107 }
0108 
0109 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
0110 {
0111     struct hugetlb_cgroup *h_cgroup;
0112 
0113     h_cgroup = hugetlb_cgroup_from_css(css);
0114     kfree(h_cgroup);
0115 }
0116 
0117 
0118 /*
0119  * Should be called with hugetlb_lock held.
0120  * Since we are holding hugetlb_lock, pages cannot get moved from
0121  * active list or uncharged from the cgroup, So no need to get
0122  * page reference and test for page active here. This function
0123  * cannot fail.
0124  */
0125 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
0126                        struct page *page)
0127 {
0128     unsigned int nr_pages;
0129     struct page_counter *counter;
0130     struct hugetlb_cgroup *page_hcg;
0131     struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
0132 
0133     page_hcg = hugetlb_cgroup_from_page(page);
0134     /*
0135      * We can have pages in active list without any cgroup
0136      * ie, hugepage with less than 3 pages. We can safely
0137      * ignore those pages.
0138      */
0139     if (!page_hcg || page_hcg != h_cg)
0140         goto out;
0141 
0142     nr_pages = 1 << compound_order(page);
0143     if (!parent) {
0144         parent = root_h_cgroup;
0145         /* root has no limit */
0146         page_counter_charge(&parent->hugepage[idx], nr_pages);
0147     }
0148     counter = &h_cg->hugepage[idx];
0149     /* Take the pages off the local counter */
0150     page_counter_cancel(counter, nr_pages);
0151 
0152     set_hugetlb_cgroup(page, parent);
0153 out:
0154     return;
0155 }
0156 
0157 /*
0158  * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
0159  * the parent cgroup.
0160  */
0161 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
0162 {
0163     struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
0164     struct hstate *h;
0165     struct page *page;
0166     int idx = 0;
0167 
0168     do {
0169         for_each_hstate(h) {
0170             spin_lock(&hugetlb_lock);
0171             list_for_each_entry(page, &h->hugepage_activelist, lru)
0172                 hugetlb_cgroup_move_parent(idx, h_cg, page);
0173 
0174             spin_unlock(&hugetlb_lock);
0175             idx++;
0176         }
0177         cond_resched();
0178     } while (hugetlb_cgroup_have_usage(h_cg));
0179 }
0180 
0181 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
0182                  struct hugetlb_cgroup **ptr)
0183 {
0184     int ret = 0;
0185     struct page_counter *counter;
0186     struct hugetlb_cgroup *h_cg = NULL;
0187 
0188     if (hugetlb_cgroup_disabled())
0189         goto done;
0190     /*
0191      * We don't charge any cgroup if the compound page have less
0192      * than 3 pages.
0193      */
0194     if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
0195         goto done;
0196 again:
0197     rcu_read_lock();
0198     h_cg = hugetlb_cgroup_from_task(current);
0199     if (!css_tryget_online(&h_cg->css)) {
0200         rcu_read_unlock();
0201         goto again;
0202     }
0203     rcu_read_unlock();
0204 
0205     if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter))
0206         ret = -ENOMEM;
0207     css_put(&h_cg->css);
0208 done:
0209     *ptr = h_cg;
0210     return ret;
0211 }
0212 
0213 /* Should be called with hugetlb_lock held */
0214 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
0215                   struct hugetlb_cgroup *h_cg,
0216                   struct page *page)
0217 {
0218     if (hugetlb_cgroup_disabled() || !h_cg)
0219         return;
0220 
0221     set_hugetlb_cgroup(page, h_cg);
0222     return;
0223 }
0224 
0225 /*
0226  * Should be called with hugetlb_lock held
0227  */
0228 void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
0229                   struct page *page)
0230 {
0231     struct hugetlb_cgroup *h_cg;
0232 
0233     if (hugetlb_cgroup_disabled())
0234         return;
0235     lockdep_assert_held(&hugetlb_lock);
0236     h_cg = hugetlb_cgroup_from_page(page);
0237     if (unlikely(!h_cg))
0238         return;
0239     set_hugetlb_cgroup(page, NULL);
0240     page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
0241     return;
0242 }
0243 
0244 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
0245                     struct hugetlb_cgroup *h_cg)
0246 {
0247     if (hugetlb_cgroup_disabled() || !h_cg)
0248         return;
0249 
0250     if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
0251         return;
0252 
0253     page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
0254     return;
0255 }
0256 
0257 enum {
0258     RES_USAGE,
0259     RES_LIMIT,
0260     RES_MAX_USAGE,
0261     RES_FAILCNT,
0262 };
0263 
0264 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
0265                    struct cftype *cft)
0266 {
0267     struct page_counter *counter;
0268     struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
0269 
0270     counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
0271 
0272     switch (MEMFILE_ATTR(cft->private)) {
0273     case RES_USAGE:
0274         return (u64)page_counter_read(counter) * PAGE_SIZE;
0275     case RES_LIMIT:
0276         return (u64)counter->limit * PAGE_SIZE;
0277     case RES_MAX_USAGE:
0278         return (u64)counter->watermark * PAGE_SIZE;
0279     case RES_FAILCNT:
0280         return counter->failcnt;
0281     default:
0282         BUG();
0283     }
0284 }
0285 
0286 static DEFINE_MUTEX(hugetlb_limit_mutex);
0287 
0288 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
0289                     char *buf, size_t nbytes, loff_t off)
0290 {
0291     int ret, idx;
0292     unsigned long nr_pages;
0293     struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
0294 
0295     if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
0296         return -EINVAL;
0297 
0298     buf = strstrip(buf);
0299     ret = page_counter_memparse(buf, "-1", &nr_pages);
0300     if (ret)
0301         return ret;
0302 
0303     idx = MEMFILE_IDX(of_cft(of)->private);
0304     nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx]));
0305 
0306     switch (MEMFILE_ATTR(of_cft(of)->private)) {
0307     case RES_LIMIT:
0308         mutex_lock(&hugetlb_limit_mutex);
0309         ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages);
0310         mutex_unlock(&hugetlb_limit_mutex);
0311         break;
0312     default:
0313         ret = -EINVAL;
0314         break;
0315     }
0316     return ret ?: nbytes;
0317 }
0318 
0319 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
0320                     char *buf, size_t nbytes, loff_t off)
0321 {
0322     int ret = 0;
0323     struct page_counter *counter;
0324     struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
0325 
0326     counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
0327 
0328     switch (MEMFILE_ATTR(of_cft(of)->private)) {
0329     case RES_MAX_USAGE:
0330         page_counter_reset_watermark(counter);
0331         break;
0332     case RES_FAILCNT:
0333         counter->failcnt = 0;
0334         break;
0335     default:
0336         ret = -EINVAL;
0337         break;
0338     }
0339     return ret ?: nbytes;
0340 }
0341 
0342 static char *mem_fmt(char *buf, int size, unsigned long hsize)
0343 {
0344     if (hsize >= (1UL << 30))
0345         snprintf(buf, size, "%luGB", hsize >> 30);
0346     else if (hsize >= (1UL << 20))
0347         snprintf(buf, size, "%luMB", hsize >> 20);
0348     else
0349         snprintf(buf, size, "%luKB", hsize >> 10);
0350     return buf;
0351 }
0352 
0353 static void __init __hugetlb_cgroup_file_init(int idx)
0354 {
0355     char buf[32];
0356     struct cftype *cft;
0357     struct hstate *h = &hstates[idx];
0358 
0359     /* format the size */
0360     mem_fmt(buf, 32, huge_page_size(h));
0361 
0362     /* Add the limit file */
0363     cft = &h->cgroup_files[0];
0364     snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
0365     cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
0366     cft->read_u64 = hugetlb_cgroup_read_u64;
0367     cft->write = hugetlb_cgroup_write;
0368 
0369     /* Add the usage file */
0370     cft = &h->cgroup_files[1];
0371     snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
0372     cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
0373     cft->read_u64 = hugetlb_cgroup_read_u64;
0374 
0375     /* Add the MAX usage file */
0376     cft = &h->cgroup_files[2];
0377     snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
0378     cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
0379     cft->write = hugetlb_cgroup_reset;
0380     cft->read_u64 = hugetlb_cgroup_read_u64;
0381 
0382     /* Add the failcntfile */
0383     cft = &h->cgroup_files[3];
0384     snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
0385     cft->private  = MEMFILE_PRIVATE(idx, RES_FAILCNT);
0386     cft->write = hugetlb_cgroup_reset;
0387     cft->read_u64 = hugetlb_cgroup_read_u64;
0388 
0389     /* NULL terminate the last cft */
0390     cft = &h->cgroup_files[4];
0391     memset(cft, 0, sizeof(*cft));
0392 
0393     WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
0394                       h->cgroup_files));
0395 }
0396 
0397 void __init hugetlb_cgroup_file_init(void)
0398 {
0399     struct hstate *h;
0400 
0401     for_each_hstate(h) {
0402         /*
0403          * Add cgroup control files only if the huge page consists
0404          * of more than two normal pages. This is because we use
0405          * page[2].private for storing cgroup details.
0406          */
0407         if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
0408             __hugetlb_cgroup_file_init(hstate_index(h));
0409     }
0410 }
0411 
0412 /*
0413  * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
0414  * when we migrate hugepages
0415  */
0416 void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
0417 {
0418     struct hugetlb_cgroup *h_cg;
0419     struct hstate *h = page_hstate(oldhpage);
0420 
0421     if (hugetlb_cgroup_disabled())
0422         return;
0423 
0424     VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
0425     spin_lock(&hugetlb_lock);
0426     h_cg = hugetlb_cgroup_from_page(oldhpage);
0427     set_hugetlb_cgroup(oldhpage, NULL);
0428 
0429     /* move the h_cg details to new cgroup */
0430     set_hugetlb_cgroup(newhpage, h_cg);
0431     list_move(&newhpage->lru, &h->hugepage_activelist);
0432     spin_unlock(&hugetlb_lock);
0433     return;
0434 }
0435 
0436 struct cgroup_subsys hugetlb_cgrp_subsys = {
0437     .css_alloc  = hugetlb_cgroup_css_alloc,
0438     .css_offline    = hugetlb_cgroup_css_offline,
0439     .css_free   = hugetlb_cgroup_css_free,
0440 };