Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * RDMA resource limiting controller for cgroups.
0004  *
0005  * Used to allow a cgroup hierarchy to stop processes from consuming
0006  * additional RDMA resources after a certain limit is reached.
0007  *
0008  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
0009  */
0010 
0011 #include <linux/bitops.h>
0012 #include <linux/slab.h>
0013 #include <linux/seq_file.h>
0014 #include <linux/cgroup.h>
0015 #include <linux/parser.h>
0016 #include <linux/cgroup_rdma.h>
0017 
0018 #define RDMACG_MAX_STR "max"
0019 
0020 /*
0021  * Protects list of resource pools maintained on per cgroup basis
0022  * and rdma device list.
0023  */
0024 static DEFINE_MUTEX(rdmacg_mutex);
0025 static LIST_HEAD(rdmacg_devices);
0026 
0027 enum rdmacg_file_type {
0028     RDMACG_RESOURCE_TYPE_MAX,
0029     RDMACG_RESOURCE_TYPE_STAT,
0030 };
0031 
0032 /*
0033  * resource table definition as to be seen by the user.
0034  * Need to add entries to it when more resources are
0035  * added/defined at IB verb/core layer.
0036  */
0037 static char const *rdmacg_resource_names[] = {
0038     [RDMACG_RESOURCE_HCA_HANDLE]    = "hca_handle",
0039     [RDMACG_RESOURCE_HCA_OBJECT]    = "hca_object",
0040 };
0041 
0042 /* resource tracker for each resource of rdma cgroup */
0043 struct rdmacg_resource {
0044     int max;
0045     int usage;
0046 };
0047 
0048 /*
0049  * resource pool object which represents per cgroup, per device
0050  * resources. There are multiple instances of this object per cgroup,
0051  * therefore it cannot be embedded within rdma_cgroup structure. It
0052  * is maintained as list.
0053  */
0054 struct rdmacg_resource_pool {
0055     struct rdmacg_device    *device;
0056     struct rdmacg_resource  resources[RDMACG_RESOURCE_MAX];
0057 
0058     struct list_head    cg_node;
0059     struct list_head    dev_node;
0060 
0061     /* count active user tasks of this pool */
0062     u64         usage_sum;
0063     /* total number counts which are set to max */
0064     int         num_max_cnt;
0065 };
0066 
0067 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
0068 {
0069     return container_of(css, struct rdma_cgroup, css);
0070 }
0071 
0072 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
0073 {
0074     return css_rdmacg(cg->css.parent);
0075 }
0076 
0077 static inline struct rdma_cgroup *get_current_rdmacg(void)
0078 {
0079     return css_rdmacg(task_get_css(current, rdma_cgrp_id));
0080 }
0081 
0082 static void set_resource_limit(struct rdmacg_resource_pool *rpool,
0083                    int index, int new_max)
0084 {
0085     if (new_max == S32_MAX) {
0086         if (rpool->resources[index].max != S32_MAX)
0087             rpool->num_max_cnt++;
0088     } else {
0089         if (rpool->resources[index].max == S32_MAX)
0090             rpool->num_max_cnt--;
0091     }
0092     rpool->resources[index].max = new_max;
0093 }
0094 
0095 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
0096 {
0097     int i;
0098 
0099     for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
0100         set_resource_limit(rpool, i, S32_MAX);
0101 }
0102 
0103 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
0104 {
0105     lockdep_assert_held(&rdmacg_mutex);
0106 
0107     list_del(&rpool->cg_node);
0108     list_del(&rpool->dev_node);
0109     kfree(rpool);
0110 }
0111 
0112 static struct rdmacg_resource_pool *
0113 find_cg_rpool_locked(struct rdma_cgroup *cg,
0114              struct rdmacg_device *device)
0115 
0116 {
0117     struct rdmacg_resource_pool *pool;
0118 
0119     lockdep_assert_held(&rdmacg_mutex);
0120 
0121     list_for_each_entry(pool, &cg->rpools, cg_node)
0122         if (pool->device == device)
0123             return pool;
0124 
0125     return NULL;
0126 }
0127 
0128 static struct rdmacg_resource_pool *
0129 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
0130 {
0131     struct rdmacg_resource_pool *rpool;
0132 
0133     rpool = find_cg_rpool_locked(cg, device);
0134     if (rpool)
0135         return rpool;
0136 
0137     rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
0138     if (!rpool)
0139         return ERR_PTR(-ENOMEM);
0140 
0141     rpool->device = device;
0142     set_all_resource_max_limit(rpool);
0143 
0144     INIT_LIST_HEAD(&rpool->cg_node);
0145     INIT_LIST_HEAD(&rpool->dev_node);
0146     list_add_tail(&rpool->cg_node, &cg->rpools);
0147     list_add_tail(&rpool->dev_node, &device->rpools);
0148     return rpool;
0149 }
0150 
0151 /**
0152  * uncharge_cg_locked - uncharge resource for rdma cgroup
0153  * @cg: pointer to cg to uncharge and all parents in hierarchy
0154  * @device: pointer to rdmacg device
0155  * @index: index of the resource to uncharge in cg (resource pool)
0156  *
0157  * It also frees the resource pool which was created as part of
0158  * charging operation when there are no resources attached to
0159  * resource pool.
0160  */
0161 static void
0162 uncharge_cg_locked(struct rdma_cgroup *cg,
0163            struct rdmacg_device *device,
0164            enum rdmacg_resource_type index)
0165 {
0166     struct rdmacg_resource_pool *rpool;
0167 
0168     rpool = find_cg_rpool_locked(cg, device);
0169 
0170     /*
0171      * rpool cannot be null at this stage. Let kernel operate in case
0172      * if there a bug in IB stack or rdma controller, instead of crashing
0173      * the system.
0174      */
0175     if (unlikely(!rpool)) {
0176         pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
0177         return;
0178     }
0179 
0180     rpool->resources[index].usage--;
0181 
0182     /*
0183      * A negative count (or overflow) is invalid,
0184      * it indicates a bug in the rdma controller.
0185      */
0186     WARN_ON_ONCE(rpool->resources[index].usage < 0);
0187     rpool->usage_sum--;
0188     if (rpool->usage_sum == 0 &&
0189         rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
0190         /*
0191          * No user of the rpool and all entries are set to max, so
0192          * safe to delete this rpool.
0193          */
0194         free_cg_rpool_locked(rpool);
0195     }
0196 }
0197 
0198 /**
0199  * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
0200  * @device: pointer to rdmacg device
0201  * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
0202  *           stop uncharging
0203  * @index: index of the resource to uncharge in cg in given resource pool
0204  */
0205 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
0206                      struct rdmacg_device *device,
0207                      struct rdma_cgroup *stop_cg,
0208                      enum rdmacg_resource_type index)
0209 {
0210     struct rdma_cgroup *p;
0211 
0212     mutex_lock(&rdmacg_mutex);
0213 
0214     for (p = cg; p != stop_cg; p = parent_rdmacg(p))
0215         uncharge_cg_locked(p, device, index);
0216 
0217     mutex_unlock(&rdmacg_mutex);
0218 
0219     css_put(&cg->css);
0220 }
0221 
0222 /**
0223  * rdmacg_uncharge - hierarchically uncharge rdma resource count
0224  * @device: pointer to rdmacg device
0225  * @index: index of the resource to uncharge in cgroup in given resource pool
0226  */
0227 void rdmacg_uncharge(struct rdma_cgroup *cg,
0228              struct rdmacg_device *device,
0229              enum rdmacg_resource_type index)
0230 {
0231     if (index >= RDMACG_RESOURCE_MAX)
0232         return;
0233 
0234     rdmacg_uncharge_hierarchy(cg, device, NULL, index);
0235 }
0236 EXPORT_SYMBOL(rdmacg_uncharge);
0237 
0238 /**
0239  * rdmacg_try_charge - hierarchically try to charge the rdma resource
0240  * @rdmacg: pointer to rdma cgroup which will own this resource
0241  * @device: pointer to rdmacg device
0242  * @index: index of the resource to charge in cgroup (resource pool)
0243  *
0244  * This function follows charging resource in hierarchical way.
0245  * It will fail if the charge would cause the new value to exceed the
0246  * hierarchical limit.
0247  * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
0248  * Returns pointer to rdmacg for this resource when charging is successful.
0249  *
0250  * Charger needs to account resources on two criteria.
0251  * (a) per cgroup & (b) per device resource usage.
0252  * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
0253  * the configured limits. Per device provides granular configuration
0254  * in multi device usage. It allocates resource pool in the hierarchy
0255  * for each parent it come across for first resource. Later on resource
0256  * pool will be available. Therefore it will be much faster thereon
0257  * to charge/uncharge.
0258  */
0259 int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
0260               struct rdmacg_device *device,
0261               enum rdmacg_resource_type index)
0262 {
0263     struct rdma_cgroup *cg, *p;
0264     struct rdmacg_resource_pool *rpool;
0265     s64 new;
0266     int ret = 0;
0267 
0268     if (index >= RDMACG_RESOURCE_MAX)
0269         return -EINVAL;
0270 
0271     /*
0272      * hold on to css, as cgroup can be removed but resource
0273      * accounting happens on css.
0274      */
0275     cg = get_current_rdmacg();
0276 
0277     mutex_lock(&rdmacg_mutex);
0278     for (p = cg; p; p = parent_rdmacg(p)) {
0279         rpool = get_cg_rpool_locked(p, device);
0280         if (IS_ERR(rpool)) {
0281             ret = PTR_ERR(rpool);
0282             goto err;
0283         } else {
0284             new = rpool->resources[index].usage + 1;
0285             if (new > rpool->resources[index].max) {
0286                 ret = -EAGAIN;
0287                 goto err;
0288             } else {
0289                 rpool->resources[index].usage = new;
0290                 rpool->usage_sum++;
0291             }
0292         }
0293     }
0294     mutex_unlock(&rdmacg_mutex);
0295 
0296     *rdmacg = cg;
0297     return 0;
0298 
0299 err:
0300     mutex_unlock(&rdmacg_mutex);
0301     rdmacg_uncharge_hierarchy(cg, device, p, index);
0302     return ret;
0303 }
0304 EXPORT_SYMBOL(rdmacg_try_charge);
0305 
0306 /**
0307  * rdmacg_register_device - register rdmacg device to rdma controller.
0308  * @device: pointer to rdmacg device whose resources need to be accounted.
0309  *
0310  * If IB stack wish a device to participate in rdma cgroup resource
0311  * tracking, it must invoke this API to register with rdma cgroup before
0312  * any user space application can start using the RDMA resources.
0313  */
0314 void rdmacg_register_device(struct rdmacg_device *device)
0315 {
0316     INIT_LIST_HEAD(&device->dev_node);
0317     INIT_LIST_HEAD(&device->rpools);
0318 
0319     mutex_lock(&rdmacg_mutex);
0320     list_add_tail(&device->dev_node, &rdmacg_devices);
0321     mutex_unlock(&rdmacg_mutex);
0322 }
0323 EXPORT_SYMBOL(rdmacg_register_device);
0324 
0325 /**
0326  * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
0327  * @device: pointer to rdmacg device which was previously registered with rdma
0328  *          controller using rdmacg_register_device().
0329  *
0330  * IB stack must invoke this after all the resources of the IB device
0331  * are destroyed and after ensuring that no more resources will be created
0332  * when this API is invoked.
0333  */
0334 void rdmacg_unregister_device(struct rdmacg_device *device)
0335 {
0336     struct rdmacg_resource_pool *rpool, *tmp;
0337 
0338     /*
0339      * Synchronize with any active resource settings,
0340      * usage query happening via configfs.
0341      */
0342     mutex_lock(&rdmacg_mutex);
0343     list_del_init(&device->dev_node);
0344 
0345     /*
0346      * Now that this device is off the cgroup list, its safe to free
0347      * all the rpool resources.
0348      */
0349     list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
0350         free_cg_rpool_locked(rpool);
0351 
0352     mutex_unlock(&rdmacg_mutex);
0353 }
0354 EXPORT_SYMBOL(rdmacg_unregister_device);
0355 
0356 static int parse_resource(char *c, int *intval)
0357 {
0358     substring_t argstr;
0359     char *name, *value = c;
0360     size_t len;
0361     int ret, i;
0362 
0363     name = strsep(&value, "=");
0364     if (!name || !value)
0365         return -EINVAL;
0366 
0367     i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
0368     if (i < 0)
0369         return i;
0370 
0371     len = strlen(value);
0372 
0373     argstr.from = value;
0374     argstr.to = value + len;
0375 
0376     ret = match_int(&argstr, intval);
0377     if (ret >= 0) {
0378         if (*intval < 0)
0379             return -EINVAL;
0380         return i;
0381     }
0382     if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
0383         *intval = S32_MAX;
0384         return i;
0385     }
0386     return -EINVAL;
0387 }
0388 
0389 static int rdmacg_parse_limits(char *options,
0390                    int *new_limits, unsigned long *enables)
0391 {
0392     char *c;
0393     int err = -EINVAL;
0394 
0395     /* parse resource options */
0396     while ((c = strsep(&options, " ")) != NULL) {
0397         int index, intval;
0398 
0399         index = parse_resource(c, &intval);
0400         if (index < 0)
0401             goto err;
0402 
0403         new_limits[index] = intval;
0404         *enables |= BIT(index);
0405     }
0406     return 0;
0407 
0408 err:
0409     return err;
0410 }
0411 
0412 static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
0413 {
0414     struct rdmacg_device *device;
0415 
0416     lockdep_assert_held(&rdmacg_mutex);
0417 
0418     list_for_each_entry(device, &rdmacg_devices, dev_node)
0419         if (!strcmp(name, device->name))
0420             return device;
0421 
0422     return NULL;
0423 }
0424 
0425 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
0426                        char *buf, size_t nbytes, loff_t off)
0427 {
0428     struct rdma_cgroup *cg = css_rdmacg(of_css(of));
0429     const char *dev_name;
0430     struct rdmacg_resource_pool *rpool;
0431     struct rdmacg_device *device;
0432     char *options = strstrip(buf);
0433     int *new_limits;
0434     unsigned long enables = 0;
0435     int i = 0, ret = 0;
0436 
0437     /* extract the device name first */
0438     dev_name = strsep(&options, " ");
0439     if (!dev_name) {
0440         ret = -EINVAL;
0441         goto err;
0442     }
0443 
0444     new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
0445     if (!new_limits) {
0446         ret = -ENOMEM;
0447         goto err;
0448     }
0449 
0450     ret = rdmacg_parse_limits(options, new_limits, &enables);
0451     if (ret)
0452         goto parse_err;
0453 
0454     /* acquire lock to synchronize with hot plug devices */
0455     mutex_lock(&rdmacg_mutex);
0456 
0457     device = rdmacg_get_device_locked(dev_name);
0458     if (!device) {
0459         ret = -ENODEV;
0460         goto dev_err;
0461     }
0462 
0463     rpool = get_cg_rpool_locked(cg, device);
0464     if (IS_ERR(rpool)) {
0465         ret = PTR_ERR(rpool);
0466         goto dev_err;
0467     }
0468 
0469     /* now set the new limits of the rpool */
0470     for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
0471         set_resource_limit(rpool, i, new_limits[i]);
0472 
0473     if (rpool->usage_sum == 0 &&
0474         rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
0475         /*
0476          * No user of the rpool and all entries are set to max, so
0477          * safe to delete this rpool.
0478          */
0479         free_cg_rpool_locked(rpool);
0480     }
0481 
0482 dev_err:
0483     mutex_unlock(&rdmacg_mutex);
0484 
0485 parse_err:
0486     kfree(new_limits);
0487 
0488 err:
0489     return ret ?: nbytes;
0490 }
0491 
0492 static void print_rpool_values(struct seq_file *sf,
0493                    struct rdmacg_resource_pool *rpool)
0494 {
0495     enum rdmacg_file_type sf_type;
0496     int i;
0497     u32 value;
0498 
0499     sf_type = seq_cft(sf)->private;
0500 
0501     for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
0502         seq_puts(sf, rdmacg_resource_names[i]);
0503         seq_putc(sf, '=');
0504         if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
0505             if (rpool)
0506                 value = rpool->resources[i].max;
0507             else
0508                 value = S32_MAX;
0509         } else {
0510             if (rpool)
0511                 value = rpool->resources[i].usage;
0512             else
0513                 value = 0;
0514         }
0515 
0516         if (value == S32_MAX)
0517             seq_puts(sf, RDMACG_MAX_STR);
0518         else
0519             seq_printf(sf, "%d", value);
0520         seq_putc(sf, ' ');
0521     }
0522 }
0523 
0524 static int rdmacg_resource_read(struct seq_file *sf, void *v)
0525 {
0526     struct rdmacg_device *device;
0527     struct rdmacg_resource_pool *rpool;
0528     struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
0529 
0530     mutex_lock(&rdmacg_mutex);
0531 
0532     list_for_each_entry(device, &rdmacg_devices, dev_node) {
0533         seq_printf(sf, "%s ", device->name);
0534 
0535         rpool = find_cg_rpool_locked(cg, device);
0536         print_rpool_values(sf, rpool);
0537 
0538         seq_putc(sf, '\n');
0539     }
0540 
0541     mutex_unlock(&rdmacg_mutex);
0542     return 0;
0543 }
0544 
0545 static struct cftype rdmacg_files[] = {
0546     {
0547         .name = "max",
0548         .write = rdmacg_resource_set_max,
0549         .seq_show = rdmacg_resource_read,
0550         .private = RDMACG_RESOURCE_TYPE_MAX,
0551         .flags = CFTYPE_NOT_ON_ROOT,
0552     },
0553     {
0554         .name = "current",
0555         .seq_show = rdmacg_resource_read,
0556         .private = RDMACG_RESOURCE_TYPE_STAT,
0557         .flags = CFTYPE_NOT_ON_ROOT,
0558     },
0559     { } /* terminate */
0560 };
0561 
0562 static struct cgroup_subsys_state *
0563 rdmacg_css_alloc(struct cgroup_subsys_state *parent)
0564 {
0565     struct rdma_cgroup *cg;
0566 
0567     cg = kzalloc(sizeof(*cg), GFP_KERNEL);
0568     if (!cg)
0569         return ERR_PTR(-ENOMEM);
0570 
0571     INIT_LIST_HEAD(&cg->rpools);
0572     return &cg->css;
0573 }
0574 
0575 static void rdmacg_css_free(struct cgroup_subsys_state *css)
0576 {
0577     struct rdma_cgroup *cg = css_rdmacg(css);
0578 
0579     kfree(cg);
0580 }
0581 
0582 /**
0583  * rdmacg_css_offline - cgroup css_offline callback
0584  * @css: css of interest
0585  *
0586  * This function is called when @css is about to go away and responsible
0587  * for shooting down all rdmacg associated with @css. As part of that it
0588  * marks all the resource pool entries to max value, so that when resources are
0589  * uncharged, associated resource pool can be freed as well.
0590  */
0591 static void rdmacg_css_offline(struct cgroup_subsys_state *css)
0592 {
0593     struct rdma_cgroup *cg = css_rdmacg(css);
0594     struct rdmacg_resource_pool *rpool;
0595 
0596     mutex_lock(&rdmacg_mutex);
0597 
0598     list_for_each_entry(rpool, &cg->rpools, cg_node)
0599         set_all_resource_max_limit(rpool);
0600 
0601     mutex_unlock(&rdmacg_mutex);
0602 }
0603 
0604 struct cgroup_subsys rdma_cgrp_subsys = {
0605     .css_alloc  = rdmacg_css_alloc,
0606     .css_free   = rdmacg_css_free,
0607     .css_offline    = rdmacg_css_offline,
0608     .legacy_cftypes = rdmacg_files,
0609     .dfl_cftypes    = rdmacg_files,
0610 };