Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright (C) 2010-2012 by Dell Inc.  All rights reserved.
0003  * Copyright (C) 2011-2013 Red Hat, Inc.
0004  *
0005  * This file is released under the GPL.
0006  *
0007  * dm-switch is a device-mapper target that maps IO to underlying block
0008  * devices efficiently when there are a large number of fixed-sized
0009  * address regions but there is no simple pattern to allow for a compact
0010  * mapping representation such as dm-stripe.
0011  */
0012 
0013 #include <linux/device-mapper.h>
0014 
0015 #include <linux/module.h>
0016 #include <linux/init.h>
0017 #include <linux/vmalloc.h>
0018 
0019 #define DM_MSG_PREFIX "switch"
0020 
0021 /*
0022  * One region_table_slot_t holds <region_entries_per_slot> region table
0023  * entries each of which is <region_table_entry_bits> in size.
0024  */
0025 typedef unsigned long region_table_slot_t;
0026 
0027 /*
0028  * A device with the offset to its start sector.
0029  */
0030 struct switch_path {
0031     struct dm_dev *dmdev;
0032     sector_t start;
0033 };
0034 
0035 /*
0036  * Context block for a dm switch device.
0037  */
0038 struct switch_ctx {
0039     struct dm_target *ti;
0040 
0041     unsigned nr_paths;      /* Number of paths in path_list. */
0042 
0043     unsigned region_size;       /* Region size in 512-byte sectors */
0044     unsigned long nr_regions;   /* Number of regions making up the device */
0045     signed char region_size_bits;   /* log2 of region_size or -1 */
0046 
0047     unsigned char region_table_entry_bits;  /* Number of bits in one region table entry */
0048     unsigned char region_entries_per_slot;  /* Number of entries in one region table slot */
0049     signed char region_entries_per_slot_bits;   /* log2 of region_entries_per_slot or -1 */
0050 
0051     region_table_slot_t *region_table;  /* Region table */
0052 
0053     /*
0054      * Array of dm devices to switch between.
0055      */
0056     struct switch_path path_list[];
0057 };
0058 
0059 static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
0060                        unsigned region_size)
0061 {
0062     struct switch_ctx *sctx;
0063 
0064     sctx = kzalloc(struct_size(sctx, path_list, nr_paths), GFP_KERNEL);
0065     if (!sctx)
0066         return NULL;
0067 
0068     sctx->ti = ti;
0069     sctx->region_size = region_size;
0070 
0071     ti->private = sctx;
0072 
0073     return sctx;
0074 }
0075 
0076 static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
0077 {
0078     struct switch_ctx *sctx = ti->private;
0079     sector_t nr_regions = ti->len;
0080     sector_t nr_slots;
0081 
0082     if (!(sctx->region_size & (sctx->region_size - 1)))
0083         sctx->region_size_bits = __ffs(sctx->region_size);
0084     else
0085         sctx->region_size_bits = -1;
0086 
0087     sctx->region_table_entry_bits = 1;
0088     while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
0089            (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
0090         sctx->region_table_entry_bits++;
0091 
0092     sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
0093     if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
0094         sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
0095     else
0096         sctx->region_entries_per_slot_bits = -1;
0097 
0098     if (sector_div(nr_regions, sctx->region_size))
0099         nr_regions++;
0100 
0101     if (nr_regions >= ULONG_MAX) {
0102         ti->error = "Region table too large";
0103         return -EINVAL;
0104     }
0105     sctx->nr_regions = nr_regions;
0106 
0107     nr_slots = nr_regions;
0108     if (sector_div(nr_slots, sctx->region_entries_per_slot))
0109         nr_slots++;
0110 
0111     if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
0112         ti->error = "Region table too large";
0113         return -EINVAL;
0114     }
0115 
0116     sctx->region_table = vmalloc(array_size(nr_slots,
0117                         sizeof(region_table_slot_t)));
0118     if (!sctx->region_table) {
0119         ti->error = "Cannot allocate region table";
0120         return -ENOMEM;
0121     }
0122 
0123     return 0;
0124 }
0125 
0126 static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
0127                 unsigned long *region_index, unsigned *bit)
0128 {
0129     if (sctx->region_entries_per_slot_bits >= 0) {
0130         *region_index = region_nr >> sctx->region_entries_per_slot_bits;
0131         *bit = region_nr & (sctx->region_entries_per_slot - 1);
0132     } else {
0133         *region_index = region_nr / sctx->region_entries_per_slot;
0134         *bit = region_nr % sctx->region_entries_per_slot;
0135     }
0136 
0137     *bit *= sctx->region_table_entry_bits;
0138 }
0139 
0140 static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
0141 {
0142     unsigned long region_index;
0143     unsigned bit;
0144 
0145     switch_get_position(sctx, region_nr, &region_index, &bit);
0146 
0147     return (READ_ONCE(sctx->region_table[region_index]) >> bit) &
0148         ((1 << sctx->region_table_entry_bits) - 1);
0149 }
0150 
0151 /*
0152  * Find which path to use at given offset.
0153  */
0154 static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
0155 {
0156     unsigned path_nr;
0157     sector_t p;
0158 
0159     p = offset;
0160     if (sctx->region_size_bits >= 0)
0161         p >>= sctx->region_size_bits;
0162     else
0163         sector_div(p, sctx->region_size);
0164 
0165     path_nr = switch_region_table_read(sctx, p);
0166 
0167     /* This can only happen if the processor uses non-atomic stores. */
0168     if (unlikely(path_nr >= sctx->nr_paths))
0169         path_nr = 0;
0170 
0171     return path_nr;
0172 }
0173 
0174 static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
0175                       unsigned value)
0176 {
0177     unsigned long region_index;
0178     unsigned bit;
0179     region_table_slot_t pte;
0180 
0181     switch_get_position(sctx, region_nr, &region_index, &bit);
0182 
0183     pte = sctx->region_table[region_index];
0184     pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
0185     pte |= (region_table_slot_t)value << bit;
0186     sctx->region_table[region_index] = pte;
0187 }
0188 
0189 /*
0190  * Fill the region table with an initial round robin pattern.
0191  */
0192 static void initialise_region_table(struct switch_ctx *sctx)
0193 {
0194     unsigned path_nr = 0;
0195     unsigned long region_nr;
0196 
0197     for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
0198         switch_region_table_write(sctx, region_nr, path_nr);
0199         if (++path_nr >= sctx->nr_paths)
0200             path_nr = 0;
0201     }
0202 }
0203 
0204 static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
0205 {
0206     struct switch_ctx *sctx = ti->private;
0207     unsigned long long start;
0208     int r;
0209 
0210     r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
0211               &sctx->path_list[sctx->nr_paths].dmdev);
0212     if (r) {
0213         ti->error = "Device lookup failed";
0214         return r;
0215     }
0216 
0217     if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
0218         ti->error = "Invalid device starting offset";
0219         dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
0220         return -EINVAL;
0221     }
0222 
0223     sctx->path_list[sctx->nr_paths].start = start;
0224 
0225     sctx->nr_paths++;
0226 
0227     return 0;
0228 }
0229 
0230 /*
0231  * Destructor: Don't free the dm_target, just the ti->private data (if any).
0232  */
0233 static void switch_dtr(struct dm_target *ti)
0234 {
0235     struct switch_ctx *sctx = ti->private;
0236 
0237     while (sctx->nr_paths--)
0238         dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
0239 
0240     vfree(sctx->region_table);
0241     kfree(sctx);
0242 }
0243 
0244 /*
0245  * Constructor arguments:
0246  *   <num_paths> <region_size> <num_optional_args> [<optional_args>...]
0247  *   [<dev_path> <offset>]+
0248  *
0249  * Optional args are to allow for future extension: currently this
0250  * parameter must be 0.
0251  */
0252 static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
0253 {
0254     static const struct dm_arg _args[] = {
0255         {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
0256         {1, UINT_MAX, "Invalid region size"},
0257         {0, 0, "Invalid number of optional args"},
0258     };
0259 
0260     struct switch_ctx *sctx;
0261     struct dm_arg_set as;
0262     unsigned nr_paths, region_size, nr_optional_args;
0263     int r;
0264 
0265     as.argc = argc;
0266     as.argv = argv;
0267 
0268     r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
0269     if (r)
0270         return -EINVAL;
0271 
0272     r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
0273     if (r)
0274         return r;
0275 
0276     r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
0277     if (r)
0278         return r;
0279     /* parse optional arguments here, if we add any */
0280 
0281     if (as.argc != nr_paths * 2) {
0282         ti->error = "Incorrect number of path arguments";
0283         return -EINVAL;
0284     }
0285 
0286     sctx = alloc_switch_ctx(ti, nr_paths, region_size);
0287     if (!sctx) {
0288         ti->error = "Cannot allocate redirection context";
0289         return -ENOMEM;
0290     }
0291 
0292     r = dm_set_target_max_io_len(ti, region_size);
0293     if (r)
0294         goto error;
0295 
0296     while (as.argc) {
0297         r = parse_path(&as, ti);
0298         if (r)
0299             goto error;
0300     }
0301 
0302     r = alloc_region_table(ti, nr_paths);
0303     if (r)
0304         goto error;
0305 
0306     initialise_region_table(sctx);
0307 
0308     /* For UNMAP, sending the request down any path is sufficient */
0309     ti->num_discard_bios = 1;
0310 
0311     return 0;
0312 
0313 error:
0314     switch_dtr(ti);
0315 
0316     return r;
0317 }
0318 
0319 static int switch_map(struct dm_target *ti, struct bio *bio)
0320 {
0321     struct switch_ctx *sctx = ti->private;
0322     sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
0323     unsigned path_nr = switch_get_path_nr(sctx, offset);
0324 
0325     bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev);
0326     bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
0327 
0328     return DM_MAPIO_REMAPPED;
0329 }
0330 
0331 /*
0332  * We need to parse hex numbers in the message as quickly as possible.
0333  *
0334  * This table-based hex parser improves performance.
0335  * It improves a time to load 1000000 entries compared to the condition-based
0336  * parser.
0337  *      table-based parser  condition-based parser
0338  * PA-RISC  0.29s           0.31s
0339  * Opteron  0.0495s         0.0498s
0340  */
0341 static const unsigned char hex_table[256] = {
0342 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0343 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0344 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0345 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
0346 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0347 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0348 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0349 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0350 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0351 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0352 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0353 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0354 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0355 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0356 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0357 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
0358 };
0359 
0360 static __always_inline unsigned long parse_hex(const char **string)
0361 {
0362     unsigned char d;
0363     unsigned long r = 0;
0364 
0365     while ((d = hex_table[(unsigned char)**string]) < 16) {
0366         r = (r << 4) | d;
0367         (*string)++;
0368     }
0369 
0370     return r;
0371 }
0372 
0373 static int process_set_region_mappings(struct switch_ctx *sctx,
0374                        unsigned argc, char **argv)
0375 {
0376     unsigned i;
0377     unsigned long region_index = 0;
0378 
0379     for (i = 1; i < argc; i++) {
0380         unsigned long path_nr;
0381         const char *string = argv[i];
0382 
0383         if ((*string & 0xdf) == 'R') {
0384             unsigned long cycle_length, num_write;
0385 
0386             string++;
0387             if (unlikely(*string == ',')) {
0388                 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
0389                 return -EINVAL;
0390             }
0391             cycle_length = parse_hex(&string);
0392             if (unlikely(*string != ',')) {
0393                 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
0394                 return -EINVAL;
0395             }
0396             string++;
0397             if (unlikely(!*string)) {
0398                 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
0399                 return -EINVAL;
0400             }
0401             num_write = parse_hex(&string);
0402             if (unlikely(*string)) {
0403                 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
0404                 return -EINVAL;
0405             }
0406 
0407             if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
0408                 DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
0409                        cycle_length - 1, region_index);
0410                 return -EINVAL;
0411             }
0412             if (unlikely(region_index + num_write < region_index) ||
0413                 unlikely(region_index + num_write >= sctx->nr_regions)) {
0414                 DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
0415                        region_index, num_write, sctx->nr_regions);
0416                 return -EINVAL;
0417             }
0418 
0419             while (num_write--) {
0420                 region_index++;
0421                 path_nr = switch_region_table_read(sctx, region_index - cycle_length);
0422                 switch_region_table_write(sctx, region_index, path_nr);
0423             }
0424 
0425             continue;
0426         }
0427 
0428         if (*string == ':')
0429             region_index++;
0430         else {
0431             region_index = parse_hex(&string);
0432             if (unlikely(*string != ':')) {
0433                 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
0434                 return -EINVAL;
0435             }
0436         }
0437 
0438         string++;
0439         if (unlikely(!*string)) {
0440             DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
0441             return -EINVAL;
0442         }
0443 
0444         path_nr = parse_hex(&string);
0445         if (unlikely(*string)) {
0446             DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
0447             return -EINVAL;
0448         }
0449         if (unlikely(region_index >= sctx->nr_regions)) {
0450             DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
0451             return -EINVAL;
0452         }
0453         if (unlikely(path_nr >= sctx->nr_paths)) {
0454             DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
0455             return -EINVAL;
0456         }
0457 
0458         switch_region_table_write(sctx, region_index, path_nr);
0459     }
0460 
0461     return 0;
0462 }
0463 
0464 /*
0465  * Messages are processed one-at-a-time.
0466  *
0467  * Only set_region_mappings is supported.
0468  */
0469 static int switch_message(struct dm_target *ti, unsigned argc, char **argv,
0470               char *result, unsigned maxlen)
0471 {
0472     static DEFINE_MUTEX(message_mutex);
0473 
0474     struct switch_ctx *sctx = ti->private;
0475     int r = -EINVAL;
0476 
0477     mutex_lock(&message_mutex);
0478 
0479     if (!strcasecmp(argv[0], "set_region_mappings"))
0480         r = process_set_region_mappings(sctx, argc, argv);
0481     else
0482         DMWARN("Unrecognised message received.");
0483 
0484     mutex_unlock(&message_mutex);
0485 
0486     return r;
0487 }
0488 
0489 static void switch_status(struct dm_target *ti, status_type_t type,
0490               unsigned status_flags, char *result, unsigned maxlen)
0491 {
0492     struct switch_ctx *sctx = ti->private;
0493     unsigned sz = 0;
0494     int path_nr;
0495 
0496     switch (type) {
0497     case STATUSTYPE_INFO:
0498         result[0] = '\0';
0499         break;
0500 
0501     case STATUSTYPE_TABLE:
0502         DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
0503         for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
0504             DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
0505                    (unsigned long long)sctx->path_list[path_nr].start);
0506         break;
0507 
0508     case STATUSTYPE_IMA:
0509         result[0] = '\0';
0510         break;
0511     }
0512 }
0513 
0514 /*
0515  * Switch ioctl:
0516  *
0517  * Passthrough all ioctls to the path for sector 0
0518  */
0519 static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
0520 {
0521     struct switch_ctx *sctx = ti->private;
0522     unsigned path_nr;
0523 
0524     path_nr = switch_get_path_nr(sctx, 0);
0525 
0526     *bdev = sctx->path_list[path_nr].dmdev->bdev;
0527 
0528     /*
0529      * Only pass ioctls through if the device sizes match exactly.
0530      */
0531     if (ti->len + sctx->path_list[path_nr].start !=
0532         bdev_nr_sectors((*bdev)))
0533         return 1;
0534     return 0;
0535 }
0536 
0537 static int switch_iterate_devices(struct dm_target *ti,
0538                   iterate_devices_callout_fn fn, void *data)
0539 {
0540     struct switch_ctx *sctx = ti->private;
0541     int path_nr;
0542     int r;
0543 
0544     for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
0545         r = fn(ti, sctx->path_list[path_nr].dmdev,
0546              sctx->path_list[path_nr].start, ti->len, data);
0547         if (r)
0548             return r;
0549     }
0550 
0551     return 0;
0552 }
0553 
0554 static struct target_type switch_target = {
0555     .name = "switch",
0556     .version = {1, 1, 0},
0557     .features = DM_TARGET_NOWAIT,
0558     .module = THIS_MODULE,
0559     .ctr = switch_ctr,
0560     .dtr = switch_dtr,
0561     .map = switch_map,
0562     .message = switch_message,
0563     .status = switch_status,
0564     .prepare_ioctl = switch_prepare_ioctl,
0565     .iterate_devices = switch_iterate_devices,
0566 };
0567 
0568 static int __init dm_switch_init(void)
0569 {
0570     int r;
0571 
0572     r = dm_register_target(&switch_target);
0573     if (r < 0)
0574         DMERR("dm_register_target() failed %d", r);
0575 
0576     return r;
0577 }
0578 
0579 static void __exit dm_switch_exit(void)
0580 {
0581     dm_unregister_target(&switch_target);
0582 }
0583 
0584 module_init(dm_switch_init);
0585 module_exit(dm_switch_exit);
0586 
0587 MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
0588 MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
0589 MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
0590 MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
0591 MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
0592 MODULE_LICENSE("GPL");