Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (C) 2021 Western Digital Corporation or its affiliates.
0004  */
0005 
0006 #include <linux/blkdev.h>
0007 #include <linux/mm.h>
0008 #include <linux/sched/mm.h>
0009 #include <linux/slab.h>
0010 
0011 #include "dm-core.h"
0012 
0013 #define DM_MSG_PREFIX "zone"
0014 
0015 #define DM_ZONE_INVALID_WP_OFST     UINT_MAX
0016 
0017 /*
0018  * For internal zone reports bypassing the top BIO submission path.
0019  */
0020 static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
0021                   sector_t sector, unsigned int nr_zones,
0022                   report_zones_cb cb, void *data)
0023 {
0024     struct gendisk *disk = md->disk;
0025     int ret;
0026     struct dm_report_zones_args args = {
0027         .next_sector = sector,
0028         .orig_data = data,
0029         .orig_cb = cb,
0030     };
0031 
0032     do {
0033         struct dm_target *tgt;
0034 
0035         tgt = dm_table_find_target(t, args.next_sector);
0036         if (WARN_ON_ONCE(!tgt->type->report_zones))
0037             return -EIO;
0038 
0039         args.tgt = tgt;
0040         ret = tgt->type->report_zones(tgt, &args,
0041                           nr_zones - args.zone_idx);
0042         if (ret < 0)
0043             return ret;
0044     } while (args.zone_idx < nr_zones &&
0045          args.next_sector < get_capacity(disk));
0046 
0047     return args.zone_idx;
0048 }
0049 
0050 /*
0051  * User facing dm device block device report zone operation. This calls the
0052  * report_zones operation for each target of a device table. This operation is
0053  * generally implemented by targets using dm_report_zones().
0054  */
0055 int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
0056             unsigned int nr_zones, report_zones_cb cb, void *data)
0057 {
0058     struct mapped_device *md = disk->private_data;
0059     struct dm_table *map;
0060     int srcu_idx, ret;
0061 
0062     if (dm_suspended_md(md))
0063         return -EAGAIN;
0064 
0065     map = dm_get_live_table(md, &srcu_idx);
0066     if (!map)
0067         return -EIO;
0068 
0069     ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
0070 
0071     dm_put_live_table(md, srcu_idx);
0072 
0073     return ret;
0074 }
0075 
0076 static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx,
0077                   void *data)
0078 {
0079     struct dm_report_zones_args *args = data;
0080     sector_t sector_diff = args->tgt->begin - args->start;
0081 
0082     /*
0083      * Ignore zones beyond the target range.
0084      */
0085     if (zone->start >= args->start + args->tgt->len)
0086         return 0;
0087 
0088     /*
0089      * Remap the start sector and write pointer position of the zone
0090      * to match its position in the target range.
0091      */
0092     zone->start += sector_diff;
0093     if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
0094         if (zone->cond == BLK_ZONE_COND_FULL)
0095             zone->wp = zone->start + zone->len;
0096         else if (zone->cond == BLK_ZONE_COND_EMPTY)
0097             zone->wp = zone->start;
0098         else
0099             zone->wp += sector_diff;
0100     }
0101 
0102     args->next_sector = zone->start + zone->len;
0103     return args->orig_cb(zone, args->zone_idx++, args->orig_data);
0104 }
0105 
0106 /*
0107  * Helper for drivers of zoned targets to implement struct target_type
0108  * report_zones operation.
0109  */
0110 int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector,
0111             struct dm_report_zones_args *args, unsigned int nr_zones)
0112 {
0113     /*
0114      * Set the target mapping start sector first so that
0115      * dm_report_zones_cb() can correctly remap zone information.
0116      */
0117     args->start = start;
0118 
0119     return blkdev_report_zones(bdev, sector, nr_zones,
0120                    dm_report_zones_cb, args);
0121 }
0122 EXPORT_SYMBOL_GPL(dm_report_zones);
0123 
0124 bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
0125 {
0126     struct request_queue *q = md->queue;
0127 
0128     if (!blk_queue_is_zoned(q))
0129         return false;
0130 
0131     switch (bio_op(bio)) {
0132     case REQ_OP_WRITE_ZEROES:
0133     case REQ_OP_WRITE:
0134         return !op_is_flush(bio->bi_opf) && bio_sectors(bio);
0135     default:
0136         return false;
0137     }
0138 }
0139 
0140 void dm_cleanup_zoned_dev(struct mapped_device *md)
0141 {
0142     if (md->disk) {
0143         kfree(md->disk->conv_zones_bitmap);
0144         md->disk->conv_zones_bitmap = NULL;
0145         kfree(md->disk->seq_zones_wlock);
0146         md->disk->seq_zones_wlock = NULL;
0147     }
0148 
0149     kvfree(md->zwp_offset);
0150     md->zwp_offset = NULL;
0151     md->nr_zones = 0;
0152 }
0153 
0154 static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone)
0155 {
0156     switch (zone->cond) {
0157     case BLK_ZONE_COND_IMP_OPEN:
0158     case BLK_ZONE_COND_EXP_OPEN:
0159     case BLK_ZONE_COND_CLOSED:
0160         return zone->wp - zone->start;
0161     case BLK_ZONE_COND_FULL:
0162         return zone->len;
0163     case BLK_ZONE_COND_EMPTY:
0164     case BLK_ZONE_COND_NOT_WP:
0165     case BLK_ZONE_COND_OFFLINE:
0166     case BLK_ZONE_COND_READONLY:
0167     default:
0168         /*
0169          * Conventional, offline and read-only zones do not have a valid
0170          * write pointer. Use 0 as for an empty zone.
0171          */
0172         return 0;
0173     }
0174 }
0175 
0176 static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
0177                  void *data)
0178 {
0179     struct mapped_device *md = data;
0180     struct gendisk *disk = md->disk;
0181 
0182     switch (zone->type) {
0183     case BLK_ZONE_TYPE_CONVENTIONAL:
0184         if (!disk->conv_zones_bitmap) {
0185             disk->conv_zones_bitmap =
0186                 kcalloc(BITS_TO_LONGS(disk->nr_zones),
0187                     sizeof(unsigned long), GFP_NOIO);
0188             if (!disk->conv_zones_bitmap)
0189                 return -ENOMEM;
0190         }
0191         set_bit(idx, disk->conv_zones_bitmap);
0192         break;
0193     case BLK_ZONE_TYPE_SEQWRITE_REQ:
0194     case BLK_ZONE_TYPE_SEQWRITE_PREF:
0195         if (!disk->seq_zones_wlock) {
0196             disk->seq_zones_wlock =
0197                 kcalloc(BITS_TO_LONGS(disk->nr_zones),
0198                     sizeof(unsigned long), GFP_NOIO);
0199             if (!disk->seq_zones_wlock)
0200                 return -ENOMEM;
0201         }
0202         if (!md->zwp_offset) {
0203             md->zwp_offset =
0204                 kvcalloc(disk->nr_zones, sizeof(unsigned int),
0205                      GFP_KERNEL);
0206             if (!md->zwp_offset)
0207                 return -ENOMEM;
0208         }
0209         md->zwp_offset[idx] = dm_get_zone_wp_offset(zone);
0210 
0211         break;
0212     default:
0213         DMERR("Invalid zone type 0x%x at sectors %llu",
0214               (int)zone->type, zone->start);
0215         return -ENODEV;
0216     }
0217 
0218     return 0;
0219 }
0220 
0221 /*
0222  * Revalidate the zones of a mapped device to initialize resource necessary
0223  * for zone append emulation. Note that we cannot simply use the block layer
0224  * blk_revalidate_disk_zones() function here as the mapped device is suspended
0225  * (this is called from __bind() context).
0226  */
0227 static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
0228 {
0229     struct gendisk *disk = md->disk;
0230     unsigned int noio_flag;
0231     int ret;
0232 
0233     /*
0234      * Check if something changed. If yes, cleanup the current resources
0235      * and reallocate everything.
0236      */
0237     if (!disk->nr_zones || disk->nr_zones != md->nr_zones)
0238         dm_cleanup_zoned_dev(md);
0239     if (md->nr_zones)
0240         return 0;
0241 
0242     /*
0243      * Scan all zones to initialize everything. Ensure that all vmalloc
0244      * operations in this context are done as if GFP_NOIO was specified.
0245      */
0246     noio_flag = memalloc_noio_save();
0247     ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones,
0248                      dm_zone_revalidate_cb, md);
0249     memalloc_noio_restore(noio_flag);
0250     if (ret < 0)
0251         goto err;
0252     if (ret != disk->nr_zones) {
0253         ret = -EIO;
0254         goto err;
0255     }
0256 
0257     md->nr_zones = disk->nr_zones;
0258 
0259     return 0;
0260 
0261 err:
0262     DMERR("Revalidate zones failed %d", ret);
0263     dm_cleanup_zoned_dev(md);
0264     return ret;
0265 }
0266 
0267 static int device_not_zone_append_capable(struct dm_target *ti,
0268                       struct dm_dev *dev, sector_t start,
0269                       sector_t len, void *data)
0270 {
0271     return !bdev_is_zoned(dev->bdev);
0272 }
0273 
0274 static bool dm_table_supports_zone_append(struct dm_table *t)
0275 {
0276     for (unsigned int i = 0; i < t->num_targets; i++) {
0277         struct dm_target *ti = dm_table_get_target(t, i);
0278 
0279         if (ti->emulate_zone_append)
0280             return false;
0281 
0282         if (!ti->type->iterate_devices ||
0283             ti->type->iterate_devices(ti, device_not_zone_append_capable, NULL))
0284             return false;
0285     }
0286 
0287     return true;
0288 }
0289 
0290 int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
0291 {
0292     struct mapped_device *md = t->md;
0293 
0294     /*
0295      * For a zoned target, the number of zones should be updated for the
0296      * correct value to be exposed in sysfs queue/nr_zones.
0297      */
0298     WARN_ON_ONCE(queue_is_mq(q));
0299     md->disk->nr_zones = bdev_nr_zones(md->disk->part0);
0300 
0301     /* Check if zone append is natively supported */
0302     if (dm_table_supports_zone_append(t)) {
0303         clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
0304         dm_cleanup_zoned_dev(md);
0305         return 0;
0306     }
0307 
0308     /*
0309      * Mark the mapped device as needing zone append emulation and
0310      * initialize the emulation resources once the capacity is set.
0311      */
0312     set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
0313     if (!get_capacity(md->disk))
0314         return 0;
0315 
0316     return dm_revalidate_zones(md, t);
0317 }
0318 
0319 static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
0320                        void *data)
0321 {
0322     unsigned int *wp_offset = data;
0323 
0324     *wp_offset = dm_get_zone_wp_offset(zone);
0325 
0326     return 0;
0327 }
0328 
0329 static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno,
0330                     unsigned int *wp_ofst)
0331 {
0332     sector_t sector = zno * bdev_zone_sectors(md->disk->part0);
0333     unsigned int noio_flag;
0334     struct dm_table *t;
0335     int srcu_idx, ret;
0336 
0337     t = dm_get_live_table(md, &srcu_idx);
0338     if (!t)
0339         return -EIO;
0340 
0341     /*
0342      * Ensure that all memory allocations in this context are done as if
0343      * GFP_NOIO was specified.
0344      */
0345     noio_flag = memalloc_noio_save();
0346     ret = dm_blk_do_report_zones(md, t, sector, 1,
0347                      dm_update_zone_wp_offset_cb, wp_ofst);
0348     memalloc_noio_restore(noio_flag);
0349 
0350     dm_put_live_table(md, srcu_idx);
0351 
0352     if (ret != 1)
0353         return -EIO;
0354 
0355     return 0;
0356 }
0357 
0358 struct orig_bio_details {
0359     enum req_op op;
0360     unsigned int nr_sectors;
0361 };
0362 
0363 /*
0364  * First phase of BIO mapping for targets with zone append emulation:
0365  * check all BIO that change a zone writer pointer and change zone
0366  * append operations into regular write operations.
0367  */
0368 static bool dm_zone_map_bio_begin(struct mapped_device *md,
0369                   unsigned int zno, struct bio *clone)
0370 {
0371     sector_t zsectors = bdev_zone_sectors(md->disk->part0);
0372     unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
0373 
0374     /*
0375      * If the target zone is in an error state, recover by inspecting the
0376      * zone to get its current write pointer position. Note that since the
0377      * target zone is already locked, a BIO issuing context should never
0378      * see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
0379      */
0380     if (zwp_offset == DM_ZONE_INVALID_WP_OFST) {
0381         if (dm_update_zone_wp_offset(md, zno, &zwp_offset))
0382             return false;
0383         WRITE_ONCE(md->zwp_offset[zno], zwp_offset);
0384     }
0385 
0386     switch (bio_op(clone)) {
0387     case REQ_OP_ZONE_RESET:
0388     case REQ_OP_ZONE_FINISH:
0389         return true;
0390     case REQ_OP_WRITE_ZEROES:
0391     case REQ_OP_WRITE:
0392         /* Writes must be aligned to the zone write pointer */
0393         if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset)
0394             return false;
0395         break;
0396     case REQ_OP_ZONE_APPEND:
0397         /*
0398          * Change zone append operations into a non-mergeable regular
0399          * writes directed at the current write pointer position of the
0400          * target zone.
0401          */
0402         clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
0403             (clone->bi_opf & (~REQ_OP_MASK));
0404         clone->bi_iter.bi_sector += zwp_offset;
0405         break;
0406     default:
0407         DMWARN_LIMIT("Invalid BIO operation");
0408         return false;
0409     }
0410 
0411     /* Cannot write to a full zone */
0412     if (zwp_offset >= zsectors)
0413         return false;
0414 
0415     return true;
0416 }
0417 
0418 /*
0419  * Second phase of BIO mapping for targets with zone append emulation:
0420  * update the zone write pointer offset array to account for the additional
0421  * data written to a zone. Note that at this point, the remapped clone BIO
0422  * may already have completed, so we do not touch it.
0423  */
0424 static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno,
0425                     struct orig_bio_details *orig_bio_details,
0426                     unsigned int nr_sectors)
0427 {
0428     unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
0429 
0430     /* The clone BIO may already have been completed and failed */
0431     if (zwp_offset == DM_ZONE_INVALID_WP_OFST)
0432         return BLK_STS_IOERR;
0433 
0434     /* Update the zone wp offset */
0435     switch (orig_bio_details->op) {
0436     case REQ_OP_ZONE_RESET:
0437         WRITE_ONCE(md->zwp_offset[zno], 0);
0438         return BLK_STS_OK;
0439     case REQ_OP_ZONE_FINISH:
0440         WRITE_ONCE(md->zwp_offset[zno],
0441                bdev_zone_sectors(md->disk->part0));
0442         return BLK_STS_OK;
0443     case REQ_OP_WRITE_ZEROES:
0444     case REQ_OP_WRITE:
0445         WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
0446         return BLK_STS_OK;
0447     case REQ_OP_ZONE_APPEND:
0448         /*
0449          * Check that the target did not truncate the write operation
0450          * emulating a zone append.
0451          */
0452         if (nr_sectors != orig_bio_details->nr_sectors) {
0453             DMWARN_LIMIT("Truncated write for zone append");
0454             return BLK_STS_IOERR;
0455         }
0456         WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
0457         return BLK_STS_OK;
0458     default:
0459         DMWARN_LIMIT("Invalid BIO operation");
0460         return BLK_STS_IOERR;
0461     }
0462 }
0463 
0464 static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno,
0465                 struct bio *clone)
0466 {
0467     if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)))
0468         return;
0469 
0470     wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
0471     bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED);
0472 }
0473 
0474 static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno,
0475                   struct bio *clone)
0476 {
0477     if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
0478         return;
0479 
0480     WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock));
0481     clear_bit_unlock(zno, disk->seq_zones_wlock);
0482     smp_mb__after_atomic();
0483     wake_up_bit(disk->seq_zones_wlock, zno);
0484 
0485     bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED);
0486 }
0487 
0488 static bool dm_need_zone_wp_tracking(struct bio *bio)
0489 {
0490     /*
0491      * Special processing is not needed for operations that do not need the
0492      * zone write lock, that is, all operations that target conventional
0493      * zones and all operations that do not modify directly a sequential
0494      * zone write pointer.
0495      */
0496     if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
0497         return false;
0498     switch (bio_op(bio)) {
0499     case REQ_OP_WRITE_ZEROES:
0500     case REQ_OP_WRITE:
0501     case REQ_OP_ZONE_RESET:
0502     case REQ_OP_ZONE_FINISH:
0503     case REQ_OP_ZONE_APPEND:
0504         return bio_zone_is_seq(bio);
0505     default:
0506         return false;
0507     }
0508 }
0509 
0510 /*
0511  * Special IO mapping for targets needing zone append emulation.
0512  */
0513 int dm_zone_map_bio(struct dm_target_io *tio)
0514 {
0515     struct dm_io *io = tio->io;
0516     struct dm_target *ti = tio->ti;
0517     struct mapped_device *md = io->md;
0518     struct bio *clone = &tio->clone;
0519     struct orig_bio_details orig_bio_details;
0520     unsigned int zno;
0521     blk_status_t sts;
0522     int r;
0523 
0524     /*
0525      * IOs that do not change a zone write pointer do not need
0526      * any additional special processing.
0527      */
0528     if (!dm_need_zone_wp_tracking(clone))
0529         return ti->type->map(ti, clone);
0530 
0531     /* Lock the target zone */
0532     zno = bio_zone_no(clone);
0533     dm_zone_lock(md->disk, zno, clone);
0534 
0535     orig_bio_details.nr_sectors = bio_sectors(clone);
0536     orig_bio_details.op = bio_op(clone);
0537 
0538     /*
0539      * Check that the bio and the target zone write pointer offset are
0540      * both valid, and if the bio is a zone append, remap it to a write.
0541      */
0542     if (!dm_zone_map_bio_begin(md, zno, clone)) {
0543         dm_zone_unlock(md->disk, zno, clone);
0544         return DM_MAPIO_KILL;
0545     }
0546 
0547     /* Let the target do its work */
0548     r = ti->type->map(ti, clone);
0549     switch (r) {
0550     case DM_MAPIO_SUBMITTED:
0551         /*
0552          * The target submitted the clone BIO. The target zone will
0553          * be unlocked on completion of the clone.
0554          */
0555         sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
0556                       *tio->len_ptr);
0557         break;
0558     case DM_MAPIO_REMAPPED:
0559         /*
0560          * The target only remapped the clone BIO. In case of error,
0561          * unlock the target zone here as the clone will not be
0562          * submitted.
0563          */
0564         sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
0565                       *tio->len_ptr);
0566         if (sts != BLK_STS_OK)
0567             dm_zone_unlock(md->disk, zno, clone);
0568         break;
0569     case DM_MAPIO_REQUEUE:
0570     case DM_MAPIO_KILL:
0571     default:
0572         dm_zone_unlock(md->disk, zno, clone);
0573         sts = BLK_STS_IOERR;
0574         break;
0575     }
0576 
0577     if (sts != BLK_STS_OK)
0578         return DM_MAPIO_KILL;
0579 
0580     return r;
0581 }
0582 
0583 /*
0584  * IO completion callback called from clone_endio().
0585  */
0586 void dm_zone_endio(struct dm_io *io, struct bio *clone)
0587 {
0588     struct mapped_device *md = io->md;
0589     struct gendisk *disk = md->disk;
0590     struct bio *orig_bio = io->orig_bio;
0591     unsigned int zwp_offset;
0592     unsigned int zno;
0593 
0594     /*
0595      * For targets that do not emulate zone append, we only need to
0596      * handle native zone-append bios.
0597      */
0598     if (!dm_emulate_zone_append(md)) {
0599         /*
0600          * Get the offset within the zone of the written sector
0601          * and add that to the original bio sector position.
0602          */
0603         if (clone->bi_status == BLK_STS_OK &&
0604             bio_op(clone) == REQ_OP_ZONE_APPEND) {
0605             sector_t mask =
0606                 (sector_t)bdev_zone_sectors(disk->part0) - 1;
0607 
0608             orig_bio->bi_iter.bi_sector +=
0609                 clone->bi_iter.bi_sector & mask;
0610         }
0611 
0612         return;
0613     }
0614 
0615     /*
0616      * For targets that do emulate zone append, if the clone BIO does not
0617      * own the target zone write lock, we have nothing to do.
0618      */
0619     if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
0620         return;
0621 
0622     zno = bio_zone_no(orig_bio);
0623 
0624     if (clone->bi_status != BLK_STS_OK) {
0625         /*
0626          * BIOs that modify a zone write pointer may leave the zone
0627          * in an unknown state in case of failure (e.g. the write
0628          * pointer was only partially advanced). In this case, set
0629          * the target zone write pointer as invalid unless it is
0630          * already being updated.
0631          */
0632         WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST);
0633     } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
0634         /*
0635          * Get the written sector for zone append operation that were
0636          * emulated using regular write operations.
0637          */
0638         zwp_offset = READ_ONCE(md->zwp_offset[zno]);
0639         if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio)))
0640             WRITE_ONCE(md->zwp_offset[zno],
0641                    DM_ZONE_INVALID_WP_OFST);
0642         else
0643             orig_bio->bi_iter.bi_sector +=
0644                 zwp_offset - bio_sectors(orig_bio);
0645     }
0646 
0647     dm_zone_unlock(disk, zno, clone);
0648 }