Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /*
0003  * Copyright (C) 2017 Western Digital Corporation or its affiliates.
0004  *
0005  * This file is released under the GPL.
0006  */
0007 
0008 #include "dm-zoned.h"
0009 
0010 #include <linux/module.h>
0011 
0012 #define DM_MSG_PREFIX       "zoned"
0013 
0014 #define DMZ_MIN_BIOS        8192
0015 
0016 /*
0017  * Zone BIO context.
0018  */
0019 struct dmz_bioctx {
0020     struct dmz_dev      *dev;
0021     struct dm_zone      *zone;
0022     struct bio      *bio;
0023     refcount_t      ref;
0024 };
0025 
0026 /*
0027  * Chunk work descriptor.
0028  */
0029 struct dm_chunk_work {
0030     struct work_struct  work;
0031     refcount_t      refcount;
0032     struct dmz_target   *target;
0033     unsigned int        chunk;
0034     struct bio_list     bio_list;
0035 };
0036 
0037 /*
0038  * Target descriptor.
0039  */
0040 struct dmz_target {
0041     struct dm_dev       **ddev;
0042     unsigned int        nr_ddevs;
0043 
0044     unsigned int        flags;
0045 
0046     /* Zoned block device information */
0047     struct dmz_dev      *dev;
0048 
0049     /* For metadata handling */
0050     struct dmz_metadata     *metadata;
0051 
0052     /* For chunk work */
0053     struct radix_tree_root  chunk_rxtree;
0054     struct workqueue_struct *chunk_wq;
0055     struct mutex        chunk_lock;
0056 
0057     /* For cloned BIOs to zones */
0058     struct bio_set      bio_set;
0059 
0060     /* For flush */
0061     spinlock_t      flush_lock;
0062     struct bio_list     flush_list;
0063     struct delayed_work flush_work;
0064     struct workqueue_struct *flush_wq;
0065 };
0066 
0067 /*
0068  * Flush intervals (seconds).
0069  */
0070 #define DMZ_FLUSH_PERIOD    (10 * HZ)
0071 
0072 /*
0073  * Target BIO completion.
0074  */
0075 static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
0076 {
0077     struct dmz_bioctx *bioctx =
0078         dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
0079 
0080     if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
0081         bio->bi_status = status;
0082     if (bioctx->dev && bio->bi_status != BLK_STS_OK)
0083         bioctx->dev->flags |= DMZ_CHECK_BDEV;
0084 
0085     if (refcount_dec_and_test(&bioctx->ref)) {
0086         struct dm_zone *zone = bioctx->zone;
0087 
0088         if (zone) {
0089             if (bio->bi_status != BLK_STS_OK &&
0090                 bio_op(bio) == REQ_OP_WRITE &&
0091                 dmz_is_seq(zone))
0092                 set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
0093             dmz_deactivate_zone(zone);
0094         }
0095         bio_endio(bio);
0096     }
0097 }
0098 
0099 /*
0100  * Completion callback for an internally cloned target BIO. This terminates the
0101  * target BIO when there are no more references to its context.
0102  */
0103 static void dmz_clone_endio(struct bio *clone)
0104 {
0105     struct dmz_bioctx *bioctx = clone->bi_private;
0106     blk_status_t status = clone->bi_status;
0107 
0108     bio_put(clone);
0109     dmz_bio_endio(bioctx->bio, status);
0110 }
0111 
0112 /*
0113  * Issue a clone of a target BIO. The clone may only partially process the
0114  * original target BIO.
0115  */
0116 static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
0117               struct bio *bio, sector_t chunk_block,
0118               unsigned int nr_blocks)
0119 {
0120     struct dmz_bioctx *bioctx =
0121         dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
0122     struct dmz_dev *dev = zone->dev;
0123     struct bio *clone;
0124 
0125     if (dev->flags & DMZ_BDEV_DYING)
0126         return -EIO;
0127 
0128     clone = bio_alloc_clone(dev->bdev, bio, GFP_NOIO, &dmz->bio_set);
0129     if (!clone)
0130         return -ENOMEM;
0131 
0132     bioctx->dev = dev;
0133     clone->bi_iter.bi_sector =
0134         dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
0135     clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
0136     clone->bi_end_io = dmz_clone_endio;
0137     clone->bi_private = bioctx;
0138 
0139     bio_advance(bio, clone->bi_iter.bi_size);
0140 
0141     refcount_inc(&bioctx->ref);
0142     submit_bio_noacct(clone);
0143 
0144     if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
0145         zone->wp_block += nr_blocks;
0146 
0147     return 0;
0148 }
0149 
0150 /*
0151  * Zero out pages of discarded blocks accessed by a read BIO.
0152  */
0153 static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio,
0154                  sector_t chunk_block, unsigned int nr_blocks)
0155 {
0156     unsigned int size = nr_blocks << DMZ_BLOCK_SHIFT;
0157 
0158     /* Clear nr_blocks */
0159     swap(bio->bi_iter.bi_size, size);
0160     zero_fill_bio(bio);
0161     swap(bio->bi_iter.bi_size, size);
0162 
0163     bio_advance(bio, size);
0164 }
0165 
0166 /*
0167  * Process a read BIO.
0168  */
0169 static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
0170                struct bio *bio)
0171 {
0172     struct dmz_metadata *zmd = dmz->metadata;
0173     sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio));
0174     unsigned int nr_blocks = dmz_bio_blocks(bio);
0175     sector_t end_block = chunk_block + nr_blocks;
0176     struct dm_zone *rzone, *bzone;
0177     int ret;
0178 
0179     /* Read into unmapped chunks need only zeroing the BIO buffer */
0180     if (!zone) {
0181         zero_fill_bio(bio);
0182         return 0;
0183     }
0184 
0185     DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks",
0186         dmz_metadata_label(zmd),
0187         (unsigned long long)dmz_bio_chunk(zmd, bio),
0188         (dmz_is_rnd(zone) ? "RND" :
0189          (dmz_is_cache(zone) ? "CACHE" : "SEQ")),
0190         zone->id,
0191         (unsigned long long)chunk_block, nr_blocks);
0192 
0193     /* Check block validity to determine the read location */
0194     bzone = zone->bzone;
0195     while (chunk_block < end_block) {
0196         nr_blocks = 0;
0197         if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
0198             chunk_block < zone->wp_block) {
0199             /* Test block validity in the data zone */
0200             ret = dmz_block_valid(zmd, zone, chunk_block);
0201             if (ret < 0)
0202                 return ret;
0203             if (ret > 0) {
0204                 /* Read data zone blocks */
0205                 nr_blocks = ret;
0206                 rzone = zone;
0207             }
0208         }
0209 
0210         /*
0211          * No valid blocks found in the data zone.
0212          * Check the buffer zone, if there is one.
0213          */
0214         if (!nr_blocks && bzone) {
0215             ret = dmz_block_valid(zmd, bzone, chunk_block);
0216             if (ret < 0)
0217                 return ret;
0218             if (ret > 0) {
0219                 /* Read buffer zone blocks */
0220                 nr_blocks = ret;
0221                 rzone = bzone;
0222             }
0223         }
0224 
0225         if (nr_blocks) {
0226             /* Valid blocks found: read them */
0227             nr_blocks = min_t(unsigned int, nr_blocks,
0228                       end_block - chunk_block);
0229             ret = dmz_submit_bio(dmz, rzone, bio,
0230                          chunk_block, nr_blocks);
0231             if (ret)
0232                 return ret;
0233             chunk_block += nr_blocks;
0234         } else {
0235             /* No valid block: zeroout the current BIO block */
0236             dmz_handle_read_zero(dmz, bio, chunk_block, 1);
0237             chunk_block++;
0238         }
0239     }
0240 
0241     return 0;
0242 }
0243 
0244 /*
0245  * Write blocks directly in a data zone, at the write pointer.
0246  * If a buffer zone is assigned, invalidate the blocks written
0247  * in place.
0248  */
0249 static int dmz_handle_direct_write(struct dmz_target *dmz,
0250                    struct dm_zone *zone, struct bio *bio,
0251                    sector_t chunk_block,
0252                    unsigned int nr_blocks)
0253 {
0254     struct dmz_metadata *zmd = dmz->metadata;
0255     struct dm_zone *bzone = zone->bzone;
0256     int ret;
0257 
0258     if (dmz_is_readonly(zone))
0259         return -EROFS;
0260 
0261     /* Submit write */
0262     ret = dmz_submit_bio(dmz, zone, bio, chunk_block, nr_blocks);
0263     if (ret)
0264         return ret;
0265 
0266     /*
0267      * Validate the blocks in the data zone and invalidate
0268      * in the buffer zone, if there is one.
0269      */
0270     ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks);
0271     if (ret == 0 && bzone)
0272         ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks);
0273 
0274     return ret;
0275 }
0276 
0277 /*
0278  * Write blocks in the buffer zone of @zone.
0279  * If no buffer zone is assigned yet, get one.
0280  * Called with @zone write locked.
0281  */
0282 static int dmz_handle_buffered_write(struct dmz_target *dmz,
0283                      struct dm_zone *zone, struct bio *bio,
0284                      sector_t chunk_block,
0285                      unsigned int nr_blocks)
0286 {
0287     struct dmz_metadata *zmd = dmz->metadata;
0288     struct dm_zone *bzone;
0289     int ret;
0290 
0291     /* Get the buffer zone. One will be allocated if needed */
0292     bzone = dmz_get_chunk_buffer(zmd, zone);
0293     if (IS_ERR(bzone))
0294         return PTR_ERR(bzone);
0295 
0296     if (dmz_is_readonly(bzone))
0297         return -EROFS;
0298 
0299     /* Submit write */
0300     ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks);
0301     if (ret)
0302         return ret;
0303 
0304     /*
0305      * Validate the blocks in the buffer zone
0306      * and invalidate in the data zone.
0307      */
0308     ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks);
0309     if (ret == 0 && chunk_block < zone->wp_block)
0310         ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
0311 
0312     return ret;
0313 }
0314 
0315 /*
0316  * Process a write BIO.
0317  */
0318 static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone,
0319                 struct bio *bio)
0320 {
0321     struct dmz_metadata *zmd = dmz->metadata;
0322     sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio));
0323     unsigned int nr_blocks = dmz_bio_blocks(bio);
0324 
0325     if (!zone)
0326         return -ENOSPC;
0327 
0328     DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
0329         dmz_metadata_label(zmd),
0330         (unsigned long long)dmz_bio_chunk(zmd, bio),
0331         (dmz_is_rnd(zone) ? "RND" :
0332          (dmz_is_cache(zone) ? "CACHE" : "SEQ")),
0333         zone->id,
0334         (unsigned long long)chunk_block, nr_blocks);
0335 
0336     if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
0337         chunk_block == zone->wp_block) {
0338         /*
0339          * zone is a random zone or it is a sequential zone
0340          * and the BIO is aligned to the zone write pointer:
0341          * direct write the zone.
0342          */
0343         return dmz_handle_direct_write(dmz, zone, bio,
0344                            chunk_block, nr_blocks);
0345     }
0346 
0347     /*
0348      * This is an unaligned write in a sequential zone:
0349      * use buffered write.
0350      */
0351     return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks);
0352 }
0353 
0354 /*
0355  * Process a discard BIO.
0356  */
0357 static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
0358                   struct bio *bio)
0359 {
0360     struct dmz_metadata *zmd = dmz->metadata;
0361     sector_t block = dmz_bio_block(bio);
0362     unsigned int nr_blocks = dmz_bio_blocks(bio);
0363     sector_t chunk_block = dmz_chunk_block(zmd, block);
0364     int ret = 0;
0365 
0366     /* For unmapped chunks, there is nothing to do */
0367     if (!zone)
0368         return 0;
0369 
0370     if (dmz_is_readonly(zone))
0371         return -EROFS;
0372 
0373     DMDEBUG("(%s): DISCARD chunk %llu -> zone %u, block %llu, %u blocks",
0374         dmz_metadata_label(dmz->metadata),
0375         (unsigned long long)dmz_bio_chunk(zmd, bio),
0376         zone->id,
0377         (unsigned long long)chunk_block, nr_blocks);
0378 
0379     /*
0380      * Invalidate blocks in the data zone and its
0381      * buffer zone if one is mapped.
0382      */
0383     if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
0384         chunk_block < zone->wp_block)
0385         ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
0386     if (ret == 0 && zone->bzone)
0387         ret = dmz_invalidate_blocks(zmd, zone->bzone,
0388                         chunk_block, nr_blocks);
0389     return ret;
0390 }
0391 
0392 /*
0393  * Process a BIO.
0394  */
0395 static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
0396                struct bio *bio)
0397 {
0398     struct dmz_bioctx *bioctx =
0399         dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
0400     struct dmz_metadata *zmd = dmz->metadata;
0401     struct dm_zone *zone;
0402     int ret;
0403 
0404     dmz_lock_metadata(zmd);
0405 
0406     /*
0407      * Get the data zone mapping the chunk. There may be no
0408      * mapping for read and discard. If a mapping is obtained,
0409      + the zone returned will be set to active state.
0410      */
0411     zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(zmd, bio),
0412                      bio_op(bio));
0413     if (IS_ERR(zone)) {
0414         ret = PTR_ERR(zone);
0415         goto out;
0416     }
0417 
0418     /* Process the BIO */
0419     if (zone) {
0420         dmz_activate_zone(zone);
0421         bioctx->zone = zone;
0422         dmz_reclaim_bio_acc(zone->dev->reclaim);
0423     }
0424 
0425     switch (bio_op(bio)) {
0426     case REQ_OP_READ:
0427         ret = dmz_handle_read(dmz, zone, bio);
0428         break;
0429     case REQ_OP_WRITE:
0430         ret = dmz_handle_write(dmz, zone, bio);
0431         break;
0432     case REQ_OP_DISCARD:
0433     case REQ_OP_WRITE_ZEROES:
0434         ret = dmz_handle_discard(dmz, zone, bio);
0435         break;
0436     default:
0437         DMERR("(%s): Unsupported BIO operation 0x%x",
0438               dmz_metadata_label(dmz->metadata), bio_op(bio));
0439         ret = -EIO;
0440     }
0441 
0442     /*
0443      * Release the chunk mapping. This will check that the mapping
0444      * is still valid, that is, that the zone used still has valid blocks.
0445      */
0446     if (zone)
0447         dmz_put_chunk_mapping(zmd, zone);
0448 out:
0449     dmz_bio_endio(bio, errno_to_blk_status(ret));
0450 
0451     dmz_unlock_metadata(zmd);
0452 }
0453 
0454 /*
0455  * Increment a chunk reference counter.
0456  */
0457 static inline void dmz_get_chunk_work(struct dm_chunk_work *cw)
0458 {
0459     refcount_inc(&cw->refcount);
0460 }
0461 
0462 /*
0463  * Decrement a chunk work reference count and
0464  * free it if it becomes 0.
0465  */
0466 static void dmz_put_chunk_work(struct dm_chunk_work *cw)
0467 {
0468     if (refcount_dec_and_test(&cw->refcount)) {
0469         WARN_ON(!bio_list_empty(&cw->bio_list));
0470         radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk);
0471         kfree(cw);
0472     }
0473 }
0474 
0475 /*
0476  * Chunk BIO work function.
0477  */
0478 static void dmz_chunk_work(struct work_struct *work)
0479 {
0480     struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work);
0481     struct dmz_target *dmz = cw->target;
0482     struct bio *bio;
0483 
0484     mutex_lock(&dmz->chunk_lock);
0485 
0486     /* Process the chunk BIOs */
0487     while ((bio = bio_list_pop(&cw->bio_list))) {
0488         mutex_unlock(&dmz->chunk_lock);
0489         dmz_handle_bio(dmz, cw, bio);
0490         mutex_lock(&dmz->chunk_lock);
0491         dmz_put_chunk_work(cw);
0492     }
0493 
0494     /* Queueing the work incremented the work refcount */
0495     dmz_put_chunk_work(cw);
0496 
0497     mutex_unlock(&dmz->chunk_lock);
0498 }
0499 
0500 /*
0501  * Flush work.
0502  */
0503 static void dmz_flush_work(struct work_struct *work)
0504 {
0505     struct dmz_target *dmz = container_of(work, struct dmz_target, flush_work.work);
0506     struct bio *bio;
0507     int ret;
0508 
0509     /* Flush dirty metadata blocks */
0510     ret = dmz_flush_metadata(dmz->metadata);
0511     if (ret)
0512         DMDEBUG("(%s): Metadata flush failed, rc=%d",
0513             dmz_metadata_label(dmz->metadata), ret);
0514 
0515     /* Process queued flush requests */
0516     while (1) {
0517         spin_lock(&dmz->flush_lock);
0518         bio = bio_list_pop(&dmz->flush_list);
0519         spin_unlock(&dmz->flush_lock);
0520 
0521         if (!bio)
0522             break;
0523 
0524         dmz_bio_endio(bio, errno_to_blk_status(ret));
0525     }
0526 
0527     queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
0528 }
0529 
0530 /*
0531  * Get a chunk work and start it to process a new BIO.
0532  * If the BIO chunk has no work yet, create one.
0533  */
0534 static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
0535 {
0536     unsigned int chunk = dmz_bio_chunk(dmz->metadata, bio);
0537     struct dm_chunk_work *cw;
0538     int ret = 0;
0539 
0540     mutex_lock(&dmz->chunk_lock);
0541 
0542     /* Get the BIO chunk work. If one is not active yet, create one */
0543     cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
0544     if (cw) {
0545         dmz_get_chunk_work(cw);
0546     } else {
0547         /* Create a new chunk work */
0548         cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
0549         if (unlikely(!cw)) {
0550             ret = -ENOMEM;
0551             goto out;
0552         }
0553 
0554         INIT_WORK(&cw->work, dmz_chunk_work);
0555         refcount_set(&cw->refcount, 1);
0556         cw->target = dmz;
0557         cw->chunk = chunk;
0558         bio_list_init(&cw->bio_list);
0559 
0560         ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
0561         if (unlikely(ret)) {
0562             kfree(cw);
0563             goto out;
0564         }
0565     }
0566 
0567     bio_list_add(&cw->bio_list, bio);
0568 
0569     if (queue_work(dmz->chunk_wq, &cw->work))
0570         dmz_get_chunk_work(cw);
0571 out:
0572     mutex_unlock(&dmz->chunk_lock);
0573     return ret;
0574 }
0575 
0576 /*
0577  * Check if the backing device is being removed. If it's on the way out,
0578  * start failing I/O. Reclaim and metadata components also call this
0579  * function to cleanly abort operation in the event of such failure.
0580  */
0581 bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
0582 {
0583     if (dmz_dev->flags & DMZ_BDEV_DYING)
0584         return true;
0585 
0586     if (dmz_dev->flags & DMZ_CHECK_BDEV)
0587         return !dmz_check_bdev(dmz_dev);
0588 
0589     if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
0590         dmz_dev_warn(dmz_dev, "Backing device queue dying");
0591         dmz_dev->flags |= DMZ_BDEV_DYING;
0592     }
0593 
0594     return dmz_dev->flags & DMZ_BDEV_DYING;
0595 }
0596 
0597 /*
0598  * Check the backing device availability. This detects such events as
0599  * backing device going offline due to errors, media removals, etc.
0600  * This check is less efficient than dmz_bdev_is_dying() and should
0601  * only be performed as a part of error handling.
0602  */
0603 bool dmz_check_bdev(struct dmz_dev *dmz_dev)
0604 {
0605     struct gendisk *disk;
0606 
0607     dmz_dev->flags &= ~DMZ_CHECK_BDEV;
0608 
0609     if (dmz_bdev_is_dying(dmz_dev))
0610         return false;
0611 
0612     disk = dmz_dev->bdev->bd_disk;
0613     if (disk->fops->check_events &&
0614         disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) {
0615         dmz_dev_warn(dmz_dev, "Backing device offline");
0616         dmz_dev->flags |= DMZ_BDEV_DYING;
0617     }
0618 
0619     return !(dmz_dev->flags & DMZ_BDEV_DYING);
0620 }
0621 
0622 /*
0623  * Process a new BIO.
0624  */
0625 static int dmz_map(struct dm_target *ti, struct bio *bio)
0626 {
0627     struct dmz_target *dmz = ti->private;
0628     struct dmz_metadata *zmd = dmz->metadata;
0629     struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
0630     sector_t sector = bio->bi_iter.bi_sector;
0631     unsigned int nr_sectors = bio_sectors(bio);
0632     sector_t chunk_sector;
0633     int ret;
0634 
0635     if (dmz_dev_is_dying(zmd))
0636         return DM_MAPIO_KILL;
0637 
0638     DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
0639         dmz_metadata_label(zmd),
0640         bio_op(bio), (unsigned long long)sector, nr_sectors,
0641         (unsigned long long)dmz_bio_chunk(zmd, bio),
0642         (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)),
0643         (unsigned int)dmz_bio_blocks(bio));
0644 
0645     if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
0646         return DM_MAPIO_REMAPPED;
0647 
0648     /* The BIO should be block aligned */
0649     if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK))
0650         return DM_MAPIO_KILL;
0651 
0652     /* Initialize the BIO context */
0653     bioctx->dev = NULL;
0654     bioctx->zone = NULL;
0655     bioctx->bio = bio;
0656     refcount_set(&bioctx->ref, 1);
0657 
0658     /* Set the BIO pending in the flush list */
0659     if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) {
0660         spin_lock(&dmz->flush_lock);
0661         bio_list_add(&dmz->flush_list, bio);
0662         spin_unlock(&dmz->flush_lock);
0663         mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0);
0664         return DM_MAPIO_SUBMITTED;
0665     }
0666 
0667     /* Split zone BIOs to fit entirely into a zone */
0668     chunk_sector = sector & (dmz_zone_nr_sectors(zmd) - 1);
0669     if (chunk_sector + nr_sectors > dmz_zone_nr_sectors(zmd))
0670         dm_accept_partial_bio(bio, dmz_zone_nr_sectors(zmd) - chunk_sector);
0671 
0672     /* Now ready to handle this BIO */
0673     ret = dmz_queue_chunk_work(dmz, bio);
0674     if (ret) {
0675         DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i",
0676             dmz_metadata_label(zmd),
0677             bio_op(bio), (u64)dmz_bio_chunk(zmd, bio),
0678             ret);
0679         return DM_MAPIO_REQUEUE;
0680     }
0681 
0682     return DM_MAPIO_SUBMITTED;
0683 }
0684 
0685 /*
0686  * Get zoned device information.
0687  */
0688 static int dmz_get_zoned_device(struct dm_target *ti, char *path,
0689                 int idx, int nr_devs)
0690 {
0691     struct dmz_target *dmz = ti->private;
0692     struct dm_dev *ddev;
0693     struct dmz_dev *dev;
0694     int ret;
0695     struct block_device *bdev;
0696 
0697     /* Get the target device */
0698     ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &ddev);
0699     if (ret) {
0700         ti->error = "Get target device failed";
0701         return ret;
0702     }
0703 
0704     bdev = ddev->bdev;
0705     if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) {
0706         if (nr_devs == 1) {
0707             ti->error = "Invalid regular device";
0708             goto err;
0709         }
0710         if (idx != 0) {
0711             ti->error = "First device must be a regular device";
0712             goto err;
0713         }
0714         if (dmz->ddev[0]) {
0715             ti->error = "Too many regular devices";
0716             goto err;
0717         }
0718         dev = &dmz->dev[idx];
0719         dev->flags = DMZ_BDEV_REGULAR;
0720     } else {
0721         if (dmz->ddev[idx]) {
0722             ti->error = "Too many zoned devices";
0723             goto err;
0724         }
0725         if (nr_devs > 1 && idx == 0) {
0726             ti->error = "First device must be a regular device";
0727             goto err;
0728         }
0729         dev = &dmz->dev[idx];
0730     }
0731     dev->bdev = bdev;
0732     dev->dev_idx = idx;
0733 
0734     dev->capacity = bdev_nr_sectors(bdev);
0735     if (ti->begin) {
0736         ti->error = "Partial mapping is not supported";
0737         goto err;
0738     }
0739 
0740     dmz->ddev[idx] = ddev;
0741 
0742     return 0;
0743 err:
0744     dm_put_device(ti, ddev);
0745     return -EINVAL;
0746 }
0747 
0748 /*
0749  * Cleanup zoned device information.
0750  */
0751 static void dmz_put_zoned_device(struct dm_target *ti)
0752 {
0753     struct dmz_target *dmz = ti->private;
0754     int i;
0755 
0756     for (i = 0; i < dmz->nr_ddevs; i++) {
0757         if (dmz->ddev[i]) {
0758             dm_put_device(ti, dmz->ddev[i]);
0759             dmz->ddev[i] = NULL;
0760         }
0761     }
0762 }
0763 
0764 static int dmz_fixup_devices(struct dm_target *ti)
0765 {
0766     struct dmz_target *dmz = ti->private;
0767     struct dmz_dev *reg_dev = NULL;
0768     sector_t zone_nr_sectors = 0;
0769     int i;
0770 
0771     /*
0772      * When we have more than on devices, the first one must be a
0773      * regular block device and the others zoned block devices.
0774      */
0775     if (dmz->nr_ddevs > 1) {
0776         reg_dev = &dmz->dev[0];
0777         if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) {
0778             ti->error = "Primary disk is not a regular device";
0779             return -EINVAL;
0780         }
0781         for (i = 1; i < dmz->nr_ddevs; i++) {
0782             struct dmz_dev *zoned_dev = &dmz->dev[i];
0783             struct block_device *bdev = zoned_dev->bdev;
0784 
0785             if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
0786                 ti->error = "Secondary disk is not a zoned device";
0787                 return -EINVAL;
0788             }
0789             if (zone_nr_sectors &&
0790                 zone_nr_sectors != bdev_zone_sectors(bdev)) {
0791                 ti->error = "Zone nr sectors mismatch";
0792                 return -EINVAL;
0793             }
0794             zone_nr_sectors = bdev_zone_sectors(bdev);
0795             zoned_dev->zone_nr_sectors = zone_nr_sectors;
0796             zoned_dev->nr_zones = bdev_nr_zones(bdev);
0797         }
0798     } else {
0799         struct dmz_dev *zoned_dev = &dmz->dev[0];
0800         struct block_device *bdev = zoned_dev->bdev;
0801 
0802         if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
0803             ti->error = "Disk is not a zoned device";
0804             return -EINVAL;
0805         }
0806         zoned_dev->zone_nr_sectors = bdev_zone_sectors(bdev);
0807         zoned_dev->nr_zones = bdev_nr_zones(bdev);
0808     }
0809 
0810     if (reg_dev) {
0811         sector_t zone_offset;
0812 
0813         reg_dev->zone_nr_sectors = zone_nr_sectors;
0814         reg_dev->nr_zones =
0815             DIV_ROUND_UP_SECTOR_T(reg_dev->capacity,
0816                           reg_dev->zone_nr_sectors);
0817         reg_dev->zone_offset = 0;
0818         zone_offset = reg_dev->nr_zones;
0819         for (i = 1; i < dmz->nr_ddevs; i++) {
0820             dmz->dev[i].zone_offset = zone_offset;
0821             zone_offset += dmz->dev[i].nr_zones;
0822         }
0823     }
0824     return 0;
0825 }
0826 
0827 /*
0828  * Setup target.
0829  */
0830 static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
0831 {
0832     struct dmz_target *dmz;
0833     int ret, i;
0834 
0835     /* Check arguments */
0836     if (argc < 1) {
0837         ti->error = "Invalid argument count";
0838         return -EINVAL;
0839     }
0840 
0841     /* Allocate and initialize the target descriptor */
0842     dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL);
0843     if (!dmz) {
0844         ti->error = "Unable to allocate the zoned target descriptor";
0845         return -ENOMEM;
0846     }
0847     dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL);
0848     if (!dmz->dev) {
0849         ti->error = "Unable to allocate the zoned device descriptors";
0850         kfree(dmz);
0851         return -ENOMEM;
0852     }
0853     dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL);
0854     if (!dmz->ddev) {
0855         ti->error = "Unable to allocate the dm device descriptors";
0856         ret = -ENOMEM;
0857         goto err;
0858     }
0859     dmz->nr_ddevs = argc;
0860 
0861     ti->private = dmz;
0862 
0863     /* Get the target zoned block device */
0864     for (i = 0; i < argc; i++) {
0865         ret = dmz_get_zoned_device(ti, argv[i], i, argc);
0866         if (ret)
0867             goto err_dev;
0868     }
0869     ret = dmz_fixup_devices(ti);
0870     if (ret)
0871         goto err_dev;
0872 
0873     /* Initialize metadata */
0874     ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata,
0875                    dm_table_device_name(ti->table));
0876     if (ret) {
0877         ti->error = "Metadata initialization failed";
0878         goto err_dev;
0879     }
0880 
0881     /* Set target (no write same support) */
0882     ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata);
0883     ti->num_flush_bios = 1;
0884     ti->num_discard_bios = 1;
0885     ti->num_write_zeroes_bios = 1;
0886     ti->per_io_data_size = sizeof(struct dmz_bioctx);
0887     ti->flush_supported = true;
0888     ti->discards_supported = true;
0889 
0890     /* The exposed capacity is the number of chunks that can be mapped */
0891     ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) <<
0892         dmz_zone_nr_sectors_shift(dmz->metadata);
0893 
0894     /* Zone BIO */
0895     ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0);
0896     if (ret) {
0897         ti->error = "Create BIO set failed";
0898         goto err_meta;
0899     }
0900 
0901     /* Chunk BIO work */
0902     mutex_init(&dmz->chunk_lock);
0903     INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO);
0904     dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s",
0905                     WQ_MEM_RECLAIM | WQ_UNBOUND, 0,
0906                     dmz_metadata_label(dmz->metadata));
0907     if (!dmz->chunk_wq) {
0908         ti->error = "Create chunk workqueue failed";
0909         ret = -ENOMEM;
0910         goto err_bio;
0911     }
0912 
0913     /* Flush work */
0914     spin_lock_init(&dmz->flush_lock);
0915     bio_list_init(&dmz->flush_list);
0916     INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work);
0917     dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM,
0918                         dmz_metadata_label(dmz->metadata));
0919     if (!dmz->flush_wq) {
0920         ti->error = "Create flush workqueue failed";
0921         ret = -ENOMEM;
0922         goto err_cwq;
0923     }
0924     mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
0925 
0926     /* Initialize reclaim */
0927     for (i = 0; i < dmz->nr_ddevs; i++) {
0928         ret = dmz_ctr_reclaim(dmz->metadata, &dmz->dev[i].reclaim, i);
0929         if (ret) {
0930             ti->error = "Zone reclaim initialization failed";
0931             goto err_fwq;
0932         }
0933     }
0934 
0935     DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)",
0936            dmz_metadata_label(dmz->metadata),
0937            (unsigned long long)ti->len,
0938            (unsigned long long)dmz_sect2blk(ti->len));
0939 
0940     return 0;
0941 err_fwq:
0942     destroy_workqueue(dmz->flush_wq);
0943 err_cwq:
0944     destroy_workqueue(dmz->chunk_wq);
0945 err_bio:
0946     mutex_destroy(&dmz->chunk_lock);
0947     bioset_exit(&dmz->bio_set);
0948 err_meta:
0949     dmz_dtr_metadata(dmz->metadata);
0950 err_dev:
0951     dmz_put_zoned_device(ti);
0952 err:
0953     kfree(dmz->dev);
0954     kfree(dmz);
0955 
0956     return ret;
0957 }
0958 
0959 /*
0960  * Cleanup target.
0961  */
0962 static void dmz_dtr(struct dm_target *ti)
0963 {
0964     struct dmz_target *dmz = ti->private;
0965     int i;
0966 
0967     destroy_workqueue(dmz->chunk_wq);
0968 
0969     for (i = 0; i < dmz->nr_ddevs; i++)
0970         dmz_dtr_reclaim(dmz->dev[i].reclaim);
0971 
0972     cancel_delayed_work_sync(&dmz->flush_work);
0973     destroy_workqueue(dmz->flush_wq);
0974 
0975     (void) dmz_flush_metadata(dmz->metadata);
0976 
0977     dmz_dtr_metadata(dmz->metadata);
0978 
0979     bioset_exit(&dmz->bio_set);
0980 
0981     dmz_put_zoned_device(ti);
0982 
0983     mutex_destroy(&dmz->chunk_lock);
0984 
0985     kfree(dmz->dev);
0986     kfree(dmz);
0987 }
0988 
0989 /*
0990  * Setup target request queue limits.
0991  */
0992 static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
0993 {
0994     struct dmz_target *dmz = ti->private;
0995     unsigned int chunk_sectors = dmz_zone_nr_sectors(dmz->metadata);
0996 
0997     limits->logical_block_size = DMZ_BLOCK_SIZE;
0998     limits->physical_block_size = DMZ_BLOCK_SIZE;
0999 
1000     blk_limits_io_min(limits, DMZ_BLOCK_SIZE);
1001     blk_limits_io_opt(limits, DMZ_BLOCK_SIZE);
1002 
1003     limits->discard_alignment = 0;
1004     limits->discard_granularity = DMZ_BLOCK_SIZE;
1005     limits->max_discard_sectors = chunk_sectors;
1006     limits->max_hw_discard_sectors = chunk_sectors;
1007     limits->max_write_zeroes_sectors = chunk_sectors;
1008 
1009     /* FS hint to try to align to the device zone size */
1010     limits->chunk_sectors = chunk_sectors;
1011     limits->max_sectors = chunk_sectors;
1012 
1013     /* We are exposing a drive-managed zoned block device */
1014     limits->zoned = BLK_ZONED_NONE;
1015 }
1016 
1017 /*
1018  * Pass on ioctl to the backend device.
1019  */
1020 static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
1021 {
1022     struct dmz_target *dmz = ti->private;
1023     struct dmz_dev *dev = &dmz->dev[0];
1024 
1025     if (!dmz_check_bdev(dev))
1026         return -EIO;
1027 
1028     *bdev = dev->bdev;
1029 
1030     return 0;
1031 }
1032 
1033 /*
1034  * Stop works on suspend.
1035  */
1036 static void dmz_suspend(struct dm_target *ti)
1037 {
1038     struct dmz_target *dmz = ti->private;
1039     int i;
1040 
1041     flush_workqueue(dmz->chunk_wq);
1042     for (i = 0; i < dmz->nr_ddevs; i++)
1043         dmz_suspend_reclaim(dmz->dev[i].reclaim);
1044     cancel_delayed_work_sync(&dmz->flush_work);
1045 }
1046 
1047 /*
1048  * Restart works on resume or if suspend failed.
1049  */
1050 static void dmz_resume(struct dm_target *ti)
1051 {
1052     struct dmz_target *dmz = ti->private;
1053     int i;
1054 
1055     queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
1056     for (i = 0; i < dmz->nr_ddevs; i++)
1057         dmz_resume_reclaim(dmz->dev[i].reclaim);
1058 }
1059 
1060 static int dmz_iterate_devices(struct dm_target *ti,
1061                    iterate_devices_callout_fn fn, void *data)
1062 {
1063     struct dmz_target *dmz = ti->private;
1064     unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata);
1065     sector_t capacity;
1066     int i, r;
1067 
1068     for (i = 0; i < dmz->nr_ddevs; i++) {
1069         capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1);
1070         r = fn(ti, dmz->ddev[i], 0, capacity, data);
1071         if (r)
1072             break;
1073     }
1074     return r;
1075 }
1076 
1077 static void dmz_status(struct dm_target *ti, status_type_t type,
1078                unsigned int status_flags, char *result,
1079                unsigned int maxlen)
1080 {
1081     struct dmz_target *dmz = ti->private;
1082     ssize_t sz = 0;
1083     char buf[BDEVNAME_SIZE];
1084     struct dmz_dev *dev;
1085     int i;
1086 
1087     switch (type) {
1088     case STATUSTYPE_INFO:
1089         DMEMIT("%u zones %u/%u cache",
1090                dmz_nr_zones(dmz->metadata),
1091                dmz_nr_unmap_cache_zones(dmz->metadata),
1092                dmz_nr_cache_zones(dmz->metadata));
1093         for (i = 0; i < dmz->nr_ddevs; i++) {
1094             /*
1095              * For a multi-device setup the first device
1096              * contains only cache zones.
1097              */
1098             if ((i == 0) &&
1099                 (dmz_nr_cache_zones(dmz->metadata) > 0))
1100                 continue;
1101             DMEMIT(" %u/%u random %u/%u sequential",
1102                    dmz_nr_unmap_rnd_zones(dmz->metadata, i),
1103                    dmz_nr_rnd_zones(dmz->metadata, i),
1104                    dmz_nr_unmap_seq_zones(dmz->metadata, i),
1105                    dmz_nr_seq_zones(dmz->metadata, i));
1106         }
1107         break;
1108     case STATUSTYPE_TABLE:
1109         dev = &dmz->dev[0];
1110         format_dev_t(buf, dev->bdev->bd_dev);
1111         DMEMIT("%s", buf);
1112         for (i = 1; i < dmz->nr_ddevs; i++) {
1113             dev = &dmz->dev[i];
1114             format_dev_t(buf, dev->bdev->bd_dev);
1115             DMEMIT(" %s", buf);
1116         }
1117         break;
1118     case STATUSTYPE_IMA:
1119         *result = '\0';
1120         break;
1121     }
1122     return;
1123 }
1124 
1125 static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv,
1126                char *result, unsigned int maxlen)
1127 {
1128     struct dmz_target *dmz = ti->private;
1129     int r = -EINVAL;
1130 
1131     if (!strcasecmp(argv[0], "reclaim")) {
1132         int i;
1133 
1134         for (i = 0; i < dmz->nr_ddevs; i++)
1135             dmz_schedule_reclaim(dmz->dev[i].reclaim);
1136         r = 0;
1137     } else
1138         DMERR("unrecognized message %s", argv[0]);
1139     return r;
1140 }
1141 
1142 static struct target_type dmz_type = {
1143     .name        = "zoned",
1144     .version     = {2, 0, 0},
1145     .features    = DM_TARGET_SINGLETON | DM_TARGET_MIXED_ZONED_MODEL,
1146     .module      = THIS_MODULE,
1147     .ctr         = dmz_ctr,
1148     .dtr         = dmz_dtr,
1149     .map         = dmz_map,
1150     .io_hints    = dmz_io_hints,
1151     .prepare_ioctl   = dmz_prepare_ioctl,
1152     .postsuspend     = dmz_suspend,
1153     .resume      = dmz_resume,
1154     .iterate_devices = dmz_iterate_devices,
1155     .status      = dmz_status,
1156     .message     = dmz_message,
1157 };
1158 
1159 static int __init dmz_init(void)
1160 {
1161     return dm_register_target(&dmz_type);
1162 }
1163 
1164 static void __exit dmz_exit(void)
1165 {
1166     dm_unregister_target(&dmz_type);
1167 }
1168 
1169 module_init(dmz_init);
1170 module_exit(dmz_exit);
1171 
1172 MODULE_DESCRIPTION(DM_NAME " target for zoned block devices");
1173 MODULE_AUTHOR("Damien Le Moal <damien.lemoal@wdc.com>");
1174 MODULE_LICENSE("GPL");