0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #include <linux/kernel.h>
0013 #include <linux/module.h>
0014 #include <linux/rbtree.h>
0015 #include <linux/blkdev.h>
0016 #include <linux/blk-mq.h>
0017 #include <linux/mm.h>
0018 #include <linux/vmalloc.h>
0019 #include <linux/sched/mm.h>
0020
0021 #include "blk.h"
0022
0023 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
0024 static const char *const zone_cond_name[] = {
0025 ZONE_COND_NAME(NOT_WP),
0026 ZONE_COND_NAME(EMPTY),
0027 ZONE_COND_NAME(IMP_OPEN),
0028 ZONE_COND_NAME(EXP_OPEN),
0029 ZONE_COND_NAME(CLOSED),
0030 ZONE_COND_NAME(READONLY),
0031 ZONE_COND_NAME(FULL),
0032 ZONE_COND_NAME(OFFLINE),
0033 };
0034 #undef ZONE_COND_NAME
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
0045 {
0046 static const char *zone_cond_str = "UNKNOWN";
0047
0048 if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
0049 zone_cond_str = zone_cond_name[zone_cond];
0050
0051 return zone_cond_str;
0052 }
0053 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
0054
0055
0056
0057
0058 bool blk_req_needs_zone_write_lock(struct request *rq)
0059 {
0060 if (blk_rq_is_passthrough(rq))
0061 return false;
0062
0063 if (!rq->q->disk->seq_zones_wlock)
0064 return false;
0065
0066 switch (req_op(rq)) {
0067 case REQ_OP_WRITE_ZEROES:
0068 case REQ_OP_WRITE:
0069 return blk_rq_zone_is_seq(rq);
0070 default:
0071 return false;
0072 }
0073 }
0074 EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
0075
0076 bool blk_req_zone_write_trylock(struct request *rq)
0077 {
0078 unsigned int zno = blk_rq_zone_no(rq);
0079
0080 if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock))
0081 return false;
0082
0083 WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
0084 rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
0085
0086 return true;
0087 }
0088 EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
0089
0090 void __blk_req_zone_write_lock(struct request *rq)
0091 {
0092 if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
0093 rq->q->disk->seq_zones_wlock)))
0094 return;
0095
0096 WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
0097 rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
0098 }
0099 EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
0100
0101 void __blk_req_zone_write_unlock(struct request *rq)
0102 {
0103 rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
0104 if (rq->q->disk->seq_zones_wlock)
0105 WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
0106 rq->q->disk->seq_zones_wlock));
0107 }
0108 EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
0109
0110
0111
0112
0113
0114
0115
0116
0117 unsigned int bdev_nr_zones(struct block_device *bdev)
0118 {
0119 sector_t zone_sectors = bdev_zone_sectors(bdev);
0120
0121 if (!bdev_is_zoned(bdev))
0122 return 0;
0123 return (bdev_nr_sectors(bdev) + zone_sectors - 1) >>
0124 ilog2(zone_sectors);
0125 }
0126 EXPORT_SYMBOL_GPL(bdev_nr_zones);
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
0148 unsigned int nr_zones, report_zones_cb cb, void *data)
0149 {
0150 struct gendisk *disk = bdev->bd_disk;
0151 sector_t capacity = get_capacity(disk);
0152
0153 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
0154 return -EOPNOTSUPP;
0155
0156 if (!nr_zones || sector >= capacity)
0157 return 0;
0158
0159 return disk->fops->report_zones(disk, sector, nr_zones, cb, data);
0160 }
0161 EXPORT_SYMBOL_GPL(blkdev_report_zones);
0162
0163 static inline unsigned long *blk_alloc_zone_bitmap(int node,
0164 unsigned int nr_zones)
0165 {
0166 return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
0167 GFP_NOIO, node);
0168 }
0169
0170 static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
0171 void *data)
0172 {
0173
0174
0175
0176
0177 switch (zone->cond) {
0178 case BLK_ZONE_COND_NOT_WP:
0179 case BLK_ZONE_COND_EMPTY:
0180 case BLK_ZONE_COND_READONLY:
0181 case BLK_ZONE_COND_OFFLINE:
0182 return 0;
0183 default:
0184 set_bit(idx, (unsigned long *)data);
0185 return 0;
0186 }
0187 }
0188
0189 static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
0190 gfp_t gfp_mask)
0191 {
0192 struct gendisk *disk = bdev->bd_disk;
0193 sector_t capacity = bdev_nr_sectors(bdev);
0194 sector_t zone_sectors = bdev_zone_sectors(bdev);
0195 unsigned long *need_reset;
0196 struct bio *bio = NULL;
0197 sector_t sector = 0;
0198 int ret;
0199
0200 need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones);
0201 if (!need_reset)
0202 return -ENOMEM;
0203
0204 ret = disk->fops->report_zones(disk, 0, disk->nr_zones,
0205 blk_zone_need_reset_cb, need_reset);
0206 if (ret < 0)
0207 goto out_free_need_reset;
0208
0209 ret = 0;
0210 while (sector < capacity) {
0211 if (!test_bit(disk_zone_no(disk, sector), need_reset)) {
0212 sector += zone_sectors;
0213 continue;
0214 }
0215
0216 bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC,
0217 gfp_mask);
0218 bio->bi_iter.bi_sector = sector;
0219 sector += zone_sectors;
0220
0221
0222 cond_resched();
0223 }
0224
0225 if (bio) {
0226 ret = submit_bio_wait(bio);
0227 bio_put(bio);
0228 }
0229
0230 out_free_need_reset:
0231 kfree(need_reset);
0232 return ret;
0233 }
0234
0235 static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
0236 {
0237 struct bio bio;
0238
0239 bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
0240 return submit_bio_wait(&bio);
0241 }
0242
0243
0244
0245
0246
0247
0248
0249
0250
0251
0252
0253
0254
0255
0256
0257
0258
0259 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
0260 sector_t sector, sector_t nr_sectors, gfp_t gfp_mask)
0261 {
0262 struct request_queue *q = bdev_get_queue(bdev);
0263 sector_t zone_sectors = bdev_zone_sectors(bdev);
0264 sector_t capacity = bdev_nr_sectors(bdev);
0265 sector_t end_sector = sector + nr_sectors;
0266 struct bio *bio = NULL;
0267 int ret = 0;
0268
0269 if (!bdev_is_zoned(bdev))
0270 return -EOPNOTSUPP;
0271
0272 if (bdev_read_only(bdev))
0273 return -EPERM;
0274
0275 if (!op_is_zone_mgmt(op))
0276 return -EOPNOTSUPP;
0277
0278 if (end_sector <= sector || end_sector > capacity)
0279
0280 return -EINVAL;
0281
0282
0283 if (sector & (zone_sectors - 1))
0284 return -EINVAL;
0285
0286 if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity)
0287 return -EINVAL;
0288
0289
0290
0291
0292
0293
0294
0295 if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
0296 if (!blk_queue_zone_resetall(q))
0297 return blkdev_zone_reset_all_emulated(bdev, gfp_mask);
0298 return blkdev_zone_reset_all(bdev, gfp_mask);
0299 }
0300
0301 while (sector < end_sector) {
0302 bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask);
0303 bio->bi_iter.bi_sector = sector;
0304 sector += zone_sectors;
0305
0306
0307 cond_resched();
0308 }
0309
0310 ret = submit_bio_wait(bio);
0311 bio_put(bio);
0312
0313 return ret;
0314 }
0315 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
0316
0317 struct zone_report_args {
0318 struct blk_zone __user *zones;
0319 };
0320
0321 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
0322 void *data)
0323 {
0324 struct zone_report_args *args = data;
0325
0326 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
0327 return -EFAULT;
0328 return 0;
0329 }
0330
0331
0332
0333
0334
0335 int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
0336 unsigned int cmd, unsigned long arg)
0337 {
0338 void __user *argp = (void __user *)arg;
0339 struct zone_report_args args;
0340 struct request_queue *q;
0341 struct blk_zone_report rep;
0342 int ret;
0343
0344 if (!argp)
0345 return -EINVAL;
0346
0347 q = bdev_get_queue(bdev);
0348 if (!q)
0349 return -ENXIO;
0350
0351 if (!bdev_is_zoned(bdev))
0352 return -ENOTTY;
0353
0354 if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
0355 return -EFAULT;
0356
0357 if (!rep.nr_zones)
0358 return -EINVAL;
0359
0360 args.zones = argp + sizeof(struct blk_zone_report);
0361 ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
0362 blkdev_copy_zone_to_user, &args);
0363 if (ret < 0)
0364 return ret;
0365
0366 rep.nr_zones = ret;
0367 rep.flags = BLK_ZONE_REP_CAPACITY;
0368 if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
0369 return -EFAULT;
0370 return 0;
0371 }
0372
0373 static int blkdev_truncate_zone_range(struct block_device *bdev, fmode_t mode,
0374 const struct blk_zone_range *zrange)
0375 {
0376 loff_t start, end;
0377
0378 if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
0379 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
0380
0381 return -EINVAL;
0382
0383 start = zrange->sector << SECTOR_SHIFT;
0384 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
0385
0386 return truncate_bdev_range(bdev, mode, start, end);
0387 }
0388
0389
0390
0391
0392
0393 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
0394 unsigned int cmd, unsigned long arg)
0395 {
0396 void __user *argp = (void __user *)arg;
0397 struct request_queue *q;
0398 struct blk_zone_range zrange;
0399 enum req_op op;
0400 int ret;
0401
0402 if (!argp)
0403 return -EINVAL;
0404
0405 q = bdev_get_queue(bdev);
0406 if (!q)
0407 return -ENXIO;
0408
0409 if (!bdev_is_zoned(bdev))
0410 return -ENOTTY;
0411
0412 if (!(mode & FMODE_WRITE))
0413 return -EBADF;
0414
0415 if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
0416 return -EFAULT;
0417
0418 switch (cmd) {
0419 case BLKRESETZONE:
0420 op = REQ_OP_ZONE_RESET;
0421
0422
0423 filemap_invalidate_lock(bdev->bd_inode->i_mapping);
0424 ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
0425 if (ret)
0426 goto fail;
0427 break;
0428 case BLKOPENZONE:
0429 op = REQ_OP_ZONE_OPEN;
0430 break;
0431 case BLKCLOSEZONE:
0432 op = REQ_OP_ZONE_CLOSE;
0433 break;
0434 case BLKFINISHZONE:
0435 op = REQ_OP_ZONE_FINISH;
0436 break;
0437 default:
0438 return -ENOTTY;
0439 }
0440
0441 ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors,
0442 GFP_KERNEL);
0443
0444 fail:
0445 if (cmd == BLKRESETZONE)
0446 filemap_invalidate_unlock(bdev->bd_inode->i_mapping);
0447
0448 return ret;
0449 }
0450
0451 void disk_free_zone_bitmaps(struct gendisk *disk)
0452 {
0453 kfree(disk->conv_zones_bitmap);
0454 disk->conv_zones_bitmap = NULL;
0455 kfree(disk->seq_zones_wlock);
0456 disk->seq_zones_wlock = NULL;
0457 }
0458
0459 struct blk_revalidate_zone_args {
0460 struct gendisk *disk;
0461 unsigned long *conv_zones_bitmap;
0462 unsigned long *seq_zones_wlock;
0463 unsigned int nr_zones;
0464 sector_t zone_sectors;
0465 sector_t sector;
0466 };
0467
0468
0469
0470
0471 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
0472 void *data)
0473 {
0474 struct blk_revalidate_zone_args *args = data;
0475 struct gendisk *disk = args->disk;
0476 struct request_queue *q = disk->queue;
0477 sector_t capacity = get_capacity(disk);
0478
0479
0480
0481
0482
0483 if (zone->start == 0) {
0484 if (zone->len == 0 || !is_power_of_2(zone->len)) {
0485 pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
0486 disk->disk_name, zone->len);
0487 return -ENODEV;
0488 }
0489
0490 args->zone_sectors = zone->len;
0491 args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
0492 } else if (zone->start + args->zone_sectors < capacity) {
0493 if (zone->len != args->zone_sectors) {
0494 pr_warn("%s: Invalid zoned device with non constant zone size\n",
0495 disk->disk_name);
0496 return -ENODEV;
0497 }
0498 } else {
0499 if (zone->len > args->zone_sectors) {
0500 pr_warn("%s: Invalid zoned device with larger last zone size\n",
0501 disk->disk_name);
0502 return -ENODEV;
0503 }
0504 }
0505
0506
0507 if (zone->start != args->sector) {
0508 pr_warn("%s: Zone gap at sectors %llu..%llu\n",
0509 disk->disk_name, args->sector, zone->start);
0510 return -ENODEV;
0511 }
0512
0513
0514 switch (zone->type) {
0515 case BLK_ZONE_TYPE_CONVENTIONAL:
0516 if (!args->conv_zones_bitmap) {
0517 args->conv_zones_bitmap =
0518 blk_alloc_zone_bitmap(q->node, args->nr_zones);
0519 if (!args->conv_zones_bitmap)
0520 return -ENOMEM;
0521 }
0522 set_bit(idx, args->conv_zones_bitmap);
0523 break;
0524 case BLK_ZONE_TYPE_SEQWRITE_REQ:
0525 case BLK_ZONE_TYPE_SEQWRITE_PREF:
0526 if (!args->seq_zones_wlock) {
0527 args->seq_zones_wlock =
0528 blk_alloc_zone_bitmap(q->node, args->nr_zones);
0529 if (!args->seq_zones_wlock)
0530 return -ENOMEM;
0531 }
0532 break;
0533 default:
0534 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
0535 disk->disk_name, (int)zone->type, zone->start);
0536 return -ENODEV;
0537 }
0538
0539 args->sector += zone->len;
0540 return 0;
0541 }
0542
0543
0544
0545
0546
0547
0548
0549
0550
0551
0552
0553
0554
0555
0556
0557 int blk_revalidate_disk_zones(struct gendisk *disk,
0558 void (*update_driver_data)(struct gendisk *disk))
0559 {
0560 struct request_queue *q = disk->queue;
0561 struct blk_revalidate_zone_args args = {
0562 .disk = disk,
0563 };
0564 unsigned int noio_flag;
0565 int ret;
0566
0567 if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
0568 return -EIO;
0569 if (WARN_ON_ONCE(!queue_is_mq(q)))
0570 return -EIO;
0571
0572 if (!get_capacity(disk))
0573 return -EIO;
0574
0575
0576
0577
0578
0579 noio_flag = memalloc_noio_save();
0580 ret = disk->fops->report_zones(disk, 0, UINT_MAX,
0581 blk_revalidate_zone_cb, &args);
0582 if (!ret) {
0583 pr_warn("%s: No zones reported\n", disk->disk_name);
0584 ret = -ENODEV;
0585 }
0586 memalloc_noio_restore(noio_flag);
0587
0588
0589
0590
0591
0592 if (ret > 0 && args.sector != get_capacity(disk)) {
0593 pr_warn("%s: Missing zones from sector %llu\n",
0594 disk->disk_name, args.sector);
0595 ret = -ENODEV;
0596 }
0597
0598
0599
0600
0601
0602
0603 blk_mq_freeze_queue(q);
0604 if (ret > 0) {
0605 blk_queue_chunk_sectors(q, args.zone_sectors);
0606 disk->nr_zones = args.nr_zones;
0607 swap(disk->seq_zones_wlock, args.seq_zones_wlock);
0608 swap(disk->conv_zones_bitmap, args.conv_zones_bitmap);
0609 if (update_driver_data)
0610 update_driver_data(disk);
0611 ret = 0;
0612 } else {
0613 pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
0614 disk_free_zone_bitmaps(disk);
0615 }
0616 blk_mq_unfreeze_queue(q);
0617
0618 kfree(args.seq_zones_wlock);
0619 kfree(args.conv_zones_bitmap);
0620 return ret;
0621 }
0622 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
0623
0624 void disk_clear_zone_settings(struct gendisk *disk)
0625 {
0626 struct request_queue *q = disk->queue;
0627
0628 blk_mq_freeze_queue(q);
0629
0630 disk_free_zone_bitmaps(disk);
0631 blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q);
0632 q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE;
0633 disk->nr_zones = 0;
0634 disk->max_open_zones = 0;
0635 disk->max_active_zones = 0;
0636 q->limits.chunk_sectors = 0;
0637 q->limits.zone_write_granularity = 0;
0638 q->limits.max_zone_append_sectors = 0;
0639
0640 blk_mq_unfreeze_queue(q);
0641 }