0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038 #include <linux/interrupt.h>
0039 #include <linux/blkdev.h>
0040 #include <linux/blk-mq.h>
0041 #include <linux/hdreg.h>
0042 #include <linux/cdrom.h>
0043 #include <linux/module.h>
0044 #include <linux/slab.h>
0045 #include <linux/major.h>
0046 #include <linux/mutex.h>
0047 #include <linux/scatterlist.h>
0048 #include <linux/bitmap.h>
0049 #include <linux/list.h>
0050 #include <linux/workqueue.h>
0051 #include <linux/sched/mm.h>
0052
0053 #include <xen/xen.h>
0054 #include <xen/xenbus.h>
0055 #include <xen/grant_table.h>
0056 #include <xen/events.h>
0057 #include <xen/page.h>
0058 #include <xen/platform_pci.h>
0059
0060 #include <xen/interface/grant_table.h>
0061 #include <xen/interface/io/blkif.h>
0062 #include <xen/interface/io/protocols.h>
0063
0064 #include <asm/xen/hypervisor.h>
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078 #define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
0079
0080 enum blkif_state {
0081 BLKIF_STATE_DISCONNECTED,
0082 BLKIF_STATE_CONNECTED,
0083 BLKIF_STATE_SUSPENDED,
0084 BLKIF_STATE_ERROR,
0085 };
0086
0087 struct grant {
0088 grant_ref_t gref;
0089 struct page *page;
0090 struct list_head node;
0091 };
0092
0093 enum blk_req_status {
0094 REQ_PROCESSING,
0095 REQ_WAITING,
0096 REQ_DONE,
0097 REQ_ERROR,
0098 REQ_EOPNOTSUPP,
0099 };
0100
0101 struct blk_shadow {
0102 struct blkif_request req;
0103 struct request *request;
0104 struct grant **grants_used;
0105 struct grant **indirect_grants;
0106 struct scatterlist *sg;
0107 unsigned int num_sg;
0108 enum blk_req_status status;
0109
0110 #define NO_ASSOCIATED_ID ~0UL
0111
0112
0113
0114
0115 unsigned long associated_id;
0116 };
0117
0118 struct blkif_req {
0119 blk_status_t error;
0120 };
0121
0122 static inline struct blkif_req *blkif_req(struct request *rq)
0123 {
0124 return blk_mq_rq_to_pdu(rq);
0125 }
0126
0127 static DEFINE_MUTEX(blkfront_mutex);
0128 static const struct block_device_operations xlvbd_block_fops;
0129 static struct delayed_work blkfront_work;
0130 static LIST_HEAD(info_list);
0131
0132
0133
0134
0135
0136
0137
0138 static unsigned int xen_blkif_max_segments = 32;
0139 module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, 0444);
0140 MODULE_PARM_DESC(max_indirect_segments,
0141 "Maximum amount of segments in indirect requests (default is 32)");
0142
0143 static unsigned int xen_blkif_max_queues = 4;
0144 module_param_named(max_queues, xen_blkif_max_queues, uint, 0444);
0145 MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
0146
0147
0148
0149
0150
0151 static unsigned int xen_blkif_max_ring_order;
0152 module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
0153 MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
0154
0155 static bool __read_mostly xen_blkif_trusted = true;
0156 module_param_named(trusted, xen_blkif_trusted, bool, 0644);
0157 MODULE_PARM_DESC(trusted, "Is the backend trusted");
0158
0159 #define BLK_RING_SIZE(info) \
0160 __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages)
0161
0162
0163
0164
0165
0166 #define RINGREF_NAME_LEN (20)
0167
0168
0169
0170 #define QUEUE_NAME_LEN (17)
0171
0172
0173
0174
0175
0176
0177 struct blkfront_ring_info {
0178
0179 spinlock_t ring_lock;
0180 struct blkif_front_ring ring;
0181 unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
0182 unsigned int evtchn, irq;
0183 struct work_struct work;
0184 struct gnttab_free_callback callback;
0185 struct list_head indirect_pages;
0186 struct list_head grants;
0187 unsigned int persistent_gnts_c;
0188 unsigned long shadow_free;
0189 struct blkfront_info *dev_info;
0190 struct blk_shadow shadow[];
0191 };
0192
0193
0194
0195
0196
0197
0198 struct blkfront_info
0199 {
0200 struct mutex mutex;
0201 struct xenbus_device *xbdev;
0202 struct gendisk *gd;
0203 u16 sector_size;
0204 unsigned int physical_sector_size;
0205 unsigned long vdisk_info;
0206 int vdevice;
0207 blkif_vdev_t handle;
0208 enum blkif_state connected;
0209
0210 unsigned int nr_ring_pages;
0211 struct request_queue *rq;
0212 unsigned int feature_flush:1;
0213 unsigned int feature_fua:1;
0214 unsigned int feature_discard:1;
0215 unsigned int feature_secdiscard:1;
0216
0217 unsigned int feature_persistent_parm:1;
0218
0219 unsigned int feature_persistent:1;
0220 unsigned int bounce:1;
0221 unsigned int discard_granularity;
0222 unsigned int discard_alignment;
0223
0224 unsigned int max_indirect_segments;
0225 int is_ready;
0226 struct blk_mq_tag_set tag_set;
0227 struct blkfront_ring_info *rinfo;
0228 unsigned int nr_rings;
0229 unsigned int rinfo_size;
0230
0231 struct list_head requests;
0232 struct bio_list bio_list;
0233 struct list_head info_list;
0234 };
0235
0236 static unsigned int nr_minors;
0237 static unsigned long *minors;
0238 static DEFINE_SPINLOCK(minor_lock);
0239
0240 #define PARTS_PER_DISK 16
0241 #define PARTS_PER_EXT_DISK 256
0242
0243 #define BLKIF_MAJOR(dev) ((dev)>>8)
0244 #define BLKIF_MINOR(dev) ((dev) & 0xff)
0245
0246 #define EXT_SHIFT 28
0247 #define EXTENDED (1<<EXT_SHIFT)
0248 #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
0249 #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
0250 #define EMULATED_HD_DISK_MINOR_OFFSET (0)
0251 #define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
0252 #define EMULATED_SD_DISK_MINOR_OFFSET (0)
0253 #define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256)
0254
0255 #define DEV_NAME "xvd"
0256
0257
0258
0259
0260
0261
0262 #define GRANTS_PER_PSEG (PAGE_SIZE / XEN_PAGE_SIZE)
0263
0264 #define GRANTS_PER_INDIRECT_FRAME \
0265 (XEN_PAGE_SIZE / sizeof(struct blkif_request_segment))
0266
0267 #define INDIRECT_GREFS(_grants) \
0268 DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME)
0269
0270 static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
0271 static void blkfront_gather_backend_features(struct blkfront_info *info);
0272 static int negotiate_mq(struct blkfront_info *info);
0273
0274 #define for_each_rinfo(info, ptr, idx) \
0275 for ((ptr) = (info)->rinfo, (idx) = 0; \
0276 (idx) < (info)->nr_rings; \
0277 (idx)++, (ptr) = (void *)(ptr) + (info)->rinfo_size)
0278
0279 static inline struct blkfront_ring_info *
0280 get_rinfo(const struct blkfront_info *info, unsigned int i)
0281 {
0282 BUG_ON(i >= info->nr_rings);
0283 return (void *)info->rinfo + i * info->rinfo_size;
0284 }
0285
0286 static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
0287 {
0288 unsigned long free = rinfo->shadow_free;
0289
0290 BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info));
0291 rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id;
0292 rinfo->shadow[free].req.u.rw.id = 0x0fffffee;
0293 return free;
0294 }
0295
0296 static int add_id_to_freelist(struct blkfront_ring_info *rinfo,
0297 unsigned long id)
0298 {
0299 if (rinfo->shadow[id].req.u.rw.id != id)
0300 return -EINVAL;
0301 if (rinfo->shadow[id].request == NULL)
0302 return -EINVAL;
0303 rinfo->shadow[id].req.u.rw.id = rinfo->shadow_free;
0304 rinfo->shadow[id].request = NULL;
0305 rinfo->shadow_free = id;
0306 return 0;
0307 }
0308
0309 static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
0310 {
0311 struct blkfront_info *info = rinfo->dev_info;
0312 struct page *granted_page;
0313 struct grant *gnt_list_entry, *n;
0314 int i = 0;
0315
0316 while (i < num) {
0317 gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
0318 if (!gnt_list_entry)
0319 goto out_of_memory;
0320
0321 if (info->bounce) {
0322 granted_page = alloc_page(GFP_NOIO | __GFP_ZERO);
0323 if (!granted_page) {
0324 kfree(gnt_list_entry);
0325 goto out_of_memory;
0326 }
0327 gnt_list_entry->page = granted_page;
0328 }
0329
0330 gnt_list_entry->gref = INVALID_GRANT_REF;
0331 list_add(&gnt_list_entry->node, &rinfo->grants);
0332 i++;
0333 }
0334
0335 return 0;
0336
0337 out_of_memory:
0338 list_for_each_entry_safe(gnt_list_entry, n,
0339 &rinfo->grants, node) {
0340 list_del(&gnt_list_entry->node);
0341 if (info->bounce)
0342 __free_page(gnt_list_entry->page);
0343 kfree(gnt_list_entry);
0344 i--;
0345 }
0346 BUG_ON(i != 0);
0347 return -ENOMEM;
0348 }
0349
0350 static struct grant *get_free_grant(struct blkfront_ring_info *rinfo)
0351 {
0352 struct grant *gnt_list_entry;
0353
0354 BUG_ON(list_empty(&rinfo->grants));
0355 gnt_list_entry = list_first_entry(&rinfo->grants, struct grant,
0356 node);
0357 list_del(&gnt_list_entry->node);
0358
0359 if (gnt_list_entry->gref != INVALID_GRANT_REF)
0360 rinfo->persistent_gnts_c--;
0361
0362 return gnt_list_entry;
0363 }
0364
0365 static inline void grant_foreign_access(const struct grant *gnt_list_entry,
0366 const struct blkfront_info *info)
0367 {
0368 gnttab_page_grant_foreign_access_ref_one(gnt_list_entry->gref,
0369 info->xbdev->otherend_id,
0370 gnt_list_entry->page,
0371 0);
0372 }
0373
0374 static struct grant *get_grant(grant_ref_t *gref_head,
0375 unsigned long gfn,
0376 struct blkfront_ring_info *rinfo)
0377 {
0378 struct grant *gnt_list_entry = get_free_grant(rinfo);
0379 struct blkfront_info *info = rinfo->dev_info;
0380
0381 if (gnt_list_entry->gref != INVALID_GRANT_REF)
0382 return gnt_list_entry;
0383
0384
0385 gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
0386 BUG_ON(gnt_list_entry->gref == -ENOSPC);
0387 if (info->bounce)
0388 grant_foreign_access(gnt_list_entry, info);
0389 else {
0390
0391 gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
0392 info->xbdev->otherend_id,
0393 gfn, 0);
0394 }
0395
0396 return gnt_list_entry;
0397 }
0398
0399 static struct grant *get_indirect_grant(grant_ref_t *gref_head,
0400 struct blkfront_ring_info *rinfo)
0401 {
0402 struct grant *gnt_list_entry = get_free_grant(rinfo);
0403 struct blkfront_info *info = rinfo->dev_info;
0404
0405 if (gnt_list_entry->gref != INVALID_GRANT_REF)
0406 return gnt_list_entry;
0407
0408
0409 gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
0410 BUG_ON(gnt_list_entry->gref == -ENOSPC);
0411 if (!info->bounce) {
0412 struct page *indirect_page;
0413
0414
0415 BUG_ON(list_empty(&rinfo->indirect_pages));
0416 indirect_page = list_first_entry(&rinfo->indirect_pages,
0417 struct page, lru);
0418 list_del(&indirect_page->lru);
0419 gnt_list_entry->page = indirect_page;
0420 }
0421 grant_foreign_access(gnt_list_entry, info);
0422
0423 return gnt_list_entry;
0424 }
0425
0426 static const char *op_name(int op)
0427 {
0428 static const char *const names[] = {
0429 [BLKIF_OP_READ] = "read",
0430 [BLKIF_OP_WRITE] = "write",
0431 [BLKIF_OP_WRITE_BARRIER] = "barrier",
0432 [BLKIF_OP_FLUSH_DISKCACHE] = "flush",
0433 [BLKIF_OP_DISCARD] = "discard" };
0434
0435 if (op < 0 || op >= ARRAY_SIZE(names))
0436 return "unknown";
0437
0438 if (!names[op])
0439 return "reserved";
0440
0441 return names[op];
0442 }
0443 static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
0444 {
0445 unsigned int end = minor + nr;
0446 int rc;
0447
0448 if (end > nr_minors) {
0449 unsigned long *bitmap, *old;
0450
0451 bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
0452 GFP_KERNEL);
0453 if (bitmap == NULL)
0454 return -ENOMEM;
0455
0456 spin_lock(&minor_lock);
0457 if (end > nr_minors) {
0458 old = minors;
0459 memcpy(bitmap, minors,
0460 BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
0461 minors = bitmap;
0462 nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
0463 } else
0464 old = bitmap;
0465 spin_unlock(&minor_lock);
0466 kfree(old);
0467 }
0468
0469 spin_lock(&minor_lock);
0470 if (find_next_bit(minors, end, minor) >= end) {
0471 bitmap_set(minors, minor, nr);
0472 rc = 0;
0473 } else
0474 rc = -EBUSY;
0475 spin_unlock(&minor_lock);
0476
0477 return rc;
0478 }
0479
0480 static void xlbd_release_minors(unsigned int minor, unsigned int nr)
0481 {
0482 unsigned int end = minor + nr;
0483
0484 BUG_ON(end > nr_minors);
0485 spin_lock(&minor_lock);
0486 bitmap_clear(minors, minor, nr);
0487 spin_unlock(&minor_lock);
0488 }
0489
0490 static void blkif_restart_queue_callback(void *arg)
0491 {
0492 struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg;
0493 schedule_work(&rinfo->work);
0494 }
0495
0496 static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
0497 {
0498
0499
0500 sector_t nsect = get_capacity(bd->bd_disk);
0501 sector_t cylinders = nsect;
0502
0503 hg->heads = 0xff;
0504 hg->sectors = 0x3f;
0505 sector_div(cylinders, hg->heads * hg->sectors);
0506 hg->cylinders = cylinders;
0507 if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
0508 hg->cylinders = 0xffff;
0509 return 0;
0510 }
0511
0512 static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
0513 unsigned command, unsigned long argument)
0514 {
0515 struct blkfront_info *info = bdev->bd_disk->private_data;
0516 int i;
0517
0518 switch (command) {
0519 case CDROMMULTISESSION:
0520 for (i = 0; i < sizeof(struct cdrom_multisession); i++)
0521 if (put_user(0, (char __user *)(argument + i)))
0522 return -EFAULT;
0523 return 0;
0524 case CDROM_GET_CAPABILITY:
0525 if (!(info->vdisk_info & VDISK_CDROM))
0526 return -EINVAL;
0527 return 0;
0528 default:
0529 return -EINVAL;
0530 }
0531 }
0532
0533 static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
0534 struct request *req,
0535 struct blkif_request **ring_req)
0536 {
0537 unsigned long id;
0538
0539 *ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
0540 rinfo->ring.req_prod_pvt++;
0541
0542 id = get_id_from_freelist(rinfo);
0543 rinfo->shadow[id].request = req;
0544 rinfo->shadow[id].status = REQ_PROCESSING;
0545 rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
0546
0547 rinfo->shadow[id].req.u.rw.id = id;
0548
0549 return id;
0550 }
0551
0552 static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo)
0553 {
0554 struct blkfront_info *info = rinfo->dev_info;
0555 struct blkif_request *ring_req, *final_ring_req;
0556 unsigned long id;
0557
0558
0559 id = blkif_ring_get_request(rinfo, req, &final_ring_req);
0560 ring_req = &rinfo->shadow[id].req;
0561
0562 ring_req->operation = BLKIF_OP_DISCARD;
0563 ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
0564 ring_req->u.discard.id = id;
0565 ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
0566 if (req_op(req) == REQ_OP_SECURE_ERASE && info->feature_secdiscard)
0567 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
0568 else
0569 ring_req->u.discard.flag = 0;
0570
0571
0572 *final_ring_req = *ring_req;
0573 rinfo->shadow[id].status = REQ_WAITING;
0574
0575 return 0;
0576 }
0577
0578 struct setup_rw_req {
0579 unsigned int grant_idx;
0580 struct blkif_request_segment *segments;
0581 struct blkfront_ring_info *rinfo;
0582 struct blkif_request *ring_req;
0583 grant_ref_t gref_head;
0584 unsigned int id;
0585
0586 bool need_copy;
0587 unsigned int bvec_off;
0588 char *bvec_data;
0589
0590 bool require_extra_req;
0591 struct blkif_request *extra_ring_req;
0592 };
0593
0594 static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
0595 unsigned int len, void *data)
0596 {
0597 struct setup_rw_req *setup = data;
0598 int n, ref;
0599 struct grant *gnt_list_entry;
0600 unsigned int fsect, lsect;
0601
0602 unsigned int grant_idx = setup->grant_idx;
0603 struct blkif_request *ring_req = setup->ring_req;
0604 struct blkfront_ring_info *rinfo = setup->rinfo;
0605
0606
0607
0608
0609
0610
0611 struct blk_shadow *shadow = &rinfo->shadow[setup->id];
0612
0613 if (unlikely(setup->require_extra_req &&
0614 grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
0615
0616
0617
0618
0619 grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
0620 ring_req = setup->extra_ring_req;
0621 }
0622
0623 if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
0624 (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
0625 if (setup->segments)
0626 kunmap_atomic(setup->segments);
0627
0628 n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
0629 gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo);
0630 shadow->indirect_grants[n] = gnt_list_entry;
0631 setup->segments = kmap_atomic(gnt_list_entry->page);
0632 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
0633 }
0634
0635 gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
0636 ref = gnt_list_entry->gref;
0637
0638
0639
0640
0641 shadow->grants_used[setup->grant_idx] = gnt_list_entry;
0642
0643 if (setup->need_copy) {
0644 void *shared_data;
0645
0646 shared_data = kmap_atomic(gnt_list_entry->page);
0647
0648
0649
0650
0651
0652
0653
0654
0655
0656 memcpy(shared_data + offset,
0657 setup->bvec_data + setup->bvec_off,
0658 len);
0659
0660 kunmap_atomic(shared_data);
0661 setup->bvec_off += len;
0662 }
0663
0664 fsect = offset >> 9;
0665 lsect = fsect + (len >> 9) - 1;
0666 if (ring_req->operation != BLKIF_OP_INDIRECT) {
0667 ring_req->u.rw.seg[grant_idx] =
0668 (struct blkif_request_segment) {
0669 .gref = ref,
0670 .first_sect = fsect,
0671 .last_sect = lsect };
0672 } else {
0673 setup->segments[grant_idx % GRANTS_PER_INDIRECT_FRAME] =
0674 (struct blkif_request_segment) {
0675 .gref = ref,
0676 .first_sect = fsect,
0677 .last_sect = lsect };
0678 }
0679
0680 (setup->grant_idx)++;
0681 }
0682
0683 static void blkif_setup_extra_req(struct blkif_request *first,
0684 struct blkif_request *second)
0685 {
0686 uint16_t nr_segments = first->u.rw.nr_segments;
0687
0688
0689
0690
0691
0692 first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
0693
0694 second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
0695 second->u.rw.sector_number = first->u.rw.sector_number +
0696 (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
0697
0698 second->u.rw.handle = first->u.rw.handle;
0699 second->operation = first->operation;
0700 }
0701
0702 static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
0703 {
0704 struct blkfront_info *info = rinfo->dev_info;
0705 struct blkif_request *ring_req, *extra_ring_req = NULL;
0706 struct blkif_request *final_ring_req, *final_extra_ring_req = NULL;
0707 unsigned long id, extra_id = NO_ASSOCIATED_ID;
0708 bool require_extra_req = false;
0709 int i;
0710 struct setup_rw_req setup = {
0711 .grant_idx = 0,
0712 .segments = NULL,
0713 .rinfo = rinfo,
0714 .need_copy = rq_data_dir(req) && info->bounce,
0715 };
0716
0717
0718
0719
0720
0721
0722 bool new_persistent_gnts = false;
0723 struct scatterlist *sg;
0724 int num_sg, max_grefs, num_grant;
0725
0726 max_grefs = req->nr_phys_segments * GRANTS_PER_PSEG;
0727 if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
0728
0729
0730
0731
0732 max_grefs += INDIRECT_GREFS(max_grefs);
0733
0734
0735 if (rinfo->persistent_gnts_c < max_grefs) {
0736 new_persistent_gnts = true;
0737
0738 if (gnttab_alloc_grant_references(
0739 max_grefs - rinfo->persistent_gnts_c,
0740 &setup.gref_head) < 0) {
0741 gnttab_request_free_callback(
0742 &rinfo->callback,
0743 blkif_restart_queue_callback,
0744 rinfo,
0745 max_grefs - rinfo->persistent_gnts_c);
0746 return 1;
0747 }
0748 }
0749
0750
0751 id = blkif_ring_get_request(rinfo, req, &final_ring_req);
0752 ring_req = &rinfo->shadow[id].req;
0753
0754 num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
0755 num_grant = 0;
0756
0757 for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
0758 num_grant += gnttab_count_grant(sg->offset, sg->length);
0759
0760 require_extra_req = info->max_indirect_segments == 0 &&
0761 num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
0762 BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
0763
0764 rinfo->shadow[id].num_sg = num_sg;
0765 if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
0766 likely(!require_extra_req)) {
0767
0768
0769
0770
0771 BUG_ON(req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA);
0772 ring_req->operation = BLKIF_OP_INDIRECT;
0773 ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
0774 BLKIF_OP_WRITE : BLKIF_OP_READ;
0775 ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
0776 ring_req->u.indirect.handle = info->handle;
0777 ring_req->u.indirect.nr_segments = num_grant;
0778 } else {
0779 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
0780 ring_req->u.rw.handle = info->handle;
0781 ring_req->operation = rq_data_dir(req) ?
0782 BLKIF_OP_WRITE : BLKIF_OP_READ;
0783 if (req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA) {
0784
0785
0786
0787
0788
0789
0790
0791 if (info->feature_flush && info->feature_fua)
0792 ring_req->operation =
0793 BLKIF_OP_WRITE_BARRIER;
0794 else if (info->feature_flush)
0795 ring_req->operation =
0796 BLKIF_OP_FLUSH_DISKCACHE;
0797 else
0798 ring_req->operation = 0;
0799 }
0800 ring_req->u.rw.nr_segments = num_grant;
0801 if (unlikely(require_extra_req)) {
0802 extra_id = blkif_ring_get_request(rinfo, req,
0803 &final_extra_ring_req);
0804 extra_ring_req = &rinfo->shadow[extra_id].req;
0805
0806
0807
0808
0809
0810 rinfo->shadow[extra_id].num_sg = 0;
0811
0812 blkif_setup_extra_req(ring_req, extra_ring_req);
0813
0814
0815 rinfo->shadow[extra_id].associated_id = id;
0816 rinfo->shadow[id].associated_id = extra_id;
0817 }
0818 }
0819
0820 setup.ring_req = ring_req;
0821 setup.id = id;
0822
0823 setup.require_extra_req = require_extra_req;
0824 if (unlikely(require_extra_req))
0825 setup.extra_ring_req = extra_ring_req;
0826
0827 for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
0828 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
0829
0830 if (setup.need_copy) {
0831 setup.bvec_off = sg->offset;
0832 setup.bvec_data = kmap_atomic(sg_page(sg));
0833 }
0834
0835 gnttab_foreach_grant_in_range(sg_page(sg),
0836 sg->offset,
0837 sg->length,
0838 blkif_setup_rw_req_grant,
0839 &setup);
0840
0841 if (setup.need_copy)
0842 kunmap_atomic(setup.bvec_data);
0843 }
0844 if (setup.segments)
0845 kunmap_atomic(setup.segments);
0846
0847
0848 *final_ring_req = *ring_req;
0849 rinfo->shadow[id].status = REQ_WAITING;
0850 if (unlikely(require_extra_req)) {
0851 *final_extra_ring_req = *extra_ring_req;
0852 rinfo->shadow[extra_id].status = REQ_WAITING;
0853 }
0854
0855 if (new_persistent_gnts)
0856 gnttab_free_grant_references(setup.gref_head);
0857
0858 return 0;
0859 }
0860
0861
0862
0863
0864
0865
0866
0867 static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo)
0868 {
0869 if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED))
0870 return 1;
0871
0872 if (unlikely(req_op(req) == REQ_OP_DISCARD ||
0873 req_op(req) == REQ_OP_SECURE_ERASE))
0874 return blkif_queue_discard_req(req, rinfo);
0875 else
0876 return blkif_queue_rw_req(req, rinfo);
0877 }
0878
0879 static inline void flush_requests(struct blkfront_ring_info *rinfo)
0880 {
0881 int notify;
0882
0883 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify);
0884
0885 if (notify)
0886 notify_remote_via_irq(rinfo->irq);
0887 }
0888
0889 static inline bool blkif_request_flush_invalid(struct request *req,
0890 struct blkfront_info *info)
0891 {
0892 return (blk_rq_is_passthrough(req) ||
0893 ((req_op(req) == REQ_OP_FLUSH) &&
0894 !info->feature_flush) ||
0895 ((req->cmd_flags & REQ_FUA) &&
0896 !info->feature_fua));
0897 }
0898
0899 static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
0900 const struct blk_mq_queue_data *qd)
0901 {
0902 unsigned long flags;
0903 int qid = hctx->queue_num;
0904 struct blkfront_info *info = hctx->queue->queuedata;
0905 struct blkfront_ring_info *rinfo = NULL;
0906
0907 rinfo = get_rinfo(info, qid);
0908 blk_mq_start_request(qd->rq);
0909 spin_lock_irqsave(&rinfo->ring_lock, flags);
0910 if (RING_FULL(&rinfo->ring))
0911 goto out_busy;
0912
0913 if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info))
0914 goto out_err;
0915
0916 if (blkif_queue_request(qd->rq, rinfo))
0917 goto out_busy;
0918
0919 flush_requests(rinfo);
0920 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
0921 return BLK_STS_OK;
0922
0923 out_err:
0924 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
0925 return BLK_STS_IOERR;
0926
0927 out_busy:
0928 blk_mq_stop_hw_queue(hctx);
0929 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
0930 return BLK_STS_DEV_RESOURCE;
0931 }
0932
0933 static void blkif_complete_rq(struct request *rq)
0934 {
0935 blk_mq_end_request(rq, blkif_req(rq)->error);
0936 }
0937
0938 static const struct blk_mq_ops blkfront_mq_ops = {
0939 .queue_rq = blkif_queue_rq,
0940 .complete = blkif_complete_rq,
0941 };
0942
0943 static void blkif_set_queue_limits(struct blkfront_info *info)
0944 {
0945 struct request_queue *rq = info->rq;
0946 struct gendisk *gd = info->gd;
0947 unsigned int segments = info->max_indirect_segments ? :
0948 BLKIF_MAX_SEGMENTS_PER_REQUEST;
0949
0950 blk_queue_flag_set(QUEUE_FLAG_VIRT, rq);
0951
0952 if (info->feature_discard) {
0953 blk_queue_max_discard_sectors(rq, get_capacity(gd));
0954 rq->limits.discard_granularity = info->discard_granularity ?:
0955 info->physical_sector_size;
0956 rq->limits.discard_alignment = info->discard_alignment;
0957 if (info->feature_secdiscard)
0958 blk_queue_max_secure_erase_sectors(rq,
0959 get_capacity(gd));
0960 }
0961
0962
0963 blk_queue_logical_block_size(rq, info->sector_size);
0964 blk_queue_physical_block_size(rq, info->physical_sector_size);
0965 blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512);
0966
0967
0968 blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
0969 blk_queue_max_segment_size(rq, PAGE_SIZE);
0970
0971
0972 blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG);
0973
0974
0975 blk_queue_dma_alignment(rq, 511);
0976 }
0977
0978 static const char *flush_info(struct blkfront_info *info)
0979 {
0980 if (info->feature_flush && info->feature_fua)
0981 return "barrier: enabled;";
0982 else if (info->feature_flush)
0983 return "flush diskcache: enabled;";
0984 else
0985 return "barrier or flush: disabled;";
0986 }
0987
0988 static void xlvbd_flush(struct blkfront_info *info)
0989 {
0990 blk_queue_write_cache(info->rq, info->feature_flush ? true : false,
0991 info->feature_fua ? true : false);
0992 pr_info("blkfront: %s: %s %s %s %s %s %s %s\n",
0993 info->gd->disk_name, flush_info(info),
0994 "persistent grants:", info->feature_persistent ?
0995 "enabled;" : "disabled;", "indirect descriptors:",
0996 info->max_indirect_segments ? "enabled;" : "disabled;",
0997 "bounce buffer:", info->bounce ? "enabled" : "disabled;");
0998 }
0999
1000 static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
1001 {
1002 int major;
1003 major = BLKIF_MAJOR(vdevice);
1004 *minor = BLKIF_MINOR(vdevice);
1005 switch (major) {
1006 case XEN_IDE0_MAJOR:
1007 *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
1008 *minor = ((*minor / 64) * PARTS_PER_DISK) +
1009 EMULATED_HD_DISK_MINOR_OFFSET;
1010 break;
1011 case XEN_IDE1_MAJOR:
1012 *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
1013 *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
1014 EMULATED_HD_DISK_MINOR_OFFSET;
1015 break;
1016 case XEN_SCSI_DISK0_MAJOR:
1017 *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
1018 *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
1019 break;
1020 case XEN_SCSI_DISK1_MAJOR:
1021 case XEN_SCSI_DISK2_MAJOR:
1022 case XEN_SCSI_DISK3_MAJOR:
1023 case XEN_SCSI_DISK4_MAJOR:
1024 case XEN_SCSI_DISK5_MAJOR:
1025 case XEN_SCSI_DISK6_MAJOR:
1026 case XEN_SCSI_DISK7_MAJOR:
1027 *offset = (*minor / PARTS_PER_DISK) +
1028 ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
1029 EMULATED_SD_DISK_NAME_OFFSET;
1030 *minor = *minor +
1031 ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
1032 EMULATED_SD_DISK_MINOR_OFFSET;
1033 break;
1034 case XEN_SCSI_DISK8_MAJOR:
1035 case XEN_SCSI_DISK9_MAJOR:
1036 case XEN_SCSI_DISK10_MAJOR:
1037 case XEN_SCSI_DISK11_MAJOR:
1038 case XEN_SCSI_DISK12_MAJOR:
1039 case XEN_SCSI_DISK13_MAJOR:
1040 case XEN_SCSI_DISK14_MAJOR:
1041 case XEN_SCSI_DISK15_MAJOR:
1042 *offset = (*minor / PARTS_PER_DISK) +
1043 ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
1044 EMULATED_SD_DISK_NAME_OFFSET;
1045 *minor = *minor +
1046 ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
1047 EMULATED_SD_DISK_MINOR_OFFSET;
1048 break;
1049 case XENVBD_MAJOR:
1050 *offset = *minor / PARTS_PER_DISK;
1051 break;
1052 default:
1053 printk(KERN_WARNING "blkfront: your disk configuration is "
1054 "incorrect, please use an xvd device instead\n");
1055 return -ENODEV;
1056 }
1057 return 0;
1058 }
1059
1060 static char *encode_disk_name(char *ptr, unsigned int n)
1061 {
1062 if (n >= 26)
1063 ptr = encode_disk_name(ptr, n / 26 - 1);
1064 *ptr = 'a' + n % 26;
1065 return ptr + 1;
1066 }
1067
1068 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
1069 struct blkfront_info *info, u16 sector_size,
1070 unsigned int physical_sector_size)
1071 {
1072 struct gendisk *gd;
1073 int nr_minors = 1;
1074 int err;
1075 unsigned int offset;
1076 int minor;
1077 int nr_parts;
1078 char *ptr;
1079
1080 BUG_ON(info->gd != NULL);
1081 BUG_ON(info->rq != NULL);
1082
1083 if ((info->vdevice>>EXT_SHIFT) > 1) {
1084
1085 printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
1086 return -ENODEV;
1087 }
1088
1089 if (!VDEV_IS_EXTENDED(info->vdevice)) {
1090 err = xen_translate_vdev(info->vdevice, &minor, &offset);
1091 if (err)
1092 return err;
1093 nr_parts = PARTS_PER_DISK;
1094 } else {
1095 minor = BLKIF_MINOR_EXT(info->vdevice);
1096 nr_parts = PARTS_PER_EXT_DISK;
1097 offset = minor / nr_parts;
1098 if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4)
1099 printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
1100 "emulated IDE disks,\n\t choose an xvd device name"
1101 "from xvde on\n", info->vdevice);
1102 }
1103 if (minor >> MINORBITS) {
1104 pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
1105 info->vdevice, minor);
1106 return -ENODEV;
1107 }
1108
1109 if ((minor % nr_parts) == 0)
1110 nr_minors = nr_parts;
1111
1112 err = xlbd_reserve_minors(minor, nr_minors);
1113 if (err)
1114 return err;
1115
1116 memset(&info->tag_set, 0, sizeof(info->tag_set));
1117 info->tag_set.ops = &blkfront_mq_ops;
1118 info->tag_set.nr_hw_queues = info->nr_rings;
1119 if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
1120
1121
1122
1123
1124
1125
1126 info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2;
1127 } else
1128 info->tag_set.queue_depth = BLK_RING_SIZE(info);
1129 info->tag_set.numa_node = NUMA_NO_NODE;
1130 info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1131 info->tag_set.cmd_size = sizeof(struct blkif_req);
1132 info->tag_set.driver_data = info;
1133
1134 err = blk_mq_alloc_tag_set(&info->tag_set);
1135 if (err)
1136 goto out_release_minors;
1137
1138 gd = blk_mq_alloc_disk(&info->tag_set, info);
1139 if (IS_ERR(gd)) {
1140 err = PTR_ERR(gd);
1141 goto out_free_tag_set;
1142 }
1143
1144 strcpy(gd->disk_name, DEV_NAME);
1145 ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
1146 BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN);
1147 if (nr_minors > 1)
1148 *ptr = 0;
1149 else
1150 snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr,
1151 "%d", minor & (nr_parts - 1));
1152
1153 gd->major = XENVBD_MAJOR;
1154 gd->first_minor = minor;
1155 gd->minors = nr_minors;
1156 gd->fops = &xlvbd_block_fops;
1157 gd->private_data = info;
1158 set_capacity(gd, capacity);
1159
1160 info->rq = gd->queue;
1161 info->gd = gd;
1162 info->sector_size = sector_size;
1163 info->physical_sector_size = physical_sector_size;
1164 blkif_set_queue_limits(info);
1165
1166 xlvbd_flush(info);
1167
1168 if (info->vdisk_info & VDISK_READONLY)
1169 set_disk_ro(gd, 1);
1170 if (info->vdisk_info & VDISK_REMOVABLE)
1171 gd->flags |= GENHD_FL_REMOVABLE;
1172
1173 return 0;
1174
1175 out_free_tag_set:
1176 blk_mq_free_tag_set(&info->tag_set);
1177 out_release_minors:
1178 xlbd_release_minors(minor, nr_minors);
1179 return err;
1180 }
1181
1182
1183 static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
1184 {
1185 if (!RING_FULL(&rinfo->ring))
1186 blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
1187 }
1188
1189 static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
1190 {
1191 unsigned long flags;
1192
1193 spin_lock_irqsave(&rinfo->ring_lock, flags);
1194 kick_pending_request_queues_locked(rinfo);
1195 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
1196 }
1197
1198 static void blkif_restart_queue(struct work_struct *work)
1199 {
1200 struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work);
1201
1202 if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
1203 kick_pending_request_queues(rinfo);
1204 }
1205
1206 static void blkif_free_ring(struct blkfront_ring_info *rinfo)
1207 {
1208 struct grant *persistent_gnt, *n;
1209 struct blkfront_info *info = rinfo->dev_info;
1210 int i, j, segs;
1211
1212
1213
1214
1215
1216 if (!list_empty(&rinfo->indirect_pages)) {
1217 struct page *indirect_page, *n;
1218
1219 BUG_ON(info->bounce);
1220 list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
1221 list_del(&indirect_page->lru);
1222 __free_page(indirect_page);
1223 }
1224 }
1225
1226
1227 if (!list_empty(&rinfo->grants)) {
1228 list_for_each_entry_safe(persistent_gnt, n,
1229 &rinfo->grants, node) {
1230 list_del(&persistent_gnt->node);
1231 if (persistent_gnt->gref != INVALID_GRANT_REF) {
1232 gnttab_end_foreign_access(persistent_gnt->gref,
1233 NULL);
1234 rinfo->persistent_gnts_c--;
1235 }
1236 if (info->bounce)
1237 __free_page(persistent_gnt->page);
1238 kfree(persistent_gnt);
1239 }
1240 }
1241 BUG_ON(rinfo->persistent_gnts_c != 0);
1242
1243 for (i = 0; i < BLK_RING_SIZE(info); i++) {
1244
1245
1246
1247
1248 if (!rinfo->shadow[i].request)
1249 goto free_shadow;
1250
1251 segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
1252 rinfo->shadow[i].req.u.indirect.nr_segments :
1253 rinfo->shadow[i].req.u.rw.nr_segments;
1254 for (j = 0; j < segs; j++) {
1255 persistent_gnt = rinfo->shadow[i].grants_used[j];
1256 gnttab_end_foreign_access(persistent_gnt->gref, NULL);
1257 if (info->bounce)
1258 __free_page(persistent_gnt->page);
1259 kfree(persistent_gnt);
1260 }
1261
1262 if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT)
1263
1264
1265
1266
1267 goto free_shadow;
1268
1269 for (j = 0; j < INDIRECT_GREFS(segs); j++) {
1270 persistent_gnt = rinfo->shadow[i].indirect_grants[j];
1271 gnttab_end_foreign_access(persistent_gnt->gref, NULL);
1272 __free_page(persistent_gnt->page);
1273 kfree(persistent_gnt);
1274 }
1275
1276 free_shadow:
1277 kvfree(rinfo->shadow[i].grants_used);
1278 rinfo->shadow[i].grants_used = NULL;
1279 kvfree(rinfo->shadow[i].indirect_grants);
1280 rinfo->shadow[i].indirect_grants = NULL;
1281 kvfree(rinfo->shadow[i].sg);
1282 rinfo->shadow[i].sg = NULL;
1283 }
1284
1285
1286 gnttab_cancel_free_callback(&rinfo->callback);
1287
1288
1289 flush_work(&rinfo->work);
1290
1291
1292 xenbus_teardown_ring((void **)&rinfo->ring.sring, info->nr_ring_pages,
1293 rinfo->ring_ref);
1294
1295 if (rinfo->irq)
1296 unbind_from_irqhandler(rinfo->irq, rinfo);
1297 rinfo->evtchn = rinfo->irq = 0;
1298 }
1299
1300 static void blkif_free(struct blkfront_info *info, int suspend)
1301 {
1302 unsigned int i;
1303 struct blkfront_ring_info *rinfo;
1304
1305
1306 info->connected = suspend ?
1307 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
1308
1309 if (info->rq)
1310 blk_mq_stop_hw_queues(info->rq);
1311
1312 for_each_rinfo(info, rinfo, i)
1313 blkif_free_ring(rinfo);
1314
1315 kvfree(info->rinfo);
1316 info->rinfo = NULL;
1317 info->nr_rings = 0;
1318 }
1319
1320 struct copy_from_grant {
1321 const struct blk_shadow *s;
1322 unsigned int grant_idx;
1323 unsigned int bvec_offset;
1324 char *bvec_data;
1325 };
1326
1327 static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
1328 unsigned int len, void *data)
1329 {
1330 struct copy_from_grant *info = data;
1331 char *shared_data;
1332
1333 const struct blk_shadow *s = info->s;
1334
1335 shared_data = kmap_atomic(s->grants_used[info->grant_idx]->page);
1336
1337 memcpy(info->bvec_data + info->bvec_offset,
1338 shared_data + offset, len);
1339
1340 info->bvec_offset += len;
1341 info->grant_idx++;
1342
1343 kunmap_atomic(shared_data);
1344 }
1345
1346 static enum blk_req_status blkif_rsp_to_req_status(int rsp)
1347 {
1348 switch (rsp)
1349 {
1350 case BLKIF_RSP_OKAY:
1351 return REQ_DONE;
1352 case BLKIF_RSP_EOPNOTSUPP:
1353 return REQ_EOPNOTSUPP;
1354 case BLKIF_RSP_ERROR:
1355 default:
1356 return REQ_ERROR;
1357 }
1358 }
1359
1360
1361
1362
1363 static int blkif_get_final_status(enum blk_req_status s1,
1364 enum blk_req_status s2)
1365 {
1366 BUG_ON(s1 < REQ_DONE);
1367 BUG_ON(s2 < REQ_DONE);
1368
1369 if (s1 == REQ_ERROR || s2 == REQ_ERROR)
1370 return BLKIF_RSP_ERROR;
1371 else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
1372 return BLKIF_RSP_EOPNOTSUPP;
1373 return BLKIF_RSP_OKAY;
1374 }
1375
1376
1377
1378
1379
1380
1381
1382 static int blkif_completion(unsigned long *id,
1383 struct blkfront_ring_info *rinfo,
1384 struct blkif_response *bret)
1385 {
1386 int i = 0;
1387 struct scatterlist *sg;
1388 int num_sg, num_grant;
1389 struct blkfront_info *info = rinfo->dev_info;
1390 struct blk_shadow *s = &rinfo->shadow[*id];
1391 struct copy_from_grant data = {
1392 .grant_idx = 0,
1393 };
1394
1395 num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
1396 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
1397
1398
1399 if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
1400 struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
1401
1402
1403 s->status = blkif_rsp_to_req_status(bret->status);
1404
1405
1406 if (s2->status < REQ_DONE)
1407 return 0;
1408
1409 bret->status = blkif_get_final_status(s->status,
1410 s2->status);
1411
1412
1413
1414
1415
1416 num_grant += s2->req.u.rw.nr_segments;
1417
1418
1419
1420
1421
1422 if (s2->num_sg != 0) {
1423
1424 *id = s->associated_id;
1425 s = s2;
1426 }
1427
1428
1429
1430
1431
1432 if (add_id_to_freelist(rinfo, s->associated_id))
1433 WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
1434 info->gd->disk_name, s->associated_id);
1435 }
1436
1437 data.s = s;
1438 num_sg = s->num_sg;
1439
1440 if (bret->operation == BLKIF_OP_READ && info->bounce) {
1441 for_each_sg(s->sg, sg, num_sg, i) {
1442 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
1443
1444 data.bvec_offset = sg->offset;
1445 data.bvec_data = kmap_atomic(sg_page(sg));
1446
1447 gnttab_foreach_grant_in_range(sg_page(sg),
1448 sg->offset,
1449 sg->length,
1450 blkif_copy_from_grant,
1451 &data);
1452
1453 kunmap_atomic(data.bvec_data);
1454 }
1455 }
1456
1457 for (i = 0; i < num_grant; i++) {
1458 if (!gnttab_try_end_foreign_access(s->grants_used[i]->gref)) {
1459
1460
1461
1462
1463
1464
1465 if (!info->feature_persistent) {
1466 pr_alert("backed has not unmapped grant: %u\n",
1467 s->grants_used[i]->gref);
1468 return -1;
1469 }
1470 list_add(&s->grants_used[i]->node, &rinfo->grants);
1471 rinfo->persistent_gnts_c++;
1472 } else {
1473
1474
1475
1476
1477
1478 s->grants_used[i]->gref = INVALID_GRANT_REF;
1479 list_add_tail(&s->grants_used[i]->node, &rinfo->grants);
1480 }
1481 }
1482 if (s->req.operation == BLKIF_OP_INDIRECT) {
1483 for (i = 0; i < INDIRECT_GREFS(num_grant); i++) {
1484 if (!gnttab_try_end_foreign_access(s->indirect_grants[i]->gref)) {
1485 if (!info->feature_persistent) {
1486 pr_alert("backed has not unmapped grant: %u\n",
1487 s->indirect_grants[i]->gref);
1488 return -1;
1489 }
1490 list_add(&s->indirect_grants[i]->node, &rinfo->grants);
1491 rinfo->persistent_gnts_c++;
1492 } else {
1493 struct page *indirect_page;
1494
1495
1496
1497
1498
1499 if (!info->bounce) {
1500 indirect_page = s->indirect_grants[i]->page;
1501 list_add(&indirect_page->lru, &rinfo->indirect_pages);
1502 }
1503 s->indirect_grants[i]->gref = INVALID_GRANT_REF;
1504 list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants);
1505 }
1506 }
1507 }
1508
1509 return 1;
1510 }
1511
1512 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1513 {
1514 struct request *req;
1515 struct blkif_response bret;
1516 RING_IDX i, rp;
1517 unsigned long flags;
1518 struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
1519 struct blkfront_info *info = rinfo->dev_info;
1520 unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS;
1521
1522 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
1523 xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS);
1524 return IRQ_HANDLED;
1525 }
1526
1527 spin_lock_irqsave(&rinfo->ring_lock, flags);
1528 again:
1529 rp = READ_ONCE(rinfo->ring.sring->rsp_prod);
1530 virt_rmb();
1531 if (RING_RESPONSE_PROD_OVERFLOW(&rinfo->ring, rp)) {
1532 pr_alert("%s: illegal number of responses %u\n",
1533 info->gd->disk_name, rp - rinfo->ring.rsp_cons);
1534 goto err;
1535 }
1536
1537 for (i = rinfo->ring.rsp_cons; i != rp; i++) {
1538 unsigned long id;
1539 unsigned int op;
1540
1541 eoiflag = 0;
1542
1543 RING_COPY_RESPONSE(&rinfo->ring, i, &bret);
1544 id = bret.id;
1545
1546
1547
1548
1549
1550
1551 if (id >= BLK_RING_SIZE(info)) {
1552 pr_alert("%s: response has incorrect id (%ld)\n",
1553 info->gd->disk_name, id);
1554 goto err;
1555 }
1556 if (rinfo->shadow[id].status != REQ_WAITING) {
1557 pr_alert("%s: response references no pending request\n",
1558 info->gd->disk_name);
1559 goto err;
1560 }
1561
1562 rinfo->shadow[id].status = REQ_PROCESSING;
1563 req = rinfo->shadow[id].request;
1564
1565 op = rinfo->shadow[id].req.operation;
1566 if (op == BLKIF_OP_INDIRECT)
1567 op = rinfo->shadow[id].req.u.indirect.indirect_op;
1568 if (bret.operation != op) {
1569 pr_alert("%s: response has wrong operation (%u instead of %u)\n",
1570 info->gd->disk_name, bret.operation, op);
1571 goto err;
1572 }
1573
1574 if (bret.operation != BLKIF_OP_DISCARD) {
1575 int ret;
1576
1577
1578
1579
1580
1581 ret = blkif_completion(&id, rinfo, &bret);
1582 if (!ret)
1583 continue;
1584 if (unlikely(ret < 0))
1585 goto err;
1586 }
1587
1588 if (add_id_to_freelist(rinfo, id)) {
1589 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
1590 info->gd->disk_name, op_name(bret.operation), id);
1591 continue;
1592 }
1593
1594 if (bret.status == BLKIF_RSP_OKAY)
1595 blkif_req(req)->error = BLK_STS_OK;
1596 else
1597 blkif_req(req)->error = BLK_STS_IOERR;
1598
1599 switch (bret.operation) {
1600 case BLKIF_OP_DISCARD:
1601 if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) {
1602 struct request_queue *rq = info->rq;
1603
1604 pr_warn_ratelimited("blkfront: %s: %s op failed\n",
1605 info->gd->disk_name, op_name(bret.operation));
1606 blkif_req(req)->error = BLK_STS_NOTSUPP;
1607 info->feature_discard = 0;
1608 info->feature_secdiscard = 0;
1609 blk_queue_max_discard_sectors(rq, 0);
1610 blk_queue_max_secure_erase_sectors(rq, 0);
1611 }
1612 break;
1613 case BLKIF_OP_FLUSH_DISKCACHE:
1614 case BLKIF_OP_WRITE_BARRIER:
1615 if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) {
1616 pr_warn_ratelimited("blkfront: %s: %s op failed\n",
1617 info->gd->disk_name, op_name(bret.operation));
1618 blkif_req(req)->error = BLK_STS_NOTSUPP;
1619 }
1620 if (unlikely(bret.status == BLKIF_RSP_ERROR &&
1621 rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
1622 pr_warn_ratelimited("blkfront: %s: empty %s op failed\n",
1623 info->gd->disk_name, op_name(bret.operation));
1624 blkif_req(req)->error = BLK_STS_NOTSUPP;
1625 }
1626 if (unlikely(blkif_req(req)->error)) {
1627 if (blkif_req(req)->error == BLK_STS_NOTSUPP)
1628 blkif_req(req)->error = BLK_STS_OK;
1629 info->feature_fua = 0;
1630 info->feature_flush = 0;
1631 xlvbd_flush(info);
1632 }
1633 fallthrough;
1634 case BLKIF_OP_READ:
1635 case BLKIF_OP_WRITE:
1636 if (unlikely(bret.status != BLKIF_RSP_OKAY))
1637 dev_dbg_ratelimited(&info->xbdev->dev,
1638 "Bad return from blkdev data request: %#x\n",
1639 bret.status);
1640
1641 break;
1642 default:
1643 BUG();
1644 }
1645
1646 if (likely(!blk_should_fake_timeout(req->q)))
1647 blk_mq_complete_request(req);
1648 }
1649
1650 rinfo->ring.rsp_cons = i;
1651
1652 if (i != rinfo->ring.req_prod_pvt) {
1653 int more_to_do;
1654 RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do);
1655 if (more_to_do)
1656 goto again;
1657 } else
1658 rinfo->ring.sring->rsp_event = i + 1;
1659
1660 kick_pending_request_queues_locked(rinfo);
1661
1662 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
1663
1664 xen_irq_lateeoi(irq, eoiflag);
1665
1666 return IRQ_HANDLED;
1667
1668 err:
1669 info->connected = BLKIF_STATE_ERROR;
1670
1671 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
1672
1673
1674
1675 pr_alert("%s disabled for further use\n", info->gd->disk_name);
1676 return IRQ_HANDLED;
1677 }
1678
1679
1680 static int setup_blkring(struct xenbus_device *dev,
1681 struct blkfront_ring_info *rinfo)
1682 {
1683 struct blkif_sring *sring;
1684 int err;
1685 struct blkfront_info *info = rinfo->dev_info;
1686 unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
1687
1688 err = xenbus_setup_ring(dev, GFP_NOIO, (void **)&sring,
1689 info->nr_ring_pages, rinfo->ring_ref);
1690 if (err)
1691 goto fail;
1692
1693 XEN_FRONT_RING_INIT(&rinfo->ring, sring, ring_size);
1694
1695 err = xenbus_alloc_evtchn(dev, &rinfo->evtchn);
1696 if (err)
1697 goto fail;
1698
1699 err = bind_evtchn_to_irqhandler_lateeoi(rinfo->evtchn, blkif_interrupt,
1700 0, "blkif", rinfo);
1701 if (err <= 0) {
1702 xenbus_dev_fatal(dev, err,
1703 "bind_evtchn_to_irqhandler failed");
1704 goto fail;
1705 }
1706 rinfo->irq = err;
1707
1708 return 0;
1709 fail:
1710 blkif_free(info, 0);
1711 return err;
1712 }
1713
1714
1715
1716
1717
1718 static int write_per_ring_nodes(struct xenbus_transaction xbt,
1719 struct blkfront_ring_info *rinfo, const char *dir)
1720 {
1721 int err;
1722 unsigned int i;
1723 const char *message = NULL;
1724 struct blkfront_info *info = rinfo->dev_info;
1725
1726 if (info->nr_ring_pages == 1) {
1727 err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]);
1728 if (err) {
1729 message = "writing ring-ref";
1730 goto abort_transaction;
1731 }
1732 } else {
1733 for (i = 0; i < info->nr_ring_pages; i++) {
1734 char ring_ref_name[RINGREF_NAME_LEN];
1735
1736 snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
1737 err = xenbus_printf(xbt, dir, ring_ref_name,
1738 "%u", rinfo->ring_ref[i]);
1739 if (err) {
1740 message = "writing ring-ref";
1741 goto abort_transaction;
1742 }
1743 }
1744 }
1745
1746 err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn);
1747 if (err) {
1748 message = "writing event-channel";
1749 goto abort_transaction;
1750 }
1751
1752 return 0;
1753
1754 abort_transaction:
1755 xenbus_transaction_end(xbt, 1);
1756 if (message)
1757 xenbus_dev_fatal(info->xbdev, err, "%s", message);
1758
1759 return err;
1760 }
1761
1762
1763 static bool feature_persistent = true;
1764 module_param(feature_persistent, bool, 0644);
1765 MODULE_PARM_DESC(feature_persistent,
1766 "Enables the persistent grants feature");
1767
1768
1769 static int talk_to_blkback(struct xenbus_device *dev,
1770 struct blkfront_info *info)
1771 {
1772 const char *message = NULL;
1773 struct xenbus_transaction xbt;
1774 int err;
1775 unsigned int i, max_page_order;
1776 unsigned int ring_page_order;
1777 struct blkfront_ring_info *rinfo;
1778
1779 if (!info)
1780 return -ENODEV;
1781
1782
1783 info->bounce = !xen_blkif_trusted ||
1784 !xenbus_read_unsigned(dev->nodename, "trusted", 1);
1785
1786 max_page_order = xenbus_read_unsigned(info->xbdev->otherend,
1787 "max-ring-page-order", 0);
1788 ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
1789 info->nr_ring_pages = 1 << ring_page_order;
1790
1791 err = negotiate_mq(info);
1792 if (err)
1793 goto destroy_blkring;
1794
1795 for_each_rinfo(info, rinfo, i) {
1796
1797 err = setup_blkring(dev, rinfo);
1798 if (err)
1799 goto destroy_blkring;
1800 }
1801
1802 again:
1803 err = xenbus_transaction_start(&xbt);
1804 if (err) {
1805 xenbus_dev_fatal(dev, err, "starting transaction");
1806 goto destroy_blkring;
1807 }
1808
1809 if (info->nr_ring_pages > 1) {
1810 err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u",
1811 ring_page_order);
1812 if (err) {
1813 message = "writing ring-page-order";
1814 goto abort_transaction;
1815 }
1816 }
1817
1818
1819 if (info->nr_rings == 1) {
1820 err = write_per_ring_nodes(xbt, info->rinfo, dev->nodename);
1821 if (err)
1822 goto destroy_blkring;
1823 } else {
1824 char *path;
1825 size_t pathsize;
1826
1827 err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u",
1828 info->nr_rings);
1829 if (err) {
1830 message = "writing multi-queue-num-queues";
1831 goto abort_transaction;
1832 }
1833
1834 pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN;
1835 path = kmalloc(pathsize, GFP_KERNEL);
1836 if (!path) {
1837 err = -ENOMEM;
1838 message = "ENOMEM while writing ring references";
1839 goto abort_transaction;
1840 }
1841
1842 for_each_rinfo(info, rinfo, i) {
1843 memset(path, 0, pathsize);
1844 snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i);
1845 err = write_per_ring_nodes(xbt, rinfo, path);
1846 if (err) {
1847 kfree(path);
1848 goto destroy_blkring;
1849 }
1850 }
1851 kfree(path);
1852 }
1853 err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
1854 XEN_IO_PROTO_ABI_NATIVE);
1855 if (err) {
1856 message = "writing protocol";
1857 goto abort_transaction;
1858 }
1859 info->feature_persistent_parm = feature_persistent;
1860 err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u",
1861 info->feature_persistent_parm);
1862 if (err)
1863 dev_warn(&dev->dev,
1864 "writing persistent grants feature to xenbus");
1865
1866 err = xenbus_transaction_end(xbt, 0);
1867 if (err) {
1868 if (err == -EAGAIN)
1869 goto again;
1870 xenbus_dev_fatal(dev, err, "completing transaction");
1871 goto destroy_blkring;
1872 }
1873
1874 for_each_rinfo(info, rinfo, i) {
1875 unsigned int j;
1876
1877 for (j = 0; j < BLK_RING_SIZE(info); j++)
1878 rinfo->shadow[j].req.u.rw.id = j + 1;
1879 rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
1880 }
1881 xenbus_switch_state(dev, XenbusStateInitialised);
1882
1883 return 0;
1884
1885 abort_transaction:
1886 xenbus_transaction_end(xbt, 1);
1887 if (message)
1888 xenbus_dev_fatal(dev, err, "%s", message);
1889 destroy_blkring:
1890 blkif_free(info, 0);
1891 return err;
1892 }
1893
1894 static int negotiate_mq(struct blkfront_info *info)
1895 {
1896 unsigned int backend_max_queues;
1897 unsigned int i;
1898 struct blkfront_ring_info *rinfo;
1899
1900 BUG_ON(info->nr_rings);
1901
1902
1903 backend_max_queues = xenbus_read_unsigned(info->xbdev->otherend,
1904 "multi-queue-max-queues", 1);
1905 info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
1906
1907 if (!info->nr_rings)
1908 info->nr_rings = 1;
1909
1910 info->rinfo_size = struct_size(info->rinfo, shadow,
1911 BLK_RING_SIZE(info));
1912 info->rinfo = kvcalloc(info->nr_rings, info->rinfo_size, GFP_KERNEL);
1913 if (!info->rinfo) {
1914 xenbus_dev_fatal(info->xbdev, -ENOMEM, "allocating ring_info structure");
1915 info->nr_rings = 0;
1916 return -ENOMEM;
1917 }
1918
1919 for_each_rinfo(info, rinfo, i) {
1920 INIT_LIST_HEAD(&rinfo->indirect_pages);
1921 INIT_LIST_HEAD(&rinfo->grants);
1922 rinfo->dev_info = info;
1923 INIT_WORK(&rinfo->work, blkif_restart_queue);
1924 spin_lock_init(&rinfo->ring_lock);
1925 }
1926 return 0;
1927 }
1928
1929
1930
1931
1932
1933
1934
1935 static int blkfront_probe(struct xenbus_device *dev,
1936 const struct xenbus_device_id *id)
1937 {
1938 int err, vdevice;
1939 struct blkfront_info *info;
1940
1941
1942 err = xenbus_scanf(XBT_NIL, dev->nodename,
1943 "virtual-device", "%i", &vdevice);
1944 if (err != 1) {
1945
1946 err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
1947 "%i", &vdevice);
1948 if (err != 1) {
1949 xenbus_dev_fatal(dev, err, "reading virtual-device");
1950 return err;
1951 }
1952 }
1953
1954 if (xen_hvm_domain()) {
1955 char *type;
1956 int len;
1957
1958 if (xen_has_pv_and_legacy_disk_devices()) {
1959 int major;
1960
1961 if (!VDEV_IS_EXTENDED(vdevice))
1962 major = BLKIF_MAJOR(vdevice);
1963 else
1964 major = XENVBD_MAJOR;
1965
1966 if (major != XENVBD_MAJOR) {
1967 printk(KERN_INFO
1968 "%s: HVM does not support vbd %d as xen block device\n",
1969 __func__, vdevice);
1970 return -ENODEV;
1971 }
1972 }
1973
1974 type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
1975 if (IS_ERR(type))
1976 return -ENODEV;
1977 if (strncmp(type, "cdrom", 5) == 0) {
1978 kfree(type);
1979 return -ENODEV;
1980 }
1981 kfree(type);
1982 }
1983 info = kzalloc(sizeof(*info), GFP_KERNEL);
1984 if (!info) {
1985 xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
1986 return -ENOMEM;
1987 }
1988
1989 info->xbdev = dev;
1990
1991 mutex_init(&info->mutex);
1992 info->vdevice = vdevice;
1993 info->connected = BLKIF_STATE_DISCONNECTED;
1994
1995
1996 info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
1997 dev_set_drvdata(&dev->dev, info);
1998
1999 mutex_lock(&blkfront_mutex);
2000 list_add(&info->info_list, &info_list);
2001 mutex_unlock(&blkfront_mutex);
2002
2003 return 0;
2004 }
2005
2006 static int blkif_recover(struct blkfront_info *info)
2007 {
2008 unsigned int r_index;
2009 struct request *req, *n;
2010 int rc;
2011 struct bio *bio;
2012 unsigned int segs;
2013 struct blkfront_ring_info *rinfo;
2014
2015 blkfront_gather_backend_features(info);
2016
2017 blkif_set_queue_limits(info);
2018 segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
2019 blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG);
2020
2021 for_each_rinfo(info, rinfo, r_index) {
2022 rc = blkfront_setup_indirect(rinfo);
2023 if (rc)
2024 return rc;
2025 }
2026 xenbus_switch_state(info->xbdev, XenbusStateConnected);
2027
2028
2029 info->connected = BLKIF_STATE_CONNECTED;
2030
2031 for_each_rinfo(info, rinfo, r_index) {
2032
2033 kick_pending_request_queues(rinfo);
2034 }
2035
2036 list_for_each_entry_safe(req, n, &info->requests, queuelist) {
2037
2038 list_del_init(&req->queuelist);
2039 BUG_ON(req->nr_phys_segments > segs);
2040 blk_mq_requeue_request(req, false);
2041 }
2042 blk_mq_start_stopped_hw_queues(info->rq, true);
2043 blk_mq_kick_requeue_list(info->rq);
2044
2045 while ((bio = bio_list_pop(&info->bio_list)) != NULL) {
2046
2047 submit_bio(bio);
2048 }
2049
2050 return 0;
2051 }
2052
2053
2054
2055
2056
2057
2058
2059 static int blkfront_resume(struct xenbus_device *dev)
2060 {
2061 struct blkfront_info *info = dev_get_drvdata(&dev->dev);
2062 int err = 0;
2063 unsigned int i, j;
2064 struct blkfront_ring_info *rinfo;
2065
2066 dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
2067
2068 bio_list_init(&info->bio_list);
2069 INIT_LIST_HEAD(&info->requests);
2070 for_each_rinfo(info, rinfo, i) {
2071 struct bio_list merge_bio;
2072 struct blk_shadow *shadow = rinfo->shadow;
2073
2074 for (j = 0; j < BLK_RING_SIZE(info); j++) {
2075
2076 if (!shadow[j].request)
2077 continue;
2078
2079
2080
2081
2082 if (req_op(shadow[j].request) == REQ_OP_FLUSH ||
2083 req_op(shadow[j].request) == REQ_OP_DISCARD ||
2084 req_op(shadow[j].request) == REQ_OP_SECURE_ERASE ||
2085 shadow[j].request->cmd_flags & REQ_FUA) {
2086
2087
2088
2089
2090
2091
2092
2093 list_add(&shadow[j].request->queuelist, &info->requests);
2094 continue;
2095 }
2096 merge_bio.head = shadow[j].request->bio;
2097 merge_bio.tail = shadow[j].request->biotail;
2098 bio_list_merge(&info->bio_list, &merge_bio);
2099 shadow[j].request->bio = NULL;
2100 blk_mq_end_request(shadow[j].request, BLK_STS_OK);
2101 }
2102 }
2103
2104 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
2105
2106 err = talk_to_blkback(dev, info);
2107 if (!err)
2108 blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings);
2109
2110
2111
2112
2113
2114
2115
2116 return err;
2117 }
2118
2119 static void blkfront_closing(struct blkfront_info *info)
2120 {
2121 struct xenbus_device *xbdev = info->xbdev;
2122 struct blkfront_ring_info *rinfo;
2123 unsigned int i;
2124
2125 if (xbdev->state == XenbusStateClosing)
2126 return;
2127
2128
2129 if (info->rq && info->gd) {
2130 blk_mq_stop_hw_queues(info->rq);
2131 blk_mark_disk_dead(info->gd);
2132 set_capacity(info->gd, 0);
2133 }
2134
2135 for_each_rinfo(info, rinfo, i) {
2136
2137 gnttab_cancel_free_callback(&rinfo->callback);
2138
2139
2140 flush_work(&rinfo->work);
2141 }
2142
2143 xenbus_frontend_closed(xbdev);
2144 }
2145
2146 static void blkfront_setup_discard(struct blkfront_info *info)
2147 {
2148 info->feature_discard = 1;
2149 info->discard_granularity = xenbus_read_unsigned(info->xbdev->otherend,
2150 "discard-granularity",
2151 0);
2152 info->discard_alignment = xenbus_read_unsigned(info->xbdev->otherend,
2153 "discard-alignment", 0);
2154 info->feature_secdiscard =
2155 !!xenbus_read_unsigned(info->xbdev->otherend, "discard-secure",
2156 0);
2157 }
2158
2159 static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
2160 {
2161 unsigned int psegs, grants, memflags;
2162 int err, i;
2163 struct blkfront_info *info = rinfo->dev_info;
2164
2165 memflags = memalloc_noio_save();
2166
2167 if (info->max_indirect_segments == 0) {
2168 if (!HAS_EXTRA_REQ)
2169 grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
2170 else {
2171
2172
2173
2174
2175
2176 grants = GRANTS_PER_PSEG;
2177 }
2178 }
2179 else
2180 grants = info->max_indirect_segments;
2181 psegs = DIV_ROUND_UP(grants, GRANTS_PER_PSEG);
2182
2183 err = fill_grant_buffer(rinfo,
2184 (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
2185 if (err)
2186 goto out_of_memory;
2187
2188 if (!info->bounce && info->max_indirect_segments) {
2189
2190
2191
2192
2193
2194 int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
2195
2196 BUG_ON(!list_empty(&rinfo->indirect_pages));
2197 for (i = 0; i < num; i++) {
2198 struct page *indirect_page = alloc_page(GFP_KERNEL |
2199 __GFP_ZERO);
2200 if (!indirect_page)
2201 goto out_of_memory;
2202 list_add(&indirect_page->lru, &rinfo->indirect_pages);
2203 }
2204 }
2205
2206 for (i = 0; i < BLK_RING_SIZE(info); i++) {
2207 rinfo->shadow[i].grants_used =
2208 kvcalloc(grants,
2209 sizeof(rinfo->shadow[i].grants_used[0]),
2210 GFP_KERNEL);
2211 rinfo->shadow[i].sg = kvcalloc(psegs,
2212 sizeof(rinfo->shadow[i].sg[0]),
2213 GFP_KERNEL);
2214 if (info->max_indirect_segments)
2215 rinfo->shadow[i].indirect_grants =
2216 kvcalloc(INDIRECT_GREFS(grants),
2217 sizeof(rinfo->shadow[i].indirect_grants[0]),
2218 GFP_KERNEL);
2219 if ((rinfo->shadow[i].grants_used == NULL) ||
2220 (rinfo->shadow[i].sg == NULL) ||
2221 (info->max_indirect_segments &&
2222 (rinfo->shadow[i].indirect_grants == NULL)))
2223 goto out_of_memory;
2224 sg_init_table(rinfo->shadow[i].sg, psegs);
2225 }
2226
2227 memalloc_noio_restore(memflags);
2228
2229 return 0;
2230
2231 out_of_memory:
2232 for (i = 0; i < BLK_RING_SIZE(info); i++) {
2233 kvfree(rinfo->shadow[i].grants_used);
2234 rinfo->shadow[i].grants_used = NULL;
2235 kvfree(rinfo->shadow[i].sg);
2236 rinfo->shadow[i].sg = NULL;
2237 kvfree(rinfo->shadow[i].indirect_grants);
2238 rinfo->shadow[i].indirect_grants = NULL;
2239 }
2240 if (!list_empty(&rinfo->indirect_pages)) {
2241 struct page *indirect_page, *n;
2242 list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
2243 list_del(&indirect_page->lru);
2244 __free_page(indirect_page);
2245 }
2246 }
2247
2248 memalloc_noio_restore(memflags);
2249
2250 return -ENOMEM;
2251 }
2252
2253
2254
2255
2256 static void blkfront_gather_backend_features(struct blkfront_info *info)
2257 {
2258 unsigned int indirect_segments;
2259
2260 info->feature_flush = 0;
2261 info->feature_fua = 0;
2262
2263
2264
2265
2266
2267
2268
2269
2270 if (xenbus_read_unsigned(info->xbdev->otherend, "feature-barrier", 0)) {
2271 info->feature_flush = 1;
2272 info->feature_fua = 1;
2273 }
2274
2275
2276
2277
2278
2279 if (xenbus_read_unsigned(info->xbdev->otherend, "feature-flush-cache",
2280 0)) {
2281 info->feature_flush = 1;
2282 info->feature_fua = 0;
2283 }
2284
2285 if (xenbus_read_unsigned(info->xbdev->otherend, "feature-discard", 0))
2286 blkfront_setup_discard(info);
2287
2288 if (info->feature_persistent_parm)
2289 info->feature_persistent =
2290 !!xenbus_read_unsigned(info->xbdev->otherend,
2291 "feature-persistent", 0);
2292 if (info->feature_persistent)
2293 info->bounce = true;
2294
2295 indirect_segments = xenbus_read_unsigned(info->xbdev->otherend,
2296 "feature-max-indirect-segments", 0);
2297 if (indirect_segments > xen_blkif_max_segments)
2298 indirect_segments = xen_blkif_max_segments;
2299 if (indirect_segments <= BLKIF_MAX_SEGMENTS_PER_REQUEST)
2300 indirect_segments = 0;
2301 info->max_indirect_segments = indirect_segments;
2302
2303 if (info->feature_persistent) {
2304 mutex_lock(&blkfront_mutex);
2305 schedule_delayed_work(&blkfront_work, HZ * 10);
2306 mutex_unlock(&blkfront_mutex);
2307 }
2308 }
2309
2310
2311
2312
2313
2314 static void blkfront_connect(struct blkfront_info *info)
2315 {
2316 unsigned long long sectors;
2317 unsigned long sector_size;
2318 unsigned int physical_sector_size;
2319 int err, i;
2320 struct blkfront_ring_info *rinfo;
2321
2322 switch (info->connected) {
2323 case BLKIF_STATE_CONNECTED:
2324
2325
2326
2327
2328 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
2329 "sectors", "%Lu", §ors);
2330 if (XENBUS_EXIST_ERR(err))
2331 return;
2332 printk(KERN_INFO "Setting capacity to %Lu\n",
2333 sectors);
2334 set_capacity_and_notify(info->gd, sectors);
2335
2336 return;
2337 case BLKIF_STATE_SUSPENDED:
2338
2339
2340
2341
2342
2343
2344 blkif_recover(info);
2345 return;
2346
2347 default:
2348 break;
2349 }
2350
2351 dev_dbg(&info->xbdev->dev, "%s:%s.\n",
2352 __func__, info->xbdev->otherend);
2353
2354 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
2355 "sectors", "%llu", §ors,
2356 "info", "%u", &info->vdisk_info,
2357 "sector-size", "%lu", §or_size,
2358 NULL);
2359 if (err) {
2360 xenbus_dev_fatal(info->xbdev, err,
2361 "reading backend fields at %s",
2362 info->xbdev->otherend);
2363 return;
2364 }
2365
2366
2367
2368
2369
2370
2371 physical_sector_size = xenbus_read_unsigned(info->xbdev->otherend,
2372 "physical-sector-size",
2373 sector_size);
2374 blkfront_gather_backend_features(info);
2375 for_each_rinfo(info, rinfo, i) {
2376 err = blkfront_setup_indirect(rinfo);
2377 if (err) {
2378 xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
2379 info->xbdev->otherend);
2380 blkif_free(info, 0);
2381 break;
2382 }
2383 }
2384
2385 err = xlvbd_alloc_gendisk(sectors, info, sector_size,
2386 physical_sector_size);
2387 if (err) {
2388 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
2389 info->xbdev->otherend);
2390 goto fail;
2391 }
2392
2393 xenbus_switch_state(info->xbdev, XenbusStateConnected);
2394
2395
2396 info->connected = BLKIF_STATE_CONNECTED;
2397 for_each_rinfo(info, rinfo, i)
2398 kick_pending_request_queues(rinfo);
2399
2400 err = device_add_disk(&info->xbdev->dev, info->gd, NULL);
2401 if (err) {
2402 put_disk(info->gd);
2403 blk_mq_free_tag_set(&info->tag_set);
2404 info->rq = NULL;
2405 goto fail;
2406 }
2407
2408 info->is_ready = 1;
2409 return;
2410
2411 fail:
2412 blkif_free(info, 0);
2413 return;
2414 }
2415
2416
2417
2418
2419 static void blkback_changed(struct xenbus_device *dev,
2420 enum xenbus_state backend_state)
2421 {
2422 struct blkfront_info *info = dev_get_drvdata(&dev->dev);
2423
2424 dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
2425
2426 switch (backend_state) {
2427 case XenbusStateInitWait:
2428 if (dev->state != XenbusStateInitialising)
2429 break;
2430 if (talk_to_blkback(dev, info))
2431 break;
2432 break;
2433 case XenbusStateInitialising:
2434 case XenbusStateInitialised:
2435 case XenbusStateReconfiguring:
2436 case XenbusStateReconfigured:
2437 case XenbusStateUnknown:
2438 break;
2439
2440 case XenbusStateConnected:
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452 if ((dev->state != XenbusStateInitialised) &&
2453 (dev->state != XenbusStateConnected)) {
2454 if (talk_to_blkback(dev, info))
2455 break;
2456 }
2457
2458 blkfront_connect(info);
2459 break;
2460
2461 case XenbusStateClosed:
2462 if (dev->state == XenbusStateClosed)
2463 break;
2464 fallthrough;
2465 case XenbusStateClosing:
2466 blkfront_closing(info);
2467 break;
2468 }
2469 }
2470
2471 static int blkfront_remove(struct xenbus_device *xbdev)
2472 {
2473 struct blkfront_info *info = dev_get_drvdata(&xbdev->dev);
2474
2475 dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
2476
2477 if (info->gd)
2478 del_gendisk(info->gd);
2479
2480 mutex_lock(&blkfront_mutex);
2481 list_del(&info->info_list);
2482 mutex_unlock(&blkfront_mutex);
2483
2484 blkif_free(info, 0);
2485 if (info->gd) {
2486 xlbd_release_minors(info->gd->first_minor, info->gd->minors);
2487 put_disk(info->gd);
2488 blk_mq_free_tag_set(&info->tag_set);
2489 }
2490
2491 kfree(info);
2492 return 0;
2493 }
2494
2495 static int blkfront_is_ready(struct xenbus_device *dev)
2496 {
2497 struct blkfront_info *info = dev_get_drvdata(&dev->dev);
2498
2499 return info->is_ready && info->xbdev;
2500 }
2501
2502 static const struct block_device_operations xlvbd_block_fops =
2503 {
2504 .owner = THIS_MODULE,
2505 .getgeo = blkif_getgeo,
2506 .ioctl = blkif_ioctl,
2507 .compat_ioctl = blkdev_compat_ptr_ioctl,
2508 };
2509
2510
2511 static const struct xenbus_device_id blkfront_ids[] = {
2512 { "vbd" },
2513 { "" }
2514 };
2515
2516 static struct xenbus_driver blkfront_driver = {
2517 .ids = blkfront_ids,
2518 .probe = blkfront_probe,
2519 .remove = blkfront_remove,
2520 .resume = blkfront_resume,
2521 .otherend_changed = blkback_changed,
2522 .is_ready = blkfront_is_ready,
2523 };
2524
2525 static void purge_persistent_grants(struct blkfront_info *info)
2526 {
2527 unsigned int i;
2528 unsigned long flags;
2529 struct blkfront_ring_info *rinfo;
2530
2531 for_each_rinfo(info, rinfo, i) {
2532 struct grant *gnt_list_entry, *tmp;
2533 LIST_HEAD(grants);
2534
2535 spin_lock_irqsave(&rinfo->ring_lock, flags);
2536
2537 if (rinfo->persistent_gnts_c == 0) {
2538 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
2539 continue;
2540 }
2541
2542 list_for_each_entry_safe(gnt_list_entry, tmp, &rinfo->grants,
2543 node) {
2544 if (gnt_list_entry->gref == INVALID_GRANT_REF ||
2545 !gnttab_try_end_foreign_access(gnt_list_entry->gref))
2546 continue;
2547
2548 list_del(&gnt_list_entry->node);
2549 rinfo->persistent_gnts_c--;
2550 gnt_list_entry->gref = INVALID_GRANT_REF;
2551 list_add_tail(&gnt_list_entry->node, &grants);
2552 }
2553
2554 list_splice_tail(&grants, &rinfo->grants);
2555
2556 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
2557 }
2558 }
2559
2560 static void blkfront_delay_work(struct work_struct *work)
2561 {
2562 struct blkfront_info *info;
2563 bool need_schedule_work = false;
2564
2565
2566
2567
2568
2569
2570
2571
2572 mutex_lock(&blkfront_mutex);
2573
2574 list_for_each_entry(info, &info_list, info_list) {
2575 if (info->feature_persistent) {
2576 need_schedule_work = true;
2577 mutex_lock(&info->mutex);
2578 purge_persistent_grants(info);
2579 mutex_unlock(&info->mutex);
2580 }
2581 }
2582
2583 if (need_schedule_work)
2584 schedule_delayed_work(&blkfront_work, HZ * 10);
2585
2586 mutex_unlock(&blkfront_mutex);
2587 }
2588
2589 static int __init xlblk_init(void)
2590 {
2591 int ret;
2592 int nr_cpus = num_online_cpus();
2593
2594 if (!xen_domain())
2595 return -ENODEV;
2596
2597 if (!xen_has_pv_disk_devices())
2598 return -ENODEV;
2599
2600 if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
2601 pr_warn("xen_blk: can't get major %d with name %s\n",
2602 XENVBD_MAJOR, DEV_NAME);
2603 return -ENODEV;
2604 }
2605
2606 if (xen_blkif_max_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST)
2607 xen_blkif_max_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
2608
2609 if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
2610 pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
2611 xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
2612 xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
2613 }
2614
2615 if (xen_blkif_max_queues > nr_cpus) {
2616 pr_info("Invalid max_queues (%d), will use default max: %d.\n",
2617 xen_blkif_max_queues, nr_cpus);
2618 xen_blkif_max_queues = nr_cpus;
2619 }
2620
2621 INIT_DELAYED_WORK(&blkfront_work, blkfront_delay_work);
2622
2623 ret = xenbus_register_frontend(&blkfront_driver);
2624 if (ret) {
2625 unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
2626 return ret;
2627 }
2628
2629 return 0;
2630 }
2631 module_init(xlblk_init);
2632
2633
2634 static void __exit xlblk_exit(void)
2635 {
2636 cancel_delayed_work_sync(&blkfront_work);
2637
2638 xenbus_unregister_driver(&blkfront_driver);
2639 unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
2640 kfree(minors);
2641 }
2642 module_exit(xlblk_exit);
2643
2644 MODULE_DESCRIPTION("Xen virtual block device frontend");
2645 MODULE_LICENSE("GPL");
2646 MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
2647 MODULE_ALIAS("xen:vbd");
2648 MODULE_ALIAS("xenblk");