block/xen-blkback/blkback.c

0001 /******************************************************************************
0002  *
0003  * Back-end of the driver for virtual block devices. This portion of the
0004  * driver exports a 'unified' block-device interface that can be accessed
0005  * by any operating system that implements a compatible front end. A
0006  * reference front-end implementation can be found in:
0007  *  drivers/block/xen-blkfront.c
0008  *
0009  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
0010  * Copyright (c) 2005, Christopher Clark
0011  *
0012  * This program is free software; you can redistribute it and/or
0013  * modify it under the terms of the GNU General Public License version 2
0014  * as published by the Free Software Foundation; or, when distributed
0015  * separately from the Linux kernel or incorporated into other
0016  * software packages, subject to the following license:
0017  *
0018  * Permission is hereby granted, free of charge, to any person obtaining a copy
0019  * of this source file (the "Software"), to deal in the Software without
0020  * restriction, including without limitation the rights to use, copy, modify,
0021  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
0022  * and to permit persons to whom the Software is furnished to do so, subject to
0023  * the following conditions:
0024  *
0025  * The above copyright notice and this permission notice shall be included in
0026  * all copies or substantial portions of the Software.
0027  *
0028  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0029  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0030  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
0031  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
0032  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
0033  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
0034  * IN THE SOFTWARE.
0035  */
0036
0037 #define pr_fmt(fmt) "xen-blkback: " fmt
0038
0039 #include <linux/spinlock.h>
0040 #include <linux/kthread.h>
0041 #include <linux/list.h>
0042 #include <linux/delay.h>
0043 #include <linux/freezer.h>
0044 #include <linux/bitmap.h>
0045
0046 #include <xen/events.h>
0047 #include <xen/page.h>
0048 #include <xen/xen.h>
0049 #include <asm/xen/hypervisor.h>
0050 #include <asm/xen/hypercall.h>
0051 #include <xen/balloon.h>
0052 #include <xen/grant_table.h>
0053 #include "common.h"
0054
0055 /*
0056  * Maximum number of unused free pages to keep in the internal buffer.
0057  * Setting this to a value too low will reduce memory used in each backend,
0058  * but can have a performance penalty.
0059  *
0060  * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
0061  * be set to a lower value that might degrade performance on some intensive
0062  * IO workloads.
0063  */
0064
0065 static int max_buffer_pages = 1024;
0066 module_param_named(max_buffer_pages, max_buffer_pages, int, 0644);
0067 MODULE_PARM_DESC(max_buffer_pages,
0068 "Maximum number of free pages to keep in each block backend buffer");
0069
0070 /*
0071  * Maximum number of grants to map persistently in blkback. For maximum
0072  * performance this should be the total numbers of grants that can be used
0073  * to fill the ring, but since this might become too high, specially with
0074  * the use of indirect descriptors, we set it to a value that provides good
0075  * performance without using too much memory.
0076  *
0077  * When the list of persistent grants is full we clean it up using a LRU
0078  * algorithm.
0079  */
0080
0081 static int max_pgrants = 1056;
0082 module_param_named(max_persistent_grants, max_pgrants, int, 0644);
0083 MODULE_PARM_DESC(max_persistent_grants,
0084                  "Maximum number of grants to map persistently");
0085
0086 /*
0087  * How long a persistent grant is allowed to remain allocated without being in
0088  * use. The time is in seconds, 0 means indefinitely long.
0089  */
0090
0091 static unsigned int pgrant_timeout = 60;
0092 module_param_named(persistent_grant_unused_seconds, pgrant_timeout,
0093            uint, 0644);
0094 MODULE_PARM_DESC(persistent_grant_unused_seconds,
0095          "Time in seconds an unused persistent grant is allowed to "
0096          "remain allocated. Default is 60, 0 means unlimited.");
0097
0098 /*
0099  * Maximum number of rings/queues blkback supports, allow as many queues as there
0100  * are CPUs if user has not specified a value.
0101  */
0102 unsigned int xenblk_max_queues;
0103 module_param_named(max_queues, xenblk_max_queues, uint, 0644);
0104 MODULE_PARM_DESC(max_queues,
0105          "Maximum number of hardware queues per virtual disk." \
0106          "By default it is the number of online CPUs.");
0107
0108 /*
0109  * Maximum order of pages to be used for the shared ring between front and
0110  * backend, 4KB page granularity is used.
0111  */
0112 unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
0113 module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
0114 MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
0115 /*
0116  * The LRU mechanism to clean the lists of persistent grants needs to
0117  * be executed periodically. The time interval between consecutive executions
0118  * of the purge mechanism is set in ms.
0119  */
0120 #define LRU_INTERVAL 100
0121
0122 /*
0123  * When the persistent grants list is full we will remove unused grants
0124  * from the list. The percent number of grants to be removed at each LRU
0125  * execution.
0126  */
0127 #define LRU_PERCENT_CLEAN 5
0128
0129 /* Run-time switchable: /sys/module/blkback/parameters/ */
0130 static unsigned int log_stats;
0131 module_param(log_stats, int, 0644);
0132
0133 #define BLKBACK_INVALID_HANDLE (~0)
0134
0135 static inline bool persistent_gnt_timeout(struct persistent_gnt *persistent_gnt)
0136 {
0137     return pgrant_timeout && (jiffies - persistent_gnt->last_used >=
0138             HZ * pgrant_timeout);
0139 }
0140
0141 #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
0142
0143 static int do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags);
0144 static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
0145                 struct blkif_request *req,
0146                 struct pending_req *pending_req);
0147 static void make_response(struct xen_blkif_ring *ring, u64 id,
0148               unsigned short op, int st);
0149
0150 #define foreach_grant_safe(pos, n, rbtree, node) \
0151     for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
0152          (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
0153          &(pos)->node != NULL; \
0154          (pos) = container_of(n, typeof(*(pos)), node), \
0155          (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
0156
0157
0158 /*
0159  * We don't need locking around the persistent grant helpers
0160  * because blkback uses a single-thread for each backend, so we
0161  * can be sure that this functions will never be called recursively.
0162  *
0163  * The only exception to that is put_persistent_grant, that can be called
0164  * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
0165  * bit operations to modify the flags of a persistent grant and to count
0166  * the number of used grants.
0167  */
0168 static int add_persistent_gnt(struct xen_blkif_ring *ring,
0169                    struct persistent_gnt *persistent_gnt)
0170 {
0171     struct rb_node **new = NULL, *parent = NULL;
0172     struct persistent_gnt *this;
0173     struct xen_blkif *blkif = ring->blkif;
0174
0175     if (ring->persistent_gnt_c >= max_pgrants) {
0176         if (!blkif->vbd.overflow_max_grants)
0177             blkif->vbd.overflow_max_grants = 1;
0178         return -EBUSY;
0179     }
0180     /* Figure out where to put new node */
0181     new = &ring->persistent_gnts.rb_node;
0182     while (*new) {
0183         this = container_of(*new, struct persistent_gnt, node);
0184
0185         parent = *new;
0186         if (persistent_gnt->gnt < this->gnt)
0187             new = &((*new)->rb_left);
0188         else if (persistent_gnt->gnt > this->gnt)
0189             new = &((*new)->rb_right);
0190         else {
0191             pr_alert_ratelimited("trying to add a gref that's already in the tree\n");
0192             return -EINVAL;
0193         }
0194     }
0195
0196     persistent_gnt->active = true;
0197     /* Add new node and rebalance tree. */
0198     rb_link_node(&(persistent_gnt->node), parent, new);
0199     rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
0200     ring->persistent_gnt_c++;
0201     atomic_inc(&ring->persistent_gnt_in_use);
0202     return 0;
0203 }
0204
0205 static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
0206                          grant_ref_t gref)
0207 {
0208     struct persistent_gnt *data;
0209     struct rb_node *node = NULL;
0210
0211     node = ring->persistent_gnts.rb_node;
0212     while (node) {
0213         data = container_of(node, struct persistent_gnt, node);
0214
0215         if (gref < data->gnt)
0216             node = node->rb_left;
0217         else if (gref > data->gnt)
0218             node = node->rb_right;
0219         else {
0220             if (data->active) {
0221                 pr_alert_ratelimited("requesting a grant already in use\n");
0222                 return NULL;
0223             }
0224             data->active = true;
0225             atomic_inc(&ring->persistent_gnt_in_use);
0226             return data;
0227         }
0228     }
0229     return NULL;
0230 }
0231
0232 static void put_persistent_gnt(struct xen_blkif_ring *ring,
0233                                struct persistent_gnt *persistent_gnt)
0234 {
0235     if (!persistent_gnt->active)
0236         pr_alert_ratelimited("freeing a grant already unused\n");
0237     persistent_gnt->last_used = jiffies;
0238     persistent_gnt->active = false;
0239     atomic_dec(&ring->persistent_gnt_in_use);
0240 }
0241
0242 static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
0243                                  unsigned int num)
0244 {
0245     struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
0246     struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
0247     struct persistent_gnt *persistent_gnt;
0248     struct rb_node *n;
0249     int segs_to_unmap = 0;
0250     struct gntab_unmap_queue_data unmap_data;
0251
0252     unmap_data.pages = pages;
0253     unmap_data.unmap_ops = unmap;
0254     unmap_data.kunmap_ops = NULL;
0255
0256     foreach_grant_safe(persistent_gnt, n, root, node) {
0257         BUG_ON(persistent_gnt->handle ==
0258             BLKBACK_INVALID_HANDLE);
0259         gnttab_set_unmap_op(&unmap[segs_to_unmap],
0260             (unsigned long) pfn_to_kaddr(page_to_pfn(
0261                 persistent_gnt->page)),
0262             GNTMAP_host_map,
0263             persistent_gnt->handle);
0264
0265         pages[segs_to_unmap] = persistent_gnt->page;
0266
0267         if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
0268             !rb_next(&persistent_gnt->node)) {
0269
0270             unmap_data.count = segs_to_unmap;
0271             BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
0272
0273             gnttab_page_cache_put(&ring->free_pages, pages,
0274                           segs_to_unmap);
0275             segs_to_unmap = 0;
0276         }
0277
0278         rb_erase(&persistent_gnt->node, root);
0279         kfree(persistent_gnt);
0280         num--;
0281     }
0282     BUG_ON(num != 0);
0283 }
0284
0285 void xen_blkbk_unmap_purged_grants(struct work_struct *work)
0286 {
0287     struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
0288     struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
0289     struct persistent_gnt *persistent_gnt;
0290     int segs_to_unmap = 0;
0291     struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
0292     struct gntab_unmap_queue_data unmap_data;
0293
0294     unmap_data.pages = pages;
0295     unmap_data.unmap_ops = unmap;
0296     unmap_data.kunmap_ops = NULL;
0297
0298     while(!list_empty(&ring->persistent_purge_list)) {
0299         persistent_gnt = list_first_entry(&ring->persistent_purge_list,
0300                                           struct persistent_gnt,
0301                                           remove_node);
0302         list_del(&persistent_gnt->remove_node);
0303
0304         gnttab_set_unmap_op(&unmap[segs_to_unmap],
0305             vaddr(persistent_gnt->page),
0306             GNTMAP_host_map,
0307             persistent_gnt->handle);
0308
0309         pages[segs_to_unmap] = persistent_gnt->page;
0310
0311         if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
0312             unmap_data.count = segs_to_unmap;
0313             BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
0314             gnttab_page_cache_put(&ring->free_pages, pages,
0315                           segs_to_unmap);
0316             segs_to_unmap = 0;
0317         }
0318         kfree(persistent_gnt);
0319     }
0320     if (segs_to_unmap > 0) {
0321         unmap_data.count = segs_to_unmap;
0322         BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
0323         gnttab_page_cache_put(&ring->free_pages, pages, segs_to_unmap);
0324     }
0325 }
0326
0327 static void purge_persistent_gnt(struct xen_blkif_ring *ring)
0328 {
0329     struct persistent_gnt *persistent_gnt;
0330     struct rb_node *n;
0331     unsigned int num_clean, total;
0332     bool scan_used = false;
0333     struct rb_root *root;
0334
0335     if (work_busy(&ring->persistent_purge_work)) {
0336         pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
0337         goto out;
0338     }
0339
0340     if (ring->persistent_gnt_c < max_pgrants ||
0341         (ring->persistent_gnt_c == max_pgrants &&
0342         !ring->blkif->vbd.overflow_max_grants)) {
0343         num_clean = 0;
0344     } else {
0345         num_clean = (max_pgrants / 100) * LRU_PERCENT_CLEAN;
0346         num_clean = ring->persistent_gnt_c - max_pgrants + num_clean;
0347         num_clean = min(ring->persistent_gnt_c, num_clean);
0348         pr_debug("Going to purge at least %u persistent grants\n",
0349              num_clean);
0350     }
0351
0352     /*
0353      * At this point, we can assure that there will be no calls
0354          * to get_persistent_grant (because we are executing this code from
0355          * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
0356          * which means that the number of currently used grants will go down,
0357          * but never up, so we will always be able to remove the requested
0358          * number of grants.
0359      */
0360
0361     total = 0;
0362
0363     BUG_ON(!list_empty(&ring->persistent_purge_list));
0364     root = &ring->persistent_gnts;
0365 purge_list:
0366     foreach_grant_safe(persistent_gnt, n, root, node) {
0367         BUG_ON(persistent_gnt->handle ==
0368             BLKBACK_INVALID_HANDLE);
0369
0370         if (persistent_gnt->active)
0371             continue;
0372         if (!scan_used && !persistent_gnt_timeout(persistent_gnt))
0373             continue;
0374         if (scan_used && total >= num_clean)
0375             continue;
0376
0377         rb_erase(&persistent_gnt->node, root);
0378         list_add(&persistent_gnt->remove_node,
0379              &ring->persistent_purge_list);
0380         total++;
0381     }
0382     /*
0383      * Check whether we also need to start cleaning
0384      * grants that were used since last purge in order to cope
0385      * with the requested num
0386      */
0387     if (!scan_used && total < num_clean) {
0388         pr_debug("Still missing %u purged frames\n", num_clean - total);
0389         scan_used = true;
0390         goto purge_list;
0391     }
0392
0393     if (total) {
0394         ring->persistent_gnt_c -= total;
0395         ring->blkif->vbd.overflow_max_grants = 0;
0396
0397         /* We can defer this work */
0398         schedule_work(&ring->persistent_purge_work);
0399         pr_debug("Purged %u/%u\n", num_clean, total);
0400     }
0401
0402 out:
0403     return;
0404 }
0405
0406 /*
0407  * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
0408  */
0409 static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
0410 {
0411     struct pending_req *req = NULL;
0412     unsigned long flags;
0413
0414     spin_lock_irqsave(&ring->pending_free_lock, flags);
0415     if (!list_empty(&ring->pending_free)) {
0416         req = list_entry(ring->pending_free.next, struct pending_req,
0417                  free_list);
0418         list_del(&req->free_list);
0419     }
0420     spin_unlock_irqrestore(&ring->pending_free_lock, flags);
0421     return req;
0422 }
0423
0424 /*
0425  * Return the 'pending_req' structure back to the freepool. We also
0426  * wake up the thread if it was waiting for a free page.
0427  */
0428 static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
0429 {
0430     unsigned long flags;
0431     int was_empty;
0432
0433     spin_lock_irqsave(&ring->pending_free_lock, flags);
0434     was_empty = list_empty(&ring->pending_free);
0435     list_add(&req->free_list, &ring->pending_free);
0436     spin_unlock_irqrestore(&ring->pending_free_lock, flags);
0437     if (was_empty)
0438         wake_up(&ring->pending_free_wq);
0439 }
0440
0441 /*
0442  * Routines for managing virtual block devices (vbds).
0443  */
0444 static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
0445                  enum req_op operation)
0446 {
0447     struct xen_vbd *vbd = &blkif->vbd;
0448     int rc = -EACCES;
0449
0450     if ((operation != REQ_OP_READ) && vbd->readonly)
0451         goto out;
0452
0453     if (likely(req->nr_sects)) {
0454         blkif_sector_t end = req->sector_number + req->nr_sects;
0455
0456         if (unlikely(end < req->sector_number))
0457             goto out;
0458         if (unlikely(end > vbd_sz(vbd)))
0459             goto out;
0460     }
0461
0462     req->dev  = vbd->pdevice;
0463     req->bdev = vbd->bdev;
0464     rc = 0;
0465
0466  out:
0467     return rc;
0468 }
0469
0470 static void xen_vbd_resize(struct xen_blkif *blkif)
0471 {
0472     struct xen_vbd *vbd = &blkif->vbd;
0473     struct xenbus_transaction xbt;
0474     int err;
0475     struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
0476     unsigned long long new_size = vbd_sz(vbd);
0477
0478     pr_info("VBD Resize: Domid: %d, Device: (%d, %d)\n",
0479         blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
0480     pr_info("VBD Resize: new size %llu\n", new_size);
0481     vbd->size = new_size;
0482 again:
0483     err = xenbus_transaction_start(&xbt);
0484     if (err) {
0485         pr_warn("Error starting transaction\n");
0486         return;
0487     }
0488     err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
0489                 (unsigned long long)vbd_sz(vbd));
0490     if (err) {
0491         pr_warn("Error writing new size\n");
0492         goto abort;
0493     }
0494     /*
0495      * Write the current state; we will use this to synchronize
0496      * the front-end. If the current state is "connected" the
0497      * front-end will get the new size information online.
0498      */
0499     err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
0500     if (err) {
0501         pr_warn("Error writing the state\n");
0502         goto abort;
0503     }
0504
0505     err = xenbus_transaction_end(xbt, 0);
0506     if (err == -EAGAIN)
0507         goto again;
0508     if (err)
0509         pr_warn("Error ending transaction\n");
0510     return;
0511 abort:
0512     xenbus_transaction_end(xbt, 1);
0513 }
0514
0515 /*
0516  * Notification from the guest OS.
0517  */
0518 static void blkif_notify_work(struct xen_blkif_ring *ring)
0519 {
0520     ring->waiting_reqs = 1;
0521     wake_up(&ring->wq);
0522 }
0523
0524 irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
0525 {
0526     blkif_notify_work(dev_id);
0527     return IRQ_HANDLED;
0528 }
0529
0530 /*
0531  * SCHEDULER FUNCTIONS
0532  */
0533
0534 static void print_stats(struct xen_blkif_ring *ring)
0535 {
0536     pr_info("(%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
0537          "  |  ds %4llu | pg: %4u/%4d\n",
0538          current->comm, ring->st_oo_req,
0539          ring->st_rd_req, ring->st_wr_req,
0540          ring->st_f_req, ring->st_ds_req,
0541          ring->persistent_gnt_c, max_pgrants);
0542     ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
0543     ring->st_rd_req = 0;
0544     ring->st_wr_req = 0;
0545     ring->st_oo_req = 0;
0546     ring->st_ds_req = 0;
0547 }
0548
0549 int xen_blkif_schedule(void *arg)
0550 {
0551     struct xen_blkif_ring *ring = arg;
0552     struct xen_blkif *blkif = ring->blkif;
0553     struct xen_vbd *vbd = &blkif->vbd;
0554     unsigned long timeout;
0555     int ret;
0556     bool do_eoi;
0557     unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
0558
0559     set_freezable();
0560     while (!kthread_should_stop()) {
0561         if (try_to_freeze())
0562             continue;
0563         if (unlikely(vbd->size != vbd_sz(vbd)))
0564             xen_vbd_resize(blkif);
0565
0566         timeout = msecs_to_jiffies(LRU_INTERVAL);
0567
0568         timeout = wait_event_interruptible_timeout(
0569             ring->wq,
0570             ring->waiting_reqs || kthread_should_stop(),
0571             timeout);
0572         if (timeout == 0)
0573             goto purge_gnt_list;
0574         timeout = wait_event_interruptible_timeout(
0575             ring->pending_free_wq,
0576             !list_empty(&ring->pending_free) ||
0577             kthread_should_stop(),
0578             timeout);
0579         if (timeout == 0)
0580             goto purge_gnt_list;
0581
0582         do_eoi = ring->waiting_reqs;
0583
0584         ring->waiting_reqs = 0;
0585         smp_mb(); /* clear flag *before* checking for work */
0586
0587         ret = do_block_io_op(ring, &eoi_flags);
0588         if (ret > 0)
0589             ring->waiting_reqs = 1;
0590         if (ret == -EACCES)
0591             wait_event_interruptible(ring->shutdown_wq,
0592                          kthread_should_stop());
0593
0594         if (do_eoi && !ring->waiting_reqs) {
0595             xen_irq_lateeoi(ring->irq, eoi_flags);
0596             eoi_flags |= XEN_EOI_FLAG_SPURIOUS;
0597         }
0598
0599 purge_gnt_list:
0600         if (blkif->vbd.feature_gnt_persistent &&
0601             time_after(jiffies, ring->next_lru)) {
0602             purge_persistent_gnt(ring);
0603             ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
0604         }
0605
0606         /* Shrink the free pages pool if it is too large. */
0607         if (time_before(jiffies, blkif->buffer_squeeze_end))
0608             gnttab_page_cache_shrink(&ring->free_pages, 0);
0609         else
0610             gnttab_page_cache_shrink(&ring->free_pages,
0611                          max_buffer_pages);
0612
0613         if (log_stats && time_after(jiffies, ring->st_print))
0614             print_stats(ring);
0615     }
0616
0617     /* Drain pending purge work */
0618     flush_work(&ring->persistent_purge_work);
0619
0620     if (log_stats)
0621         print_stats(ring);
0622
0623     ring->xenblkd = NULL;
0624
0625     return 0;
0626 }
0627
0628 /*
0629  * Remove persistent grants and empty the pool of free pages
0630  */
0631 void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
0632 {
0633     /* Free all persistent grant pages */
0634     if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
0635         free_persistent_gnts(ring, &ring->persistent_gnts,
0636             ring->persistent_gnt_c);
0637
0638     BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
0639     ring->persistent_gnt_c = 0;
0640
0641     /* Since we are shutting down remove all pages from the buffer */
0642     gnttab_page_cache_shrink(&ring->free_pages, 0 /* All */);
0643 }
0644
0645 static unsigned int xen_blkbk_unmap_prepare(
0646     struct xen_blkif_ring *ring,
0647     struct grant_page **pages,
0648     unsigned int num,
0649     struct gnttab_unmap_grant_ref *unmap_ops,
0650     struct page **unmap_pages)
0651 {
0652     unsigned int i, invcount = 0;
0653
0654     for (i = 0; i < num; i++) {
0655         if (pages[i]->persistent_gnt != NULL) {
0656             put_persistent_gnt(ring, pages[i]->persistent_gnt);
0657             continue;
0658         }
0659         if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
0660             continue;
0661         unmap_pages[invcount] = pages[i]->page;
0662         gnttab_set_unmap_op(&unmap_ops[invcount], vaddr(pages[i]->page),
0663                     GNTMAP_host_map, pages[i]->handle);
0664         pages[i]->handle = BLKBACK_INVALID_HANDLE;
0665         invcount++;
0666     }
0667
0668     return invcount;
0669 }
0670
0671 static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
0672 {
0673     struct pending_req *pending_req = (struct pending_req *)(data->data);
0674     struct xen_blkif_ring *ring = pending_req->ring;
0675     struct xen_blkif *blkif = ring->blkif;
0676
0677     /* BUG_ON used to reproduce existing behaviour,
0678        but is this the best way to deal with this? */
0679     BUG_ON(result);
0680
0681     gnttab_page_cache_put(&ring->free_pages, data->pages, data->count);
0682     make_response(ring, pending_req->id,
0683               pending_req->operation, pending_req->status);
0684     free_req(ring, pending_req);
0685     /*
0686      * Make sure the request is freed before releasing blkif,
0687      * or there could be a race between free_req and the
0688      * cleanup done in xen_blkif_free during shutdown.
0689      *
0690      * NB: The fact that we might try to wake up pending_free_wq
0691      * before drain_complete (in case there's a drain going on)
0692      * it's not a problem with our current implementation
0693      * because we can assure there's no thread waiting on
0694      * pending_free_wq if there's a drain going on, but it has
0695      * to be taken into account if the current model is changed.
0696      */
0697     if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
0698         complete(&blkif->drain_complete);
0699     }
0700     xen_blkif_put(blkif);
0701 }
0702
0703 static void xen_blkbk_unmap_and_respond(struct pending_req *req)
0704 {
0705     struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
0706     struct xen_blkif_ring *ring = req->ring;
0707     struct grant_page **pages = req->segments;
0708     unsigned int invcount;
0709
0710     invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
0711                        req->unmap, req->unmap_pages);
0712
0713     work->data = req;
0714     work->done = xen_blkbk_unmap_and_respond_callback;
0715     work->unmap_ops = req->unmap;
0716     work->kunmap_ops = NULL;
0717     work->pages = req->unmap_pages;
0718     work->count = invcount;
0719
0720     gnttab_unmap_refs_async(&req->gnttab_unmap_data);
0721 }
0722
0723
0724 /*
0725  * Unmap the grant references.
0726  *
0727  * This could accumulate ops up to the batch size to reduce the number
0728  * of hypercalls, but since this is only used in error paths there's
0729  * no real need.
0730  */
0731 static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
0732                             struct grant_page *pages[],
0733                             int num)
0734 {
0735     struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
0736     struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
0737     unsigned int invcount = 0;
0738     int ret;
0739
0740     while (num) {
0741         unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
0742
0743         invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
0744                            unmap, unmap_pages);
0745         if (invcount) {
0746             ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
0747             BUG_ON(ret);
0748             gnttab_page_cache_put(&ring->free_pages, unmap_pages,
0749                           invcount);
0750         }
0751         pages += batch;
0752         num -= batch;
0753     }
0754 }
0755
0756 static int xen_blkbk_map(struct xen_blkif_ring *ring,
0757              struct grant_page *pages[],
0758              int num, bool ro)
0759 {
0760     struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
0761     struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
0762     struct persistent_gnt *persistent_gnt = NULL;
0763     phys_addr_t addr = 0;
0764     int i, seg_idx, new_map_idx;
0765     int segs_to_map = 0;
0766     int ret = 0;
0767     int last_map = 0, map_until = 0;
0768     int use_persistent_gnts;
0769     struct xen_blkif *blkif = ring->blkif;
0770
0771     use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
0772
0773     /*
0774      * Fill out preq.nr_sects with proper amount of sectors, and setup
0775      * assign map[..] with the PFN of the page in our domain with the
0776      * corresponding grant reference for each page.
0777      */
0778 again:
0779     for (i = map_until; i < num; i++) {
0780         uint32_t flags;
0781
0782         if (use_persistent_gnts) {
0783             persistent_gnt = get_persistent_gnt(
0784                 ring,
0785                 pages[i]->gref);
0786         }
0787
0788         if (persistent_gnt) {
0789             /*
0790              * We are using persistent grants and
0791              * the grant is already mapped
0792              */
0793             pages[i]->page = persistent_gnt->page;
0794             pages[i]->persistent_gnt = persistent_gnt;
0795         } else {
0796             if (gnttab_page_cache_get(&ring->free_pages,
0797                           &pages[i]->page)) {
0798                 gnttab_page_cache_put(&ring->free_pages,
0799                               pages_to_gnt,
0800                               segs_to_map);
0801                 ret = -ENOMEM;
0802                 goto out;
0803             }
0804             addr = vaddr(pages[i]->page);
0805             pages_to_gnt[segs_to_map] = pages[i]->page;
0806             pages[i]->persistent_gnt = NULL;
0807             flags = GNTMAP_host_map;
0808             if (!use_persistent_gnts && ro)
0809                 flags |= GNTMAP_readonly;
0810             gnttab_set_map_op(&map[segs_to_map++], addr,
0811                       flags, pages[i]->gref,
0812                       blkif->domid);
0813         }
0814         map_until = i + 1;
0815         if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
0816             break;
0817     }
0818
0819     if (segs_to_map)
0820         ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
0821
0822     /*
0823      * Now swizzle the MFN in our domain with the MFN from the other domain
0824      * so that when we access vaddr(pending_req,i) it has the contents of
0825      * the page from the other domain.
0826      */
0827     for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
0828         if (!pages[seg_idx]->persistent_gnt) {
0829             /* This is a newly mapped grant */
0830             BUG_ON(new_map_idx >= segs_to_map);
0831             if (unlikely(map[new_map_idx].status != 0)) {
0832                 pr_debug("invalid buffer -- could not remap it\n");
0833                 gnttab_page_cache_put(&ring->free_pages,
0834                               &pages[seg_idx]->page, 1);
0835                 pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
0836                 ret |= !ret;
0837                 goto next;
0838             }
0839             pages[seg_idx]->handle = map[new_map_idx].handle;
0840         } else {
0841             continue;
0842         }
0843         if (use_persistent_gnts &&
0844             ring->persistent_gnt_c < max_pgrants) {
0845             /*
0846              * We are using persistent grants, the grant is
0847              * not mapped but we might have room for it.
0848              */
0849             persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
0850                                  GFP_KERNEL);
0851             if (!persistent_gnt) {
0852                 /*
0853                  * If we don't have enough memory to
0854                  * allocate the persistent_gnt struct
0855                  * map this grant non-persistenly
0856                  */
0857                 goto next;
0858             }
0859             persistent_gnt->gnt = map[new_map_idx].ref;
0860             persistent_gnt->handle = map[new_map_idx].handle;
0861             persistent_gnt->page = pages[seg_idx]->page;
0862             if (add_persistent_gnt(ring,
0863                                    persistent_gnt)) {
0864                 kfree(persistent_gnt);
0865                 persistent_gnt = NULL;
0866                 goto next;
0867             }
0868             pages[seg_idx]->persistent_gnt = persistent_gnt;
0869             pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
0870                  persistent_gnt->gnt, ring->persistent_gnt_c,
0871                  max_pgrants);
0872             goto next;
0873         }
0874         if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
0875             blkif->vbd.overflow_max_grants = 1;
0876             pr_debug("domain %u, device %#x is using maximum number of persistent grants\n",
0877                      blkif->domid, blkif->vbd.handle);
0878         }
0879         /*
0880          * We could not map this grant persistently, so use it as
0881          * a non-persistent grant.
0882          */
0883 next:
0884         new_map_idx++;
0885     }
0886     segs_to_map = 0;
0887     last_map = map_until;
0888     if (!ret && map_until != num)
0889         goto again;
0890
0891 out:
0892     for (i = last_map; i < num; i++) {
0893         /* Don't zap current batch's valid persistent grants. */
0894         if(i >= map_until)
0895             pages[i]->persistent_gnt = NULL;
0896         pages[i]->handle = BLKBACK_INVALID_HANDLE;
0897     }
0898
0899     return ret;
0900 }
0901
0902 static int xen_blkbk_map_seg(struct pending_req *pending_req)
0903 {
0904     int rc;
0905
0906     rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
0907                pending_req->nr_segs,
0908                        (pending_req->operation != BLKIF_OP_READ));
0909
0910     return rc;
0911 }
0912
0913 static int xen_blkbk_parse_indirect(struct blkif_request *req,
0914                     struct pending_req *pending_req,
0915                     struct seg_buf seg[],
0916                     struct phys_req *preq)
0917 {
0918     struct grant_page **pages = pending_req->indirect_pages;
0919     struct xen_blkif_ring *ring = pending_req->ring;
0920     int indirect_grefs, rc, n, nseg, i;
0921     struct blkif_request_segment *segments = NULL;
0922
0923     nseg = pending_req->nr_segs;
0924     indirect_grefs = INDIRECT_PAGES(nseg);
0925     BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
0926
0927     for (i = 0; i < indirect_grefs; i++)
0928         pages[i]->gref = req->u.indirect.indirect_grefs[i];
0929
0930     rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
0931     if (rc)
0932         goto unmap;
0933
0934     for (n = 0; n < nseg; n++) {
0935         uint8_t first_sect, last_sect;
0936
0937         if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
0938             /* Map indirect segments */
0939             if (segments)
0940                 kunmap_atomic(segments);
0941             segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
0942         }
0943         i = n % SEGS_PER_INDIRECT_FRAME;
0944
0945         pending_req->segments[n]->gref = segments[i].gref;
0946
0947         first_sect = READ_ONCE(segments[i].first_sect);
0948         last_sect = READ_ONCE(segments[i].last_sect);
0949         if (last_sect >= (XEN_PAGE_SIZE >> 9) || last_sect < first_sect) {
0950             rc = -EINVAL;
0951             goto unmap;
0952         }
0953
0954         seg[n].nsec = last_sect - first_sect + 1;
0955         seg[n].offset = first_sect << 9;
0956         preq->nr_sects += seg[n].nsec;
0957     }
0958
0959 unmap:
0960     if (segments)
0961         kunmap_atomic(segments);
0962     xen_blkbk_unmap(ring, pages, indirect_grefs);
0963     return rc;
0964 }
0965
0966 static int dispatch_discard_io(struct xen_blkif_ring *ring,
0967                 struct blkif_request *req)
0968 {
0969     int err = 0;
0970     int status = BLKIF_RSP_OKAY;
0971     struct xen_blkif *blkif = ring->blkif;
0972     struct block_device *bdev = blkif->vbd.bdev;
0973     struct phys_req preq;
0974
0975     xen_blkif_get(blkif);
0976
0977     preq.sector_number = req->u.discard.sector_number;
0978     preq.nr_sects      = req->u.discard.nr_sectors;
0979
0980     err = xen_vbd_translate(&preq, blkif, REQ_OP_WRITE);
0981     if (err) {
0982         pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n",
0983             preq.sector_number,
0984             preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
0985         goto fail_response;
0986     }
0987     ring->st_ds_req++;
0988
0989     if (blkif->vbd.discard_secure &&
0990         (req->u.discard.flag & BLKIF_DISCARD_SECURE))
0991         err = blkdev_issue_secure_erase(bdev,
0992                 req->u.discard.sector_number,
0993                 req->u.discard.nr_sectors, GFP_KERNEL);
0994     else
0995         err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
0996                 req->u.discard.nr_sectors, GFP_KERNEL);
0997
0998 fail_response:
0999     if (err == -EOPNOTSUPP) {
1000         pr_debug("discard op failed, not supported\n");
1001         status = BLKIF_RSP_EOPNOTSUPP;
1002     } else if (err)
1003         status = BLKIF_RSP_ERROR;
1004
1005     make_response(ring, req->u.discard.id, req->operation, status);
1006     xen_blkif_put(blkif);
1007     return err;
1008 }
1009
1010 static int dispatch_other_io(struct xen_blkif_ring *ring,
1011                  struct blkif_request *req,
1012                  struct pending_req *pending_req)
1013 {
1014     free_req(ring, pending_req);
1015     make_response(ring, req->u.other.id, req->operation,
1016               BLKIF_RSP_EOPNOTSUPP);
1017     return -EIO;
1018 }
1019
1020 static void xen_blk_drain_io(struct xen_blkif_ring *ring)
1021 {
1022     struct xen_blkif *blkif = ring->blkif;
1023
1024     atomic_set(&blkif->drain, 1);
1025     do {
1026         if (atomic_read(&ring->inflight) == 0)
1027             break;
1028         wait_for_completion_interruptible_timeout(
1029                 &blkif->drain_complete, HZ);
1030
1031         if (!atomic_read(&blkif->drain))
1032             break;
1033     } while (!kthread_should_stop());
1034     atomic_set(&blkif->drain, 0);
1035 }
1036
1037 static void __end_block_io_op(struct pending_req *pending_req,
1038         blk_status_t error)
1039 {
1040     /* An error fails the entire request. */
1041     if (pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE &&
1042         error == BLK_STS_NOTSUPP) {
1043         pr_debug("flush diskcache op failed, not supported\n");
1044         xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
1045         pending_req->status = BLKIF_RSP_EOPNOTSUPP;
1046     } else if (pending_req->operation == BLKIF_OP_WRITE_BARRIER &&
1047            error == BLK_STS_NOTSUPP) {
1048         pr_debug("write barrier op failed, not supported\n");
1049         xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
1050         pending_req->status = BLKIF_RSP_EOPNOTSUPP;
1051     } else if (error) {
1052         pr_debug("Buffer not up-to-date at end of operation,"
1053              " error=%d\n", error);
1054         pending_req->status = BLKIF_RSP_ERROR;
1055     }
1056
1057     /*
1058      * If all of the bio's have completed it is time to unmap
1059      * the grant references associated with 'request' and provide
1060      * the proper response on the ring.
1061      */
1062     if (atomic_dec_and_test(&pending_req->pendcnt))
1063         xen_blkbk_unmap_and_respond(pending_req);
1064 }
1065
1066 /*
1067  * bio callback.
1068  */
1069 static void end_block_io_op(struct bio *bio)
1070 {
1071     __end_block_io_op(bio->bi_private, bio->bi_status);
1072     bio_put(bio);
1073 }
1074
1075
1076
1077 /*
1078  * Function to copy the from the ring buffer the 'struct blkif_request'
1079  * (which has the sectors we want, number of them, grant references, etc),
1080  * and transmute  it to the block API to hand it over to the proper block disk.
1081  */
1082 static int
1083 __do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags)
1084 {
1085     union blkif_back_rings *blk_rings = &ring->blk_rings;
1086     struct blkif_request req;
1087     struct pending_req *pending_req;
1088     RING_IDX rc, rp;
1089     int more_to_do = 0;
1090
1091     rc = blk_rings->common.req_cons;
1092     rp = blk_rings->common.sring->req_prod;
1093     rmb(); /* Ensure we see queued requests up to 'rp'. */
1094
1095     if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
1096         rc = blk_rings->common.rsp_prod_pvt;
1097         pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
1098             rp, rc, rp - rc, ring->blkif->vbd.pdevice);
1099         return -EACCES;
1100     }
1101     while (rc != rp) {
1102
1103         if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
1104             break;
1105
1106         /* We've seen a request, so clear spurious eoi flag. */
1107         *eoi_flags &= ~XEN_EOI_FLAG_SPURIOUS;
1108
1109         if (kthread_should_stop()) {
1110             more_to_do = 1;
1111             break;
1112         }
1113
1114         pending_req = alloc_req(ring);
1115         if (NULL == pending_req) {
1116             ring->st_oo_req++;
1117             more_to_do = 1;
1118             break;
1119         }
1120
1121         switch (ring->blkif->blk_protocol) {
1122         case BLKIF_PROTOCOL_NATIVE:
1123             memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
1124             break;
1125         case BLKIF_PROTOCOL_X86_32:
1126             blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
1127             break;
1128         case BLKIF_PROTOCOL_X86_64:
1129             blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
1130             break;
1131         default:
1132             BUG();
1133         }
1134         blk_rings->common.req_cons = ++rc; /* before make_response() */
1135
1136         /* Apply all sanity checks to /private copy/ of request. */
1137         barrier();
1138
1139         switch (req.operation) {
1140         case BLKIF_OP_READ:
1141         case BLKIF_OP_WRITE:
1142         case BLKIF_OP_WRITE_BARRIER:
1143         case BLKIF_OP_FLUSH_DISKCACHE:
1144         case BLKIF_OP_INDIRECT:
1145             if (dispatch_rw_block_io(ring, &req, pending_req))
1146                 goto done;
1147             break;
1148         case BLKIF_OP_DISCARD:
1149             free_req(ring, pending_req);
1150             if (dispatch_discard_io(ring, &req))
1151                 goto done;
1152             break;
1153         default:
1154             if (dispatch_other_io(ring, &req, pending_req))
1155                 goto done;
1156             break;
1157         }
1158
1159         /* Yield point for this unbounded loop. */
1160         cond_resched();
1161     }
1162 done:
1163     return more_to_do;
1164 }
1165
1166 static int
1167 do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags)
1168 {
1169     union blkif_back_rings *blk_rings = &ring->blk_rings;
1170     int more_to_do;
1171
1172     do {
1173         more_to_do = __do_block_io_op(ring, eoi_flags);
1174         if (more_to_do)
1175             break;
1176
1177         RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
1178     } while (more_to_do);
1179
1180     return more_to_do;
1181 }
1182 /*
1183  * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
1184  * and call the 'submit_bio' to pass it to the underlying storage.
1185  */
1186 static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
1187                 struct blkif_request *req,
1188                 struct pending_req *pending_req)
1189 {
1190     struct phys_req preq;
1191     struct seg_buf *seg = pending_req->seg;
1192     unsigned int nseg;
1193     struct bio *bio = NULL;
1194     struct bio **biolist = pending_req->biolist;
1195     int i, nbio = 0;
1196     enum req_op operation;
1197     blk_opf_t operation_flags = 0;
1198     struct blk_plug plug;
1199     bool drain = false;
1200     struct grant_page **pages = pending_req->segments;
1201     unsigned short req_operation;
1202
1203     req_operation = req->operation == BLKIF_OP_INDIRECT ?
1204             req->u.indirect.indirect_op : req->operation;
1205
1206     if ((req->operation == BLKIF_OP_INDIRECT) &&
1207         (req_operation != BLKIF_OP_READ) &&
1208         (req_operation != BLKIF_OP_WRITE)) {
1209         pr_debug("Invalid indirect operation (%u)\n", req_operation);
1210         goto fail_response;
1211     }
1212
1213     switch (req_operation) {
1214     case BLKIF_OP_READ:
1215         ring->st_rd_req++;
1216         operation = REQ_OP_READ;
1217         break;
1218     case BLKIF_OP_WRITE:
1219         ring->st_wr_req++;
1220         operation = REQ_OP_WRITE;
1221         operation_flags = REQ_SYNC | REQ_IDLE;
1222         break;
1223     case BLKIF_OP_WRITE_BARRIER:
1224         drain = true;
1225         fallthrough;
1226     case BLKIF_OP_FLUSH_DISKCACHE:
1227         ring->st_f_req++;
1228         operation = REQ_OP_WRITE;
1229         operation_flags = REQ_PREFLUSH;
1230         break;
1231     default:
1232         operation = 0; /* make gcc happy */
1233         goto fail_response;
1234         break;
1235     }
1236
1237     /* Check that the number of segments is sane. */
1238     nseg = req->operation == BLKIF_OP_INDIRECT ?
1239            req->u.indirect.nr_segments : req->u.rw.nr_segments;
1240
1241     if (unlikely(nseg == 0 && operation_flags != REQ_PREFLUSH) ||
1242         unlikely((req->operation != BLKIF_OP_INDIRECT) &&
1243              (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
1244         unlikely((req->operation == BLKIF_OP_INDIRECT) &&
1245              (nseg > MAX_INDIRECT_SEGMENTS))) {
1246         pr_debug("Bad number of segments in request (%d)\n", nseg);
1247         /* Haven't submitted any bio's yet. */
1248         goto fail_response;
1249     }
1250
1251     preq.nr_sects      = 0;
1252
1253     pending_req->ring      = ring;
1254     pending_req->id        = req->u.rw.id;
1255     pending_req->operation = req_operation;
1256     pending_req->status    = BLKIF_RSP_OKAY;
1257     pending_req->nr_segs   = nseg;
1258
1259     if (req->operation != BLKIF_OP_INDIRECT) {
1260         preq.dev               = req->u.rw.handle;
1261         preq.sector_number     = req->u.rw.sector_number;
1262         for (i = 0; i < nseg; i++) {
1263             pages[i]->gref = req->u.rw.seg[i].gref;
1264             seg[i].nsec = req->u.rw.seg[i].last_sect -
1265                 req->u.rw.seg[i].first_sect + 1;
1266             seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
1267             if ((req->u.rw.seg[i].last_sect >= (XEN_PAGE_SIZE >> 9)) ||
1268                 (req->u.rw.seg[i].last_sect <
1269                  req->u.rw.seg[i].first_sect))
1270                 goto fail_response;
1271             preq.nr_sects += seg[i].nsec;
1272         }
1273     } else {
1274         preq.dev               = req->u.indirect.handle;
1275         preq.sector_number     = req->u.indirect.sector_number;
1276         if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
1277             goto fail_response;
1278     }
1279
1280     if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
1281         pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
1282              operation == REQ_OP_READ ? "read" : "write",
1283              preq.sector_number,
1284              preq.sector_number + preq.nr_sects,
1285              ring->blkif->vbd.pdevice);
1286         goto fail_response;
1287     }
1288
1289     /*
1290      * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
1291      * is set there.
1292      */
1293     for (i = 0; i < nseg; i++) {
1294         if (((int)preq.sector_number|(int)seg[i].nsec) &
1295             ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
1296             pr_debug("Misaligned I/O request from domain %d\n",
1297                  ring->blkif->domid);
1298             goto fail_response;
1299         }
1300     }
1301
1302     /* Wait on all outstanding I/O's and once that has been completed
1303      * issue the flush.
1304      */
1305     if (drain)
1306         xen_blk_drain_io(pending_req->ring);
1307
1308     /*
1309      * If we have failed at this point, we need to undo the M2P override,
1310      * set gnttab_set_unmap_op on all of the grant references and perform
1311      * the hypercall to unmap the grants - that is all done in
1312      * xen_blkbk_unmap.
1313      */
1314     if (xen_blkbk_map_seg(pending_req))
1315         goto fail_flush;
1316
1317     /*
1318      * This corresponding xen_blkif_put is done in __end_block_io_op, or
1319      * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
1320      */
1321     xen_blkif_get(ring->blkif);
1322     atomic_inc(&ring->inflight);
1323
1324     for (i = 0; i < nseg; i++) {
1325         while ((bio == NULL) ||
1326                (bio_add_page(bio,
1327                      pages[i]->page,
1328                      seg[i].nsec << 9,
1329                      seg[i].offset) == 0)) {
1330             bio = bio_alloc(preq.bdev, bio_max_segs(nseg - i),
1331                     operation | operation_flags,
1332                     GFP_KERNEL);
1333             biolist[nbio++] = bio;
1334             bio->bi_private = pending_req;
1335             bio->bi_end_io  = end_block_io_op;
1336             bio->bi_iter.bi_sector  = preq.sector_number;
1337         }
1338
1339         preq.sector_number += seg[i].nsec;
1340     }
1341
1342     /* This will be hit if the operation was a flush or discard. */
1343     if (!bio) {
1344         BUG_ON(operation_flags != REQ_PREFLUSH);
1345
1346         bio = bio_alloc(preq.bdev, 0, operation | operation_flags,
1347                 GFP_KERNEL);
1348         biolist[nbio++] = bio;
1349         bio->bi_private = pending_req;
1350         bio->bi_end_io  = end_block_io_op;
1351     }
1352
1353     atomic_set(&pending_req->pendcnt, nbio);
1354     blk_start_plug(&plug);
1355
1356     for (i = 0; i < nbio; i++)
1357         submit_bio(biolist[i]);
1358
1359     /* Let the I/Os go.. */
1360     blk_finish_plug(&plug);
1361
1362     if (operation == REQ_OP_READ)
1363         ring->st_rd_sect += preq.nr_sects;
1364     else if (operation == REQ_OP_WRITE)
1365         ring->st_wr_sect += preq.nr_sects;
1366
1367     return 0;
1368
1369  fail_flush:
1370     xen_blkbk_unmap(ring, pending_req->segments,
1371                     pending_req->nr_segs);
1372  fail_response:
1373     /* Haven't submitted any bio's yet. */
1374     make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
1375     free_req(ring, pending_req);
1376     msleep(1); /* back off a bit */
1377     return -EIO;
1378 }
1379
1380
1381
1382 /*
1383  * Put a response on the ring on how the operation fared.
1384  */
1385 static void make_response(struct xen_blkif_ring *ring, u64 id,
1386               unsigned short op, int st)
1387 {
1388     struct blkif_response *resp;
1389     unsigned long     flags;
1390     union blkif_back_rings *blk_rings;
1391     int notify;
1392
1393     spin_lock_irqsave(&ring->blk_ring_lock, flags);
1394     blk_rings = &ring->blk_rings;
1395     /* Place on the response ring for the relevant domain. */
1396     switch (ring->blkif->blk_protocol) {
1397     case BLKIF_PROTOCOL_NATIVE:
1398         resp = RING_GET_RESPONSE(&blk_rings->native,
1399                      blk_rings->native.rsp_prod_pvt);
1400         break;
1401     case BLKIF_PROTOCOL_X86_32:
1402         resp = RING_GET_RESPONSE(&blk_rings->x86_32,
1403                      blk_rings->x86_32.rsp_prod_pvt);
1404         break;
1405     case BLKIF_PROTOCOL_X86_64:
1406         resp = RING_GET_RESPONSE(&blk_rings->x86_64,
1407                      blk_rings->x86_64.rsp_prod_pvt);
1408         break;
1409     default:
1410         BUG();
1411     }
1412
1413     resp->id        = id;
1414     resp->operation = op;
1415     resp->status    = st;
1416
1417     blk_rings->common.rsp_prod_pvt++;
1418     RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1419     spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
1420     if (notify)
1421         notify_remote_via_irq(ring->irq);
1422 }
1423
1424 static int __init xen_blkif_init(void)
1425 {
1426     int rc = 0;
1427
1428     if (!xen_domain())
1429         return -ENODEV;
1430
1431     if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
1432         pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
1433             xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
1434         xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
1435     }
1436
1437     if (xenblk_max_queues == 0)
1438         xenblk_max_queues = num_online_cpus();
1439
1440     rc = xen_blkif_interface_init();
1441     if (rc)
1442         goto failed_init;
1443
1444     rc = xen_blkif_xenbus_init();
1445     if (rc)
1446         goto failed_init;
1447
1448  failed_init:
1449     return rc;
1450 }
1451
1452 module_init(xen_blkif_init);
1453
1454 static void __exit xen_blkif_fini(void)
1455 {
1456     xen_blkif_xenbus_fini();
1457     xen_blkif_interface_fini();
1458 }
1459
1460 module_exit(xen_blkif_fini);
1461
1462 MODULE_LICENSE("Dual BSD/GPL");
1463 MODULE_ALIAS("xen-backend:vbd");