Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright (C) 2003 Sistina Software Limited.
0003  * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
0004  *
0005  * This file is released under the GPL.
0006  */
0007 
0008 #include <linux/device-mapper.h>
0009 
0010 #include "dm-rq.h"
0011 #include "dm-bio-record.h"
0012 #include "dm-path-selector.h"
0013 #include "dm-uevent.h"
0014 
0015 #include <linux/blkdev.h>
0016 #include <linux/ctype.h>
0017 #include <linux/init.h>
0018 #include <linux/mempool.h>
0019 #include <linux/module.h>
0020 #include <linux/pagemap.h>
0021 #include <linux/slab.h>
0022 #include <linux/time.h>
0023 #include <linux/timer.h>
0024 #include <linux/workqueue.h>
0025 #include <linux/delay.h>
0026 #include <scsi/scsi_dh.h>
0027 #include <linux/atomic.h>
0028 #include <linux/blk-mq.h>
0029 
0030 #define DM_MSG_PREFIX "multipath"
0031 #define DM_PG_INIT_DELAY_MSECS 2000
0032 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
0033 #define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
0034 
0035 static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
0036 
0037 /* Path properties */
0038 struct pgpath {
0039     struct list_head list;
0040 
0041     struct priority_group *pg;  /* Owning PG */
0042     unsigned fail_count;        /* Cumulative failure count */
0043 
0044     struct dm_path path;
0045     struct delayed_work activate_path;
0046 
0047     bool is_active:1;       /* Path status */
0048 };
0049 
0050 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
0051 
0052 /*
0053  * Paths are grouped into Priority Groups and numbered from 1 upwards.
0054  * Each has a path selector which controls which path gets used.
0055  */
0056 struct priority_group {
0057     struct list_head list;
0058 
0059     struct multipath *m;        /* Owning multipath instance */
0060     struct path_selector ps;
0061 
0062     unsigned pg_num;        /* Reference number */
0063     unsigned nr_pgpaths;        /* Number of paths in PG */
0064     struct list_head pgpaths;
0065 
0066     bool bypassed:1;        /* Temporarily bypass this PG? */
0067 };
0068 
0069 /* Multipath context */
0070 struct multipath {
0071     unsigned long flags;        /* Multipath state flags */
0072 
0073     spinlock_t lock;
0074     enum dm_queue_mode queue_mode;
0075 
0076     struct pgpath *current_pgpath;
0077     struct priority_group *current_pg;
0078     struct priority_group *next_pg; /* Switch to this PG if set */
0079 
0080     atomic_t nr_valid_paths;    /* Total number of usable paths */
0081     unsigned nr_priority_groups;
0082     struct list_head priority_groups;
0083 
0084     const char *hw_handler_name;
0085     char *hw_handler_params;
0086     wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
0087     unsigned pg_init_retries;   /* Number of times to retry pg_init */
0088     unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
0089     atomic_t pg_init_in_progress;   /* Only one pg_init allowed at once */
0090     atomic_t pg_init_count;     /* Number of times pg_init called */
0091 
0092     struct mutex work_mutex;
0093     struct work_struct trigger_event;
0094     struct dm_target *ti;
0095 
0096     struct work_struct process_queued_bios;
0097     struct bio_list queued_bios;
0098 
0099     struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
0100 };
0101 
0102 /*
0103  * Context information attached to each io we process.
0104  */
0105 struct dm_mpath_io {
0106     struct pgpath *pgpath;
0107     size_t nr_bytes;
0108     u64 start_time_ns;
0109 };
0110 
0111 typedef int (*action_fn) (struct pgpath *pgpath);
0112 
0113 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
0114 static void trigger_event(struct work_struct *work);
0115 static void activate_or_offline_path(struct pgpath *pgpath);
0116 static void activate_path_work(struct work_struct *work);
0117 static void process_queued_bios(struct work_struct *work);
0118 static void queue_if_no_path_timeout_work(struct timer_list *t);
0119 
0120 /*-----------------------------------------------
0121  * Multipath state flags.
0122  *-----------------------------------------------*/
0123 
0124 #define MPATHF_QUEUE_IO 0           /* Must we queue all I/O? */
0125 #define MPATHF_QUEUE_IF_NO_PATH 1       /* Queue I/O if last path fails? */
0126 #define MPATHF_SAVED_QUEUE_IF_NO_PATH 2     /* Saved state during suspension */
0127 #define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3 /* If there's already a hw_handler present, don't change it. */
0128 #define MPATHF_PG_INIT_DISABLED 4       /* pg_init is not currently allowed */
0129 #define MPATHF_PG_INIT_REQUIRED 5       /* pg_init needs calling? */
0130 #define MPATHF_PG_INIT_DELAY_RETRY 6        /* Delay pg_init retry? */
0131 
0132 static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m)
0133 {
0134     bool r = test_bit(MPATHF_bit, &m->flags);
0135 
0136     if (r) {
0137         unsigned long flags;
0138         spin_lock_irqsave(&m->lock, flags);
0139         r = test_bit(MPATHF_bit, &m->flags);
0140         spin_unlock_irqrestore(&m->lock, flags);
0141     }
0142 
0143     return r;
0144 }
0145 
0146 /*-----------------------------------------------
0147  * Allocation routines
0148  *-----------------------------------------------*/
0149 
0150 static struct pgpath *alloc_pgpath(void)
0151 {
0152     struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
0153 
0154     if (!pgpath)
0155         return NULL;
0156 
0157     pgpath->is_active = true;
0158 
0159     return pgpath;
0160 }
0161 
0162 static void free_pgpath(struct pgpath *pgpath)
0163 {
0164     kfree(pgpath);
0165 }
0166 
0167 static struct priority_group *alloc_priority_group(void)
0168 {
0169     struct priority_group *pg;
0170 
0171     pg = kzalloc(sizeof(*pg), GFP_KERNEL);
0172 
0173     if (pg)
0174         INIT_LIST_HEAD(&pg->pgpaths);
0175 
0176     return pg;
0177 }
0178 
0179 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
0180 {
0181     struct pgpath *pgpath, *tmp;
0182 
0183     list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
0184         list_del(&pgpath->list);
0185         dm_put_device(ti, pgpath->path.dev);
0186         free_pgpath(pgpath);
0187     }
0188 }
0189 
0190 static void free_priority_group(struct priority_group *pg,
0191                 struct dm_target *ti)
0192 {
0193     struct path_selector *ps = &pg->ps;
0194 
0195     if (ps->type) {
0196         ps->type->destroy(ps);
0197         dm_put_path_selector(ps->type);
0198     }
0199 
0200     free_pgpaths(&pg->pgpaths, ti);
0201     kfree(pg);
0202 }
0203 
0204 static struct multipath *alloc_multipath(struct dm_target *ti)
0205 {
0206     struct multipath *m;
0207 
0208     m = kzalloc(sizeof(*m), GFP_KERNEL);
0209     if (m) {
0210         INIT_LIST_HEAD(&m->priority_groups);
0211         spin_lock_init(&m->lock);
0212         atomic_set(&m->nr_valid_paths, 0);
0213         INIT_WORK(&m->trigger_event, trigger_event);
0214         mutex_init(&m->work_mutex);
0215 
0216         m->queue_mode = DM_TYPE_NONE;
0217 
0218         m->ti = ti;
0219         ti->private = m;
0220 
0221         timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0);
0222     }
0223 
0224     return m;
0225 }
0226 
0227 static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
0228 {
0229     if (m->queue_mode == DM_TYPE_NONE) {
0230         m->queue_mode = DM_TYPE_REQUEST_BASED;
0231     } else if (m->queue_mode == DM_TYPE_BIO_BASED) {
0232         INIT_WORK(&m->process_queued_bios, process_queued_bios);
0233         /*
0234          * bio-based doesn't support any direct scsi_dh management;
0235          * it just discovers if a scsi_dh is attached.
0236          */
0237         set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
0238     }
0239 
0240     dm_table_set_type(ti->table, m->queue_mode);
0241 
0242     /*
0243      * Init fields that are only used when a scsi_dh is attached
0244      * - must do this unconditionally (really doesn't hurt non-SCSI uses)
0245      */
0246     set_bit(MPATHF_QUEUE_IO, &m->flags);
0247     atomic_set(&m->pg_init_in_progress, 0);
0248     atomic_set(&m->pg_init_count, 0);
0249     m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
0250     init_waitqueue_head(&m->pg_init_wait);
0251 
0252     return 0;
0253 }
0254 
0255 static void free_multipath(struct multipath *m)
0256 {
0257     struct priority_group *pg, *tmp;
0258 
0259     list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
0260         list_del(&pg->list);
0261         free_priority_group(pg, m->ti);
0262     }
0263 
0264     kfree(m->hw_handler_name);
0265     kfree(m->hw_handler_params);
0266     mutex_destroy(&m->work_mutex);
0267     kfree(m);
0268 }
0269 
0270 static struct dm_mpath_io *get_mpio(union map_info *info)
0271 {
0272     return info->ptr;
0273 }
0274 
0275 static size_t multipath_per_bio_data_size(void)
0276 {
0277     return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details);
0278 }
0279 
0280 static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio)
0281 {
0282     return dm_per_bio_data(bio, multipath_per_bio_data_size());
0283 }
0284 
0285 static struct dm_bio_details *get_bio_details_from_mpio(struct dm_mpath_io *mpio)
0286 {
0287     /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */
0288     void *bio_details = mpio + 1;
0289     return bio_details;
0290 }
0291 
0292 static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p)
0293 {
0294     struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
0295     struct dm_bio_details *bio_details = get_bio_details_from_mpio(mpio);
0296 
0297     mpio->nr_bytes = bio->bi_iter.bi_size;
0298     mpio->pgpath = NULL;
0299     mpio->start_time_ns = 0;
0300     *mpio_p = mpio;
0301 
0302     dm_bio_record(bio_details, bio);
0303 }
0304 
0305 /*-----------------------------------------------
0306  * Path selection
0307  *-----------------------------------------------*/
0308 
0309 static int __pg_init_all_paths(struct multipath *m)
0310 {
0311     struct pgpath *pgpath;
0312     unsigned long pg_init_delay = 0;
0313 
0314     lockdep_assert_held(&m->lock);
0315 
0316     if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
0317         return 0;
0318 
0319     atomic_inc(&m->pg_init_count);
0320     clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
0321 
0322     /* Check here to reset pg_init_required */
0323     if (!m->current_pg)
0324         return 0;
0325 
0326     if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags))
0327         pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
0328                          m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
0329     list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
0330         /* Skip failed paths */
0331         if (!pgpath->is_active)
0332             continue;
0333         if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
0334                        pg_init_delay))
0335             atomic_inc(&m->pg_init_in_progress);
0336     }
0337     return atomic_read(&m->pg_init_in_progress);
0338 }
0339 
0340 static int pg_init_all_paths(struct multipath *m)
0341 {
0342     int ret;
0343     unsigned long flags;
0344 
0345     spin_lock_irqsave(&m->lock, flags);
0346     ret = __pg_init_all_paths(m);
0347     spin_unlock_irqrestore(&m->lock, flags);
0348 
0349     return ret;
0350 }
0351 
0352 static void __switch_pg(struct multipath *m, struct priority_group *pg)
0353 {
0354     lockdep_assert_held(&m->lock);
0355 
0356     m->current_pg = pg;
0357 
0358     /* Must we initialise the PG first, and queue I/O till it's ready? */
0359     if (m->hw_handler_name) {
0360         set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
0361         set_bit(MPATHF_QUEUE_IO, &m->flags);
0362     } else {
0363         clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
0364         clear_bit(MPATHF_QUEUE_IO, &m->flags);
0365     }
0366 
0367     atomic_set(&m->pg_init_count, 0);
0368 }
0369 
0370 static struct pgpath *choose_path_in_pg(struct multipath *m,
0371                     struct priority_group *pg,
0372                     size_t nr_bytes)
0373 {
0374     unsigned long flags;
0375     struct dm_path *path;
0376     struct pgpath *pgpath;
0377 
0378     path = pg->ps.type->select_path(&pg->ps, nr_bytes);
0379     if (!path)
0380         return ERR_PTR(-ENXIO);
0381 
0382     pgpath = path_to_pgpath(path);
0383 
0384     if (unlikely(READ_ONCE(m->current_pg) != pg)) {
0385         /* Only update current_pgpath if pg changed */
0386         spin_lock_irqsave(&m->lock, flags);
0387         m->current_pgpath = pgpath;
0388         __switch_pg(m, pg);
0389         spin_unlock_irqrestore(&m->lock, flags);
0390     }
0391 
0392     return pgpath;
0393 }
0394 
0395 static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
0396 {
0397     unsigned long flags;
0398     struct priority_group *pg;
0399     struct pgpath *pgpath;
0400     unsigned bypassed = 1;
0401 
0402     if (!atomic_read(&m->nr_valid_paths)) {
0403         spin_lock_irqsave(&m->lock, flags);
0404         clear_bit(MPATHF_QUEUE_IO, &m->flags);
0405         spin_unlock_irqrestore(&m->lock, flags);
0406         goto failed;
0407     }
0408 
0409     /* Were we instructed to switch PG? */
0410     if (READ_ONCE(m->next_pg)) {
0411         spin_lock_irqsave(&m->lock, flags);
0412         pg = m->next_pg;
0413         if (!pg) {
0414             spin_unlock_irqrestore(&m->lock, flags);
0415             goto check_current_pg;
0416         }
0417         m->next_pg = NULL;
0418         spin_unlock_irqrestore(&m->lock, flags);
0419         pgpath = choose_path_in_pg(m, pg, nr_bytes);
0420         if (!IS_ERR_OR_NULL(pgpath))
0421             return pgpath;
0422     }
0423 
0424     /* Don't change PG until it has no remaining paths */
0425 check_current_pg:
0426     pg = READ_ONCE(m->current_pg);
0427     if (pg) {
0428         pgpath = choose_path_in_pg(m, pg, nr_bytes);
0429         if (!IS_ERR_OR_NULL(pgpath))
0430             return pgpath;
0431     }
0432 
0433     /*
0434      * Loop through priority groups until we find a valid path.
0435      * First time we skip PGs marked 'bypassed'.
0436      * Second time we only try the ones we skipped, but set
0437      * pg_init_delay_retry so we do not hammer controllers.
0438      */
0439     do {
0440         list_for_each_entry(pg, &m->priority_groups, list) {
0441             if (pg->bypassed == !!bypassed)
0442                 continue;
0443             pgpath = choose_path_in_pg(m, pg, nr_bytes);
0444             if (!IS_ERR_OR_NULL(pgpath)) {
0445                 if (!bypassed) {
0446                     spin_lock_irqsave(&m->lock, flags);
0447                     set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
0448                     spin_unlock_irqrestore(&m->lock, flags);
0449                 }
0450                 return pgpath;
0451             }
0452         }
0453     } while (bypassed--);
0454 
0455 failed:
0456     spin_lock_irqsave(&m->lock, flags);
0457     m->current_pgpath = NULL;
0458     m->current_pg = NULL;
0459     spin_unlock_irqrestore(&m->lock, flags);
0460 
0461     return NULL;
0462 }
0463 
0464 /*
0465  * dm_report_EIO() is a macro instead of a function to make pr_debug_ratelimited()
0466  * report the function name and line number of the function from which
0467  * it has been invoked.
0468  */
0469 #define dm_report_EIO(m)                        \
0470 do {                                    \
0471     DMDEBUG_LIMIT("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d", \
0472               dm_table_device_name((m)->ti->table),     \
0473               test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags),   \
0474               test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \
0475               dm_noflush_suspending((m)->ti));          \
0476 } while (0)
0477 
0478 /*
0479  * Check whether bios must be queued in the device-mapper core rather
0480  * than here in the target.
0481  */
0482 static bool __must_push_back(struct multipath *m)
0483 {
0484     return dm_noflush_suspending(m->ti);
0485 }
0486 
0487 static bool must_push_back_rq(struct multipath *m)
0488 {
0489     unsigned long flags;
0490     bool ret;
0491 
0492     spin_lock_irqsave(&m->lock, flags);
0493     ret = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m));
0494     spin_unlock_irqrestore(&m->lock, flags);
0495 
0496     return ret;
0497 }
0498 
0499 /*
0500  * Map cloned requests (request-based multipath)
0501  */
0502 static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
0503                    union map_info *map_context,
0504                    struct request **__clone)
0505 {
0506     struct multipath *m = ti->private;
0507     size_t nr_bytes = blk_rq_bytes(rq);
0508     struct pgpath *pgpath;
0509     struct block_device *bdev;
0510     struct dm_mpath_io *mpio = get_mpio(map_context);
0511     struct request_queue *q;
0512     struct request *clone;
0513 
0514     /* Do we need to select a new pgpath? */
0515     pgpath = READ_ONCE(m->current_pgpath);
0516     if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
0517         pgpath = choose_pgpath(m, nr_bytes);
0518 
0519     if (!pgpath) {
0520         if (must_push_back_rq(m))
0521             return DM_MAPIO_DELAY_REQUEUE;
0522         dm_report_EIO(m);   /* Failed */
0523         return DM_MAPIO_KILL;
0524     } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) ||
0525            mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) {
0526         pg_init_all_paths(m);
0527         return DM_MAPIO_DELAY_REQUEUE;
0528     }
0529 
0530     mpio->pgpath = pgpath;
0531     mpio->nr_bytes = nr_bytes;
0532 
0533     bdev = pgpath->path.dev->bdev;
0534     q = bdev_get_queue(bdev);
0535     clone = blk_mq_alloc_request(q, rq->cmd_flags | REQ_NOMERGE,
0536             BLK_MQ_REQ_NOWAIT);
0537     if (IS_ERR(clone)) {
0538         /* EBUSY, ENODEV or EWOULDBLOCK: requeue */
0539         if (blk_queue_dying(q)) {
0540             atomic_inc(&m->pg_init_in_progress);
0541             activate_or_offline_path(pgpath);
0542             return DM_MAPIO_DELAY_REQUEUE;
0543         }
0544 
0545         /*
0546          * blk-mq's SCHED_RESTART can cover this requeue, so we
0547          * needn't deal with it by DELAY_REQUEUE. More importantly,
0548          * we have to return DM_MAPIO_REQUEUE so that blk-mq can
0549          * get the queue busy feedback (via BLK_STS_RESOURCE),
0550          * otherwise I/O merging can suffer.
0551          */
0552         return DM_MAPIO_REQUEUE;
0553     }
0554     clone->bio = clone->biotail = NULL;
0555     clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
0556     *__clone = clone;
0557 
0558     if (pgpath->pg->ps.type->start_io)
0559         pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
0560                           &pgpath->path,
0561                           nr_bytes);
0562     return DM_MAPIO_REMAPPED;
0563 }
0564 
0565 static void multipath_release_clone(struct request *clone,
0566                     union map_info *map_context)
0567 {
0568     if (unlikely(map_context)) {
0569         /*
0570          * non-NULL map_context means caller is still map
0571          * method; must undo multipath_clone_and_map()
0572          */
0573         struct dm_mpath_io *mpio = get_mpio(map_context);
0574         struct pgpath *pgpath = mpio->pgpath;
0575 
0576         if (pgpath && pgpath->pg->ps.type->end_io)
0577             pgpath->pg->ps.type->end_io(&pgpath->pg->ps,
0578                             &pgpath->path,
0579                             mpio->nr_bytes,
0580                             clone->io_start_time_ns);
0581     }
0582 
0583     blk_mq_free_request(clone);
0584 }
0585 
0586 /*
0587  * Map cloned bios (bio-based multipath)
0588  */
0589 
0590 static void __multipath_queue_bio(struct multipath *m, struct bio *bio)
0591 {
0592     /* Queue for the daemon to resubmit */
0593     bio_list_add(&m->queued_bios, bio);
0594     if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
0595         queue_work(kmultipathd, &m->process_queued_bios);
0596 }
0597 
0598 static void multipath_queue_bio(struct multipath *m, struct bio *bio)
0599 {
0600     unsigned long flags;
0601 
0602     spin_lock_irqsave(&m->lock, flags);
0603     __multipath_queue_bio(m, bio);
0604     spin_unlock_irqrestore(&m->lock, flags);
0605 }
0606 
0607 static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
0608 {
0609     struct pgpath *pgpath;
0610     unsigned long flags;
0611 
0612     /* Do we need to select a new pgpath? */
0613     pgpath = READ_ONCE(m->current_pgpath);
0614     if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
0615         pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
0616 
0617     if (!pgpath) {
0618         spin_lock_irqsave(&m->lock, flags);
0619         if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
0620             __multipath_queue_bio(m, bio);
0621             pgpath = ERR_PTR(-EAGAIN);
0622         }
0623         spin_unlock_irqrestore(&m->lock, flags);
0624 
0625     } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) ||
0626            mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) {
0627         multipath_queue_bio(m, bio);
0628         pg_init_all_paths(m);
0629         return ERR_PTR(-EAGAIN);
0630     }
0631 
0632     return pgpath;
0633 }
0634 
0635 static int __multipath_map_bio(struct multipath *m, struct bio *bio,
0636                    struct dm_mpath_io *mpio)
0637 {
0638     struct pgpath *pgpath = __map_bio(m, bio);
0639 
0640     if (IS_ERR(pgpath))
0641         return DM_MAPIO_SUBMITTED;
0642 
0643     if (!pgpath) {
0644         if (__must_push_back(m))
0645             return DM_MAPIO_REQUEUE;
0646         dm_report_EIO(m);
0647         return DM_MAPIO_KILL;
0648     }
0649 
0650     mpio->pgpath = pgpath;
0651 
0652     if (dm_ps_use_hr_timer(pgpath->pg->ps.type))
0653         mpio->start_time_ns = ktime_get_ns();
0654 
0655     bio->bi_status = 0;
0656     bio_set_dev(bio, pgpath->path.dev->bdev);
0657     bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
0658 
0659     if (pgpath->pg->ps.type->start_io)
0660         pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
0661                           &pgpath->path,
0662                           mpio->nr_bytes);
0663     return DM_MAPIO_REMAPPED;
0664 }
0665 
0666 static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
0667 {
0668     struct multipath *m = ti->private;
0669     struct dm_mpath_io *mpio = NULL;
0670 
0671     multipath_init_per_bio_data(bio, &mpio);
0672     return __multipath_map_bio(m, bio, mpio);
0673 }
0674 
0675 static void process_queued_io_list(struct multipath *m)
0676 {
0677     if (m->queue_mode == DM_TYPE_REQUEST_BASED)
0678         dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table));
0679     else if (m->queue_mode == DM_TYPE_BIO_BASED)
0680         queue_work(kmultipathd, &m->process_queued_bios);
0681 }
0682 
0683 static void process_queued_bios(struct work_struct *work)
0684 {
0685     int r;
0686     unsigned long flags;
0687     struct bio *bio;
0688     struct bio_list bios;
0689     struct blk_plug plug;
0690     struct multipath *m =
0691         container_of(work, struct multipath, process_queued_bios);
0692 
0693     bio_list_init(&bios);
0694 
0695     spin_lock_irqsave(&m->lock, flags);
0696 
0697     if (bio_list_empty(&m->queued_bios)) {
0698         spin_unlock_irqrestore(&m->lock, flags);
0699         return;
0700     }
0701 
0702     bio_list_merge(&bios, &m->queued_bios);
0703     bio_list_init(&m->queued_bios);
0704 
0705     spin_unlock_irqrestore(&m->lock, flags);
0706 
0707     blk_start_plug(&plug);
0708     while ((bio = bio_list_pop(&bios))) {
0709         struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
0710         dm_bio_restore(get_bio_details_from_mpio(mpio), bio);
0711         r = __multipath_map_bio(m, bio, mpio);
0712         switch (r) {
0713         case DM_MAPIO_KILL:
0714             bio->bi_status = BLK_STS_IOERR;
0715             bio_endio(bio);
0716             break;
0717         case DM_MAPIO_REQUEUE:
0718             bio->bi_status = BLK_STS_DM_REQUEUE;
0719             bio_endio(bio);
0720             break;
0721         case DM_MAPIO_REMAPPED:
0722             submit_bio_noacct(bio);
0723             break;
0724         case DM_MAPIO_SUBMITTED:
0725             break;
0726         default:
0727             WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r);
0728         }
0729     }
0730     blk_finish_plug(&plug);
0731 }
0732 
0733 /*
0734  * If we run out of usable paths, should we queue I/O or error it?
0735  */
0736 static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
0737                 bool save_old_value, const char *caller)
0738 {
0739     unsigned long flags;
0740     bool queue_if_no_path_bit, saved_queue_if_no_path_bit;
0741     const char *dm_dev_name = dm_table_device_name(m->ti->table);
0742 
0743     DMDEBUG("%s: %s caller=%s queue_if_no_path=%d save_old_value=%d",
0744         dm_dev_name, __func__, caller, queue_if_no_path, save_old_value);
0745 
0746     spin_lock_irqsave(&m->lock, flags);
0747 
0748     queue_if_no_path_bit = test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
0749     saved_queue_if_no_path_bit = test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
0750 
0751     if (save_old_value) {
0752         if (unlikely(!queue_if_no_path_bit && saved_queue_if_no_path_bit)) {
0753             DMERR("%s: QIFNP disabled but saved as enabled, saving again loses state, not saving!",
0754                   dm_dev_name);
0755         } else
0756             assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path_bit);
0757     } else if (!queue_if_no_path && saved_queue_if_no_path_bit) {
0758         /* due to "fail_if_no_path" message, need to honor it. */
0759         clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
0760     }
0761     assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path);
0762 
0763     DMDEBUG("%s: after %s changes; QIFNP = %d; SQIFNP = %d; DNFS = %d",
0764         dm_dev_name, __func__,
0765         test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags),
0766         test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
0767         dm_noflush_suspending(m->ti));
0768 
0769     spin_unlock_irqrestore(&m->lock, flags);
0770 
0771     if (!queue_if_no_path) {
0772         dm_table_run_md_queue_async(m->ti->table);
0773         process_queued_io_list(m);
0774     }
0775 
0776     return 0;
0777 }
0778 
0779 /*
0780  * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
0781  * process any queued I/O.
0782  */
0783 static void queue_if_no_path_timeout_work(struct timer_list *t)
0784 {
0785     struct multipath *m = from_timer(m, t, nopath_timer);
0786 
0787     DMWARN("queue_if_no_path timeout on %s, failing queued IO",
0788            dm_table_device_name(m->ti->table));
0789     queue_if_no_path(m, false, false, __func__);
0790 }
0791 
0792 /*
0793  * Enable the queue_if_no_path timeout if necessary.
0794  * Called with m->lock held.
0795  */
0796 static void enable_nopath_timeout(struct multipath *m)
0797 {
0798     unsigned long queue_if_no_path_timeout =
0799         READ_ONCE(queue_if_no_path_timeout_secs) * HZ;
0800 
0801     lockdep_assert_held(&m->lock);
0802 
0803     if (queue_if_no_path_timeout > 0 &&
0804         atomic_read(&m->nr_valid_paths) == 0 &&
0805         test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
0806         mod_timer(&m->nopath_timer,
0807               jiffies + queue_if_no_path_timeout);
0808     }
0809 }
0810 
0811 static void disable_nopath_timeout(struct multipath *m)
0812 {
0813     del_timer_sync(&m->nopath_timer);
0814 }
0815 
0816 /*
0817  * An event is triggered whenever a path is taken out of use.
0818  * Includes path failure and PG bypass.
0819  */
0820 static void trigger_event(struct work_struct *work)
0821 {
0822     struct multipath *m =
0823         container_of(work, struct multipath, trigger_event);
0824 
0825     dm_table_event(m->ti->table);
0826 }
0827 
0828 /*-----------------------------------------------------------------
0829  * Constructor/argument parsing:
0830  * <#multipath feature args> [<arg>]*
0831  * <#hw_handler args> [hw_handler [<arg>]*]
0832  * <#priority groups>
0833  * <initial priority group>
0834  *     [<selector> <#selector args> [<arg>]*
0835  *      <#paths> <#per-path selector args>
0836  *         [<path> [<arg>]* ]+ ]+
0837  *---------------------------------------------------------------*/
0838 static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
0839                    struct dm_target *ti)
0840 {
0841     int r;
0842     struct path_selector_type *pst;
0843     unsigned ps_argc;
0844 
0845     static const struct dm_arg _args[] = {
0846         {0, 1024, "invalid number of path selector args"},
0847     };
0848 
0849     pst = dm_get_path_selector(dm_shift_arg(as));
0850     if (!pst) {
0851         ti->error = "unknown path selector type";
0852         return -EINVAL;
0853     }
0854 
0855     r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
0856     if (r) {
0857         dm_put_path_selector(pst);
0858         return -EINVAL;
0859     }
0860 
0861     r = pst->create(&pg->ps, ps_argc, as->argv);
0862     if (r) {
0863         dm_put_path_selector(pst);
0864         ti->error = "path selector constructor failed";
0865         return r;
0866     }
0867 
0868     pg->ps.type = pst;
0869     dm_consume_args(as, ps_argc);
0870 
0871     return 0;
0872 }
0873 
0874 static int setup_scsi_dh(struct block_device *bdev, struct multipath *m,
0875              const char **attached_handler_name, char **error)
0876 {
0877     struct request_queue *q = bdev_get_queue(bdev);
0878     int r;
0879 
0880     if (mpath_double_check_test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, m)) {
0881 retain:
0882         if (*attached_handler_name) {
0883             /*
0884              * Clear any hw_handler_params associated with a
0885              * handler that isn't already attached.
0886              */
0887             if (m->hw_handler_name && strcmp(*attached_handler_name, m->hw_handler_name)) {
0888                 kfree(m->hw_handler_params);
0889                 m->hw_handler_params = NULL;
0890             }
0891 
0892             /*
0893              * Reset hw_handler_name to match the attached handler
0894              *
0895              * NB. This modifies the table line to show the actual
0896              * handler instead of the original table passed in.
0897              */
0898             kfree(m->hw_handler_name);
0899             m->hw_handler_name = *attached_handler_name;
0900             *attached_handler_name = NULL;
0901         }
0902     }
0903 
0904     if (m->hw_handler_name) {
0905         r = scsi_dh_attach(q, m->hw_handler_name);
0906         if (r == -EBUSY) {
0907             DMINFO("retaining handler on device %pg", bdev);
0908             goto retain;
0909         }
0910         if (r < 0) {
0911             *error = "error attaching hardware handler";
0912             return r;
0913         }
0914 
0915         if (m->hw_handler_params) {
0916             r = scsi_dh_set_params(q, m->hw_handler_params);
0917             if (r < 0) {
0918                 *error = "unable to set hardware handler parameters";
0919                 return r;
0920             }
0921         }
0922     }
0923 
0924     return 0;
0925 }
0926 
0927 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
0928                  struct dm_target *ti)
0929 {
0930     int r;
0931     struct pgpath *p;
0932     struct multipath *m = ti->private;
0933     struct request_queue *q;
0934     const char *attached_handler_name = NULL;
0935 
0936     /* we need at least a path arg */
0937     if (as->argc < 1) {
0938         ti->error = "no device given";
0939         return ERR_PTR(-EINVAL);
0940     }
0941 
0942     p = alloc_pgpath();
0943     if (!p)
0944         return ERR_PTR(-ENOMEM);
0945 
0946     r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
0947               &p->path.dev);
0948     if (r) {
0949         ti->error = "error getting device";
0950         goto bad;
0951     }
0952 
0953     q = bdev_get_queue(p->path.dev->bdev);
0954     attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
0955     if (attached_handler_name || m->hw_handler_name) {
0956         INIT_DELAYED_WORK(&p->activate_path, activate_path_work);
0957         r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error);
0958         kfree(attached_handler_name);
0959         if (r) {
0960             dm_put_device(ti, p->path.dev);
0961             goto bad;
0962         }
0963     }
0964 
0965     r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
0966     if (r) {
0967         dm_put_device(ti, p->path.dev);
0968         goto bad;
0969     }
0970 
0971     return p;
0972  bad:
0973     free_pgpath(p);
0974     return ERR_PTR(r);
0975 }
0976 
0977 static struct priority_group *parse_priority_group(struct dm_arg_set *as,
0978                            struct multipath *m)
0979 {
0980     static const struct dm_arg _args[] = {
0981         {1, 1024, "invalid number of paths"},
0982         {0, 1024, "invalid number of selector args"}
0983     };
0984 
0985     int r;
0986     unsigned i, nr_selector_args, nr_args;
0987     struct priority_group *pg;
0988     struct dm_target *ti = m->ti;
0989 
0990     if (as->argc < 2) {
0991         as->argc = 0;
0992         ti->error = "not enough priority group arguments";
0993         return ERR_PTR(-EINVAL);
0994     }
0995 
0996     pg = alloc_priority_group();
0997     if (!pg) {
0998         ti->error = "couldn't allocate priority group";
0999         return ERR_PTR(-ENOMEM);
1000     }
1001     pg->m = m;
1002 
1003     r = parse_path_selector(as, pg, ti);
1004     if (r)
1005         goto bad;
1006 
1007     /*
1008      * read the paths
1009      */
1010     r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
1011     if (r)
1012         goto bad;
1013 
1014     r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
1015     if (r)
1016         goto bad;
1017 
1018     nr_args = 1 + nr_selector_args;
1019     for (i = 0; i < pg->nr_pgpaths; i++) {
1020         struct pgpath *pgpath;
1021         struct dm_arg_set path_args;
1022 
1023         if (as->argc < nr_args) {
1024             ti->error = "not enough path parameters";
1025             r = -EINVAL;
1026             goto bad;
1027         }
1028 
1029         path_args.argc = nr_args;
1030         path_args.argv = as->argv;
1031 
1032         pgpath = parse_path(&path_args, &pg->ps, ti);
1033         if (IS_ERR(pgpath)) {
1034             r = PTR_ERR(pgpath);
1035             goto bad;
1036         }
1037 
1038         pgpath->pg = pg;
1039         list_add_tail(&pgpath->list, &pg->pgpaths);
1040         dm_consume_args(as, nr_args);
1041     }
1042 
1043     return pg;
1044 
1045  bad:
1046     free_priority_group(pg, ti);
1047     return ERR_PTR(r);
1048 }
1049 
1050 static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
1051 {
1052     unsigned hw_argc;
1053     int ret;
1054     struct dm_target *ti = m->ti;
1055 
1056     static const struct dm_arg _args[] = {
1057         {0, 1024, "invalid number of hardware handler args"},
1058     };
1059 
1060     if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
1061         return -EINVAL;
1062 
1063     if (!hw_argc)
1064         return 0;
1065 
1066     if (m->queue_mode == DM_TYPE_BIO_BASED) {
1067         dm_consume_args(as, hw_argc);
1068         DMERR("bio-based multipath doesn't allow hardware handler args");
1069         return 0;
1070     }
1071 
1072     m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
1073     if (!m->hw_handler_name)
1074         return -EINVAL;
1075 
1076     if (hw_argc > 1) {
1077         char *p;
1078         int i, j, len = 4;
1079 
1080         for (i = 0; i <= hw_argc - 2; i++)
1081             len += strlen(as->argv[i]) + 1;
1082         p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
1083         if (!p) {
1084             ti->error = "memory allocation failed";
1085             ret = -ENOMEM;
1086             goto fail;
1087         }
1088         j = sprintf(p, "%d", hw_argc - 1);
1089         for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
1090             j = sprintf(p, "%s", as->argv[i]);
1091     }
1092     dm_consume_args(as, hw_argc - 1);
1093 
1094     return 0;
1095 fail:
1096     kfree(m->hw_handler_name);
1097     m->hw_handler_name = NULL;
1098     return ret;
1099 }
1100 
1101 static int parse_features(struct dm_arg_set *as, struct multipath *m)
1102 {
1103     int r;
1104     unsigned argc;
1105     struct dm_target *ti = m->ti;
1106     const char *arg_name;
1107 
1108     static const struct dm_arg _args[] = {
1109         {0, 8, "invalid number of feature args"},
1110         {1, 50, "pg_init_retries must be between 1 and 50"},
1111         {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
1112     };
1113 
1114     r = dm_read_arg_group(_args, as, &argc, &ti->error);
1115     if (r)
1116         return -EINVAL;
1117 
1118     if (!argc)
1119         return 0;
1120 
1121     do {
1122         arg_name = dm_shift_arg(as);
1123         argc--;
1124 
1125         if (!strcasecmp(arg_name, "queue_if_no_path")) {
1126             r = queue_if_no_path(m, true, false, __func__);
1127             continue;
1128         }
1129 
1130         if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
1131             set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
1132             continue;
1133         }
1134 
1135         if (!strcasecmp(arg_name, "pg_init_retries") &&
1136             (argc >= 1)) {
1137             r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
1138             argc--;
1139             continue;
1140         }
1141 
1142         if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
1143             (argc >= 1)) {
1144             r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
1145             argc--;
1146             continue;
1147         }
1148 
1149         if (!strcasecmp(arg_name, "queue_mode") &&
1150             (argc >= 1)) {
1151             const char *queue_mode_name = dm_shift_arg(as);
1152 
1153             if (!strcasecmp(queue_mode_name, "bio"))
1154                 m->queue_mode = DM_TYPE_BIO_BASED;
1155             else if (!strcasecmp(queue_mode_name, "rq") ||
1156                  !strcasecmp(queue_mode_name, "mq"))
1157                 m->queue_mode = DM_TYPE_REQUEST_BASED;
1158             else {
1159                 ti->error = "Unknown 'queue_mode' requested";
1160                 r = -EINVAL;
1161             }
1162             argc--;
1163             continue;
1164         }
1165 
1166         ti->error = "Unrecognised multipath feature request";
1167         r = -EINVAL;
1168     } while (argc && !r);
1169 
1170     return r;
1171 }
1172 
1173 static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
1174 {
1175     /* target arguments */
1176     static const struct dm_arg _args[] = {
1177         {0, 1024, "invalid number of priority groups"},
1178         {0, 1024, "invalid initial priority group number"},
1179     };
1180 
1181     int r;
1182     struct multipath *m;
1183     struct dm_arg_set as;
1184     unsigned pg_count = 0;
1185     unsigned next_pg_num;
1186     unsigned long flags;
1187 
1188     as.argc = argc;
1189     as.argv = argv;
1190 
1191     m = alloc_multipath(ti);
1192     if (!m) {
1193         ti->error = "can't allocate multipath";
1194         return -EINVAL;
1195     }
1196 
1197     r = parse_features(&as, m);
1198     if (r)
1199         goto bad;
1200 
1201     r = alloc_multipath_stage2(ti, m);
1202     if (r)
1203         goto bad;
1204 
1205     r = parse_hw_handler(&as, m);
1206     if (r)
1207         goto bad;
1208 
1209     r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
1210     if (r)
1211         goto bad;
1212 
1213     r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
1214     if (r)
1215         goto bad;
1216 
1217     if ((!m->nr_priority_groups && next_pg_num) ||
1218         (m->nr_priority_groups && !next_pg_num)) {
1219         ti->error = "invalid initial priority group";
1220         r = -EINVAL;
1221         goto bad;
1222     }
1223 
1224     /* parse the priority groups */
1225     while (as.argc) {
1226         struct priority_group *pg;
1227         unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths);
1228 
1229         pg = parse_priority_group(&as, m);
1230         if (IS_ERR(pg)) {
1231             r = PTR_ERR(pg);
1232             goto bad;
1233         }
1234 
1235         nr_valid_paths += pg->nr_pgpaths;
1236         atomic_set(&m->nr_valid_paths, nr_valid_paths);
1237 
1238         list_add_tail(&pg->list, &m->priority_groups);
1239         pg_count++;
1240         pg->pg_num = pg_count;
1241         if (!--next_pg_num)
1242             m->next_pg = pg;
1243     }
1244 
1245     if (pg_count != m->nr_priority_groups) {
1246         ti->error = "priority group count mismatch";
1247         r = -EINVAL;
1248         goto bad;
1249     }
1250 
1251     spin_lock_irqsave(&m->lock, flags);
1252     enable_nopath_timeout(m);
1253     spin_unlock_irqrestore(&m->lock, flags);
1254 
1255     ti->num_flush_bios = 1;
1256     ti->num_discard_bios = 1;
1257     ti->num_write_zeroes_bios = 1;
1258     if (m->queue_mode == DM_TYPE_BIO_BASED)
1259         ti->per_io_data_size = multipath_per_bio_data_size();
1260     else
1261         ti->per_io_data_size = sizeof(struct dm_mpath_io);
1262 
1263     return 0;
1264 
1265  bad:
1266     free_multipath(m);
1267     return r;
1268 }
1269 
1270 static void multipath_wait_for_pg_init_completion(struct multipath *m)
1271 {
1272     DEFINE_WAIT(wait);
1273 
1274     while (1) {
1275         prepare_to_wait(&m->pg_init_wait, &wait, TASK_UNINTERRUPTIBLE);
1276 
1277         if (!atomic_read(&m->pg_init_in_progress))
1278             break;
1279 
1280         io_schedule();
1281     }
1282     finish_wait(&m->pg_init_wait, &wait);
1283 }
1284 
1285 static void flush_multipath_work(struct multipath *m)
1286 {
1287     if (m->hw_handler_name) {
1288         unsigned long flags;
1289 
1290         if (!atomic_read(&m->pg_init_in_progress))
1291             goto skip;
1292 
1293         spin_lock_irqsave(&m->lock, flags);
1294         if (atomic_read(&m->pg_init_in_progress) &&
1295             !test_and_set_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) {
1296             spin_unlock_irqrestore(&m->lock, flags);
1297 
1298             flush_workqueue(kmpath_handlerd);
1299             multipath_wait_for_pg_init_completion(m);
1300 
1301             spin_lock_irqsave(&m->lock, flags);
1302             clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
1303         }
1304         spin_unlock_irqrestore(&m->lock, flags);
1305     }
1306 skip:
1307     if (m->queue_mode == DM_TYPE_BIO_BASED)
1308         flush_work(&m->process_queued_bios);
1309     flush_work(&m->trigger_event);
1310 }
1311 
1312 static void multipath_dtr(struct dm_target *ti)
1313 {
1314     struct multipath *m = ti->private;
1315 
1316     disable_nopath_timeout(m);
1317     flush_multipath_work(m);
1318     free_multipath(m);
1319 }
1320 
1321 /*
1322  * Take a path out of use.
1323  */
1324 static int fail_path(struct pgpath *pgpath)
1325 {
1326     unsigned long flags;
1327     struct multipath *m = pgpath->pg->m;
1328 
1329     spin_lock_irqsave(&m->lock, flags);
1330 
1331     if (!pgpath->is_active)
1332         goto out;
1333 
1334     DMWARN("%s: Failing path %s.",
1335            dm_table_device_name(m->ti->table),
1336            pgpath->path.dev->name);
1337 
1338     pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
1339     pgpath->is_active = false;
1340     pgpath->fail_count++;
1341 
1342     atomic_dec(&m->nr_valid_paths);
1343 
1344     if (pgpath == m->current_pgpath)
1345         m->current_pgpath = NULL;
1346 
1347     dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
1348                pgpath->path.dev->name, atomic_read(&m->nr_valid_paths));
1349 
1350     schedule_work(&m->trigger_event);
1351 
1352     enable_nopath_timeout(m);
1353 
1354 out:
1355     spin_unlock_irqrestore(&m->lock, flags);
1356 
1357     return 0;
1358 }
1359 
1360 /*
1361  * Reinstate a previously-failed path
1362  */
1363 static int reinstate_path(struct pgpath *pgpath)
1364 {
1365     int r = 0, run_queue = 0;
1366     unsigned long flags;
1367     struct multipath *m = pgpath->pg->m;
1368     unsigned nr_valid_paths;
1369 
1370     spin_lock_irqsave(&m->lock, flags);
1371 
1372     if (pgpath->is_active)
1373         goto out;
1374 
1375     DMWARN("%s: Reinstating path %s.",
1376            dm_table_device_name(m->ti->table),
1377            pgpath->path.dev->name);
1378 
1379     r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
1380     if (r)
1381         goto out;
1382 
1383     pgpath->is_active = true;
1384 
1385     nr_valid_paths = atomic_inc_return(&m->nr_valid_paths);
1386     if (nr_valid_paths == 1) {
1387         m->current_pgpath = NULL;
1388         run_queue = 1;
1389     } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1390         if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1391             atomic_inc(&m->pg_init_in_progress);
1392     }
1393 
1394     dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
1395                pgpath->path.dev->name, nr_valid_paths);
1396 
1397     schedule_work(&m->trigger_event);
1398 
1399 out:
1400     spin_unlock_irqrestore(&m->lock, flags);
1401     if (run_queue) {
1402         dm_table_run_md_queue_async(m->ti->table);
1403         process_queued_io_list(m);
1404     }
1405 
1406     if (pgpath->is_active)
1407         disable_nopath_timeout(m);
1408 
1409     return r;
1410 }
1411 
1412 /*
1413  * Fail or reinstate all paths that match the provided struct dm_dev.
1414  */
1415 static int action_dev(struct multipath *m, struct dm_dev *dev,
1416               action_fn action)
1417 {
1418     int r = -EINVAL;
1419     struct pgpath *pgpath;
1420     struct priority_group *pg;
1421 
1422     list_for_each_entry(pg, &m->priority_groups, list) {
1423         list_for_each_entry(pgpath, &pg->pgpaths, list) {
1424             if (pgpath->path.dev == dev)
1425                 r = action(pgpath);
1426         }
1427     }
1428 
1429     return r;
1430 }
1431 
1432 /*
1433  * Temporarily try to avoid having to use the specified PG
1434  */
1435 static void bypass_pg(struct multipath *m, struct priority_group *pg,
1436               bool bypassed)
1437 {
1438     unsigned long flags;
1439 
1440     spin_lock_irqsave(&m->lock, flags);
1441 
1442     pg->bypassed = bypassed;
1443     m->current_pgpath = NULL;
1444     m->current_pg = NULL;
1445 
1446     spin_unlock_irqrestore(&m->lock, flags);
1447 
1448     schedule_work(&m->trigger_event);
1449 }
1450 
1451 /*
1452  * Switch to using the specified PG from the next I/O that gets mapped
1453  */
1454 static int switch_pg_num(struct multipath *m, const char *pgstr)
1455 {
1456     struct priority_group *pg;
1457     unsigned pgnum;
1458     unsigned long flags;
1459     char dummy;
1460 
1461     if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1462         !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
1463         DMWARN("invalid PG number supplied to switch_pg_num");
1464         return -EINVAL;
1465     }
1466 
1467     spin_lock_irqsave(&m->lock, flags);
1468     list_for_each_entry(pg, &m->priority_groups, list) {
1469         pg->bypassed = false;
1470         if (--pgnum)
1471             continue;
1472 
1473         m->current_pgpath = NULL;
1474         m->current_pg = NULL;
1475         m->next_pg = pg;
1476     }
1477     spin_unlock_irqrestore(&m->lock, flags);
1478 
1479     schedule_work(&m->trigger_event);
1480     return 0;
1481 }
1482 
1483 /*
1484  * Set/clear bypassed status of a PG.
1485  * PGs are numbered upwards from 1 in the order they were declared.
1486  */
1487 static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed)
1488 {
1489     struct priority_group *pg;
1490     unsigned pgnum;
1491     char dummy;
1492 
1493     if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1494         !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
1495         DMWARN("invalid PG number supplied to bypass_pg");
1496         return -EINVAL;
1497     }
1498 
1499     list_for_each_entry(pg, &m->priority_groups, list) {
1500         if (!--pgnum)
1501             break;
1502     }
1503 
1504     bypass_pg(m, pg, bypassed);
1505     return 0;
1506 }
1507 
1508 /*
1509  * Should we retry pg_init immediately?
1510  */
1511 static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1512 {
1513     unsigned long flags;
1514     bool limit_reached = false;
1515 
1516     spin_lock_irqsave(&m->lock, flags);
1517 
1518     if (atomic_read(&m->pg_init_count) <= m->pg_init_retries &&
1519         !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
1520         set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
1521     else
1522         limit_reached = true;
1523 
1524     spin_unlock_irqrestore(&m->lock, flags);
1525 
1526     return limit_reached;
1527 }
1528 
1529 static void pg_init_done(void *data, int errors)
1530 {
1531     struct pgpath *pgpath = data;
1532     struct priority_group *pg = pgpath->pg;
1533     struct multipath *m = pg->m;
1534     unsigned long flags;
1535     bool delay_retry = false;
1536 
1537     /* device or driver problems */
1538     switch (errors) {
1539     case SCSI_DH_OK:
1540         break;
1541     case SCSI_DH_NOSYS:
1542         if (!m->hw_handler_name) {
1543             errors = 0;
1544             break;
1545         }
1546         DMERR("Could not failover the device: Handler scsi_dh_%s "
1547               "Error %d.", m->hw_handler_name, errors);
1548         /*
1549          * Fail path for now, so we do not ping pong
1550          */
1551         fail_path(pgpath);
1552         break;
1553     case SCSI_DH_DEV_TEMP_BUSY:
1554         /*
1555          * Probably doing something like FW upgrade on the
1556          * controller so try the other pg.
1557          */
1558         bypass_pg(m, pg, true);
1559         break;
1560     case SCSI_DH_RETRY:
1561         /* Wait before retrying. */
1562         delay_retry = true;
1563         fallthrough;
1564     case SCSI_DH_IMM_RETRY:
1565     case SCSI_DH_RES_TEMP_UNAVAIL:
1566         if (pg_init_limit_reached(m, pgpath))
1567             fail_path(pgpath);
1568         errors = 0;
1569         break;
1570     case SCSI_DH_DEV_OFFLINED:
1571     default:
1572         /*
1573          * We probably do not want to fail the path for a device
1574          * error, but this is what the old dm did. In future
1575          * patches we can do more advanced handling.
1576          */
1577         fail_path(pgpath);
1578     }
1579 
1580     spin_lock_irqsave(&m->lock, flags);
1581     if (errors) {
1582         if (pgpath == m->current_pgpath) {
1583             DMERR("Could not failover device. Error %d.", errors);
1584             m->current_pgpath = NULL;
1585             m->current_pg = NULL;
1586         }
1587     } else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1588         pg->bypassed = false;
1589 
1590     if (atomic_dec_return(&m->pg_init_in_progress) > 0)
1591         /* Activations of other paths are still on going */
1592         goto out;
1593 
1594     if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
1595         if (delay_retry)
1596             set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1597         else
1598             clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1599 
1600         if (__pg_init_all_paths(m))
1601             goto out;
1602     }
1603     clear_bit(MPATHF_QUEUE_IO, &m->flags);
1604 
1605     process_queued_io_list(m);
1606 
1607     /*
1608      * Wake up any thread waiting to suspend.
1609      */
1610     wake_up(&m->pg_init_wait);
1611 
1612 out:
1613     spin_unlock_irqrestore(&m->lock, flags);
1614 }
1615 
1616 static void activate_or_offline_path(struct pgpath *pgpath)
1617 {
1618     struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1619 
1620     if (pgpath->is_active && !blk_queue_dying(q))
1621         scsi_dh_activate(q, pg_init_done, pgpath);
1622     else
1623         pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
1624 }
1625 
1626 static void activate_path_work(struct work_struct *work)
1627 {
1628     struct pgpath *pgpath =
1629         container_of(work, struct pgpath, activate_path.work);
1630 
1631     activate_or_offline_path(pgpath);
1632 }
1633 
1634 static int multipath_end_io(struct dm_target *ti, struct request *clone,
1635                 blk_status_t error, union map_info *map_context)
1636 {
1637     struct dm_mpath_io *mpio = get_mpio(map_context);
1638     struct pgpath *pgpath = mpio->pgpath;
1639     int r = DM_ENDIO_DONE;
1640 
1641     /*
1642      * We don't queue any clone request inside the multipath target
1643      * during end I/O handling, since those clone requests don't have
1644      * bio clones.  If we queue them inside the multipath target,
1645      * we need to make bio clones, that requires memory allocation.
1646      * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests
1647      *  don't have bio clones.)
1648      * Instead of queueing the clone request here, we queue the original
1649      * request into dm core, which will remake a clone request and
1650      * clone bios for it and resubmit it later.
1651      */
1652     if (error && blk_path_error(error)) {
1653         struct multipath *m = ti->private;
1654 
1655         if (error == BLK_STS_RESOURCE)
1656             r = DM_ENDIO_DELAY_REQUEUE;
1657         else
1658             r = DM_ENDIO_REQUEUE;
1659 
1660         if (pgpath)
1661             fail_path(pgpath);
1662 
1663         if (!atomic_read(&m->nr_valid_paths) &&
1664             !must_push_back_rq(m)) {
1665             if (error == BLK_STS_IOERR)
1666                 dm_report_EIO(m);
1667             /* complete with the original error */
1668             r = DM_ENDIO_DONE;
1669         }
1670     }
1671 
1672     if (pgpath) {
1673         struct path_selector *ps = &pgpath->pg->ps;
1674 
1675         if (ps->type->end_io)
1676             ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes,
1677                      clone->io_start_time_ns);
1678     }
1679 
1680     return r;
1681 }
1682 
1683 static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
1684                 blk_status_t *error)
1685 {
1686     struct multipath *m = ti->private;
1687     struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
1688     struct pgpath *pgpath = mpio->pgpath;
1689     unsigned long flags;
1690     int r = DM_ENDIO_DONE;
1691 
1692     if (!*error || !blk_path_error(*error))
1693         goto done;
1694 
1695     if (pgpath)
1696         fail_path(pgpath);
1697 
1698     if (!atomic_read(&m->nr_valid_paths)) {
1699         spin_lock_irqsave(&m->lock, flags);
1700         if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
1701             if (__must_push_back(m)) {
1702                 r = DM_ENDIO_REQUEUE;
1703             } else {
1704                 dm_report_EIO(m);
1705                 *error = BLK_STS_IOERR;
1706             }
1707             spin_unlock_irqrestore(&m->lock, flags);
1708             goto done;
1709         }
1710         spin_unlock_irqrestore(&m->lock, flags);
1711     }
1712 
1713     multipath_queue_bio(m, clone);
1714     r = DM_ENDIO_INCOMPLETE;
1715 done:
1716     if (pgpath) {
1717         struct path_selector *ps = &pgpath->pg->ps;
1718 
1719         if (ps->type->end_io)
1720             ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes,
1721                      (mpio->start_time_ns ?:
1722                       dm_start_time_ns_from_clone(clone)));
1723     }
1724 
1725     return r;
1726 }
1727 
1728 /*
1729  * Suspend with flush can't complete until all the I/O is processed
1730  * so if the last path fails we must error any remaining I/O.
1731  * - Note that if the freeze_bdev fails while suspending, the
1732  *   queue_if_no_path state is lost - userspace should reset it.
1733  * Otherwise, during noflush suspend, queue_if_no_path will not change.
1734  */
1735 static void multipath_presuspend(struct dm_target *ti)
1736 {
1737     struct multipath *m = ti->private;
1738 
1739     /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */
1740     if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti))
1741         queue_if_no_path(m, false, true, __func__);
1742 }
1743 
1744 static void multipath_postsuspend(struct dm_target *ti)
1745 {
1746     struct multipath *m = ti->private;
1747 
1748     mutex_lock(&m->work_mutex);
1749     flush_multipath_work(m);
1750     mutex_unlock(&m->work_mutex);
1751 }
1752 
1753 /*
1754  * Restore the queue_if_no_path setting.
1755  */
1756 static void multipath_resume(struct dm_target *ti)
1757 {
1758     struct multipath *m = ti->private;
1759     unsigned long flags;
1760 
1761     spin_lock_irqsave(&m->lock, flags);
1762     if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) {
1763         set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
1764         clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
1765     }
1766 
1767     DMDEBUG("%s: %s finished; QIFNP = %d; SQIFNP = %d",
1768         dm_table_device_name(m->ti->table), __func__,
1769         test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags),
1770         test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags));
1771 
1772     spin_unlock_irqrestore(&m->lock, flags);
1773 }
1774 
1775 /*
1776  * Info output has the following format:
1777  * num_multipath_feature_args [multipath_feature_args]*
1778  * num_handler_status_args [handler_status_args]*
1779  * num_groups init_group_number
1780  *            [A|D|E num_ps_status_args [ps_status_args]*
1781  *             num_paths num_selector_args
1782  *             [path_dev A|F fail_count [selector_args]* ]+ ]+
1783  *
1784  * Table output has the following format (identical to the constructor string):
1785  * num_feature_args [features_args]*
1786  * num_handler_args hw_handler [hw_handler_args]*
1787  * num_groups init_group_number
1788  *     [priority selector-name num_ps_args [ps_args]*
1789  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1790  */
1791 static void multipath_status(struct dm_target *ti, status_type_t type,
1792                  unsigned status_flags, char *result, unsigned maxlen)
1793 {
1794     int sz = 0, pg_counter, pgpath_counter;
1795     unsigned long flags;
1796     struct multipath *m = ti->private;
1797     struct priority_group *pg;
1798     struct pgpath *p;
1799     unsigned pg_num;
1800     char state;
1801 
1802     spin_lock_irqsave(&m->lock, flags);
1803 
1804     /* Features */
1805     if (type == STATUSTYPE_INFO)
1806         DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags),
1807                atomic_read(&m->pg_init_count));
1808     else {
1809         DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
1810                   (m->pg_init_retries > 0) * 2 +
1811                   (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
1812                   test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) +
1813                   (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2);
1814 
1815         if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1816             DMEMIT("queue_if_no_path ");
1817         if (m->pg_init_retries)
1818             DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1819         if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1820             DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1821         if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
1822             DMEMIT("retain_attached_hw_handler ");
1823         if (m->queue_mode != DM_TYPE_REQUEST_BASED) {
1824             switch(m->queue_mode) {
1825             case DM_TYPE_BIO_BASED:
1826                 DMEMIT("queue_mode bio ");
1827                 break;
1828             default:
1829                 WARN_ON_ONCE(true);
1830                 break;
1831             }
1832         }
1833     }
1834 
1835     if (!m->hw_handler_name || type == STATUSTYPE_INFO)
1836         DMEMIT("0 ");
1837     else
1838         DMEMIT("1 %s ", m->hw_handler_name);
1839 
1840     DMEMIT("%u ", m->nr_priority_groups);
1841 
1842     if (m->next_pg)
1843         pg_num = m->next_pg->pg_num;
1844     else if (m->current_pg)
1845         pg_num = m->current_pg->pg_num;
1846     else
1847         pg_num = (m->nr_priority_groups ? 1 : 0);
1848 
1849     DMEMIT("%u ", pg_num);
1850 
1851     switch (type) {
1852     case STATUSTYPE_INFO:
1853         list_for_each_entry(pg, &m->priority_groups, list) {
1854             if (pg->bypassed)
1855                 state = 'D';    /* Disabled */
1856             else if (pg == m->current_pg)
1857                 state = 'A';    /* Currently Active */
1858             else
1859                 state = 'E';    /* Enabled */
1860 
1861             DMEMIT("%c ", state);
1862 
1863             if (pg->ps.type->status)
1864                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1865                               result + sz,
1866                               maxlen - sz);
1867             else
1868                 DMEMIT("0 ");
1869 
1870             DMEMIT("%u %u ", pg->nr_pgpaths,
1871                    pg->ps.type->info_args);
1872 
1873             list_for_each_entry(p, &pg->pgpaths, list) {
1874                 DMEMIT("%s %s %u ", p->path.dev->name,
1875                        p->is_active ? "A" : "F",
1876                        p->fail_count);
1877                 if (pg->ps.type->status)
1878                     sz += pg->ps.type->status(&pg->ps,
1879                           &p->path, type, result + sz,
1880                           maxlen - sz);
1881             }
1882         }
1883         break;
1884 
1885     case STATUSTYPE_TABLE:
1886         list_for_each_entry(pg, &m->priority_groups, list) {
1887             DMEMIT("%s ", pg->ps.type->name);
1888 
1889             if (pg->ps.type->status)
1890                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1891                               result + sz,
1892                               maxlen - sz);
1893             else
1894                 DMEMIT("0 ");
1895 
1896             DMEMIT("%u %u ", pg->nr_pgpaths,
1897                    pg->ps.type->table_args);
1898 
1899             list_for_each_entry(p, &pg->pgpaths, list) {
1900                 DMEMIT("%s ", p->path.dev->name);
1901                 if (pg->ps.type->status)
1902                     sz += pg->ps.type->status(&pg->ps,
1903                           &p->path, type, result + sz,
1904                           maxlen - sz);
1905             }
1906         }
1907         break;
1908 
1909     case STATUSTYPE_IMA:
1910         sz = 0; /*reset the result pointer*/
1911 
1912         DMEMIT_TARGET_NAME_VERSION(ti->type);
1913         DMEMIT(",nr_priority_groups=%u", m->nr_priority_groups);
1914 
1915         pg_counter = 0;
1916         list_for_each_entry(pg, &m->priority_groups, list) {
1917             if (pg->bypassed)
1918                 state = 'D';    /* Disabled */
1919             else if (pg == m->current_pg)
1920                 state = 'A';    /* Currently Active */
1921             else
1922                 state = 'E';    /* Enabled */
1923             DMEMIT(",pg_state_%d=%c", pg_counter, state);
1924             DMEMIT(",nr_pgpaths_%d=%u", pg_counter, pg->nr_pgpaths);
1925             DMEMIT(",path_selector_name_%d=%s", pg_counter, pg->ps.type->name);
1926 
1927             pgpath_counter = 0;
1928             list_for_each_entry(p, &pg->pgpaths, list) {
1929                 DMEMIT(",path_name_%d_%d=%s,is_active_%d_%d=%c,fail_count_%d_%d=%u",
1930                        pg_counter, pgpath_counter, p->path.dev->name,
1931                        pg_counter, pgpath_counter, p->is_active ? 'A' : 'F',
1932                        pg_counter, pgpath_counter, p->fail_count);
1933                 if (pg->ps.type->status) {
1934                     DMEMIT(",path_selector_status_%d_%d=",
1935                            pg_counter, pgpath_counter);
1936                     sz += pg->ps.type->status(&pg->ps, &p->path,
1937                                   type, result + sz,
1938                                   maxlen - sz);
1939                 }
1940                 pgpath_counter++;
1941             }
1942             pg_counter++;
1943         }
1944         DMEMIT(";");
1945         break;
1946     }
1947 
1948     spin_unlock_irqrestore(&m->lock, flags);
1949 }
1950 
1951 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
1952                  char *result, unsigned maxlen)
1953 {
1954     int r = -EINVAL;
1955     struct dm_dev *dev;
1956     struct multipath *m = ti->private;
1957     action_fn action;
1958     unsigned long flags;
1959 
1960     mutex_lock(&m->work_mutex);
1961 
1962     if (dm_suspended(ti)) {
1963         r = -EBUSY;
1964         goto out;
1965     }
1966 
1967     if (argc == 1) {
1968         if (!strcasecmp(argv[0], "queue_if_no_path")) {
1969             r = queue_if_no_path(m, true, false, __func__);
1970             spin_lock_irqsave(&m->lock, flags);
1971             enable_nopath_timeout(m);
1972             spin_unlock_irqrestore(&m->lock, flags);
1973             goto out;
1974         } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
1975             r = queue_if_no_path(m, false, false, __func__);
1976             disable_nopath_timeout(m);
1977             goto out;
1978         }
1979     }
1980 
1981     if (argc != 2) {
1982         DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
1983         goto out;
1984     }
1985 
1986     if (!strcasecmp(argv[0], "disable_group")) {
1987         r = bypass_pg_num(m, argv[1], true);
1988         goto out;
1989     } else if (!strcasecmp(argv[0], "enable_group")) {
1990         r = bypass_pg_num(m, argv[1], false);
1991         goto out;
1992     } else if (!strcasecmp(argv[0], "switch_group")) {
1993         r = switch_pg_num(m, argv[1]);
1994         goto out;
1995     } else if (!strcasecmp(argv[0], "reinstate_path"))
1996         action = reinstate_path;
1997     else if (!strcasecmp(argv[0], "fail_path"))
1998         action = fail_path;
1999     else {
2000         DMWARN("Unrecognised multipath message received: %s", argv[0]);
2001         goto out;
2002     }
2003 
2004     r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
2005     if (r) {
2006         DMWARN("message: error getting device %s",
2007                argv[1]);
2008         goto out;
2009     }
2010 
2011     r = action_dev(m, dev, action);
2012 
2013     dm_put_device(ti, dev);
2014 
2015 out:
2016     mutex_unlock(&m->work_mutex);
2017     return r;
2018 }
2019 
2020 static int multipath_prepare_ioctl(struct dm_target *ti,
2021                    struct block_device **bdev)
2022 {
2023     struct multipath *m = ti->private;
2024     struct pgpath *pgpath;
2025     unsigned long flags;
2026     int r;
2027 
2028     pgpath = READ_ONCE(m->current_pgpath);
2029     if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
2030         pgpath = choose_pgpath(m, 0);
2031 
2032     if (pgpath) {
2033         if (!mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) {
2034             *bdev = pgpath->path.dev->bdev;
2035             r = 0;
2036         } else {
2037             /* pg_init has not started or completed */
2038             r = -ENOTCONN;
2039         }
2040     } else {
2041         /* No path is available */
2042         r = -EIO;
2043         spin_lock_irqsave(&m->lock, flags);
2044         if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
2045             r = -ENOTCONN;
2046         spin_unlock_irqrestore(&m->lock, flags);
2047     }
2048 
2049     if (r == -ENOTCONN) {
2050         if (!READ_ONCE(m->current_pg)) {
2051             /* Path status changed, redo selection */
2052             (void) choose_pgpath(m, 0);
2053         }
2054         spin_lock_irqsave(&m->lock, flags);
2055         if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
2056             (void) __pg_init_all_paths(m);
2057         spin_unlock_irqrestore(&m->lock, flags);
2058         dm_table_run_md_queue_async(m->ti->table);
2059         process_queued_io_list(m);
2060     }
2061 
2062     /*
2063      * Only pass ioctls through if the device sizes match exactly.
2064      */
2065     if (!r && ti->len != bdev_nr_sectors((*bdev)))
2066         return 1;
2067     return r;
2068 }
2069 
2070 static int multipath_iterate_devices(struct dm_target *ti,
2071                      iterate_devices_callout_fn fn, void *data)
2072 {
2073     struct multipath *m = ti->private;
2074     struct priority_group *pg;
2075     struct pgpath *p;
2076     int ret = 0;
2077 
2078     list_for_each_entry(pg, &m->priority_groups, list) {
2079         list_for_each_entry(p, &pg->pgpaths, list) {
2080             ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
2081             if (ret)
2082                 goto out;
2083         }
2084     }
2085 
2086 out:
2087     return ret;
2088 }
2089 
2090 static int pgpath_busy(struct pgpath *pgpath)
2091 {
2092     struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
2093 
2094     return blk_lld_busy(q);
2095 }
2096 
2097 /*
2098  * We return "busy", only when we can map I/Os but underlying devices
2099  * are busy (so even if we map I/Os now, the I/Os will wait on
2100  * the underlying queue).
2101  * In other words, if we want to kill I/Os or queue them inside us
2102  * due to map unavailability, we don't return "busy".  Otherwise,
2103  * dm core won't give us the I/Os and we can't do what we want.
2104  */
2105 static int multipath_busy(struct dm_target *ti)
2106 {
2107     bool busy = false, has_active = false;
2108     struct multipath *m = ti->private;
2109     struct priority_group *pg, *next_pg;
2110     struct pgpath *pgpath;
2111 
2112     /* pg_init in progress */
2113     if (atomic_read(&m->pg_init_in_progress))
2114         return true;
2115 
2116     /* no paths available, for blk-mq: rely on IO mapping to delay requeue */
2117     if (!atomic_read(&m->nr_valid_paths)) {
2118         unsigned long flags;
2119         spin_lock_irqsave(&m->lock, flags);
2120         if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
2121             spin_unlock_irqrestore(&m->lock, flags);
2122             return (m->queue_mode != DM_TYPE_REQUEST_BASED);
2123         }
2124         spin_unlock_irqrestore(&m->lock, flags);
2125     }
2126 
2127     /* Guess which priority_group will be used at next mapping time */
2128     pg = READ_ONCE(m->current_pg);
2129     next_pg = READ_ONCE(m->next_pg);
2130     if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg))
2131         pg = next_pg;
2132 
2133     if (!pg) {
2134         /*
2135          * We don't know which pg will be used at next mapping time.
2136          * We don't call choose_pgpath() here to avoid to trigger
2137          * pg_init just by busy checking.
2138          * So we don't know whether underlying devices we will be using
2139          * at next mapping time are busy or not. Just try mapping.
2140          */
2141         return busy;
2142     }
2143 
2144     /*
2145      * If there is one non-busy active path at least, the path selector
2146      * will be able to select it. So we consider such a pg as not busy.
2147      */
2148     busy = true;
2149     list_for_each_entry(pgpath, &pg->pgpaths, list) {
2150         if (pgpath->is_active) {
2151             has_active = true;
2152             if (!pgpath_busy(pgpath)) {
2153                 busy = false;
2154                 break;
2155             }
2156         }
2157     }
2158 
2159     if (!has_active) {
2160         /*
2161          * No active path in this pg, so this pg won't be used and
2162          * the current_pg will be changed at next mapping time.
2163          * We need to try mapping to determine it.
2164          */
2165         busy = false;
2166     }
2167 
2168     return busy;
2169 }
2170 
2171 /*-----------------------------------------------------------------
2172  * Module setup
2173  *---------------------------------------------------------------*/
2174 static struct target_type multipath_target = {
2175     .name = "multipath",
2176     .version = {1, 14, 0},
2177     .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE |
2178             DM_TARGET_PASSES_INTEGRITY,
2179     .module = THIS_MODULE,
2180     .ctr = multipath_ctr,
2181     .dtr = multipath_dtr,
2182     .clone_and_map_rq = multipath_clone_and_map,
2183     .release_clone_rq = multipath_release_clone,
2184     .rq_end_io = multipath_end_io,
2185     .map = multipath_map_bio,
2186     .end_io = multipath_end_io_bio,
2187     .presuspend = multipath_presuspend,
2188     .postsuspend = multipath_postsuspend,
2189     .resume = multipath_resume,
2190     .status = multipath_status,
2191     .message = multipath_message,
2192     .prepare_ioctl = multipath_prepare_ioctl,
2193     .iterate_devices = multipath_iterate_devices,
2194     .busy = multipath_busy,
2195 };
2196 
2197 static int __init dm_multipath_init(void)
2198 {
2199     int r;
2200 
2201     kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
2202     if (!kmultipathd) {
2203         DMERR("failed to create workqueue kmpathd");
2204         r = -ENOMEM;
2205         goto bad_alloc_kmultipathd;
2206     }
2207 
2208     /*
2209      * A separate workqueue is used to handle the device handlers
2210      * to avoid overloading existing workqueue. Overloading the
2211      * old workqueue would also create a bottleneck in the
2212      * path of the storage hardware device activation.
2213      */
2214     kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
2215                           WQ_MEM_RECLAIM);
2216     if (!kmpath_handlerd) {
2217         DMERR("failed to create workqueue kmpath_handlerd");
2218         r = -ENOMEM;
2219         goto bad_alloc_kmpath_handlerd;
2220     }
2221 
2222     r = dm_register_target(&multipath_target);
2223     if (r < 0) {
2224         DMERR("request-based register failed %d", r);
2225         r = -EINVAL;
2226         goto bad_register_target;
2227     }
2228 
2229     return 0;
2230 
2231 bad_register_target:
2232     destroy_workqueue(kmpath_handlerd);
2233 bad_alloc_kmpath_handlerd:
2234     destroy_workqueue(kmultipathd);
2235 bad_alloc_kmultipathd:
2236     return r;
2237 }
2238 
2239 static void __exit dm_multipath_exit(void)
2240 {
2241     destroy_workqueue(kmpath_handlerd);
2242     destroy_workqueue(kmultipathd);
2243 
2244     dm_unregister_target(&multipath_target);
2245 }
2246 
2247 module_init(dm_multipath_init);
2248 module_exit(dm_multipath_exit);
2249 
2250 module_param_named(queue_if_no_path_timeout_secs,
2251            queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR);
2252 MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds");
2253 
2254 MODULE_DESCRIPTION(DM_NAME " multipath target");
2255 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
2256 MODULE_LICENSE("GPL");