dm mpath: reinstate bio-based support
[cascardo/linux.git] / drivers / md / dm-mpath.c
1 /*
2  * Copyright (C) 2003 Sistina Software Limited.
3  * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7
8 #include <linux/device-mapper.h>
9
10 #include "dm-rq.h"
11 #include "dm-bio-record.h"
12 #include "dm-path-selector.h"
13 #include "dm-uevent.h"
14
15 #include <linux/blkdev.h>
16 #include <linux/ctype.h>
17 #include <linux/init.h>
18 #include <linux/mempool.h>
19 #include <linux/module.h>
20 #include <linux/pagemap.h>
21 #include <linux/slab.h>
22 #include <linux/time.h>
23 #include <linux/workqueue.h>
24 #include <linux/delay.h>
25 #include <scsi/scsi_dh.h>
26 #include <linux/atomic.h>
27 #include <linux/blk-mq.h>
28
29 #define DM_MSG_PREFIX "multipath"
30 #define DM_PG_INIT_DELAY_MSECS 2000
31 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
32
33 /* Path properties */
34 struct pgpath {
35         struct list_head list;
36
37         struct priority_group *pg;      /* Owning PG */
38         unsigned fail_count;            /* Cumulative failure count */
39
40         struct dm_path path;
41         struct delayed_work activate_path;
42
43         bool is_active:1;               /* Path status */
44 };
45
46 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
47
48 /*
49  * Paths are grouped into Priority Groups and numbered from 1 upwards.
50  * Each has a path selector which controls which path gets used.
51  */
52 struct priority_group {
53         struct list_head list;
54
55         struct multipath *m;            /* Owning multipath instance */
56         struct path_selector ps;
57
58         unsigned pg_num;                /* Reference number */
59         unsigned nr_pgpaths;            /* Number of paths in PG */
60         struct list_head pgpaths;
61
62         bool bypassed:1;                /* Temporarily bypass this PG? */
63 };
64
65 /* Multipath context */
66 struct multipath {
67         struct list_head list;
68         struct dm_target *ti;
69
70         const char *hw_handler_name;
71         char *hw_handler_params;
72
73         spinlock_t lock;
74
75         unsigned nr_priority_groups;
76         struct list_head priority_groups;
77
78         wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
79
80         struct pgpath *current_pgpath;
81         struct priority_group *current_pg;
82         struct priority_group *next_pg; /* Switch to this PG if set */
83
84         unsigned long flags;            /* Multipath state flags */
85
86         unsigned pg_init_retries;       /* Number of times to retry pg_init */
87         unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
88
89         atomic_t nr_valid_paths;        /* Total number of usable paths */
90         atomic_t pg_init_in_progress;   /* Only one pg_init allowed at once */
91         atomic_t pg_init_count;         /* Number of times pg_init called */
92
93         /*
94          * We must use a mempool of dm_mpath_io structs so that we
95          * can resubmit bios on error.
96          */
97         mempool_t *mpio_pool;
98
99         struct mutex work_mutex;
100         struct work_struct trigger_event;
101
102         struct work_struct process_queued_bios;
103         struct bio_list queued_bios;
104 };
105
106 /*
107  * Context information attached to each io we process.
108  */
109 struct dm_mpath_io {
110         struct pgpath *pgpath;
111         size_t nr_bytes;
112
113         /*
114          * FIXME: make request-based code _not_ include this member.
115          */
116         struct dm_bio_details bio_details;
117 };
118
119 typedef int (*action_fn) (struct pgpath *pgpath);
120
121 static struct kmem_cache *_mpio_cache;
122
123 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
124 static void trigger_event(struct work_struct *work);
125 static void activate_path(struct work_struct *work);
126 static void process_queued_bios(struct work_struct *work);
127
128 /*-----------------------------------------------
129  * Multipath state flags.
130  *-----------------------------------------------*/
131
132 #define MPATHF_QUEUE_IO 0                       /* Must we queue all I/O? */
133 #define MPATHF_QUEUE_IF_NO_PATH 1               /* Queue I/O if last path fails? */
134 #define MPATHF_SAVED_QUEUE_IF_NO_PATH 2         /* Saved state during suspension */
135 #define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3     /* If there's already a hw_handler present, don't change it. */
136 #define MPATHF_PG_INIT_DISABLED 4               /* pg_init is not currently allowed */
137 #define MPATHF_PG_INIT_REQUIRED 5               /* pg_init needs calling? */
138 #define MPATHF_PG_INIT_DELAY_RETRY 6            /* Delay pg_init retry? */
139 #define MPATHF_BIO_BASED 7                      /* Device is bio-based? */
140
141 /*-----------------------------------------------
142  * Allocation routines
143  *-----------------------------------------------*/
144
145 static struct pgpath *alloc_pgpath(void)
146 {
147         struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
148
149         if (pgpath) {
150                 pgpath->is_active = true;
151                 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
152         }
153
154         return pgpath;
155 }
156
157 static void free_pgpath(struct pgpath *pgpath)
158 {
159         kfree(pgpath);
160 }
161
162 static struct priority_group *alloc_priority_group(void)
163 {
164         struct priority_group *pg;
165
166         pg = kzalloc(sizeof(*pg), GFP_KERNEL);
167
168         if (pg)
169                 INIT_LIST_HEAD(&pg->pgpaths);
170
171         return pg;
172 }
173
174 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
175 {
176         struct pgpath *pgpath, *tmp;
177
178         list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
179                 list_del(&pgpath->list);
180                 dm_put_device(ti, pgpath->path.dev);
181                 free_pgpath(pgpath);
182         }
183 }
184
185 static void free_priority_group(struct priority_group *pg,
186                                 struct dm_target *ti)
187 {
188         struct path_selector *ps = &pg->ps;
189
190         if (ps->type) {
191                 ps->type->destroy(ps);
192                 dm_put_path_selector(ps->type);
193         }
194
195         free_pgpaths(&pg->pgpaths, ti);
196         kfree(pg);
197 }
198
199 static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq,
200                                          bool bio_based)
201 {
202         struct multipath *m;
203
204         m = kzalloc(sizeof(*m), GFP_KERNEL);
205         if (m) {
206                 INIT_LIST_HEAD(&m->priority_groups);
207                 spin_lock_init(&m->lock);
208                 set_bit(MPATHF_QUEUE_IO, &m->flags);
209                 atomic_set(&m->nr_valid_paths, 0);
210                 atomic_set(&m->pg_init_in_progress, 0);
211                 atomic_set(&m->pg_init_count, 0);
212                 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
213                 INIT_WORK(&m->trigger_event, trigger_event);
214                 init_waitqueue_head(&m->pg_init_wait);
215                 mutex_init(&m->work_mutex);
216
217                 m->mpio_pool = NULL;
218                 if (!use_blk_mq && !bio_based) {
219                         unsigned min_ios = dm_get_reserved_rq_based_ios();
220
221                         m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
222                         if (!m->mpio_pool) {
223                                 kfree(m);
224                                 return NULL;
225                         }
226                 }
227
228                 if (bio_based) {
229                         INIT_WORK(&m->process_queued_bios, process_queued_bios);
230                         set_bit(MPATHF_BIO_BASED, &m->flags);
231                         /*
232                          * bio-based doesn't support any direct scsi_dh management;
233                          * it just discovers if a scsi_dh is attached.
234                          */
235                         set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
236                 }
237
238                 m->ti = ti;
239                 ti->private = m;
240         }
241
242         return m;
243 }
244
245 static void free_multipath(struct multipath *m)
246 {
247         struct priority_group *pg, *tmp;
248
249         list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
250                 list_del(&pg->list);
251                 free_priority_group(pg, m->ti);
252         }
253
254         kfree(m->hw_handler_name);
255         kfree(m->hw_handler_params);
256         mempool_destroy(m->mpio_pool);
257         kfree(m);
258 }
259
260 static struct dm_mpath_io *get_mpio(union map_info *info)
261 {
262         return info->ptr;
263 }
264
265 static struct dm_mpath_io *set_mpio(struct multipath *m, union map_info *info)
266 {
267         struct dm_mpath_io *mpio;
268
269         if (!m->mpio_pool) {
270                 /* Use blk-mq pdu memory requested via per_io_data_size */
271                 mpio = get_mpio(info);
272                 memset(mpio, 0, sizeof(*mpio));
273                 return mpio;
274         }
275
276         mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
277         if (!mpio)
278                 return NULL;
279
280         memset(mpio, 0, sizeof(*mpio));
281         info->ptr = mpio;
282
283         return mpio;
284 }
285
286 static void clear_request_fn_mpio(struct multipath *m, union map_info *info)
287 {
288         /* Only needed for non blk-mq (.request_fn) multipath */
289         if (m->mpio_pool) {
290                 struct dm_mpath_io *mpio = info->ptr;
291
292                 info->ptr = NULL;
293                 mempool_free(mpio, m->mpio_pool);
294         }
295 }
296
297 static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio)
298 {
299         return dm_per_bio_data(bio, sizeof(struct dm_mpath_io));
300 }
301
302 static struct dm_mpath_io *set_mpio_bio(struct multipath *m, struct bio *bio)
303 {
304         struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
305
306         memset(mpio, 0, sizeof(*mpio));
307         dm_bio_record(&mpio->bio_details, bio);
308
309         return mpio;
310 }
311
312 /*-----------------------------------------------
313  * Path selection
314  *-----------------------------------------------*/
315
316 static int __pg_init_all_paths(struct multipath *m)
317 {
318         struct pgpath *pgpath;
319         unsigned long pg_init_delay = 0;
320
321         if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
322                 return 0;
323
324         atomic_inc(&m->pg_init_count);
325         clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
326
327         /* Check here to reset pg_init_required */
328         if (!m->current_pg)
329                 return 0;
330
331         if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags))
332                 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
333                                                  m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
334         list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
335                 /* Skip failed paths */
336                 if (!pgpath->is_active)
337                         continue;
338                 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
339                                        pg_init_delay))
340                         atomic_inc(&m->pg_init_in_progress);
341         }
342         return atomic_read(&m->pg_init_in_progress);
343 }
344
345 static int pg_init_all_paths(struct multipath *m)
346 {
347         int r;
348         unsigned long flags;
349
350         spin_lock_irqsave(&m->lock, flags);
351         r = __pg_init_all_paths(m);
352         spin_unlock_irqrestore(&m->lock, flags);
353
354         return r;
355 }
356
357 static void __switch_pg(struct multipath *m, struct priority_group *pg)
358 {
359         m->current_pg = pg;
360
361         /* Must we initialise the PG first, and queue I/O till it's ready? */
362         if (m->hw_handler_name) {
363                 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
364                 set_bit(MPATHF_QUEUE_IO, &m->flags);
365         } else {
366                 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
367                 clear_bit(MPATHF_QUEUE_IO, &m->flags);
368         }
369
370         atomic_set(&m->pg_init_count, 0);
371 }
372
373 static struct pgpath *choose_path_in_pg(struct multipath *m,
374                                         struct priority_group *pg,
375                                         size_t nr_bytes)
376 {
377         unsigned long flags;
378         struct dm_path *path;
379         struct pgpath *pgpath;
380
381         path = pg->ps.type->select_path(&pg->ps, nr_bytes);
382         if (!path)
383                 return ERR_PTR(-ENXIO);
384
385         pgpath = path_to_pgpath(path);
386
387         if (unlikely(lockless_dereference(m->current_pg) != pg)) {
388                 /* Only update current_pgpath if pg changed */
389                 spin_lock_irqsave(&m->lock, flags);
390                 m->current_pgpath = pgpath;
391                 __switch_pg(m, pg);
392                 spin_unlock_irqrestore(&m->lock, flags);
393         }
394
395         return pgpath;
396 }
397
398 static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
399 {
400         unsigned long flags;
401         struct priority_group *pg;
402         struct pgpath *pgpath;
403         bool bypassed = true;
404
405         if (!atomic_read(&m->nr_valid_paths)) {
406                 clear_bit(MPATHF_QUEUE_IO, &m->flags);
407                 goto failed;
408         }
409
410         /* Were we instructed to switch PG? */
411         if (lockless_dereference(m->next_pg)) {
412                 spin_lock_irqsave(&m->lock, flags);
413                 pg = m->next_pg;
414                 if (!pg) {
415                         spin_unlock_irqrestore(&m->lock, flags);
416                         goto check_current_pg;
417                 }
418                 m->next_pg = NULL;
419                 spin_unlock_irqrestore(&m->lock, flags);
420                 pgpath = choose_path_in_pg(m, pg, nr_bytes);
421                 if (!IS_ERR_OR_NULL(pgpath))
422                         return pgpath;
423         }
424
425         /* Don't change PG until it has no remaining paths */
426 check_current_pg:
427         pg = lockless_dereference(m->current_pg);
428         if (pg) {
429                 pgpath = choose_path_in_pg(m, pg, nr_bytes);
430                 if (!IS_ERR_OR_NULL(pgpath))
431                         return pgpath;
432         }
433
434         /*
435          * Loop through priority groups until we find a valid path.
436          * First time we skip PGs marked 'bypassed'.
437          * Second time we only try the ones we skipped, but set
438          * pg_init_delay_retry so we do not hammer controllers.
439          */
440         do {
441                 list_for_each_entry(pg, &m->priority_groups, list) {
442                         if (pg->bypassed == bypassed)
443                                 continue;
444                         pgpath = choose_path_in_pg(m, pg, nr_bytes);
445                         if (!IS_ERR_OR_NULL(pgpath)) {
446                                 if (!bypassed)
447                                         set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
448                                 return pgpath;
449                         }
450                 }
451         } while (bypassed--);
452
453 failed:
454         spin_lock_irqsave(&m->lock, flags);
455         m->current_pgpath = NULL;
456         m->current_pg = NULL;
457         spin_unlock_irqrestore(&m->lock, flags);
458
459         return NULL;
460 }
461
462 /*
463  * Check whether bios must be queued in the device-mapper core rather
464  * than here in the target.
465  *
466  * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
467  * same value then we are not between multipath_presuspend()
468  * and multipath_resume() calls and we have no need to check
469  * for the DMF_NOFLUSH_SUSPENDING flag.
470  */
471 static bool __must_push_back(struct multipath *m)
472 {
473         return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
474                  test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
475                 dm_noflush_suspending(m->ti));
476 }
477
478 static bool must_push_back_rq(struct multipath *m)
479 {
480         return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
481                 __must_push_back(m));
482 }
483
484 static bool must_push_back_bio(struct multipath *m)
485 {
486         return __must_push_back(m);
487 }
488
489 /*
490  * Map cloned requests (request-based multipath)
491  */
492 static int __multipath_map(struct dm_target *ti, struct request *clone,
493                            union map_info *map_context,
494                            struct request *rq, struct request **__clone)
495 {
496         struct multipath *m = ti->private;
497         int r = DM_MAPIO_REQUEUE;
498         size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
499         struct pgpath *pgpath;
500         struct block_device *bdev;
501         struct dm_mpath_io *mpio;
502
503         /* Do we need to select a new pgpath? */
504         pgpath = lockless_dereference(m->current_pgpath);
505         if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
506                 pgpath = choose_pgpath(m, nr_bytes);
507
508         if (!pgpath) {
509                 if (!must_push_back_rq(m))
510                         r = -EIO;       /* Failed */
511                 return r;
512         } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
513                    test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
514                 pg_init_all_paths(m);
515                 return r;
516         }
517
518         mpio = set_mpio(m, map_context);
519         if (!mpio)
520                 /* ENOMEM, requeue */
521                 return r;
522
523         mpio->pgpath = pgpath;
524         mpio->nr_bytes = nr_bytes;
525
526         bdev = pgpath->path.dev->bdev;
527
528         if (clone) {
529                 /*
530                  * Old request-based interface: allocated clone is passed in.
531                  * Used by: .request_fn stacked on .request_fn path(s).
532                  */
533                 clone->q = bdev_get_queue(bdev);
534                 clone->rq_disk = bdev->bd_disk;
535                 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
536         } else {
537                 /*
538                  * blk-mq request-based interface; used by both:
539                  * .request_fn stacked on blk-mq path(s) and
540                  * blk-mq stacked on blk-mq path(s).
541                  */
542                 *__clone = blk_mq_alloc_request(bdev_get_queue(bdev),
543                                                 rq_data_dir(rq), BLK_MQ_REQ_NOWAIT);
544                 if (IS_ERR(*__clone)) {
545                         /* ENOMEM, requeue */
546                         clear_request_fn_mpio(m, map_context);
547                         return r;
548                 }
549                 (*__clone)->bio = (*__clone)->biotail = NULL;
550                 (*__clone)->rq_disk = bdev->bd_disk;
551                 (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT;
552         }
553
554         if (pgpath->pg->ps.type->start_io)
555                 pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
556                                               &pgpath->path,
557                                               nr_bytes);
558         return DM_MAPIO_REMAPPED;
559 }
560
561 static int multipath_map(struct dm_target *ti, struct request *clone,
562                          union map_info *map_context)
563 {
564         return __multipath_map(ti, clone, map_context, NULL, NULL);
565 }
566
567 static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
568                                    union map_info *map_context,
569                                    struct request **clone)
570 {
571         return __multipath_map(ti, NULL, map_context, rq, clone);
572 }
573
574 static void multipath_release_clone(struct request *clone)
575 {
576         blk_mq_free_request(clone);
577 }
578
579 /*
580  * Map cloned bios (bio-based multipath)
581  */
582 static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio)
583 {
584         size_t nr_bytes = bio->bi_iter.bi_size;
585         struct pgpath *pgpath;
586         unsigned long flags;
587         bool queue_io;
588
589         /* Do we need to select a new pgpath? */
590         pgpath = lockless_dereference(m->current_pgpath);
591         queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
592         if (!pgpath || !queue_io)
593                 pgpath = choose_pgpath(m, nr_bytes);
594
595         if ((pgpath && queue_io) ||
596             (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
597                 /* Queue for the daemon to resubmit */
598                 spin_lock_irqsave(&m->lock, flags);
599                 bio_list_add(&m->queued_bios, bio);
600                 spin_unlock_irqrestore(&m->lock, flags);
601                 /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
602                 if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
603                         pg_init_all_paths(m);
604                 else if (!queue_io)
605                         queue_work(kmultipathd, &m->process_queued_bios);
606                 return DM_MAPIO_SUBMITTED;
607         }
608
609         if (!pgpath) {
610                 if (!must_push_back_bio(m))
611                         return -EIO;
612                 return DM_MAPIO_REQUEUE;
613         }
614
615         mpio->pgpath = pgpath;
616         mpio->nr_bytes = nr_bytes;
617
618         bio->bi_error = 0;
619         bio->bi_bdev = pgpath->path.dev->bdev;
620         bio->bi_rw |= REQ_FAILFAST_TRANSPORT;
621
622         if (pgpath->pg->ps.type->start_io)
623                 pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
624                                               &pgpath->path,
625                                               nr_bytes);
626         return DM_MAPIO_REMAPPED;
627 }
628
629 static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
630 {
631         struct multipath *m = ti->private;
632         struct dm_mpath_io *mpio = set_mpio_bio(m, bio);
633
634         return __multipath_map_bio(m, bio, mpio);
635 }
636
637 static void process_queued_bios_list(struct multipath *m)
638 {
639         if (test_bit(MPATHF_BIO_BASED, &m->flags))
640                 queue_work(kmultipathd, &m->process_queued_bios);
641 }
642
643 static void process_queued_bios(struct work_struct *work)
644 {
645         int r;
646         unsigned long flags;
647         struct bio *bio;
648         struct bio_list bios;
649         struct blk_plug plug;
650         struct multipath *m =
651                 container_of(work, struct multipath, process_queued_bios);
652
653         bio_list_init(&bios);
654
655         spin_lock_irqsave(&m->lock, flags);
656
657         if (bio_list_empty(&m->queued_bios)) {
658                 spin_unlock_irqrestore(&m->lock, flags);
659                 return;
660         }
661
662         bio_list_merge(&bios, &m->queued_bios);
663         bio_list_init(&m->queued_bios);
664
665         spin_unlock_irqrestore(&m->lock, flags);
666
667         blk_start_plug(&plug);
668         while ((bio = bio_list_pop(&bios))) {
669                 r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio));
670                 if (r < 0 || r == DM_MAPIO_REQUEUE) {
671                         bio->bi_error = r;
672                         bio_endio(bio);
673                 } else if (r == DM_MAPIO_REMAPPED)
674                         generic_make_request(bio);
675         }
676         blk_finish_plug(&plug);
677 }
678
679 /*
680  * If we run out of usable paths, should we queue I/O or error it?
681  */
682 static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
683                             bool save_old_value)
684 {
685         unsigned long flags;
686
687         spin_lock_irqsave(&m->lock, flags);
688
689         if (save_old_value) {
690                 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
691                         set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
692                 else
693                         clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
694         } else {
695                 if (queue_if_no_path)
696                         set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
697                 else
698                         clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
699         }
700         if (queue_if_no_path)
701                 set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
702         else
703                 clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
704
705         spin_unlock_irqrestore(&m->lock, flags);
706
707         if (!queue_if_no_path) {
708                 dm_table_run_md_queue_async(m->ti->table);
709                 process_queued_bios_list(m);
710         }
711
712         return 0;
713 }
714
715 /*
716  * An event is triggered whenever a path is taken out of use.
717  * Includes path failure and PG bypass.
718  */
719 static void trigger_event(struct work_struct *work)
720 {
721         struct multipath *m =
722                 container_of(work, struct multipath, trigger_event);
723
724         dm_table_event(m->ti->table);
725 }
726
727 /*-----------------------------------------------------------------
728  * Constructor/argument parsing:
729  * <#multipath feature args> [<arg>]*
730  * <#hw_handler args> [hw_handler [<arg>]*]
731  * <#priority groups>
732  * <initial priority group>
733  *     [<selector> <#selector args> [<arg>]*
734  *      <#paths> <#per-path selector args>
735  *         [<path> [<arg>]* ]+ ]+
736  *---------------------------------------------------------------*/
737 static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
738                                struct dm_target *ti)
739 {
740         int r;
741         struct path_selector_type *pst;
742         unsigned ps_argc;
743
744         static struct dm_arg _args[] = {
745                 {0, 1024, "invalid number of path selector args"},
746         };
747
748         pst = dm_get_path_selector(dm_shift_arg(as));
749         if (!pst) {
750                 ti->error = "unknown path selector type";
751                 return -EINVAL;
752         }
753
754         r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
755         if (r) {
756                 dm_put_path_selector(pst);
757                 return -EINVAL;
758         }
759
760         r = pst->create(&pg->ps, ps_argc, as->argv);
761         if (r) {
762                 dm_put_path_selector(pst);
763                 ti->error = "path selector constructor failed";
764                 return r;
765         }
766
767         pg->ps.type = pst;
768         dm_consume_args(as, ps_argc);
769
770         return 0;
771 }
772
773 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
774                                struct dm_target *ti)
775 {
776         int r;
777         struct pgpath *p;
778         struct multipath *m = ti->private;
779         struct request_queue *q = NULL;
780         const char *attached_handler_name;
781
782         /* we need at least a path arg */
783         if (as->argc < 1) {
784                 ti->error = "no device given";
785                 return ERR_PTR(-EINVAL);
786         }
787
788         p = alloc_pgpath();
789         if (!p)
790                 return ERR_PTR(-ENOMEM);
791
792         r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
793                           &p->path.dev);
794         if (r) {
795                 ti->error = "error getting device";
796                 goto bad;
797         }
798
799         if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) || m->hw_handler_name)
800                 q = bdev_get_queue(p->path.dev->bdev);
801
802         if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) {
803 retain:
804                 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
805                 if (attached_handler_name) {
806                         /*
807                          * Reset hw_handler_name to match the attached handler
808                          * and clear any hw_handler_params associated with the
809                          * ignored handler.
810                          *
811                          * NB. This modifies the table line to show the actual
812                          * handler instead of the original table passed in.
813                          */
814                         kfree(m->hw_handler_name);
815                         m->hw_handler_name = attached_handler_name;
816
817                         kfree(m->hw_handler_params);
818                         m->hw_handler_params = NULL;
819                 }
820         }
821
822         if (m->hw_handler_name) {
823                 r = scsi_dh_attach(q, m->hw_handler_name);
824                 if (r == -EBUSY) {
825                         char b[BDEVNAME_SIZE];
826
827                         printk(KERN_INFO "dm-mpath: retaining handler on device %s\n",
828                                 bdevname(p->path.dev->bdev, b));
829                         goto retain;
830                 }
831                 if (r < 0) {
832                         ti->error = "error attaching hardware handler";
833                         dm_put_device(ti, p->path.dev);
834                         goto bad;
835                 }
836
837                 if (m->hw_handler_params) {
838                         r = scsi_dh_set_params(q, m->hw_handler_params);
839                         if (r < 0) {
840                                 ti->error = "unable to set hardware "
841                                                         "handler parameters";
842                                 dm_put_device(ti, p->path.dev);
843                                 goto bad;
844                         }
845                 }
846         }
847
848         r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
849         if (r) {
850                 dm_put_device(ti, p->path.dev);
851                 goto bad;
852         }
853
854         return p;
855
856  bad:
857         free_pgpath(p);
858         return ERR_PTR(r);
859 }
860
861 static struct priority_group *parse_priority_group(struct dm_arg_set *as,
862                                                    struct multipath *m)
863 {
864         static struct dm_arg _args[] = {
865                 {1, 1024, "invalid number of paths"},
866                 {0, 1024, "invalid number of selector args"}
867         };
868
869         int r;
870         unsigned i, nr_selector_args, nr_args;
871         struct priority_group *pg;
872         struct dm_target *ti = m->ti;
873
874         if (as->argc < 2) {
875                 as->argc = 0;
876                 ti->error = "not enough priority group arguments";
877                 return ERR_PTR(-EINVAL);
878         }
879
880         pg = alloc_priority_group();
881         if (!pg) {
882                 ti->error = "couldn't allocate priority group";
883                 return ERR_PTR(-ENOMEM);
884         }
885         pg->m = m;
886
887         r = parse_path_selector(as, pg, ti);
888         if (r)
889                 goto bad;
890
891         /*
892          * read the paths
893          */
894         r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
895         if (r)
896                 goto bad;
897
898         r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
899         if (r)
900                 goto bad;
901
902         nr_args = 1 + nr_selector_args;
903         for (i = 0; i < pg->nr_pgpaths; i++) {
904                 struct pgpath *pgpath;
905                 struct dm_arg_set path_args;
906
907                 if (as->argc < nr_args) {
908                         ti->error = "not enough path parameters";
909                         r = -EINVAL;
910                         goto bad;
911                 }
912
913                 path_args.argc = nr_args;
914                 path_args.argv = as->argv;
915
916                 pgpath = parse_path(&path_args, &pg->ps, ti);
917                 if (IS_ERR(pgpath)) {
918                         r = PTR_ERR(pgpath);
919                         goto bad;
920                 }
921
922                 pgpath->pg = pg;
923                 list_add_tail(&pgpath->list, &pg->pgpaths);
924                 dm_consume_args(as, nr_args);
925         }
926
927         return pg;
928
929  bad:
930         free_priority_group(pg, ti);
931         return ERR_PTR(r);
932 }
933
934 static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
935 {
936         unsigned hw_argc;
937         int ret;
938         struct dm_target *ti = m->ti;
939
940         static struct dm_arg _args[] = {
941                 {0, 1024, "invalid number of hardware handler args"},
942         };
943
944         if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
945                 return -EINVAL;
946
947         if (!hw_argc)
948                 return 0;
949
950         if (test_bit(MPATHF_BIO_BASED, &m->flags)) {
951                 dm_consume_args(as, hw_argc);
952                 DMERR("bio-based multipath doesn't allow hardware handler args");
953                 return 0;
954         }
955
956         m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
957
958         if (hw_argc > 1) {
959                 char *p;
960                 int i, j, len = 4;
961
962                 for (i = 0; i <= hw_argc - 2; i++)
963                         len += strlen(as->argv[i]) + 1;
964                 p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
965                 if (!p) {
966                         ti->error = "memory allocation failed";
967                         ret = -ENOMEM;
968                         goto fail;
969                 }
970                 j = sprintf(p, "%d", hw_argc - 1);
971                 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
972                         j = sprintf(p, "%s", as->argv[i]);
973         }
974         dm_consume_args(as, hw_argc - 1);
975
976         return 0;
977 fail:
978         kfree(m->hw_handler_name);
979         m->hw_handler_name = NULL;
980         return ret;
981 }
982
983 static int parse_features(struct dm_arg_set *as, struct multipath *m)
984 {
985         int r;
986         unsigned argc;
987         struct dm_target *ti = m->ti;
988         const char *arg_name;
989
990         static struct dm_arg _args[] = {
991                 {0, 6, "invalid number of feature args"},
992                 {1, 50, "pg_init_retries must be between 1 and 50"},
993                 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
994         };
995
996         r = dm_read_arg_group(_args, as, &argc, &ti->error);
997         if (r)
998                 return -EINVAL;
999
1000         if (!argc)
1001                 return 0;
1002
1003         do {
1004                 arg_name = dm_shift_arg(as);
1005                 argc--;
1006
1007                 if (!strcasecmp(arg_name, "queue_if_no_path")) {
1008                         r = queue_if_no_path(m, true, false);
1009                         continue;
1010                 }
1011
1012                 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
1013                         set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
1014                         continue;
1015                 }
1016
1017                 if (!strcasecmp(arg_name, "pg_init_retries") &&
1018                     (argc >= 1)) {
1019                         r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
1020                         argc--;
1021                         continue;
1022                 }
1023
1024                 if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
1025                     (argc >= 1)) {
1026                         r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
1027                         argc--;
1028                         continue;
1029                 }
1030
1031                 ti->error = "Unrecognised multipath feature request";
1032                 r = -EINVAL;
1033         } while (argc && !r);
1034
1035         return r;
1036 }
1037
1038 static int __multipath_ctr(struct dm_target *ti, unsigned int argc,
1039                            char **argv, bool bio_based)
1040 {
1041         /* target arguments */
1042         static struct dm_arg _args[] = {
1043                 {0, 1024, "invalid number of priority groups"},
1044                 {0, 1024, "invalid initial priority group number"},
1045         };
1046
1047         int r;
1048         struct multipath *m;
1049         struct dm_arg_set as;
1050         unsigned pg_count = 0;
1051         unsigned next_pg_num;
1052         bool use_blk_mq = dm_use_blk_mq(dm_table_get_md(ti->table));
1053
1054         as.argc = argc;
1055         as.argv = argv;
1056
1057         m = alloc_multipath(ti, use_blk_mq, bio_based);
1058         if (!m) {
1059                 ti->error = "can't allocate multipath";
1060                 return -EINVAL;
1061         }
1062
1063         r = parse_features(&as, m);
1064         if (r)
1065                 goto bad;
1066
1067         r = parse_hw_handler(&as, m);
1068         if (r)
1069                 goto bad;
1070
1071         r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
1072         if (r)
1073                 goto bad;
1074
1075         r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
1076         if (r)
1077                 goto bad;
1078
1079         if ((!m->nr_priority_groups && next_pg_num) ||
1080             (m->nr_priority_groups && !next_pg_num)) {
1081                 ti->error = "invalid initial priority group";
1082                 r = -EINVAL;
1083                 goto bad;
1084         }
1085
1086         /* parse the priority groups */
1087         while (as.argc) {
1088                 struct priority_group *pg;
1089                 unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths);
1090
1091                 pg = parse_priority_group(&as, m);
1092                 if (IS_ERR(pg)) {
1093                         r = PTR_ERR(pg);
1094                         goto bad;
1095                 }
1096
1097                 nr_valid_paths += pg->nr_pgpaths;
1098                 atomic_set(&m->nr_valid_paths, nr_valid_paths);
1099
1100                 list_add_tail(&pg->list, &m->priority_groups);
1101                 pg_count++;
1102                 pg->pg_num = pg_count;
1103                 if (!--next_pg_num)
1104                         m->next_pg = pg;
1105         }
1106
1107         if (pg_count != m->nr_priority_groups) {
1108                 ti->error = "priority group count mismatch";
1109                 r = -EINVAL;
1110                 goto bad;
1111         }
1112
1113         ti->num_flush_bios = 1;
1114         ti->num_discard_bios = 1;
1115         ti->num_write_same_bios = 1;
1116         if (use_blk_mq || bio_based)
1117                 ti->per_io_data_size = sizeof(struct dm_mpath_io);
1118
1119         return 0;
1120
1121  bad:
1122         free_multipath(m);
1123         return r;
1124 }
1125
1126 static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
1127 {
1128         return __multipath_ctr(ti, argc, argv, false);
1129 }
1130
1131 static int multipath_bio_ctr(struct dm_target *ti, unsigned argc, char **argv)
1132 {
1133         return __multipath_ctr(ti, argc, argv, true);
1134 }
1135
1136 static void multipath_wait_for_pg_init_completion(struct multipath *m)
1137 {
1138         DECLARE_WAITQUEUE(wait, current);
1139
1140         add_wait_queue(&m->pg_init_wait, &wait);
1141
1142         while (1) {
1143                 set_current_state(TASK_UNINTERRUPTIBLE);
1144
1145                 if (!atomic_read(&m->pg_init_in_progress))
1146                         break;
1147
1148                 io_schedule();
1149         }
1150         set_current_state(TASK_RUNNING);
1151
1152         remove_wait_queue(&m->pg_init_wait, &wait);
1153 }
1154
1155 static void flush_multipath_work(struct multipath *m)
1156 {
1157         set_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
1158         smp_mb__after_atomic();
1159
1160         flush_workqueue(kmpath_handlerd);
1161         multipath_wait_for_pg_init_completion(m);
1162         flush_workqueue(kmultipathd);
1163         flush_work(&m->trigger_event);
1164
1165         clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
1166         smp_mb__after_atomic();
1167 }
1168
1169 static void multipath_dtr(struct dm_target *ti)
1170 {
1171         struct multipath *m = ti->private;
1172
1173         flush_multipath_work(m);
1174         free_multipath(m);
1175 }
1176
1177 /*
1178  * Take a path out of use.
1179  */
1180 static int fail_path(struct pgpath *pgpath)
1181 {
1182         unsigned long flags;
1183         struct multipath *m = pgpath->pg->m;
1184
1185         spin_lock_irqsave(&m->lock, flags);
1186
1187         if (!pgpath->is_active)
1188                 goto out;
1189
1190         DMWARN("Failing path %s.", pgpath->path.dev->name);
1191
1192         pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
1193         pgpath->is_active = false;
1194         pgpath->fail_count++;
1195
1196         atomic_dec(&m->nr_valid_paths);
1197
1198         if (pgpath == m->current_pgpath)
1199                 m->current_pgpath = NULL;
1200
1201         dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
1202                        pgpath->path.dev->name, atomic_read(&m->nr_valid_paths));
1203
1204         schedule_work(&m->trigger_event);
1205
1206 out:
1207         spin_unlock_irqrestore(&m->lock, flags);
1208
1209         return 0;
1210 }
1211
1212 /*
1213  * Reinstate a previously-failed path
1214  */
1215 static int reinstate_path(struct pgpath *pgpath)
1216 {
1217         int r = 0, run_queue = 0;
1218         unsigned long flags;
1219         struct multipath *m = pgpath->pg->m;
1220         unsigned nr_valid_paths;
1221
1222         spin_lock_irqsave(&m->lock, flags);
1223
1224         if (pgpath->is_active)
1225                 goto out;
1226
1227         DMWARN("Reinstating path %s.", pgpath->path.dev->name);
1228
1229         r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
1230         if (r)
1231                 goto out;
1232
1233         pgpath->is_active = true;
1234
1235         nr_valid_paths = atomic_inc_return(&m->nr_valid_paths);
1236         if (nr_valid_paths == 1) {
1237                 m->current_pgpath = NULL;
1238                 run_queue = 1;
1239         } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1240                 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1241                         atomic_inc(&m->pg_init_in_progress);
1242         }
1243
1244         dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
1245                        pgpath->path.dev->name, nr_valid_paths);
1246
1247         schedule_work(&m->trigger_event);
1248
1249 out:
1250         spin_unlock_irqrestore(&m->lock, flags);
1251         if (run_queue) {
1252                 dm_table_run_md_queue_async(m->ti->table);
1253                 process_queued_bios_list(m);
1254         }
1255
1256         return r;
1257 }
1258
1259 /*
1260  * Fail or reinstate all paths that match the provided struct dm_dev.
1261  */
1262 static int action_dev(struct multipath *m, struct dm_dev *dev,
1263                       action_fn action)
1264 {
1265         int r = -EINVAL;
1266         struct pgpath *pgpath;
1267         struct priority_group *pg;
1268
1269         list_for_each_entry(pg, &m->priority_groups, list) {
1270                 list_for_each_entry(pgpath, &pg->pgpaths, list) {
1271                         if (pgpath->path.dev == dev)
1272                                 r = action(pgpath);
1273                 }
1274         }
1275
1276         return r;
1277 }
1278
1279 /*
1280  * Temporarily try to avoid having to use the specified PG
1281  */
1282 static void bypass_pg(struct multipath *m, struct priority_group *pg,
1283                       bool bypassed)
1284 {
1285         unsigned long flags;
1286
1287         spin_lock_irqsave(&m->lock, flags);
1288
1289         pg->bypassed = bypassed;
1290         m->current_pgpath = NULL;
1291         m->current_pg = NULL;
1292
1293         spin_unlock_irqrestore(&m->lock, flags);
1294
1295         schedule_work(&m->trigger_event);
1296 }
1297
1298 /*
1299  * Switch to using the specified PG from the next I/O that gets mapped
1300  */
1301 static int switch_pg_num(struct multipath *m, const char *pgstr)
1302 {
1303         struct priority_group *pg;
1304         unsigned pgnum;
1305         unsigned long flags;
1306         char dummy;
1307
1308         if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1309             (pgnum > m->nr_priority_groups)) {
1310                 DMWARN("invalid PG number supplied to switch_pg_num");
1311                 return -EINVAL;
1312         }
1313
1314         spin_lock_irqsave(&m->lock, flags);
1315         list_for_each_entry(pg, &m->priority_groups, list) {
1316                 pg->bypassed = false;
1317                 if (--pgnum)
1318                         continue;
1319
1320                 m->current_pgpath = NULL;
1321                 m->current_pg = NULL;
1322                 m->next_pg = pg;
1323         }
1324         spin_unlock_irqrestore(&m->lock, flags);
1325
1326         schedule_work(&m->trigger_event);
1327         return 0;
1328 }
1329
1330 /*
1331  * Set/clear bypassed status of a PG.
1332  * PGs are numbered upwards from 1 in the order they were declared.
1333  */
1334 static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed)
1335 {
1336         struct priority_group *pg;
1337         unsigned pgnum;
1338         char dummy;
1339
1340         if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1341             (pgnum > m->nr_priority_groups)) {
1342                 DMWARN("invalid PG number supplied to bypass_pg");
1343                 return -EINVAL;
1344         }
1345
1346         list_for_each_entry(pg, &m->priority_groups, list) {
1347                 if (!--pgnum)
1348                         break;
1349         }
1350
1351         bypass_pg(m, pg, bypassed);
1352         return 0;
1353 }
1354
1355 /*
1356  * Should we retry pg_init immediately?
1357  */
1358 static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1359 {
1360         unsigned long flags;
1361         bool limit_reached = false;
1362
1363         spin_lock_irqsave(&m->lock, flags);
1364
1365         if (atomic_read(&m->pg_init_count) <= m->pg_init_retries &&
1366             !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
1367                 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
1368         else
1369                 limit_reached = true;
1370
1371         spin_unlock_irqrestore(&m->lock, flags);
1372
1373         return limit_reached;
1374 }
1375
1376 static void pg_init_done(void *data, int errors)
1377 {
1378         struct pgpath *pgpath = data;
1379         struct priority_group *pg = pgpath->pg;
1380         struct multipath *m = pg->m;
1381         unsigned long flags;
1382         bool delay_retry = false;
1383
1384         /* device or driver problems */
1385         switch (errors) {
1386         case SCSI_DH_OK:
1387                 break;
1388         case SCSI_DH_NOSYS:
1389                 if (!m->hw_handler_name) {
1390                         errors = 0;
1391                         break;
1392                 }
1393                 DMERR("Could not failover the device: Handler scsi_dh_%s "
1394                       "Error %d.", m->hw_handler_name, errors);
1395                 /*
1396                  * Fail path for now, so we do not ping pong
1397                  */
1398                 fail_path(pgpath);
1399                 break;
1400         case SCSI_DH_DEV_TEMP_BUSY:
1401                 /*
1402                  * Probably doing something like FW upgrade on the
1403                  * controller so try the other pg.
1404                  */
1405                 bypass_pg(m, pg, true);
1406                 break;
1407         case SCSI_DH_RETRY:
1408                 /* Wait before retrying. */
1409                 delay_retry = 1;
1410         case SCSI_DH_IMM_RETRY:
1411         case SCSI_DH_RES_TEMP_UNAVAIL:
1412                 if (pg_init_limit_reached(m, pgpath))
1413                         fail_path(pgpath);
1414                 errors = 0;
1415                 break;
1416         case SCSI_DH_DEV_OFFLINED:
1417         default:
1418                 /*
1419                  * We probably do not want to fail the path for a device
1420                  * error, but this is what the old dm did. In future
1421                  * patches we can do more advanced handling.
1422                  */
1423                 fail_path(pgpath);
1424         }
1425
1426         spin_lock_irqsave(&m->lock, flags);
1427         if (errors) {
1428                 if (pgpath == m->current_pgpath) {
1429                         DMERR("Could not failover device. Error %d.", errors);
1430                         m->current_pgpath = NULL;
1431                         m->current_pg = NULL;
1432                 }
1433         } else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1434                 pg->bypassed = false;
1435
1436         if (atomic_dec_return(&m->pg_init_in_progress) > 0)
1437                 /* Activations of other paths are still on going */
1438                 goto out;
1439
1440         if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
1441                 if (delay_retry)
1442                         set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1443                 else
1444                         clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1445
1446                 if (__pg_init_all_paths(m))
1447                         goto out;
1448         }
1449         clear_bit(MPATHF_QUEUE_IO, &m->flags);
1450
1451         process_queued_bios_list(m);
1452
1453         /*
1454          * Wake up any thread waiting to suspend.
1455          */
1456         wake_up(&m->pg_init_wait);
1457
1458 out:
1459         spin_unlock_irqrestore(&m->lock, flags);
1460 }
1461
1462 static void activate_path(struct work_struct *work)
1463 {
1464         struct pgpath *pgpath =
1465                 container_of(work, struct pgpath, activate_path.work);
1466
1467         if (pgpath->is_active)
1468                 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
1469                                  pg_init_done, pgpath);
1470         else
1471                 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
1472 }
1473
1474 static int noretry_error(int error)
1475 {
1476         switch (error) {
1477         case -EOPNOTSUPP:
1478         case -EREMOTEIO:
1479         case -EILSEQ:
1480         case -ENODATA:
1481         case -ENOSPC:
1482                 return 1;
1483         }
1484
1485         /* Anything else could be a path failure, so should be retried */
1486         return 0;
1487 }
1488
1489 /*
1490  * end_io handling
1491  */
1492 static int do_end_io(struct multipath *m, struct request *clone,
1493                      int error, struct dm_mpath_io *mpio)
1494 {
1495         /*
1496          * We don't queue any clone request inside the multipath target
1497          * during end I/O handling, since those clone requests don't have
1498          * bio clones.  If we queue them inside the multipath target,
1499          * we need to make bio clones, that requires memory allocation.
1500          * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests
1501          *  don't have bio clones.)
1502          * Instead of queueing the clone request here, we queue the original
1503          * request into dm core, which will remake a clone request and
1504          * clone bios for it and resubmit it later.
1505          */
1506         int r = DM_ENDIO_REQUEUE;
1507
1508         if (!error && !clone->errors)
1509                 return 0;       /* I/O complete */
1510
1511         if (noretry_error(error))
1512                 return error;
1513
1514         if (mpio->pgpath)
1515                 fail_path(mpio->pgpath);
1516
1517         if (!atomic_read(&m->nr_valid_paths)) {
1518                 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
1519                         if (!must_push_back_rq(m))
1520                                 r = -EIO;
1521                 } else {
1522                         if (error == -EBADE)
1523                                 r = error;
1524                 }
1525         }
1526
1527         return r;
1528 }
1529
1530 static int multipath_end_io(struct dm_target *ti, struct request *clone,
1531                             int error, union map_info *map_context)
1532 {
1533         struct multipath *m = ti->private;
1534         struct dm_mpath_io *mpio = get_mpio(map_context);
1535         struct pgpath *pgpath;
1536         struct path_selector *ps;
1537         int r;
1538
1539         BUG_ON(!mpio);
1540
1541         r = do_end_io(m, clone, error, mpio);
1542         pgpath = mpio->pgpath;
1543         if (pgpath) {
1544                 ps = &pgpath->pg->ps;
1545                 if (ps->type->end_io)
1546                         ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1547         }
1548         clear_request_fn_mpio(m, map_context);
1549
1550         return r;
1551 }
1552
1553 static int do_end_io_bio(struct multipath *m, struct bio *clone,
1554                          int error, struct dm_mpath_io *mpio)
1555 {
1556         unsigned long flags;
1557
1558         if (!error)
1559                 return 0;       /* I/O complete */
1560
1561         if (noretry_error(error))
1562                 return error;
1563
1564         if (mpio->pgpath)
1565                 fail_path(mpio->pgpath);
1566
1567         if (!atomic_read(&m->nr_valid_paths)) {
1568                 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
1569                         if (!must_push_back_bio(m))
1570                                 return -EIO;
1571                         return DM_ENDIO_REQUEUE;
1572                 } else {
1573                         if (error == -EBADE)
1574                                 return error;
1575                 }
1576         }
1577
1578         /* Queue for the daemon to resubmit */
1579         dm_bio_restore(&mpio->bio_details, clone);
1580
1581         spin_lock_irqsave(&m->lock, flags);
1582         bio_list_add(&m->queued_bios, clone);
1583         spin_unlock_irqrestore(&m->lock, flags);
1584         if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
1585                 queue_work(kmultipathd, &m->process_queued_bios);
1586
1587         return DM_ENDIO_INCOMPLETE;
1588 }
1589
1590 static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int error)
1591 {
1592         struct multipath *m = ti->private;
1593         struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
1594         struct pgpath *pgpath;
1595         struct path_selector *ps;
1596         int r;
1597
1598         BUG_ON(!mpio);
1599
1600         r = do_end_io_bio(m, clone, error, mpio);
1601         pgpath = mpio->pgpath;
1602         if (pgpath) {
1603                 ps = &pgpath->pg->ps;
1604                 if (ps->type->end_io)
1605                         ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1606         }
1607
1608         return r;
1609 }
1610
1611 /*
1612  * Suspend can't complete until all the I/O is processed so if
1613  * the last path fails we must error any remaining I/O.
1614  * Note that if the freeze_bdev fails while suspending, the
1615  * queue_if_no_path state is lost - userspace should reset it.
1616  */
1617 static void multipath_presuspend(struct dm_target *ti)
1618 {
1619         struct multipath *m = ti->private;
1620
1621         queue_if_no_path(m, false, true);
1622 }
1623
1624 static void multipath_postsuspend(struct dm_target *ti)
1625 {
1626         struct multipath *m = ti->private;
1627
1628         mutex_lock(&m->work_mutex);
1629         flush_multipath_work(m);
1630         mutex_unlock(&m->work_mutex);
1631 }
1632
1633 /*
1634  * Restore the queue_if_no_path setting.
1635  */
1636 static void multipath_resume(struct dm_target *ti)
1637 {
1638         struct multipath *m = ti->private;
1639
1640         if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags))
1641                 set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
1642         else
1643                 clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
1644         smp_mb__after_atomic();
1645 }
1646
1647 /*
1648  * Info output has the following format:
1649  * num_multipath_feature_args [multipath_feature_args]*
1650  * num_handler_status_args [handler_status_args]*
1651  * num_groups init_group_number
1652  *            [A|D|E num_ps_status_args [ps_status_args]*
1653  *             num_paths num_selector_args
1654  *             [path_dev A|F fail_count [selector_args]* ]+ ]+
1655  *
1656  * Table output has the following format (identical to the constructor string):
1657  * num_feature_args [features_args]*
1658  * num_handler_args hw_handler [hw_handler_args]*
1659  * num_groups init_group_number
1660  *     [priority selector-name num_ps_args [ps_args]*
1661  *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1662  */
1663 static void multipath_status(struct dm_target *ti, status_type_t type,
1664                              unsigned status_flags, char *result, unsigned maxlen)
1665 {
1666         int sz = 0;
1667         unsigned long flags;
1668         struct multipath *m = ti->private;
1669         struct priority_group *pg;
1670         struct pgpath *p;
1671         unsigned pg_num;
1672         char state;
1673
1674         spin_lock_irqsave(&m->lock, flags);
1675
1676         /* Features */
1677         if (type == STATUSTYPE_INFO)
1678                 DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags),
1679                        atomic_read(&m->pg_init_count));
1680         else {
1681                 DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
1682                               (m->pg_init_retries > 0) * 2 +
1683                               (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
1684                               test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags));
1685                 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1686                         DMEMIT("queue_if_no_path ");
1687                 if (m->pg_init_retries)
1688                         DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1689                 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1690                         DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1691                 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
1692                         DMEMIT("retain_attached_hw_handler ");
1693         }
1694
1695         if (!m->hw_handler_name || type == STATUSTYPE_INFO)
1696                 DMEMIT("0 ");
1697         else
1698                 DMEMIT("1 %s ", m->hw_handler_name);
1699
1700         DMEMIT("%u ", m->nr_priority_groups);
1701
1702         if (m->next_pg)
1703                 pg_num = m->next_pg->pg_num;
1704         else if (m->current_pg)
1705                 pg_num = m->current_pg->pg_num;
1706         else
1707                 pg_num = (m->nr_priority_groups ? 1 : 0);
1708
1709         DMEMIT("%u ", pg_num);
1710
1711         switch (type) {
1712         case STATUSTYPE_INFO:
1713                 list_for_each_entry(pg, &m->priority_groups, list) {
1714                         if (pg->bypassed)
1715                                 state = 'D';    /* Disabled */
1716                         else if (pg == m->current_pg)
1717                                 state = 'A';    /* Currently Active */
1718                         else
1719                                 state = 'E';    /* Enabled */
1720
1721                         DMEMIT("%c ", state);
1722
1723                         if (pg->ps.type->status)
1724                                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1725                                                           result + sz,
1726                                                           maxlen - sz);
1727                         else
1728                                 DMEMIT("0 ");
1729
1730                         DMEMIT("%u %u ", pg->nr_pgpaths,
1731                                pg->ps.type->info_args);
1732
1733                         list_for_each_entry(p, &pg->pgpaths, list) {
1734                                 DMEMIT("%s %s %u ", p->path.dev->name,
1735                                        p->is_active ? "A" : "F",
1736                                        p->fail_count);
1737                                 if (pg->ps.type->status)
1738                                         sz += pg->ps.type->status(&pg->ps,
1739                                               &p->path, type, result + sz,
1740                                               maxlen - sz);
1741                         }
1742                 }
1743                 break;
1744
1745         case STATUSTYPE_TABLE:
1746                 list_for_each_entry(pg, &m->priority_groups, list) {
1747                         DMEMIT("%s ", pg->ps.type->name);
1748
1749                         if (pg->ps.type->status)
1750                                 sz += pg->ps.type->status(&pg->ps, NULL, type,
1751                                                           result + sz,
1752                                                           maxlen - sz);
1753                         else
1754                                 DMEMIT("0 ");
1755
1756                         DMEMIT("%u %u ", pg->nr_pgpaths,
1757                                pg->ps.type->table_args);
1758
1759                         list_for_each_entry(p, &pg->pgpaths, list) {
1760                                 DMEMIT("%s ", p->path.dev->name);
1761                                 if (pg->ps.type->status)
1762                                         sz += pg->ps.type->status(&pg->ps,
1763                                               &p->path, type, result + sz,
1764                                               maxlen - sz);
1765                         }
1766                 }
1767                 break;
1768         }
1769
1770         spin_unlock_irqrestore(&m->lock, flags);
1771 }
1772
1773 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1774 {
1775         int r = -EINVAL;
1776         struct dm_dev *dev;
1777         struct multipath *m = ti->private;
1778         action_fn action;
1779
1780         mutex_lock(&m->work_mutex);
1781
1782         if (dm_suspended(ti)) {
1783                 r = -EBUSY;
1784                 goto out;
1785         }
1786
1787         if (argc == 1) {
1788                 if (!strcasecmp(argv[0], "queue_if_no_path")) {
1789                         r = queue_if_no_path(m, true, false);
1790                         goto out;
1791                 } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
1792                         r = queue_if_no_path(m, false, false);
1793                         goto out;
1794                 }
1795         }
1796
1797         if (argc != 2) {
1798                 DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
1799                 goto out;
1800         }
1801
1802         if (!strcasecmp(argv[0], "disable_group")) {
1803                 r = bypass_pg_num(m, argv[1], true);
1804                 goto out;
1805         } else if (!strcasecmp(argv[0], "enable_group")) {
1806                 r = bypass_pg_num(m, argv[1], false);
1807                 goto out;
1808         } else if (!strcasecmp(argv[0], "switch_group")) {
1809                 r = switch_pg_num(m, argv[1]);
1810                 goto out;
1811         } else if (!strcasecmp(argv[0], "reinstate_path"))
1812                 action = reinstate_path;
1813         else if (!strcasecmp(argv[0], "fail_path"))
1814                 action = fail_path;
1815         else {
1816                 DMWARN("Unrecognised multipath message received: %s", argv[0]);
1817                 goto out;
1818         }
1819
1820         r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
1821         if (r) {
1822                 DMWARN("message: error getting device %s",
1823                        argv[1]);
1824                 goto out;
1825         }
1826
1827         r = action_dev(m, dev, action);
1828
1829         dm_put_device(ti, dev);
1830
1831 out:
1832         mutex_unlock(&m->work_mutex);
1833         return r;
1834 }
1835
1836 static int multipath_prepare_ioctl(struct dm_target *ti,
1837                 struct block_device **bdev, fmode_t *mode)
1838 {
1839         struct multipath *m = ti->private;
1840         struct pgpath *current_pgpath;
1841         int r;
1842
1843         current_pgpath = lockless_dereference(m->current_pgpath);
1844         if (!current_pgpath)
1845                 current_pgpath = choose_pgpath(m, 0);
1846
1847         if (current_pgpath) {
1848                 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
1849                         *bdev = current_pgpath->path.dev->bdev;
1850                         *mode = current_pgpath->path.dev->mode;
1851                         r = 0;
1852                 } else {
1853                         /* pg_init has not started or completed */
1854                         r = -ENOTCONN;
1855                 }
1856         } else {
1857                 /* No path is available */
1858                 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1859                         r = -ENOTCONN;
1860                 else
1861                         r = -EIO;
1862         }
1863
1864         if (r == -ENOTCONN) {
1865                 if (!lockless_dereference(m->current_pg)) {
1866                         /* Path status changed, redo selection */
1867                         (void) choose_pgpath(m, 0);
1868                 }
1869                 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1870                         pg_init_all_paths(m);
1871                 dm_table_run_md_queue_async(m->ti->table);
1872                 process_queued_bios_list(m);
1873         }
1874
1875         /*
1876          * Only pass ioctls through if the device sizes match exactly.
1877          */
1878         if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
1879                 return 1;
1880         return r;
1881 }
1882
1883 static int multipath_iterate_devices(struct dm_target *ti,
1884                                      iterate_devices_callout_fn fn, void *data)
1885 {
1886         struct multipath *m = ti->private;
1887         struct priority_group *pg;
1888         struct pgpath *p;
1889         int ret = 0;
1890
1891         list_for_each_entry(pg, &m->priority_groups, list) {
1892                 list_for_each_entry(p, &pg->pgpaths, list) {
1893                         ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
1894                         if (ret)
1895                                 goto out;
1896                 }
1897         }
1898
1899 out:
1900         return ret;
1901 }
1902
1903 static int pgpath_busy(struct pgpath *pgpath)
1904 {
1905         struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1906
1907         return blk_lld_busy(q);
1908 }
1909
1910 /*
1911  * We return "busy", only when we can map I/Os but underlying devices
1912  * are busy (so even if we map I/Os now, the I/Os will wait on
1913  * the underlying queue).
1914  * In other words, if we want to kill I/Os or queue them inside us
1915  * due to map unavailability, we don't return "busy".  Otherwise,
1916  * dm core won't give us the I/Os and we can't do what we want.
1917  */
1918 static int multipath_busy(struct dm_target *ti)
1919 {
1920         bool busy = false, has_active = false;
1921         struct multipath *m = ti->private;
1922         struct priority_group *pg, *next_pg;
1923         struct pgpath *pgpath;
1924
1925         /* pg_init in progress or no paths available */
1926         if (atomic_read(&m->pg_init_in_progress) ||
1927             (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)))
1928                 return true;
1929
1930         /* Guess which priority_group will be used at next mapping time */
1931         pg = lockless_dereference(m->current_pg);
1932         next_pg = lockless_dereference(m->next_pg);
1933         if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg))
1934                 pg = next_pg;
1935
1936         if (!pg) {
1937                 /*
1938                  * We don't know which pg will be used at next mapping time.
1939                  * We don't call choose_pgpath() here to avoid to trigger
1940                  * pg_init just by busy checking.
1941                  * So we don't know whether underlying devices we will be using
1942                  * at next mapping time are busy or not. Just try mapping.
1943                  */
1944                 return busy;
1945         }
1946
1947         /*
1948          * If there is one non-busy active path at least, the path selector
1949          * will be able to select it. So we consider such a pg as not busy.
1950          */
1951         busy = true;
1952         list_for_each_entry(pgpath, &pg->pgpaths, list) {
1953                 if (pgpath->is_active) {
1954                         has_active = true;
1955                         if (!pgpath_busy(pgpath)) {
1956                                 busy = false;
1957                                 break;
1958                         }
1959                 }
1960         }
1961
1962         if (!has_active) {
1963                 /*
1964                  * No active path in this pg, so this pg won't be used and
1965                  * the current_pg will be changed at next mapping time.
1966                  * We need to try mapping to determine it.
1967                  */
1968                 busy = false;
1969         }
1970
1971         return busy;
1972 }
1973
1974 /*-----------------------------------------------------------------
1975  * Module setup
1976  *---------------------------------------------------------------*/
1977 static struct target_type multipath_target = {
1978         .name = "multipath",
1979         .version = {1, 11, 0},
1980         .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
1981         .module = THIS_MODULE,
1982         .ctr = multipath_ctr,
1983         .dtr = multipath_dtr,
1984         .map_rq = multipath_map,
1985         .clone_and_map_rq = multipath_clone_and_map,
1986         .release_clone_rq = multipath_release_clone,
1987         .rq_end_io = multipath_end_io,
1988         .presuspend = multipath_presuspend,
1989         .postsuspend = multipath_postsuspend,
1990         .resume = multipath_resume,
1991         .status = multipath_status,
1992         .message = multipath_message,
1993         .prepare_ioctl = multipath_prepare_ioctl,
1994         .iterate_devices = multipath_iterate_devices,
1995         .busy = multipath_busy,
1996 };
1997
1998 static struct target_type multipath_bio_target = {
1999         .name = "multipath-bio",
2000         .version = {1, 0, 0},
2001         .module = THIS_MODULE,
2002         .ctr = multipath_bio_ctr,
2003         .dtr = multipath_dtr,
2004         .map = multipath_map_bio,
2005         .end_io = multipath_end_io_bio,
2006         .presuspend = multipath_presuspend,
2007         .postsuspend = multipath_postsuspend,
2008         .resume = multipath_resume,
2009         .status = multipath_status,
2010         .message = multipath_message,
2011         .prepare_ioctl = multipath_prepare_ioctl,
2012         .iterate_devices = multipath_iterate_devices,
2013         .busy = multipath_busy,
2014 };
2015
2016 static int __init dm_multipath_init(void)
2017 {
2018         int r;
2019
2020         /* allocate a slab for the dm_mpath_ios */
2021         _mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
2022         if (!_mpio_cache)
2023                 return -ENOMEM;
2024
2025         r = dm_register_target(&multipath_target);
2026         if (r < 0) {
2027                 DMERR("request-based register failed %d", r);
2028                 r = -EINVAL;
2029                 goto bad_register_target;
2030         }
2031
2032         r = dm_register_target(&multipath_bio_target);
2033         if (r < 0) {
2034                 DMERR("bio-based register failed %d", r);
2035                 r = -EINVAL;
2036                 goto bad_register_bio_based_target;
2037         }
2038
2039         kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
2040         if (!kmultipathd) {
2041                 DMERR("failed to create workqueue kmpathd");
2042                 r = -ENOMEM;
2043                 goto bad_alloc_kmultipathd;
2044         }
2045
2046         /*
2047          * A separate workqueue is used to handle the device handlers
2048          * to avoid overloading existing workqueue. Overloading the
2049          * old workqueue would also create a bottleneck in the
2050          * path of the storage hardware device activation.
2051          */
2052         kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
2053                                                   WQ_MEM_RECLAIM);
2054         if (!kmpath_handlerd) {
2055                 DMERR("failed to create workqueue kmpath_handlerd");
2056                 r = -ENOMEM;
2057                 goto bad_alloc_kmpath_handlerd;
2058         }
2059
2060         return 0;
2061
2062 bad_alloc_kmpath_handlerd:
2063         destroy_workqueue(kmultipathd);
2064 bad_alloc_kmultipathd:
2065         dm_unregister_target(&multipath_bio_target);
2066 bad_register_bio_based_target:
2067         dm_unregister_target(&multipath_target);
2068 bad_register_target:
2069         kmem_cache_destroy(_mpio_cache);
2070
2071         return r;
2072 }
2073
2074 static void __exit dm_multipath_exit(void)
2075 {
2076         destroy_workqueue(kmpath_handlerd);
2077         destroy_workqueue(kmultipathd);
2078
2079         dm_unregister_target(&multipath_target);
2080         dm_unregister_target(&multipath_bio_target);
2081         kmem_cache_destroy(_mpio_cache);
2082 }
2083
2084 module_init(dm_multipath_init);
2085 module_exit(dm_multipath_exit);
2086
2087 MODULE_DESCRIPTION(DM_NAME " multipath target");
2088 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
2089 MODULE_LICENSE("GPL");