drivers/md/dm.c

   1 /*
   2  * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3  * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4  *
   5  * This file is released under the GPL.
   6  */
   7
   8 #include "dm.h"
   9 #include "dm-uevent.h"
  10
  11 #include <linux/init.h>
  12 #include <linux/module.h>
  13 #include <linux/mutex.h>
  14 #include <linux/moduleparam.h>
  15 #include <linux/blkpg.h>
  16 #include <linux/bio.h>
  17 #include <linux/mempool.h>
  18 #include <linux/slab.h>
  19 #include <linux/idr.h>
  20 #include <linux/hdreg.h>
  21 #include <linux/delay.h>
  22 #include <linux/wait.h>
  23 #include <linux/kthread.h>
  24 #include <linux/ktime.h>
  25 #include <linux/elevator.h> /* for rq_end_sector() */
  26 #include <linux/blk-mq.h>
  27 #include <linux/pr.h>
  28
  29 #include <trace/events/block.h>
  30
  31 #define DM_MSG_PREFIX "core"
  32
  33 #ifdef CONFIG_PRINTK
  34 /*
  35  * ratelimit state to be used in DMXXX_LIMIT().
  36  */
  37 DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
  38                        DEFAULT_RATELIMIT_INTERVAL,
  39                        DEFAULT_RATELIMIT_BURST);
  40 EXPORT_SYMBOL(dm_ratelimit_state);
  41 #endif
  42
  43 /*
  44  * Cookies are numeric values sent with CHANGE and REMOVE
  45  * uevents while resuming, removing or renaming the device.
  46  */
  47 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  48 #define DM_COOKIE_LENGTH 24
  49
  50 static const char *_name = DM_NAME;
  51
  52 static unsigned int major = 0;
  53 static unsigned int _major = 0;
  54
  55 static DEFINE_IDR(_minor_idr);
  56
  57 static DEFINE_SPINLOCK(_minor_lock);
  58
  59 static void do_deferred_remove(struct work_struct *w);
  60
  61 static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  62
  63 static struct workqueue_struct *deferred_remove_workqueue;
  64
  65 /*
  66  * For bio-based dm.
  67  * One of these is allocated per bio.
  68  */
  69 struct dm_io {
  70         struct mapped_device *md;
  71         int error;
  72         atomic_t io_count;
  73         struct bio *bio;
  74         unsigned long start_time;
  75         spinlock_t endio_lock;
  76         struct dm_stats_aux stats_aux;
  77 };
  78
  79 /*
  80  * For request-based dm.
  81  * One of these is allocated per request.
  82  */
  83 struct dm_rq_target_io {
  84         struct mapped_device *md;
  85         struct dm_target *ti;
  86         struct request *orig, *clone;
  87         struct kthread_work work;
  88         int error;
  89         union map_info info;
  90         struct dm_stats_aux stats_aux;
  91         unsigned long duration_jiffies;
  92         unsigned n_sectors;
  93 };
  94
  95 /*
  96  * For request-based dm - the bio clones we allocate are embedded in these
  97  * structs.
  98  *
  99  * We allocate these with bio_alloc_bioset, using the front_pad parameter when
 100  * the bioset is created - this means the bio has to come at the end of the
 101  * struct.
 102  */
 103 struct dm_rq_clone_bio_info {
 104         struct bio *orig;
 105         struct dm_rq_target_io *tio;
 106         struct bio clone;
 107 };
 108
 109 #define MINOR_ALLOCED ((void *)-1)
 110
 111 /*
 112  * Bits for the md->flags field.
 113  */
 114 #define DMF_BLOCK_IO_FOR_SUSPEND 0
 115 #define DMF_SUSPENDED 1
 116 #define DMF_FROZEN 2
 117 #define DMF_FREEING 3
 118 #define DMF_DELETING 4
 119 #define DMF_NOFLUSH_SUSPENDING 5
 120 #define DMF_DEFERRED_REMOVE 6
 121 #define DMF_SUSPENDED_INTERNALLY 7
 122
 123 /*
 124  * A dummy definition to make RCU happy.
 125  * struct dm_table should never be dereferenced in this file.
 126  */
 127 struct dm_table {
 128         int undefined__;
 129 };
 130
 131 /*
 132  * Work processed by per-device workqueue.
 133  */
 134 struct mapped_device {
 135         struct srcu_struct io_barrier;
 136         struct mutex suspend_lock;
 137         atomic_t holders;
 138         atomic_t open_count;
 139
 140         /*
 141          * The current mapping.
 142          * Use dm_get_live_table{_fast} or take suspend_lock for
 143          * dereference.
 144          */
 145         struct dm_table __rcu *map;
 146
 147         struct list_head table_devices;
 148         struct mutex table_devices_lock;
 149
 150         unsigned long flags;
 151
 152         struct request_queue *queue;
 153         unsigned type;
 154         /* Protect queue and type against concurrent access. */
 155         struct mutex type_lock;
 156
 157         struct dm_target *immutable_target;
 158         struct target_type *immutable_target_type;
 159
 160         struct gendisk *disk;
 161         char name[16];
 162
 163         void *interface_ptr;
 164
 165         /*
 166          * A list of ios that arrived while we were suspended.
 167          */
 168         atomic_t pending[2];
 169         wait_queue_head_t wait;
 170         struct work_struct work;
 171         struct bio_list deferred;
 172         spinlock_t deferred_lock;
 173
 174         /*
 175          * Processing queue (flush)
 176          */
 177         struct workqueue_struct *wq;
 178
 179         /*
 180          * io objects are allocated from here.
 181          */
 182         mempool_t *io_pool;
 183         mempool_t *rq_pool;
 184
 185         struct bio_set *bs;
 186
 187         /*
 188          * Event handling.
 189          */
 190         atomic_t event_nr;
 191         wait_queue_head_t eventq;
 192         atomic_t uevent_seq;
 193         struct list_head uevent_list;
 194         spinlock_t uevent_lock; /* Protect access to uevent_list */
 195
 196         /*
 197          * freeze/thaw support require holding onto a super block
 198          */
 199         struct super_block *frozen_sb;
 200         struct block_device *bdev;
 201
 202         /* forced geometry settings */
 203         struct hd_geometry geometry;
 204
 205         /* kobject and completion */
 206         struct dm_kobject_holder kobj_holder;
 207
 208         /* zero-length flush that will be cloned and submitted to targets */
 209         struct bio flush_bio;
 210
 211         /* the number of internal suspends */
 212         unsigned internal_suspend_count;
 213
 214         struct dm_stats stats;
 215
 216         struct kthread_worker kworker;
 217         struct task_struct *kworker_task;
 218
 219         /* for request-based merge heuristic in dm_request_fn() */
 220         unsigned seq_rq_merge_deadline_usecs;
 221         int last_rq_rw;
 222         sector_t last_rq_pos;
 223         ktime_t last_rq_start_time;
 224
 225         /* for blk-mq request-based DM support */
 226         struct blk_mq_tag_set tag_set;
 227         bool use_blk_mq;
 228 };
 229
 230 #ifdef CONFIG_DM_MQ_DEFAULT
 231 static bool use_blk_mq = true;
 232 #else
 233 static bool use_blk_mq = false;
 234 #endif
 235
 236 bool dm_use_blk_mq(struct mapped_device *md)
 237 {
 238         return md->use_blk_mq;
 239 }
 240
 241 /*
 242  * For mempools pre-allocation at the table loading time.
 243  */
 244 struct dm_md_mempools {
 245         mempool_t *io_pool;
 246         mempool_t *rq_pool;
 247         struct bio_set *bs;
 248 };
 249
 250 struct table_device {
 251         struct list_head list;
 252         atomic_t count;
 253         struct dm_dev dm_dev;
 254 };
 255
 256 #define RESERVED_BIO_BASED_IOS          16
 257 #define RESERVED_REQUEST_BASED_IOS      256
 258 #define RESERVED_MAX_IOS                1024
 259 static struct kmem_cache *_io_cache;
 260 static struct kmem_cache *_rq_tio_cache;
 261 static struct kmem_cache *_rq_cache;
 262
 263 /*
 264  * Bio-based DM's mempools' reserved IOs set by the user.
 265  */
 266 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 267
 268 /*
 269  * Request-based DM's mempools' reserved IOs set by the user.
 270  */
 271 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
 272
 273 static unsigned __dm_get_module_param(unsigned *module_param,
 274                                       unsigned def, unsigned max)
 275 {
 276         unsigned param = ACCESS_ONCE(*module_param);
 277         unsigned modified_param = 0;
 278
 279         if (!param)
 280                 modified_param = def;
 281         else if (param > max)
 282                 modified_param = max;
 283
 284         if (modified_param) {
 285                 (void)cmpxchg(module_param, param, modified_param);
 286                 param = modified_param;
 287         }
 288
 289         return param;
 290 }
 291
 292 unsigned dm_get_reserved_bio_based_ios(void)
 293 {
 294         return __dm_get_module_param(&reserved_bio_based_ios,
 295                                      RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
 296 }
 297 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 298
 299 unsigned dm_get_reserved_rq_based_ios(void)
 300 {
 301         return __dm_get_module_param(&reserved_rq_based_ios,
 302                                      RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
 303 }
 304 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
 305
 306 static int __init local_init(void)
 307 {
 308         int r = -ENOMEM;
 309
 310         /* allocate a slab for the dm_ios */
 311         _io_cache = KMEM_CACHE(dm_io, 0);
 312         if (!_io_cache)
 313                 return r;
 314
 315         _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 316         if (!_rq_tio_cache)
 317                 goto out_free_io_cache;
 318
 319         _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request),
 320                                       __alignof__(struct request), 0, NULL);
 321         if (!_rq_cache)
 322                 goto out_free_rq_tio_cache;
 323
 324         r = dm_uevent_init();
 325         if (r)
 326                 goto out_free_rq_cache;
 327
 328         deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 329         if (!deferred_remove_workqueue) {
 330                 r = -ENOMEM;
 331                 goto out_uevent_exit;
 332         }
 333
 334         _major = major;
 335         r = register_blkdev(_major, _name);
 336         if (r < 0)
 337                 goto out_free_workqueue;
 338
 339         if (!_major)
 340                 _major = r;
 341
 342         return 0;
 343
 344 out_free_workqueue:
 345         destroy_workqueue(deferred_remove_workqueue);
 346 out_uevent_exit:
 347         dm_uevent_exit();
 348 out_free_rq_cache:
 349         kmem_cache_destroy(_rq_cache);
 350 out_free_rq_tio_cache:
 351         kmem_cache_destroy(_rq_tio_cache);
 352 out_free_io_cache:
 353         kmem_cache_destroy(_io_cache);
 354
 355         return r;
 356 }
 357
 358 static void local_exit(void)
 359 {
 360         flush_scheduled_work();
 361         destroy_workqueue(deferred_remove_workqueue);
 362
 363         kmem_cache_destroy(_rq_cache);
 364         kmem_cache_destroy(_rq_tio_cache);
 365         kmem_cache_destroy(_io_cache);
 366         unregister_blkdev(_major, _name);
 367         dm_uevent_exit();
 368
 369         _major = 0;
 370
 371         DMINFO("cleaned up");
 372 }
 373
 374 static int (*_inits[])(void) __initdata = {
 375         local_init,
 376         dm_target_init,
 377         dm_linear_init,
 378         dm_stripe_init,
 379         dm_io_init,
 380         dm_kcopyd_init,
 381         dm_interface_init,
 382         dm_statistics_init,
 383 };
 384
 385 static void (*_exits[])(void) = {
 386         local_exit,
 387         dm_target_exit,
 388         dm_linear_exit,
 389         dm_stripe_exit,
 390         dm_io_exit,
 391         dm_kcopyd_exit,
 392         dm_interface_exit,
 393         dm_statistics_exit,
 394 };
 395
 396 static int __init dm_init(void)
 397 {
 398         const int count = ARRAY_SIZE(_inits);
 399
 400         int r, i;
 401
 402         for (i = 0; i < count; i++) {
 403                 r = _inits[i]();
 404                 if (r)
 405                         goto bad;
 406         }
 407
 408         return 0;
 409
 410       bad:
 411         while (i--)
 412                 _exits[i]();
 413
 414         return r;
 415 }
 416
 417 static void __exit dm_exit(void)
 418 {
 419         int i = ARRAY_SIZE(_exits);
 420
 421         while (i--)
 422                 _exits[i]();
 423
 424         /*
 425          * Should be empty by this point.
 426          */
 427         idr_destroy(&_minor_idr);
 428 }
 429
 430 /*
 431  * Block device functions
 432  */
 433 int dm_deleting_md(struct mapped_device *md)
 434 {
 435         return test_bit(DMF_DELETING, &md->flags);
 436 }
 437
 438 static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 439 {
 440         struct mapped_device *md;
 441
 442         spin_lock(&_minor_lock);
 443
 444         md = bdev->bd_disk->private_data;
 445         if (!md)
 446                 goto out;
 447
 448         if (test_bit(DMF_FREEING, &md->flags) ||
 449             dm_deleting_md(md)) {
 450                 md = NULL;
 451                 goto out;
 452         }
 453
 454         dm_get(md);
 455         atomic_inc(&md->open_count);
 456 out:
 457         spin_unlock(&_minor_lock);
 458
 459         return md ? 0 : -ENXIO;
 460 }
 461
 462 static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 463 {
 464         struct mapped_device *md;
 465
 466         spin_lock(&_minor_lock);
 467
 468         md = disk->private_data;
 469         if (WARN_ON(!md))
 470                 goto out;
 471
 472         if (atomic_dec_and_test(&md->open_count) &&
 473             (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 474                 queue_work(deferred_remove_workqueue, &deferred_remove_work);
 475
 476         dm_put(md);
 477 out:
 478         spin_unlock(&_minor_lock);
 479 }
 480
 481 int dm_open_count(struct mapped_device *md)
 482 {
 483         return atomic_read(&md->open_count);
 484 }
 485
 486 /*
 487  * Guarantees nothing is using the device before it's deleted.
 488  */
 489 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 490 {
 491         int r = 0;
 492
 493         spin_lock(&_minor_lock);
 494
 495         if (dm_open_count(md)) {
 496                 r = -EBUSY;
 497                 if (mark_deferred)
 498                         set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 499         } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 500                 r = -EEXIST;
 501         else
 502                 set_bit(DMF_DELETING, &md->flags);
 503
 504         spin_unlock(&_minor_lock);
 505
 506         return r;
 507 }
 508
 509 int dm_cancel_deferred_remove(struct mapped_device *md)
 510 {
 511         int r = 0;
 512
 513         spin_lock(&_minor_lock);
 514
 515         if (test_bit(DMF_DELETING, &md->flags))
 516                 r = -EBUSY;
 517         else
 518                 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 519
 520         spin_unlock(&_minor_lock);
 521
 522         return r;
 523 }
 524
 525 static void do_deferred_remove(struct work_struct *w)
 526 {
 527         dm_deferred_remove();
 528 }
 529
 530 sector_t dm_get_size(struct mapped_device *md)
 531 {
 532         return get_capacity(md->disk);
 533 }
 534
 535 struct request_queue *dm_get_md_queue(struct mapped_device *md)
 536 {
 537         return md->queue;
 538 }
 539
 540 struct dm_stats *dm_get_stats(struct mapped_device *md)
 541 {
 542         return &md->stats;
 543 }
 544
 545 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 546 {
 547         struct mapped_device *md = bdev->bd_disk->private_data;
 548
 549         return dm_get_geometry(md, geo);
 550 }
 551
 552 static int dm_grab_bdev_for_ioctl(struct mapped_device *md,
 553                                   struct block_device **bdev,
 554                                   fmode_t *mode)
 555 {
 556         struct dm_target *tgt;
 557         struct dm_table *map;
 558         int srcu_idx, r;
 559
 560 retry:
 561         r = -ENOTTY;
 562         map = dm_get_live_table(md, &srcu_idx);
 563         if (!map || !dm_table_get_size(map))
 564                 goto out;
 565
 566         /* We only support devices that have a single target */
 567         if (dm_table_get_num_targets(map) != 1)
 568                 goto out;
 569
 570         tgt = dm_table_get_target(map, 0);
 571         if (!tgt->type->prepare_ioctl)
 572                 goto out;
 573
 574         if (dm_suspended_md(md)) {
 575                 r = -EAGAIN;
 576                 goto out;
 577         }
 578
 579         r = tgt->type->prepare_ioctl(tgt, bdev, mode);
 580         if (r < 0)
 581                 goto out;
 582
 583         bdgrab(*bdev);
 584         dm_put_live_table(md, srcu_idx);
 585         return r;
 586
 587 out:
 588         dm_put_live_table(md, srcu_idx);
 589         if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 590                 msleep(10);
 591                 goto retry;
 592         }
 593         return r;
 594 }
 595
 596 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 597                         unsigned int cmd, unsigned long arg)
 598 {
 599         struct mapped_device *md = bdev->bd_disk->private_data;
 600         int r;
 601
 602         r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
 603         if (r < 0)
 604                 return r;
 605
 606         if (r > 0) {
 607                 /*
 608                  * Target determined this ioctl is being issued against
 609                  * a logical partition of the parent bdev; so extra
 610                  * validation is needed.
 611                  */
 612                 r = scsi_verify_blk_ioctl(NULL, cmd);
 613                 if (r)
 614                         goto out;
 615         }
 616
 617         r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 618 out:
 619         bdput(bdev);
 620         return r;
 621 }
 622
 623 static struct dm_io *alloc_io(struct mapped_device *md)
 624 {
 625         return mempool_alloc(md->io_pool, GFP_NOIO);
 626 }
 627
 628 static void free_io(struct mapped_device *md, struct dm_io *io)
 629 {
 630         mempool_free(io, md->io_pool);
 631 }
 632
 633 static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 634 {
 635         bio_put(&tio->clone);
 636 }
 637
 638 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
 639                                             gfp_t gfp_mask)
 640 {
 641         return mempool_alloc(md->io_pool, gfp_mask);
 642 }
 643
 644 static void free_rq_tio(struct dm_rq_target_io *tio)
 645 {
 646         mempool_free(tio, tio->md->io_pool);
 647 }
 648
 649 static struct request *alloc_clone_request(struct mapped_device *md,
 650                                            gfp_t gfp_mask)
 651 {
 652         return mempool_alloc(md->rq_pool, gfp_mask);
 653 }
 654
 655 static void free_clone_request(struct mapped_device *md, struct request *rq)
 656 {
 657         mempool_free(rq, md->rq_pool);
 658 }
 659
 660 static int md_in_flight(struct mapped_device *md)
 661 {
 662         return atomic_read(&md->pending[READ]) +
 663                atomic_read(&md->pending[WRITE]);
 664 }
 665
 666 static void start_io_acct(struct dm_io *io)
 667 {
 668         struct mapped_device *md = io->md;
 669         struct bio *bio = io->bio;
 670         int cpu;
 671         int rw = bio_data_dir(bio);
 672
 673         io->start_time = jiffies;
 674
 675         cpu = part_stat_lock();
 676         part_round_stats(cpu, &dm_disk(md)->part0);
 677         part_stat_unlock();
 678         atomic_set(&dm_disk(md)->part0.in_flight[rw],
 679                 atomic_inc_return(&md->pending[rw]));
 680
 681         if (unlikely(dm_stats_used(&md->stats)))
 682                 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
 683                                     bio_sectors(bio), false, 0, &io->stats_aux);
 684 }
 685
 686 static void end_io_acct(struct dm_io *io)
 687 {
 688         struct mapped_device *md = io->md;
 689         struct bio *bio = io->bio;
 690         unsigned long duration = jiffies - io->start_time;
 691         int pending;
 692         int rw = bio_data_dir(bio);
 693
 694         generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
 695
 696         if (unlikely(dm_stats_used(&md->stats)))
 697                 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
 698                                     bio_sectors(bio), true, duration, &io->stats_aux);
 699
 700         /*
 701          * After this is decremented the bio must not be touched if it is
 702          * a flush.
 703          */
 704         pending = atomic_dec_return(&md->pending[rw]);
 705         atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 706         pending += atomic_read(&md->pending[rw^0x1]);
 707
 708         /* nudge anyone waiting on suspend queue */
 709         if (!pending)
 710                 wake_up(&md->wait);
 711 }
 712
 713 /*
 714  * Add the bio to the list of deferred io.
 715  */
 716 static void queue_io(struct mapped_device *md, struct bio *bio)
 717 {
 718         unsigned long flags;
 719
 720         spin_lock_irqsave(&md->deferred_lock, flags);
 721         bio_list_add(&md->deferred, bio);
 722         spin_unlock_irqrestore(&md->deferred_lock, flags);
 723         queue_work(md->wq, &md->work);
 724 }
 725
 726 /*
 727  * Everyone (including functions in this file), should use this
 728  * function to access the md->map field, and make sure they call
 729  * dm_put_live_table() when finished.
 730  */
 731 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 732 {
 733         *srcu_idx = srcu_read_lock(&md->io_barrier);
 734
 735         return srcu_dereference(md->map, &md->io_barrier);
 736 }
 737
 738 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 739 {
 740         srcu_read_unlock(&md->io_barrier, srcu_idx);
 741 }
 742
 743 void dm_sync_table(struct mapped_device *md)
 744 {
 745         synchronize_srcu(&md->io_barrier);
 746         synchronize_rcu_expedited();
 747 }
 748
 749 /*
 750  * A fast alternative to dm_get_live_table/dm_put_live_table.
 751  * The caller must not block between these two functions.
 752  */
 753 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 754 {
 755         rcu_read_lock();
 756         return rcu_dereference(md->map);
 757 }
 758
 759 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 760 {
 761         rcu_read_unlock();
 762 }
 763
 764 /*
 765  * Open a table device so we can use it as a map destination.
 766  */
 767 static int open_table_device(struct table_device *td, dev_t dev,
 768                              struct mapped_device *md)
 769 {
 770         static char *_claim_ptr = "I belong to device-mapper";
 771         struct block_device *bdev;
 772
 773         int r;
 774
 775         BUG_ON(td->dm_dev.bdev);
 776
 777         bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr);
 778         if (IS_ERR(bdev))
 779                 return PTR_ERR(bdev);
 780
 781         r = bd_link_disk_holder(bdev, dm_disk(md));
 782         if (r) {
 783                 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 784                 return r;
 785         }
 786
 787         td->dm_dev.bdev = bdev;
 788         return 0;
 789 }
 790
 791 /*
 792  * Close a table device that we've been using.
 793  */
 794 static void close_table_device(struct table_device *td, struct mapped_device *md)
 795 {
 796         if (!td->dm_dev.bdev)
 797                 return;
 798
 799         bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 800         blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 801         td->dm_dev.bdev = NULL;
 802 }
 803
 804 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 805                                               fmode_t mode) {
 806         struct table_device *td;
 807
 808         list_for_each_entry(td, l, list)
 809                 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 810                         return td;
 811
 812         return NULL;
 813 }
 814
 815 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 816                         struct dm_dev **result) {
 817         int r;
 818         struct table_device *td;
 819
 820         mutex_lock(&md->table_devices_lock);
 821         td = find_table_device(&md->table_devices, dev, mode);
 822         if (!td) {
 823                 td = kmalloc(sizeof(*td), GFP_KERNEL);
 824                 if (!td) {
 825                         mutex_unlock(&md->table_devices_lock);
 826                         return -ENOMEM;
 827                 }
 828
 829                 td->dm_dev.mode = mode;
 830                 td->dm_dev.bdev = NULL;
 831
 832                 if ((r = open_table_device(td, dev, md))) {
 833                         mutex_unlock(&md->table_devices_lock);
 834                         kfree(td);
 835                         return r;
 836                 }
 837
 838                 format_dev_t(td->dm_dev.name, dev);
 839
 840                 atomic_set(&td->count, 0);
 841                 list_add(&td->list, &md->table_devices);
 842         }
 843         atomic_inc(&td->count);
 844         mutex_unlock(&md->table_devices_lock);
 845
 846         *result = &td->dm_dev;
 847         return 0;
 848 }
 849 EXPORT_SYMBOL_GPL(dm_get_table_device);
 850
 851 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 852 {
 853         struct table_device *td = container_of(d, struct table_device, dm_dev);
 854
 855         mutex_lock(&md->table_devices_lock);
 856         if (atomic_dec_and_test(&td->count)) {
 857                 close_table_device(td, md);
 858                 list_del(&td->list);
 859                 kfree(td);
 860         }
 861         mutex_unlock(&md->table_devices_lock);
 862 }
 863 EXPORT_SYMBOL(dm_put_table_device);
 864
 865 static void free_table_devices(struct list_head *devices)
 866 {
 867         struct list_head *tmp, *next;
 868
 869         list_for_each_safe(tmp, next, devices) {
 870                 struct table_device *td = list_entry(tmp, struct table_device, list);
 871
 872                 DMWARN("dm_destroy: %s still exists with %d references",
 873                        td->dm_dev.name, atomic_read(&td->count));
 874                 kfree(td);
 875         }
 876 }
 877
 878 /*
 879  * Get the geometry associated with a dm device
 880  */
 881 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 882 {
 883         *geo = md->geometry;
 884
 885         return 0;
 886 }
 887
 888 /*
 889  * Set the geometry of a device.
 890  */
 891 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 892 {
 893         sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 894
 895         if (geo->start > sz) {
 896                 DMWARN("Start sector is beyond the geometry limits.");
 897                 return -EINVAL;
 898         }
 899
 900         md->geometry = *geo;
 901
 902         return 0;
 903 }
 904
 905 /*-----------------------------------------------------------------
 906  * CRUD START:
 907  *   A more elegant soln is in the works that uses the queue
 908  *   merge fn, unfortunately there are a couple of changes to
 909  *   the block layer that I want to make for this.  So in the
 910  *   interests of getting something for people to use I give
 911  *   you this clearly demarcated crap.
 912  *---------------------------------------------------------------*/
 913
 914 static int __noflush_suspending(struct mapped_device *md)
 915 {
 916         return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 917 }
 918
 919 /*
 920  * Decrements the number of outstanding ios that a bio has been
 921  * cloned into, completing the original io if necc.
 922  */
 923 static void dec_pending(struct dm_io *io, int error)
 924 {
 925         unsigned long flags;
 926         int io_error;
 927         struct bio *bio;
 928         struct mapped_device *md = io->md;
 929
 930         /* Push-back supersedes any I/O errors */
 931         if (unlikely(error)) {
 932                 spin_lock_irqsave(&io->endio_lock, flags);
 933                 if (!(io->error > 0 && __noflush_suspending(md)))
 934                         io->error = error;
 935                 spin_unlock_irqrestore(&io->endio_lock, flags);
 936         }
 937
 938         if (atomic_dec_and_test(&io->io_count)) {
 939                 if (io->error == DM_ENDIO_REQUEUE) {
 940                         /*
 941                          * Target requested pushing back the I/O.
 942                          */
 943                         spin_lock_irqsave(&md->deferred_lock, flags);
 944                         if (__noflush_suspending(md))
 945                                 bio_list_add_head(&md->deferred, io->bio);
 946                         else
 947                                 /* noflush suspend was interrupted. */
 948                                 io->error = -EIO;
 949                         spin_unlock_irqrestore(&md->deferred_lock, flags);
 950                 }
 951
 952                 io_error = io->error;
 953                 bio = io->bio;
 954                 end_io_acct(io);
 955                 free_io(md, io);
 956
 957                 if (io_error == DM_ENDIO_REQUEUE)
 958                         return;
 959
 960                 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
 961                         /*
 962                          * Preflush done for flush with data, reissue
 963                          * without REQ_FLUSH.
 964                          */
 965                         bio->bi_rw &= ~REQ_FLUSH;
 966                         queue_io(md, bio);
 967                 } else {
 968                         /* done with normal IO or empty flush */
 969                         trace_block_bio_complete(md->queue, bio, io_error);
 970                         bio->bi_error = io_error;
 971                         bio_endio(bio);
 972                 }
 973         }
 974 }
 975
 976 static void disable_write_same(struct mapped_device *md)
 977 {
 978         struct queue_limits *limits = dm_get_queue_limits(md);
 979
 980         /* device doesn't really support WRITE SAME, disable it */
 981         limits->max_write_same_sectors = 0;
 982 }
 983
 984 static void clone_endio(struct bio *bio)
 985 {
 986         int error = bio->bi_error;
 987         int r = error;
 988         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 989         struct dm_io *io = tio->io;
 990         struct mapped_device *md = tio->io->md;
 991         dm_endio_fn endio = tio->ti->type->end_io;
 992
 993         if (endio) {
 994                 r = endio(tio->ti, bio, error);
 995                 if (r < 0 || r == DM_ENDIO_REQUEUE)
 996                         /*
 997                          * error and requeue request are handled
 998                          * in dec_pending().
 999                          */
1000                         error = r;
1001                 else if (r == DM_ENDIO_INCOMPLETE)
1002                         /* The target will handle the io */
1003                         return;
1004                 else if (r) {
1005                         DMWARN("unimplemented target endio return value: %d", r);
1006                         BUG();
1007                 }
1008         }
1009
1010         if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) &&
1011                      !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
1012                 disable_write_same(md);
1013
1014         free_tio(md, tio);
1015         dec_pending(io, error);
1016 }
1017
1018 /*
1019  * Partial completion handling for request-based dm
1020  */
1021 static void end_clone_bio(struct bio *clone)
1022 {
1023         struct dm_rq_clone_bio_info *info =
1024                 container_of(clone, struct dm_rq_clone_bio_info, clone);
1025         struct dm_rq_target_io *tio = info->tio;
1026         struct bio *bio = info->orig;
1027         unsigned int nr_bytes = info->orig->bi_iter.bi_size;
1028         int error = clone->bi_error;
1029
1030         bio_put(clone);
1031
1032         if (tio->error)
1033                 /*
1034                  * An error has already been detected on the request.
1035                  * Once error occurred, just let clone->end_io() handle
1036                  * the remainder.
1037                  */
1038                 return;
1039         else if (error) {
1040                 /*
1041                  * Don't notice the error to the upper layer yet.
1042                  * The error handling decision is made by the target driver,
1043                  * when the request is completed.
1044                  */
1045                 tio->error = error;
1046                 return;
1047         }
1048
1049         /*
1050          * I/O for the bio successfully completed.
1051          * Notice the data completion to the upper layer.
1052          */
1053
1054         /*
1055          * bios are processed from the head of the list.
1056          * So the completing bio should always be rq->bio.
1057          * If it's not, something wrong is happening.
1058          */
1059         if (tio->orig->bio != bio)
1060                 DMERR("bio completion is going in the middle of the request");
1061
1062         /*
1063          * Update the original request.
1064          * Do not use blk_end_request() here, because it may complete
1065          * the original request before the clone, and break the ordering.
1066          */
1067         blk_update_request(tio->orig, 0, nr_bytes);
1068 }
1069
1070 static struct dm_rq_target_io *tio_from_request(struct request *rq)
1071 {
1072         return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
1073 }
1074
1075 static void rq_end_stats(struct mapped_device *md, struct request *orig)
1076 {
1077         if (unlikely(dm_stats_used(&md->stats))) {
1078                 struct dm_rq_target_io *tio = tio_from_request(orig);
1079                 tio->duration_jiffies = jiffies - tio->duration_jiffies;
1080                 dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
1081                                     tio->n_sectors, true, tio->duration_jiffies,
1082                                     &tio->stats_aux);
1083         }
1084 }
1085
1086 /*
1087  * Don't touch any member of the md after calling this function because
1088  * the md may be freed in dm_put() at the end of this function.
1089  * Or do dm_get() before calling this function and dm_put() later.
1090  */
1091 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
1092 {
1093         atomic_dec(&md->pending[rw]);
1094
1095         /* nudge anyone waiting on suspend queue */
1096         if (!md_in_flight(md))
1097                 wake_up(&md->wait);
1098
1099         /*
1100          * Run this off this callpath, as drivers could invoke end_io while
1101          * inside their request_fn (and holding the queue lock). Calling
1102          * back into ->request_fn() could deadlock attempting to grab the
1103          * queue lock again.
1104          */
1105         if (!md->queue->mq_ops && run_queue)
1106                 blk_run_queue_async(md->queue);
1107
1108         /*
1109          * dm_put() must be at the end of this function. See the comment above
1110          */
1111         dm_put(md);
1112 }
1113
1114 static void free_rq_clone(struct request *clone)
1115 {
1116         struct dm_rq_target_io *tio = clone->end_io_data;
1117         struct mapped_device *md = tio->md;
1118
1119         blk_rq_unprep_clone(clone);
1120
1121         if (md->type == DM_TYPE_MQ_REQUEST_BASED)
1122                 /* stacked on blk-mq queue(s) */
1123                 tio->ti->type->release_clone_rq(clone);
1124         else if (!md->queue->mq_ops)
1125                 /* request_fn queue stacked on request_fn queue(s) */
1126                 free_clone_request(md, clone);
1127         /*
1128          * NOTE: for the blk-mq queue stacked on request_fn queue(s) case:
1129          * no need to call free_clone_request() because we leverage blk-mq by
1130          * allocating the clone at the end of the blk-mq pdu (see: clone_rq)
1131          */
1132
1133         if (!md->queue->mq_ops)
1134                 free_rq_tio(tio);
1135 }
1136
1137 /*
1138  * Complete the clone and the original request.
1139  * Must be called without clone's queue lock held,
1140  * see end_clone_request() for more details.
1141  */
1142 static void dm_end_request(struct request *clone, int error)
1143 {
1144         int rw = rq_data_dir(clone);
1145         struct dm_rq_target_io *tio = clone->end_io_data;
1146         struct mapped_device *md = tio->md;
1147         struct request *rq = tio->orig;
1148
1149         if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
1150                 rq->errors = clone->errors;
1151                 rq->resid_len = clone->resid_len;
1152
1153                 if (rq->sense)
1154                         /*
1155                          * We are using the sense buffer of the original
1156                          * request.
1157                          * So setting the length of the sense data is enough.
1158                          */
1159                         rq->sense_len = clone->sense_len;
1160         }
1161
1162         free_rq_clone(clone);
1163         rq_end_stats(md, rq);
1164         if (!rq->q->mq_ops)
1165                 blk_end_request_all(rq, error);
1166         else
1167                 blk_mq_end_request(rq, error);
1168         rq_completed(md, rw, true);
1169 }
1170
1171 static void dm_unprep_request(struct request *rq)
1172 {
1173         struct dm_rq_target_io *tio = tio_from_request(rq);
1174         struct request *clone = tio->clone;
1175
1176         if (!rq->q->mq_ops) {
1177                 rq->special = NULL;
1178                 rq->cmd_flags &= ~REQ_DONTPREP;
1179         }
1180
1181         if (clone)
1182                 free_rq_clone(clone);
1183         else if (!tio->md->queue->mq_ops)
1184                 free_rq_tio(tio);
1185 }
1186
1187 /*
1188  * Requeue the original request of a clone.
1189  */
1190 static void old_requeue_request(struct request *rq)
1191 {
1192         struct request_queue *q = rq->q;
1193         unsigned long flags;
1194
1195         spin_lock_irqsave(q->queue_lock, flags);
1196         blk_requeue_request(q, rq);
1197         blk_run_queue_async(q);
1198         spin_unlock_irqrestore(q->queue_lock, flags);
1199 }
1200
1201 static void dm_requeue_original_request(struct mapped_device *md,
1202                                         struct request *rq)
1203 {
1204         int rw = rq_data_dir(rq);
1205
1206         dm_unprep_request(rq);
1207
1208         rq_end_stats(md, rq);
1209         if (!rq->q->mq_ops)
1210                 old_requeue_request(rq);
1211         else {
1212                 blk_mq_requeue_request(rq);
1213                 blk_mq_kick_requeue_list(rq->q);
1214         }
1215
1216         rq_completed(md, rw, false);
1217 }
1218
1219 static void old_stop_queue(struct request_queue *q)
1220 {
1221         unsigned long flags;
1222
1223         if (blk_queue_stopped(q))
1224                 return;
1225
1226         spin_lock_irqsave(q->queue_lock, flags);
1227         blk_stop_queue(q);
1228         spin_unlock_irqrestore(q->queue_lock, flags);
1229 }
1230
1231 static void stop_queue(struct request_queue *q)
1232 {
1233         if (!q->mq_ops)
1234                 old_stop_queue(q);
1235         else
1236                 blk_mq_stop_hw_queues(q);
1237 }
1238
1239 static void old_start_queue(struct request_queue *q)
1240 {
1241         unsigned long flags;
1242
1243         spin_lock_irqsave(q->queue_lock, flags);
1244         if (blk_queue_stopped(q))
1245                 blk_start_queue(q);
1246         spin_unlock_irqrestore(q->queue_lock, flags);
1247 }
1248
1249 static void start_queue(struct request_queue *q)
1250 {
1251         if (!q->mq_ops)
1252                 old_start_queue(q);
1253         else
1254                 blk_mq_start_stopped_hw_queues(q, true);
1255 }
1256
1257 static void dm_done(struct request *clone, int error, bool mapped)
1258 {
1259         int r = error;
1260         struct dm_rq_target_io *tio = clone->end_io_data;
1261         dm_request_endio_fn rq_end_io = NULL;
1262
1263         if (tio->ti) {
1264                 rq_end_io = tio->ti->type->rq_end_io;
1265
1266                 if (mapped && rq_end_io)
1267                         r = rq_end_io(tio->ti, clone, error, &tio->info);
1268         }
1269
1270         if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) &&
1271                      !clone->q->limits.max_write_same_sectors))
1272                 disable_write_same(tio->md);
1273
1274         if (r <= 0)
1275                 /* The target wants to complete the I/O */
1276                 dm_end_request(clone, r);
1277         else if (r == DM_ENDIO_INCOMPLETE)
1278                 /* The target will handle the I/O */
1279                 return;
1280         else if (r == DM_ENDIO_REQUEUE)
1281                 /* The target wants to requeue the I/O */
1282                 dm_requeue_original_request(tio->md, tio->orig);
1283         else {
1284                 DMWARN("unimplemented target endio return value: %d", r);
1285                 BUG();
1286         }
1287 }
1288
1289 /*
1290  * Request completion handler for request-based dm
1291  */
1292 static void dm_softirq_done(struct request *rq)
1293 {
1294         bool mapped = true;
1295         struct dm_rq_target_io *tio = tio_from_request(rq);
1296         struct request *clone = tio->clone;
1297         int rw;
1298
1299         if (!clone) {
1300                 rq_end_stats(tio->md, rq);
1301                 rw = rq_data_dir(rq);
1302                 if (!rq->q->mq_ops) {
1303                         blk_end_request_all(rq, tio->error);
1304                         rq_completed(tio->md, rw, false);
1305                         free_rq_tio(tio);
1306                 } else {
1307                         blk_mq_end_request(rq, tio->error);
1308                         rq_completed(tio->md, rw, false);
1309                 }
1310                 return;
1311         }
1312
1313         if (rq->cmd_flags & REQ_FAILED)
1314                 mapped = false;
1315
1316         dm_done(clone, tio->error, mapped);
1317 }
1318
1319 /*
1320  * Complete the clone and the original request with the error status
1321  * through softirq context.
1322  */
1323 static void dm_complete_request(struct request *rq, int error)
1324 {
1325         struct dm_rq_target_io *tio = tio_from_request(rq);
1326
1327         tio->error = error;
1328         if (!rq->q->mq_ops)
1329                 blk_complete_request(rq);
1330         else
1331                 blk_mq_complete_request(rq, error);
1332 }
1333
1334 /*
1335  * Complete the not-mapped clone and the original request with the error status
1336  * through softirq context.
1337  * Target's rq_end_io() function isn't called.
1338  * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
1339  */
1340 static void dm_kill_unmapped_request(struct request *rq, int error)
1341 {
1342         rq->cmd_flags |= REQ_FAILED;
1343         dm_complete_request(rq, error);
1344 }
1345
1346 /*
1347  * Called with the clone's queue lock held (for non-blk-mq)
1348  */
1349 static void end_clone_request(struct request *clone, int error)
1350 {
1351         struct dm_rq_target_io *tio = clone->end_io_data;
1352
1353         if (!clone->q->mq_ops) {
1354                 /*
1355                  * For just cleaning up the information of the queue in which
1356                  * the clone was dispatched.
1357                  * The clone is *NOT* freed actually here because it is alloced
1358                  * from dm own mempool (REQ_ALLOCED isn't set).
1359                  */
1360                 __blk_put_request(clone->q, clone);
1361         }
1362
1363         /*
1364          * Actual request completion is done in a softirq context which doesn't
1365          * hold the clone's queue lock.  Otherwise, deadlock could occur because:
1366          *     - another request may be submitted by the upper level driver
1367          *       of the stacking during the completion
1368          *     - the submission which requires queue lock may be done
1369          *       against this clone's queue
1370          */
1371         dm_complete_request(tio->orig, error);
1372 }
1373
1374 /*
1375  * Return maximum size of I/O possible at the supplied sector up to the current
1376  * target boundary.
1377  */
1378 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1379 {
1380         sector_t target_offset = dm_target_offset(ti, sector);
1381
1382         return ti->len - target_offset;
1383 }
1384
1385 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1386 {
1387         sector_t len = max_io_len_target_boundary(sector, ti);
1388         sector_t offset, max_len;
1389
1390         /*
1391          * Does the target need to split even further?
1392          */
1393         if (ti->max_io_len) {
1394                 offset = dm_target_offset(ti, sector);
1395                 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1396                         max_len = sector_div(offset, ti->max_io_len);
1397                 else
1398                         max_len = offset & (ti->max_io_len - 1);
1399                 max_len = ti->max_io_len - max_len;
1400
1401                 if (len > max_len)
1402                         len = max_len;
1403         }
1404
1405         return len;
1406 }
1407
1408 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1409 {
1410         if (len > UINT_MAX) {
1411                 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1412                       (unsigned long long)len, UINT_MAX);
1413                 ti->error = "Maximum size of target IO is too large";
1414                 return -EINVAL;
1415         }
1416
1417         ti->max_io_len = (uint32_t) len;
1418
1419         return 0;
1420 }
1421 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1422
1423 /*
1424  * A target may call dm_accept_partial_bio only from the map routine.  It is
1425  * allowed for all bio types except REQ_FLUSH.
1426  *
1427  * dm_accept_partial_bio informs the dm that the target only wants to process
1428  * additional n_sectors sectors of the bio and the rest of the data should be
1429  * sent in a next bio.
1430  *
1431  * A diagram that explains the arithmetics:
1432  * +--------------------+---------------+-------+
1433  * |         1          |       2       |   3   |
1434  * +--------------------+---------------+-------+
1435  *
1436  * <-------------- *tio->len_ptr --------------->
1437  *                      <------- bi_size ------->
1438  *                      <-- n_sectors -->
1439  *
1440  * Region 1 was already iterated over with bio_advance or similar function.
1441  *      (it may be empty if the target doesn't use bio_advance)
1442  * Region 2 is the remaining bio size that the target wants to process.
1443  *      (it may be empty if region 1 is non-empty, although there is no reason
1444  *       to make it empty)
1445  * The target requires that region 3 is to be sent in the next bio.
1446  *
1447  * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1448  * the partially processed part (the sum of regions 1+2) must be the same for all
1449  * copies of the bio.
1450  */
1451 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1452 {
1453         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1454         unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1455         BUG_ON(bio->bi_rw & REQ_FLUSH);
1456         BUG_ON(bi_size > *tio->len_ptr);
1457         BUG_ON(n_sectors > bi_size);
1458         *tio->len_ptr -= bi_size - n_sectors;
1459         bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1460 }
1461 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1462
1463 static void __map_bio(struct dm_target_io *tio)
1464 {
1465         int r;
1466         sector_t sector;
1467         struct mapped_device *md;
1468         struct bio *clone = &tio->clone;
1469         struct dm_target *ti = tio->ti;
1470
1471         clone->bi_end_io = clone_endio;
1472
1473         /*
1474          * Map the clone.  If r == 0 we don't need to do
1475          * anything, the target has assumed ownership of
1476          * this io.
1477          */
1478         atomic_inc(&tio->io->io_count);
1479         sector = clone->bi_iter.bi_sector;
1480         r = ti->type->map(ti, clone);
1481         if (r == DM_MAPIO_REMAPPED) {
1482                 /* the bio has been remapped so dispatch it */
1483
1484                 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1485                                       tio->io->bio->bi_bdev->bd_dev, sector);
1486
1487                 generic_make_request(clone);
1488         } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1489                 /* error the io and bail out, or requeue it if needed */
1490                 md = tio->io->md;
1491                 dec_pending(tio->io, r);
1492                 free_tio(md, tio);
1493         } else if (r != DM_MAPIO_SUBMITTED) {
1494                 DMWARN("unimplemented target map return value: %d", r);
1495                 BUG();
1496         }
1497 }
1498
1499 struct clone_info {
1500         struct mapped_device *md;
1501         struct dm_table *map;
1502         struct bio *bio;
1503         struct dm_io *io;
1504         sector_t sector;
1505         unsigned sector_count;
1506 };
1507
1508 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1509 {
1510         bio->bi_iter.bi_sector = sector;
1511         bio->bi_iter.bi_size = to_bytes(len);
1512 }
1513
1514 /*
1515  * Creates a bio that consists of range of complete bvecs.
1516  */
1517 static void clone_bio(struct dm_target_io *tio, struct bio *bio,
1518                       sector_t sector, unsigned len)
1519 {
1520         struct bio *clone = &tio->clone;
1521
1522         __bio_clone_fast(clone, bio);
1523
1524         if (bio_integrity(bio))
1525                 bio_integrity_clone(clone, bio, GFP_NOIO);
1526
1527         bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1528         clone->bi_iter.bi_size = to_bytes(len);
1529
1530         if (bio_integrity(bio))
1531                 bio_integrity_trim(clone, 0, len);
1532 }
1533
1534 static struct dm_target_io *alloc_tio(struct clone_info *ci,
1535                                       struct dm_target *ti,
1536                                       unsigned target_bio_nr)
1537 {
1538         struct dm_target_io *tio;
1539         struct bio *clone;
1540
1541         clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
1542         tio = container_of(clone, struct dm_target_io, clone);
1543
1544         tio->io = ci->io;
1545         tio->ti = ti;
1546         tio->target_bio_nr = target_bio_nr;
1547
1548         return tio;
1549 }
1550
1551 static void __clone_and_map_simple_bio(struct clone_info *ci,
1552                                        struct dm_target *ti,
1553                                        unsigned target_bio_nr, unsigned *len)
1554 {
1555         struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
1556         struct bio *clone = &tio->clone;
1557
1558         tio->len_ptr = len;
1559
1560         __bio_clone_fast(clone, ci->bio);
1561         if (len)
1562                 bio_setup_sector(clone, ci->sector, *len);
1563
1564         __map_bio(tio);
1565 }
1566
1567 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1568                                   unsigned num_bios, unsigned *len)
1569 {
1570         unsigned target_bio_nr;
1571
1572         for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
1573                 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
1574 }
1575
1576 static int __send_empty_flush(struct clone_info *ci)
1577 {
1578         unsigned target_nr = 0;
1579         struct dm_target *ti;
1580
1581         BUG_ON(bio_has_data(ci->bio));
1582         while ((ti = dm_table_get_target(ci->map, target_nr++)))
1583                 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1584
1585         return 0;
1586 }
1587
1588 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1589                                      sector_t sector, unsigned *len)
1590 {
1591         struct bio *bio = ci->bio;
1592         struct dm_target_io *tio;
1593         unsigned target_bio_nr;
1594         unsigned num_target_bios = 1;
1595
1596         /*
1597          * Does the target want to receive duplicate copies of the bio?
1598          */
1599         if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1600                 num_target_bios = ti->num_write_bios(ti, bio);
1601
1602         for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1603                 tio = alloc_tio(ci, ti, target_bio_nr);
1604                 tio->len_ptr = len;
1605                 clone_bio(tio, bio, sector, *len);
1606                 __map_bio(tio);
1607         }
1608 }
1609
1610 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1611
1612 static unsigned get_num_discard_bios(struct dm_target *ti)
1613 {
1614         return ti->num_discard_bios;
1615 }
1616
1617 static unsigned get_num_write_same_bios(struct dm_target *ti)
1618 {
1619         return ti->num_write_same_bios;
1620 }
1621
1622 typedef bool (*is_split_required_fn)(struct dm_target *ti);
1623
1624 static bool is_split_required_for_discard(struct dm_target *ti)
1625 {
1626         return ti->split_discard_bios;
1627 }
1628
1629 static int __send_changing_extent_only(struct clone_info *ci,
1630                                        get_num_bios_fn get_num_bios,
1631                                        is_split_required_fn is_split_required)
1632 {
1633         struct dm_target *ti;
1634         unsigned len;
1635         unsigned num_bios;
1636
1637         do {
1638                 ti = dm_table_find_target(ci->map, ci->sector);
1639                 if (!dm_target_is_valid(ti))
1640                         return -EIO;
1641
1642                 /*
1643                  * Even though the device advertised support for this type of
1644                  * request, that does not mean every target supports it, and
1645                  * reconfiguration might also have changed that since the
1646                  * check was performed.
1647                  */
1648                 num_bios = get_num_bios ? get_num_bios(ti) : 0;
1649                 if (!num_bios)
1650                         return -EOPNOTSUPP;
1651
1652                 if (is_split_required && !is_split_required(ti))
1653                         len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1654                 else
1655                         len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1656
1657                 __send_duplicate_bios(ci, ti, num_bios, &len);
1658
1659                 ci->sector += len;
1660         } while (ci->sector_count -= len);
1661
1662         return 0;
1663 }
1664
1665 static int __send_discard(struct clone_info *ci)
1666 {
1667         return __send_changing_extent_only(ci, get_num_discard_bios,
1668                                            is_split_required_for_discard);
1669 }
1670
1671 static int __send_write_same(struct clone_info *ci)
1672 {
1673         return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
1674 }
1675
1676 /*
1677  * Select the correct strategy for processing a non-flush bio.
1678  */
1679 static int __split_and_process_non_flush(struct clone_info *ci)
1680 {
1681         struct bio *bio = ci->bio;
1682         struct dm_target *ti;
1683         unsigned len;
1684
1685         if (unlikely(bio->bi_rw & REQ_DISCARD))
1686                 return __send_discard(ci);
1687         else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
1688                 return __send_write_same(ci);
1689
1690         ti = dm_table_find_target(ci->map, ci->sector);
1691         if (!dm_target_is_valid(ti))
1692                 return -EIO;
1693
1694         len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1695
1696         __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1697
1698         ci->sector += len;
1699         ci->sector_count -= len;
1700
1701         return 0;
1702 }
1703
1704 /*
1705  * Entry point to split a bio into clones and submit them to the targets.
1706  */
1707 static void __split_and_process_bio(struct mapped_device *md,
1708                                     struct dm_table *map, struct bio *bio)
1709 {
1710         struct clone_info ci;
1711         int error = 0;
1712
1713         if (unlikely(!map)) {
1714                 bio_io_error(bio);
1715                 return;
1716         }
1717
1718         ci.map = map;
1719         ci.md = md;
1720         ci.io = alloc_io(md);
1721         ci.io->error = 0;
1722         atomic_set(&ci.io->io_count, 1);
1723         ci.io->bio = bio;
1724         ci.io->md = md;
1725         spin_lock_init(&ci.io->endio_lock);
1726         ci.sector = bio->bi_iter.bi_sector;
1727
1728         start_io_acct(ci.io);
1729
1730         if (bio->bi_rw & REQ_FLUSH) {
1731                 ci.bio = &ci.md->flush_bio;
1732                 ci.sector_count = 0;
1733                 error = __send_empty_flush(&ci);
1734                 /* dec_pending submits any data associated with flush */
1735         } else {
1736                 ci.bio = bio;
1737                 ci.sector_count = bio_sectors(bio);
1738                 while (ci.sector_count && !error)
1739                         error = __split_and_process_non_flush(&ci);
1740         }
1741
1742         /* drop the extra reference count */
1743         dec_pending(ci.io, error);
1744 }
1745 /*-----------------------------------------------------------------
1746  * CRUD END
1747  *---------------------------------------------------------------*/
1748
1749 /*
1750  * The request function that just remaps the bio built up by
1751  * dm_merge_bvec.
1752  */
1753 static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1754 {
1755         int rw = bio_data_dir(bio);
1756         struct mapped_device *md = q->queuedata;
1757         int srcu_idx;
1758         struct dm_table *map;
1759
1760         map = dm_get_live_table(md, &srcu_idx);
1761
1762         generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
1763
1764         /* if we're suspended, we have to queue this io for later */
1765         if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1766                 dm_put_live_table(md, srcu_idx);
1767
1768                 if (bio_rw(bio) != READA)
1769                         queue_io(md, bio);
1770                 else
1771                         bio_io_error(bio);
1772                 return BLK_QC_T_NONE;
1773         }
1774
1775         __split_and_process_bio(md, map, bio);
1776         dm_put_live_table(md, srcu_idx);
1777         return BLK_QC_T_NONE;
1778 }
1779
1780 int dm_request_based(struct mapped_device *md)
1781 {
1782         return blk_queue_stackable(md->queue);
1783 }
1784
1785 static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
1786 {
1787         int r;
1788
1789         if (blk_queue_io_stat(clone->q))
1790                 clone->cmd_flags |= REQ_IO_STAT;
1791
1792         clone->start_time = jiffies;
1793         r = blk_insert_cloned_request(clone->q, clone);
1794         if (r)
1795                 /* must complete clone in terms of original request */
1796                 dm_complete_request(rq, r);
1797 }
1798
1799 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1800                                  void *data)
1801 {
1802         struct dm_rq_target_io *tio = data;
1803         struct dm_rq_clone_bio_info *info =
1804                 container_of(bio, struct dm_rq_clone_bio_info, clone);
1805
1806         info->orig = bio_orig;
1807         info->tio = tio;
1808         bio->bi_end_io = end_clone_bio;
1809
1810         return 0;
1811 }
1812
1813 static int setup_clone(struct request *clone, struct request *rq,
1814                        struct dm_rq_target_io *tio, gfp_t gfp_mask)
1815 {
1816         int r;
1817
1818         r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
1819                               dm_rq_bio_constructor, tio);
1820         if (r)
1821                 return r;
1822
1823         clone->cmd = rq->cmd;
1824         clone->cmd_len = rq->cmd_len;
1825         clone->sense = rq->sense;
1826         clone->end_io = end_clone_request;
1827         clone->end_io_data = tio;
1828
1829         tio->clone = clone;
1830
1831         return 0;
1832 }
1833
1834 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1835                                 struct dm_rq_target_io *tio, gfp_t gfp_mask)
1836 {
1837         /*
1838          * Do not allocate a clone if tio->clone was already set
1839          * (see: dm_mq_queue_rq).
1840          */
1841         bool alloc_clone = !tio->clone;
1842         struct request *clone;
1843
1844         if (alloc_clone) {
1845                 clone = alloc_clone_request(md, gfp_mask);
1846                 if (!clone)
1847                         return NULL;
1848         } else
1849                 clone = tio->clone;
1850
1851         blk_rq_init(NULL, clone);
1852         if (setup_clone(clone, rq, tio, gfp_mask)) {
1853                 /* -ENOMEM */
1854                 if (alloc_clone)
1855                         free_clone_request(md, clone);
1856                 return NULL;
1857         }
1858
1859         return clone;
1860 }
1861
1862 static void map_tio_request(struct kthread_work *work);
1863
1864 static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
1865                      struct mapped_device *md)
1866 {
1867         tio->md = md;
1868         tio->ti = NULL;
1869         tio->clone = NULL;
1870         tio->orig = rq;
1871         tio->error = 0;
1872         memset(&tio->info, 0, sizeof(tio->info));
1873         if (md->kworker_task)
1874                 init_kthread_work(&tio->work, map_tio_request);
1875 }
1876
1877 static struct dm_rq_target_io *prep_tio(struct request *rq,
1878                                         struct mapped_device *md, gfp_t gfp_mask)
1879 {
1880         struct dm_rq_target_io *tio;
1881         int srcu_idx;
1882         struct dm_table *table;
1883
1884         tio = alloc_rq_tio(md, gfp_mask);
1885         if (!tio)
1886                 return NULL;
1887
1888         init_tio(tio, rq, md);
1889
1890         table = dm_get_live_table(md, &srcu_idx);
1891         if (!dm_table_mq_request_based(table)) {
1892                 if (!clone_rq(rq, md, tio, gfp_mask)) {
1893                         dm_put_live_table(md, srcu_idx);
1894                         free_rq_tio(tio);
1895                         return NULL;
1896                 }
1897         }
1898         dm_put_live_table(md, srcu_idx);
1899
1900         return tio;
1901 }
1902
1903 /*
1904  * Called with the queue lock held.
1905  */
1906 static int dm_prep_fn(struct request_queue *q, struct request *rq)
1907 {
1908         struct mapped_device *md = q->queuedata;
1909         struct dm_rq_target_io *tio;
1910
1911         if (unlikely(rq->special)) {
1912                 DMWARN("Already has something in rq->special.");
1913                 return BLKPREP_KILL;
1914         }
1915
1916         tio = prep_tio(rq, md, GFP_ATOMIC);
1917         if (!tio)
1918                 return BLKPREP_DEFER;
1919
1920         rq->special = tio;
1921         rq->cmd_flags |= REQ_DONTPREP;
1922
1923         return BLKPREP_OK;
1924 }
1925
1926 /*
1927  * Returns:
1928  * 0                : the request has been processed
1929  * DM_MAPIO_REQUEUE : the original request needs to be requeued
1930  * < 0              : the request was completed due to failure
1931  */
1932 static int map_request(struct dm_rq_target_io *tio, struct request *rq,
1933                        struct mapped_device *md)
1934 {
1935         int r;
1936         struct dm_target *ti = tio->ti;
1937         struct request *clone = NULL;
1938
1939         if (tio->clone) {
1940                 clone = tio->clone;
1941                 r = ti->type->map_rq(ti, clone, &tio->info);
1942         } else {
1943                 r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
1944                 if (r < 0) {
1945                         /* The target wants to complete the I/O */
1946                         dm_kill_unmapped_request(rq, r);
1947                         return r;
1948                 }
1949                 if (r != DM_MAPIO_REMAPPED)
1950                         return r;
1951                 if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
1952                         /* -ENOMEM */
1953                         ti->type->release_clone_rq(clone);
1954                         return DM_MAPIO_REQUEUE;
1955                 }
1956         }
1957
1958         switch (r) {
1959         case DM_MAPIO_SUBMITTED:
1960                 /* The target has taken the I/O to submit by itself later */
1961                 break;
1962         case DM_MAPIO_REMAPPED:
1963                 /* The target has remapped the I/O so dispatch it */
1964                 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1965                                      blk_rq_pos(rq));
1966                 dm_dispatch_clone_request(clone, rq);
1967                 break;
1968         case DM_MAPIO_REQUEUE:
1969                 /* The target wants to requeue the I/O */
1970                 dm_requeue_original_request(md, tio->orig);
1971                 break;
1972         default:
1973                 if (r > 0) {
1974                         DMWARN("unimplemented target map return value: %d", r);
1975                         BUG();
1976                 }
1977
1978                 /* The target wants to complete the I/O */
1979                 dm_kill_unmapped_request(rq, r);
1980                 return r;
1981         }
1982
1983         return 0;
1984 }
1985
1986 static void map_tio_request(struct kthread_work *work)
1987 {
1988         struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
1989         struct request *rq = tio->orig;
1990         struct mapped_device *md = tio->md;
1991
1992         if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
1993                 dm_requeue_original_request(md, rq);
1994 }
1995
1996 static void dm_start_request(struct mapped_device *md, struct request *orig)
1997 {
1998         if (!orig->q->mq_ops)
1999                 blk_start_request(orig);
2000         else
2001                 blk_mq_start_request(orig);
2002         atomic_inc(&md->pending[rq_data_dir(orig)]);
2003
2004         if (md->seq_rq_merge_deadline_usecs) {
2005                 md->last_rq_pos = rq_end_sector(orig);
2006                 md->last_rq_rw = rq_data_dir(orig);
2007                 md->last_rq_start_time = ktime_get();
2008         }
2009
2010         if (unlikely(dm_stats_used(&md->stats))) {
2011                 struct dm_rq_target_io *tio = tio_from_request(orig);
2012                 tio->duration_jiffies = jiffies;
2013                 tio->n_sectors = blk_rq_sectors(orig);
2014                 dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
2015                                     tio->n_sectors, false, 0, &tio->stats_aux);
2016         }
2017
2018         /*
2019          * Hold the md reference here for the in-flight I/O.
2020          * We can't rely on the reference count by device opener,
2021          * because the device may be closed during the request completion
2022          * when all bios are completed.
2023          * See the comment in rq_completed() too.
2024          */
2025         dm_get(md);
2026 }
2027
2028 #define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
2029
2030 ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
2031 {
2032         return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
2033 }
2034
2035 ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
2036                                                      const char *buf, size_t count)
2037 {
2038         unsigned deadline;
2039
2040         if (!dm_request_based(md) || md->use_blk_mq)
2041                 return count;
2042
2043         if (kstrtouint(buf, 10, &deadline))
2044                 return -EINVAL;
2045
2046         if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
2047                 deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
2048
2049         md->seq_rq_merge_deadline_usecs = deadline;
2050
2051         return count;
2052 }
2053
2054 static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
2055 {
2056         ktime_t kt_deadline;
2057
2058         if (!md->seq_rq_merge_deadline_usecs)
2059                 return false;
2060
2061         kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
2062         kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
2063
2064         return !ktime_after(ktime_get(), kt_deadline);
2065 }
2066
2067 /*
2068  * q->request_fn for request-based dm.
2069  * Called with the queue lock held.
2070  */
2071 static void dm_request_fn(struct request_queue *q)
2072 {
2073         struct mapped_device *md = q->queuedata;
2074         struct dm_target *ti = md->immutable_target;
2075         struct request *rq;
2076         struct dm_rq_target_io *tio;
2077         sector_t pos = 0;
2078
2079         if (unlikely(!ti)) {
2080                 int srcu_idx;
2081                 struct dm_table *map = dm_get_live_table(md, &srcu_idx);
2082
2083                 ti = dm_table_find_target(map, pos);
2084                 dm_put_live_table(md, srcu_idx);
2085         }
2086
2087         /*
2088          * For suspend, check blk_queue_stopped() and increment
2089          * ->pending within a single queue_lock not to increment the
2090          * number of in-flight I/Os after the queue is stopped in
2091          * dm_suspend().
2092          */
2093         while (!blk_queue_stopped(q)) {
2094                 rq = blk_peek_request(q);
2095                 if (!rq)
2096                         return;
2097
2098                 /* always use block 0 to find the target for flushes for now */
2099                 pos = 0;
2100                 if (!(rq->cmd_flags & REQ_FLUSH))
2101                         pos = blk_rq_pos(rq);
2102
2103                 if ((dm_request_peeked_before_merge_deadline(md) &&
2104                      md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
2105                      md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
2106                     (ti->type->busy && ti->type->busy(ti))) {
2107                         blk_delay_queue(q, HZ / 100);
2108                         return;
2109                 }
2110
2111                 dm_start_request(md, rq);
2112
2113                 tio = tio_from_request(rq);
2114                 /* Establish tio->ti before queuing work (map_tio_request) */
2115                 tio->ti = ti;
2116                 queue_kthread_work(&md->kworker, &tio->work);
2117                 BUG_ON(!irqs_disabled());
2118         }
2119 }
2120
2121 static int dm_any_congested(void *congested_data, int bdi_bits)
2122 {
2123         int r = bdi_bits;
2124         struct mapped_device *md = congested_data;
2125         struct dm_table *map;
2126
2127         if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2128                 if (dm_request_based(md)) {
2129                         /*
2130                          * With request-based DM we only need to check the
2131                          * top-level queue for congestion.
2132                          */
2133                         r = md->queue->backing_dev_info.wb.state & bdi_bits;
2134                 } else {
2135                         map = dm_get_live_table_fast(md);
2136                         if (map)
2137                                 r = dm_table_any_congested(map, bdi_bits);
2138                         dm_put_live_table_fast(md);
2139                 }
2140         }
2141
2142         return r;
2143 }
2144
2145 /*-----------------------------------------------------------------
2146  * An IDR is used to keep track of allocated minor numbers.
2147  *---------------------------------------------------------------*/
2148 static void free_minor(int minor)
2149 {
2150         spin_lock(&_minor_lock);
2151         idr_remove(&_minor_idr, minor);
2152         spin_unlock(&_minor_lock);
2153 }
2154
2155 /*
2156  * See if the device with a specific minor # is free.
2157  */
2158 static int specific_minor(int minor)
2159 {
2160         int r;
2161
2162         if (minor >= (1 << MINORBITS))
2163                 return -EINVAL;
2164
2165         idr_preload(GFP_KERNEL);
2166         spin_lock(&_minor_lock);
2167
2168         r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
2169
2170         spin_unlock(&_minor_lock);
2171         idr_preload_end();
2172         if (r < 0)
2173                 return r == -ENOSPC ? -EBUSY : r;
2174         return 0;
2175 }
2176
2177 static int next_free_minor(int *minor)
2178 {
2179         int r;
2180
2181         idr_preload(GFP_KERNEL);
2182         spin_lock(&_minor_lock);
2183
2184         r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
2185
2186         spin_unlock(&_minor_lock);
2187         idr_preload_end();
2188         if (r < 0)
2189                 return r;
2190         *minor = r;
2191         return 0;
2192 }
2193
2194 static const struct block_device_operations dm_blk_dops;
2195
2196 static void dm_wq_work(struct work_struct *work);
2197
2198 static void dm_init_md_queue(struct mapped_device *md)
2199 {
2200         /*
2201          * Request-based dm devices cannot be stacked on top of bio-based dm
2202          * devices.  The type of this dm device may not have been decided yet.
2203          * The type is decided at the first table loading time.
2204          * To prevent problematic device stacking, clear the queue flag
2205          * for request stacking support until then.
2206          *
2207          * This queue is new, so no concurrency on the queue_flags.
2208          */
2209         queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
2210
2211         /*
2212          * Initialize data that will only be used by a non-blk-mq DM queue
2213          * - must do so here (in alloc_dev callchain) before queue is used
2214          */
2215         md->queue->queuedata = md;
2216         md->queue->backing_dev_info.congested_data = md;
2217 }
2218
2219 static void dm_init_old_md_queue(struct mapped_device *md)
2220 {
2221         md->use_blk_mq = false;
2222         dm_init_md_queue(md);
2223
2224         /*
2225          * Initialize aspects of queue that aren't relevant for blk-mq
2226          */
2227         md->queue->backing_dev_info.congested_fn = dm_any_congested;
2228         blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
2229 }
2230
2231 static void cleanup_mapped_device(struct mapped_device *md)
2232 {
2233         if (md->wq)
2234                 destroy_workqueue(md->wq);
2235         if (md->kworker_task)
2236                 kthread_stop(md->kworker_task);
2237         mempool_destroy(md->io_pool);
2238         mempool_destroy(md->rq_pool);
2239         if (md->bs)
2240                 bioset_free(md->bs);
2241
2242         cleanup_srcu_struct(&md->io_barrier);
2243
2244         if (md->disk) {
2245                 spin_lock(&_minor_lock);
2246                 md->disk->private_data = NULL;
2247                 spin_unlock(&_minor_lock);
2248                 del_gendisk(md->disk);
2249                 put_disk(md->disk);
2250         }
2251
2252         if (md->queue)
2253                 blk_cleanup_queue(md->queue);
2254
2255         if (md->bdev) {
2256                 bdput(md->bdev);
2257                 md->bdev = NULL;
2258         }
2259 }
2260
2261 /*
2262  * Allocate and initialise a blank device with a given minor.
2263  */
2264 static struct mapped_device *alloc_dev(int minor)
2265 {
2266         int r;
2267         struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
2268         void *old_md;
2269
2270         if (!md) {
2271                 DMWARN("unable to allocate device, out of memory.");
2272                 return NULL;
2273         }
2274
2275         if (!try_module_get(THIS_MODULE))
2276                 goto bad_module_get;
2277
2278         /* get a minor number for the dev */
2279         if (minor == DM_ANY_MINOR)
2280                 r = next_free_minor(&minor);
2281         else
2282                 r = specific_minor(minor);
2283         if (r < 0)
2284                 goto bad_minor;
2285
2286         r = init_srcu_struct(&md->io_barrier);
2287         if (r < 0)
2288                 goto bad_io_barrier;
2289
2290         md->use_blk_mq = use_blk_mq;
2291         md->type = DM_TYPE_NONE;
2292         mutex_init(&md->suspend_lock);
2293         mutex_init(&md->type_lock);
2294         mutex_init(&md->table_devices_lock);
2295         spin_lock_init(&md->deferred_lock);
2296         atomic_set(&md->holders, 1);
2297         atomic_set(&md->open_count, 0);
2298         atomic_set(&md->event_nr, 0);
2299         atomic_set(&md->uevent_seq, 0);
2300         INIT_LIST_HEAD(&md->uevent_list);
2301         INIT_LIST_HEAD(&md->table_devices);
2302         spin_lock_init(&md->uevent_lock);
2303
2304         md->queue = blk_alloc_queue(GFP_KERNEL);
2305         if (!md->queue)
2306                 goto bad;
2307
2308         dm_init_md_queue(md);
2309
2310         md->disk = alloc_disk(1);
2311         if (!md->disk)
2312                 goto bad;
2313
2314         atomic_set(&md->pending[0], 0);
2315         atomic_set(&md->pending[1], 0);
2316         init_waitqueue_head(&md->wait);
2317         INIT_WORK(&md->work, dm_wq_work);
2318         init_waitqueue_head(&md->eventq);
2319         init_completion(&md->kobj_holder.completion);
2320         md->kworker_task = NULL;
2321
2322         md->disk->major = _major;
2323         md->disk->first_minor = minor;
2324         md->disk->fops = &dm_blk_dops;
2325         md->disk->queue = md->queue;
2326         md->disk->private_data = md;
2327         sprintf(md->disk->disk_name, "dm-%d", minor);
2328         add_disk(md->disk);
2329         format_dev_t(md->name, MKDEV(_major, minor));
2330
2331         md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
2332         if (!md->wq)
2333                 goto bad;
2334
2335         md->bdev = bdget_disk(md->disk, 0);
2336         if (!md->bdev)
2337                 goto bad;
2338
2339         bio_init(&md->flush_bio);
2340         md->flush_bio.bi_bdev = md->bdev;
2341         md->flush_bio.bi_rw = WRITE_FLUSH;
2342
2343         dm_stats_init(&md->stats);
2344
2345         /* Populate the mapping, nobody knows we exist yet */
2346         spin_lock(&_minor_lock);
2347         old_md = idr_replace(&_minor_idr, md, minor);
2348         spin_unlock(&_minor_lock);
2349
2350         BUG_ON(old_md != MINOR_ALLOCED);
2351
2352         return md;
2353
2354 bad:
2355         cleanup_mapped_device(md);
2356 bad_io_barrier:
2357         free_minor(minor);
2358 bad_minor:
2359         module_put(THIS_MODULE);
2360 bad_module_get:
2361         kfree(md);
2362         return NULL;
2363 }
2364
2365 static void unlock_fs(struct mapped_device *md);
2366
2367 static void free_dev(struct mapped_device *md)
2368 {
2369         int minor = MINOR(disk_devt(md->disk));
2370
2371         unlock_fs(md);
2372
2373         cleanup_mapped_device(md);
2374         if (md->use_blk_mq)
2375                 blk_mq_free_tag_set(&md->tag_set);
2376
2377         free_table_devices(&md->table_devices);
2378         dm_stats_cleanup(&md->stats);
2379         free_minor(minor);
2380
2381         module_put(THIS_MODULE);
2382         kfree(md);
2383 }
2384
2385 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
2386 {
2387         struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2388
2389         if (md->bs) {
2390                 /* The md already has necessary mempools. */
2391                 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
2392                         /*
2393                          * Reload bioset because front_pad may have changed
2394                          * because a different table was loaded.
2395                          */
2396                         bioset_free(md->bs);
2397                         md->bs = p->bs;
2398                         p->bs = NULL;
2399                 }
2400                 /*
2401                  * There's no need to reload with request-based dm
2402                  * because the size of front_pad doesn't change.
2403                  * Note for future: If you are to reload bioset,
2404                  * prep-ed requests in the queue may refer
2405                  * to bio from the old bioset, so you must walk
2406                  * through the queue to unprep.
2407                  */
2408                 goto out;
2409         }
2410
2411         BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
2412
2413         md->io_pool = p->io_pool;
2414         p->io_pool = NULL;
2415         md->rq_pool = p->rq_pool;
2416         p->rq_pool = NULL;
2417         md->bs = p->bs;
2418         p->bs = NULL;
2419
2420 out:
2421         /* mempool bind completed, no longer need any mempools in the table */
2422         dm_table_free_md_mempools(t);
2423 }
2424
2425 /*
2426  * Bind a table to the device.
2427  */
2428 static void event_callback(void *context)
2429 {
2430         unsigned long flags;
2431         LIST_HEAD(uevents);
2432         struct mapped_device *md = (struct mapped_device *) context;
2433
2434         spin_lock_irqsave(&md->uevent_lock, flags);
2435         list_splice_init(&md->uevent_list, &uevents);
2436         spin_unlock_irqrestore(&md->uevent_lock, flags);
2437
2438         dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2439
2440         atomic_inc(&md->event_nr);
2441         wake_up(&md->eventq);
2442 }
2443
2444 /*
2445  * Protected by md->suspend_lock obtained by dm_swap_table().
2446  */
2447 static void __set_size(struct mapped_device *md, sector_t size)
2448 {
2449         set_capacity(md->disk, size);
2450
2451         i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2452 }
2453
2454 /*
2455  * Returns old map, which caller must destroy.
2456  */
2457 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2458                                struct queue_limits *limits)
2459 {
2460         struct dm_table *old_map;
2461         struct request_queue *q = md->queue;
2462         sector_t size;
2463
2464         size = dm_table_get_size(t);
2465
2466         /*
2467          * Wipe any geometry if the size of the table changed.
2468          */
2469         if (size != dm_get_size(md))
2470                 memset(&md->geometry, 0, sizeof(md->geometry));
2471
2472         __set_size(md, size);
2473
2474         dm_table_event_callback(t, event_callback, md);
2475
2476         /*
2477          * The queue hasn't been stopped yet, if the old table type wasn't
2478          * for request-based during suspension.  So stop it to prevent
2479          * I/O mapping before resume.
2480          * This must be done before setting the queue restrictions,
2481          * because request-based dm may be run just after the setting.
2482          */
2483         if (dm_table_request_based(t)) {
2484                 stop_queue(q);
2485                 /*
2486                  * Leverage the fact that request-based DM targets are
2487                  * immutable singletons and establish md->immutable_target
2488                  * - used to optimize both dm_request_fn and dm_mq_queue_rq
2489                  */
2490                 md->immutable_target = dm_table_get_immutable_target(t);
2491         }
2492
2493         __bind_mempools(md, t);
2494
2495         old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2496         rcu_assign_pointer(md->map, t);
2497         md->immutable_target_type = dm_table_get_immutable_target_type(t);
2498
2499         dm_table_set_restrictions(t, q, limits);
2500         if (old_map)
2501                 dm_sync_table(md);
2502
2503         return old_map;
2504 }
2505
2506 /*
2507  * Returns unbound table for the caller to free.
2508  */
2509 static struct dm_table *__unbind(struct mapped_device *md)
2510 {
2511         struct dm_table *map = rcu_dereference_protected(md->map, 1);
2512
2513         if (!map)
2514                 return NULL;
2515
2516         dm_table_event_callback(map, NULL, NULL);
2517         RCU_INIT_POINTER(md->map, NULL);
2518         dm_sync_table(md);
2519
2520         return map;
2521 }
2522
2523 /*
2524  * Constructor for a new device.
2525  */
2526 int dm_create(int minor, struct mapped_device **result)
2527 {
2528         struct mapped_device *md;
2529
2530         md = alloc_dev(minor);
2531         if (!md)
2532                 return -ENXIO;
2533
2534         dm_sysfs_init(md);
2535
2536         *result = md;
2537         return 0;
2538 }
2539
2540 /*
2541  * Functions to manage md->type.
2542  * All are required to hold md->type_lock.
2543  */
2544 void dm_lock_md_type(struct mapped_device *md)
2545 {
2546         mutex_lock(&md->type_lock);
2547 }
2548
2549 void dm_unlock_md_type(struct mapped_device *md)
2550 {
2551         mutex_unlock(&md->type_lock);
2552 }
2553
2554 void dm_set_md_type(struct mapped_device *md, unsigned type)
2555 {
2556         BUG_ON(!mutex_is_locked(&md->type_lock));
2557         md->type = type;
2558 }
2559
2560 unsigned dm_get_md_type(struct mapped_device *md)
2561 {
2562         return md->type;
2563 }
2564
2565 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2566 {
2567         return md->immutable_target_type;
2568 }
2569
2570 /*
2571  * The queue_limits are only valid as long as you have a reference
2572  * count on 'md'.
2573  */
2574 struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2575 {
2576         BUG_ON(!atomic_read(&md->holders));
2577         return &md->queue->limits;
2578 }
2579 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2580
2581 static void init_rq_based_worker_thread(struct mapped_device *md)
2582 {
2583         /* Initialize the request-based DM worker thread */
2584         init_kthread_worker(&md->kworker);
2585         md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
2586                                        "kdmwork-%s", dm_device_name(md));
2587 }
2588
2589 /*
2590  * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2591  */
2592 static int dm_init_request_based_queue(struct mapped_device *md)
2593 {
2594         struct request_queue *q = NULL;
2595
2596         /* Fully initialize the queue */
2597         q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
2598         if (!q)
2599                 return -EINVAL;
2600
2601         /* disable dm_request_fn's merge heuristic by default */
2602         md->seq_rq_merge_deadline_usecs = 0;
2603
2604         md->queue = q;
2605         dm_init_old_md_queue(md);
2606         blk_queue_softirq_done(md->queue, dm_softirq_done);
2607         blk_queue_prep_rq(md->queue, dm_prep_fn);
2608
2609         init_rq_based_worker_thread(md);
2610
2611         elv_register_queue(md->queue);
2612
2613         return 0;
2614 }
2615
2616 static int dm_mq_init_request(void *data, struct request *rq,
2617                               unsigned int hctx_idx, unsigned int request_idx,
2618                               unsigned int numa_node)
2619 {
2620         struct mapped_device *md = data;
2621         struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
2622
2623         /*
2624          * Must initialize md member of tio, otherwise it won't
2625          * be available in dm_mq_queue_rq.
2626          */
2627         tio->md = md;
2628
2629         return 0;
2630 }
2631
2632 static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
2633                           const struct blk_mq_queue_data *bd)
2634 {
2635         struct request *rq = bd->rq;
2636         struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
2637         struct mapped_device *md = tio->md;
2638         struct dm_target *ti = md->immutable_target;
2639
2640         if (unlikely(!ti)) {
2641                 int srcu_idx;
2642                 struct dm_table *map = dm_get_live_table(md, &srcu_idx);
2643
2644                 ti = dm_table_find_target(map, 0);
2645                 dm_put_live_table(md, srcu_idx);
2646         }
2647
2648         if (ti->type->busy && ti->type->busy(ti))
2649                 return BLK_MQ_RQ_QUEUE_BUSY;
2650
2651         dm_start_request(md, rq);
2652
2653         /* Init tio using md established in .init_request */
2654         init_tio(tio, rq, md);
2655
2656         /*
2657          * Establish tio->ti before queuing work (map_tio_request)
2658          * or making direct call to map_request().
2659          */
2660         tio->ti = ti;
2661
2662         /*
2663          * Both the table and md type cannot change after initial table load
2664          */
2665         if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) {
2666                 /* clone request is allocated at the end of the pdu */
2667                 tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
2668                 (void) clone_rq(rq, md, tio, GFP_ATOMIC);
2669                 queue_kthread_work(&md->kworker, &tio->work);
2670         } else {
2671                 /* Direct call is fine since .queue_rq allows allocations */
2672                 if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
2673                         /* Undo dm_start_request() before requeuing */
2674                         rq_end_stats(md, rq);
2675                         rq_completed(md, rq_data_dir(rq), false);
2676                         return BLK_MQ_RQ_QUEUE_BUSY;
2677                 }
2678         }
2679
2680         return BLK_MQ_RQ_QUEUE_OK;
2681 }
2682
2683 static struct blk_mq_ops dm_mq_ops = {
2684         .queue_rq = dm_mq_queue_rq,
2685         .map_queue = blk_mq_map_queue,
2686         .complete = dm_softirq_done,
2687         .init_request = dm_mq_init_request,
2688 };
2689
2690 static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
2691 {
2692         unsigned md_type = dm_get_md_type(md);
2693         struct request_queue *q;
2694         int err;
2695
2696         memset(&md->tag_set, 0, sizeof(md->tag_set));
2697         md->tag_set.ops = &dm_mq_ops;
2698         md->tag_set.queue_depth = BLKDEV_MAX_RQ;
2699         md->tag_set.numa_node = NUMA_NO_NODE;
2700         md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
2701         md->tag_set.nr_hw_queues = 1;
2702         if (md_type == DM_TYPE_REQUEST_BASED) {
2703                 /* make the memory for non-blk-mq clone part of the pdu */
2704                 md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
2705         } else
2706                 md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
2707         md->tag_set.driver_data = md;
2708
2709         err = blk_mq_alloc_tag_set(&md->tag_set);
2710         if (err)
2711                 return err;
2712
2713         q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
2714         if (IS_ERR(q)) {
2715                 err = PTR_ERR(q);
2716                 goto out_tag_set;
2717         }
2718         md->queue = q;
2719         dm_init_md_queue(md);
2720
2721         /* backfill 'mq' sysfs registration normally done in blk_register_queue */
2722         blk_mq_register_disk(md->disk);
2723
2724         if (md_type == DM_TYPE_REQUEST_BASED)
2725                 init_rq_based_worker_thread(md);
2726
2727         return 0;
2728
2729 out_tag_set:
2730         blk_mq_free_tag_set(&md->tag_set);
2731         return err;
2732 }
2733
2734 static unsigned filter_md_type(unsigned type, struct mapped_device *md)
2735 {
2736         if (type == DM_TYPE_BIO_BASED)
2737                 return type;
2738
2739         return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
2740 }
2741
2742 /*
2743  * Setup the DM device's queue based on md's type
2744  */
2745 int dm_setup_md_queue(struct mapped_device *md)
2746 {
2747         int r;
2748         unsigned md_type = filter_md_type(dm_get_md_type(md), md);
2749
2750         switch (md_type) {
2751         case DM_TYPE_REQUEST_BASED:
2752                 r = dm_init_request_based_queue(md);
2753                 if (r) {
2754                         DMWARN("Cannot initialize queue for request-based mapped device");
2755                         return r;
2756                 }
2757                 break;
2758         case DM_TYPE_MQ_REQUEST_BASED:
2759                 r = dm_init_request_based_blk_mq_queue(md);
2760                 if (r) {
2761                         DMWARN("Cannot initialize queue for request-based blk-mq mapped device");
2762                         return r;
2763                 }
2764                 break;
2765         case DM_TYPE_BIO_BASED:
2766                 dm_init_old_md_queue(md);
2767                 blk_queue_make_request(md->queue, dm_make_request);
2768                 /*
2769                  * DM handles splitting bios as needed.  Free the bio_split bioset
2770                  * since it won't be used (saves 1 process per bio-based DM device).
2771                  */
2772                 bioset_free(md->queue->bio_split);
2773                 md->queue->bio_split = NULL;
2774                 break;
2775         }
2776
2777         return 0;
2778 }
2779
2780 struct mapped_device *dm_get_md(dev_t dev)
2781 {
2782         struct mapped_device *md;
2783         unsigned minor = MINOR(dev);
2784
2785         if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2786                 return NULL;
2787
2788         spin_lock(&_minor_lock);
2789
2790         md = idr_find(&_minor_idr, minor);
2791         if (md) {
2792                 if ((md == MINOR_ALLOCED ||
2793                      (MINOR(disk_devt(dm_disk(md))) != minor) ||
2794                      dm_deleting_md(md) ||
2795                      test_bit(DMF_FREEING, &md->flags))) {
2796                         md = NULL;
2797                         goto out;
2798                 }
2799                 dm_get(md);
2800         }
2801
2802 out:
2803         spin_unlock(&_minor_lock);
2804
2805         return md;
2806 }
2807 EXPORT_SYMBOL_GPL(dm_get_md);
2808
2809 void *dm_get_mdptr(struct mapped_device *md)
2810 {
2811         return md->interface_ptr;
2812 }
2813
2814 void dm_set_mdptr(struct mapped_device *md, void *ptr)
2815 {
2816         md->interface_ptr = ptr;
2817 }
2818
2819 void dm_get(struct mapped_device *md)
2820 {
2821         atomic_inc(&md->holders);
2822         BUG_ON(test_bit(DMF_FREEING, &md->flags));
2823 }
2824
2825 int dm_hold(struct mapped_device *md)
2826 {
2827         spin_lock(&_minor_lock);
2828         if (test_bit(DMF_FREEING, &md->flags)) {
2829                 spin_unlock(&_minor_lock);
2830                 return -EBUSY;
2831         }
2832         dm_get(md);
2833         spin_unlock(&_minor_lock);
2834         return 0;
2835 }
2836 EXPORT_SYMBOL_GPL(dm_hold);
2837
2838 const char *dm_device_name(struct mapped_device *md)
2839 {
2840         return md->name;
2841 }
2842 EXPORT_SYMBOL_GPL(dm_device_name);
2843
2844 static void __dm_destroy(struct mapped_device *md, bool wait)
2845 {
2846         struct dm_table *map;
2847         int srcu_idx;
2848
2849         might_sleep();
2850
2851         spin_lock(&_minor_lock);
2852         idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2853         set_bit(DMF_FREEING, &md->flags);
2854         spin_unlock(&_minor_lock);
2855
2856         if (dm_request_based(md) && md->kworker_task)
2857                 flush_kthread_worker(&md->kworker);
2858
2859         /*
2860          * Take suspend_lock so that presuspend and postsuspend methods
2861          * do not race with internal suspend.
2862          */
2863         mutex_lock(&md->suspend_lock);
2864         map = dm_get_live_table(md, &srcu_idx);
2865         if (!dm_suspended_md(md)) {
2866                 dm_table_presuspend_targets(map);
2867                 dm_table_postsuspend_targets(map);
2868         }
2869         /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2870         dm_put_live_table(md, srcu_idx);
2871         mutex_unlock(&md->suspend_lock);
2872
2873         /*
2874          * Rare, but there may be I/O requests still going to complete,
2875          * for example.  Wait for all references to disappear.
2876          * No one should increment the reference count of the mapped_device,
2877          * after the mapped_device state becomes DMF_FREEING.
2878          */
2879         if (wait)
2880                 while (atomic_read(&md->holders))
2881                         msleep(1);
2882         else if (atomic_read(&md->holders))
2883                 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2884                        dm_device_name(md), atomic_read(&md->holders));
2885
2886         dm_sysfs_exit(md);
2887         dm_table_destroy(__unbind(md));
2888         free_dev(md);
2889 }
2890
2891 void dm_destroy(struct mapped_device *md)
2892 {
2893         __dm_destroy(md, true);
2894 }
2895
2896 void dm_destroy_immediate(struct mapped_device *md)
2897 {
2898         __dm_destroy(md, false);
2899 }
2900
2901 void dm_put(struct mapped_device *md)
2902 {
2903         atomic_dec(&md->holders);
2904 }
2905 EXPORT_SYMBOL_GPL(dm_put);
2906
2907 static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2908 {
2909         int r = 0;
2910         DECLARE_WAITQUEUE(wait, current);
2911
2912         add_wait_queue(&md->wait, &wait);
2913
2914         while (1) {
2915                 set_current_state(interruptible);
2916
2917                 if (!md_in_flight(md))
2918                         break;
2919
2920                 if (interruptible == TASK_INTERRUPTIBLE &&
2921                     signal_pending(current)) {
2922                         r = -EINTR;
2923                         break;
2924                 }
2925
2926                 io_schedule();
2927         }
2928         set_current_state(TASK_RUNNING);
2929
2930         remove_wait_queue(&md->wait, &wait);
2931
2932         return r;
2933 }
2934
2935 /*
2936  * Process the deferred bios
2937  */
2938 static void dm_wq_work(struct work_struct *work)
2939 {
2940         struct mapped_device *md = container_of(work, struct mapped_device,
2941                                                 work);
2942         struct bio *c;
2943         int srcu_idx;
2944         struct dm_table *map;
2945
2946         map = dm_get_live_table(md, &srcu_idx);
2947
2948         while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2949                 spin_lock_irq(&md->deferred_lock);
2950                 c = bio_list_pop(&md->deferred);
2951                 spin_unlock_irq(&md->deferred_lock);
2952
2953                 if (!c)
2954                         break;
2955
2956                 if (dm_request_based(md))
2957                         generic_make_request(c);
2958                 else
2959                         __split_and_process_bio(md, map, c);
2960         }
2961
2962         dm_put_live_table(md, srcu_idx);
2963 }
2964
2965 static void dm_queue_flush(struct mapped_device *md)
2966 {
2967         clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2968         smp_mb__after_atomic();
2969         queue_work(md->wq, &md->work);
2970 }
2971
2972 /*
2973  * Swap in a new table, returning the old one for the caller to destroy.
2974  */
2975 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2976 {
2977         struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2978         struct queue_limits limits;
2979         int r;
2980
2981         mutex_lock(&md->suspend_lock);
2982
2983         /* device must be suspended */
2984         if (!dm_suspended_md(md))
2985                 goto out;
2986
2987         /*
2988          * If the new table has no data devices, retain the existing limits.
2989          * This helps multipath with queue_if_no_path if all paths disappear,
2990          * then new I/O is queued based on these limits, and then some paths
2991          * reappear.
2992          */
2993         if (dm_table_has_no_data_devices(table)) {
2994                 live_map = dm_get_live_table_fast(md);
2995                 if (live_map)
2996                         limits = md->queue->limits;
2997                 dm_put_live_table_fast(md);
2998         }
2999
3000         if (!live_map) {
3001                 r = dm_calculate_queue_limits(table, &limits);
3002                 if (r) {
3003                         map = ERR_PTR(r);
3004                         goto out;
3005                 }
3006         }
3007
3008         map = __bind(md, table, &limits);
3009
3010 out:
3011         mutex_unlock(&md->suspend_lock);
3012         return map;
3013 }
3014
3015 /*
3016  * Functions to lock and unlock any filesystem running on the
3017  * device.
3018  */
3019 static int lock_fs(struct mapped_device *md)
3020 {
3021         int r;
3022
3023         WARN_ON(md->frozen_sb);
3024
3025         md->frozen_sb = freeze_bdev(md->bdev);
3026         if (IS_ERR(md->frozen_sb)) {
3027                 r = PTR_ERR(md->frozen_sb);
3028                 md->frozen_sb = NULL;
3029                 return r;
3030         }
3031
3032         set_bit(DMF_FROZEN, &md->flags);
3033
3034         return 0;
3035 }
3036
3037 static void unlock_fs(struct mapped_device *md)
3038 {
3039         if (!test_bit(DMF_FROZEN, &md->flags))
3040                 return;
3041
3042         thaw_bdev(md->bdev, md->frozen_sb);
3043         md->frozen_sb = NULL;
3044         clear_bit(DMF_FROZEN, &md->flags);
3045 }
3046
3047 /*
3048  * If __dm_suspend returns 0, the device is completely quiescent
3049  * now. There is no request-processing activity. All new requests
3050  * are being added to md->deferred list.
3051  *
3052  * Caller must hold md->suspend_lock
3053  */
3054 static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
3055                         unsigned suspend_flags, int interruptible)
3056 {
3057         bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
3058         bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
3059         int r;
3060
3061         /*
3062          * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
3063          * This flag is cleared before dm_suspend returns.
3064          */
3065         if (noflush)
3066                 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
3067
3068         /*
3069          * This gets reverted if there's an error later and the targets
3070          * provide the .presuspend_undo hook.
3071          */
3072         dm_table_presuspend_targets(map);
3073
3074         /*
3075          * Flush I/O to the device.
3076          * Any I/O submitted after lock_fs() may not be flushed.
3077          * noflush takes precedence over do_lockfs.
3078          * (lock_fs() flushes I/Os and waits for them to complete.)
3079          */
3080         if (!noflush && do_lockfs) {
3081                 r = lock_fs(md);
3082                 if (r) {
3083                         dm_table_presuspend_undo_targets(map);
3084                         return r;
3085                 }
3086         }
3087
3088         /*
3089          * Here we must make sure that no processes are submitting requests
3090          * to target drivers i.e. no one may be executing
3091          * __split_and_process_bio. This is called from dm_request and
3092          * dm_wq_work.
3093          *
3094          * To get all processes out of __split_and_process_bio in dm_request,
3095          * we take the write lock. To prevent any process from reentering
3096          * __split_and_process_bio from dm_request and quiesce the thread
3097          * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
3098          * flush_workqueue(md->wq).
3099          */
3100         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
3101         if (map)
3102                 synchronize_srcu(&md->io_barrier);
3103
3104         /*
3105          * Stop md->queue before flushing md->wq in case request-based
3106          * dm defers requests to md->wq from md->queue.
3107          */
3108         if (dm_request_based(md)) {
3109                 stop_queue(md->queue);
3110                 if (md->kworker_task)
3111                         flush_kthread_worker(&md->kworker);
3112         }
3113
3114         flush_workqueue(md->wq);
3115
3116         /*
3117          * At this point no more requests are entering target request routines.
3118          * We call dm_wait_for_completion to wait for all existing requests
3119          * to finish.
3120          */
3121         r = dm_wait_for_completion(md, interruptible);
3122
3123         if (noflush)
3124                 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
3125         if (map)
3126                 synchronize_srcu(&md->io_barrier);
3127
3128         /* were we interrupted ? */
3129         if (r < 0) {
3130                 dm_queue_flush(md);
3131
3132                 if (dm_request_based(md))
3133                         start_queue(md->queue);
3134
3135                 unlock_fs(md);
3136                 dm_table_presuspend_undo_targets(map);
3137                 /* pushback list is already flushed, so skip flush */
3138         }
3139
3140         return r;
3141 }
3142
3143 /*
3144  * We need to be able to change a mapping table under a mounted
3145  * filesystem.  For example we might want to move some data in
3146  * the background.  Before the table can be swapped with
3147  * dm_bind_table, dm_suspend must be called to flush any in
3148  * flight bios and ensure that any further io gets deferred.
3149  */
3150 /*
3151  * Suspend mechanism in request-based dm.
3152  *
3153  * 1. Flush all I/Os by lock_fs() if needed.
3154  * 2. Stop dispatching any I/O by stopping the request_queue.
3155  * 3. Wait for all in-flight I/Os to be completed or requeued.
3156  *
3157  * To abort suspend, start the request_queue.
3158  */
3159 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
3160 {
3161         struct dm_table *map = NULL;
3162         int r = 0;
3163
3164 retry:
3165         mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
3166
3167         if (dm_suspended_md(md)) {
3168                 r = -EINVAL;
3169                 goto out_unlock;
3170         }
3171
3172         if (dm_suspended_internally_md(md)) {
3173                 /* already internally suspended, wait for internal resume */
3174                 mutex_unlock(&md->suspend_lock);
3175                 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
3176                 if (r)
3177                         return r;
3178                 goto retry;
3179         }
3180
3181         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3182
3183         r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE);
3184         if (r)
3185                 goto out_unlock;
3186
3187         set_bit(DMF_SUSPENDED, &md->flags);
3188
3189         dm_table_postsuspend_targets(map);
3190
3191 out_unlock:
3192         mutex_unlock(&md->suspend_lock);
3193         return r;
3194 }
3195
3196 static int __dm_resume(struct mapped_device *md, struct dm_table *map)
3197 {
3198         if (map) {
3199                 int r = dm_table_resume_targets(map);
3200                 if (r)
3201                         return r;
3202         }
3203
3204         dm_queue_flush(md);
3205
3206         /*
3207          * Flushing deferred I/Os must be done after targets are resumed
3208          * so that mapping of targets can work correctly.
3209          * Request-based dm is queueing the deferred I/Os in its request_queue.
3210          */
3211         if (dm_request_based(md))
3212                 start_queue(md->queue);
3213
3214         unlock_fs(md);
3215
3216         return 0;
3217 }
3218
3219 int dm_resume(struct mapped_device *md)
3220 {
3221         int r = -EINVAL;
3222         struct dm_table *map = NULL;
3223
3224 retry:
3225         mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
3226
3227         if (!dm_suspended_md(md))
3228                 goto out;
3229
3230         if (dm_suspended_internally_md(md)) {
3231                 /* already internally suspended, wait for internal resume */
3232                 mutex_unlock(&md->suspend_lock);
3233                 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
3234                 if (r)
3235                         return r;
3236                 goto retry;
3237         }
3238
3239         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3240         if (!map || !dm_table_get_size(map))
3241                 goto out;
3242
3243         r = __dm_resume(md, map);
3244         if (r)
3245                 goto out;
3246
3247         clear_bit(DMF_SUSPENDED, &md->flags);
3248
3249         r = 0;
3250 out:
3251         mutex_unlock(&md->suspend_lock);
3252
3253         return r;
3254 }
3255
3256 /*
3257  * Internal suspend/resume works like userspace-driven suspend. It waits
3258  * until all bios finish and prevents issuing new bios to the target drivers.
3259  * It may be used only from the kernel.
3260  */
3261
3262 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
3263 {
3264         struct dm_table *map = NULL;
3265
3266         if (md->internal_suspend_count++)
3267                 return; /* nested internal suspend */
3268
3269         if (dm_suspended_md(md)) {
3270                 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3271                 return; /* nest suspend */
3272         }
3273
3274         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3275
3276         /*
3277          * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
3278          * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
3279          * would require changing .presuspend to return an error -- avoid this
3280          * until there is a need for more elaborate variants of internal suspend.
3281          */
3282         (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE);
3283
3284         set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3285
3286         dm_table_postsuspend_targets(map);
3287 }
3288
3289 static void __dm_internal_resume(struct mapped_device *md)
3290 {
3291         BUG_ON(!md->internal_suspend_count);
3292
3293         if (--md->internal_suspend_count)
3294                 return; /* resume from nested internal suspend */
3295
3296         if (dm_suspended_md(md))
3297                 goto done; /* resume from nested suspend */
3298
3299         /*
3300          * NOTE: existing callers don't need to call dm_table_resume_targets
3301          * (which may fail -- so best to avoid it for now by passing NULL map)
3302          */
3303         (void) __dm_resume(md, NULL);
3304
3305 done:
3306         clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3307         smp_mb__after_atomic();
3308         wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
3309 }
3310
3311 void dm_internal_suspend_noflush(struct mapped_device *md)
3312 {
3313         mutex_lock(&md->suspend_lock);
3314         __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
3315         mutex_unlock(&md->suspend_lock);
3316 }
3317 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
3318
3319 void dm_internal_resume(struct mapped_device *md)
3320 {
3321         mutex_lock(&md->suspend_lock);
3322         __dm_internal_resume(md);
3323         mutex_unlock(&md->suspend_lock);
3324 }
3325 EXPORT_SYMBOL_GPL(dm_internal_resume);
3326
3327 /*
3328  * Fast variants of internal suspend/resume hold md->suspend_lock,
3329  * which prevents interaction with userspace-driven suspend.
3330  */
3331
3332 void dm_internal_suspend_fast(struct mapped_device *md)
3333 {
3334         mutex_lock(&md->suspend_lock);
3335         if (dm_suspended_md(md) || dm_suspended_internally_md(md))
3336                 return;
3337
3338         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
3339         synchronize_srcu(&md->io_barrier);
3340         flush_workqueue(md->wq);
3341         dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
3342 }
3343 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
3344
3345 void dm_internal_resume_fast(struct mapped_device *md)
3346 {
3347         if (dm_suspended_md(md) || dm_suspended_internally_md(md))
3348                 goto done;
3349
3350         dm_queue_flush(md);
3351
3352 done:
3353         mutex_unlock(&md->suspend_lock);
3354 }
3355 EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
3356
3357 /*-----------------------------------------------------------------
3358  * Event notification.
3359  *---------------------------------------------------------------*/
3360 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
3361                        unsigned cookie)
3362 {
3363         char udev_cookie[DM_COOKIE_LENGTH];
3364         char *envp[] = { udev_cookie, NULL };
3365
3366         if (!cookie)
3367                 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
3368         else {
3369                 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
3370                          DM_COOKIE_ENV_VAR_NAME, cookie);
3371                 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
3372                                           action, envp);
3373         }
3374 }
3375
3376 uint32_t dm_next_uevent_seq(struct mapped_device *md)
3377 {
3378         return atomic_add_return(1, &md->uevent_seq);
3379 }
3380
3381 uint32_t dm_get_event_nr(struct mapped_device *md)
3382 {
3383         return atomic_read(&md->event_nr);
3384 }
3385
3386 int dm_wait_event(struct mapped_device *md, int event_nr)
3387 {
3388         return wait_event_interruptible(md->eventq,
3389                         (event_nr != atomic_read(&md->event_nr)));
3390 }
3391
3392 void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
3393 {
3394         unsigned long flags;
3395
3396         spin_lock_irqsave(&md->uevent_lock, flags);
3397         list_add(elist, &md->uevent_list);
3398         spin_unlock_irqrestore(&md->uevent_lock, flags);
3399 }
3400
3401 /*
3402  * The gendisk is only valid as long as you have a reference
3403  * count on 'md'.
3404  */
3405 struct gendisk *dm_disk(struct mapped_device *md)
3406 {
3407         return md->disk;
3408 }
3409 EXPORT_SYMBOL_GPL(dm_disk);
3410
3411 struct kobject *dm_kobject(struct mapped_device *md)
3412 {
3413         return &md->kobj_holder.kobj;
3414 }
3415
3416 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
3417 {
3418         struct mapped_device *md;
3419
3420         md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
3421
3422         if (test_bit(DMF_FREEING, &md->flags) ||
3423             dm_deleting_md(md))
3424                 return NULL;
3425
3426         dm_get(md);
3427         return md;
3428 }
3429
3430 int dm_suspended_md(struct mapped_device *md)
3431 {
3432         return test_bit(DMF_SUSPENDED, &md->flags);
3433 }
3434
3435 int dm_suspended_internally_md(struct mapped_device *md)
3436 {
3437         return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3438 }
3439
3440 int dm_test_deferred_remove_flag(struct mapped_device *md)
3441 {
3442         return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
3443 }
3444
3445 int dm_suspended(struct dm_target *ti)
3446 {
3447         return dm_suspended_md(dm_table_get_md(ti->table));
3448 }
3449 EXPORT_SYMBOL_GPL(dm_suspended);
3450
3451 int dm_noflush_suspending(struct dm_target *ti)
3452 {
3453         return __noflush_suspending(dm_table_get_md(ti->table));
3454 }
3455 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3456
3457 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
3458                                             unsigned integrity, unsigned per_bio_data_size)
3459 {
3460         struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
3461         struct kmem_cache *cachep = NULL;
3462         unsigned int pool_size = 0;
3463         unsigned int front_pad;
3464
3465         if (!pools)
3466                 return NULL;
3467
3468         type = filter_md_type(type, md);
3469
3470         switch (type) {
3471         case DM_TYPE_BIO_BASED:
3472                 cachep = _io_cache;
3473                 pool_size = dm_get_reserved_bio_based_ios();
3474                 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
3475                 break;
3476         case DM_TYPE_REQUEST_BASED:
3477                 cachep = _rq_tio_cache;
3478                 pool_size = dm_get_reserved_rq_based_ios();
3479                 pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
3480                 if (!pools->rq_pool)
3481                         goto out;
3482                 /* fall through to setup remaining rq-based pools */
3483         case DM_TYPE_MQ_REQUEST_BASED:
3484                 if (!pool_size)
3485                         pool_size = dm_get_reserved_rq_based_ios();
3486                 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
3487                 /* per_bio_data_size is not used. See __bind_mempools(). */
3488                 WARN_ON(per_bio_data_size != 0);
3489                 break;
3490         default:
3491                 BUG();
3492         }
3493
3494         if (cachep) {
3495                 pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
3496                 if (!pools->io_pool)
3497                         goto out;
3498         }
3499
3500         pools->bs = bioset_create_nobvec(pool_size, front_pad);
3501         if (!pools->bs)
3502                 goto out;
3503
3504         if (integrity && bioset_integrity_create(pools->bs, pool_size))
3505                 goto out;
3506
3507         return pools;
3508
3509 out:
3510         dm_free_md_mempools(pools);
3511
3512         return NULL;
3513 }
3514
3515 void dm_free_md_mempools(struct dm_md_mempools *pools)
3516 {
3517         if (!pools)
3518                 return;
3519
3520         mempool_destroy(pools->io_pool);
3521         mempool_destroy(pools->rq_pool);
3522
3523         if (pools->bs)
3524                 bioset_free(pools->bs);
3525
3526         kfree(pools);
3527 }
3528
3529 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3530                           u32 flags)
3531 {
3532         struct mapped_device *md = bdev->bd_disk->private_data;
3533         const struct pr_ops *ops;
3534         fmode_t mode;
3535         int r;
3536
3537         r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
3538         if (r < 0)
3539                 return r;
3540
3541         ops = bdev->bd_disk->fops->pr_ops;
3542         if (ops && ops->pr_register)
3543                 r = ops->pr_register(bdev, old_key, new_key, flags);
3544         else
3545                 r = -EOPNOTSUPP;
3546
3547         bdput(bdev);
3548         return r;
3549 }
3550
3551 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3552                          u32 flags)
3553 {
3554         struct mapped_device *md = bdev->bd_disk->private_data;
3555         const struct pr_ops *ops;
3556         fmode_t mode;
3557         int r;
3558
3559         r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
3560         if (r < 0)
3561                 return r;
3562
3563         ops = bdev->bd_disk->fops->pr_ops;
3564         if (ops && ops->pr_reserve)
3565                 r = ops->pr_reserve(bdev, key, type, flags);
3566         else
3567                 r = -EOPNOTSUPP;
3568
3569         bdput(bdev);
3570         return r;
3571 }
3572
3573 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3574 {
3575         struct mapped_device *md = bdev->bd_disk->private_data;
3576         const struct pr_ops *ops;
3577         fmode_t mode;
3578         int r;
3579
3580         r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
3581         if (r < 0)
3582                 return r;
3583
3584         ops = bdev->bd_disk->fops->pr_ops;
3585         if (ops && ops->pr_release)
3586                 r = ops->pr_release(bdev, key, type);
3587         else
3588                 r = -EOPNOTSUPP;
3589
3590         bdput(bdev);
3591         return r;
3592 }
3593
3594 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3595                          enum pr_type type, bool abort)
3596 {
3597         struct mapped_device *md = bdev->bd_disk->private_data;
3598         const struct pr_ops *ops;
3599         fmode_t mode;
3600         int r;
3601
3602         r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
3603         if (r < 0)
3604                 return r;
3605
3606         ops = bdev->bd_disk->fops->pr_ops;
3607         if (ops && ops->pr_preempt)
3608                 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3609         else
3610                 r = -EOPNOTSUPP;
3611
3612         bdput(bdev);
3613         return r;
3614 }
3615
3616 static int dm_pr_clear(struct block_device *bdev, u64 key)
3617 {
3618         struct mapped_device *md = bdev->bd_disk->private_data;
3619         const struct pr_ops *ops;
3620         fmode_t mode;
3621         int r;
3622
3623         r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
3624         if (r < 0)
3625                 return r;
3626
3627         ops = bdev->bd_disk->fops->pr_ops;
3628         if (ops && ops->pr_clear)
3629                 r = ops->pr_clear(bdev, key);
3630         else
3631                 r = -EOPNOTSUPP;
3632
3633         bdput(bdev);
3634         return r;
3635 }
3636
3637 static const struct pr_ops dm_pr_ops = {
3638         .pr_register    = dm_pr_register,
3639         .pr_reserve     = dm_pr_reserve,
3640         .pr_release     = dm_pr_release,
3641         .pr_preempt     = dm_pr_preempt,
3642         .pr_clear       = dm_pr_clear,
3643 };
3644
3645 static const struct block_device_operations dm_blk_dops = {
3646         .open = dm_blk_open,
3647         .release = dm_blk_close,
3648         .ioctl = dm_blk_ioctl,
3649         .getgeo = dm_blk_getgeo,
3650         .pr_ops = &dm_pr_ops,
3651         .owner = THIS_MODULE
3652 };
3653
3654 /*
3655  * module hooks
3656  */
3657 module_init(dm_init);
3658 module_exit(dm_exit);
3659
3660 module_param(major, uint, 0);
3661 MODULE_PARM_DESC(major, "The major number of the device mapper");
3662
3663 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3664 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3665
3666 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
3667 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
3668
3669 module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
3670 MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
3671
3672 MODULE_DESCRIPTION(DM_NAME " driver");
3673 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3674 MODULE_LICENSE("GPL");