dm: allocate blk_mq_tag_set rather than embed in mapped_device
[cascardo/linux.git] / drivers / md / dm.c
index 5df4048..e903595 100644 (file)
@@ -106,14 +106,6 @@ struct dm_rq_clone_bio_info {
        struct bio clone;
 };
 
-union map_info *dm_get_rq_mapinfo(struct request *rq)
-{
-       if (rq && rq->end_io_data)
-               return &((struct dm_rq_target_io *)rq->end_io_data)->info;
-       return NULL;
-}
-EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
-
 #define MINOR_ALLOCED ((void *)-1)
 
 /*
@@ -162,6 +154,7 @@ struct mapped_device {
        /* Protect queue and type against concurrent access. */
        struct mutex type_lock;
 
+       struct dm_target *immutable_target;
        struct target_type *immutable_target_type;
 
        struct gendisk *disk;
@@ -230,7 +223,7 @@ struct mapped_device {
        ktime_t last_rq_start_time;
 
        /* for blk-mq request-based DM support */
-       struct blk_mq_tag_set tag_set;
+       struct blk_mq_tag_set *tag_set;
        bool use_blk_mq;
 };
 
@@ -240,6 +233,12 @@ static bool use_blk_mq = true;
 static bool use_blk_mq = false;
 #endif
 
+#define DM_MQ_NR_HW_QUEUES 1
+#define DM_MQ_QUEUE_DEPTH 2048
+
+static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES;
+static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH;
+
 bool dm_use_blk_mq(struct mapped_device *md)
 {
        return md->use_blk_mq;
@@ -310,6 +309,17 @@ unsigned dm_get_reserved_rq_based_ios(void)
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
 
+static unsigned dm_get_blk_mq_nr_hw_queues(void)
+{
+       return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32);
+}
+
+static unsigned dm_get_blk_mq_queue_depth(void)
+{
+       return __dm_get_module_param(&dm_mq_queue_depth,
+                                    DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH);
+}
+
 static int __init local_init(void)
 {
        int r = -ENOMEM;
@@ -556,16 +566,17 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
        return dm_get_geometry(md, geo);
 }
 
-static int dm_get_live_table_for_ioctl(struct mapped_device *md,
-               struct dm_target **tgt, struct block_device **bdev,
-               fmode_t *mode, int *srcu_idx)
+static int dm_grab_bdev_for_ioctl(struct mapped_device *md,
+                                 struct block_device **bdev,
+                                 fmode_t *mode)
 {
+       struct dm_target *tgt;
        struct dm_table *map;
-       int r;
+       int srcu_idx, r;
 
 retry:
        r = -ENOTTY;
-       map = dm_get_live_table(md, srcu_idx);
+       map = dm_get_live_table(md, &srcu_idx);
        if (!map || !dm_table_get_size(map))
                goto out;
 
@@ -573,9 +584,8 @@ retry:
        if (dm_table_get_num_targets(map) != 1)
                goto out;
 
-       *tgt = dm_table_get_target(map, 0);
-
-       if (!(*tgt)->type->prepare_ioctl)
+       tgt = dm_table_get_target(map, 0);
+       if (!tgt->type->prepare_ioctl)
                goto out;
 
        if (dm_suspended_md(md)) {
@@ -583,14 +593,16 @@ retry:
                goto out;
        }
 
-       r = (*tgt)->type->prepare_ioctl(*tgt, bdev, mode);
+       r = tgt->type->prepare_ioctl(tgt, bdev, mode);
        if (r < 0)
                goto out;
 
+       bdgrab(*bdev);
+       dm_put_live_table(md, srcu_idx);
        return r;
 
 out:
-       dm_put_live_table(md, *srcu_idx);
+       dm_put_live_table(md, srcu_idx);
        if (r == -ENOTCONN && !fatal_signal_pending(current)) {
                msleep(10);
                goto retry;
@@ -602,11 +614,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
                        unsigned int cmd, unsigned long arg)
 {
        struct mapped_device *md = bdev->bd_disk->private_data;
-       struct dm_target *tgt;
-       struct block_device *tgt_bdev = NULL;
-       int srcu_idx, r;
+       int r;
 
-       r = dm_get_live_table_for_ioctl(md, &tgt, &tgt_bdev, &mode, &srcu_idx);
+       r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
        if (r < 0)
                return r;
 
@@ -621,9 +631,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
                        goto out;
        }
 
-       r =  __blkdev_driver_ioctl(tgt_bdev, mode, cmd, arg);
+       r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 out:
-       dm_put_live_table(md, srcu_idx);
+       bdput(bdev);
        return r;
 }
 
@@ -1109,12 +1119,8 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
         * back into ->request_fn() could deadlock attempting to grab the
         * queue lock again.
         */
-       if (run_queue) {
-               if (md->queue->mq_ops)
-                       blk_mq_run_hw_queues(md->queue, true);
-               else
-                       blk_run_queue_async(md->queue);
-       }
+       if (!md->queue->mq_ops && run_queue)
+               blk_run_queue_async(md->queue);
 
        /*
         * dm_put() must be at the end of this function. See the comment above
@@ -1191,6 +1197,8 @@ static void dm_unprep_request(struct request *rq)
 
        if (clone)
                free_rq_clone(clone);
+       else if (!tio->md->queue->mq_ops)
+               free_rq_tio(tio);
 }
 
 /*
@@ -1334,7 +1342,10 @@ static void dm_complete_request(struct request *rq, int error)
        struct dm_rq_target_io *tio = tio_from_request(rq);
 
        tio->error = error;
-       blk_complete_request(rq);
+       if (!rq->q->mq_ops)
+               blk_complete_request(rq);
+       else
+               blk_mq_complete_request(rq, error);
 }
 
 /*
@@ -2077,12 +2088,18 @@ static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
 static void dm_request_fn(struct request_queue *q)
 {
        struct mapped_device *md = q->queuedata;
-       int srcu_idx;
-       struct dm_table *map = dm_get_live_table(md, &srcu_idx);
-       struct dm_target *ti;
+       struct dm_target *ti = md->immutable_target;
        struct request *rq;
        struct dm_rq_target_io *tio;
-       sector_t pos;
+       sector_t pos = 0;
+
+       if (unlikely(!ti)) {
+               int srcu_idx;
+               struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+
+               ti = dm_table_find_target(map, pos);
+               dm_put_live_table(md, srcu_idx);
+       }
 
        /*
         * For suspend, check blk_queue_stopped() and increment
@@ -2093,33 +2110,21 @@ static void dm_request_fn(struct request_queue *q)
        while (!blk_queue_stopped(q)) {
                rq = blk_peek_request(q);
                if (!rq)
-                       goto out;
+                       return;
 
                /* always use block 0 to find the target for flushes for now */
                pos = 0;
                if (!(rq->cmd_flags & REQ_FLUSH))
                        pos = blk_rq_pos(rq);
 
-               ti = dm_table_find_target(map, pos);
-               if (!dm_target_is_valid(ti)) {
-                       /*
-                        * Must perform setup, that rq_completed() requires,
-                        * before calling dm_kill_unmapped_request
-                        */
-                       DMERR_LIMIT("request attempted access beyond the end of device");
-                       dm_start_request(md, rq);
-                       dm_kill_unmapped_request(rq, -EIO);
-                       continue;
+               if ((dm_request_peeked_before_merge_deadline(md) &&
+                    md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+                    md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
+                   (ti->type->busy && ti->type->busy(ti))) {
+                       blk_delay_queue(q, HZ / 100);
+                       return;
                }
 
-               if (dm_request_peeked_before_merge_deadline(md) &&
-                   md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
-                   md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
-                       goto delay_and_out;
-
-               if (ti->type->busy && ti->type->busy(ti))
-                       goto delay_and_out;
-
                dm_start_request(md, rq);
 
                tio = tio_from_request(rq);
@@ -2128,13 +2133,6 @@ static void dm_request_fn(struct request_queue *q)
                queue_kthread_work(&md->kworker, &tio->work);
                BUG_ON(!irqs_disabled());
        }
-
-       goto out;
-
-delay_and_out:
-       blk_delay_queue(q, HZ / 100);
-out:
-       dm_put_live_table(md, srcu_idx);
 }
 
 static int dm_any_congested(void *congested_data, int bdi_bits)
@@ -2144,19 +2142,18 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
        struct dm_table *map;
 
        if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
-               map = dm_get_live_table_fast(md);
-               if (map) {
+               if (dm_request_based(md)) {
                        /*
-                        * Request-based dm cares about only own queue for
-                        * the query about congestion status of request_queue
+                        * With request-based DM we only need to check the
+                        * top-level queue for congestion.
                         */
-                       if (dm_request_based(md))
-                               r = md->queue->backing_dev_info.wb.state &
-                                   bdi_bits;
-                       else
+                       r = md->queue->backing_dev_info.wb.state & bdi_bits;
+               } else {
+                       map = dm_get_live_table_fast(md);
+                       if (map)
                                r = dm_table_any_congested(map, bdi_bits);
+                       dm_put_live_table_fast(md);
                }
-               dm_put_live_table_fast(md);
        }
 
        return r;
@@ -2391,8 +2388,10 @@ static void free_dev(struct mapped_device *md)
        unlock_fs(md);
 
        cleanup_mapped_device(md);
-       if (md->use_blk_mq)
-               blk_mq_free_tag_set(&md->tag_set);
+       if (md->tag_set) {
+               blk_mq_free_tag_set(md->tag_set);
+               kfree(md->tag_set);
+       }
 
        free_table_devices(&md->table_devices);
        dm_stats_cleanup(&md->stats);
@@ -2500,8 +2499,15 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
         * This must be done before setting the queue restrictions,
         * because request-based dm may be run just after the setting.
         */
-       if (dm_table_request_based(t))
+       if (dm_table_request_based(t)) {
                stop_queue(q);
+               /*
+                * Leverage the fact that request-based DM targets are
+                * immutable singletons and establish md->immutable_target
+                * - used to optimize both dm_request_fn and dm_mq_queue_rq
+                */
+               md->immutable_target = dm_table_get_immutable_target(t);
+       }
 
        __bind_mempools(md, t);
 
@@ -2572,7 +2578,6 @@ void dm_set_md_type(struct mapped_device *md, unsigned type)
 
 unsigned dm_get_md_type(struct mapped_device *md)
 {
-       BUG_ON(!mutex_is_locked(&md->type_lock));
        return md->type;
 }
 
@@ -2649,28 +2654,15 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
        struct request *rq = bd->rq;
        struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
        struct mapped_device *md = tio->md;
-       int srcu_idx;
-       struct dm_table *map = dm_get_live_table(md, &srcu_idx);
-       struct dm_target *ti;
-       sector_t pos;
+       struct dm_target *ti = md->immutable_target;
 
-       /* always use block 0 to find the target for flushes for now */
-       pos = 0;
-       if (!(rq->cmd_flags & REQ_FLUSH))
-               pos = blk_rq_pos(rq);
+       if (unlikely(!ti)) {
+               int srcu_idx;
+               struct dm_table *map = dm_get_live_table(md, &srcu_idx);
 
-       ti = dm_table_find_target(map, pos);
-       if (!dm_target_is_valid(ti)) {
+               ti = dm_table_find_target(map, 0);
                dm_put_live_table(md, srcu_idx);
-               DMERR_LIMIT("request attempted access beyond the end of device");
-               /*
-                * Must perform setup, that rq_completed() requires,
-                * before returning BLK_MQ_RQ_QUEUE_ERROR
-                */
-               dm_start_request(md, rq);
-               return BLK_MQ_RQ_QUEUE_ERROR;
        }
-       dm_put_live_table(md, srcu_idx);
 
        if (ti->type->busy && ti->type->busy(ti))
                return BLK_MQ_RQ_QUEUE_BUSY;
@@ -2686,8 +2678,10 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
         */
        tio->ti = ti;
 
-       /* Clone the request if underlying devices aren't blk-mq */
-       if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
+       /*
+        * Both the table and md type cannot change after initial table load
+        */
+       if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) {
                /* clone request is allocated at the end of the pdu */
                tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
                (void) clone_rq(rq, md, tio, GFP_ATOMIC);
@@ -2718,24 +2712,28 @@ static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
        struct request_queue *q;
        int err;
 
-       memset(&md->tag_set, 0, sizeof(md->tag_set));
-       md->tag_set.ops = &dm_mq_ops;
-       md->tag_set.queue_depth = BLKDEV_MAX_RQ;
-       md->tag_set.numa_node = NUMA_NO_NODE;
-       md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
-       md->tag_set.nr_hw_queues = 1;
+       md->tag_set = kzalloc(sizeof(struct blk_mq_tag_set), GFP_KERNEL);
+       if (!md->tag_set)
+               return -ENOMEM;
+
+       md->tag_set->ops = &dm_mq_ops;
+       md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
+       md->tag_set->numa_node = NUMA_NO_NODE;
+       md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+       md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
+       md->tag_set->driver_data = md;
+
+       md->tag_set->cmd_size = sizeof(struct dm_rq_target_io);
        if (md_type == DM_TYPE_REQUEST_BASED) {
-               /* make the memory for non-blk-mq clone part of the pdu */
-               md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
-       } else
-               md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
-       md->tag_set.driver_data = md;
+               /* put the memory for non-blk-mq clone at the end of the pdu */
+               md->tag_set->cmd_size += sizeof(struct request);
+       }
 
-       err = blk_mq_alloc_tag_set(&md->tag_set);
+       err = blk_mq_alloc_tag_set(md->tag_set);
        if (err)
-               return err;
+               goto out_kfree_tag_set;
 
-       q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
+       q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
        if (IS_ERR(q)) {
                err = PTR_ERR(q);
                goto out_tag_set;
@@ -2752,7 +2750,10 @@ static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
        return 0;
 
 out_tag_set:
-       blk_mq_free_tag_set(&md->tag_set);
+       blk_mq_free_tag_set(md->tag_set);
+out_kfree_tag_set:
+       kfree(md->tag_set);
+
        return err;
 }
 
@@ -3552,15 +3553,14 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
 }
 
 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
-               u32 flags)
+                         u32 flags)
 {
        struct mapped_device *md = bdev->bd_disk->private_data;
        const struct pr_ops *ops;
-       struct dm_target *tgt;
        fmode_t mode;
-       int srcu_idx, r;
+       int r;
 
-       r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+       r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
        if (r < 0)
                return r;
 
@@ -3570,20 +3570,19 @@ static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
        else
                r = -EOPNOTSUPP;
 
-       dm_put_live_table(md, srcu_idx);
+       bdput(bdev);
        return r;
 }
 
 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
-               u32 flags)
+                        u32 flags)
 {
        struct mapped_device *md = bdev->bd_disk->private_data;
        const struct pr_ops *ops;
-       struct dm_target *tgt;
        fmode_t mode;
-       int srcu_idx, r;
+       int r;
 
-       r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+       r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
        if (r < 0)
                return r;
 
@@ -3593,7 +3592,7 @@ static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
        else
                r = -EOPNOTSUPP;
 
-       dm_put_live_table(md, srcu_idx);
+       bdput(bdev);
        return r;
 }
 
@@ -3601,11 +3600,10 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
 {
        struct mapped_device *md = bdev->bd_disk->private_data;
        const struct pr_ops *ops;
-       struct dm_target *tgt;
        fmode_t mode;
-       int srcu_idx, r;
+       int r;
 
-       r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+       r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
        if (r < 0)
                return r;
 
@@ -3615,20 +3613,19 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
        else
                r = -EOPNOTSUPP;
 
-       dm_put_live_table(md, srcu_idx);
+       bdput(bdev);
        return r;
 }
 
 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
-               enum pr_type type, bool abort)
+                        enum pr_type type, bool abort)
 {
        struct mapped_device *md = bdev->bd_disk->private_data;
        const struct pr_ops *ops;
-       struct dm_target *tgt;
        fmode_t mode;
-       int srcu_idx, r;
+       int r;
 
-       r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+       r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
        if (r < 0)
                return r;
 
@@ -3638,7 +3635,7 @@ static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
        else
                r = -EOPNOTSUPP;
 
-       dm_put_live_table(md, srcu_idx);
+       bdput(bdev);
        return r;
 }
 
@@ -3646,11 +3643,10 @@ static int dm_pr_clear(struct block_device *bdev, u64 key)
 {
        struct mapped_device *md = bdev->bd_disk->private_data;
        const struct pr_ops *ops;
-       struct dm_target *tgt;
        fmode_t mode;
-       int srcu_idx, r;
+       int r;
 
-       r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+       r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
        if (r < 0)
                return r;
 
@@ -3660,7 +3656,7 @@ static int dm_pr_clear(struct block_device *bdev, u64 key)
        else
                r = -EOPNOTSUPP;
 
-       dm_put_live_table(md, srcu_idx);
+       bdput(bdev);
        return r;
 }
 
@@ -3699,6 +3695,12 @@ MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"
 module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
 
+module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices");
+
+module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");