drivers/md/dm-cache-target.c

   1 /*
   2  * Copyright (C) 2012 Red Hat. All rights reserved.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm.h"
   8 #include "dm-bio-prison.h"
   9 #include "dm-bio-record.h"
  10 #include "dm-cache-metadata.h"
  11
  12 #include <linux/dm-io.h>
  13 #include <linux/dm-kcopyd.h>
  14 #include <linux/init.h>
  15 #include <linux/mempool.h>
  16 #include <linux/module.h>
  17 #include <linux/slab.h>
  18 #include <linux/vmalloc.h>
  19
  20 #define DM_MSG_PREFIX "cache"
  21
  22 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
  23         "A percentage of time allocated for copying to and/or from cache");
  24
  25 /*----------------------------------------------------------------*/
  26
  27 /*
  28  * Glossary:
  29  *
  30  * oblock: index of an origin block
  31  * cblock: index of a cache block
  32  * promotion: movement of a block from origin to cache
  33  * demotion: movement of a block from cache to origin
  34  * migration: movement of a block between the origin and cache device,
  35  *            either direction
  36  */
  37
  38 /*----------------------------------------------------------------*/
  39
  40 static size_t bitset_size_in_bytes(unsigned nr_entries)
  41 {
  42         return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
  43 }
  44
  45 static unsigned long *alloc_bitset(unsigned nr_entries)
  46 {
  47         size_t s = bitset_size_in_bytes(nr_entries);
  48         return vzalloc(s);
  49 }
  50
  51 static void clear_bitset(void *bitset, unsigned nr_entries)
  52 {
  53         size_t s = bitset_size_in_bytes(nr_entries);
  54         memset(bitset, 0, s);
  55 }
  56
  57 static void free_bitset(unsigned long *bits)
  58 {
  59         vfree(bits);
  60 }
  61
  62 /*----------------------------------------------------------------*/
  63
  64 #define PRISON_CELLS 1024
  65 #define MIGRATION_POOL_SIZE 128
  66 #define COMMIT_PERIOD HZ
  67 #define MIGRATION_COUNT_WINDOW 10
  68
  69 /*
  70  * The block size of the device holding cache data must be >= 32KB
  71  */
  72 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
  73
  74 /*
  75  * FIXME: the cache is read/write for the time being.
  76  */
  77 enum cache_mode {
  78         CM_WRITE,               /* metadata may be changed */
  79         CM_READ_ONLY,           /* metadata may not be changed */
  80 };
  81
  82 struct cache_features {
  83         enum cache_mode mode;
  84         bool write_through:1;
  85 };
  86
  87 struct cache_stats {
  88         atomic_t read_hit;
  89         atomic_t read_miss;
  90         atomic_t write_hit;
  91         atomic_t write_miss;
  92         atomic_t demotion;
  93         atomic_t promotion;
  94         atomic_t copies_avoided;
  95         atomic_t cache_cell_clash;
  96         atomic_t commit_count;
  97         atomic_t discard_count;
  98 };
  99
 100 struct cache {
 101         struct dm_target *ti;
 102         struct dm_target_callbacks callbacks;
 103
 104         /*
 105          * Metadata is written to this device.
 106          */
 107         struct dm_dev *metadata_dev;
 108
 109         /*
 110          * The slower of the two data devices.  Typically a spindle.
 111          */
 112         struct dm_dev *origin_dev;
 113
 114         /*
 115          * The faster of the two data devices.  Typically an SSD.
 116          */
 117         struct dm_dev *cache_dev;
 118
 119         /*
 120          * Cache features such as write-through.
 121          */
 122         struct cache_features features;
 123
 124         /*
 125          * Size of the origin device in _complete_ blocks and native sectors.
 126          */
 127         dm_oblock_t origin_blocks;
 128         sector_t origin_sectors;
 129
 130         /*
 131          * Size of the cache device in blocks.
 132          */
 133         dm_cblock_t cache_size;
 134
 135         /*
 136          * Fields for converting from sectors to blocks.
 137          */
 138         uint32_t sectors_per_block;
 139         int sectors_per_block_shift;
 140
 141         struct dm_cache_metadata *cmd;
 142
 143         spinlock_t lock;
 144         struct bio_list deferred_bios;
 145         struct bio_list deferred_flush_bios;
 146         struct bio_list deferred_writethrough_bios;
 147         struct list_head quiesced_migrations;
 148         struct list_head completed_migrations;
 149         struct list_head need_commit_migrations;
 150         sector_t migration_threshold;
 151         atomic_t nr_migrations;
 152         wait_queue_head_t migration_wait;
 153
 154         /*
 155          * cache_size entries, dirty if set
 156          */
 157         dm_cblock_t nr_dirty;
 158         unsigned long *dirty_bitset;
 159
 160         /*
 161          * origin_blocks entries, discarded if set.
 162          */
 163         uint32_t discard_block_size; /* a power of 2 times sectors per block */
 164         dm_dblock_t discard_nr_blocks;
 165         unsigned long *discard_bitset;
 166
 167         struct dm_kcopyd_client *copier;
 168         struct workqueue_struct *wq;
 169         struct work_struct worker;
 170
 171         struct delayed_work waker;
 172         unsigned long last_commit_jiffies;
 173
 174         struct dm_bio_prison *prison;
 175         struct dm_deferred_set *all_io_ds;
 176
 177         mempool_t *migration_pool;
 178         struct dm_cache_migration *next_migration;
 179
 180         struct dm_cache_policy *policy;
 181         unsigned policy_nr_args;
 182
 183         bool need_tick_bio:1;
 184         bool sized:1;
 185         bool quiescing:1;
 186         bool commit_requested:1;
 187         bool loaded_mappings:1;
 188         bool loaded_discards:1;
 189
 190         struct cache_stats stats;
 191
 192         /*
 193          * Rather than reconstructing the table line for the status we just
 194          * save it and regurgitate.
 195          */
 196         unsigned nr_ctr_args;
 197         const char **ctr_args;
 198 };
 199
 200 struct per_bio_data {
 201         bool tick:1;
 202         unsigned req_nr:2;
 203         struct dm_deferred_entry *all_io_entry;
 204
 205         /* writethrough fields */
 206         struct cache *cache;
 207         dm_cblock_t cblock;
 208         bio_end_io_t *saved_bi_end_io;
 209         struct dm_bio_details bio_details;
 210 };
 211
 212 struct dm_cache_migration {
 213         struct list_head list;
 214         struct cache *cache;
 215
 216         unsigned long start_jiffies;
 217         dm_oblock_t old_oblock;
 218         dm_oblock_t new_oblock;
 219         dm_cblock_t cblock;
 220
 221         bool err:1;
 222         bool writeback:1;
 223         bool demote:1;
 224         bool promote:1;
 225
 226         struct dm_bio_prison_cell *old_ocell;
 227         struct dm_bio_prison_cell *new_ocell;
 228 };
 229
 230 /*
 231  * Processing a bio in the worker thread may require these memory
 232  * allocations.  We prealloc to avoid deadlocks (the same worker thread
 233  * frees them back to the mempool).
 234  */
 235 struct prealloc {
 236         struct dm_cache_migration *mg;
 237         struct dm_bio_prison_cell *cell1;
 238         struct dm_bio_prison_cell *cell2;
 239 };
 240
 241 static void wake_worker(struct cache *cache)
 242 {
 243         queue_work(cache->wq, &cache->worker);
 244 }
 245
 246 /*----------------------------------------------------------------*/
 247
 248 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
 249 {
 250         /* FIXME: change to use a local slab. */
 251         return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
 252 }
 253
 254 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
 255 {
 256         dm_bio_prison_free_cell(cache->prison, cell);
 257 }
 258
 259 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
 260 {
 261         if (!p->mg) {
 262                 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
 263                 if (!p->mg)
 264                         return -ENOMEM;
 265         }
 266
 267         if (!p->cell1) {
 268                 p->cell1 = alloc_prison_cell(cache);
 269                 if (!p->cell1)
 270                         return -ENOMEM;
 271         }
 272
 273         if (!p->cell2) {
 274                 p->cell2 = alloc_prison_cell(cache);
 275                 if (!p->cell2)
 276                         return -ENOMEM;
 277         }
 278
 279         return 0;
 280 }
 281
 282 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
 283 {
 284         if (p->cell2)
 285                 free_prison_cell(cache, p->cell2);
 286
 287         if (p->cell1)
 288                 free_prison_cell(cache, p->cell1);
 289
 290         if (p->mg)
 291                 mempool_free(p->mg, cache->migration_pool);
 292 }
 293
 294 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
 295 {
 296         struct dm_cache_migration *mg = p->mg;
 297
 298         BUG_ON(!mg);
 299         p->mg = NULL;
 300
 301         return mg;
 302 }
 303
 304 /*
 305  * You must have a cell within the prealloc struct to return.  If not this
 306  * function will BUG() rather than returning NULL.
 307  */
 308 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
 309 {
 310         struct dm_bio_prison_cell *r = NULL;
 311
 312         if (p->cell1) {
 313                 r = p->cell1;
 314                 p->cell1 = NULL;
 315
 316         } else if (p->cell2) {
 317                 r = p->cell2;
 318                 p->cell2 = NULL;
 319         } else
 320                 BUG();
 321
 322         return r;
 323 }
 324
 325 /*
 326  * You can't have more than two cells in a prealloc struct.  BUG() will be
 327  * called if you try and overfill.
 328  */
 329 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
 330 {
 331         if (!p->cell2)
 332                 p->cell2 = cell;
 333
 334         else if (!p->cell1)
 335                 p->cell1 = cell;
 336
 337         else
 338                 BUG();
 339 }
 340
 341 /*----------------------------------------------------------------*/
 342
 343 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
 344 {
 345         key->virtual = 0;
 346         key->dev = 0;
 347         key->block = from_oblock(oblock);
 348 }
 349
 350 /*
 351  * The caller hands in a preallocated cell, and a free function for it.
 352  * The cell will be freed if there's an error, or if it wasn't used because
 353  * a cell with that key already exists.
 354  */
 355 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
 356
 357 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
 358                       struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
 359                       cell_free_fn free_fn, void *free_context,
 360                       struct dm_bio_prison_cell **cell_result)
 361 {
 362         int r;
 363         struct dm_cell_key key;
 364
 365         build_key(oblock, &key);
 366         r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
 367         if (r)
 368                 free_fn(free_context, cell_prealloc);
 369
 370         return r;
 371 }
 372
 373 static int get_cell(struct cache *cache,
 374                     dm_oblock_t oblock,
 375                     struct prealloc *structs,
 376                     struct dm_bio_prison_cell **cell_result)
 377 {
 378         int r;
 379         struct dm_cell_key key;
 380         struct dm_bio_prison_cell *cell_prealloc;
 381
 382         cell_prealloc = prealloc_get_cell(structs);
 383
 384         build_key(oblock, &key);
 385         r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
 386         if (r)
 387                 prealloc_put_cell(structs, cell_prealloc);
 388
 389         return r;
 390 }
 391
 392  /*----------------------------------------------------------------*/
 393
 394 static bool is_dirty(struct cache *cache, dm_cblock_t b)
 395 {
 396         return test_bit(from_cblock(b), cache->dirty_bitset);
 397 }
 398
 399 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
 400 {
 401         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
 402                 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
 403                 policy_set_dirty(cache->policy, oblock);
 404         }
 405 }
 406
 407 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
 408 {
 409         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
 410                 policy_clear_dirty(cache->policy, oblock);
 411                 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
 412                 if (!from_cblock(cache->nr_dirty))
 413                         dm_table_event(cache->ti->table);
 414         }
 415 }
 416
 417 /*----------------------------------------------------------------*/
 418 static bool block_size_is_power_of_two(struct cache *cache)
 419 {
 420         return cache->sectors_per_block_shift >= 0;
 421 }
 422
 423 static dm_block_t block_div(dm_block_t b, uint32_t n)
 424 {
 425         do_div(b, n);
 426
 427         return b;
 428 }
 429
 430 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
 431 {
 432         uint32_t discard_blocks = cache->discard_block_size;
 433         dm_block_t b = from_oblock(oblock);
 434
 435         if (!block_size_is_power_of_two(cache))
 436                 discard_blocks = discard_blocks / cache->sectors_per_block;
 437         else
 438                 discard_blocks >>= cache->sectors_per_block_shift;
 439
 440         b = block_div(b, discard_blocks);
 441
 442         return to_dblock(b);
 443 }
 444
 445 static void set_discard(struct cache *cache, dm_dblock_t b)
 446 {
 447         unsigned long flags;
 448
 449         atomic_inc(&cache->stats.discard_count);
 450
 451         spin_lock_irqsave(&cache->lock, flags);
 452         set_bit(from_dblock(b), cache->discard_bitset);
 453         spin_unlock_irqrestore(&cache->lock, flags);
 454 }
 455
 456 static void clear_discard(struct cache *cache, dm_dblock_t b)
 457 {
 458         unsigned long flags;
 459
 460         spin_lock_irqsave(&cache->lock, flags);
 461         clear_bit(from_dblock(b), cache->discard_bitset);
 462         spin_unlock_irqrestore(&cache->lock, flags);
 463 }
 464
 465 static bool is_discarded(struct cache *cache, dm_dblock_t b)
 466 {
 467         int r;
 468         unsigned long flags;
 469
 470         spin_lock_irqsave(&cache->lock, flags);
 471         r = test_bit(from_dblock(b), cache->discard_bitset);
 472         spin_unlock_irqrestore(&cache->lock, flags);
 473
 474         return r;
 475 }
 476
 477 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
 478 {
 479         int r;
 480         unsigned long flags;
 481
 482         spin_lock_irqsave(&cache->lock, flags);
 483         r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
 484                      cache->discard_bitset);
 485         spin_unlock_irqrestore(&cache->lock, flags);
 486
 487         return r;
 488 }
 489
 490 /*----------------------------------------------------------------*/
 491
 492 static void load_stats(struct cache *cache)
 493 {
 494         struct dm_cache_statistics stats;
 495
 496         dm_cache_metadata_get_stats(cache->cmd, &stats);
 497         atomic_set(&cache->stats.read_hit, stats.read_hits);
 498         atomic_set(&cache->stats.read_miss, stats.read_misses);
 499         atomic_set(&cache->stats.write_hit, stats.write_hits);
 500         atomic_set(&cache->stats.write_miss, stats.write_misses);
 501 }
 502
 503 static void save_stats(struct cache *cache)
 504 {
 505         struct dm_cache_statistics stats;
 506
 507         stats.read_hits = atomic_read(&cache->stats.read_hit);
 508         stats.read_misses = atomic_read(&cache->stats.read_miss);
 509         stats.write_hits = atomic_read(&cache->stats.write_hit);
 510         stats.write_misses = atomic_read(&cache->stats.write_miss);
 511
 512         dm_cache_metadata_set_stats(cache->cmd, &stats);
 513 }
 514
 515 /*----------------------------------------------------------------
 516  * Per bio data
 517  *--------------------------------------------------------------*/
 518 static struct per_bio_data *get_per_bio_data(struct bio *bio)
 519 {
 520         struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
 521         BUG_ON(!pb);
 522         return pb;
 523 }
 524
 525 static struct per_bio_data *init_per_bio_data(struct bio *bio)
 526 {
 527         struct per_bio_data *pb = get_per_bio_data(bio);
 528
 529         pb->tick = false;
 530         pb->req_nr = dm_bio_get_target_bio_nr(bio);
 531         pb->all_io_entry = NULL;
 532
 533         return pb;
 534 }
 535
 536 /*----------------------------------------------------------------
 537  * Remapping
 538  *--------------------------------------------------------------*/
 539 static void remap_to_origin(struct cache *cache, struct bio *bio)
 540 {
 541         bio->bi_bdev = cache->origin_dev->bdev;
 542 }
 543
 544 static void remap_to_cache(struct cache *cache, struct bio *bio,
 545                            dm_cblock_t cblock)
 546 {
 547         sector_t bi_sector = bio->bi_sector;
 548
 549         bio->bi_bdev = cache->cache_dev->bdev;
 550         if (!block_size_is_power_of_two(cache))
 551                 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
 552                                 sector_div(bi_sector, cache->sectors_per_block);
 553         else
 554                 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
 555                                 (bi_sector & (cache->sectors_per_block - 1));
 556 }
 557
 558 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 559 {
 560         unsigned long flags;
 561         struct per_bio_data *pb = get_per_bio_data(bio);
 562
 563         spin_lock_irqsave(&cache->lock, flags);
 564         if (cache->need_tick_bio &&
 565             !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
 566                 pb->tick = true;
 567                 cache->need_tick_bio = false;
 568         }
 569         spin_unlock_irqrestore(&cache->lock, flags);
 570 }
 571
 572 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 573                                   dm_oblock_t oblock)
 574 {
 575         check_if_tick_bio_needed(cache, bio);
 576         remap_to_origin(cache, bio);
 577         if (bio_data_dir(bio) == WRITE)
 578                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 579 }
 580
 581 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 582                                  dm_oblock_t oblock, dm_cblock_t cblock)
 583 {
 584         remap_to_cache(cache, bio, cblock);
 585         if (bio_data_dir(bio) == WRITE) {
 586                 set_dirty(cache, oblock, cblock);
 587                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 588         }
 589 }
 590
 591 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 592 {
 593         sector_t block_nr = bio->bi_sector;
 594
 595         if (!block_size_is_power_of_two(cache))
 596                 (void) sector_div(block_nr, cache->sectors_per_block);
 597         else
 598                 block_nr >>= cache->sectors_per_block_shift;
 599
 600         return to_oblock(block_nr);
 601 }
 602
 603 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
 604 {
 605         return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 606 }
 607
 608 static void issue(struct cache *cache, struct bio *bio)
 609 {
 610         unsigned long flags;
 611
 612         if (!bio_triggers_commit(cache, bio)) {
 613                 generic_make_request(bio);
 614                 return;
 615         }
 616
 617         /*
 618          * Batch together any bios that trigger commits and then issue a
 619          * single commit for them in do_worker().
 620          */
 621         spin_lock_irqsave(&cache->lock, flags);
 622         cache->commit_requested = true;
 623         bio_list_add(&cache->deferred_flush_bios, bio);
 624         spin_unlock_irqrestore(&cache->lock, flags);
 625 }
 626
 627 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
 628 {
 629         unsigned long flags;
 630
 631         spin_lock_irqsave(&cache->lock, flags);
 632         bio_list_add(&cache->deferred_writethrough_bios, bio);
 633         spin_unlock_irqrestore(&cache->lock, flags);
 634
 635         wake_worker(cache);
 636 }
 637
 638 static void writethrough_endio(struct bio *bio, int err)
 639 {
 640         struct per_bio_data *pb = get_per_bio_data(bio);
 641         bio->bi_end_io = pb->saved_bi_end_io;
 642
 643         if (err) {
 644                 bio_endio(bio, err);
 645                 return;
 646         }
 647
 648         dm_bio_restore(&pb->bio_details, bio);
 649         remap_to_cache(pb->cache, bio, pb->cblock);
 650
 651         /*
 652          * We can't issue this bio directly, since we're in interrupt
 653          * context.  So it get's put on a bio list for processing by the
 654          * worker thread.
 655          */
 656         defer_writethrough_bio(pb->cache, bio);
 657 }
 658
 659 /*
 660  * When running in writethrough mode we need to send writes to clean blocks
 661  * to both the cache and origin devices.  In future we'd like to clone the
 662  * bio and send them in parallel, but for now we're doing them in
 663  * series as this is easier.
 664  */
 665 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
 666                                        dm_oblock_t oblock, dm_cblock_t cblock)
 667 {
 668         struct per_bio_data *pb = get_per_bio_data(bio);
 669
 670         pb->cache = cache;
 671         pb->cblock = cblock;
 672         pb->saved_bi_end_io = bio->bi_end_io;
 673         dm_bio_record(&pb->bio_details, bio);
 674         bio->bi_end_io = writethrough_endio;
 675
 676         remap_to_origin_clear_discard(pb->cache, bio, oblock);
 677 }
 678
 679 /*----------------------------------------------------------------
 680  * Migration processing
 681  *
 682  * Migration covers moving data from the origin device to the cache, or
 683  * vice versa.
 684  *--------------------------------------------------------------*/
 685 static void free_migration(struct dm_cache_migration *mg)
 686 {
 687         mempool_free(mg, mg->cache->migration_pool);
 688 }
 689
 690 static void inc_nr_migrations(struct cache *cache)
 691 {
 692         atomic_inc(&cache->nr_migrations);
 693 }
 694
 695 static void dec_nr_migrations(struct cache *cache)
 696 {
 697         atomic_dec(&cache->nr_migrations);
 698
 699         /*
 700          * Wake the worker in case we're suspending the target.
 701          */
 702         wake_up(&cache->migration_wait);
 703 }
 704
 705 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
 706                          bool holder)
 707 {
 708         (holder ? dm_cell_release : dm_cell_release_no_holder)
 709                 (cache->prison, cell, &cache->deferred_bios);
 710         free_prison_cell(cache, cell);
 711 }
 712
 713 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
 714                        bool holder)
 715 {
 716         unsigned long flags;
 717
 718         spin_lock_irqsave(&cache->lock, flags);
 719         __cell_defer(cache, cell, holder);
 720         spin_unlock_irqrestore(&cache->lock, flags);
 721
 722         wake_worker(cache);
 723 }
 724
 725 static void cleanup_migration(struct dm_cache_migration *mg)
 726 {
 727         dec_nr_migrations(mg->cache);
 728         free_migration(mg);
 729 }
 730
 731 static void migration_failure(struct dm_cache_migration *mg)
 732 {
 733         struct cache *cache = mg->cache;
 734
 735         if (mg->writeback) {
 736                 DMWARN_LIMIT("writeback failed; couldn't copy block");
 737                 set_dirty(cache, mg->old_oblock, mg->cblock);
 738                 cell_defer(cache, mg->old_ocell, false);
 739
 740         } else if (mg->demote) {
 741                 DMWARN_LIMIT("demotion failed; couldn't copy block");
 742                 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
 743
 744                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
 745                 if (mg->promote)
 746                         cell_defer(cache, mg->new_ocell, 1);
 747         } else {
 748                 DMWARN_LIMIT("promotion failed; couldn't copy block");
 749                 policy_remove_mapping(cache->policy, mg->new_oblock);
 750                 cell_defer(cache, mg->new_ocell, 1);
 751         }
 752
 753         cleanup_migration(mg);
 754 }
 755
 756 static void migration_success_pre_commit(struct dm_cache_migration *mg)
 757 {
 758         unsigned long flags;
 759         struct cache *cache = mg->cache;
 760
 761         if (mg->writeback) {
 762                 cell_defer(cache, mg->old_ocell, false);
 763                 clear_dirty(cache, mg->old_oblock, mg->cblock);
 764                 cleanup_migration(mg);
 765                 return;
 766
 767         } else if (mg->demote) {
 768                 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
 769                         DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
 770                         policy_force_mapping(cache->policy, mg->new_oblock,
 771                                              mg->old_oblock);
 772                         if (mg->promote)
 773                                 cell_defer(cache, mg->new_ocell, true);
 774                         cleanup_migration(mg);
 775                         return;
 776                 }
 777         } else {
 778                 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
 779                         DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
 780                         policy_remove_mapping(cache->policy, mg->new_oblock);
 781                         cleanup_migration(mg);
 782                         return;
 783                 }
 784         }
 785
 786         spin_lock_irqsave(&cache->lock, flags);
 787         list_add_tail(&mg->list, &cache->need_commit_migrations);
 788         cache->commit_requested = true;
 789         spin_unlock_irqrestore(&cache->lock, flags);
 790 }
 791
 792 static void migration_success_post_commit(struct dm_cache_migration *mg)
 793 {
 794         unsigned long flags;
 795         struct cache *cache = mg->cache;
 796
 797         if (mg->writeback) {
 798                 DMWARN("writeback unexpectedly triggered commit");
 799                 return;
 800
 801         } else if (mg->demote) {
 802                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
 803
 804                 if (mg->promote) {
 805                         mg->demote = false;
 806
 807                         spin_lock_irqsave(&cache->lock, flags);
 808                         list_add_tail(&mg->list, &cache->quiesced_migrations);
 809                         spin_unlock_irqrestore(&cache->lock, flags);
 810
 811                 } else
 812                         cleanup_migration(mg);
 813
 814         } else {
 815                 cell_defer(cache, mg->new_ocell, true);
 816                 clear_dirty(cache, mg->new_oblock, mg->cblock);
 817                 cleanup_migration(mg);
 818         }
 819 }
 820
 821 static void copy_complete(int read_err, unsigned long write_err, void *context)
 822 {
 823         unsigned long flags;
 824         struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
 825         struct cache *cache = mg->cache;
 826
 827         if (read_err || write_err)
 828                 mg->err = true;
 829
 830         spin_lock_irqsave(&cache->lock, flags);
 831         list_add_tail(&mg->list, &cache->completed_migrations);
 832         spin_unlock_irqrestore(&cache->lock, flags);
 833
 834         wake_worker(cache);
 835 }
 836
 837 static void issue_copy_real(struct dm_cache_migration *mg)
 838 {
 839         int r;
 840         struct dm_io_region o_region, c_region;
 841         struct cache *cache = mg->cache;
 842
 843         o_region.bdev = cache->origin_dev->bdev;
 844         o_region.count = cache->sectors_per_block;
 845
 846         c_region.bdev = cache->cache_dev->bdev;
 847         c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
 848         c_region.count = cache->sectors_per_block;
 849
 850         if (mg->writeback || mg->demote) {
 851                 /* demote */
 852                 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
 853                 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
 854         } else {
 855                 /* promote */
 856                 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
 857                 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
 858         }
 859
 860         if (r < 0)
 861                 migration_failure(mg);
 862 }
 863
 864 static void avoid_copy(struct dm_cache_migration *mg)
 865 {
 866         atomic_inc(&mg->cache->stats.copies_avoided);
 867         migration_success_pre_commit(mg);
 868 }
 869
 870 static void issue_copy(struct dm_cache_migration *mg)
 871 {
 872         bool avoid;
 873         struct cache *cache = mg->cache;
 874
 875         if (mg->writeback || mg->demote)
 876                 avoid = !is_dirty(cache, mg->cblock) ||
 877                         is_discarded_oblock(cache, mg->old_oblock);
 878         else
 879                 avoid = is_discarded_oblock(cache, mg->new_oblock);
 880
 881         avoid ? avoid_copy(mg) : issue_copy_real(mg);
 882 }
 883
 884 static void complete_migration(struct dm_cache_migration *mg)
 885 {
 886         if (mg->err)
 887                 migration_failure(mg);
 888         else
 889                 migration_success_pre_commit(mg);
 890 }
 891
 892 static void process_migrations(struct cache *cache, struct list_head *head,
 893                                void (*fn)(struct dm_cache_migration *))
 894 {
 895         unsigned long flags;
 896         struct list_head list;
 897         struct dm_cache_migration *mg, *tmp;
 898
 899         INIT_LIST_HEAD(&list);
 900         spin_lock_irqsave(&cache->lock, flags);
 901         list_splice_init(head, &list);
 902         spin_unlock_irqrestore(&cache->lock, flags);
 903
 904         list_for_each_entry_safe(mg, tmp, &list, list)
 905                 fn(mg);
 906 }
 907
 908 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
 909 {
 910         list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
 911 }
 912
 913 static void queue_quiesced_migration(struct dm_cache_migration *mg)
 914 {
 915         unsigned long flags;
 916         struct cache *cache = mg->cache;
 917
 918         spin_lock_irqsave(&cache->lock, flags);
 919         __queue_quiesced_migration(mg);
 920         spin_unlock_irqrestore(&cache->lock, flags);
 921
 922         wake_worker(cache);
 923 }
 924
 925 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
 926 {
 927         unsigned long flags;
 928         struct dm_cache_migration *mg, *tmp;
 929
 930         spin_lock_irqsave(&cache->lock, flags);
 931         list_for_each_entry_safe(mg, tmp, work, list)
 932                 __queue_quiesced_migration(mg);
 933         spin_unlock_irqrestore(&cache->lock, flags);
 934
 935         wake_worker(cache);
 936 }
 937
 938 static void check_for_quiesced_migrations(struct cache *cache,
 939                                           struct per_bio_data *pb)
 940 {
 941         struct list_head work;
 942
 943         if (!pb->all_io_entry)
 944                 return;
 945
 946         INIT_LIST_HEAD(&work);
 947         if (pb->all_io_entry)
 948                 dm_deferred_entry_dec(pb->all_io_entry, &work);
 949
 950         if (!list_empty(&work))
 951                 queue_quiesced_migrations(cache, &work);
 952 }
 953
 954 static void quiesce_migration(struct dm_cache_migration *mg)
 955 {
 956         if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
 957                 queue_quiesced_migration(mg);
 958 }
 959
 960 static void promote(struct cache *cache, struct prealloc *structs,
 961                     dm_oblock_t oblock, dm_cblock_t cblock,
 962                     struct dm_bio_prison_cell *cell)
 963 {
 964         struct dm_cache_migration *mg = prealloc_get_migration(structs);
 965
 966         mg->err = false;
 967         mg->writeback = false;
 968         mg->demote = false;
 969         mg->promote = true;
 970         mg->cache = cache;
 971         mg->new_oblock = oblock;
 972         mg->cblock = cblock;
 973         mg->old_ocell = NULL;
 974         mg->new_ocell = cell;
 975         mg->start_jiffies = jiffies;
 976
 977         inc_nr_migrations(cache);
 978         quiesce_migration(mg);
 979 }
 980
 981 static void writeback(struct cache *cache, struct prealloc *structs,
 982                       dm_oblock_t oblock, dm_cblock_t cblock,
 983                       struct dm_bio_prison_cell *cell)
 984 {
 985         struct dm_cache_migration *mg = prealloc_get_migration(structs);
 986
 987         mg->err = false;
 988         mg->writeback = true;
 989         mg->demote = false;
 990         mg->promote = false;
 991         mg->cache = cache;
 992         mg->old_oblock = oblock;
 993         mg->cblock = cblock;
 994         mg->old_ocell = cell;
 995         mg->new_ocell = NULL;
 996         mg->start_jiffies = jiffies;
 997
 998         inc_nr_migrations(cache);
 999         quiesce_migration(mg);
1000 }
1001
1002 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1003                                 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1004                                 dm_cblock_t cblock,
1005                                 struct dm_bio_prison_cell *old_ocell,
1006                                 struct dm_bio_prison_cell *new_ocell)
1007 {
1008         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1009
1010         mg->err = false;
1011         mg->writeback = false;
1012         mg->demote = true;
1013         mg->promote = true;
1014         mg->cache = cache;
1015         mg->old_oblock = old_oblock;
1016         mg->new_oblock = new_oblock;
1017         mg->cblock = cblock;
1018         mg->old_ocell = old_ocell;
1019         mg->new_ocell = new_ocell;
1020         mg->start_jiffies = jiffies;
1021
1022         inc_nr_migrations(cache);
1023         quiesce_migration(mg);
1024 }
1025
1026 /*----------------------------------------------------------------
1027  * bio processing
1028  *--------------------------------------------------------------*/
1029 static void defer_bio(struct cache *cache, struct bio *bio)
1030 {
1031         unsigned long flags;
1032
1033         spin_lock_irqsave(&cache->lock, flags);
1034         bio_list_add(&cache->deferred_bios, bio);
1035         spin_unlock_irqrestore(&cache->lock, flags);
1036
1037         wake_worker(cache);
1038 }
1039
1040 static void process_flush_bio(struct cache *cache, struct bio *bio)
1041 {
1042         struct per_bio_data *pb = get_per_bio_data(bio);
1043
1044         BUG_ON(bio->bi_size);
1045         if (!pb->req_nr)
1046                 remap_to_origin(cache, bio);
1047         else
1048                 remap_to_cache(cache, bio, 0);
1049
1050         issue(cache, bio);
1051 }
1052
1053 /*
1054  * People generally discard large parts of a device, eg, the whole device
1055  * when formatting.  Splitting these large discards up into cache block
1056  * sized ios and then quiescing (always neccessary for discard) takes too
1057  * long.
1058  *
1059  * We keep it simple, and allow any size of discard to come in, and just
1060  * mark off blocks on the discard bitset.  No passdown occurs!
1061  *
1062  * To implement passdown we need to change the bio_prison such that a cell
1063  * can have a key that spans many blocks.
1064  */
1065 static void process_discard_bio(struct cache *cache, struct bio *bio)
1066 {
1067         dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
1068                                                   cache->discard_block_size);
1069         dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1070         dm_block_t b;
1071
1072         end_block = block_div(end_block, cache->discard_block_size);
1073
1074         for (b = start_block; b < end_block; b++)
1075                 set_discard(cache, to_dblock(b));
1076
1077         bio_endio(bio, 0);
1078 }
1079
1080 static bool spare_migration_bandwidth(struct cache *cache)
1081 {
1082         sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1083                 cache->sectors_per_block;
1084         return current_volume < cache->migration_threshold;
1085 }
1086
1087 static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1088                                dm_cblock_t cblock)
1089 {
1090         return bio_data_dir(bio) == WRITE &&
1091                 cache->features.write_through && !is_dirty(cache, cblock);
1092 }
1093
1094 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1095 {
1096         atomic_inc(bio_data_dir(bio) == READ ?
1097                    &cache->stats.read_hit : &cache->stats.write_hit);
1098 }
1099
1100 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1101 {
1102         atomic_inc(bio_data_dir(bio) == READ ?
1103                    &cache->stats.read_miss : &cache->stats.write_miss);
1104 }
1105
1106 static void process_bio(struct cache *cache, struct prealloc *structs,
1107                         struct bio *bio)
1108 {
1109         int r;
1110         bool release_cell = true;
1111         dm_oblock_t block = get_bio_block(cache, bio);
1112         struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1113         struct policy_result lookup_result;
1114         struct per_bio_data *pb = get_per_bio_data(bio);
1115         bool discarded_block = is_discarded_oblock(cache, block);
1116         bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
1117
1118         /*
1119          * Check to see if that block is currently migrating.
1120          */
1121         cell_prealloc = prealloc_get_cell(structs);
1122         r = bio_detain(cache, block, bio, cell_prealloc,
1123                        (cell_free_fn) prealloc_put_cell,
1124                        structs, &new_ocell);
1125         if (r > 0)
1126                 return;
1127
1128         r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1129                        bio, &lookup_result);
1130
1131         if (r == -EWOULDBLOCK)
1132                 /* migration has been denied */
1133                 lookup_result.op = POLICY_MISS;
1134
1135         switch (lookup_result.op) {
1136         case POLICY_HIT:
1137                 inc_hit_counter(cache, bio);
1138                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1139
1140                 if (is_writethrough_io(cache, bio, lookup_result.cblock))
1141                         remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1142                 else
1143                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1144
1145                 issue(cache, bio);
1146                 break;
1147
1148         case POLICY_MISS:
1149                 inc_miss_counter(cache, bio);
1150                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1151                 remap_to_origin_clear_discard(cache, bio, block);
1152                 issue(cache, bio);
1153                 break;
1154
1155         case POLICY_NEW:
1156                 atomic_inc(&cache->stats.promotion);
1157                 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1158                 release_cell = false;
1159                 break;
1160
1161         case POLICY_REPLACE:
1162                 cell_prealloc = prealloc_get_cell(structs);
1163                 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1164                                (cell_free_fn) prealloc_put_cell,
1165                                structs, &old_ocell);
1166                 if (r > 0) {
1167                         /*
1168                          * We have to be careful to avoid lock inversion of
1169                          * the cells.  So we back off, and wait for the
1170                          * old_ocell to become free.
1171                          */
1172                         policy_force_mapping(cache->policy, block,
1173                                              lookup_result.old_oblock);
1174                         atomic_inc(&cache->stats.cache_cell_clash);
1175                         break;
1176                 }
1177                 atomic_inc(&cache->stats.demotion);
1178                 atomic_inc(&cache->stats.promotion);
1179
1180                 demote_then_promote(cache, structs, lookup_result.old_oblock,
1181                                     block, lookup_result.cblock,
1182                                     old_ocell, new_ocell);
1183                 release_cell = false;
1184                 break;
1185
1186         default:
1187                 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1188                             (unsigned) lookup_result.op);
1189                 bio_io_error(bio);
1190         }
1191
1192         if (release_cell)
1193                 cell_defer(cache, new_ocell, false);
1194 }
1195
1196 static int need_commit_due_to_time(struct cache *cache)
1197 {
1198         return jiffies < cache->last_commit_jiffies ||
1199                jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1200 }
1201
1202 static int commit_if_needed(struct cache *cache)
1203 {
1204         if (dm_cache_changed_this_transaction(cache->cmd) &&
1205             (cache->commit_requested || need_commit_due_to_time(cache))) {
1206                 atomic_inc(&cache->stats.commit_count);
1207                 cache->last_commit_jiffies = jiffies;
1208                 cache->commit_requested = false;
1209                 return dm_cache_commit(cache->cmd, false);
1210         }
1211
1212         return 0;
1213 }
1214
1215 static void process_deferred_bios(struct cache *cache)
1216 {
1217         unsigned long flags;
1218         struct bio_list bios;
1219         struct bio *bio;
1220         struct prealloc structs;
1221
1222         memset(&structs, 0, sizeof(structs));
1223         bio_list_init(&bios);
1224
1225         spin_lock_irqsave(&cache->lock, flags);
1226         bio_list_merge(&bios, &cache->deferred_bios);
1227         bio_list_init(&cache->deferred_bios);
1228         spin_unlock_irqrestore(&cache->lock, flags);
1229
1230         while (!bio_list_empty(&bios)) {
1231                 /*
1232                  * If we've got no free migration structs, and processing
1233                  * this bio might require one, we pause until there are some
1234                  * prepared mappings to process.
1235                  */
1236                 if (prealloc_data_structs(cache, &structs)) {
1237                         spin_lock_irqsave(&cache->lock, flags);
1238                         bio_list_merge(&cache->deferred_bios, &bios);
1239                         spin_unlock_irqrestore(&cache->lock, flags);
1240                         break;
1241                 }
1242
1243                 bio = bio_list_pop(&bios);
1244
1245                 if (bio->bi_rw & REQ_FLUSH)
1246                         process_flush_bio(cache, bio);
1247                 else if (bio->bi_rw & REQ_DISCARD)
1248                         process_discard_bio(cache, bio);
1249                 else
1250                         process_bio(cache, &structs, bio);
1251         }
1252
1253         prealloc_free_structs(cache, &structs);
1254 }
1255
1256 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1257 {
1258         unsigned long flags;
1259         struct bio_list bios;
1260         struct bio *bio;
1261
1262         bio_list_init(&bios);
1263
1264         spin_lock_irqsave(&cache->lock, flags);
1265         bio_list_merge(&bios, &cache->deferred_flush_bios);
1266         bio_list_init(&cache->deferred_flush_bios);
1267         spin_unlock_irqrestore(&cache->lock, flags);
1268
1269         while ((bio = bio_list_pop(&bios)))
1270                 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1271 }
1272
1273 static void process_deferred_writethrough_bios(struct cache *cache)
1274 {
1275         unsigned long flags;
1276         struct bio_list bios;
1277         struct bio *bio;
1278
1279         bio_list_init(&bios);
1280
1281         spin_lock_irqsave(&cache->lock, flags);
1282         bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1283         bio_list_init(&cache->deferred_writethrough_bios);
1284         spin_unlock_irqrestore(&cache->lock, flags);
1285
1286         while ((bio = bio_list_pop(&bios)))
1287                 generic_make_request(bio);
1288 }
1289
1290 static void writeback_some_dirty_blocks(struct cache *cache)
1291 {
1292         int r = 0;
1293         dm_oblock_t oblock;
1294         dm_cblock_t cblock;
1295         struct prealloc structs;
1296         struct dm_bio_prison_cell *old_ocell;
1297
1298         memset(&structs, 0, sizeof(structs));
1299
1300         while (spare_migration_bandwidth(cache)) {
1301                 if (prealloc_data_structs(cache, &structs))
1302                         break;
1303
1304                 r = policy_writeback_work(cache->policy, &oblock, &cblock);
1305                 if (r)
1306                         break;
1307
1308                 r = get_cell(cache, oblock, &structs, &old_ocell);
1309                 if (r) {
1310                         policy_set_dirty(cache->policy, oblock);
1311                         break;
1312                 }
1313
1314                 writeback(cache, &structs, oblock, cblock, old_ocell);
1315         }
1316
1317         prealloc_free_structs(cache, &structs);
1318 }
1319
1320 /*----------------------------------------------------------------
1321  * Main worker loop
1322  *--------------------------------------------------------------*/
1323 static void start_quiescing(struct cache *cache)
1324 {
1325         unsigned long flags;
1326
1327         spin_lock_irqsave(&cache->lock, flags);
1328         cache->quiescing = 1;
1329         spin_unlock_irqrestore(&cache->lock, flags);
1330 }
1331
1332 static void stop_quiescing(struct cache *cache)
1333 {
1334         unsigned long flags;
1335
1336         spin_lock_irqsave(&cache->lock, flags);
1337         cache->quiescing = 0;
1338         spin_unlock_irqrestore(&cache->lock, flags);
1339 }
1340
1341 static bool is_quiescing(struct cache *cache)
1342 {
1343         int r;
1344         unsigned long flags;
1345
1346         spin_lock_irqsave(&cache->lock, flags);
1347         r = cache->quiescing;
1348         spin_unlock_irqrestore(&cache->lock, flags);
1349
1350         return r;
1351 }
1352
1353 static void wait_for_migrations(struct cache *cache)
1354 {
1355         wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1356 }
1357
1358 static void stop_worker(struct cache *cache)
1359 {
1360         cancel_delayed_work(&cache->waker);
1361         flush_workqueue(cache->wq);
1362 }
1363
1364 static void requeue_deferred_io(struct cache *cache)
1365 {
1366         struct bio *bio;
1367         struct bio_list bios;
1368
1369         bio_list_init(&bios);
1370         bio_list_merge(&bios, &cache->deferred_bios);
1371         bio_list_init(&cache->deferred_bios);
1372
1373         while ((bio = bio_list_pop(&bios)))
1374                 bio_endio(bio, DM_ENDIO_REQUEUE);
1375 }
1376
1377 static int more_work(struct cache *cache)
1378 {
1379         if (is_quiescing(cache))
1380                 return !list_empty(&cache->quiesced_migrations) ||
1381                         !list_empty(&cache->completed_migrations) ||
1382                         !list_empty(&cache->need_commit_migrations);
1383         else
1384                 return !bio_list_empty(&cache->deferred_bios) ||
1385                         !bio_list_empty(&cache->deferred_flush_bios) ||
1386                         !bio_list_empty(&cache->deferred_writethrough_bios) ||
1387                         !list_empty(&cache->quiesced_migrations) ||
1388                         !list_empty(&cache->completed_migrations) ||
1389                         !list_empty(&cache->need_commit_migrations);
1390 }
1391
1392 static void do_worker(struct work_struct *ws)
1393 {
1394         struct cache *cache = container_of(ws, struct cache, worker);
1395
1396         do {
1397                 if (!is_quiescing(cache))
1398                         process_deferred_bios(cache);
1399
1400                 process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1401                 process_migrations(cache, &cache->completed_migrations, complete_migration);
1402
1403                 writeback_some_dirty_blocks(cache);
1404
1405                 process_deferred_writethrough_bios(cache);
1406
1407                 if (commit_if_needed(cache)) {
1408                         process_deferred_flush_bios(cache, false);
1409
1410                         /*
1411                          * FIXME: rollback metadata or just go into a
1412                          * failure mode and error everything
1413                          */
1414                 } else {
1415                         process_deferred_flush_bios(cache, true);
1416                         process_migrations(cache, &cache->need_commit_migrations,
1417                                            migration_success_post_commit);
1418                 }
1419         } while (more_work(cache));
1420 }
1421
1422 /*
1423  * We want to commit periodically so that not too much
1424  * unwritten metadata builds up.
1425  */
1426 static void do_waker(struct work_struct *ws)
1427 {
1428         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1429         wake_worker(cache);
1430         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1431 }
1432
1433 /*----------------------------------------------------------------*/
1434
1435 static int is_congested(struct dm_dev *dev, int bdi_bits)
1436 {
1437         struct request_queue *q = bdev_get_queue(dev->bdev);
1438         return bdi_congested(&q->backing_dev_info, bdi_bits);
1439 }
1440
1441 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1442 {
1443         struct cache *cache = container_of(cb, struct cache, callbacks);
1444
1445         return is_congested(cache->origin_dev, bdi_bits) ||
1446                 is_congested(cache->cache_dev, bdi_bits);
1447 }
1448
1449 /*----------------------------------------------------------------
1450  * Target methods
1451  *--------------------------------------------------------------*/
1452
1453 /*
1454  * This function gets called on the error paths of the constructor, so we
1455  * have to cope with a partially initialised struct.
1456  */
1457 static void destroy(struct cache *cache)
1458 {
1459         unsigned i;
1460
1461         if (cache->next_migration)
1462                 mempool_free(cache->next_migration, cache->migration_pool);
1463
1464         if (cache->migration_pool)
1465                 mempool_destroy(cache->migration_pool);
1466
1467         if (cache->all_io_ds)
1468                 dm_deferred_set_destroy(cache->all_io_ds);
1469
1470         if (cache->prison)
1471                 dm_bio_prison_destroy(cache->prison);
1472
1473         if (cache->wq)
1474                 destroy_workqueue(cache->wq);
1475
1476         if (cache->dirty_bitset)
1477                 free_bitset(cache->dirty_bitset);
1478
1479         if (cache->discard_bitset)
1480                 free_bitset(cache->discard_bitset);
1481
1482         if (cache->copier)
1483                 dm_kcopyd_client_destroy(cache->copier);
1484
1485         if (cache->cmd)
1486                 dm_cache_metadata_close(cache->cmd);
1487
1488         if (cache->metadata_dev)
1489                 dm_put_device(cache->ti, cache->metadata_dev);
1490
1491         if (cache->origin_dev)
1492                 dm_put_device(cache->ti, cache->origin_dev);
1493
1494         if (cache->cache_dev)
1495                 dm_put_device(cache->ti, cache->cache_dev);
1496
1497         if (cache->policy)
1498                 dm_cache_policy_destroy(cache->policy);
1499
1500         for (i = 0; i < cache->nr_ctr_args ; i++)
1501                 kfree(cache->ctr_args[i]);
1502         kfree(cache->ctr_args);
1503
1504         kfree(cache);
1505 }
1506
1507 static void cache_dtr(struct dm_target *ti)
1508 {
1509         struct cache *cache = ti->private;
1510
1511         destroy(cache);
1512 }
1513
1514 static sector_t get_dev_size(struct dm_dev *dev)
1515 {
1516         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1517 }
1518
1519 /*----------------------------------------------------------------*/
1520
1521 /*
1522  * Construct a cache device mapping.
1523  *
1524  * cache <metadata dev> <cache dev> <origin dev> <block size>
1525  *       <#feature args> [<feature arg>]*
1526  *       <policy> <#policy args> [<policy arg>]*
1527  *
1528  * metadata dev    : fast device holding the persistent metadata
1529  * cache dev       : fast device holding cached data blocks
1530  * origin dev      : slow device holding original data blocks
1531  * block size      : cache unit size in sectors
1532  *
1533  * #feature args   : number of feature arguments passed
1534  * feature args    : writethrough.  (The default is writeback.)
1535  *
1536  * policy          : the replacement policy to use
1537  * #policy args    : an even number of policy arguments corresponding
1538  *                   to key/value pairs passed to the policy
1539  * policy args     : key/value pairs passed to the policy
1540  *                   E.g. 'sequential_threshold 1024'
1541  *                   See cache-policies.txt for details.
1542  *
1543  * Optional feature arguments are:
1544  *   writethrough  : write through caching that prohibits cache block
1545  *                   content from being different from origin block content.
1546  *                   Without this argument, the default behaviour is to write
1547  *                   back cache block contents later for performance reasons,
1548  *                   so they may differ from the corresponding origin blocks.
1549  */
1550 struct cache_args {
1551         struct dm_target *ti;
1552
1553         struct dm_dev *metadata_dev;
1554
1555         struct dm_dev *cache_dev;
1556         sector_t cache_sectors;
1557
1558         struct dm_dev *origin_dev;
1559         sector_t origin_sectors;
1560
1561         uint32_t block_size;
1562
1563         const char *policy_name;
1564         int policy_argc;
1565         const char **policy_argv;
1566
1567         struct cache_features features;
1568 };
1569
1570 static void destroy_cache_args(struct cache_args *ca)
1571 {
1572         if (ca->metadata_dev)
1573                 dm_put_device(ca->ti, ca->metadata_dev);
1574
1575         if (ca->cache_dev)
1576                 dm_put_device(ca->ti, ca->cache_dev);
1577
1578         if (ca->origin_dev)
1579                 dm_put_device(ca->ti, ca->origin_dev);
1580
1581         kfree(ca);
1582 }
1583
1584 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1585 {
1586         if (!as->argc) {
1587                 *error = "Insufficient args";
1588                 return false;
1589         }
1590
1591         return true;
1592 }
1593
1594 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1595                               char **error)
1596 {
1597         int r;
1598         sector_t metadata_dev_size;
1599         char b[BDEVNAME_SIZE];
1600
1601         if (!at_least_one_arg(as, error))
1602                 return -EINVAL;
1603
1604         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1605                           &ca->metadata_dev);
1606         if (r) {
1607                 *error = "Error opening metadata device";
1608                 return r;
1609         }
1610
1611         metadata_dev_size = get_dev_size(ca->metadata_dev);
1612         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1613                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1614                        bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1615
1616         return 0;
1617 }
1618
1619 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1620                            char **error)
1621 {
1622         int r;
1623
1624         if (!at_least_one_arg(as, error))
1625                 return -EINVAL;
1626
1627         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1628                           &ca->cache_dev);
1629         if (r) {
1630                 *error = "Error opening cache device";
1631                 return r;
1632         }
1633         ca->cache_sectors = get_dev_size(ca->cache_dev);
1634
1635         return 0;
1636 }
1637
1638 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1639                             char **error)
1640 {
1641         int r;
1642
1643         if (!at_least_one_arg(as, error))
1644                 return -EINVAL;
1645
1646         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1647                           &ca->origin_dev);
1648         if (r) {
1649                 *error = "Error opening origin device";
1650                 return r;
1651         }
1652
1653         ca->origin_sectors = get_dev_size(ca->origin_dev);
1654         if (ca->ti->len > ca->origin_sectors) {
1655                 *error = "Device size larger than cached device";
1656                 return -EINVAL;
1657         }
1658
1659         return 0;
1660 }
1661
1662 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1663                             char **error)
1664 {
1665         unsigned long tmp;
1666
1667         if (!at_least_one_arg(as, error))
1668                 return -EINVAL;
1669
1670         if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp ||
1671             tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1672             tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1673                 *error = "Invalid data block size";
1674                 return -EINVAL;
1675         }
1676
1677         if (tmp > ca->cache_sectors) {
1678                 *error = "Data block size is larger than the cache device";
1679                 return -EINVAL;
1680         }
1681
1682         ca->block_size = tmp;
1683
1684         return 0;
1685 }
1686
1687 static void init_features(struct cache_features *cf)
1688 {
1689         cf->mode = CM_WRITE;
1690         cf->write_through = false;
1691 }
1692
1693 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1694                           char **error)
1695 {
1696         static struct dm_arg _args[] = {
1697                 {0, 1, "Invalid number of cache feature arguments"},
1698         };
1699
1700         int r;
1701         unsigned argc;
1702         const char *arg;
1703         struct cache_features *cf = &ca->features;
1704
1705         init_features(cf);
1706
1707         r = dm_read_arg_group(_args, as, &argc, error);
1708         if (r)
1709                 return -EINVAL;
1710
1711         while (argc--) {
1712                 arg = dm_shift_arg(as);
1713
1714                 if (!strcasecmp(arg, "writeback"))
1715                         cf->write_through = false;
1716
1717                 else if (!strcasecmp(arg, "writethrough"))
1718                         cf->write_through = true;
1719
1720                 else {
1721                         *error = "Unrecognised cache feature requested";
1722                         return -EINVAL;
1723                 }
1724         }
1725
1726         return 0;
1727 }
1728
1729 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
1730                         char **error)
1731 {
1732         static struct dm_arg _args[] = {
1733                 {0, 1024, "Invalid number of policy arguments"},
1734         };
1735
1736         int r;
1737
1738         if (!at_least_one_arg(as, error))
1739                 return -EINVAL;
1740
1741         ca->policy_name = dm_shift_arg(as);
1742
1743         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
1744         if (r)
1745                 return -EINVAL;
1746
1747         ca->policy_argv = (const char **)as->argv;
1748         dm_consume_args(as, ca->policy_argc);
1749
1750         return 0;
1751 }
1752
1753 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
1754                             char **error)
1755 {
1756         int r;
1757         struct dm_arg_set as;
1758
1759         as.argc = argc;
1760         as.argv = argv;
1761
1762         r = parse_metadata_dev(ca, &as, error);
1763         if (r)
1764                 return r;
1765
1766         r = parse_cache_dev(ca, &as, error);
1767         if (r)
1768                 return r;
1769
1770         r = parse_origin_dev(ca, &as, error);
1771         if (r)
1772                 return r;
1773
1774         r = parse_block_size(ca, &as, error);
1775         if (r)
1776                 return r;
1777
1778         r = parse_features(ca, &as, error);
1779         if (r)
1780                 return r;
1781
1782         r = parse_policy(ca, &as, error);
1783         if (r)
1784                 return r;
1785
1786         return 0;
1787 }
1788
1789 /*----------------------------------------------------------------*/
1790
1791 static struct kmem_cache *migration_cache;
1792
1793 static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
1794 {
1795         int r = 0;
1796
1797         if (argc & 1) {
1798                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
1799                 return -EINVAL;
1800         }
1801
1802         while (argc) {
1803                 r = policy_set_config_value(p, argv[0], argv[1]);
1804                 if (r) {
1805                         DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
1806                                argv[0], argv[1]);
1807                         return r;
1808                 }
1809
1810                 argc -= 2;
1811                 argv += 2;
1812         }
1813
1814         return r;
1815 }
1816
1817 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1818                                char **error)
1819 {
1820         int r;
1821
1822         cache->policy = dm_cache_policy_create(ca->policy_name,
1823                                                cache->cache_size,
1824                                                cache->origin_sectors,
1825                                                cache->sectors_per_block);
1826         if (!cache->policy) {
1827                 *error = "Error creating cache's policy";
1828                 return -ENOMEM;
1829         }
1830
1831         r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
1832         if (r) {
1833                 *error = "Error setting cache policy's config values";
1834                 dm_cache_policy_destroy(cache->policy);
1835                 cache->policy = NULL;
1836         }
1837
1838         return r;
1839 }
1840
1841 /*
1842  * We want the discard block size to be a power of two, at least the size
1843  * of the cache block size, and have no more than 2^14 discard blocks
1844  * across the origin.
1845  */
1846 #define MAX_DISCARD_BLOCKS (1 << 14)
1847
1848 static bool too_many_discard_blocks(sector_t discard_block_size,
1849                                     sector_t origin_size)
1850 {
1851         (void) sector_div(origin_size, discard_block_size);
1852
1853         return origin_size > MAX_DISCARD_BLOCKS;
1854 }
1855
1856 static sector_t calculate_discard_block_size(sector_t cache_block_size,
1857                                              sector_t origin_size)
1858 {
1859         sector_t discard_block_size;
1860
1861         discard_block_size = roundup_pow_of_two(cache_block_size);
1862
1863         if (origin_size)
1864                 while (too_many_discard_blocks(discard_block_size, origin_size))
1865                         discard_block_size *= 2;
1866
1867         return discard_block_size;
1868 }
1869
1870 #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
1871
1872 static int cache_create(struct cache_args *ca, struct cache **result)
1873 {
1874         int r = 0;
1875         char **error = &ca->ti->error;
1876         struct cache *cache;
1877         struct dm_target *ti = ca->ti;
1878         dm_block_t origin_blocks;
1879         struct dm_cache_metadata *cmd;
1880         bool may_format = ca->features.mode == CM_WRITE;
1881
1882         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
1883         if (!cache)
1884                 return -ENOMEM;
1885
1886         cache->ti = ca->ti;
1887         ti->private = cache;
1888         ti->per_bio_data_size = sizeof(struct per_bio_data);
1889         ti->num_flush_bios = 2;
1890         ti->flush_supported = true;
1891
1892         ti->num_discard_bios = 1;
1893         ti->discards_supported = true;
1894         ti->discard_zeroes_data_unsupported = true;
1895
1896         memcpy(&cache->features, &ca->features, sizeof(cache->features));
1897
1898         cache->callbacks.congested_fn = cache_is_congested;
1899         dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1900
1901         cache->metadata_dev = ca->metadata_dev;
1902         cache->origin_dev = ca->origin_dev;
1903         cache->cache_dev = ca->cache_dev;
1904
1905         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
1906
1907         /* FIXME: factor out this whole section */
1908         origin_blocks = cache->origin_sectors = ca->origin_sectors;
1909         origin_blocks = block_div(origin_blocks, ca->block_size);
1910         cache->origin_blocks = to_oblock(origin_blocks);
1911
1912         cache->sectors_per_block = ca->block_size;
1913         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
1914                 r = -EINVAL;
1915                 goto bad;
1916         }
1917
1918         if (ca->block_size & (ca->block_size - 1)) {
1919                 dm_block_t cache_size = ca->cache_sectors;
1920
1921                 cache->sectors_per_block_shift = -1;
1922                 cache_size = block_div(cache_size, ca->block_size);
1923                 cache->cache_size = to_cblock(cache_size);
1924         } else {
1925                 cache->sectors_per_block_shift = __ffs(ca->block_size);
1926                 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
1927         }
1928
1929         r = create_cache_policy(cache, ca, error);
1930         if (r)
1931                 goto bad;
1932         cache->policy_nr_args = ca->policy_argc;
1933
1934         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
1935                                      ca->block_size, may_format,
1936                                      dm_cache_policy_get_hint_size(cache->policy));
1937         if (IS_ERR(cmd)) {
1938                 *error = "Error creating metadata object";
1939                 r = PTR_ERR(cmd);
1940                 goto bad;
1941         }
1942         cache->cmd = cmd;
1943
1944         spin_lock_init(&cache->lock);
1945         bio_list_init(&cache->deferred_bios);
1946         bio_list_init(&cache->deferred_flush_bios);
1947         bio_list_init(&cache->deferred_writethrough_bios);
1948         INIT_LIST_HEAD(&cache->quiesced_migrations);
1949         INIT_LIST_HEAD(&cache->completed_migrations);
1950         INIT_LIST_HEAD(&cache->need_commit_migrations);
1951         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
1952         atomic_set(&cache->nr_migrations, 0);
1953         init_waitqueue_head(&cache->migration_wait);
1954
1955         cache->nr_dirty = 0;
1956         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
1957         if (!cache->dirty_bitset) {
1958                 *error = "could not allocate dirty bitset";
1959                 goto bad;
1960         }
1961         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
1962
1963         cache->discard_block_size =
1964                 calculate_discard_block_size(cache->sectors_per_block,
1965                                              cache->origin_sectors);
1966         cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
1967         cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
1968         if (!cache->discard_bitset) {
1969                 *error = "could not allocate discard bitset";
1970                 goto bad;
1971         }
1972         clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
1973
1974         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1975         if (IS_ERR(cache->copier)) {
1976                 *error = "could not create kcopyd client";
1977                 r = PTR_ERR(cache->copier);
1978                 goto bad;
1979         }
1980
1981         cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1982         if (!cache->wq) {
1983                 *error = "could not create workqueue for metadata object";
1984                 goto bad;
1985         }
1986         INIT_WORK(&cache->worker, do_worker);
1987         INIT_DELAYED_WORK(&cache->waker, do_waker);
1988         cache->last_commit_jiffies = jiffies;
1989
1990         cache->prison = dm_bio_prison_create(PRISON_CELLS);
1991         if (!cache->prison) {
1992                 *error = "could not create bio prison";
1993                 goto bad;
1994         }
1995
1996         cache->all_io_ds = dm_deferred_set_create();
1997         if (!cache->all_io_ds) {
1998                 *error = "could not create all_io deferred set";
1999                 goto bad;
2000         }
2001
2002         cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2003                                                          migration_cache);
2004         if (!cache->migration_pool) {
2005                 *error = "Error creating cache's migration mempool";
2006                 goto bad;
2007         }
2008
2009         cache->next_migration = NULL;
2010
2011         cache->need_tick_bio = true;
2012         cache->sized = false;
2013         cache->quiescing = false;
2014         cache->commit_requested = false;
2015         cache->loaded_mappings = false;
2016         cache->loaded_discards = false;
2017
2018         load_stats(cache);
2019
2020         atomic_set(&cache->stats.demotion, 0);
2021         atomic_set(&cache->stats.promotion, 0);
2022         atomic_set(&cache->stats.copies_avoided, 0);
2023         atomic_set(&cache->stats.cache_cell_clash, 0);
2024         atomic_set(&cache->stats.commit_count, 0);
2025         atomic_set(&cache->stats.discard_count, 0);
2026
2027         *result = cache;
2028         return 0;
2029
2030 bad:
2031         destroy(cache);
2032         return r;
2033 }
2034
2035 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2036 {
2037         unsigned i;
2038         const char **copy;
2039
2040         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2041         if (!copy)
2042                 return -ENOMEM;
2043         for (i = 0; i < argc; i++) {
2044                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2045                 if (!copy[i]) {
2046                         while (i--)
2047                                 kfree(copy[i]);
2048                         kfree(copy);
2049                         return -ENOMEM;
2050                 }
2051         }
2052
2053         cache->nr_ctr_args = argc;
2054         cache->ctr_args = copy;
2055
2056         return 0;
2057 }
2058
2059 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2060 {
2061         int r = -EINVAL;
2062         struct cache_args *ca;
2063         struct cache *cache = NULL;
2064
2065         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2066         if (!ca) {
2067                 ti->error = "Error allocating memory for cache";
2068                 return -ENOMEM;
2069         }
2070         ca->ti = ti;
2071
2072         r = parse_cache_args(ca, argc, argv, &ti->error);
2073         if (r)
2074                 goto out;
2075
2076         r = cache_create(ca, &cache);
2077         if (r)
2078                 goto out;
2079
2080         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2081         if (r) {
2082                 destroy(cache);
2083                 goto out;
2084         }
2085
2086         ti->private = cache;
2087
2088 out:
2089         destroy_cache_args(ca);
2090         return r;
2091 }
2092
2093 static int cache_map(struct dm_target *ti, struct bio *bio)
2094 {
2095         struct cache *cache = ti->private;
2096
2097         int r;
2098         dm_oblock_t block = get_bio_block(cache, bio);
2099         bool can_migrate = false;
2100         bool discarded_block;
2101         struct dm_bio_prison_cell *cell;
2102         struct policy_result lookup_result;
2103         struct per_bio_data *pb;
2104
2105         if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
2106                 /*
2107                  * This can only occur if the io goes to a partial block at
2108                  * the end of the origin device.  We don't cache these.
2109                  * Just remap to the origin and carry on.
2110                  */
2111                 remap_to_origin_clear_discard(cache, bio, block);
2112                 return DM_MAPIO_REMAPPED;
2113         }
2114
2115         pb = init_per_bio_data(bio);
2116
2117         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2118                 defer_bio(cache, bio);
2119                 return DM_MAPIO_SUBMITTED;
2120         }
2121
2122         /*
2123          * Check to see if that block is currently migrating.
2124          */
2125         cell = alloc_prison_cell(cache);
2126         if (!cell) {
2127                 defer_bio(cache, bio);
2128                 return DM_MAPIO_SUBMITTED;
2129         }
2130
2131         r = bio_detain(cache, block, bio, cell,
2132                        (cell_free_fn) free_prison_cell,
2133                        cache, &cell);
2134         if (r) {
2135                 if (r < 0)
2136                         defer_bio(cache, bio);
2137
2138                 return DM_MAPIO_SUBMITTED;
2139         }
2140
2141         discarded_block = is_discarded_oblock(cache, block);
2142
2143         r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2144                        bio, &lookup_result);
2145         if (r == -EWOULDBLOCK) {
2146                 cell_defer(cache, cell, true);
2147                 return DM_MAPIO_SUBMITTED;
2148
2149         } else if (r) {
2150                 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2151                 bio_io_error(bio);
2152                 return DM_MAPIO_SUBMITTED;
2153         }
2154
2155         switch (lookup_result.op) {
2156         case POLICY_HIT:
2157                 inc_hit_counter(cache, bio);
2158                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2159
2160                 if (is_writethrough_io(cache, bio, lookup_result.cblock))
2161                         remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2162                 else
2163                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2164
2165                 cell_defer(cache, cell, false);
2166                 break;
2167
2168         case POLICY_MISS:
2169                 inc_miss_counter(cache, bio);
2170                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2171
2172                 if (pb->req_nr != 0) {
2173                         /*
2174                          * This is a duplicate writethrough io that is no
2175                          * longer needed because the block has been demoted.
2176                          */
2177                         bio_endio(bio, 0);
2178                         cell_defer(cache, cell, false);
2179                         return DM_MAPIO_SUBMITTED;
2180                 } else {
2181                         remap_to_origin_clear_discard(cache, bio, block);
2182                         cell_defer(cache, cell, false);
2183                 }
2184                 break;
2185
2186         default:
2187                 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2188                             (unsigned) lookup_result.op);
2189                 bio_io_error(bio);
2190                 return DM_MAPIO_SUBMITTED;
2191         }
2192
2193         return DM_MAPIO_REMAPPED;
2194 }
2195
2196 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2197 {
2198         struct cache *cache = ti->private;
2199         unsigned long flags;
2200         struct per_bio_data *pb = get_per_bio_data(bio);
2201
2202         if (pb->tick) {
2203                 policy_tick(cache->policy);
2204
2205                 spin_lock_irqsave(&cache->lock, flags);
2206                 cache->need_tick_bio = true;
2207                 spin_unlock_irqrestore(&cache->lock, flags);
2208         }
2209
2210         check_for_quiesced_migrations(cache, pb);
2211
2212         return 0;
2213 }
2214
2215 static int write_dirty_bitset(struct cache *cache)
2216 {
2217         unsigned i, r;
2218
2219         for (i = 0; i < from_cblock(cache->cache_size); i++) {
2220                 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2221                                        is_dirty(cache, to_cblock(i)));
2222                 if (r)
2223                         return r;
2224         }
2225
2226         return 0;
2227 }
2228
2229 static int write_discard_bitset(struct cache *cache)
2230 {
2231         unsigned i, r;
2232
2233         r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2234                                            cache->discard_nr_blocks);
2235         if (r) {
2236                 DMERR("could not resize on-disk discard bitset");
2237                 return r;
2238         }
2239
2240         for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2241                 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2242                                          is_discarded(cache, to_dblock(i)));
2243                 if (r)
2244                         return r;
2245         }
2246
2247         return 0;
2248 }
2249
2250 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2251                      uint32_t hint)
2252 {
2253         struct cache *cache = context;
2254         return dm_cache_save_hint(cache->cmd, cblock, hint);
2255 }
2256
2257 static int write_hints(struct cache *cache)
2258 {
2259         int r;
2260
2261         r = dm_cache_begin_hints(cache->cmd, cache->policy);
2262         if (r) {
2263                 DMERR("dm_cache_begin_hints failed");
2264                 return r;
2265         }
2266
2267         r = policy_walk_mappings(cache->policy, save_hint, cache);
2268         if (r)
2269                 DMERR("policy_walk_mappings failed");
2270
2271         return r;
2272 }
2273
2274 /*
2275  * returns true on success
2276  */
2277 static bool sync_metadata(struct cache *cache)
2278 {
2279         int r1, r2, r3, r4;
2280
2281         r1 = write_dirty_bitset(cache);
2282         if (r1)
2283                 DMERR("could not write dirty bitset");
2284
2285         r2 = write_discard_bitset(cache);
2286         if (r2)
2287                 DMERR("could not write discard bitset");
2288
2289         save_stats(cache);
2290
2291         r3 = write_hints(cache);
2292         if (r3)
2293                 DMERR("could not write hints");
2294
2295         /*
2296          * If writing the above metadata failed, we still commit, but don't
2297          * set the clean shutdown flag.  This will effectively force every
2298          * dirty bit to be set on reload.
2299          */
2300         r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2301         if (r4)
2302                 DMERR("could not write cache metadata.  Data loss may occur.");
2303
2304         return !r1 && !r2 && !r3 && !r4;
2305 }
2306
2307 static void cache_postsuspend(struct dm_target *ti)
2308 {
2309         struct cache *cache = ti->private;
2310
2311         start_quiescing(cache);
2312         wait_for_migrations(cache);
2313         stop_worker(cache);
2314         requeue_deferred_io(cache);
2315         stop_quiescing(cache);
2316
2317         (void) sync_metadata(cache);
2318 }
2319
2320 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2321                         bool dirty, uint32_t hint, bool hint_valid)
2322 {
2323         int r;
2324         struct cache *cache = context;
2325
2326         r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2327         if (r)
2328                 return r;
2329
2330         if (dirty)
2331                 set_dirty(cache, oblock, cblock);
2332         else
2333                 clear_dirty(cache, oblock, cblock);
2334
2335         return 0;
2336 }
2337
2338 static int load_discard(void *context, sector_t discard_block_size,
2339                         dm_dblock_t dblock, bool discard)
2340 {
2341         struct cache *cache = context;
2342
2343         /* FIXME: handle mis-matched block size */
2344
2345         if (discard)
2346                 set_discard(cache, dblock);
2347         else
2348                 clear_discard(cache, dblock);
2349
2350         return 0;
2351 }
2352
2353 static int cache_preresume(struct dm_target *ti)
2354 {
2355         int r = 0;
2356         struct cache *cache = ti->private;
2357         sector_t actual_cache_size = get_dev_size(cache->cache_dev);
2358         (void) sector_div(actual_cache_size, cache->sectors_per_block);
2359
2360         /*
2361          * Check to see if the cache has resized.
2362          */
2363         if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
2364                 cache->cache_size = to_cblock(actual_cache_size);
2365
2366                 r = dm_cache_resize(cache->cmd, cache->cache_size);
2367                 if (r) {
2368                         DMERR("could not resize cache metadata");
2369                         return r;
2370                 }
2371
2372                 cache->sized = true;
2373         }
2374
2375         if (!cache->loaded_mappings) {
2376                 r = dm_cache_load_mappings(cache->cmd, cache->policy,
2377                                            load_mapping, cache);
2378                 if (r) {
2379                         DMERR("could not load cache mappings");
2380                         return r;
2381                 }
2382
2383                 cache->loaded_mappings = true;
2384         }
2385
2386         if (!cache->loaded_discards) {
2387                 r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2388                 if (r) {
2389                         DMERR("could not load origin discards");
2390                         return r;
2391                 }
2392
2393                 cache->loaded_discards = true;
2394         }
2395
2396         return r;
2397 }
2398
2399 static void cache_resume(struct dm_target *ti)
2400 {
2401         struct cache *cache = ti->private;
2402
2403         cache->need_tick_bio = true;
2404         do_waker(&cache->waker.work);
2405 }
2406
2407 /*
2408  * Status format:
2409  *
2410  * <#used metadata blocks>/<#total metadata blocks>
2411  * <#read hits> <#read misses> <#write hits> <#write misses>
2412  * <#demotions> <#promotions> <#blocks in cache> <#dirty>
2413  * <#features> <features>*
2414  * <#core args> <core args>
2415  * <#policy args> <policy args>*
2416  */
2417 static void cache_status(struct dm_target *ti, status_type_t type,
2418                          unsigned status_flags, char *result, unsigned maxlen)
2419 {
2420         int r = 0;
2421         unsigned i;
2422         ssize_t sz = 0;
2423         dm_block_t nr_free_blocks_metadata = 0;
2424         dm_block_t nr_blocks_metadata = 0;
2425         char buf[BDEVNAME_SIZE];
2426         struct cache *cache = ti->private;
2427         dm_cblock_t residency;
2428
2429         switch (type) {
2430         case STATUSTYPE_INFO:
2431                 /* Commit to ensure statistics aren't out-of-date */
2432                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2433                         r = dm_cache_commit(cache->cmd, false);
2434                         if (r)
2435                                 DMERR("could not commit metadata for accurate status");
2436                 }
2437
2438                 r = dm_cache_get_free_metadata_block_count(cache->cmd,
2439                                                            &nr_free_blocks_metadata);
2440                 if (r) {
2441                         DMERR("could not get metadata free block count");
2442                         goto err;
2443                 }
2444
2445                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2446                 if (r) {
2447                         DMERR("could not get metadata device size");
2448                         goto err;
2449                 }
2450
2451                 residency = policy_residency(cache->policy);
2452
2453                 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
2454                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2455                        (unsigned long long)nr_blocks_metadata,
2456                        (unsigned) atomic_read(&cache->stats.read_hit),
2457                        (unsigned) atomic_read(&cache->stats.read_miss),
2458                        (unsigned) atomic_read(&cache->stats.write_hit),
2459                        (unsigned) atomic_read(&cache->stats.write_miss),
2460                        (unsigned) atomic_read(&cache->stats.demotion),
2461                        (unsigned) atomic_read(&cache->stats.promotion),
2462                        (unsigned long long) from_cblock(residency),
2463                        cache->nr_dirty);
2464
2465                 if (cache->features.write_through)
2466                         DMEMIT("1 writethrough ");
2467                 else
2468                         DMEMIT("0 ");
2469
2470                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2471                 if (sz < maxlen) {
2472                         r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2473                         if (r)
2474                                 DMERR("policy_emit_config_values returned %d", r);
2475                 }
2476
2477                 break;
2478
2479         case STATUSTYPE_TABLE:
2480                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2481                 DMEMIT("%s ", buf);
2482                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2483                 DMEMIT("%s ", buf);
2484                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2485                 DMEMIT("%s", buf);
2486
2487                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
2488                         DMEMIT(" %s", cache->ctr_args[i]);
2489                 if (cache->nr_ctr_args)
2490                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2491         }
2492
2493         return;
2494
2495 err:
2496         DMEMIT("Error");
2497 }
2498
2499 #define NOT_CORE_OPTION 1
2500
2501 static int process_config_option(struct cache *cache, char **argv)
2502 {
2503         unsigned long tmp;
2504
2505         if (!strcasecmp(argv[0], "migration_threshold")) {
2506                 if (kstrtoul(argv[1], 10, &tmp))
2507                         return -EINVAL;
2508
2509                 cache->migration_threshold = tmp;
2510                 return 0;
2511         }
2512
2513         return NOT_CORE_OPTION;
2514 }
2515
2516 /*
2517  * Supports <key> <value>.
2518  *
2519  * The key migration_threshold is supported by the cache target core.
2520  */
2521 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2522 {
2523         int r;
2524         struct cache *cache = ti->private;
2525
2526         if (argc != 2)
2527                 return -EINVAL;
2528
2529         r = process_config_option(cache, argv);
2530         if (r == NOT_CORE_OPTION)
2531                 return policy_set_config_value(cache->policy, argv[0], argv[1]);
2532
2533         return r;
2534 }
2535
2536 static int cache_iterate_devices(struct dm_target *ti,
2537                                  iterate_devices_callout_fn fn, void *data)
2538 {
2539         int r = 0;
2540         struct cache *cache = ti->private;
2541
2542         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
2543         if (!r)
2544                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
2545
2546         return r;
2547 }
2548
2549 /*
2550  * We assume I/O is going to the origin (which is the volume
2551  * more likely to have restrictions e.g. by being striped).
2552  * (Looking up the exact location of the data would be expensive
2553  * and could always be out of date by the time the bio is submitted.)
2554  */
2555 static int cache_bvec_merge(struct dm_target *ti,
2556                             struct bvec_merge_data *bvm,
2557                             struct bio_vec *biovec, int max_size)
2558 {
2559         struct cache *cache = ti->private;
2560         struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
2561
2562         if (!q->merge_bvec_fn)
2563                 return max_size;
2564
2565         bvm->bi_bdev = cache->origin_dev->bdev;
2566         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2567 }
2568
2569 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2570 {
2571         /*
2572          * FIXME: these limits may be incompatible with the cache device
2573          */
2574         limits->max_discard_sectors = cache->discard_block_size * 1024;
2575         limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
2576 }
2577
2578 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2579 {
2580         struct cache *cache = ti->private;
2581
2582         blk_limits_io_min(limits, 0);
2583         blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2584         set_discard_limits(cache, limits);
2585 }
2586
2587 /*----------------------------------------------------------------*/
2588
2589 static struct target_type cache_target = {
2590         .name = "cache",
2591         .version = {1, 1, 0},
2592         .module = THIS_MODULE,
2593         .ctr = cache_ctr,
2594         .dtr = cache_dtr,
2595         .map = cache_map,
2596         .end_io = cache_end_io,
2597         .postsuspend = cache_postsuspend,
2598         .preresume = cache_preresume,
2599         .resume = cache_resume,
2600         .status = cache_status,
2601         .message = cache_message,
2602         .iterate_devices = cache_iterate_devices,
2603         .merge = cache_bvec_merge,
2604         .io_hints = cache_io_hints,
2605 };
2606
2607 static int __init dm_cache_init(void)
2608 {
2609         int r;
2610
2611         r = dm_register_target(&cache_target);
2612         if (r) {
2613                 DMERR("cache target registration failed: %d", r);
2614                 return r;
2615         }
2616
2617         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
2618         if (!migration_cache) {
2619                 dm_unregister_target(&cache_target);
2620                 return -ENOMEM;
2621         }
2622
2623         return 0;
2624 }
2625
2626 static void __exit dm_cache_exit(void)
2627 {
2628         dm_unregister_target(&cache_target);
2629         kmem_cache_destroy(migration_cache);
2630 }
2631
2632 module_init(dm_cache_init);
2633 module_exit(dm_cache_exit);
2634
2635 MODULE_DESCRIPTION(DM_NAME " cache target");
2636 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
2637 MODULE_LICENSE("GPL");