dm cache: fix writes to cache device in writethrough mode
[cascardo/linux.git] / drivers / md / dm-cache-target.c
1 /*
2  * Copyright (C) 2012 Red Hat. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include "dm.h"
8 #include "dm-bio-prison.h"
9 #include "dm-bio-record.h"
10 #include "dm-cache-metadata.h"
11
12 #include <linux/dm-io.h>
13 #include <linux/dm-kcopyd.h>
14 #include <linux/init.h>
15 #include <linux/mempool.h>
16 #include <linux/module.h>
17 #include <linux/slab.h>
18 #include <linux/vmalloc.h>
19
20 #define DM_MSG_PREFIX "cache"
21
22 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
23         "A percentage of time allocated for copying to and/or from cache");
24
25 /*----------------------------------------------------------------*/
26
27 /*
28  * Glossary:
29  *
30  * oblock: index of an origin block
31  * cblock: index of a cache block
32  * promotion: movement of a block from origin to cache
33  * demotion: movement of a block from cache to origin
34  * migration: movement of a block between the origin and cache device,
35  *            either direction
36  */
37
38 /*----------------------------------------------------------------*/
39
40 static size_t bitset_size_in_bytes(unsigned nr_entries)
41 {
42         return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
43 }
44
45 static unsigned long *alloc_bitset(unsigned nr_entries)
46 {
47         size_t s = bitset_size_in_bytes(nr_entries);
48         return vzalloc(s);
49 }
50
51 static void clear_bitset(void *bitset, unsigned nr_entries)
52 {
53         size_t s = bitset_size_in_bytes(nr_entries);
54         memset(bitset, 0, s);
55 }
56
57 static void free_bitset(unsigned long *bits)
58 {
59         vfree(bits);
60 }
61
62 /*----------------------------------------------------------------*/
63
64 #define PRISON_CELLS 1024
65 #define MIGRATION_POOL_SIZE 128
66 #define COMMIT_PERIOD HZ
67 #define MIGRATION_COUNT_WINDOW 10
68
69 /*
70  * The block size of the device holding cache data must be >= 32KB
71  */
72 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
73
74 /*
75  * FIXME: the cache is read/write for the time being.
76  */
77 enum cache_mode {
78         CM_WRITE,               /* metadata may be changed */
79         CM_READ_ONLY,           /* metadata may not be changed */
80 };
81
82 struct cache_features {
83         enum cache_mode mode;
84         bool write_through:1;
85 };
86
87 struct cache_stats {
88         atomic_t read_hit;
89         atomic_t read_miss;
90         atomic_t write_hit;
91         atomic_t write_miss;
92         atomic_t demotion;
93         atomic_t promotion;
94         atomic_t copies_avoided;
95         atomic_t cache_cell_clash;
96         atomic_t commit_count;
97         atomic_t discard_count;
98 };
99
100 struct cache {
101         struct dm_target *ti;
102         struct dm_target_callbacks callbacks;
103
104         /*
105          * Metadata is written to this device.
106          */
107         struct dm_dev *metadata_dev;
108
109         /*
110          * The slower of the two data devices.  Typically a spindle.
111          */
112         struct dm_dev *origin_dev;
113
114         /*
115          * The faster of the two data devices.  Typically an SSD.
116          */
117         struct dm_dev *cache_dev;
118
119         /*
120          * Cache features such as write-through.
121          */
122         struct cache_features features;
123
124         /*
125          * Size of the origin device in _complete_ blocks and native sectors.
126          */
127         dm_oblock_t origin_blocks;
128         sector_t origin_sectors;
129
130         /*
131          * Size of the cache device in blocks.
132          */
133         dm_cblock_t cache_size;
134
135         /*
136          * Fields for converting from sectors to blocks.
137          */
138         uint32_t sectors_per_block;
139         int sectors_per_block_shift;
140
141         struct dm_cache_metadata *cmd;
142
143         spinlock_t lock;
144         struct bio_list deferred_bios;
145         struct bio_list deferred_flush_bios;
146         struct bio_list deferred_writethrough_bios;
147         struct list_head quiesced_migrations;
148         struct list_head completed_migrations;
149         struct list_head need_commit_migrations;
150         sector_t migration_threshold;
151         atomic_t nr_migrations;
152         wait_queue_head_t migration_wait;
153
154         /*
155          * cache_size entries, dirty if set
156          */
157         dm_cblock_t nr_dirty;
158         unsigned long *dirty_bitset;
159
160         /*
161          * origin_blocks entries, discarded if set.
162          */
163         uint32_t discard_block_size; /* a power of 2 times sectors per block */
164         dm_dblock_t discard_nr_blocks;
165         unsigned long *discard_bitset;
166
167         struct dm_kcopyd_client *copier;
168         struct workqueue_struct *wq;
169         struct work_struct worker;
170
171         struct delayed_work waker;
172         unsigned long last_commit_jiffies;
173
174         struct dm_bio_prison *prison;
175         struct dm_deferred_set *all_io_ds;
176
177         mempool_t *migration_pool;
178         struct dm_cache_migration *next_migration;
179
180         struct dm_cache_policy *policy;
181         unsigned policy_nr_args;
182
183         bool need_tick_bio:1;
184         bool sized:1;
185         bool quiescing:1;
186         bool commit_requested:1;
187         bool loaded_mappings:1;
188         bool loaded_discards:1;
189
190         struct cache_stats stats;
191
192         /*
193          * Rather than reconstructing the table line for the status we just
194          * save it and regurgitate.
195          */
196         unsigned nr_ctr_args;
197         const char **ctr_args;
198 };
199
200 struct per_bio_data {
201         bool tick:1;
202         unsigned req_nr:2;
203         struct dm_deferred_entry *all_io_entry;
204
205         /* writethrough fields */
206         struct cache *cache;
207         dm_cblock_t cblock;
208         bio_end_io_t *saved_bi_end_io;
209         struct dm_bio_details bio_details;
210 };
211
212 struct dm_cache_migration {
213         struct list_head list;
214         struct cache *cache;
215
216         unsigned long start_jiffies;
217         dm_oblock_t old_oblock;
218         dm_oblock_t new_oblock;
219         dm_cblock_t cblock;
220
221         bool err:1;
222         bool writeback:1;
223         bool demote:1;
224         bool promote:1;
225
226         struct dm_bio_prison_cell *old_ocell;
227         struct dm_bio_prison_cell *new_ocell;
228 };
229
230 /*
231  * Processing a bio in the worker thread may require these memory
232  * allocations.  We prealloc to avoid deadlocks (the same worker thread
233  * frees them back to the mempool).
234  */
235 struct prealloc {
236         struct dm_cache_migration *mg;
237         struct dm_bio_prison_cell *cell1;
238         struct dm_bio_prison_cell *cell2;
239 };
240
241 static void wake_worker(struct cache *cache)
242 {
243         queue_work(cache->wq, &cache->worker);
244 }
245
246 /*----------------------------------------------------------------*/
247
248 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
249 {
250         /* FIXME: change to use a local slab. */
251         return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
252 }
253
254 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
255 {
256         dm_bio_prison_free_cell(cache->prison, cell);
257 }
258
259 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
260 {
261         if (!p->mg) {
262                 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
263                 if (!p->mg)
264                         return -ENOMEM;
265         }
266
267         if (!p->cell1) {
268                 p->cell1 = alloc_prison_cell(cache);
269                 if (!p->cell1)
270                         return -ENOMEM;
271         }
272
273         if (!p->cell2) {
274                 p->cell2 = alloc_prison_cell(cache);
275                 if (!p->cell2)
276                         return -ENOMEM;
277         }
278
279         return 0;
280 }
281
282 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
283 {
284         if (p->cell2)
285                 free_prison_cell(cache, p->cell2);
286
287         if (p->cell1)
288                 free_prison_cell(cache, p->cell1);
289
290         if (p->mg)
291                 mempool_free(p->mg, cache->migration_pool);
292 }
293
294 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
295 {
296         struct dm_cache_migration *mg = p->mg;
297
298         BUG_ON(!mg);
299         p->mg = NULL;
300
301         return mg;
302 }
303
304 /*
305  * You must have a cell within the prealloc struct to return.  If not this
306  * function will BUG() rather than returning NULL.
307  */
308 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
309 {
310         struct dm_bio_prison_cell *r = NULL;
311
312         if (p->cell1) {
313                 r = p->cell1;
314                 p->cell1 = NULL;
315
316         } else if (p->cell2) {
317                 r = p->cell2;
318                 p->cell2 = NULL;
319         } else
320                 BUG();
321
322         return r;
323 }
324
325 /*
326  * You can't have more than two cells in a prealloc struct.  BUG() will be
327  * called if you try and overfill.
328  */
329 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
330 {
331         if (!p->cell2)
332                 p->cell2 = cell;
333
334         else if (!p->cell1)
335                 p->cell1 = cell;
336
337         else
338                 BUG();
339 }
340
341 /*----------------------------------------------------------------*/
342
343 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
344 {
345         key->virtual = 0;
346         key->dev = 0;
347         key->block = from_oblock(oblock);
348 }
349
350 /*
351  * The caller hands in a preallocated cell, and a free function for it.
352  * The cell will be freed if there's an error, or if it wasn't used because
353  * a cell with that key already exists.
354  */
355 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
356
357 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
358                       struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
359                       cell_free_fn free_fn, void *free_context,
360                       struct dm_bio_prison_cell **cell_result)
361 {
362         int r;
363         struct dm_cell_key key;
364
365         build_key(oblock, &key);
366         r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
367         if (r)
368                 free_fn(free_context, cell_prealloc);
369
370         return r;
371 }
372
373 static int get_cell(struct cache *cache,
374                     dm_oblock_t oblock,
375                     struct prealloc *structs,
376                     struct dm_bio_prison_cell **cell_result)
377 {
378         int r;
379         struct dm_cell_key key;
380         struct dm_bio_prison_cell *cell_prealloc;
381
382         cell_prealloc = prealloc_get_cell(structs);
383
384         build_key(oblock, &key);
385         r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
386         if (r)
387                 prealloc_put_cell(structs, cell_prealloc);
388
389         return r;
390 }
391
392  /*----------------------------------------------------------------*/
393
394 static bool is_dirty(struct cache *cache, dm_cblock_t b)
395 {
396         return test_bit(from_cblock(b), cache->dirty_bitset);
397 }
398
399 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
400 {
401         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
402                 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
403                 policy_set_dirty(cache->policy, oblock);
404         }
405 }
406
407 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
408 {
409         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
410                 policy_clear_dirty(cache->policy, oblock);
411                 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
412                 if (!from_cblock(cache->nr_dirty))
413                         dm_table_event(cache->ti->table);
414         }
415 }
416
417 /*----------------------------------------------------------------*/
418 static bool block_size_is_power_of_two(struct cache *cache)
419 {
420         return cache->sectors_per_block_shift >= 0;
421 }
422
423 static dm_block_t block_div(dm_block_t b, uint32_t n)
424 {
425         do_div(b, n);
426
427         return b;
428 }
429
430 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
431 {
432         uint32_t discard_blocks = cache->discard_block_size;
433         dm_block_t b = from_oblock(oblock);
434
435         if (!block_size_is_power_of_two(cache))
436                 discard_blocks = discard_blocks / cache->sectors_per_block;
437         else
438                 discard_blocks >>= cache->sectors_per_block_shift;
439
440         b = block_div(b, discard_blocks);
441
442         return to_dblock(b);
443 }
444
445 static void set_discard(struct cache *cache, dm_dblock_t b)
446 {
447         unsigned long flags;
448
449         atomic_inc(&cache->stats.discard_count);
450
451         spin_lock_irqsave(&cache->lock, flags);
452         set_bit(from_dblock(b), cache->discard_bitset);
453         spin_unlock_irqrestore(&cache->lock, flags);
454 }
455
456 static void clear_discard(struct cache *cache, dm_dblock_t b)
457 {
458         unsigned long flags;
459
460         spin_lock_irqsave(&cache->lock, flags);
461         clear_bit(from_dblock(b), cache->discard_bitset);
462         spin_unlock_irqrestore(&cache->lock, flags);
463 }
464
465 static bool is_discarded(struct cache *cache, dm_dblock_t b)
466 {
467         int r;
468         unsigned long flags;
469
470         spin_lock_irqsave(&cache->lock, flags);
471         r = test_bit(from_dblock(b), cache->discard_bitset);
472         spin_unlock_irqrestore(&cache->lock, flags);
473
474         return r;
475 }
476
477 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
478 {
479         int r;
480         unsigned long flags;
481
482         spin_lock_irqsave(&cache->lock, flags);
483         r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
484                      cache->discard_bitset);
485         spin_unlock_irqrestore(&cache->lock, flags);
486
487         return r;
488 }
489
490 /*----------------------------------------------------------------*/
491
492 static void load_stats(struct cache *cache)
493 {
494         struct dm_cache_statistics stats;
495
496         dm_cache_metadata_get_stats(cache->cmd, &stats);
497         atomic_set(&cache->stats.read_hit, stats.read_hits);
498         atomic_set(&cache->stats.read_miss, stats.read_misses);
499         atomic_set(&cache->stats.write_hit, stats.write_hits);
500         atomic_set(&cache->stats.write_miss, stats.write_misses);
501 }
502
503 static void save_stats(struct cache *cache)
504 {
505         struct dm_cache_statistics stats;
506
507         stats.read_hits = atomic_read(&cache->stats.read_hit);
508         stats.read_misses = atomic_read(&cache->stats.read_miss);
509         stats.write_hits = atomic_read(&cache->stats.write_hit);
510         stats.write_misses = atomic_read(&cache->stats.write_miss);
511
512         dm_cache_metadata_set_stats(cache->cmd, &stats);
513 }
514
515 /*----------------------------------------------------------------
516  * Per bio data
517  *--------------------------------------------------------------*/
518 static struct per_bio_data *get_per_bio_data(struct bio *bio)
519 {
520         struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
521         BUG_ON(!pb);
522         return pb;
523 }
524
525 static struct per_bio_data *init_per_bio_data(struct bio *bio)
526 {
527         struct per_bio_data *pb = get_per_bio_data(bio);
528
529         pb->tick = false;
530         pb->req_nr = dm_bio_get_target_bio_nr(bio);
531         pb->all_io_entry = NULL;
532
533         return pb;
534 }
535
536 /*----------------------------------------------------------------
537  * Remapping
538  *--------------------------------------------------------------*/
539 static void remap_to_origin(struct cache *cache, struct bio *bio)
540 {
541         bio->bi_bdev = cache->origin_dev->bdev;
542 }
543
544 static void remap_to_cache(struct cache *cache, struct bio *bio,
545                            dm_cblock_t cblock)
546 {
547         sector_t bi_sector = bio->bi_sector;
548
549         bio->bi_bdev = cache->cache_dev->bdev;
550         if (!block_size_is_power_of_two(cache))
551                 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
552                                 sector_div(bi_sector, cache->sectors_per_block);
553         else
554                 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
555                                 (bi_sector & (cache->sectors_per_block - 1));
556 }
557
558 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
559 {
560         unsigned long flags;
561         struct per_bio_data *pb = get_per_bio_data(bio);
562
563         spin_lock_irqsave(&cache->lock, flags);
564         if (cache->need_tick_bio &&
565             !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
566                 pb->tick = true;
567                 cache->need_tick_bio = false;
568         }
569         spin_unlock_irqrestore(&cache->lock, flags);
570 }
571
572 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
573                                   dm_oblock_t oblock)
574 {
575         check_if_tick_bio_needed(cache, bio);
576         remap_to_origin(cache, bio);
577         if (bio_data_dir(bio) == WRITE)
578                 clear_discard(cache, oblock_to_dblock(cache, oblock));
579 }
580
581 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
582                                  dm_oblock_t oblock, dm_cblock_t cblock)
583 {
584         remap_to_cache(cache, bio, cblock);
585         if (bio_data_dir(bio) == WRITE) {
586                 set_dirty(cache, oblock, cblock);
587                 clear_discard(cache, oblock_to_dblock(cache, oblock));
588         }
589 }
590
591 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
592 {
593         sector_t block_nr = bio->bi_sector;
594
595         if (!block_size_is_power_of_two(cache))
596                 (void) sector_div(block_nr, cache->sectors_per_block);
597         else
598                 block_nr >>= cache->sectors_per_block_shift;
599
600         return to_oblock(block_nr);
601 }
602
603 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
604 {
605         return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
606 }
607
608 static void issue(struct cache *cache, struct bio *bio)
609 {
610         unsigned long flags;
611
612         if (!bio_triggers_commit(cache, bio)) {
613                 generic_make_request(bio);
614                 return;
615         }
616
617         /*
618          * Batch together any bios that trigger commits and then issue a
619          * single commit for them in do_worker().
620          */
621         spin_lock_irqsave(&cache->lock, flags);
622         cache->commit_requested = true;
623         bio_list_add(&cache->deferred_flush_bios, bio);
624         spin_unlock_irqrestore(&cache->lock, flags);
625 }
626
627 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
628 {
629         unsigned long flags;
630
631         spin_lock_irqsave(&cache->lock, flags);
632         bio_list_add(&cache->deferred_writethrough_bios, bio);
633         spin_unlock_irqrestore(&cache->lock, flags);
634
635         wake_worker(cache);
636 }
637
638 static void writethrough_endio(struct bio *bio, int err)
639 {
640         struct per_bio_data *pb = get_per_bio_data(bio);
641         bio->bi_end_io = pb->saved_bi_end_io;
642
643         if (err) {
644                 bio_endio(bio, err);
645                 return;
646         }
647
648         dm_bio_restore(&pb->bio_details, bio);
649         remap_to_cache(pb->cache, bio, pb->cblock);
650
651         /*
652          * We can't issue this bio directly, since we're in interrupt
653          * context.  So it get's put on a bio list for processing by the
654          * worker thread.
655          */
656         defer_writethrough_bio(pb->cache, bio);
657 }
658
659 /*
660  * When running in writethrough mode we need to send writes to clean blocks
661  * to both the cache and origin devices.  In future we'd like to clone the
662  * bio and send them in parallel, but for now we're doing them in
663  * series as this is easier.
664  */
665 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
666                                        dm_oblock_t oblock, dm_cblock_t cblock)
667 {
668         struct per_bio_data *pb = get_per_bio_data(bio);
669
670         pb->cache = cache;
671         pb->cblock = cblock;
672         pb->saved_bi_end_io = bio->bi_end_io;
673         dm_bio_record(&pb->bio_details, bio);
674         bio->bi_end_io = writethrough_endio;
675
676         remap_to_origin_clear_discard(pb->cache, bio, oblock);
677 }
678
679 /*----------------------------------------------------------------
680  * Migration processing
681  *
682  * Migration covers moving data from the origin device to the cache, or
683  * vice versa.
684  *--------------------------------------------------------------*/
685 static void free_migration(struct dm_cache_migration *mg)
686 {
687         mempool_free(mg, mg->cache->migration_pool);
688 }
689
690 static void inc_nr_migrations(struct cache *cache)
691 {
692         atomic_inc(&cache->nr_migrations);
693 }
694
695 static void dec_nr_migrations(struct cache *cache)
696 {
697         atomic_dec(&cache->nr_migrations);
698
699         /*
700          * Wake the worker in case we're suspending the target.
701          */
702         wake_up(&cache->migration_wait);
703 }
704
705 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
706                          bool holder)
707 {
708         (holder ? dm_cell_release : dm_cell_release_no_holder)
709                 (cache->prison, cell, &cache->deferred_bios);
710         free_prison_cell(cache, cell);
711 }
712
713 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
714                        bool holder)
715 {
716         unsigned long flags;
717
718         spin_lock_irqsave(&cache->lock, flags);
719         __cell_defer(cache, cell, holder);
720         spin_unlock_irqrestore(&cache->lock, flags);
721
722         wake_worker(cache);
723 }
724
725 static void cleanup_migration(struct dm_cache_migration *mg)
726 {
727         dec_nr_migrations(mg->cache);
728         free_migration(mg);
729 }
730
731 static void migration_failure(struct dm_cache_migration *mg)
732 {
733         struct cache *cache = mg->cache;
734
735         if (mg->writeback) {
736                 DMWARN_LIMIT("writeback failed; couldn't copy block");
737                 set_dirty(cache, mg->old_oblock, mg->cblock);
738                 cell_defer(cache, mg->old_ocell, false);
739
740         } else if (mg->demote) {
741                 DMWARN_LIMIT("demotion failed; couldn't copy block");
742                 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
743
744                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
745                 if (mg->promote)
746                         cell_defer(cache, mg->new_ocell, 1);
747         } else {
748                 DMWARN_LIMIT("promotion failed; couldn't copy block");
749                 policy_remove_mapping(cache->policy, mg->new_oblock);
750                 cell_defer(cache, mg->new_ocell, 1);
751         }
752
753         cleanup_migration(mg);
754 }
755
756 static void migration_success_pre_commit(struct dm_cache_migration *mg)
757 {
758         unsigned long flags;
759         struct cache *cache = mg->cache;
760
761         if (mg->writeback) {
762                 cell_defer(cache, mg->old_ocell, false);
763                 clear_dirty(cache, mg->old_oblock, mg->cblock);
764                 cleanup_migration(mg);
765                 return;
766
767         } else if (mg->demote) {
768                 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
769                         DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
770                         policy_force_mapping(cache->policy, mg->new_oblock,
771                                              mg->old_oblock);
772                         if (mg->promote)
773                                 cell_defer(cache, mg->new_ocell, true);
774                         cleanup_migration(mg);
775                         return;
776                 }
777         } else {
778                 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
779                         DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
780                         policy_remove_mapping(cache->policy, mg->new_oblock);
781                         cleanup_migration(mg);
782                         return;
783                 }
784         }
785
786         spin_lock_irqsave(&cache->lock, flags);
787         list_add_tail(&mg->list, &cache->need_commit_migrations);
788         cache->commit_requested = true;
789         spin_unlock_irqrestore(&cache->lock, flags);
790 }
791
792 static void migration_success_post_commit(struct dm_cache_migration *mg)
793 {
794         unsigned long flags;
795         struct cache *cache = mg->cache;
796
797         if (mg->writeback) {
798                 DMWARN("writeback unexpectedly triggered commit");
799                 return;
800
801         } else if (mg->demote) {
802                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
803
804                 if (mg->promote) {
805                         mg->demote = false;
806
807                         spin_lock_irqsave(&cache->lock, flags);
808                         list_add_tail(&mg->list, &cache->quiesced_migrations);
809                         spin_unlock_irqrestore(&cache->lock, flags);
810
811                 } else
812                         cleanup_migration(mg);
813
814         } else {
815                 cell_defer(cache, mg->new_ocell, true);
816                 clear_dirty(cache, mg->new_oblock, mg->cblock);
817                 cleanup_migration(mg);
818         }
819 }
820
821 static void copy_complete(int read_err, unsigned long write_err, void *context)
822 {
823         unsigned long flags;
824         struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
825         struct cache *cache = mg->cache;
826
827         if (read_err || write_err)
828                 mg->err = true;
829
830         spin_lock_irqsave(&cache->lock, flags);
831         list_add_tail(&mg->list, &cache->completed_migrations);
832         spin_unlock_irqrestore(&cache->lock, flags);
833
834         wake_worker(cache);
835 }
836
837 static void issue_copy_real(struct dm_cache_migration *mg)
838 {
839         int r;
840         struct dm_io_region o_region, c_region;
841         struct cache *cache = mg->cache;
842
843         o_region.bdev = cache->origin_dev->bdev;
844         o_region.count = cache->sectors_per_block;
845
846         c_region.bdev = cache->cache_dev->bdev;
847         c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
848         c_region.count = cache->sectors_per_block;
849
850         if (mg->writeback || mg->demote) {
851                 /* demote */
852                 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
853                 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
854         } else {
855                 /* promote */
856                 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
857                 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
858         }
859
860         if (r < 0)
861                 migration_failure(mg);
862 }
863
864 static void avoid_copy(struct dm_cache_migration *mg)
865 {
866         atomic_inc(&mg->cache->stats.copies_avoided);
867         migration_success_pre_commit(mg);
868 }
869
870 static void issue_copy(struct dm_cache_migration *mg)
871 {
872         bool avoid;
873         struct cache *cache = mg->cache;
874
875         if (mg->writeback || mg->demote)
876                 avoid = !is_dirty(cache, mg->cblock) ||
877                         is_discarded_oblock(cache, mg->old_oblock);
878         else
879                 avoid = is_discarded_oblock(cache, mg->new_oblock);
880
881         avoid ? avoid_copy(mg) : issue_copy_real(mg);
882 }
883
884 static void complete_migration(struct dm_cache_migration *mg)
885 {
886         if (mg->err)
887                 migration_failure(mg);
888         else
889                 migration_success_pre_commit(mg);
890 }
891
892 static void process_migrations(struct cache *cache, struct list_head *head,
893                                void (*fn)(struct dm_cache_migration *))
894 {
895         unsigned long flags;
896         struct list_head list;
897         struct dm_cache_migration *mg, *tmp;
898
899         INIT_LIST_HEAD(&list);
900         spin_lock_irqsave(&cache->lock, flags);
901         list_splice_init(head, &list);
902         spin_unlock_irqrestore(&cache->lock, flags);
903
904         list_for_each_entry_safe(mg, tmp, &list, list)
905                 fn(mg);
906 }
907
908 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
909 {
910         list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
911 }
912
913 static void queue_quiesced_migration(struct dm_cache_migration *mg)
914 {
915         unsigned long flags;
916         struct cache *cache = mg->cache;
917
918         spin_lock_irqsave(&cache->lock, flags);
919         __queue_quiesced_migration(mg);
920         spin_unlock_irqrestore(&cache->lock, flags);
921
922         wake_worker(cache);
923 }
924
925 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
926 {
927         unsigned long flags;
928         struct dm_cache_migration *mg, *tmp;
929
930         spin_lock_irqsave(&cache->lock, flags);
931         list_for_each_entry_safe(mg, tmp, work, list)
932                 __queue_quiesced_migration(mg);
933         spin_unlock_irqrestore(&cache->lock, flags);
934
935         wake_worker(cache);
936 }
937
938 static void check_for_quiesced_migrations(struct cache *cache,
939                                           struct per_bio_data *pb)
940 {
941         struct list_head work;
942
943         if (!pb->all_io_entry)
944                 return;
945
946         INIT_LIST_HEAD(&work);
947         if (pb->all_io_entry)
948                 dm_deferred_entry_dec(pb->all_io_entry, &work);
949
950         if (!list_empty(&work))
951                 queue_quiesced_migrations(cache, &work);
952 }
953
954 static void quiesce_migration(struct dm_cache_migration *mg)
955 {
956         if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
957                 queue_quiesced_migration(mg);
958 }
959
960 static void promote(struct cache *cache, struct prealloc *structs,
961                     dm_oblock_t oblock, dm_cblock_t cblock,
962                     struct dm_bio_prison_cell *cell)
963 {
964         struct dm_cache_migration *mg = prealloc_get_migration(structs);
965
966         mg->err = false;
967         mg->writeback = false;
968         mg->demote = false;
969         mg->promote = true;
970         mg->cache = cache;
971         mg->new_oblock = oblock;
972         mg->cblock = cblock;
973         mg->old_ocell = NULL;
974         mg->new_ocell = cell;
975         mg->start_jiffies = jiffies;
976
977         inc_nr_migrations(cache);
978         quiesce_migration(mg);
979 }
980
981 static void writeback(struct cache *cache, struct prealloc *structs,
982                       dm_oblock_t oblock, dm_cblock_t cblock,
983                       struct dm_bio_prison_cell *cell)
984 {
985         struct dm_cache_migration *mg = prealloc_get_migration(structs);
986
987         mg->err = false;
988         mg->writeback = true;
989         mg->demote = false;
990         mg->promote = false;
991         mg->cache = cache;
992         mg->old_oblock = oblock;
993         mg->cblock = cblock;
994         mg->old_ocell = cell;
995         mg->new_ocell = NULL;
996         mg->start_jiffies = jiffies;
997
998         inc_nr_migrations(cache);
999         quiesce_migration(mg);
1000 }
1001
1002 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1003                                 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1004                                 dm_cblock_t cblock,
1005                                 struct dm_bio_prison_cell *old_ocell,
1006                                 struct dm_bio_prison_cell *new_ocell)
1007 {
1008         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1009
1010         mg->err = false;
1011         mg->writeback = false;
1012         mg->demote = true;
1013         mg->promote = true;
1014         mg->cache = cache;
1015         mg->old_oblock = old_oblock;
1016         mg->new_oblock = new_oblock;
1017         mg->cblock = cblock;
1018         mg->old_ocell = old_ocell;
1019         mg->new_ocell = new_ocell;
1020         mg->start_jiffies = jiffies;
1021
1022         inc_nr_migrations(cache);
1023         quiesce_migration(mg);
1024 }
1025
1026 /*----------------------------------------------------------------
1027  * bio processing
1028  *--------------------------------------------------------------*/
1029 static void defer_bio(struct cache *cache, struct bio *bio)
1030 {
1031         unsigned long flags;
1032
1033         spin_lock_irqsave(&cache->lock, flags);
1034         bio_list_add(&cache->deferred_bios, bio);
1035         spin_unlock_irqrestore(&cache->lock, flags);
1036
1037         wake_worker(cache);
1038 }
1039
1040 static void process_flush_bio(struct cache *cache, struct bio *bio)
1041 {
1042         struct per_bio_data *pb = get_per_bio_data(bio);
1043
1044         BUG_ON(bio->bi_size);
1045         if (!pb->req_nr)
1046                 remap_to_origin(cache, bio);
1047         else
1048                 remap_to_cache(cache, bio, 0);
1049
1050         issue(cache, bio);
1051 }
1052
1053 /*
1054  * People generally discard large parts of a device, eg, the whole device
1055  * when formatting.  Splitting these large discards up into cache block
1056  * sized ios and then quiescing (always neccessary for discard) takes too
1057  * long.
1058  *
1059  * We keep it simple, and allow any size of discard to come in, and just
1060  * mark off blocks on the discard bitset.  No passdown occurs!
1061  *
1062  * To implement passdown we need to change the bio_prison such that a cell
1063  * can have a key that spans many blocks.
1064  */
1065 static void process_discard_bio(struct cache *cache, struct bio *bio)
1066 {
1067         dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
1068                                                   cache->discard_block_size);
1069         dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1070         dm_block_t b;
1071
1072         end_block = block_div(end_block, cache->discard_block_size);
1073
1074         for (b = start_block; b < end_block; b++)
1075                 set_discard(cache, to_dblock(b));
1076
1077         bio_endio(bio, 0);
1078 }
1079
1080 static bool spare_migration_bandwidth(struct cache *cache)
1081 {
1082         sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1083                 cache->sectors_per_block;
1084         return current_volume < cache->migration_threshold;
1085 }
1086
1087 static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1088                                dm_cblock_t cblock)
1089 {
1090         return bio_data_dir(bio) == WRITE &&
1091                 cache->features.write_through && !is_dirty(cache, cblock);
1092 }
1093
1094 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1095 {
1096         atomic_inc(bio_data_dir(bio) == READ ?
1097                    &cache->stats.read_hit : &cache->stats.write_hit);
1098 }
1099
1100 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1101 {
1102         atomic_inc(bio_data_dir(bio) == READ ?
1103                    &cache->stats.read_miss : &cache->stats.write_miss);
1104 }
1105
1106 static void process_bio(struct cache *cache, struct prealloc *structs,
1107                         struct bio *bio)
1108 {
1109         int r;
1110         bool release_cell = true;
1111         dm_oblock_t block = get_bio_block(cache, bio);
1112         struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1113         struct policy_result lookup_result;
1114         struct per_bio_data *pb = get_per_bio_data(bio);
1115         bool discarded_block = is_discarded_oblock(cache, block);
1116         bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
1117
1118         /*
1119          * Check to see if that block is currently migrating.
1120          */
1121         cell_prealloc = prealloc_get_cell(structs);
1122         r = bio_detain(cache, block, bio, cell_prealloc,
1123                        (cell_free_fn) prealloc_put_cell,
1124                        structs, &new_ocell);
1125         if (r > 0)
1126                 return;
1127
1128         r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1129                        bio, &lookup_result);
1130
1131         if (r == -EWOULDBLOCK)
1132                 /* migration has been denied */
1133                 lookup_result.op = POLICY_MISS;
1134
1135         switch (lookup_result.op) {
1136         case POLICY_HIT:
1137                 inc_hit_counter(cache, bio);
1138                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1139
1140                 if (is_writethrough_io(cache, bio, lookup_result.cblock))
1141                         remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1142                 else
1143                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1144
1145                 issue(cache, bio);
1146                 break;
1147
1148         case POLICY_MISS:
1149                 inc_miss_counter(cache, bio);
1150                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1151                 remap_to_origin_clear_discard(cache, bio, block);
1152                 issue(cache, bio);
1153                 break;
1154
1155         case POLICY_NEW:
1156                 atomic_inc(&cache->stats.promotion);
1157                 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1158                 release_cell = false;
1159                 break;
1160
1161         case POLICY_REPLACE:
1162                 cell_prealloc = prealloc_get_cell(structs);
1163                 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1164                                (cell_free_fn) prealloc_put_cell,
1165                                structs, &old_ocell);
1166                 if (r > 0) {
1167                         /*
1168                          * We have to be careful to avoid lock inversion of
1169                          * the cells.  So we back off, and wait for the
1170                          * old_ocell to become free.
1171                          */
1172                         policy_force_mapping(cache->policy, block,
1173                                              lookup_result.old_oblock);
1174                         atomic_inc(&cache->stats.cache_cell_clash);
1175                         break;
1176                 }
1177                 atomic_inc(&cache->stats.demotion);
1178                 atomic_inc(&cache->stats.promotion);
1179
1180                 demote_then_promote(cache, structs, lookup_result.old_oblock,
1181                                     block, lookup_result.cblock,
1182                                     old_ocell, new_ocell);
1183                 release_cell = false;
1184                 break;
1185
1186         default:
1187                 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1188                             (unsigned) lookup_result.op);
1189                 bio_io_error(bio);
1190         }
1191
1192         if (release_cell)
1193                 cell_defer(cache, new_ocell, false);
1194 }
1195
1196 static int need_commit_due_to_time(struct cache *cache)
1197 {
1198         return jiffies < cache->last_commit_jiffies ||
1199                jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1200 }
1201
1202 static int commit_if_needed(struct cache *cache)
1203 {
1204         if (dm_cache_changed_this_transaction(cache->cmd) &&
1205             (cache->commit_requested || need_commit_due_to_time(cache))) {
1206                 atomic_inc(&cache->stats.commit_count);
1207                 cache->last_commit_jiffies = jiffies;
1208                 cache->commit_requested = false;
1209                 return dm_cache_commit(cache->cmd, false);
1210         }
1211
1212         return 0;
1213 }
1214
1215 static void process_deferred_bios(struct cache *cache)
1216 {
1217         unsigned long flags;
1218         struct bio_list bios;
1219         struct bio *bio;
1220         struct prealloc structs;
1221
1222         memset(&structs, 0, sizeof(structs));
1223         bio_list_init(&bios);
1224
1225         spin_lock_irqsave(&cache->lock, flags);
1226         bio_list_merge(&bios, &cache->deferred_bios);
1227         bio_list_init(&cache->deferred_bios);
1228         spin_unlock_irqrestore(&cache->lock, flags);
1229
1230         while (!bio_list_empty(&bios)) {
1231                 /*
1232                  * If we've got no free migration structs, and processing
1233                  * this bio might require one, we pause until there are some
1234                  * prepared mappings to process.
1235                  */
1236                 if (prealloc_data_structs(cache, &structs)) {
1237                         spin_lock_irqsave(&cache->lock, flags);
1238                         bio_list_merge(&cache->deferred_bios, &bios);
1239                         spin_unlock_irqrestore(&cache->lock, flags);
1240                         break;
1241                 }
1242
1243                 bio = bio_list_pop(&bios);
1244
1245                 if (bio->bi_rw & REQ_FLUSH)
1246                         process_flush_bio(cache, bio);
1247                 else if (bio->bi_rw & REQ_DISCARD)
1248                         process_discard_bio(cache, bio);
1249                 else
1250                         process_bio(cache, &structs, bio);
1251         }
1252
1253         prealloc_free_structs(cache, &structs);
1254 }
1255
1256 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1257 {
1258         unsigned long flags;
1259         struct bio_list bios;
1260         struct bio *bio;
1261
1262         bio_list_init(&bios);
1263
1264         spin_lock_irqsave(&cache->lock, flags);
1265         bio_list_merge(&bios, &cache->deferred_flush_bios);
1266         bio_list_init(&cache->deferred_flush_bios);
1267         spin_unlock_irqrestore(&cache->lock, flags);
1268
1269         while ((bio = bio_list_pop(&bios)))
1270                 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1271 }
1272
1273 static void process_deferred_writethrough_bios(struct cache *cache)
1274 {
1275         unsigned long flags;
1276         struct bio_list bios;
1277         struct bio *bio;
1278
1279         bio_list_init(&bios);
1280
1281         spin_lock_irqsave(&cache->lock, flags);
1282         bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1283         bio_list_init(&cache->deferred_writethrough_bios);
1284         spin_unlock_irqrestore(&cache->lock, flags);
1285
1286         while ((bio = bio_list_pop(&bios)))
1287                 generic_make_request(bio);
1288 }
1289
1290 static void writeback_some_dirty_blocks(struct cache *cache)
1291 {
1292         int r = 0;
1293         dm_oblock_t oblock;
1294         dm_cblock_t cblock;
1295         struct prealloc structs;
1296         struct dm_bio_prison_cell *old_ocell;
1297
1298         memset(&structs, 0, sizeof(structs));
1299
1300         while (spare_migration_bandwidth(cache)) {
1301                 if (prealloc_data_structs(cache, &structs))
1302                         break;
1303
1304                 r = policy_writeback_work(cache->policy, &oblock, &cblock);
1305                 if (r)
1306                         break;
1307
1308                 r = get_cell(cache, oblock, &structs, &old_ocell);
1309                 if (r) {
1310                         policy_set_dirty(cache->policy, oblock);
1311                         break;
1312                 }
1313
1314                 writeback(cache, &structs, oblock, cblock, old_ocell);
1315         }
1316
1317         prealloc_free_structs(cache, &structs);
1318 }
1319
1320 /*----------------------------------------------------------------
1321  * Main worker loop
1322  *--------------------------------------------------------------*/
1323 static void start_quiescing(struct cache *cache)
1324 {
1325         unsigned long flags;
1326
1327         spin_lock_irqsave(&cache->lock, flags);
1328         cache->quiescing = 1;
1329         spin_unlock_irqrestore(&cache->lock, flags);
1330 }
1331
1332 static void stop_quiescing(struct cache *cache)
1333 {
1334         unsigned long flags;
1335
1336         spin_lock_irqsave(&cache->lock, flags);
1337         cache->quiescing = 0;
1338         spin_unlock_irqrestore(&cache->lock, flags);
1339 }
1340
1341 static bool is_quiescing(struct cache *cache)
1342 {
1343         int r;
1344         unsigned long flags;
1345
1346         spin_lock_irqsave(&cache->lock, flags);
1347         r = cache->quiescing;
1348         spin_unlock_irqrestore(&cache->lock, flags);
1349
1350         return r;
1351 }
1352
1353 static void wait_for_migrations(struct cache *cache)
1354 {
1355         wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1356 }
1357
1358 static void stop_worker(struct cache *cache)
1359 {
1360         cancel_delayed_work(&cache->waker);
1361         flush_workqueue(cache->wq);
1362 }
1363
1364 static void requeue_deferred_io(struct cache *cache)
1365 {
1366         struct bio *bio;
1367         struct bio_list bios;
1368
1369         bio_list_init(&bios);
1370         bio_list_merge(&bios, &cache->deferred_bios);
1371         bio_list_init(&cache->deferred_bios);
1372
1373         while ((bio = bio_list_pop(&bios)))
1374                 bio_endio(bio, DM_ENDIO_REQUEUE);
1375 }
1376
1377 static int more_work(struct cache *cache)
1378 {
1379         if (is_quiescing(cache))
1380                 return !list_empty(&cache->quiesced_migrations) ||
1381                         !list_empty(&cache->completed_migrations) ||
1382                         !list_empty(&cache->need_commit_migrations);
1383         else
1384                 return !bio_list_empty(&cache->deferred_bios) ||
1385                         !bio_list_empty(&cache->deferred_flush_bios) ||
1386                         !bio_list_empty(&cache->deferred_writethrough_bios) ||
1387                         !list_empty(&cache->quiesced_migrations) ||
1388                         !list_empty(&cache->completed_migrations) ||
1389                         !list_empty(&cache->need_commit_migrations);
1390 }
1391
1392 static void do_worker(struct work_struct *ws)
1393 {
1394         struct cache *cache = container_of(ws, struct cache, worker);
1395
1396         do {
1397                 if (!is_quiescing(cache))
1398                         process_deferred_bios(cache);
1399
1400                 process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1401                 process_migrations(cache, &cache->completed_migrations, complete_migration);
1402
1403                 writeback_some_dirty_blocks(cache);
1404
1405                 process_deferred_writethrough_bios(cache);
1406
1407                 if (commit_if_needed(cache)) {
1408                         process_deferred_flush_bios(cache, false);
1409
1410                         /*
1411                          * FIXME: rollback metadata or just go into a
1412                          * failure mode and error everything
1413                          */
1414                 } else {
1415                         process_deferred_flush_bios(cache, true);
1416                         process_migrations(cache, &cache->need_commit_migrations,
1417                                            migration_success_post_commit);
1418                 }
1419         } while (more_work(cache));
1420 }
1421
1422 /*
1423  * We want to commit periodically so that not too much
1424  * unwritten metadata builds up.
1425  */
1426 static void do_waker(struct work_struct *ws)
1427 {
1428         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1429         wake_worker(cache);
1430         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1431 }
1432
1433 /*----------------------------------------------------------------*/
1434
1435 static int is_congested(struct dm_dev *dev, int bdi_bits)
1436 {
1437         struct request_queue *q = bdev_get_queue(dev->bdev);
1438         return bdi_congested(&q->backing_dev_info, bdi_bits);
1439 }
1440
1441 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1442 {
1443         struct cache *cache = container_of(cb, struct cache, callbacks);
1444
1445         return is_congested(cache->origin_dev, bdi_bits) ||
1446                 is_congested(cache->cache_dev, bdi_bits);
1447 }
1448
1449 /*----------------------------------------------------------------
1450  * Target methods
1451  *--------------------------------------------------------------*/
1452
1453 /*
1454  * This function gets called on the error paths of the constructor, so we
1455  * have to cope with a partially initialised struct.
1456  */
1457 static void destroy(struct cache *cache)
1458 {
1459         unsigned i;
1460
1461         if (cache->next_migration)
1462                 mempool_free(cache->next_migration, cache->migration_pool);
1463
1464         if (cache->migration_pool)
1465                 mempool_destroy(cache->migration_pool);
1466
1467         if (cache->all_io_ds)
1468                 dm_deferred_set_destroy(cache->all_io_ds);
1469
1470         if (cache->prison)
1471                 dm_bio_prison_destroy(cache->prison);
1472
1473         if (cache->wq)
1474                 destroy_workqueue(cache->wq);
1475
1476         if (cache->dirty_bitset)
1477                 free_bitset(cache->dirty_bitset);
1478
1479         if (cache->discard_bitset)
1480                 free_bitset(cache->discard_bitset);
1481
1482         if (cache->copier)
1483                 dm_kcopyd_client_destroy(cache->copier);
1484
1485         if (cache->cmd)
1486                 dm_cache_metadata_close(cache->cmd);
1487
1488         if (cache->metadata_dev)
1489                 dm_put_device(cache->ti, cache->metadata_dev);
1490
1491         if (cache->origin_dev)
1492                 dm_put_device(cache->ti, cache->origin_dev);
1493
1494         if (cache->cache_dev)
1495                 dm_put_device(cache->ti, cache->cache_dev);
1496
1497         if (cache->policy)
1498                 dm_cache_policy_destroy(cache->policy);
1499
1500         for (i = 0; i < cache->nr_ctr_args ; i++)
1501                 kfree(cache->ctr_args[i]);
1502         kfree(cache->ctr_args);
1503
1504         kfree(cache);
1505 }
1506
1507 static void cache_dtr(struct dm_target *ti)
1508 {
1509         struct cache *cache = ti->private;
1510
1511         destroy(cache);
1512 }
1513
1514 static sector_t get_dev_size(struct dm_dev *dev)
1515 {
1516         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1517 }
1518
1519 /*----------------------------------------------------------------*/
1520
1521 /*
1522  * Construct a cache device mapping.
1523  *
1524  * cache <metadata dev> <cache dev> <origin dev> <block size>
1525  *       <#feature args> [<feature arg>]*
1526  *       <policy> <#policy args> [<policy arg>]*
1527  *
1528  * metadata dev    : fast device holding the persistent metadata
1529  * cache dev       : fast device holding cached data blocks
1530  * origin dev      : slow device holding original data blocks
1531  * block size      : cache unit size in sectors
1532  *
1533  * #feature args   : number of feature arguments passed
1534  * feature args    : writethrough.  (The default is writeback.)
1535  *
1536  * policy          : the replacement policy to use
1537  * #policy args    : an even number of policy arguments corresponding
1538  *                   to key/value pairs passed to the policy
1539  * policy args     : key/value pairs passed to the policy
1540  *                   E.g. 'sequential_threshold 1024'
1541  *                   See cache-policies.txt for details.
1542  *
1543  * Optional feature arguments are:
1544  *   writethrough  : write through caching that prohibits cache block
1545  *                   content from being different from origin block content.
1546  *                   Without this argument, the default behaviour is to write
1547  *                   back cache block contents later for performance reasons,
1548  *                   so they may differ from the corresponding origin blocks.
1549  */
1550 struct cache_args {
1551         struct dm_target *ti;
1552
1553         struct dm_dev *metadata_dev;
1554
1555         struct dm_dev *cache_dev;
1556         sector_t cache_sectors;
1557
1558         struct dm_dev *origin_dev;
1559         sector_t origin_sectors;
1560
1561         uint32_t block_size;
1562
1563         const char *policy_name;
1564         int policy_argc;
1565         const char **policy_argv;
1566
1567         struct cache_features features;
1568 };
1569
1570 static void destroy_cache_args(struct cache_args *ca)
1571 {
1572         if (ca->metadata_dev)
1573                 dm_put_device(ca->ti, ca->metadata_dev);
1574
1575         if (ca->cache_dev)
1576                 dm_put_device(ca->ti, ca->cache_dev);
1577
1578         if (ca->origin_dev)
1579                 dm_put_device(ca->ti, ca->origin_dev);
1580
1581         kfree(ca);
1582 }
1583
1584 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1585 {
1586         if (!as->argc) {
1587                 *error = "Insufficient args";
1588                 return false;
1589         }
1590
1591         return true;
1592 }
1593
1594 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1595                               char **error)
1596 {
1597         int r;
1598         sector_t metadata_dev_size;
1599         char b[BDEVNAME_SIZE];
1600
1601         if (!at_least_one_arg(as, error))
1602                 return -EINVAL;
1603
1604         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1605                           &ca->metadata_dev);
1606         if (r) {
1607                 *error = "Error opening metadata device";
1608                 return r;
1609         }
1610
1611         metadata_dev_size = get_dev_size(ca->metadata_dev);
1612         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1613                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1614                        bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1615
1616         return 0;
1617 }
1618
1619 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1620                            char **error)
1621 {
1622         int r;
1623
1624         if (!at_least_one_arg(as, error))
1625                 return -EINVAL;
1626
1627         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1628                           &ca->cache_dev);
1629         if (r) {
1630                 *error = "Error opening cache device";
1631                 return r;
1632         }
1633         ca->cache_sectors = get_dev_size(ca->cache_dev);
1634
1635         return 0;
1636 }
1637
1638 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1639                             char **error)
1640 {
1641         int r;
1642
1643         if (!at_least_one_arg(as, error))
1644                 return -EINVAL;
1645
1646         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1647                           &ca->origin_dev);
1648         if (r) {
1649                 *error = "Error opening origin device";
1650                 return r;
1651         }
1652
1653         ca->origin_sectors = get_dev_size(ca->origin_dev);
1654         if (ca->ti->len > ca->origin_sectors) {
1655                 *error = "Device size larger than cached device";
1656                 return -EINVAL;
1657         }
1658
1659         return 0;
1660 }
1661
1662 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1663                             char **error)
1664 {
1665         unsigned long tmp;
1666
1667         if (!at_least_one_arg(as, error))
1668                 return -EINVAL;
1669
1670         if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp ||
1671             tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1672             tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1673                 *error = "Invalid data block size";
1674                 return -EINVAL;
1675         }
1676
1677         if (tmp > ca->cache_sectors) {
1678                 *error = "Data block size is larger than the cache device";
1679                 return -EINVAL;
1680         }
1681
1682         ca->block_size = tmp;
1683
1684         return 0;
1685 }
1686
1687 static void init_features(struct cache_features *cf)
1688 {
1689         cf->mode = CM_WRITE;
1690         cf->write_through = false;
1691 }
1692
1693 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1694                           char **error)
1695 {
1696         static struct dm_arg _args[] = {
1697                 {0, 1, "Invalid number of cache feature arguments"},
1698         };
1699
1700         int r;
1701         unsigned argc;
1702         const char *arg;
1703         struct cache_features *cf = &ca->features;
1704
1705         init_features(cf);
1706
1707         r = dm_read_arg_group(_args, as, &argc, error);
1708         if (r)
1709                 return -EINVAL;
1710
1711         while (argc--) {
1712                 arg = dm_shift_arg(as);
1713
1714                 if (!strcasecmp(arg, "writeback"))
1715                         cf->write_through = false;
1716
1717                 else if (!strcasecmp(arg, "writethrough"))
1718                         cf->write_through = true;
1719
1720                 else {
1721                         *error = "Unrecognised cache feature requested";
1722                         return -EINVAL;
1723                 }
1724         }
1725
1726         return 0;
1727 }
1728
1729 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
1730                         char **error)
1731 {
1732         static struct dm_arg _args[] = {
1733                 {0, 1024, "Invalid number of policy arguments"},
1734         };
1735
1736         int r;
1737
1738         if (!at_least_one_arg(as, error))
1739                 return -EINVAL;
1740
1741         ca->policy_name = dm_shift_arg(as);
1742
1743         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
1744         if (r)
1745                 return -EINVAL;
1746
1747         ca->policy_argv = (const char **)as->argv;
1748         dm_consume_args(as, ca->policy_argc);
1749
1750         return 0;
1751 }
1752
1753 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
1754                             char **error)
1755 {
1756         int r;
1757         struct dm_arg_set as;
1758
1759         as.argc = argc;
1760         as.argv = argv;
1761
1762         r = parse_metadata_dev(ca, &as, error);
1763         if (r)
1764                 return r;
1765
1766         r = parse_cache_dev(ca, &as, error);
1767         if (r)
1768                 return r;
1769
1770         r = parse_origin_dev(ca, &as, error);
1771         if (r)
1772                 return r;
1773
1774         r = parse_block_size(ca, &as, error);
1775         if (r)
1776                 return r;
1777
1778         r = parse_features(ca, &as, error);
1779         if (r)
1780                 return r;
1781
1782         r = parse_policy(ca, &as, error);
1783         if (r)
1784                 return r;
1785
1786         return 0;
1787 }
1788
1789 /*----------------------------------------------------------------*/
1790
1791 static struct kmem_cache *migration_cache;
1792
1793 static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
1794 {
1795         int r = 0;
1796
1797         if (argc & 1) {
1798                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
1799                 return -EINVAL;
1800         }
1801
1802         while (argc) {
1803                 r = policy_set_config_value(p, argv[0], argv[1]);
1804                 if (r) {
1805                         DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
1806                                argv[0], argv[1]);
1807                         return r;
1808                 }
1809
1810                 argc -= 2;
1811                 argv += 2;
1812         }
1813
1814         return r;
1815 }
1816
1817 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1818                                char **error)
1819 {
1820         int r;
1821
1822         cache->policy = dm_cache_policy_create(ca->policy_name,
1823                                                cache->cache_size,
1824                                                cache->origin_sectors,
1825                                                cache->sectors_per_block);
1826         if (!cache->policy) {
1827                 *error = "Error creating cache's policy";
1828                 return -ENOMEM;
1829         }
1830
1831         r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
1832         if (r) {
1833                 *error = "Error setting cache policy's config values";
1834                 dm_cache_policy_destroy(cache->policy);
1835                 cache->policy = NULL;
1836         }
1837
1838         return r;
1839 }
1840
1841 /*
1842  * We want the discard block size to be a power of two, at least the size
1843  * of the cache block size, and have no more than 2^14 discard blocks
1844  * across the origin.
1845  */
1846 #define MAX_DISCARD_BLOCKS (1 << 14)
1847
1848 static bool too_many_discard_blocks(sector_t discard_block_size,
1849                                     sector_t origin_size)
1850 {
1851         (void) sector_div(origin_size, discard_block_size);
1852
1853         return origin_size > MAX_DISCARD_BLOCKS;
1854 }
1855
1856 static sector_t calculate_discard_block_size(sector_t cache_block_size,
1857                                              sector_t origin_size)
1858 {
1859         sector_t discard_block_size;
1860
1861         discard_block_size = roundup_pow_of_two(cache_block_size);
1862
1863         if (origin_size)
1864                 while (too_many_discard_blocks(discard_block_size, origin_size))
1865                         discard_block_size *= 2;
1866
1867         return discard_block_size;
1868 }
1869
1870 #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
1871
1872 static int cache_create(struct cache_args *ca, struct cache **result)
1873 {
1874         int r = 0;
1875         char **error = &ca->ti->error;
1876         struct cache *cache;
1877         struct dm_target *ti = ca->ti;
1878         dm_block_t origin_blocks;
1879         struct dm_cache_metadata *cmd;
1880         bool may_format = ca->features.mode == CM_WRITE;
1881
1882         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
1883         if (!cache)
1884                 return -ENOMEM;
1885
1886         cache->ti = ca->ti;
1887         ti->private = cache;
1888         ti->per_bio_data_size = sizeof(struct per_bio_data);
1889         ti->num_flush_bios = 2;
1890         ti->flush_supported = true;
1891
1892         ti->num_discard_bios = 1;
1893         ti->discards_supported = true;
1894         ti->discard_zeroes_data_unsupported = true;
1895
1896         memcpy(&cache->features, &ca->features, sizeof(cache->features));
1897
1898         cache->callbacks.congested_fn = cache_is_congested;
1899         dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1900
1901         cache->metadata_dev = ca->metadata_dev;
1902         cache->origin_dev = ca->origin_dev;
1903         cache->cache_dev = ca->cache_dev;
1904
1905         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
1906
1907         /* FIXME: factor out this whole section */
1908         origin_blocks = cache->origin_sectors = ca->origin_sectors;
1909         origin_blocks = block_div(origin_blocks, ca->block_size);
1910         cache->origin_blocks = to_oblock(origin_blocks);
1911
1912         cache->sectors_per_block = ca->block_size;
1913         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
1914                 r = -EINVAL;
1915                 goto bad;
1916         }
1917
1918         if (ca->block_size & (ca->block_size - 1)) {
1919                 dm_block_t cache_size = ca->cache_sectors;
1920
1921                 cache->sectors_per_block_shift = -1;
1922                 cache_size = block_div(cache_size, ca->block_size);
1923                 cache->cache_size = to_cblock(cache_size);
1924         } else {
1925                 cache->sectors_per_block_shift = __ffs(ca->block_size);
1926                 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
1927         }
1928
1929         r = create_cache_policy(cache, ca, error);
1930         if (r)
1931                 goto bad;
1932         cache->policy_nr_args = ca->policy_argc;
1933
1934         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
1935                                      ca->block_size, may_format,
1936                                      dm_cache_policy_get_hint_size(cache->policy));
1937         if (IS_ERR(cmd)) {
1938                 *error = "Error creating metadata object";
1939                 r = PTR_ERR(cmd);
1940                 goto bad;
1941         }
1942         cache->cmd = cmd;
1943
1944         spin_lock_init(&cache->lock);
1945         bio_list_init(&cache->deferred_bios);
1946         bio_list_init(&cache->deferred_flush_bios);
1947         bio_list_init(&cache->deferred_writethrough_bios);
1948         INIT_LIST_HEAD(&cache->quiesced_migrations);
1949         INIT_LIST_HEAD(&cache->completed_migrations);
1950         INIT_LIST_HEAD(&cache->need_commit_migrations);
1951         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
1952         atomic_set(&cache->nr_migrations, 0);
1953         init_waitqueue_head(&cache->migration_wait);
1954
1955         cache->nr_dirty = 0;
1956         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
1957         if (!cache->dirty_bitset) {
1958                 *error = "could not allocate dirty bitset";
1959                 goto bad;
1960         }
1961         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
1962
1963         cache->discard_block_size =
1964                 calculate_discard_block_size(cache->sectors_per_block,
1965                                              cache->origin_sectors);
1966         cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
1967         cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
1968         if (!cache->discard_bitset) {
1969                 *error = "could not allocate discard bitset";
1970                 goto bad;
1971         }
1972         clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
1973
1974         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1975         if (IS_ERR(cache->copier)) {
1976                 *error = "could not create kcopyd client";
1977                 r = PTR_ERR(cache->copier);
1978                 goto bad;
1979         }
1980
1981         cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1982         if (!cache->wq) {
1983                 *error = "could not create workqueue for metadata object";
1984                 goto bad;
1985         }
1986         INIT_WORK(&cache->worker, do_worker);
1987         INIT_DELAYED_WORK(&cache->waker, do_waker);
1988         cache->last_commit_jiffies = jiffies;
1989
1990         cache->prison = dm_bio_prison_create(PRISON_CELLS);
1991         if (!cache->prison) {
1992                 *error = "could not create bio prison";
1993                 goto bad;
1994         }
1995
1996         cache->all_io_ds = dm_deferred_set_create();
1997         if (!cache->all_io_ds) {
1998                 *error = "could not create all_io deferred set";
1999                 goto bad;
2000         }
2001
2002         cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2003                                                          migration_cache);
2004         if (!cache->migration_pool) {
2005                 *error = "Error creating cache's migration mempool";
2006                 goto bad;
2007         }
2008
2009         cache->next_migration = NULL;
2010
2011         cache->need_tick_bio = true;
2012         cache->sized = false;
2013         cache->quiescing = false;
2014         cache->commit_requested = false;
2015         cache->loaded_mappings = false;
2016         cache->loaded_discards = false;
2017
2018         load_stats(cache);
2019
2020         atomic_set(&cache->stats.demotion, 0);
2021         atomic_set(&cache->stats.promotion, 0);
2022         atomic_set(&cache->stats.copies_avoided, 0);
2023         atomic_set(&cache->stats.cache_cell_clash, 0);
2024         atomic_set(&cache->stats.commit_count, 0);
2025         atomic_set(&cache->stats.discard_count, 0);
2026
2027         *result = cache;
2028         return 0;
2029
2030 bad:
2031         destroy(cache);
2032         return r;
2033 }
2034
2035 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2036 {
2037         unsigned i;
2038         const char **copy;
2039
2040         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2041         if (!copy)
2042                 return -ENOMEM;
2043         for (i = 0; i < argc; i++) {
2044                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2045                 if (!copy[i]) {
2046                         while (i--)
2047                                 kfree(copy[i]);
2048                         kfree(copy);
2049                         return -ENOMEM;
2050                 }
2051         }
2052
2053         cache->nr_ctr_args = argc;
2054         cache->ctr_args = copy;
2055
2056         return 0;
2057 }
2058
2059 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2060 {
2061         int r = -EINVAL;
2062         struct cache_args *ca;
2063         struct cache *cache = NULL;
2064
2065         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2066         if (!ca) {
2067                 ti->error = "Error allocating memory for cache";
2068                 return -ENOMEM;
2069         }
2070         ca->ti = ti;
2071
2072         r = parse_cache_args(ca, argc, argv, &ti->error);
2073         if (r)
2074                 goto out;
2075
2076         r = cache_create(ca, &cache);
2077         if (r)
2078                 goto out;
2079
2080         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2081         if (r) {
2082                 destroy(cache);
2083                 goto out;
2084         }
2085
2086         ti->private = cache;
2087
2088 out:
2089         destroy_cache_args(ca);
2090         return r;
2091 }
2092
2093 static int cache_map(struct dm_target *ti, struct bio *bio)
2094 {
2095         struct cache *cache = ti->private;
2096
2097         int r;
2098         dm_oblock_t block = get_bio_block(cache, bio);
2099         bool can_migrate = false;
2100         bool discarded_block;
2101         struct dm_bio_prison_cell *cell;
2102         struct policy_result lookup_result;
2103         struct per_bio_data *pb;
2104
2105         if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
2106                 /*
2107                  * This can only occur if the io goes to a partial block at
2108                  * the end of the origin device.  We don't cache these.
2109                  * Just remap to the origin and carry on.
2110                  */
2111                 remap_to_origin_clear_discard(cache, bio, block);
2112                 return DM_MAPIO_REMAPPED;
2113         }
2114
2115         pb = init_per_bio_data(bio);
2116
2117         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2118                 defer_bio(cache, bio);
2119                 return DM_MAPIO_SUBMITTED;
2120         }
2121
2122         /*
2123          * Check to see if that block is currently migrating.
2124          */
2125         cell = alloc_prison_cell(cache);
2126         if (!cell) {
2127                 defer_bio(cache, bio);
2128                 return DM_MAPIO_SUBMITTED;
2129         }
2130
2131         r = bio_detain(cache, block, bio, cell,
2132                        (cell_free_fn) free_prison_cell,
2133                        cache, &cell);
2134         if (r) {
2135                 if (r < 0)
2136                         defer_bio(cache, bio);
2137
2138                 return DM_MAPIO_SUBMITTED;
2139         }
2140
2141         discarded_block = is_discarded_oblock(cache, block);
2142
2143         r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2144                        bio, &lookup_result);
2145         if (r == -EWOULDBLOCK) {
2146                 cell_defer(cache, cell, true);
2147                 return DM_MAPIO_SUBMITTED;
2148
2149         } else if (r) {
2150                 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2151                 bio_io_error(bio);
2152                 return DM_MAPIO_SUBMITTED;
2153         }
2154
2155         switch (lookup_result.op) {
2156         case POLICY_HIT:
2157                 inc_hit_counter(cache, bio);
2158                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2159
2160                 if (is_writethrough_io(cache, bio, lookup_result.cblock))
2161                         remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2162                 else
2163                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2164
2165                 cell_defer(cache, cell, false);
2166                 break;
2167
2168         case POLICY_MISS:
2169                 inc_miss_counter(cache, bio);
2170                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2171
2172                 if (pb->req_nr != 0) {
2173                         /*
2174                          * This is a duplicate writethrough io that is no
2175                          * longer needed because the block has been demoted.
2176                          */
2177                         bio_endio(bio, 0);
2178                         cell_defer(cache, cell, false);
2179                         return DM_MAPIO_SUBMITTED;
2180                 } else {
2181                         remap_to_origin_clear_discard(cache, bio, block);
2182                         cell_defer(cache, cell, false);
2183                 }
2184                 break;
2185
2186         default:
2187                 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2188                             (unsigned) lookup_result.op);
2189                 bio_io_error(bio);
2190                 return DM_MAPIO_SUBMITTED;
2191         }
2192
2193         return DM_MAPIO_REMAPPED;
2194 }
2195
2196 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2197 {
2198         struct cache *cache = ti->private;
2199         unsigned long flags;
2200         struct per_bio_data *pb = get_per_bio_data(bio);
2201
2202         if (pb->tick) {
2203                 policy_tick(cache->policy);
2204
2205                 spin_lock_irqsave(&cache->lock, flags);
2206                 cache->need_tick_bio = true;
2207                 spin_unlock_irqrestore(&cache->lock, flags);
2208         }
2209
2210         check_for_quiesced_migrations(cache, pb);
2211
2212         return 0;
2213 }
2214
2215 static int write_dirty_bitset(struct cache *cache)
2216 {
2217         unsigned i, r;
2218
2219         for (i = 0; i < from_cblock(cache->cache_size); i++) {
2220                 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2221                                        is_dirty(cache, to_cblock(i)));
2222                 if (r)
2223                         return r;
2224         }
2225
2226         return 0;
2227 }
2228
2229 static int write_discard_bitset(struct cache *cache)
2230 {
2231         unsigned i, r;
2232
2233         r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2234                                            cache->discard_nr_blocks);
2235         if (r) {
2236                 DMERR("could not resize on-disk discard bitset");
2237                 return r;
2238         }
2239
2240         for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2241                 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2242                                          is_discarded(cache, to_dblock(i)));
2243                 if (r)
2244                         return r;
2245         }
2246
2247         return 0;
2248 }
2249
2250 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2251                      uint32_t hint)
2252 {
2253         struct cache *cache = context;
2254         return dm_cache_save_hint(cache->cmd, cblock, hint);
2255 }
2256
2257 static int write_hints(struct cache *cache)
2258 {
2259         int r;
2260
2261         r = dm_cache_begin_hints(cache->cmd, cache->policy);
2262         if (r) {
2263                 DMERR("dm_cache_begin_hints failed");
2264                 return r;
2265         }
2266
2267         r = policy_walk_mappings(cache->policy, save_hint, cache);
2268         if (r)
2269                 DMERR("policy_walk_mappings failed");
2270
2271         return r;
2272 }
2273
2274 /*
2275  * returns true on success
2276  */
2277 static bool sync_metadata(struct cache *cache)
2278 {
2279         int r1, r2, r3, r4;
2280
2281         r1 = write_dirty_bitset(cache);
2282         if (r1)
2283                 DMERR("could not write dirty bitset");
2284
2285         r2 = write_discard_bitset(cache);
2286         if (r2)
2287                 DMERR("could not write discard bitset");
2288
2289         save_stats(cache);
2290
2291         r3 = write_hints(cache);
2292         if (r3)
2293                 DMERR("could not write hints");
2294
2295         /*
2296          * If writing the above metadata failed, we still commit, but don't
2297          * set the clean shutdown flag.  This will effectively force every
2298          * dirty bit to be set on reload.
2299          */
2300         r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2301         if (r4)
2302                 DMERR("could not write cache metadata.  Data loss may occur.");
2303
2304         return !r1 && !r2 && !r3 && !r4;
2305 }
2306
2307 static void cache_postsuspend(struct dm_target *ti)
2308 {
2309         struct cache *cache = ti->private;
2310
2311         start_quiescing(cache);
2312         wait_for_migrations(cache);
2313         stop_worker(cache);
2314         requeue_deferred_io(cache);
2315         stop_quiescing(cache);
2316
2317         (void) sync_metadata(cache);
2318 }
2319
2320 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2321                         bool dirty, uint32_t hint, bool hint_valid)
2322 {
2323         int r;
2324         struct cache *cache = context;
2325
2326         r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2327         if (r)
2328                 return r;
2329
2330         if (dirty)
2331                 set_dirty(cache, oblock, cblock);
2332         else
2333                 clear_dirty(cache, oblock, cblock);
2334
2335         return 0;
2336 }
2337
2338 static int load_discard(void *context, sector_t discard_block_size,
2339                         dm_dblock_t dblock, bool discard)
2340 {
2341         struct cache *cache = context;
2342
2343         /* FIXME: handle mis-matched block size */
2344
2345         if (discard)
2346                 set_discard(cache, dblock);
2347         else
2348                 clear_discard(cache, dblock);
2349
2350         return 0;
2351 }
2352
2353 static int cache_preresume(struct dm_target *ti)
2354 {
2355         int r = 0;
2356         struct cache *cache = ti->private;
2357         sector_t actual_cache_size = get_dev_size(cache->cache_dev);
2358         (void) sector_div(actual_cache_size, cache->sectors_per_block);
2359
2360         /*
2361          * Check to see if the cache has resized.
2362          */
2363         if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
2364                 cache->cache_size = to_cblock(actual_cache_size);
2365
2366                 r = dm_cache_resize(cache->cmd, cache->cache_size);
2367                 if (r) {
2368                         DMERR("could not resize cache metadata");
2369                         return r;
2370                 }
2371
2372                 cache->sized = true;
2373         }
2374
2375         if (!cache->loaded_mappings) {
2376                 r = dm_cache_load_mappings(cache->cmd, cache->policy,
2377                                            load_mapping, cache);
2378                 if (r) {
2379                         DMERR("could not load cache mappings");
2380                         return r;
2381                 }
2382
2383                 cache->loaded_mappings = true;
2384         }
2385
2386         if (!cache->loaded_discards) {
2387                 r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2388                 if (r) {
2389                         DMERR("could not load origin discards");
2390                         return r;
2391                 }
2392
2393                 cache->loaded_discards = true;
2394         }
2395
2396         return r;
2397 }
2398
2399 static void cache_resume(struct dm_target *ti)
2400 {
2401         struct cache *cache = ti->private;
2402
2403         cache->need_tick_bio = true;
2404         do_waker(&cache->waker.work);
2405 }
2406
2407 /*
2408  * Status format:
2409  *
2410  * <#used metadata blocks>/<#total metadata blocks>
2411  * <#read hits> <#read misses> <#write hits> <#write misses>
2412  * <#demotions> <#promotions> <#blocks in cache> <#dirty>
2413  * <#features> <features>*
2414  * <#core args> <core args>
2415  * <#policy args> <policy args>*
2416  */
2417 static void cache_status(struct dm_target *ti, status_type_t type,
2418                          unsigned status_flags, char *result, unsigned maxlen)
2419 {
2420         int r = 0;
2421         unsigned i;
2422         ssize_t sz = 0;
2423         dm_block_t nr_free_blocks_metadata = 0;
2424         dm_block_t nr_blocks_metadata = 0;
2425         char buf[BDEVNAME_SIZE];
2426         struct cache *cache = ti->private;
2427         dm_cblock_t residency;
2428
2429         switch (type) {
2430         case STATUSTYPE_INFO:
2431                 /* Commit to ensure statistics aren't out-of-date */
2432                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2433                         r = dm_cache_commit(cache->cmd, false);
2434                         if (r)
2435                                 DMERR("could not commit metadata for accurate status");
2436                 }
2437
2438                 r = dm_cache_get_free_metadata_block_count(cache->cmd,
2439                                                            &nr_free_blocks_metadata);
2440                 if (r) {
2441                         DMERR("could not get metadata free block count");
2442                         goto err;
2443                 }
2444
2445                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2446                 if (r) {
2447                         DMERR("could not get metadata device size");
2448                         goto err;
2449                 }
2450
2451                 residency = policy_residency(cache->policy);
2452
2453                 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
2454                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2455                        (unsigned long long)nr_blocks_metadata,
2456                        (unsigned) atomic_read(&cache->stats.read_hit),
2457                        (unsigned) atomic_read(&cache->stats.read_miss),
2458                        (unsigned) atomic_read(&cache->stats.write_hit),
2459                        (unsigned) atomic_read(&cache->stats.write_miss),
2460                        (unsigned) atomic_read(&cache->stats.demotion),
2461                        (unsigned) atomic_read(&cache->stats.promotion),
2462                        (unsigned long long) from_cblock(residency),
2463                        cache->nr_dirty);
2464
2465                 if (cache->features.write_through)
2466                         DMEMIT("1 writethrough ");
2467                 else
2468                         DMEMIT("0 ");
2469
2470                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2471                 if (sz < maxlen) {
2472                         r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2473                         if (r)
2474                                 DMERR("policy_emit_config_values returned %d", r);
2475                 }
2476
2477                 break;
2478
2479         case STATUSTYPE_TABLE:
2480                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2481                 DMEMIT("%s ", buf);
2482                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2483                 DMEMIT("%s ", buf);
2484                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2485                 DMEMIT("%s", buf);
2486
2487                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
2488                         DMEMIT(" %s", cache->ctr_args[i]);
2489                 if (cache->nr_ctr_args)
2490                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2491         }
2492
2493         return;
2494
2495 err:
2496         DMEMIT("Error");
2497 }
2498
2499 #define NOT_CORE_OPTION 1
2500
2501 static int process_config_option(struct cache *cache, char **argv)
2502 {
2503         unsigned long tmp;
2504
2505         if (!strcasecmp(argv[0], "migration_threshold")) {
2506                 if (kstrtoul(argv[1], 10, &tmp))
2507                         return -EINVAL;
2508
2509                 cache->migration_threshold = tmp;
2510                 return 0;
2511         }
2512
2513         return NOT_CORE_OPTION;
2514 }
2515
2516 /*
2517  * Supports <key> <value>.
2518  *
2519  * The key migration_threshold is supported by the cache target core.
2520  */
2521 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2522 {
2523         int r;
2524         struct cache *cache = ti->private;
2525
2526         if (argc != 2)
2527                 return -EINVAL;
2528
2529         r = process_config_option(cache, argv);
2530         if (r == NOT_CORE_OPTION)
2531                 return policy_set_config_value(cache->policy, argv[0], argv[1]);
2532
2533         return r;
2534 }
2535
2536 static int cache_iterate_devices(struct dm_target *ti,
2537                                  iterate_devices_callout_fn fn, void *data)
2538 {
2539         int r = 0;
2540         struct cache *cache = ti->private;
2541
2542         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
2543         if (!r)
2544                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
2545
2546         return r;
2547 }
2548
2549 /*
2550  * We assume I/O is going to the origin (which is the volume
2551  * more likely to have restrictions e.g. by being striped).
2552  * (Looking up the exact location of the data would be expensive
2553  * and could always be out of date by the time the bio is submitted.)
2554  */
2555 static int cache_bvec_merge(struct dm_target *ti,
2556                             struct bvec_merge_data *bvm,
2557                             struct bio_vec *biovec, int max_size)
2558 {
2559         struct cache *cache = ti->private;
2560         struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
2561
2562         if (!q->merge_bvec_fn)
2563                 return max_size;
2564
2565         bvm->bi_bdev = cache->origin_dev->bdev;
2566         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2567 }
2568
2569 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2570 {
2571         /*
2572          * FIXME: these limits may be incompatible with the cache device
2573          */
2574         limits->max_discard_sectors = cache->discard_block_size * 1024;
2575         limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
2576 }
2577
2578 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2579 {
2580         struct cache *cache = ti->private;
2581
2582         blk_limits_io_min(limits, 0);
2583         blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2584         set_discard_limits(cache, limits);
2585 }
2586
2587 /*----------------------------------------------------------------*/
2588
2589 static struct target_type cache_target = {
2590         .name = "cache",
2591         .version = {1, 1, 0},
2592         .module = THIS_MODULE,
2593         .ctr = cache_ctr,
2594         .dtr = cache_dtr,
2595         .map = cache_map,
2596         .end_io = cache_end_io,
2597         .postsuspend = cache_postsuspend,
2598         .preresume = cache_preresume,
2599         .resume = cache_resume,
2600         .status = cache_status,
2601         .message = cache_message,
2602         .iterate_devices = cache_iterate_devices,
2603         .merge = cache_bvec_merge,
2604         .io_hints = cache_io_hints,
2605 };
2606
2607 static int __init dm_cache_init(void)
2608 {
2609         int r;
2610
2611         r = dm_register_target(&cache_target);
2612         if (r) {
2613                 DMERR("cache target registration failed: %d", r);
2614                 return r;
2615         }
2616
2617         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
2618         if (!migration_cache) {
2619                 dm_unregister_target(&cache_target);
2620                 return -ENOMEM;
2621         }
2622
2623         return 0;
2624 }
2625
2626 static void __exit dm_cache_exit(void)
2627 {
2628         dm_unregister_target(&cache_target);
2629         kmem_cache_destroy(migration_cache);
2630 }
2631
2632 module_init(dm_cache_init);
2633 module_exit(dm_cache_exit);
2634
2635 MODULE_DESCRIPTION(DM_NAME " cache target");
2636 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
2637 MODULE_LICENSE("GPL");