4 * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
5 * Copyright (c) 2010,2011, Nitin Gupta
7 * Zcache provides an in-kernel "host implementation" for transcendent memory
8 * and, thus indirectly, for cleancache and frontswap. Zcache includes two
9 * page-accessible memory [1] interfaces, both utilizing the crypto compression
11 * 1) "compression buddies" ("zbud") is used for ephemeral pages
12 * 2) zsmalloc is used for persistent pages.
13 * Xvmalloc (based on the TLSF allocator) has very low fragmentation
14 * so maximizes space efficiency, while zbud allows pairs (and potentially,
15 * in the future, more than a pair of) compressed pages to be closely linked
16 * so that reclaiming can be done via the kernel's physical-page-oriented
17 * "shrinker" interface.
19 * [1] For a definition of page-accessible memory (aka PAM), see:
20 * http://marc.info/?l=linux-mm&m=127811271605009
23 #include <linux/module.h>
24 #include <linux/cpu.h>
25 #include <linux/highmem.h>
26 #include <linux/list.h>
27 #include <linux/slab.h>
28 #include <linux/spinlock.h>
29 #include <linux/types.h>
30 #include <linux/atomic.h>
31 #include <linux/math64.h>
32 #include <linux/crypto.h>
33 #include <linux/string.h>
34 #include <linux/idr.h>
37 #include "../zsmalloc/zsmalloc.h"
39 #ifdef CONFIG_CLEANCACHE
40 #include <linux/cleancache.h>
42 #ifdef CONFIG_FRONTSWAP
43 #include <linux/frontswap.h>
47 /* this is more aggressive but may cause other problems? */
48 #define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
50 #define ZCACHE_GFP_MASK \
51 (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
54 #define MAX_CLIENTS 16
55 #define LOCAL_CLIENT ((uint16_t)-1)
57 MODULE_LICENSE("GPL");
59 struct zcache_client {
60 struct idr tmem_pools;
61 struct zs_pool *zspool;
66 static struct zcache_client zcache_host;
67 static struct zcache_client zcache_clients[MAX_CLIENTS];
69 static inline uint16_t get_client_id_from_client(struct zcache_client *cli)
72 if (cli == &zcache_host)
74 return cli - &zcache_clients[0];
77 static inline bool is_local_client(struct zcache_client *cli)
79 return cli == &zcache_host;
82 /* crypto API for zcache */
83 #define ZCACHE_COMP_NAME_SZ CRYPTO_MAX_ALG_NAME
84 static char zcache_comp_name[ZCACHE_COMP_NAME_SZ];
85 static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms;
88 ZCACHE_COMPOP_COMPRESS,
89 ZCACHE_COMPOP_DECOMPRESS
92 static inline int zcache_comp_op(enum comp_op op,
93 const u8 *src, unsigned int slen,
94 u8 *dst, unsigned int *dlen)
96 struct crypto_comp *tfm;
99 BUG_ON(!zcache_comp_pcpu_tfms);
100 tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu());
103 case ZCACHE_COMPOP_COMPRESS:
104 ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
106 case ZCACHE_COMPOP_DECOMPRESS:
107 ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
117 * Compression buddies ("zbud") provides for packing two (or, possibly
118 * in the future, more) compressed ephemeral pages into a single "raw"
119 * (physical) page and tracking them with data structures so that
120 * the raw pages can be easily reclaimed.
122 * A zbud page ("zbpg") is an aligned page containing a list_head,
123 * a lock, and two "zbud headers". The remainder of the physical
124 * page is divided up into aligned 64-byte "chunks" which contain
125 * the compressed data for zero, one, or two zbuds. Each zbpg
126 * resides on: (1) an "unused list" if it has no zbuds; (2) a
127 * "buddied" list if it is fully populated with two zbuds; or
128 * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
129 * the one unbuddied zbud uses. The data inside a zbpg cannot be
130 * read or written unless the zbpg's lock is held.
133 #define ZBH_SENTINEL 0x43214321
134 #define ZBPG_SENTINEL 0xdeadbeef
136 #define ZBUD_MAX_BUDS 2
143 uint16_t size; /* compressed size in bytes, zero means unused */
148 struct list_head bud_list;
150 struct zbud_hdr buddy[ZBUD_MAX_BUDS];
152 /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
155 #define CHUNK_SHIFT 6
156 #define CHUNK_SIZE (1 << CHUNK_SHIFT)
157 #define CHUNK_MASK (~(CHUNK_SIZE-1))
158 #define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \
159 CHUNK_MASK) >> CHUNK_SHIFT)
160 #define MAX_CHUNK (NCHUNKS-1)
163 struct list_head list;
165 } zbud_unbuddied[NCHUNKS];
166 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
167 /* element 0 is never used but optimizing that isn't worth it */
168 static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
170 struct list_head zbud_buddied_list;
171 static unsigned long zcache_zbud_buddied_count;
173 /* protects the buddied list and all unbuddied lists */
174 static DEFINE_SPINLOCK(zbud_budlists_spinlock);
176 static LIST_HEAD(zbpg_unused_list);
177 static unsigned long zcache_zbpg_unused_list_count;
179 /* protects the unused page list */
180 static DEFINE_SPINLOCK(zbpg_unused_list_spinlock);
182 static atomic_t zcache_zbud_curr_raw_pages;
183 static atomic_t zcache_zbud_curr_zpages;
184 static unsigned long zcache_zbud_curr_zbytes;
185 static unsigned long zcache_zbud_cumul_zpages;
186 static unsigned long zcache_zbud_cumul_zbytes;
187 static unsigned long zcache_compress_poor;
188 static unsigned long zcache_mean_compress_poor;
190 /* forward references */
191 static void *zcache_get_free_page(void);
192 static void zcache_free_page(void *p);
195 * zbud helper functions
198 static inline unsigned zbud_max_buddy_size(void)
200 return MAX_CHUNK << CHUNK_SHIFT;
203 static inline unsigned zbud_size_to_chunks(unsigned size)
205 BUG_ON(size == 0 || size > zbud_max_buddy_size());
206 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
209 static inline int zbud_budnum(struct zbud_hdr *zh)
211 unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
212 struct zbud_page *zbpg = NULL;
213 unsigned budnum = -1U;
216 for (i = 0; i < ZBUD_MAX_BUDS; i++)
217 if (offset == offsetof(typeof(*zbpg), buddy[i])) {
221 BUG_ON(budnum == -1U);
225 static char *zbud_data(struct zbud_hdr *zh, unsigned size)
227 struct zbud_page *zbpg;
231 ASSERT_SENTINEL(zh, ZBH);
232 budnum = zbud_budnum(zh);
233 BUG_ON(size == 0 || size > zbud_max_buddy_size());
234 zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
235 ASSERT_SPINLOCK(&zbpg->lock);
238 p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
240 else if (budnum == 1)
241 p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
246 * zbud raw page management
249 static struct zbud_page *zbud_alloc_raw_page(void)
251 struct zbud_page *zbpg = NULL;
252 struct zbud_hdr *zh0, *zh1;
255 /* if any pages on the zbpg list, use one */
256 spin_lock(&zbpg_unused_list_spinlock);
257 if (!list_empty(&zbpg_unused_list)) {
258 zbpg = list_first_entry(&zbpg_unused_list,
259 struct zbud_page, bud_list);
260 list_del_init(&zbpg->bud_list);
261 zcache_zbpg_unused_list_count--;
264 spin_unlock(&zbpg_unused_list_spinlock);
266 /* none on zbpg list, try to get a kernel page */
267 zbpg = zcache_get_free_page();
268 if (likely(zbpg != NULL)) {
269 INIT_LIST_HEAD(&zbpg->bud_list);
270 zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
271 spin_lock_init(&zbpg->lock);
273 ASSERT_INVERTED_SENTINEL(zbpg, ZBPG);
274 SET_SENTINEL(zbpg, ZBPG);
275 BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
276 BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
278 atomic_inc(&zcache_zbud_curr_raw_pages);
279 INIT_LIST_HEAD(&zbpg->bud_list);
280 SET_SENTINEL(zbpg, ZBPG);
281 zh0->size = 0; zh1->size = 0;
282 tmem_oid_set_invalid(&zh0->oid);
283 tmem_oid_set_invalid(&zh1->oid);
289 static void zbud_free_raw_page(struct zbud_page *zbpg)
291 struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
293 ASSERT_SENTINEL(zbpg, ZBPG);
294 BUG_ON(!list_empty(&zbpg->bud_list));
295 ASSERT_SPINLOCK(&zbpg->lock);
296 BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
297 BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
298 INVERT_SENTINEL(zbpg, ZBPG);
299 spin_unlock(&zbpg->lock);
300 spin_lock(&zbpg_unused_list_spinlock);
301 list_add(&zbpg->bud_list, &zbpg_unused_list);
302 zcache_zbpg_unused_list_count++;
303 spin_unlock(&zbpg_unused_list_spinlock);
307 * core zbud handling routines
310 static unsigned zbud_free(struct zbud_hdr *zh)
314 ASSERT_SENTINEL(zh, ZBH);
315 BUG_ON(!tmem_oid_valid(&zh->oid));
317 BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
319 tmem_oid_set_invalid(&zh->oid);
320 INVERT_SENTINEL(zh, ZBH);
321 zcache_zbud_curr_zbytes -= size;
322 atomic_dec(&zcache_zbud_curr_zpages);
326 static void zbud_free_and_delist(struct zbud_hdr *zh)
329 struct zbud_hdr *zh_other;
330 unsigned budnum = zbud_budnum(zh), size;
331 struct zbud_page *zbpg =
332 container_of(zh, struct zbud_page, buddy[budnum]);
334 spin_lock(&zbud_budlists_spinlock);
335 spin_lock(&zbpg->lock);
336 if (list_empty(&zbpg->bud_list)) {
337 /* ignore zombie page... see zbud_evict_pages() */
338 spin_unlock(&zbpg->lock);
339 spin_unlock(&zbud_budlists_spinlock);
342 size = zbud_free(zh);
343 ASSERT_SPINLOCK(&zbpg->lock);
344 zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
345 if (zh_other->size == 0) { /* was unbuddied: unlist and free */
346 chunks = zbud_size_to_chunks(size) ;
347 BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
348 list_del_init(&zbpg->bud_list);
349 zbud_unbuddied[chunks].count--;
350 spin_unlock(&zbud_budlists_spinlock);
351 zbud_free_raw_page(zbpg);
352 } else { /* was buddied: move remaining buddy to unbuddied list */
353 chunks = zbud_size_to_chunks(zh_other->size) ;
354 list_del_init(&zbpg->bud_list);
355 zcache_zbud_buddied_count--;
356 list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
357 zbud_unbuddied[chunks].count++;
358 spin_unlock(&zbud_budlists_spinlock);
359 spin_unlock(&zbpg->lock);
363 static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id,
364 struct tmem_oid *oid,
365 uint32_t index, struct page *page,
366 void *cdata, unsigned size)
368 struct zbud_hdr *zh0, *zh1, *zh = NULL;
369 struct zbud_page *zbpg = NULL, *ztmp;
372 int i, found_good_buddy = 0;
374 nchunks = zbud_size_to_chunks(size) ;
375 for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
376 spin_lock(&zbud_budlists_spinlock);
377 if (!list_empty(&zbud_unbuddied[i].list)) {
378 list_for_each_entry_safe(zbpg, ztmp,
379 &zbud_unbuddied[i].list, bud_list) {
380 if (spin_trylock(&zbpg->lock)) {
381 found_good_buddy = i;
382 goto found_unbuddied;
386 spin_unlock(&zbud_budlists_spinlock);
388 /* didn't find a good buddy, try allocating a new page */
389 zbpg = zbud_alloc_raw_page();
390 if (unlikely(zbpg == NULL))
392 /* ok, have a page, now compress the data before taking locks */
393 spin_lock(&zbud_budlists_spinlock);
394 spin_lock(&zbpg->lock);
395 list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
396 zbud_unbuddied[nchunks].count++;
397 zh = &zbpg->buddy[0];
401 ASSERT_SPINLOCK(&zbpg->lock);
402 zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
403 BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
404 if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
405 ASSERT_SENTINEL(zh0, ZBH);
407 } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
408 ASSERT_SENTINEL(zh1, ZBH);
412 list_del_init(&zbpg->bud_list);
413 zbud_unbuddied[found_good_buddy].count--;
414 list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
415 zcache_zbud_buddied_count++;
418 SET_SENTINEL(zh, ZBH);
422 zh->pool_id = pool_id;
423 zh->client_id = client_id;
424 to = zbud_data(zh, size);
425 memcpy(to, cdata, size);
426 spin_unlock(&zbpg->lock);
427 spin_unlock(&zbud_budlists_spinlock);
429 zbud_cumul_chunk_counts[nchunks]++;
430 atomic_inc(&zcache_zbud_curr_zpages);
431 zcache_zbud_cumul_zpages++;
432 zcache_zbud_curr_zbytes += size;
433 zcache_zbud_cumul_zbytes += size;
438 static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
440 struct zbud_page *zbpg;
441 unsigned budnum = zbud_budnum(zh);
442 unsigned int out_len = PAGE_SIZE;
443 char *to_va, *from_va;
447 zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
448 spin_lock(&zbpg->lock);
449 if (list_empty(&zbpg->bud_list)) {
450 /* ignore zombie page... see zbud_evict_pages() */
454 ASSERT_SENTINEL(zh, ZBH);
455 BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
456 to_va = kmap_atomic(page);
458 from_va = zbud_data(zh, size);
459 ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, from_va, size,
462 BUG_ON(out_len != PAGE_SIZE);
463 kunmap_atomic(to_va);
465 spin_unlock(&zbpg->lock);
470 * The following routines handle shrinking of ephemeral pages by evicting
471 * pages "least valuable" first.
474 static unsigned long zcache_evicted_raw_pages;
475 static unsigned long zcache_evicted_buddied_pages;
476 static unsigned long zcache_evicted_unbuddied_pages;
478 static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id,
480 static void zcache_put_pool(struct tmem_pool *pool);
483 * Flush and free all zbuds in a zbpg, then free the pageframe
485 static void zbud_evict_zbpg(struct zbud_page *zbpg)
489 uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS];
490 uint32_t index[ZBUD_MAX_BUDS];
491 struct tmem_oid oid[ZBUD_MAX_BUDS];
492 struct tmem_pool *pool;
494 ASSERT_SPINLOCK(&zbpg->lock);
495 BUG_ON(!list_empty(&zbpg->bud_list));
496 for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
497 zh = &zbpg->buddy[i];
499 client_id[j] = zh->client_id;
500 pool_id[j] = zh->pool_id;
502 index[j] = zh->index;
507 spin_unlock(&zbpg->lock);
508 for (i = 0; i < j; i++) {
509 pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
511 tmem_flush_page(pool, &oid[i], index[i]);
512 zcache_put_pool(pool);
515 ASSERT_SENTINEL(zbpg, ZBPG);
516 spin_lock(&zbpg->lock);
517 zbud_free_raw_page(zbpg);
521 * Free nr pages. This code is funky because we want to hold the locks
522 * protecting various lists for as short a time as possible, and in some
523 * circumstances the list may change asynchronously when the list lock is
524 * not held. In some cases we also trylock not only to avoid waiting on a
525 * page in use by another cpu, but also to avoid potential deadlock due to
528 static void zbud_evict_pages(int nr)
530 struct zbud_page *zbpg;
533 /* first try freeing any pages on unused list */
535 spin_lock_bh(&zbpg_unused_list_spinlock);
536 if (!list_empty(&zbpg_unused_list)) {
537 /* can't walk list here, since it may change when unlocked */
538 zbpg = list_first_entry(&zbpg_unused_list,
539 struct zbud_page, bud_list);
540 list_del_init(&zbpg->bud_list);
541 zcache_zbpg_unused_list_count--;
542 atomic_dec(&zcache_zbud_curr_raw_pages);
543 spin_unlock_bh(&zbpg_unused_list_spinlock);
544 zcache_free_page(zbpg);
545 zcache_evicted_raw_pages++;
548 goto retry_unused_list;
550 spin_unlock_bh(&zbpg_unused_list_spinlock);
552 /* now try freeing unbuddied pages, starting with least space avail */
553 for (i = 0; i < MAX_CHUNK; i++) {
555 spin_lock_bh(&zbud_budlists_spinlock);
556 if (list_empty(&zbud_unbuddied[i].list)) {
557 spin_unlock_bh(&zbud_budlists_spinlock);
560 list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
561 if (unlikely(!spin_trylock(&zbpg->lock)))
563 list_del_init(&zbpg->bud_list);
564 zbud_unbuddied[i].count--;
565 spin_unlock(&zbud_budlists_spinlock);
566 zcache_evicted_unbuddied_pages++;
567 /* want budlists unlocked when doing zbpg eviction */
568 zbud_evict_zbpg(zbpg);
572 goto retry_unbud_list_i;
574 spin_unlock_bh(&zbud_budlists_spinlock);
577 /* as a last resort, free buddied pages */
579 spin_lock_bh(&zbud_budlists_spinlock);
580 if (list_empty(&zbud_buddied_list)) {
581 spin_unlock_bh(&zbud_budlists_spinlock);
584 list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
585 if (unlikely(!spin_trylock(&zbpg->lock)))
587 list_del_init(&zbpg->bud_list);
588 zcache_zbud_buddied_count--;
589 spin_unlock(&zbud_budlists_spinlock);
590 zcache_evicted_buddied_pages++;
591 /* want budlists unlocked when doing zbpg eviction */
592 zbud_evict_zbpg(zbpg);
598 spin_unlock_bh(&zbud_budlists_spinlock);
603 static void __init zbud_init(void)
607 INIT_LIST_HEAD(&zbud_buddied_list);
609 for (i = 0; i < NCHUNKS; i++)
610 INIT_LIST_HEAD(&zbud_unbuddied[i].list);
615 * These sysfs routines show a nice distribution of how many zbpg's are
616 * currently (and have ever been placed) in each unbuddied list. It's fun
617 * to watch but can probably go away before final merge.
619 static int zbud_show_unbuddied_list_counts(char *buf)
624 for (i = 0; i < NCHUNKS; i++)
625 p += sprintf(p, "%u ", zbud_unbuddied[i].count);
629 static int zbud_show_cumul_chunk_counts(char *buf)
631 unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
632 unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
633 unsigned long total_chunks_lte_42 = 0;
636 for (i = 0; i < NCHUNKS; i++) {
637 p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
638 chunks += zbud_cumul_chunk_counts[i];
639 total_chunks += zbud_cumul_chunk_counts[i];
640 sum_total_chunks += i * zbud_cumul_chunk_counts[i];
642 total_chunks_lte_21 = total_chunks;
644 total_chunks_lte_32 = total_chunks;
646 total_chunks_lte_42 = total_chunks;
648 p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
649 total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
650 chunks == 0 ? 0 : sum_total_chunks / chunks);
656 * This "zv" PAM implementation combines the slab-based zsmalloc
657 * with the crypto compression API to maximize the amount of data that can
658 * be packed into a physical page.
660 * Zv represents a PAM page with the index and object (plus a "size" value
661 * necessary for decompression) immediately preceding the compressed data.
664 #define ZVH_SENTINEL 0x43214321
674 /* rudimentary policy limits */
675 /* total number of persistent pages may not exceed this percentage */
676 static unsigned int zv_page_count_policy_percent = 75;
678 * byte count defining poor compression; pages with greater zsize will be
681 static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7;
683 * byte count defining poor *mean* compression; pages with greater zsize
684 * will be rejected until sufficient better-compressed pages are accepted
685 * driving the mean below this threshold
687 static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5;
689 static atomic_t zv_curr_dist_counts[NCHUNKS];
690 static atomic_t zv_cumul_dist_counts[NCHUNKS];
692 static unsigned long zv_create(struct zs_pool *pool, uint32_t pool_id,
693 struct tmem_oid *oid, uint32_t index,
694 void *cdata, unsigned clen)
697 u32 size = clen + sizeof(struct zv_hdr);
698 int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
699 unsigned long handle = 0;
701 BUG_ON(!irqs_disabled());
702 BUG_ON(chunks >= NCHUNKS);
703 handle = zs_malloc(pool, size);
706 atomic_inc(&zv_curr_dist_counts[chunks]);
707 atomic_inc(&zv_cumul_dist_counts[chunks]);
708 zv = zs_map_object(pool, handle);
711 zv->pool_id = pool_id;
713 SET_SENTINEL(zv, ZVH);
714 memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
715 zs_unmap_object(pool, handle);
720 static void zv_free(struct zs_pool *pool, unsigned long handle)
727 zv = zs_map_object(pool, handle);
728 ASSERT_SENTINEL(zv, ZVH);
729 size = zv->size + sizeof(struct zv_hdr);
730 INVERT_SENTINEL(zv, ZVH);
731 zs_unmap_object(pool, handle);
733 chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
734 BUG_ON(chunks >= NCHUNKS);
735 atomic_dec(&zv_curr_dist_counts[chunks]);
737 local_irq_save(flags);
738 zs_free(pool, handle);
739 local_irq_restore(flags);
742 static void zv_decompress(struct page *page, unsigned long handle)
744 unsigned int clen = PAGE_SIZE;
749 zv = zs_map_object(zcache_host.zspool, handle);
750 BUG_ON(zv->size == 0);
751 ASSERT_SENTINEL(zv, ZVH);
752 to_va = kmap_atomic(page);
753 ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, (char *)zv + sizeof(*zv),
754 zv->size, to_va, &clen);
755 kunmap_atomic(to_va);
756 zs_unmap_object(zcache_host.zspool, handle);
758 BUG_ON(clen != PAGE_SIZE);
763 * show a distribution of compression stats for zv pages.
766 static int zv_curr_dist_counts_show(char *buf)
768 unsigned long i, n, chunks = 0, sum_total_chunks = 0;
771 for (i = 0; i < NCHUNKS; i++) {
772 n = atomic_read(&zv_curr_dist_counts[i]);
773 p += sprintf(p, "%lu ", n);
775 sum_total_chunks += i * n;
777 p += sprintf(p, "mean:%lu\n",
778 chunks == 0 ? 0 : sum_total_chunks / chunks);
782 static int zv_cumul_dist_counts_show(char *buf)
784 unsigned long i, n, chunks = 0, sum_total_chunks = 0;
787 for (i = 0; i < NCHUNKS; i++) {
788 n = atomic_read(&zv_cumul_dist_counts[i]);
789 p += sprintf(p, "%lu ", n);
791 sum_total_chunks += i * n;
793 p += sprintf(p, "mean:%lu\n",
794 chunks == 0 ? 0 : sum_total_chunks / chunks);
799 * setting zv_max_zsize via sysfs causes all persistent (e.g. swap)
800 * pages that don't compress to less than this value (including metadata
801 * overhead) to be rejected. We don't allow the value to get too close
804 static ssize_t zv_max_zsize_show(struct kobject *kobj,
805 struct kobj_attribute *attr,
808 return sprintf(buf, "%u\n", zv_max_zsize);
811 static ssize_t zv_max_zsize_store(struct kobject *kobj,
812 struct kobj_attribute *attr,
813 const char *buf, size_t count)
818 if (!capable(CAP_SYS_ADMIN))
821 err = kstrtoul(buf, 10, &val);
822 if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
829 * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap)
830 * pages that don't compress to less than this value (including metadata
831 * overhead) to be rejected UNLESS the mean compression is also smaller
832 * than this value. In other words, we are load-balancing-by-zsize the
833 * accepted pages. Again, we don't allow the value to get too close
836 static ssize_t zv_max_mean_zsize_show(struct kobject *kobj,
837 struct kobj_attribute *attr,
840 return sprintf(buf, "%u\n", zv_max_mean_zsize);
843 static ssize_t zv_max_mean_zsize_store(struct kobject *kobj,
844 struct kobj_attribute *attr,
845 const char *buf, size_t count)
850 if (!capable(CAP_SYS_ADMIN))
853 err = kstrtoul(buf, 10, &val);
854 if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
856 zv_max_mean_zsize = val;
861 * setting zv_page_count_policy_percent via sysfs sets an upper bound of
862 * persistent (e.g. swap) pages that will be retained according to:
863 * (zv_page_count_policy_percent * totalram_pages) / 100)
864 * when that limit is reached, further puts will be rejected (until
865 * some pages have been flushed). Note that, due to compression,
866 * this number may exceed 100; it defaults to 75 and we set an
867 * arbitary limit of 150. A poor choice will almost certainly result
868 * in OOM's, so this value should only be changed prudently.
870 static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj,
871 struct kobj_attribute *attr,
874 return sprintf(buf, "%u\n", zv_page_count_policy_percent);
877 static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj,
878 struct kobj_attribute *attr,
879 const char *buf, size_t count)
884 if (!capable(CAP_SYS_ADMIN))
887 err = kstrtoul(buf, 10, &val);
888 if (err || (val == 0) || (val > 150))
890 zv_page_count_policy_percent = val;
894 static struct kobj_attribute zcache_zv_max_zsize_attr = {
895 .attr = { .name = "zv_max_zsize", .mode = 0644 },
896 .show = zv_max_zsize_show,
897 .store = zv_max_zsize_store,
900 static struct kobj_attribute zcache_zv_max_mean_zsize_attr = {
901 .attr = { .name = "zv_max_mean_zsize", .mode = 0644 },
902 .show = zv_max_mean_zsize_show,
903 .store = zv_max_mean_zsize_store,
906 static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = {
907 .attr = { .name = "zv_page_count_policy_percent",
909 .show = zv_page_count_policy_percent_show,
910 .store = zv_page_count_policy_percent_store,
915 * zcache core code starts here
918 /* useful stats not collected by cleancache or frontswap */
919 static unsigned long zcache_flush_total;
920 static unsigned long zcache_flush_found;
921 static unsigned long zcache_flobj_total;
922 static unsigned long zcache_flobj_found;
923 static unsigned long zcache_failed_eph_puts;
924 static unsigned long zcache_failed_pers_puts;
927 * Tmem operations assume the poolid implies the invoking client.
928 * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
929 * RAMster has each client numbered by cluster node, and a KVM version
930 * of zcache would have one client per guest and each client might
933 static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid)
935 struct tmem_pool *pool = NULL;
936 struct zcache_client *cli = NULL;
938 if (cli_id == LOCAL_CLIENT)
941 if (cli_id >= MAX_CLIENTS)
943 cli = &zcache_clients[cli_id];
948 atomic_inc(&cli->refcount);
949 pool = idr_find(&cli->tmem_pools, poolid);
951 atomic_inc(&pool->refcount);
956 static void zcache_put_pool(struct tmem_pool *pool)
958 struct zcache_client *cli = NULL;
963 atomic_dec(&pool->refcount);
964 atomic_dec(&cli->refcount);
967 int zcache_new_client(uint16_t cli_id)
969 struct zcache_client *cli = NULL;
972 if (cli_id == LOCAL_CLIENT)
974 else if ((unsigned int)cli_id < MAX_CLIENTS)
975 cli = &zcache_clients[cli_id];
981 #ifdef CONFIG_FRONTSWAP
982 cli->zspool = zs_create_pool("zcache", ZCACHE_GFP_MASK);
983 if (cli->zspool == NULL)
985 idr_init(&cli->tmem_pools);
992 /* counters for debugging */
993 static unsigned long zcache_failed_get_free_pages;
994 static unsigned long zcache_failed_alloc;
995 static unsigned long zcache_put_to_flush;
998 * for now, used named slabs so can easily track usage; later can
999 * either just use kmalloc, or perhaps add a slab-like allocator
1000 * to more carefully manage total memory utilization
1002 static struct kmem_cache *zcache_objnode_cache;
1003 static struct kmem_cache *zcache_obj_cache;
1004 static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
1005 static unsigned long zcache_curr_obj_count_max;
1006 static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
1007 static unsigned long zcache_curr_objnode_count_max;
1010 * to avoid memory allocation recursion (e.g. due to direct reclaim), we
1011 * preload all necessary data structures so the hostops callbacks never
1012 * actually do a malloc
1014 struct zcache_preload {
1016 struct tmem_obj *obj;
1018 struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
1020 static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
1022 static int zcache_do_preload(struct tmem_pool *pool)
1024 struct zcache_preload *kp;
1025 struct tmem_objnode *objnode;
1026 struct tmem_obj *obj;
1030 if (unlikely(zcache_objnode_cache == NULL))
1032 if (unlikely(zcache_obj_cache == NULL))
1035 /* IRQ has already been disabled. */
1036 kp = &__get_cpu_var(zcache_preloads);
1037 while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
1038 objnode = kmem_cache_alloc(zcache_objnode_cache,
1040 if (unlikely(objnode == NULL)) {
1041 zcache_failed_alloc++;
1045 kp->objnodes[kp->nr++] = objnode;
1048 obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
1049 if (unlikely(obj == NULL)) {
1050 zcache_failed_alloc++;
1054 page = (void *)__get_free_page(ZCACHE_GFP_MASK);
1055 if (unlikely(page == NULL)) {
1056 zcache_failed_get_free_pages++;
1057 kmem_cache_free(zcache_obj_cache, obj);
1061 if (kp->obj == NULL)
1064 kmem_cache_free(zcache_obj_cache, obj);
1066 if (kp->page == NULL)
1069 free_page((unsigned long)page);
1076 static void *zcache_get_free_page(void)
1078 struct zcache_preload *kp;
1081 kp = &__get_cpu_var(zcache_preloads);
1083 BUG_ON(page == NULL);
1088 static void zcache_free_page(void *p)
1090 free_page((unsigned long)p);
1094 * zcache implementation for tmem host ops
1097 static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
1099 struct tmem_objnode *objnode = NULL;
1100 unsigned long count;
1101 struct zcache_preload *kp;
1103 kp = &__get_cpu_var(zcache_preloads);
1106 objnode = kp->objnodes[kp->nr - 1];
1107 BUG_ON(objnode == NULL);
1108 kp->objnodes[kp->nr - 1] = NULL;
1110 count = atomic_inc_return(&zcache_curr_objnode_count);
1111 if (count > zcache_curr_objnode_count_max)
1112 zcache_curr_objnode_count_max = count;
1117 static void zcache_objnode_free(struct tmem_objnode *objnode,
1118 struct tmem_pool *pool)
1120 atomic_dec(&zcache_curr_objnode_count);
1121 BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
1122 kmem_cache_free(zcache_objnode_cache, objnode);
1125 static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
1127 struct tmem_obj *obj = NULL;
1128 unsigned long count;
1129 struct zcache_preload *kp;
1131 kp = &__get_cpu_var(zcache_preloads);
1133 BUG_ON(obj == NULL);
1135 count = atomic_inc_return(&zcache_curr_obj_count);
1136 if (count > zcache_curr_obj_count_max)
1137 zcache_curr_obj_count_max = count;
1141 static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
1143 atomic_dec(&zcache_curr_obj_count);
1144 BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
1145 kmem_cache_free(zcache_obj_cache, obj);
1148 static struct tmem_hostops zcache_hostops = {
1149 .obj_alloc = zcache_obj_alloc,
1150 .obj_free = zcache_obj_free,
1151 .objnode_alloc = zcache_objnode_alloc,
1152 .objnode_free = zcache_objnode_free,
1156 * zcache implementations for PAM page descriptor ops
1159 static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
1160 static unsigned long zcache_curr_eph_pampd_count_max;
1161 static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
1162 static unsigned long zcache_curr_pers_pampd_count_max;
1164 /* forward reference */
1165 static int zcache_compress(struct page *from, void **out_va, unsigned *out_len);
1167 static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,
1168 struct tmem_pool *pool, struct tmem_oid *oid,
1171 void *pampd = NULL, *cdata;
1174 unsigned long count;
1175 struct page *page = (struct page *)(data);
1176 struct zcache_client *cli = pool->client;
1177 uint16_t client_id = get_client_id_from_client(cli);
1178 unsigned long zv_mean_zsize;
1179 unsigned long curr_pers_pampd_count;
1183 ret = zcache_compress(page, &cdata, &clen);
1186 if (clen == 0 || clen > zbud_max_buddy_size()) {
1187 zcache_compress_poor++;
1190 pampd = (void *)zbud_create(client_id, pool->pool_id, oid,
1191 index, page, cdata, clen);
1192 if (pampd != NULL) {
1193 count = atomic_inc_return(&zcache_curr_eph_pampd_count);
1194 if (count > zcache_curr_eph_pampd_count_max)
1195 zcache_curr_eph_pampd_count_max = count;
1198 curr_pers_pampd_count =
1199 atomic_read(&zcache_curr_pers_pampd_count);
1200 if (curr_pers_pampd_count >
1201 (zv_page_count_policy_percent * totalram_pages) / 100)
1203 ret = zcache_compress(page, &cdata, &clen);
1206 /* reject if compression is too poor */
1207 if (clen > zv_max_zsize) {
1208 zcache_compress_poor++;
1211 /* reject if mean compression is too poor */
1212 if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) {
1213 total_zsize = zs_get_total_size_bytes(cli->zspool);
1214 zv_mean_zsize = div_u64(total_zsize,
1215 curr_pers_pampd_count);
1216 if (zv_mean_zsize > zv_max_mean_zsize) {
1217 zcache_mean_compress_poor++;
1221 pampd = (void *)zv_create(cli->zspool, pool->pool_id,
1222 oid, index, cdata, clen);
1225 count = atomic_inc_return(&zcache_curr_pers_pampd_count);
1226 if (count > zcache_curr_pers_pampd_count_max)
1227 zcache_curr_pers_pampd_count_max = count;
1234 * fill the pageframe corresponding to the struct page with the data
1235 * from the passed pampd
1237 static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw,
1238 void *pampd, struct tmem_pool *pool,
1239 struct tmem_oid *oid, uint32_t index)
1243 BUG_ON(is_ephemeral(pool));
1244 zv_decompress((struct page *)(data), (unsigned long)pampd);
1249 * fill the pageframe corresponding to the struct page with the data
1250 * from the passed pampd
1252 static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw,
1253 void *pampd, struct tmem_pool *pool,
1254 struct tmem_oid *oid, uint32_t index)
1258 BUG_ON(!is_ephemeral(pool));
1259 zbud_decompress((struct page *)(data), pampd);
1260 zbud_free_and_delist((struct zbud_hdr *)pampd);
1261 atomic_dec(&zcache_curr_eph_pampd_count);
1266 * free the pampd and remove it from any zcache lists
1267 * pampd must no longer be pointed to from any tmem data structures!
1269 static void zcache_pampd_free(void *pampd, struct tmem_pool *pool,
1270 struct tmem_oid *oid, uint32_t index)
1272 struct zcache_client *cli = pool->client;
1274 if (is_ephemeral(pool)) {
1275 zbud_free_and_delist((struct zbud_hdr *)pampd);
1276 atomic_dec(&zcache_curr_eph_pampd_count);
1277 BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
1279 zv_free(cli->zspool, (unsigned long)pampd);
1280 atomic_dec(&zcache_curr_pers_pampd_count);
1281 BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
1285 static void zcache_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj)
1289 static void zcache_pampd_new_obj(struct tmem_obj *obj)
1293 static int zcache_pampd_replace_in_obj(void *pampd, struct tmem_obj *obj)
1298 static bool zcache_pampd_is_remote(void *pampd)
1303 static struct tmem_pamops zcache_pamops = {
1304 .create = zcache_pampd_create,
1305 .get_data = zcache_pampd_get_data,
1306 .get_data_and_free = zcache_pampd_get_data_and_free,
1307 .free = zcache_pampd_free,
1308 .free_obj = zcache_pampd_free_obj,
1309 .new_obj = zcache_pampd_new_obj,
1310 .replace_in_obj = zcache_pampd_replace_in_obj,
1311 .is_remote = zcache_pampd_is_remote,
1315 * zcache compression/decompression and related per-cpu stuff
1318 static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
1319 #define ZCACHE_DSTMEM_ORDER 1
1321 static int zcache_compress(struct page *from, void **out_va, unsigned *out_len)
1324 unsigned char *dmem = __get_cpu_var(zcache_dstmem);
1327 BUG_ON(!irqs_disabled());
1328 if (unlikely(dmem == NULL))
1329 goto out; /* no buffer or no compressor so can't compress */
1330 *out_len = PAGE_SIZE << ZCACHE_DSTMEM_ORDER;
1331 from_va = kmap_atomic(from);
1333 ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, from_va, PAGE_SIZE, dmem,
1337 kunmap_atomic(from_va);
1343 static int zcache_comp_cpu_up(int cpu)
1345 struct crypto_comp *tfm;
1347 tfm = crypto_alloc_comp(zcache_comp_name, 0, 0);
1350 *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm;
1354 static void zcache_comp_cpu_down(int cpu)
1356 struct crypto_comp *tfm;
1358 tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu);
1359 crypto_free_comp(tfm);
1360 *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL;
1363 static int zcache_cpu_notifier(struct notifier_block *nb,
1364 unsigned long action, void *pcpu)
1366 int ret, cpu = (long)pcpu;
1367 struct zcache_preload *kp;
1370 case CPU_UP_PREPARE:
1371 ret = zcache_comp_cpu_up(cpu);
1372 if (ret != NOTIFY_OK) {
1373 pr_err("zcache: can't allocate compressor transform\n");
1376 per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
1377 GFP_KERNEL | __GFP_REPEAT, ZCACHE_DSTMEM_ORDER);
1380 case CPU_UP_CANCELED:
1381 zcache_comp_cpu_down(cpu);
1382 free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
1383 ZCACHE_DSTMEM_ORDER);
1384 per_cpu(zcache_dstmem, cpu) = NULL;
1385 kp = &per_cpu(zcache_preloads, cpu);
1387 kmem_cache_free(zcache_objnode_cache,
1388 kp->objnodes[kp->nr - 1]);
1389 kp->objnodes[kp->nr - 1] = NULL;
1393 kmem_cache_free(zcache_obj_cache, kp->obj);
1397 free_page((unsigned long)kp->page);
1407 static struct notifier_block zcache_cpu_notifier_block = {
1408 .notifier_call = zcache_cpu_notifier
1412 #define ZCACHE_SYSFS_RO(_name) \
1413 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1414 struct kobj_attribute *attr, char *buf) \
1416 return sprintf(buf, "%lu\n", zcache_##_name); \
1418 static struct kobj_attribute zcache_##_name##_attr = { \
1419 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1420 .show = zcache_##_name##_show, \
1423 #define ZCACHE_SYSFS_RO_ATOMIC(_name) \
1424 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1425 struct kobj_attribute *attr, char *buf) \
1427 return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
1429 static struct kobj_attribute zcache_##_name##_attr = { \
1430 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1431 .show = zcache_##_name##_show, \
1434 #define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
1435 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1436 struct kobj_attribute *attr, char *buf) \
1438 return _func(buf); \
1440 static struct kobj_attribute zcache_##_name##_attr = { \
1441 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1442 .show = zcache_##_name##_show, \
1445 ZCACHE_SYSFS_RO(curr_obj_count_max);
1446 ZCACHE_SYSFS_RO(curr_objnode_count_max);
1447 ZCACHE_SYSFS_RO(flush_total);
1448 ZCACHE_SYSFS_RO(flush_found);
1449 ZCACHE_SYSFS_RO(flobj_total);
1450 ZCACHE_SYSFS_RO(flobj_found);
1451 ZCACHE_SYSFS_RO(failed_eph_puts);
1452 ZCACHE_SYSFS_RO(failed_pers_puts);
1453 ZCACHE_SYSFS_RO(zbud_curr_zbytes);
1454 ZCACHE_SYSFS_RO(zbud_cumul_zpages);
1455 ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
1456 ZCACHE_SYSFS_RO(zbud_buddied_count);
1457 ZCACHE_SYSFS_RO(zbpg_unused_list_count);
1458 ZCACHE_SYSFS_RO(evicted_raw_pages);
1459 ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
1460 ZCACHE_SYSFS_RO(evicted_buddied_pages);
1461 ZCACHE_SYSFS_RO(failed_get_free_pages);
1462 ZCACHE_SYSFS_RO(failed_alloc);
1463 ZCACHE_SYSFS_RO(put_to_flush);
1464 ZCACHE_SYSFS_RO(compress_poor);
1465 ZCACHE_SYSFS_RO(mean_compress_poor);
1466 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
1467 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
1468 ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
1469 ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
1470 ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
1471 zbud_show_unbuddied_list_counts);
1472 ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
1473 zbud_show_cumul_chunk_counts);
1474 ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts,
1475 zv_curr_dist_counts_show);
1476 ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts,
1477 zv_cumul_dist_counts_show);
1479 static struct attribute *zcache_attrs[] = {
1480 &zcache_curr_obj_count_attr.attr,
1481 &zcache_curr_obj_count_max_attr.attr,
1482 &zcache_curr_objnode_count_attr.attr,
1483 &zcache_curr_objnode_count_max_attr.attr,
1484 &zcache_flush_total_attr.attr,
1485 &zcache_flobj_total_attr.attr,
1486 &zcache_flush_found_attr.attr,
1487 &zcache_flobj_found_attr.attr,
1488 &zcache_failed_eph_puts_attr.attr,
1489 &zcache_failed_pers_puts_attr.attr,
1490 &zcache_compress_poor_attr.attr,
1491 &zcache_mean_compress_poor_attr.attr,
1492 &zcache_zbud_curr_raw_pages_attr.attr,
1493 &zcache_zbud_curr_zpages_attr.attr,
1494 &zcache_zbud_curr_zbytes_attr.attr,
1495 &zcache_zbud_cumul_zpages_attr.attr,
1496 &zcache_zbud_cumul_zbytes_attr.attr,
1497 &zcache_zbud_buddied_count_attr.attr,
1498 &zcache_zbpg_unused_list_count_attr.attr,
1499 &zcache_evicted_raw_pages_attr.attr,
1500 &zcache_evicted_unbuddied_pages_attr.attr,
1501 &zcache_evicted_buddied_pages_attr.attr,
1502 &zcache_failed_get_free_pages_attr.attr,
1503 &zcache_failed_alloc_attr.attr,
1504 &zcache_put_to_flush_attr.attr,
1505 &zcache_zbud_unbuddied_list_counts_attr.attr,
1506 &zcache_zbud_cumul_chunk_counts_attr.attr,
1507 &zcache_zv_curr_dist_counts_attr.attr,
1508 &zcache_zv_cumul_dist_counts_attr.attr,
1509 &zcache_zv_max_zsize_attr.attr,
1510 &zcache_zv_max_mean_zsize_attr.attr,
1511 &zcache_zv_page_count_policy_percent_attr.attr,
1515 static struct attribute_group zcache_attr_group = {
1516 .attrs = zcache_attrs,
1520 #endif /* CONFIG_SYSFS */
1522 * When zcache is disabled ("frozen"), pools can be created and destroyed,
1523 * but all puts (and thus all other operations that require memory allocation)
1524 * must fail. If zcache is unfrozen, accepts puts, then frozen again,
1525 * data consistency requires all puts while frozen to be converted into
1528 static bool zcache_freeze;
1531 * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
1533 static int shrink_zcache_memory(struct shrinker *shrink,
1534 struct shrink_control *sc)
1537 int nr = sc->nr_to_scan;
1538 gfp_t gfp_mask = sc->gfp_mask;
1541 if (!(gfp_mask & __GFP_FS))
1542 /* does this case really need to be skipped? */
1544 zbud_evict_pages(nr);
1546 ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
1551 static struct shrinker zcache_shrinker = {
1552 .shrink = shrink_zcache_memory,
1553 .seeks = DEFAULT_SEEKS,
1557 * zcache shims between cleancache/frontswap ops and tmem
1560 static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp,
1561 uint32_t index, struct page *page)
1563 struct tmem_pool *pool;
1566 BUG_ON(!irqs_disabled());
1567 pool = zcache_get_pool_by_id(cli_id, pool_id);
1568 if (unlikely(pool == NULL))
1570 if (!zcache_freeze && zcache_do_preload(pool) == 0) {
1571 /* preload does preempt_disable on success */
1572 ret = tmem_put(pool, oidp, index, (char *)(page),
1573 PAGE_SIZE, 0, is_ephemeral(pool));
1575 if (is_ephemeral(pool))
1576 zcache_failed_eph_puts++;
1578 zcache_failed_pers_puts++;
1580 zcache_put_pool(pool);
1582 zcache_put_to_flush++;
1583 if (atomic_read(&pool->obj_count) > 0)
1584 /* the put fails whether the flush succeeds or not */
1585 (void)tmem_flush_page(pool, oidp, index);
1586 zcache_put_pool(pool);
1592 static int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp,
1593 uint32_t index, struct page *page)
1595 struct tmem_pool *pool;
1597 unsigned long flags;
1598 size_t size = PAGE_SIZE;
1600 local_irq_save(flags);
1601 pool = zcache_get_pool_by_id(cli_id, pool_id);
1602 if (likely(pool != NULL)) {
1603 if (atomic_read(&pool->obj_count) > 0)
1604 ret = tmem_get(pool, oidp, index, (char *)(page),
1605 &size, 0, is_ephemeral(pool));
1606 zcache_put_pool(pool);
1608 local_irq_restore(flags);
1612 static int zcache_flush_page(int cli_id, int pool_id,
1613 struct tmem_oid *oidp, uint32_t index)
1615 struct tmem_pool *pool;
1617 unsigned long flags;
1619 local_irq_save(flags);
1620 zcache_flush_total++;
1621 pool = zcache_get_pool_by_id(cli_id, pool_id);
1622 if (likely(pool != NULL)) {
1623 if (atomic_read(&pool->obj_count) > 0)
1624 ret = tmem_flush_page(pool, oidp, index);
1625 zcache_put_pool(pool);
1628 zcache_flush_found++;
1629 local_irq_restore(flags);
1633 static int zcache_flush_object(int cli_id, int pool_id,
1634 struct tmem_oid *oidp)
1636 struct tmem_pool *pool;
1638 unsigned long flags;
1640 local_irq_save(flags);
1641 zcache_flobj_total++;
1642 pool = zcache_get_pool_by_id(cli_id, pool_id);
1643 if (likely(pool != NULL)) {
1644 if (atomic_read(&pool->obj_count) > 0)
1645 ret = tmem_flush_object(pool, oidp);
1646 zcache_put_pool(pool);
1649 zcache_flobj_found++;
1650 local_irq_restore(flags);
1654 static int zcache_destroy_pool(int cli_id, int pool_id)
1656 struct tmem_pool *pool = NULL;
1657 struct zcache_client *cli = NULL;
1662 if (cli_id == LOCAL_CLIENT)
1664 else if ((unsigned int)cli_id < MAX_CLIENTS)
1665 cli = &zcache_clients[cli_id];
1668 atomic_inc(&cli->refcount);
1669 pool = idr_find(&cli->tmem_pools, pool_id);
1672 idr_remove(&cli->tmem_pools, pool_id);
1673 /* wait for pool activity on other cpus to quiesce */
1674 while (atomic_read(&pool->refcount) != 0)
1676 atomic_dec(&cli->refcount);
1678 ret = tmem_destroy_pool(pool);
1681 pr_info("zcache: destroyed pool id=%d, cli_id=%d\n",
1687 static int zcache_new_pool(uint16_t cli_id, uint32_t flags)
1690 struct tmem_pool *pool;
1691 struct zcache_client *cli = NULL;
1694 if (cli_id == LOCAL_CLIENT)
1696 else if ((unsigned int)cli_id < MAX_CLIENTS)
1697 cli = &zcache_clients[cli_id];
1700 atomic_inc(&cli->refcount);
1701 pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC);
1703 pr_info("zcache: pool creation failed: out of memory\n");
1708 r = idr_pre_get(&cli->tmem_pools, GFP_ATOMIC);
1711 pr_info("zcache: pool creation failed: out of memory\n");
1714 r = idr_get_new(&cli->tmem_pools, pool, &poolid);
1715 } while (r == -EAGAIN);
1717 pr_info("zcache: pool creation failed: error %d\n", r);
1722 atomic_set(&pool->refcount, 0);
1724 pool->pool_id = poolid;
1725 tmem_new_pool(pool, flags);
1726 pr_info("zcache: created %s tmem pool, id=%d, client=%d\n",
1727 flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
1731 atomic_dec(&cli->refcount);
1736 * Two kernel functionalities currently can be layered on top of tmem.
1737 * These are "cleancache" which is used as a second-chance cache for clean
1738 * page cache pages; and "frontswap" which is used for swap pages
1739 * to avoid writes to disk. A generic "shim" is provided here for each
1740 * to translate in-kernel semantics to zcache semantics.
1743 #ifdef CONFIG_CLEANCACHE
1744 static void zcache_cleancache_put_page(int pool_id,
1745 struct cleancache_filekey key,
1746 pgoff_t index, struct page *page)
1748 u32 ind = (u32) index;
1749 struct tmem_oid oid = *(struct tmem_oid *)&key;
1751 if (likely(ind == index))
1752 (void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, page);
1755 static int zcache_cleancache_get_page(int pool_id,
1756 struct cleancache_filekey key,
1757 pgoff_t index, struct page *page)
1759 u32 ind = (u32) index;
1760 struct tmem_oid oid = *(struct tmem_oid *)&key;
1763 if (likely(ind == index))
1764 ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, page);
1768 static void zcache_cleancache_flush_page(int pool_id,
1769 struct cleancache_filekey key,
1772 u32 ind = (u32) index;
1773 struct tmem_oid oid = *(struct tmem_oid *)&key;
1775 if (likely(ind == index))
1776 (void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind);
1779 static void zcache_cleancache_flush_inode(int pool_id,
1780 struct cleancache_filekey key)
1782 struct tmem_oid oid = *(struct tmem_oid *)&key;
1784 (void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid);
1787 static void zcache_cleancache_flush_fs(int pool_id)
1790 (void)zcache_destroy_pool(LOCAL_CLIENT, pool_id);
1793 static int zcache_cleancache_init_fs(size_t pagesize)
1795 BUG_ON(sizeof(struct cleancache_filekey) !=
1796 sizeof(struct tmem_oid));
1797 BUG_ON(pagesize != PAGE_SIZE);
1798 return zcache_new_pool(LOCAL_CLIENT, 0);
1801 static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
1803 /* shared pools are unsupported and map to private */
1804 BUG_ON(sizeof(struct cleancache_filekey) !=
1805 sizeof(struct tmem_oid));
1806 BUG_ON(pagesize != PAGE_SIZE);
1807 return zcache_new_pool(LOCAL_CLIENT, 0);
1810 static struct cleancache_ops zcache_cleancache_ops = {
1811 .put_page = zcache_cleancache_put_page,
1812 .get_page = zcache_cleancache_get_page,
1813 .invalidate_page = zcache_cleancache_flush_page,
1814 .invalidate_inode = zcache_cleancache_flush_inode,
1815 .invalidate_fs = zcache_cleancache_flush_fs,
1816 .init_shared_fs = zcache_cleancache_init_shared_fs,
1817 .init_fs = zcache_cleancache_init_fs
1820 struct cleancache_ops zcache_cleancache_register_ops(void)
1822 struct cleancache_ops old_ops =
1823 cleancache_register_ops(&zcache_cleancache_ops);
1829 #ifdef CONFIG_FRONTSWAP
1830 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1831 static int zcache_frontswap_poolid = -1;
1834 * Swizzling increases objects per swaptype, increasing tmem concurrency
1835 * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
1836 * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from
1837 * frontswap_load(), but has side-effects. Hence using 8.
1840 #define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
1841 #define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
1842 #define iswiz(_ind) (_ind >> SWIZ_BITS)
1844 static inline struct tmem_oid oswiz(unsigned type, u32 ind)
1846 struct tmem_oid oid = { .oid = { 0 } };
1847 oid.oid[0] = _oswiz(type, ind);
1851 static int zcache_frontswap_store(unsigned type, pgoff_t offset,
1854 u64 ind64 = (u64)offset;
1855 u32 ind = (u32)offset;
1856 struct tmem_oid oid = oswiz(type, ind);
1858 unsigned long flags;
1860 BUG_ON(!PageLocked(page));
1861 if (likely(ind64 == ind)) {
1862 local_irq_save(flags);
1863 ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1864 &oid, iswiz(ind), page);
1865 local_irq_restore(flags);
1870 /* returns 0 if the page was successfully gotten from frontswap, -1 if
1871 * was not present (should never happen!) */
1872 static int zcache_frontswap_load(unsigned type, pgoff_t offset,
1875 u64 ind64 = (u64)offset;
1876 u32 ind = (u32)offset;
1877 struct tmem_oid oid = oswiz(type, ind);
1880 BUG_ON(!PageLocked(page));
1881 if (likely(ind64 == ind))
1882 ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1883 &oid, iswiz(ind), page);
1887 /* flush a single page from frontswap */
1888 static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
1890 u64 ind64 = (u64)offset;
1891 u32 ind = (u32)offset;
1892 struct tmem_oid oid = oswiz(type, ind);
1894 if (likely(ind64 == ind))
1895 (void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1899 /* flush all pages from the passed swaptype */
1900 static void zcache_frontswap_flush_area(unsigned type)
1902 struct tmem_oid oid;
1905 for (ind = SWIZ_MASK; ind >= 0; ind--) {
1906 oid = oswiz(type, ind);
1907 (void)zcache_flush_object(LOCAL_CLIENT,
1908 zcache_frontswap_poolid, &oid);
1912 static void zcache_frontswap_init(unsigned ignored)
1914 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1915 if (zcache_frontswap_poolid < 0)
1916 zcache_frontswap_poolid =
1917 zcache_new_pool(LOCAL_CLIENT, TMEM_POOL_PERSIST);
1920 static struct frontswap_ops zcache_frontswap_ops = {
1921 .store = zcache_frontswap_store,
1922 .load = zcache_frontswap_load,
1923 .invalidate_page = zcache_frontswap_flush_page,
1924 .invalidate_area = zcache_frontswap_flush_area,
1925 .init = zcache_frontswap_init
1928 struct frontswap_ops zcache_frontswap_register_ops(void)
1930 struct frontswap_ops old_ops =
1931 frontswap_register_ops(&zcache_frontswap_ops);
1938 * zcache initialization
1939 * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
1943 static int zcache_enabled;
1945 static int __init enable_zcache(char *s)
1950 __setup("zcache", enable_zcache);
1952 /* allow independent dynamic disabling of cleancache and frontswap */
1954 static int use_cleancache = 1;
1956 static int __init no_cleancache(char *s)
1962 __setup("nocleancache", no_cleancache);
1964 static int use_frontswap = 1;
1966 static int __init no_frontswap(char *s)
1972 __setup("nofrontswap", no_frontswap);
1974 static int __init enable_zcache_compressor(char *s)
1976 strncpy(zcache_comp_name, s, ZCACHE_COMP_NAME_SZ);
1980 __setup("zcache=", enable_zcache_compressor);
1983 static int __init zcache_comp_init(void)
1987 /* check crypto algorithm */
1988 if (*zcache_comp_name != '\0') {
1989 ret = crypto_has_comp(zcache_comp_name, 0, 0);
1991 pr_info("zcache: %s not supported\n",
1995 strcpy(zcache_comp_name, "lzo");
1996 ret = crypto_has_comp(zcache_comp_name, 0, 0);
2001 pr_info("zcache: using %s compressor\n", zcache_comp_name);
2003 /* alloc percpu transforms */
2005 zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
2006 if (!zcache_comp_pcpu_tfms)
2012 static int __init zcache_init(void)
2017 ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
2019 pr_err("zcache: can't create sysfs\n");
2022 #endif /* CONFIG_SYSFS */
2024 if (zcache_enabled) {
2027 tmem_register_hostops(&zcache_hostops);
2028 tmem_register_pamops(&zcache_pamops);
2029 ret = register_cpu_notifier(&zcache_cpu_notifier_block);
2031 pr_err("zcache: can't register cpu notifier\n");
2034 ret = zcache_comp_init();
2036 pr_err("zcache: compressor initialization failed\n");
2039 for_each_online_cpu(cpu) {
2040 void *pcpu = (void *)(long)cpu;
2041 zcache_cpu_notifier(&zcache_cpu_notifier_block,
2042 CPU_UP_PREPARE, pcpu);
2045 zcache_objnode_cache = kmem_cache_create("zcache_objnode",
2046 sizeof(struct tmem_objnode), 0, 0, NULL);
2047 zcache_obj_cache = kmem_cache_create("zcache_obj",
2048 sizeof(struct tmem_obj), 0, 0, NULL);
2049 ret = zcache_new_client(LOCAL_CLIENT);
2051 pr_err("zcache: can't create client\n");
2055 #ifdef CONFIG_CLEANCACHE
2056 if (zcache_enabled && use_cleancache) {
2057 struct cleancache_ops old_ops;
2060 register_shrinker(&zcache_shrinker);
2061 old_ops = zcache_cleancache_register_ops();
2062 pr_info("zcache: cleancache enabled using kernel "
2063 "transcendent memory and compression buddies\n");
2064 if (old_ops.init_fs != NULL)
2065 pr_warning("zcache: cleancache_ops overridden");
2068 #ifdef CONFIG_FRONTSWAP
2069 if (zcache_enabled && use_frontswap) {
2070 struct frontswap_ops old_ops;
2072 old_ops = zcache_frontswap_register_ops();
2073 pr_info("zcache: frontswap enabled using kernel "
2074 "transcendent memory and zsmalloc\n");
2075 if (old_ops.init != NULL)
2076 pr_warning("zcache: frontswap_ops overridden");
2083 module_init(zcache_init)