PM / Hibernate: Implement position keeping in radix tree
[cascardo/linux.git] / kernel / power / snapshot.c
1 /*
2  * linux/kernel/power/snapshot.c
3  *
4  * This file provides system snapshot/restore functionality for swsusp.
5  *
6  * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
7  * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
8  *
9  * This file is released under the GPLv2.
10  *
11  */
12
13 #include <linux/version.h>
14 #include <linux/module.h>
15 #include <linux/mm.h>
16 #include <linux/suspend.h>
17 #include <linux/delay.h>
18 #include <linux/bitops.h>
19 #include <linux/spinlock.h>
20 #include <linux/kernel.h>
21 #include <linux/pm.h>
22 #include <linux/device.h>
23 #include <linux/init.h>
24 #include <linux/bootmem.h>
25 #include <linux/syscalls.h>
26 #include <linux/console.h>
27 #include <linux/highmem.h>
28 #include <linux/list.h>
29 #include <linux/slab.h>
30 #include <linux/compiler.h>
31
32 #include <asm/uaccess.h>
33 #include <asm/mmu_context.h>
34 #include <asm/pgtable.h>
35 #include <asm/tlbflush.h>
36 #include <asm/io.h>
37
38 #include "power.h"
39
40 static int swsusp_page_is_free(struct page *);
41 static void swsusp_set_page_forbidden(struct page *);
42 static void swsusp_unset_page_forbidden(struct page *);
43
44 /*
45  * Number of bytes to reserve for memory allocations made by device drivers
46  * from their ->freeze() and ->freeze_noirq() callbacks so that they don't
47  * cause image creation to fail (tunable via /sys/power/reserved_size).
48  */
49 unsigned long reserved_size;
50
51 void __init hibernate_reserved_size_init(void)
52 {
53         reserved_size = SPARE_PAGES * PAGE_SIZE;
54 }
55
56 /*
57  * Preferred image size in bytes (tunable via /sys/power/image_size).
58  * When it is set to N, swsusp will do its best to ensure the image
59  * size will not exceed N bytes, but if that is impossible, it will
60  * try to create the smallest image possible.
61  */
62 unsigned long image_size;
63
64 void __init hibernate_image_size_init(void)
65 {
66         image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
67 }
68
69 /* List of PBEs needed for restoring the pages that were allocated before
70  * the suspend and included in the suspend image, but have also been
71  * allocated by the "resume" kernel, so their contents cannot be written
72  * directly to their "original" page frames.
73  */
74 struct pbe *restore_pblist;
75
76 /* Pointer to an auxiliary buffer (1 page) */
77 static void *buffer;
78
79 /**
80  *      @safe_needed - on resume, for storing the PBE list and the image,
81  *      we can only use memory pages that do not conflict with the pages
82  *      used before suspend.  The unsafe pages have PageNosaveFree set
83  *      and we count them using unsafe_pages.
84  *
85  *      Each allocated image page is marked as PageNosave and PageNosaveFree
86  *      so that swsusp_free() can release it.
87  */
88
89 #define PG_ANY          0
90 #define PG_SAFE         1
91 #define PG_UNSAFE_CLEAR 1
92 #define PG_UNSAFE_KEEP  0
93
94 static unsigned int allocated_unsafe_pages;
95
96 static void *get_image_page(gfp_t gfp_mask, int safe_needed)
97 {
98         void *res;
99
100         res = (void *)get_zeroed_page(gfp_mask);
101         if (safe_needed)
102                 while (res && swsusp_page_is_free(virt_to_page(res))) {
103                         /* The page is unsafe, mark it for swsusp_free() */
104                         swsusp_set_page_forbidden(virt_to_page(res));
105                         allocated_unsafe_pages++;
106                         res = (void *)get_zeroed_page(gfp_mask);
107                 }
108         if (res) {
109                 swsusp_set_page_forbidden(virt_to_page(res));
110                 swsusp_set_page_free(virt_to_page(res));
111         }
112         return res;
113 }
114
115 unsigned long get_safe_page(gfp_t gfp_mask)
116 {
117         return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
118 }
119
120 static struct page *alloc_image_page(gfp_t gfp_mask)
121 {
122         struct page *page;
123
124         page = alloc_page(gfp_mask);
125         if (page) {
126                 swsusp_set_page_forbidden(page);
127                 swsusp_set_page_free(page);
128         }
129         return page;
130 }
131
132 /**
133  *      free_image_page - free page represented by @addr, allocated with
134  *      get_image_page (page flags set by it must be cleared)
135  */
136
137 static inline void free_image_page(void *addr, int clear_nosave_free)
138 {
139         struct page *page;
140
141         BUG_ON(!virt_addr_valid(addr));
142
143         page = virt_to_page(addr);
144
145         swsusp_unset_page_forbidden(page);
146         if (clear_nosave_free)
147                 swsusp_unset_page_free(page);
148
149         __free_page(page);
150 }
151
152 /* struct linked_page is used to build chains of pages */
153
154 #define LINKED_PAGE_DATA_SIZE   (PAGE_SIZE - sizeof(void *))
155
156 struct linked_page {
157         struct linked_page *next;
158         char data[LINKED_PAGE_DATA_SIZE];
159 } __packed;
160
161 static inline void
162 free_list_of_pages(struct linked_page *list, int clear_page_nosave)
163 {
164         while (list) {
165                 struct linked_page *lp = list->next;
166
167                 free_image_page(list, clear_page_nosave);
168                 list = lp;
169         }
170 }
171
172 /**
173   *     struct chain_allocator is used for allocating small objects out of
174   *     a linked list of pages called 'the chain'.
175   *
176   *     The chain grows each time when there is no room for a new object in
177   *     the current page.  The allocated objects cannot be freed individually.
178   *     It is only possible to free them all at once, by freeing the entire
179   *     chain.
180   *
181   *     NOTE: The chain allocator may be inefficient if the allocated objects
182   *     are not much smaller than PAGE_SIZE.
183   */
184
185 struct chain_allocator {
186         struct linked_page *chain;      /* the chain */
187         unsigned int used_space;        /* total size of objects allocated out
188                                          * of the current page
189                                          */
190         gfp_t gfp_mask;         /* mask for allocating pages */
191         int safe_needed;        /* if set, only "safe" pages are allocated */
192 };
193
194 static void
195 chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
196 {
197         ca->chain = NULL;
198         ca->used_space = LINKED_PAGE_DATA_SIZE;
199         ca->gfp_mask = gfp_mask;
200         ca->safe_needed = safe_needed;
201 }
202
203 static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
204 {
205         void *ret;
206
207         if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
208                 struct linked_page *lp;
209
210                 lp = get_image_page(ca->gfp_mask, ca->safe_needed);
211                 if (!lp)
212                         return NULL;
213
214                 lp->next = ca->chain;
215                 ca->chain = lp;
216                 ca->used_space = 0;
217         }
218         ret = ca->chain->data + ca->used_space;
219         ca->used_space += size;
220         return ret;
221 }
222
223 /**
224  *      Data types related to memory bitmaps.
225  *
226  *      Memory bitmap is a structure consiting of many linked lists of
227  *      objects.  The main list's elements are of type struct zone_bitmap
228  *      and each of them corresonds to one zone.  For each zone bitmap
229  *      object there is a list of objects of type struct bm_block that
230  *      represent each blocks of bitmap in which information is stored.
231  *
232  *      struct memory_bitmap contains a pointer to the main list of zone
233  *      bitmap objects, a struct bm_position used for browsing the bitmap,
234  *      and a pointer to the list of pages used for allocating all of the
235  *      zone bitmap objects and bitmap block objects.
236  *
237  *      NOTE: It has to be possible to lay out the bitmap in memory
238  *      using only allocations of order 0.  Additionally, the bitmap is
239  *      designed to work with arbitrary number of zones (this is over the
240  *      top for now, but let's avoid making unnecessary assumptions ;-).
241  *
242  *      struct zone_bitmap contains a pointer to a list of bitmap block
243  *      objects and a pointer to the bitmap block object that has been
244  *      most recently used for setting bits.  Additionally, it contains the
245  *      pfns that correspond to the start and end of the represented zone.
246  *
247  *      struct bm_block contains a pointer to the memory page in which
248  *      information is stored (in the form of a block of bitmap)
249  *      It also contains the pfns that correspond to the start and end of
250  *      the represented memory area.
251  *
252  *      The memory bitmap is organized as a radix tree to guarantee fast random
253  *      access to the bits. There is one radix tree for each zone (as returned
254  *      from create_mem_extents).
255  *
256  *      One radix tree is represented by one struct mem_zone_bm_rtree. There are
257  *      two linked lists for the nodes of the tree, one for the inner nodes and
258  *      one for the leave nodes. The linked leave nodes are used for fast linear
259  *      access of the memory bitmap.
260  *
261  *      The struct rtree_node represents one node of the radix tree.
262  */
263
264 #define BM_END_OF_MAP   (~0UL)
265
266 #define BM_BITS_PER_BLOCK       (PAGE_SIZE * BITS_PER_BYTE)
267 #define BM_BLOCK_SHIFT          (PAGE_SHIFT + 3)
268 #define BM_BLOCK_MASK           ((1UL << BM_BLOCK_SHIFT) - 1)
269
270 struct bm_block {
271         struct list_head hook;  /* hook into a list of bitmap blocks */
272         unsigned long start_pfn;        /* pfn represented by the first bit */
273         unsigned long end_pfn;  /* pfn represented by the last bit plus 1 */
274         unsigned long *data;    /* bitmap representing pages */
275 };
276
277 static inline unsigned long bm_block_bits(struct bm_block *bb)
278 {
279         return bb->end_pfn - bb->start_pfn;
280 }
281
282 /*
283  * struct rtree_node is a wrapper struct to link the nodes
284  * of the rtree together for easy linear iteration over
285  * bits and easy freeing
286  */
287 struct rtree_node {
288         struct list_head list;
289         unsigned long *data;
290 };
291
292 /*
293  * struct mem_zone_bm_rtree represents a bitmap used for one
294  * populated memory zone.
295  */
296 struct mem_zone_bm_rtree {
297         struct list_head list;          /* Link Zones together         */
298         struct list_head nodes;         /* Radix Tree inner nodes      */
299         struct list_head leaves;        /* Radix Tree leaves           */
300         unsigned long start_pfn;        /* Zone start page frame       */
301         unsigned long end_pfn;          /* Zone end page frame + 1     */
302         struct rtree_node *rtree;       /* Radix Tree Root             */
303         int levels;                     /* Number of Radix Tree Levels */
304         unsigned int blocks;            /* Number of Bitmap Blocks     */
305 };
306
307 /* strcut bm_position is used for browsing memory bitmaps */
308
309 struct bm_position {
310         struct bm_block *block;
311         int bit;
312
313         struct mem_zone_bm_rtree *zone;
314         struct rtree_node *node;
315         unsigned long node_pfn;
316         int node_bit;
317 };
318
319 struct memory_bitmap {
320         struct list_head zones;
321         struct list_head blocks;        /* list of bitmap blocks */
322         struct linked_page *p_list;     /* list of pages used to store zone
323                                          * bitmap objects and bitmap block
324                                          * objects
325                                          */
326         struct bm_position cur; /* most recently used bit position */
327 };
328
329 /* Functions that operate on memory bitmaps */
330
331 #define BM_ENTRIES_PER_LEVEL    (PAGE_SIZE / sizeof(unsigned long))
332 #if BITS_PER_LONG == 32
333 #define BM_RTREE_LEVEL_SHIFT    (PAGE_SHIFT - 2)
334 #else
335 #define BM_RTREE_LEVEL_SHIFT    (PAGE_SHIFT - 3)
336 #endif
337 #define BM_RTREE_LEVEL_MASK     ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
338
339 /*
340  *      alloc_rtree_node - Allocate a new node and add it to the radix tree.
341  *
342  *      This function is used to allocate inner nodes as well as the
343  *      leave nodes of the radix tree. It also adds the node to the
344  *      corresponding linked list passed in by the *list parameter.
345  */
346 static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
347                                            struct chain_allocator *ca,
348                                            struct list_head *list)
349 {
350         struct rtree_node *node;
351
352         node = chain_alloc(ca, sizeof(struct rtree_node));
353         if (!node)
354                 return NULL;
355
356         node->data = get_image_page(gfp_mask, safe_needed);
357         if (!node->data)
358                 return NULL;
359
360         list_add_tail(&node->list, list);
361
362         return node;
363 }
364
365 /*
366  *      add_rtree_block - Add a new leave node to the radix tree
367  *
368  *      The leave nodes need to be allocated in order to keep the leaves
369  *      linked list in order. This is guaranteed by the zone->blocks
370  *      counter.
371  */
372 static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
373                            int safe_needed, struct chain_allocator *ca)
374 {
375         struct rtree_node *node, *block, **dst;
376         unsigned int levels_needed, block_nr;
377         int i;
378
379         block_nr = zone->blocks;
380         levels_needed = 0;
381
382         /* How many levels do we need for this block nr? */
383         while (block_nr) {
384                 levels_needed += 1;
385                 block_nr >>= BM_RTREE_LEVEL_SHIFT;
386         }
387
388         /* Make sure the rtree has enough levels */
389         for (i = zone->levels; i < levels_needed; i++) {
390                 node = alloc_rtree_node(gfp_mask, safe_needed, ca,
391                                         &zone->nodes);
392                 if (!node)
393                         return -ENOMEM;
394
395                 node->data[0] = (unsigned long)zone->rtree;
396                 zone->rtree = node;
397                 zone->levels += 1;
398         }
399
400         /* Allocate new block */
401         block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves);
402         if (!block)
403                 return -ENOMEM;
404
405         /* Now walk the rtree to insert the block */
406         node = zone->rtree;
407         dst = &zone->rtree;
408         block_nr = zone->blocks;
409         for (i = zone->levels; i > 0; i--) {
410                 int index;
411
412                 if (!node) {
413                         node = alloc_rtree_node(gfp_mask, safe_needed, ca,
414                                                 &zone->nodes);
415                         if (!node)
416                                 return -ENOMEM;
417                         *dst = node;
418                 }
419
420                 index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
421                 index &= BM_RTREE_LEVEL_MASK;
422                 dst = (struct rtree_node **)&((*dst)->data[index]);
423                 node = *dst;
424         }
425
426         zone->blocks += 1;
427         *dst = block;
428
429         return 0;
430 }
431
432 static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
433                                int clear_nosave_free);
434
435 /*
436  *      create_zone_bm_rtree - create a radix tree for one zone
437  *
438  *      Allocated the mem_zone_bm_rtree structure and initializes it.
439  *      This function also allocated and builds the radix tree for the
440  *      zone.
441  */
442 static struct mem_zone_bm_rtree *
443 create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
444                      struct chain_allocator *ca,
445                      unsigned long start, unsigned long end)
446 {
447         struct mem_zone_bm_rtree *zone;
448         unsigned int i, nr_blocks;
449         unsigned long pages;
450
451         pages = end - start;
452         zone  = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree));
453         if (!zone)
454                 return NULL;
455
456         INIT_LIST_HEAD(&zone->nodes);
457         INIT_LIST_HEAD(&zone->leaves);
458         zone->start_pfn = start;
459         zone->end_pfn = end;
460         nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
461
462         for (i = 0; i < nr_blocks; i++) {
463                 if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) {
464                         free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR);
465                         return NULL;
466                 }
467         }
468
469         return zone;
470 }
471
472 /*
473  *      free_zone_bm_rtree - Free the memory of the radix tree
474  *
475  *      Free all node pages of the radix tree. The mem_zone_bm_rtree
476  *      structure itself is not freed here nor are the rtree_node
477  *      structs.
478  */
479 static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
480                                int clear_nosave_free)
481 {
482         struct rtree_node *node;
483
484         list_for_each_entry(node, &zone->nodes, list)
485                 free_image_page(node->data, clear_nosave_free);
486
487         list_for_each_entry(node, &zone->leaves, list)
488                 free_image_page(node->data, clear_nosave_free);
489 }
490
491 static void memory_bm_position_reset(struct memory_bitmap *bm)
492 {
493         bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
494         bm->cur.bit = 0;
495
496         bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
497                                   list);
498         bm->cur.node = list_entry(bm->cur.zone->leaves.next,
499                                   struct rtree_node, list);
500         bm->cur.node_pfn = 0;
501         bm->cur.node_bit = 0;
502 }
503
504 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
505
506 /**
507  *      create_bm_block_list - create a list of block bitmap objects
508  *      @pages - number of pages to track
509  *      @list - list to put the allocated blocks into
510  *      @ca - chain allocator to be used for allocating memory
511  */
512 static int create_bm_block_list(unsigned long pages,
513                                 struct list_head *list,
514                                 struct chain_allocator *ca)
515 {
516         unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
517
518         while (nr_blocks-- > 0) {
519                 struct bm_block *bb;
520
521                 bb = chain_alloc(ca, sizeof(struct bm_block));
522                 if (!bb)
523                         return -ENOMEM;
524                 list_add(&bb->hook, list);
525         }
526
527         return 0;
528 }
529
530 struct mem_extent {
531         struct list_head hook;
532         unsigned long start;
533         unsigned long end;
534 };
535
536 /**
537  *      free_mem_extents - free a list of memory extents
538  *      @list - list of extents to empty
539  */
540 static void free_mem_extents(struct list_head *list)
541 {
542         struct mem_extent *ext, *aux;
543
544         list_for_each_entry_safe(ext, aux, list, hook) {
545                 list_del(&ext->hook);
546                 kfree(ext);
547         }
548 }
549
550 /**
551  *      create_mem_extents - create a list of memory extents representing
552  *                           contiguous ranges of PFNs
553  *      @list - list to put the extents into
554  *      @gfp_mask - mask to use for memory allocations
555  */
556 static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
557 {
558         struct zone *zone;
559
560         INIT_LIST_HEAD(list);
561
562         for_each_populated_zone(zone) {
563                 unsigned long zone_start, zone_end;
564                 struct mem_extent *ext, *cur, *aux;
565
566                 zone_start = zone->zone_start_pfn;
567                 zone_end = zone_end_pfn(zone);
568
569                 list_for_each_entry(ext, list, hook)
570                         if (zone_start <= ext->end)
571                                 break;
572
573                 if (&ext->hook == list || zone_end < ext->start) {
574                         /* New extent is necessary */
575                         struct mem_extent *new_ext;
576
577                         new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask);
578                         if (!new_ext) {
579                                 free_mem_extents(list);
580                                 return -ENOMEM;
581                         }
582                         new_ext->start = zone_start;
583                         new_ext->end = zone_end;
584                         list_add_tail(&new_ext->hook, &ext->hook);
585                         continue;
586                 }
587
588                 /* Merge this zone's range of PFNs with the existing one */
589                 if (zone_start < ext->start)
590                         ext->start = zone_start;
591                 if (zone_end > ext->end)
592                         ext->end = zone_end;
593
594                 /* More merging may be possible */
595                 cur = ext;
596                 list_for_each_entry_safe_continue(cur, aux, list, hook) {
597                         if (zone_end < cur->start)
598                                 break;
599                         if (zone_end < cur->end)
600                                 ext->end = cur->end;
601                         list_del(&cur->hook);
602                         kfree(cur);
603                 }
604         }
605
606         return 0;
607 }
608
609 /**
610   *     memory_bm_create - allocate memory for a memory bitmap
611   */
612 static int
613 memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
614 {
615         struct chain_allocator ca;
616         struct list_head mem_extents;
617         struct mem_extent *ext;
618         int error;
619
620         chain_init(&ca, gfp_mask, safe_needed);
621         INIT_LIST_HEAD(&bm->blocks);
622         INIT_LIST_HEAD(&bm->zones);
623
624         error = create_mem_extents(&mem_extents, gfp_mask);
625         if (error)
626                 return error;
627
628         list_for_each_entry(ext, &mem_extents, hook) {
629                 struct mem_zone_bm_rtree *zone;
630                 struct bm_block *bb;
631                 unsigned long pfn = ext->start;
632                 unsigned long pages = ext->end - ext->start;
633
634                 bb = list_entry(bm->blocks.prev, struct bm_block, hook);
635
636                 error = create_bm_block_list(pages, bm->blocks.prev, &ca);
637                 if (error)
638                         goto Error;
639
640                 list_for_each_entry_continue(bb, &bm->blocks, hook) {
641                         bb->data = get_image_page(gfp_mask, safe_needed);
642                         if (!bb->data) {
643                                 error = -ENOMEM;
644                                 goto Error;
645                         }
646
647                         bb->start_pfn = pfn;
648                         if (pages >= BM_BITS_PER_BLOCK) {
649                                 pfn += BM_BITS_PER_BLOCK;
650                                 pages -= BM_BITS_PER_BLOCK;
651                         } else {
652                                 /* This is executed only once in the loop */
653                                 pfn += pages;
654                         }
655                         bb->end_pfn = pfn;
656                 }
657
658                 zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca,
659                                             ext->start, ext->end);
660                 if (!zone)
661                         goto Error;
662                 list_add_tail(&zone->list, &bm->zones);
663         }
664
665         bm->p_list = ca.chain;
666         memory_bm_position_reset(bm);
667  Exit:
668         free_mem_extents(&mem_extents);
669         return error;
670
671  Error:
672         bm->p_list = ca.chain;
673         memory_bm_free(bm, PG_UNSAFE_CLEAR);
674         goto Exit;
675 }
676
677 /**
678   *     memory_bm_free - free memory occupied by the memory bitmap @bm
679   */
680 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
681 {
682         struct mem_zone_bm_rtree *zone;
683         struct bm_block *bb;
684
685         list_for_each_entry(bb, &bm->blocks, hook)
686                 if (bb->data)
687                         free_image_page(bb->data, clear_nosave_free);
688
689         list_for_each_entry(zone, &bm->zones, list)
690                 free_zone_bm_rtree(zone, clear_nosave_free);
691
692         free_list_of_pages(bm->p_list, clear_nosave_free);
693
694         INIT_LIST_HEAD(&bm->zones);
695         INIT_LIST_HEAD(&bm->blocks);
696 }
697
698 /**
699  *      memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
700  *      to given pfn.  The cur_zone_bm member of @bm and the cur_block member
701  *      of @bm->cur_zone_bm are updated.
702  */
703 static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
704                                 void **addr, unsigned int *bit_nr)
705 {
706         struct bm_block *bb;
707
708         /*
709          * Check if the pfn corresponds to the current bitmap block and find
710          * the block where it fits if this is not the case.
711          */
712         bb = bm->cur.block;
713         if (pfn < bb->start_pfn)
714                 list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
715                         if (pfn >= bb->start_pfn)
716                                 break;
717
718         if (pfn >= bb->end_pfn)
719                 list_for_each_entry_continue(bb, &bm->blocks, hook)
720                         if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
721                                 break;
722
723         if (&bb->hook == &bm->blocks)
724                 return -EFAULT;
725
726         /* The block has been found */
727         bm->cur.block = bb;
728         pfn -= bb->start_pfn;
729         bm->cur.bit = pfn + 1;
730         *bit_nr = pfn;
731         *addr = bb->data;
732         return 0;
733 }
734
735 /*
736  *      memory_rtree_find_bit - Find the bit for pfn in the memory
737  *                              bitmap
738  *
739  *      Walks the radix tree to find the page which contains the bit for
740  *      pfn and returns the bit position in **addr and *bit_nr.
741  */
742 static int memory_rtree_find_bit(struct memory_bitmap *bm, unsigned long pfn,
743                                  void **addr, unsigned int *bit_nr)
744 {
745         struct mem_zone_bm_rtree *curr, *zone;
746         struct rtree_node *node;
747         int i, block_nr;
748
749         zone = bm->cur.zone;
750
751         if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
752                 goto zone_found;
753
754         zone = NULL;
755
756         /* Find the right zone */
757         list_for_each_entry(curr, &bm->zones, list) {
758                 if (pfn >= curr->start_pfn && pfn < curr->end_pfn) {
759                         zone = curr;
760                         break;
761                 }
762         }
763
764         if (!zone)
765                 return -EFAULT;
766
767 zone_found:
768         /*
769          * We have a zone. Now walk the radix tree to find the leave
770          * node for our pfn.
771          */
772
773         node = bm->cur.node;
774         if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
775                 goto node_found;
776
777         node      = zone->rtree;
778         block_nr  = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT;
779
780         for (i = zone->levels; i > 0; i--) {
781                 int index;
782
783                 index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
784                 index &= BM_RTREE_LEVEL_MASK;
785                 BUG_ON(node->data[index] == 0);
786                 node = (struct rtree_node *)node->data[index];
787         }
788
789 node_found:
790         /* Update last position */
791         bm->cur.zone = zone;
792         bm->cur.node = node;
793         bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
794
795         /* Set return values */
796         *addr = node->data;
797         *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK;
798
799         return 0;
800 }
801
802 static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
803 {
804         void *addr;
805         unsigned int bit;
806         int error;
807
808         error = memory_bm_find_bit(bm, pfn, &addr, &bit);
809         BUG_ON(error);
810         set_bit(bit, addr);
811
812         error = memory_rtree_find_bit(bm, pfn, &addr, &bit);
813         BUG_ON(error);
814         set_bit(bit, addr);
815 }
816
817 static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
818 {
819         void *addr;
820         unsigned int bit;
821         int error;
822
823         error = memory_bm_find_bit(bm, pfn, &addr, &bit);
824         if (!error)
825                 set_bit(bit, addr);
826         else
827                 return error;
828
829         error = memory_rtree_find_bit(bm, pfn, &addr, &bit);
830         if (!error)
831                 set_bit(bit, addr);
832
833         return error;
834 }
835
836 static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
837 {
838         void *addr;
839         unsigned int bit;
840         int error;
841
842         error = memory_bm_find_bit(bm, pfn, &addr, &bit);
843         BUG_ON(error);
844         clear_bit(bit, addr);
845
846         error = memory_rtree_find_bit(bm, pfn, &addr, &bit);
847         BUG_ON(error);
848         clear_bit(bit, addr);
849 }
850
851 static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
852 {
853         void *addr;
854         unsigned int bit;
855         int error, error2;
856         int v;
857
858         error = memory_bm_find_bit(bm, pfn, &addr, &bit);
859         BUG_ON(error);
860         v = test_bit(bit, addr);
861
862         error2 = memory_rtree_find_bit(bm, pfn, &addr, &bit);
863         BUG_ON(error2);
864
865         WARN_ON_ONCE(v != test_bit(bit, addr));
866
867         return v;
868 }
869
870 static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
871 {
872         void *addr;
873         unsigned int bit;
874         int present;
875
876         present = !memory_bm_find_bit(bm, pfn, &addr, &bit);
877
878         WARN_ON_ONCE(present != !memory_rtree_find_bit(bm, pfn, &addr, &bit));
879
880         return present;
881 }
882
883 /**
884  *      memory_bm_next_pfn - find the pfn that corresponds to the next set bit
885  *      in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is
886  *      returned.
887  *
888  *      It is required to run memory_bm_position_reset() before the first call to
889  *      this function.
890  */
891
892 static unsigned long memory_bm_rtree_next_pfn(struct memory_bitmap *bm);
893
894 static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
895 {
896         unsigned long rtree_pfn;
897         struct bm_block *bb;
898         int bit;
899
900         rtree_pfn = memory_bm_rtree_next_pfn(bm);
901
902         bb = bm->cur.block;
903         do {
904                 bit = bm->cur.bit;
905                 bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
906                 if (bit < bm_block_bits(bb))
907                         goto Return_pfn;
908
909                 bb = list_entry(bb->hook.next, struct bm_block, hook);
910                 bm->cur.block = bb;
911                 bm->cur.bit = 0;
912         } while (&bb->hook != &bm->blocks);
913
914         memory_bm_position_reset(bm);
915         WARN_ON_ONCE(rtree_pfn != BM_END_OF_MAP);
916         return BM_END_OF_MAP;
917
918  Return_pfn:
919         WARN_ON_ONCE(bb->start_pfn + bit != rtree_pfn);
920         bm->cur.bit = bit + 1;
921         return bb->start_pfn + bit;
922 }
923
924 /*
925  *      rtree_next_node - Jumps to the next leave node
926  *
927  *      Sets the position to the beginning of the next node in the
928  *      memory bitmap. This is either the next node in the current
929  *      zone's radix tree or the first node in the radix tree of the
930  *      next zone.
931  *
932  *      Returns true if there is a next node, false otherwise.
933  */
934 static bool rtree_next_node(struct memory_bitmap *bm)
935 {
936         bm->cur.node = list_entry(bm->cur.node->list.next,
937                                   struct rtree_node, list);
938         if (&bm->cur.node->list != &bm->cur.zone->leaves) {
939                 bm->cur.node_pfn += BM_BITS_PER_BLOCK;
940                 bm->cur.node_bit  = 0;
941                 return true;
942         }
943
944         /* No more nodes, goto next zone */
945         bm->cur.zone = list_entry(bm->cur.zone->list.next,
946                                   struct mem_zone_bm_rtree, list);
947         if (&bm->cur.zone->list != &bm->zones) {
948                 bm->cur.node = list_entry(bm->cur.zone->leaves.next,
949                                           struct rtree_node, list);
950                 bm->cur.node_pfn = 0;
951                 bm->cur.node_bit = 0;
952                 return true;
953         }
954
955         /* No more zones */
956         return false;
957 }
958
959 /*
960  *      memory_bm_rtree_next_pfn - Find the next set bit
961  *
962  *      Starting from the last returned position this function searches
963  *      for the next set bit in the memory bitmap and returns its
964  *      number. If no more bit is set BM_END_OF_MAP is returned.
965  */
966 static unsigned long memory_bm_rtree_next_pfn(struct memory_bitmap *bm)
967 {
968         unsigned long bits, pfn, pages;
969         int bit;
970
971         do {
972                 pages     = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
973                 bits      = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
974                 bit       = find_next_bit(bm->cur.node->data, bits,
975                                           bm->cur.node_bit);
976                 if (bit < bits) {
977                         pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
978                         bm->cur.node_bit = bit + 1;
979                         return pfn;
980                 }
981         } while (rtree_next_node(bm));
982
983         return BM_END_OF_MAP;
984 }
985
986 /**
987  *      This structure represents a range of page frames the contents of which
988  *      should not be saved during the suspend.
989  */
990
991 struct nosave_region {
992         struct list_head list;
993         unsigned long start_pfn;
994         unsigned long end_pfn;
995 };
996
997 static LIST_HEAD(nosave_regions);
998
999 /**
1000  *      register_nosave_region - register a range of page frames the contents
1001  *      of which should not be saved during the suspend (to be used in the early
1002  *      initialization code)
1003  */
1004
1005 void __init
1006 __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
1007                          int use_kmalloc)
1008 {
1009         struct nosave_region *region;
1010
1011         if (start_pfn >= end_pfn)
1012                 return;
1013
1014         if (!list_empty(&nosave_regions)) {
1015                 /* Try to extend the previous region (they should be sorted) */
1016                 region = list_entry(nosave_regions.prev,
1017                                         struct nosave_region, list);
1018                 if (region->end_pfn == start_pfn) {
1019                         region->end_pfn = end_pfn;
1020                         goto Report;
1021                 }
1022         }
1023         if (use_kmalloc) {
1024                 /* during init, this shouldn't fail */
1025                 region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
1026                 BUG_ON(!region);
1027         } else
1028                 /* This allocation cannot fail */
1029                 region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
1030         region->start_pfn = start_pfn;
1031         region->end_pfn = end_pfn;
1032         list_add_tail(&region->list, &nosave_regions);
1033  Report:
1034         printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
1035                 (unsigned long long) start_pfn << PAGE_SHIFT,
1036                 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
1037 }
1038
1039 /*
1040  * Set bits in this map correspond to the page frames the contents of which
1041  * should not be saved during the suspend.
1042  */
1043 static struct memory_bitmap *forbidden_pages_map;
1044
1045 /* Set bits in this map correspond to free page frames. */
1046 static struct memory_bitmap *free_pages_map;
1047
1048 /*
1049  * Each page frame allocated for creating the image is marked by setting the
1050  * corresponding bits in forbidden_pages_map and free_pages_map simultaneously
1051  */
1052
1053 void swsusp_set_page_free(struct page *page)
1054 {
1055         if (free_pages_map)
1056                 memory_bm_set_bit(free_pages_map, page_to_pfn(page));
1057 }
1058
1059 static int swsusp_page_is_free(struct page *page)
1060 {
1061         return free_pages_map ?
1062                 memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0;
1063 }
1064
1065 void swsusp_unset_page_free(struct page *page)
1066 {
1067         if (free_pages_map)
1068                 memory_bm_clear_bit(free_pages_map, page_to_pfn(page));
1069 }
1070
1071 static void swsusp_set_page_forbidden(struct page *page)
1072 {
1073         if (forbidden_pages_map)
1074                 memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page));
1075 }
1076
1077 int swsusp_page_is_forbidden(struct page *page)
1078 {
1079         return forbidden_pages_map ?
1080                 memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0;
1081 }
1082
1083 static void swsusp_unset_page_forbidden(struct page *page)
1084 {
1085         if (forbidden_pages_map)
1086                 memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page));
1087 }
1088
1089 /**
1090  *      mark_nosave_pages - set bits corresponding to the page frames the
1091  *      contents of which should not be saved in a given bitmap.
1092  */
1093
1094 static void mark_nosave_pages(struct memory_bitmap *bm)
1095 {
1096         struct nosave_region *region;
1097
1098         if (list_empty(&nosave_regions))
1099                 return;
1100
1101         list_for_each_entry(region, &nosave_regions, list) {
1102                 unsigned long pfn;
1103
1104                 pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
1105                          (unsigned long long) region->start_pfn << PAGE_SHIFT,
1106                          ((unsigned long long) region->end_pfn << PAGE_SHIFT)
1107                                 - 1);
1108
1109                 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
1110                         if (pfn_valid(pfn)) {
1111                                 /*
1112                                  * It is safe to ignore the result of
1113                                  * mem_bm_set_bit_check() here, since we won't
1114                                  * touch the PFNs for which the error is
1115                                  * returned anyway.
1116                                  */
1117                                 mem_bm_set_bit_check(bm, pfn);
1118                         }
1119         }
1120 }
1121
1122 /**
1123  *      create_basic_memory_bitmaps - create bitmaps needed for marking page
1124  *      frames that should not be saved and free page frames.  The pointers
1125  *      forbidden_pages_map and free_pages_map are only modified if everything
1126  *      goes well, because we don't want the bits to be used before both bitmaps
1127  *      are set up.
1128  */
1129
1130 int create_basic_memory_bitmaps(void)
1131 {
1132         struct memory_bitmap *bm1, *bm2;
1133         int error = 0;
1134
1135         if (forbidden_pages_map && free_pages_map)
1136                 return 0;
1137         else
1138                 BUG_ON(forbidden_pages_map || free_pages_map);
1139
1140         bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
1141         if (!bm1)
1142                 return -ENOMEM;
1143
1144         error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
1145         if (error)
1146                 goto Free_first_object;
1147
1148         bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
1149         if (!bm2)
1150                 goto Free_first_bitmap;
1151
1152         error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY);
1153         if (error)
1154                 goto Free_second_object;
1155
1156         forbidden_pages_map = bm1;
1157         free_pages_map = bm2;
1158         mark_nosave_pages(forbidden_pages_map);
1159
1160         pr_debug("PM: Basic memory bitmaps created\n");
1161
1162         return 0;
1163
1164  Free_second_object:
1165         kfree(bm2);
1166  Free_first_bitmap:
1167         memory_bm_free(bm1, PG_UNSAFE_CLEAR);
1168  Free_first_object:
1169         kfree(bm1);
1170         return -ENOMEM;
1171 }
1172
1173 /**
1174  *      free_basic_memory_bitmaps - free memory bitmaps allocated by
1175  *      create_basic_memory_bitmaps().  The auxiliary pointers are necessary
1176  *      so that the bitmaps themselves are not referred to while they are being
1177  *      freed.
1178  */
1179
1180 void free_basic_memory_bitmaps(void)
1181 {
1182         struct memory_bitmap *bm1, *bm2;
1183
1184         if (WARN_ON(!(forbidden_pages_map && free_pages_map)))
1185                 return;
1186
1187         bm1 = forbidden_pages_map;
1188         bm2 = free_pages_map;
1189         forbidden_pages_map = NULL;
1190         free_pages_map = NULL;
1191         memory_bm_free(bm1, PG_UNSAFE_CLEAR);
1192         kfree(bm1);
1193         memory_bm_free(bm2, PG_UNSAFE_CLEAR);
1194         kfree(bm2);
1195
1196         pr_debug("PM: Basic memory bitmaps freed\n");
1197 }
1198
1199 /**
1200  *      snapshot_additional_pages - estimate the number of additional pages
1201  *      be needed for setting up the suspend image data structures for given
1202  *      zone (usually the returned value is greater than the exact number)
1203  */
1204
1205 unsigned int snapshot_additional_pages(struct zone *zone)
1206 {
1207         unsigned int rtree, nodes;
1208         unsigned int res;
1209
1210         res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
1211         res += DIV_ROUND_UP(res * sizeof(struct bm_block),
1212                             LINKED_PAGE_DATA_SIZE);
1213         rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
1214         rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node),
1215                               LINKED_PAGE_DATA_SIZE);
1216         while (nodes > 1) {
1217                 nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL);
1218                 rtree += nodes;
1219         }
1220
1221         return 2 * (res + rtree);
1222 }
1223
1224 #ifdef CONFIG_HIGHMEM
1225 /**
1226  *      count_free_highmem_pages - compute the total number of free highmem
1227  *      pages, system-wide.
1228  */
1229
1230 static unsigned int count_free_highmem_pages(void)
1231 {
1232         struct zone *zone;
1233         unsigned int cnt = 0;
1234
1235         for_each_populated_zone(zone)
1236                 if (is_highmem(zone))
1237                         cnt += zone_page_state(zone, NR_FREE_PAGES);
1238
1239         return cnt;
1240 }
1241
1242 /**
1243  *      saveable_highmem_page - Determine whether a highmem page should be
1244  *      included in the suspend image.
1245  *
1246  *      We should save the page if it isn't Nosave or NosaveFree, or Reserved,
1247  *      and it isn't a part of a free chunk of pages.
1248  */
1249 static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
1250 {
1251         struct page *page;
1252
1253         if (!pfn_valid(pfn))
1254                 return NULL;
1255
1256         page = pfn_to_page(pfn);
1257         if (page_zone(page) != zone)
1258                 return NULL;
1259
1260         BUG_ON(!PageHighMem(page));
1261
1262         if (swsusp_page_is_forbidden(page) ||  swsusp_page_is_free(page) ||
1263             PageReserved(page))
1264                 return NULL;
1265
1266         if (page_is_guard(page))
1267                 return NULL;
1268
1269         return page;
1270 }
1271
1272 /**
1273  *      count_highmem_pages - compute the total number of saveable highmem
1274  *      pages.
1275  */
1276
1277 static unsigned int count_highmem_pages(void)
1278 {
1279         struct zone *zone;
1280         unsigned int n = 0;
1281
1282         for_each_populated_zone(zone) {
1283                 unsigned long pfn, max_zone_pfn;
1284
1285                 if (!is_highmem(zone))
1286                         continue;
1287
1288                 mark_free_pages(zone);
1289                 max_zone_pfn = zone_end_pfn(zone);
1290                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1291                         if (saveable_highmem_page(zone, pfn))
1292                                 n++;
1293         }
1294         return n;
1295 }
1296 #else
1297 static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
1298 {
1299         return NULL;
1300 }
1301 #endif /* CONFIG_HIGHMEM */
1302
1303 /**
1304  *      saveable_page - Determine whether a non-highmem page should be included
1305  *      in the suspend image.
1306  *
1307  *      We should save the page if it isn't Nosave, and is not in the range
1308  *      of pages statically defined as 'unsaveable', and it isn't a part of
1309  *      a free chunk of pages.
1310  */
1311 static struct page *saveable_page(struct zone *zone, unsigned long pfn)
1312 {
1313         struct page *page;
1314
1315         if (!pfn_valid(pfn))
1316                 return NULL;
1317
1318         page = pfn_to_page(pfn);
1319         if (page_zone(page) != zone)
1320                 return NULL;
1321
1322         BUG_ON(PageHighMem(page));
1323
1324         if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
1325                 return NULL;
1326
1327         if (PageReserved(page)
1328             && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
1329                 return NULL;
1330
1331         if (page_is_guard(page))
1332                 return NULL;
1333
1334         return page;
1335 }
1336
1337 /**
1338  *      count_data_pages - compute the total number of saveable non-highmem
1339  *      pages.
1340  */
1341
1342 static unsigned int count_data_pages(void)
1343 {
1344         struct zone *zone;
1345         unsigned long pfn, max_zone_pfn;
1346         unsigned int n = 0;
1347
1348         for_each_populated_zone(zone) {
1349                 if (is_highmem(zone))
1350                         continue;
1351
1352                 mark_free_pages(zone);
1353                 max_zone_pfn = zone_end_pfn(zone);
1354                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1355                         if (saveable_page(zone, pfn))
1356                                 n++;
1357         }
1358         return n;
1359 }
1360
1361 /* This is needed, because copy_page and memcpy are not usable for copying
1362  * task structs.
1363  */
1364 static inline void do_copy_page(long *dst, long *src)
1365 {
1366         int n;
1367
1368         for (n = PAGE_SIZE / sizeof(long); n; n--)
1369                 *dst++ = *src++;
1370 }
1371
1372
1373 /**
1374  *      safe_copy_page - check if the page we are going to copy is marked as
1375  *              present in the kernel page tables (this always is the case if
1376  *              CONFIG_DEBUG_PAGEALLOC is not set and in that case
1377  *              kernel_page_present() always returns 'true').
1378  */
1379 static void safe_copy_page(void *dst, struct page *s_page)
1380 {
1381         if (kernel_page_present(s_page)) {
1382                 do_copy_page(dst, page_address(s_page));
1383         } else {
1384                 kernel_map_pages(s_page, 1, 1);
1385                 do_copy_page(dst, page_address(s_page));
1386                 kernel_map_pages(s_page, 1, 0);
1387         }
1388 }
1389
1390
1391 #ifdef CONFIG_HIGHMEM
1392 static inline struct page *
1393 page_is_saveable(struct zone *zone, unsigned long pfn)
1394 {
1395         return is_highmem(zone) ?
1396                 saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
1397 }
1398
1399 static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
1400 {
1401         struct page *s_page, *d_page;
1402         void *src, *dst;
1403
1404         s_page = pfn_to_page(src_pfn);
1405         d_page = pfn_to_page(dst_pfn);
1406         if (PageHighMem(s_page)) {
1407                 src = kmap_atomic(s_page);
1408                 dst = kmap_atomic(d_page);
1409                 do_copy_page(dst, src);
1410                 kunmap_atomic(dst);
1411                 kunmap_atomic(src);
1412         } else {
1413                 if (PageHighMem(d_page)) {
1414                         /* Page pointed to by src may contain some kernel
1415                          * data modified by kmap_atomic()
1416                          */
1417                         safe_copy_page(buffer, s_page);
1418                         dst = kmap_atomic(d_page);
1419                         copy_page(dst, buffer);
1420                         kunmap_atomic(dst);
1421                 } else {
1422                         safe_copy_page(page_address(d_page), s_page);
1423                 }
1424         }
1425 }
1426 #else
1427 #define page_is_saveable(zone, pfn)     saveable_page(zone, pfn)
1428
1429 static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
1430 {
1431         safe_copy_page(page_address(pfn_to_page(dst_pfn)),
1432                                 pfn_to_page(src_pfn));
1433 }
1434 #endif /* CONFIG_HIGHMEM */
1435
1436 static void
1437 copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1438 {
1439         struct zone *zone;
1440         unsigned long pfn;
1441
1442         for_each_populated_zone(zone) {
1443                 unsigned long max_zone_pfn;
1444
1445                 mark_free_pages(zone);
1446                 max_zone_pfn = zone_end_pfn(zone);
1447                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1448                         if (page_is_saveable(zone, pfn))
1449                                 memory_bm_set_bit(orig_bm, pfn);
1450         }
1451         memory_bm_position_reset(orig_bm);
1452         memory_bm_position_reset(copy_bm);
1453         for(;;) {
1454                 pfn = memory_bm_next_pfn(orig_bm);
1455                 if (unlikely(pfn == BM_END_OF_MAP))
1456                         break;
1457                 copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
1458         }
1459 }
1460
1461 /* Total number of image pages */
1462 static unsigned int nr_copy_pages;
1463 /* Number of pages needed for saving the original pfns of the image pages */
1464 static unsigned int nr_meta_pages;
1465 /*
1466  * Numbers of normal and highmem page frames allocated for hibernation image
1467  * before suspending devices.
1468  */
1469 unsigned int alloc_normal, alloc_highmem;
1470 /*
1471  * Memory bitmap used for marking saveable pages (during hibernation) or
1472  * hibernation image pages (during restore)
1473  */
1474 static struct memory_bitmap orig_bm;
1475 /*
1476  * Memory bitmap used during hibernation for marking allocated page frames that
1477  * will contain copies of saveable pages.  During restore it is initially used
1478  * for marking hibernation image pages, but then the set bits from it are
1479  * duplicated in @orig_bm and it is released.  On highmem systems it is next
1480  * used for marking "safe" highmem pages, but it has to be reinitialized for
1481  * this purpose.
1482  */
1483 static struct memory_bitmap copy_bm;
1484
1485 /**
1486  *      swsusp_free - free pages allocated for the suspend.
1487  *
1488  *      Suspend pages are alocated before the atomic copy is made, so we
1489  *      need to release them after the resume.
1490  */
1491
1492 void swsusp_free(void)
1493 {
1494         struct zone *zone;
1495         unsigned long pfn, max_zone_pfn;
1496
1497         for_each_populated_zone(zone) {
1498                 max_zone_pfn = zone_end_pfn(zone);
1499                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1500                         if (pfn_valid(pfn)) {
1501                                 struct page *page = pfn_to_page(pfn);
1502
1503                                 if (swsusp_page_is_forbidden(page) &&
1504                                     swsusp_page_is_free(page)) {
1505                                         swsusp_unset_page_forbidden(page);
1506                                         swsusp_unset_page_free(page);
1507                                         __free_page(page);
1508                                 }
1509                         }
1510         }
1511         nr_copy_pages = 0;
1512         nr_meta_pages = 0;
1513         restore_pblist = NULL;
1514         buffer = NULL;
1515         alloc_normal = 0;
1516         alloc_highmem = 0;
1517 }
1518
1519 /* Helper functions used for the shrinking of memory. */
1520
1521 #define GFP_IMAGE       (GFP_KERNEL | __GFP_NOWARN)
1522
1523 /**
1524  * preallocate_image_pages - Allocate a number of pages for hibernation image
1525  * @nr_pages: Number of page frames to allocate.
1526  * @mask: GFP flags to use for the allocation.
1527  *
1528  * Return value: Number of page frames actually allocated
1529  */
1530 static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
1531 {
1532         unsigned long nr_alloc = 0;
1533
1534         while (nr_pages > 0) {
1535                 struct page *page;
1536
1537                 page = alloc_image_page(mask);
1538                 if (!page)
1539                         break;
1540                 memory_bm_set_bit(&copy_bm, page_to_pfn(page));
1541                 if (PageHighMem(page))
1542                         alloc_highmem++;
1543                 else
1544                         alloc_normal++;
1545                 nr_pages--;
1546                 nr_alloc++;
1547         }
1548
1549         return nr_alloc;
1550 }
1551
1552 static unsigned long preallocate_image_memory(unsigned long nr_pages,
1553                                               unsigned long avail_normal)
1554 {
1555         unsigned long alloc;
1556
1557         if (avail_normal <= alloc_normal)
1558                 return 0;
1559
1560         alloc = avail_normal - alloc_normal;
1561         if (nr_pages < alloc)
1562                 alloc = nr_pages;
1563
1564         return preallocate_image_pages(alloc, GFP_IMAGE);
1565 }
1566
1567 #ifdef CONFIG_HIGHMEM
1568 static unsigned long preallocate_image_highmem(unsigned long nr_pages)
1569 {
1570         return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM);
1571 }
1572
1573 /**
1574  *  __fraction - Compute (an approximation of) x * (multiplier / base)
1575  */
1576 static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
1577 {
1578         x *= multiplier;
1579         do_div(x, base);
1580         return (unsigned long)x;
1581 }
1582
1583 static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1584                                                 unsigned long highmem,
1585                                                 unsigned long total)
1586 {
1587         unsigned long alloc = __fraction(nr_pages, highmem, total);
1588
1589         return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM);
1590 }
1591 #else /* CONFIG_HIGHMEM */
1592 static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
1593 {
1594         return 0;
1595 }
1596
1597 static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1598                                                 unsigned long highmem,
1599                                                 unsigned long total)
1600 {
1601         return 0;
1602 }
1603 #endif /* CONFIG_HIGHMEM */
1604
1605 /**
1606  * free_unnecessary_pages - Release preallocated pages not needed for the image
1607  */
1608 static void free_unnecessary_pages(void)
1609 {
1610         unsigned long save, to_free_normal, to_free_highmem;
1611
1612         save = count_data_pages();
1613         if (alloc_normal >= save) {
1614                 to_free_normal = alloc_normal - save;
1615                 save = 0;
1616         } else {
1617                 to_free_normal = 0;
1618                 save -= alloc_normal;
1619         }
1620         save += count_highmem_pages();
1621         if (alloc_highmem >= save) {
1622                 to_free_highmem = alloc_highmem - save;
1623         } else {
1624                 to_free_highmem = 0;
1625                 save -= alloc_highmem;
1626                 if (to_free_normal > save)
1627                         to_free_normal -= save;
1628                 else
1629                         to_free_normal = 0;
1630         }
1631
1632         memory_bm_position_reset(&copy_bm);
1633
1634         while (to_free_normal > 0 || to_free_highmem > 0) {
1635                 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1636                 struct page *page = pfn_to_page(pfn);
1637
1638                 if (PageHighMem(page)) {
1639                         if (!to_free_highmem)
1640                                 continue;
1641                         to_free_highmem--;
1642                         alloc_highmem--;
1643                 } else {
1644                         if (!to_free_normal)
1645                                 continue;
1646                         to_free_normal--;
1647                         alloc_normal--;
1648                 }
1649                 memory_bm_clear_bit(&copy_bm, pfn);
1650                 swsusp_unset_page_forbidden(page);
1651                 swsusp_unset_page_free(page);
1652                 __free_page(page);
1653         }
1654 }
1655
1656 /**
1657  * minimum_image_size - Estimate the minimum acceptable size of an image
1658  * @saveable: Number of saveable pages in the system.
1659  *
1660  * We want to avoid attempting to free too much memory too hard, so estimate the
1661  * minimum acceptable size of a hibernation image to use as the lower limit for
1662  * preallocating memory.
1663  *
1664  * We assume that the minimum image size should be proportional to
1665  *
1666  * [number of saveable pages] - [number of pages that can be freed in theory]
1667  *
1668  * where the second term is the sum of (1) reclaimable slab pages, (2) active
1669  * and (3) inactive anonymous pages, (4) active and (5) inactive file pages,
1670  * minus mapped file pages.
1671  */
1672 static unsigned long minimum_image_size(unsigned long saveable)
1673 {
1674         unsigned long size;
1675
1676         size = global_page_state(NR_SLAB_RECLAIMABLE)
1677                 + global_page_state(NR_ACTIVE_ANON)
1678                 + global_page_state(NR_INACTIVE_ANON)
1679                 + global_page_state(NR_ACTIVE_FILE)
1680                 + global_page_state(NR_INACTIVE_FILE)
1681                 - global_page_state(NR_FILE_MAPPED);
1682
1683         return saveable <= size ? 0 : saveable - size;
1684 }
1685
1686 /**
1687  * hibernate_preallocate_memory - Preallocate memory for hibernation image
1688  *
1689  * To create a hibernation image it is necessary to make a copy of every page
1690  * frame in use.  We also need a number of page frames to be free during
1691  * hibernation for allocations made while saving the image and for device
1692  * drivers, in case they need to allocate memory from their hibernation
1693  * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough
1694  * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through
1695  * /sys/power/reserved_size, respectively).  To make this happen, we compute the
1696  * total number of available page frames and allocate at least
1697  *
1698  * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
1699  *  + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
1700  *
1701  * of them, which corresponds to the maximum size of a hibernation image.
1702  *
1703  * If image_size is set below the number following from the above formula,
1704  * the preallocation of memory is continued until the total number of saveable
1705  * pages in the system is below the requested image size or the minimum
1706  * acceptable image size returned by minimum_image_size(), whichever is greater.
1707  */
1708 int hibernate_preallocate_memory(void)
1709 {
1710         struct zone *zone;
1711         unsigned long saveable, size, max_size, count, highmem, pages = 0;
1712         unsigned long alloc, save_highmem, pages_highmem, avail_normal;
1713         struct timeval start, stop;
1714         int error;
1715
1716         printk(KERN_INFO "PM: Preallocating image memory... ");
1717         do_gettimeofday(&start);
1718
1719         error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
1720         if (error)
1721                 goto err_out;
1722
1723         error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
1724         if (error)
1725                 goto err_out;
1726
1727         alloc_normal = 0;
1728         alloc_highmem = 0;
1729
1730         /* Count the number of saveable data pages. */
1731         save_highmem = count_highmem_pages();
1732         saveable = count_data_pages();
1733
1734         /*
1735          * Compute the total number of page frames we can use (count) and the
1736          * number of pages needed for image metadata (size).
1737          */
1738         count = saveable;
1739         saveable += save_highmem;
1740         highmem = save_highmem;
1741         size = 0;
1742         for_each_populated_zone(zone) {
1743                 size += snapshot_additional_pages(zone);
1744                 if (is_highmem(zone))
1745                         highmem += zone_page_state(zone, NR_FREE_PAGES);
1746                 else
1747                         count += zone_page_state(zone, NR_FREE_PAGES);
1748         }
1749         avail_normal = count;
1750         count += highmem;
1751         count -= totalreserve_pages;
1752
1753         /* Add number of pages required for page keys (s390 only). */
1754         size += page_key_additional_pages(saveable);
1755
1756         /* Compute the maximum number of saveable pages to leave in memory. */
1757         max_size = (count - (size + PAGES_FOR_IO)) / 2
1758                         - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
1759         /* Compute the desired number of image pages specified by image_size. */
1760         size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1761         if (size > max_size)
1762                 size = max_size;
1763         /*
1764          * If the desired number of image pages is at least as large as the
1765          * current number of saveable pages in memory, allocate page frames for
1766          * the image and we're done.
1767          */
1768         if (size >= saveable) {
1769                 pages = preallocate_image_highmem(save_highmem);
1770                 pages += preallocate_image_memory(saveable - pages, avail_normal);
1771                 goto out;
1772         }
1773
1774         /* Estimate the minimum size of the image. */
1775         pages = minimum_image_size(saveable);
1776         /*
1777          * To avoid excessive pressure on the normal zone, leave room in it to
1778          * accommodate an image of the minimum size (unless it's already too
1779          * small, in which case don't preallocate pages from it at all).
1780          */
1781         if (avail_normal > pages)
1782                 avail_normal -= pages;
1783         else
1784                 avail_normal = 0;
1785         if (size < pages)
1786                 size = min_t(unsigned long, pages, max_size);
1787
1788         /*
1789          * Let the memory management subsystem know that we're going to need a
1790          * large number of page frames to allocate and make it free some memory.
1791          * NOTE: If this is not done, performance will be hurt badly in some
1792          * test cases.
1793          */
1794         shrink_all_memory(saveable - size);
1795
1796         /*
1797          * The number of saveable pages in memory was too high, so apply some
1798          * pressure to decrease it.  First, make room for the largest possible
1799          * image and fail if that doesn't work.  Next, try to decrease the size
1800          * of the image as much as indicated by 'size' using allocations from
1801          * highmem and non-highmem zones separately.
1802          */
1803         pages_highmem = preallocate_image_highmem(highmem / 2);
1804         alloc = count - max_size;
1805         if (alloc > pages_highmem)
1806                 alloc -= pages_highmem;
1807         else
1808                 alloc = 0;
1809         pages = preallocate_image_memory(alloc, avail_normal);
1810         if (pages < alloc) {
1811                 /* We have exhausted non-highmem pages, try highmem. */
1812                 alloc -= pages;
1813                 pages += pages_highmem;
1814                 pages_highmem = preallocate_image_highmem(alloc);
1815                 if (pages_highmem < alloc)
1816                         goto err_out;
1817                 pages += pages_highmem;
1818                 /*
1819                  * size is the desired number of saveable pages to leave in
1820                  * memory, so try to preallocate (all memory - size) pages.
1821                  */
1822                 alloc = (count - pages) - size;
1823                 pages += preallocate_image_highmem(alloc);
1824         } else {
1825                 /*
1826                  * There are approximately max_size saveable pages at this point
1827                  * and we want to reduce this number down to size.
1828                  */
1829                 alloc = max_size - size;
1830                 size = preallocate_highmem_fraction(alloc, highmem, count);
1831                 pages_highmem += size;
1832                 alloc -= size;
1833                 size = preallocate_image_memory(alloc, avail_normal);
1834                 pages_highmem += preallocate_image_highmem(alloc - size);
1835                 pages += pages_highmem + size;
1836         }
1837
1838         /*
1839          * We only need as many page frames for the image as there are saveable
1840          * pages in memory, but we have allocated more.  Release the excessive
1841          * ones now.
1842          */
1843         free_unnecessary_pages();
1844
1845  out:
1846         do_gettimeofday(&stop);
1847         printk(KERN_CONT "done (allocated %lu pages)\n", pages);
1848         swsusp_show_speed(&start, &stop, pages, "Allocated");
1849
1850         return 0;
1851
1852  err_out:
1853         printk(KERN_CONT "\n");
1854         swsusp_free();
1855         return -ENOMEM;
1856 }
1857
1858 #ifdef CONFIG_HIGHMEM
1859 /**
1860   *     count_pages_for_highmem - compute the number of non-highmem pages
1861   *     that will be necessary for creating copies of highmem pages.
1862   */
1863
1864 static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
1865 {
1866         unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
1867
1868         if (free_highmem >= nr_highmem)
1869                 nr_highmem = 0;
1870         else
1871                 nr_highmem -= free_highmem;
1872
1873         return nr_highmem;
1874 }
1875 #else
1876 static unsigned int
1877 count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
1878 #endif /* CONFIG_HIGHMEM */
1879
1880 /**
1881  *      enough_free_mem - Make sure we have enough free memory for the
1882  *      snapshot image.
1883  */
1884
1885 static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
1886 {
1887         struct zone *zone;
1888         unsigned int free = alloc_normal;
1889
1890         for_each_populated_zone(zone)
1891                 if (!is_highmem(zone))
1892                         free += zone_page_state(zone, NR_FREE_PAGES);
1893
1894         nr_pages += count_pages_for_highmem(nr_highmem);
1895         pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
1896                 nr_pages, PAGES_FOR_IO, free);
1897
1898         return free > nr_pages + PAGES_FOR_IO;
1899 }
1900
1901 #ifdef CONFIG_HIGHMEM
1902 /**
1903  *      get_highmem_buffer - if there are some highmem pages in the suspend
1904  *      image, we may need the buffer to copy them and/or load their data.
1905  */
1906
1907 static inline int get_highmem_buffer(int safe_needed)
1908 {
1909         buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
1910         return buffer ? 0 : -ENOMEM;
1911 }
1912
1913 /**
1914  *      alloc_highmem_image_pages - allocate some highmem pages for the image.
1915  *      Try to allocate as many pages as needed, but if the number of free
1916  *      highmem pages is lesser than that, allocate them all.
1917  */
1918
1919 static inline unsigned int
1920 alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1921 {
1922         unsigned int to_alloc = count_free_highmem_pages();
1923
1924         if (to_alloc > nr_highmem)
1925                 to_alloc = nr_highmem;
1926
1927         nr_highmem -= to_alloc;
1928         while (to_alloc-- > 0) {
1929                 struct page *page;
1930
1931                 page = alloc_image_page(__GFP_HIGHMEM);
1932                 memory_bm_set_bit(bm, page_to_pfn(page));
1933         }
1934         return nr_highmem;
1935 }
1936 #else
1937 static inline int get_highmem_buffer(int safe_needed) { return 0; }
1938
1939 static inline unsigned int
1940 alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
1941 #endif /* CONFIG_HIGHMEM */
1942
1943 /**
1944  *      swsusp_alloc - allocate memory for the suspend image
1945  *
1946  *      We first try to allocate as many highmem pages as there are
1947  *      saveable highmem pages in the system.  If that fails, we allocate
1948  *      non-highmem pages for the copies of the remaining highmem ones.
1949  *
1950  *      In this approach it is likely that the copies of highmem pages will
1951  *      also be located in the high memory, because of the way in which
1952  *      copy_data_pages() works.
1953  */
1954
1955 static int
1956 swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1957                 unsigned int nr_pages, unsigned int nr_highmem)
1958 {
1959         if (nr_highmem > 0) {
1960                 if (get_highmem_buffer(PG_ANY))
1961                         goto err_out;
1962                 if (nr_highmem > alloc_highmem) {
1963                         nr_highmem -= alloc_highmem;
1964                         nr_pages += alloc_highmem_pages(copy_bm, nr_highmem);
1965                 }
1966         }
1967         if (nr_pages > alloc_normal) {
1968                 nr_pages -= alloc_normal;
1969                 while (nr_pages-- > 0) {
1970                         struct page *page;
1971
1972                         page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
1973                         if (!page)
1974                                 goto err_out;
1975                         memory_bm_set_bit(copy_bm, page_to_pfn(page));
1976                 }
1977         }
1978
1979         return 0;
1980
1981  err_out:
1982         swsusp_free();
1983         return -ENOMEM;
1984 }
1985
1986 asmlinkage __visible int swsusp_save(void)
1987 {
1988         unsigned int nr_pages, nr_highmem;
1989
1990         printk(KERN_INFO "PM: Creating hibernation image:\n");
1991
1992         drain_local_pages(NULL);
1993         nr_pages = count_data_pages();
1994         nr_highmem = count_highmem_pages();
1995         printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
1996
1997         if (!enough_free_mem(nr_pages, nr_highmem)) {
1998                 printk(KERN_ERR "PM: Not enough free memory\n");
1999                 return -ENOMEM;
2000         }
2001
2002         if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
2003                 printk(KERN_ERR "PM: Memory allocation failed\n");
2004                 return -ENOMEM;
2005         }
2006
2007         /* During allocating of suspend pagedir, new cold pages may appear.
2008          * Kill them.
2009          */
2010         drain_local_pages(NULL);
2011         copy_data_pages(&copy_bm, &orig_bm);
2012
2013         /*
2014          * End of critical section. From now on, we can write to memory,
2015          * but we should not touch disk. This specially means we must _not_
2016          * touch swap space! Except we must write out our image of course.
2017          */
2018
2019         nr_pages += nr_highmem;
2020         nr_copy_pages = nr_pages;
2021         nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
2022
2023         printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n",
2024                 nr_pages);
2025
2026         return 0;
2027 }
2028
2029 #ifndef CONFIG_ARCH_HIBERNATION_HEADER
2030 static int init_header_complete(struct swsusp_info *info)
2031 {
2032         memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
2033         info->version_code = LINUX_VERSION_CODE;
2034         return 0;
2035 }
2036
2037 static char *check_image_kernel(struct swsusp_info *info)
2038 {
2039         if (info->version_code != LINUX_VERSION_CODE)
2040                 return "kernel version";
2041         if (strcmp(info->uts.sysname,init_utsname()->sysname))
2042                 return "system type";
2043         if (strcmp(info->uts.release,init_utsname()->release))
2044                 return "kernel release";
2045         if (strcmp(info->uts.version,init_utsname()->version))
2046                 return "version";
2047         if (strcmp(info->uts.machine,init_utsname()->machine))
2048                 return "machine";
2049         return NULL;
2050 }
2051 #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
2052
2053 unsigned long snapshot_get_image_size(void)
2054 {
2055         return nr_copy_pages + nr_meta_pages + 1;
2056 }
2057
2058 static int init_header(struct swsusp_info *info)
2059 {
2060         memset(info, 0, sizeof(struct swsusp_info));
2061         info->num_physpages = get_num_physpages();
2062         info->image_pages = nr_copy_pages;
2063         info->pages = snapshot_get_image_size();
2064         info->size = info->pages;
2065         info->size <<= PAGE_SHIFT;
2066         return init_header_complete(info);
2067 }
2068
2069 /**
2070  *      pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
2071  *      are stored in the array @buf[] (1 page at a time)
2072  */
2073
2074 static inline void
2075 pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
2076 {
2077         int j;
2078
2079         for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
2080                 buf[j] = memory_bm_next_pfn(bm);
2081                 if (unlikely(buf[j] == BM_END_OF_MAP))
2082                         break;
2083                 /* Save page key for data page (s390 only). */
2084                 page_key_read(buf + j);
2085         }
2086 }
2087
2088 /**
2089  *      snapshot_read_next - used for reading the system memory snapshot.
2090  *
2091  *      On the first call to it @handle should point to a zeroed
2092  *      snapshot_handle structure.  The structure gets updated and a pointer
2093  *      to it should be passed to this function every next time.
2094  *
2095  *      On success the function returns a positive number.  Then, the caller
2096  *      is allowed to read up to the returned number of bytes from the memory
2097  *      location computed by the data_of() macro.
2098  *
2099  *      The function returns 0 to indicate the end of data stream condition,
2100  *      and a negative number is returned on error.  In such cases the
2101  *      structure pointed to by @handle is not updated and should not be used
2102  *      any more.
2103  */
2104
2105 int snapshot_read_next(struct snapshot_handle *handle)
2106 {
2107         if (handle->cur > nr_meta_pages + nr_copy_pages)
2108                 return 0;
2109
2110         if (!buffer) {
2111                 /* This makes the buffer be freed by swsusp_free() */
2112                 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
2113                 if (!buffer)
2114                         return -ENOMEM;
2115         }
2116         if (!handle->cur) {
2117                 int error;
2118
2119                 error = init_header((struct swsusp_info *)buffer);
2120                 if (error)
2121                         return error;
2122                 handle->buffer = buffer;
2123                 memory_bm_position_reset(&orig_bm);
2124                 memory_bm_position_reset(&copy_bm);
2125         } else if (handle->cur <= nr_meta_pages) {
2126                 clear_page(buffer);
2127                 pack_pfns(buffer, &orig_bm);
2128         } else {
2129                 struct page *page;
2130
2131                 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
2132                 if (PageHighMem(page)) {
2133                         /* Highmem pages are copied to the buffer,
2134                          * because we can't return with a kmapped
2135                          * highmem page (we may not be called again).
2136                          */
2137                         void *kaddr;
2138
2139                         kaddr = kmap_atomic(page);
2140                         copy_page(buffer, kaddr);
2141                         kunmap_atomic(kaddr);
2142                         handle->buffer = buffer;
2143                 } else {
2144                         handle->buffer = page_address(page);
2145                 }
2146         }
2147         handle->cur++;
2148         return PAGE_SIZE;
2149 }
2150
2151 /**
2152  *      mark_unsafe_pages - mark the pages that cannot be used for storing
2153  *      the image during resume, because they conflict with the pages that
2154  *      had been used before suspend
2155  */
2156
2157 static int mark_unsafe_pages(struct memory_bitmap *bm)
2158 {
2159         struct zone *zone;
2160         unsigned long pfn, max_zone_pfn;
2161
2162         /* Clear page flags */
2163         for_each_populated_zone(zone) {
2164                 max_zone_pfn = zone_end_pfn(zone);
2165                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
2166                         if (pfn_valid(pfn))
2167                                 swsusp_unset_page_free(pfn_to_page(pfn));
2168         }
2169
2170         /* Mark pages that correspond to the "original" pfns as "unsafe" */
2171         memory_bm_position_reset(bm);
2172         do {
2173                 pfn = memory_bm_next_pfn(bm);
2174                 if (likely(pfn != BM_END_OF_MAP)) {
2175                         if (likely(pfn_valid(pfn)))
2176                                 swsusp_set_page_free(pfn_to_page(pfn));
2177                         else
2178                                 return -EFAULT;
2179                 }
2180         } while (pfn != BM_END_OF_MAP);
2181
2182         allocated_unsafe_pages = 0;
2183
2184         return 0;
2185 }
2186
2187 static void
2188 duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
2189 {
2190         unsigned long pfn;
2191
2192         memory_bm_position_reset(src);
2193         pfn = memory_bm_next_pfn(src);
2194         while (pfn != BM_END_OF_MAP) {
2195                 memory_bm_set_bit(dst, pfn);
2196                 pfn = memory_bm_next_pfn(src);
2197         }
2198 }
2199
2200 static int check_header(struct swsusp_info *info)
2201 {
2202         char *reason;
2203
2204         reason = check_image_kernel(info);
2205         if (!reason && info->num_physpages != get_num_physpages())
2206                 reason = "memory size";
2207         if (reason) {
2208                 printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
2209                 return -EPERM;
2210         }
2211         return 0;
2212 }
2213
2214 /**
2215  *      load header - check the image header and copy data from it
2216  */
2217
2218 static int
2219 load_header(struct swsusp_info *info)
2220 {
2221         int error;
2222
2223         restore_pblist = NULL;
2224         error = check_header(info);
2225         if (!error) {
2226                 nr_copy_pages = info->image_pages;
2227                 nr_meta_pages = info->pages - info->image_pages - 1;
2228         }
2229         return error;
2230 }
2231
2232 /**
2233  *      unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
2234  *      the corresponding bit in the memory bitmap @bm
2235  */
2236 static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
2237 {
2238         int j;
2239
2240         for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
2241                 if (unlikely(buf[j] == BM_END_OF_MAP))
2242                         break;
2243
2244                 /* Extract and buffer page key for data page (s390 only). */
2245                 page_key_memorize(buf + j);
2246
2247                 if (memory_bm_pfn_present(bm, buf[j]))
2248                         memory_bm_set_bit(bm, buf[j]);
2249                 else
2250                         return -EFAULT;
2251         }
2252
2253         return 0;
2254 }
2255
2256 /* List of "safe" pages that may be used to store data loaded from the suspend
2257  * image
2258  */
2259 static struct linked_page *safe_pages_list;
2260
2261 #ifdef CONFIG_HIGHMEM
2262 /* struct highmem_pbe is used for creating the list of highmem pages that
2263  * should be restored atomically during the resume from disk, because the page
2264  * frames they have occupied before the suspend are in use.
2265  */
2266 struct highmem_pbe {
2267         struct page *copy_page; /* data is here now */
2268         struct page *orig_page; /* data was here before the suspend */
2269         struct highmem_pbe *next;
2270 };
2271
2272 /* List of highmem PBEs needed for restoring the highmem pages that were
2273  * allocated before the suspend and included in the suspend image, but have
2274  * also been allocated by the "resume" kernel, so their contents cannot be
2275  * written directly to their "original" page frames.
2276  */
2277 static struct highmem_pbe *highmem_pblist;
2278
2279 /**
2280  *      count_highmem_image_pages - compute the number of highmem pages in the
2281  *      suspend image.  The bits in the memory bitmap @bm that correspond to the
2282  *      image pages are assumed to be set.
2283  */
2284
2285 static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
2286 {
2287         unsigned long pfn;
2288         unsigned int cnt = 0;
2289
2290         memory_bm_position_reset(bm);
2291         pfn = memory_bm_next_pfn(bm);
2292         while (pfn != BM_END_OF_MAP) {
2293                 if (PageHighMem(pfn_to_page(pfn)))
2294                         cnt++;
2295
2296                 pfn = memory_bm_next_pfn(bm);
2297         }
2298         return cnt;
2299 }
2300
2301 /**
2302  *      prepare_highmem_image - try to allocate as many highmem pages as
2303  *      there are highmem image pages (@nr_highmem_p points to the variable
2304  *      containing the number of highmem image pages).  The pages that are
2305  *      "safe" (ie. will not be overwritten when the suspend image is
2306  *      restored) have the corresponding bits set in @bm (it must be
2307  *      unitialized).
2308  *
2309  *      NOTE: This function should not be called if there are no highmem
2310  *      image pages.
2311  */
2312
2313 static unsigned int safe_highmem_pages;
2314
2315 static struct memory_bitmap *safe_highmem_bm;
2316
2317 static int
2318 prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
2319 {
2320         unsigned int to_alloc;
2321
2322         if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
2323                 return -ENOMEM;
2324
2325         if (get_highmem_buffer(PG_SAFE))
2326                 return -ENOMEM;
2327
2328         to_alloc = count_free_highmem_pages();
2329         if (to_alloc > *nr_highmem_p)
2330                 to_alloc = *nr_highmem_p;
2331         else
2332                 *nr_highmem_p = to_alloc;
2333
2334         safe_highmem_pages = 0;
2335         while (to_alloc-- > 0) {
2336                 struct page *page;
2337
2338                 page = alloc_page(__GFP_HIGHMEM);
2339                 if (!swsusp_page_is_free(page)) {
2340                         /* The page is "safe", set its bit the bitmap */
2341                         memory_bm_set_bit(bm, page_to_pfn(page));
2342                         safe_highmem_pages++;
2343                 }
2344                 /* Mark the page as allocated */
2345                 swsusp_set_page_forbidden(page);
2346                 swsusp_set_page_free(page);
2347         }
2348         memory_bm_position_reset(bm);
2349         safe_highmem_bm = bm;
2350         return 0;
2351 }
2352
2353 /**
2354  *      get_highmem_page_buffer - for given highmem image page find the buffer
2355  *      that suspend_write_next() should set for its caller to write to.
2356  *
2357  *      If the page is to be saved to its "original" page frame or a copy of
2358  *      the page is to be made in the highmem, @buffer is returned.  Otherwise,
2359  *      the copy of the page is to be made in normal memory, so the address of
2360  *      the copy is returned.
2361  *
2362  *      If @buffer is returned, the caller of suspend_write_next() will write
2363  *      the page's contents to @buffer, so they will have to be copied to the
2364  *      right location on the next call to suspend_write_next() and it is done
2365  *      with the help of copy_last_highmem_page().  For this purpose, if
2366  *      @buffer is returned, @last_highmem page is set to the page to which
2367  *      the data will have to be copied from @buffer.
2368  */
2369
2370 static struct page *last_highmem_page;
2371
2372 static void *
2373 get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
2374 {
2375         struct highmem_pbe *pbe;
2376         void *kaddr;
2377
2378         if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
2379                 /* We have allocated the "original" page frame and we can
2380                  * use it directly to store the loaded page.
2381                  */
2382                 last_highmem_page = page;
2383                 return buffer;
2384         }
2385         /* The "original" page frame has not been allocated and we have to
2386          * use a "safe" page frame to store the loaded page.
2387          */
2388         pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
2389         if (!pbe) {
2390                 swsusp_free();
2391                 return ERR_PTR(-ENOMEM);
2392         }
2393         pbe->orig_page = page;
2394         if (safe_highmem_pages > 0) {
2395                 struct page *tmp;
2396
2397                 /* Copy of the page will be stored in high memory */
2398                 kaddr = buffer;
2399                 tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
2400                 safe_highmem_pages--;
2401                 last_highmem_page = tmp;
2402                 pbe->copy_page = tmp;
2403         } else {
2404                 /* Copy of the page will be stored in normal memory */
2405                 kaddr = safe_pages_list;
2406                 safe_pages_list = safe_pages_list->next;
2407                 pbe->copy_page = virt_to_page(kaddr);
2408         }
2409         pbe->next = highmem_pblist;
2410         highmem_pblist = pbe;
2411         return kaddr;
2412 }
2413
2414 /**
2415  *      copy_last_highmem_page - copy the contents of a highmem image from
2416  *      @buffer, where the caller of snapshot_write_next() has place them,
2417  *      to the right location represented by @last_highmem_page .
2418  */
2419
2420 static void copy_last_highmem_page(void)
2421 {
2422         if (last_highmem_page) {
2423                 void *dst;
2424
2425                 dst = kmap_atomic(last_highmem_page);
2426                 copy_page(dst, buffer);
2427                 kunmap_atomic(dst);
2428                 last_highmem_page = NULL;
2429         }
2430 }
2431
2432 static inline int last_highmem_page_copied(void)
2433 {
2434         return !last_highmem_page;
2435 }
2436
2437 static inline void free_highmem_data(void)
2438 {
2439         if (safe_highmem_bm)
2440                 memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
2441
2442         if (buffer)
2443                 free_image_page(buffer, PG_UNSAFE_CLEAR);
2444 }
2445 #else
2446 static inline int get_safe_write_buffer(void) { return 0; }
2447
2448 static unsigned int
2449 count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
2450
2451 static inline int
2452 prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
2453 {
2454         return 0;
2455 }
2456
2457 static inline void *
2458 get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
2459 {
2460         return ERR_PTR(-EINVAL);
2461 }
2462
2463 static inline void copy_last_highmem_page(void) {}
2464 static inline int last_highmem_page_copied(void) { return 1; }
2465 static inline void free_highmem_data(void) {}
2466 #endif /* CONFIG_HIGHMEM */
2467
2468 /**
2469  *      prepare_image - use the memory bitmap @bm to mark the pages that will
2470  *      be overwritten in the process of restoring the system memory state
2471  *      from the suspend image ("unsafe" pages) and allocate memory for the
2472  *      image.
2473  *
2474  *      The idea is to allocate a new memory bitmap first and then allocate
2475  *      as many pages as needed for the image data, but not to assign these
2476  *      pages to specific tasks initially.  Instead, we just mark them as
2477  *      allocated and create a lists of "safe" pages that will be used
2478  *      later.  On systems with high memory a list of "safe" highmem pages is
2479  *      also created.
2480  */
2481
2482 #define PBES_PER_LINKED_PAGE    (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
2483
2484 static int
2485 prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
2486 {
2487         unsigned int nr_pages, nr_highmem;
2488         struct linked_page *sp_list, *lp;
2489         int error;
2490
2491         /* If there is no highmem, the buffer will not be necessary */
2492         free_image_page(buffer, PG_UNSAFE_CLEAR);
2493         buffer = NULL;
2494
2495         nr_highmem = count_highmem_image_pages(bm);
2496         error = mark_unsafe_pages(bm);
2497         if (error)
2498                 goto Free;
2499
2500         error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
2501         if (error)
2502                 goto Free;
2503
2504         duplicate_memory_bitmap(new_bm, bm);
2505         memory_bm_free(bm, PG_UNSAFE_KEEP);
2506         if (nr_highmem > 0) {
2507                 error = prepare_highmem_image(bm, &nr_highmem);
2508                 if (error)
2509                         goto Free;
2510         }
2511         /* Reserve some safe pages for potential later use.
2512          *
2513          * NOTE: This way we make sure there will be enough safe pages for the
2514          * chain_alloc() in get_buffer().  It is a bit wasteful, but
2515          * nr_copy_pages cannot be greater than 50% of the memory anyway.
2516          */
2517         sp_list = NULL;
2518         /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
2519         nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
2520         nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
2521         while (nr_pages > 0) {
2522                 lp = get_image_page(GFP_ATOMIC, PG_SAFE);
2523                 if (!lp) {
2524                         error = -ENOMEM;
2525                         goto Free;
2526                 }
2527                 lp->next = sp_list;
2528                 sp_list = lp;
2529                 nr_pages--;
2530         }
2531         /* Preallocate memory for the image */
2532         safe_pages_list = NULL;
2533         nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
2534         while (nr_pages > 0) {
2535                 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
2536                 if (!lp) {
2537                         error = -ENOMEM;
2538                         goto Free;
2539                 }
2540                 if (!swsusp_page_is_free(virt_to_page(lp))) {
2541                         /* The page is "safe", add it to the list */
2542                         lp->next = safe_pages_list;
2543                         safe_pages_list = lp;
2544                 }
2545                 /* Mark the page as allocated */
2546                 swsusp_set_page_forbidden(virt_to_page(lp));
2547                 swsusp_set_page_free(virt_to_page(lp));
2548                 nr_pages--;
2549         }
2550         /* Free the reserved safe pages so that chain_alloc() can use them */
2551         while (sp_list) {
2552                 lp = sp_list->next;
2553                 free_image_page(sp_list, PG_UNSAFE_CLEAR);
2554                 sp_list = lp;
2555         }
2556         return 0;
2557
2558  Free:
2559         swsusp_free();
2560         return error;
2561 }
2562
2563 /**
2564  *      get_buffer - compute the address that snapshot_write_next() should
2565  *      set for its caller to write to.
2566  */
2567
2568 static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2569 {
2570         struct pbe *pbe;
2571         struct page *page;
2572         unsigned long pfn = memory_bm_next_pfn(bm);
2573
2574         if (pfn == BM_END_OF_MAP)
2575                 return ERR_PTR(-EFAULT);
2576
2577         page = pfn_to_page(pfn);
2578         if (PageHighMem(page))
2579                 return get_highmem_page_buffer(page, ca);
2580
2581         if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
2582                 /* We have allocated the "original" page frame and we can
2583                  * use it directly to store the loaded page.
2584                  */
2585                 return page_address(page);
2586
2587         /* The "original" page frame has not been allocated and we have to
2588          * use a "safe" page frame to store the loaded page.
2589          */
2590         pbe = chain_alloc(ca, sizeof(struct pbe));
2591         if (!pbe) {
2592                 swsusp_free();
2593                 return ERR_PTR(-ENOMEM);
2594         }
2595         pbe->orig_address = page_address(page);
2596         pbe->address = safe_pages_list;
2597         safe_pages_list = safe_pages_list->next;
2598         pbe->next = restore_pblist;
2599         restore_pblist = pbe;
2600         return pbe->address;
2601 }
2602
2603 /**
2604  *      snapshot_write_next - used for writing the system memory snapshot.
2605  *
2606  *      On the first call to it @handle should point to a zeroed
2607  *      snapshot_handle structure.  The structure gets updated and a pointer
2608  *      to it should be passed to this function every next time.
2609  *
2610  *      On success the function returns a positive number.  Then, the caller
2611  *      is allowed to write up to the returned number of bytes to the memory
2612  *      location computed by the data_of() macro.
2613  *
2614  *      The function returns 0 to indicate the "end of file" condition,
2615  *      and a negative number is returned on error.  In such cases the
2616  *      structure pointed to by @handle is not updated and should not be used
2617  *      any more.
2618  */
2619
2620 int snapshot_write_next(struct snapshot_handle *handle)
2621 {
2622         static struct chain_allocator ca;
2623         int error = 0;
2624
2625         /* Check if we have already loaded the entire image */
2626         if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
2627                 return 0;
2628
2629         handle->sync_read = 1;
2630
2631         if (!handle->cur) {
2632                 if (!buffer)
2633                         /* This makes the buffer be freed by swsusp_free() */
2634                         buffer = get_image_page(GFP_ATOMIC, PG_ANY);
2635
2636                 if (!buffer)
2637                         return -ENOMEM;
2638
2639                 handle->buffer = buffer;
2640         } else if (handle->cur == 1) {
2641                 error = load_header(buffer);
2642                 if (error)
2643                         return error;
2644
2645                 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
2646                 if (error)
2647                         return error;
2648
2649                 /* Allocate buffer for page keys. */
2650                 error = page_key_alloc(nr_copy_pages);
2651                 if (error)
2652                         return error;
2653
2654         } else if (handle->cur <= nr_meta_pages + 1) {
2655                 error = unpack_orig_pfns(buffer, &copy_bm);
2656                 if (error)
2657                         return error;
2658
2659                 if (handle->cur == nr_meta_pages + 1) {
2660                         error = prepare_image(&orig_bm, &copy_bm);
2661                         if (error)
2662                                 return error;
2663
2664                         chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2665                         memory_bm_position_reset(&orig_bm);
2666                         restore_pblist = NULL;
2667                         handle->buffer = get_buffer(&orig_bm, &ca);
2668                         handle->sync_read = 0;
2669                         if (IS_ERR(handle->buffer))
2670                                 return PTR_ERR(handle->buffer);
2671                 }
2672         } else {
2673                 copy_last_highmem_page();
2674                 /* Restore page key for data page (s390 only). */
2675                 page_key_write(handle->buffer);
2676                 handle->buffer = get_buffer(&orig_bm, &ca);
2677                 if (IS_ERR(handle->buffer))
2678                         return PTR_ERR(handle->buffer);
2679                 if (handle->buffer != buffer)
2680                         handle->sync_read = 0;
2681         }
2682         handle->cur++;
2683         return PAGE_SIZE;
2684 }
2685
2686 /**
2687  *      snapshot_write_finalize - must be called after the last call to
2688  *      snapshot_write_next() in case the last page in the image happens
2689  *      to be a highmem page and its contents should be stored in the
2690  *      highmem.  Additionally, it releases the memory that will not be
2691  *      used any more.
2692  */
2693
2694 void snapshot_write_finalize(struct snapshot_handle *handle)
2695 {
2696         copy_last_highmem_page();
2697         /* Restore page key for data page (s390 only). */
2698         page_key_write(handle->buffer);
2699         page_key_free();
2700         /* Free only if we have loaded the image entirely */
2701         if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2702                 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
2703                 free_highmem_data();
2704         }
2705 }
2706
2707 int snapshot_image_loaded(struct snapshot_handle *handle)
2708 {
2709         return !(!nr_copy_pages || !last_highmem_page_copied() ||
2710                         handle->cur <= nr_meta_pages + nr_copy_pages);
2711 }
2712
2713 #ifdef CONFIG_HIGHMEM
2714 /* Assumes that @buf is ready and points to a "safe" page */
2715 static inline void
2716 swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
2717 {
2718         void *kaddr1, *kaddr2;
2719
2720         kaddr1 = kmap_atomic(p1);
2721         kaddr2 = kmap_atomic(p2);
2722         copy_page(buf, kaddr1);
2723         copy_page(kaddr1, kaddr2);
2724         copy_page(kaddr2, buf);
2725         kunmap_atomic(kaddr2);
2726         kunmap_atomic(kaddr1);
2727 }
2728
2729 /**
2730  *      restore_highmem - for each highmem page that was allocated before
2731  *      the suspend and included in the suspend image, and also has been
2732  *      allocated by the "resume" kernel swap its current (ie. "before
2733  *      resume") contents with the previous (ie. "before suspend") one.
2734  *
2735  *      If the resume eventually fails, we can call this function once
2736  *      again and restore the "before resume" highmem state.
2737  */
2738
2739 int restore_highmem(void)
2740 {
2741         struct highmem_pbe *pbe = highmem_pblist;
2742         void *buf;
2743
2744         if (!pbe)
2745                 return 0;
2746
2747         buf = get_image_page(GFP_ATOMIC, PG_SAFE);
2748         if (!buf)
2749                 return -ENOMEM;
2750
2751         while (pbe) {
2752                 swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
2753                 pbe = pbe->next;
2754         }
2755         free_image_page(buf, PG_UNSAFE_CLEAR);
2756         return 0;
2757 }
2758 #endif /* CONFIG_HIGHMEM */