mm: page_alloc: cache the last node whose dirty limit is reached

[cascardo/linux.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 8b3e134..7427e0e 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -63,6 +63,7 @@
  #include <linux/sched/rt.h>
  #include <linux/page_owner.h>
  #include <linux/kthread.h>
+#include <linux/memcontrol.h>
  
  #include <asm/sections.h>
  #include <asm/tlbflush.h>
@@ -294,14 +295,6 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
         return false;
  }
  
-static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
-{
-       if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
-               return true;
-
-       return false;
-}
-
  /*
   * Returns false when the remaining initialisation should be deferred until
   * later in the boot cycle when it can be parallelised.
@@ -341,11 +334,6 @@ static inline bool early_page_uninitialised(unsigned long pfn)
         return false;
  }
  
-static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
-{
-       return false;
-}
-
  static inline bool update_defer_init(pg_data_t *pgdat,
                                 unsigned long pfn, unsigned long zone_end,
                                 unsigned long *nr_initialised)
@@ -1006,6 +994,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
  
                 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
  
+               if (compound)
+                       ClearPageDoubleMap(page);
                 for (i = 1; i < (1 << order); i++) {
                         if (compound)
                                 bad += free_tail_pages_check(page, page + i);
@@ -1016,8 +1006,12 @@ static __always_inline bool free_pages_prepare(struct page *page,
                         (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
                 }
         }
-       if (PageAnonHead(page))
+       if (PageMappingFlags(page))
                 page->mapping = NULL;
+       if (memcg_kmem_enabled() && PageKmemcg(page)) {
+               memcg_kmem_uncharge(page, order);
+               __ClearPageKmemcg(page);
+       }
         if (check_free)
                 bad += free_pages_check(page);
         if (bad)
@@ -1084,9 +1078,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
  
         spin_lock(&zone->lock);
         isolated_pageblocks = has_isolate_pageblock(zone);
-       nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+       nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
         if (nr_scanned)
-               __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+               __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
  
         while (count) {
                 struct page *page;
@@ -1141,9 +1135,9 @@ static void free_one_page(struct zone *zone,
  {
         unsigned long nr_scanned;
         spin_lock(&zone->lock);
-       nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+       nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
         if (nr_scanned)
-               __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+               __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
  
         if (unlikely(has_isolate_pageblock(zone) ||
                 is_migrate_isolate(migratetype))) {
@@ -1724,6 +1718,19 @@ static bool check_new_pages(struct page *page, unsigned int order)
         return false;
  }
  
+inline void post_alloc_hook(struct page *page, unsigned int order,
+                               gfp_t gfp_flags)
+{
+       set_page_private(page, 0);
+       set_page_refcounted(page);
+
+       arch_alloc_page(page, order);
+       kernel_map_pages(page, 1 << order, 1);
+       kernel_poison_pages(page, 1 << order, 1);
+       kasan_alloc_pages(page, order);
+       set_page_owner(page, order, gfp_flags);
+}
+
  static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
                                                         unsigned int alloc_flags)
  {
@@ -1736,13 +1743,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
                         poisoned &= page_is_poisoned(p);
         }
  
-       set_page_private(page, 0);
-       set_page_refcounted(page);
-
-       arch_alloc_page(page, order);
-       kernel_map_pages(page, 1 << order, 1);
-       kernel_poison_pages(page, 1 << order, 1);
-       kasan_alloc_pages(page, order);
+       post_alloc_hook(page, order, gfp_flags);
  
         if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
                 for (i = 0; i < (1 << order); i++)
@@ -1751,8 +1752,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
         if (order && (gfp_flags & __GFP_COMP))
                 prep_compound_page(page, order);
  
-       set_page_owner(page, order, gfp_flags);
-
         /*
          * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
          * allocate the page. The expectation is that the caller is taking
@@ -2461,7 +2460,6 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
  void split_page(struct page *page, unsigned int order)
  {
         int i;
-       gfp_t gfp_mask;
  
         VM_BUG_ON_PAGE(PageCompound(page), page);
         VM_BUG_ON_PAGE(!page_count(page), page);
@@ -2475,12 +2473,9 @@ void split_page(struct page *page, unsigned int order)
                 split_page(virt_to_page(page[0].shadow), order);
  #endif
  
-       gfp_mask = get_page_owner_gfp(page);
-       set_page_owner(page, 0, gfp_mask);
-       for (i = 1; i < (1 << order); i++) {
+       for (i = 1; i < (1 << order); i++)
                 set_page_refcounted(page + i);
-               set_page_owner(page + i, 0, gfp_mask);
-       }
+       split_page_owner(page, order);
  }
  EXPORT_SYMBOL_GPL(split_page);
  
@@ -2509,9 +2504,10 @@ int __isolate_free_page(struct page *page, unsigned int order)
         zone->free_area[order].nr_free--;
         rmv_page_order(page);
  
-       set_page_owner(page, order, __GFP_MOVABLE);
-
-       /* Set the pageblock if the isolated page is at least a pageblock */
+       /*
+        * Set the pageblock if the isolated page is at least half of a
+        * pageblock
+        */
         if (order >= pageblock_order - 1) {
                 struct page *endpage = page + (1 << order) - 1;
                 for (; page < endpage; page += pageblock_nr_pages) {
@@ -2526,33 +2522,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
         return 1UL << order;
  }
  
-/*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
- */
-int split_free_page(struct page *page)
-{
-       unsigned int order;
-       int nr_pages;
-
-       order = page_order(page);
-
-       nr_pages = __isolate_free_page(page, order);
-       if (!nr_pages)
-               return 0;
-
-       /* Split into individual pages */
-       set_page_refcounted(page);
-       split_page(page, order);
-       return nr_pages;
-}
-
  /*
   * Update NUMA hit/miss statistics
   *
@@ -2618,7 +2587,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                         else
                                 page = list_first_entry(list, struct page, lru);
  
-                       __dec_zone_state(zone, NR_ALLOC_BATCH);
                         list_del(&page->lru);
                         pcp->count--;
  
@@ -2644,15 +2612,10 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                 spin_unlock(&zone->lock);
                 if (!page)
                         goto failed;
-               __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
                 __mod_zone_freepage_state(zone, -(1 << order),
                                           get_pcppage_migratetype(page));
         }
  
-       if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
-           !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
-               set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
-
         __count_zone_vm_events(PGALLOC, zone, 1 << order);
         zone_statistics(preferred_zone, zone, gfp_flags);
         local_irq_restore(flags);
@@ -2863,40 +2826,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
  }
  
  #ifdef CONFIG_NUMA
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
-       return local_zone->node == zone->node;
-}
-
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  {
         return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
                                 RECLAIM_DISTANCE;
  }
  #else  /* CONFIG_NUMA */
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
-       return true;
-}
-
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  {
         return true;
  }
  #endif /* CONFIG_NUMA */
  
-static void reset_alloc_batches(struct zone *preferred_zone)
-{
-       struct zone *zone = preferred_zone->zone_pgdat->node_zones;
-
-       do {
-               mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                       high_wmark_pages(zone) - low_wmark_pages(zone) -
-                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-               clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
-       } while (zone++ != preferred_zone);
-}
-
  /*
   * get_page_from_freelist goes through the zonelist trying to allocate
   * a page.
@@ -2907,10 +2848,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
  {
         struct zoneref *z = ac->preferred_zoneref;
         struct zone *zone;
-       bool fair_skipped = false;
-       bool apply_fair = (alloc_flags & ALLOC_FAIR);
+       struct pglist_data *last_pgdat_dirty_limit = NULL;
  
-zonelist_scan:
         /*
          * Scan zonelist, looking for a zone with enough free.
          * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
@@ -2924,51 +2863,34 @@ zonelist_scan:
                         (alloc_flags & ALLOC_CPUSET) &&
                         !__cpuset_zone_allowed(zone, gfp_mask))
                                 continue;
-               /*
-                * Distribute pages in proportion to the individual
-                * zone size to ensure fair page aging.  The zone a
-                * page was allocated in should have no effect on the
-                * time the page has in memory before being reclaimed.
-                */
-               if (apply_fair) {
-                       if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
-                               fair_skipped = true;
-                               continue;
-                       }
-                       if (!zone_local(ac->preferred_zoneref->zone, zone)) {
-                               if (fair_skipped)
-                                       goto reset_fair;
-                               apply_fair = false;
-                       }
-               }
                 /*
                  * When allocating a page cache page for writing, we
-                * want to get it from a zone that is within its dirty
-                * limit, such that no single zone holds more than its
+                * want to get it from a node that is within its dirty
+                * limit, such that no single node holds more than its
                  * proportional share of globally allowed dirty pages.
-                * The dirty limits take into account the zone's
+                * The dirty limits take into account the node's
                  * lowmem reserves and high watermark so that kswapd
                  * should be able to balance it without having to
                  * write pages from its LRU list.
                  *
-                * This may look like it could increase pressure on
-                * lower zones by failing allocations in higher zones
-                * before they are full.  But the pages that do spill
-                * over are limited as the lower zones are protected
-                * by this very same mechanism.  It should not become
-                * a practical burden to them.
-                *
                  * XXX: For now, allow allocations to potentially
-                * exceed the per-zone dirty limit in the slowpath
+                * exceed the per-node dirty limit in the slowpath
                  * (spread_dirty_pages unset) before going into reclaim,
                  * which is important when on a NUMA setup the allowed
-                * zones are together not big enough to reach the
+                * nodes are together not big enough to reach the
                  * global limit.  The proper fix for these situations
-                * will require awareness of zones in the
+                * will require awareness of nodes in the
                  * dirty-throttling and the flusher threads.
                  */
-               if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
-                       continue;
+               if (ac->spread_dirty_pages) {
+                       if (last_pgdat_dirty_limit == zone->zone_pgdat)
+                               continue;
+
+                       if (!node_dirty_ok(zone->zone_pgdat)) {
+                               last_pgdat_dirty_limit = zone->zone_pgdat;
+                               continue;
+                       }
+               }
  
                 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
                 if (!zone_watermark_fast(zone, order, mark,
@@ -2980,16 +2902,16 @@ zonelist_scan:
                         if (alloc_flags & ALLOC_NO_WATERMARKS)
                                 goto try_this_zone;
  
-                       if (zone_reclaim_mode == 0 ||
+                       if (node_reclaim_mode == 0 ||
                             !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
                                 continue;
  
-                       ret = zone_reclaim(zone, gfp_mask, order);
+                       ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
                         switch (ret) {
-                       case ZONE_RECLAIM_NOSCAN:
+                       case NODE_RECLAIM_NOSCAN:
                                 /* did not scan */
                                 continue;
-                       case ZONE_RECLAIM_FULL:
+                       case NODE_RECLAIM_FULL:
                                 /* scanned but unreclaimable */
                                 continue;
                         default:
@@ -3019,23 +2941,6 @@ try_this_zone:
                 }
         }
  
-       /*
-        * The first pass makes sure allocations are spread fairly within the
-        * local node.  However, the local node might have free pages left
-        * after the fairness batches are exhausted, and remote zones haven't
-        * even been considered yet.  Try once more without fairness, and
-        * include remote zones now, before entering the slowpath and waking
-        * kswapd: prefer spilling to a remote zone over swapping locally.
-        */
-       if (fair_skipped) {
-reset_fair:
-               apply_fair = false;
-               fair_skipped = false;
-               reset_alloc_batches(ac->preferred_zoneref->zone);
-               z = ac->preferred_zoneref;
-               goto zonelist_scan;
-       }
-
         return NULL;
  }
  
@@ -3105,6 +3010,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
         struct oom_control oc = {
                 .zonelist = ac->zonelist,
                 .nodemask = ac->nodemask,
+               .memcg = NULL,
                 .gfp_mask = gfp_mask,
                 .order = order,
         };
@@ -3404,10 +3310,14 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
  {
         struct zoneref *z;
         struct zone *zone;
+       pg_data_t *last_pgdat = NULL;
  
         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
-                                               ac->high_zoneidx, ac->nodemask)
-               wakeup_kswapd(zone, order, ac_classzone_idx(ac));
+                                       ac->high_zoneidx, ac->nodemask) {
+               if (last_pgdat != zone->zone_pgdat)
+                       wakeup_kswapd(zone, order, ac->high_zoneidx);
+               last_pgdat = zone->zone_pgdat;
+       }
  }
  
  static inline unsigned int
@@ -3529,14 +3439,12 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                          * prevent from pre mature OOM
                          */
                         if (!did_some_progress) {
-                               unsigned long writeback;
-                               unsigned long dirty;
+                               unsigned long write_pending;
  
-                               writeback = zone_page_state_snapshot(zone,
-                                                                    NR_WRITEBACK);
-                               dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
+                               write_pending = zone_page_state_snapshot(zone,
+                                                       NR_ZONE_WRITE_PENDING);
  
-                               if (2*(writeback + dirty) > reclaimable) {
+                               if (2 * write_pending > reclaimable) {
                                         congestion_wait(BLK_RW_ASYNC, HZ/10);
                                         return true;
                                 }
@@ -3781,7 +3689,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  {
         struct page *page;
         unsigned int cpuset_mems_cookie;
-       unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+       unsigned int alloc_flags = ALLOC_WMARK_LOW;
         gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
         struct alloc_context ac = {
                 .high_zoneidx = gfp_zone(gfp_mask),
@@ -3868,6 +3776,14 @@ no_zone:
         }
  
  out:
+       if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page) {
+               if (unlikely(memcg_kmem_charge(page, gfp_mask, order))) {
+                       __free_pages(page, order);
+                       page = NULL;
+               } else
+                       __SetPageKmemcg(page);
+       }
+
         if (kmemcheck_enabled && page)
                 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
  
@@ -4023,56 +3939,6 @@ void __free_page_frag(void *addr)
  }
  EXPORT_SYMBOL(__free_page_frag);
  
-/*
- * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
- * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is
- * equivalent to alloc_pages.
- *
- * It should be used when the caller would like to use kmalloc, but since the
- * allocation is large, it has to fall back to the page allocator.
- */
-struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
-{
-       struct page *page;
-
-       page = alloc_pages(gfp_mask, order);
-       if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
-               __free_pages(page, order);
-               page = NULL;
-       }
-       return page;
-}
-
-struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
-{
-       struct page *page;
-
-       page = alloc_pages_node(nid, gfp_mask, order);
-       if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
-               __free_pages(page, order);
-               page = NULL;
-       }
-       return page;
-}
-
-/*
- * __free_kmem_pages and free_kmem_pages will free pages allocated with
- * alloc_kmem_pages.
- */
-void __free_kmem_pages(struct page *page, unsigned int order)
-{
-       memcg_kmem_uncharge(page, order);
-       __free_pages(page, order);
-}
-
-void free_kmem_pages(unsigned long addr, unsigned int order)
-{
-       if (addr != 0) {
-               VM_BUG_ON(!virt_addr_valid((void *)addr));
-               __free_kmem_pages(virt_to_page((void *)addr), order);
-       }
-}
-
  static void *make_alloc_exact(unsigned long addr, unsigned int order,
                 size_t size)
  {
@@ -4254,7 +4120,7 @@ EXPORT_SYMBOL_GPL(si_mem_available);
  void si_meminfo(struct sysinfo *val)
  {
         val->totalram = totalram_pages;
-       val->sharedram = global_page_state(NR_SHMEM);
+       val->sharedram = global_node_page_state(NR_SHMEM);
         val->freeram = global_page_state(NR_FREE_PAGES);
         val->bufferram = nr_blockdev_pages();
         val->totalhigh = totalhigh_pages;
@@ -4276,8 +4142,8 @@ void si_meminfo_node(struct sysinfo *val, int nid)
         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
                 managed_pages += pgdat->node_zones[zone_type].managed_pages;
         val->totalram = managed_pages;
-       val->sharedram = node_page_state(nid, NR_SHMEM);
-       val->freeram = node_page_state(nid, NR_FREE_PAGES);
+       val->sharedram = node_page_state(pgdat, NR_SHMEM);
+       val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
  #ifdef CONFIG_HIGHMEM
         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
                 struct zone *zone = &pgdat->node_zones[zone_type];
@@ -4360,6 +4226,7 @@ void show_free_areas(unsigned int filter)
         unsigned long free_pcp = 0;
         int cpu;
         struct zone *zone;
+       pg_data_t *pgdat;
  
         for_each_populated_zone(zone) {
                 if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -4375,26 +4242,71 @@ void show_free_areas(unsigned int filter)
                 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
                 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
                 " free:%lu free_pcp:%lu free_cma:%lu\n",
-               global_page_state(NR_ACTIVE_ANON),
-               global_page_state(NR_INACTIVE_ANON),
-               global_page_state(NR_ISOLATED_ANON),
-               global_page_state(NR_ACTIVE_FILE),
-               global_page_state(NR_INACTIVE_FILE),
-               global_page_state(NR_ISOLATED_FILE),
-               global_page_state(NR_UNEVICTABLE),
-               global_page_state(NR_FILE_DIRTY),
-               global_page_state(NR_WRITEBACK),
-               global_page_state(NR_UNSTABLE_NFS),
+               global_node_page_state(NR_ACTIVE_ANON),
+               global_node_page_state(NR_INACTIVE_ANON),
+               global_node_page_state(NR_ISOLATED_ANON),
+               global_node_page_state(NR_ACTIVE_FILE),
+               global_node_page_state(NR_INACTIVE_FILE),
+               global_node_page_state(NR_ISOLATED_FILE),
+               global_node_page_state(NR_UNEVICTABLE),
+               global_node_page_state(NR_FILE_DIRTY),
+               global_node_page_state(NR_WRITEBACK),
+               global_node_page_state(NR_UNSTABLE_NFS),
                 global_page_state(NR_SLAB_RECLAIMABLE),
                 global_page_state(NR_SLAB_UNRECLAIMABLE),
-               global_page_state(NR_FILE_MAPPED),
-               global_page_state(NR_SHMEM),
+               global_node_page_state(NR_FILE_MAPPED),
+               global_node_page_state(NR_SHMEM),
                 global_page_state(NR_PAGETABLE),
                 global_page_state(NR_BOUNCE),
                 global_page_state(NR_FREE_PAGES),
                 free_pcp,
                 global_page_state(NR_FREE_CMA_PAGES));
  
+       for_each_online_pgdat(pgdat) {
+               printk("Node %d"
+                       " active_anon:%lukB"
+                       " inactive_anon:%lukB"
+                       " active_file:%lukB"
+                       " inactive_file:%lukB"
+                       " unevictable:%lukB"
+                       " isolated(anon):%lukB"
+                       " isolated(file):%lukB"
+                       " mapped:%lukB"
+                       " dirty:%lukB"
+                       " writeback:%lukB"
+                       " shmem:%lukB"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                       " shmem_thp: %lukB"
+                       " shmem_pmdmapped: %lukB"
+                       " anon_thp: %lukB"
+#endif
+                       " writeback_tmp:%lukB"
+                       " unstable:%lukB"
+                       " all_unreclaimable? %s"
+                       "\n",
+                       pgdat->node_id,
+                       K(node_page_state(pgdat, NR_ACTIVE_ANON)),
+                       K(node_page_state(pgdat, NR_INACTIVE_ANON)),
+                       K(node_page_state(pgdat, NR_ACTIVE_FILE)),
+                       K(node_page_state(pgdat, NR_INACTIVE_FILE)),
+                       K(node_page_state(pgdat, NR_UNEVICTABLE)),
+                       K(node_page_state(pgdat, NR_ISOLATED_ANON)),
+                       K(node_page_state(pgdat, NR_ISOLATED_FILE)),
+                       K(node_page_state(pgdat, NR_FILE_MAPPED)),
+                       K(node_page_state(pgdat, NR_FILE_DIRTY)),
+                       K(node_page_state(pgdat, NR_WRITEBACK)),
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                       K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
+                       K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
+                                       * HPAGE_PMD_NR),
+                       K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
+#endif
+                       K(node_page_state(pgdat, NR_SHMEM)),
+                       K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+                       K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
+                       !pgdat_reclaimable(pgdat) ? "yes" : "no");
+       }
+
         for_each_populated_zone(zone) {
                 int i;
  
@@ -4411,66 +4323,37 @@ void show_free_areas(unsigned int filter)
                         " min:%lukB"
                         " low:%lukB"
                         " high:%lukB"
-                       " active_anon:%lukB"
-                       " inactive_anon:%lukB"
-                       " active_file:%lukB"
-                       " inactive_file:%lukB"
-                       " unevictable:%lukB"
-                       " isolated(anon):%lukB"
-                       " isolated(file):%lukB"
                         " present:%lukB"
                         " managed:%lukB"
                         " mlocked:%lukB"
-                       " dirty:%lukB"
-                       " writeback:%lukB"
-                       " mapped:%lukB"
-                       " shmem:%lukB"
                         " slab_reclaimable:%lukB"
                         " slab_unreclaimable:%lukB"
                         " kernel_stack:%lukB"
                         " pagetables:%lukB"
-                       " unstable:%lukB"
                         " bounce:%lukB"
                         " free_pcp:%lukB"
                         " local_pcp:%ukB"
                         " free_cma:%lukB"
-                       " writeback_tmp:%lukB"
-                       " pages_scanned:%lu"
-                       " all_unreclaimable? %s"
+                       " node_pages_scanned:%lu"
                         "\n",
                         zone->name,
                         K(zone_page_state(zone, NR_FREE_PAGES)),
                         K(min_wmark_pages(zone)),
                         K(low_wmark_pages(zone)),
                         K(high_wmark_pages(zone)),
-                       K(zone_page_state(zone, NR_ACTIVE_ANON)),
-                       K(zone_page_state(zone, NR_INACTIVE_ANON)),
-                       K(zone_page_state(zone, NR_ACTIVE_FILE)),
-                       K(zone_page_state(zone, NR_INACTIVE_FILE)),
-                       K(zone_page_state(zone, NR_UNEVICTABLE)),
-                       K(zone_page_state(zone, NR_ISOLATED_ANON)),
-                       K(zone_page_state(zone, NR_ISOLATED_FILE)),
                         K(zone->present_pages),
                         K(zone->managed_pages),
                         K(zone_page_state(zone, NR_MLOCK)),
-                       K(zone_page_state(zone, NR_FILE_DIRTY)),
-                       K(zone_page_state(zone, NR_WRITEBACK)),
-                       K(zone_page_state(zone, NR_FILE_MAPPED)),
-                       K(zone_page_state(zone, NR_SHMEM)),
                         K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
                         K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
                         zone_page_state(zone, NR_KERNEL_STACK) *
                                 THREAD_SIZE / 1024,
                         K(zone_page_state(zone, NR_PAGETABLE)),
-                       K(zone_page_state(zone, NR_UNSTABLE_NFS)),
                         K(zone_page_state(zone, NR_BOUNCE)),
                         K(free_pcp),
                         K(this_cpu_read(zone->pageset->pcp.count)),
                         K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
-                       K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
-                       K(zone_page_state(zone, NR_PAGES_SCANNED)),
-                       (!zone_reclaimable(zone) ? "yes" : "no")
-                       );
+                       K(node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED)));
                 printk("lowmem_reserve[]:");
                 for (i = 0; i < MAX_NR_ZONES; i++)
                         printk(" %ld", zone->lowmem_reserve[i]);
@@ -4512,7 +4395,7 @@ void show_free_areas(unsigned int filter)
  
         hugetlb_show_meminfo();
  
-       printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
+       printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
  
         show_swap_cache_info();
  }
@@ -5383,6 +5266,11 @@ static void __meminit setup_zone_pageset(struct zone *zone)
         zone->pageset = alloc_percpu(struct per_cpu_pageset);
         for_each_possible_cpu(cpu)
                 zone_pageset_init(zone, cpu);
+
+       if (!zone->zone_pgdat->per_cpu_nodestats) {
+               zone->zone_pgdat->per_cpu_nodestats =
+                       alloc_percpu(struct per_cpu_nodestat);
+       }
  }
  
  /*
@@ -5952,6 +5840,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
         init_waitqueue_head(&pgdat->kcompactd_wait);
  #endif
         pgdat_page_ext_init(pgdat);
+       spin_lock_init(&pgdat->lru_lock);
+       lruvec_init(node_lruvec(pgdat));
  
         for (j = 0; j < MAX_NR_ZONES; j++) {
                 struct zone *zone = pgdat->node_zones + j;
@@ -6001,21 +5891,16 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
                 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
  #ifdef CONFIG_NUMA
                 zone->node = nid;
-               zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
+               pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio)
                                                 / 100;
-               zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
+               pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100;
  #endif
                 zone->name = zone_names[j];
+               zone->zone_pgdat = pgdat;
                 spin_lock_init(&zone->lock);
-               spin_lock_init(&zone->lru_lock);
                 zone_seqlock_init(zone);
-               zone->zone_pgdat = pgdat;
                 zone_pcp_init(zone);
  
-               /* For bootup, initialized properly in watermark setup */
-               mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
-
-               lruvec_init(&zone->lruvec);
                 if (!size)
                         continue;
  
@@ -6081,11 +5966,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
         unsigned long end_pfn = 0;
  
         /* pg_data_t should be reset to zero when it's allocated */
-       WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
+       WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
  
         reset_deferred_meminit(pgdat);
         pgdat->node_id = nid;
         pgdat->node_start_pfn = node_start_pfn;
+       pgdat->per_cpu_nodestats = NULL;
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
         get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
         pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
@@ -6467,15 +6353,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                                 sizeof(arch_zone_lowest_possible_pfn));
         memset(arch_zone_highest_possible_pfn, 0,
                                 sizeof(arch_zone_highest_possible_pfn));
-       arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
-       arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
-       for (i = 1; i < MAX_NR_ZONES; i++) {
+
+       start_pfn = find_min_pfn_with_active_regions();
+
+       for (i = 0; i < MAX_NR_ZONES; i++) {
                 if (i == ZONE_MOVABLE)
                         continue;
-               arch_zone_lowest_possible_pfn[i] =
-                       arch_zone_highest_possible_pfn[i-1];
-               arch_zone_highest_possible_pfn[i] =
-                       max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
+
+               end_pfn = max(max_zone_pfn[i], start_pfn);
+               arch_zone_lowest_possible_pfn[i] = start_pfn;
+               arch_zone_highest_possible_pfn[i] = end_pfn;
+
+               start_pfn = end_pfn;
         }
         arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
         arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
@@ -6739,6 +6628,9 @@ static void calculate_totalreserve_pages(void)
         enum zone_type i, j;
  
         for_each_online_pgdat(pgdat) {
+
+               pgdat->totalreserve_pages = 0;
+
                 for (i = 0; i < MAX_NR_ZONES; i++) {
                         struct zone *zone = pgdat->node_zones + i;
                         long max = 0;
@@ -6755,7 +6647,7 @@ static void calculate_totalreserve_pages(void)
                         if (max > zone->managed_pages)
                                 max = zone->managed_pages;
  
-                       zone->totalreserve_pages = max;
+                       pgdat->totalreserve_pages += max;
  
                         reserve_pages += max;
                 }
@@ -6856,10 +6748,6 @@ static void __setup_per_zone_wmarks(void)
                 zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
                 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
  
-               __mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                       high_wmark_pages(zone) - low_wmark_pages(zone) -
-                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-
                 spin_unlock_irqrestore(&zone->lock, flags);
         }
  
@@ -6970,6 +6858,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
  int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
  {
+       struct pglist_data *pgdat;
         struct zone *zone;
         int rc;
  
@@ -6977,8 +6866,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
         if (rc)
                 return rc;
  
+       for_each_online_pgdat(pgdat)
+               pgdat->min_slab_pages = 0;
+
         for_each_zone(zone)
-               zone->min_unmapped_pages = (zone->managed_pages *
+               zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
                                 sysctl_min_unmapped_ratio) / 100;
         return 0;
  }
@@ -6986,6 +6878,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
  int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
  {
+       struct pglist_data *pgdat;
         struct zone *zone;
         int rc;
  
@@ -6993,8 +6886,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
         if (rc)
                 return rc;
  
+       for_each_online_pgdat(pgdat)
+               pgdat->min_slab_pages = 0;
+
         for_each_zone(zone)
-               zone->min_slab_pages = (zone->managed_pages *
+               zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
                                 sysctl_min_slab_ratio) / 100;
         return 0;
  }