mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and...

[cascardo/linux.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 48aaf7b..70461f3 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -169,12 +169,12 @@ void pm_restrict_gfp_mask(void)
         WARN_ON(!mutex_is_locked(&pm_mutex));
         WARN_ON(saved_gfp_mask);
         saved_gfp_mask = gfp_allowed_mask;
-       gfp_allowed_mask &= ~GFP_IOFS;
+       gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
  }
  
  bool pm_suspended_storage(void)
  {
-       if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+       if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
                 return false;
         return true;
  }
@@ -2159,13 +2159,13 @@ failed:
  static struct {
         struct fault_attr attr;
  
-       u32 ignore_gfp_highmem;
-       u32 ignore_gfp_wait;
+       bool ignore_gfp_highmem;
+       bool ignore_gfp_wait;
         u32 min_order;
  } fail_page_alloc = {
         .attr = FAULT_ATTR_INITIALIZER,
-       .ignore_gfp_wait = 1,
-       .ignore_gfp_highmem = 1,
+       .ignore_gfp_wait = true,
+       .ignore_gfp_highmem = true,
         .min_order = 1,
  };
  
@@ -2183,7 +2183,7 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
                 return false;
         if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
                 return false;
-       if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+       if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_DIRECT_RECLAIM))
                 return false;
  
         return should_fail(&fail_page_alloc.attr, 1 << order);
@@ -2249,6 +2249,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
                 min -= min / 2;
         if (alloc_flags & ALLOC_HARDER)
                 min -= min / 4;
+
  #ifdef CONFIG_CMA
         /* If allocation can't use CMA areas don't use free CMA pages */
         if (!(alloc_flags & ALLOC_CMA))
@@ -2278,14 +2279,14 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
  }
  
  bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
-                       unsigned long mark, int classzone_idx, int alloc_flags)
+                       unsigned long mark, int classzone_idx)
  {
         long free_pages = zone_page_state(z, NR_FREE_PAGES);
  
         if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
                 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
  
-       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+       return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
                                                                 free_pages);
  }
  
@@ -2477,8 +2478,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
         nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
         int zlc_active = 0;             /* set if using zonelist_cache */
         int did_zlc_setup = 0;          /* just call zlc_setup() one time */
-       bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
-                               (gfp_mask & __GFP_WRITE);
         int nr_fair_skipped = 0;
         bool zonelist_rescan;
  
@@ -2533,14 +2532,14 @@ zonelist_scan:
                  *
                  * XXX: For now, allow allocations to potentially
                  * exceed the per-zone dirty limit in the slowpath
-                * (ALLOC_WMARK_LOW unset) before going into reclaim,
+                * (spread_dirty_pages unset) before going into reclaim,
                  * which is important when on a NUMA setup the allowed
                  * zones are together not big enough to reach the
                  * global limit.  The proper fix for these situations
                  * will require awareness of zones in the
                  * dirty-throttling and the flusher threads.
                  */
-               if (consider_zone_dirty && !zone_dirty_ok(zone))
+               if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
                         continue;
  
                 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
@@ -2686,7 +2685,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                 if (test_thread_flag(TIF_MEMDIE) ||
                     (current->flags & (PF_MEMALLOC | PF_EXITING)))
                         filter &= ~SHOW_MEM_FILTER_NODES;
-       if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+       if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
                 filter &= ~SHOW_MEM_FILTER_NODES;
  
         if (fmt) {
@@ -2946,7 +2945,6 @@ static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
         int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
-       const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
  
         /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2955,11 +2953,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
          * The caller may dip into page reserves a bit more if the caller
          * cannot run direct reclaim, or if the caller has realtime scheduling
          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
-        * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
+        * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
          */
         alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
  
-       if (atomic) {
+       if (gfp_mask & __GFP_ATOMIC) {
                 /*
                  * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
                  * if it can't schedule.
@@ -2996,11 +2994,16 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
         return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
  }
  
+static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
+{
+       return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+}
+
  static inline struct page *
  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                                 struct alloc_context *ac)
  {
-       const gfp_t wait = gfp_mask & __GFP_WAIT;
+       bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
         struct page *page = NULL;
         int alloc_flags;
         unsigned long pages_reclaimed = 0;
@@ -3020,16 +3023,24 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                 return NULL;
         }
  
+       /*
+        * We also sanity check to catch abuse of atomic reserves being used by
+        * callers that are not in atomic context.
+        */
+       if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
+                               (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
+               gfp_mask &= ~__GFP_ATOMIC;
+
         /*
          * If this allocation cannot block and it is for a specific node, then
          * fail early.  There's no need to wakeup kswapd or retry for a
          * speculative node-specific allocation.
          */
-       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
+       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
                 goto nopage;
  
  retry:
-       if (!(gfp_mask & __GFP_NO_KSWAPD))
+       if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                 wake_all_kswapds(order, ac);
  
         /*
@@ -3072,8 +3083,8 @@ retry:
                 }
         }
  
-       /* Atomic allocations - we can't balance anything */
-       if (!wait) {
+       /* Caller is not willing to reclaim, we can't balance anything */
+       if (!can_direct_reclaim) {
                 /*
                  * All existing users of the deprecated __GFP_NOFAIL are
                  * blockable, so warn of any new users that actually allow this
@@ -3103,7 +3114,7 @@ retry:
                 goto got_pg;
  
         /* Checks for THP-specific high-order allocations */
-       if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
+       if (is_thp_gfp_mask(gfp_mask)) {
                 /*
                  * If compaction is deferred for high-order allocations, it is
                  * because sync compaction recently failed. If this is the case
@@ -3138,8 +3149,7 @@ retry:
          * fault, so use asynchronous memory compaction for THP unless it is
          * khugepaged trying to collapse.
          */
-       if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
-                                               (current->flags & PF_KTHREAD))
+       if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
                 migration_mode = MIGRATE_SYNC_LIGHT;
  
         /* Try direct reclaim and then allocating */
@@ -3210,7 +3220,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  
         lockdep_trace_alloc(gfp_mask);
  
-       might_sleep_if(gfp_mask & __GFP_WAIT);
+       might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
  
         if (should_fail_alloc_page(gfp_mask, order))
                 return NULL;
@@ -3231,6 +3241,10 @@ retry_cpuset:
  
         /* We set it here, as __alloc_pages_slowpath might have changed it */
         ac.zonelist = zonelist;
+
+       /* Dirty zone balancing only done in the fast path */
+       ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+
         /* The preferred zone is used for statistics later */
         preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
                                 ac.nodemask ? : &cpuset_current_mems_allowed,
@@ -3249,6 +3263,7 @@ retry_cpuset:
                  * complete.
                  */
                 alloc_mask = memalloc_noio_flags(gfp_mask);
+               ac.spread_dirty_pages = false;
  
                 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
         }
@@ -3428,24 +3443,24 @@ EXPORT_SYMBOL(__free_page_frag);
  struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
  {
         struct page *page;
-       struct mem_cgroup *memcg = NULL;
  
-       if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-               return NULL;
         page = alloc_pages(gfp_mask, order);
-       memcg_kmem_commit_charge(page, memcg, order);
+       if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+               __free_pages(page, order);
+               page = NULL;
+       }
         return page;
  }
  
  struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
  {
         struct page *page;
-       struct mem_cgroup *memcg = NULL;
  
-       if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-               return NULL;
         page = alloc_pages_node(nid, gfp_mask, order);
-       memcg_kmem_commit_charge(page, memcg, order);
+       if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+               __free_pages(page, order);
+               page = NULL;
+       }
         return page;
  }
  
@@ -3455,7 +3470,7 @@ struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
   */
  void __free_kmem_pages(struct page *page, unsigned int order)
  {
-       memcg_kmem_uncharge_pages(page, order);
+       memcg_kmem_uncharge(page, order);
         __free_pages(page, order);
  }
  
@@ -4900,8 +4915,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
  
  int __meminit init_currently_empty_zone(struct zone *zone,
                                         unsigned long zone_start_pfn,
-                                       unsigned long size,
-                                       enum memmap_context context)
+                                       unsigned long size)
  {
         struct pglist_data *pgdat = zone->zone_pgdat;
         int ret;
@@ -5413,8 +5427,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
  
                 set_pageblock_order();
                 setup_usemap(pgdat, zone, zone_start_pfn, size);
-               ret = init_currently_empty_zone(zone, zone_start_pfn,
-                                               size, MEMMAP_EARLY);
+               ret = init_currently_empty_zone(zone, zone_start_pfn, size);
                 BUG_ON(ret);
                 memmap_init(size, nid, j, zone_start_pfn);
                 zone_start_pfn += size;
@@ -5423,6 +5436,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
  
  static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
  {
+       unsigned long __maybe_unused offset = 0;
+
         /* Skip empty nodes */
         if (!pgdat->node_spanned_pages)
                 return;
@@ -5439,6 +5454,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                  * for the buddy allocator to function correctly.
                  */
                 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+               offset = pgdat->node_start_pfn - start;
                 end = pgdat_end_pfn(pgdat);
                 end = ALIGN(end, MAX_ORDER_NR_PAGES);
                 size =  (end - start) * sizeof(struct page);
@@ -5446,7 +5462,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                 if (!map)
                         map = memblock_virt_alloc_node_nopanic(size,
                                                                pgdat->node_id);
-               pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
+               pgdat->node_mem_map = map + offset;
         }
  #ifndef CONFIG_NEED_MULTIPLE_NODES
         /*
@@ -5454,9 +5470,9 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
          */
         if (pgdat == NODE_DATA(0)) {
                 mem_map = NODE_DATA(0)->node_mem_map;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
                 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
-                       mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
+                       mem_map -= offset;
  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
         }
  #endif
@@ -5668,13 +5684,17 @@ static void __init find_zone_movable_pfns_for_nodes(void)
                  */
                 required_movablecore =
                         roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+               required_movablecore = min(totalpages, required_movablecore);
                 corepages = totalpages - required_movablecore;
  
                 required_kernelcore = max(required_kernelcore, corepages);
         }
  
-       /* If kernelcore was not specified, there is no ZONE_MOVABLE */
-       if (!required_kernelcore)
+       /*
+        * If kernelcore was not specified or kernelcore size is larger
+        * than totalpages, there is no ZONE_MOVABLE.
+        */
+       if (!required_kernelcore || required_kernelcore >= totalpages)
                 goto out;
  
         /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */