mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and...

[cascardo/linux.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 446bb36..70461f3 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -169,12 +169,12 @@ void pm_restrict_gfp_mask(void)
         WARN_ON(!mutex_is_locked(&pm_mutex));
         WARN_ON(saved_gfp_mask);
         saved_gfp_mask = gfp_allowed_mask;
-       gfp_allowed_mask &= ~GFP_IOFS;
+       gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
  }
  
  bool pm_suspended_storage(void)
  {
-       if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+       if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
                 return false;
         return true;
  }
@@ -2183,7 +2183,7 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
                 return false;
         if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
                 return false;
-       if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+       if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_DIRECT_RECLAIM))
                 return false;
  
         return should_fail(&fail_page_alloc.attr, 1 << order);
@@ -2249,6 +2249,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
                 min -= min / 2;
         if (alloc_flags & ALLOC_HARDER)
                 min -= min / 4;
+
  #ifdef CONFIG_CMA
         /* If allocation can't use CMA areas don't use free CMA pages */
         if (!(alloc_flags & ALLOC_CMA))
@@ -2278,14 +2279,14 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
  }
  
  bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
-                       unsigned long mark, int classzone_idx, int alloc_flags)
+                       unsigned long mark, int classzone_idx)
  {
         long free_pages = zone_page_state(z, NR_FREE_PAGES);
  
         if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
                 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
  
-       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+       return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
                                                                 free_pages);
  }
  
@@ -2477,8 +2478,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
         nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
         int zlc_active = 0;             /* set if using zonelist_cache */
         int did_zlc_setup = 0;          /* just call zlc_setup() one time */
-       bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
-                               (gfp_mask & __GFP_WRITE);
         int nr_fair_skipped = 0;
         bool zonelist_rescan;
  
@@ -2533,14 +2532,14 @@ zonelist_scan:
                  *
                  * XXX: For now, allow allocations to potentially
                  * exceed the per-zone dirty limit in the slowpath
-                * (ALLOC_WMARK_LOW unset) before going into reclaim,
+                * (spread_dirty_pages unset) before going into reclaim,
                  * which is important when on a NUMA setup the allowed
                  * zones are together not big enough to reach the
                  * global limit.  The proper fix for these situations
                  * will require awareness of zones in the
                  * dirty-throttling and the flusher threads.
                  */
-               if (consider_zone_dirty && !zone_dirty_ok(zone))
+               if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
                         continue;
  
                 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
@@ -2686,7 +2685,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                 if (test_thread_flag(TIF_MEMDIE) ||
                     (current->flags & (PF_MEMALLOC | PF_EXITING)))
                         filter &= ~SHOW_MEM_FILTER_NODES;
-       if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+       if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
                 filter &= ~SHOW_MEM_FILTER_NODES;
  
         if (fmt) {
@@ -2946,7 +2945,6 @@ static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
         int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
-       const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
  
         /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2955,11 +2953,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
          * The caller may dip into page reserves a bit more if the caller
          * cannot run direct reclaim, or if the caller has realtime scheduling
          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
-        * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
+        * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
          */
         alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
  
-       if (atomic) {
+       if (gfp_mask & __GFP_ATOMIC) {
                 /*
                  * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
                  * if it can't schedule.
@@ -2996,11 +2994,16 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
         return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
  }
  
+static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
+{
+       return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+}
+
  static inline struct page *
  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                                 struct alloc_context *ac)
  {
-       const gfp_t wait = gfp_mask & __GFP_WAIT;
+       bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
         struct page *page = NULL;
         int alloc_flags;
         unsigned long pages_reclaimed = 0;
@@ -3020,16 +3023,24 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                 return NULL;
         }
  
+       /*
+        * We also sanity check to catch abuse of atomic reserves being used by
+        * callers that are not in atomic context.
+        */
+       if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
+                               (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
+               gfp_mask &= ~__GFP_ATOMIC;
+
         /*
          * If this allocation cannot block and it is for a specific node, then
          * fail early.  There's no need to wakeup kswapd or retry for a
          * speculative node-specific allocation.
          */
-       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
+       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
                 goto nopage;
  
  retry:
-       if (!(gfp_mask & __GFP_NO_KSWAPD))
+       if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                 wake_all_kswapds(order, ac);
  
         /*
@@ -3072,8 +3083,8 @@ retry:
                 }
         }
  
-       /* Atomic allocations - we can't balance anything */
-       if (!wait) {
+       /* Caller is not willing to reclaim, we can't balance anything */
+       if (!can_direct_reclaim) {
                 /*
                  * All existing users of the deprecated __GFP_NOFAIL are
                  * blockable, so warn of any new users that actually allow this
@@ -3103,7 +3114,7 @@ retry:
                 goto got_pg;
  
         /* Checks for THP-specific high-order allocations */
-       if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
+       if (is_thp_gfp_mask(gfp_mask)) {
                 /*
                  * If compaction is deferred for high-order allocations, it is
                  * because sync compaction recently failed. If this is the case
@@ -3138,8 +3149,7 @@ retry:
          * fault, so use asynchronous memory compaction for THP unless it is
          * khugepaged trying to collapse.
          */
-       if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
-                                               (current->flags & PF_KTHREAD))
+       if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
                 migration_mode = MIGRATE_SYNC_LIGHT;
  
         /* Try direct reclaim and then allocating */
@@ -3210,7 +3220,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  
         lockdep_trace_alloc(gfp_mask);
  
-       might_sleep_if(gfp_mask & __GFP_WAIT);
+       might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
  
         if (should_fail_alloc_page(gfp_mask, order))
                 return NULL;
@@ -3231,6 +3241,10 @@ retry_cpuset:
  
         /* We set it here, as __alloc_pages_slowpath might have changed it */
         ac.zonelist = zonelist;
+
+       /* Dirty zone balancing only done in the fast path */
+       ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+
         /* The preferred zone is used for statistics later */
         preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
                                 ac.nodemask ? : &cpuset_current_mems_allowed,
@@ -3249,6 +3263,7 @@ retry_cpuset:
                  * complete.
                  */
                 alloc_mask = memalloc_noio_flags(gfp_mask);
+               ac.spread_dirty_pages = false;
  
                 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
         }