team: don't call netdev_change_features under team->lock

[cascardo/linux.git] / mm / oom_kill.c
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 8634958..5bb2f76 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -174,8 +174,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
         if (!p)
                 return 0;
  
+       /*
+        * Do not even consider tasks which are explicitly marked oom
+        * unkillable or have been already oom reaped.
+        */
         adj = (long)p->signal->oom_score_adj;
-       if (adj == OOM_SCORE_ADJ_MIN) {
+       if (adj == OOM_SCORE_ADJ_MIN ||
+                       test_bit(MMF_OOM_REAPED, &p->mm->flags)) {
                 task_unlock(p);
                 return 0;
         }
@@ -278,12 +283,8 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
          * This task already has access to memory reserves and is being killed.
          * Don't allow any other task to have access to the reserves.
          */
-       if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
-               if (!is_sysrq_oom(oc))
-                       return OOM_SCAN_ABORT;
-       }
-       if (!task->mm)
-               return OOM_SCAN_CONTINUE;
+       if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims))
+               return OOM_SCAN_ABORT;
  
         /*
          * If task is allocating a lot of memory and has been marked to be
@@ -302,12 +303,12 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
  static struct task_struct *select_bad_process(struct oom_control *oc,
                 unsigned int *ppoints, unsigned long totalpages)
  {
-       struct task_struct *g, *p;
+       struct task_struct *p;
         struct task_struct *chosen = NULL;
         unsigned long chosen_points = 0;
  
         rcu_read_lock();
-       for_each_process_thread(g, p) {
+       for_each_process(p) {
                 unsigned int points;
  
                 switch (oom_scan_process_thread(oc, p, totalpages)) {
@@ -326,9 +327,6 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
                 points = oom_badness(p, NULL, oc->nodemask, totalpages);
                 if (!points || points < chosen_points)
                         continue;
-               /* Prefer thread group leaders for display purposes */
-               if (points == chosen_points && thread_group_leader(chosen))
-                       continue;
  
                 chosen = p;
                 chosen_points = points;
@@ -412,6 +410,25 @@ bool oom_killer_disabled __read_mostly;
  
  #define K(x) ((x) << (PAGE_SHIFT-10))
  
+/*
+ * task->mm can be NULL if the task is the exited group leader.  So to
+ * determine whether the task is using a particular mm, we examine all the
+ * task's threads: if one of those is using this mm then this task was also
+ * using it.
+ */
+static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+{
+       struct task_struct *t;
+
+       for_each_thread(p, t) {
+               struct mm_struct *t_mm = READ_ONCE(t->mm);
+               if (t_mm)
+                       return t_mm == mm;
+       }
+       return false;
+}
+
+
  #ifdef CONFIG_MMU
  /*
   * OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -422,7 +439,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
  static struct task_struct *oom_reaper_list;
  static DEFINE_SPINLOCK(oom_reaper_lock);
  
-
  static bool __oom_reap_task(struct task_struct *tsk)
  {
         struct mmu_gather tlb;
@@ -491,16 +507,17 @@ static bool __oom_reap_task(struct task_struct *tsk)
         up_read(&mm->mmap_sem);
  
         /*
-        * Clear TIF_MEMDIE because the task shouldn't be sitting on a
-        * reasonably reclaimable memory anymore. OOM killer can continue
-        * by selecting other victim if unmapping hasn't led to any
-        * improvements. This also means that selecting this task doesn't
-        * make any sense.
+        * This task can be safely ignored because we cannot do much more
+        * to release its memory.
          */
-       tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
-       exit_oom_victim(tsk);
+       set_bit(MMF_OOM_REAPED, &mm->flags);
  out:
-       mmput(mm);
+       /*
+        * Drop our reference but make sure the mmput slow path is called from a
+        * different context because we shouldn't risk we get stuck there and
+        * put the oom_reaper out of the way.
+        */
+       mmput_async(mm);
         return ret;
  }
  
@@ -519,6 +536,15 @@ static void oom_reap_task(struct task_struct *tsk)
                 debug_show_all_locks();
         }
  
+       /*
+        * Clear TIF_MEMDIE because the task shouldn't be sitting on a
+        * reasonably reclaimable memory anymore or it is not a good candidate
+        * for the oom victim right now because it cannot release its memory
+        * itself nor by the oom reaper.
+        */
+       tsk->oom_reaper_list = NULL;
+       exit_oom_victim(tsk);
+
         /* Drop a reference taken by wake_oom_reaper */
         put_task_struct(tsk);
  }
@@ -563,6 +589,53 @@ static void wake_oom_reaper(struct task_struct *tsk)
         wake_up(&oom_reaper_wait);
  }
  
+/* Check if we can reap the given task. This has to be called with stable
+ * tsk->mm
+ */
+void try_oom_reaper(struct task_struct *tsk)
+{
+       struct mm_struct *mm = tsk->mm;
+       struct task_struct *p;
+
+       if (!mm)
+               return;
+
+       /*
+        * There might be other threads/processes which are either not
+        * dying or even not killable.
+        */
+       if (atomic_read(&mm->mm_users) > 1) {
+               rcu_read_lock();
+               for_each_process(p) {
+                       bool exiting;
+
+                       if (!process_shares_mm(p, mm))
+                               continue;
+                       if (same_thread_group(p, tsk))
+                               continue;
+                       if (fatal_signal_pending(p))
+                               continue;
+
+                       /*
+                        * If the task is exiting make sure the whole thread group
+                        * is exiting and cannot acces mm anymore.
+                        */
+                       spin_lock_irq(&p->sighand->siglock);
+                       exiting = signal_group_exit(p->signal);
+                       spin_unlock_irq(&p->sighand->siglock);
+                       if (exiting)
+                               continue;
+
+                       /* Give up */
+                       rcu_read_unlock();
+                       return;
+               }
+               rcu_read_unlock();
+       }
+
+       wake_oom_reaper(tsk);
+}
+
  static int __init oom_init(void)
  {
         oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -593,6 +666,7 @@ void mark_oom_victim(struct task_struct *tsk)
         /* OOM killer might race with memcg OOM */
         if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
                 return;
+       atomic_inc(&tsk->signal->oom_victims);
         /*
          * Make sure that the task is woken up from uninterruptible sleep
          * if it is frozen because OOM killer wouldn't be able to free
@@ -610,6 +684,7 @@ void exit_oom_victim(struct task_struct *tsk)
  {
         if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
                 return;
+       atomic_dec(&tsk->signal->oom_victims);
  
         if (!atomic_dec_return(&oom_victims))
                 wake_up_all(&oom_victims_wait);
@@ -652,24 +727,6 @@ void oom_killer_enable(void)
         oom_killer_disabled = false;
  }
  
-/*
- * task->mm can be NULL if the task is the exited group leader.  So to
- * determine whether the task is using a particular mm, we examine all the
- * task's threads: if one of those is using this mm then this task was also
- * using it.
- */
-static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
-{
-       struct task_struct *t;
-
-       for_each_thread(p, t) {
-               struct mm_struct *t_mm = READ_ONCE(t->mm);
-               if (t_mm)
-                       return t_mm == mm;
-       }
-       return false;
-}
-
  /*
   * Must be called while holding a reference to p, which will be released upon
   * returning.
@@ -694,6 +751,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
         task_lock(p);
         if (p->mm && task_will_free_mem(p)) {
                 mark_oom_victim(p);
+               try_oom_reaper(p);
                 task_unlock(p);
                 put_task_struct(p);
                 return;
@@ -873,9 +931,19 @@ bool out_of_memory(struct oom_control *oc)
         if (current->mm &&
             (fatal_signal_pending(current) || task_will_free_mem(current))) {
                 mark_oom_victim(current);
+               try_oom_reaper(current);
                 return true;
         }
  
+       /*
+        * The OOM killer does not compensate for IO-less reclaim.
+        * pagefault_out_of_memory lost its gfp context so we have to
+        * make sure exclude 0 mask - all other users should have at least
+        * ___GFP_DIRECT_RECLAIM to get here.
+        */
+       if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL)))
+               return true;
+
         /*
          * Check if there were limitations on the allocation (only relevant for
          * NUMA) that may require different handling.