Merge branch 'linus' into sched/core, to resolve conflicts

author Ingo Molnar <mingo@kernel.org>

Sun, 2 Feb 2014 08:45:39 +0000 (09:45 +0100)

committer Ingo Molnar <mingo@kernel.org>

Sun, 2 Feb 2014 08:45:39 +0000 (09:45 +0100)
author Ingo Molnar <mingo@kernel.org>
Sun, 2 Feb 2014 08:45:39 +0000 (09:45 +0100)
committer Ingo Molnar <mingo@kernel.org>
Sun, 2 Feb 2014 08:45:39 +0000 (09:45 +0100)
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt

index e55124e..04bf16a 100644 (file)
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -441,8 +441,7 @@ feature should be disabled. Otherwise, if the system overhead from the
  feature is too high then the rate the kernel samples for NUMA hinting
  faults may be controlled by the numa_balancing_scan_period_min_ms,
  numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
-numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
-numa_balancing_migrate_deferred.
+numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
  
  ==============================================================
  
@@ -483,13 +482,6 @@ rate for each task.
  numa_balancing_scan_size_mb is how many megabytes worth of pages are
  scanned for a given scan.
  
-numa_balancing_migrate_deferred is how many page migrations get skipped
-unconditionally, after a page migration is skipped because a page is shared
-with other tasks. This reduces page migration overhead, and determines
-how much stronger the "move task near its memory" policy scheduler becomes,
-versus the "move memory near its task" memory management policy, for workloads
-with shared memory.
-
  ==============================================================
  
  osrelease, ostype & version:
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 68a0e84..ed86779 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1469,9 +1469,10 @@ struct task_struct {
         unsigned int numa_scan_period;
         unsigned int numa_scan_period_max;
         int numa_preferred_nid;
-       int numa_migrate_deferred;
         unsigned long numa_migrate_retry;
         u64 node_stamp;                 /* migration stamp  */
+       u64 last_task_numa_placement;
+       u64 last_sum_exec_runtime;
         struct callback_head numa_work;
  
         struct list_head numa_entry;
@@ -1482,15 +1483,22 @@ struct task_struct {
          * Scheduling placement decisions are made based on the these counts.
          * The values remain static for the duration of a PTE scan
          */
-       unsigned long *numa_faults;
+       unsigned long *numa_faults_memory;
         unsigned long total_numa_faults;
  
         /*
          * numa_faults_buffer records faults per node during the current
-        * scan window. When the scan completes, the counts in numa_faults
-        * decay and these values are copied.
+        * scan window. When the scan completes, the counts in
+        * numa_faults_memory decay and these values are copied.
          */
-       unsigned long *numa_faults_buffer;
+       unsigned long *numa_faults_buffer_memory;
+
+       /*
+        * Track the nodes the process was running on when a NUMA hinting
+        * fault was incurred.
+        */
+       unsigned long *numa_faults_cpu;
+       unsigned long *numa_faults_buffer_cpu;
  
         /*
          * numa_faults_locality tracks if faults recorded during the last
@@ -1595,8 +1603,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
  extern pid_t task_numa_group_id(struct task_struct *p);
  extern void set_numabalancing_state(bool enabled);
  extern void task_numa_free(struct task_struct *p);
-
-extern unsigned int sysctl_numa_balancing_migrate_deferred;
+extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
+                                       int src_nid, int dst_cpu);
  #else
  static inline void task_numa_fault(int last_node, int node, int pages,
                                    int flags)
@@ -1612,6 +1620,11 @@ static inline void set_numabalancing_state(bool enabled)
  static inline void task_numa_free(struct task_struct *p)
  {
  }
+static inline bool should_numa_migrate_memory(struct task_struct *p,
+                               struct page *page, int src_nid, int dst_cpu)
+{
+       return true;
+}
  #endif
  
  static inline struct pid *task_pid(struct task_struct *task)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index b46131e..210a12a 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1745,8 +1745,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
         p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
         p->numa_scan_period = sysctl_numa_balancing_scan_delay;
         p->numa_work.next = &p->numa_work;
-       p->numa_faults = NULL;
-       p->numa_faults_buffer = NULL;
+       p->numa_faults_memory = NULL;
+       p->numa_faults_buffer_memory = NULL;
+       p->last_task_numa_placement = 0;
+       p->last_sum_exec_runtime = 0;
  
         INIT_LIST_HEAD(&p->numa_entry);
         p->numa_group = NULL;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index dd52e7f..31b908d 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -533,15 +533,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
                         unsigned long nr_faults = -1;
                         int cpu_current, home_node;
  
-                       if (p->numa_faults)
-                               nr_faults = p->numa_faults[2*node + i];
+                       if (p->numa_faults_memory)
+                               nr_faults = p->numa_faults_memory[2*node + i];
  
                         cpu_current = !i ? (task_node(p) == node) :
                                 (pol && node_isset(node, pol->v.nodes));
  
                         home_node = (p->numa_preferred_nid == node);
  
-                       SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
+                       SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
                                 i, node, cpu_current, home_node, nr_faults);
                 }
         }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 966cc2b..4caa803 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -819,14 +819,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
  /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
  unsigned int sysctl_numa_balancing_scan_delay = 1000;
  
-/*
- * After skipping a page migration on a shared page, skip N more numa page
- * migrations unconditionally. This reduces the number of NUMA migrations
- * in shared memory workloads, and has the effect of pulling tasks towards
- * where their memory lives, over pulling the memory towards the task.
- */
-unsigned int sysctl_numa_balancing_migrate_deferred = 16;
-
  static unsigned int task_nr_scan_windows(struct task_struct *p)
  {
         unsigned long rss = 0;
@@ -893,10 +885,26 @@ struct numa_group {
         struct list_head task_list;
  
         struct rcu_head rcu;
+       nodemask_t active_nodes;
         unsigned long total_faults;
+       /*
+        * Faults_cpu is used to decide whether memory should move
+        * towards the CPU. As a consequence, these stats are weighted
+        * more by CPU use than by memory faults.
+        */
+       unsigned long *faults_cpu;
         unsigned long faults[0];
  };
  
+/* Shared or private faults. */
+#define NR_NUMA_HINT_FAULT_TYPES 2
+
+/* Memory and CPU locality */
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
+
+/* Averaged statistics, and temporary buffers. */
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
+
  pid_t task_numa_group_id(struct task_struct *p)
  {
         return p->numa_group ? p->numa_group->gid : 0;
@@ -904,16 +912,16 @@ pid_t task_numa_group_id(struct task_struct *p)
  
  static inline int task_faults_idx(int nid, int priv)
  {
-       return 2 * nid + priv;
+       return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
  }
  
  static inline unsigned long task_faults(struct task_struct *p, int nid)
  {
-       if (!p->numa_faults)
+       if (!p->numa_faults_memory)
                 return 0;
  
-       return p->numa_faults[task_faults_idx(nid, 0)] +
-               p->numa_faults[task_faults_idx(nid, 1)];
+       return p->numa_faults_memory[task_faults_idx(nid, 0)] +
+               p->numa_faults_memory[task_faults_idx(nid, 1)];
  }
  
  static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -925,6 +933,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
                 p->numa_group->faults[task_faults_idx(nid, 1)];
  }
  
+static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+{
+       return group->faults_cpu[task_faults_idx(nid, 0)] +
+               group->faults_cpu[task_faults_idx(nid, 1)];
+}
+
  /*
   * These return the fraction of accesses done by a particular task, or
   * task group, on a particular numa node.  The group weight is given a
@@ -935,7 +949,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
  {
         unsigned long total_faults;
  
-       if (!p->numa_faults)
+       if (!p->numa_faults_memory)
                 return 0;
  
         total_faults = p->total_numa_faults;
@@ -954,6 +968,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
         return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
  }
  
+bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+                               int src_nid, int dst_cpu)
+{
+       struct numa_group *ng = p->numa_group;
+       int dst_nid = cpu_to_node(dst_cpu);
+       int last_cpupid, this_cpupid;
+
+       this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+
+       /*
+        * Multi-stage node selection is used in conjunction with a periodic
+        * migration fault to build a temporal task<->page relation. By using
+        * a two-stage filter we remove short/unlikely relations.
+        *
+        * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
+        * a task's usage of a particular page (n_p) per total usage of this
+        * page (n_t) (in a given time-span) to a probability.
+        *
+        * Our periodic faults will sample this probability and getting the
+        * same result twice in a row, given these samples are fully
+        * independent, is then given by P(n)^2, provided our sample period
+        * is sufficiently short compared to the usage pattern.
+        *
+        * This quadric squishes small probabilities, making it less likely we
+        * act on an unlikely task<->page relation.
+        */
+       last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+       if (!cpupid_pid_unset(last_cpupid) &&
+                               cpupid_to_nid(last_cpupid) != dst_nid)
+               return false;
+
+       /* Always allow migrate on private faults */
+       if (cpupid_match_pid(p, last_cpupid))
+               return true;
+
+       /* A shared fault, but p->numa_group has not been set up yet. */
+       if (!ng)
+               return true;
+
+       /*
+        * Do not migrate if the destination is not a node that
+        * is actively used by this numa group.
+        */
+       if (!node_isset(dst_nid, ng->active_nodes))
+               return false;
+
+       /*
+        * Source is a node that is not actively used by this
+        * numa group, while the destination is. Migrate.
+        */
+       if (!node_isset(src_nid, ng->active_nodes))
+               return true;
+
+       /*
+        * Both source and destination are nodes in active
+        * use by this numa group. Maximize memory bandwidth
+        * by migrating from more heavily used groups, to less
+        * heavily used ones, spreading the load around.
+        * Use a 1/4 hysteresis to avoid spurious page movement.
+        */
+       return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+}
+
  static unsigned long weighted_cpuload(const int cpu);
  static unsigned long source_load(int cpu, int type);
  static unsigned long target_load(int cpu, int type);
@@ -1267,7 +1344,7 @@ static int task_numa_migrate(struct task_struct *p)
  static void numa_migrate_preferred(struct task_struct *p)
  {
         /* This task has no NUMA fault statistics yet */
-       if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+       if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
                 return;
  
         /* Periodically retry migrating the task to the preferred node */
@@ -1281,6 +1358,38 @@ static void numa_migrate_preferred(struct task_struct *p)
         task_numa_migrate(p);
  }
  
+/*
+ * Find the nodes on which the workload is actively running. We do this by
+ * tracking the nodes from which NUMA hinting faults are triggered. This can
+ * be different from the set of nodes where the workload's memory is currently
+ * located.
+ *
+ * The bitmask is used to make smarter decisions on when to do NUMA page
+ * migrations, To prevent flip-flopping, and excessive page migrations, nodes
+ * are added when they cause over 6/16 of the maximum number of faults, but
+ * only removed when they drop below 3/16.
+ */
+static void update_numa_active_node_mask(struct numa_group *numa_group)
+{
+       unsigned long faults, max_faults = 0;
+       int nid;
+
+       for_each_online_node(nid) {
+               faults = group_faults_cpu(numa_group, nid);
+               if (faults > max_faults)
+                       max_faults = faults;
+       }
+
+       for_each_online_node(nid) {
+               faults = group_faults_cpu(numa_group, nid);
+               if (!node_isset(nid, numa_group->active_nodes)) {
+                       if (faults > max_faults * 6 / 16)
+                               node_set(nid, numa_group->active_nodes);
+               } else if (faults < max_faults * 3 / 16)
+                       node_clear(nid, numa_group->active_nodes);
+       }
+}
+
  /*
   * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
   * increments. The more local the fault statistics are, the higher the scan
@@ -1355,11 +1464,41 @@ static void update_task_scan_period(struct task_struct *p,
         memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
  }
  
+/*
+ * Get the fraction of time the task has been running since the last
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
+ * decays those on a 32ms period, which is orders of magnitude off
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
+ * stats only if the task is so new there are no NUMA statistics yet.
+ */
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
+{
+       u64 runtime, delta, now;
+       /* Use the start of this time slice to avoid calculations. */
+       now = p->se.exec_start;
+       runtime = p->se.sum_exec_runtime;
+
+       if (p->last_task_numa_placement) {
+               delta = runtime - p->last_sum_exec_runtime;
+               *period = now - p->last_task_numa_placement;
+       } else {
+               delta = p->se.avg.runnable_avg_sum;
+               *period = p->se.avg.runnable_avg_period;
+       }
+
+       p->last_sum_exec_runtime = runtime;
+       p->last_task_numa_placement = now;
+
+       return delta;
+}
+
  static void task_numa_placement(struct task_struct *p)
  {
         int seq, nid, max_nid = -1, max_group_nid = -1;
         unsigned long max_faults = 0, max_group_faults = 0;
         unsigned long fault_types[2] = { 0, 0 };
+       unsigned long total_faults;
+       u64 runtime, period;
         spinlock_t *group_lock = NULL;
  
         seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1368,6 +1507,10 @@ static void task_numa_placement(struct task_struct *p)
         p->numa_scan_seq = seq;
         p->numa_scan_period_max = task_scan_max(p);
  
+       total_faults = p->numa_faults_locality[0] +
+                      p->numa_faults_locality[1];
+       runtime = numa_get_avg_runtime(p, &period);
+
         /* If the task is part of a group prevent parallel updates to group stats */
         if (p->numa_group) {
                 group_lock = &p->numa_group->lock;
@@ -1379,24 +1522,37 @@ static void task_numa_placement(struct task_struct *p)
                 unsigned long faults = 0, group_faults = 0;
                 int priv, i;
  
-               for (priv = 0; priv < 2; priv++) {
-                       long diff;
+               for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
+                       long diff, f_diff, f_weight;
  
                         i = task_faults_idx(nid, priv);
-                       diff = -p->numa_faults[i];
  
                         /* Decay existing window, copy faults since last scan */
-                       p->numa_faults[i] >>= 1;
-                       p->numa_faults[i] += p->numa_faults_buffer[i];
-                       fault_types[priv] += p->numa_faults_buffer[i];
-                       p->numa_faults_buffer[i] = 0;
+                       diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
+                       fault_types[priv] += p->numa_faults_buffer_memory[i];
+                       p->numa_faults_buffer_memory[i] = 0;
  
-                       faults += p->numa_faults[i];
-                       diff += p->numa_faults[i];
+                       /*
+                        * Normalize the faults_from, so all tasks in a group
+                        * count according to CPU use, instead of by the raw
+                        * number of faults. Tasks with little runtime have
+                        * little over-all impact on throughput, and thus their
+                        * faults are less important.
+                        */
+                       f_weight = div64_u64(runtime << 16, period + 1);
+                       f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+                                  (total_faults + 1);
+                       f_diff = f_weight - p->numa_faults_cpu[i] / 2;
+                       p->numa_faults_buffer_cpu[i] = 0;
+
+                       p->numa_faults_memory[i] += diff;
+                       p->numa_faults_cpu[i] += f_diff;
+                       faults += p->numa_faults_memory[i];
                         p->total_numa_faults += diff;
                         if (p->numa_group) {
                                 /* safe because we can only change our own group */
                                 p->numa_group->faults[i] += diff;
+                               p->numa_group->faults_cpu[i] += f_diff;
                                 p->numa_group->total_faults += diff;
                                 group_faults += p->numa_group->faults[i];
                         }
@@ -1416,6 +1572,7 @@ static void task_numa_placement(struct task_struct *p)
         update_task_scan_period(p, fault_types[0], fault_types[1]);
  
         if (p->numa_group) {
+               update_numa_active_node_mask(p->numa_group);
                 /*
                  * If the preferred task and group nids are different,
                  * iterate over the nodes again to find the best place.
@@ -1465,7 +1622,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
  
         if (unlikely(!p->numa_group)) {
                 unsigned int size = sizeof(struct numa_group) +
-                                   2*nr_node_ids*sizeof(unsigned long);
+                                   4*nr_node_ids*sizeof(unsigned long);
  
                 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
                 if (!grp)
@@ -1475,9 +1632,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
                 spin_lock_init(&grp->lock);
                 INIT_LIST_HEAD(&grp->task_list);
                 grp->gid = p->pid;
+               /* Second half of the array tracks nids where faults happen */
+               grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
+                                               nr_node_ids;
+
+               node_set(task_node(current), grp->active_nodes);
  
-               for (i = 0; i < 2*nr_node_ids; i++)
-                       grp->faults[i] = p->numa_faults[i];
+               for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+                       grp->faults[i] = p->numa_faults_memory[i];
  
                 grp->total_faults = p->total_numa_faults;
  
@@ -1534,9 +1696,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
  
         double_lock(&my_grp->lock, &grp->lock);
  
-       for (i = 0; i < 2*nr_node_ids; i++) {
-               my_grp->faults[i] -= p->numa_faults[i];
-               grp->faults[i] += p->numa_faults[i];
+       for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
+               my_grp->faults[i] -= p->numa_faults_memory[i];
+               grp->faults[i] += p->numa_faults_memory[i];
         }
         my_grp->total_faults -= p->total_numa_faults;
         grp->total_faults += p->total_numa_faults;
@@ -1562,12 +1724,12 @@ void task_numa_free(struct task_struct *p)
  {
         struct numa_group *grp = p->numa_group;
         int i;
-       void *numa_faults = p->numa_faults;
+       void *numa_faults = p->numa_faults_memory;
  
         if (grp) {
                 spin_lock(&grp->lock);
-               for (i = 0; i < 2*nr_node_ids; i++)
-                       grp->faults[i] -= p->numa_faults[i];
+               for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+                       grp->faults[i] -= p->numa_faults_memory[i];
                 grp->total_faults -= p->total_numa_faults;
  
                 list_del(&p->numa_entry);
@@ -1577,18 +1739,21 @@ void task_numa_free(struct task_struct *p)
                 put_numa_group(grp);
         }
  
-       p->numa_faults = NULL;
-       p->numa_faults_buffer = NULL;
+       p->numa_faults_memory = NULL;
+       p->numa_faults_buffer_memory = NULL;
+       p->numa_faults_cpu= NULL;
+       p->numa_faults_buffer_cpu = NULL;
         kfree(numa_faults);
  }
  
  /*
   * Got a PROT_NONE fault for a page on @node.
   */
-void task_numa_fault(int last_cpupid, int node, int pages, int flags)
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
  {
         struct task_struct *p = current;
         bool migrated = flags & TNF_MIGRATED;
+       int cpu_node = task_node(current);
         int priv;
  
         if (!numabalancing_enabled)
@@ -1603,16 +1768,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
                 return;
  
         /* Allocate buffer to track faults on a per-node basis */
-       if (unlikely(!p->numa_faults)) {
-               int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+       if (unlikely(!p->numa_faults_memory)) {
+               int size = sizeof(*p->numa_faults_memory) *
+                          NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
  
-               /* numa_faults and numa_faults_buffer share the allocation */
-               p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
-               if (!p->numa_faults)
+               p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+               if (!p->numa_faults_memory)
                         return;
  
-               BUG_ON(p->numa_faults_buffer);
-               p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+               BUG_ON(p->numa_faults_buffer_memory);
+               /*
+                * The averaged statistics, shared & private, memory & cpu,
+                * occupy the first half of the array. The second half of the
+                * array is for current counters, which are averaged into the
+                * first set by task_numa_placement.
+                */
+               p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
+               p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
+               p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
                 p->total_numa_faults = 0;
                 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
         }
@@ -1641,7 +1814,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
         if (migrated)
                 p->numa_pages_migrated += pages;
  
-       p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+       p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
+       p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
         p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
  }
  
@@ -4783,7 +4957,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
  {
         int src_nid, dst_nid;
  
-       if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+       if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
             !(env->sd->flags & SD_NUMA)) {
                 return false;
         }
@@ -4814,7 +4988,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
         if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
                 return false;
  
-       if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+       if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
                 return false;
  
         src_nid = cpu_to_node(env->src_cpu);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 49e13e1..7754ff1 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -385,13 +385,6 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
-       {
-               .procname       = "numa_balancing_migrate_deferred",
-               .data           = &sysctl_numa_balancing_migrate_deferred,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec,
-       },
         {
                 .procname       = "numa_balancing",
                 .data           = NULL, /* filled in by handler */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index ae3c8f3..f520b9d 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2301,35 +2301,6 @@ static void sp_free(struct sp_node *n)
         kmem_cache_free(sn_cache, n);
  }
  
-#ifdef CONFIG_NUMA_BALANCING
-static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
-       /* Never defer a private fault */
-       if (cpupid_match_pid(p, last_cpupid))
-               return false;
-
-       if (p->numa_migrate_deferred) {
-               p->numa_migrate_deferred--;
-               return true;
-       }
-       return false;
-}
-
-static inline void defer_numa_migrate(struct task_struct *p)
-{
-       p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
-}
-#else
-static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
-       return false;
-}
-
-static inline void defer_numa_migrate(struct task_struct *p)
-{
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
  /**
   * mpol_misplaced - check whether current page node is valid in policy
   *
@@ -2403,52 +2374,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
  
         /* Migrate the page towards the node whose CPU is referencing it */
         if (pol->flags & MPOL_F_MORON) {
-               int last_cpupid;
-               int this_cpupid;
-
                 polnid = thisnid;
-               this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
-
-               /*
-                * Multi-stage node selection is used in conjunction
-                * with a periodic migration fault to build a temporal
-                * task<->page relation. By using a two-stage filter we
-                * remove short/unlikely relations.
-                *
-                * Using P(p) ~ n_p / n_t as per frequentist
-                * probability, we can equate a task's usage of a
-                * particular page (n_p) per total usage of this
-                * page (n_t) (in a given time-span) to a probability.
-                *
-                * Our periodic faults will sample this probability and
-                * getting the same result twice in a row, given these
-                * samples are fully independent, is then given by
-                * P(n)^2, provided our sample period is sufficiently
-                * short compared to the usage pattern.
-                *
-                * This quadric squishes small probabilities, making
-                * it less likely we act on an unlikely task<->page
-                * relation.
-                */
-               last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
-               if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
  
-                       /* See sysctl_numa_balancing_migrate_deferred comment */
-                       if (!cpupid_match_pid(current, last_cpupid))
-                               defer_numa_migrate(current);
-
-                       goto out;
-               }
-
-               /*
-                * The quadratic filter above reduces extraneous migration
-                * of shared pages somewhat. This code reduces it even more,
-                * reducing the overhead of page migrations of shared pages.
-                * This makes workloads with shared pages rely more on
-                * "move task near its memory", and less on "move memory
-                * towards its task", which is exactly what we want.
-                */
-               if (numa_migrate_deferred(current, last_cpupid))
+               if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
                         goto out;
         }
author	Ingo Molnar <mingo@kernel.org>
	Sun, 2 Feb 2014 08:45:39 +0000 (09:45 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Sun, 2 Feb 2014 08:45:39 +0000 (09:45 +0100)
Documentation/sysctl/kernel.txt		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/debug.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
mm/mempolicy.c		patch \| blob \| history