Merge tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel...
[cascardo/linux.git] / kernel / sched / fair.c
index 756f9f9..9af5af9 100644 (file)
@@ -26,6 +26,9 @@
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/migrate.h>
+#include <linux/task_work.h>
 
 #include <trace/events/sched.h>
 
@@ -774,6 +777,227 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 100;
+unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
+unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
+
+/* Portion of address space to scan in MB */
+unsigned int sysctl_numa_balancing_scan_size = 256;
+
+/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
+unsigned int sysctl_numa_balancing_scan_delay = 1000;
+
+static void task_numa_placement(struct task_struct *p)
+{
+       int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+
+       if (p->numa_scan_seq == seq)
+               return;
+       p->numa_scan_seq = seq;
+
+       /* FIXME: Scheduling placement policy hints go here */
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages, bool migrated)
+{
+       struct task_struct *p = current;
+
+       if (!sched_feat_numa(NUMA))
+               return;
+
+       /* FIXME: Allocate task-specific structure for placement policy here */
+
+       /*
+        * If pages are properly placed (did not migrate) then scan slower.
+        * This is reset periodically in case of phase changes
+        */
+        if (!migrated)
+               p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
+                       p->numa_scan_period + jiffies_to_msecs(10));
+
+       task_numa_placement(p);
+}
+
+static void reset_ptenuma_scan(struct task_struct *p)
+{
+       ACCESS_ONCE(p->mm->numa_scan_seq)++;
+       p->mm->numa_scan_offset = 0;
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+       unsigned long migrate, next_scan, now = jiffies;
+       struct task_struct *p = current;
+       struct mm_struct *mm = p->mm;
+       struct vm_area_struct *vma;
+       unsigned long start, end;
+       long pages;
+
+       WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+
+       work->next = work; /* protect against double add */
+       /*
+        * Who cares about NUMA placement when they're dying.
+        *
+        * NOTE: make sure not to dereference p->mm before this check,
+        * exit_task_work() happens _after_ exit_mm() so we could be called
+        * without p->mm even though we still had it when we enqueued this
+        * work.
+        */
+       if (p->flags & PF_EXITING)
+               return;
+
+       /*
+        * We do not care about task placement until a task runs on a node
+        * other than the first one used by the address space. This is
+        * largely because migrations are driven by what CPU the task
+        * is running on. If it's never scheduled on another node, it'll
+        * not migrate so why bother trapping the fault.
+        */
+       if (mm->first_nid == NUMA_PTE_SCAN_INIT)
+               mm->first_nid = numa_node_id();
+       if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
+               /* Are we running on a new node yet? */
+               if (numa_node_id() == mm->first_nid &&
+                   !sched_feat_numa(NUMA_FORCE))
+                       return;
+
+               mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
+       }
+
+       /*
+        * Reset the scan period if enough time has gone by. Objective is that
+        * scanning will be reduced if pages are properly placed. As tasks
+        * can enter different phases this needs to be re-examined. Lacking
+        * proper tracking of reference behaviour, this blunt hammer is used.
+        */
+       migrate = mm->numa_next_reset;
+       if (time_after(now, migrate)) {
+               p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+               next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
+               xchg(&mm->numa_next_reset, next_scan);
+       }
+
+       /*
+        * Enforce maximal scan/migration frequency..
+        */
+       migrate = mm->numa_next_scan;
+       if (time_before(now, migrate))
+               return;
+
+       if (p->numa_scan_period == 0)
+               p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+
+       next_scan = now + msecs_to_jiffies(p->numa_scan_period);
+       if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+               return;
+
+       /*
+        * Do not set pte_numa if the current running node is rate-limited.
+        * This loses statistics on the fault but if we are unwilling to
+        * migrate to this node, it is less likely we can do useful work
+        */
+       if (migrate_ratelimited(numa_node_id()))
+               return;
+
+       start = mm->numa_scan_offset;
+       pages = sysctl_numa_balancing_scan_size;
+       pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+       if (!pages)
+               return;
+
+       down_read(&mm->mmap_sem);
+       vma = find_vma(mm, start);
+       if (!vma) {
+               reset_ptenuma_scan(p);
+               start = 0;
+               vma = mm->mmap;
+       }
+       for (; vma; vma = vma->vm_next) {
+               if (!vma_migratable(vma))
+                       continue;
+
+               /* Skip small VMAs. They are not likely to be of relevance */
+               if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR)
+                       continue;
+
+               do {
+                       start = max(start, vma->vm_start);
+                       end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
+                       end = min(end, vma->vm_end);
+                       pages -= change_prot_numa(vma, start, end);
+
+                       start = end;
+                       if (pages <= 0)
+                               goto out;
+               } while (end != vma->vm_end);
+       }
+
+out:
+       /*
+        * It is possible to reach the end of the VMA list but the last few VMAs are
+        * not guaranteed to the vma_migratable. If they are not, we would find the
+        * !migratable VMA on the next scan but not reset the scanner to the start
+        * so check it now.
+        */
+       if (vma)
+               mm->numa_scan_offset = start;
+       else
+               reset_ptenuma_scan(p);
+       up_read(&mm->mmap_sem);
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+       struct callback_head *work = &curr->numa_work;
+       u64 period, now;
+
+       /*
+        * We don't care about NUMA placement if we don't have memory.
+        */
+       if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+               return;
+
+       /*
+        * Using runtime rather than walltime has the dual advantage that
+        * we (mostly) drive the selection from busy threads and that the
+        * task needs to have done some actual work before we bother with
+        * NUMA placement.
+        */
+       now = curr->se.sum_exec_runtime;
+       period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+       if (now - curr->node_stamp > period) {
+               if (!curr->node_stamp)
+                       curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+               curr->node_stamp = now;
+
+               if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+                       init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+                       task_work_add(curr, work, true);
+               }
+       }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -5501,6 +5725,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                entity_tick(cfs_rq, se, queued);
        }
 
+       if (sched_feat_numa(NUMA))
+               task_tick_numa(rq, curr);
+
        update_rq_runnable_avg(rq, 1);
 }