Merge branch 'timers-nohz-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 25 Jul 2016 21:43:00 +0000 (14:43 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 25 Jul 2016 21:43:00 +0000 (14:43 -0700)
Pull NOHZ updates from Ingo Molnar:

 - fix system/idle cputime leaked on cputime accounting (all nohz
   configs) (Rik van Riel)

 - remove the messy, ad-hoc irqtime account on nohz-full and make it
   compatible with CONFIG_IRQ_TIME_ACCOUNTING=y instead (Rik van Riel)

 - cleanups (Frederic Weisbecker)

 - remove unecessary irq disablement in the irqtime code (Rik van Riel)

* 'timers-nohz-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/cputime: Drop local_irq_save/restore from irqtime_account_irq()
  sched/cputime: Reorganize vtime native irqtime accounting headers
  sched/cputime: Clean up the old vtime gen irqtime accounting completely
  sched/cputime: Replace VTIME_GEN irq time code with IRQ_TIME_ACCOUNTING code
  sched/cputime: Count actually elapsed irq & softirq time

include/asm-generic/cputime_nsecs.h
include/linux/vtime.h
init/Kconfig
kernel/sched/cputime.c

index 0f1c6f3..a84e28e 100644 (file)
@@ -50,6 +50,8 @@ typedef u64 __nocast cputime64_t;
        (__force u64)(__ct)
 #define nsecs_to_cputime(__nsecs)      \
        (__force cputime_t)(__nsecs)
+#define nsecs_to_cputime64(__nsecs)    \
+       (__force cputime64_t)(__nsecs)
 
 
 /*
index fa21969..aa9bfea 100644 (file)
@@ -12,11 +12,9 @@ struct task_struct;
 /*
  * vtime_accounting_cpu_enabled() definitions/declarations
  */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE)
 static inline bool vtime_accounting_cpu_enabled(void) { return true; }
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+#elif defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN)
 /*
  * Checks if vtime is enabled on some CPU. Cputime readers want to be careful
  * in that case and compute the tickless cputime.
@@ -37,11 +35,9 @@ static inline bool vtime_accounting_cpu_enabled(void)
 
        return false;
 }
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
-
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
 static inline bool vtime_accounting_cpu_enabled(void) { return false; }
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+#endif
 
 
 /*
@@ -64,35 +60,15 @@ extern void vtime_account_system(struct task_struct *tsk);
 extern void vtime_account_idle(struct task_struct *tsk);
 extern void vtime_account_user(struct task_struct *tsk);
 
-#ifdef __ARCH_HAS_VTIME_ACCOUNT
-extern void vtime_account_irq_enter(struct task_struct *tsk);
-#else
-extern void vtime_common_account_irq_enter(struct task_struct *tsk);
-static inline void vtime_account_irq_enter(struct task_struct *tsk)
-{
-       if (vtime_accounting_cpu_enabled())
-               vtime_common_account_irq_enter(tsk);
-}
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 static inline void vtime_task_switch(struct task_struct *prev) { }
 static inline void vtime_account_system(struct task_struct *tsk) { }
 static inline void vtime_account_user(struct task_struct *tsk) { }
-static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void arch_vtime_task_switch(struct task_struct *tsk);
-extern void vtime_gen_account_irq_exit(struct task_struct *tsk);
-
-static inline void vtime_account_irq_exit(struct task_struct *tsk)
-{
-       if (vtime_accounting_cpu_enabled())
-               vtime_gen_account_irq_exit(tsk);
-}
-
 extern void vtime_user_enter(struct task_struct *tsk);
 
 static inline void vtime_user_exit(struct task_struct *tsk)
@@ -103,11 +79,6 @@ extern void vtime_guest_enter(struct task_struct *tsk);
 extern void vtime_guest_exit(struct task_struct *tsk);
 extern void vtime_init_idle(struct task_struct *tsk, int cpu);
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
-static inline void vtime_account_irq_exit(struct task_struct *tsk)
-{
-       /* On hard|softirq exit we always account to hard|softirq cputime */
-       vtime_account_system(tsk);
-}
 static inline void vtime_user_enter(struct task_struct *tsk) { }
 static inline void vtime_user_exit(struct task_struct *tsk) { }
 static inline void vtime_guest_enter(struct task_struct *tsk) { }
@@ -115,6 +86,19 @@ static inline void vtime_guest_exit(struct task_struct *tsk) { }
 static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
 #endif
 
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+extern void vtime_account_irq_enter(struct task_struct *tsk);
+static inline void vtime_account_irq_exit(struct task_struct *tsk)
+{
+       /* On hard|softirq exit we always account to hard|softirq cputime */
+       vtime_account_system(tsk);
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
+static inline void vtime_account_irq_exit(struct task_struct *tsk) { }
+#endif
+
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 extern void irqtime_account_irq(struct task_struct *tsk);
 #else
index 7e0b24f..557bdf1 100644 (file)
@@ -375,9 +375,11 @@ config VIRT_CPU_ACCOUNTING_GEN
 
          If unsure, say N.
 
+endchoice
+
 config IRQ_TIME_ACCOUNTING
        bool "Fine granularity task level IRQ time accounting"
-       depends on HAVE_IRQ_TIME_ACCOUNTING && !NO_HZ_FULL
+       depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE
        help
          Select this option to enable fine granularity task irq time
          accounting. This is done by reading a timestamp on each
@@ -386,8 +388,6 @@ config IRQ_TIME_ACCOUNTING
 
          If in doubt, say N here.
 
-endchoice
-
 config BSD_PROCESS_ACCT
        bool "BSD Process Accounting"
        depends on MULTIUSER
index 3d60e5d..ea0f6f3 100644 (file)
@@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
  */
 void irqtime_account_irq(struct task_struct *curr)
 {
-       unsigned long flags;
        s64 delta;
        int cpu;
 
        if (!sched_clock_irqtime)
                return;
 
-       local_irq_save(flags);
-
        cpu = smp_processor_id();
        delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
        __this_cpu_add(irq_start_time, delta);
@@ -75,44 +72,53 @@ void irqtime_account_irq(struct task_struct *curr)
                __this_cpu_add(cpu_softirq_time, delta);
 
        irq_time_write_end();
-       local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
-static int irqtime_account_hi_update(void)
+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
 {
        u64 *cpustat = kcpustat_this_cpu->cpustat;
        unsigned long flags;
-       u64 latest_ns;
-       int ret = 0;
+       cputime_t irq_cputime;
 
        local_irq_save(flags);
-       latest_ns = this_cpu_read(cpu_hardirq_time);
-       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
-               ret = 1;
+       irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
+                     cpustat[CPUTIME_IRQ];
+       irq_cputime = min(irq_cputime, maxtime);
+       cpustat[CPUTIME_IRQ] += irq_cputime;
        local_irq_restore(flags);
-       return ret;
+       return irq_cputime;
 }
 
-static int irqtime_account_si_update(void)
+static cputime_t irqtime_account_si_update(cputime_t maxtime)
 {
        u64 *cpustat = kcpustat_this_cpu->cpustat;
        unsigned long flags;
-       u64 latest_ns;
-       int ret = 0;
+       cputime_t softirq_cputime;
 
        local_irq_save(flags);
-       latest_ns = this_cpu_read(cpu_softirq_time);
-       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
-               ret = 1;
+       softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
+                         cpustat[CPUTIME_SOFTIRQ];
+       softirq_cputime = min(softirq_cputime, maxtime);
+       cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
        local_irq_restore(flags);
-       return ret;
+       return softirq_cputime;
 }
 
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 #define sched_clock_irqtime    (0)
 
+static cputime_t irqtime_account_hi_update(cputime_t dummy)
+{
+       return 0;
+}
+
+static cputime_t irqtime_account_si_update(cputime_t dummy)
+{
+       return 0;
+}
+
 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
 
 static inline void task_group_account_field(struct task_struct *p, int index,
@@ -257,31 +263,44 @@ void account_idle_time(cputime_t cputime)
                cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 }
 
-static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies)
+static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
 {
 #ifdef CONFIG_PARAVIRT
        if (static_key_false(&paravirt_steal_enabled)) {
+               cputime_t steal_cputime;
                u64 steal;
-               unsigned long steal_jiffies;
 
                steal = paravirt_steal_clock(smp_processor_id());
                steal -= this_rq()->prev_steal_time;
 
-               /*
-                * steal is in nsecs but our caller is expecting steal
-                * time in jiffies. Lets cast the result to jiffies
-                * granularity and account the rest on the next rounds.
-                */
-               steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies);
-               this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
+               steal_cputime = min(nsecs_to_cputime(steal), maxtime);
+               account_steal_time(steal_cputime);
+               this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
 
-               account_steal_time(jiffies_to_cputime(steal_jiffies));
-               return steal_jiffies;
+               return steal_cputime;
        }
 #endif
        return 0;
 }
 
+/*
+ * Account how much elapsed time was spent in steal, irq, or softirq time.
+ */
+static inline cputime_t account_other_time(cputime_t max)
+{
+       cputime_t accounted;
+
+       accounted = steal_account_process_time(max);
+
+       if (accounted < max)
+               accounted += irqtime_account_hi_update(max - accounted);
+
+       if (accounted < max)
+               accounted += irqtime_account_si_update(max - accounted);
+
+       return accounted;
+}
+
 /*
  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
  * tasks (sum on group iteration) belonging to @tsk's group.
@@ -342,21 +361,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                                         struct rq *rq, int ticks)
 {
-       cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
-       u64 cputime = (__force u64) cputime_one_jiffy;
-       u64 *cpustat = kcpustat_this_cpu->cpustat;
+       u64 cputime = (__force u64) cputime_one_jiffy * ticks;
+       cputime_t scaled, other;
 
-       if (steal_account_process_tick(ULONG_MAX))
+       /*
+        * When returning from idle, many ticks can get accounted at
+        * once, including some ticks of steal, irq, and softirq time.
+        * Subtract those ticks from the amount of time accounted to
+        * idle, or potentially user or system time. Due to rounding,
+        * other time can exceed ticks occasionally.
+        */
+       other = account_other_time(cputime);
+       if (other >= cputime)
                return;
+       cputime -= other;
+       scaled = cputime_to_scaled(cputime);
 
-       cputime *= ticks;
-       scaled *= ticks;
-
-       if (irqtime_account_hi_update()) {
-               cpustat[CPUTIME_IRQ] += cputime;
-       } else if (irqtime_account_si_update()) {
-               cpustat[CPUTIME_SOFTIRQ] += cputime;
-       } else if (this_cpu_ksoftirqd() == p) {
+       if (this_cpu_ksoftirqd() == p) {
                /*
                 * ksoftirqd time do not get accounted in cpu_softirq_time.
                 * So, we have to handle it separately here.
@@ -406,6 +427,10 @@ void vtime_common_task_switch(struct task_struct *prev)
 }
 #endif
 
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /*
  * Archs that account the whole time spent in the idle task
  * (outside irq) as idle time can rely on this and just implement
@@ -415,33 +440,16 @@ void vtime_common_task_switch(struct task_struct *prev)
  * vtime_account().
  */
 #ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_common_account_irq_enter(struct task_struct *tsk)
+void vtime_account_irq_enter(struct task_struct *tsk)
 {
-       if (!in_interrupt()) {
-               /*
-                * If we interrupted user, context_tracking_in_user()
-                * is 1 because the context tracking don't hook
-                * on irq entry/exit. This way we know if
-                * we need to flush user time on kernel entry.
-                */
-               if (context_tracking_in_user()) {
-                       vtime_account_user(tsk);
-                       return;
-               }
-
-               if (is_idle_task(tsk)) {
-                       vtime_account_idle(tsk);
-                       return;
-               }
-       }
-       vtime_account_system(tsk);
+       if (!in_interrupt() && is_idle_task(tsk))
+               vtime_account_idle(tsk);
+       else
+               vtime_account_system(tsk);
 }
-EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
 
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        *ut = p->utime;
@@ -466,7 +474,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
  */
 void account_process_tick(struct task_struct *p, int user_tick)
 {
-       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+       cputime_t cputime, scaled, steal;
        struct rq *rq = this_rq();
 
        if (vtime_accounting_cpu_enabled())
@@ -477,16 +485,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
                return;
        }
 
-       if (steal_account_process_tick(ULONG_MAX))
+       cputime = cputime_one_jiffy;
+       steal = steal_account_process_time(cputime);
+
+       if (steal >= cputime)
                return;
 
+       cputime -= steal;
+       scaled = cputime_to_scaled(cputime);
+
        if (user_tick)
-               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+               account_user_time(p, cputimescaled);
        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
-               account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
-                                   one_jiffy_scaled);
+               account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
        else
-               account_idle_time(cputime_one_jiffy);
+               account_idle_time(cputime);
 }
 
 /*
@@ -681,14 +694,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
 static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
        unsigned long now = READ_ONCE(jiffies);
-       unsigned long delta_jiffies, steal_jiffies;
+       cputime_t delta, other;
 
-       delta_jiffies = now - tsk->vtime_snap;
-       steal_jiffies = steal_account_process_tick(delta_jiffies);
+       delta = jiffies_to_cputime(now - tsk->vtime_snap);
+       other = account_other_time(delta);
        WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
        tsk->vtime_snap = now;
 
-       return jiffies_to_cputime(delta_jiffies - steal_jiffies);
+       return delta - other;
 }
 
 static void __vtime_account_system(struct task_struct *tsk)
@@ -708,16 +721,6 @@ void vtime_account_system(struct task_struct *tsk)
        write_seqcount_end(&tsk->vtime_seqcount);
 }
 
-void vtime_gen_account_irq_exit(struct task_struct *tsk)
-{
-       write_seqcount_begin(&tsk->vtime_seqcount);
-       if (vtime_delta(tsk))
-               __vtime_account_system(tsk);
-       if (context_tracking_in_user())
-               tsk->vtime_snap_whence = VTIME_USER;
-       write_seqcount_end(&tsk->vtime_seqcount);
-}
-
 void vtime_account_user(struct task_struct *tsk)
 {
        cputime_t delta_cpu;