Merge branch 'for-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Oct 2016 18:46:25 +0000 (11:46 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Oct 2016 18:46:25 +0000 (11:46 -0700)
Pull percpu updates from Tejun Heo:

 - Nick improved generic implementations of percpu operations which
   modify the variable and return so that they calculate the physical
   address only once.

 - percpu_ref percpu <-> atomic mode switching improvements. The
   patchset was originally posted about a year ago but fell through the
   crack.

 - misc non-critical fixes.

* 'for-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu:
  mm/percpu.c: fix potential memory leakage for pcpu_embed_first_chunk()
  mm/percpu.c: correct max_distance calculation for pcpu_embed_first_chunk()
  percpu: eliminate two sparse warnings
  percpu: improve generic percpu modify-return implementation
  percpu-refcount: init ->confirm_switch member properly
  percpu_ref: allow operation mode switching operations to be called concurrently
  percpu_ref: restructure operation mode switching
  percpu_ref: unify staggered atomic switching wait behavior
  percpu_ref: reorganize __percpu_ref_switch_to_atomic() and relocate percpu_ref_switch_to_atomic()
  percpu_ref: remove unnecessary RCU grace period for staggered atomic switching confirmation

arch/x86/include/asm/percpu.h
include/asm-generic/percpu.h
lib/percpu-refcount.c
mm/percpu.c

index e02e3f8..84f58de 100644 (file)
@@ -521,7 +521,8 @@ do {                                                                        \
 static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
                         const unsigned long __percpu *addr)
 {
-       unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
+       unsigned long __percpu *a =
+               (unsigned long __percpu *)addr + nr / BITS_PER_LONG;
 
 #ifdef CONFIG_X86_64
        return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
@@ -538,7 +539,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
        asm volatile("bt "__percpu_arg(2)",%1\n\t"
                        CC_SET(c)
                        : CC_OUT(c) (oldbit)
-                       : "m" (*(unsigned long *)addr), "Ir" (nr));
+                       : "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
 
        return oldbit;
 }
index 4d9f233..40e8870 100644 (file)
@@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void);
 #define PER_CPU_DEF_ATTRIBUTES
 #endif
 
+#define raw_cpu_generic_read(pcp)                                      \
+({                                                                     \
+       *raw_cpu_ptr(&(pcp));                                           \
+})
+
 #define raw_cpu_generic_to_op(pcp, val, op)                            \
 do {                                                                   \
        *raw_cpu_ptr(&(pcp)) op val;                                    \
@@ -72,34 +77,39 @@ do {                                                                        \
 
 #define raw_cpu_generic_add_return(pcp, val)                           \
 ({                                                                     \
-       raw_cpu_add(pcp, val);                                          \
-       raw_cpu_read(pcp);                                              \
+       typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));                       \
+                                                                       \
+       *__p += val;                                                    \
+       *__p;                                                           \
 })
 
 #define raw_cpu_generic_xchg(pcp, nval)                                        \
 ({                                                                     \
+       typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));                       \
        typeof(pcp) __ret;                                              \
-       __ret = raw_cpu_read(pcp);                                      \
-       raw_cpu_write(pcp, nval);                                       \
+       __ret = *__p;                                                   \
+       *__p = nval;                                                    \
        __ret;                                                          \
 })
 
 #define raw_cpu_generic_cmpxchg(pcp, oval, nval)                       \
 ({                                                                     \
+       typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));                       \
        typeof(pcp) __ret;                                              \
-       __ret = raw_cpu_read(pcp);                                      \
+       __ret = *__p;                                                   \
        if (__ret == (oval))                                            \
-               raw_cpu_write(pcp, nval);                               \
+               *__p = nval;                                            \
        __ret;                                                          \
 })
 
 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
 ({                                                                     \
+       typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1));                    \
+       typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2));                    \
        int __ret = 0;                                                  \
-       if (raw_cpu_read(pcp1) == (oval1) &&                            \
-                        raw_cpu_read(pcp2)  == (oval2)) {              \
-               raw_cpu_write(pcp1, nval1);                             \
-               raw_cpu_write(pcp2, nval2);                             \
+       if (*__p1 == (oval1) && *__p2  == (oval2)) {                    \
+               *__p1 = nval1;                                          \
+               *__p2 = nval2;                                          \
                __ret = 1;                                              \
        }                                                               \
        (__ret);                                                        \
@@ -109,7 +119,7 @@ do {                                                                        \
 ({                                                                     \
        typeof(pcp) __ret;                                              \
        preempt_disable();                                              \
-       __ret = *this_cpu_ptr(&(pcp));                                  \
+       __ret = raw_cpu_generic_read(pcp);                              \
        preempt_enable();                                               \
        __ret;                                                          \
 })
@@ -118,17 +128,17 @@ do {                                                                      \
 do {                                                                   \
        unsigned long __flags;                                          \
        raw_local_irq_save(__flags);                                    \
-       *raw_cpu_ptr(&(pcp)) op val;                                    \
+       raw_cpu_generic_to_op(pcp, val, op);                            \
        raw_local_irq_restore(__flags);                                 \
 } while (0)
 
+
 #define this_cpu_generic_add_return(pcp, val)                          \
 ({                                                                     \
        typeof(pcp) __ret;                                              \
        unsigned long __flags;                                          \
        raw_local_irq_save(__flags);                                    \
-       raw_cpu_add(pcp, val);                                          \
-       __ret = raw_cpu_read(pcp);                                      \
+       __ret = raw_cpu_generic_add_return(pcp, val);                   \
        raw_local_irq_restore(__flags);                                 \
        __ret;                                                          \
 })
@@ -138,8 +148,7 @@ do {                                                                        \
        typeof(pcp) __ret;                                              \
        unsigned long __flags;                                          \
        raw_local_irq_save(__flags);                                    \
-       __ret = raw_cpu_read(pcp);                                      \
-       raw_cpu_write(pcp, nval);                                       \
+       __ret = raw_cpu_generic_xchg(pcp, nval);                        \
        raw_local_irq_restore(__flags);                                 \
        __ret;                                                          \
 })
@@ -149,9 +158,7 @@ do {                                                                        \
        typeof(pcp) __ret;                                              \
        unsigned long __flags;                                          \
        raw_local_irq_save(__flags);                                    \
-       __ret = raw_cpu_read(pcp);                                      \
-       if (__ret == (oval))                                            \
-               raw_cpu_write(pcp, nval);                               \
+       __ret = raw_cpu_generic_cmpxchg(pcp, oval, nval);               \
        raw_local_irq_restore(__flags);                                 \
        __ret;                                                          \
 })
@@ -168,16 +175,16 @@ do {                                                                      \
 })
 
 #ifndef raw_cpu_read_1
-#define raw_cpu_read_1(pcp)            (*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_1(pcp)            raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_2
-#define raw_cpu_read_2(pcp)            (*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_2(pcp)            raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_4
-#define raw_cpu_read_4(pcp)            (*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_4(pcp)            raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_8
-#define raw_cpu_read_8(pcp)            (*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_8(pcp)            raw_cpu_generic_read(pcp)
 #endif
 
 #ifndef raw_cpu_write_1
index 27fe749..9ac959e 100644 (file)
@@ -33,6 +33,7 @@
 
 #define PERCPU_COUNT_BIAS      (1LU << (BITS_PER_LONG - 1))
 
+static DEFINE_SPINLOCK(percpu_ref_switch_lock);
 static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);
 
 static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
@@ -82,6 +83,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
        atomic_long_set(&ref->count, start_count);
 
        ref->release = release;
+       ref->confirm_switch = NULL;
        return 0;
 }
 EXPORT_SYMBOL_GPL(percpu_ref_init);
@@ -101,6 +103,8 @@ void percpu_ref_exit(struct percpu_ref *ref)
        unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
 
        if (percpu_count) {
+               /* non-NULL confirm_switch indicates switching in progress */
+               WARN_ON_ONCE(ref->confirm_switch);
                free_percpu(percpu_count);
                ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
        }
@@ -161,66 +165,23 @@ static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
 static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                          percpu_ref_func_t *confirm_switch)
 {
-       if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) {
-               /* switching from percpu to atomic */
-               ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
-
-               /*
-                * Non-NULL ->confirm_switch is used to indicate that
-                * switching is in progress.  Use noop one if unspecified.
-                */
-               WARN_ON_ONCE(ref->confirm_switch);
-               ref->confirm_switch =
-                       confirm_switch ?: percpu_ref_noop_confirm_switch;
-
-               percpu_ref_get(ref);    /* put after confirmation */
-               call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
-       } else if (confirm_switch) {
-               /*
-                * Somebody already set ATOMIC.  Switching may still be in
-                * progress.  @confirm_switch must be invoked after the
-                * switching is complete and a full sched RCU grace period
-                * has passed.  Wait synchronously for the previous
-                * switching and schedule @confirm_switch invocation.
-                */
-               wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-               ref->confirm_switch = confirm_switch;
-
-               percpu_ref_get(ref);    /* put after confirmation */
-               call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu);
+       if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
+               if (confirm_switch)
+                       confirm_switch(ref);
+               return;
        }
-}
 
-/**
- * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
- * @ref: percpu_ref to switch to atomic mode
- * @confirm_switch: optional confirmation callback
- *
- * There's no reason to use this function for the usual reference counting.
- * Use percpu_ref_kill[_and_confirm]().
- *
- * Schedule switching of @ref to atomic mode.  All its percpu counts will
- * be collected to the main atomic counter.  On completion, when all CPUs
- * are guaraneed to be in atomic mode, @confirm_switch, which may not
- * block, is invoked.  This function may be invoked concurrently with all
- * the get/put operations and can safely be mixed with kill and reinit
- * operations.  Note that @ref will stay in atomic mode across kill/reinit
- * cycles until percpu_ref_switch_to_percpu() is called.
- *
- * This function normally doesn't block and can be called from any context
- * but it may block if @confirm_kill is specified and @ref is already in
- * the process of switching to atomic mode.  In such cases, @confirm_switch
- * will be invoked after the switching is complete.
- *
- * Due to the way percpu_ref is implemented, @confirm_switch will be called
- * after at least one full sched RCU grace period has passed but this is an
- * implementation detail and must not be depended upon.
- */
-void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
-                                percpu_ref_func_t *confirm_switch)
-{
-       ref->force_atomic = true;
-       __percpu_ref_switch_to_atomic(ref, confirm_switch);
+       /* switching from percpu to atomic */
+       ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
+
+       /*
+        * Non-NULL ->confirm_switch is used to indicate that switching is
+        * in progress.  Use noop one if unspecified.
+        */
+       ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch;
+
+       percpu_ref_get(ref);    /* put after confirmation */
+       call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
 }
 
 static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
@@ -233,8 +194,6 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
        if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
                return;
 
-       wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-
        atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
 
        /*
@@ -250,6 +209,58 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
                          ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
 }
 
+static void __percpu_ref_switch_mode(struct percpu_ref *ref,
+                                    percpu_ref_func_t *confirm_switch)
+{
+       lockdep_assert_held(&percpu_ref_switch_lock);
+
+       /*
+        * If the previous ATOMIC switching hasn't finished yet, wait for
+        * its completion.  If the caller ensures that ATOMIC switching
+        * isn't in progress, this function can be called from any context.
+        */
+       wait_event_lock_irq(percpu_ref_switch_waitq, !ref->confirm_switch,
+                           percpu_ref_switch_lock);
+
+       if (ref->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD))
+               __percpu_ref_switch_to_atomic(ref, confirm_switch);
+       else
+               __percpu_ref_switch_to_percpu(ref);
+}
+
+/**
+ * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
+ * @ref: percpu_ref to switch to atomic mode
+ * @confirm_switch: optional confirmation callback
+ *
+ * There's no reason to use this function for the usual reference counting.
+ * Use percpu_ref_kill[_and_confirm]().
+ *
+ * Schedule switching of @ref to atomic mode.  All its percpu counts will
+ * be collected to the main atomic counter.  On completion, when all CPUs
+ * are guaraneed to be in atomic mode, @confirm_switch, which may not
+ * block, is invoked.  This function may be invoked concurrently with all
+ * the get/put operations and can safely be mixed with kill and reinit
+ * operations.  Note that @ref will stay in atomic mode across kill/reinit
+ * cycles until percpu_ref_switch_to_percpu() is called.
+ *
+ * This function may block if @ref is in the process of switching to atomic
+ * mode.  If the caller ensures that @ref is not in the process of
+ * switching to atomic mode, this function can be called from any context.
+ */
+void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
+                                percpu_ref_func_t *confirm_switch)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
+       ref->force_atomic = true;
+       __percpu_ref_switch_mode(ref, confirm_switch);
+
+       spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
+}
+
 /**
  * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
  * @ref: percpu_ref to switch to percpu mode
@@ -264,17 +275,20 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
  * dying or dead, the actual switching takes place on the following
  * percpu_ref_reinit().
  *
- * This function normally doesn't block and can be called from any context
- * but it may block if @ref is in the process of switching to atomic mode
- * by percpu_ref_switch_atomic().
+ * This function may block if @ref is in the process of switching to atomic
+ * mode.  If the caller ensures that @ref is not in the process of
+ * switching to atomic mode, this function can be called from any context.
  */
 void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 {
+       unsigned long flags;
+
+       spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
        ref->force_atomic = false;
+       __percpu_ref_switch_mode(ref, NULL);
 
-       /* a dying or dead ref can't be switched to percpu mode w/o reinit */
-       if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD))
-               __percpu_ref_switch_to_percpu(ref);
+       spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
 
 /**
@@ -290,21 +304,23 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
  *
  * This function normally doesn't block and can be called from any context
  * but it may block if @confirm_kill is specified and @ref is in the
- * process of switching to atomic mode by percpu_ref_switch_atomic().
- *
- * Due to the way percpu_ref is implemented, @confirm_switch will be called
- * after at least one full sched RCU grace period has passed but this is an
- * implementation detail and must not be depended upon.
+ * process of switching to atomic mode by percpu_ref_switch_to_atomic().
  */
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_kill)
 {
+       unsigned long flags;
+
+       spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
        WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD,
                  "%s called more than once on %pf!", __func__, ref->release);
 
        ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
-       __percpu_ref_switch_to_atomic(ref, confirm_kill);
+       __percpu_ref_switch_mode(ref, confirm_kill);
        percpu_ref_put(ref);
+
+       spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
 EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
 
@@ -321,11 +337,16 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
  */
 void percpu_ref_reinit(struct percpu_ref *ref)
 {
+       unsigned long flags;
+
+       spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
        WARN_ON_ONCE(!percpu_ref_is_zero(ref));
 
        ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
        percpu_ref_get(ref);
-       if (!ref->force_atomic)
-               __percpu_ref_switch_to_percpu(ref);
+       __percpu_ref_switch_mode(ref, NULL);
+
+       spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
 EXPORT_SYMBOL_GPL(percpu_ref_reinit);
index 9903830..2557143 100644 (file)
@@ -1961,8 +1961,9 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
        void *base = (void *)ULONG_MAX;
        void **areas = NULL;
        struct pcpu_alloc_info *ai;
-       size_t size_sum, areas_size, max_distance;
-       int group, i, rc;
+       size_t size_sum, areas_size;
+       unsigned long max_distance;
+       int group, i, highest_group, rc;
 
        ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
                                   cpu_distance_fn);
@@ -1978,7 +1979,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
                goto out_free;
        }
 
-       /* allocate, copy and determine base address */
+       /* allocate, copy and determine base address & max_distance */
+       highest_group = 0;
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                unsigned int cpu = NR_CPUS;
@@ -1999,6 +2001,21 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
                areas[group] = ptr;
 
                base = min(ptr, base);
+               if (ptr > areas[highest_group])
+                       highest_group = group;
+       }
+       max_distance = areas[highest_group] - base;
+       max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
+
+       /* warn if maximum distance is further than 75% of vmalloc space */
+       if (max_distance > VMALLOC_TOTAL * 3 / 4) {
+               pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
+                               max_distance, VMALLOC_TOTAL);
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+               /* and fail if we have fallback */
+               rc = -EINVAL;
+               goto out_free_areas;
+#endif
        }
 
        /*
@@ -2023,23 +2040,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
        }
 
        /* base address is now known, determine group base offsets */
-       max_distance = 0;
        for (group = 0; group < ai->nr_groups; group++) {
                ai->groups[group].base_offset = areas[group] - base;
-               max_distance = max_t(size_t, max_distance,
-                                    ai->groups[group].base_offset);
-       }
-       max_distance += ai->unit_size;
-
-       /* warn if maximum distance is further than 75% of vmalloc space */
-       if (max_distance > VMALLOC_TOTAL * 3 / 4) {
-               pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n",
-                       max_distance, VMALLOC_TOTAL);
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
-               /* and fail if we have fallback */
-               rc = -EINVAL;
-               goto out_free;
-#endif
        }
 
        pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",