Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 3 Mar 2010 15:34:18 +0000 (07:34 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 3 Mar 2010 15:34:18 +0000 (07:34 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Mar 2010 15:34:18 +0000 (07:34 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Mar 2010 15:34:18 +0000 (07:34 -0800)
diff --combined arch/powerpc/include/asm/local.h

index ce58c80,227753d..c2410af
--- 1/arch/powerpc/include/asm/local.h
--- 2/arch/powerpc/include/asm/local.h
+++ b/arch/powerpc/include/asm/local.h
@@@ -24,7 -24,7 +24,7 @@@ static __inline__ long local_add_return
         long t;
   
         __asm__ __volatile__(
- -"1:"  PPC_LLARX       "%0,0,%2                # local_add_return\n\
+ +"1:"  PPC_LLARX(%0,0,%2,0) "                  # local_add_return\n\
         add     %0,%1,%0\n"
         PPC405_ERR77(0,%2)
         PPC_STLCX       "%0,0,%2 \n\
@@@ -43,7 -43,7 +43,7 @@@ static __inline__ long local_sub_return
         long t;
   
         __asm__ __volatile__(
- -"1:"  PPC_LLARX       "%0,0,%2                # local_sub_return\n\
+ +"1:"  PPC_LLARX(%0,0,%2,0) "                  # local_sub_return\n\
         subf    %0,%1,%0\n"
         PPC405_ERR77(0,%2)
         PPC_STLCX       "%0,0,%2 \n\
@@@ -60,7 -60,7 +60,7 @@@ static __inline__ long local_inc_return
         long t;
   
         __asm__ __volatile__(
- -"1:"  PPC_LLARX       "%0,0,%1                # local_inc_return\n\
+ +"1:"  PPC_LLARX(%0,0,%1,0) "                  # local_inc_return\n\
         addic   %0,%0,1\n"
         PPC405_ERR77(0,%1)
         PPC_STLCX       "%0,0,%1 \n\
@@@ -87,7 -87,7 +87,7 @@@ static __inline__ long local_dec_return
         long t;
   
         __asm__ __volatile__(
- -"1:"  PPC_LLARX       "%0,0,%1                # local_dec_return\n\
+ +"1:"  PPC_LLARX(%0,0,%1,0) "                  # local_dec_return\n\
         addic   %0,%0,-1\n"
         PPC405_ERR77(0,%1)
         PPC_STLCX       "%0,0,%1\n\
@@@ -117,7 -117,7 +117,7 @@@ static __inline__ int local_add_unless(
         long t;
   
         __asm__ __volatile__ (
- -"1:"  PPC_LLARX       "%0,0,%1                # local_add_unless\n\
+ +"1:"  PPC_LLARX(%0,0,%1,0) "                  # local_add_unless\n\
         cmpw    0,%0,%3 \n\
         beq-    2f \n\
         add     %0,%2,%0 \n"
@@@ -147,7 -147,7 +147,7 @@@ static __inline__ long local_dec_if_pos
         long t;
   
         __asm__ __volatile__(
- -"1:"  PPC_LLARX       "%0,0,%1                # local_dec_if_positive\n\
+ +"1:"  PPC_LLARX(%0,0,%1,0) "                  # local_dec_if_positive\n\
         cmpwi   %0,1\n\
         addi    %0,%0,-1\n\
         blt-    2f\n"
@@@ -172,29 -172,4 +172,4 @@@
   #define __local_add(i,l)      ((l)->a.counter+=(i))
   #define __local_sub(i,l)      ((l)->a.counter-=(i))
   
- /* Need to disable preemption for the cpu local counters otherwise we could
-    still access a variable of a previous CPU in a non atomic way. */
- #define cpu_local_wrap_v(l)           \
-       ({ local_t res__;               \
-          preempt_disable();           \
-          res__ = (l);                 \
-          preempt_enable();            \
-          res__; })
- #define cpu_local_wrap(l)             \
-       ({ preempt_disable();           \
-          l;                           \
-          preempt_enable(); })         \
- 
- #define cpu_local_read(l)    cpu_local_wrap_v(local_read(&__get_cpu_var(l)))
- #define cpu_local_set(l, i)  cpu_local_wrap(local_set(&__get_cpu_var(l), (i)))
- #define cpu_local_inc(l)     cpu_local_wrap(local_inc(&__get_cpu_var(l)))
- #define cpu_local_dec(l)     cpu_local_wrap(local_dec(&__get_cpu_var(l)))
- #define cpu_local_add(i, l)  cpu_local_wrap(local_add((i), &__get_cpu_var(l)))
- #define cpu_local_sub(i, l)  cpu_local_wrap(local_sub((i), &__get_cpu_var(l)))
- 
- #define __cpu_local_inc(l)    cpu_local_inc(l)
- #define __cpu_local_dec(l)    cpu_local_dec(l)
- #define __cpu_local_add(i, l) cpu_local_add((i), (l))
- #define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
- 
   #endif /* _ARCH_POWERPC_LOCAL_H */
diff --combined arch/x86/include/asm/system.h

index e04740f,e529f26..b8fe48e
--- 1/arch/x86/include/asm/system.h
--- 2/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@@ -11,9 -11,9 +11,9 @@@
   #include <linux/irqflags.h>
   
   /* entries in ARCH_DLINFO: */
- -#ifdef CONFIG_IA32_EMULATION
+ +#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
   # define AT_VECTOR_SIZE_ARCH 2
- -#else
+ +#else /* else it's non-compat x86-64 */
   # define AT_VECTOR_SIZE_ARCH 1
   #endif
   
@@@ -32,7 -32,7 +32,7 @@@ extern void show_regs_common(void)
         "movl %P[task_canary](%[next]), %%ebx\n\t"                      \
         "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
   #define __switch_canary_oparam                                                \
-       , [stack_canary] "=m" (per_cpu_var(stack_canary.canary))
+       , [stack_canary] "=m" (stack_canary.canary)
   #define __switch_canary_iparam                                                \
         , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
   #else /* CC_STACKPROTECTOR */
@@@ -114,7 -114,7 +114,7 @@@ do {                                                                       
         "movq %P[task_canary](%%rsi),%%r8\n\t"                            \
         "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
   #define __switch_canary_oparam                                                  \
-       , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary))
+       , [gs_canary] "=m" (irq_stack_union.stack_canary)
   #define __switch_canary_iparam                                                  \
         , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
   #else /* CC_STACKPROTECTOR */
@@@ -133,7 -133,7 +133,7 @@@
              __switch_canary                                              \
              "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
              "movq %%rax,%%rdi\n\t"                                       \
-            "testl  %[_tif_fork],%P[ti_flags](%%r8)\n\t"         \
+            "testl  %[_tif_fork],%P[ti_flags](%%r8)\n\t"                 \
              "jnz   ret_from_fork\n\t"                                    \
              RESTORE_CONTEXT                                              \
              : "=a" (last)                                                \
@@@ -143,7 -143,7 +143,7 @@@
                [ti_flags] "i" (offsetof(struct thread_info, flags)),      \
                [_tif_fork] "i" (_TIF_FORK),                               \
                [thread_info] "i" (offsetof(struct task_struct, stack)),   \
-              [current_task] "m" (per_cpu_var(current_task))             \
+              [current_task] "m" (current_task)                          \
                __switch_canary_iparam                                     \
              : "memory", "cc" __EXTRA_CLOBBER)
   #endif
diff --combined drivers/acpi/processor_perflib.c

index a959f6a,8c6a649..d648a98
--- 1/drivers/acpi/processor_perflib.c
--- 2/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@@ -413,11 -413,7 +413,11 @@@ static int acpi_processor_get_performan
         if (result)
                 goto update_bios;
   
- -      return 0;
+ +      /* We need to call _PPC once when cpufreq starts */
+ +      if (ignore_ppc != 1)
+ +              result = acpi_processor_get_platform_limit(pr);
+ +
+ +      return result;
   
         /*
          * Having _PPC but missing frequencies (_PSS, _PCT) is a very good hint that
@@@ -561,7 -557,7 +561,7 @@@ end
   }
   
   int acpi_processor_preregister_performance(
-               struct acpi_processor_performance *performance)
+               struct acpi_processor_performance __percpu *performance)
   {
         int count, count_target;
         int retval = 0;
diff --combined drivers/dma/dmaengine.c

index e7a3230,4eadd98..87399ca
--- 1/drivers/dma/dmaengine.c
--- 2/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@@ -284,7 -284,7 +284,7 @@@ struct dma_chan_tbl_ent 
   /**
    * channel_table - percpu lookup table for memory-to-memory offload providers
    */
- static struct dma_chan_tbl_ent *channel_table[DMA_TX_TYPE_END];
+ static struct dma_chan_tbl_ent __percpu *channel_table[DMA_TX_TYPE_END];
   
   static int __init dma_channel_table_init(void)
   {
@@@ -826,7 -826,6 +826,7 @@@ void dma_async_device_unregister(struc
                 chan->dev->chan = NULL;
                 mutex_unlock(&dma_list_mutex);
                 device_unregister(&chan->dev->device);
+ +              free_percpu(chan->local);
         }
   }
   EXPORT_SYMBOL(dma_async_device_unregister);
diff --combined drivers/edac/amd64_edac.c

index 3391e67,7b36c88..7cd1cdc
--- 1/drivers/edac/amd64_edac.c
--- 2/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@@ -13,7 -13,7 +13,7 @@@ module_param(report_gart_errors, int, 0
   static int ecc_enable_override;
   module_param(ecc_enable_override, int, 0644);
   
- static struct msr *msrs;
+ static struct msr __percpu *msrs;
   
   /* Lookup table for all possible MC control instances */
   struct amd64_pvt;
@@@ -2658,11 -2658,10 +2658,11 @@@ static void amd64_restore_ecc_error_rep
    * the memory system completely. A command line option allows to force-enable
    * hardware ECC later in amd64_enable_ecc_error_reporting().
    */
- -static const char *ecc_warning =
- -      "WARNING: ECC is disabled by BIOS. Module will NOT be loaded.\n"
- -      " Either Enable ECC in the BIOS, or set 'ecc_enable_override'.\n"
- -      " Also, use of the override can cause unknown side effects.\n";
+ +static const char *ecc_msg =
+ +      "ECC disabled in the BIOS or no ECC capability, module will not load.\n"
+ +      " Either enable ECC checking or force module loading by setting "
+ +      "'ecc_enable_override'.\n"
+ +      " (Note that use of the override may cause unknown side effects.)\n";
   
   static int amd64_check_ecc_enabled(struct amd64_pvt *pvt)
   {
@@@ -2674,7 -2673,7 +2674,7 @@@
   
         ecc_enabled = !!(value & K8_NBCFG_ECC_ENABLE);
         if (!ecc_enabled)
- -              amd64_printk(KERN_WARNING, "This node reports that Memory ECC "
+ +              amd64_printk(KERN_NOTICE, "This node reports that Memory ECC "
                              "is currently disabled, set F3x%x[22] (%s).\n",
                              K8_NBCFG, pci_name(pvt->misc_f3_ctl));
         else
@@@ -2682,13 -2681,13 +2682,13 @@@
   
         nb_mce_en = amd64_nb_mce_bank_enabled_on_node(pvt->mc_node_id);
         if (!nb_mce_en)
- -              amd64_printk(KERN_WARNING, "NB MCE bank disabled, set MSR "
+ +              amd64_printk(KERN_NOTICE, "NB MCE bank disabled, set MSR "
                              "0x%08x[4] on node %d to enable.\n",
                              MSR_IA32_MCG_CTL, pvt->mc_node_id);
   
         if (!ecc_enabled || !nb_mce_en) {
                 if (!ecc_enable_override) {
- -                      amd64_printk(KERN_WARNING, "%s", ecc_warning);
+ +                      amd64_printk(KERN_NOTICE, "%s", ecc_msg);
                         return -ENODEV;
                 }
                 ecc_enable_override = 0;
diff --combined drivers/md/raid5.c

index 509c8f3,77cb3ab..70ffbd0
--- 1/drivers/md/raid5.c
--- 2/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@@ -3739,7 -3739,7 +3739,7 @@@ static int bio_fits_rdev(struct bio *bi
         if ((bi->bi_size>>9) > queue_max_sectors(q))
                 return 0;
         blk_recount_segments(q, bi);
- -      if (bi->bi_phys_segments > queue_max_phys_segments(q))
+ +      if (bi->bi_phys_segments > queue_max_segments(q))
                 return 0;
   
         if (q->merge_bvec_fn)
@@@ -4680,7 -4680,7 +4680,7 @@@ static int raid5_alloc_percpu(raid5_con
   {
         unsigned long cpu;
         struct page *spare_page;
-       struct raid5_percpu *allcpus;
+       struct raid5_percpu __percpu *allcpus;
         void *scribble;
         int err;
   
@@@ -5136,8 -5136,9 +5136,8 @@@ static int stop(mddev_t *mddev
         mddev->thread = NULL;
         mddev->queue->backing_dev_info.congested_fn = NULL;
         blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
- -      sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
         free_conf(conf);
- -      mddev->private = NULL;
+ +      mddev->private = &raid5_attrs_group;
         return 0;
   }
   
@@@ -5463,11 -5464,11 +5463,11 @@@ static int raid5_start_reshape(mddev_t 
                     !test_bit(Faulty, &rdev->flags)) {
                         if (raid5_add_disk(mddev, rdev) == 0) {
                                 char nm[20];
- -                              if (rdev->raid_disk >= conf->previous_raid_disks)
+ +                              if (rdev->raid_disk >= conf->previous_raid_disks) {
                                         set_bit(In_sync, &rdev->flags);
- -                              else
+ +                                      added_devices++;
+ +                              } else
                                         rdev->recovery_offset = 0;
- -                              added_devices++;
                                 sprintf(nm, "rd%d", rdev->raid_disk);
                                 if (sysfs_create_link(&mddev->kobj,
                                                       &rdev->kobj, nm))
@@@ -5479,12 -5480,9 +5479,12 @@@
                                 break;
                 }
   
+ +      /* When a reshape changes the number of devices, ->degraded
+ +       * is measured against the large of the pre and post number of
+ +       * devices.*/
         if (mddev->delta_disks > 0) {
                 spin_lock_irqsave(&conf->device_lock, flags);
- -              mddev->degraded = (conf->raid_disks - conf->previous_raid_disks)
+ +              mddev->degraded += (conf->raid_disks - conf->previous_raid_disks)
                         - added_devices;
                 spin_unlock_irqrestore(&conf->device_lock, flags);
         }
diff --combined fs/xfs/xfs_mount.h

index 70504fc,24c8887..14dafd6
--- 1/fs/xfs/xfs_mount.h
--- 2/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@@ -78,8 -78,7 +78,8 @@@ typedef int   (*xfs_send_destroy_t)(struc
   typedef int   (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
                         struct xfs_inode *, dm_right_t,
                         struct xfs_inode *, dm_right_t,
- -                      const char *, const char *, mode_t, int, int);
+ +                      const unsigned char *, const unsigned char *,
+ +                      mode_t, int, int);
   typedef int   (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
                         char *, char *);
   typedef void  (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
@@@ -208,8 -207,8 +208,8 @@@ typedef struct xfs_mount 
         uint                    m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
         uint                    m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
         uint                    m_in_maxlevels; /* max inobt btree levels. */
- -      struct xfs_perag        *m_perag;       /* per-ag accounting info */
- -      struct rw_semaphore     m_peraglock;    /* lock for m_perag (pointer) */
+ +      struct radix_tree_root  m_perag_tree;   /* per-ag accounting info */
+ +      spinlock_t              m_perag_lock;   /* lock for m_perag_tree */
         struct mutex            m_growlock;     /* growfs mutex */
         int                     m_fixedfsid[2]; /* unchanged for life of FS */
         uint                    m_dmevmask;     /* DMI events for this FS */
@@@ -225,7 -224,6 +225,7 @@@
         __uint64_t              m_maxioffset;   /* maximum inode offset */
         __uint64_t              m_resblks;      /* total reserved blocks */
         __uint64_t              m_resblks_avail;/* available reserved blocks */
+ +      __uint64_t              m_resblks_save; /* reserved blks @ remount,ro */
         int                     m_dalign;       /* stripe unit */
         int                     m_swidth;       /* stripe width */
         int                     m_sinoalign;    /* stripe unit inode alignment */
@@@ -245,7 -243,7 +245,7 @@@
         struct xfs_qmops        *m_qm_ops;      /* vector of XQM ops */
         atomic_t                m_active_trans; /* number trans frozen */
   #ifdef HAVE_PERCPU_SB
-       xfs_icsb_cnts_t         *m_sb_cnts;     /* per-cpu superblock counters */
+       xfs_icsb_cnts_t __percpu *m_sb_cnts;    /* per-cpu superblock counters */
         unsigned long           m_icsb_counters; /* disabled per-cpu counters */
         struct notifier_block   m_icsb_notifier; /* hotplug cpu notifier */
         struct mutex            m_icsb_mutex;   /* balancer sync lock */
@@@ -386,10 -384,19 +386,10 @@@ xfs_daddr_to_agbno(struct xfs_mount *mp
   }
   
   /*
- - * perag get/put wrappers for eventual ref counting
+ + * perag get/put wrappers for ref counting
    */
- -static inline xfs_perag_t *
- -xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino)
- -{
- -      return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)];
- -}
- -
- -static inline void
- -xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag)
- -{
- -      /* nothing to see here, move along */
- -}
+ +struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+ +void  xfs_perag_put(struct xfs_perag *pag);
   
   /*
    * Per-cpu superblock locking functions
@@@ -421,7 -428,6 +421,7 @@@ typedef struct xfs_mod_sb 
   } xfs_mod_sb_t;
   
   extern int    xfs_log_sbcount(xfs_mount_t *, uint);
+ +extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
   extern int    xfs_mountfs(xfs_mount_t *mp);
   
   extern void   xfs_unmountfs(xfs_mount_t *);
@@@ -444,8 -450,7 +444,8 @@@ extern struct xfs_dmops xfs_dmcore_xfs
   #endif        /* __KERNEL__ */
   
   extern void   xfs_mod_sb(struct xfs_trans *, __int64_t);
- -extern xfs_agnumber_t xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t);
+ +extern int    xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
+ +                                      xfs_agnumber_t *);
   extern void   xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
   extern void   xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
   
diff --combined include/acpi/processor.h

index 2983176,477544f..1172c27
--- 1/include/acpi/processor.h
--- 2/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@@ -92,11 -92,11 +92,11 @@@ struct acpi_processor_power 
   /* Performance Management */
   
   struct acpi_psd_package {
- -      acpi_integer num_entries;
- -      acpi_integer revision;
- -      acpi_integer domain;
- -      acpi_integer coord_type;
- -      acpi_integer num_processors;
+ +      u64 num_entries;
+ +      u64 revision;
+ +      u64 domain;
+ +      u64 coord_type;
+ +      u64 num_processors;
   } __attribute__ ((packed));
   
   struct acpi_pct_register {
@@@ -110,12 -110,12 +110,12 @@@
   } __attribute__ ((packed));
   
   struct acpi_processor_px {
- -      acpi_integer core_frequency;    /* megahertz */
- -      acpi_integer power;     /* milliWatts */
- -      acpi_integer transition_latency;        /* microseconds */
- -      acpi_integer bus_master_latency;        /* microseconds */
- -      acpi_integer control;   /* control value */
- -      acpi_integer status;    /* success indicator */
+ +      u64 core_frequency;     /* megahertz */
+ +      u64 power;      /* milliWatts */
+ +      u64 transition_latency; /* microseconds */
+ +      u64 bus_master_latency; /* microseconds */
+ +      u64 control;    /* control value */
+ +      u64 status;     /* success indicator */
   };
   
   struct acpi_processor_performance {
@@@ -133,11 -133,11 +133,11 @@@
   /* Throttling Control */
   
   struct acpi_tsd_package {
- -      acpi_integer num_entries;
- -      acpi_integer revision;
- -      acpi_integer domain;
- -      acpi_integer coord_type;
- -      acpi_integer num_processors;
+ +      u64 num_entries;
+ +      u64 revision;
+ +      u64 domain;
+ +      u64 coord_type;
+ +      u64 num_processors;
   } __attribute__ ((packed));
   
   struct acpi_ptc_register {
@@@ -151,11 -151,11 +151,11 @@@
   } __attribute__ ((packed));
   
   struct acpi_processor_tx_tss {
- -      acpi_integer freqpercentage;    /* */
- -      acpi_integer power;     /* milliWatts */
- -      acpi_integer transition_latency;        /* microseconds */
- -      acpi_integer control;   /* control value */
- -      acpi_integer status;    /* success indicator */
+ +      u64 freqpercentage;     /* */
+ +      u64 power;      /* milliWatts */
+ +      u64 transition_latency; /* microseconds */
+ +      u64 control;    /* control value */
+ +      u64 status;     /* success indicator */
   };
   struct acpi_processor_tx {
         u16 power;
@@@ -238,7 -238,7 +238,7 @@@ struct acpi_processor_errata 
   
   extern int acpi_processor_preregister_performance(struct
                                                   acpi_processor_performance
-                                                 *performance);
+                                                 __percpu *performance);
   
   extern int acpi_processor_register_performance(struct acpi_processor_performance
                                                *performance, unsigned int cpu);
diff --combined include/linux/mm.h

index 8b2fa85,91d2ba1..2e724c8
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -265,8 -265,6 +265,8 @@@ static inline int get_page_unless_zero(
         return atomic_inc_not_zero(&page->_count);
   }
   
+ +extern int page_is_ram(unsigned long pfn);
+ +
   /* Support for virtually mapped pages */
   struct page *vmalloc_to_page(const void *addr);
   unsigned long vmalloc_to_pfn(const void *addr);
@@@ -1081,11 -1079,7 +1081,7 @@@ extern void si_meminfo(struct sysinfo 
   extern void si_meminfo_node(struct sysinfo *val, int nid);
   extern int after_bootmem;
   
- #ifdef CONFIG_NUMA
   extern void setup_per_cpu_pageset(void);
- #else
- static inline void setup_per_cpu_pageset(void) {}
- #endif
   
   extern void zone_pcp_update(struct zone *zone);
   
diff --combined include/linux/percpu_counter.h

index 794662b,9bd103c..c88d67b
--- 1/include/linux/percpu_counter.h
--- 2/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@@ -21,7 -21,7 +21,7 @@@ struct percpu_counter 
   #ifdef CONFIG_HOTPLUG_CPU
         struct list_head list;  /* All percpu_counters are on a list */
   #endif
-       s32 *counters;
+       s32 __percpu *counters;
   };
   
   extern int percpu_counter_batch;
@@@ -98,6 -98,9 +98,6 @@@ static inline void percpu_counter_set(s
         fbc->count = amount;
   }
   
- -#define __percpu_counter_add(fbc, amount, batch) \
- -      percpu_counter_add(fbc, amount)
- -
   static inline void
   percpu_counter_add(struct percpu_counter *fbc, s64 amount)
   {
@@@ -106,12 -109,6 +106,12 @@@
         preempt_enable();
   }
   
+ +static inline void
+ +__percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
+ +{
+ +      percpu_counter_add(fbc, amount);
+ +}
+ +
   static inline s64 percpu_counter_read(struct percpu_counter *fbc)
   {
         return fbc->count;
diff --combined include/linux/srcu.h

index 3084f80,41eedcc..4d5ecb2
--- 1/include/linux/srcu.h
--- 2/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@@ -33,11 -33,8 +33,11 @@@ struct srcu_struct_array 
   
   struct srcu_struct {
         int completed;
-       struct srcu_struct_array *per_cpu_ref;
+       struct srcu_struct_array __percpu *per_cpu_ref;
         struct mutex mutex;
+ +#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ +      struct lockdep_map dep_map;
+ +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
   };
   
   #ifndef CONFIG_PREEMPT
@@@ -46,100 -43,12 +46,100 @@@
   #define srcu_barrier()
   #endif /* #else #ifndef CONFIG_PREEMPT */
   
+ +#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ +
+ +int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+ +                     struct lock_class_key *key);
+ +
+ +#define init_srcu_struct(sp) \
+ +({ \
+ +      static struct lock_class_key __srcu_key; \
+ +      \
+ +      __init_srcu_struct((sp), #sp, &__srcu_key); \
+ +})
+ +
+ +# define srcu_read_acquire(sp) \
+ +              lock_acquire(&(sp)->dep_map, 0, 0, 2, 1, NULL, _THIS_IP_)
+ +# define srcu_read_release(sp) \
+ +              lock_release(&(sp)->dep_map, 1, _THIS_IP_)
+ +
+ +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+ +
   int init_srcu_struct(struct srcu_struct *sp);
+ +
+ +# define srcu_read_acquire(sp)  do { } while (0)
+ +# define srcu_read_release(sp)  do { } while (0)
+ +
+ +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+ +
   void cleanup_srcu_struct(struct srcu_struct *sp);
- -int srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
- -void srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
+ +int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
+ +void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
   void synchronize_srcu(struct srcu_struct *sp);
   void synchronize_srcu_expedited(struct srcu_struct *sp);
   long srcu_batches_completed(struct srcu_struct *sp);
   
+ +#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ +
+ +/**
+ + * srcu_read_lock_held - might we be in SRCU read-side critical section?
+ + *
+ + * If CONFIG_PROVE_LOCKING is selected and enabled, returns nonzero iff in
+ + * an SRCU read-side critical section.  In absence of CONFIG_PROVE_LOCKING,
+ + * this assumes we are in an SRCU read-side critical section unless it can
+ + * prove otherwise.
+ + */
+ +static inline int srcu_read_lock_held(struct srcu_struct *sp)
+ +{
+ +      if (debug_locks)
+ +              return lock_is_held(&sp->dep_map);
+ +      return 1;
+ +}
+ +
+ +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+ +
+ +static inline int srcu_read_lock_held(struct srcu_struct *sp)
+ +{
+ +      return 1;
+ +}
+ +
+ +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+ +
+ +/**
+ + * srcu_dereference - fetch SRCU-protected pointer with checking
+ + *
+ + * Makes rcu_dereference_check() do the dirty work.
+ + */
+ +#define srcu_dereference(p, sp) \
+ +              rcu_dereference_check(p, srcu_read_lock_held(sp))
+ +
+ +/**
+ + * srcu_read_lock - register a new reader for an SRCU-protected structure.
+ + * @sp: srcu_struct in which to register the new reader.
+ + *
+ + * Enter an SRCU read-side critical section.  Note that SRCU read-side
+ + * critical sections may be nested.
+ + */
+ +static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
+ +{
+ +      int retval = __srcu_read_lock(sp);
+ +
+ +      srcu_read_acquire(sp);
+ +      return retval;
+ +}
+ +
+ +/**
+ + * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
+ + * @sp: srcu_struct in which to unregister the old reader.
+ + * @idx: return value from corresponding srcu_read_lock().
+ + *
+ + * Exit an SRCU read-side critical section.
+ + */
+ +static inline void srcu_read_unlock(struct srcu_struct *sp, int idx)
+ +      __releases(sp)
+ +{
+ +      srcu_read_release(sp);
+ +      __srcu_read_unlock(sp, idx);
+ +}
+ +
   #endif
diff --combined kernel/rcutorture.c

index 258cdf0,0b52175..58df55b
--- 1/kernel/rcutorture.c
--- 2/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@@ -61,9 -61,6 +61,9 @@@ static int test_no_idle_hz;   /* Test RCU
   static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
   static int stutter = 5;               /* Start/stop testing interval (in sec) */
   static int irqreader = 1;     /* RCU readers from irq (timers). */
+ +static int fqs_duration = 0;  /* Duration of bursts (us), 0 to disable. */
+ +static int fqs_holdoff = 0;   /* Hold time within burst (us). */
+ +static int fqs_stutter = 3;   /* Wait time between bursts (s). */
   static char *torture_type = "rcu"; /* What RCU implementation to torture. */
   
   module_param(nreaders, int, 0444);
@@@ -82,12 -79,6 +82,12 @@@ module_param(stutter, int, 0444)
   MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
   module_param(irqreader, int, 0444);
   MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
+ +module_param(fqs_duration, int, 0444);
+ +MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
+ +module_param(fqs_holdoff, int, 0444);
+ +MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
+ +module_param(fqs_stutter, int, 0444);
+ +MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
   module_param(torture_type, charp, 0444);
   MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
   
@@@ -108,7 -99,6 +108,7 @@@ static struct task_struct **reader_task
   static struct task_struct *stats_task;
   static struct task_struct *shuffler_task;
   static struct task_struct *stutter_task;
+ +static struct task_struct *fqs_task;
   
   #define RCU_TORTURE_PIPE_LEN 10
   
@@@ -273,7 -263,6 +273,7 @@@ struct rcu_torture_ops 
         void (*deferred_free)(struct rcu_torture *p);
         void (*sync)(void);
         void (*cb_barrier)(void);
+ +      void (*fqs)(void);
         int (*stats)(char *page);
         int irq_capable;
         char *name;
@@@ -358,7 -347,6 +358,7 @@@ static struct rcu_torture_ops rcu_ops 
         .deferred_free  = rcu_torture_deferred_free,
         .sync           = synchronize_rcu,
         .cb_barrier     = rcu_barrier,
+ +      .fqs            = rcu_force_quiescent_state,
         .stats          = NULL,
         .irq_capable    = 1,
         .name           = "rcu"
@@@ -400,7 -388,6 +400,7 @@@ static struct rcu_torture_ops rcu_sync_
         .deferred_free  = rcu_sync_torture_deferred_free,
         .sync           = synchronize_rcu,
         .cb_barrier     = NULL,
+ +      .fqs            = rcu_force_quiescent_state,
         .stats          = NULL,
         .irq_capable    = 1,
         .name           = "rcu_sync"
@@@ -416,7 -403,6 +416,7 @@@ static struct rcu_torture_ops rcu_exped
         .deferred_free  = rcu_sync_torture_deferred_free,
         .sync           = synchronize_rcu_expedited,
         .cb_barrier     = NULL,
+ +      .fqs            = rcu_force_quiescent_state,
         .stats          = NULL,
         .irq_capable    = 1,
         .name           = "rcu_expedited"
@@@ -479,7 -465,6 +479,7 @@@ static struct rcu_torture_ops rcu_bh_op
         .deferred_free  = rcu_bh_torture_deferred_free,
         .sync           = rcu_bh_torture_synchronize,
         .cb_barrier     = rcu_barrier_bh,
+ +      .fqs            = rcu_bh_force_quiescent_state,
         .stats          = NULL,
         .irq_capable    = 1,
         .name           = "rcu_bh"
@@@ -495,7 -480,6 +495,7 @@@ static struct rcu_torture_ops rcu_bh_sy
         .deferred_free  = rcu_sync_torture_deferred_free,
         .sync           = rcu_bh_torture_synchronize,
         .cb_barrier     = NULL,
+ +      .fqs            = rcu_bh_force_quiescent_state,
         .stats          = NULL,
         .irq_capable    = 1,
         .name           = "rcu_bh_sync"
@@@ -637,7 -621,6 +637,7 @@@ static struct rcu_torture_ops sched_op
         .deferred_free  = rcu_sched_torture_deferred_free,
         .sync           = sched_torture_synchronize,
         .cb_barrier     = rcu_barrier_sched,
+ +      .fqs            = rcu_sched_force_quiescent_state,
         .stats          = NULL,
         .irq_capable    = 1,
         .name           = "sched"
@@@ -653,7 -636,6 +653,7 @@@ static struct rcu_torture_ops sched_syn
         .deferred_free  = rcu_sync_torture_deferred_free,
         .sync           = sched_torture_synchronize,
         .cb_barrier     = NULL,
+ +      .fqs            = rcu_sched_force_quiescent_state,
         .stats          = NULL,
         .name           = "sched_sync"
   };
@@@ -668,44 -650,11 +668,44 @@@ static struct rcu_torture_ops sched_exp
         .deferred_free  = rcu_sync_torture_deferred_free,
         .sync           = synchronize_sched_expedited,
         .cb_barrier     = NULL,
+ +      .fqs            = rcu_sched_force_quiescent_state,
         .stats          = rcu_expedited_torture_stats,
         .irq_capable    = 1,
         .name           = "sched_expedited"
   };
   
+ +/*
+ + * RCU torture force-quiescent-state kthread.  Repeatedly induces
+ + * bursts of calls to force_quiescent_state(), increasing the probability
+ + * of occurrence of some important types of race conditions.
+ + */
+ +static int
+ +rcu_torture_fqs(void *arg)
+ +{
+ +      unsigned long fqs_resume_time;
+ +      int fqs_burst_remaining;
+ +
+ +      VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
+ +      do {
+ +              fqs_resume_time = jiffies + fqs_stutter * HZ;
+ +              while (jiffies - fqs_resume_time > LONG_MAX) {
+ +                      schedule_timeout_interruptible(1);
+ +              }
+ +              fqs_burst_remaining = fqs_duration;
+ +              while (fqs_burst_remaining > 0) {
+ +                      cur_ops->fqs();
+ +                      udelay(fqs_holdoff);
+ +                      fqs_burst_remaining -= fqs_holdoff;
+ +              }
+ +              rcu_stutter_wait("rcu_torture_fqs");
+ +      } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+ +      VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
+ +      rcutorture_shutdown_absorb("rcu_torture_fqs");
+ +      while (!kthread_should_stop())
+ +              schedule_timeout_uninterruptible(1);
+ +      return 0;
+ +}
+ +
   /*
    * RCU torture writer kthread.  Repeatedly substitutes a new structure
    * for that pointed to by rcu_torture_current, freeing the old structure
@@@ -796,11 -745,7 +796,11 @@@ static void rcu_torture_timer(unsigned 
   
         idx = cur_ops->readlock();
         completed = cur_ops->completed();
- -      p = rcu_dereference(rcu_torture_current);
+ +      p = rcu_dereference_check(rcu_torture_current,
+ +                                rcu_read_lock_held() ||
+ +                                rcu_read_lock_bh_held() ||
+ +                                rcu_read_lock_sched_held() ||
+ +                                srcu_read_lock_held(&srcu_ctl));
         if (p == NULL) {
                 /* Leave because rcu_torture_writer is not yet underway */
                 cur_ops->readunlock(idx);
@@@ -818,13 -763,13 +818,13 @@@
                 /* Should not happen, but... */
                 pipe_count = RCU_TORTURE_PIPE_LEN;
         }
-       __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
+       __this_cpu_inc(rcu_torture_count[pipe_count]);
         completed = cur_ops->completed() - completed;
         if (completed > RCU_TORTURE_PIPE_LEN) {
                 /* Should not happen, but... */
                 completed = RCU_TORTURE_PIPE_LEN;
         }
-       __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
+       __this_cpu_inc(rcu_torture_batch[completed]);
         preempt_enable();
         cur_ops->readunlock(idx);
   }
@@@ -853,15 -798,11 +853,15 @@@ rcu_torture_reader(void *arg
         do {
                 if (irqreader && cur_ops->irq_capable) {
                         if (!timer_pending(&t))
- -                              mod_timer(&t, 1);
+ +                              mod_timer(&t, jiffies + 1);
                 }
                 idx = cur_ops->readlock();
                 completed = cur_ops->completed();
- -              p = rcu_dereference(rcu_torture_current);
+ +              p = rcu_dereference_check(rcu_torture_current,
+ +                                        rcu_read_lock_held() ||
+ +                                        rcu_read_lock_bh_held() ||
+ +                                        rcu_read_lock_sched_held() ||
+ +                                        srcu_read_lock_held(&srcu_ctl));
                 if (p == NULL) {
                         /* Wait for rcu_torture_writer to get underway */
                         cur_ops->readunlock(idx);
@@@ -877,13 -818,13 +877,13 @@@
                         /* Should not happen, but... */
                         pipe_count = RCU_TORTURE_PIPE_LEN;
                 }
-               __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
+               __this_cpu_inc(rcu_torture_count[pipe_count]);
                 completed = cur_ops->completed() - completed;
                 if (completed > RCU_TORTURE_PIPE_LEN) {
                         /* Should not happen, but... */
                         completed = RCU_TORTURE_PIPE_LEN;
                 }
-               __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
+               __this_cpu_inc(rcu_torture_batch[completed]);
                 preempt_enable();
                 cur_ops->readunlock(idx);
                 schedule();
@@@ -1089,11 -1030,10 +1089,11 @@@ rcu_torture_print_module_parms(char *ta
         printk(KERN_ALERT "%s" TORTURE_FLAG
                 "--- %s: nreaders=%d nfakewriters=%d "
                 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
- -              "shuffle_interval=%d stutter=%d irqreader=%d\n",
+ +              "shuffle_interval=%d stutter=%d irqreader=%d "
+ +              "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
                 torture_type, tag, nrealreaders, nfakewriters,
                 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
- -              stutter, irqreader);
+ +              stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
   }
   
   static struct notifier_block rcutorture_nb = {
@@@ -1169,12 -1109,6 +1169,12 @@@ rcu_torture_cleanup(void
         }
         stats_task = NULL;
   
+ +      if (fqs_task) {
+ +              VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
+ +              kthread_stop(fqs_task);
+ +      }
+ +      fqs_task = NULL;
+ +
         /* Wait for all RCU callbacks to fire.  */
   
         if (cur_ops->cb_barrier != NULL)
@@@ -1220,11 -1154,6 +1220,11 @@@ rcu_torture_init(void
                 mutex_unlock(&fullstop_mutex);
                 return -EINVAL;
         }
+ +      if (cur_ops->fqs == NULL && fqs_duration != 0) {
+ +              printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
+ +                                "fqs_duration, fqs disabled.\n");
+ +              fqs_duration = 0;
+ +      }
         if (cur_ops->init)
                 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
   
@@@ -1353,19 -1282,6 +1353,19 @@@
                         goto unwind;
                 }
         }
+ +      if (fqs_duration < 0)
+ +              fqs_duration = 0;
+ +      if (fqs_duration) {
+ +              /* Create the stutter thread */
+ +              fqs_task = kthread_run(rcu_torture_fqs, NULL,
+ +                                     "rcu_torture_fqs");
+ +              if (IS_ERR(fqs_task)) {
+ +                      firsterr = PTR_ERR(fqs_task);
+ +                      VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
+ +                      fqs_task = NULL;
+ +                      goto unwind;
+ +              }
+ +      }
         register_reboot_notifier(&rcutorture_nb);
         mutex_unlock(&fullstop_mutex);
         return 0;
diff --combined kernel/sched.c

index 6a212c9,978edfd..abb36b1
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -233,7 -233,7 +233,7 @@@ static void destroy_rt_bandwidth(struc
    */
   static DEFINE_MUTEX(sched_domains_mutex);
   
- -#ifdef CONFIG_GROUP_SCHED
+ +#ifdef CONFIG_CGROUP_SCHED
   
   #include <linux/cgroup.h>
   
@@@ -243,7 -243,13 +243,7 @@@ static LIST_HEAD(task_groups)
   
   /* task group related information */
   struct task_group {
- -#ifdef CONFIG_CGROUP_SCHED
         struct cgroup_subsys_state css;
- -#endif
- -
- -#ifdef CONFIG_USER_SCHED
- -      uid_t uid;
- -#endif
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
         /* schedulable entities of this group on each cpu */
@@@ -268,7 -274,35 +268,7 @@@
         struct list_head children;
   };
   
- -#ifdef CONFIG_USER_SCHED
- -
- -/* Helper function to pass uid information to create_sched_user() */
- -void set_tg_uid(struct user_struct *user)
- -{
- -      user->tg->uid = user->uid;
- -}
- -
- -/*
- - * Root task group.
- - *    Every UID task group (including init_task_group aka UID-0) will
- - *    be a child to this group.
- - */
- -struct task_group root_task_group;
- -
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -/* Default task group's sched entity on each cpu */
- -static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
- -/* Default task group's cfs_rq on each cpu */
- -static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
- -#endif /* CONFIG_FAIR_GROUP_SCHED */
- -
- -#ifdef CONFIG_RT_GROUP_SCHED
- -static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
- -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
- -#endif /* CONFIG_RT_GROUP_SCHED */
- -#else /* !CONFIG_USER_SCHED */
   #define root_task_group init_task_group
- -#endif /* CONFIG_USER_SCHED */
   
   /* task_group_lock serializes add/remove of task groups and also changes to
    * a task group's cpu shares.
@@@ -284,7 -318,11 +284,7 @@@ static int root_task_group_empty(void
   }
   #endif
   
- -#ifdef CONFIG_USER_SCHED
- -# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
- -#else /* !CONFIG_USER_SCHED */
   # define INIT_TASK_GROUP_LOAD NICE_0_LOAD
- -#endif /* CONFIG_USER_SCHED */
   
   /*
    * A weight of 0 or 1 can cause arithmetics problems.
@@@ -310,7 -348,11 +310,7 @@@ static inline struct task_group *task_g
   {
         struct task_group *tg;
   
- -#ifdef CONFIG_USER_SCHED
- -      rcu_read_lock();
- -      tg = __task_cred(p)->user->tg;
- -      rcu_read_unlock();
- -#elif defined(CONFIG_CGROUP_SCHED)
+ +#ifdef CONFIG_CGROUP_SCHED
         tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                 struct task_group, css);
   #else
@@@ -341,7 -383,7 +341,7 @@@ static inline struct task_group *task_g
         return NULL;
   }
   
- -#endif        /* CONFIG_GROUP_SCHED */
+ +#endif        /* CONFIG_CGROUP_SCHED */
   
   /* CFS-related fields in a runqueue */
   struct cfs_rq {
@@@ -436,6 -478,7 +436,6 @@@ struct rt_rq 
         struct rq *rq;
         struct list_head leaf_rt_rq_list;
         struct task_group *tg;
- -      struct sched_rt_entity *rt_se;
   #endif
   };
   
@@@ -602,11 -645,6 +602,11 @@@ static inline int cpu_of(struct rq *rq
   #endif
   }
   
+ +#define rcu_dereference_check_sched_domain(p) \
+ +      rcu_dereference_check((p), \
+ +                            rcu_read_lock_sched_held() || \
+ +                            lockdep_is_held(&sched_domains_mutex))
+ +
   /*
    * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
    * See detach_destroy_domains: synchronize_sched for details.
@@@ -615,7 -653,7 +615,7 @@@
    * preempt-disabled sections.
    */
   #define for_each_domain(cpu, __sd) \
- -      for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+ +      for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
   
   #define cpu_rq(cpu)           (&per_cpu(runqueues, (cpu)))
   #define this_rq()             (&__get_cpu_var(runqueues))
@@@ -902,19 -940,6 +902,19 @@@ static inline void finish_lock_switch(s
   }
   #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
   
+ +/*
+ + * Check whether the task is waking, we use this to synchronize against
+ + * ttwu() so that task_cpu() reports a stable number.
+ + *
+ + * We need to make an exception for PF_STARTING tasks because the fork
+ + * path might require task_rq_lock() to work, eg. it can call
+ + * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+ + */
+ +static inline int task_is_waking(struct task_struct *p)
+ +{
+ +      return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+ +}
+ +
   /*
    * __task_rq_lock - lock the runqueue a given task resides on.
    * Must be called interrupts disabled.
@@@ -922,14 -947,10 +922,14 @@@
   static inline struct rq *__task_rq_lock(struct task_struct *p)
         __acquires(rq->lock)
   {
+ +      struct rq *rq;
+ +
         for (;;) {
- -              struct rq *rq = task_rq(p);
+ +              while (task_is_waking(p))
+ +                      cpu_relax();
+ +              rq = task_rq(p);
                 raw_spin_lock(&rq->lock);
- -              if (likely(rq == task_rq(p)))
+ +              if (likely(rq == task_rq(p) && !task_is_waking(p)))
                         return rq;
                 raw_spin_unlock(&rq->lock);
         }
@@@ -946,12 -967,10 +946,12 @@@ static struct rq *task_rq_lock(struct t
         struct rq *rq;
   
         for (;;) {
+ +              while (task_is_waking(p))
+ +                      cpu_relax();
                 local_irq_save(*flags);
                 rq = task_rq(p);
                 raw_spin_lock(&rq->lock);
- -              if (likely(rq == task_rq(p)))
+ +              if (likely(rq == task_rq(p) && !task_is_waking(p)))
                         return rq;
                 raw_spin_unlock_irqrestore(&rq->lock, *flags);
         }
@@@ -1371,6 -1390,32 +1371,6 @@@ static const u32 prio_to_wmult[40] = 
    /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
   };
   
- -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
- -
- -/*
- - * runqueue iterator, to support SMP load-balancing between different
- - * scheduling classes, without having to expose their internal data
- - * structures to the load-balancing proper:
- - */
- -struct rq_iterator {
- -      void *arg;
- -      struct task_struct *(*start)(void *);
- -      struct task_struct *(*next)(void *);
- -};
- -
- -#ifdef CONFIG_SMP
- -static unsigned long
- -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
- -            unsigned long max_load_move, struct sched_domain *sd,
- -            enum cpu_idle_type idle, int *all_pinned,
- -            int *this_best_prio, struct rq_iterator *iterator);
- -
- -static int
- -iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
- -                 struct sched_domain *sd, enum cpu_idle_type idle,
- -                 struct rq_iterator *iterator);
- -#endif
- -
   /* Time spent by the tasks of the cpu accounting group executing in ... */
   enum cpuacct_stat_index {
         CPUACCT_STAT_USER,      /* ... user mode */
@@@ -1486,7 -1531,7 +1486,7 @@@ static unsigned long target_load(int cp
   
   static struct sched_group *group_of(int cpu)
   {
- -      struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+ +      struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
   
         if (!sd)
                 return NULL;
@@@ -1521,7 -1566,7 +1521,7 @@@ static unsigned long cpu_avg_load_per_t
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
   
- static __read_mostly unsigned long *update_shares_data;
+ static __read_mostly unsigned long __percpu *update_shares_data;
   
   static void __set_se_shares(struct sched_entity *se, unsigned long shares);
   
@@@ -1656,6 -1701,16 +1656,6 @@@ static void update_shares(struct sched_
         }
   }
   
- -static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
- -{
- -      if (root_task_group_empty())
- -              return;
- -
- -      raw_spin_unlock(&rq->lock);
- -      update_shares(sd);
- -      raw_spin_lock(&rq->lock);
- -}
- -
   static void update_h_load(long cpu)
   {
         if (root_task_group_empty())
@@@ -1670,6 -1725,10 +1670,6 @@@ static inline void update_shares(struc
   {
   }
   
- -static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
- -{
- -}
- -
   #endif
   
   #ifdef CONFIG_PREEMPT
@@@ -1746,51 -1805,6 +1746,51 @@@ static inline void double_unlock_balanc
         raw_spin_unlock(&busiest->lock);
         lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
   }
+ +
+ +/*
+ + * double_rq_lock - safely lock two runqueues
+ + *
+ + * Note this does not disable interrupts like task_rq_lock,
+ + * you need to do so manually before calling.
+ + */
+ +static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ +      __acquires(rq1->lock)
+ +      __acquires(rq2->lock)
+ +{
+ +      BUG_ON(!irqs_disabled());
+ +      if (rq1 == rq2) {
+ +              raw_spin_lock(&rq1->lock);
+ +              __acquire(rq2->lock);   /* Fake it out ;) */
+ +      } else {
+ +              if (rq1 < rq2) {
+ +                      raw_spin_lock(&rq1->lock);
+ +                      raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+ +              } else {
+ +                      raw_spin_lock(&rq2->lock);
+ +                      raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+ +              }
+ +      }
+ +      update_rq_clock(rq1);
+ +      update_rq_clock(rq2);
+ +}
+ +
+ +/*
+ + * double_rq_unlock - safely unlock two runqueues
+ + *
+ + * Note this does not restore interrupts like task_rq_unlock,
+ + * you need to do so manually after calling.
+ + */
+ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ +      __releases(rq1->lock)
+ +      __releases(rq2->lock)
+ +{
+ +      raw_spin_unlock(&rq1->lock);
+ +      if (rq1 != rq2)
+ +              raw_spin_unlock(&rq2->lock);
+ +      else
+ +              __release(rq2->lock);
+ +}
+ +
   #endif
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
@@@ -1820,14 -1834,18 +1820,14 @@@ static inline void __set_task_cpu(struc
   #endif
   }
   
- -#include "sched_stats.h"
- -#include "sched_idletask.c"
- -#include "sched_fair.c"
- -#include "sched_rt.c"
- -#ifdef CONFIG_SCHED_DEBUG
- -# include "sched_debug.c"
- -#endif
+ +static const struct sched_class rt_sched_class;
   
   #define sched_class_highest (&rt_sched_class)
   #define for_each_class(class) \
      for (class = sched_class_highest; class; class = class->next)
   
+ +#include "sched_stats.h"
+ +
   static void inc_nr_running(struct rq *rq)
   {
         rq->nr_running++;
@@@ -1865,14 -1883,13 +1865,14 @@@ static void update_avg(u64 *avg, u64 sa
         *avg += diff >> 3;
   }
   
- -static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
+ +static void
+ +enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
   {
         if (wakeup)
                 p->se.start_runtime = p->se.sum_exec_runtime;
   
         sched_info_queued(p);
- -      p->sched_class->enqueue_task(rq, p, wakeup);
+ +      p->sched_class->enqueue_task(rq, p, wakeup, head);
         p->se.on_rq = 1;
   }
   
@@@ -1894,37 -1911,6 +1894,37 @@@ static void dequeue_task(struct rq *rq
         p->se.on_rq = 0;
   }
   
+ +/*
+ + * activate_task - move a task to the runqueue.
+ + */
+ +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+ +{
+ +      if (task_contributes_to_load(p))
+ +              rq->nr_uninterruptible--;
+ +
+ +      enqueue_task(rq, p, wakeup, false);
+ +      inc_nr_running(rq);
+ +}
+ +
+ +/*
+ + * deactivate_task - remove a task from the runqueue.
+ + */
+ +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+ +{
+ +      if (task_contributes_to_load(p))
+ +              rq->nr_uninterruptible++;
+ +
+ +      dequeue_task(rq, p, sleep);
+ +      dec_nr_running(rq);
+ +}
+ +
+ +#include "sched_idletask.c"
+ +#include "sched_fair.c"
+ +#include "sched_rt.c"
+ +#ifdef CONFIG_SCHED_DEBUG
+ +# include "sched_debug.c"
+ +#endif
+ +
   /*
    * __normal_prio - return the priority that is based on the static prio
    */
@@@ -1971,6 -1957,30 +1971,6 @@@ static int effective_prio(struct task_s
         return p->prio;
   }
   
- -/*
- - * activate_task - move a task to the runqueue.
- - */
- -static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
- -{
- -      if (task_contributes_to_load(p))
- -              rq->nr_uninterruptible--;
- -
- -      enqueue_task(rq, p, wakeup);
- -      inc_nr_running(rq);
- -}
- -
- -/*
- - * deactivate_task - remove a task from the runqueue.
- - */
- -static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
- -{
- -      if (task_contributes_to_load(p))
- -              rq->nr_uninterruptible++;
- -
- -      dequeue_task(rq, p, sleep);
- -      dec_nr_running(rq);
- -}
- -
   /**
    * task_curr - is this task currently executing on a CPU?
    * @p: the task in question.
@@@ -2398,27 -2408,14 +2398,27 @@@ static int try_to_wake_up(struct task_s
         __task_rq_unlock(rq);
   
         cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
- -      if (cpu != orig_cpu)
+ +      if (cpu != orig_cpu) {
+ +              /*
+ +               * Since we migrate the task without holding any rq->lock,
+ +               * we need to be careful with task_rq_lock(), since that
+ +               * might end up locking an invalid rq.
+ +               */
                 set_task_cpu(p, cpu);
+ +      }
   
- -      rq = __task_rq_lock(p);
+ +      rq = cpu_rq(cpu);
+ +      raw_spin_lock(&rq->lock);
         update_rq_clock(rq);
   
+ +      /*
+ +       * We migrated the task without holding either rq->lock, however
+ +       * since the task is not on the task list itself, nobody else
+ +       * will try and migrate the task, hence the rq should match the
+ +       * cpu we just moved it to.
+ +       */
+ +      WARN_ON(task_cpu(p) != cpu);
         WARN_ON(p->state != TASK_WAKING);
- -      cpu = task_cpu(p);
   
   #ifdef CONFIG_SCHEDSTATS
         schedstat_inc(rq, ttwu_count);
@@@ -2666,13 -2663,7 +2666,13 @@@ void wake_up_new_task(struct task_struc
         set_task_cpu(p, cpu);
   #endif
   
- -      rq = task_rq_lock(p, &flags);
+ +      /*
+ +       * Since the task is not on the rq and we still have TASK_WAKING set
+ +       * nobody else will migrate this task.
+ +       */
+ +      rq = cpu_rq(cpu);
+ +      raw_spin_lock_irqsave(&rq->lock, flags);
+ +
         BUG_ON(p->state != TASK_WAKING);
         p->state = TASK_RUNNING;
         update_rq_clock(rq);
@@@ -2803,13 -2794,7 +2803,13 @@@ static void finish_task_switch(struct r
          */
         prev_state = prev->state;
         finish_arch_switch(prev);
- -      perf_event_task_sched_in(current, cpu_of(rq));
+ +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ +      local_irq_disable();
+ +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+ +      perf_event_task_sched_in(current);
+ +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ +      local_irq_enable();
+ +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
         finish_lock_switch(rq, prev);
   
         fire_sched_in_preempt_notifiers(current);
@@@ -2957,208 -2942,2017 +2957,208 @@@ unsigned long nr_running(void
         return sum;
   }
   
- -unsigned long nr_uninterruptible(void)
- -{
- -      unsigned long i, sum = 0;
- -
- -      for_each_possible_cpu(i)
- -              sum += cpu_rq(i)->nr_uninterruptible;
- -
- -      /*
- -       * Since we read the counters lockless, it might be slightly
- -       * inaccurate. Do not allow it to go below zero though:
- -       */
- -      if (unlikely((long)sum < 0))
- -              sum = 0;
- -
- -      return sum;
- -}
- -
- -unsigned long long nr_context_switches(void)
- -{
- -      int i;
- -      unsigned long long sum = 0;
- -
- -      for_each_possible_cpu(i)
- -              sum += cpu_rq(i)->nr_switches;
- -
- -      return sum;
- -}
- -
- -unsigned long nr_iowait(void)
- -{
- -      unsigned long i, sum = 0;
- -
- -      for_each_possible_cpu(i)
- -              sum += atomic_read(&cpu_rq(i)->nr_iowait);
- -
- -      return sum;
- -}
- -
- -unsigned long nr_iowait_cpu(void)
- -{
- -      struct rq *this = this_rq();
- -      return atomic_read(&this->nr_iowait);
- -}
- -
- -unsigned long this_cpu_load(void)
- -{
- -      struct rq *this = this_rq();
- -      return this->cpu_load[0];
- -}
- -
- -
- -/* Variables and functions for calc_load */
- -static atomic_long_t calc_load_tasks;
- -static unsigned long calc_load_update;
- -unsigned long avenrun[3];
- -EXPORT_SYMBOL(avenrun);
- -
- -/**
- - * get_avenrun - get the load average array
- - * @loads:    pointer to dest load array
- - * @offset:   offset to add
- - * @shift:    shift count to shift the result left
- - *
- - * These values are estimates at best, so no need for locking.
- - */
- -void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
- -{
- -      loads[0] = (avenrun[0] + offset) << shift;
- -      loads[1] = (avenrun[1] + offset) << shift;
- -      loads[2] = (avenrun[2] + offset) << shift;
- -}
- -
- -static unsigned long
- -calc_load(unsigned long load, unsigned long exp, unsigned long active)
- -{
- -      load *= exp;
- -      load += active * (FIXED_1 - exp);
- -      return load >> FSHIFT;
- -}
- -
- -/*
- - * calc_load - update the avenrun load estimates 10 ticks after the
- - * CPUs have updated calc_load_tasks.
- - */
- -void calc_global_load(void)
- -{
- -      unsigned long upd = calc_load_update + 10;
- -      long active;
- -
- -      if (time_before(jiffies, upd))
- -              return;
- -
- -      active = atomic_long_read(&calc_load_tasks);
- -      active = active > 0 ? active * FIXED_1 : 0;
- -
- -      avenrun[0] = calc_load(avenrun[0], EXP_1, active);
- -      avenrun[1] = calc_load(avenrun[1], EXP_5, active);
- -      avenrun[2] = calc_load(avenrun[2], EXP_15, active);
- -
- -      calc_load_update += LOAD_FREQ;
- -}
- -
- -/*
- - * Either called from update_cpu_load() or from a cpu going idle
- - */
- -static void calc_load_account_active(struct rq *this_rq)
- -{
- -      long nr_active, delta;
- -
- -      nr_active = this_rq->nr_running;
- -      nr_active += (long) this_rq->nr_uninterruptible;
- -
- -      if (nr_active != this_rq->calc_load_active) {
- -              delta = nr_active - this_rq->calc_load_active;
- -              this_rq->calc_load_active = nr_active;
- -              atomic_long_add(delta, &calc_load_tasks);
- -      }
- -}
- -
- -/*
- - * Update rq->cpu_load[] statistics. This function is usually called every
- - * scheduler tick (TICK_NSEC).
- - */
- -static void update_cpu_load(struct rq *this_rq)
- -{
- -      unsigned long this_load = this_rq->load.weight;
- -      int i, scale;
- -
- -      this_rq->nr_load_updates++;
- -
- -      /* Update our load: */
- -      for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- -              unsigned long old_load, new_load;
- -
- -              /* scale is effectively 1 << i now, and >> i divides by scale */
- -
- -              old_load = this_rq->cpu_load[i];
- -              new_load = this_load;
- -              /*
- -               * Round up the averaging division if load is increasing. This
- -               * prevents us from getting stuck on 9 if the load is 10, for
- -               * example.
- -               */
- -              if (new_load > old_load)
- -                      new_load += scale-1;
- -              this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
- -      }
- -
- -      if (time_after_eq(jiffies, this_rq->calc_load_update)) {
- -              this_rq->calc_load_update += LOAD_FREQ;
- -              calc_load_account_active(this_rq);
- -      }
- -}
- -
- -#ifdef CONFIG_SMP
- -
- -/*
- - * double_rq_lock - safely lock two runqueues
- - *
- - * Note this does not disable interrupts like task_rq_lock,
- - * you need to do so manually before calling.
- - */
- -static void double_rq_lock(struct rq *rq1, struct rq *rq2)
- -      __acquires(rq1->lock)
- -      __acquires(rq2->lock)
- -{
- -      BUG_ON(!irqs_disabled());
- -      if (rq1 == rq2) {
- -              raw_spin_lock(&rq1->lock);
- -              __acquire(rq2->lock);   /* Fake it out ;) */
- -      } else {
- -              if (rq1 < rq2) {
- -                      raw_spin_lock(&rq1->lock);
- -                      raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
- -              } else {
- -                      raw_spin_lock(&rq2->lock);
- -                      raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
- -              }
- -      }
- -      update_rq_clock(rq1);
- -      update_rq_clock(rq2);
- -}
- -
- -/*
- - * double_rq_unlock - safely unlock two runqueues
- - *
- - * Note this does not restore interrupts like task_rq_unlock,
- - * you need to do so manually after calling.
- - */
- -static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
- -      __releases(rq1->lock)
- -      __releases(rq2->lock)
- -{
- -      raw_spin_unlock(&rq1->lock);
- -      if (rq1 != rq2)
- -              raw_spin_unlock(&rq2->lock);
- -      else
- -              __release(rq2->lock);
- -}
- -
- -/*
- - * sched_exec - execve() is a valuable balancing opportunity, because at
- - * this point the task has the smallest effective memory and cache footprint.
- - */
- -void sched_exec(void)
- -{
- -      struct task_struct *p = current;
- -      struct migration_req req;
- -      int dest_cpu, this_cpu;
- -      unsigned long flags;
- -      struct rq *rq;
- -
- -again:
- -      this_cpu = get_cpu();
- -      dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
- -      if (dest_cpu == this_cpu) {
- -              put_cpu();
- -              return;
- -      }
- -
- -      rq = task_rq_lock(p, &flags);
- -      put_cpu();
- -
- -      /*
- -       * select_task_rq() can race against ->cpus_allowed
- -       */
- -      if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
- -          || unlikely(!cpu_active(dest_cpu))) {
- -              task_rq_unlock(rq, &flags);
- -              goto again;
- -      }
- -
- -      /* force the process onto the specified CPU */
- -      if (migrate_task(p, dest_cpu, &req)) {
- -              /* Need to wait for migration thread (might exit: take ref). */
- -              struct task_struct *mt = rq->migration_thread;
- -
- -              get_task_struct(mt);
- -              task_rq_unlock(rq, &flags);
- -              wake_up_process(mt);
- -              put_task_struct(mt);
- -              wait_for_completion(&req.done);
- -
- -              return;
- -      }
- -      task_rq_unlock(rq, &flags);
- -}
- -
- -/*
- - * pull_task - move a task from a remote runqueue to the local runqueue.
- - * Both runqueues must be locked.
- - */
- -static void pull_task(struct rq *src_rq, struct task_struct *p,
- -                    struct rq *this_rq, int this_cpu)
- -{
- -      deactivate_task(src_rq, p, 0);
- -      set_task_cpu(p, this_cpu);
- -      activate_task(this_rq, p, 0);
- -      check_preempt_curr(this_rq, p, 0);
- -}
- -
- -/*
- - * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
- - */
- -static
- -int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
- -                   struct sched_domain *sd, enum cpu_idle_type idle,
- -                   int *all_pinned)
- -{
- -      int tsk_cache_hot = 0;
- -      /*
- -       * We do not migrate tasks that are:
- -       * 1) running (obviously), or
- -       * 2) cannot be migrated to this CPU due to cpus_allowed, or
- -       * 3) are cache-hot on their current CPU.
- -       */
- -      if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
- -              schedstat_inc(p, se.nr_failed_migrations_affine);
- -              return 0;
- -      }
- -      *all_pinned = 0;
- -
- -      if (task_running(rq, p)) {
- -              schedstat_inc(p, se.nr_failed_migrations_running);
- -              return 0;
- -      }
- -
- -      /*
- -       * Aggressive migration if:
- -       * 1) task is cache cold, or
- -       * 2) too many balance attempts have failed.
- -       */
- -
- -      tsk_cache_hot = task_hot(p, rq->clock, sd);
- -      if (!tsk_cache_hot ||
- -              sd->nr_balance_failed > sd->cache_nice_tries) {
- -#ifdef CONFIG_SCHEDSTATS
- -              if (tsk_cache_hot) {
- -                      schedstat_inc(sd, lb_hot_gained[idle]);
- -                      schedstat_inc(p, se.nr_forced_migrations);
- -              }
- -#endif
- -              return 1;
- -      }
- -
- -      if (tsk_cache_hot) {
- -              schedstat_inc(p, se.nr_failed_migrations_hot);
- -              return 0;
- -      }
- -      return 1;
- -}
- -
- -static unsigned long
- -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
- -            unsigned long max_load_move, struct sched_domain *sd,
- -            enum cpu_idle_type idle, int *all_pinned,
- -            int *this_best_prio, struct rq_iterator *iterator)
- -{
- -      int loops = 0, pulled = 0, pinned = 0;
- -      struct task_struct *p;
- -      long rem_load_move = max_load_move;
- -
- -      if (max_load_move == 0)
- -              goto out;
- -
- -      pinned = 1;
- -
- -      /*
- -       * Start the load-balancing iterator:
- -       */
- -      p = iterator->start(iterator->arg);
- -next:
- -      if (!p || loops++ > sysctl_sched_nr_migrate)
- -              goto out;
- -
- -      if ((p->se.load.weight >> 1) > rem_load_move ||
- -          !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
- -              p = iterator->next(iterator->arg);
- -              goto next;
- -      }
- -
- -      pull_task(busiest, p, this_rq, this_cpu);
- -      pulled++;
- -      rem_load_move -= p->se.load.weight;
- -
- -#ifdef CONFIG_PREEMPT
- -      /*
- -       * NEWIDLE balancing is a source of latency, so preemptible kernels
- -       * will stop after the first task is pulled to minimize the critical
- -       * section.
- -       */
- -      if (idle == CPU_NEWLY_IDLE)
- -              goto out;
- -#endif
- -
- -      /*
- -       * We only want to steal up to the prescribed amount of weighted load.
- -       */
- -      if (rem_load_move > 0) {
- -              if (p->prio < *this_best_prio)
- -                      *this_best_prio = p->prio;
- -              p = iterator->next(iterator->arg);
- -              goto next;
- -      }
- -out:
- -      /*
- -       * Right now, this is one of only two places pull_task() is called,
- -       * so we can safely collect pull_task() stats here rather than
- -       * inside pull_task().
- -       */
- -      schedstat_add(sd, lb_gained[idle], pulled);
- -
- -      if (all_pinned)
- -              *all_pinned = pinned;
- -
- -      return max_load_move - rem_load_move;
- -}
- -
- -/*
- - * move_tasks tries to move up to max_load_move weighted load from busiest to
- - * this_rq, as part of a balancing operation within domain "sd".
- - * Returns 1 if successful and 0 otherwise.
- - *
- - * Called with both runqueues locked.
- - */
- -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
- -                    unsigned long max_load_move,
- -                    struct sched_domain *sd, enum cpu_idle_type idle,
- -                    int *all_pinned)
- -{
- -      const struct sched_class *class = sched_class_highest;
- -      unsigned long total_load_moved = 0;
- -      int this_best_prio = this_rq->curr->prio;
- -
- -      do {
- -              total_load_moved +=
- -                      class->load_balance(this_rq, this_cpu, busiest,
- -                              max_load_move - total_load_moved,
- -                              sd, idle, all_pinned, &this_best_prio);
- -              class = class->next;
- -
- -#ifdef CONFIG_PREEMPT
- -              /*
- -               * NEWIDLE balancing is a source of latency, so preemptible
- -               * kernels will stop after the first task is pulled to minimize
- -               * the critical section.
- -               */
- -              if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
- -                      break;
- -#endif
- -      } while (class && max_load_move > total_load_moved);
- -
- -      return total_load_moved > 0;
- -}
- -
- -static int
- -iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
- -                 struct sched_domain *sd, enum cpu_idle_type idle,
- -                 struct rq_iterator *iterator)
- -{
- -      struct task_struct *p = iterator->start(iterator->arg);
- -      int pinned = 0;
- -
- -      while (p) {
- -              if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
- -                      pull_task(busiest, p, this_rq, this_cpu);
- -                      /*
- -                       * Right now, this is only the second place pull_task()
- -                       * is called, so we can safely collect pull_task()
- -                       * stats here rather than inside pull_task().
- -                       */
- -                      schedstat_inc(sd, lb_gained[idle]);
- -
- -                      return 1;
- -              }
- -              p = iterator->next(iterator->arg);
- -      }
- -
- -      return 0;
- -}
- -
- -/*
- - * move_one_task tries to move exactly one task from busiest to this_rq, as
- - * part of active balancing operations within "domain".
- - * Returns 1 if successful and 0 otherwise.
- - *
- - * Called with both runqueues locked.
- - */
- -static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
- -                       struct sched_domain *sd, enum cpu_idle_type idle)
- -{
- -      const struct sched_class *class;
- -
- -      for_each_class(class) {
- -              if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
- -                      return 1;
- -      }
- -
- -      return 0;
- -}
- -/********** Helpers for find_busiest_group ************************/
- -/*
- - * sd_lb_stats - Structure to store the statistics of a sched_domain
- - *            during load balancing.
- - */
- -struct sd_lb_stats {
- -      struct sched_group *busiest; /* Busiest group in this sd */
- -      struct sched_group *this;  /* Local group in this sd */
- -      unsigned long total_load;  /* Total load of all groups in sd */
- -      unsigned long total_pwr;   /*   Total power of all groups in sd */
- -      unsigned long avg_load;    /* Average load across all groups in sd */
- -
- -      /** Statistics of this group */
- -      unsigned long this_load;
- -      unsigned long this_load_per_task;
- -      unsigned long this_nr_running;
- -
- -      /* Statistics of the busiest group */
- -      unsigned long max_load;
- -      unsigned long busiest_load_per_task;
- -      unsigned long busiest_nr_running;
- -
- -      int group_imb; /* Is there imbalance in this sd */
- -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -      int power_savings_balance; /* Is powersave balance needed for this sd */
- -      struct sched_group *group_min; /* Least loaded group in sd */
- -      struct sched_group *group_leader; /* Group which relieves group_min */
- -      unsigned long min_load_per_task; /* load_per_task in group_min */
- -      unsigned long leader_nr_running; /* Nr running of group_leader */
- -      unsigned long min_nr_running; /* Nr running of group_min */
- -#endif
- -};
- -
- -/*
- - * sg_lb_stats - stats of a sched_group required for load_balancing
- - */
- -struct sg_lb_stats {
- -      unsigned long avg_load; /*Avg load across the CPUs of the group */
- -      unsigned long group_load; /* Total load over the CPUs of the group */
- -      unsigned long sum_nr_running; /* Nr tasks running in the group */
- -      unsigned long sum_weighted_load; /* Weighted load of group's tasks */
- -      unsigned long group_capacity;
- -      int group_imb; /* Is there an imbalance in the group ? */
- -};
- -
- -/**
- - * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- - * @group: The group whose first cpu is to be returned.
- - */
- -static inline unsigned int group_first_cpu(struct sched_group *group)
- -{
- -      return cpumask_first(sched_group_cpus(group));
- -}
- -
- -/**
- - * get_sd_load_idx - Obtain the load index for a given sched domain.
- - * @sd: The sched_domain whose load_idx is to be obtained.
- - * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
- - */
- -static inline int get_sd_load_idx(struct sched_domain *sd,
- -                                      enum cpu_idle_type idle)
- -{
- -      int load_idx;
- -
- -      switch (idle) {
- -      case CPU_NOT_IDLE:
- -              load_idx = sd->busy_idx;
- -              break;
- -
- -      case CPU_NEWLY_IDLE:
- -              load_idx = sd->newidle_idx;
- -              break;
- -      default:
- -              load_idx = sd->idle_idx;
- -              break;
- -      }
- -
- -      return load_idx;
- -}
- -
- -
- -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -/**
- - * init_sd_power_savings_stats - Initialize power savings statistics for
- - * the given sched_domain, during load balancing.
- - *
- - * @sd: Sched domain whose power-savings statistics are to be initialized.
- - * @sds: Variable containing the statistics for sd.
- - * @idle: Idle status of the CPU at which we're performing load-balancing.
- - */
- -static inline void init_sd_power_savings_stats(struct sched_domain *sd,
- -      struct sd_lb_stats *sds, enum cpu_idle_type idle)
- -{
- -      /*
- -       * Busy processors will not participate in power savings
- -       * balance.
- -       */
- -      if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
- -              sds->power_savings_balance = 0;
- -      else {
- -              sds->power_savings_balance = 1;
- -              sds->min_nr_running = ULONG_MAX;
- -              sds->leader_nr_running = 0;
- -      }
- -}
- -
- -/**
- - * update_sd_power_savings_stats - Update the power saving stats for a
- - * sched_domain while performing load balancing.
- - *
- - * @group: sched_group belonging to the sched_domain under consideration.
- - * @sds: Variable containing the statistics of the sched_domain
- - * @local_group: Does group contain the CPU for which we're performing
- - *            load balancing ?
- - * @sgs: Variable containing the statistics of the group.
- - */
- -static inline void update_sd_power_savings_stats(struct sched_group *group,
- -      struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
- -{
- -
- -      if (!sds->power_savings_balance)
- -              return;
- -
- -      /*
- -       * If the local group is idle or completely loaded
- -       * no need to do power savings balance at this domain
- -       */
- -      if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
- -                              !sds->this_nr_running))
- -              sds->power_savings_balance = 0;
- -
- -      /*
- -       * If a group is already running at full capacity or idle,
- -       * don't include that group in power savings calculations
- -       */
- -      if (!sds->power_savings_balance ||
- -              sgs->sum_nr_running >= sgs->group_capacity ||
- -              !sgs->sum_nr_running)
- -              return;
- -
- -      /*
- -       * Calculate the group which has the least non-idle load.
- -       * This is the group from where we need to pick up the load
- -       * for saving power
- -       */
- -      if ((sgs->sum_nr_running < sds->min_nr_running) ||
- -          (sgs->sum_nr_running == sds->min_nr_running &&
- -           group_first_cpu(group) > group_first_cpu(sds->group_min))) {
- -              sds->group_min = group;
- -              sds->min_nr_running = sgs->sum_nr_running;
- -              sds->min_load_per_task = sgs->sum_weighted_load /
- -                                              sgs->sum_nr_running;
- -      }
- -
- -      /*
- -       * Calculate the group which is almost near its
- -       * capacity but still has some space to pick up some load
- -       * from other group and save more power
- -       */
- -      if (sgs->sum_nr_running + 1 > sgs->group_capacity)
- -              return;
- -
- -      if (sgs->sum_nr_running > sds->leader_nr_running ||
- -          (sgs->sum_nr_running == sds->leader_nr_running &&
- -           group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
- -              sds->group_leader = group;
- -              sds->leader_nr_running = sgs->sum_nr_running;
- -      }
- -}
- -
- -/**
- - * check_power_save_busiest_group - see if there is potential for some power-savings balance
- - * @sds: Variable containing the statistics of the sched_domain
- - *    under consideration.
- - * @this_cpu: Cpu at which we're currently performing load-balancing.
- - * @imbalance: Variable to store the imbalance.
- - *
- - * Description:
- - * Check if we have potential to perform some power-savings balance.
- - * If yes, set the busiest group to be the least loaded group in the
- - * sched_domain, so that it's CPUs can be put to idle.
- - *
- - * Returns 1 if there is potential to perform power-savings balance.
- - * Else returns 0.
- - */
- -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
- -                                      int this_cpu, unsigned long *imbalance)
- -{
- -      if (!sds->power_savings_balance)
- -              return 0;
- -
- -      if (sds->this != sds->group_leader ||
- -                      sds->group_leader == sds->group_min)
- -              return 0;
- -
- -      *imbalance = sds->min_load_per_task;
- -      sds->busiest = sds->group_min;
- -
- -      return 1;
- -
- -}
- -#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
- -static inline void init_sd_power_savings_stats(struct sched_domain *sd,
- -      struct sd_lb_stats *sds, enum cpu_idle_type idle)
- -{
- -      return;
- -}
- -
- -static inline void update_sd_power_savings_stats(struct sched_group *group,
- -      struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
- -{
- -      return;
- -}
- -
- -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
- -                                      int this_cpu, unsigned long *imbalance)
- -{
- -      return 0;
- -}
- -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
- -
- -
- -unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
- -{
- -      return SCHED_LOAD_SCALE;
- -}
- -
- -unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
- -{
- -      return default_scale_freq_power(sd, cpu);
- -}
- -
- -unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
- -{
- -      unsigned long weight = cpumask_weight(sched_domain_span(sd));
- -      unsigned long smt_gain = sd->smt_gain;
- -
- -      smt_gain /= weight;
- -
- -      return smt_gain;
- -}
- -
- -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
- -{
- -      return default_scale_smt_power(sd, cpu);
- -}
- -
- -unsigned long scale_rt_power(int cpu)
- -{
- -      struct rq *rq = cpu_rq(cpu);
- -      u64 total, available;
- -
- -      sched_avg_update(rq);
- -
- -      total = sched_avg_period() + (rq->clock - rq->age_stamp);
- -      available = total - rq->rt_avg;
- -
- -      if (unlikely((s64)total < SCHED_LOAD_SCALE))
- -              total = SCHED_LOAD_SCALE;
- -
- -      total >>= SCHED_LOAD_SHIFT;
- -
- -      return div_u64(available, total);
- -}
- -
- -static void update_cpu_power(struct sched_domain *sd, int cpu)
- -{
- -      unsigned long weight = cpumask_weight(sched_domain_span(sd));
- -      unsigned long power = SCHED_LOAD_SCALE;
- -      struct sched_group *sdg = sd->groups;
- -
- -      if (sched_feat(ARCH_POWER))
- -              power *= arch_scale_freq_power(sd, cpu);
- -      else
- -              power *= default_scale_freq_power(sd, cpu);
- -
- -      power >>= SCHED_LOAD_SHIFT;
- -
- -      if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- -              if (sched_feat(ARCH_POWER))
- -                      power *= arch_scale_smt_power(sd, cpu);
- -              else
- -                      power *= default_scale_smt_power(sd, cpu);
- -
- -              power >>= SCHED_LOAD_SHIFT;
- -      }
- -
- -      power *= scale_rt_power(cpu);
- -      power >>= SCHED_LOAD_SHIFT;
- -
- -      if (!power)
- -              power = 1;
- -
- -      sdg->cpu_power = power;
- -}
- -
- -static void update_group_power(struct sched_domain *sd, int cpu)
- -{
- -      struct sched_domain *child = sd->child;
- -      struct sched_group *group, *sdg = sd->groups;
- -      unsigned long power;
- -
- -      if (!child) {
- -              update_cpu_power(sd, cpu);
- -              return;
- -      }
- -
- -      power = 0;
- -
- -      group = child->groups;
- -      do {
- -              power += group->cpu_power;
- -              group = group->next;
- -      } while (group != child->groups);
- -
- -      sdg->cpu_power = power;
- -}
- -
- -/**
- - * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- - * @sd: The sched_domain whose statistics are to be updated.
- - * @group: sched_group whose statistics are to be updated.
- - * @this_cpu: Cpu for which load balance is currently performed.
- - * @idle: Idle status of this_cpu
- - * @load_idx: Load index of sched_domain of this_cpu for load calc.
- - * @sd_idle: Idle status of the sched_domain containing group.
- - * @local_group: Does group contain this_cpu.
- - * @cpus: Set of cpus considered for load balancing.
- - * @balance: Should we balance.
- - * @sgs: variable to hold the statistics for this group.
- - */
- -static inline void update_sg_lb_stats(struct sched_domain *sd,
- -                      struct sched_group *group, int this_cpu,
- -                      enum cpu_idle_type idle, int load_idx, int *sd_idle,
- -                      int local_group, const struct cpumask *cpus,
- -                      int *balance, struct sg_lb_stats *sgs)
- -{
- -      unsigned long load, max_cpu_load, min_cpu_load;
- -      int i;
- -      unsigned int balance_cpu = -1, first_idle_cpu = 0;
- -      unsigned long sum_avg_load_per_task;
- -      unsigned long avg_load_per_task;
- -
- -      if (local_group) {
- -              balance_cpu = group_first_cpu(group);
- -              if (balance_cpu == this_cpu)
- -                      update_group_power(sd, this_cpu);
- -      }
- -
- -      /* Tally up the load of all CPUs in the group */
- -      sum_avg_load_per_task = avg_load_per_task = 0;
- -      max_cpu_load = 0;
- -      min_cpu_load = ~0UL;
- -
- -      for_each_cpu_and(i, sched_group_cpus(group), cpus) {
- -              struct rq *rq = cpu_rq(i);
- -
- -              if (*sd_idle && rq->nr_running)
- -                      *sd_idle = 0;
- -
- -              /* Bias balancing toward cpus of our domain */
- -              if (local_group) {
- -                      if (idle_cpu(i) && !first_idle_cpu) {
- -                              first_idle_cpu = 1;
- -                              balance_cpu = i;
- -                      }
- -
- -                      load = target_load(i, load_idx);
- -              } else {
- -                      load = source_load(i, load_idx);
- -                      if (load > max_cpu_load)
- -                              max_cpu_load = load;
- -                      if (min_cpu_load > load)
- -                              min_cpu_load = load;
- -              }
- -
- -              sgs->group_load += load;
- -              sgs->sum_nr_running += rq->nr_running;
- -              sgs->sum_weighted_load += weighted_cpuload(i);
- -
- -              sum_avg_load_per_task += cpu_avg_load_per_task(i);
- -      }
- -
- -      /*
- -       * First idle cpu or the first cpu(busiest) in this sched group
- -       * is eligible for doing load balancing at this and above
- -       * domains. In the newly idle case, we will allow all the cpu's
- -       * to do the newly idle load balance.
- -       */
- -      if (idle != CPU_NEWLY_IDLE && local_group &&
- -          balance_cpu != this_cpu && balance) {
- -              *balance = 0;
- -              return;
- -      }
- -
- -      /* Adjust by relative CPU power of the group */
- -      sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
- -
- -
- -      /*
- -       * Consider the group unbalanced when the imbalance is larger
- -       * than the average weight of two tasks.
- -       *
- -       * APZ: with cgroup the avg task weight can vary wildly and
- -       *      might not be a suitable number - should we keep a
- -       *      normalized nr_running number somewhere that negates
- -       *      the hierarchy?
- -       */
- -      avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
- -              group->cpu_power;
- -
- -      if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
- -              sgs->group_imb = 1;
- -
- -      sgs->group_capacity =
- -              DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
- -}
- -
- -/**
- - * update_sd_lb_stats - Update sched_group's statistics for load balancing.
- - * @sd: sched_domain whose statistics are to be updated.
- - * @this_cpu: Cpu for which load balance is currently performed.
- - * @idle: Idle status of this_cpu
- - * @sd_idle: Idle status of the sched_domain containing group.
- - * @cpus: Set of cpus considered for load balancing.
- - * @balance: Should we balance.
- - * @sds: variable to hold the statistics for this sched_domain.
- - */
- -static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
- -                      enum cpu_idle_type idle, int *sd_idle,
- -                      const struct cpumask *cpus, int *balance,
- -                      struct sd_lb_stats *sds)
- -{
- -      struct sched_domain *child = sd->child;
- -      struct sched_group *group = sd->groups;
- -      struct sg_lb_stats sgs;
- -      int load_idx, prefer_sibling = 0;
- -
- -      if (child && child->flags & SD_PREFER_SIBLING)
- -              prefer_sibling = 1;
- -
- -      init_sd_power_savings_stats(sd, sds, idle);
- -      load_idx = get_sd_load_idx(sd, idle);
- -
- -      do {
- -              int local_group;
- -
- -              local_group = cpumask_test_cpu(this_cpu,
- -                                             sched_group_cpus(group));
- -              memset(&sgs, 0, sizeof(sgs));
- -              update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
- -                              local_group, cpus, balance, &sgs);
- -
- -              if (local_group && balance && !(*balance))
- -                      return;
- -
- -              sds->total_load += sgs.group_load;
- -              sds->total_pwr += group->cpu_power;
- -
- -              /*
- -               * In case the child domain prefers tasks go to siblings
- -               * first, lower the group capacity to one so that we'll try
- -               * and move all the excess tasks away.
- -               */
- -              if (prefer_sibling)
- -                      sgs.group_capacity = min(sgs.group_capacity, 1UL);
- -
- -              if (local_group) {
- -                      sds->this_load = sgs.avg_load;
- -                      sds->this = group;
- -                      sds->this_nr_running = sgs.sum_nr_running;
- -                      sds->this_load_per_task = sgs.sum_weighted_load;
- -              } else if (sgs.avg_load > sds->max_load &&
- -                         (sgs.sum_nr_running > sgs.group_capacity ||
- -                              sgs.group_imb)) {
- -                      sds->max_load = sgs.avg_load;
- -                      sds->busiest = group;
- -                      sds->busiest_nr_running = sgs.sum_nr_running;
- -                      sds->busiest_load_per_task = sgs.sum_weighted_load;
- -                      sds->group_imb = sgs.group_imb;
- -              }
- -
- -              update_sd_power_savings_stats(group, sds, local_group, &sgs);
- -              group = group->next;
- -      } while (group != sd->groups);
- -}
- -
- -/**
- - * fix_small_imbalance - Calculate the minor imbalance that exists
- - *                    amongst the groups of a sched_domain, during
- - *                    load balancing.
- - * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- - * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- - * @imbalance: Variable to store the imbalance.
- - */
- -static inline void fix_small_imbalance(struct sd_lb_stats *sds,
- -                              int this_cpu, unsigned long *imbalance)
- -{
- -      unsigned long tmp, pwr_now = 0, pwr_move = 0;
- -      unsigned int imbn = 2;
- -
- -      if (sds->this_nr_running) {
- -              sds->this_load_per_task /= sds->this_nr_running;
- -              if (sds->busiest_load_per_task >
- -                              sds->this_load_per_task)
- -                      imbn = 1;
- -      } else
- -              sds->this_load_per_task =
- -                      cpu_avg_load_per_task(this_cpu);
- -
- -      if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
- -                      sds->busiest_load_per_task * imbn) {
- -              *imbalance = sds->busiest_load_per_task;
- -              return;
- -      }
- -
- -      /*
- -       * OK, we don't have enough imbalance to justify moving tasks,
- -       * however we may be able to increase total CPU power used by
- -       * moving them.
- -       */
- -
- -      pwr_now += sds->busiest->cpu_power *
- -                      min(sds->busiest_load_per_task, sds->max_load);
- -      pwr_now += sds->this->cpu_power *
- -                      min(sds->this_load_per_task, sds->this_load);
- -      pwr_now /= SCHED_LOAD_SCALE;
- -
- -      /* Amount of load we'd subtract */
- -      tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
- -              sds->busiest->cpu_power;
- -      if (sds->max_load > tmp)
- -              pwr_move += sds->busiest->cpu_power *
- -                      min(sds->busiest_load_per_task, sds->max_load - tmp);
- -
- -      /* Amount of load we'd add */
- -      if (sds->max_load * sds->busiest->cpu_power <
- -              sds->busiest_load_per_task * SCHED_LOAD_SCALE)
- -              tmp = (sds->max_load * sds->busiest->cpu_power) /
- -                      sds->this->cpu_power;
- -      else
- -              tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
- -                      sds->this->cpu_power;
- -      pwr_move += sds->this->cpu_power *
- -                      min(sds->this_load_per_task, sds->this_load + tmp);
- -      pwr_move /= SCHED_LOAD_SCALE;
- -
- -      /* Move if we gain throughput */
- -      if (pwr_move > pwr_now)
- -              *imbalance = sds->busiest_load_per_task;
- -}
- -
- -/**
- - * calculate_imbalance - Calculate the amount of imbalance present within the
- - *                     groups of a given sched_domain during load balance.
- - * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- - * @this_cpu: Cpu for which currently load balance is being performed.
- - * @imbalance: The variable to store the imbalance.
- - */
- -static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
- -              unsigned long *imbalance)
- -{
- -      unsigned long max_pull;
- -      /*
- -       * In the presence of smp nice balancing, certain scenarios can have
- -       * max load less than avg load(as we skip the groups at or below
- -       * its cpu_power, while calculating max_load..)
- -       */
- -      if (sds->max_load < sds->avg_load) {
- -              *imbalance = 0;
- -              return fix_small_imbalance(sds, this_cpu, imbalance);
- -      }
- -
- -      /* Don't want to pull so many tasks that a group would go idle */
- -      max_pull = min(sds->max_load - sds->avg_load,
- -                      sds->max_load - sds->busiest_load_per_task);
- -
- -      /* How much load to actually move to equalise the imbalance */
- -      *imbalance = min(max_pull * sds->busiest->cpu_power,
- -              (sds->avg_load - sds->this_load) * sds->this->cpu_power)
- -                      / SCHED_LOAD_SCALE;
- -
- -      /*
- -       * if *imbalance is less than the average load per runnable task
- -       * there is no gaurantee that any tasks will be moved so we'll have
- -       * a think about bumping its value to force at least one task to be
- -       * moved
- -       */
- -      if (*imbalance < sds->busiest_load_per_task)
- -              return fix_small_imbalance(sds, this_cpu, imbalance);
- -
- -}
- -/******* find_busiest_group() helpers end here *********************/
- -
- -/**
- - * find_busiest_group - Returns the busiest group within the sched_domain
- - * if there is an imbalance. If there isn't an imbalance, and
- - * the user has opted for power-savings, it returns a group whose
- - * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- - * such a group exists.
- - *
- - * Also calculates the amount of weighted load which should be moved
- - * to restore balance.
- - *
- - * @sd: The sched_domain whose busiest group is to be returned.
- - * @this_cpu: The cpu for which load balancing is currently being performed.
- - * @imbalance: Variable which stores amount of weighted load which should
- - *            be moved to restore balance/put a group to idle.
- - * @idle: The idle status of this_cpu.
- - * @sd_idle: The idleness of sd
- - * @cpus: The set of CPUs under consideration for load-balancing.
- - * @balance: Pointer to a variable indicating if this_cpu
- - *    is the appropriate cpu to perform load balancing at this_level.
- - *
- - * Returns:   - the busiest group if imbalance exists.
- - *            - If no imbalance and user has opted for power-savings balance,
- - *               return the least loaded group whose CPUs can be
- - *               put to idle by rebalancing its tasks onto our group.
- - */
- -static struct sched_group *
- -find_busiest_group(struct sched_domain *sd, int this_cpu,
- -                 unsigned long *imbalance, enum cpu_idle_type idle,
- -                 int *sd_idle, const struct cpumask *cpus, int *balance)
- -{
- -      struct sd_lb_stats sds;
- -
- -      memset(&sds, 0, sizeof(sds));
- -
- -      /*
- -       * Compute the various statistics relavent for load balancing at
- -       * this level.
- -       */
- -      update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
- -                                      balance, &sds);
- -
- -      /* Cases where imbalance does not exist from POV of this_cpu */
- -      /* 1) this_cpu is not the appropriate cpu to perform load balancing
- -       *    at this level.
- -       * 2) There is no busy sibling group to pull from.
- -       * 3) This group is the busiest group.
- -       * 4) This group is more busy than the avg busieness at this
- -       *    sched_domain.
- -       * 5) The imbalance is within the specified limit.
- -       * 6) Any rebalance would lead to ping-pong
- -       */
- -      if (balance && !(*balance))
- -              goto ret;
- -
- -      if (!sds.busiest || sds.busiest_nr_running == 0)
- -              goto out_balanced;
- -
- -      if (sds.this_load >= sds.max_load)
- -              goto out_balanced;
- -
- -      sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
- -
- -      if (sds.this_load >= sds.avg_load)
- -              goto out_balanced;
- -
- -      if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
- -              goto out_balanced;
- -
- -      sds.busiest_load_per_task /= sds.busiest_nr_running;
- -      if (sds.group_imb)
- -              sds.busiest_load_per_task =
- -                      min(sds.busiest_load_per_task, sds.avg_load);
- -
- -      /*
- -       * We're trying to get all the cpus to the average_load, so we don't
- -       * want to push ourselves above the average load, nor do we wish to
- -       * reduce the max loaded cpu below the average load, as either of these
- -       * actions would just result in more rebalancing later, and ping-pong
- -       * tasks around. Thus we look for the minimum possible imbalance.
- -       * Negative imbalances (*we* are more loaded than anyone else) will
- -       * be counted as no imbalance for these purposes -- we can't fix that
- -       * by pulling tasks to us. Be careful of negative numbers as they'll
- -       * appear as very large values with unsigned longs.
- -       */
- -      if (sds.max_load <= sds.busiest_load_per_task)
- -              goto out_balanced;
- -
- -      /* Looks like there is an imbalance. Compute it */
- -      calculate_imbalance(&sds, this_cpu, imbalance);
- -      return sds.busiest;
- -
- -out_balanced:
- -      /*
- -       * There is no obvious imbalance. But check if we can do some balancing
- -       * to save power.
- -       */
- -      if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
- -              return sds.busiest;
- -ret:
- -      *imbalance = 0;
- -      return NULL;
- -}
- -
- -/*
- - * find_busiest_queue - find the busiest runqueue among the cpus in group.
- - */
- -static struct rq *
- -find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
- -                 unsigned long imbalance, const struct cpumask *cpus)
- -{
- -      struct rq *busiest = NULL, *rq;
- -      unsigned long max_load = 0;
- -      int i;
- -
- -      for_each_cpu(i, sched_group_cpus(group)) {
- -              unsigned long power = power_of(i);
- -              unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
- -              unsigned long wl;
- -
- -              if (!cpumask_test_cpu(i, cpus))
- -                      continue;
- -
- -              rq = cpu_rq(i);
- -              wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
- -              wl /= power;
- -
- -              if (capacity && rq->nr_running == 1 && wl > imbalance)
- -                      continue;
- -
- -              if (wl > max_load) {
- -                      max_load = wl;
- -                      busiest = rq;
- -              }
- -      }
- -
- -      return busiest;
- -}
- -
- -/*
- - * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
- - * so long as it is large enough.
- - */
- -#define MAX_PINNED_INTERVAL   512
- -
- -/* Working cpumask for load_balance and load_balance_newidle. */
- -static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
- -
- -/*
- - * Check this_cpu to ensure it is balanced within domain. Attempt to move
- - * tasks if there is an imbalance.
- - */
- -static int load_balance(int this_cpu, struct rq *this_rq,
- -                      struct sched_domain *sd, enum cpu_idle_type idle,
- -                      int *balance)
- -{
- -      int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
- -      struct sched_group *group;
- -      unsigned long imbalance;
- -      struct rq *busiest;
- -      unsigned long flags;
- -      struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
- -
- -      cpumask_copy(cpus, cpu_active_mask);
- -
- -      /*
- -       * When power savings policy is enabled for the parent domain, idle
- -       * sibling can pick up load irrespective of busy siblings. In this case,
- -       * let the state of idle sibling percolate up as CPU_IDLE, instead of
- -       * portraying it as CPU_NOT_IDLE.
- -       */
- -      if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
- -          !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- -              sd_idle = 1;
- -
- -      schedstat_inc(sd, lb_count[idle]);
- -
- -redo:
- -      update_shares(sd);
- -      group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
- -                                 cpus, balance);
- -
- -      if (*balance == 0)
- -              goto out_balanced;
- -
- -      if (!group) {
- -              schedstat_inc(sd, lb_nobusyg[idle]);
- -              goto out_balanced;
- -      }
- -
- -      busiest = find_busiest_queue(group, idle, imbalance, cpus);
- -      if (!busiest) {
- -              schedstat_inc(sd, lb_nobusyq[idle]);
- -              goto out_balanced;
- -      }
- -
- -      BUG_ON(busiest == this_rq);
- -
- -      schedstat_add(sd, lb_imbalance[idle], imbalance);
- -
- -      ld_moved = 0;
- -      if (busiest->nr_running > 1) {
- -              /*
- -               * Attempt to move tasks. If find_busiest_group has found
- -               * an imbalance but busiest->nr_running <= 1, the group is
- -               * still unbalanced. ld_moved simply stays zero, so it is
- -               * correctly treated as an imbalance.
- -               */
- -              local_irq_save(flags);
- -              double_rq_lock(this_rq, busiest);
- -              ld_moved = move_tasks(this_rq, this_cpu, busiest,
- -                                    imbalance, sd, idle, &all_pinned);
- -              double_rq_unlock(this_rq, busiest);
- -              local_irq_restore(flags);
- -
- -              /*
- -               * some other cpu did the load balance for us.
- -               */
- -              if (ld_moved && this_cpu != smp_processor_id())
- -                      resched_cpu(this_cpu);
- -
- -              /* All tasks on this runqueue were pinned by CPU affinity */
- -              if (unlikely(all_pinned)) {
- -                      cpumask_clear_cpu(cpu_of(busiest), cpus);
- -                      if (!cpumask_empty(cpus))
- -                              goto redo;
- -                      goto out_balanced;
- -              }
- -      }
- -
- -      if (!ld_moved) {
- -              schedstat_inc(sd, lb_failed[idle]);
- -              sd->nr_balance_failed++;
- -
- -              if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
- -
- -                      raw_spin_lock_irqsave(&busiest->lock, flags);
- -
- -                      /* don't kick the migration_thread, if the curr
- -                       * task on busiest cpu can't be moved to this_cpu
- -                       */
- -                      if (!cpumask_test_cpu(this_cpu,
- -                                            &busiest->curr->cpus_allowed)) {
- -                              raw_spin_unlock_irqrestore(&busiest->lock,
- -                                                          flags);
- -                              all_pinned = 1;
- -                              goto out_one_pinned;
- -                      }
- -
- -                      if (!busiest->active_balance) {
- -                              busiest->active_balance = 1;
- -                              busiest->push_cpu = this_cpu;
- -                              active_balance = 1;
- -                      }
- -                      raw_spin_unlock_irqrestore(&busiest->lock, flags);
- -                      if (active_balance)
- -                              wake_up_process(busiest->migration_thread);
- -
- -                      /*
- -                       * We've kicked active balancing, reset the failure
- -                       * counter.
- -                       */
- -                      sd->nr_balance_failed = sd->cache_nice_tries+1;
- -              }
- -      } else
- -              sd->nr_balance_failed = 0;
- -
- -      if (likely(!active_balance)) {
- -              /* We were unbalanced, so reset the balancing interval */
- -              sd->balance_interval = sd->min_interval;
- -      } else {
- -              /*
- -               * If we've begun active balancing, start to back off. This
- -               * case may not be covered by the all_pinned logic if there
- -               * is only 1 task on the busy runqueue (because we don't call
- -               * move_tasks).
- -               */
- -              if (sd->balance_interval < sd->max_interval)
- -                      sd->balance_interval *= 2;
- -      }
- -
- -      if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- -          !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- -              ld_moved = -1;
- -
- -      goto out;
- -
- -out_balanced:
- -      schedstat_inc(sd, lb_balanced[idle]);
- -
- -      sd->nr_balance_failed = 0;
- -
- -out_one_pinned:
- -      /* tune up the balancing interval */
- -      if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
- -                      (sd->balance_interval < sd->max_interval))
- -              sd->balance_interval *= 2;
- -
- -      if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- -          !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- -              ld_moved = -1;
- -      else
- -              ld_moved = 0;
- -out:
- -      if (ld_moved)
- -              update_shares(sd);
- -      return ld_moved;
- -}
- -
- -/*
- - * Check this_cpu to ensure it is balanced within domain. Attempt to move
- - * tasks if there is an imbalance.
- - *
- - * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
- - * this_rq is locked.
- - */
- -static int
- -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
- -{
- -      struct sched_group *group;
- -      struct rq *busiest = NULL;
- -      unsigned long imbalance;
- -      int ld_moved = 0;
- -      int sd_idle = 0;
- -      int all_pinned = 0;
- -      struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
- -
- -      cpumask_copy(cpus, cpu_active_mask);
- -
- -      /*
- -       * When power savings policy is enabled for the parent domain, idle
- -       * sibling can pick up load irrespective of busy siblings. In this case,
- -       * let the state of idle sibling percolate up as IDLE, instead of
- -       * portraying it as CPU_NOT_IDLE.
- -       */
- -      if (sd->flags & SD_SHARE_CPUPOWER &&
- -          !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- -              sd_idle = 1;
- -
- -      schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
- -redo:
- -      update_shares_locked(this_rq, sd);
- -      group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
- -                                 &sd_idle, cpus, NULL);
- -      if (!group) {
- -              schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
- -              goto out_balanced;
- -      }
- -
- -      busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
- -      if (!busiest) {
- -              schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
- -              goto out_balanced;
- -      }
- -
- -      BUG_ON(busiest == this_rq);
- -
- -      schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
- -
- -      ld_moved = 0;
- -      if (busiest->nr_running > 1) {
- -              /* Attempt to move tasks */
- -              double_lock_balance(this_rq, busiest);
- -              /* this_rq->clock is already updated */
- -              update_rq_clock(busiest);
- -              ld_moved = move_tasks(this_rq, this_cpu, busiest,
- -                                      imbalance, sd, CPU_NEWLY_IDLE,
- -                                      &all_pinned);
- -              double_unlock_balance(this_rq, busiest);
- -
- -              if (unlikely(all_pinned)) {
- -                      cpumask_clear_cpu(cpu_of(busiest), cpus);
- -                      if (!cpumask_empty(cpus))
- -                              goto redo;
- -              }
- -      }
- -
- -      if (!ld_moved) {
- -              int active_balance = 0;
- -
- -              schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
- -              if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- -                  !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- -                      return -1;
- -
- -              if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
- -                      return -1;
- -
- -              if (sd->nr_balance_failed++ < 2)
- -                      return -1;
- -
- -              /*
- -               * The only task running in a non-idle cpu can be moved to this
- -               * cpu in an attempt to completely freeup the other CPU
- -               * package. The same method used to move task in load_balance()
- -               * have been extended for load_balance_newidle() to speedup
- -               * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
- -               *
- -               * The package power saving logic comes from
- -               * find_busiest_group().  If there are no imbalance, then
- -               * f_b_g() will return NULL.  However when sched_mc={1,2} then
- -               * f_b_g() will select a group from which a running task may be
- -               * pulled to this cpu in order to make the other package idle.
- -               * If there is no opportunity to make a package idle and if
- -               * there are no imbalance, then f_b_g() will return NULL and no
- -               * action will be taken in load_balance_newidle().
- -               *
- -               * Under normal task pull operation due to imbalance, there
- -               * will be more than one task in the source run queue and
- -               * move_tasks() will succeed.  ld_moved will be true and this
- -               * active balance code will not be triggered.
- -               */
- -
- -              /* Lock busiest in correct order while this_rq is held */
- -              double_lock_balance(this_rq, busiest);
- -
- -              /*
- -               * don't kick the migration_thread, if the curr
- -               * task on busiest cpu can't be moved to this_cpu
- -               */
- -              if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
- -                      double_unlock_balance(this_rq, busiest);
- -                      all_pinned = 1;
- -                      return ld_moved;
- -              }
- -
- -              if (!busiest->active_balance) {
- -                      busiest->active_balance = 1;
- -                      busiest->push_cpu = this_cpu;
- -                      active_balance = 1;
- -              }
- -
- -              double_unlock_balance(this_rq, busiest);
- -              /*
- -               * Should not call ttwu while holding a rq->lock
- -               */
- -              raw_spin_unlock(&this_rq->lock);
- -              if (active_balance)
- -                      wake_up_process(busiest->migration_thread);
- -              raw_spin_lock(&this_rq->lock);
- -
- -      } else
- -              sd->nr_balance_failed = 0;
- -
- -      update_shares_locked(this_rq, sd);
- -      return ld_moved;
- -
- -out_balanced:
- -      schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
- -      if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- -          !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- -              return -1;
- -      sd->nr_balance_failed = 0;
- -
- -      return 0;
- -}
- -
- -/*
- - * idle_balance is called by schedule() if this_cpu is about to become
- - * idle. Attempts to pull tasks from other CPUs.
- - */
- -static void idle_balance(int this_cpu, struct rq *this_rq)
- -{
- -      struct sched_domain *sd;
- -      int pulled_task = 0;
- -      unsigned long next_balance = jiffies + HZ;
- -
- -      this_rq->idle_stamp = this_rq->clock;
- -
- -      if (this_rq->avg_idle < sysctl_sched_migration_cost)
- -              return;
- -
- -      for_each_domain(this_cpu, sd) {
- -              unsigned long interval;
- -
- -              if (!(sd->flags & SD_LOAD_BALANCE))
- -                      continue;
- -
- -              if (sd->flags & SD_BALANCE_NEWIDLE)
- -                      /* If we've pulled tasks over stop searching: */
- -                      pulled_task = load_balance_newidle(this_cpu, this_rq,
- -                                                         sd);
- -
- -              interval = msecs_to_jiffies(sd->balance_interval);
- -              if (time_after(next_balance, sd->last_balance + interval))
- -                      next_balance = sd->last_balance + interval;
- -              if (pulled_task) {
- -                      this_rq->idle_stamp = 0;
- -                      break;
- -              }
- -      }
- -      if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
- -              /*
- -               * We are going idle. next_balance may be set based on
- -               * a busy processor. So reset next_balance.
- -               */
- -              this_rq->next_balance = next_balance;
- -      }
- -}
- -
- -/*
- - * active_load_balance is run by migration threads. It pushes running tasks
- - * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- - * running on each physical CPU where possible, and avoids physical /
- - * logical imbalances.
- - *
- - * Called with busiest_rq locked.
- - */
- -static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
- -{
- -      int target_cpu = busiest_rq->push_cpu;
- -      struct sched_domain *sd;
- -      struct rq *target_rq;
- -
- -      /* Is there any task to move? */
- -      if (busiest_rq->nr_running <= 1)
- -              return;
- -
- -      target_rq = cpu_rq(target_cpu);
- -
- -      /*
- -       * This condition is "impossible", if it occurs
- -       * we need to fix it. Originally reported by
- -       * Bjorn Helgaas on a 128-cpu setup.
- -       */
- -      BUG_ON(busiest_rq == target_rq);
- -
- -      /* move a task from busiest_rq to target_rq */
- -      double_lock_balance(busiest_rq, target_rq);
- -      update_rq_clock(busiest_rq);
- -      update_rq_clock(target_rq);
- -
- -      /* Search for an sd spanning us and the target CPU. */
- -      for_each_domain(target_cpu, sd) {
- -              if ((sd->flags & SD_LOAD_BALANCE) &&
- -                  cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
- -                              break;
- -      }
- -
- -      if (likely(sd)) {
- -              schedstat_inc(sd, alb_count);
- -
- -              if (move_one_task(target_rq, target_cpu, busiest_rq,
- -                                sd, CPU_IDLE))
- -                      schedstat_inc(sd, alb_pushed);
- -              else
- -                      schedstat_inc(sd, alb_failed);
- -      }
- -      double_unlock_balance(busiest_rq, target_rq);
- -}
- -
- -#ifdef CONFIG_NO_HZ
- -static struct {
- -      atomic_t load_balancer;
- -      cpumask_var_t cpu_mask;
- -      cpumask_var_t ilb_grp_nohz_mask;
- -} nohz ____cacheline_aligned = {
- -      .load_balancer = ATOMIC_INIT(-1),
- -};
- -
- -int get_nohz_load_balancer(void)
- -{
- -      return atomic_read(&nohz.load_balancer);
- -}
- -
- -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -/**
- - * lowest_flag_domain - Return lowest sched_domain containing flag.
- - * @cpu:      The cpu whose lowest level of sched domain is to
- - *            be returned.
- - * @flag:     The flag to check for the lowest sched_domain
- - *            for the given cpu.
- - *
- - * Returns the lowest sched_domain of a cpu which contains the given flag.
- - */
- -static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
- -{
- -      struct sched_domain *sd;
- -
- -      for_each_domain(cpu, sd)
- -              if (sd && (sd->flags & flag))
- -                      break;
- -
- -      return sd;
- -}
- -
- -/**
- - * for_each_flag_domain - Iterates over sched_domains containing the flag.
- - * @cpu:      The cpu whose domains we're iterating over.
- - * @sd:               variable holding the value of the power_savings_sd
- - *            for cpu.
- - * @flag:     The flag to filter the sched_domains to be iterated.
- - *
- - * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- - * set, starting from the lowest sched_domain to the highest.
- - */
- -#define for_each_flag_domain(cpu, sd, flag) \
- -      for (sd = lowest_flag_domain(cpu, flag); \
- -              (sd && (sd->flags & flag)); sd = sd->parent)
- -
- -/**
- - * is_semi_idle_group - Checks if the given sched_group is semi-idle.
- - * @ilb_group:        group to be checked for semi-idleness
- - *
- - * Returns:   1 if the group is semi-idle. 0 otherwise.
- - *
- - * We define a sched_group to be semi idle if it has atleast one idle-CPU
- - * and atleast one non-idle CPU. This helper function checks if the given
- - * sched_group is semi-idle or not.
- - */
- -static inline int is_semi_idle_group(struct sched_group *ilb_group)
- -{
- -      cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
- -                                      sched_group_cpus(ilb_group));
- -
- -      /*
- -       * A sched_group is semi-idle when it has atleast one busy cpu
- -       * and atleast one idle cpu.
- -       */
- -      if (cpumask_empty(nohz.ilb_grp_nohz_mask))
- -              return 0;
- -
- -      if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
- -              return 0;
- -
- -      return 1;
- -}
- -/**
- - * find_new_ilb - Finds the optimum idle load balancer for nomination.
- - * @cpu:      The cpu which is nominating a new idle_load_balancer.
- - *
- - * Returns:   Returns the id of the idle load balancer if it exists,
- - *            Else, returns >= nr_cpu_ids.
- - *
- - * This algorithm picks the idle load balancer such that it belongs to a
- - * semi-idle powersavings sched_domain. The idea is to try and avoid
- - * completely idle packages/cores just for the purpose of idle load balancing
- - * when there are other idle cpu's which are better suited for that job.
- - */
- -static int find_new_ilb(int cpu)
- -{
- -      struct sched_domain *sd;
- -      struct sched_group *ilb_group;
- -
- -      /*
- -       * Have idle load balancer selection from semi-idle packages only
- -       * when power-aware load balancing is enabled
- -       */
- -      if (!(sched_smt_power_savings || sched_mc_power_savings))
- -              goto out_done;
- -
- -      /*
- -       * Optimize for the case when we have no idle CPUs or only one
- -       * idle CPU. Don't walk the sched_domain hierarchy in such cases
- -       */
- -      if (cpumask_weight(nohz.cpu_mask) < 2)
- -              goto out_done;
- -
- -      for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
- -              ilb_group = sd->groups;
- -
- -              do {
- -                      if (is_semi_idle_group(ilb_group))
- -                              return cpumask_first(nohz.ilb_grp_nohz_mask);
+ +unsigned long nr_uninterruptible(void)
+ +{
+ +      unsigned long i, sum = 0;
   
- -                      ilb_group = ilb_group->next;
+ +      for_each_possible_cpu(i)
+ +              sum += cpu_rq(i)->nr_uninterruptible;
   
- -              } while (ilb_group != sd->groups);
- -      }
+ +      /*
+ +       * Since we read the counters lockless, it might be slightly
+ +       * inaccurate. Do not allow it to go below zero though:
+ +       */
+ +      if (unlikely((long)sum < 0))
+ +              sum = 0;
   
- -out_done:
- -      return cpumask_first(nohz.cpu_mask);
- -}
- -#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
- -static inline int find_new_ilb(int call_cpu)
- -{
- -      return cpumask_first(nohz.cpu_mask);
+ +      return sum;
   }
- -#endif
   
- -/*
- - * This routine will try to nominate the ilb (idle load balancing)
- - * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- - * load balancing on behalf of all those cpus. If all the cpus in the system
- - * go into this tickless mode, then there will be no ilb owner (as there is
- - * no need for one) and all the cpus will sleep till the next wakeup event
- - * arrives...
- - *
- - * For the ilb owner, tick is not stopped. And this tick will be used
- - * for idle load balancing. ilb owner will still be part of
- - * nohz.cpu_mask..
- - *
- - * While stopping the tick, this cpu will become the ilb owner if there
- - * is no other owner. And will be the owner till that cpu becomes busy
- - * or if all cpus in the system stop their ticks at which point
- - * there is no need for ilb owner.
- - *
- - * When the ilb owner becomes busy, it nominates another owner, during the
- - * next busy scheduler_tick()
- - */
- -int select_nohz_load_balancer(int stop_tick)
+ +unsigned long long nr_context_switches(void)
   {
- -      int cpu = smp_processor_id();
+ +      int i;
+ +      unsigned long long sum = 0;
   
- -      if (stop_tick) {
- -              cpu_rq(cpu)->in_nohz_recently = 1;
+ +      for_each_possible_cpu(i)
+ +              sum += cpu_rq(i)->nr_switches;
   
- -              if (!cpu_active(cpu)) {
- -                      if (atomic_read(&nohz.load_balancer) != cpu)
- -                              return 0;
+ +      return sum;
+ +}
   
- -                      /*
- -                       * If we are going offline and still the leader,
- -                       * give up!
- -                       */
- -                      if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
- -                              BUG();
+ +unsigned long nr_iowait(void)
+ +{
+ +      unsigned long i, sum = 0;
   
- -                      return 0;
- -              }
+ +      for_each_possible_cpu(i)
+ +              sum += atomic_read(&cpu_rq(i)->nr_iowait);
   
- -              cpumask_set_cpu(cpu, nohz.cpu_mask);
+ +      return sum;
+ +}
   
- -              /* time for ilb owner also to sleep */
- -              if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
- -                      if (atomic_read(&nohz.load_balancer) == cpu)
- -                              atomic_set(&nohz.load_balancer, -1);
- -                      return 0;
- -              }
+ +unsigned long nr_iowait_cpu(void)
+ +{
+ +      struct rq *this = this_rq();
+ +      return atomic_read(&this->nr_iowait);
+ +}
   
- -              if (atomic_read(&nohz.load_balancer) == -1) {
- -                      /* make me the ilb owner */
- -                      if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
- -                              return 1;
- -              } else if (atomic_read(&nohz.load_balancer) == cpu) {
- -                      int new_ilb;
+ +unsigned long this_cpu_load(void)
+ +{
+ +      struct rq *this = this_rq();
+ +      return this->cpu_load[0];
+ +}
   
- -                      if (!(sched_smt_power_savings ||
- -                                              sched_mc_power_savings))
- -                              return 1;
- -                      /*
- -                       * Check to see if there is a more power-efficient
- -                       * ilb.
- -                       */
- -                      new_ilb = find_new_ilb(cpu);
- -                      if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
- -                              atomic_set(&nohz.load_balancer, -1);
- -                              resched_cpu(new_ilb);
- -                              return 0;
- -                      }
- -                      return 1;
- -              }
- -      } else {
- -              if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
- -                      return 0;
   
- -              cpumask_clear_cpu(cpu, nohz.cpu_mask);
+ +/* Variables and functions for calc_load */
+ +static atomic_long_t calc_load_tasks;
+ +static unsigned long calc_load_update;
+ +unsigned long avenrun[3];
+ +EXPORT_SYMBOL(avenrun);
   
- -              if (atomic_read(&nohz.load_balancer) == cpu)
- -                      if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
- -                              BUG();
- -      }
- -      return 0;
+ +/**
+ + * get_avenrun - get the load average array
+ + * @loads:    pointer to dest load array
+ + * @offset:   offset to add
+ + * @shift:    shift count to shift the result left
+ + *
+ + * These values are estimates at best, so no need for locking.
+ + */
+ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+ +{
+ +      loads[0] = (avenrun[0] + offset) << shift;
+ +      loads[1] = (avenrun[1] + offset) << shift;
+ +      loads[2] = (avenrun[2] + offset) << shift;
   }
- -#endif
   
- -static DEFINE_SPINLOCK(balancing);
+ +static unsigned long
+ +calc_load(unsigned long load, unsigned long exp, unsigned long active)
+ +{
+ +      load *= exp;
+ +      load += active * (FIXED_1 - exp);
+ +      return load >> FSHIFT;
+ +}
   
   /*
- - * It checks each scheduling domain to see if it is due to be balanced,
- - * and initiates a balancing operation if so.
- - *
- - * Balancing parameters are set up in arch_init_sched_domains.
+ + * calc_load - update the avenrun load estimates 10 ticks after the
+ + * CPUs have updated calc_load_tasks.
    */
- -static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+ +void calc_global_load(void)
   {
- -      int balance = 1;
- -      struct rq *rq = cpu_rq(cpu);
- -      unsigned long interval;
- -      struct sched_domain *sd;
- -      /* Earliest time when we have to do rebalance again */
- -      unsigned long next_balance = jiffies + 60*HZ;
- -      int update_next_balance = 0;
- -      int need_serialize;
+ +      unsigned long upd = calc_load_update + 10;
+ +      long active;
   
- -      for_each_domain(cpu, sd) {
- -              if (!(sd->flags & SD_LOAD_BALANCE))
- -                      continue;
+ +      if (time_before(jiffies, upd))
+ +              return;
   
- -              interval = sd->balance_interval;
- -              if (idle != CPU_IDLE)
- -                      interval *= sd->busy_factor;
+ +      active = atomic_long_read(&calc_load_tasks);
+ +      active = active > 0 ? active * FIXED_1 : 0;
   
- -              /* scale ms to jiffies */
- -              interval = msecs_to_jiffies(interval);
- -              if (unlikely(!interval))
- -                      interval = 1;
- -              if (interval > HZ*NR_CPUS/10)
- -                      interval = HZ*NR_CPUS/10;
+ +      avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ +      avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ +      avenrun[2] = calc_load(avenrun[2], EXP_15, active);
   
- -              need_serialize = sd->flags & SD_SERIALIZE;
+ +      calc_load_update += LOAD_FREQ;
+ +}
   
- -              if (need_serialize) {
- -                      if (!spin_trylock(&balancing))
- -                              goto out;
- -              }
+ +/*
+ + * Either called from update_cpu_load() or from a cpu going idle
+ + */
+ +static void calc_load_account_active(struct rq *this_rq)
+ +{
+ +      long nr_active, delta;
   
- -              if (time_after_eq(jiffies, sd->last_balance + interval)) {
- -                      if (load_balance(cpu, rq, sd, idle, &balance)) {
- -                              /*
- -                               * We've pulled tasks over so either we're no
- -                               * longer idle, or one of our SMT siblings is
- -                               * not idle.
- -                               */
- -                              idle = CPU_NOT_IDLE;
- -                      }
- -                      sd->last_balance = jiffies;
- -              }
- -              if (need_serialize)
- -                      spin_unlock(&balancing);
- -out:
- -              if (time_after(next_balance, sd->last_balance + interval)) {
- -                      next_balance = sd->last_balance + interval;
- -                      update_next_balance = 1;
- -              }
+ +      nr_active = this_rq->nr_running;
+ +      nr_active += (long) this_rq->nr_uninterruptible;
   
- -              /*
- -               * Stop the load balance at this level. There is another
- -               * CPU in our sched group which is doing load balancing more
- -               * actively.
- -               */
- -              if (!balance)
- -                      break;
+ +      if (nr_active != this_rq->calc_load_active) {
+ +              delta = nr_active - this_rq->calc_load_active;
+ +              this_rq->calc_load_active = nr_active;
+ +              atomic_long_add(delta, &calc_load_tasks);
         }
- -
- -      /*
- -       * next_balance will be updated only when there is a need.
- -       * When the cpu is attached to null domain for ex, it will not be
- -       * updated.
- -       */
- -      if (likely(update_next_balance))
- -              rq->next_balance = next_balance;
   }
   
   /*
- - * run_rebalance_domains is triggered when needed from the scheduler tick.
- - * In CONFIG_NO_HZ case, the idle load balance owner will do the
- - * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ + * Update rq->cpu_load[] statistics. This function is usually called every
+ + * scheduler tick (TICK_NSEC).
    */
- -static void run_rebalance_domains(struct softirq_action *h)
+ +static void update_cpu_load(struct rq *this_rq)
   {
- -      int this_cpu = smp_processor_id();
- -      struct rq *this_rq = cpu_rq(this_cpu);
- -      enum cpu_idle_type idle = this_rq->idle_at_tick ?
- -                                              CPU_IDLE : CPU_NOT_IDLE;
- -
- -      rebalance_domains(this_cpu, idle);
+ +      unsigned long this_load = this_rq->load.weight;
+ +      int i, scale;
   
- -#ifdef CONFIG_NO_HZ
- -      /*
- -       * If this cpu is the owner for idle load balancing, then do the
- -       * balancing on behalf of the other idle cpus whose ticks are
- -       * stopped.
- -       */
- -      if (this_rq->idle_at_tick &&
- -          atomic_read(&nohz.load_balancer) == this_cpu) {
- -              struct rq *rq;
- -              int balance_cpu;
+ +      this_rq->nr_load_updates++;
   
- -              for_each_cpu(balance_cpu, nohz.cpu_mask) {
- -                      if (balance_cpu == this_cpu)
- -                              continue;
+ +      /* Update our load: */
+ +      for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ +              unsigned long old_load, new_load;
   
- -                      /*
- -                       * If this cpu gets work to do, stop the load balancing
- -                       * work being done for other cpus. Next load
- -                       * balancing owner will pick it up.
- -                       */
- -                      if (need_resched())
- -                              break;
+ +              /* scale is effectively 1 << i now, and >> i divides by scale */
   
- -                      rebalance_domains(balance_cpu, CPU_IDLE);
+ +              old_load = this_rq->cpu_load[i];
+ +              new_load = this_load;
+ +              /*
+ +               * Round up the averaging division if load is increasing. This
+ +               * prevents us from getting stuck on 9 if the load is 10, for
+ +               * example.
+ +               */
+ +              if (new_load > old_load)
+ +                      new_load += scale-1;
+ +              this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+ +      }
   
- -                      rq = cpu_rq(balance_cpu);
- -                      if (time_after(this_rq->next_balance, rq->next_balance))
- -                              this_rq->next_balance = rq->next_balance;
- -              }
+ +      if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+ +              this_rq->calc_load_update += LOAD_FREQ;
+ +              calc_load_account_active(this_rq);
         }
- -#endif
   }
   
- -static inline int on_null_domain(int cpu)
- -{
- -      return !rcu_dereference(cpu_rq(cpu)->sd);
- -}
+ +#ifdef CONFIG_SMP
   
   /*
- - * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- - *
- - * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- - * idle load balancing owner or decide to stop the periodic load balancing,
- - * if the whole system is idle.
+ + * sched_exec - execve() is a valuable balancing opportunity, because at
+ + * this point the task has the smallest effective memory and cache footprint.
    */
- -static inline void trigger_load_balance(struct rq *rq, int cpu)
+ +void sched_exec(void)
   {
- -#ifdef CONFIG_NO_HZ
- -      /*
- -       * If we were in the nohz mode recently and busy at the current
- -       * scheduler tick, then check if we need to nominate new idle
- -       * load balancer.
- -       */
- -      if (rq->in_nohz_recently && !rq->idle_at_tick) {
- -              rq->in_nohz_recently = 0;
- -
- -              if (atomic_read(&nohz.load_balancer) == cpu) {
- -                      cpumask_clear_cpu(cpu, nohz.cpu_mask);
- -                      atomic_set(&nohz.load_balancer, -1);
- -              }
- -
- -              if (atomic_read(&nohz.load_balancer) == -1) {
- -                      int ilb = find_new_ilb(cpu);
+ +      struct task_struct *p = current;
+ +      struct migration_req req;
+ +      int dest_cpu, this_cpu;
+ +      unsigned long flags;
+ +      struct rq *rq;
   
- -                      if (ilb < nr_cpu_ids)
- -                              resched_cpu(ilb);
- -              }
+ +again:
+ +      this_cpu = get_cpu();
+ +      dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
+ +      if (dest_cpu == this_cpu) {
+ +              put_cpu();
+ +              return;
         }
   
+ +      rq = task_rq_lock(p, &flags);
+ +      put_cpu();
+ +
         /*
- -       * If this cpu is idle and doing idle load balancing for all the
- -       * cpus with ticks stopped, is it time for that to stop?
+ +       * select_task_rq() can race against ->cpus_allowed
          */
- -      if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
- -          cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
- -              resched_cpu(cpu);
- -              return;
+ +      if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
+ +          || unlikely(!cpu_active(dest_cpu))) {
+ +              task_rq_unlock(rq, &flags);
+ +              goto again;
         }
   
- -      /*
- -       * If this cpu is idle and the idle load balancing is done by
- -       * someone else, then no need raise the SCHED_SOFTIRQ
- -       */
- -      if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
- -          cpumask_test_cpu(cpu, nohz.cpu_mask))
- -              return;
- -#endif
- -      /* Don't need to rebalance while attached to NULL domain */
- -      if (time_after_eq(jiffies, rq->next_balance) &&
- -          likely(!on_null_domain(cpu)))
- -              raise_softirq(SCHED_SOFTIRQ);
- -}
+ +      /* force the process onto the specified CPU */
+ +      if (migrate_task(p, dest_cpu, &req)) {
+ +              /* Need to wait for migration thread (might exit: take ref). */
+ +              struct task_struct *mt = rq->migration_thread;
   
- -#else /* CONFIG_SMP */
+ +              get_task_struct(mt);
+ +              task_rq_unlock(rq, &flags);
+ +              wake_up_process(mt);
+ +              put_task_struct(mt);
+ +              wait_for_completion(&req.done);
   
- -/*
- - * on UP we do not need to balance between CPUs:
- - */
- -static inline void idle_balance(int cpu, struct rq *rq)
- -{
+ +              return;
+ +      }
+ +      task_rq_unlock(rq, &flags);
   }
   
   #endif
@@@ -3515,7 -5309,7 +3515,7 @@@ void scheduler_tick(void
         curr->sched_class->task_tick(rq, curr, 0);
         raw_spin_unlock(&rq->lock);
   
- -      perf_event_task_tick(curr, cpu);
+ +      perf_event_task_tick(curr);
   
   #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
@@@ -3729,7 -5523,7 +3729,7 @@@ need_resched_nonpreemptible
   
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
- -              perf_event_task_sched_out(prev, next, cpu);
+ +              perf_event_task_sched_out(prev, next);
   
                 rq->nr_switches++;
                 rq->curr = next;
@@@ -4260,7 -6054,7 +4260,7 @@@ void rt_mutex_setprio(struct task_struc
         unsigned long flags;
         int oldprio, on_rq, running;
         struct rq *rq;
- -      const struct sched_class *prev_class = p->sched_class;
+ +      const struct sched_class *prev_class;
   
         BUG_ON(prio < 0 || prio > MAX_PRIO);
   
@@@ -4268,7 -6062,6 +4268,7 @@@
         update_rq_clock(rq);
   
         oldprio = p->prio;
+ +      prev_class = p->sched_class;
         on_rq = p->se.on_rq;
         running = task_current(rq, p);
         if (on_rq)
@@@ -4286,7 -6079,7 +4286,7 @@@
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (on_rq) {
- -              enqueue_task(rq, p, 0);
+ +              enqueue_task(rq, p, 0, oldprio < prio);
   
                 check_class_changed(rq, p, prev_class, oldprio, running);
         }
@@@ -4330,7 -6123,7 +4330,7 @@@ void set_user_nice(struct task_struct *
         delta = p->prio - old_prio;
   
         if (on_rq) {
- -              enqueue_task(rq, p, 0);
+ +              enqueue_task(rq, p, 0, false);
                 /*
                  * If the task increased its priority or is running and
                  * lowered its priority, then reschedule its CPU:
@@@ -4488,7 -6281,7 +4488,7 @@@ static int __sched_setscheduler(struct 
   {
         int retval, oldprio, oldpolicy = -1, on_rq, running;
         unsigned long flags;
- -      const struct sched_class *prev_class = p->sched_class;
+ +      const struct sched_class *prev_class;
         struct rq *rq;
         int reset_on_fork;
   
@@@ -4602,7 -6395,6 +4602,7 @@@ recheck
         p->sched_reset_on_fork = reset_on_fork;
   
         oldprio = p->prio;
+ +      prev_class = p->sched_class;
         __setscheduler(rq, p, policy, param->sched_priority);
   
         if (running)
@@@ -5353,8 -7145,27 +5353,8 @@@ int set_cpus_allowed_ptr(struct task_st
         struct rq *rq;
         int ret = 0;
   
- -      /*
- -       * Since we rely on wake-ups to migrate sleeping tasks, don't change
- -       * the ->cpus_allowed mask from under waking tasks, which would be
- -       * possible when we change rq->lock in ttwu(), so synchronize against
- -       * TASK_WAKING to avoid that.
- -       *
- -       * Make an exception for freshly cloned tasks, since cpuset namespaces
- -       * might move the task about, we have to validate the target in
- -       * wake_up_new_task() anyway since the cpu might have gone away.
- -       */
- -again:
- -      while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
- -              cpu_relax();
- -
         rq = task_rq_lock(p, &flags);
   
- -      if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
- -              task_rq_unlock(rq, &flags);
- -              goto again;
- -      }
- -
         if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                 ret = -EINVAL;
                 goto out;
@@@ -7641,6 -9452,7 +7641,6 @@@ static void init_tg_rt_entry(struct tas
         tg->rt_rq[cpu] = rt_rq;
         init_rt_rq(rt_rq, rq);
         rt_rq->tg = tg;
- -      rt_rq->rt_se = rt_se;
         rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
         if (add)
                 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@@ -7671,6 -9483,9 +7671,6 @@@ void __init sched_init(void
   #ifdef CONFIG_RT_GROUP_SCHED
         alloc_size += 2 * nr_cpu_ids * sizeof(void **);
   #endif
- -#ifdef CONFIG_USER_SCHED
- -      alloc_size *= 2;
- -#endif
   #ifdef CONFIG_CPUMASK_OFFSTACK
         alloc_size += num_possible_cpus() * cpumask_size();
   #endif
@@@ -7684,6 -9499,13 +7684,6 @@@
                 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
   
- -#ifdef CONFIG_USER_SCHED
- -              root_task_group.se = (struct sched_entity **)ptr;
- -              ptr += nr_cpu_ids * sizeof(void **);
- -
- -              root_task_group.cfs_rq = (struct cfs_rq **)ptr;
- -              ptr += nr_cpu_ids * sizeof(void **);
- -#endif /* CONFIG_USER_SCHED */
   #endif /* CONFIG_FAIR_GROUP_SCHED */
   #ifdef CONFIG_RT_GROUP_SCHED
                 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@@ -7692,6 -9514,13 +7692,6 @@@
                 init_task_group.rt_rq = (struct rt_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
   
- -#ifdef CONFIG_USER_SCHED
- -              root_task_group.rt_se = (struct sched_rt_entity **)ptr;
- -              ptr += nr_cpu_ids * sizeof(void **);
- -
- -              root_task_group.rt_rq = (struct rt_rq **)ptr;
- -              ptr += nr_cpu_ids * sizeof(void **);
- -#endif /* CONFIG_USER_SCHED */
   #endif /* CONFIG_RT_GROUP_SCHED */
   #ifdef CONFIG_CPUMASK_OFFSTACK
                 for_each_possible_cpu(i) {
@@@ -7711,13 -9540,22 +7711,13 @@@
   #ifdef CONFIG_RT_GROUP_SCHED
         init_rt_bandwidth(&init_task_group.rt_bandwidth,
                         global_rt_period(), global_rt_runtime());
- -#ifdef CONFIG_USER_SCHED
- -      init_rt_bandwidth(&root_task_group.rt_bandwidth,
- -                      global_rt_period(), RUNTIME_INF);
- -#endif /* CONFIG_USER_SCHED */
   #endif /* CONFIG_RT_GROUP_SCHED */
   
- -#ifdef CONFIG_GROUP_SCHED
+ +#ifdef CONFIG_CGROUP_SCHED
         list_add(&init_task_group.list, &task_groups);
         INIT_LIST_HEAD(&init_task_group.children);
   
- -#ifdef CONFIG_USER_SCHED
- -      INIT_LIST_HEAD(&root_task_group.children);
- -      init_task_group.parent = &root_task_group;
- -      list_add(&init_task_group.siblings, &root_task_group.children);
- -#endif /* CONFIG_USER_SCHED */
- -#endif /* CONFIG_GROUP_SCHED */
+ +#endif /* CONFIG_CGROUP_SCHED */
   
   #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
         update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@@ -7757,6 -9595,25 +7757,6 @@@
                  * directly in rq->cfs (i.e init_task_group->se[] = NULL).
                  */
                 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
- -#elif defined CONFIG_USER_SCHED
- -              root_task_group.shares = NICE_0_LOAD;
- -              init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
- -              /*
- -               * In case of task-groups formed thr' the user id of tasks,
- -               * init_task_group represents tasks belonging to root user.
- -               * Hence it forms a sibling of all subsequent groups formed.
- -               * In this case, init_task_group gets only a fraction of overall
- -               * system cpu resource, based on the weight assigned to root
- -               * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
- -               * by letting tasks of init_task_group sit in a separate cfs_rq
- -               * (init_tg_cfs_rq) and having one entity represent this group of
- -               * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
- -               */
- -              init_tg_cfs_entry(&init_task_group,
- -                              &per_cpu(init_tg_cfs_rq, i),
- -                              &per_cpu(init_sched_entity, i), i, 1,
- -                              root_task_group.se[i]);
- -
   #endif
   #endif /* CONFIG_FAIR_GROUP_SCHED */
   
@@@ -7765,6 -9622,12 +7765,6 @@@
                 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
   #ifdef CONFIG_CGROUP_SCHED
                 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
- -#elif defined CONFIG_USER_SCHED
- -              init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
- -              init_tg_rt_entry(&init_task_group,
- -                              &per_cpu(init_rt_rq_var, i),
- -                              &per_cpu(init_sched_rt_entity, i), i, 1,
- -                              root_task_group.rt_se[i]);
   #endif
   #endif
   
@@@ -7849,7 -9712,7 +7849,7 @@@ static inline int preempt_count_equals(
         return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
   }
   
- -void __might_sleep(char *file, int line, int preempt_offset)
+ +void __might_sleep(const char *file, int line, int preempt_offset)
   {
   #ifdef in_atomic
         static unsigned long prev_jiffy;        /* ratelimiting */
@@@ -8160,7 -10023,7 +8160,7 @@@ static inline void unregister_rt_sched_
   }
   #endif /* CONFIG_RT_GROUP_SCHED */
   
- -#ifdef CONFIG_GROUP_SCHED
+ +#ifdef CONFIG_CGROUP_SCHED
   static void free_sched_group(struct task_group *tg)
   {
         free_fair_sched_group(tg);
@@@ -8265,11 -10128,11 +8265,11 @@@ void sched_move_task(struct task_struc
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
         if (on_rq)
- -              enqueue_task(rq, tsk, 0);
+ +              enqueue_task(rq, tsk, 0, false);
   
         task_rq_unlock(rq, &flags);
   }
- -#endif /* CONFIG_GROUP_SCHED */
+ +#endif /* CONFIG_CGROUP_SCHED */
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
   static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@@ -8411,6 -10274,13 +8411,6 @@@ static int tg_schedulable(struct task_g
                 runtime = d->rt_runtime;
         }
   
- -#ifdef CONFIG_USER_SCHED
- -      if (tg == &root_task_group) {
- -              period = global_rt_period();
- -              runtime = global_rt_runtime();
- -      }
- -#endif
- -
         /*
          * Cannot have more runtime than the period.
          */
@@@ -8813,7 -10683,7 +8813,7 @@@ struct cgroup_subsys cpu_cgroup_subsys 
   struct cpuacct {
         struct cgroup_subsys_state css;
         /* cpuusage holds pointer to a u64-type object on every cpu */
-       u64 *cpuusage;
+       u64 __percpu *cpuusage;
         struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
         struct cpuacct *parent;
   };
@@@ -9029,23 -10899,6 +9029,23 @@@ static void cpuacct_charge(struct task_
         rcu_read_unlock();
   }
   
+ +/*
+ + * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
+ + * in cputime_t units. As a result, cpuacct_update_stats calls
+ + * percpu_counter_add with values large enough to always overflow the
+ + * per cpu batch limit causing bad SMP scalability.
+ + *
+ + * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
+ + * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
+ + * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
+ + */
+ +#ifdef CONFIG_SMP
+ +#define CPUACCT_BATCH \
+ +      min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
+ +#else
+ +#define CPUACCT_BATCH 0
+ +#endif
+ +
   /*
    * Charge the system/user time to the task's accounting group.
    */
@@@ -9053,7 -10906,6 +9053,7 @@@ static void cpuacct_update_stats(struc
                 enum cpuacct_stat_index idx, cputime_t val)
   {
         struct cpuacct *ca;
+ +      int batch = CPUACCT_BATCH;
   
         if (unlikely(!cpuacct_subsys.active))
                 return;
@@@ -9062,7 -10914,7 +9062,7 @@@
         ca = task_ca(tsk);
   
         do {
- -              percpu_counter_add(&ca->cpustat[idx], val);
+ +              __percpu_counter_add(&ca->cpustat[idx], val, batch);
                 ca = ca->parent;
         } while (ca);
         rcu_read_unlock();
diff --combined kernel/trace/trace.c

index 032c57c,667ba80..ed01fdb
--- 1/kernel/trace/trace.c
--- 2/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@@ -32,7 -32,6 +32,7 @@@
   #include <linux/splice.h>
   #include <linux/kdebug.h>
   #include <linux/string.h>
+ +#include <linux/rwsem.h>
   #include <linux/ctype.h>
   #include <linux/init.h>
   #include <linux/poll.h>
@@@ -92,17 -91,20 +92,17 @@@ DEFINE_PER_CPU(int, ftrace_cpu_disabled
   static inline void ftrace_disable_cpu(void)
   {
         preempt_disable();
-       __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled));
+       __this_cpu_inc(ftrace_cpu_disabled);
   }
   
   static inline void ftrace_enable_cpu(void)
   {
-       __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled));
+       __this_cpu_dec(ftrace_cpu_disabled);
         preempt_enable();
   }
   
   static cpumask_var_t __read_mostly    tracing_buffer_mask;
   
- -/* Define which cpu buffers are currently read in trace_pipe */
- -static cpumask_var_t                  tracing_reader_cpumask;
- -
   #define for_each_tracing_cpu(cpu)     \
         for_each_cpu(cpu, tracing_buffer_mask)
   
@@@ -241,91 -243,12 +241,91 @@@ static struct tracer            *current_trace __
   
   /*
    * trace_types_lock is used to protect the trace_types list.
- - * This lock is also used to keep user access serialized.
- - * Accesses from userspace will grab this lock while userspace
- - * activities happen inside the kernel.
    */
   static DEFINE_MUTEX(trace_types_lock);
   
+ +/*
+ + * serialize the access of the ring buffer
+ + *
+ + * ring buffer serializes readers, but it is low level protection.
+ + * The validity of the events (which returns by ring_buffer_peek() ..etc)
+ + * are not protected by ring buffer.
+ + *
+ + * The content of events may become garbage if we allow other process consumes
+ + * these events concurrently:
+ + *   A) the page of the consumed events may become a normal page
+ + *      (not reader page) in ring buffer, and this page will be rewrited
+ + *      by events producer.
+ + *   B) The page of the consumed events may become a page for splice_read,
+ + *      and this page will be returned to system.
+ + *
+ + * These primitives allow multi process access to different cpu ring buffer
+ + * concurrently.
+ + *
+ + * These primitives don't distinguish read-only and read-consume access.
+ + * Multi read-only access are also serialized.
+ + */
+ +
+ +#ifdef CONFIG_SMP
+ +static DECLARE_RWSEM(all_cpu_access_lock);
+ +static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
+ +
+ +static inline void trace_access_lock(int cpu)
+ +{
+ +      if (cpu == TRACE_PIPE_ALL_CPU) {
+ +              /* gain it for accessing the whole ring buffer. */
+ +              down_write(&all_cpu_access_lock);
+ +      } else {
+ +              /* gain it for accessing a cpu ring buffer. */
+ +
+ +              /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
+ +              down_read(&all_cpu_access_lock);
+ +
+ +              /* Secondly block other access to this @cpu ring buffer. */
+ +              mutex_lock(&per_cpu(cpu_access_lock, cpu));
+ +      }
+ +}
+ +
+ +static inline void trace_access_unlock(int cpu)
+ +{
+ +      if (cpu == TRACE_PIPE_ALL_CPU) {
+ +              up_write(&all_cpu_access_lock);
+ +      } else {
+ +              mutex_unlock(&per_cpu(cpu_access_lock, cpu));
+ +              up_read(&all_cpu_access_lock);
+ +      }
+ +}
+ +
+ +static inline void trace_access_lock_init(void)
+ +{
+ +      int cpu;
+ +
+ +      for_each_possible_cpu(cpu)
+ +              mutex_init(&per_cpu(cpu_access_lock, cpu));
+ +}
+ +
+ +#else
+ +
+ +static DEFINE_MUTEX(access_lock);
+ +
+ +static inline void trace_access_lock(int cpu)
+ +{
+ +      (void)cpu;
+ +      mutex_lock(&access_lock);
+ +}
+ +
+ +static inline void trace_access_unlock(int cpu)
+ +{
+ +      (void)cpu;
+ +      mutex_unlock(&access_lock);
+ +}
+ +
+ +static inline void trace_access_lock_init(void)
+ +{
+ +}
+ +
+ +#endif
+ +
   /* trace_wait is a waitqueue for tasks blocked on trace_poll */
   static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
   
@@@ -1166,7 -1089,7 +1166,7 @@@ trace_function(struct trace_array *tr
         struct ftrace_entry *entry;
   
         /* If we are reading the ring buffer, don't trace */
-       if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+       if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                 return;
   
         event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@@ -1397,10 -1320,8 +1397,10 @@@ int trace_vbprintk(unsigned long ip, co
         entry->fmt                      = fmt;
   
         memcpy(entry->buf, trace_buf, sizeof(u32) * len);
- -      if (!filter_check_discard(call, entry, buffer, event))
+ +      if (!filter_check_discard(call, entry, buffer, event)) {
                 ring_buffer_unlock_commit(buffer, event);
+ +              ftrace_trace_stack(buffer, flags, 6, pc);
+ +      }
   
   out_unlock:
         arch_spin_unlock(&trace_buf_lock);
@@@ -1473,10 -1394,8 +1473,10 @@@ int trace_array_vprintk(struct trace_ar
   
         memcpy(&entry->buf, trace_buf, len);
         entry->buf[len] = '\0';
- -      if (!filter_check_discard(call, entry, buffer, event))
+ +      if (!filter_check_discard(call, entry, buffer, event)) {
                 ring_buffer_unlock_commit(buffer, event);
+ +              ftrace_trace_stack(buffer, irq_flags, 6, pc);
+ +      }
   
    out_unlock:
         arch_spin_unlock(&trace_buf_lock);
@@@ -1666,6 -1585,12 +1666,6 @@@ static void tracing_iter_reset(struct t
   }
   
   /*
- - * No necessary locking here. The worst thing which can
- - * happen is loosing events consumed at the same time
- - * by a trace_pipe reader.
- - * Other than that, we don't risk to crash the ring buffer
- - * because it serializes the readers.
- - *
    * The current tracer is copied to avoid a global locking
    * all around.
    */
@@@ -1720,16 -1645,12 +1720,16 @@@ static void *s_start(struct seq_file *m
         }
   
         trace_event_read_lock();
+ +      trace_access_lock(cpu_file);
         return p;
   }
   
   static void s_stop(struct seq_file *m, void *p)
   {
+ +      struct trace_iterator *iter = m->private;
+ +
         atomic_dec(&trace_record_cmdline_disabled);
+ +      trace_access_unlock(iter->cpu_file);
         trace_event_read_unlock();
   }
   
@@@ -2920,6 -2841,22 +2920,6 @@@ static int tracing_open_pipe(struct ino
   
         mutex_lock(&trace_types_lock);
   
- -      /* We only allow one reader per cpu */
- -      if (cpu_file == TRACE_PIPE_ALL_CPU) {
- -              if (!cpumask_empty(tracing_reader_cpumask)) {
- -                      ret = -EBUSY;
- -                      goto out;
- -              }
- -              cpumask_setall(tracing_reader_cpumask);
- -      } else {
- -              if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
- -                      cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
- -              else {
- -                      ret = -EBUSY;
- -                      goto out;
- -              }
- -      }
- -
         /* create a buffer to store the information to pass to userspace */
         iter = kzalloc(sizeof(*iter), GFP_KERNEL);
         if (!iter) {
@@@ -2975,6 -2912,12 +2975,6 @@@ static int tracing_release_pipe(struct 
   
         mutex_lock(&trace_types_lock);
   
- -      if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
- -              cpumask_clear(tracing_reader_cpumask);
- -      else
- -              cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
- -
- -
         if (iter->trace->pipe_close)
                 iter->trace->pipe_close(iter);
   
@@@ -3136,7 -3079,6 +3136,7 @@@ waitagain
         iter->pos = -1;
   
         trace_event_read_lock();
+ +      trace_access_lock(iter->cpu_file);
         while (find_next_entry_inc(iter) != NULL) {
                 enum print_line_t ret;
                 int len = iter->seq.len;
@@@ -3153,7 -3095,6 +3153,7 @@@
                 if (iter->seq.len >= cnt)
                         break;
         }
+ +      trace_access_unlock(iter->cpu_file);
         trace_event_read_unlock();
   
         /* Now copy what we have to the user */
@@@ -3279,7 -3220,6 +3279,7 @@@ static ssize_t tracing_splice_read_pipe
         }
   
         trace_event_read_lock();
+ +      trace_access_lock(iter->cpu_file);
   
         /* Fill as many pages as possible. */
         for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
@@@ -3303,7 -3243,6 +3303,7 @@@
                 trace_seq_init(&iter->seq);
         }
   
+ +      trace_access_unlock(iter->cpu_file);
         trace_event_read_unlock();
         mutex_unlock(&iter->mutex);
   
@@@ -3605,12 -3544,10 +3605,12 @@@ tracing_buffers_read(struct file *filp
   
         info->read = 0;
   
+ +      trace_access_lock(info->cpu);
         ret = ring_buffer_read_page(info->tr->buffer,
                                     &info->spare,
                                     count,
                                     info->cpu, 0);
+ +      trace_access_unlock(info->cpu);
         if (ret < 0)
                 return 0;
   
@@@ -3738,7 -3675,6 +3738,7 @@@ tracing_buffers_splice_read(struct fil
                 len &= PAGE_MASK;
         }
   
+ +      trace_access_lock(info->cpu);
         entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
   
         for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
@@@ -3786,7 -3722,6 +3786,7 @@@
                 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
         }
   
+ +      trace_access_unlock(info->cpu);
         spd.nr_pages = i;
   
         /* did we read anything? */
@@@ -4223,8 -4158,6 +4223,8 @@@ static __init int tracer_init_debugfs(v
         struct dentry *d_tracer;
         int cpu;
   
+ +      trace_access_lock_init();
+ +
         d_tracer = tracing_init_dentry();
   
         trace_create_file("tracing_enabled", 0644, d_tracer,
@@@ -4459,6 -4392,9 +4459,6 @@@ __init static int tracer_alloc_buffers(
         if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
                 goto out_free_buffer_mask;
   
- -      if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
- -              goto out_free_tracing_cpumask;
- -
         /* To save memory, keep the ring buffer size to its minimum */
         if (ring_buffer_expanded)
                 ring_buf_size = trace_buf_size;
@@@ -4516,6 -4452,8 +4516,6 @@@
         return 0;
   
   out_free_cpumask:
- -      free_cpumask_var(tracing_reader_cpumask);
- -out_free_tracing_cpumask:
         free_cpumask_var(tracing_cpumask);
   out_free_buffer_mask:
         free_cpumask_var(tracing_buffer_mask);
diff --combined kernel/trace/trace_functions_graph.c

index e998a82,9d976f3..3fc2a57
--- 1/kernel/trace/trace_functions_graph.c
--- 2/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@@ -18,7 -18,6 +18,7 @@@ struct fgraph_cpu_data 
         pid_t           last_pid;
         int             depth;
         int             ignore;
+ +      unsigned long   enter_funcs[FTRACE_RETFUNC_DEPTH];
   };
   
   struct fgraph_data {
@@@ -188,7 -187,7 +188,7 @@@ static int __trace_graph_entry(struct t
         struct ring_buffer *buffer = tr->buffer;
         struct ftrace_graph_ent_entry *entry;
   
-       if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+       if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                 return 0;
   
         event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@@ -213,11 -212,13 +213,11 @@@ int trace_graph_entry(struct ftrace_gra
         int cpu;
         int pc;
   
- -      if (unlikely(!tr))
- -              return 0;
- -
         if (!ftrace_trace_task(current))
                 return 0;
   
- -      if (!ftrace_graph_addr(trace->func))
+ +      /* trace it when it is-nested-in or is a function enabled. */
+ +      if (!(trace->depth || ftrace_graph_addr(trace->func)))
                 return 0;
   
         local_irq_save(flags);
@@@ -230,6 -231,9 +230,6 @@@
         } else {
                 ret = 0;
         }
- -      /* Only do the atomic if it is not already set */
- -      if (!test_tsk_trace_graph(current))
- -              set_tsk_trace_graph(current);
   
         atomic_dec(&data->disabled);
         local_irq_restore(flags);
@@@ -247,7 -251,7 +247,7 @@@ static void __trace_graph_return(struc
         struct ring_buffer *buffer = tr->buffer;
         struct ftrace_graph_ret_entry *entry;
   
-       if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+       if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                 return;
   
         event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@@ -277,24 -281,17 +277,24 @@@ void trace_graph_return(struct ftrace_g
                 pc = preempt_count();
                 __trace_graph_return(tr, trace, flags, pc);
         }
- -      if (!trace->depth)
- -              clear_tsk_trace_graph(current);
         atomic_dec(&data->disabled);
         local_irq_restore(flags);
   }
   
+ +void set_graph_array(struct trace_array *tr)
+ +{
+ +      graph_array = tr;
+ +
+ +      /* Make graph_array visible before we start tracing */
+ +
+ +      smp_mb();
+ +}
+ +
   static int graph_trace_init(struct trace_array *tr)
   {
         int ret;
   
- -      graph_array = tr;
+ +      set_graph_array(tr);
         ret = register_ftrace_graph(&trace_graph_return,
                                     &trace_graph_entry);
         if (ret)
@@@ -304,6 -301,11 +304,6 @@@
         return 0;
   }
   
- -void set_graph_array(struct trace_array *tr)
- -{
- -      graph_array = tr;
- -}
- -
   static void graph_trace_reset(struct trace_array *tr)
   {
         tracing_stop_cmdline_record();
@@@ -671,21 -673,15 +671,21 @@@ print_graph_entry_leaf(struct trace_ite
         duration = graph_ret->rettime - graph_ret->calltime;
   
         if (data) {
+ +              struct fgraph_cpu_data *cpu_data;
                 int cpu = iter->cpu;
- -              int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+ +
+ +              cpu_data = per_cpu_ptr(data->cpu_data, cpu);
   
                 /*
                  * Comments display at + 1 to depth. Since
                  * this is a leaf function, keep the comments
                  * equal to this depth.
                  */
- -              *depth = call->depth - 1;
+ +              cpu_data->depth = call->depth - 1;
+ +
+ +              /* No need to keep this function around for this depth */
+ +              if (call->depth < FTRACE_RETFUNC_DEPTH)
+ +                      cpu_data->enter_funcs[call->depth] = 0;
         }
   
         /* Overhead */
@@@ -725,15 -721,10 +725,15 @@@ print_graph_entry_nested(struct trace_i
         int i;
   
         if (data) {
+ +              struct fgraph_cpu_data *cpu_data;
                 int cpu = iter->cpu;
- -              int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
   
- -              *depth = call->depth;
+ +              cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+ +              cpu_data->depth = call->depth;
+ +
+ +              /* Save this function pointer to see if the exit matches */
+ +              if (call->depth < FTRACE_RETFUNC_DEPTH)
+ +                      cpu_data->enter_funcs[call->depth] = call->func;
         }
   
         /* No overhead */
@@@ -863,28 -854,19 +863,28 @@@ print_graph_return(struct ftrace_graph_
         struct fgraph_data *data = iter->private;
         pid_t pid = ent->pid;
         int cpu = iter->cpu;
+ +      int func_match = 1;
         int ret;
         int i;
   
         if (data) {
+ +              struct fgraph_cpu_data *cpu_data;
                 int cpu = iter->cpu;
- -              int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+ +
+ +              cpu_data = per_cpu_ptr(data->cpu_data, cpu);
   
                 /*
                  * Comments display at + 1 to depth. This is the
                  * return from a function, we now want the comments
                  * to display at the same level of the bracket.
                  */
- -              *depth = trace->depth - 1;
+ +              cpu_data->depth = trace->depth - 1;
+ +
+ +              if (trace->depth < FTRACE_RETFUNC_DEPTH) {
+ +                      if (cpu_data->enter_funcs[trace->depth] != trace->func)
+ +                              func_match = 0;
+ +                      cpu_data->enter_funcs[trace->depth] = 0;
+ +              }
         }
   
         if (print_graph_prologue(iter, s, 0, 0))
@@@ -909,21 -891,9 +909,21 @@@
                         return TRACE_TYPE_PARTIAL_LINE;
         }
   
- -      ret = trace_seq_printf(s, "}\n");
- -      if (!ret)
- -              return TRACE_TYPE_PARTIAL_LINE;
+ +      /*
+ +       * If the return function does not have a matching entry,
+ +       * then the entry was lost. Instead of just printing
+ +       * the '}' and letting the user guess what function this
+ +       * belongs to, write out the function name.
+ +       */
+ +      if (func_match) {
+ +              ret = trace_seq_printf(s, "}\n");
+ +              if (!ret)
+ +                      return TRACE_TYPE_PARTIAL_LINE;
+ +      } else {
+ +              ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func);
+ +              if (!ret)
+ +                      return TRACE_TYPE_PARTIAL_LINE;
+ +      }
   
         /* Overrun */
         if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 3 Mar 2010 15:34:18 +0000 (07:34 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 3 Mar 2010 15:34:18 +0000 (07:34 -0800)
		1	2
arch/powerpc/include/asm/local.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/system.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/acpi/processor_perflib.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/dma/dmaengine.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/edac/amd64_edac.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid5.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_mount.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/acpi/processor.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/percpu_counter.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/srcu.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcutorture.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace_functions_graph.c	patch \|	diff1 \|	diff2 \|	blob \| history