Merge branch 'for-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 14 Oct 2016 19:18:50 +0000 (12:18 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 14 Oct 2016 19:18:50 +0000 (12:18 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Oct 2016 19:18:50 +0000 (12:18 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Oct 2016 19:18:50 +0000 (12:18 -0700)
diff --combined fs/kernfs/dir.c

index dcd96aa,6e7fd37..cf4c636
--- 1/fs/kernfs/dir.c
--- 2/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@@ -110,8 -110,9 +110,9 @@@ static struct kernfs_node *kernfs_commo
    * kn_to:   /n1/n2/n3         [depth=3]
    * result:  /../..
    *
-  * return value: length of the string.  If greater than buflen,
-  * then contents of buf are undefined.  On error, -1 is returned.
+  * Returns the length of the full path.  If the full length is equal to or
+  * greater than @buflen, @buf contains the truncated path with the trailing
+  * '\0'.  On error, -errno is returned.
    */
   static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
                                         struct kernfs_node *kn_from,
@@@ -119,9 -120,8 +120,8 @@@
   {
         struct kernfs_node *kn, *common;
         const char parent_str[] = "/..";
-       size_t depth_from, depth_to, len = 0, nlen = 0;
-       char *p;
-       int i;
+       size_t depth_from, depth_to, len = 0;
+       int i, j;
   
         if (!kn_from)
                 kn_from = kernfs_root(kn_to)->kn;
@@@ -131,7 -131,7 +131,7 @@@
   
         common = kernfs_common_ancestor(kn_from, kn_to);
         if (WARN_ON(!common))
-               return -1;
+               return -EINVAL;
   
         depth_to = kernfs_depth(common, kn_to);
         depth_from = kernfs_depth(common, kn_from);
@@@ -144,22 -144,16 +144,16 @@@
                                len < buflen ? buflen - len : 0);
   
         /* Calculate how many bytes we need for the rest */
-       for (kn = kn_to; kn != common; kn = kn->parent)
-               nlen += strlen(kn->name) + 1;
- 
-       if (len + nlen >= buflen)
-               return len + nlen;
- 
-       p = buf + len + nlen;
-       *p = '\0';
-       for (kn = kn_to; kn != common; kn = kn->parent) {
-               size_t tmp = strlen(kn->name);
-               p -= tmp;
-               memcpy(p, kn->name, tmp);
-               *(--p) = '/';
+       for (i = depth_to - 1; i >= 0; i--) {
+               for (kn = kn_to, j = 0; j < i; j++)
+                       kn = kn->parent;
+               len += strlcpy(buf + len, "/",
+                              len < buflen ? buflen - len : 0);
+               len += strlcpy(buf + len, kn->name,
+                              len < buflen ? buflen - len : 0);
         }
   
-       return len + nlen;
+       return len;
   }
   
   /**
@@@ -185,29 -179,6 +179,6 @@@ int kernfs_name(struct kernfs_node *kn
         return ret;
   }
   
- /**
-  * kernfs_path_len - determine the length of the full path of a given node
-  * @kn: kernfs_node of interest
-  *
-  * The returned length doesn't include the space for the terminating '\0'.
-  */
- size_t kernfs_path_len(struct kernfs_node *kn)
- {
-       size_t len = 0;
-       unsigned long flags;
- 
-       spin_lock_irqsave(&kernfs_rename_lock, flags);
- 
-       do {
-               len += strlen(kn->name) + 1;
-               kn = kn->parent;
-       } while (kn && kn->parent);
- 
-       spin_unlock_irqrestore(&kernfs_rename_lock, flags);
- 
-       return len;
- }
- 
   /**
    * kernfs_path_from_node - build path of node @to relative to @from.
    * @from: parent kernfs_node relative to which we need to build the path
@@@ -220,8 -191,9 +191,9 @@@
    * path (which includes '..'s) as needed to reach from @from to @to is
    * returned.
    *
-  * If @buf isn't long enough, the return value will be greater than @buflen
-  * and @buf contents are undefined.
+  * Returns the length of the full path.  If the full length is equal to or
+  * greater than @buflen, @buf contains the truncated path with the trailing
+  * '\0'.  On error, -errno is returned.
    */
   int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
                           char *buf, size_t buflen)
@@@ -236,28 -208,6 +208,6 @@@
   }
   EXPORT_SYMBOL_GPL(kernfs_path_from_node);
   
- /**
-  * kernfs_path - build full path of a given node
-  * @kn: kernfs_node of interest
-  * @buf: buffer to copy @kn's name into
-  * @buflen: size of @buf
-  *
-  * Builds and returns the full path of @kn in @buf of @buflen bytes.  The
-  * path is built from the end of @buf so the returned pointer usually
-  * doesn't match @buf.  If @buf isn't long enough, @buf is nul terminated
-  * and %NULL is returned.
-  */
- char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
- {
-       int ret;
- 
-       ret = kernfs_path_from_node(kn, NULL, buf, buflen);
-       if (ret < 0 || ret >= buflen)
-               return NULL;
-       return buf;
- }
- EXPORT_SYMBOL_GPL(kernfs_path);
- 
   /**
    * pr_cont_kernfs_name - pr_cont name of a kernfs_node
    * @kn: kernfs_node of interest
@@@ -1096,17 -1046,13 +1046,17 @@@ static int kernfs_iop_rmdir(struct inod
   }
   
   static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
- -                           struct inode *new_dir, struct dentry *new_dentry)
+ +                           struct inode *new_dir, struct dentry *new_dentry,
+ +                           unsigned int flags)
   {
         struct kernfs_node *kn  = old_dentry->d_fsdata;
         struct kernfs_node *new_parent = new_dir->i_private;
         struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
         int ret;
   
+ +      if (flags)
+ +              return -EINVAL;
+ +
         if (!scops || !scops->rename)
                 return -EPERM;
   
@@@ -1130,6 -1076,9 +1080,6 @@@ const struct inode_operations kernfs_di
         .permission     = kernfs_iop_permission,
         .setattr        = kernfs_iop_setattr,
         .getattr        = kernfs_iop_getattr,
- -      .setxattr       = kernfs_iop_setxattr,
- -      .removexattr    = kernfs_iop_removexattr,
- -      .getxattr       = kernfs_iop_getxattr,
         .listxattr      = kernfs_iop_listxattr,
   
         .mkdir          = kernfs_iop_mkdir,
diff --combined include/linux/blk-cgroup.h

index cbdbf34,4e8c215..3bf5d33
--- 1/include/linux/blk-cgroup.h
--- 2/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@@ -45,7 -45,7 +45,7 @@@ struct blkcg 
         spinlock_t                      lock;
   
         struct radix_tree_root          blkg_tree;
- -      struct blkcg_gq                 *blkg_hint;
+ +      struct blkcg_gq __rcu           *blkg_hint;
         struct hlist_head               blkg_list;
   
         struct blkcg_policy_data        *cpd[BLKCG_MAX_POLS];
@@@ -343,16 -343,7 +343,7 @@@ static inline struct blkcg *cpd_to_blkc
    */
   static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
   {
-       char *p;
- 
-       p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-       if (!p) {
-               strncpy(buf, "<unavailable>", buflen);
-               return -ENAMETOOLONG;
-       }
- 
-       memmove(buf, p, buf + buflen - p);
-       return 0;
+       return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
   }
   
   /**
diff --combined include/linux/cgroup.h

index 440a721,6df3636..c83c23f
--- 1/include/linux/cgroup.h
--- 2/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@@ -97,7 -97,7 +97,7 @@@ int cgroup_add_legacy_cftypes(struct cg
   int cgroup_rm_cftypes(struct cftype *cfts);
   void cgroup_file_notify(struct cgroup_file *cfile);
   
- char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
+ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
   int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
   int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                      struct pid *pid, struct task_struct *tsk);
@@@ -497,23 -497,6 +497,23 @@@ static inline bool cgroup_is_descendant
         return cgrp->ancestor_ids[ancestor->level] == ancestor->id;
   }
   
+ +/**
+ + * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
+ + * @task: the task to be tested
+ + * @ancestor: possible ancestor of @task's cgroup
+ + *
+ + * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
+ + * It follows all the same rules as cgroup_is_descendant, and only applies
+ + * to the default hierarchy.
+ + */
+ +static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
+ +                                             struct cgroup *ancestor)
+ +{
+ +      struct css_set *cset = task_css_set(task);
+ +
+ +      return cgroup_is_descendant(cset->dfl_cgrp, ancestor);
+ +}
+ +
   /* no synchronization, the result can only be used as a hint */
   static inline bool cgroup_is_populated(struct cgroup *cgrp)
   {
@@@ -555,8 -538,7 +555,7 @@@ static inline int cgroup_name(struct cg
         return kernfs_name(cgrp->kn, buf, buflen);
   }
   
- static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
-                                             size_t buflen)
+ static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen)
   {
         return kernfs_path(cgrp->kn, buf, buflen);
   }
@@@ -574,7 -556,6 +573,7 @@@ static inline void pr_cont_cgroup_path(
   #else /* !CONFIG_CGROUPS */
   
   struct cgroup_subsys_state;
+ +struct cgroup;
   
   static inline void css_put(struct cgroup_subsys_state *css) {}
   static inline int cgroup_attach_task_all(struct task_struct *from,
@@@ -592,11 -573,6 +591,11 @@@ static inline void cgroup_free(struct t
   static inline int cgroup_init_early(void) { return 0; }
   static inline int cgroup_init(void) { return 0; }
   
+ +static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
+ +                                             struct cgroup *ancestor)
+ +{
+ +      return true;
+ +}
   #endif /* !CONFIG_CGROUPS */
   
   /*
@@@ -644,7 -620,6 +643,7 @@@ struct cgroup_namespace 
         atomic_t                count;
         struct ns_common        ns;
         struct user_namespace   *user_ns;
+ +      struct ucounts          *ucounts;
         struct css_set          *root_cset;
   };
   
@@@ -658,8 -633,8 +657,8 @@@ struct cgroup_namespace *copy_cgroup_ns
                                         struct user_namespace *user_ns,
                                         struct cgroup_namespace *old_ns);
   
- char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
-                    struct cgroup_namespace *ns);
+ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+                  struct cgroup_namespace *ns);
   
   #else /* !CONFIG_CGROUPS */
   
diff --combined kernel/cgroup.c

index 4406615,a7f9fb4..85bc9be
--- 1/kernel/cgroup.c
--- 2/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@@ -64,6 -64,9 +64,9 @@@
   #include <linux/file.h>
   #include <net/sock.h>
   
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/cgroup.h>
+ 
   /*
    * pidlists linger the following amount before being destroyed.  The goal
    * is avoiding frequent destruction in the middle of consecutive read calls
@@@ -1176,6 -1179,8 +1179,8 @@@ static void cgroup_destroy_root(struct 
         struct cgroup *cgrp = &root->cgrp;
         struct cgrp_cset_link *link, *tmp_link;
   
+       trace_cgroup_destroy_root(root);
+ 
         cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
   
         BUG_ON(atomic_read(&root->nr_cgrps));
@@@ -1874,6 -1879,9 +1879,9 @@@ static int cgroup_remount(struct kernfs
                 strcpy(root->release_agent_path, opts.release_agent);
                 spin_unlock(&release_agent_path_lock);
         }
+ 
+       trace_cgroup_remount(root);
+ 
    out_unlock:
         kfree(opts.release_agent);
         kfree(opts.name);
@@@ -2031,6 -2039,8 +2039,8 @@@ static int cgroup_setup_root(struct cgr
         if (ret)
                 goto destroy_root;
   
+       trace_cgroup_setup_root(root);
+ 
         /*
          * There must be no failure case after here, since rebinding takes
          * care of subsystems' refcounts, which are explicitly dropped in
@@@ -2315,22 -2325,18 +2325,18 @@@ static struct file_system_type cgroup2_
         .fs_flags = FS_USERNS_MOUNT,
   };
   
- static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
-                                  struct cgroup_namespace *ns)
+ static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+                                struct cgroup_namespace *ns)
   {
         struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
-       int ret;
   
-       ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
-       if (ret < 0 || ret >= buflen)
-               return NULL;
-       return buf;
+       return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
   }
   
- char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
-                    struct cgroup_namespace *ns)
+ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+                  struct cgroup_namespace *ns)
   {
-       char *ret;
+       int ret;
   
         mutex_lock(&cgroup_mutex);
         spin_lock_irq(&css_set_lock);
@@@ -2357,12 -2363,12 +2363,12 @@@ EXPORT_SYMBOL_GPL(cgroup_path_ns)
    *
    * Return value is the same as kernfs_path().
    */
- char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
   {
         struct cgroup_root *root;
         struct cgroup *cgrp;
         int hierarchy_id = 1;
-       char *path = NULL;
+       int ret;
   
         mutex_lock(&cgroup_mutex);
         spin_lock_irq(&css_set_lock);
@@@ -2371,16 -2377,15 +2377,15 @@@
   
         if (root) {
                 cgrp = task_cgroup_from_root(task, root);
-               path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
+               ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
         } else {
                 /* if no hierarchy exists, everyone is in "/" */
-               if (strlcpy(buf, "/", buflen) < buflen)
-                       path = buf;
+               ret = strlcpy(buf, "/", buflen);
         }
   
         spin_unlock_irq(&css_set_lock);
         mutex_unlock(&cgroup_mutex);
-       return path;
+       return ret;
   }
   EXPORT_SYMBOL_GPL(task_cgroup_path);
   
@@@ -2830,6 -2835,10 +2835,10 @@@ static int cgroup_attach_task(struct cg
                 ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
   
         cgroup_migrate_finish(&preloaded_csets);
+ 
+       if (!ret)
+               trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
+ 
         return ret;
   }
   
@@@ -3446,28 -3455,9 +3455,28 @@@ static ssize_t cgroup_subtree_control_w
          * Except for the root, subtree_control must be zero for a cgroup
          * with tasks so that child cgroups don't compete against tasks.
          */
- -      if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
- -              ret = -EBUSY;
- -              goto out_unlock;
+ +      if (enable && cgroup_parent(cgrp)) {
+ +              struct cgrp_cset_link *link;
+ +
+ +              /*
+ +               * Because namespaces pin csets too, @cgrp->cset_links
+ +               * might not be empty even when @cgrp is empty.  Walk and
+ +               * verify each cset.
+ +               */
+ +              spin_lock_irq(&css_set_lock);
+ +
+ +              ret = 0;
+ +              list_for_each_entry(link, &cgrp->cset_links, cset_link) {
+ +                      if (css_set_populated(link->cset)) {
+ +                              ret = -EBUSY;
+ +                              break;
+ +                      }
+ +              }
+ +
+ +              spin_unlock_irq(&css_set_lock);
+ +
+ +              if (ret)
+ +                      goto out_unlock;
         }
   
         /* save and update control masks and prepare csses */
@@@ -3611,6 -3601,8 +3620,8 @@@ static int cgroup_rename(struct kernfs_
         mutex_lock(&cgroup_mutex);
   
         ret = kernfs_rename(kn, new_parent, new_name_str);
+       if (!ret)
+               trace_cgroup_rename(cgrp);
   
         mutex_unlock(&cgroup_mutex);
   
@@@ -3918,9 -3910,7 +3929,9 @@@ void cgroup_file_notify(struct cgroup_f
    * cgroup_task_count - count the number of tasks in a cgroup.
    * @cgrp: the cgroup in question
    *
- - * Return the number of tasks in the cgroup.
+ + * Return the number of tasks in the cgroup.  The returned number can be
+ + * higher than the actual number of tasks due to css_set references from
+ + * namespace roots and temporary usages.
    */
   static int cgroup_task_count(const struct cgroup *cgrp)
   {
@@@ -4381,6 -4371,8 +4392,8 @@@ int cgroup_transfer_tasks(struct cgrou
   
                 if (task) {
                         ret = cgroup_migrate(task, false, to->root);
+                       if (!ret)
+                               trace_cgroup_transfer_tasks(to, task, false);
                         put_task_struct(task);
                 }
         } while (task && !ret);
@@@ -5046,6 -5038,8 +5059,8 @@@ static void css_release_work_fn(struct 
                         ss->css_released(css);
         } else {
                 /* cgroup release path */
+               trace_cgroup_release(cgrp);
+ 
                 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
                 cgrp->id = -1;
   
@@@ -5332,6 -5326,8 +5347,8 @@@ static int cgroup_mkdir(struct kernfs_n
         if (ret)
                 goto out_destroy;
   
+       trace_cgroup_mkdir(cgrp);
+ 
         /* let's create and online css's */
         kernfs_activate(kn);
   
@@@ -5507,6 -5503,9 +5524,9 @@@ static int cgroup_rmdir(struct kernfs_n
   
         ret = cgroup_destroy_locked(cgrp);
   
+       if (!ret)
+               trace_cgroup_rmdir(cgrp);
+ 
         cgroup_kn_unlock(kn);
         return ret;
   }
@@@ -5627,12 -5626,6 +5647,12 @@@ int __init cgroup_init(void
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
   
+ +      /*
+ +       * The latency of the synchronize_sched() is too high for cgroups,
+ +       * avoid it at the cost of forcing all readers into the slow path.
+ +       */
+ +      rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+ +
         get_user_ns(init_cgroup_ns.user_ns);
   
         mutex_lock(&cgroup_mutex);
@@@ -5743,7 -5736,7 +5763,7 @@@ core_initcall(cgroup_wq_init)
   int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                      struct pid *pid, struct task_struct *tsk)
   {
-       char *buf, *path;
+       char *buf;
         int retval;
         struct cgroup_root *root;
   
@@@ -5786,18 -5779,18 +5806,18 @@@
                  * " (deleted)" is appended to the cgroup path.
                  */
                 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
-                       path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
+                       retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
                                                 current->nsproxy->cgroup_ns);
-                       if (!path) {
+                       if (retval >= PATH_MAX)
                                 retval = -ENAMETOOLONG;
+                       if (retval < 0)
                                 goto out_unlock;
-                       }
+ 
+                       seq_puts(m, buf);
                 } else {
-                       path = "/";
+                       seq_puts(m, "/");
                 }
   
-               seq_puts(m, path);
- 
                 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
                         seq_puts(m, " (deleted)\n");
                 else
@@@ -6062,8 -6055,9 +6082,9 @@@ static void cgroup_release_agent(struc
   {
         struct cgroup *cgrp =
                 container_of(work, struct cgroup, release_agent_work);
-       char *pathbuf = NULL, *agentbuf = NULL, *path;
+       char *pathbuf = NULL, *agentbuf = NULL;
         char *argv[3], *envp[3];
+       int ret;
   
         mutex_lock(&cgroup_mutex);
   
@@@ -6073,13 -6067,13 +6094,13 @@@
                 goto out;
   
         spin_lock_irq(&css_set_lock);
-       path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+       ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
         spin_unlock_irq(&css_set_lock);
-       if (!path)
+       if (ret < 0 || ret >= PATH_MAX)
                 goto out;
   
         argv[0] = agentbuf;
-       argv[1] = path;
+       argv[1] = pathbuf;
         argv[2] = NULL;
   
         /* minimal command environment */
@@@ -6297,12 -6291,6 +6318,12 @@@ void cgroup_sk_alloc(struct sock_cgroup
         if (cgroup_sk_alloc_disabled)
                 return;
   
+ +      /* Socket clone path */
+ +      if (skcd->val) {
+ +              cgroup_get(sock_cgroup_ptr(skcd));
+ +              return;
+ +      }
+ +
         rcu_read_lock();
   
         while (true) {
@@@ -6328,16 -6316,6 +6349,16 @@@ void cgroup_sk_free(struct sock_cgroup_
   
   /* cgroup namespaces */
   
+ +static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+ +{
+ +      return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+ +}
+ +
+ +static void dec_cgroup_namespaces(struct ucounts *ucounts)
+ +{
+ +      dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+ +}
+ +
   static struct cgroup_namespace *alloc_cgroup_ns(void)
   {
         struct cgroup_namespace *new_ns;
@@@ -6359,7 -6337,6 +6380,7 @@@
   void free_cgroup_ns(struct cgroup_namespace *ns)
   {
         put_css_set(ns->root_cset);
+ +      dec_cgroup_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         ns_free_inum(&ns->ns);
         kfree(ns);
@@@ -6371,7 -6348,6 +6392,7 @@@ struct cgroup_namespace *copy_cgroup_ns
                                         struct cgroup_namespace *old_ns)
   {
         struct cgroup_namespace *new_ns;
+ +      struct ucounts *ucounts;
         struct css_set *cset;
   
         BUG_ON(!old_ns);
@@@ -6385,10 -6361,6 +6406,10 @@@
         if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                 return ERR_PTR(-EPERM);
   
+ +      ucounts = inc_cgroup_namespaces(user_ns);
+ +      if (!ucounts)
+ +              return ERR_PTR(-ENOSPC);
+ +
         /* It is not safe to take cgroup_mutex here */
         spin_lock_irq(&css_set_lock);
         cset = task_css_set(current);
@@@ -6398,12 -6370,10 +6419,12 @@@
         new_ns = alloc_cgroup_ns();
         if (IS_ERR(new_ns)) {
                 put_css_set(cset);
+ +              dec_cgroup_namespaces(ucounts);
                 return new_ns;
         }
   
         new_ns->user_ns = get_user_ns(user_ns);
+ +      new_ns->ucounts = ucounts;
         new_ns->root_cset = cset;
   
         return new_ns;
@@@ -6454,18 -6424,12 +6475,18 @@@ static void cgroupns_put(struct ns_comm
         put_cgroup_ns(to_cg_ns(ns));
   }
   
+ +static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+ +{
+ +      return to_cg_ns(ns)->user_ns;
+ +}
+ +
   const struct proc_ns_operations cgroupns_operations = {
         .name           = "cgroup",
         .type           = CLONE_NEWCGROUP,
         .get            = cgroupns_get,
         .put            = cgroupns_put,
         .install        = cgroupns_install,
+ +      .owner          = cgroupns_owner,
   };
   
   static __init int cgroup_namespaces_init(void)
diff --combined kernel/cpuset.c

index 2b4c20a,97dd8e1..29f815d
--- 1/kernel/cpuset.c
--- 2/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@@ -325,7 -325,8 +325,7 @@@ static struct file_system_type cpuset_f
   /*
    * Return in pmask the portion of a cpusets's cpus_allowed that
    * are online.  If none are online, walk up the cpuset hierarchy
- - * until we find one that does have some online cpus.  The top
- - * cpuset always has some cpus online.
+ + * until we find one that does have some online cpus.
    *
    * One way or another, we guarantee to return some non-empty subset
    * of cpu_online_mask.
@@@ -334,20 -335,8 +334,20 @@@
    */
   static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
   {
- -      while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
+ +      while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
                 cs = parent_cs(cs);
+ +              if (unlikely(!cs)) {
+ +                      /*
+ +                       * The top cpuset doesn't have any online cpu as a
+ +                       * consequence of a race between cpuset_hotplug_work
+ +                       * and cpu hotplug notifier.  But we know the top
+ +                       * cpuset's effective_cpus is on its way to to be
+ +                       * identical to cpu_online_mask.
+ +                       */
+ +                      cpumask_copy(pmask, cpu_online_mask);
+ +                      return;
+ +              }
+ +      }
         cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
   }
   
@@@ -2080,20 -2069,6 +2080,20 @@@ static void cpuset_bind(struct cgroup_s
         mutex_unlock(&cpuset_mutex);
   }
   
+ +/*
+ + * Make sure the new task conform to the current state of its parent,
+ + * which could have been changed by cpuset just after it inherits the
+ + * state from the parent and before it sits on the cgroup's task list.
+ + */
+ +static void cpuset_fork(struct task_struct *task)
+ +{
+ +      if (task_css_is_root(task, cpuset_cgrp_id))
+ +              return;
+ +
+ +      set_cpus_allowed_ptr(task, &current->cpus_allowed);
+ +      task->mems_allowed = current->mems_allowed;
+ +}
+ +
   struct cgroup_subsys cpuset_cgrp_subsys = {
         .css_alloc      = cpuset_css_alloc,
         .css_online     = cpuset_css_online,
@@@ -2104,7 -2079,6 +2104,7 @@@
         .attach         = cpuset_attach,
         .post_attach    = cpuset_post_attach,
         .bind           = cpuset_bind,
+ +      .fork           = cpuset_fork,
         .legacy_cftypes = files,
         .early_init     = true,
   };
@@@ -2715,7 -2689,7 +2715,7 @@@ void __cpuset_memory_pressure_bump(void
   int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
                      struct pid *pid, struct task_struct *tsk)
   {
-       char *buf, *p;
+       char *buf;
         struct cgroup_subsys_state *css;
         int retval;
   
@@@ -2724,14 -2698,15 +2724,15 @@@
         if (!buf)
                 goto out;
   
-       retval = -ENAMETOOLONG;
         css = task_get_css(tsk, cpuset_cgrp_id);
-       p = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
-                          current->nsproxy->cgroup_ns);
+       retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
+                               current->nsproxy->cgroup_ns);
         css_put(css);
-       if (!p)
+       if (retval >= PATH_MAX)
+               retval = -ENAMETOOLONG;
+       if (retval < 0)
                 goto out_free;
-       seq_puts(m, p);
+       seq_puts(m, buf);
         seq_putc(m, '\n');
         retval = 0;
   out_free:
diff --combined kernel/sched/debug.c

index 1393588,23cb609..fa178b6
--- 1/kernel/sched/debug.c
--- 2/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@@ -369,12 -369,8 +369,12 @@@ static void print_cfs_group_stats(struc
   
   #define P(F) \
         SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+ +#define P_SCHEDSTAT(F) \
+ +      SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)schedstat_val(F))
   #define PN(F) \
         SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+ +#define PN_SCHEDSTAT(F) \
+ +      SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
   
         if (!se)
                 return;
@@@ -382,27 -378,26 +382,27 @@@
         PN(se->exec_start);
         PN(se->vruntime);
         PN(se->sum_exec_runtime);
- -#ifdef CONFIG_SCHEDSTATS
         if (schedstat_enabled()) {
- -              PN(se->statistics.wait_start);
- -              PN(se->statistics.sleep_start);
- -              PN(se->statistics.block_start);
- -              PN(se->statistics.sleep_max);
- -              PN(se->statistics.block_max);
- -              PN(se->statistics.exec_max);
- -              PN(se->statistics.slice_max);
- -              PN(se->statistics.wait_max);
- -              PN(se->statistics.wait_sum);
- -              P(se->statistics.wait_count);
+ +              PN_SCHEDSTAT(se->statistics.wait_start);
+ +              PN_SCHEDSTAT(se->statistics.sleep_start);
+ +              PN_SCHEDSTAT(se->statistics.block_start);
+ +              PN_SCHEDSTAT(se->statistics.sleep_max);
+ +              PN_SCHEDSTAT(se->statistics.block_max);
+ +              PN_SCHEDSTAT(se->statistics.exec_max);
+ +              PN_SCHEDSTAT(se->statistics.slice_max);
+ +              PN_SCHEDSTAT(se->statistics.wait_max);
+ +              PN_SCHEDSTAT(se->statistics.wait_sum);
+ +              P_SCHEDSTAT(se->statistics.wait_count);
         }
- -#endif
         P(se->load.weight);
   #ifdef CONFIG_SMP
         P(se->avg.load_avg);
         P(se->avg.util_avg);
   #endif
+ +
+ +#undef PN_SCHEDSTAT
   #undef PN
+ +#undef P_SCHEDSTAT
   #undef P
   }
   #endif
@@@ -415,7 -410,8 +415,8 @@@ static char *task_group_path(struct tas
         if (autogroup_path(tg, group_path, PATH_MAX))
                 return group_path;
   
-       return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+       cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+       return group_path;
   }
   #endif
   
@@@ -434,9 -430,9 +435,9 @@@ print_task(struct seq_file *m, struct r
                 p->prio);
   
         SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- -              SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
+ +              SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
                 SPLIT_NS(p->se.sum_exec_runtime),
- -              SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
+ +              SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
   
   #ifdef CONFIG_NUMA_BALANCING
         SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
@@@ -631,7 -627,9 +632,7 @@@ do {                                                                       
   #undef P64
   #endif
   
- -#ifdef CONFIG_SCHEDSTATS
- -#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
- -
+ +#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, schedstat_val(rq->n));
         if (schedstat_enabled()) {
                 P(yld_count);
                 P(sched_count);
@@@ -639,8 -637,9 +640,8 @@@
                 P(ttwu_count);
                 P(ttwu_local);
         }
- -
   #undef P
- -#endif
+ +
         spin_lock_irqsave(&sched_debug_lock, flags);
         print_cfs_stats(m, cpu);
         print_rt_stats(m, cpu);
@@@ -870,14 -869,10 +871,14 @@@ void proc_sched_show_task(struct task_s
         SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
   #define P(F) \
         SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+ +#define P_SCHEDSTAT(F) \
+ +      SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
   #define __PN(F) \
         SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
   #define PN(F) \
         SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+ +#define PN_SCHEDSTAT(F) \
+ +      SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
   
         PN(se.exec_start);
         PN(se.vruntime);
@@@ -887,36 -882,37 +888,36 @@@
   
         P(se.nr_migrations);
   
- -#ifdef CONFIG_SCHEDSTATS
         if (schedstat_enabled()) {
                 u64 avg_atom, avg_per_cpu;
   
- -              PN(se.statistics.sum_sleep_runtime);
- -              PN(se.statistics.wait_start);
- -              PN(se.statistics.sleep_start);
- -              PN(se.statistics.block_start);
- -              PN(se.statistics.sleep_max);
- -              PN(se.statistics.block_max);
- -              PN(se.statistics.exec_max);
- -              PN(se.statistics.slice_max);
- -              PN(se.statistics.wait_max);
- -              PN(se.statistics.wait_sum);
- -              P(se.statistics.wait_count);
- -              PN(se.statistics.iowait_sum);
- -              P(se.statistics.iowait_count);
- -              P(se.statistics.nr_migrations_cold);
- -              P(se.statistics.nr_failed_migrations_affine);
- -              P(se.statistics.nr_failed_migrations_running);
- -              P(se.statistics.nr_failed_migrations_hot);
- -              P(se.statistics.nr_forced_migrations);
- -              P(se.statistics.nr_wakeups);
- -              P(se.statistics.nr_wakeups_sync);
- -              P(se.statistics.nr_wakeups_migrate);
- -              P(se.statistics.nr_wakeups_local);
- -              P(se.statistics.nr_wakeups_remote);
- -              P(se.statistics.nr_wakeups_affine);
- -              P(se.statistics.nr_wakeups_affine_attempts);
- -              P(se.statistics.nr_wakeups_passive);
- -              P(se.statistics.nr_wakeups_idle);
+ +              PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
+ +              PN_SCHEDSTAT(se.statistics.wait_start);
+ +              PN_SCHEDSTAT(se.statistics.sleep_start);
+ +              PN_SCHEDSTAT(se.statistics.block_start);
+ +              PN_SCHEDSTAT(se.statistics.sleep_max);
+ +              PN_SCHEDSTAT(se.statistics.block_max);
+ +              PN_SCHEDSTAT(se.statistics.exec_max);
+ +              PN_SCHEDSTAT(se.statistics.slice_max);
+ +              PN_SCHEDSTAT(se.statistics.wait_max);
+ +              PN_SCHEDSTAT(se.statistics.wait_sum);
+ +              P_SCHEDSTAT(se.statistics.wait_count);
+ +              PN_SCHEDSTAT(se.statistics.iowait_sum);
+ +              P_SCHEDSTAT(se.statistics.iowait_count);
+ +              P_SCHEDSTAT(se.statistics.nr_migrations_cold);
+ +              P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
+ +              P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
+ +              P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
+ +              P_SCHEDSTAT(se.statistics.nr_forced_migrations);
+ +              P_SCHEDSTAT(se.statistics.nr_wakeups);
+ +              P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
+ +              P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
+ +              P_SCHEDSTAT(se.statistics.nr_wakeups_local);
+ +              P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
+ +              P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
+ +              P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
+ +              P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
+ +              P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
   
                 avg_atom = p->se.sum_exec_runtime;
                 if (nr_switches)
@@@ -935,7 -931,7 +936,7 @@@
                 __PN(avg_atom);
                 __PN(avg_per_cpu);
         }
- -#endif
+ +
         __P(nr_switches);
         SEQ_printf(m, "%-45s:%21Ld\n",
                    "nr_voluntary_switches", (long long)p->nvcsw);
@@@ -952,10 -948,8 +953,10 @@@
   #endif
         P(policy);
         P(prio);
+ +#undef PN_SCHEDSTAT
   #undef PN
   #undef __PN
+ +#undef P_SCHEDSTAT
   #undef P
   #undef __P
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 14 Oct 2016 19:18:50 +0000 (12:18 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 14 Oct 2016 19:18:50 +0000 (12:18 -0700)
		1	2
fs/kernfs/dir.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blk-cgroup.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/cgroup.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cpuset.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/debug.c	patch \|	diff1 \|	diff2 \|	blob \| history