Merge branch 'for-3.9' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 20 Feb 2013 17:16:21 +0000 (09:16 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 20 Feb 2013 17:16:21 +0000 (09:16 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 20 Feb 2013 17:16:21 +0000 (09:16 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 20 Feb 2013 17:16:21 +0000 (09:16 -0800)
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX

index f78b90a..f5635a0 100644 (file)
--- a/Documentation/cgroups/00-INDEX
+++ b/Documentation/cgroups/00-INDEX
@@ -4,8 +4,6 @@ blkio-controller.txt
         - Description for Block IO Controller, implementation and usage details.
  cgroups.txt
         - Control Groups definition, implementation details, examples and API.
-cgroup_event_listener.c
-       - A user program for cgroup listener.
  cpuacct.txt
         - CPU Accounting Controller; account CPU usage for groups of tasks.
  cpusets.txt
diff --git a/Documentation/cgroups/cgroup_event_listener.c b/Documentation/cgroups/cgroup_event_listener.c

deleted file mode 100644 (file)

index 3e082f9..0000000
--- a/Documentation/cgroups/cgroup_event_listener.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * cgroup_event_listener.c - Simple listener of cgroup events
- *
- * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name>
- */
-
-#include <assert.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <libgen.h>
-#include <limits.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-
-#include <sys/eventfd.h>
-
-#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>\n"
-
-int main(int argc, char **argv)
-{
-       int efd = -1;
-       int cfd = -1;
-       int event_control = -1;
-       char event_control_path[PATH_MAX];
-       char line[LINE_MAX];
-       int ret;
-
-       if (argc != 3) {
-               fputs(USAGE_STR, stderr);
-               return 1;
-       }
-
-       cfd = open(argv[1], O_RDONLY);
-       if (cfd == -1) {
-               fprintf(stderr, "Cannot open %s: %s\n", argv[1],
-                               strerror(errno));
-               goto out;
-       }
-
-       ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control",
-                       dirname(argv[1]));
-       if (ret >= PATH_MAX) {
-               fputs("Path to cgroup.event_control is too long\n", stderr);
-               goto out;
-       }
-
-       event_control = open(event_control_path, O_WRONLY);
-       if (event_control == -1) {
-               fprintf(stderr, "Cannot open %s: %s\n", event_control_path,
-                               strerror(errno));
-               goto out;
-       }
-
-       efd = eventfd(0, 0);
-       if (efd == -1) {
-               perror("eventfd() failed");
-               goto out;
-       }
-
-       ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]);
-       if (ret >= LINE_MAX) {
-               fputs("Arguments string is too long\n", stderr);
-               goto out;
-       }
-
-       ret = write(event_control, line, strlen(line) + 1);
-       if (ret == -1) {
-               perror("Cannot write to cgroup.event_control");
-               goto out;
-       }
-
-       while (1) {
-               uint64_t result;
-
-               ret = read(efd, &result, sizeof(result));
-               if (ret == -1) {
-                       if (errno == EINTR)
-                               continue;
-                       perror("Cannot read from eventfd");
-                       break;
-               }
-               assert(ret == sizeof(result));
-
-               ret = access(event_control_path, W_OK);
-               if ((ret == -1) && (errno == ENOENT)) {
-                               puts("The cgroup seems to have removed.");
-                               ret = 0;
-                               break;
-               }
-
-               if (ret == -1) {
-                       perror("cgroup.event_control "
-                                       "is not accessible any more");
-                       break;
-               }
-
-               printf("%s %s: crossed\n", argv[1], argv[2]);
-       }
-
-out:
-       if (efd >= 0)
-               close(efd);
-       if (event_control >= 0)
-               close(event_control);
-       if (cfd >= 0)
-               close(cfd);
-
-       return (ret != 0);
-}
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt

index fc8fa97..ce94a83 100644 (file)
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -399,8 +399,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
  
   9.10 Memory thresholds
         Memory controller implements memory thresholds using cgroups notification
-       API. You can use Documentation/cgroups/cgroup_event_listener.c to test
-       it.
+       API. You can use tools/cgroup/cgroup_event_listener.c to test it.
  
         (Shell-A) Create cgroup and run event listener
         # mkdir /cgroup/A
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index 7d73905..900af59 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -203,6 +203,7 @@ struct cgroup {
  
         /* For RCU-protected deletion */
         struct rcu_head rcu_head;
+       struct work_struct free_work;
  
         /* List of events which userspace want to receive */
         struct list_head event_list;
@@ -558,6 +559,7 @@ static inline struct cgroup* task_cgroup(struct task_struct *task,
  
  struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
                                           struct cgroup *cgroup);
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
  
  /**
   * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants
@@ -706,7 +708,6 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
  static inline int cgroup_init_early(void) { return 0; }
  static inline int cgroup_init(void) { return 0; }
  static inline void cgroup_fork(struct task_struct *p) {}
-static inline void cgroup_fork_callbacks(struct task_struct *p) {}
  static inline void cgroup_post_fork(struct task_struct *p) {}
  static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
  
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 33cc421..e4112aa 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2659,7 +2659,10 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
  extern struct task_group root_task_group;
  
  extern struct task_group *sched_create_group(struct task_group *parent);
+extern void sched_online_group(struct task_group *tg,
+                              struct task_group *parent);
  extern void sched_destroy_group(struct task_group *tg);
+extern void sched_offline_group(struct task_group *tg);
  extern void sched_move_task(struct task_struct *tsk);
  #ifdef CONFIG_FAIR_GROUP_SCHED
  extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 4855892..b5c6432 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
  #include <linux/module.h>
  #include <linux/delayacct.h>
  #include <linux/cgroupstats.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
  #include <linux/namei.h>
  #include <linux/pid_namespace.h>
  #include <linux/idr.h>
@@ -376,22 +376,18 @@ static int css_set_count;
   * account cgroups in empty hierarchies.
   */
  #define CSS_SET_HASH_BITS      7
-#define CSS_SET_TABLE_SIZE     (1 << CSS_SET_HASH_BITS)
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
  
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
  {
         int i;
-       int index;
-       unsigned long tmp = 0UL;
+       unsigned long key = 0UL;
  
         for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
-               tmp += (unsigned long)css[i];
-       tmp = (tmp >> 16) ^ tmp;
+               key += (unsigned long)css[i];
+       key = (key >> 16) ^ key;
  
-       index = hash_long(tmp, CSS_SET_HASH_BITS);
-
-       return &css_set_table[index];
+       return key;
  }
  
  /* We don't maintain the lists running through each css_set to its
@@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
         }
  
         /* This css_set is dead. unlink it and release cgroup refcounts */
-       hlist_del(&cg->hlist);
+       hash_del(&cg->hlist);
         css_set_count--;
  
         list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
                 struct cgroup *cgrp = link->cgrp;
                 list_del(&link->cg_link_list);
                 list_del(&link->cgrp_link_list);
+
+               /*
+                * We may not be holding cgroup_mutex, and if cgrp->count is
+                * dropped to 0 the cgroup can be destroyed at any time, hence
+                * rcu_read_lock is used to keep it alive.
+                */
+               rcu_read_lock();
                 if (atomic_dec_and_test(&cgrp->count) &&
                     notify_on_release(cgrp)) {
                         if (taskexit)
                                 set_bit(CGRP_RELEASABLE, &cgrp->flags);
                         check_for_release(cgrp);
                 }
+               rcu_read_unlock();
  
                 kfree(link);
         }
@@ -550,9 +554,9 @@ static struct css_set *find_existing_css_set(
  {
         int i;
         struct cgroupfs_root *root = cgrp->root;
-       struct hlist_head *hhead;
         struct hlist_node *node;
         struct css_set *cg;
+       unsigned long key;
  
         /*
          * Build the set of subsystem state objects that we want to see in the
@@ -572,8 +576,8 @@ static struct css_set *find_existing_css_set(
                 }
         }
  
-       hhead = css_set_hash(template);
-       hlist_for_each_entry(cg, node, hhead, hlist) {
+       key = css_set_hash(template);
+       hash_for_each_possible(css_set_table, cg, node, hlist, key) {
                 if (!compare_css_sets(cg, oldcg, cgrp, template))
                         continue;
  
@@ -657,8 +661,8 @@ static struct css_set *find_css_set(
  
         struct list_head tmp_cg_links;
  
-       struct hlist_head *hhead;
         struct cg_cgroup_link *link;
+       unsigned long key;
  
         /* First see if we already have a cgroup group that matches
          * the desired set */
@@ -704,8 +708,8 @@ static struct css_set *find_css_set(
         css_set_count++;
  
         /* Add this cgroup group to the hash table */
-       hhead = css_set_hash(res->subsys);
-       hlist_add_head(&res->hlist, hhead);
+       key = css_set_hash(res->subsys);
+       hash_add(css_set_table, &res->hlist, key);
  
         write_unlock(&css_set_lock);
  
@@ -856,47 +860,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
         return inode;
  }
  
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+static void cgroup_free_fn(struct work_struct *work)
  {
-       /* is dentry a directory ? if so, kfree() associated cgroup */
-       if (S_ISDIR(inode->i_mode)) {
-               struct cgroup *cgrp = dentry->d_fsdata;
-               struct cgroup_subsys *ss;
-               BUG_ON(!(cgroup_is_removed(cgrp)));
-               /* It's possible for external users to be holding css
-                * reference counts on a cgroup; css_put() needs to
-                * be able to access the cgroup after decrementing
-                * the reference count in order to know if it needs to
-                * queue the cgroup to be handled by the release
-                * agent */
-               synchronize_rcu();
+       struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+       struct cgroup_subsys *ss;
  
-               mutex_lock(&cgroup_mutex);
-               /*
-                * Release the subsystem state objects.
-                */
-               for_each_subsys(cgrp->root, ss)
-                       ss->css_free(cgrp);
+       mutex_lock(&cgroup_mutex);
+       /*
+        * Release the subsystem state objects.
+        */
+       for_each_subsys(cgrp->root, ss)
+               ss->css_free(cgrp);
  
-               cgrp->root->number_of_cgroups--;
-               mutex_unlock(&cgroup_mutex);
+       cgrp->root->number_of_cgroups--;
+       mutex_unlock(&cgroup_mutex);
  
-               /*
-                * Drop the active superblock reference that we took when we
-                * created the cgroup
-                */
-               deactivate_super(cgrp->root->sb);
+       /*
+        * Drop the active superblock reference that we took when we
+        * created the cgroup
+        */
+       deactivate_super(cgrp->root->sb);
  
-               /*
-                * if we're getting rid of the cgroup, refcount should ensure
-                * that there are no pidlists left.
-                */
-               BUG_ON(!list_empty(&cgrp->pidlists));
+       /*
+        * if we're getting rid of the cgroup, refcount should ensure
+        * that there are no pidlists left.
+        */
+       BUG_ON(!list_empty(&cgrp->pidlists));
  
-               simple_xattrs_free(&cgrp->xattrs);
+       simple_xattrs_free(&cgrp->xattrs);
  
-               ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
-               kfree_rcu(cgrp, rcu_head);
+       ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+       kfree(cgrp);
+}
+
+static void cgroup_free_rcu(struct rcu_head *head)
+{
+       struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+
+       schedule_work(&cgrp->free_work);
+}
+
+static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+{
+       /* is dentry a directory ? if so, kfree() associated cgroup */
+       if (S_ISDIR(inode->i_mode)) {
+               struct cgroup *cgrp = dentry->d_fsdata;
+
+               BUG_ON(!(cgroup_is_removed(cgrp)));
+               call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
         } else {
                 struct cfent *cfe = __d_cfe(dentry);
                 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -925,13 +936,17 @@ static void remove_dir(struct dentry *d)
         dput(parent);
  }
  
-static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
  {
         struct cfent *cfe;
  
         lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
         lockdep_assert_held(&cgroup_mutex);
  
+       /*
+        * If we're doing cleanup due to failure of cgroup_create(),
+        * the corresponding @cfe may not exist.
+        */
         list_for_each_entry(cfe, &cgrp->files, node) {
                 struct dentry *d = cfe->dentry;
  
@@ -944,9 +959,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
                 list_del_init(&cfe->node);
                 dput(d);
  
-               return 0;
+               break;
         }
-       return -ENOENT;
  }
  
  /**
@@ -1083,7 +1097,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                 }
         }
         root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
-       synchronize_rcu();
  
         return 0;
  }
@@ -1393,6 +1406,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
         INIT_LIST_HEAD(&cgrp->allcg_node);
         INIT_LIST_HEAD(&cgrp->release_list);
         INIT_LIST_HEAD(&cgrp->pidlists);
+       INIT_WORK(&cgrp->free_work, cgroup_free_fn);
         mutex_init(&cgrp->pidlist_mutex);
         INIT_LIST_HEAD(&cgrp->event_list);
         spin_lock_init(&cgrp->event_list_lock);
@@ -1597,6 +1611,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 struct cgroupfs_root *existing_root;
                 const struct cred *cred;
                 int i;
+               struct hlist_node *node;
+               struct css_set *cg;
  
                 BUG_ON(sb->s_root != NULL);
  
@@ -1650,14 +1666,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 /* Link the top cgroup in this hierarchy into all
                  * the css_set objects */
                 write_lock(&css_set_lock);
-               for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
-                       struct hlist_head *hhead = &css_set_table[i];
-                       struct hlist_node *node;
-                       struct css_set *cg;
-
-                       hlist_for_each_entry(cg, node, hhead, hlist)
-                               link_css_set(&tmp_cg_links, cg, root_cgrp);
-               }
+               hash_for_each(css_set_table, i, node, cg, hlist)
+                       link_css_set(&tmp_cg_links, cg, root_cgrp);
                 write_unlock(&css_set_lock);
  
                 free_cg_links(&tmp_cg_links);
@@ -1773,7 +1783,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
         rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
                            "cgroup_path() called without proper locking");
  
-       if (!dentry || cgrp == dummytop) {
+       if (cgrp == dummytop) {
                 /*
                  * Inactive subsystems have no dentry for their root
                  * cgroup
@@ -1982,7 +1992,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                         ss->attach(cgrp, &tset);
         }
  
-       synchronize_rcu();
  out:
         if (retval) {
                 for_each_subsys(root, ss) {
@@ -2151,7 +2160,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         /*
          * step 5: success! and cleanup
          */
-       synchronize_rcu();
         retval = 0;
  out_put_css_set_refs:
         if (retval) {
@@ -2769,14 +2777,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
                 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
                         continue;
  
-               if (is_add)
+               if (is_add) {
                         err = cgroup_add_file(cgrp, subsys, cft);
-               else
-                       err = cgroup_rm_file(cgrp, cft);
-               if (err) {
-                       pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
-                                  is_add ? "add" : "remove", cft->name, err);
+                       if (err)
+                               pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
+                                       cft->name, err);
                         ret = err;
+               } else {
+                       cgroup_rm_file(cgrp, cft);
                 }
         }
         return ret;
@@ -3017,6 +3025,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
  }
  EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
  
+/**
+ * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
+ * @pos: cgroup of interest
+ *
+ * Return the rightmost descendant of @pos.  If there's no descendant,
+ * @pos is returned.  This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ */
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+{
+       struct cgroup *last, *tmp;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       do {
+               last = pos;
+               /* ->prev isn't RCU safe, walk ->next till the end */
+               pos = NULL;
+               list_for_each_entry_rcu(tmp, &last->children, sibling)
+                       pos = tmp;
+       } while (pos);
+
+       return last;
+}
+EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
+
  static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
  {
         struct cgroup *last;
@@ -3752,8 +3786,13 @@ static void cgroup_event_remove(struct work_struct *work)
                         remove);
         struct cgroup *cgrp = event->cgrp;
  
+       remove_wait_queue(event->wqh, &event->wait);
+
         event->cft->unregister_event(cgrp, event->cft, event->eventfd);
  
+       /* Notify userspace the event is going away. */
+       eventfd_signal(event->eventfd, 1);
+
         eventfd_ctx_put(event->eventfd);
         kfree(event);
         dput(cgrp->dentry);
@@ -3773,15 +3812,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
         unsigned long flags = (unsigned long)key;
  
         if (flags & POLLHUP) {
-               __remove_wait_queue(event->wqh, &event->wait);
-               spin_lock(&cgrp->event_list_lock);
-               list_del_init(&event->list);
-               spin_unlock(&cgrp->event_list_lock);
                 /*
-                * We are in atomic context, but cgroup_event_remove() may
-                * sleep, so we have to call it in workqueue.
+                * If the event has been detached at cgroup removal, we
+                * can simply return knowing the other side will cleanup
+                * for us.
+                *
+                * We can't race against event freeing since the other
+                * side will require wqh->lock via remove_wait_queue(),
+                * which we hold.
                  */
-               schedule_work(&event->remove);
+               spin_lock(&cgrp->event_list_lock);
+               if (!list_empty(&event->list)) {
+                       list_del_init(&event->list);
+                       /*
+                        * We are in atomic context, but cgroup_event_remove()
+                        * may sleep, so we have to call it in workqueue.
+                        */
+                       schedule_work(&event->remove);
+               }
+               spin_unlock(&cgrp->event_list_lock);
         }
  
         return 0;
@@ -3807,6 +3856,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
                                       const char *buffer)
  {
         struct cgroup_event *event = NULL;
+       struct cgroup *cgrp_cfile;
         unsigned int efd, cfd;
         struct file *efile = NULL;
         struct file *cfile = NULL;
@@ -3862,6 +3912,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
                 goto fail;
         }
  
+       /*
+        * The file to be monitored must be in the same cgroup as
+        * cgroup.event_control is.
+        */
+       cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
+       if (cgrp_cfile != cgrp) {
+               ret = -EINVAL;
+               goto fail;
+       }
+
         if (!event->cft->register_event || !event->cft->unregister_event) {
                 ret = -EINVAL;
                 goto fail;
@@ -4135,6 +4195,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
  
         init_cgroup_housekeeping(cgrp);
  
+       dentry->d_fsdata = cgrp;
+       cgrp->dentry = dentry;
+
         cgrp->parent = parent;
         cgrp->root = parent->root;
         cgrp->top_cgroup = parent->top_cgroup;
@@ -4172,8 +4235,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         lockdep_assert_held(&dentry->d_inode->i_mutex);
  
         /* allocation complete, commit to creation */
-       dentry->d_fsdata = cgrp;
-       cgrp->dentry = dentry;
         list_add_tail(&cgrp->allcg_node, &root->allcg_list);
         list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
         root->number_of_cgroups++;
@@ -4340,20 +4401,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         /*
          * Unregister events and notify userspace.
          * Notify userspace about cgroup removing only after rmdir of cgroup
-        * directory to avoid race between userspace and kernelspace. Use
-        * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
-        * cgroup_event_wake() is called with the wait queue head locked,
-        * remove_wait_queue() cannot be called while holding event_list_lock.
+        * directory to avoid race between userspace and kernelspace.
          */
         spin_lock(&cgrp->event_list_lock);
-       list_splice_init(&cgrp->event_list, &tmp_list);
-       spin_unlock(&cgrp->event_list_lock);
-       list_for_each_entry_safe(event, tmp, &tmp_list, list) {
+       list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
                 list_del_init(&event->list);
-               remove_wait_queue(event->wqh, &event->wait);
-               eventfd_signal(event->eventfd, 1);
                 schedule_work(&event->remove);
         }
+       spin_unlock(&cgrp->event_list_lock);
  
         return 0;
  }
@@ -4438,6 +4493,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
  {
         struct cgroup_subsys_state *css;
         int i, ret;
+       struct hlist_node *node, *tmp;
+       struct css_set *cg;
+       unsigned long key;
  
         /* check name and function validity */
         if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4503,23 +4561,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
          * this is all done under the css_set_lock.
          */
         write_lock(&css_set_lock);
-       for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
-               struct css_set *cg;
-               struct hlist_node *node, *tmp;
-               struct hlist_head *bucket = &css_set_table[i], *new_bucket;
-
-               hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
-                       /* skip entries that we already rehashed */
-                       if (cg->subsys[ss->subsys_id])
-                               continue;
-                       /* remove existing entry */
-                       hlist_del(&cg->hlist);
-                       /* set new value */
-                       cg->subsys[ss->subsys_id] = css;
-                       /* recompute hash and restore entry */
-                       new_bucket = css_set_hash(cg->subsys);
-                       hlist_add_head(&cg->hlist, new_bucket);
-               }
+       hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) {
+               /* skip entries that we already rehashed */
+               if (cg->subsys[ss->subsys_id])
+                       continue;
+               /* remove existing entry */
+               hash_del(&cg->hlist);
+               /* set new value */
+               cg->subsys[ss->subsys_id] = css;
+               /* recompute hash and restore entry */
+               key = css_set_hash(cg->subsys);
+               hash_add(css_set_table, node, key);
         }
         write_unlock(&css_set_lock);
  
@@ -4551,7 +4603,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
  void cgroup_unload_subsys(struct cgroup_subsys *ss)
  {
         struct cg_cgroup_link *link;
-       struct hlist_head *hhead;
  
         BUG_ON(ss->module == NULL);
  
@@ -4585,11 +4636,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
         write_lock(&css_set_lock);
         list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
                 struct css_set *cg = link->cg;
+               unsigned long key;
  
-               hlist_del(&cg->hlist);
+               hash_del(&cg->hlist);
                 cg->subsys[ss->subsys_id] = NULL;
-               hhead = css_set_hash(cg->subsys);
-               hlist_add_head(&cg->hlist, hhead);
+               key = css_set_hash(cg->subsys);
+               hash_add(css_set_table, &cg->hlist, key);
         }
         write_unlock(&css_set_lock);
  
@@ -4631,9 +4683,6 @@ int __init cgroup_init_early(void)
         list_add(&init_css_set_link.cg_link_list,
                  &init_css_set.cg_links);
  
-       for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
-               INIT_HLIST_HEAD(&css_set_table[i]);
-
         for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                 struct cgroup_subsys *ss = subsys[i];
  
@@ -4667,7 +4716,7 @@ int __init cgroup_init(void)
  {
         int err;
         int i;
-       struct hlist_head *hhead;
+       unsigned long key;
  
         err = bdi_init(&cgroup_backing_dev_info);
         if (err)
@@ -4686,8 +4735,8 @@ int __init cgroup_init(void)
         }
  
         /* Add init_css_set to the hash table */
-       hhead = css_set_hash(init_css_set.subsys);
-       hlist_add_head(&init_css_set.hlist, hhead);
+       key = css_set_hash(init_css_set.subsys);
+       hash_add(css_set_table, &init_css_set.hlist, key);
         BUG_ON(!init_root_id(&rootnode));
  
         cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4982,8 +5031,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
         }
         task_unlock(tsk);
  
-       if (cg)
-               put_css_set_taskexit(cg);
+       put_css_set_taskexit(cg);
  }
  
  /**
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index 7bb63ee..5bb9bf1 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2511,8 +2511,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
  
         dentry = task_cs(tsk)->css.cgroup->dentry;
         spin_lock(&cpuset_buffer_lock);
-       snprintf(cpuset_name, CPUSET_NAME_LEN,
-                dentry ? (const char *)dentry->d_name.name : "/");
+
+       if (!dentry) {
+               strcpy(cpuset_name, "/");
+       } else {
+               spin_lock(&dentry->d_lock);
+               strlcpy(cpuset_name, (const char *)dentry->d_name.name,
+                       CPUSET_NAME_LEN);
+               spin_unlock(&dentry->d_lock);
+       }
+
         nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
                            tsk->mems_allowed);
         printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c

index 0984a21..64de5f8 100644 (file)
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
         ag->tg->rt_se = NULL;
         ag->tg->rt_rq = NULL;
  #endif
+       sched_offline_group(ag->tg);
         sched_destroy_group(ag->tg);
  }
  
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
         if (IS_ERR(tg))
                 goto out_free;
  
+       sched_online_group(tg, &root_task_group);
+
         kref_init(&ag->kref);
         init_rwsem(&ag->lock);
         ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 03d7784..3a673a3 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7161,7 +7161,6 @@ static void free_sched_group(struct task_group *tg)
  struct task_group *sched_create_group(struct task_group *parent)
  {
         struct task_group *tg;
-       unsigned long flags;
  
         tg = kzalloc(sizeof(*tg), GFP_KERNEL);
         if (!tg)
@@ -7173,6 +7172,17 @@ struct task_group *sched_create_group(struct task_group *parent)
         if (!alloc_rt_sched_group(tg, parent))
                 goto err;
  
+       return tg;
+
+err:
+       free_sched_group(tg);
+       return ERR_PTR(-ENOMEM);
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+       unsigned long flags;
+
         spin_lock_irqsave(&task_group_lock, flags);
         list_add_rcu(&tg->list, &task_groups);
  
@@ -7182,12 +7192,6 @@ struct task_group *sched_create_group(struct task_group *parent)
         INIT_LIST_HEAD(&tg->children);
         list_add_rcu(&tg->siblings, &parent->children);
         spin_unlock_irqrestore(&task_group_lock, flags);
-
-       return tg;
-
-err:
-       free_sched_group(tg);
-       return ERR_PTR(-ENOMEM);
  }
  
  /* rcu callback to free various structures associated with a task group */
@@ -7199,6 +7203,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
  
  /* Destroy runqueue etc associated with a task group */
  void sched_destroy_group(struct task_group *tg)
+{
+       /* wait for possible concurrent references to cfs_rqs complete */
+       call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
  {
         unsigned long flags;
         int i;
@@ -7211,9 +7221,6 @@ void sched_destroy_group(struct task_group *tg)
         list_del_rcu(&tg->list);
         list_del_rcu(&tg->siblings);
         spin_unlock_irqrestore(&task_group_lock, flags);
-
-       /* wait for possible concurrent references to cfs_rqs complete */
-       call_rcu(&tg->rcu, free_sched_group_rcu);
  }
  
  /* change task's runqueue when it moves between groups.
@@ -7584,6 +7591,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
         return &tg->css;
  }
  
+static int cpu_cgroup_css_online(struct cgroup *cgrp)
+{
+       struct task_group *tg = cgroup_tg(cgrp);
+       struct task_group *parent;
+
+       if (!cgrp->parent)
+               return 0;
+
+       parent = cgroup_tg(cgrp->parent);
+       sched_online_group(tg, parent);
+       return 0;
+}
+
  static void cpu_cgroup_css_free(struct cgroup *cgrp)
  {
         struct task_group *tg = cgroup_tg(cgrp);
@@ -7591,6 +7611,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
         sched_destroy_group(tg);
  }
  
+static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+{
+       struct task_group *tg = cgroup_tg(cgrp);
+
+       sched_offline_group(tg);
+}
+
  static int cpu_cgroup_can_attach(struct cgroup *cgrp,
                                  struct cgroup_taskset *tset)
  {
@@ -7946,6 +7973,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
         .name           = "cpu",
         .css_alloc      = cpu_cgroup_css_alloc,
         .css_free       = cpu_cgroup_css_free,
+       .css_online     = cpu_cgroup_css_online,
+       .css_offline    = cpu_cgroup_css_offline,
         .can_attach     = cpu_cgroup_can_attach,
         .attach         = cpu_cgroup_attach,
         .exit           = cpu_cgroup_exit,
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 7ae4c4c..557e7b5 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
         if (autogroup_path(tg, group_path, PATH_MAX))
                 return group_path;
  
-       /*
-        * May be NULL if the underlying cgroup isn't fully-created yet
-        */
-       if (!tg->css.cgroup) {
-               group_path[0] = '\0';
-               return group_path;
-       }
         cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
         return group_path;
  }
diff --git a/tools/Makefile b/tools/Makefile

index 798fa0e..fa36565 100644 (file)
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -3,6 +3,7 @@ include scripts/Makefile.include
  help:
         @echo 'Possible targets:'
         @echo ''
+       @echo '  cgroup     - cgroup tools'
         @echo '  cpupower   - a tool for all things x86 CPU power'
         @echo '  firewire   - the userspace part of nosy, an IEEE-1394 traffic sniffer'
         @echo '  lguest     - a minimal 32-bit x86 hypervisor'
@@ -33,7 +34,7 @@ help:
  cpupower: FORCE
         $(call descend,power/$@)
  
-firewire lguest perf usb virtio vm: FORCE
+cgroup firewire lguest perf usb virtio vm: FORCE
         $(call descend,$@)
  
  selftests: FORCE
@@ -45,7 +46,7 @@ turbostat x86_energy_perf_policy: FORCE
  cpupower_install:
         $(call descend,power/$(@:_install=),install)
  
-firewire_install lguest_install perf_install usb_install virtio_install vm_install:
+cgroup_install firewire_install lguest_install perf_install usb_install virtio_install vm_install:
         $(call descend,$(@:_install=),install)
  
  selftests_install:
@@ -54,14 +55,14 @@ selftests_install:
  turbostat_install x86_energy_perf_policy_install:
         $(call descend,power/x86/$(@:_install=),install)
  
-install: cpupower_install firewire_install lguest_install perf_install \
-               selftests_install turbostat_install usb_install virtio_install \
-               vm_install x86_energy_perf_policy_install
+install: cgroup_install cpupower_install firewire_install lguest_install \
+               perf_install selftests_install turbostat_install usb_install \
+               virtio_install vm_install x86_energy_perf_policy_install
  
  cpupower_clean:
         $(call descend,power/cpupower,clean)
  
-firewire_clean lguest_clean perf_clean usb_clean virtio_clean vm_clean:
+cgroup_clean firewire_clean lguest_clean perf_clean usb_clean virtio_clean vm_clean:
         $(call descend,$(@:_clean=),clean)
  
  selftests_clean:
@@ -70,8 +71,8 @@ selftests_clean:
  turbostat_clean x86_energy_perf_policy_clean:
         $(call descend,power/x86/$(@:_clean=),clean)
  
-clean: cpupower_clean firewire_clean lguest_clean perf_clean selftests_clean \
-               turbostat_clean usb_clean virtio_clean vm_clean \
-               x86_energy_perf_policy_clean
+clean: cgroup_clean cpupower_clean firewire_clean lguest_clean perf_clean \
+               selftests_clean turbostat_clean usb_clean virtio_clean \
+               vm_clean x86_energy_perf_policy_clean
  
  .PHONY: FORCE
diff --git a/tools/cgroup/.gitignore b/tools/cgroup/.gitignore

new file mode 100644 (file)

index 0000000..633cd9b
--- /dev/null
+++ b/tools/cgroup/.gitignore
@@ -0,0 +1 @@
+cgroup_event_listener
diff --git a/tools/cgroup/Makefile b/tools/cgroup/Makefile

new file mode 100644 (file)

index 0000000..b428619
--- /dev/null
+++ b/tools/cgroup/Makefile
@@ -0,0 +1,11 @@
+# Makefile for cgroup tools
+
+CC = $(CROSS_COMPILE)gcc
+CFLAGS = -Wall -Wextra
+
+all: cgroup_event_listener
+%: %.c
+       $(CC) $(CFLAGS) -o $@ $^
+
+clean:
+       $(RM) cgroup_event_listener
diff --git a/tools/cgroup/cgroup_event_listener.c b/tools/cgroup/cgroup_event_listener.c

new file mode 100644 (file)

index 0000000..4eb5507
--- /dev/null
+++ b/tools/cgroup/cgroup_event_listener.c
@@ -0,0 +1,82 @@
+/*
+ * cgroup_event_listener.c - Simple listener of cgroup events
+ *
+ * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name>
+ */
+
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/eventfd.h>
+
+#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>"
+
+int main(int argc, char **argv)
+{
+       int efd = -1;
+       int cfd = -1;
+       int event_control = -1;
+       char event_control_path[PATH_MAX];
+       char line[LINE_MAX];
+       int ret;
+
+       if (argc != 3)
+               errx(1, "%s", USAGE_STR);
+
+       cfd = open(argv[1], O_RDONLY);
+       if (cfd == -1)
+               err(1, "Cannot open %s", argv[1]);
+
+       ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control",
+                       dirname(argv[1]));
+       if (ret >= PATH_MAX)
+               errx(1, "Path to cgroup.event_control is too long");
+
+       event_control = open(event_control_path, O_WRONLY);
+       if (event_control == -1)
+               err(1, "Cannot open %s", event_control_path);
+
+       efd = eventfd(0, 0);
+       if (efd == -1)
+               err(1, "eventfd() failed");
+
+       ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]);
+       if (ret >= LINE_MAX)
+               errx(1, "Arguments string is too long");
+
+       ret = write(event_control, line, strlen(line) + 1);
+       if (ret == -1)
+               err(1, "Cannot write to cgroup.event_control");
+
+       while (1) {
+               uint64_t result;
+
+               ret = read(efd, &result, sizeof(result));
+               if (ret == -1) {
+                       if (errno == EINTR)
+                               continue;
+                       err(1, "Cannot read from eventfd");
+               }
+               assert(ret == sizeof(result));
+
+               ret = access(event_control_path, W_OK);
+               if ((ret == -1) && (errno == ENOENT)) {
+                       puts("The cgroup seems to have removed.");
+                       break;
+               }
+
+               if (ret == -1)
+                       err(1, "cgroup.event_control is not accessible any more");
+
+               printf("%s %s: crossed\n", argv[1], argv[2]);
+       }
+
+       return 0;
+}
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 20 Feb 2013 17:16:21 +0000 (09:16 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 20 Feb 2013 17:16:21 +0000 (09:16 -0800)
Documentation/cgroups/00-INDEX		patch \| blob \| history
Documentation/cgroups/cgroup_event_listener.c	[deleted file]	patch \| blob \| history
Documentation/cgroups/memcg_test.txt		patch \| blob \| history
include/linux/cgroup.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/cgroup.c		patch \| blob \| history
kernel/cpuset.c		patch \| blob \| history
kernel/sched/auto_group.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/debug.c		patch \| blob \| history
tools/Makefile		patch \| blob \| history
tools/cgroup/.gitignore	[new file with mode: 0644]	patch \| blob
tools/cgroup/Makefile	[new file with mode: 0644]	patch \| blob
tools/cgroup/cgroup_event_listener.c	[new file with mode: 0644]	patch \| blob