mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and...
[cascardo/linux.git] / kernel / cgroup.c
index 56e2b77..f1603c1 100644 (file)
@@ -45,7 +45,6 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/rwsem.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
@@ -76,7 +75,7 @@
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
  *
- * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ * css_set_lock protects task->cgroups pointer, the list of css_set
  * objects, and the chain of tasks off each css_set.
  *
  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  */
 #ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
-DECLARE_RWSEM(css_set_rwsem);
+DEFINE_SPINLOCK(css_set_lock);
 EXPORT_SYMBOL_GPL(cgroup_mutex);
-EXPORT_SYMBOL_GPL(css_set_rwsem);
+EXPORT_SYMBOL_GPL(css_set_lock);
 #else
 static DEFINE_MUTEX(cgroup_mutex);
-static DECLARE_RWSEM(css_set_rwsem);
+static DEFINE_SPINLOCK(css_set_lock);
 #endif
 
 /*
@@ -174,12 +173,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
  */
 static bool cgrp_dfl_root_visible;
 
-/*
- * Set by the boot param of the same name and makes subsystems with NULL
- * ->dfl_files to use ->legacy_files on the default hierarchy.
- */
-static bool cgroup_legacy_files_on_dfl;
-
 /* some controllers are not supported in the default hierarchy */
 static unsigned long cgrp_dfl_root_inhibit_ss_mask;
 
@@ -207,6 +200,7 @@ static u64 css_serial_nr_next = 1;
  */
 static unsigned long have_fork_callback __read_mostly;
 static unsigned long have_exit_callback __read_mostly;
+static unsigned long have_free_callback __read_mostly;
 
 /* Ditto for the can_fork callback. */
 static unsigned long have_canfork_callback __read_mostly;
@@ -216,6 +210,7 @@ static struct cftype cgroup_legacy_base_files[];
 
 static int rebind_subsystems(struct cgroup_root *dst_root,
                             unsigned long ss_mask);
+static void css_task_iter_advance(struct css_task_iter *it);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
                      bool visible);
@@ -304,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
 
        idr_preload(gfp_mask);
        spin_lock_bh(&cgroup_idr_lock);
-       ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
+       ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
        spin_unlock_bh(&cgroup_idr_lock);
        idr_preload_end();
        return ret;
@@ -593,6 +588,7 @@ struct css_set init_css_set = {
        .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
        .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
+       .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
 };
 
 static int css_set_count       = 1;    /* 1 for init_css_set */
@@ -603,7 +599,7 @@ static int css_set_count    = 1;    /* 1 for init_css_set */
  */
 static bool css_set_populated(struct css_set *cset)
 {
-       lockdep_assert_held(&css_set_rwsem);
+       lockdep_assert_held(&css_set_lock);
 
        return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
 }
@@ -626,7 +622,7 @@ static bool css_set_populated(struct css_set *cset)
  */
 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 {
-       lockdep_assert_held(&css_set_rwsem);
+       lockdep_assert_held(&css_set_lock);
 
        do {
                bool trigger;
@@ -658,7 +654,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
 {
        struct cgrp_cset_link *link;
 
-       lockdep_assert_held(&css_set_rwsem);
+       lockdep_assert_held(&css_set_lock);
 
        list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
                cgroup_update_populated(link->cgrp, populated);
@@ -675,17 +671,33 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
  * css_set, @from_cset can be NULL.  If @task is being disassociated
  * instead of moved, @to_cset can be NULL.
  *
- * This function automatically handles populated_cnt updates but the caller
- * is responsible for managing @from_cset and @to_cset's reference counts.
+ * This function automatically handles populated_cnt updates and
+ * css_task_iter adjustments but the caller is responsible for managing
+ * @from_cset and @to_cset's reference counts.
  */
 static void css_set_move_task(struct task_struct *task,
                              struct css_set *from_cset, struct css_set *to_cset,
                              bool use_mg_tasks)
 {
-       lockdep_assert_held(&css_set_rwsem);
+       lockdep_assert_held(&css_set_lock);
 
        if (from_cset) {
+               struct css_task_iter *it, *pos;
+
                WARN_ON_ONCE(list_empty(&task->cg_list));
+
+               /*
+                * @task is leaving, advance task iterators which are
+                * pointing to it so that they can resume at the next
+                * position.  Advancing an iterator might remove it from
+                * the list, use safe walk.  See css_task_iter_advance*()
+                * for details.
+                */
+               list_for_each_entry_safe(it, pos, &from_cset->task_iters,
+                                        iters_node)
+                       if (it->task_pos == &task->cg_list)
+                               css_task_iter_advance(it);
+
                list_del_init(&task->cg_list);
                if (!css_set_populated(from_cset))
                        css_set_update_populated(from_cset, false);
@@ -737,7 +749,7 @@ static void put_css_set_locked(struct css_set *cset)
        struct cgroup_subsys *ss;
        int ssid;
 
-       lockdep_assert_held(&css_set_rwsem);
+       lockdep_assert_held(&css_set_lock);
 
        if (!atomic_dec_and_test(&cset->refcount))
                return;
@@ -769,9 +781,9 @@ static void put_css_set(struct css_set *cset)
        if (atomic_add_unless(&cset->refcount, -1, 1))
                return;
 
-       down_write(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        put_css_set_locked(cset);
-       up_write(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
 }
 
 /*
@@ -994,11 +1006,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
        /* First see if we already have a cgroup group that matches
         * the desired set */
-       down_read(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        cset = find_existing_css_set(old_cset, cgrp, template);
        if (cset)
                get_css_set(cset);
-       up_read(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
 
        if (cset)
                return cset;
@@ -1019,13 +1031,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        INIT_LIST_HEAD(&cset->mg_tasks);
        INIT_LIST_HEAD(&cset->mg_preload_node);
        INIT_LIST_HEAD(&cset->mg_node);
+       INIT_LIST_HEAD(&cset->task_iters);
        INIT_HLIST_NODE(&cset->hlist);
 
        /* Copy the set of subsystem state objects generated in
         * find_existing_css_set() */
        memcpy(cset->subsys, template, sizeof(cset->subsys));
 
-       down_write(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        /* Add reference counts and links from the new css_set. */
        list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;
@@ -1047,7 +1060,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
                list_add_tail(&cset->e_cset_node[ssid],
                              &cset->subsys[ssid]->cgroup->e_csets[ssid]);
 
-       up_write(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
 
        return cset;
 }
@@ -1111,14 +1124,15 @@ static void cgroup_destroy_root(struct cgroup_root *root)
         * Release all the links from cset_links to this hierarchy's
         * root cgroup
         */
-       down_write(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
 
        list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
                kfree(link);
        }
-       up_write(&css_set_rwsem);
+
+       spin_unlock_bh(&css_set_lock);
 
        if (!list_empty(&root->root_list)) {
                list_del(&root->root_list);
@@ -1140,7 +1154,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
        struct cgroup *res = NULL;
 
        lockdep_assert_held(&cgroup_mutex);
-       lockdep_assert_held(&css_set_rwsem);
+       lockdep_assert_held(&css_set_lock);
 
        if (cset == &init_css_set) {
                res = &root->cgrp;
@@ -1163,7 +1177,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 
 /*
  * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_rwsem held.
+ * called with cgroup_mutex and css_set_lock held.
  */
 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                            struct cgroup_root *root)
@@ -1512,11 +1526,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
                ss->root = dst_root;
                css->cgroup = dcgrp;
 
-               down_write(&css_set_rwsem);
+               spin_lock_bh(&css_set_lock);
                hash_for_each(css_set_table, i, cset, hlist)
                        list_move_tail(&cset->e_cset_node[ss->id],
                                       &dcgrp->e_csets[ss->id]);
-               up_write(&css_set_rwsem);
+               spin_unlock_bh(&css_set_lock);
 
                src_root->subsys_mask &= ~(1 << ssid);
                scgrp->subtree_control &= ~(1 << ssid);
@@ -1793,7 +1807,7 @@ static void cgroup_enable_task_cg_lists(void)
 {
        struct task_struct *p, *g;
 
-       down_write(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
 
        if (use_task_css_set_links)
                goto out_unlock;
@@ -1832,7 +1846,7 @@ static void cgroup_enable_task_cg_lists(void)
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
 out_unlock:
-       up_write(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
 }
 
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1896,7 +1910,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
                goto out;
 
        /*
-        * We're accessing css_set_count without locking css_set_rwsem here,
+        * We're accessing css_set_count without locking css_set_lock here,
         * but that's OK - it can only be increased by someone holding
         * cgroup_lock, and that's us. The worst that can happen is that we
         * have some link structures left over
@@ -1938,13 +1952,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
         * Link the root cgroup in this hierarchy into all the css_set
         * objects.
         */
-       down_write(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        hash_for_each(css_set_table, i, cset, hlist) {
                link_css_set(&tmp_links, cset, root_cgrp);
                if (css_set_populated(cset))
                        cgroup_update_populated(root_cgrp, true);
        }
-       up_write(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
 
        BUG_ON(!list_empty(&root_cgrp->self.children));
        BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2177,7 +2191,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
        char *path = NULL;
 
        mutex_lock(&cgroup_mutex);
-       down_read(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
 
        root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
 
@@ -2190,7 +2204,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
                        path = buf;
        }
 
-       up_read(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        return path;
 }
@@ -2239,7 +2253,7 @@ static void cgroup_taskset_add(struct task_struct *task,
 {
        struct css_set *cset;
 
-       lockdep_assert_held(&css_set_rwsem);
+       lockdep_assert_held(&css_set_lock);
 
        /* @task either already exited or can't exit until the end */
        if (task->flags & PF_EXITING)
@@ -2345,7 +2359,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
         * the new cgroup.  There are no failure cases after here, so this
         * is the commit point.
         */
-       down_write(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        list_for_each_entry(cset, &tset->src_csets, mg_node) {
                list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
                        struct css_set *from_cset = task_css_set(task);
@@ -2356,7 +2370,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
                        put_css_set_locked(from_cset);
                }
        }
-       up_write(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
 
        /*
         * Migration is committed, all target tasks are now on dst_csets.
@@ -2380,13 +2394,13 @@ out_cancel_attach:
                        css->ss->cancel_attach(css, tset);
        }
 out_release_tset:
-       down_write(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        list_splice_init(&tset->dst_csets, &tset->src_csets);
        list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
                list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
                list_del_init(&cset->mg_node);
        }
-       up_write(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
        return ret;
 }
 
@@ -2403,14 +2417,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 
        lockdep_assert_held(&cgroup_mutex);
 
-       down_write(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_preload_node);
                put_css_set_locked(cset);
        }
-       up_write(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
 }
 
 /**
@@ -2436,7 +2450,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
        struct cgroup *src_cgrp;
 
        lockdep_assert_held(&cgroup_mutex);
-       lockdep_assert_held(&css_set_rwsem);
+       lockdep_assert_held(&css_set_lock);
 
        src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
 
@@ -2552,7 +2566,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
         * already PF_EXITING could be freed from underneath us unless we
         * take an rcu_read_lock.
         */
-       down_write(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
@@ -2561,7 +2575,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
-       up_write(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
 
        return cgroup_taskset_migrate(&tset, cgrp);
 }
@@ -2582,7 +2596,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
        int ret;
 
        /* look up all src csets */
-       down_read(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
@@ -2592,7 +2606,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
-       up_read(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
 
        /* prepare dst csets and commit */
        ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
@@ -2625,9 +2639,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
                struct cgroup *cgrp;
                struct inode *inode;
 
-               down_read(&css_set_rwsem);
+               spin_lock_bh(&css_set_lock);
                cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-               up_read(&css_set_rwsem);
+               spin_unlock_bh(&css_set_lock);
 
                while (!cgroup_is_descendant(dst_cgrp, cgrp))
                        cgrp = cgroup_parent(cgrp);
@@ -2724,9 +2738,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
                if (root == &cgrp_dfl_root)
                        continue;
 
-               down_read(&css_set_rwsem);
+               spin_lock_bh(&css_set_lock);
                from_cgrp = task_cgroup_from_root(from, root);
-               up_read(&css_set_rwsem);
+               spin_unlock_bh(&css_set_lock);
 
                retval = cgroup_attach_task(from_cgrp, tsk, false);
                if (retval)
@@ -2851,7 +2865,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
        percpu_down_write(&cgroup_threadgroup_rwsem);
 
        /* look up all csses currently attached to @cgrp's subtree */
-       down_read(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
                struct cgrp_cset_link *link;
 
@@ -2863,14 +2877,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
                        cgroup_migrate_add_src(link->cset, cgrp,
                                               &preloaded_csets);
        }
-       up_read(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
 
        /* NULL dst indicates self on default hierarchy */
        ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
        if (ret)
                goto out_finish;
 
-       down_write(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
                struct task_struct *task, *ntask;
 
@@ -2882,7 +2896,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
                list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
                        cgroup_taskset_add(task, &tset);
        }
-       up_write(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
 
        ret = cgroup_taskset_migrate(&tset, cgrp);
 out_finish:
@@ -3533,17 +3547,8 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
        struct cftype *cft;
 
-       /*
-        * If legacy_flies_on_dfl, we want to show the legacy files on the
-        * dfl hierarchy but iff the target subsystem hasn't been updated
-        * for the dfl hierarchy yet.
-        */
-       if (!cgroup_legacy_files_on_dfl ||
-           ss->dfl_cftypes != ss->legacy_cftypes) {
-               for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
-                       cft->flags |= __CFTYPE_NOT_ON_DFL;
-       }
-
+       for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+               cft->flags |= __CFTYPE_NOT_ON_DFL;
        return cgroup_add_cftypes(ss, cfts);
 }
 
@@ -3558,10 +3563,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
        int count = 0;
        struct cgrp_cset_link *link;
 
-       down_read(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                count += atomic_read(&link->cset->refcount);
-       up_read(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
        return count;
 }
 
@@ -3804,6 +3809,8 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
        struct cgrp_cset_link *link;
        struct css_set *cset;
 
+       lockdep_assert_held(&css_set_lock);
+
        /* Advance to the next non-empty css_set */
        do {
                l = l->next;
@@ -3831,12 +3838,36 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 
        it->tasks_head = &cset->tasks;
        it->mg_tasks_head = &cset->mg_tasks;
+
+       /*
+        * We don't keep css_sets locked across iteration steps and thus
+        * need to take steps to ensure that iteration can be resumed after
+        * the lock is re-acquired.  Iteration is performed at two levels -
+        * css_sets and tasks in them.
+        *
+        * Once created, a css_set never leaves its cgroup lists, so a
+        * pinned css_set is guaranteed to stay put and we can resume
+        * iteration afterwards.
+        *
+        * Tasks may leave @cset across iteration steps.  This is resolved
+        * by registering each iterator with the css_set currently being
+        * walked and making css_set_move_task() advance iterators whose
+        * next task is leaving.
+        */
+       if (it->cur_cset) {
+               list_del(&it->iters_node);
+               put_css_set_locked(it->cur_cset);
+       }
+       get_css_set(cset);
+       it->cur_cset = cset;
+       list_add(&it->iters_node, &cset->task_iters);
 }
 
 static void css_task_iter_advance(struct css_task_iter *it)
 {
        struct list_head *l = it->task_pos;
 
+       lockdep_assert_held(&css_set_lock);
        WARN_ON_ONCE(!l);
 
        /*
@@ -3864,19 +3895,16 @@ static void css_task_iter_advance(struct css_task_iter *it)
  * css_task_iter_next() to walk through the tasks until the function
  * returns NULL.  On completion of iteration, css_task_iter_end() must be
  * called.
- *
- * Note that this function acquires a lock which is released when the
- * iteration finishes.  The caller can't sleep while iteration is in
- * progress.
  */
 void css_task_iter_start(struct cgroup_subsys_state *css,
                         struct css_task_iter *it)
-       __acquires(css_set_rwsem)
 {
        /* no one should try to iterate before mounting cgroups */
        WARN_ON_ONCE(!use_task_css_set_links);
 
-       down_read(&css_set_rwsem);
+       memset(it, 0, sizeof(*it));
+
+       spin_lock_bh(&css_set_lock);
 
        it->ss = css->ss;
 
@@ -3888,6 +3916,8 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
        it->cset_head = it->cset_pos;
 
        css_task_iter_advance_css_set(it);
+
+       spin_unlock_bh(&css_set_lock);
 }
 
 /**
@@ -3900,14 +3930,23 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
  */
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
-       struct task_struct *res;
+       if (it->cur_task) {
+               put_task_struct(it->cur_task);
+               it->cur_task = NULL;
+       }
 
-       if (!it->cset_pos)
-               return NULL;
+       spin_lock_bh(&css_set_lock);
 
-       res = list_entry(it->task_pos, struct task_struct, cg_list);
-       css_task_iter_advance(it);
-       return res;
+       if (it->task_pos) {
+               it->cur_task = list_entry(it->task_pos, struct task_struct,
+                                         cg_list);
+               get_task_struct(it->cur_task);
+               css_task_iter_advance(it);
+       }
+
+       spin_unlock_bh(&css_set_lock);
+
+       return it->cur_task;
 }
 
 /**
@@ -3917,9 +3956,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
  * Finish task iteration started by css_task_iter_start().
  */
 void css_task_iter_end(struct css_task_iter *it)
-       __releases(css_set_rwsem)
 {
-       up_read(&css_set_rwsem);
+       if (it->cur_cset) {
+               spin_lock_bh(&css_set_lock);
+               list_del(&it->iters_node);
+               put_css_set_locked(it->cur_cset);
+               spin_unlock_bh(&css_set_lock);
+       }
+
+       if (it->cur_task)
+               put_task_struct(it->cur_task);
 }
 
 /**
@@ -3944,10 +3990,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
        mutex_lock(&cgroup_mutex);
 
        /* all tasks in @from are being moved, all csets are source */
-       down_read(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        list_for_each_entry(link, &from->cset_links, cset_link)
                cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-       up_read(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
 
        ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
        if (ret)
@@ -5121,6 +5167,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 
        have_fork_callback |= (bool)ss->fork << ss->id;
        have_exit_callback |= (bool)ss->exit << ss->id;
+       have_free_callback |= (bool)ss->free << ss->id;
        have_canfork_callback |= (bool)ss->can_fork << ss->id;
 
        /* At system boot, before all subsystems have been
@@ -5181,7 +5228,7 @@ int __init cgroup_init(void)
 {
        struct cgroup_subsys *ss;
        unsigned long key;
-       int ssid, err;
+       int ssid;
 
        BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
@@ -5226,9 +5273,6 @@ int __init cgroup_init(void)
 
                cgrp_dfl_root.subsys_mask |= 1 << ss->id;
 
-               if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
-                       ss->dfl_cftypes = ss->legacy_cftypes;
-
                if (!ss->dfl_cftypes)
                        cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
 
@@ -5243,17 +5287,10 @@ int __init cgroup_init(void)
                        ss->bind(init_css_set.subsys[ssid]);
        }
 
-       err = sysfs_create_mount_point(fs_kobj, "cgroup");
-       if (err)
-               return err;
-
-       err = register_filesystem(&cgroup_fs_type);
-       if (err < 0) {
-               sysfs_remove_mount_point(fs_kobj, "cgroup");
-               return err;
-       }
+       WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
+       WARN_ON(register_filesystem(&cgroup_fs_type));
+       WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
 
-       proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
        return 0;
 }
 
@@ -5300,7 +5337,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                goto out;
 
        mutex_lock(&cgroup_mutex);
-       down_read(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
 
        for_each_root(root) {
                struct cgroup_subsys *ss;
@@ -5320,19 +5357,39 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                        seq_printf(m, "%sname=%s", count ? "," : "",
                                   root->name);
                seq_putc(m, ':');
+
                cgrp = task_cgroup_from_root(tsk, root);
-               path = cgroup_path(cgrp, buf, PATH_MAX);
-               if (!path) {
-                       retval = -ENAMETOOLONG;
-                       goto out_unlock;
+
+               /*
+                * On traditional hierarchies, all zombie tasks show up as
+                * belonging to the root cgroup.  On the default hierarchy,
+                * while a zombie doesn't show up in "cgroup.procs" and
+                * thus can't be migrated, its /proc/PID/cgroup keeps
+                * reporting the cgroup it belonged to before exiting.  If
+                * the cgroup is removed before the zombie is reaped,
+                * " (deleted)" is appended to the cgroup path.
+                */
+               if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
+                       path = cgroup_path(cgrp, buf, PATH_MAX);
+                       if (!path) {
+                               retval = -ENAMETOOLONG;
+                               goto out_unlock;
+                       }
+               } else {
+                       path = "/";
                }
+
                seq_puts(m, path);
-               seq_putc(m, '\n');
+
+               if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
+                       seq_puts(m, " (deleted)\n");
+               else
+                       seq_putc(m, '\n');
        }
 
        retval = 0;
 out_unlock:
-       up_read(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        kfree(buf);
 out:
@@ -5478,7 +5535,7 @@ void cgroup_post_fork(struct task_struct *child,
         * @child during its iteration.
         *
         * If we won the race, @child is associated with %current's
-        * css_set.  Grabbing css_set_rwsem guarantees both that the
+        * css_set.  Grabbing css_set_lock guarantees both that the
         * association is stable, and, on completion of the parent's
         * migration, @child is visible in the source of migration or
         * already in the destination cgroup.  This guarantee is necessary
@@ -5493,13 +5550,13 @@ void cgroup_post_fork(struct task_struct *child,
        if (use_task_css_set_links) {
                struct css_set *cset;
 
-               down_write(&css_set_rwsem);
+               spin_lock_bh(&css_set_lock);
                cset = task_css_set(current);
                if (list_empty(&child->cg_list)) {
                        get_css_set(cset);
                        css_set_move_task(child, NULL, cset, false);
                }
-               up_write(&css_set_rwsem);
+               spin_unlock_bh(&css_set_lock);
        }
 
        /*
@@ -5534,7 +5591,6 @@ void cgroup_exit(struct task_struct *tsk)
 {
        struct cgroup_subsys *ss;
        struct css_set *cset;
-       bool put_cset = false;
        int i;
 
        /*
@@ -5544,25 +5600,28 @@ void cgroup_exit(struct task_struct *tsk)
        cset = task_css_set(tsk);
 
        if (!list_empty(&tsk->cg_list)) {
-               down_write(&css_set_rwsem);
+               spin_lock_bh(&css_set_lock);
                css_set_move_task(tsk, cset, NULL, false);
-               up_write(&css_set_rwsem);
-               put_cset = true;
+               spin_unlock_bh(&css_set_lock);
+       } else {
+               get_css_set(cset);
        }
 
-       /* Reassign the task to the init_css_set. */
-       RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
-
        /* see cgroup_post_fork() for details */
-       for_each_subsys_which(ss, i, &have_exit_callback) {
-               struct cgroup_subsys_state *old_css = cset->subsys[i];
-               struct cgroup_subsys_state *css = task_css(tsk, i);
+       for_each_subsys_which(ss, i, &have_exit_callback)
+               ss->exit(tsk);
+}
 
-               ss->exit(css, old_css, tsk);
-       }
+void cgroup_free(struct task_struct *task)
+{
+       struct css_set *cset = task_css_set(task);
+       struct cgroup_subsys *ss;
+       int ssid;
 
-       if (put_cset)
-               put_css_set(cset);
+       for_each_subsys_which(ss, ssid, &have_free_callback)
+               ss->free(task);
+
+       put_css_set(cset);
 }
 
 static void check_for_release(struct cgroup *cgrp)
@@ -5653,14 +5712,6 @@ static int __init cgroup_disable(char *str)
 }
 __setup("cgroup_disable=", cgroup_disable);
 
-static int __init cgroup_set_legacy_files_on_dfl(char *str)
-{
-       printk("cgroup: using legacy files on the default hierarchy\n");
-       cgroup_legacy_files_on_dfl = true;
-       return 0;
-}
-__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
-
 /**
  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
  * @dentry: directory dentry of interest
@@ -5764,7 +5815,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
        if (!name_buf)
                return -ENOMEM;
 
-       down_read(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        rcu_read_lock();
        cset = rcu_dereference(current->cgroups);
        list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -5775,7 +5826,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
                           c->root->hierarchy_id, name_buf);
        }
        rcu_read_unlock();
-       up_read(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
        kfree(name_buf);
        return 0;
 }
@@ -5786,7 +5837,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
        struct cgroup_subsys_state *css = seq_css(seq);
        struct cgrp_cset_link *link;
 
-       down_read(&css_set_rwsem);
+       spin_lock_bh(&css_set_lock);
        list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
                struct css_set *cset = link->cset;
                struct task_struct *task;
@@ -5809,7 +5860,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
        overflow:
                seq_puts(seq, "  ...\n");
        }
-       up_read(&css_set_rwsem);
+       spin_unlock_bh(&css_set_lock);
        return 0;
 }