kernel/cgroup.c

   1 /*
   2  *  Generic process-grouping system.
   3  *
   4  *  Based originally on the cpuset system, extracted by Paul Menage
   5  *  Copyright (C) 2006 Google, Inc
   6  *
   7  *  Notifications support
   8  *  Copyright (C) 2009 Nokia Corporation
   9  *  Author: Kirill A. Shutemov
  10  *
  11  *  Copyright notices from the original cpuset code:
  12  *  --------------------------------------------------
  13  *  Copyright (C) 2003 BULL SA.
  14  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  15  *
  16  *  Portions derived from Patrick Mochel's sysfs code.
  17  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  18  *
  19  *  2003-10-10 Written by Simon Derr.
  20  *  2003-10-22 Updates by Stephen Hemminger.
  21  *  2004 May-July Rework by Paul Jackson.
  22  *  ---------------------------------------------------
  23  *
  24  *  This file is subject to the terms and conditions of the GNU General Public
  25  *  License.  See the file COPYING in the main directory of the Linux
  26  *  distribution for more details.
  27  */
  28
  29 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  30
  31 #include <linux/cgroup.h>
  32 #include <linux/cred.h>
  33 #include <linux/ctype.h>
  34 #include <linux/errno.h>
  35 #include <linux/init_task.h>
  36 #include <linux/kernel.h>
  37 #include <linux/list.h>
  38 #include <linux/magic.h>
  39 #include <linux/mm.h>
  40 #include <linux/mutex.h>
  41 #include <linux/mount.h>
  42 #include <linux/pagemap.h>
  43 #include <linux/proc_fs.h>
  44 #include <linux/rcupdate.h>
  45 #include <linux/sched.h>
  46 #include <linux/slab.h>
  47 #include <linux/spinlock.h>
  48 #include <linux/percpu-rwsem.h>
  49 #include <linux/string.h>
  50 #include <linux/sort.h>
  51 #include <linux/kmod.h>
  52 #include <linux/delayacct.h>
  53 #include <linux/cgroupstats.h>
  54 #include <linux/hashtable.h>
  55 #include <linux/pid_namespace.h>
  56 #include <linux/idr.h>
  57 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
  58 #include <linux/kthread.h>
  59 #include <linux/delay.h>
  60 #include <linux/atomic.h>
  61 #include <linux/cpuset.h>
  62 #include <net/sock.h>
  63
  64 /*
  65  * pidlists linger the following amount before being destroyed.  The goal
  66  * is avoiding frequent destruction in the middle of consecutive read calls
  67  * Expiring in the middle is a performance problem not a correctness one.
  68  * 1 sec should be enough.
  69  */
  70 #define CGROUP_PIDLIST_DESTROY_DELAY    HZ
  71
  72 #define CGROUP_FILE_NAME_MAX            (MAX_CGROUP_TYPE_NAMELEN +      \
  73                                          MAX_CFTYPE_NAME + 2)
  74
  75 /*
  76  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  77  * hierarchy must be performed while holding it.
  78  *
  79  * css_set_lock protects task->cgroups pointer, the list of css_set
  80  * objects, and the chain of tasks off each css_set.
  81  *
  82  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  83  * cgroup.h can use them for lockdep annotations.
  84  */
  85 #ifdef CONFIG_PROVE_RCU
  86 DEFINE_MUTEX(cgroup_mutex);
  87 DEFINE_SPINLOCK(css_set_lock);
  88 EXPORT_SYMBOL_GPL(cgroup_mutex);
  89 EXPORT_SYMBOL_GPL(css_set_lock);
  90 #else
  91 static DEFINE_MUTEX(cgroup_mutex);
  92 static DEFINE_SPINLOCK(css_set_lock);
  93 #endif
  94
  95 /*
  96  * Protects cgroup_idr and css_idr so that IDs can be released without
  97  * grabbing cgroup_mutex.
  98  */
  99 static DEFINE_SPINLOCK(cgroup_idr_lock);
 100
 101 /*
 102  * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
 103  * against file removal/re-creation across css hiding.
 104  */
 105 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
 106
 107 /*
 108  * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
 109  * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
 110  */
 111 static DEFINE_SPINLOCK(release_agent_path_lock);
 112
 113 struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 114
 115 #define cgroup_assert_mutex_or_rcu_locked()                             \
 116         RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
 117                            !lockdep_is_held(&cgroup_mutex),             \
 118                            "cgroup_mutex or RCU read lock required");
 119
 120 /*
 121  * cgroup destruction makes heavy use of work items and there can be a lot
 122  * of concurrent destructions.  Use a separate workqueue so that cgroup
 123  * destruction work items don't end up filling up max_active of system_wq
 124  * which may lead to deadlock.
 125  */
 126 static struct workqueue_struct *cgroup_destroy_wq;
 127
 128 /*
 129  * pidlist destructions need to be flushed on cgroup destruction.  Use a
 130  * separate workqueue as flush domain.
 131  */
 132 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
 133
 134 /* generate an array of cgroup subsystem pointers */
 135 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
 136 static struct cgroup_subsys *cgroup_subsys[] = {
 137 #include <linux/cgroup_subsys.h>
 138 };
 139 #undef SUBSYS
 140
 141 /* array of cgroup subsystem names */
 142 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
 143 static const char *cgroup_subsys_name[] = {
 144 #include <linux/cgroup_subsys.h>
 145 };
 146 #undef SUBSYS
 147
 148 /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
 149 #define SUBSYS(_x)                                                              \
 150         DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);                 \
 151         DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);                  \
 152         EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);                      \
 153         EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
 154 #include <linux/cgroup_subsys.h>
 155 #undef SUBSYS
 156
 157 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
 158 static struct static_key_true *cgroup_subsys_enabled_key[] = {
 159 #include <linux/cgroup_subsys.h>
 160 };
 161 #undef SUBSYS
 162
 163 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
 164 static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 165 #include <linux/cgroup_subsys.h>
 166 };
 167 #undef SUBSYS
 168
 169 /*
 170  * The default hierarchy, reserved for the subsystems that are otherwise
 171  * unattached - it never has more than a single cgroup, and all tasks are
 172  * part of that cgroup.
 173  */
 174 struct cgroup_root cgrp_dfl_root;
 175 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 176
 177 /*
 178  * The default hierarchy always exists but is hidden until mounted for the
 179  * first time.  This is for backward compatibility.
 180  */
 181 static bool cgrp_dfl_visible;
 182
 183 /* Controllers blocked by the commandline in v1 */
 184 static u16 cgroup_no_v1_mask;
 185
 186 /* some controllers are not supported in the default hierarchy */
 187 static u16 cgrp_dfl_inhibit_ss_mask;
 188
 189 /* The list of hierarchy roots */
 190
 191 static LIST_HEAD(cgroup_roots);
 192 static int cgroup_root_count;
 193
 194 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
 195 static DEFINE_IDR(cgroup_hierarchy_idr);
 196
 197 /*
 198  * Assign a monotonically increasing serial number to csses.  It guarantees
 199  * cgroups with bigger numbers are newer than those with smaller numbers.
 200  * Also, as csses are always appended to the parent's ->children list, it
 201  * guarantees that sibling csses are always sorted in the ascending serial
 202  * number order on the list.  Protected by cgroup_mutex.
 203  */
 204 static u64 css_serial_nr_next = 1;
 205
 206 /*
 207  * These bitmask flags indicate whether tasks in the fork and exit paths have
 208  * fork/exit handlers to call. This avoids us having to do extra work in the
 209  * fork/exit path to check which subsystems have fork/exit callbacks.
 210  */
 211 static u16 have_fork_callback __read_mostly;
 212 static u16 have_exit_callback __read_mostly;
 213 static u16 have_free_callback __read_mostly;
 214
 215 /* Ditto for the can_fork callback. */
 216 static u16 have_canfork_callback __read_mostly;
 217
 218 static struct file_system_type cgroup2_fs_type;
 219 static struct cftype cgroup_dfl_base_files[];
 220 static struct cftype cgroup_legacy_base_files[];
 221
 222 static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
 223 static void css_task_iter_advance(struct css_task_iter *it);
 224 static int cgroup_destroy_locked(struct cgroup *cgrp);
 225 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 226                                               struct cgroup_subsys *ss);
 227 static void css_release(struct percpu_ref *ref);
 228 static void kill_css(struct cgroup_subsys_state *css);
 229 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 230                               struct cgroup *cgrp, struct cftype cfts[],
 231                               bool is_add);
 232
 233 /**
 234  * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
 235  * @ssid: subsys ID of interest
 236  *
 237  * cgroup_subsys_enabled() can only be used with literal subsys names which
 238  * is fine for individual subsystems but unsuitable for cgroup core.  This
 239  * is slower static_key_enabled() based test indexed by @ssid.
 240  */
 241 static bool cgroup_ssid_enabled(int ssid)
 242 {
 243         return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 244 }
 245
 246 static bool cgroup_ssid_no_v1(int ssid)
 247 {
 248         return cgroup_no_v1_mask & (1 << ssid);
 249 }
 250
 251 /**
 252  * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
 253  * @cgrp: the cgroup of interest
 254  *
 255  * The default hierarchy is the v2 interface of cgroup and this function
 256  * can be used to test whether a cgroup is on the default hierarchy for
 257  * cases where a subsystem should behave differnetly depending on the
 258  * interface version.
 259  *
 260  * The set of behaviors which change on the default hierarchy are still
 261  * being determined and the mount option is prefixed with __DEVEL__.
 262  *
 263  * List of changed behaviors:
 264  *
 265  * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
 266  *   and "name" are disallowed.
 267  *
 268  * - When mounting an existing superblock, mount options should match.
 269  *
 270  * - Remount is disallowed.
 271  *
 272  * - rename(2) is disallowed.
 273  *
 274  * - "tasks" is removed.  Everything should be at process granularity.  Use
 275  *   "cgroup.procs" instead.
 276  *
 277  * - "cgroup.procs" is not sorted.  pids will be unique unless they got
 278  *   recycled inbetween reads.
 279  *
 280  * - "release_agent" and "notify_on_release" are removed.  Replacement
 281  *   notification mechanism will be implemented.
 282  *
 283  * - "cgroup.clone_children" is removed.
 284  *
 285  * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
 286  *   and its descendants contain no task; otherwise, 1.  The file also
 287  *   generates kernfs notification which can be monitored through poll and
 288  *   [di]notify when the value of the file changes.
 289  *
 290  * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
 291  *   take masks of ancestors with non-empty cpus/mems, instead of being
 292  *   moved to an ancestor.
 293  *
 294  * - cpuset: a task can be moved into an empty cpuset, and again it takes
 295  *   masks of ancestors.
 296  *
 297  * - memcg: use_hierarchy is on by default and the cgroup file for the flag
 298  *   is not created.
 299  *
 300  * - blkcg: blk-throttle becomes properly hierarchical.
 301  *
 302  * - debug: disallowed on the default hierarchy.
 303  */
 304 static bool cgroup_on_dfl(const struct cgroup *cgrp)
 305 {
 306         return cgrp->root == &cgrp_dfl_root;
 307 }
 308
 309 /* IDR wrappers which synchronize using cgroup_idr_lock */
 310 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
 311                             gfp_t gfp_mask)
 312 {
 313         int ret;
 314
 315         idr_preload(gfp_mask);
 316         spin_lock_bh(&cgroup_idr_lock);
 317         ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
 318         spin_unlock_bh(&cgroup_idr_lock);
 319         idr_preload_end();
 320         return ret;
 321 }
 322
 323 static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
 324 {
 325         void *ret;
 326
 327         spin_lock_bh(&cgroup_idr_lock);
 328         ret = idr_replace(idr, ptr, id);
 329         spin_unlock_bh(&cgroup_idr_lock);
 330         return ret;
 331 }
 332
 333 static void cgroup_idr_remove(struct idr *idr, int id)
 334 {
 335         spin_lock_bh(&cgroup_idr_lock);
 336         idr_remove(idr, id);
 337         spin_unlock_bh(&cgroup_idr_lock);
 338 }
 339
 340 static struct cgroup *cgroup_parent(struct cgroup *cgrp)
 341 {
 342         struct cgroup_subsys_state *parent_css = cgrp->self.parent;
 343
 344         if (parent_css)
 345                 return container_of(parent_css, struct cgroup, self);
 346         return NULL;
 347 }
 348
 349 /* subsystems visibly enabled on a cgroup */
 350 static u16 cgroup_control(struct cgroup *cgrp)
 351 {
 352         struct cgroup *parent = cgroup_parent(cgrp);
 353         u16 root_ss_mask = cgrp->root->subsys_mask;
 354
 355         if (parent)
 356                 return parent->subtree_control;
 357
 358         if (cgroup_on_dfl(cgrp))
 359                 root_ss_mask &= ~cgrp_dfl_inhibit_ss_mask;
 360
 361         return root_ss_mask;
 362 }
 363
 364 /* subsystems enabled on a cgroup */
 365 static u16 cgroup_ss_mask(struct cgroup *cgrp)
 366 {
 367         struct cgroup *parent = cgroup_parent(cgrp);
 368
 369         if (parent)
 370                 return parent->subtree_ss_mask;
 371
 372         return cgrp->root->subsys_mask;
 373 }
 374
 375 /**
 376  * cgroup_css - obtain a cgroup's css for the specified subsystem
 377  * @cgrp: the cgroup of interest
 378  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 379  *
 380  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 381  * function must be called either under cgroup_mutex or rcu_read_lock() and
 382  * the caller is responsible for pinning the returned css if it wants to
 383  * keep accessing it outside the said locks.  This function may return
 384  * %NULL if @cgrp doesn't have @subsys_id enabled.
 385  */
 386 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 387                                               struct cgroup_subsys *ss)
 388 {
 389         if (ss)
 390                 return rcu_dereference_check(cgrp->subsys[ss->id],
 391                                         lockdep_is_held(&cgroup_mutex));
 392         else
 393                 return &cgrp->self;
 394 }
 395
 396 /**
 397  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 398  * @cgrp: the cgroup of interest
 399  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 400  *
 401  * Similar to cgroup_css() but returns the effective css, which is defined
 402  * as the matching css of the nearest ancestor including self which has @ss
 403  * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 404  * function is guaranteed to return non-NULL css.
 405  */
 406 static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 407                                                 struct cgroup_subsys *ss)
 408 {
 409         lockdep_assert_held(&cgroup_mutex);
 410
 411         if (!ss)
 412                 return &cgrp->self;
 413
 414         /*
 415          * This function is used while updating css associations and thus
 416          * can't test the csses directly.  Test ss_mask.
 417          */
 418         while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
 419                 cgrp = cgroup_parent(cgrp);
 420                 if (!cgrp)
 421                         return NULL;
 422         }
 423
 424         return cgroup_css(cgrp, ss);
 425 }
 426
 427 /**
 428  * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
 429  * @cgrp: the cgroup of interest
 430  * @ss: the subsystem of interest
 431  *
 432  * Find and get the effective css of @cgrp for @ss.  The effective css is
 433  * defined as the matching css of the nearest ancestor including self which
 434  * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 435  * the root css is returned, so this function always returns a valid css.
 436  * The returned css must be put using css_put().
 437  */
 438 struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
 439                                              struct cgroup_subsys *ss)
 440 {
 441         struct cgroup_subsys_state *css;
 442
 443         rcu_read_lock();
 444
 445         do {
 446                 css = cgroup_css(cgrp, ss);
 447
 448                 if (css && css_tryget_online(css))
 449                         goto out_unlock;
 450                 cgrp = cgroup_parent(cgrp);
 451         } while (cgrp);
 452
 453         css = init_css_set.subsys[ss->id];
 454         css_get(css);
 455 out_unlock:
 456         rcu_read_unlock();
 457         return css;
 458 }
 459
 460 /* convenient tests for these bits */
 461 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 462 {
 463         return !(cgrp->self.flags & CSS_ONLINE);
 464 }
 465
 466 static void cgroup_get(struct cgroup *cgrp)
 467 {
 468         WARN_ON_ONCE(cgroup_is_dead(cgrp));
 469         css_get(&cgrp->self);
 470 }
 471
 472 static bool cgroup_tryget(struct cgroup *cgrp)
 473 {
 474         return css_tryget(&cgrp->self);
 475 }
 476
 477 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 478 {
 479         struct cgroup *cgrp = of->kn->parent->priv;
 480         struct cftype *cft = of_cft(of);
 481
 482         /*
 483          * This is open and unprotected implementation of cgroup_css().
 484          * seq_css() is only called from a kernfs file operation which has
 485          * an active reference on the file.  Because all the subsystem
 486          * files are drained before a css is disassociated with a cgroup,
 487          * the matching css from the cgroup's subsys table is guaranteed to
 488          * be and stay valid until the enclosing operation is complete.
 489          */
 490         if (cft->ss)
 491                 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
 492         else
 493                 return &cgrp->self;
 494 }
 495 EXPORT_SYMBOL_GPL(of_css);
 496
 497 static int notify_on_release(const struct cgroup *cgrp)
 498 {
 499         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 500 }
 501
 502 /**
 503  * for_each_css - iterate all css's of a cgroup
 504  * @css: the iteration cursor
 505  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 506  * @cgrp: the target cgroup to iterate css's of
 507  *
 508  * Should be called under cgroup_[tree_]mutex.
 509  */
 510 #define for_each_css(css, ssid, cgrp)                                   \
 511         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
 512                 if (!((css) = rcu_dereference_check(                    \
 513                                 (cgrp)->subsys[(ssid)],                 \
 514                                 lockdep_is_held(&cgroup_mutex)))) { }   \
 515                 else
 516
 517 /**
 518  * for_each_e_css - iterate all effective css's of a cgroup
 519  * @css: the iteration cursor
 520  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 521  * @cgrp: the target cgroup to iterate css's of
 522  *
 523  * Should be called under cgroup_[tree_]mutex.
 524  */
 525 #define for_each_e_css(css, ssid, cgrp)                                 \
 526         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
 527                 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
 528                         ;                                               \
 529                 else
 530
 531 /**
 532  * for_each_subsys - iterate all enabled cgroup subsystems
 533  * @ss: the iteration cursor
 534  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 535  */
 536 #define for_each_subsys(ss, ssid)                                       \
 537         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&                \
 538              (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 539
 540 /**
 541  * do_each_subsys_mask - filter for_each_subsys with a bitmask
 542  * @ss: the iteration cursor
 543  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 544  * @ss_mask: the bitmask
 545  *
 546  * The block will only run for cases where the ssid-th bit (1 << ssid) of
 547  * @ss_mask is set.
 548  */
 549 #define do_each_subsys_mask(ss, ssid, ss_mask) do {                     \
 550         unsigned long __ss_mask = (ss_mask);                            \
 551         if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
 552                 (ssid) = 0;                                             \
 553                 break;                                                  \
 554         }                                                               \
 555         for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {       \
 556                 (ss) = cgroup_subsys[ssid];                             \
 557                 {
 558
 559 #define while_each_subsys_mask()                                        \
 560                 }                                                       \
 561         }                                                               \
 562 } while (false)
 563
 564 /* iterate across the hierarchies */
 565 #define for_each_root(root)                                             \
 566         list_for_each_entry((root), &cgroup_roots, root_list)
 567
 568 /* iterate over child cgrps, lock should be held throughout iteration */
 569 #define cgroup_for_each_live_child(child, cgrp)                         \
 570         list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
 571                 if (({ lockdep_assert_held(&cgroup_mutex);              \
 572                        cgroup_is_dead(child); }))                       \
 573                         ;                                               \
 574                 else
 575
 576 static void cgroup_release_agent(struct work_struct *work);
 577 static void check_for_release(struct cgroup *cgrp);
 578
 579 /*
 580  * A cgroup can be associated with multiple css_sets as different tasks may
 581  * belong to different cgroups on different hierarchies.  In the other
 582  * direction, a css_set is naturally associated with multiple cgroups.
 583  * This M:N relationship is represented by the following link structure
 584  * which exists for each association and allows traversing the associations
 585  * from both sides.
 586  */
 587 struct cgrp_cset_link {
 588         /* the cgroup and css_set this link associates */
 589         struct cgroup           *cgrp;
 590         struct css_set          *cset;
 591
 592         /* list of cgrp_cset_links anchored at cgrp->cset_links */
 593         struct list_head        cset_link;
 594
 595         /* list of cgrp_cset_links anchored at css_set->cgrp_links */
 596         struct list_head        cgrp_link;
 597 };
 598
 599 /*
 600  * The default css_set - used by init and its children prior to any
 601  * hierarchies being mounted. It contains a pointer to the root state
 602  * for each subsystem. Also used to anchor the list of css_sets. Not
 603  * reference-counted, to improve performance when child cgroups
 604  * haven't been created.
 605  */
 606 struct css_set init_css_set = {
 607         .refcount               = ATOMIC_INIT(1),
 608         .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
 609         .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
 610         .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
 611         .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
 612         .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
 613         .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
 614 };
 615
 616 static int css_set_count        = 1;    /* 1 for init_css_set */
 617
 618 /**
 619  * css_set_populated - does a css_set contain any tasks?
 620  * @cset: target css_set
 621  */
 622 static bool css_set_populated(struct css_set *cset)
 623 {
 624         lockdep_assert_held(&css_set_lock);
 625
 626         return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
 627 }
 628
 629 /**
 630  * cgroup_update_populated - updated populated count of a cgroup
 631  * @cgrp: the target cgroup
 632  * @populated: inc or dec populated count
 633  *
 634  * One of the css_sets associated with @cgrp is either getting its first
 635  * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
 636  * count is propagated towards root so that a given cgroup's populated_cnt
 637  * is zero iff the cgroup and all its descendants don't contain any tasks.
 638  *
 639  * @cgrp's interface file "cgroup.populated" is zero if
 640  * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
 641  * changes from or to zero, userland is notified that the content of the
 642  * interface file has changed.  This can be used to detect when @cgrp and
 643  * its descendants become populated or empty.
 644  */
 645 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 646 {
 647         lockdep_assert_held(&css_set_lock);
 648
 649         do {
 650                 bool trigger;
 651
 652                 if (populated)
 653                         trigger = !cgrp->populated_cnt++;
 654                 else
 655                         trigger = !--cgrp->populated_cnt;
 656
 657                 if (!trigger)
 658                         break;
 659
 660                 check_for_release(cgrp);
 661                 cgroup_file_notify(&cgrp->events_file);
 662
 663                 cgrp = cgroup_parent(cgrp);
 664         } while (cgrp);
 665 }
 666
 667 /**
 668  * css_set_update_populated - update populated state of a css_set
 669  * @cset: target css_set
 670  * @populated: whether @cset is populated or depopulated
 671  *
 672  * @cset is either getting the first task or losing the last.  Update the
 673  * ->populated_cnt of all associated cgroups accordingly.
 674  */
 675 static void css_set_update_populated(struct css_set *cset, bool populated)
 676 {
 677         struct cgrp_cset_link *link;
 678
 679         lockdep_assert_held(&css_set_lock);
 680
 681         list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
 682                 cgroup_update_populated(link->cgrp, populated);
 683 }
 684
 685 /**
 686  * css_set_move_task - move a task from one css_set to another
 687  * @task: task being moved
 688  * @from_cset: css_set @task currently belongs to (may be NULL)
 689  * @to_cset: new css_set @task is being moved to (may be NULL)
 690  * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
 691  *
 692  * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
 693  * css_set, @from_cset can be NULL.  If @task is being disassociated
 694  * instead of moved, @to_cset can be NULL.
 695  *
 696  * This function automatically handles populated_cnt updates and
 697  * css_task_iter adjustments but the caller is responsible for managing
 698  * @from_cset and @to_cset's reference counts.
 699  */
 700 static void css_set_move_task(struct task_struct *task,
 701                               struct css_set *from_cset, struct css_set *to_cset,
 702                               bool use_mg_tasks)
 703 {
 704         lockdep_assert_held(&css_set_lock);
 705
 706         if (to_cset && !css_set_populated(to_cset))
 707                 css_set_update_populated(to_cset, true);
 708
 709         if (from_cset) {
 710                 struct css_task_iter *it, *pos;
 711
 712                 WARN_ON_ONCE(list_empty(&task->cg_list));
 713
 714                 /*
 715                  * @task is leaving, advance task iterators which are
 716                  * pointing to it so that they can resume at the next
 717                  * position.  Advancing an iterator might remove it from
 718                  * the list, use safe walk.  See css_task_iter_advance*()
 719                  * for details.
 720                  */
 721                 list_for_each_entry_safe(it, pos, &from_cset->task_iters,
 722                                          iters_node)
 723                         if (it->task_pos == &task->cg_list)
 724                                 css_task_iter_advance(it);
 725
 726                 list_del_init(&task->cg_list);
 727                 if (!css_set_populated(from_cset))
 728                         css_set_update_populated(from_cset, false);
 729         } else {
 730                 WARN_ON_ONCE(!list_empty(&task->cg_list));
 731         }
 732
 733         if (to_cset) {
 734                 /*
 735                  * We are synchronized through cgroup_threadgroup_rwsem
 736                  * against PF_EXITING setting such that we can't race
 737                  * against cgroup_exit() changing the css_set to
 738                  * init_css_set and dropping the old one.
 739                  */
 740                 WARN_ON_ONCE(task->flags & PF_EXITING);
 741
 742                 rcu_assign_pointer(task->cgroups, to_cset);
 743                 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
 744                                                              &to_cset->tasks);
 745         }
 746 }
 747
 748 /*
 749  * hash table for cgroup groups. This improves the performance to find
 750  * an existing css_set. This hash doesn't (currently) take into
 751  * account cgroups in empty hierarchies.
 752  */
 753 #define CSS_SET_HASH_BITS       7
 754 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 755
 756 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 757 {
 758         unsigned long key = 0UL;
 759         struct cgroup_subsys *ss;
 760         int i;
 761
 762         for_each_subsys(ss, i)
 763                 key += (unsigned long)css[i];
 764         key = (key >> 16) ^ key;
 765
 766         return key;
 767 }
 768
 769 static void put_css_set_locked(struct css_set *cset)
 770 {
 771         struct cgrp_cset_link *link, *tmp_link;
 772         struct cgroup_subsys *ss;
 773         int ssid;
 774
 775         lockdep_assert_held(&css_set_lock);
 776
 777         if (!atomic_dec_and_test(&cset->refcount))
 778                 return;
 779
 780         /* This css_set is dead. unlink it and release cgroup and css refs */
 781         for_each_subsys(ss, ssid) {
 782                 list_del(&cset->e_cset_node[ssid]);
 783                 css_put(cset->subsys[ssid]);
 784         }
 785         hash_del(&cset->hlist);
 786         css_set_count--;
 787
 788         list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 789                 list_del(&link->cset_link);
 790                 list_del(&link->cgrp_link);
 791                 if (cgroup_parent(link->cgrp))
 792                         cgroup_put(link->cgrp);
 793                 kfree(link);
 794         }
 795
 796         kfree_rcu(cset, rcu_head);
 797 }
 798
 799 static void put_css_set(struct css_set *cset)
 800 {
 801         /*
 802          * Ensure that the refcount doesn't hit zero while any readers
 803          * can see it. Similar to atomic_dec_and_lock(), but for an
 804          * rwlock
 805          */
 806         if (atomic_add_unless(&cset->refcount, -1, 1))
 807                 return;
 808
 809         spin_lock_bh(&css_set_lock);
 810         put_css_set_locked(cset);
 811         spin_unlock_bh(&css_set_lock);
 812 }
 813
 814 /*
 815  * refcounted get/put for css_set objects
 816  */
 817 static inline void get_css_set(struct css_set *cset)
 818 {
 819         atomic_inc(&cset->refcount);
 820 }
 821
 822 /**
 823  * compare_css_sets - helper function for find_existing_css_set().
 824  * @cset: candidate css_set being tested
 825  * @old_cset: existing css_set for a task
 826  * @new_cgrp: cgroup that's being entered by the task
 827  * @template: desired set of css pointers in css_set (pre-calculated)
 828  *
 829  * Returns true if "cset" matches "old_cset" except for the hierarchy
 830  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 831  */
 832 static bool compare_css_sets(struct css_set *cset,
 833                              struct css_set *old_cset,
 834                              struct cgroup *new_cgrp,
 835                              struct cgroup_subsys_state *template[])
 836 {
 837         struct list_head *l1, *l2;
 838
 839         /*
 840          * On the default hierarchy, there can be csets which are
 841          * associated with the same set of cgroups but different csses.
 842          * Let's first ensure that csses match.
 843          */
 844         if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
 845                 return false;
 846
 847         /*
 848          * Compare cgroup pointers in order to distinguish between
 849          * different cgroups in hierarchies.  As different cgroups may
 850          * share the same effective css, this comparison is always
 851          * necessary.
 852          */
 853         l1 = &cset->cgrp_links;
 854         l2 = &old_cset->cgrp_links;
 855         while (1) {
 856                 struct cgrp_cset_link *link1, *link2;
 857                 struct cgroup *cgrp1, *cgrp2;
 858
 859                 l1 = l1->next;
 860                 l2 = l2->next;
 861                 /* See if we reached the end - both lists are equal length. */
 862                 if (l1 == &cset->cgrp_links) {
 863                         BUG_ON(l2 != &old_cset->cgrp_links);
 864                         break;
 865                 } else {
 866                         BUG_ON(l2 == &old_cset->cgrp_links);
 867                 }
 868                 /* Locate the cgroups associated with these links. */
 869                 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
 870                 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
 871                 cgrp1 = link1->cgrp;
 872                 cgrp2 = link2->cgrp;
 873                 /* Hierarchies should be linked in the same order. */
 874                 BUG_ON(cgrp1->root != cgrp2->root);
 875
 876                 /*
 877                  * If this hierarchy is the hierarchy of the cgroup
 878                  * that's changing, then we need to check that this
 879                  * css_set points to the new cgroup; if it's any other
 880                  * hierarchy, then this css_set should point to the
 881                  * same cgroup as the old css_set.
 882                  */
 883                 if (cgrp1->root == new_cgrp->root) {
 884                         if (cgrp1 != new_cgrp)
 885                                 return false;
 886                 } else {
 887                         if (cgrp1 != cgrp2)
 888                                 return false;
 889                 }
 890         }
 891         return true;
 892 }
 893
 894 /**
 895  * find_existing_css_set - init css array and find the matching css_set
 896  * @old_cset: the css_set that we're using before the cgroup transition
 897  * @cgrp: the cgroup that we're moving into
 898  * @template: out param for the new set of csses, should be clear on entry
 899  */
 900 static struct css_set *find_existing_css_set(struct css_set *old_cset,
 901                                         struct cgroup *cgrp,
 902                                         struct cgroup_subsys_state *template[])
 903 {
 904         struct cgroup_root *root = cgrp->root;
 905         struct cgroup_subsys *ss;
 906         struct css_set *cset;
 907         unsigned long key;
 908         int i;
 909
 910         /*
 911          * Build the set of subsystem state objects that we want to see in the
 912          * new css_set. while subsystems can change globally, the entries here
 913          * won't change, so no need for locking.
 914          */
 915         for_each_subsys(ss, i) {
 916                 if (root->subsys_mask & (1UL << i)) {
 917                         /*
 918                          * @ss is in this hierarchy, so we want the
 919                          * effective css from @cgrp.
 920                          */
 921                         template[i] = cgroup_e_css(cgrp, ss);
 922                 } else {
 923                         /*
 924                          * @ss is not in this hierarchy, so we don't want
 925                          * to change the css.
 926                          */
 927                         template[i] = old_cset->subsys[i];
 928                 }
 929         }
 930
 931         key = css_set_hash(template);
 932         hash_for_each_possible(css_set_table, cset, hlist, key) {
 933                 if (!compare_css_sets(cset, old_cset, cgrp, template))
 934                         continue;
 935
 936                 /* This css_set matches what we need */
 937                 return cset;
 938         }
 939
 940         /* No existing cgroup group matched */
 941         return NULL;
 942 }
 943
 944 static void free_cgrp_cset_links(struct list_head *links_to_free)
 945 {
 946         struct cgrp_cset_link *link, *tmp_link;
 947
 948         list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
 949                 list_del(&link->cset_link);
 950                 kfree(link);
 951         }
 952 }
 953
 954 /**
 955  * allocate_cgrp_cset_links - allocate cgrp_cset_links
 956  * @count: the number of links to allocate
 957  * @tmp_links: list_head the allocated links are put on
 958  *
 959  * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 960  * through ->cset_link.  Returns 0 on success or -errno.
 961  */
 962 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
 963 {
 964         struct cgrp_cset_link *link;
 965         int i;
 966
 967         INIT_LIST_HEAD(tmp_links);
 968
 969         for (i = 0; i < count; i++) {
 970                 link = kzalloc(sizeof(*link), GFP_KERNEL);
 971                 if (!link) {
 972                         free_cgrp_cset_links(tmp_links);
 973                         return -ENOMEM;
 974                 }
 975                 list_add(&link->cset_link, tmp_links);
 976         }
 977         return 0;
 978 }
 979
 980 /**
 981  * link_css_set - a helper function to link a css_set to a cgroup
 982  * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
 983  * @cset: the css_set to be linked
 984  * @cgrp: the destination cgroup
 985  */
 986 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 987                          struct cgroup *cgrp)
 988 {
 989         struct cgrp_cset_link *link;
 990
 991         BUG_ON(list_empty(tmp_links));
 992
 993         if (cgroup_on_dfl(cgrp))
 994                 cset->dfl_cgrp = cgrp;
 995
 996         link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
 997         link->cset = cset;
 998         link->cgrp = cgrp;
 999
1000         /*
1001          * Always add links to the tail of the lists so that the lists are
1002          * in choronological order.
1003          */
1004         list_move_tail(&link->cset_link, &cgrp->cset_links);
1005         list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1006
1007         if (cgroup_parent(cgrp))
1008                 cgroup_get(cgrp);
1009 }
1010
1011 /**
1012  * find_css_set - return a new css_set with one cgroup updated
1013  * @old_cset: the baseline css_set
1014  * @cgrp: the cgroup to be updated
1015  *
1016  * Return a new css_set that's equivalent to @old_cset, but with @cgrp
1017  * substituted into the appropriate hierarchy.
1018  */
1019 static struct css_set *find_css_set(struct css_set *old_cset,
1020                                     struct cgroup *cgrp)
1021 {
1022         struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1023         struct css_set *cset;
1024         struct list_head tmp_links;
1025         struct cgrp_cset_link *link;
1026         struct cgroup_subsys *ss;
1027         unsigned long key;
1028         int ssid;
1029
1030         lockdep_assert_held(&cgroup_mutex);
1031
1032         /* First see if we already have a cgroup group that matches
1033          * the desired set */
1034         spin_lock_bh(&css_set_lock);
1035         cset = find_existing_css_set(old_cset, cgrp, template);
1036         if (cset)
1037                 get_css_set(cset);
1038         spin_unlock_bh(&css_set_lock);
1039
1040         if (cset)
1041                 return cset;
1042
1043         cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1044         if (!cset)
1045                 return NULL;
1046
1047         /* Allocate all the cgrp_cset_link objects that we'll need */
1048         if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1049                 kfree(cset);
1050                 return NULL;
1051         }
1052
1053         atomic_set(&cset->refcount, 1);
1054         INIT_LIST_HEAD(&cset->cgrp_links);
1055         INIT_LIST_HEAD(&cset->tasks);
1056         INIT_LIST_HEAD(&cset->mg_tasks);
1057         INIT_LIST_HEAD(&cset->mg_preload_node);
1058         INIT_LIST_HEAD(&cset->mg_node);
1059         INIT_LIST_HEAD(&cset->task_iters);
1060         INIT_HLIST_NODE(&cset->hlist);
1061
1062         /* Copy the set of subsystem state objects generated in
1063          * find_existing_css_set() */
1064         memcpy(cset->subsys, template, sizeof(cset->subsys));
1065
1066         spin_lock_bh(&css_set_lock);
1067         /* Add reference counts and links from the new css_set. */
1068         list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1069                 struct cgroup *c = link->cgrp;
1070
1071                 if (c->root == cgrp->root)
1072                         c = cgrp;
1073                 link_css_set(&tmp_links, cset, c);
1074         }
1075
1076         BUG_ON(!list_empty(&tmp_links));
1077
1078         css_set_count++;
1079
1080         /* Add @cset to the hash table */
1081         key = css_set_hash(cset->subsys);
1082         hash_add(css_set_table, &cset->hlist, key);
1083
1084         for_each_subsys(ss, ssid) {
1085                 struct cgroup_subsys_state *css = cset->subsys[ssid];
1086
1087                 list_add_tail(&cset->e_cset_node[ssid],
1088                               &css->cgroup->e_csets[ssid]);
1089                 css_get(css);
1090         }
1091
1092         spin_unlock_bh(&css_set_lock);
1093
1094         return cset;
1095 }
1096
1097 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1098 {
1099         struct cgroup *root_cgrp = kf_root->kn->priv;
1100
1101         return root_cgrp->root;
1102 }
1103
1104 static int cgroup_init_root_id(struct cgroup_root *root)
1105 {
1106         int id;
1107
1108         lockdep_assert_held(&cgroup_mutex);
1109
1110         id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1111         if (id < 0)
1112                 return id;
1113
1114         root->hierarchy_id = id;
1115         return 0;
1116 }
1117
1118 static void cgroup_exit_root_id(struct cgroup_root *root)
1119 {
1120         lockdep_assert_held(&cgroup_mutex);
1121
1122         if (root->hierarchy_id) {
1123                 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1124                 root->hierarchy_id = 0;
1125         }
1126 }
1127
1128 static void cgroup_free_root(struct cgroup_root *root)
1129 {
1130         if (root) {
1131                 /* hierarchy ID should already have been released */
1132                 WARN_ON_ONCE(root->hierarchy_id);
1133
1134                 idr_destroy(&root->cgroup_idr);
1135                 kfree(root);
1136         }
1137 }
1138
1139 static void cgroup_destroy_root(struct cgroup_root *root)
1140 {
1141         struct cgroup *cgrp = &root->cgrp;
1142         struct cgrp_cset_link *link, *tmp_link;
1143
1144         mutex_lock(&cgroup_mutex);
1145
1146         BUG_ON(atomic_read(&root->nr_cgrps));
1147         BUG_ON(!list_empty(&cgrp->self.children));
1148
1149         /* Rebind all subsystems back to the default hierarchy */
1150         rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
1151
1152         /*
1153          * Release all the links from cset_links to this hierarchy's
1154          * root cgroup
1155          */
1156         spin_lock_bh(&css_set_lock);
1157
1158         list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1159                 list_del(&link->cset_link);
1160                 list_del(&link->cgrp_link);
1161                 kfree(link);
1162         }
1163
1164         spin_unlock_bh(&css_set_lock);
1165
1166         if (!list_empty(&root->root_list)) {
1167                 list_del(&root->root_list);
1168                 cgroup_root_count--;
1169         }
1170
1171         cgroup_exit_root_id(root);
1172
1173         mutex_unlock(&cgroup_mutex);
1174
1175         kernfs_destroy_root(root->kf_root);
1176         cgroup_free_root(root);
1177 }
1178
1179 /* look up cgroup associated with given css_set on the specified hierarchy */
1180 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1181                                             struct cgroup_root *root)
1182 {
1183         struct cgroup *res = NULL;
1184
1185         lockdep_assert_held(&cgroup_mutex);
1186         lockdep_assert_held(&css_set_lock);
1187
1188         if (cset == &init_css_set) {
1189                 res = &root->cgrp;
1190         } else {
1191                 struct cgrp_cset_link *link;
1192
1193                 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1194                         struct cgroup *c = link->cgrp;
1195
1196                         if (c->root == root) {
1197                                 res = c;
1198                                 break;
1199                         }
1200                 }
1201         }
1202
1203         BUG_ON(!res);
1204         return res;
1205 }
1206
1207 /*
1208  * Return the cgroup for "task" from the given hierarchy. Must be
1209  * called with cgroup_mutex and css_set_lock held.
1210  */
1211 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
1212                                             struct cgroup_root *root)
1213 {
1214         /*
1215          * No need to lock the task - since we hold cgroup_mutex the
1216          * task can't change groups, so the only thing that can happen
1217          * is that it exits and its css is set back to init_css_set.
1218          */
1219         return cset_cgroup_from_root(task_css_set(task), root);
1220 }
1221
1222 /*
1223  * A task must hold cgroup_mutex to modify cgroups.
1224  *
1225  * Any task can increment and decrement the count field without lock.
1226  * So in general, code holding cgroup_mutex can't rely on the count
1227  * field not changing.  However, if the count goes to zero, then only
1228  * cgroup_attach_task() can increment it again.  Because a count of zero
1229  * means that no tasks are currently attached, therefore there is no
1230  * way a task attached to that cgroup can fork (the other way to
1231  * increment the count).  So code holding cgroup_mutex can safely
1232  * assume that if the count is zero, it will stay zero. Similarly, if
1233  * a task holds cgroup_mutex on a cgroup with zero count, it
1234  * knows that the cgroup won't be removed, as cgroup_rmdir()
1235  * needs that mutex.
1236  *
1237  * A cgroup can only be deleted if both its 'count' of using tasks
1238  * is zero, and its list of 'children' cgroups is empty.  Since all
1239  * tasks in the system use _some_ cgroup, and since there is always at
1240  * least one task in the system (init, pid == 1), therefore, root cgroup
1241  * always has either children cgroups and/or using tasks.  So we don't
1242  * need a special hack to ensure that root cgroup cannot be deleted.
1243  *
1244  * P.S.  One more locking exception.  RCU is used to guard the
1245  * update of a tasks cgroup pointer by cgroup_attach_task()
1246  */
1247
1248 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1249 static const struct file_operations proc_cgroupstats_operations;
1250
1251 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1252                               char *buf)
1253 {
1254         struct cgroup_subsys *ss = cft->ss;
1255
1256         if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1257             !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1258                 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1259                          cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1260                          cft->name);
1261         else
1262                 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1263         return buf;
1264 }
1265
1266 /**
1267  * cgroup_file_mode - deduce file mode of a control file
1268  * @cft: the control file in question
1269  *
1270  * S_IRUGO for read, S_IWUSR for write.
1271  */
1272 static umode_t cgroup_file_mode(const struct cftype *cft)
1273 {
1274         umode_t mode = 0;
1275
1276         if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1277                 mode |= S_IRUGO;
1278
1279         if (cft->write_u64 || cft->write_s64 || cft->write) {
1280                 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1281                         mode |= S_IWUGO;
1282                 else
1283                         mode |= S_IWUSR;
1284         }
1285
1286         return mode;
1287 }
1288
1289 /**
1290  * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
1291  * @cgrp: the target cgroup
1292  * @subtree_control: the new subtree_control mask to consider
1293  *
1294  * On the default hierarchy, a subsystem may request other subsystems to be
1295  * enabled together through its ->depends_on mask.  In such cases, more
1296  * subsystems than specified in "cgroup.subtree_control" may be enabled.
1297  *
1298  * This function calculates which subsystems need to be enabled if
1299  * @subtree_control is to be applied to @cgrp.  The returned mask is always
1300  * a superset of @subtree_control and follows the usual hierarchy rules.
1301  */
1302 static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control)
1303 {
1304         u16 cur_ss_mask = subtree_control;
1305         struct cgroup_subsys *ss;
1306         int ssid;
1307
1308         lockdep_assert_held(&cgroup_mutex);
1309
1310         if (!cgroup_on_dfl(cgrp))
1311                 return cur_ss_mask;
1312
1313         while (true) {
1314                 u16 new_ss_mask = cur_ss_mask;
1315
1316                 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1317                         new_ss_mask |= ss->depends_on;
1318                 } while_each_subsys_mask();
1319
1320                 /*
1321                  * Mask out subsystems which aren't available.  This can
1322                  * happen only if some depended-upon subsystems were bound
1323                  * to non-default hierarchies.
1324                  */
1325                 new_ss_mask &= cgroup_ss_mask(cgrp);
1326
1327                 if (new_ss_mask == cur_ss_mask)
1328                         break;
1329                 cur_ss_mask = new_ss_mask;
1330         }
1331
1332         return cur_ss_mask;
1333 }
1334
1335 /**
1336  * cgroup_refresh_subtree_ss_mask - update subtree_ss_mask
1337  * @cgrp: the target cgroup
1338  *
1339  * Update @cgrp->subtree_ss_mask according to the current
1340  * @cgrp->subtree_control using cgroup_calc_subtree_ss_mask().
1341  */
1342 static void cgroup_refresh_subtree_ss_mask(struct cgroup *cgrp)
1343 {
1344         cgrp->subtree_ss_mask =
1345                 cgroup_calc_subtree_ss_mask(cgrp, cgrp->subtree_control);
1346 }
1347
1348 /**
1349  * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1350  * @kn: the kernfs_node being serviced
1351  *
1352  * This helper undoes cgroup_kn_lock_live() and should be invoked before
1353  * the method finishes if locking succeeded.  Note that once this function
1354  * returns the cgroup returned by cgroup_kn_lock_live() may become
1355  * inaccessible any time.  If the caller intends to continue to access the
1356  * cgroup, it should pin it before invoking this function.
1357  */
1358 static void cgroup_kn_unlock(struct kernfs_node *kn)
1359 {
1360         struct cgroup *cgrp;
1361
1362         if (kernfs_type(kn) == KERNFS_DIR)
1363                 cgrp = kn->priv;
1364         else
1365                 cgrp = kn->parent->priv;
1366
1367         mutex_unlock(&cgroup_mutex);
1368
1369         kernfs_unbreak_active_protection(kn);
1370         cgroup_put(cgrp);
1371 }
1372
1373 /**
1374  * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1375  * @kn: the kernfs_node being serviced
1376  *
1377  * This helper is to be used by a cgroup kernfs method currently servicing
1378  * @kn.  It breaks the active protection, performs cgroup locking and
1379  * verifies that the associated cgroup is alive.  Returns the cgroup if
1380  * alive; otherwise, %NULL.  A successful return should be undone by a
1381  * matching cgroup_kn_unlock() invocation.
1382  *
1383  * Any cgroup kernfs method implementation which requires locking the
1384  * associated cgroup should use this helper.  It avoids nesting cgroup
1385  * locking under kernfs active protection and allows all kernfs operations
1386  * including self-removal.
1387  */
1388 static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
1389 {
1390         struct cgroup *cgrp;
1391
1392         if (kernfs_type(kn) == KERNFS_DIR)
1393                 cgrp = kn->priv;
1394         else
1395                 cgrp = kn->parent->priv;
1396
1397         /*
1398          * We're gonna grab cgroup_mutex which nests outside kernfs
1399          * active_ref.  cgroup liveliness check alone provides enough
1400          * protection against removal.  Ensure @cgrp stays accessible and
1401          * break the active_ref protection.
1402          */
1403         if (!cgroup_tryget(cgrp))
1404                 return NULL;
1405         kernfs_break_active_protection(kn);
1406
1407         mutex_lock(&cgroup_mutex);
1408
1409         if (!cgroup_is_dead(cgrp))
1410                 return cgrp;
1411
1412         cgroup_kn_unlock(kn);
1413         return NULL;
1414 }
1415
1416 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1417 {
1418         char name[CGROUP_FILE_NAME_MAX];
1419
1420         lockdep_assert_held(&cgroup_mutex);
1421
1422         if (cft->file_offset) {
1423                 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1424                 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1425
1426                 spin_lock_irq(&cgroup_file_kn_lock);
1427                 cfile->kn = NULL;
1428                 spin_unlock_irq(&cgroup_file_kn_lock);
1429         }
1430
1431         kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1432 }
1433
1434 /**
1435  * css_clear_dir - remove subsys files in a cgroup directory
1436  * @css: taget css
1437  * @cgrp_override: specify if target cgroup is different from css->cgroup
1438  */
1439 static void css_clear_dir(struct cgroup_subsys_state *css,
1440                           struct cgroup *cgrp_override)
1441 {
1442         struct cgroup *cgrp = cgrp_override ?: css->cgroup;
1443         struct cftype *cfts;
1444
1445         if (!(css->flags & CSS_VISIBLE))
1446                 return;
1447
1448         css->flags &= ~CSS_VISIBLE;
1449
1450         list_for_each_entry(cfts, &css->ss->cfts, node)
1451                 cgroup_addrm_files(css, cgrp, cfts, false);
1452 }
1453
1454 /**
1455  * css_populate_dir - create subsys files in a cgroup directory
1456  * @css: target css
1457  * @cgrp_overried: specify if target cgroup is different from css->cgroup
1458  *
1459  * On failure, no file is added.
1460  */
1461 static int css_populate_dir(struct cgroup_subsys_state *css,
1462                             struct cgroup *cgrp_override)
1463 {
1464         struct cgroup *cgrp = cgrp_override ?: css->cgroup;
1465         struct cftype *cfts, *failed_cfts;
1466         int ret;
1467
1468         if (css->flags & CSS_VISIBLE)
1469                 return 0;
1470
1471         if (!css->ss) {
1472                 if (cgroup_on_dfl(cgrp))
1473                         cfts = cgroup_dfl_base_files;
1474                 else
1475                         cfts = cgroup_legacy_base_files;
1476
1477                 return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1478         }
1479
1480         list_for_each_entry(cfts, &css->ss->cfts, node) {
1481                 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1482                 if (ret < 0) {
1483                         failed_cfts = cfts;
1484                         goto err;
1485                 }
1486         }
1487
1488         css->flags |= CSS_VISIBLE;
1489
1490         return 0;
1491 err:
1492         list_for_each_entry(cfts, &css->ss->cfts, node) {
1493                 if (cfts == failed_cfts)
1494                         break;
1495                 cgroup_addrm_files(css, cgrp, cfts, false);
1496         }
1497         return ret;
1498 }
1499
1500 static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1501 {
1502         struct cgroup *dcgrp = &dst_root->cgrp;
1503         struct cgroup_subsys *ss;
1504         u16 tmp_ss_mask;
1505         int ssid, i, ret;
1506
1507         lockdep_assert_held(&cgroup_mutex);
1508
1509         do_each_subsys_mask(ss, ssid, ss_mask) {
1510                 /* if @ss has non-root csses attached to it, can't move */
1511                 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
1512                         return -EBUSY;
1513
1514                 /* can't move between two non-dummy roots either */
1515                 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1516                         return -EBUSY;
1517         } while_each_subsys_mask();
1518
1519         /* skip creating root files on dfl_root for inhibited subsystems */
1520         tmp_ss_mask = ss_mask;
1521         if (dst_root == &cgrp_dfl_root)
1522                 tmp_ss_mask &= ~cgrp_dfl_inhibit_ss_mask;
1523
1524         do_each_subsys_mask(ss, ssid, tmp_ss_mask) {
1525                 struct cgroup *scgrp = &ss->root->cgrp;
1526                 int tssid;
1527
1528                 ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
1529                 if (!ret)
1530                         continue;
1531
1532                 /*
1533                  * Rebinding back to the default root is not allowed to
1534                  * fail.  Using both default and non-default roots should
1535                  * be rare.  Moving subsystems back and forth even more so.
1536                  * Just warn about it and continue.
1537                  */
1538                 if (dst_root == &cgrp_dfl_root) {
1539                         if (cgrp_dfl_visible) {
1540                                 pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
1541                                         ret, ss_mask);
1542                                 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1543                         }
1544                         continue;
1545                 }
1546
1547                 do_each_subsys_mask(ss, tssid, tmp_ss_mask) {
1548                         if (tssid == ssid)
1549                                 break;
1550                         css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
1551                 } while_each_subsys_mask();
1552                 return ret;
1553         } while_each_subsys_mask();
1554
1555         /*
1556          * Nothing can fail from this point on.  Remove files for the
1557          * removed subsystems and rebind each subsystem.
1558          */
1559         do_each_subsys_mask(ss, ssid, ss_mask) {
1560                 struct cgroup_root *src_root = ss->root;
1561                 struct cgroup *scgrp = &src_root->cgrp;
1562                 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1563                 struct css_set *cset;
1564
1565                 WARN_ON(!css || cgroup_css(dcgrp, ss));
1566
1567                 css_clear_dir(css, NULL);
1568
1569                 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1570                 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1571                 ss->root = dst_root;
1572                 css->cgroup = dcgrp;
1573
1574                 spin_lock_bh(&css_set_lock);
1575                 hash_for_each(css_set_table, i, cset, hlist)
1576                         list_move_tail(&cset->e_cset_node[ss->id],
1577                                        &dcgrp->e_csets[ss->id]);
1578                 spin_unlock_bh(&css_set_lock);
1579
1580                 src_root->subsys_mask &= ~(1 << ssid);
1581                 scgrp->subtree_control &= ~(1 << ssid);
1582                 cgroup_refresh_subtree_ss_mask(scgrp);
1583
1584                 /* default hierarchy doesn't enable controllers by default */
1585                 dst_root->subsys_mask |= 1 << ssid;
1586                 if (dst_root == &cgrp_dfl_root) {
1587                         static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1588                 } else {
1589                         dcgrp->subtree_control |= 1 << ssid;
1590                         cgroup_refresh_subtree_ss_mask(dcgrp);
1591                         static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1592                 }
1593
1594                 if (ss->bind)
1595                         ss->bind(css);
1596         } while_each_subsys_mask();
1597
1598         kernfs_activate(dcgrp->kn);
1599         return 0;
1600 }
1601
1602 static int cgroup_show_options(struct seq_file *seq,
1603                                struct kernfs_root *kf_root)
1604 {
1605         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1606         struct cgroup_subsys *ss;
1607         int ssid;
1608
1609         if (root != &cgrp_dfl_root)
1610                 for_each_subsys(ss, ssid)
1611                         if (root->subsys_mask & (1 << ssid))
1612                                 seq_show_option(seq, ss->legacy_name, NULL);
1613         if (root->flags & CGRP_ROOT_NOPREFIX)
1614                 seq_puts(seq, ",noprefix");
1615         if (root->flags & CGRP_ROOT_XATTR)
1616                 seq_puts(seq, ",xattr");
1617
1618         spin_lock(&release_agent_path_lock);
1619         if (strlen(root->release_agent_path))
1620                 seq_show_option(seq, "release_agent",
1621                                 root->release_agent_path);
1622         spin_unlock(&release_agent_path_lock);
1623
1624         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1625                 seq_puts(seq, ",clone_children");
1626         if (strlen(root->name))
1627                 seq_show_option(seq, "name", root->name);
1628         return 0;
1629 }
1630
1631 struct cgroup_sb_opts {
1632         u16 subsys_mask;
1633         unsigned int flags;
1634         char *release_agent;
1635         bool cpuset_clone_children;
1636         char *name;
1637         /* User explicitly requested empty subsystem */
1638         bool none;
1639 };
1640
1641 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1642 {
1643         char *token, *o = data;
1644         bool all_ss = false, one_ss = false;
1645         u16 mask = U16_MAX;
1646         struct cgroup_subsys *ss;
1647         int nr_opts = 0;
1648         int i;
1649
1650 #ifdef CONFIG_CPUSETS
1651         mask = ~((u16)1 << cpuset_cgrp_id);
1652 #endif
1653
1654         memset(opts, 0, sizeof(*opts));
1655
1656         while ((token = strsep(&o, ",")) != NULL) {
1657                 nr_opts++;
1658
1659                 if (!*token)
1660                         return -EINVAL;
1661                 if (!strcmp(token, "none")) {
1662                         /* Explicitly have no subsystems */
1663                         opts->none = true;
1664                         continue;
1665                 }
1666                 if (!strcmp(token, "all")) {
1667                         /* Mutually exclusive option 'all' + subsystem name */
1668                         if (one_ss)
1669                                 return -EINVAL;
1670                         all_ss = true;
1671                         continue;
1672                 }
1673                 if (!strcmp(token, "noprefix")) {
1674                         opts->flags |= CGRP_ROOT_NOPREFIX;
1675                         continue;
1676                 }
1677                 if (!strcmp(token, "clone_children")) {
1678                         opts->cpuset_clone_children = true;
1679                         continue;
1680                 }
1681                 if (!strcmp(token, "xattr")) {
1682                         opts->flags |= CGRP_ROOT_XATTR;
1683                         continue;
1684                 }
1685                 if (!strncmp(token, "release_agent=", 14)) {
1686                         /* Specifying two release agents is forbidden */
1687                         if (opts->release_agent)
1688                                 return -EINVAL;
1689                         opts->release_agent =
1690                                 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1691                         if (!opts->release_agent)
1692                                 return -ENOMEM;
1693                         continue;
1694                 }
1695                 if (!strncmp(token, "name=", 5)) {
1696                         const char *name = token + 5;
1697                         /* Can't specify an empty name */
1698                         if (!strlen(name))
1699                                 return -EINVAL;
1700                         /* Must match [\w.-]+ */
1701                         for (i = 0; i < strlen(name); i++) {
1702                                 char c = name[i];
1703                                 if (isalnum(c))
1704                                         continue;
1705                                 if ((c == '.') || (c == '-') || (c == '_'))
1706                                         continue;
1707                                 return -EINVAL;
1708                         }
1709                         /* Specifying two names is forbidden */
1710                         if (opts->name)
1711                                 return -EINVAL;
1712                         opts->name = kstrndup(name,
1713                                               MAX_CGROUP_ROOT_NAMELEN - 1,
1714                                               GFP_KERNEL);
1715                         if (!opts->name)
1716                                 return -ENOMEM;
1717
1718                         continue;
1719                 }
1720
1721                 for_each_subsys(ss, i) {
1722                         if (strcmp(token, ss->legacy_name))
1723                                 continue;
1724                         if (!cgroup_ssid_enabled(i))
1725                                 continue;
1726                         if (cgroup_ssid_no_v1(i))
1727                                 continue;
1728
1729                         /* Mutually exclusive option 'all' + subsystem name */
1730                         if (all_ss)
1731                                 return -EINVAL;
1732                         opts->subsys_mask |= (1 << i);
1733                         one_ss = true;
1734
1735                         break;
1736                 }
1737                 if (i == CGROUP_SUBSYS_COUNT)
1738                         return -ENOENT;
1739         }
1740
1741         /*
1742          * If the 'all' option was specified select all the subsystems,
1743          * otherwise if 'none', 'name=' and a subsystem name options were
1744          * not specified, let's default to 'all'
1745          */
1746         if (all_ss || (!one_ss && !opts->none && !opts->name))
1747                 for_each_subsys(ss, i)
1748                         if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
1749                                 opts->subsys_mask |= (1 << i);
1750
1751         /*
1752          * We either have to specify by name or by subsystems. (So all
1753          * empty hierarchies must have a name).
1754          */
1755         if (!opts->subsys_mask && !opts->name)
1756                 return -EINVAL;
1757
1758         /*
1759          * Option noprefix was introduced just for backward compatibility
1760          * with the old cpuset, so we allow noprefix only if mounting just
1761          * the cpuset subsystem.
1762          */
1763         if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1764                 return -EINVAL;
1765
1766         /* Can't specify "none" and some subsystems */
1767         if (opts->subsys_mask && opts->none)
1768                 return -EINVAL;
1769
1770         return 0;
1771 }
1772
1773 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1774 {
1775         int ret = 0;
1776         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1777         struct cgroup_sb_opts opts;
1778         u16 added_mask, removed_mask;
1779
1780         if (root == &cgrp_dfl_root) {
1781                 pr_err("remount is not allowed\n");
1782                 return -EINVAL;
1783         }
1784
1785         mutex_lock(&cgroup_mutex);
1786
1787         /* See what subsystems are wanted */
1788         ret = parse_cgroupfs_options(data, &opts);
1789         if (ret)
1790                 goto out_unlock;
1791
1792         if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1793                 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1794                         task_tgid_nr(current), current->comm);
1795
1796         added_mask = opts.subsys_mask & ~root->subsys_mask;
1797         removed_mask = root->subsys_mask & ~opts.subsys_mask;
1798
1799         /* Don't allow flags or name to change at remount */
1800         if ((opts.flags ^ root->flags) ||
1801             (opts.name && strcmp(opts.name, root->name))) {
1802                 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1803                        opts.flags, opts.name ?: "", root->flags, root->name);
1804                 ret = -EINVAL;
1805                 goto out_unlock;
1806         }
1807
1808         /* remounting is not allowed for populated hierarchies */
1809         if (!list_empty(&root->cgrp.self.children)) {
1810                 ret = -EBUSY;
1811                 goto out_unlock;
1812         }
1813
1814         ret = rebind_subsystems(root, added_mask);
1815         if (ret)
1816                 goto out_unlock;
1817
1818         rebind_subsystems(&cgrp_dfl_root, removed_mask);
1819
1820         if (opts.release_agent) {
1821                 spin_lock(&release_agent_path_lock);
1822                 strcpy(root->release_agent_path, opts.release_agent);
1823                 spin_unlock(&release_agent_path_lock);
1824         }
1825  out_unlock:
1826         kfree(opts.release_agent);
1827         kfree(opts.name);
1828         mutex_unlock(&cgroup_mutex);
1829         return ret;
1830 }
1831
1832 /*
1833  * To reduce the fork() overhead for systems that are not actually using
1834  * their cgroups capability, we don't maintain the lists running through
1835  * each css_set to its tasks until we see the list actually used - in other
1836  * words after the first mount.
1837  */
1838 static bool use_task_css_set_links __read_mostly;
1839
1840 static void cgroup_enable_task_cg_lists(void)
1841 {
1842         struct task_struct *p, *g;
1843
1844         spin_lock_bh(&css_set_lock);
1845
1846         if (use_task_css_set_links)
1847                 goto out_unlock;
1848
1849         use_task_css_set_links = true;
1850
1851         /*
1852          * We need tasklist_lock because RCU is not safe against
1853          * while_each_thread(). Besides, a forking task that has passed
1854          * cgroup_post_fork() without seeing use_task_css_set_links = 1
1855          * is not guaranteed to have its child immediately visible in the
1856          * tasklist if we walk through it with RCU.
1857          */
1858         read_lock(&tasklist_lock);
1859         do_each_thread(g, p) {
1860                 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1861                              task_css_set(p) != &init_css_set);
1862
1863                 /*
1864                  * We should check if the process is exiting, otherwise
1865                  * it will race with cgroup_exit() in that the list
1866                  * entry won't be deleted though the process has exited.
1867                  * Do it while holding siglock so that we don't end up
1868                  * racing against cgroup_exit().
1869                  */
1870                 spin_lock_irq(&p->sighand->siglock);
1871                 if (!(p->flags & PF_EXITING)) {
1872                         struct css_set *cset = task_css_set(p);
1873
1874                         if (!css_set_populated(cset))
1875                                 css_set_update_populated(cset, true);
1876                         list_add_tail(&p->cg_list, &cset->tasks);
1877                         get_css_set(cset);
1878                 }
1879                 spin_unlock_irq(&p->sighand->siglock);
1880         } while_each_thread(g, p);
1881         read_unlock(&tasklist_lock);
1882 out_unlock:
1883         spin_unlock_bh(&css_set_lock);
1884 }
1885
1886 static void init_cgroup_housekeeping(struct cgroup *cgrp)
1887 {
1888         struct cgroup_subsys *ss;
1889         int ssid;
1890
1891         INIT_LIST_HEAD(&cgrp->self.sibling);
1892         INIT_LIST_HEAD(&cgrp->self.children);
1893         INIT_LIST_HEAD(&cgrp->cset_links);
1894         INIT_LIST_HEAD(&cgrp->pidlists);
1895         mutex_init(&cgrp->pidlist_mutex);
1896         cgrp->self.cgroup = cgrp;
1897         cgrp->self.flags |= CSS_ONLINE;
1898
1899         for_each_subsys(ss, ssid)
1900                 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1901
1902         init_waitqueue_head(&cgrp->offline_waitq);
1903         INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
1904 }
1905
1906 static void init_cgroup_root(struct cgroup_root *root,
1907                              struct cgroup_sb_opts *opts)
1908 {
1909         struct cgroup *cgrp = &root->cgrp;
1910
1911         INIT_LIST_HEAD(&root->root_list);
1912         atomic_set(&root->nr_cgrps, 1);
1913         cgrp->root = root;
1914         init_cgroup_housekeeping(cgrp);
1915         idr_init(&root->cgroup_idr);
1916
1917         root->flags = opts->flags;
1918         if (opts->release_agent)
1919                 strcpy(root->release_agent_path, opts->release_agent);
1920         if (opts->name)
1921                 strcpy(root->name, opts->name);
1922         if (opts->cpuset_clone_children)
1923                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1924 }
1925
1926 static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
1927 {
1928         LIST_HEAD(tmp_links);
1929         struct cgroup *root_cgrp = &root->cgrp;
1930         struct css_set *cset;
1931         int i, ret;
1932
1933         lockdep_assert_held(&cgroup_mutex);
1934
1935         ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
1936         if (ret < 0)
1937                 goto out;
1938         root_cgrp->id = ret;
1939         root_cgrp->ancestor_ids[0] = ret;
1940
1941         ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
1942                               GFP_KERNEL);
1943         if (ret)
1944                 goto out;
1945
1946         /*
1947          * We're accessing css_set_count without locking css_set_lock here,
1948          * but that's OK - it can only be increased by someone holding
1949          * cgroup_lock, and that's us. The worst that can happen is that we
1950          * have some link structures left over
1951          */
1952         ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1953         if (ret)
1954                 goto cancel_ref;
1955
1956         ret = cgroup_init_root_id(root);
1957         if (ret)
1958                 goto cancel_ref;
1959
1960         root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1961                                            KERNFS_ROOT_CREATE_DEACTIVATED,
1962                                            root_cgrp);
1963         if (IS_ERR(root->kf_root)) {
1964                 ret = PTR_ERR(root->kf_root);
1965                 goto exit_root_id;
1966         }
1967         root_cgrp->kn = root->kf_root->kn;
1968
1969         ret = css_populate_dir(&root_cgrp->self, NULL);
1970         if (ret)
1971                 goto destroy_root;
1972
1973         ret = rebind_subsystems(root, ss_mask);
1974         if (ret)
1975                 goto destroy_root;
1976
1977         /*
1978          * There must be no failure case after here, since rebinding takes
1979          * care of subsystems' refcounts, which are explicitly dropped in
1980          * the failure exit path.
1981          */
1982         list_add(&root->root_list, &cgroup_roots);
1983         cgroup_root_count++;
1984
1985         /*
1986          * Link the root cgroup in this hierarchy into all the css_set
1987          * objects.
1988          */
1989         spin_lock_bh(&css_set_lock);
1990         hash_for_each(css_set_table, i, cset, hlist) {
1991                 link_css_set(&tmp_links, cset, root_cgrp);
1992                 if (css_set_populated(cset))
1993                         cgroup_update_populated(root_cgrp, true);
1994         }
1995         spin_unlock_bh(&css_set_lock);
1996
1997         BUG_ON(!list_empty(&root_cgrp->self.children));
1998         BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1999
2000         kernfs_activate(root_cgrp->kn);
2001         ret = 0;
2002         goto out;
2003
2004 destroy_root:
2005         kernfs_destroy_root(root->kf_root);
2006         root->kf_root = NULL;
2007 exit_root_id:
2008         cgroup_exit_root_id(root);
2009 cancel_ref:
2010         percpu_ref_exit(&root_cgrp->self.refcnt);
2011 out:
2012         free_cgrp_cset_links(&tmp_links);
2013         return ret;
2014 }
2015
2016 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
2017                          int flags, const char *unused_dev_name,
2018                          void *data)
2019 {
2020         bool is_v2 = fs_type == &cgroup2_fs_type;
2021         struct super_block *pinned_sb = NULL;
2022         struct cgroup_subsys *ss;
2023         struct cgroup_root *root;
2024         struct cgroup_sb_opts opts;
2025         struct dentry *dentry;
2026         int ret;
2027         int i;
2028         bool new_sb;
2029
2030         /*
2031          * The first time anyone tries to mount a cgroup, enable the list
2032          * linking each css_set to its tasks and fix up all existing tasks.
2033          */
2034         if (!use_task_css_set_links)
2035                 cgroup_enable_task_cg_lists();
2036
2037         if (is_v2) {
2038                 if (data) {
2039                         pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
2040                         return ERR_PTR(-EINVAL);
2041                 }
2042                 cgrp_dfl_visible = true;
2043                 root = &cgrp_dfl_root;
2044                 cgroup_get(&root->cgrp);
2045                 goto out_mount;
2046         }
2047
2048         mutex_lock(&cgroup_mutex);
2049
2050         /* First find the desired set of subsystems */
2051         ret = parse_cgroupfs_options(data, &opts);
2052         if (ret)
2053                 goto out_unlock;
2054
2055         /*
2056          * Destruction of cgroup root is asynchronous, so subsystems may
2057          * still be dying after the previous unmount.  Let's drain the
2058          * dying subsystems.  We just need to ensure that the ones
2059          * unmounted previously finish dying and don't care about new ones
2060          * starting.  Testing ref liveliness is good enough.
2061          */
2062         for_each_subsys(ss, i) {
2063                 if (!(opts.subsys_mask & (1 << i)) ||
2064                     ss->root == &cgrp_dfl_root)
2065                         continue;
2066
2067                 if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
2068                         mutex_unlock(&cgroup_mutex);
2069                         msleep(10);
2070                         ret = restart_syscall();
2071                         goto out_free;
2072                 }
2073                 cgroup_put(&ss->root->cgrp);
2074         }
2075
2076         for_each_root(root) {
2077                 bool name_match = false;
2078
2079                 if (root == &cgrp_dfl_root)
2080                         continue;
2081
2082                 /*
2083                  * If we asked for a name then it must match.  Also, if
2084                  * name matches but sybsys_mask doesn't, we should fail.
2085                  * Remember whether name matched.
2086                  */
2087                 if (opts.name) {
2088                         if (strcmp(opts.name, root->name))
2089                                 continue;
2090                         name_match = true;
2091                 }
2092
2093                 /*
2094                  * If we asked for subsystems (or explicitly for no
2095                  * subsystems) then they must match.
2096                  */
2097                 if ((opts.subsys_mask || opts.none) &&
2098                     (opts.subsys_mask != root->subsys_mask)) {
2099                         if (!name_match)
2100                                 continue;
2101                         ret = -EBUSY;
2102                         goto out_unlock;
2103                 }
2104
2105                 if (root->flags ^ opts.flags)
2106                         pr_warn("new mount options do not match the existing superblock, will be ignored\n");
2107
2108                 /*
2109                  * We want to reuse @root whose lifetime is governed by its
2110                  * ->cgrp.  Let's check whether @root is alive and keep it
2111                  * that way.  As cgroup_kill_sb() can happen anytime, we
2112                  * want to block it by pinning the sb so that @root doesn't
2113                  * get killed before mount is complete.
2114                  *
2115                  * With the sb pinned, tryget_live can reliably indicate
2116                  * whether @root can be reused.  If it's being killed,
2117                  * drain it.  We can use wait_queue for the wait but this
2118                  * path is super cold.  Let's just sleep a bit and retry.
2119                  */
2120                 pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
2121                 if (IS_ERR(pinned_sb) ||
2122                     !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
2123                         mutex_unlock(&cgroup_mutex);
2124                         if (!IS_ERR_OR_NULL(pinned_sb))
2125                                 deactivate_super(pinned_sb);
2126                         msleep(10);
2127                         ret = restart_syscall();
2128                         goto out_free;
2129                 }
2130
2131                 ret = 0;
2132                 goto out_unlock;
2133         }
2134
2135         /*
2136          * No such thing, create a new one.  name= matching without subsys
2137          * specification is allowed for already existing hierarchies but we
2138          * can't create new one without subsys specification.
2139          */
2140         if (!opts.subsys_mask && !opts.none) {
2141                 ret = -EINVAL;
2142                 goto out_unlock;
2143         }
2144
2145         root = kzalloc(sizeof(*root), GFP_KERNEL);
2146         if (!root) {
2147                 ret = -ENOMEM;
2148                 goto out_unlock;
2149         }
2150
2151         init_cgroup_root(root, &opts);
2152
2153         ret = cgroup_setup_root(root, opts.subsys_mask);
2154         if (ret)
2155                 cgroup_free_root(root);
2156
2157 out_unlock:
2158         mutex_unlock(&cgroup_mutex);
2159 out_free:
2160         kfree(opts.release_agent);
2161         kfree(opts.name);
2162
2163         if (ret)
2164                 return ERR_PTR(ret);
2165 out_mount:
2166         dentry = kernfs_mount(fs_type, flags, root->kf_root,
2167                               is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
2168                               &new_sb);
2169         if (IS_ERR(dentry) || !new_sb)
2170                 cgroup_put(&root->cgrp);
2171
2172         /*
2173          * If @pinned_sb, we're reusing an existing root and holding an
2174          * extra ref on its sb.  Mount is complete.  Put the extra ref.
2175          */
2176         if (pinned_sb) {
2177                 WARN_ON(new_sb);
2178                 deactivate_super(pinned_sb);
2179         }
2180
2181         return dentry;
2182 }
2183
2184 static void cgroup_kill_sb(struct super_block *sb)
2185 {
2186         struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2187         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
2188
2189         /*
2190          * If @root doesn't have any mounts or children, start killing it.
2191          * This prevents new mounts by disabling percpu_ref_tryget_live().
2192          * cgroup_mount() may wait for @root's release.
2193          *
2194          * And don't kill the default root.
2195          */
2196         if (!list_empty(&root->cgrp.self.children) ||
2197             root == &cgrp_dfl_root)
2198                 cgroup_put(&root->cgrp);
2199         else
2200                 percpu_ref_kill(&root->cgrp.self.refcnt);
2201
2202         kernfs_kill_sb(sb);
2203 }
2204
2205 static struct file_system_type cgroup_fs_type = {
2206         .name = "cgroup",
2207         .mount = cgroup_mount,
2208         .kill_sb = cgroup_kill_sb,
2209 };
2210
2211 static struct file_system_type cgroup2_fs_type = {
2212         .name = "cgroup2",
2213         .mount = cgroup_mount,
2214         .kill_sb = cgroup_kill_sb,
2215 };
2216
2217 /**
2218  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
2219  * @task: target task
2220  * @buf: the buffer to write the path into
2221  * @buflen: the length of the buffer
2222  *
2223  * Determine @task's cgroup on the first (the one with the lowest non-zero
2224  * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
2225  * function grabs cgroup_mutex and shouldn't be used inside locks used by
2226  * cgroup controller callbacks.
2227  *
2228  * Return value is the same as kernfs_path().
2229  */
2230 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2231 {
2232         struct cgroup_root *root;
2233         struct cgroup *cgrp;
2234         int hierarchy_id = 1;
2235         char *path = NULL;
2236
2237         mutex_lock(&cgroup_mutex);
2238         spin_lock_bh(&css_set_lock);
2239
2240         root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2241
2242         if (root) {
2243                 cgrp = task_cgroup_from_root(task, root);
2244                 path = cgroup_path(cgrp, buf, buflen);
2245         } else {
2246                 /* if no hierarchy exists, everyone is in "/" */
2247                 if (strlcpy(buf, "/", buflen) < buflen)
2248                         path = buf;
2249         }
2250
2251         spin_unlock_bh(&css_set_lock);
2252         mutex_unlock(&cgroup_mutex);
2253         return path;
2254 }
2255 EXPORT_SYMBOL_GPL(task_cgroup_path);
2256
2257 /* used to track tasks and other necessary states during migration */
2258 struct cgroup_taskset {
2259         /* the src and dst cset list running through cset->mg_node */
2260         struct list_head        src_csets;
2261         struct list_head        dst_csets;
2262
2263         /* the subsys currently being processed */
2264         int                     ssid;
2265
2266         /*
2267          * Fields for cgroup_taskset_*() iteration.
2268          *
2269          * Before migration is committed, the target migration tasks are on
2270          * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
2271          * the csets on ->dst_csets.  ->csets point to either ->src_csets
2272          * or ->dst_csets depending on whether migration is committed.
2273          *
2274          * ->cur_csets and ->cur_task point to the current task position
2275          * during iteration.
2276          */
2277         struct list_head        *csets;
2278         struct css_set          *cur_cset;
2279         struct task_struct      *cur_task;
2280 };
2281
2282 #define CGROUP_TASKSET_INIT(tset)       (struct cgroup_taskset){        \
2283         .src_csets              = LIST_HEAD_INIT(tset.src_csets),       \
2284         .dst_csets              = LIST_HEAD_INIT(tset.dst_csets),       \
2285         .csets                  = &tset.src_csets,                      \
2286 }
2287
2288 /**
2289  * cgroup_taskset_add - try to add a migration target task to a taskset
2290  * @task: target task
2291  * @tset: target taskset
2292  *
2293  * Add @task, which is a migration target, to @tset.  This function becomes
2294  * noop if @task doesn't need to be migrated.  @task's css_set should have
2295  * been added as a migration source and @task->cg_list will be moved from
2296  * the css_set's tasks list to mg_tasks one.
2297  */
2298 static void cgroup_taskset_add(struct task_struct *task,
2299                                struct cgroup_taskset *tset)
2300 {
2301         struct css_set *cset;
2302
2303         lockdep_assert_held(&css_set_lock);
2304
2305         /* @task either already exited or can't exit until the end */
2306         if (task->flags & PF_EXITING)
2307                 return;
2308
2309         /* leave @task alone if post_fork() hasn't linked it yet */
2310         if (list_empty(&task->cg_list))
2311                 return;
2312
2313         cset = task_css_set(task);
2314         if (!cset->mg_src_cgrp)
2315                 return;
2316
2317         list_move_tail(&task->cg_list, &cset->mg_tasks);
2318         if (list_empty(&cset->mg_node))
2319                 list_add_tail(&cset->mg_node, &tset->src_csets);
2320         if (list_empty(&cset->mg_dst_cset->mg_node))
2321                 list_move_tail(&cset->mg_dst_cset->mg_node,
2322                                &tset->dst_csets);
2323 }
2324
2325 /**
2326  * cgroup_taskset_first - reset taskset and return the first task
2327  * @tset: taskset of interest
2328  * @dst_cssp: output variable for the destination css
2329  *
2330  * @tset iteration is initialized and the first task is returned.
2331  */
2332 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2333                                          struct cgroup_subsys_state **dst_cssp)
2334 {
2335         tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2336         tset->cur_task = NULL;
2337
2338         return cgroup_taskset_next(tset, dst_cssp);
2339 }
2340
2341 /**
2342  * cgroup_taskset_next - iterate to the next task in taskset
2343  * @tset: taskset of interest
2344  * @dst_cssp: output variable for the destination css
2345  *
2346  * Return the next task in @tset.  Iteration must have been initialized
2347  * with cgroup_taskset_first().
2348  */
2349 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2350                                         struct cgroup_subsys_state **dst_cssp)
2351 {
2352         struct css_set *cset = tset->cur_cset;
2353         struct task_struct *task = tset->cur_task;
2354
2355         while (&cset->mg_node != tset->csets) {
2356                 if (!task)
2357                         task = list_first_entry(&cset->mg_tasks,
2358                                                 struct task_struct, cg_list);
2359                 else
2360                         task = list_next_entry(task, cg_list);
2361
2362                 if (&task->cg_list != &cset->mg_tasks) {
2363                         tset->cur_cset = cset;
2364                         tset->cur_task = task;
2365
2366                         /*
2367                          * This function may be called both before and
2368                          * after cgroup_taskset_migrate().  The two cases
2369                          * can be distinguished by looking at whether @cset
2370                          * has its ->mg_dst_cset set.
2371                          */
2372                         if (cset->mg_dst_cset)
2373                                 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2374                         else
2375                                 *dst_cssp = cset->subsys[tset->ssid];
2376
2377                         return task;
2378                 }
2379
2380                 cset = list_next_entry(cset, mg_node);
2381                 task = NULL;
2382         }
2383
2384         return NULL;
2385 }
2386
2387 /**
2388  * cgroup_taskset_migrate - migrate a taskset to a cgroup
2389  * @tset: taget taskset
2390  * @dst_cgrp: destination cgroup
2391  *
2392  * Migrate tasks in @tset to @dst_cgrp.  This function fails iff one of the
2393  * ->can_attach callbacks fails and guarantees that either all or none of
2394  * the tasks in @tset are migrated.  @tset is consumed regardless of
2395  * success.
2396  */
2397 static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
2398                                   struct cgroup *dst_cgrp)
2399 {
2400         struct cgroup_subsys_state *css, *failed_css = NULL;
2401         struct task_struct *task, *tmp_task;
2402         struct css_set *cset, *tmp_cset;
2403         int i, ret;
2404
2405         /* methods shouldn't be called if no task is actually migrating */
2406         if (list_empty(&tset->src_csets))
2407                 return 0;
2408
2409         /* check that we can legitimately attach to the cgroup */
2410         for_each_e_css(css, i, dst_cgrp) {
2411                 if (css->ss->can_attach) {
2412                         tset->ssid = i;
2413                         ret = css->ss->can_attach(tset);
2414                         if (ret) {
2415                                 failed_css = css;
2416                                 goto out_cancel_attach;
2417                         }
2418                 }
2419         }
2420
2421         /*
2422          * Now that we're guaranteed success, proceed to move all tasks to
2423          * the new cgroup.  There are no failure cases after here, so this
2424          * is the commit point.
2425          */
2426         spin_lock_bh(&css_set_lock);
2427         list_for_each_entry(cset, &tset->src_csets, mg_node) {
2428                 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2429                         struct css_set *from_cset = task_css_set(task);
2430                         struct css_set *to_cset = cset->mg_dst_cset;
2431
2432                         get_css_set(to_cset);
2433                         css_set_move_task(task, from_cset, to_cset, true);
2434                         put_css_set_locked(from_cset);
2435                 }
2436         }
2437         spin_unlock_bh(&css_set_lock);
2438
2439         /*
2440          * Migration is committed, all target tasks are now on dst_csets.
2441          * Nothing is sensitive to fork() after this point.  Notify
2442          * controllers that migration is complete.
2443          */
2444         tset->csets = &tset->dst_csets;
2445
2446         for_each_e_css(css, i, dst_cgrp) {
2447                 if (css->ss->attach) {
2448                         tset->ssid = i;
2449                         css->ss->attach(tset);
2450                 }
2451         }
2452
2453         ret = 0;
2454         goto out_release_tset;
2455
2456 out_cancel_attach:
2457         for_each_e_css(css, i, dst_cgrp) {
2458                 if (css == failed_css)
2459                         break;
2460                 if (css->ss->cancel_attach) {
2461                         tset->ssid = i;
2462                         css->ss->cancel_attach(tset);
2463                 }
2464         }
2465 out_release_tset:
2466         spin_lock_bh(&css_set_lock);
2467         list_splice_init(&tset->dst_csets, &tset->src_csets);
2468         list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2469                 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2470                 list_del_init(&cset->mg_node);
2471         }
2472         spin_unlock_bh(&css_set_lock);
2473         return ret;
2474 }
2475
2476 /**
2477  * cgroup_migrate_finish - cleanup after attach
2478  * @preloaded_csets: list of preloaded css_sets
2479  *
2480  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
2481  * those functions for details.
2482  */
2483 static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2484 {
2485         struct css_set *cset, *tmp_cset;
2486
2487         lockdep_assert_held(&cgroup_mutex);
2488
2489         spin_lock_bh(&css_set_lock);
2490         list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
2491                 cset->mg_src_cgrp = NULL;
2492                 cset->mg_dst_cset = NULL;
2493                 list_del_init(&cset->mg_preload_node);
2494                 put_css_set_locked(cset);
2495         }
2496         spin_unlock_bh(&css_set_lock);
2497 }
2498
2499 /**
2500  * cgroup_migrate_add_src - add a migration source css_set
2501  * @src_cset: the source css_set to add
2502  * @dst_cgrp: the destination cgroup
2503  * @preloaded_csets: list of preloaded css_sets
2504  *
2505  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
2506  * @src_cset and add it to @preloaded_csets, which should later be cleaned
2507  * up by cgroup_migrate_finish().
2508  *
2509  * This function may be called without holding cgroup_threadgroup_rwsem
2510  * even if the target is a process.  Threads may be created and destroyed
2511  * but as long as cgroup_mutex is not dropped, no new css_set can be put
2512  * into play and the preloaded css_sets are guaranteed to cover all
2513  * migrations.
2514  */
2515 static void cgroup_migrate_add_src(struct css_set *src_cset,
2516                                    struct cgroup *dst_cgrp,
2517                                    struct list_head *preloaded_csets)
2518 {
2519         struct cgroup *src_cgrp;
2520
2521         lockdep_assert_held(&cgroup_mutex);
2522         lockdep_assert_held(&css_set_lock);
2523
2524         src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2525
2526         if (!list_empty(&src_cset->mg_preload_node))
2527                 return;
2528
2529         WARN_ON(src_cset->mg_src_cgrp);
2530         WARN_ON(!list_empty(&src_cset->mg_tasks));
2531         WARN_ON(!list_empty(&src_cset->mg_node));
2532
2533         src_cset->mg_src_cgrp = src_cgrp;
2534         get_css_set(src_cset);
2535         list_add(&src_cset->mg_preload_node, preloaded_csets);
2536 }
2537
2538 /**
2539  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
2540  * @dst_cgrp: the destination cgroup (may be %NULL)
2541  * @preloaded_csets: list of preloaded source css_sets
2542  *
2543  * Tasks are about to be moved to @dst_cgrp and all the source css_sets
2544  * have been preloaded to @preloaded_csets.  This function looks up and
2545  * pins all destination css_sets, links each to its source, and append them
2546  * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
2547  * source css_set is assumed to be its cgroup on the default hierarchy.
2548  *
2549  * This function must be called after cgroup_migrate_add_src() has been
2550  * called on each migration source css_set.  After migration is performed
2551  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
2552  * @preloaded_csets.
2553  */
2554 static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
2555                                       struct list_head *preloaded_csets)
2556 {
2557         LIST_HEAD(csets);
2558         struct css_set *src_cset, *tmp_cset;
2559
2560         lockdep_assert_held(&cgroup_mutex);
2561
2562         /*
2563          * Except for the root, subtree_control must be zero for a cgroup
2564          * with tasks so that child cgroups don't compete against tasks.
2565          */
2566         if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
2567             dst_cgrp->subtree_control)
2568                 return -EBUSY;
2569
2570         /* look up the dst cset for each src cset and link it to src */
2571         list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
2572                 struct css_set *dst_cset;
2573
2574                 dst_cset = find_css_set(src_cset,
2575                                         dst_cgrp ?: src_cset->dfl_cgrp);
2576                 if (!dst_cset)
2577                         goto err;
2578
2579                 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2580
2581                 /*
2582                  * If src cset equals dst, it's noop.  Drop the src.
2583                  * cgroup_migrate() will skip the cset too.  Note that we
2584                  * can't handle src == dst as some nodes are used by both.
2585                  */
2586                 if (src_cset == dst_cset) {
2587                         src_cset->mg_src_cgrp = NULL;
2588                         list_del_init(&src_cset->mg_preload_node);
2589                         put_css_set(src_cset);
2590                         put_css_set(dst_cset);
2591                         continue;
2592                 }
2593
2594                 src_cset->mg_dst_cset = dst_cset;
2595
2596                 if (list_empty(&dst_cset->mg_preload_node))
2597                         list_add(&dst_cset->mg_preload_node, &csets);
2598                 else
2599                         put_css_set(dst_cset);
2600         }
2601
2602         list_splice_tail(&csets, preloaded_csets);
2603         return 0;
2604 err:
2605         cgroup_migrate_finish(&csets);
2606         return -ENOMEM;
2607 }
2608
2609 /**
2610  * cgroup_migrate - migrate a process or task to a cgroup
2611  * @leader: the leader of the process or the task to migrate
2612  * @threadgroup: whether @leader points to the whole process or a single task
2613  * @cgrp: the destination cgroup
2614  *
2615  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
2616  * process, the caller must be holding cgroup_threadgroup_rwsem.  The
2617  * caller is also responsible for invoking cgroup_migrate_add_src() and
2618  * cgroup_migrate_prepare_dst() on the targets before invoking this
2619  * function and following up with cgroup_migrate_finish().
2620  *
2621  * As long as a controller's ->can_attach() doesn't fail, this function is
2622  * guaranteed to succeed.  This means that, excluding ->can_attach()
2623  * failure, when migrating multiple targets, the success or failure can be
2624  * decided for all targets by invoking group_migrate_prepare_dst() before
2625  * actually starting migrating.
2626  */
2627 static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2628                           struct cgroup *cgrp)
2629 {
2630         struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
2631         struct task_struct *task;
2632
2633         /*
2634          * Prevent freeing of tasks while we take a snapshot. Tasks that are
2635          * already PF_EXITING could be freed from underneath us unless we
2636          * take an rcu_read_lock.
2637          */
2638         spin_lock_bh(&css_set_lock);
2639         rcu_read_lock();
2640         task = leader;
2641         do {
2642                 cgroup_taskset_add(task, &tset);
2643                 if (!threadgroup)
2644                         break;
2645         } while_each_thread(leader, task);
2646         rcu_read_unlock();
2647         spin_unlock_bh(&css_set_lock);
2648
2649         return cgroup_taskset_migrate(&tset, cgrp);
2650 }
2651
2652 /**
2653  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2654  * @dst_cgrp: the cgroup to attach to
2655  * @leader: the task or the leader of the threadgroup to be attached
2656  * @threadgroup: attach the whole threadgroup?
2657  *
2658  * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
2659  */
2660 static int cgroup_attach_task(struct cgroup *dst_cgrp,
2661                               struct task_struct *leader, bool threadgroup)
2662 {
2663         LIST_HEAD(preloaded_csets);
2664         struct task_struct *task;
2665         int ret;
2666
2667         /* look up all src csets */
2668         spin_lock_bh(&css_set_lock);
2669         rcu_read_lock();
2670         task = leader;
2671         do {
2672                 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2673                                        &preloaded_csets);
2674                 if (!threadgroup)
2675                         break;
2676         } while_each_thread(leader, task);
2677         rcu_read_unlock();
2678         spin_unlock_bh(&css_set_lock);
2679
2680         /* prepare dst csets and commit */
2681         ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2682         if (!ret)
2683                 ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
2684
2685         cgroup_migrate_finish(&preloaded_csets);
2686         return ret;
2687 }
2688
2689 static int cgroup_procs_write_permission(struct task_struct *task,
2690                                          struct cgroup *dst_cgrp,
2691                                          struct kernfs_open_file *of)
2692 {
2693         const struct cred *cred = current_cred();
2694         const struct cred *tcred = get_task_cred(task);
2695         int ret = 0;
2696
2697         /*
2698          * even if we're attaching all tasks in the thread group, we only
2699          * need to check permissions on one of them.
2700          */
2701         if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2702             !uid_eq(cred->euid, tcred->uid) &&
2703             !uid_eq(cred->euid, tcred->suid))
2704                 ret = -EACCES;
2705
2706         if (!ret && cgroup_on_dfl(dst_cgrp)) {
2707                 struct super_block *sb = of->file->f_path.dentry->d_sb;
2708                 struct cgroup *cgrp;
2709                 struct inode *inode;
2710
2711                 spin_lock_bh(&css_set_lock);
2712                 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2713                 spin_unlock_bh(&css_set_lock);
2714
2715                 while (!cgroup_is_descendant(dst_cgrp, cgrp))
2716                         cgrp = cgroup_parent(cgrp);
2717
2718                 ret = -ENOMEM;
2719                 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
2720                 if (inode) {
2721                         ret = inode_permission(inode, MAY_WRITE);
2722                         iput(inode);
2723                 }
2724         }
2725
2726         put_cred(tcred);
2727         return ret;
2728 }
2729
2730 /*
2731  * Find the task_struct of the task to attach by vpid and pass it along to the
2732  * function to attach either it or all tasks in its threadgroup. Will lock
2733  * cgroup_mutex and threadgroup.
2734  */
2735 static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2736                                     size_t nbytes, loff_t off, bool threadgroup)
2737 {
2738         struct task_struct *tsk;
2739         struct cgroup *cgrp;
2740         pid_t pid;
2741         int ret;
2742
2743         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2744                 return -EINVAL;
2745
2746         cgrp = cgroup_kn_lock_live(of->kn);
2747         if (!cgrp)
2748                 return -ENODEV;
2749
2750         percpu_down_write(&cgroup_threadgroup_rwsem);
2751         rcu_read_lock();
2752         if (pid) {
2753                 tsk = find_task_by_vpid(pid);
2754                 if (!tsk) {
2755                         ret = -ESRCH;
2756                         goto out_unlock_rcu;
2757                 }
2758         } else {
2759                 tsk = current;
2760         }
2761
2762         if (threadgroup)
2763                 tsk = tsk->group_leader;
2764
2765         /*
2766          * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2767          * trapped in a cpuset, or RT worker may be born in a cgroup
2768          * with no rt_runtime allocated.  Just say no.
2769          */
2770         if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2771                 ret = -EINVAL;
2772                 goto out_unlock_rcu;
2773         }
2774
2775         get_task_struct(tsk);
2776         rcu_read_unlock();
2777
2778         ret = cgroup_procs_write_permission(tsk, cgrp, of);
2779         if (!ret)
2780                 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2781
2782         put_task_struct(tsk);
2783         goto out_unlock_threadgroup;
2784
2785 out_unlock_rcu:
2786         rcu_read_unlock();
2787 out_unlock_threadgroup:
2788         percpu_up_write(&cgroup_threadgroup_rwsem);
2789         cgroup_kn_unlock(of->kn);
2790         cpuset_post_attach_flush();
2791         return ret ?: nbytes;
2792 }
2793
2794 /**
2795  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2796  * @from: attach to all cgroups of a given task
2797  * @tsk: the task to be attached
2798  */
2799 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2800 {
2801         struct cgroup_root *root;
2802         int retval = 0;
2803
2804         mutex_lock(&cgroup_mutex);
2805         for_each_root(root) {
2806                 struct cgroup *from_cgrp;
2807
2808                 if (root == &cgrp_dfl_root)
2809                         continue;
2810
2811                 spin_lock_bh(&css_set_lock);
2812                 from_cgrp = task_cgroup_from_root(from, root);
2813                 spin_unlock_bh(&css_set_lock);
2814
2815                 retval = cgroup_attach_task(from_cgrp, tsk, false);
2816                 if (retval)
2817                         break;
2818         }
2819         mutex_unlock(&cgroup_mutex);
2820
2821         return retval;
2822 }
2823 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2824
2825 static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2826                                   char *buf, size_t nbytes, loff_t off)
2827 {
2828         return __cgroup_procs_write(of, buf, nbytes, off, false);
2829 }
2830
2831 static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2832                                   char *buf, size_t nbytes, loff_t off)
2833 {
2834         return __cgroup_procs_write(of, buf, nbytes, off, true);
2835 }
2836
2837 static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
2838                                           char *buf, size_t nbytes, loff_t off)
2839 {
2840         struct cgroup *cgrp;
2841
2842         BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2843
2844         cgrp = cgroup_kn_lock_live(of->kn);
2845         if (!cgrp)
2846                 return -ENODEV;
2847         spin_lock(&release_agent_path_lock);
2848         strlcpy(cgrp->root->release_agent_path, strstrip(buf),
2849                 sizeof(cgrp->root->release_agent_path));
2850         spin_unlock(&release_agent_path_lock);
2851         cgroup_kn_unlock(of->kn);
2852         return nbytes;
2853 }
2854
2855 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2856 {
2857         struct cgroup *cgrp = seq_css(seq)->cgroup;
2858
2859         spin_lock(&release_agent_path_lock);
2860         seq_puts(seq, cgrp->root->release_agent_path);
2861         spin_unlock(&release_agent_path_lock);
2862         seq_putc(seq, '\n');
2863         return 0;
2864 }
2865
2866 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2867 {
2868         seq_puts(seq, "0\n");
2869         return 0;
2870 }
2871
2872 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2873 {
2874         struct cgroup_subsys *ss;
2875         bool printed = false;
2876         int ssid;
2877
2878         do_each_subsys_mask(ss, ssid, ss_mask) {
2879                 if (printed)
2880                         seq_putc(seq, ' ');
2881                 seq_printf(seq, "%s", ss->name);
2882                 printed = true;
2883         } while_each_subsys_mask();
2884         if (printed)
2885                 seq_putc(seq, '\n');
2886 }
2887
2888 /* show controllers which are enabled from the parent */
2889 static int cgroup_controllers_show(struct seq_file *seq, void *v)
2890 {
2891         struct cgroup *cgrp = seq_css(seq)->cgroup;
2892
2893         cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2894         return 0;
2895 }
2896
2897 /* show controllers which are enabled for a given cgroup's children */
2898 static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2899 {
2900         struct cgroup *cgrp = seq_css(seq)->cgroup;
2901
2902         cgroup_print_ss_mask(seq, cgrp->subtree_control);
2903         return 0;
2904 }
2905
2906 /**
2907  * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2908  * @cgrp: root of the subtree to update csses for
2909  *
2910  * @cgrp's subtree_ss_mask has changed and its subtree's (self excluded)
2911  * css associations need to be updated accordingly.  This function looks up
2912  * all css_sets which are attached to the subtree, creates the matching
2913  * updated css_sets and migrates the tasks to the new ones.
2914  */
2915 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2916 {
2917         LIST_HEAD(preloaded_csets);
2918         struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
2919         struct cgroup_subsys_state *css;
2920         struct css_set *src_cset;
2921         int ret;
2922
2923         lockdep_assert_held(&cgroup_mutex);
2924
2925         percpu_down_write(&cgroup_threadgroup_rwsem);
2926
2927         /* look up all csses currently attached to @cgrp's subtree */
2928         spin_lock_bh(&css_set_lock);
2929         css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2930                 struct cgrp_cset_link *link;
2931
2932                 /* self is not affected by subtree_ss_mask change */
2933                 if (css->cgroup == cgrp)
2934                         continue;
2935
2936                 list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
2937                         cgroup_migrate_add_src(link->cset, cgrp,
2938                                                &preloaded_csets);
2939         }
2940         spin_unlock_bh(&css_set_lock);
2941
2942         /* NULL dst indicates self on default hierarchy */
2943         ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2944         if (ret)
2945                 goto out_finish;
2946
2947         spin_lock_bh(&css_set_lock);
2948         list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2949                 struct task_struct *task, *ntask;
2950
2951                 /* src_csets precede dst_csets, break on the first dst_cset */
2952                 if (!src_cset->mg_src_cgrp)
2953                         break;
2954
2955                 /* all tasks in src_csets need to be migrated */
2956                 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2957                         cgroup_taskset_add(task, &tset);
2958         }
2959         spin_unlock_bh(&css_set_lock);
2960
2961         ret = cgroup_taskset_migrate(&tset, cgrp);
2962 out_finish:
2963         cgroup_migrate_finish(&preloaded_csets);
2964         percpu_up_write(&cgroup_threadgroup_rwsem);
2965         return ret;
2966 }
2967
2968 /**
2969  * cgroup_drain_offline - wait for previously offlined csses to go away
2970  * @cgrp: parent of the target cgroups
2971  *
2972  * Because css offlining is asynchronous, userland may try to re-enable a
2973  * controller while the previous css is still around.  This function drains
2974  * the previous css instances of @cgrp's children.
2975  *
2976  * Must be called with cgroup_mutex held.  Returns %false if there were no
2977  * dying css instances.  Returns %true if there were one or more and this
2978  * function waited.  On %true return, cgroup_mutex has been dropped and
2979  * re-acquired inbetween which anything could have happened.  The caller
2980  * typically would have to start over.
2981  */
2982 static bool cgroup_drain_offline(struct cgroup *cgrp)
2983 {
2984         struct cgroup *dsct;
2985         struct cgroup_subsys *ss;
2986         int ssid;
2987
2988         lockdep_assert_held(&cgroup_mutex);
2989
2990         cgroup_for_each_live_child(dsct, cgrp) {
2991                 for_each_subsys(ss, ssid) {
2992                         struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
2993                         DEFINE_WAIT(wait);
2994
2995                         if (!css)
2996                                 continue;
2997
2998                         cgroup_get(dsct);
2999                         prepare_to_wait(&dsct->offline_waitq, &wait,
3000                                         TASK_UNINTERRUPTIBLE);
3001
3002                         mutex_unlock(&cgroup_mutex);
3003                         schedule();
3004                         finish_wait(&dsct->offline_waitq, &wait);
3005                         mutex_lock(&cgroup_mutex);
3006
3007                         cgroup_put(dsct);
3008                         return true;
3009                 }
3010         }
3011
3012         return false;
3013 }
3014
3015 /**
3016  * cgroup_apply_control_disable - kill or hide csses according to control
3017  * @cgrp: parent of the target cgroups
3018  *
3019  * Walk @cgrp's children and kill and hide csses so that they match
3020  * cgroup_ss_mask() and cgroup_visible_mask().
3021  *
3022  * A css is hidden when the userland requests it to be disabled while other
3023  * subsystems are still depending on it.  The css must not actively control
3024  * resources and be in the vanilla state if it's made visible again later.
3025  * Controllers which may be depended upon should provide ->css_reset() for
3026  * this purpose.
3027  */
3028 static void cgroup_apply_control_disable(struct cgroup *cgrp)
3029 {
3030         struct cgroup *dsct;
3031         struct cgroup_subsys *ss;
3032         int ssid;
3033
3034         cgroup_for_each_live_child(dsct, cgrp) {
3035                 for_each_subsys(ss, ssid) {
3036                         struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3037
3038                         if (!css)
3039                                 continue;
3040
3041                         if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
3042                                 kill_css(css);
3043                         } else if (!(cgroup_control(dsct) & (1 << ss->id))) {
3044                                 css_clear_dir(css, NULL);
3045                                 if (ss->css_reset)
3046                                         ss->css_reset(css);
3047                         }
3048                 }
3049         }
3050 }
3051
3052 /* change the enabled child controllers for a cgroup in the default hierarchy */
3053 static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3054                                             char *buf, size_t nbytes,
3055                                             loff_t off)
3056 {
3057         u16 enable = 0, disable = 0;
3058         u16 css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
3059         struct cgroup *cgrp, *child;
3060         struct cgroup_subsys *ss;
3061         char *tok;
3062         int ssid, ret;
3063
3064         /*
3065          * Parse input - space separated list of subsystem names prefixed
3066          * with either + or -.
3067          */
3068         buf = strstrip(buf);
3069         while ((tok = strsep(&buf, " "))) {
3070                 if (tok[0] == '\0')
3071                         continue;
3072                 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3073                         if (!cgroup_ssid_enabled(ssid) ||
3074                             strcmp(tok + 1, ss->name))
3075                                 continue;
3076
3077                         if (*tok == '+') {
3078                                 enable |= 1 << ssid;
3079                                 disable &= ~(1 << ssid);
3080                         } else if (*tok == '-') {
3081                                 disable |= 1 << ssid;
3082                                 enable &= ~(1 << ssid);
3083                         } else {
3084                                 return -EINVAL;
3085                         }
3086                         break;
3087                 } while_each_subsys_mask();
3088                 if (ssid == CGROUP_SUBSYS_COUNT)
3089                         return -EINVAL;
3090         }
3091
3092         cgrp = cgroup_kn_lock_live(of->kn);
3093         if (!cgrp)
3094                 return -ENODEV;
3095
3096         for_each_subsys(ss, ssid) {
3097                 if (enable & (1 << ssid)) {
3098                         if (cgrp->subtree_control & (1 << ssid)) {
3099                                 enable &= ~(1 << ssid);
3100                                 continue;
3101                         }
3102
3103                         if (!(cgroup_control(cgrp) & (1 << ssid))) {
3104                                 ret = -ENOENT;
3105                                 goto out_unlock;
3106                         }
3107                 } else if (disable & (1 << ssid)) {
3108                         if (!(cgrp->subtree_control & (1 << ssid))) {
3109                                 disable &= ~(1 << ssid);
3110                                 continue;
3111                         }
3112
3113                         /* a child has it enabled? */
3114                         cgroup_for_each_live_child(child, cgrp) {
3115                                 if (child->subtree_control & (1 << ssid)) {
3116                                         ret = -EBUSY;
3117                                         goto out_unlock;
3118                                 }
3119                         }
3120                 }
3121         }
3122
3123         if (!enable && !disable) {
3124                 ret = 0;
3125                 goto out_unlock;
3126         }
3127
3128         /*
3129          * Except for the root, subtree_control must be zero for a cgroup
3130          * with tasks so that child cgroups don't compete against tasks.
3131          */
3132         if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
3133                 ret = -EBUSY;
3134                 goto out_unlock;
3135         }
3136
3137         if (cgroup_drain_offline(cgrp)) {
3138                 cgroup_kn_unlock(of->kn);
3139                 return restart_syscall();
3140         }
3141
3142         /*
3143          * Update subsys masks and calculate what needs to be done.  More
3144          * subsystems than specified may need to be enabled or disabled
3145          * depending on subsystem dependencies.
3146          */
3147         old_sc = cgrp->subtree_control;
3148         old_ss = cgrp->subtree_ss_mask;
3149         new_sc = (old_sc | enable) & ~disable;
3150         new_ss = cgroup_calc_subtree_ss_mask(cgrp, new_sc);
3151
3152         css_enable = ~old_ss & new_ss;
3153         css_disable = old_ss & ~new_ss;
3154         enable |= css_enable;
3155         disable |= css_disable;
3156
3157         cgrp->subtree_control = new_sc;
3158         cgrp->subtree_ss_mask = new_ss;
3159
3160         /*
3161          * Create new csses or make the existing ones visible.  A css is
3162          * created invisible if it's being implicitly enabled through
3163          * dependency.  An invisible css is made visible when the userland
3164          * explicitly enables it.
3165          */
3166         do_each_subsys_mask(ss, ssid, enable) {
3167                 cgroup_for_each_live_child(child, cgrp) {
3168                         if (css_enable & (1 << ssid)) {
3169                                 struct cgroup_subsys_state *css;
3170
3171                                 css = css_create(child, ss);
3172                                 if (IS_ERR(css)) {
3173                                         ret = PTR_ERR(css);
3174                                         goto err_undo_css;
3175                                 }
3176
3177                                 if (cgrp->subtree_control & (1 << ssid)) {
3178                                         ret = css_populate_dir(css, NULL);
3179                                         if (ret)
3180                                                 goto err_undo_css;
3181                                 }
3182                         } else {
3183                                 ret = css_populate_dir(cgroup_css(child, ss),
3184                                                        NULL);
3185                                 if (ret)
3186                                         goto err_undo_css;
3187                         }
3188                 }
3189         } while_each_subsys_mask();
3190
3191         /*
3192          * At this point, cgroup_e_css() results reflect the new csses
3193          * making the following cgroup_update_dfl_csses() properly update
3194          * css associations of all tasks in the subtree.
3195          */
3196         ret = cgroup_update_dfl_csses(cgrp);
3197         if (ret)
3198                 goto err_undo_css;
3199
3200         /* all tasks are migrated out of disabled csses, commit disable */
3201         cgroup_apply_control_disable(cgrp);
3202
3203         kernfs_activate(cgrp->kn);
3204         ret = 0;
3205 out_unlock:
3206         cgroup_kn_unlock(of->kn);
3207         return ret ?: nbytes;
3208
3209 err_undo_css:
3210         /* restore masks and shoot down new csses */
3211         cgrp->subtree_control = old_sc;
3212         cgrp->subtree_ss_mask = old_ss;
3213
3214         cgroup_apply_control_disable(cgrp);
3215
3216         goto out_unlock;
3217 }
3218
3219 static int cgroup_events_show(struct seq_file *seq, void *v)
3220 {
3221         seq_printf(seq, "populated %d\n",
3222                    cgroup_is_populated(seq_css(seq)->cgroup));
3223         return 0;
3224 }
3225
3226 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3227                                  size_t nbytes, loff_t off)
3228 {
3229         struct cgroup *cgrp = of->kn->parent->priv;
3230         struct cftype *cft = of->kn->priv;
3231         struct cgroup_subsys_state *css;
3232         int ret;
3233
3234         if (cft->write)
3235                 return cft->write(of, buf, nbytes, off);
3236
3237         /*
3238          * kernfs guarantees that a file isn't deleted with operations in
3239          * flight, which means that the matching css is and stays alive and
3240          * doesn't need to be pinned.  The RCU locking is not necessary
3241          * either.  It's just for the convenience of using cgroup_css().
3242          */
3243         rcu_read_lock();
3244         css = cgroup_css(cgrp, cft->ss);
3245         rcu_read_unlock();
3246
3247         if (cft->write_u64) {
3248                 unsigned long long v;
3249                 ret = kstrtoull(buf, 0, &v);
3250                 if (!ret)
3251                         ret = cft->write_u64(css, cft, v);
3252         } else if (cft->write_s64) {
3253                 long long v;
3254                 ret = kstrtoll(buf, 0, &v);
3255                 if (!ret)
3256                         ret = cft->write_s64(css, cft, v);
3257         } else {
3258                 ret = -EINVAL;
3259         }
3260
3261         return ret ?: nbytes;
3262 }
3263
3264 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3265 {
3266         return seq_cft(seq)->seq_start(seq, ppos);
3267 }
3268
3269 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3270 {
3271         return seq_cft(seq)->seq_next(seq, v, ppos);
3272 }
3273
3274 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3275 {
3276         seq_cft(seq)->seq_stop(seq, v);
3277 }
3278
3279 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3280 {
3281         struct cftype *cft = seq_cft(m);
3282         struct cgroup_subsys_state *css = seq_css(m);
3283
3284         if (cft->seq_show)
3285                 return cft->seq_show(m, arg);
3286
3287         if (cft->read_u64)
3288                 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
3289         else if (cft->read_s64)
3290                 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
3291         else
3292                 return -EINVAL;
3293         return 0;
3294 }
3295
3296 static struct kernfs_ops cgroup_kf_single_ops = {
3297         .atomic_write_len       = PAGE_SIZE,
3298         .write                  = cgroup_file_write,
3299         .seq_show               = cgroup_seqfile_show,
3300 };
3301
3302 static struct kernfs_ops cgroup_kf_ops = {
3303         .atomic_write_len       = PAGE_SIZE,
3304         .write                  = cgroup_file_write,
3305         .seq_start              = cgroup_seqfile_start,
3306         .seq_next               = cgroup_seqfile_next,
3307         .seq_stop               = cgroup_seqfile_stop,
3308         .seq_show               = cgroup_seqfile_show,
3309 };
3310
3311 /*
3312  * cgroup_rename - Only allow simple rename of directories in place.
3313  */
3314 static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
3315                          const char *new_name_str)
3316 {
3317         struct cgroup *cgrp = kn->priv;
3318         int ret;
3319
3320         if (kernfs_type(kn) != KERNFS_DIR)
3321                 return -ENOTDIR;
3322         if (kn->parent != new_parent)
3323                 return -EIO;
3324
3325         /*
3326          * This isn't a proper migration and its usefulness is very
3327          * limited.  Disallow on the default hierarchy.
3328          */
3329         if (cgroup_on_dfl(cgrp))
3330                 return -EPERM;
3331
3332         /*
3333          * We're gonna grab cgroup_mutex which nests outside kernfs
3334          * active_ref.  kernfs_rename() doesn't require active_ref
3335          * protection.  Break them before grabbing cgroup_mutex.
3336          */
3337         kernfs_break_active_protection(new_parent);
3338         kernfs_break_active_protection(kn);
3339
3340         mutex_lock(&cgroup_mutex);
3341
3342         ret = kernfs_rename(kn, new_parent, new_name_str);
3343
3344         mutex_unlock(&cgroup_mutex);
3345
3346         kernfs_unbreak_active_protection(kn);
3347         kernfs_unbreak_active_protection(new_parent);
3348         return ret;
3349 }
3350
3351 /* set uid and gid of cgroup dirs and files to that of the creator */
3352 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3353 {
3354         struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3355                                .ia_uid = current_fsuid(),
3356                                .ia_gid = current_fsgid(), };
3357
3358         if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3359             gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3360                 return 0;
3361
3362         return kernfs_setattr(kn, &iattr);
3363 }
3364
3365 static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3366                            struct cftype *cft)
3367 {
3368         char name[CGROUP_FILE_NAME_MAX];
3369         struct kernfs_node *kn;
3370         struct lock_class_key *key = NULL;
3371         int ret;
3372
3373 #ifdef CONFIG_DEBUG_LOCK_ALLOC
3374         key = &cft->lockdep_key;
3375 #endif
3376         kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3377                                   cgroup_file_mode(cft), 0, cft->kf_ops, cft,
3378                                   NULL, key);
3379         if (IS_ERR(kn))
3380                 return PTR_ERR(kn);
3381
3382         ret = cgroup_kn_set_ugid(kn);
3383         if (ret) {
3384                 kernfs_remove(kn);
3385                 return ret;
3386         }
3387
3388         if (cft->file_offset) {
3389                 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3390
3391                 spin_lock_irq(&cgroup_file_kn_lock);
3392                 cfile->kn = kn;
3393                 spin_unlock_irq(&cgroup_file_kn_lock);
3394         }
3395
3396         return 0;
3397 }
3398
3399 /**
3400  * cgroup_addrm_files - add or remove files to a cgroup directory
3401  * @css: the target css
3402  * @cgrp: the target cgroup (usually css->cgroup)
3403  * @cfts: array of cftypes to be added
3404  * @is_add: whether to add or remove
3405  *
3406  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
3407  * For removals, this function never fails.
3408  */
3409 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3410                               struct cgroup *cgrp, struct cftype cfts[],
3411                               bool is_add)
3412 {
3413         struct cftype *cft, *cft_end = NULL;
3414         int ret = 0;
3415
3416         lockdep_assert_held(&cgroup_mutex);
3417
3418 restart:
3419         for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3420                 /* does cft->flags tell us to skip this file on @cgrp? */
3421                 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3422                         continue;
3423                 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3424                         continue;
3425                 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3426                         continue;
3427                 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3428                         continue;
3429
3430                 if (is_add) {
3431                         ret = cgroup_add_file(css, cgrp, cft);
3432                         if (ret) {
3433                                 pr_warn("%s: failed to add %s, err=%d\n",
3434                                         __func__, cft->name, ret);
3435                                 cft_end = cft;
3436                                 is_add = false;
3437                                 goto restart;
3438                         }
3439                 } else {
3440                         cgroup_rm_file(cgrp, cft);
3441                 }
3442         }
3443         return ret;
3444 }
3445
3446 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3447 {
3448         LIST_HEAD(pending);
3449         struct cgroup_subsys *ss = cfts[0].ss;
3450         struct cgroup *root = &ss->root->cgrp;
3451         struct cgroup_subsys_state *css;
3452         int ret = 0;
3453
3454         lockdep_assert_held(&cgroup_mutex);
3455
3456         /* add/rm files for all cgroups created before */
3457         css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3458                 struct cgroup *cgrp = css->cgroup;
3459
3460                 if (!(css->flags & CSS_VISIBLE))
3461                         continue;
3462
3463                 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3464                 if (ret)
3465                         break;
3466         }
3467
3468         if (is_add && !ret)
3469                 kernfs_activate(root->kn);
3470         return ret;
3471 }
3472
3473 static void cgroup_exit_cftypes(struct cftype *cfts)
3474 {
3475         struct cftype *cft;
3476
3477         for (cft = cfts; cft->name[0] != '\0'; cft++) {
3478                 /* free copy for custom atomic_write_len, see init_cftypes() */
3479                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
3480                         kfree(cft->kf_ops);
3481                 cft->kf_ops = NULL;
3482                 cft->ss = NULL;
3483
3484                 /* revert flags set by cgroup core while adding @cfts */
3485                 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3486         }
3487 }
3488
3489 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3490 {
3491         struct cftype *cft;
3492
3493         for (cft = cfts; cft->name[0] != '\0'; cft++) {
3494                 struct kernfs_ops *kf_ops;
3495
3496                 WARN_ON(cft->ss || cft->kf_ops);
3497
3498                 if (cft->seq_start)
3499                         kf_ops = &cgroup_kf_ops;
3500                 else
3501                         kf_ops = &cgroup_kf_single_ops;
3502
3503                 /*
3504                  * Ugh... if @cft wants a custom max_write_len, we need to
3505                  * make a copy of kf_ops to set its atomic_write_len.
3506                  */
3507                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3508                         kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3509                         if (!kf_ops) {
3510                                 cgroup_exit_cftypes(cfts);
3511                                 return -ENOMEM;
3512                         }
3513                         kf_ops->atomic_write_len = cft->max_write_len;
3514                 }
3515
3516                 cft->kf_ops = kf_ops;
3517                 cft->ss = ss;
3518         }
3519
3520         return 0;
3521 }
3522
3523 static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3524 {
3525         lockdep_assert_held(&cgroup_mutex);
3526
3527         if (!cfts || !cfts[0].ss)
3528                 return -ENOENT;
3529
3530         list_del(&cfts->node);
3531         cgroup_apply_cftypes(cfts, false);
3532         cgroup_exit_cftypes(cfts);
3533         return 0;
3534 }
3535
3536 /**
3537  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
3538  * @cfts: zero-length name terminated array of cftypes
3539  *
3540  * Unregister @cfts.  Files described by @cfts are removed from all
3541  * existing cgroups and all future cgroups won't have them either.  This
3542  * function can be called anytime whether @cfts' subsys is attached or not.
3543  *
3544  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
3545  * registered.
3546  */
3547 int cgroup_rm_cftypes(struct cftype *cfts)
3548 {
3549         int ret;
3550
3551         mutex_lock(&cgroup_mutex);
3552         ret = cgroup_rm_cftypes_locked(cfts);
3553         mutex_unlock(&cgroup_mutex);
3554         return ret;
3555 }
3556
3557 /**
3558  * cgroup_add_cftypes - add an array of cftypes to a subsystem
3559  * @ss: target cgroup subsystem
3560  * @cfts: zero-length name terminated array of cftypes
3561  *
3562  * Register @cfts to @ss.  Files described by @cfts are created for all
3563  * existing cgroups to which @ss is attached and all future cgroups will
3564  * have them too.  This function can be called anytime whether @ss is
3565  * attached or not.
3566  *
3567  * Returns 0 on successful registration, -errno on failure.  Note that this
3568  * function currently returns 0 as long as @cfts registration is successful
3569  * even if some file creation attempts on existing cgroups fail.
3570  */
3571 static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3572 {
3573         int ret;
3574
3575         if (!cgroup_ssid_enabled(ss->id))
3576                 return 0;
3577
3578         if (!cfts || cfts[0].name[0] == '\0')
3579                 return 0;
3580
3581         ret = cgroup_init_cftypes(ss, cfts);
3582         if (ret)
3583                 return ret;
3584
3585         mutex_lock(&cgroup_mutex);
3586
3587         list_add_tail(&cfts->node, &ss->cfts);
3588         ret = cgroup_apply_cftypes(cfts, true);
3589         if (ret)
3590                 cgroup_rm_cftypes_locked(cfts);
3591
3592         mutex_unlock(&cgroup_mutex);
3593         return ret;
3594 }
3595
3596 /**
3597  * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
3598  * @ss: target cgroup subsystem
3599  * @cfts: zero-length name terminated array of cftypes
3600  *
3601  * Similar to cgroup_add_cftypes() but the added files are only used for
3602  * the default hierarchy.
3603  */
3604 int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3605 {
3606         struct cftype *cft;
3607
3608         for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3609                 cft->flags |= __CFTYPE_ONLY_ON_DFL;
3610         return cgroup_add_cftypes(ss, cfts);
3611 }
3612
3613 /**
3614  * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
3615  * @ss: target cgroup subsystem
3616  * @cfts: zero-length name terminated array of cftypes
3617  *
3618  * Similar to cgroup_add_cftypes() but the added files are only used for
3619  * the legacy hierarchies.
3620  */
3621 int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3622 {
3623         struct cftype *cft;
3624
3625         for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3626                 cft->flags |= __CFTYPE_NOT_ON_DFL;
3627         return cgroup_add_cftypes(ss, cfts);
3628 }
3629
3630 /**
3631  * cgroup_file_notify - generate a file modified event for a cgroup_file
3632  * @cfile: target cgroup_file
3633  *
3634  * @cfile must have been obtained by setting cftype->file_offset.
3635  */
3636 void cgroup_file_notify(struct cgroup_file *cfile)
3637 {
3638         unsigned long flags;
3639
3640         spin_lock_irqsave(&cgroup_file_kn_lock, flags);
3641         if (cfile->kn)
3642                 kernfs_notify(cfile->kn);
3643         spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
3644 }
3645
3646 /**
3647  * cgroup_task_count - count the number of tasks in a cgroup.
3648  * @cgrp: the cgroup in question
3649  *
3650  * Return the number of tasks in the cgroup.
3651  */
3652 static int cgroup_task_count(const struct cgroup *cgrp)
3653 {
3654         int count = 0;
3655         struct cgrp_cset_link *link;
3656
3657         spin_lock_bh(&css_set_lock);
3658         list_for_each_entry(link, &cgrp->cset_links, cset_link)
3659                 count += atomic_read(&link->cset->refcount);
3660         spin_unlock_bh(&css_set_lock);
3661         return count;
3662 }
3663
3664 /**
3665  * css_next_child - find the next child of a given css
3666  * @pos: the current position (%NULL to initiate traversal)
3667  * @parent: css whose children to walk
3668  *
3669  * This function returns the next child of @parent and should be called
3670  * under either cgroup_mutex or RCU read lock.  The only requirement is
3671  * that @parent and @pos are accessible.  The next sibling is guaranteed to
3672  * be returned regardless of their states.
3673  *
3674  * If a subsystem synchronizes ->css_online() and the start of iteration, a
3675  * css which finished ->css_online() is guaranteed to be visible in the
3676  * future iterations and will stay visible until the last reference is put.
3677  * A css which hasn't finished ->css_online() or already finished
3678  * ->css_offline() may show up during traversal.  It's each subsystem's
3679  * responsibility to synchronize against on/offlining.
3680  */
3681 struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
3682                                            struct cgroup_subsys_state *parent)
3683 {
3684         struct cgroup_subsys_state *next;
3685
3686         cgroup_assert_mutex_or_rcu_locked();
3687
3688         /*
3689          * @pos could already have been unlinked from the sibling list.
3690          * Once a cgroup is removed, its ->sibling.next is no longer
3691          * updated when its next sibling changes.  CSS_RELEASED is set when
3692          * @pos is taken off list, at which time its next pointer is valid,
3693          * and, as releases are serialized, the one pointed to by the next
3694          * pointer is guaranteed to not have started release yet.  This
3695          * implies that if we observe !CSS_RELEASED on @pos in this RCU
3696          * critical section, the one pointed to by its next pointer is
3697          * guaranteed to not have finished its RCU grace period even if we
3698          * have dropped rcu_read_lock() inbetween iterations.
3699          *
3700          * If @pos has CSS_RELEASED set, its next pointer can't be
3701          * dereferenced; however, as each css is given a monotonically
3702          * increasing unique serial number and always appended to the
3703          * sibling list, the next one can be found by walking the parent's
3704          * children until the first css with higher serial number than
3705          * @pos's.  While this path can be slower, it happens iff iteration
3706          * races against release and the race window is very small.
3707          */
3708         if (!pos) {
3709                 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
3710         } else if (likely(!(pos->flags & CSS_RELEASED))) {
3711                 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
3712         } else {
3713                 list_for_each_entry_rcu(next, &parent->children, sibling)
3714                         if (next->serial_nr > pos->serial_nr)
3715                                 break;
3716         }
3717
3718         /*
3719          * @next, if not pointing to the head, can be dereferenced and is
3720          * the next sibling.
3721          */
3722         if (&next->sibling != &parent->children)
3723                 return next;
3724         return NULL;
3725 }
3726
3727 /**
3728  * css_next_descendant_pre - find the next descendant for pre-order walk
3729  * @pos: the current position (%NULL to initiate traversal)
3730  * @root: css whose descendants to walk
3731  *
3732  * To be used by css_for_each_descendant_pre().  Find the next descendant
3733  * to visit for pre-order traversal of @root's descendants.  @root is
3734  * included in the iteration and the first node to be visited.
3735  *
3736  * While this function requires cgroup_mutex or RCU read locking, it
3737  * doesn't require the whole traversal to be contained in a single critical
3738  * section.  This function will return the correct next descendant as long
3739  * as both @pos and @root are accessible and @pos is a descendant of @root.
3740  *
3741  * If a subsystem synchronizes ->css_online() and the start of iteration, a
3742  * css which finished ->css_online() is guaranteed to be visible in the
3743  * future iterations and will stay visible until the last reference is put.
3744  * A css which hasn't finished ->css_online() or already finished
3745  * ->css_offline() may show up during traversal.  It's each subsystem's
3746  * responsibility to synchronize against on/offlining.
3747  */
3748 struct cgroup_subsys_state *
3749 css_next_descendant_pre(struct cgroup_subsys_state *pos,
3750                         struct cgroup_subsys_state *root)
3751 {
3752         struct cgroup_subsys_state *next;
3753
3754         cgroup_assert_mutex_or_rcu_locked();
3755
3756         /* if first iteration, visit @root */
3757         if (!pos)
3758                 return root;
3759
3760         /* visit the first child if exists */
3761         next = css_next_child(NULL, pos);
3762         if (next)
3763                 return next;
3764
3765         /* no child, visit my or the closest ancestor's next sibling */
3766         while (pos != root) {
3767                 next = css_next_child(pos, pos->parent);
3768                 if (next)
3769                         return next;
3770                 pos = pos->parent;
3771         }
3772
3773         return NULL;
3774 }
3775
3776 /**
3777  * css_rightmost_descendant - return the rightmost descendant of a css
3778  * @pos: css of interest
3779  *
3780  * Return the rightmost descendant of @pos.  If there's no descendant, @pos
3781  * is returned.  This can be used during pre-order traversal to skip
3782  * subtree of @pos.
3783  *
3784  * While this function requires cgroup_mutex or RCU read locking, it
3785  * doesn't require the whole traversal to be contained in a single critical
3786  * section.  This function will return the correct rightmost descendant as
3787  * long as @pos is accessible.
3788  */
3789 struct cgroup_subsys_state *
3790 css_rightmost_descendant(struct cgroup_subsys_state *pos)
3791 {
3792         struct cgroup_subsys_state *last, *tmp;
3793
3794         cgroup_assert_mutex_or_rcu_locked();
3795
3796         do {
3797                 last = pos;
3798                 /* ->prev isn't RCU safe, walk ->next till the end */
3799                 pos = NULL;
3800                 css_for_each_child(tmp, last)
3801                         pos = tmp;
3802         } while (pos);
3803
3804         return last;
3805 }
3806
3807 static struct cgroup_subsys_state *
3808 css_leftmost_descendant(struct cgroup_subsys_state *pos)
3809 {
3810         struct cgroup_subsys_state *last;
3811
3812         do {
3813                 last = pos;
3814                 pos = css_next_child(NULL, pos);
3815         } while (pos);
3816
3817         return last;
3818 }
3819
3820 /**
3821  * css_next_descendant_post - find the next descendant for post-order walk
3822  * @pos: the current position (%NULL to initiate traversal)
3823  * @root: css whose descendants to walk
3824  *
3825  * To be used by css_for_each_descendant_post().  Find the next descendant
3826  * to visit for post-order traversal of @root's descendants.  @root is
3827  * included in the iteration and the last node to be visited.
3828  *
3829  * While this function requires cgroup_mutex or RCU read locking, it
3830  * doesn't require the whole traversal to be contained in a single critical
3831  * section.  This function will return the correct next descendant as long
3832  * as both @pos and @cgroup are accessible and @pos is a descendant of
3833  * @cgroup.
3834  *
3835  * If a subsystem synchronizes ->css_online() and the start of iteration, a
3836  * css which finished ->css_online() is guaranteed to be visible in the
3837  * future iterations and will stay visible until the last reference is put.
3838  * A css which hasn't finished ->css_online() or already finished
3839  * ->css_offline() may show up during traversal.  It's each subsystem's
3840  * responsibility to synchronize against on/offlining.
3841  */
3842 struct cgroup_subsys_state *
3843 css_next_descendant_post(struct cgroup_subsys_state *pos,
3844                          struct cgroup_subsys_state *root)
3845 {
3846         struct cgroup_subsys_state *next;
3847
3848         cgroup_assert_mutex_or_rcu_locked();
3849
3850         /* if first iteration, visit leftmost descendant which may be @root */
3851         if (!pos)
3852                 return css_leftmost_descendant(root);
3853
3854         /* if we visited @root, we're done */
3855         if (pos == root)
3856                 return NULL;
3857
3858         /* if there's an unvisited sibling, visit its leftmost descendant */
3859         next = css_next_child(pos, pos->parent);
3860         if (next)
3861                 return css_leftmost_descendant(next);
3862
3863         /* no sibling left, visit parent */
3864         return pos->parent;
3865 }
3866
3867 /**
3868  * css_has_online_children - does a css have online children
3869  * @css: the target css
3870  *
3871  * Returns %true if @css has any online children; otherwise, %false.  This
3872  * function can be called from any context but the caller is responsible
3873  * for synchronizing against on/offlining as necessary.
3874  */
3875 bool css_has_online_children(struct cgroup_subsys_state *css)
3876 {
3877         struct cgroup_subsys_state *child;
3878         bool ret = false;
3879
3880         rcu_read_lock();
3881         css_for_each_child(child, css) {
3882                 if (child->flags & CSS_ONLINE) {
3883                         ret = true;
3884                         break;
3885                 }
3886         }
3887         rcu_read_unlock();
3888         return ret;
3889 }
3890
3891 /**
3892  * css_task_iter_advance_css_set - advance a task itererator to the next css_set
3893  * @it: the iterator to advance
3894  *
3895  * Advance @it to the next css_set to walk.
3896  */
3897 static void css_task_iter_advance_css_set(struct css_task_iter *it)
3898 {
3899         struct list_head *l = it->cset_pos;
3900         struct cgrp_cset_link *link;
3901         struct css_set *cset;
3902
3903         lockdep_assert_held(&css_set_lock);
3904
3905         /* Advance to the next non-empty css_set */
3906         do {
3907                 l = l->next;
3908                 if (l == it->cset_head) {
3909                         it->cset_pos = NULL;
3910                         it->task_pos = NULL;
3911                         return;
3912                 }
3913
3914                 if (it->ss) {
3915                         cset = container_of(l, struct css_set,
3916                                             e_cset_node[it->ss->id]);
3917                 } else {
3918                         link = list_entry(l, struct cgrp_cset_link, cset_link);
3919                         cset = link->cset;
3920                 }
3921         } while (!css_set_populated(cset));
3922
3923         it->cset_pos = l;
3924
3925         if (!list_empty(&cset->tasks))
3926                 it->task_pos = cset->tasks.next;
3927         else
3928                 it->task_pos = cset->mg_tasks.next;
3929
3930         it->tasks_head = &cset->tasks;
3931         it->mg_tasks_head = &cset->mg_tasks;
3932
3933         /*
3934          * We don't keep css_sets locked across iteration steps and thus
3935          * need to take steps to ensure that iteration can be resumed after
3936          * the lock is re-acquired.  Iteration is performed at two levels -
3937          * css_sets and tasks in them.
3938          *
3939          * Once created, a css_set never leaves its cgroup lists, so a
3940          * pinned css_set is guaranteed to stay put and we can resume
3941          * iteration afterwards.
3942          *
3943          * Tasks may leave @cset across iteration steps.  This is resolved
3944          * by registering each iterator with the css_set currently being
3945          * walked and making css_set_move_task() advance iterators whose
3946          * next task is leaving.
3947          */
3948         if (it->cur_cset) {
3949                 list_del(&it->iters_node);
3950                 put_css_set_locked(it->cur_cset);
3951         }
3952         get_css_set(cset);
3953         it->cur_cset = cset;
3954         list_add(&it->iters_node, &cset->task_iters);
3955 }
3956
3957 static void css_task_iter_advance(struct css_task_iter *it)
3958 {
3959         struct list_head *l = it->task_pos;
3960
3961         lockdep_assert_held(&css_set_lock);
3962         WARN_ON_ONCE(!l);
3963
3964         /*
3965          * Advance iterator to find next entry.  cset->tasks is consumed
3966          * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
3967          * next cset.
3968          */
3969         l = l->next;
3970
3971         if (l == it->tasks_head)
3972                 l = it->mg_tasks_head->next;
3973
3974         if (l == it->mg_tasks_head)
3975                 css_task_iter_advance_css_set(it);
3976         else
3977                 it->task_pos = l;
3978 }
3979
3980 /**
3981  * css_task_iter_start - initiate task iteration
3982  * @css: the css to walk tasks of
3983  * @it: the task iterator to use
3984  *
3985  * Initiate iteration through the tasks of @css.  The caller can call
3986  * css_task_iter_next() to walk through the tasks until the function
3987  * returns NULL.  On completion of iteration, css_task_iter_end() must be
3988  * called.
3989  */
3990 void css_task_iter_start(struct cgroup_subsys_state *css,
3991                          struct css_task_iter *it)
3992 {
3993         /* no one should try to iterate before mounting cgroups */
3994         WARN_ON_ONCE(!use_task_css_set_links);
3995
3996         memset(it, 0, sizeof(*it));
3997
3998         spin_lock_bh(&css_set_lock);
3999
4000         it->ss = css->ss;
4001
4002         if (it->ss)
4003                 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4004         else
4005                 it->cset_pos = &css->cgroup->cset_links;
4006
4007         it->cset_head = it->cset_pos;
4008
4009         css_task_iter_advance_css_set(it);
4010
4011         spin_unlock_bh(&css_set_lock);
4012 }
4013
4014 /**
4015  * css_task_iter_next - return the next task for the iterator
4016  * @it: the task iterator being iterated
4017  *
4018  * The "next" function for task iteration.  @it should have been
4019  * initialized via css_task_iter_start().  Returns NULL when the iteration
4020  * reaches the end.
4021  */
4022 struct task_struct *css_task_iter_next(struct css_task_iter *it)
4023 {
4024         if (it->cur_task) {
4025                 put_task_struct(it->cur_task);
4026                 it->cur_task = NULL;
4027         }
4028
4029         spin_lock_bh(&css_set_lock);
4030
4031         if (it->task_pos) {
4032                 it->cur_task = list_entry(it->task_pos, struct task_struct,
4033                                           cg_list);
4034                 get_task_struct(it->cur_task);
4035                 css_task_iter_advance(it);
4036         }
4037
4038         spin_unlock_bh(&css_set_lock);
4039
4040         return it->cur_task;
4041 }
4042
4043 /**
4044  * css_task_iter_end - finish task iteration
4045  * @it: the task iterator to finish
4046  *
4047  * Finish task iteration started by css_task_iter_start().
4048  */
4049 void css_task_iter_end(struct css_task_iter *it)
4050 {
4051         if (it->cur_cset) {
4052                 spin_lock_bh(&css_set_lock);
4053                 list_del(&it->iters_node);
4054                 put_css_set_locked(it->cur_cset);
4055                 spin_unlock_bh(&css_set_lock);
4056         }
4057
4058         if (it->cur_task)
4059                 put_task_struct(it->cur_task);
4060 }
4061
4062 /**
4063  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
4064  * @to: cgroup to which the tasks will be moved
4065  * @from: cgroup in which the tasks currently reside
4066  *
4067  * Locking rules between cgroup_post_fork() and the migration path
4068  * guarantee that, if a task is forking while being migrated, the new child
4069  * is guaranteed to be either visible in the source cgroup after the
4070  * parent's migration is complete or put into the target cgroup.  No task
4071  * can slip out of migration through forking.
4072  */
4073 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
4074 {
4075         LIST_HEAD(preloaded_csets);
4076         struct cgrp_cset_link *link;
4077         struct css_task_iter it;
4078         struct task_struct *task;
4079         int ret;
4080
4081         mutex_lock(&cgroup_mutex);
4082
4083         /* all tasks in @from are being moved, all csets are source */
4084         spin_lock_bh(&css_set_lock);
4085         list_for_each_entry(link, &from->cset_links, cset_link)
4086                 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
4087         spin_unlock_bh(&css_set_lock);
4088
4089         ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
4090         if (ret)
4091                 goto out_err;
4092
4093         /*
4094          * Migrate tasks one-by-one until @from is empty.  This fails iff
4095          * ->can_attach() fails.
4096          */
4097         do {
4098                 css_task_iter_start(&from->self, &it);
4099                 task = css_task_iter_next(&it);
4100                 if (task)
4101                         get_task_struct(task);
4102                 css_task_iter_end(&it);
4103
4104                 if (task) {
4105                         ret = cgroup_migrate(task, false, to);
4106                         put_task_struct(task);
4107                 }
4108         } while (task && !ret);
4109 out_err:
4110         cgroup_migrate_finish(&preloaded_csets);
4111         mutex_unlock(&cgroup_mutex);
4112         return ret;
4113 }
4114
4115 /*
4116  * Stuff for reading the 'tasks'/'procs' files.
4117  *
4118  * Reading this file can return large amounts of data if a cgroup has
4119  * *lots* of attached tasks. So it may need several calls to read(),
4120  * but we cannot guarantee that the information we produce is correct
4121  * unless we produce it entirely atomically.
4122  *
4123  */
4124
4125 /* which pidlist file are we talking about? */
4126 enum cgroup_filetype {
4127         CGROUP_FILE_PROCS,
4128         CGROUP_FILE_TASKS,
4129 };
4130
4131 /*
4132  * A pidlist is a list of pids that virtually represents the contents of one
4133  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
4134  * a pair (one each for procs, tasks) for each pid namespace that's relevant
4135  * to the cgroup.
4136  */
4137 struct cgroup_pidlist {
4138         /*
4139          * used to find which pidlist is wanted. doesn't change as long as
4140          * this particular list stays in the list.
4141         */
4142         struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
4143         /* array of xids */
4144         pid_t *list;
4145         /* how many elements the above list has */
4146         int length;
4147         /* each of these stored in a list by its cgroup */
4148         struct list_head links;
4149         /* pointer to the cgroup we belong to, for list removal purposes */
4150         struct cgroup *owner;
4151         /* for delayed destruction */
4152         struct delayed_work destroy_dwork;
4153 };
4154
4155 /*
4156  * The following two functions "fix" the issue where there are more pids
4157  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
4158  * TODO: replace with a kernel-wide solution to this problem
4159  */
4160 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
4161 static void *pidlist_allocate(int count)
4162 {
4163         if (PIDLIST_TOO_LARGE(count))
4164                 return vmalloc(count * sizeof(pid_t));
4165         else
4166                 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
4167 }
4168
4169 static void pidlist_free(void *p)
4170 {
4171         kvfree(p);
4172 }
4173
4174 /*
4175  * Used to destroy all pidlists lingering waiting for destroy timer.  None
4176  * should be left afterwards.
4177  */
4178 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
4179 {
4180         struct cgroup_pidlist *l, *tmp_l;
4181
4182         mutex_lock(&cgrp->pidlist_mutex);
4183         list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
4184                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
4185         mutex_unlock(&cgrp->pidlist_mutex);
4186
4187         flush_workqueue(cgroup_pidlist_destroy_wq);
4188         BUG_ON(!list_empty(&cgrp->pidlists));
4189 }
4190
4191 static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
4192 {
4193         struct delayed_work *dwork = to_delayed_work(work);
4194         struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
4195                                                 destroy_dwork);
4196         struct cgroup_pidlist *tofree = NULL;
4197
4198         mutex_lock(&l->owner->pidlist_mutex);
4199
4200         /*
4201          * Destroy iff we didn't get queued again.  The state won't change
4202          * as destroy_dwork can only be queued while locked.
4203          */
4204         if (!delayed_work_pending(dwork)) {
4205                 list_del(&l->links);
4206                 pidlist_free(l->list);
4207                 put_pid_ns(l->key.ns);
4208                 tofree = l;
4209         }
4210
4211         mutex_unlock(&l->owner->pidlist_mutex);
4212         kfree(tofree);
4213 }
4214
4215 /*
4216  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
4217  * Returns the number of unique elements.
4218  */
4219 static int pidlist_uniq(pid_t *list, int length)
4220 {
4221         int src, dest = 1;
4222
4223         /*
4224          * we presume the 0th element is unique, so i starts at 1. trivial
4225          * edge cases first; no work needs to be done for either
4226          */
4227         if (length == 0 || length == 1)
4228                 return length;
4229         /* src and dest walk down the list; dest counts unique elements */
4230         for (src = 1; src < length; src++) {
4231                 /* find next unique element */
4232                 while (list[src] == list[src-1]) {
4233                         src++;
4234                         if (src == length)
4235                                 goto after;
4236                 }
4237                 /* dest always points to where the next unique element goes */
4238                 list[dest] = list[src];
4239                 dest++;
4240         }
4241 after:
4242         return dest;
4243 }
4244
4245 /*
4246  * The two pid files - task and cgroup.procs - guaranteed that the result
4247  * is sorted, which forced this whole pidlist fiasco.  As pid order is
4248  * different per namespace, each namespace needs differently sorted list,
4249  * making it impossible to use, for example, single rbtree of member tasks
4250  * sorted by task pointer.  As pidlists can be fairly large, allocating one
4251  * per open file is dangerous, so cgroup had to implement shared pool of
4252  * pidlists keyed by cgroup and namespace.
4253  *
4254  * All this extra complexity was caused by the original implementation
4255  * committing to an entirely unnecessary property.  In the long term, we
4256  * want to do away with it.  Explicitly scramble sort order if on the
4257  * default hierarchy so that no such expectation exists in the new
4258  * interface.
4259  *
4260  * Scrambling is done by swapping every two consecutive bits, which is
4261  * non-identity one-to-one mapping which disturbs sort order sufficiently.
4262  */
4263 static pid_t pid_fry(pid_t pid)
4264 {
4265         unsigned a = pid & 0x55555555;
4266         unsigned b = pid & 0xAAAAAAAA;
4267
4268         return (a << 1) | (b >> 1);
4269 }
4270
4271 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
4272 {
4273         if (cgroup_on_dfl(cgrp))
4274                 return pid_fry(pid);
4275         else
4276                 return pid;
4277 }
4278
4279 static int cmppid(const void *a, const void *b)
4280 {
4281         return *(pid_t *)a - *(pid_t *)b;
4282 }
4283
4284 static int fried_cmppid(const void *a, const void *b)
4285 {
4286         return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
4287 }
4288
4289 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
4290                                                   enum cgroup_filetype type)
4291 {
4292         struct cgroup_pidlist *l;
4293         /* don't need task_nsproxy() if we're looking at ourself */
4294         struct pid_namespace *ns = task_active_pid_ns(current);
4295
4296         lockdep_assert_held(&cgrp->pidlist_mutex);
4297
4298         list_for_each_entry(l, &cgrp->pidlists, links)
4299                 if (l->key.type == type && l->key.ns == ns)
4300                         return l;
4301         return NULL;
4302 }
4303
4304 /*
4305  * find the appropriate pidlist for our purpose (given procs vs tasks)
4306  * returns with the lock on that pidlist already held, and takes care
4307  * of the use count, or returns NULL with no locks held if we're out of
4308  * memory.
4309  */
4310 static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
4311                                                 enum cgroup_filetype type)
4312 {
4313         struct cgroup_pidlist *l;
4314
4315         lockdep_assert_held(&cgrp->pidlist_mutex);
4316
4317         l = cgroup_pidlist_find(cgrp, type);
4318         if (l)
4319                 return l;
4320
4321         /* entry not found; create a new one */
4322         l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
4323         if (!l)
4324                 return l;
4325
4326         INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
4327         l->key.type = type;
4328         /* don't need task_nsproxy() if we're looking at ourself */
4329         l->key.ns = get_pid_ns(task_active_pid_ns(current));
4330         l->owner = cgrp;
4331         list_add(&l->links, &cgrp->pidlists);
4332         return l;
4333 }
4334
4335 /*
4336  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
4337  */
4338 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
4339                               struct cgroup_pidlist **lp)
4340 {
4341         pid_t *array;
4342         int length;
4343         int pid, n = 0; /* used for populating the array */
4344         struct css_task_iter it;
4345         struct task_struct *tsk;
4346         struct cgroup_pidlist *l;
4347
4348         lockdep_assert_held(&cgrp->pidlist_mutex);
4349
4350         /*
4351          * If cgroup gets more users after we read count, we won't have
4352          * enough space - tough.  This race is indistinguishable to the
4353          * caller from the case that the additional cgroup users didn't
4354          * show up until sometime later on.
4355          */
4356         length = cgroup_task_count(cgrp);
4357         array = pidlist_allocate(length);
4358         if (!array)
4359                 return -ENOMEM;
4360         /* now, populate the array */
4361         css_task_iter_start(&cgrp->self, &it);
4362         while ((tsk = css_task_iter_next(&it))) {
4363                 if (unlikely(n == length))
4364                         break;
4365                 /* get tgid or pid for procs or tasks file respectively */
4366                 if (type == CGROUP_FILE_PROCS)
4367                         pid = task_tgid_vnr(tsk);
4368                 else
4369                         pid = task_pid_vnr(tsk);
4370                 if (pid > 0) /* make sure to only use valid results */
4371                         array[n++] = pid;
4372         }
4373         css_task_iter_end(&it);
4374         length = n;
4375         /* now sort & (if procs) strip out duplicates */
4376         if (cgroup_on_dfl(cgrp))
4377                 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
4378         else
4379                 sort(array, length, sizeof(pid_t), cmppid, NULL);
4380         if (type == CGROUP_FILE_PROCS)
4381                 length = pidlist_uniq(array, length);
4382
4383         l = cgroup_pidlist_find_create(cgrp, type);
4384         if (!l) {
4385                 pidlist_free(array);
4386                 return -ENOMEM;
4387         }
4388
4389         /* store array, freeing old if necessary */
4390         pidlist_free(l->list);
4391         l->list = array;
4392         l->length = length;
4393         *lp = l;
4394         return 0;
4395 }
4396
4397 /**
4398  * cgroupstats_build - build and fill cgroupstats
4399  * @stats: cgroupstats to fill information into
4400  * @dentry: A dentry entry belonging to the cgroup for which stats have
4401  * been requested.
4402  *
4403  * Build and fill cgroupstats so that taskstats can export it to user
4404  * space.
4405  */
4406 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
4407 {
4408         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4409         struct cgroup *cgrp;
4410         struct css_task_iter it;
4411         struct task_struct *tsk;
4412
4413         /* it should be kernfs_node belonging to cgroupfs and is a directory */
4414         if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
4415             kernfs_type(kn) != KERNFS_DIR)
4416                 return -EINVAL;
4417
4418         mutex_lock(&cgroup_mutex);
4419
4420         /*
4421          * We aren't being called from kernfs and there's no guarantee on
4422          * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
4423          * @kn->priv is RCU safe.  Let's do the RCU dancing.
4424          */
4425         rcu_read_lock();
4426         cgrp = rcu_dereference(kn->priv);
4427         if (!cgrp || cgroup_is_dead(cgrp)) {
4428                 rcu_read_unlock();
4429                 mutex_unlock(&cgroup_mutex);
4430                 return -ENOENT;
4431         }
4432         rcu_read_unlock();
4433
4434         css_task_iter_start(&cgrp->self, &it);
4435         while ((tsk = css_task_iter_next(&it))) {
4436                 switch (tsk->state) {
4437                 case TASK_RUNNING:
4438                         stats->nr_running++;
4439                         break;
4440                 case TASK_INTERRUPTIBLE:
4441                         stats->nr_sleeping++;
4442                         break;
4443                 case TASK_UNINTERRUPTIBLE:
4444                         stats->nr_uninterruptible++;
4445                         break;
4446                 case TASK_STOPPED:
4447                         stats->nr_stopped++;
4448                         break;
4449                 default:
4450                         if (delayacct_is_task_waiting_on_io(tsk))
4451                                 stats->nr_io_wait++;
4452                         break;
4453                 }
4454         }
4455         css_task_iter_end(&it);
4456
4457         mutex_unlock(&cgroup_mutex);
4458         return 0;
4459 }
4460
4461
4462 /*
4463  * seq_file methods for the tasks/procs files. The seq_file position is the
4464  * next pid to display; the seq_file iterator is a pointer to the pid
4465  * in the cgroup->l->list array.
4466  */
4467
4468 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
4469 {
4470         /*
4471          * Initially we receive a position value that corresponds to
4472          * one more than the last pid shown (or 0 on the first call or
4473          * after a seek to the start). Use a binary-search to find the
4474          * next pid to display, if any
4475          */
4476         struct kernfs_open_file *of = s->private;
4477         struct cgroup *cgrp = seq_css(s)->cgroup;
4478         struct cgroup_pidlist *l;
4479         enum cgroup_filetype type = seq_cft(s)->private;
4480         int index = 0, pid = *pos;
4481         int *iter, ret;
4482
4483         mutex_lock(&cgrp->pidlist_mutex);
4484
4485         /*
4486          * !NULL @of->priv indicates that this isn't the first start()
4487          * after open.  If the matching pidlist is around, we can use that.
4488          * Look for it.  Note that @of->priv can't be used directly.  It
4489          * could already have been destroyed.
4490          */
4491         if (of->priv)
4492                 of->priv = cgroup_pidlist_find(cgrp, type);
4493
4494         /*
4495          * Either this is the first start() after open or the matching
4496          * pidlist has been destroyed inbetween.  Create a new one.
4497          */
4498         if (!of->priv) {
4499                 ret = pidlist_array_load(cgrp, type,
4500                                          (struct cgroup_pidlist **)&of->priv);
4501                 if (ret)
4502                         return ERR_PTR(ret);
4503         }
4504         l = of->priv;
4505
4506         if (pid) {
4507                 int end = l->length;
4508
4509                 while (index < end) {
4510                         int mid = (index + end) / 2;
4511                         if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
4512                                 index = mid;
4513                                 break;
4514                         } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
4515                                 index = mid + 1;
4516                         else
4517                                 end = mid;
4518                 }
4519         }
4520         /* If we're off the end of the array, we're done */
4521         if (index >= l->length)
4522                 return NULL;
4523         /* Update the abstract position to be the actual pid that we found */
4524         iter = l->list + index;
4525         *pos = cgroup_pid_fry(cgrp, *iter);
4526         return iter;
4527 }
4528
4529 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
4530 {
4531         struct kernfs_open_file *of = s->private;
4532         struct cgroup_pidlist *l = of->priv;
4533
4534         if (l)
4535                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
4536                                  CGROUP_PIDLIST_DESTROY_DELAY);
4537         mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
4538 }
4539
4540 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
4541 {
4542         struct kernfs_open_file *of = s->private;
4543         struct cgroup_pidlist *l = of->priv;
4544         pid_t *p = v;
4545         pid_t *end = l->list + l->length;
4546         /*
4547          * Advance to the next pid in the array. If this goes off the
4548          * end, we're done
4549          */
4550         p++;
4551         if (p >= end) {
4552                 return NULL;
4553         } else {
4554                 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
4555                 return p;
4556         }
4557 }
4558
4559 static int cgroup_pidlist_show(struct seq_file *s, void *v)
4560 {
4561         seq_printf(s, "%d\n", *(int *)v);
4562
4563         return 0;
4564 }
4565
4566 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
4567                                          struct cftype *cft)
4568 {
4569         return notify_on_release(css->cgroup);
4570 }
4571
4572 static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
4573                                           struct cftype *cft, u64 val)
4574 {
4575         if (val)
4576                 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4577         else
4578                 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4579         return 0;
4580 }
4581
4582 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4583                                       struct cftype *cft)
4584 {
4585         return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4586 }
4587
4588 static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4589                                        struct cftype *cft, u64 val)
4590 {
4591         if (val)
4592                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4593         else
4594                 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4595         return 0;
4596 }
4597
4598 /* cgroup core interface files for the default hierarchy */
4599 static struct cftype cgroup_dfl_base_files[] = {
4600         {
4601                 .name = "cgroup.procs",
4602                 .file_offset = offsetof(struct cgroup, procs_file),
4603                 .seq_start = cgroup_pidlist_start,
4604                 .seq_next = cgroup_pidlist_next,
4605                 .seq_stop = cgroup_pidlist_stop,
4606                 .seq_show = cgroup_pidlist_show,
4607                 .private = CGROUP_FILE_PROCS,
4608                 .write = cgroup_procs_write,
4609         },
4610         {
4611                 .name = "cgroup.controllers",
4612                 .seq_show = cgroup_controllers_show,
4613         },
4614         {
4615                 .name = "cgroup.subtree_control",
4616                 .seq_show = cgroup_subtree_control_show,
4617                 .write = cgroup_subtree_control_write,
4618         },
4619         {
4620                 .name = "cgroup.events",
4621                 .flags = CFTYPE_NOT_ON_ROOT,
4622                 .file_offset = offsetof(struct cgroup, events_file),
4623                 .seq_show = cgroup_events_show,
4624         },
4625         { }     /* terminate */
4626 };
4627
4628 /* cgroup core interface files for the legacy hierarchies */
4629 static struct cftype cgroup_legacy_base_files[] = {
4630         {
4631                 .name = "cgroup.procs",
4632                 .seq_start = cgroup_pidlist_start,
4633                 .seq_next = cgroup_pidlist_next,
4634                 .seq_stop = cgroup_pidlist_stop,
4635                 .seq_show = cgroup_pidlist_show,
4636                 .private = CGROUP_FILE_PROCS,
4637                 .write = cgroup_procs_write,
4638         },
4639         {
4640                 .name = "cgroup.clone_children",
4641                 .read_u64 = cgroup_clone_children_read,
4642                 .write_u64 = cgroup_clone_children_write,
4643         },
4644         {
4645                 .name = "cgroup.sane_behavior",
4646                 .flags = CFTYPE_ONLY_ON_ROOT,
4647                 .seq_show = cgroup_sane_behavior_show,
4648         },
4649         {
4650                 .name = "tasks",
4651                 .seq_start = cgroup_pidlist_start,
4652                 .seq_next = cgroup_pidlist_next,
4653                 .seq_stop = cgroup_pidlist_stop,
4654                 .seq_show = cgroup_pidlist_show,
4655                 .private = CGROUP_FILE_TASKS,
4656                 .write = cgroup_tasks_write,
4657         },
4658         {
4659                 .name = "notify_on_release",
4660                 .read_u64 = cgroup_read_notify_on_release,
4661                 .write_u64 = cgroup_write_notify_on_release,
4662         },
4663         {
4664                 .name = "release_agent",
4665                 .flags = CFTYPE_ONLY_ON_ROOT,
4666                 .seq_show = cgroup_release_agent_show,
4667                 .write = cgroup_release_agent_write,
4668                 .max_write_len = PATH_MAX - 1,
4669         },
4670         { }     /* terminate */
4671 };
4672
4673 /*
4674  * css destruction is four-stage process.
4675  *
4676  * 1. Destruction starts.  Killing of the percpu_ref is initiated.
4677  *    Implemented in kill_css().
4678  *
4679  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4680  *    and thus css_tryget_online() is guaranteed to fail, the css can be
4681  *    offlined by invoking offline_css().  After offlining, the base ref is
4682  *    put.  Implemented in css_killed_work_fn().
4683  *
4684  * 3. When the percpu_ref reaches zero, the only possible remaining
4685  *    accessors are inside RCU read sections.  css_release() schedules the
4686  *    RCU callback.
4687  *
4688  * 4. After the grace period, the css can be freed.  Implemented in
4689  *    css_free_work_fn().
4690  *
4691  * It is actually hairier because both step 2 and 4 require process context
4692  * and thus involve punting to css->destroy_work adding two additional
4693  * steps to the already complex sequence.
4694  */
4695 static void css_free_work_fn(struct work_struct *work)
4696 {
4697         struct cgroup_subsys_state *css =
4698                 container_of(work, struct cgroup_subsys_state, destroy_work);
4699         struct cgroup_subsys *ss = css->ss;
4700         struct cgroup *cgrp = css->cgroup;
4701
4702         percpu_ref_exit(&css->refcnt);
4703
4704         if (ss) {
4705                 /* css free path */
4706                 struct cgroup_subsys_state *parent = css->parent;
4707                 int id = css->id;
4708
4709                 ss->css_free(css);
4710                 cgroup_idr_remove(&ss->css_idr, id);
4711                 cgroup_put(cgrp);
4712
4713                 if (parent)
4714                         css_put(parent);
4715         } else {
4716                 /* cgroup free path */
4717                 atomic_dec(&cgrp->root->nr_cgrps);
4718                 cgroup_pidlist_destroy_all(cgrp);
4719                 cancel_work_sync(&cgrp->release_agent_work);
4720
4721                 if (cgroup_parent(cgrp)) {
4722                         /*
4723                          * We get a ref to the parent, and put the ref when
4724                          * this cgroup is being freed, so it's guaranteed
4725                          * that the parent won't be destroyed before its
4726                          * children.
4727                          */
4728                         cgroup_put(cgroup_parent(cgrp));
4729                         kernfs_put(cgrp->kn);
4730                         kfree(cgrp);
4731                 } else {
4732                         /*
4733                          * This is root cgroup's refcnt reaching zero,
4734                          * which indicates that the root should be
4735                          * released.
4736                          */
4737                         cgroup_destroy_root(cgrp->root);
4738                 }
4739         }
4740 }
4741
4742 static void css_free_rcu_fn(struct rcu_head *rcu_head)
4743 {
4744         struct cgroup_subsys_state *css =
4745                 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4746
4747         INIT_WORK(&css->destroy_work, css_free_work_fn);
4748         queue_work(cgroup_destroy_wq, &css->destroy_work);
4749 }
4750
4751 static void css_release_work_fn(struct work_struct *work)
4752 {
4753         struct cgroup_subsys_state *css =
4754                 container_of(work, struct cgroup_subsys_state, destroy_work);
4755         struct cgroup_subsys *ss = css->ss;
4756         struct cgroup *cgrp = css->cgroup;
4757
4758         mutex_lock(&cgroup_mutex);
4759
4760         css->flags |= CSS_RELEASED;
4761         list_del_rcu(&css->sibling);
4762
4763         if (ss) {
4764                 /* css release path */
4765                 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
4766                 if (ss->css_released)
4767                         ss->css_released(css);
4768         } else {
4769                 /* cgroup release path */
4770                 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4771                 cgrp->id = -1;
4772
4773                 /*
4774                  * There are two control paths which try to determine
4775                  * cgroup from dentry without going through kernfs -
4776                  * cgroupstats_build() and css_tryget_online_from_dir().
4777                  * Those are supported by RCU protecting clearing of
4778                  * cgrp->kn->priv backpointer.
4779                  */
4780                 if (cgrp->kn)
4781                         RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
4782                                          NULL);
4783         }
4784
4785         mutex_unlock(&cgroup_mutex);
4786
4787         call_rcu(&css->rcu_head, css_free_rcu_fn);
4788 }
4789
4790 static void css_release(struct percpu_ref *ref)
4791 {
4792         struct cgroup_subsys_state *css =
4793                 container_of(ref, struct cgroup_subsys_state, refcnt);
4794
4795         INIT_WORK(&css->destroy_work, css_release_work_fn);
4796         queue_work(cgroup_destroy_wq, &css->destroy_work);
4797 }
4798
4799 static void init_and_link_css(struct cgroup_subsys_state *css,
4800                               struct cgroup_subsys *ss, struct cgroup *cgrp)
4801 {
4802         lockdep_assert_held(&cgroup_mutex);
4803
4804         cgroup_get(cgrp);
4805
4806         memset(css, 0, sizeof(*css));
4807         css->cgroup = cgrp;
4808         css->ss = ss;
4809         INIT_LIST_HEAD(&css->sibling);
4810         INIT_LIST_HEAD(&css->children);
4811         css->serial_nr = css_serial_nr_next++;
4812         atomic_set(&css->online_cnt, 0);
4813
4814         if (cgroup_parent(cgrp)) {
4815                 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
4816                 css_get(css->parent);
4817         }
4818
4819         BUG_ON(cgroup_css(cgrp, ss));
4820 }
4821
4822 /* invoke ->css_online() on a new CSS and mark it online if successful */
4823 static int online_css(struct cgroup_subsys_state *css)
4824 {
4825         struct cgroup_subsys *ss = css->ss;
4826         int ret = 0;
4827
4828         lockdep_assert_held(&cgroup_mutex);
4829
4830         if (ss->css_online)
4831                 ret = ss->css_online(css);
4832         if (!ret) {
4833                 css->flags |= CSS_ONLINE;
4834                 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4835
4836                 atomic_inc(&css->online_cnt);
4837                 if (css->parent)
4838                         atomic_inc(&css->parent->online_cnt);
4839         }
4840         return ret;
4841 }
4842
4843 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4844 static void offline_css(struct cgroup_subsys_state *css)
4845 {
4846         struct cgroup_subsys *ss = css->ss;
4847
4848         lockdep_assert_held(&cgroup_mutex);
4849
4850         if (!(css->flags & CSS_ONLINE))
4851                 return;
4852
4853         if (ss->css_reset)
4854                 ss->css_reset(css);
4855
4856         if (ss->css_offline)
4857                 ss->css_offline(css);
4858
4859         css->flags &= ~CSS_ONLINE;
4860         RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
4861
4862         wake_up_all(&css->cgroup->offline_waitq);
4863 }
4864
4865 /**
4866  * css_create - create a cgroup_subsys_state
4867  * @cgrp: the cgroup new css will be associated with
4868  * @ss: the subsys of new css
4869  *
4870  * Create a new css associated with @cgrp - @ss pair.  On success, the new
4871  * css is online and installed in @cgrp.  This function doesn't create the
4872  * interface files.  Returns 0 on success, -errno on failure.
4873  */
4874 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
4875                                               struct cgroup_subsys *ss)
4876 {
4877         struct cgroup *parent = cgroup_parent(cgrp);
4878         struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
4879         struct cgroup_subsys_state *css;
4880         int err;
4881
4882         lockdep_assert_held(&cgroup_mutex);
4883
4884         css = ss->css_alloc(parent_css);
4885         if (IS_ERR(css))
4886                 return css;
4887
4888         init_and_link_css(css, ss, cgrp);
4889
4890         err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
4891         if (err)
4892                 goto err_free_css;
4893
4894         err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
4895         if (err < 0)
4896                 goto err_free_percpu_ref;
4897         css->id = err;
4898
4899         /* @css is ready to be brought online now, make it visible */
4900         list_add_tail_rcu(&css->sibling, &parent_css->children);
4901         cgroup_idr_replace(&ss->css_idr, css, css->id);
4902
4903         err = online_css(css);
4904         if (err)
4905                 goto err_list_del;
4906
4907         if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4908             cgroup_parent(parent)) {
4909                 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4910                         current->comm, current->pid, ss->name);
4911                 if (!strcmp(ss->name, "memory"))
4912                         pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
4913                 ss->warned_broken_hierarchy = true;
4914         }
4915
4916         return css;
4917
4918 err_list_del:
4919         list_del_rcu(&css->sibling);
4920         cgroup_idr_remove(&ss->css_idr, css->id);
4921 err_free_percpu_ref:
4922         percpu_ref_exit(&css->refcnt);
4923 err_free_css:
4924         call_rcu(&css->rcu_head, css_free_rcu_fn);
4925         return ERR_PTR(err);
4926 }
4927
4928 static struct cgroup *cgroup_create(struct cgroup *parent)
4929 {
4930         struct cgroup_root *root = parent->root;
4931         struct cgroup_subsys *ss;
4932         struct cgroup *cgrp, *tcgrp;
4933         int level = parent->level + 1;
4934         int ssid, ret;
4935
4936         /* allocate the cgroup and its ID, 0 is reserved for the root */
4937         cgrp = kzalloc(sizeof(*cgrp) +
4938                        sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
4939         if (!cgrp)
4940                 return ERR_PTR(-ENOMEM);
4941
4942         ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
4943         if (ret)
4944                 goto out_free_cgrp;
4945
4946         /*
4947          * Temporarily set the pointer to NULL, so idr_find() won't return
4948          * a half-baked cgroup.
4949          */
4950         cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
4951         if (cgrp->id < 0) {
4952                 ret = -ENOMEM;
4953                 goto out_cancel_ref;
4954         }
4955
4956         init_cgroup_housekeeping(cgrp);
4957
4958         cgrp->self.parent = &parent->self;
4959         cgrp->root = root;
4960         cgrp->level = level;
4961
4962         for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
4963                 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
4964
4965         if (notify_on_release(parent))
4966                 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4967
4968         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4969                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4970
4971         cgrp->self.serial_nr = css_serial_nr_next++;
4972
4973         /* allocation complete, commit to creation */
4974         list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
4975         atomic_inc(&root->nr_cgrps);
4976         cgroup_get(parent);
4977
4978         /*
4979          * @cgrp is now fully operational.  If something fails after this
4980          * point, it'll be released via the normal destruction path.
4981          */
4982         cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4983
4984         /* create the csses */
4985         do_each_subsys_mask(ss, ssid, cgroup_ss_mask(cgrp)) {
4986                 struct cgroup_subsys_state *css;
4987
4988                 css = css_create(cgrp, ss);
4989                 if (IS_ERR(css)) {
4990                         ret = PTR_ERR(css);
4991                         goto out_destroy;
4992                 }
4993         } while_each_subsys_mask();
4994
4995         /*
4996          * On the default hierarchy, a child doesn't automatically inherit
4997          * subtree_control from the parent.  Each is configured manually.
4998          */
4999         if (!cgroup_on_dfl(cgrp)) {
5000                 cgrp->subtree_control = cgroup_control(cgrp);
5001                 cgroup_refresh_subtree_ss_mask(cgrp);
5002         }
5003
5004         return cgrp;
5005
5006 out_cancel_ref:
5007         percpu_ref_exit(&cgrp->self.refcnt);
5008 out_free_cgrp:
5009         kfree(cgrp);
5010         return ERR_PTR(ret);
5011 out_destroy:
5012         cgroup_destroy_locked(cgrp);
5013         return ERR_PTR(ret);
5014 }
5015
5016 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
5017                         umode_t mode)
5018 {
5019         struct cgroup *parent, *cgrp;
5020         struct cgroup_subsys *ss;
5021         struct kernfs_node *kn;
5022         int ssid, ret;
5023
5024         /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
5025         if (strchr(name, '\n'))
5026                 return -EINVAL;
5027
5028         parent = cgroup_kn_lock_live(parent_kn);
5029         if (!parent)
5030                 return -ENODEV;
5031
5032         cgrp = cgroup_create(parent);
5033         if (IS_ERR(cgrp)) {
5034                 ret = PTR_ERR(cgrp);
5035                 goto out_unlock;
5036         }
5037
5038         /* create the directory */
5039         kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5040         if (IS_ERR(kn)) {
5041                 ret = PTR_ERR(kn);
5042                 goto out_destroy;
5043         }
5044         cgrp->kn = kn;
5045
5046         /*
5047          * This extra ref will be put in cgroup_free_fn() and guarantees
5048          * that @cgrp->kn is always accessible.
5049          */
5050         kernfs_get(kn);
5051
5052         ret = cgroup_kn_set_ugid(kn);
5053         if (ret)
5054                 goto out_destroy;
5055
5056         ret = css_populate_dir(&cgrp->self, NULL);
5057         if (ret)
5058                 goto out_destroy;
5059
5060         do_each_subsys_mask(ss, ssid, cgroup_control(cgrp)) {
5061                 ret = css_populate_dir(cgroup_css(cgrp, ss), NULL);
5062                 if (ret)
5063                         goto out_destroy;
5064         } while_each_subsys_mask();
5065
5066         /* let's create and online css's */
5067         kernfs_activate(kn);
5068
5069         ret = 0;
5070         goto out_unlock;
5071
5072 out_destroy:
5073         cgroup_destroy_locked(cgrp);
5074 out_unlock:
5075         cgroup_kn_unlock(parent_kn);
5076         return ret;
5077 }
5078
5079 /*
5080  * This is called when the refcnt of a css is confirmed to be killed.
5081  * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
5082  * initate destruction and put the css ref from kill_css().
5083  */
5084 static void css_killed_work_fn(struct work_struct *work)
5085 {
5086         struct cgroup_subsys_state *css =
5087                 container_of(work, struct cgroup_subsys_state, destroy_work);
5088
5089         mutex_lock(&cgroup_mutex);
5090
5091         do {
5092                 offline_css(css);
5093                 css_put(css);
5094                 /* @css can't go away while we're holding cgroup_mutex */
5095                 css = css->parent;
5096         } while (css && atomic_dec_and_test(&css->online_cnt));
5097
5098         mutex_unlock(&cgroup_mutex);
5099 }
5100
5101 /* css kill confirmation processing requires process context, bounce */
5102 static void css_killed_ref_fn(struct percpu_ref *ref)
5103 {
5104         struct cgroup_subsys_state *css =
5105                 container_of(ref, struct cgroup_subsys_state, refcnt);
5106
5107         if (atomic_dec_and_test(&css->online_cnt)) {
5108                 INIT_WORK(&css->destroy_work, css_killed_work_fn);
5109                 queue_work(cgroup_destroy_wq, &css->destroy_work);
5110         }
5111 }
5112
5113 /**
5114  * kill_css - destroy a css
5115  * @css: css to destroy
5116  *
5117  * This function initiates destruction of @css by removing cgroup interface
5118  * files and putting its base reference.  ->css_offline() will be invoked
5119  * asynchronously once css_tryget_online() is guaranteed to fail and when
5120  * the reference count reaches zero, @css will be released.
5121  */
5122 static void kill_css(struct cgroup_subsys_state *css)
5123 {
5124         lockdep_assert_held(&cgroup_mutex);
5125
5126         /*
5127          * This must happen before css is disassociated with its cgroup.
5128          * See seq_css() for details.
5129          */
5130         css_clear_dir(css, NULL);
5131
5132         /*
5133          * Killing would put the base ref, but we need to keep it alive
5134          * until after ->css_offline().
5135          */
5136         css_get(css);
5137
5138         /*
5139          * cgroup core guarantees that, by the time ->css_offline() is
5140          * invoked, no new css reference will be given out via
5141          * css_tryget_online().  We can't simply call percpu_ref_kill() and
5142          * proceed to offlining css's because percpu_ref_kill() doesn't
5143          * guarantee that the ref is seen as killed on all CPUs on return.
5144          *
5145          * Use percpu_ref_kill_and_confirm() to get notifications as each
5146          * css is confirmed to be seen as killed on all CPUs.
5147          */
5148         percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5149 }
5150
5151 /**
5152  * cgroup_destroy_locked - the first stage of cgroup destruction
5153  * @cgrp: cgroup to be destroyed
5154  *
5155  * css's make use of percpu refcnts whose killing latency shouldn't be
5156  * exposed to userland and are RCU protected.  Also, cgroup core needs to
5157  * guarantee that css_tryget_online() won't succeed by the time
5158  * ->css_offline() is invoked.  To satisfy all the requirements,
5159  * destruction is implemented in the following two steps.
5160  *
5161  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
5162  *     userland visible parts and start killing the percpu refcnts of
5163  *     css's.  Set up so that the next stage will be kicked off once all
5164  *     the percpu refcnts are confirmed to be killed.
5165  *
5166  * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
5167  *     rest of destruction.  Once all cgroup references are gone, the
5168  *     cgroup is RCU-freed.
5169  *
5170  * This function implements s1.  After this step, @cgrp is gone as far as
5171  * the userland is concerned and a new cgroup with the same name may be
5172  * created.  As cgroup doesn't care about the names internally, this
5173  * doesn't cause any problem.
5174  */
5175 static int cgroup_destroy_locked(struct cgroup *cgrp)
5176         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5177 {
5178         struct cgroup_subsys_state *css;
5179         int ssid;
5180
5181         lockdep_assert_held(&cgroup_mutex);
5182
5183         /*
5184          * Only migration can raise populated from zero and we're already
5185          * holding cgroup_mutex.
5186          */
5187         if (cgroup_is_populated(cgrp))
5188                 return -EBUSY;
5189
5190         /*
5191          * Make sure there's no live children.  We can't test emptiness of
5192          * ->self.children as dead children linger on it while being
5193          * drained; otherwise, "rmdir parent/child parent" may fail.
5194          */
5195         if (css_has_online_children(&cgrp->self))
5196                 return -EBUSY;
5197
5198         /*
5199          * Mark @cgrp dead.  This prevents further task migration and child
5200          * creation by disabling cgroup_lock_live_group().
5201          */
5202         cgrp->self.flags &= ~CSS_ONLINE;
5203
5204         /* initiate massacre of all css's */
5205         for_each_css(css, ssid, cgrp)
5206                 kill_css(css);
5207
5208         /*
5209          * Remove @cgrp directory along with the base files.  @cgrp has an
5210          * extra ref on its kn.
5211          */
5212         kernfs_remove(cgrp->kn);
5213
5214         check_for_release(cgroup_parent(cgrp));
5215
5216         /* put the base reference */
5217         percpu_ref_kill(&cgrp->self.refcnt);
5218
5219         return 0;
5220 };
5221
5222 static int cgroup_rmdir(struct kernfs_node *kn)
5223 {
5224         struct cgroup *cgrp;
5225         int ret = 0;
5226
5227         cgrp = cgroup_kn_lock_live(kn);
5228         if (!cgrp)
5229                 return 0;
5230
5231         ret = cgroup_destroy_locked(cgrp);
5232
5233         cgroup_kn_unlock(kn);
5234         return ret;
5235 }
5236
5237 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5238         .remount_fs             = cgroup_remount,
5239         .show_options           = cgroup_show_options,
5240         .mkdir                  = cgroup_mkdir,
5241         .rmdir                  = cgroup_rmdir,
5242         .rename                 = cgroup_rename,
5243 };
5244
5245 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5246 {
5247         struct cgroup_subsys_state *css;
5248
5249         pr_debug("Initializing cgroup subsys %s\n", ss->name);
5250
5251         mutex_lock(&cgroup_mutex);
5252
5253         idr_init(&ss->css_idr);
5254         INIT_LIST_HEAD(&ss->cfts);
5255
5256         /* Create the root cgroup state for this subsystem */
5257         ss->root = &cgrp_dfl_root;
5258         css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
5259         /* We don't handle early failures gracefully */
5260         BUG_ON(IS_ERR(css));
5261         init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5262
5263         /*
5264          * Root csses are never destroyed and we can't initialize
5265          * percpu_ref during early init.  Disable refcnting.
5266          */
5267         css->flags |= CSS_NO_REF;
5268
5269         if (early) {
5270                 /* allocation can't be done safely during early init */
5271                 css->id = 1;
5272         } else {
5273                 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5274                 BUG_ON(css->id < 0);
5275         }
5276
5277         /* Update the init_css_set to contain a subsys
5278          * pointer to this state - since the subsystem is
5279          * newly registered, all tasks and hence the
5280          * init_css_set is in the subsystem's root cgroup. */
5281         init_css_set.subsys[ss->id] = css;
5282
5283         have_fork_callback |= (bool)ss->fork << ss->id;
5284         have_exit_callback |= (bool)ss->exit << ss->id;
5285         have_free_callback |= (bool)ss->free << ss->id;
5286         have_canfork_callback |= (bool)ss->can_fork << ss->id;
5287
5288         /* At system boot, before all subsystems have been
5289          * registered, no tasks have been forked, so we don't
5290          * need to invoke fork callbacks here. */
5291         BUG_ON(!list_empty(&init_task.tasks));
5292
5293         BUG_ON(online_css(css));
5294
5295         mutex_unlock(&cgroup_mutex);
5296 }
5297
5298 /**
5299  * cgroup_init_early - cgroup initialization at system boot
5300  *
5301  * Initialize cgroups at system boot, and initialize any
5302  * subsystems that request early init.
5303  */
5304 int __init cgroup_init_early(void)
5305 {
5306         static struct cgroup_sb_opts __initdata opts;
5307         struct cgroup_subsys *ss;
5308         int i;
5309
5310         init_cgroup_root(&cgrp_dfl_root, &opts);
5311         cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
5312
5313         RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
5314
5315         for_each_subsys(ss, i) {
5316                 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
5317                      "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
5318                      i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
5319                      ss->id, ss->name);
5320                 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
5321                      "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
5322
5323                 ss->id = i;
5324                 ss->name = cgroup_subsys_name[i];
5325                 if (!ss->legacy_name)
5326                         ss->legacy_name = cgroup_subsys_name[i];
5327
5328                 if (ss->early_init)
5329                         cgroup_init_subsys(ss, true);
5330         }
5331         return 0;
5332 }
5333
5334 static u16 cgroup_disable_mask __initdata;
5335
5336 /**
5337  * cgroup_init - cgroup initialization
5338  *
5339  * Register cgroup filesystem and /proc file, and initialize
5340  * any subsystems that didn't request early init.
5341  */
5342 int __init cgroup_init(void)
5343 {
5344         struct cgroup_subsys *ss;
5345         int ssid;
5346
5347         BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5348         BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
5349         BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
5350         BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
5351
5352         mutex_lock(&cgroup_mutex);
5353
5354         /*
5355          * Add init_css_set to the hash table so that dfl_root can link to
5356          * it during init.
5357          */
5358         hash_add(css_set_table, &init_css_set.hlist,
5359                  css_set_hash(init_css_set.subsys));
5360
5361         BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
5362
5363         mutex_unlock(&cgroup_mutex);
5364
5365         for_each_subsys(ss, ssid) {
5366                 if (ss->early_init) {
5367                         struct cgroup_subsys_state *css =
5368                                 init_css_set.subsys[ss->id];
5369
5370                         css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5371                                                    GFP_KERNEL);
5372                         BUG_ON(css->id < 0);
5373                 } else {
5374                         cgroup_init_subsys(ss, false);
5375                 }
5376
5377                 list_add_tail(&init_css_set.e_cset_node[ssid],
5378                               &cgrp_dfl_root.cgrp.e_csets[ssid]);
5379
5380                 /*
5381                  * Setting dfl_root subsys_mask needs to consider the
5382                  * disabled flag and cftype registration needs kmalloc,
5383                  * both of which aren't available during early_init.
5384                  */
5385                 if (cgroup_disable_mask & (1 << ssid)) {
5386                         static_branch_disable(cgroup_subsys_enabled_key[ssid]);
5387                         printk(KERN_INFO "Disabling %s control group subsystem\n",
5388                                ss->name);
5389                         continue;
5390                 }
5391
5392                 if (cgroup_ssid_no_v1(ssid))
5393                         printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
5394                                ss->name);
5395
5396                 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5397
5398                 if (!ss->dfl_cftypes)
5399                         cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
5400
5401                 if (ss->dfl_cftypes == ss->legacy_cftypes) {
5402                         WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5403                 } else {
5404                         WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5405                         WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5406                 }
5407
5408                 if (ss->bind)
5409                         ss->bind(init_css_set.subsys[ssid]);
5410         }
5411
5412         /* init_css_set.subsys[] has been updated, re-hash */
5413         hash_del(&init_css_set.hlist);
5414         hash_add(css_set_table, &init_css_set.hlist,
5415                  css_set_hash(init_css_set.subsys));
5416
5417         WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5418         WARN_ON(register_filesystem(&cgroup_fs_type));
5419         WARN_ON(register_filesystem(&cgroup2_fs_type));
5420         WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
5421
5422         return 0;
5423 }
5424
5425 static int __init cgroup_wq_init(void)
5426 {
5427         /*
5428          * There isn't much point in executing destruction path in
5429          * parallel.  Good chunk is serialized with cgroup_mutex anyway.
5430          * Use 1 for @max_active.
5431          *
5432          * We would prefer to do this in cgroup_init() above, but that
5433          * is called before init_workqueues(): so leave this until after.
5434          */
5435         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5436         BUG_ON(!cgroup_destroy_wq);
5437
5438         /*
5439          * Used to destroy pidlists and separate to serve as flush domain.
5440          * Cap @max_active to 1 too.
5441          */
5442         cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
5443                                                     0, 1);
5444         BUG_ON(!cgroup_pidlist_destroy_wq);
5445
5446         return 0;
5447 }
5448 core_initcall(cgroup_wq_init);
5449
5450 /*
5451  * proc_cgroup_show()
5452  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
5453  *  - Used for /proc/<pid>/cgroup.
5454  */
5455 int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5456                      struct pid *pid, struct task_struct *tsk)
5457 {
5458         char *buf, *path;
5459         int retval;
5460         struct cgroup_root *root;
5461
5462         retval = -ENOMEM;
5463         buf = kmalloc(PATH_MAX, GFP_KERNEL);
5464         if (!buf)
5465                 goto out;
5466
5467         mutex_lock(&cgroup_mutex);
5468         spin_lock_bh(&css_set_lock);
5469
5470         for_each_root(root) {
5471                 struct cgroup_subsys *ss;
5472                 struct cgroup *cgrp;
5473                 int ssid, count = 0;
5474
5475                 if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
5476                         continue;
5477
5478                 seq_printf(m, "%d:", root->hierarchy_id);
5479                 if (root != &cgrp_dfl_root)
5480                         for_each_subsys(ss, ssid)
5481                                 if (root->subsys_mask & (1 << ssid))
5482                                         seq_printf(m, "%s%s", count++ ? "," : "",
5483                                                    ss->legacy_name);
5484                 if (strlen(root->name))
5485                         seq_printf(m, "%sname=%s", count ? "," : "",
5486                                    root->name);
5487                 seq_putc(m, ':');
5488
5489                 cgrp = task_cgroup_from_root(tsk, root);
5490
5491                 /*
5492                  * On traditional hierarchies, all zombie tasks show up as
5493                  * belonging to the root cgroup.  On the default hierarchy,
5494                  * while a zombie doesn't show up in "cgroup.procs" and
5495                  * thus can't be migrated, its /proc/PID/cgroup keeps
5496                  * reporting the cgroup it belonged to before exiting.  If
5497                  * the cgroup is removed before the zombie is reaped,
5498                  * " (deleted)" is appended to the cgroup path.
5499                  */
5500                 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5501                         path = cgroup_path(cgrp, buf, PATH_MAX);
5502                         if (!path) {
5503                                 retval = -ENAMETOOLONG;
5504                                 goto out_unlock;
5505                         }
5506                 } else {
5507                         path = "/";
5508                 }
5509
5510                 seq_puts(m, path);
5511
5512                 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5513                         seq_puts(m, " (deleted)\n");
5514                 else
5515                         seq_putc(m, '\n');
5516         }
5517
5518         retval = 0;
5519 out_unlock:
5520         spin_unlock_bh(&css_set_lock);
5521         mutex_unlock(&cgroup_mutex);
5522         kfree(buf);
5523 out:
5524         return retval;
5525 }
5526
5527 /* Display information about each subsystem and each hierarchy */
5528 static int proc_cgroupstats_show(struct seq_file *m, void *v)
5529 {
5530         struct cgroup_subsys *ss;
5531         int i;
5532
5533         seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
5534         /*
5535          * ideally we don't want subsystems moving around while we do this.
5536          * cgroup_mutex is also necessary to guarantee an atomic snapshot of
5537          * subsys/hierarchy state.
5538          */
5539         mutex_lock(&cgroup_mutex);
5540
5541         for_each_subsys(ss, i)
5542                 seq_printf(m, "%s\t%d\t%d\t%d\n",
5543                            ss->legacy_name, ss->root->hierarchy_id,
5544                            atomic_read(&ss->root->nr_cgrps),
5545                            cgroup_ssid_enabled(i));
5546
5547         mutex_unlock(&cgroup_mutex);
5548         return 0;
5549 }
5550
5551 static int cgroupstats_open(struct inode *inode, struct file *file)
5552 {
5553         return single_open(file, proc_cgroupstats_show, NULL);
5554 }
5555
5556 static const struct file_operations proc_cgroupstats_operations = {
5557         .open = cgroupstats_open,
5558         .read = seq_read,
5559         .llseek = seq_lseek,
5560         .release = single_release,
5561 };
5562
5563 /**
5564  * cgroup_fork - initialize cgroup related fields during copy_process()
5565  * @child: pointer to task_struct of forking parent process.
5566  *
5567  * A task is associated with the init_css_set until cgroup_post_fork()
5568  * attaches it to the parent's css_set.  Empty cg_list indicates that
5569  * @child isn't holding reference to its css_set.
5570  */
5571 void cgroup_fork(struct task_struct *child)
5572 {
5573         RCU_INIT_POINTER(child->cgroups, &init_css_set);
5574         INIT_LIST_HEAD(&child->cg_list);
5575 }
5576
5577 /**
5578  * cgroup_can_fork - called on a new task before the process is exposed
5579  * @child: the task in question.
5580  *
5581  * This calls the subsystem can_fork() callbacks. If the can_fork() callback
5582  * returns an error, the fork aborts with that error code. This allows for
5583  * a cgroup subsystem to conditionally allow or deny new forks.
5584  */
5585 int cgroup_can_fork(struct task_struct *child)
5586 {
5587         struct cgroup_subsys *ss;
5588         int i, j, ret;
5589
5590         do_each_subsys_mask(ss, i, have_canfork_callback) {
5591                 ret = ss->can_fork(child);
5592                 if (ret)
5593                         goto out_revert;
5594         } while_each_subsys_mask();
5595
5596         return 0;
5597
5598 out_revert:
5599         for_each_subsys(ss, j) {
5600                 if (j >= i)
5601                         break;
5602                 if (ss->cancel_fork)
5603                         ss->cancel_fork(child);
5604         }
5605
5606         return ret;
5607 }
5608
5609 /**
5610  * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
5611  * @child: the task in question
5612  *
5613  * This calls the cancel_fork() callbacks if a fork failed *after*
5614  * cgroup_can_fork() succeded.
5615  */
5616 void cgroup_cancel_fork(struct task_struct *child)
5617 {
5618         struct cgroup_subsys *ss;
5619         int i;
5620
5621         for_each_subsys(ss, i)
5622                 if (ss->cancel_fork)
5623                         ss->cancel_fork(child);
5624 }
5625
5626 /**
5627  * cgroup_post_fork - called on a new task after adding it to the task list
5628  * @child: the task in question
5629  *
5630  * Adds the task to the list running through its css_set if necessary and
5631  * call the subsystem fork() callbacks.  Has to be after the task is
5632  * visible on the task list in case we race with the first call to
5633  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5634  * list.
5635  */
5636 void cgroup_post_fork(struct task_struct *child)
5637 {
5638         struct cgroup_subsys *ss;
5639         int i;
5640
5641         /*
5642          * This may race against cgroup_enable_task_cg_lists().  As that
5643          * function sets use_task_css_set_links before grabbing
5644          * tasklist_lock and we just went through tasklist_lock to add
5645          * @child, it's guaranteed that either we see the set
5646          * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5647          * @child during its iteration.
5648          *
5649          * If we won the race, @child is associated with %current's
5650          * css_set.  Grabbing css_set_lock guarantees both that the
5651          * association is stable, and, on completion of the parent's
5652          * migration, @child is visible in the source of migration or
5653          * already in the destination cgroup.  This guarantee is necessary
5654          * when implementing operations which need to migrate all tasks of
5655          * a cgroup to another.
5656          *
5657          * Note that if we lose to cgroup_enable_task_cg_lists(), @child
5658          * will remain in init_css_set.  This is safe because all tasks are
5659          * in the init_css_set before cg_links is enabled and there's no
5660          * operation which transfers all tasks out of init_css_set.
5661          */
5662         if (use_task_css_set_links) {
5663                 struct css_set *cset;
5664
5665                 spin_lock_bh(&css_set_lock);
5666                 cset = task_css_set(current);
5667                 if (list_empty(&child->cg_list)) {
5668                         get_css_set(cset);
5669                         css_set_move_task(child, NULL, cset, false);
5670                 }
5671                 spin_unlock_bh(&css_set_lock);
5672         }
5673
5674         /*
5675          * Call ss->fork().  This must happen after @child is linked on
5676          * css_set; otherwise, @child might change state between ->fork()
5677          * and addition to css_set.
5678          */
5679         do_each_subsys_mask(ss, i, have_fork_callback) {
5680                 ss->fork(child);
5681         } while_each_subsys_mask();
5682 }
5683
5684 /**
5685  * cgroup_exit - detach cgroup from exiting task
5686  * @tsk: pointer to task_struct of exiting process
5687  *
5688  * Description: Detach cgroup from @tsk and release it.
5689  *
5690  * Note that cgroups marked notify_on_release force every task in
5691  * them to take the global cgroup_mutex mutex when exiting.
5692  * This could impact scaling on very large systems.  Be reluctant to
5693  * use notify_on_release cgroups where very high task exit scaling
5694  * is required on large systems.
5695  *
5696  * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
5697  * call cgroup_exit() while the task is still competent to handle
5698  * notify_on_release(), then leave the task attached to the root cgroup in
5699  * each hierarchy for the remainder of its exit.  No need to bother with
5700  * init_css_set refcnting.  init_css_set never goes away and we can't race
5701  * with migration path - PF_EXITING is visible to migration path.
5702  */
5703 void cgroup_exit(struct task_struct *tsk)
5704 {
5705         struct cgroup_subsys *ss;
5706         struct css_set *cset;
5707         int i;
5708
5709         /*
5710          * Unlink from @tsk from its css_set.  As migration path can't race
5711          * with us, we can check css_set and cg_list without synchronization.
5712          */
5713         cset = task_css_set(tsk);
5714
5715         if (!list_empty(&tsk->cg_list)) {
5716                 spin_lock_bh(&css_set_lock);
5717                 css_set_move_task(tsk, cset, NULL, false);
5718                 spin_unlock_bh(&css_set_lock);
5719         } else {
5720                 get_css_set(cset);
5721         }
5722
5723         /* see cgroup_post_fork() for details */
5724         do_each_subsys_mask(ss, i, have_exit_callback) {
5725                 ss->exit(tsk);
5726         } while_each_subsys_mask();
5727 }
5728
5729 void cgroup_free(struct task_struct *task)
5730 {
5731         struct css_set *cset = task_css_set(task);
5732         struct cgroup_subsys *ss;
5733         int ssid;
5734
5735         do_each_subsys_mask(ss, ssid, have_free_callback) {
5736                 ss->free(task);
5737         } while_each_subsys_mask();
5738
5739         put_css_set(cset);
5740 }
5741
5742 static void check_for_release(struct cgroup *cgrp)
5743 {
5744         if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
5745             !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
5746                 schedule_work(&cgrp->release_agent_work);
5747 }
5748
5749 /*
5750  * Notify userspace when a cgroup is released, by running the
5751  * configured release agent with the name of the cgroup (path
5752  * relative to the root of cgroup file system) as the argument.
5753  *
5754  * Most likely, this user command will try to rmdir this cgroup.
5755  *
5756  * This races with the possibility that some other task will be
5757  * attached to this cgroup before it is removed, or that some other
5758  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
5759  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
5760  * unused, and this cgroup will be reprieved from its death sentence,
5761  * to continue to serve a useful existence.  Next time it's released,
5762  * we will get notified again, if it still has 'notify_on_release' set.
5763  *
5764  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
5765  * means only wait until the task is successfully execve()'d.  The
5766  * separate release agent task is forked by call_usermodehelper(),
5767  * then control in this thread returns here, without waiting for the
5768  * release agent task.  We don't bother to wait because the caller of
5769  * this routine has no use for the exit status of the release agent
5770  * task, so no sense holding our caller up for that.
5771  */
5772 static void cgroup_release_agent(struct work_struct *work)
5773 {
5774         struct cgroup *cgrp =
5775                 container_of(work, struct cgroup, release_agent_work);
5776         char *pathbuf = NULL, *agentbuf = NULL, *path;
5777         char *argv[3], *envp[3];
5778
5779         mutex_lock(&cgroup_mutex);
5780
5781         pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5782         agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5783         if (!pathbuf || !agentbuf)
5784                 goto out;
5785
5786         path = cgroup_path(cgrp, pathbuf, PATH_MAX);
5787         if (!path)
5788                 goto out;
5789
5790         argv[0] = agentbuf;
5791         argv[1] = path;
5792         argv[2] = NULL;
5793
5794         /* minimal command environment */
5795         envp[0] = "HOME=/";
5796         envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5797         envp[2] = NULL;
5798
5799         mutex_unlock(&cgroup_mutex);
5800         call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5801         goto out_free;
5802 out:
5803         mutex_unlock(&cgroup_mutex);
5804 out_free:
5805         kfree(agentbuf);
5806         kfree(pathbuf);
5807 }
5808
5809 static int __init cgroup_disable(char *str)
5810 {
5811         struct cgroup_subsys *ss;
5812         char *token;
5813         int i;
5814
5815         while ((token = strsep(&str, ",")) != NULL) {
5816                 if (!*token)
5817                         continue;
5818
5819                 for_each_subsys(ss, i) {
5820                         if (strcmp(token, ss->name) &&
5821                             strcmp(token, ss->legacy_name))
5822                                 continue;
5823                         cgroup_disable_mask |= 1 << i;
5824                 }
5825         }
5826         return 1;
5827 }
5828 __setup("cgroup_disable=", cgroup_disable);
5829
5830 static int __init cgroup_no_v1(char *str)
5831 {
5832         struct cgroup_subsys *ss;
5833         char *token;
5834         int i;
5835
5836         while ((token = strsep(&str, ",")) != NULL) {
5837                 if (!*token)
5838                         continue;
5839
5840                 if (!strcmp(token, "all")) {
5841                         cgroup_no_v1_mask = U16_MAX;
5842                         break;
5843                 }
5844
5845                 for_each_subsys(ss, i) {
5846                         if (strcmp(token, ss->name) &&
5847                             strcmp(token, ss->legacy_name))
5848                                 continue;
5849
5850                         cgroup_no_v1_mask |= 1 << i;
5851                 }
5852         }
5853         return 1;
5854 }
5855 __setup("cgroup_no_v1=", cgroup_no_v1);
5856
5857 /**
5858  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
5859  * @dentry: directory dentry of interest
5860  * @ss: subsystem of interest
5861  *
5862  * If @dentry is a directory for a cgroup which has @ss enabled on it, try
5863  * to get the corresponding css and return it.  If such css doesn't exist
5864  * or can't be pinned, an ERR_PTR value is returned.
5865  */
5866 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
5867                                                        struct cgroup_subsys *ss)
5868 {
5869         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
5870         struct file_system_type *s_type = dentry->d_sb->s_type;
5871         struct cgroup_subsys_state *css = NULL;
5872         struct cgroup *cgrp;
5873
5874         /* is @dentry a cgroup dir? */
5875         if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
5876             !kn || kernfs_type(kn) != KERNFS_DIR)
5877                 return ERR_PTR(-EBADF);
5878
5879         rcu_read_lock();
5880
5881         /*
5882          * This path doesn't originate from kernfs and @kn could already
5883          * have been or be removed at any point.  @kn->priv is RCU
5884          * protected for this access.  See css_release_work_fn() for details.
5885          */
5886         cgrp = rcu_dereference(kn->priv);
5887         if (cgrp)
5888                 css = cgroup_css(cgrp, ss);
5889
5890         if (!css || !css_tryget_online(css))
5891                 css = ERR_PTR(-ENOENT);
5892
5893         rcu_read_unlock();
5894         return css;
5895 }
5896
5897 /**
5898  * css_from_id - lookup css by id
5899  * @id: the cgroup id
5900  * @ss: cgroup subsys to be looked into
5901  *
5902  * Returns the css if there's valid one with @id, otherwise returns NULL.
5903  * Should be called under rcu_read_lock().
5904  */
5905 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5906 {
5907         WARN_ON_ONCE(!rcu_read_lock_held());
5908         return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
5909 }
5910
5911 /**
5912  * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
5913  * @path: path on the default hierarchy
5914  *
5915  * Find the cgroup at @path on the default hierarchy, increment its
5916  * reference count and return it.  Returns pointer to the found cgroup on
5917  * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
5918  * if @path points to a non-directory.
5919  */
5920 struct cgroup *cgroup_get_from_path(const char *path)
5921 {
5922         struct kernfs_node *kn;
5923         struct cgroup *cgrp;
5924
5925         mutex_lock(&cgroup_mutex);
5926
5927         kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
5928         if (kn) {
5929                 if (kernfs_type(kn) == KERNFS_DIR) {
5930                         cgrp = kn->priv;
5931                         cgroup_get(cgrp);
5932                 } else {
5933                         cgrp = ERR_PTR(-ENOTDIR);
5934                 }
5935                 kernfs_put(kn);
5936         } else {
5937                 cgrp = ERR_PTR(-ENOENT);
5938         }
5939
5940         mutex_unlock(&cgroup_mutex);
5941         return cgrp;
5942 }
5943 EXPORT_SYMBOL_GPL(cgroup_get_from_path);
5944
5945 /*
5946  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
5947  * definition in cgroup-defs.h.
5948  */
5949 #ifdef CONFIG_SOCK_CGROUP_DATA
5950
5951 #if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
5952
5953 DEFINE_SPINLOCK(cgroup_sk_update_lock);
5954 static bool cgroup_sk_alloc_disabled __read_mostly;
5955
5956 void cgroup_sk_alloc_disable(void)
5957 {
5958         if (cgroup_sk_alloc_disabled)
5959                 return;
5960         pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
5961         cgroup_sk_alloc_disabled = true;
5962 }
5963
5964 #else
5965
5966 #define cgroup_sk_alloc_disabled        false
5967
5968 #endif
5969
5970 void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
5971 {
5972         if (cgroup_sk_alloc_disabled)
5973                 return;
5974
5975         rcu_read_lock();
5976
5977         while (true) {
5978                 struct css_set *cset;
5979
5980                 cset = task_css_set(current);
5981                 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
5982                         skcd->val = (unsigned long)cset->dfl_cgrp;
5983                         break;
5984                 }
5985                 cpu_relax();
5986         }
5987
5988         rcu_read_unlock();
5989 }
5990
5991 void cgroup_sk_free(struct sock_cgroup_data *skcd)
5992 {
5993         cgroup_put(sock_cgroup_ptr(skcd));
5994 }
5995
5996 #endif  /* CONFIG_SOCK_CGROUP_DATA */
5997
5998 #ifdef CONFIG_CGROUP_DEBUG
5999 static struct cgroup_subsys_state *
6000 debug_css_alloc(struct cgroup_subsys_state *parent_css)
6001 {
6002         struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
6003
6004         if (!css)
6005                 return ERR_PTR(-ENOMEM);
6006
6007         return css;
6008 }
6009
6010 static void debug_css_free(struct cgroup_subsys_state *css)
6011 {
6012         kfree(css);
6013 }
6014
6015 static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
6016                                 struct cftype *cft)
6017 {
6018         return cgroup_task_count(css->cgroup);
6019 }
6020
6021 static u64 current_css_set_read(struct cgroup_subsys_state *css,
6022                                 struct cftype *cft)
6023 {
6024         return (u64)(unsigned long)current->cgroups;
6025 }
6026
6027 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
6028                                          struct cftype *cft)
6029 {
6030         u64 count;
6031
6032         rcu_read_lock();
6033         count = atomic_read(&task_css_set(current)->refcount);
6034         rcu_read_unlock();
6035         return count;
6036 }
6037
6038 static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
6039 {
6040         struct cgrp_cset_link *link;
6041         struct css_set *cset;
6042         char *name_buf;
6043
6044         name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
6045         if (!name_buf)
6046                 return -ENOMEM;
6047
6048         spin_lock_bh(&css_set_lock);
6049         rcu_read_lock();
6050         cset = rcu_dereference(current->cgroups);
6051         list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
6052                 struct cgroup *c = link->cgrp;
6053
6054                 cgroup_name(c, name_buf, NAME_MAX + 1);
6055                 seq_printf(seq, "Root %d group %s\n",
6056                            c->root->hierarchy_id, name_buf);
6057         }
6058         rcu_read_unlock();
6059         spin_unlock_bh(&css_set_lock);
6060         kfree(name_buf);
6061         return 0;
6062 }
6063
6064 #define MAX_TASKS_SHOWN_PER_CSS 25
6065 static int cgroup_css_links_read(struct seq_file *seq, void *v)
6066 {
6067         struct cgroup_subsys_state *css = seq_css(seq);
6068         struct cgrp_cset_link *link;
6069
6070         spin_lock_bh(&css_set_lock);
6071         list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
6072                 struct css_set *cset = link->cset;
6073                 struct task_struct *task;
6074                 int count = 0;
6075
6076                 seq_printf(seq, "css_set %p\n", cset);
6077
6078                 list_for_each_entry(task, &cset->tasks, cg_list) {
6079                         if (count++ > MAX_TASKS_SHOWN_PER_CSS)
6080                                 goto overflow;
6081                         seq_printf(seq, "  task %d\n", task_pid_vnr(task));
6082                 }
6083
6084                 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
6085                         if (count++ > MAX_TASKS_SHOWN_PER_CSS)
6086                                 goto overflow;
6087                         seq_printf(seq, "  task %d\n", task_pid_vnr(task));
6088                 }
6089                 continue;
6090         overflow:
6091                 seq_puts(seq, "  ...\n");
6092         }
6093         spin_unlock_bh(&css_set_lock);
6094         return 0;
6095 }
6096
6097 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
6098 {
6099         return (!cgroup_is_populated(css->cgroup) &&
6100                 !css_has_online_children(&css->cgroup->self));
6101 }
6102
6103 static struct cftype debug_files[] =  {
6104         {
6105                 .name = "taskcount",
6106                 .read_u64 = debug_taskcount_read,
6107         },
6108
6109         {
6110                 .name = "current_css_set",
6111                 .read_u64 = current_css_set_read,
6112         },
6113
6114         {
6115                 .name = "current_css_set_refcount",
6116                 .read_u64 = current_css_set_refcount_read,
6117         },
6118
6119         {
6120                 .name = "current_css_set_cg_links",
6121                 .seq_show = current_css_set_cg_links_read,
6122         },
6123
6124         {
6125                 .name = "cgroup_css_links",
6126                 .seq_show = cgroup_css_links_read,
6127         },
6128
6129         {
6130                 .name = "releasable",
6131                 .read_u64 = releasable_read,
6132         },
6133
6134         { }     /* terminate */
6135 };
6136
6137 struct cgroup_subsys debug_cgrp_subsys = {
6138         .css_alloc = debug_css_alloc,
6139         .css_free = debug_css_free,
6140         .legacy_cftypes = debug_files,
6141 };
6142 #endif /* CONFIG_CGROUP_DEBUG */