From: Linus Torvalds Date: Fri, 14 Oct 2016 19:18:50 +0000 (-0700) Subject: Merge branch 'for-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup X-Git-Tag: v4.9-rc1~20 X-Git-Url: http://git.cascardo.eti.br/?a=commitdiff_plain;h=f34d3606f76a8121b9d4940d2dd436bebeb2f9d7;hp=-c;p=cascardo%2Flinux.git Merge branch 'for-4.9' of git://git./linux/kernel/git/tj/cgroup Pull cgroup updates from Tejun Heo: - tracepoints for basic cgroup management operations added - kernfs and cgroup path formatting functions updated to behave in the style of strlcpy() - non-critical bug fixes * 'for-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: blkcg: Unlock blkcg_pol_mutex only once when cpd == NULL cgroup: fix error handling regressions in proc_cgroup_show() and cgroup_release_agent() cpuset: fix error handling regression in proc_cpuset_show() cgroup: add tracepoints for basic operations cgroup: make cgroup_path() and friends behave in the style of strlcpy() kernfs: remove kernfs_path_len() kernfs: make kernfs_path*() behave in the style of strlcpy() kernfs: add dummy implementation of kernfs_path_from_node() --- f34d3606f76a8121b9d4940d2dd436bebeb2f9d7 diff --combined fs/kernfs/dir.c index dcd96aac02f5,6e7fd37615f8..cf4c636ff4da --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@@ -110,8 -110,9 +110,9 @@@ static struct kernfs_node *kernfs_commo * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * - * return value: length of the string. If greater than buflen, - * then contents of buf are undefined. On error, -1 is returned. + * Returns the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, @@@ -119,9 -120,8 +120,8 @@@ { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; - size_t depth_from, depth_to, len = 0, nlen = 0; - char *p; - int i; + size_t depth_from, depth_to, len = 0; + int i, j; if (!kn_from) kn_from = kernfs_root(kn_to)->kn; @@@ -131,7 -131,7 +131,7 @@@ common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) - return -1; + return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); @@@ -144,22 -144,16 +144,16 @@@ len < buflen ? buflen - len : 0); /* Calculate how many bytes we need for the rest */ - for (kn = kn_to; kn != common; kn = kn->parent) - nlen += strlen(kn->name) + 1; - - if (len + nlen >= buflen) - return len + nlen; - - p = buf + len + nlen; - *p = '\0'; - for (kn = kn_to; kn != common; kn = kn->parent) { - size_t tmp = strlen(kn->name); - p -= tmp; - memcpy(p, kn->name, tmp); - *(--p) = '/'; + for (i = depth_to - 1; i >= 0; i--) { + for (kn = kn_to, j = 0; j < i; j++) + kn = kn->parent; + len += strlcpy(buf + len, "/", + len < buflen ? buflen - len : 0); + len += strlcpy(buf + len, kn->name, + len < buflen ? buflen - len : 0); } - return len + nlen; + return len; } /** @@@ -185,29 -179,6 +179,6 @@@ int kernfs_name(struct kernfs_node *kn return ret; } - /** - * kernfs_path_len - determine the length of the full path of a given node - * @kn: kernfs_node of interest - * - * The returned length doesn't include the space for the terminating '\0'. - */ - size_t kernfs_path_len(struct kernfs_node *kn) - { - size_t len = 0; - unsigned long flags; - - spin_lock_irqsave(&kernfs_rename_lock, flags); - - do { - len += strlen(kn->name) + 1; - kn = kn->parent; - } while (kn && kn->parent); - - spin_unlock_irqrestore(&kernfs_rename_lock, flags); - - return len; - } - /** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path @@@ -220,8 -191,9 +191,9 @@@ * path (which includes '..'s) as needed to reach from @from to @to is * returned. * - * If @buf isn't long enough, the return value will be greater than @buflen - * and @buf contents are undefined. + * Returns the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) @@@ -236,28 -208,6 +208,6 @@@ } EXPORT_SYMBOL_GPL(kernfs_path_from_node); - /** - * kernfs_path - build full path of a given node - * @kn: kernfs_node of interest - * @buf: buffer to copy @kn's name into - * @buflen: size of @buf - * - * Builds and returns the full path of @kn in @buf of @buflen bytes. The - * path is built from the end of @buf so the returned pointer usually - * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated - * and %NULL is returned. - */ - char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) - { - int ret; - - ret = kernfs_path_from_node(kn, NULL, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; - } - EXPORT_SYMBOL_GPL(kernfs_path); - /** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest @@@ -1096,17 -1046,13 +1046,17 @@@ static int kernfs_iop_rmdir(struct inod } static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct kernfs_node *kn = old_dentry->d_fsdata; struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; + if (flags) + return -EINVAL; + if (!scops || !scops->rename) return -EPERM; @@@ -1130,6 -1076,9 +1080,6 @@@ const struct inode_operations kernfs_di .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, - .setxattr = kernfs_iop_setxattr, - .removexattr = kernfs_iop_removexattr, - .getxattr = kernfs_iop_getxattr, .listxattr = kernfs_iop_listxattr, .mkdir = kernfs_iop_mkdir, diff --combined include/linux/blk-cgroup.h index cbdbf34de5b6,4e8c215e185c..3bf5d33800ab --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@@ -45,7 -45,7 +45,7 @@@ struct blkcg spinlock_t lock; struct radix_tree_root blkg_tree; - struct blkcg_gq *blkg_hint; + struct blkcg_gq __rcu *blkg_hint; struct hlist_head blkg_list; struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; @@@ -343,16 -343,7 +343,7 @@@ static inline struct blkcg *cpd_to_blkc */ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) { - char *p; - - p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); - if (!p) { - strncpy(buf, "", buflen); - return -ENAMETOOLONG; - } - - memmove(buf, p, buf + buflen - p); - return 0; + return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); } /** diff --combined include/linux/cgroup.h index 440a72164a11,6df36361a492..c83c23f0577b --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@@ -97,7 -97,7 +97,7 @@@ int cgroup_add_legacy_cftypes(struct cg int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); - char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); + int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); @@@ -497,23 -497,6 +497,23 @@@ static inline bool cgroup_is_descendant return cgrp->ancestor_ids[ancestor->level] == ancestor->id; } +/** + * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry + * @task: the task to be tested + * @ancestor: possible ancestor of @task's cgroup + * + * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. + * It follows all the same rules as cgroup_is_descendant, and only applies + * to the default hierarchy. + */ +static inline bool task_under_cgroup_hierarchy(struct task_struct *task, + struct cgroup *ancestor) +{ + struct css_set *cset = task_css_set(task); + + return cgroup_is_descendant(cset->dfl_cgrp, ancestor); +} + /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { @@@ -555,8 -538,7 +555,7 @@@ static inline int cgroup_name(struct cg return kernfs_name(cgrp->kn, buf, buflen); } - static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, - size_t buflen) + static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_path(cgrp->kn, buf, buflen); } @@@ -574,7 -556,6 +573,7 @@@ static inline void pr_cont_cgroup_path( #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; +struct cgroup; static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, @@@ -592,11 -573,6 +591,11 @@@ static inline void cgroup_free(struct t static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } +static inline bool task_under_cgroup_hierarchy(struct task_struct *task, + struct cgroup *ancestor) +{ + return true; +} #endif /* !CONFIG_CGROUPS */ /* @@@ -644,7 -620,6 +643,7 @@@ struct cgroup_namespace atomic_t count; struct ns_common ns; struct user_namespace *user_ns; + struct ucounts *ucounts; struct css_set *root_cset; }; @@@ -658,8 -633,8 +657,8 @@@ struct cgroup_namespace *copy_cgroup_ns struct user_namespace *user_ns, struct cgroup_namespace *old_ns); - char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns); + int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns); #else /* !CONFIG_CGROUPS */ diff --combined kernel/cgroup.c index 44066158f0d1,a7f9fb4e1fc7..85bc9beb046d --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@@ -64,6 -64,9 +64,9 @@@ #include #include + #define CREATE_TRACE_POINTS + #include + /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls @@@ -1176,6 -1179,8 +1179,8 @@@ static void cgroup_destroy_root(struct struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; + trace_cgroup_destroy_root(root); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); @@@ -1874,6 -1879,9 +1879,9 @@@ static int cgroup_remount(struct kernfs strcpy(root->release_agent_path, opts.release_agent); spin_unlock(&release_agent_path_lock); } + + trace_cgroup_remount(root); + out_unlock: kfree(opts.release_agent); kfree(opts.name); @@@ -2031,6 -2039,8 +2039,8 @@@ static int cgroup_setup_root(struct cgr if (ret) goto destroy_root; + trace_cgroup_setup_root(root); + /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in @@@ -2315,22 -2325,18 +2325,18 @@@ static struct file_system_type cgroup2_ .fs_flags = FS_USERNS_MOUNT, }; - static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) + static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); - int ret; - ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; + return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } - char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) + int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { - char *ret; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@@ -2357,12 -2363,12 +2363,12 @@@ EXPORT_SYMBOL_GPL(cgroup_path_ns) * * Return value is the same as kernfs_path(). */ - char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) + int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroup_root *root; struct cgroup *cgrp; int hierarchy_id = 1; - char *path = NULL; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@@ -2371,16 -2377,15 +2377,15 @@@ if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ - if (strlcpy(buf, "/", buflen) < buflen) - path = buf; + ret = strlcpy(buf, "/", buflen); } spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); - return path; + return ret; } EXPORT_SYMBOL_GPL(task_cgroup_path); @@@ -2830,6 -2835,10 +2835,10 @@@ static int cgroup_attach_task(struct cg ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); cgroup_migrate_finish(&preloaded_csets); + + if (!ret) + trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); + return ret; } @@@ -3446,28 -3455,9 +3455,28 @@@ static ssize_t cgroup_subtree_control_w * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { - ret = -EBUSY; - goto out_unlock; + if (enable && cgroup_parent(cgrp)) { + struct cgrp_cset_link *link; + + /* + * Because namespaces pin csets too, @cgrp->cset_links + * might not be empty even when @cgrp is empty. Walk and + * verify each cset. + */ + spin_lock_irq(&css_set_lock); + + ret = 0; + list_for_each_entry(link, &cgrp->cset_links, cset_link) { + if (css_set_populated(link->cset)) { + ret = -EBUSY; + break; + } + } + + spin_unlock_irq(&css_set_lock); + + if (ret) + goto out_unlock; } /* save and update control masks and prepare csses */ @@@ -3611,6 -3601,8 +3620,8 @@@ static int cgroup_rename(struct kernfs_ mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); mutex_unlock(&cgroup_mutex); @@@ -3918,9 -3910,7 +3929,9 @@@ void cgroup_file_notify(struct cgroup_f * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * - * Return the number of tasks in the cgroup. + * Return the number of tasks in the cgroup. The returned number can be + * higher than the actual number of tasks due to css_set references from + * namespace roots and temporary usages. */ static int cgroup_task_count(const struct cgroup *cgrp) { @@@ -4381,6 -4371,8 +4392,8 @@@ int cgroup_transfer_tasks(struct cgrou if (task) { ret = cgroup_migrate(task, false, to->root); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); put_task_struct(task); } } while (task && !ret); @@@ -5046,6 -5038,8 +5059,8 @@@ static void css_release_work_fn(struct ss->css_released(css); } else { /* cgroup release path */ + trace_cgroup_release(cgrp); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@@ -5332,6 -5326,8 +5347,8 @@@ static int cgroup_mkdir(struct kernfs_n if (ret) goto out_destroy; + trace_cgroup_mkdir(cgrp); + /* let's create and online css's */ kernfs_activate(kn); @@@ -5507,6 -5503,9 +5524,9 @@@ static int cgroup_rmdir(struct kernfs_n ret = cgroup_destroy_locked(cgrp); + if (!ret) + trace_cgroup_rmdir(cgrp); + cgroup_kn_unlock(kn); return ret; } @@@ -5627,12 -5626,6 +5647,12 @@@ int __init cgroup_init(void BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + /* + * The latency of the synchronize_sched() is too high for cgroups, + * avoid it at the cost of forcing all readers into the slow path. + */ + rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); + get_user_ns(init_cgroup_ns.user_ns); mutex_lock(&cgroup_mutex); @@@ -5743,7 -5736,7 +5763,7 @@@ core_initcall(cgroup_wq_init) int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *path; + char *buf; int retval; struct cgroup_root *root; @@@ -5786,18 -5779,18 +5806,18 @@@ * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (!path) { + if (retval >= PATH_MAX) retval = -ENAMETOOLONG; + if (retval < 0) goto out_unlock; - } + + seq_puts(m, buf); } else { - path = "/"; + seq_puts(m, "/"); } - seq_puts(m, path); - if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) seq_puts(m, " (deleted)\n"); else @@@ -6062,8 -6055,9 +6082,9 @@@ static void cgroup_release_agent(struc { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL, *path; + char *pathbuf = NULL, *agentbuf = NULL; char *argv[3], *envp[3]; + int ret; mutex_lock(&cgroup_mutex); @@@ -6073,13 -6067,13 +6094,13 @@@ goto out; spin_lock_irq(&css_set_lock); - path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); spin_unlock_irq(&css_set_lock); - if (!path) + if (ret < 0 || ret >= PATH_MAX) goto out; argv[0] = agentbuf; - argv[1] = path; + argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ @@@ -6297,12 -6291,6 +6318,12 @@@ void cgroup_sk_alloc(struct sock_cgroup if (cgroup_sk_alloc_disabled) return; + /* Socket clone path */ + if (skcd->val) { + cgroup_get(sock_cgroup_ptr(skcd)); + return; + } + rcu_read_lock(); while (true) { @@@ -6328,16 -6316,6 +6349,16 @@@ void cgroup_sk_free(struct sock_cgroup_ /* cgroup namespaces */ +static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); +} + +static void dec_cgroup_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); +} + static struct cgroup_namespace *alloc_cgroup_ns(void) { struct cgroup_namespace *new_ns; @@@ -6359,7 -6337,6 +6380,7 @@@ void free_cgroup_ns(struct cgroup_namespace *ns) { put_css_set(ns->root_cset); + dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); @@@ -6371,7 -6348,6 +6392,7 @@@ struct cgroup_namespace *copy_cgroup_ns struct cgroup_namespace *old_ns) { struct cgroup_namespace *new_ns; + struct ucounts *ucounts; struct css_set *cset; BUG_ON(!old_ns); @@@ -6385,10 -6361,6 +6406,10 @@@ if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); + ucounts = inc_cgroup_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); cset = task_css_set(current); @@@ -6398,12 -6370,10 +6419,12 @@@ new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { put_css_set(cset); + dec_cgroup_namespaces(ucounts); return new_ns; } new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; new_ns->root_cset = cset; return new_ns; @@@ -6454,18 -6424,12 +6475,18 @@@ static void cgroupns_put(struct ns_comm put_cgroup_ns(to_cg_ns(ns)); } +static struct user_namespace *cgroupns_owner(struct ns_common *ns) +{ + return to_cg_ns(ns)->user_ns; +} + const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, + .owner = cgroupns_owner, }; static __init int cgroup_namespaces_init(void) diff --combined kernel/cpuset.c index 2b4c20ab5bbe,97dd8e178786..29f815d2ef7e --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@@ -325,7 -325,8 +325,7 @@@ static struct file_system_type cpuset_f /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. The top - * cpuset always has some cpus online. + * until we find one that does have some online cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. @@@ -334,20 -335,8 +334,20 @@@ */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) + while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { cs = parent_cs(cs); + if (unlikely(!cs)) { + /* + * The top cpuset doesn't have any online cpu as a + * consequence of a race between cpuset_hotplug_work + * and cpu hotplug notifier. But we know the top + * cpuset's effective_cpus is on its way to to be + * identical to cpu_online_mask. + */ + cpumask_copy(pmask, cpu_online_mask); + return; + } + } cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); } @@@ -2080,20 -2069,6 +2080,20 @@@ static void cpuset_bind(struct cgroup_s mutex_unlock(&cpuset_mutex); } +/* + * Make sure the new task conform to the current state of its parent, + * which could have been changed by cpuset just after it inherits the + * state from the parent and before it sits on the cgroup's task list. + */ +static void cpuset_fork(struct task_struct *task) +{ + if (task_css_is_root(task, cpuset_cgrp_id)) + return; + + set_cpus_allowed_ptr(task, ¤t->cpus_allowed); + task->mems_allowed = current->mems_allowed; +} + struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, @@@ -2104,7 -2079,6 +2104,7 @@@ .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .fork = cpuset_fork, .legacy_cftypes = files, .early_init = true, }; @@@ -2715,7 -2689,7 +2715,7 @@@ void __cpuset_memory_pressure_bump(void int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *p; + char *buf; struct cgroup_subsys_state *css; int retval; @@@ -2724,14 -2698,15 +2724,15 @@@ if (!buf) goto out; - retval = -ENAMETOOLONG; css = task_get_css(tsk, cpuset_cgrp_id); - p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); + retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); css_put(css); - if (!p) + if (retval >= PATH_MAX) + retval = -ENAMETOOLONG; + if (retval < 0) goto out_free; - seq_puts(m, p); + seq_puts(m, buf); seq_putc(m, '\n'); retval = 0; out_free: diff --combined kernel/sched/debug.c index 13935886a471,23cb609ba4eb..fa178b62ea79 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@@ -369,12 -369,8 +369,12 @@@ static void print_cfs_group_stats(struc #define P(F) \ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define P_SCHEDSTAT(F) \ + SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN_SCHEDSTAT(F) \ + SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) if (!se) return; @@@ -382,27 -378,26 +382,27 @@@ PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); -#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { - PN(se->statistics.wait_start); - PN(se->statistics.sleep_start); - PN(se->statistics.block_start); - PN(se->statistics.sleep_max); - PN(se->statistics.block_max); - PN(se->statistics.exec_max); - PN(se->statistics.slice_max); - PN(se->statistics.wait_max); - PN(se->statistics.wait_sum); - P(se->statistics.wait_count); + PN_SCHEDSTAT(se->statistics.wait_start); + PN_SCHEDSTAT(se->statistics.sleep_start); + PN_SCHEDSTAT(se->statistics.block_start); + PN_SCHEDSTAT(se->statistics.sleep_max); + PN_SCHEDSTAT(se->statistics.block_max); + PN_SCHEDSTAT(se->statistics.exec_max); + PN_SCHEDSTAT(se->statistics.slice_max); + PN_SCHEDSTAT(se->statistics.wait_max); + PN_SCHEDSTAT(se->statistics.wait_sum); + P_SCHEDSTAT(se->statistics.wait_count); } -#endif P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); #endif + +#undef PN_SCHEDSTAT #undef PN +#undef P_SCHEDSTAT #undef P } #endif @@@ -415,7 -410,8 +415,8 @@@ static char *task_group_path(struct tas if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; } #endif @@@ -434,9 -430,9 +435,9 @@@ print_task(struct seq_file *m, struct r p->prio); SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), + SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); + SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); @@@ -631,7 -627,9 +632,7 @@@ do { #undef P64 #endif -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); - +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { P(yld_count); P(sched_count); @@@ -639,8 -637,9 +640,8 @@@ P(ttwu_count); P(ttwu_local); } - #undef P -#endif + spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); @@@ -870,14 -869,10 +871,14 @@@ void proc_sched_show_task(struct task_s SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) #define P(F) \ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define P_SCHEDSTAT(F) \ + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) #define __PN(F) \ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) #define PN(F) \ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define PN_SCHEDSTAT(F) \ + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) PN(se.exec_start); PN(se.vruntime); @@@ -887,36 -882,37 +888,36 @@@ P(se.nr_migrations); -#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; - PN(se.statistics.sum_sleep_runtime); - PN(se.statistics.wait_start); - PN(se.statistics.sleep_start); - PN(se.statistics.block_start); - PN(se.statistics.sleep_max); - PN(se.statistics.block_max); - PN(se.statistics.exec_max); - PN(se.statistics.slice_max); - PN(se.statistics.wait_max); - PN(se.statistics.wait_sum); - P(se.statistics.wait_count); - PN(se.statistics.iowait_sum); - P(se.statistics.iowait_count); - P(se.statistics.nr_migrations_cold); - P(se.statistics.nr_failed_migrations_affine); - P(se.statistics.nr_failed_migrations_running); - P(se.statistics.nr_failed_migrations_hot); - P(se.statistics.nr_forced_migrations); - P(se.statistics.nr_wakeups); - P(se.statistics.nr_wakeups_sync); - P(se.statistics.nr_wakeups_migrate); - P(se.statistics.nr_wakeups_local); - P(se.statistics.nr_wakeups_remote); - P(se.statistics.nr_wakeups_affine); - P(se.statistics.nr_wakeups_affine_attempts); - P(se.statistics.nr_wakeups_passive); - P(se.statistics.nr_wakeups_idle); + PN_SCHEDSTAT(se.statistics.sum_sleep_runtime); + PN_SCHEDSTAT(se.statistics.wait_start); + PN_SCHEDSTAT(se.statistics.sleep_start); + PN_SCHEDSTAT(se.statistics.block_start); + PN_SCHEDSTAT(se.statistics.sleep_max); + PN_SCHEDSTAT(se.statistics.block_max); + PN_SCHEDSTAT(se.statistics.exec_max); + PN_SCHEDSTAT(se.statistics.slice_max); + PN_SCHEDSTAT(se.statistics.wait_max); + PN_SCHEDSTAT(se.statistics.wait_sum); + P_SCHEDSTAT(se.statistics.wait_count); + PN_SCHEDSTAT(se.statistics.iowait_sum); + P_SCHEDSTAT(se.statistics.iowait_count); + P_SCHEDSTAT(se.statistics.nr_migrations_cold); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_running); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot); + P_SCHEDSTAT(se.statistics.nr_forced_migrations); + P_SCHEDSTAT(se.statistics.nr_wakeups); + P_SCHEDSTAT(se.statistics.nr_wakeups_sync); + P_SCHEDSTAT(se.statistics.nr_wakeups_migrate); + P_SCHEDSTAT(se.statistics.nr_wakeups_local); + P_SCHEDSTAT(se.statistics.nr_wakeups_remote); + P_SCHEDSTAT(se.statistics.nr_wakeups_affine); + P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); + P_SCHEDSTAT(se.statistics.nr_wakeups_passive); + P_SCHEDSTAT(se.statistics.nr_wakeups_idle); avg_atom = p->se.sum_exec_runtime; if (nr_switches) @@@ -935,7 -931,7 +936,7 @@@ __PN(avg_atom); __PN(avg_per_cpu); } -#endif + __P(nr_switches); SEQ_printf(m, "%-45s:%21Ld\n", "nr_voluntary_switches", (long long)p->nvcsw); @@@ -952,10 -948,8 +953,10 @@@ #endif P(policy); P(prio); +#undef PN_SCHEDSTAT #undef PN #undef __PN +#undef P_SCHEDSTAT #undef P #undef __P