*
* You should have received a copy of the GNU General Public License
* version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
+ * http://www.gnu.org/licenses/gpl-2.0.html
*
* GPL HEADER END
*/
return -EPROTO;
}
- if (body->valid & OBD_MD_FLRMTPERM) {
- struct mdt_remote_perm *perm;
-
- LASSERT(client_is_remote(exp));
- perm = req_capsule_server_swab_get(pill, &RMF_ACL,
- lustre_swab_mdt_remote_perm);
- if (!perm)
- return -EPROTO;
- }
-
return 0;
}
req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
op_data->op_mode);
- if (op_data->op_valid & OBD_MD_FLRMTPERM) {
- LASSERT(client_is_remote(exp));
- req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
- sizeof(struct mdt_remote_perm));
- }
ptlrpc_request_set_replen(req);
rc = mdc_getattr_common(exp, req);
return rc;
}
- rc = posix_acl_valid(acl);
+ rc = posix_acl_valid(&init_user_ns, acl);
if (rc) {
CERROR("validate acl: %d\n", rc);
posix_acl_release(acl);
}
rc = 0;
- if (md->body->valid & OBD_MD_FLRMTPERM) {
- /* remote permission */
- LASSERT(client_is_remote(exp));
- md->remote_perm = req_capsule_server_swab_get(pill, &RMF_ACL,
- lustre_swab_mdt_remote_perm);
- if (!md->remote_perm) {
- rc = -EPROTO;
- goto out;
- }
- } else if (md->body->valid & OBD_MD_FLACL) {
+ if (md->body->valid & OBD_MD_FLACL) {
/* for ACL, it's possible that FLACL is set but aclsize is zero.
* only when aclsize != 0 there's an actual segment for ACL
* in reply buffer.
struct md_open_data *mod;
struct mdt_rec_create *rec;
struct mdt_body *body;
- struct ptlrpc_request *open_req = it->d.lustre.it_data;
+ struct ptlrpc_request *open_req = it->it_request;
struct obd_import *imp = open_req->rq_import;
if (!open_req->rq_replay)
goto out;
}
- mdc_pack_body(req, NULL, OBD_MD_FLRMTPERM, 0, -1, 0);
+ mdc_pack_body(req, NULL, 0, 0, -1, 0);
/* Copy hsm_progress struct */
req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS);
goto out;
}
- mdc_pack_body(req, NULL, OBD_MD_FLRMTPERM, 0, -1, 0);
+ mdc_pack_body(req, NULL, 0, 0, -1, 0);
/* Copy hsm_progress struct */
archive_mask = req_capsule_client_get(&req->rq_pill,
return rc;
}
- mdc_pack_body(req, &op_data->op_fid1, OBD_MD_FLRMTPERM, 0,
+ mdc_pack_body(req, &op_data->op_fid1, 0, 0,
op_data->op_suppgids[0], 0);
ptlrpc_request_set_replen(req);
goto out;
}
- mdc_pack_body(req, NULL, OBD_MD_FLRMTPERM, 0, -1, 0);
+ mdc_pack_body(req, NULL, 0, 0, -1, 0);
ptlrpc_request_set_replen(req);
return rc;
}
- mdc_pack_body(req, &op_data->op_fid1, OBD_MD_FLRMTPERM, 0,
+ mdc_pack_body(req, &op_data->op_fid1, 0, 0,
op_data->op_suppgids[0], 0);
ptlrpc_request_set_replen(req);
return rc;
}
- mdc_pack_body(req, &op_data->op_fid1, OBD_MD_FLRMTPERM, 0,
+ mdc_pack_body(req, &op_data->op_fid1, 0, 0,
op_data->op_suppgids[0], 0);
/* Copy states */
return rc;
}
- mdc_pack_body(req, NULL, OBD_MD_FLRMTPERM, 0, -1, 0);
+ mdc_pack_body(req, NULL, 0, 0, -1, 0);
/* Copy hsm_request struct */
req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST);
case IOC_OBD_STATFS: {
struct obd_statfs stat_buf = {0};
- if (*((__u32 *) data->ioc_inlbuf2) != 0) {
+ if (*((__u32 *)data->ioc_inlbuf2) != 0) {
rc = -ENODEV;
goto out;
}
if (len < sizeof(*lh) + sizeof(*hal)) {
CERROR("Short HSM message %d < %d\n", len,
- (int) (sizeof(*lh) + sizeof(*hal)));
+ (int)(sizeof(*lh) + sizeof(*hal)));
return -EPROTO;
}
if (lh->kuc_magic == __swab16(KUC_MAGIC)) {
return rc;
}
-/* get remote permission for current user on fid */
-static int mdc_get_remote_perm(struct obd_export *exp, const struct lu_fid *fid,
- __u32 suppgid, struct ptlrpc_request **request)
-{
- struct ptlrpc_request *req;
- int rc;
-
- LASSERT(client_is_remote(exp));
-
- *request = NULL;
- req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
- if (!req)
- return -ENOMEM;
-
- rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
- if (rc) {
- ptlrpc_request_free(req);
- return rc;
- }
-
- mdc_pack_body(req, fid, OBD_MD_FLRMTPERM, 0, suppgid, 0);
-
- req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
- sizeof(struct mdt_remote_perm));
-
- ptlrpc_request_set_replen(req);
-
- rc = ptlrpc_queue_wait(req);
- if (rc)
- ptlrpc_req_finished(req);
- else
- *request = req;
- return rc;
-}
-
static struct obd_ops mdc_obd_ops = {
.owner = THIS_MODULE,
.setup = mdc_setup,
.free_lustre_md = mdc_free_lustre_md,
.set_open_replay_data = mdc_set_open_replay_data,
.clear_open_replay_data = mdc_clear_open_replay_data,
- .get_remote_perm = mdc_get_remote_perm,
.intent_getattr_async = mdc_intent_getattr_async,
.revalidate_lock = mdc_revalidate_lock
};
if (size < 0)
return size;
- if (!ops->direct_access)
+ if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access)
return -EOPNOTSUPP;
if ((sector + DIV_ROUND_UP(size, 512)) >
part_nr_sects_read(bdev->bd_part))
memset(bdev, 0, sizeof(*bdev));
mutex_init(&bdev->bd_mutex);
- INIT_LIST_HEAD(&bdev->bd_inodes);
INIT_LIST_HEAD(&bdev->bd_list);
#ifdef CONFIG_SYSFS
INIT_LIST_HEAD(&bdev->bd_holder_disks);
mutex_init(&bdev->bd_fsfreeze_mutex);
}
-static inline void __bd_forget(struct inode *inode)
-{
- list_del_init(&inode->i_devices);
- inode->i_bdev = NULL;
- inode->i_mapping = &inode->i_data;
-}
-
static void bdev_evict_inode(struct inode *inode)
{
struct block_device *bdev = &BDEV_I(inode)->bdev;
- struct list_head *p;
truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
spin_lock(&bdev_lock);
- while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
- __bd_forget(list_entry(p, struct inode, i_devices));
- }
list_del_init(&bdev->bd_list);
spin_unlock(&bdev_lock);
}
bdgrab(bdev);
inode->i_bdev = bdev;
inode->i_mapping = bdev->bd_inode->i_mapping;
- list_add(&inode->i_devices, &bdev->bd_inodes);
}
spin_unlock(&bdev_lock);
}
spin_lock(&bdev_lock);
if (!sb_is_blkdev_sb(inode->i_sb))
bdev = inode->i_bdev;
- __bd_forget(inode);
+ inode->i_bdev = NULL;
+ inode->i_mapping = &inode->i_data;
spin_unlock(&bdev_lock);
if (bdev)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
- if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
+ if (IS_ENABLED(CONFIG_BLK_DEV_DAX) &&
+ blk_queue_dax(disk->queue))
bdev->bd_inode->i_flags = S_DAX;
else
bdev->bd_inode->i_flags = 0;
if (!S_ISBLK(inode->i_mode))
goto fail;
error = -EACCES;
- if (path.mnt->mnt_flags & MNT_NODEV)
+ if (!may_open_dev(&path))
goto fail;
error = -ENOMEM;
bdev = bd_acquire(inode);
INIT_HLIST_NODE(&inode->i_hash);
INIT_LIST_HEAD(&inode->i_devices);
INIT_LIST_HEAD(&inode->i_io_list);
+ INIT_LIST_HEAD(&inode->i_wb_list);
INIT_LIST_HEAD(&inode->i_lru);
address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
BUG_ON(!list_empty(&inode->i_data.private_list));
BUG_ON(!(inode->i_state & I_FREEING));
BUG_ON(inode->i_state & I_CLEAR);
+ BUG_ON(!list_empty(&inode->i_wb_list));
/* don't need i_lock here, no concurrent mods to i_state */
inode->i_state = I_FREEING | I_CLEAR;
}
if (inode->i_flags & S_NOATIME)
return false;
+
+ /* Atime updates will likely cause i_uid and i_gid to be written
+ * back improprely if their true value is unknown to the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return false;
+
if (IS_NOATIME(inode))
return false;
if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
#include <linux/posix_acl.h>
#include <linux/hash.h>
#include <linux/bitops.h>
+ #include <linux/init_task.h>
#include <asm/uaccess.h>
#include "internal.h"
*/
if (IS_IMMUTABLE(inode))
return -EACCES;
+
+ /*
+ * Updating mtime will likely cause i_uid and i_gid to be
+ * written back improperly if their true value is unknown
+ * to the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return -EACCES;
}
retval = do_inode_permission(inode, mask);
{
const struct inode *inode;
const struct inode *parent;
+ kuid_t puid;
if (!sysctl_protected_symlinks)
return 0;
return 0;
/* Allowed if parent directory and link owner match. */
- if (uid_eq(parent->i_uid, inode->i_uid))
+ puid = parent->i_uid;
+ if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
return 0;
if (nd->flags & LOOKUP_RCU)
bool *need_mntput)
{
struct vfsmount *mnt;
+ const struct cred *old_cred;
int err;
if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
path->dentry->d_inode)
return -EISDIR;
+ if (path->dentry->d_sb->s_user_ns != &init_user_ns)
+ return -EACCES;
+
nd->total_link_count++;
if (nd->total_link_count >= 40)
return -ELOOP;
+ old_cred = override_creds(&init_cred);
mnt = path->dentry->d_op->d_automount(path);
+ revert_creds(old_cred);
if (IS_ERR(mnt)) {
/*
* The filesystem is allowed to return -EISDIR here to indicate
}
/*
- * This looks up the name in dcache, possibly revalidates the old dentry and
- * allocates a new one if not found or not valid. In the need_lookup argument
- * returns whether i_op->lookup is necessary.
+ * This looks up the name in dcache and possibly revalidates the found dentry.
+ * NULL is returned if the dentry does not exist in the cache.
*/
static struct dentry *lookup_dcache(const struct qstr *name,
struct dentry *dir,
* payload bytes, to match the way that hash_name() iterates until it
* finds the delimiter after the name.
*/
-unsigned int full_name_hash(const char *name, unsigned int len)
+unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
- unsigned long a, x = 0, y = 0;
+ unsigned long a, x = 0, y = (unsigned long)salt;
for (;;) {
if (!len)
EXPORT_SYMBOL(full_name_hash);
/* Return the "hash_len" (hash and length) of a null-terminated string */
-u64 hashlen_string(const char *name)
+u64 hashlen_string(const void *salt, const char *name)
{
- unsigned long a = 0, x = 0, y = 0, adata, mask, len;
+ unsigned long a = 0, x = 0, y = (unsigned long)salt;
+ unsigned long adata, mask, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
- len = -sizeof(unsigned long);
+ len = 0;
+ goto inside;
+
do {
HASH_MIX(x, y, a);
len += sizeof(unsigned long);
+inside:
a = load_unaligned_zeropad(name+len);
} while (!has_zero(a, &adata, &constants));
* Calculate the length and hash of the path component, and
* return the "hash_len" as the result.
*/
-static inline u64 hash_name(const char *name)
+static inline u64 hash_name(const void *salt, const char *name)
{
- unsigned long a = 0, b, x = 0, y = 0, adata, bdata, mask, len;
+ unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
+ unsigned long adata, bdata, mask, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
- len = -sizeof(unsigned long);
+ len = 0;
+ goto inside;
+
do {
HASH_MIX(x, y, a);
len += sizeof(unsigned long);
+inside:
a = load_unaligned_zeropad(name+len);
b = a ^ REPEAT_BYTE('/');
} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
/* Return the hash of a string of known length */
-unsigned int full_name_hash(const char *name, unsigned int len)
+unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
- unsigned long hash = init_name_hash();
+ unsigned long hash = init_name_hash(salt);
while (len--)
hash = partial_name_hash((unsigned char)*name++, hash);
return end_name_hash(hash);
EXPORT_SYMBOL(full_name_hash);
/* Return the "hash_len" (hash and length) of a null-terminated string */
-u64 hashlen_string(const char *name)
+u64 hashlen_string(const void *salt, const char *name)
{
- unsigned long hash = init_name_hash();
+ unsigned long hash = init_name_hash(salt);
unsigned long len = 0, c;
c = (unsigned char)*name;
* We know there's a real path component here of at least
* one character.
*/
-static inline u64 hash_name(const char *name)
+static inline u64 hash_name(const void *salt, const char *name)
{
- unsigned long hash = init_name_hash();
+ unsigned long hash = init_name_hash(salt);
unsigned long len = 0, c;
c = (unsigned char)*name;
if (err)
return err;
- hash_len = hash_name(name);
+ hash_len = hash_name(nd->path.dentry, name);
type = LAST_NORM;
if (name[0] == '.') switch (hashlen_len(hash_len)) {
}
EXPORT_SYMBOL(vfs_path_lookup);
-/**
- * lookup_hash - lookup single pathname component on already hashed name
- * @name: name and hash to lookup
- * @base: base directory to lookup from
- *
- * The name must have been verified and hashed (see lookup_one_len()). Using
- * this after just full_name_hash() is unsafe.
- *
- * This function also doesn't check for search permission on base directory.
- *
- * Use lookup_one_len_unlocked() instead, unless you really know what you are
- * doing.
- *
- * Do not hold i_mutex; this helper takes i_mutex if necessary.
- */
-struct dentry *lookup_hash(const struct qstr *name, struct dentry *base)
-{
- struct dentry *ret;
-
- ret = lookup_dcache(name, base, 0);
- if (!ret)
- ret = lookup_slow(name, base, 0);
-
- return ret;
-}
-EXPORT_SYMBOL(lookup_hash);
-
/**
* lookup_one_len - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
this.name = name;
this.len = len;
- this.hash = full_name_hash(name, len);
+ this.hash = full_name_hash(base, name, len);
if (!len)
return ERR_PTR(-EACCES);
struct qstr this;
unsigned int c;
int err;
+ struct dentry *ret;
this.name = name;
this.len = len;
- this.hash = full_name_hash(name, len);
+ this.hash = full_name_hash(base, name, len);
if (!len)
return ERR_PTR(-EACCES);
if (err)
return ERR_PTR(err);
- return lookup_hash(&this, base);
+ ret = lookup_dcache(&this, base, 0);
+ if (!ret)
+ ret = lookup_slow(&this, base, 0);
+ return ret;
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
* c. have CAP_FOWNER capability
* 6. If the victim is append-only or immutable we can't do antyhing with
* links pointing to it.
- * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
- * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
- * 9. We can't remove a root or mountpoint.
- * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ * 7. If the victim has an unknown uid or gid we can't change the inode.
+ * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ * 10. We can't remove a root or mountpoint.
+ * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
* nfs_async_unlink().
*/
static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
return -EPERM;
if (check_sticky(dir, inode) || IS_APPEND(inode) ||
- IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
+ IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
return -EPERM;
if (isdir) {
if (!d_is_dir(victim))
* 1. We can't do it if child already exists (open has special treatment for
* this case, but since we are inlined it's OK)
* 2. We can't do it if dir is read-only (done in permission())
- * 3. We should have write and exec permissions on dir
- * 4. We can't do it if dir is immutable (done in permission())
+ * 3. We can't do it if the fs can't represent the fsuid or fsgid.
+ * 4. We should have write and exec permissions on dir
+ * 5. We can't do it if dir is immutable (done in permission())
*/
static inline int may_create(struct inode *dir, struct dentry *child)
{
+ struct user_namespace *s_user_ns;
audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
if (child->d_inode)
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
+ s_user_ns = dir->i_sb->s_user_ns;
+ if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+ !kgid_has_mapping(s_user_ns, current_fsgid()))
+ return -EOVERFLOW;
return inode_permission(dir, MAY_WRITE | MAY_EXEC);
}
}
EXPORT_SYMBOL(vfs_create);
+ bool may_open_dev(const struct path *path)
+ {
+ return !(path->mnt->mnt_flags & MNT_NODEV) &&
+ !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
+ }
+
static int may_open(struct path *path, int acc_mode, int flag)
{
struct dentry *dentry = path->dentry;
break;
case S_IFBLK:
case S_IFCHR:
- if (path->mnt->mnt_flags & MNT_NODEV)
+ if (!may_open_dev(path))
return -EACCES;
/*FALLTHRU*/
case S_IFIFO:
}
if (*opened & FILE_CREATED)
fsnotify_create(dir, dentry);
- path->dentry = dentry;
- path->mnt = nd->path.mnt;
- return 1;
+ if (unlikely(d_is_negative(dentry))) {
+ error = -ENOENT;
+ } else {
+ path->dentry = dentry;
+ path->mnt = nd->path.mnt;
+ return 1;
+ }
}
}
dput(dentry);
int acc_mode = op->acc_mode;
unsigned seq;
struct inode *inode;
- struct path save_parent = { .dentry = NULL, .mnt = NULL };
struct path path;
- bool retried = false;
int error;
nd->flags &= ~LOOKUP_PARENT;
return -EISDIR;
}
-retry_lookup:
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
error = mnt_want_write(nd->path.mnt);
if (!error)
got_write = false;
}
+ error = follow_managed(&path, nd);
+ if (unlikely(error < 0))
+ return error;
+
if (unlikely(d_is_negative(path.dentry))) {
path_to_nameidata(&path, nd);
return -ENOENT;
return -EEXIST;
}
- error = follow_managed(&path, nd);
- if (unlikely(error < 0))
- return error;
-
seq = 0; /* out of RCU mode, so the value doesn't matter */
inode = d_backing_inode(path.dentry);
finish_lookup:
if (unlikely(error))
return error;
- if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
- path_to_nameidata(&path, nd);
- } else {
- save_parent.dentry = nd->path.dentry;
- save_parent.mnt = mntget(path.mnt);
- nd->path.dentry = path.dentry;
-
- }
+ path_to_nameidata(&path, nd);
nd->inode = inode;
nd->seq = seq;
/* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
finish_open:
error = complete_walk(nd);
- if (error) {
- path_put(&save_parent);
+ if (error)
return error;
- }
audit_inode(nd->name, nd->path.dentry, 0);
error = -EISDIR;
if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
goto out;
BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
error = vfs_open(&nd->path, file, current_cred());
- if (!error) {
- *opened |= FILE_OPENED;
- } else {
- if (error == -EOPENSTALE)
- goto stale_open;
+ if (error)
goto out;
- }
+ *opened |= FILE_OPENED;
opened:
error = open_check_o_direct(file);
if (!error)
}
if (got_write)
mnt_drop_write(nd->path.mnt);
- path_put(&save_parent);
return error;
-
-stale_open:
- /* If no saved parent or already retried then can't retry */
- if (!save_parent.dentry || retried)
- goto out;
-
- BUG_ON(save_parent.dentry != dir);
- path_put(&nd->path);
- nd->path = save_parent;
- nd->inode = dir->d_inode;
- save_parent.mnt = NULL;
- save_parent.dentry = NULL;
- if (got_write) {
- mnt_drop_write(nd->path.mnt);
- got_write = false;
- }
- retried = true;
- goto retry_lookup;
}
static int do_tmpfile(struct nameidata *nd, unsigned flags,
*/
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return -EPERM;
+ /*
+ * Updating the link count will likely cause i_uid and i_gid to
+ * be writen back improperly if their true value is unknown to
+ * the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return -EPERM;
if (!dir->i_op->link)
return -EPERM;
if (S_ISDIR(inode->i_mode))
* Check source == target.
* On overlayfs need to look at underlying inodes.
*/
- if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0))
+ if (d_real_inode(old_dentry) == d_real_inode(new_dentry))
return 0;
error = may_delete(old_dir, old_dentry, is_dir);
goto out_unlock;
lock_mount_hash();
+ event++;
while (!hlist_empty(&mp->m_list)) {
mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
!(mnt_flags & MNT_NODEV)) {
- /* Was the nodev implicitly added in mount? */
- if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
- !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
- mnt_flags |= MNT_NODEV;
- } else {
- return -EPERM;
- }
+ return -EPERM;
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
!(mnt_flags & MNT_NOSUID)) {
return err;
}
- static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags);
+ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags);
/*
* create a new mount for userspace and request it to be added into the
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
- struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
struct vfsmount *mnt;
int err;
if (!type)
return -ENODEV;
- if (user_ns != &init_user_ns) {
- if (!(type->fs_flags & FS_USERNS_MOUNT)) {
- put_filesystem(type);
- return -EPERM;
- }
- /* Only in special cases allow devices from mounts
- * created outside the initial user namespace.
- */
- if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
- flags |= MS_NODEV;
- mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
- }
- if (type->fs_flags & FS_USERNS_VISIBLE) {
- if (!fs_fully_visible(type, &mnt_flags)) {
- put_filesystem(type);
- return -EPERM;
- }
- }
- }
-
mnt = vfs_kern_mount(type, flags, name, data);
if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
!mnt->mnt_sb->s_subtype)
if (IS_ERR(mnt))
return PTR_ERR(mnt);
+ if (mount_too_revealing(mnt, &mnt_flags)) {
+ mntput(mnt);
+ return -EPERM;
+ }
+
err = do_add_mount(real_mount(mnt), path, mnt_flags);
if (err)
mntput(mnt);
return chrooted;
}
- static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
+ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
+ int *new_mnt_flags)
{
- struct mnt_namespace *ns = current->nsproxy->mnt_ns;
int new_flags = *new_mnt_flags;
struct mount *mnt;
bool visible = false;
- if (unlikely(!ns))
- return false;
-
down_read(&namespace_sem);
list_for_each_entry(mnt, &ns->list, mnt_list) {
struct mount *child;
int mnt_flags;
- if (mnt->mnt.mnt_sb->s_type != type)
+ if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
continue;
/* This mount is not fully visible if it's root directory
if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
continue;
- /* Read the mount flags and filter out flags that
- * may safely be ignored.
- */
+ /* A local view of the mount flags */
mnt_flags = mnt->mnt.mnt_flags;
- if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC)
- mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC);
/* Don't miss readonly hidden in the superblock flags */
if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY)
if ((mnt_flags & MNT_LOCK_READONLY) &&
!(new_flags & MNT_READONLY))
continue;
- if ((mnt_flags & MNT_LOCK_NODEV) &&
- !(new_flags & MNT_NODEV))
- continue;
- if ((mnt_flags & MNT_LOCK_NOSUID) &&
- !(new_flags & MNT_NOSUID))
- continue;
- if ((mnt_flags & MNT_LOCK_NOEXEC) &&
- !(new_flags & MNT_NOEXEC))
- continue;
if ((mnt_flags & MNT_LOCK_ATIME) &&
((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
continue;
}
/* Preserve the locked attributes */
*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
- MNT_LOCK_NODEV | \
- MNT_LOCK_NOSUID | \
- MNT_LOCK_NOEXEC | \
MNT_LOCK_ATIME);
visible = true;
goto found;
return visible;
}
+ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
+ {
+ const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
+ struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+ unsigned long s_iflags;
+
+ if (ns->user_ns == &init_user_ns)
+ return false;
+
+ /* Can this filesystem be too revealing? */
+ s_iflags = mnt->mnt_sb->s_iflags;
+ if (!(s_iflags & SB_I_USERNS_VISIBLE))
+ return false;
+
+ if ((s_iflags & required_iflags) != required_iflags) {
+ WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
+ required_iflags);
+ return true;
+ }
+
+ return !mnt_already_visible(ns, mnt, new_mnt_flags);
+ }
+
+ bool mnt_may_suid(struct vfsmount *mnt)
+ {
+ /*
+ * Foreign mounts (accessed via fchdir or through /proc
+ * symlinks) are always treated as if they are nosuid. This
+ * prevents namespaces from trusting potentially unsafe
+ * suid/sgid bits, file caps, or security labels that originate
+ * in other namespaces.
+ */
+ return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
+ current_in_userns(mnt->mnt_sb->s_user_ns);
+ }
+
static struct ns_common *mntns_get(struct task_struct *task)
{
struct ns_common *ns = NULL;
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
- .owner = THIS_MODULE,
};
static int exports_nfsd_open(struct inode *inode, struct file *file)
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
- .owner = THIS_MODULE,
};
static int export_features_show(struct seq_file *m, void *v)
.read = seq_read,
.llseek = seq_lseek,
.release = nfsd_pool_stats_release,
- .owner = THIS_MODULE,
};
static struct file_operations reply_cache_stats_operations = {
#endif
/* last one */ {""}
};
- struct net *net = data;
- int ret;
-
- ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
- if (ret)
- return ret;
- sb->s_fs_info = get_net(net);
- return 0;
+ get_net(sb->s_fs_info);
+ return simple_fill_super(sb, 0x6e667364, nfsd_files);
}
static struct dentry *nfsd_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super);
+ struct net *net = current->nsproxy->net_ns;
+ return mount_ns(fs_type, flags, data, net, net->user_ns, nfsd_fill_super);
}
static void nfsd_umount(struct super_block *sb)
* Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
*/
int
- posix_acl_valid(const struct posix_acl *acl)
+ posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl)
{
const struct posix_acl_entry *pa, *pe;
int state = ACL_USER_OBJ;
case ACL_USER:
if (state != ACL_USER)
return -EINVAL;
- if (!uid_valid(pa->e_uid))
+ if (!kuid_has_mapping(user_ns, pa->e_uid))
return -EINVAL;
needs_mask = 1;
break;
case ACL_GROUP:
if (state != ACL_GROUP)
return -EINVAL;
- if (!gid_valid(pa->e_gid))
+ if (!kgid_has_mapping(user_ns, pa->e_gid))
return -EINVAL;
needs_mask = 1;
break;
return error;
}
-static int
-posix_acl_xattr_set(const struct xattr_handler *handler,
- struct dentry *unused, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
+int
+set_posix_acl(struct inode *inode, int type, struct posix_acl *acl)
{
- struct posix_acl *acl = NULL;
- int ret;
-
if (!IS_POSIXACL(inode))
return -EOPNOTSUPP;
if (!inode->i_op->set_acl)
return -EOPNOTSUPP;
- if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
- return value ? -EACCES : 0;
+ if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
if (!inode_owner_or_capable(inode))
return -EPERM;
- int ret = posix_acl_valid(acl);
+ if (acl) {
++ int ret = posix_acl_valid(inode->i_sb->s_user_ns, acl);
+ if (ret)
+ return ret;
+ }
+ return inode->i_op->set_acl(inode, acl, type);
+}
+EXPORT_SYMBOL(set_posix_acl);
+
+static int
+posix_acl_xattr_set(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct posix_acl *acl = NULL;
+ int ret;
+
if (value) {
acl = posix_acl_from_xattr(&init_user_ns, value, size);
if (IS_ERR(acl))
return PTR_ERR(acl);
-
- if (acl) {
- ret = posix_acl_valid(inode->i_sb->s_user_ns, acl);
- if (ret)
- goto out;
- }
}
-
- ret = inode->i_op->set_acl(inode, acl, handler->flags);
-out:
+ ret = set_posix_acl(inode, handler->flags, acl);
posix_acl_release(acl);
return ret;
}
return inode;
}
- int proc_fill_super(struct super_block *s)
+ int proc_fill_super(struct super_block *s, void *data, int silent)
{
+ struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
struct inode *root_inode;
int ret;
+ if (!proc_parse_options(data, ns))
+ return -EINVAL;
+
+ /* User space would break if executables or devices appear on proc */
+ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = PROC_SUPER_MAGIC;
s->s_op = &proc_sops;
s->s_time_gran = 1;
++
++ /*
++ * procfs isn't actually a stacking filesystem; however, there is
++ * too much magic going on inside it to permit stacking things on
++ * top of it
++ */
++ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
pde_get(&proc_root);
root_inode = proc_get_inode(s, &proc_root);
unsigned int hashent = hashfn(sb, qid);
struct dquot *dquot, *empty = NULL;
+ if (!qid_has_mapping(sb->s_user_ns, qid))
+ return ERR_PTR(-EINVAL);
+
if (!sb_has_quota_active(sb, qid.type))
return ERR_PTR(-ESRCH);
we_slept:
else
dquot->dq_dqb.dqb_curinodes = 0;
if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
- dquot->dq_dqb.dqb_itime = (time_t) 0;
+ dquot->dq_dqb.dqb_itime = (time64_t) 0;
clear_bit(DQ_INODES_B, &dquot->dq_flags);
}
else
dquot->dq_dqb.dqb_curspace = 0;
if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
- dquot->dq_dqb.dqb_btime = (time_t) 0;
+ dquot->dq_dqb.dqb_btime = (time64_t) 0;
clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}
if (dquot->dq_dqb.dqb_isoftlimit &&
newinodes > dquot->dq_dqb.dqb_isoftlimit &&
dquot->dq_dqb.dqb_itime &&
- get_seconds() >= dquot->dq_dqb.dqb_itime &&
+ ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime &&
!ignore_hardlimit(dquot)) {
prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
return -EDQUOT;
newinodes > dquot->dq_dqb.dqb_isoftlimit &&
dquot->dq_dqb.dqb_itime == 0) {
prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);
- dquot->dq_dqb.dqb_itime = get_seconds() +
+ dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() +
sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace;
}
if (dquot->dq_dqb.dqb_bsoftlimit &&
tspace > dquot->dq_dqb.dqb_bsoftlimit &&
dquot->dq_dqb.dqb_btime &&
- get_seconds() >= dquot->dq_dqb.dqb_btime &&
+ ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime &&
!ignore_hardlimit(dquot)) {
if (!prealloc)
prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
dquot->dq_dqb.dqb_btime == 0) {
if (!prealloc) {
prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
- dquot->dq_dqb.dqb_btime = get_seconds() +
+ dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() +
sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace;
}
else
error = -EINVAL;
goto out_fmt;
}
+ /* Filesystems outside of init_user_ns not yet supported */
+ if (sb->s_user_ns != &init_user_ns) {
+ error = -EINVAL;
+ goto out_fmt;
+ }
/* Usage always has to be set... */
if (!(flags & DQUOT_USAGE_ENABLED)) {
error = -EINVAL;
clear_bit(DQ_BLKS_B, &dquot->dq_flags);
} else if (!(di->d_fieldmask & QC_SPC_TIMER))
/* Set grace only if user hasn't provided his own... */
- dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
+ dm->dqb_btime = ktime_get_real_seconds() + dqi->dqi_bgrace;
}
if (check_ilim) {
if (!dm->dqb_isoftlimit ||
clear_bit(DQ_INODES_B, &dquot->dq_flags);
} else if (!(di->d_fieldmask & QC_INO_TIMER))
/* Set grace only if user hasn't provided his own... */
- dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
+ dm->dqb_itime = ktime_get_real_seconds() + dqi->dqi_igrace;
}
if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit ||
dm->dqb_isoftlimit)
#include <linux/cleancache.h>
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
+ #include <linux/user_namespace.h>
#include "internal.h"
list_lru_destroy(&s->s_inode_lru);
security_sb_free(s);
WARN_ON(!list_empty(&s->s_mounts));
+ put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
kfree(s->s_options);
call_rcu(&s->rcu, destroy_super_rcu);
* alloc_super - create new superblock
* @type: filesystem type superblock should belong to
* @flags: the mount flags
+ * @user_ns: User namespace for the super_block
*
* Allocates and initializes a new &struct super_block. alloc_super()
* returns a pointer new superblock or %NULL if allocation had failed.
*/
- static struct super_block *alloc_super(struct file_system_type *type, int flags)
+ static struct super_block *alloc_super(struct file_system_type *type, int flags,
+ struct user_namespace *user_ns)
{
struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
static const struct super_operations default_op;
return NULL;
INIT_LIST_HEAD(&s->s_mounts);
+ s->s_user_ns = get_user_ns(user_ns);
if (security_sb_alloc(s))
goto fail;
init_waitqueue_head(&s->s_writers.wait_unfrozen);
s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags;
+ if (s->s_user_ns != &init_user_ns)
+ s->s_iflags |= SB_I_NODEV;
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
mutex_init(&s->s_sync_lock);
INIT_LIST_HEAD(&s->s_inodes);
spin_lock_init(&s->s_inode_list_lock);
+ INIT_LIST_HEAD(&s->s_inodes_wb);
+ spin_lock_init(&s->s_inode_wblist_lock);
if (list_lru_init_memcg(&s->s_dentry_lru))
goto fail;
EXPORT_SYMBOL(generic_shutdown_super);
/**
- * sget - find or create a superblock
+ * sget_userns - find or create a superblock
* @type: filesystem type superblock should belong to
* @test: comparison callback
* @set: setup callback
* @flags: mount flags
+ * @user_ns: User namespace for the super_block
* @data: argument to each of them
*/
- struct super_block *sget(struct file_system_type *type,
+ struct super_block *sget_userns(struct file_system_type *type,
int (*test)(struct super_block *,void *),
int (*set)(struct super_block *,void *),
- int flags,
+ int flags, struct user_namespace *user_ns,
void *data)
{
struct super_block *s = NULL;
struct super_block *old;
int err;
+ if (!(flags & MS_KERNMOUNT) &&
+ !(type->fs_flags & FS_USERNS_MOUNT) &&
+ !capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
retry:
spin_lock(&sb_lock);
if (test) {
hlist_for_each_entry(old, &type->fs_supers, s_instances) {
if (!test(old, data))
continue;
+ if (user_ns != old->s_user_ns) {
+ spin_unlock(&sb_lock);
+ if (s) {
+ up_write(&s->s_umount);
+ destroy_super(s);
+ }
+ return ERR_PTR(-EBUSY);
+ }
if (!grab_super(old))
goto retry;
if (s) {
}
if (!s) {
spin_unlock(&sb_lock);
- s = alloc_super(type, flags);
+ s = alloc_super(type, flags, user_ns);
if (!s)
return ERR_PTR(-ENOMEM);
goto retry;
return s;
}
+ EXPORT_SYMBOL(sget_userns);
+
+ /**
+ * sget - find or create a superblock
+ * @type: filesystem type superblock should belong to
+ * @test: comparison callback
+ * @set: setup callback
+ * @flags: mount flags
+ * @data: argument to each of them
+ */
+ struct super_block *sget(struct file_system_type *type,
+ int (*test)(struct super_block *,void *),
+ int (*set)(struct super_block *,void *),
+ int flags,
+ void *data)
+ {
+ struct user_namespace *user_ns = current_user_ns();
+
+ /* Ensure the requestor has permissions over the target filesystem */
+ if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ return sget_userns(type, test, set, flags, user_ns, data);
+ }
+
EXPORT_SYMBOL(sget);
void drop_super(struct super_block *sb)
return set_anon_super(sb, NULL);
}
- struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
- void *data, int (*fill_super)(struct super_block *, void *, int))
+ struct dentry *mount_ns(struct file_system_type *fs_type,
+ int flags, void *data, void *ns, struct user_namespace *user_ns,
+ int (*fill_super)(struct super_block *, void *, int))
{
struct super_block *sb;
- sb = sget(fs_type, ns_test_super, ns_set_super, flags, data);
+ /* Don't allow mounting unless the caller has CAP_SYS_ADMIN
+ * over the namespace.
+ */
+ if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
+ user_ns, ns);
if (IS_ERR(sb))
return ERR_CAST(sb);
#define CHECK_IOVEC_ONLY -1
/*
- * The below are the various read and write types that we support. Some of
+ * The below are the various read and write flags that we support. Some of
* them include behavioral modifiers that send information down to the
- * block layer and IO scheduler. Terminology:
+ * block layer and IO scheduler. They should be used along with a req_op.
+ * Terminology:
*
* The block layer uses device plugging to defer IO a little bit, in
* the hope that we will see more IO very shortly. This increases
* READ_SYNC A synchronous read. Device is not plugged, caller can
* immediately wait on this read without caring about
* unplugging.
- * READA Used for read-ahead operations. Lower priority, and the
- * block layer could (in theory) choose to ignore this
- * request if it runs into resource problems.
* WRITE A normal async write. Device will be plugged.
* WRITE_SYNC Synchronous write. Identical to WRITE, but passes down
* the hint that someone will be waiting on this IO
* non-volatile media on completion.
*
*/
-#define RW_MASK REQ_WRITE
-#define RWA_MASK REQ_RAHEAD
+#define RW_MASK REQ_OP_WRITE
-#define READ 0
-#define WRITE RW_MASK
-#define READA RWA_MASK
+#define READ REQ_OP_READ
+#define WRITE REQ_OP_WRITE
-#define READ_SYNC (READ | REQ_SYNC)
-#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE)
-#define WRITE_ODIRECT (WRITE | REQ_SYNC)
-#define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH)
-#define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA)
-#define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
+#define READ_SYNC REQ_SYNC
+#define WRITE_SYNC (REQ_SYNC | REQ_NOIDLE)
+#define WRITE_ODIRECT REQ_SYNC
+#define WRITE_FLUSH (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH)
+#define WRITE_FUA (REQ_SYNC | REQ_NOIDLE | REQ_FUA)
+#define WRITE_FLUSH_FUA (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA)
/*
* Attribute flags. These should be or-ed together to figure out what
*/
int (*migratepage) (struct address_space *,
struct page *, struct page *, enum migrate_mode);
+ bool (*isolate_page)(struct page *, isolate_mode_t);
+ void (*putback_page)(struct page *);
int (*launder_page) (struct page *);
int (*is_partially_uptodate) (struct page *, unsigned long,
unsigned long);
struct inode * bd_inode; /* will die */
struct super_block * bd_super;
struct mutex bd_mutex; /* open/close mutex */
- struct list_head bd_inodes;
void * bd_claiming;
void * bd_holder;
int bd_holders;
#endif
struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list;
+ struct list_head i_wb_list; /* backing dev writeback list */
union {
struct hlist_head i_dentry;
struct rcu_head i_rcu;
#endif
}
- /* Helper functions so that in most cases filesystems will
- * not need to deal directly with kuid_t and kgid_t and can
- * instead deal with the raw numeric values that are stored
- * in the filesystem.
- */
- static inline uid_t i_uid_read(const struct inode *inode)
- {
- return from_kuid(&init_user_ns, inode->i_uid);
- }
-
- static inline gid_t i_gid_read(const struct inode *inode)
- {
- return from_kgid(&init_user_ns, inode->i_gid);
- }
-
- static inline void i_uid_write(struct inode *inode, uid_t uid)
- {
- inode->i_uid = make_kuid(&init_user_ns, uid);
- }
-
- static inline void i_gid_write(struct inode *inode, gid_t gid)
- {
- inode->i_gid = make_kgid(&init_user_ns, gid);
- }
-
static inline unsigned iminor(const struct inode *inode)
{
return MINOR(inode->i_rdev);
static inline struct dentry *file_dentry(const struct file *file)
{
- struct dentry *dentry = file->f_path.dentry;
-
- if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
- return dentry->d_op->d_real(dentry, file_inode(file));
- else
- return dentry;
+ return d_real(file->f_path.dentry, file_inode(file), 0);
}
static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
/* sb->s_iflags */
#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */
#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */
+ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */
+
+ /* sb->s_iflags to limit user namespace mounts */
+ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */
/* Possible states of 'frozen' field */
enum {
struct workqueue_struct *s_dio_done_wq;
struct hlist_head s_pins;
+ /*
+ * Owning user namespace and default context in which to
+ * interpret filesystem uids, gids, quotas, device nodes,
+ * xattrs and security labels.
+ */
+ struct user_namespace *s_user_ns;
+
/*
* Keep the lru lists last in the structure so they always sit on their
* own individual cachelines.
/* s_inode_list_lock protects s_inodes */
spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp;
struct list_head s_inodes; /* all inodes */
+
+ spinlock_t s_inode_wblist_lock;
+ struct list_head s_inodes_wb; /* writeback inodes */
};
+ /* Helper functions so that in most cases filesystems will
+ * not need to deal directly with kuid_t and kgid_t and can
+ * instead deal with the raw numeric values that are stored
+ * in the filesystem.
+ */
+ static inline uid_t i_uid_read(const struct inode *inode)
+ {
+ return from_kuid(inode->i_sb->s_user_ns, inode->i_uid);
+ }
+
+ static inline gid_t i_gid_read(const struct inode *inode)
+ {
+ return from_kgid(inode->i_sb->s_user_ns, inode->i_gid);
+ }
+
+ static inline void i_uid_write(struct inode *inode, uid_t uid)
+ {
+ inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid);
+ }
+
+ static inline void i_gid_write(struct inode *inode, gid_t gid)
+ {
+ inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
+ }
+
extern struct timespec current_fs_time(struct super_block *sb);
/*
*/
extern void inode_init_owner(struct inode *inode, const struct inode *dir,
umode_t mode);
+ extern bool may_open_dev(const struct path *path);
/*
* VFS FS_IOC_FIEMAP helper definitions.
*/
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
+ static inline bool HAS_UNMAPPED_ID(struct inode *inode)
+ {
+ return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
+ }
+
/*
* Inode state bits. Protected by inode->i_lock
*
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE 4
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
- #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */
- #define FS_USERNS_VISIBLE 32 /* FS must already be visible */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);
#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
- extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
- void *data, int (*fill_super)(struct super_block *, void *, int));
+ extern struct dentry *mount_ns(struct file_system_type *fs_type,
+ int flags, void *data, void *ns, struct user_namespace *user_ns,
+ int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int));
int set_anon_super(struct super_block *s, void *data);
int get_anon_bdev(dev_t *);
void free_anon_bdev(dev_t);
+ struct super_block *sget_userns(struct file_system_type *type,
+ int (*test)(struct super_block *,void *),
+ int (*set)(struct super_block *,void *),
+ int flags, struct user_namespace *user_ns,
+ void *data);
struct super_block *sget(struct file_system_type *type,
int (*test)(struct super_block *,void *),
int (*set)(struct super_block *,void *),
extern bool is_bad_inode(struct inode *);
#ifdef CONFIG_BLOCK
-/*
- * return READ, READA, or WRITE
- */
-#define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK))
+static inline bool op_is_write(unsigned int op)
+{
+ return op == REQ_OP_READ ? false : true;
+}
/*
* return data direction, READ or WRITE
*/
-#define bio_data_dir(bio) ((bio)->bi_rw & 1)
+static inline int bio_data_dir(struct bio *bio)
+{
+ return op_is_write(bio_op(bio)) ? WRITE : READ;
+}
extern void check_disk_size_change(struct gendisk *disk,
struct block_device *bdev);
loff_t start, loff_t end, int sync_mode);
extern int filemap_fdatawrite_range(struct address_space *mapping,
loff_t start, loff_t end);
+extern int filemap_check_errors(struct address_space *mapping);
extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
int datasync);
extern void inode_sb_list_add(struct inode *inode);
#ifdef CONFIG_BLOCK
-extern blk_qc_t submit_bio(int, struct bio *);
+extern blk_qc_t submit_bio(struct bio *);
extern int bdev_read_only(struct block_device *);
#endif
extern int set_blocksize(struct block_device *, int);
extern int nonseekable_open(struct inode * inode, struct file * filp);
#ifdef CONFIG_BLOCK
-typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
+typedef void (dio_submit_t)(struct bio *bio, struct inode *inode,
loff_t file_offset);
enum {
};
struct posix_acl {
- union {
- atomic_t a_refcount;
- struct rcu_head a_rcu;
- };
+ atomic_t a_refcount;
+ struct rcu_head a_rcu;
unsigned int a_count;
struct posix_acl_entry a_entries[0];
};
extern void posix_acl_init(struct posix_acl *, int);
extern struct posix_acl *posix_acl_alloc(int, gfp_t);
- extern int posix_acl_valid(const struct posix_acl *);
+ extern int posix_acl_valid(struct user_namespace *, const struct posix_acl *);
extern int posix_acl_permission(struct inode *, const struct posix_acl *, int);
extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
return kqid;
}
+ /**
+ * qid_has_mapping - Report if a qid maps into a user namespace.
+ * @ns: The user namespace to see if a value maps into.
+ * @qid: The kernel internal quota identifier to test.
+ */
+ static inline bool qid_has_mapping(struct user_namespace *ns, struct kqid qid)
+ {
+ return from_kqid(ns, qid) != (qid_t) -1;
+ }
+
extern spinlock_t dq_data_lock;
qsize_t dqb_ihardlimit; /* absolute limit on allocated inodes */
qsize_t dqb_isoftlimit; /* preferred inode limit */
qsize_t dqb_curinodes; /* current # allocated inodes */
- time_t dqb_btime; /* time limit for excessive disk use */
- time_t dqb_itime; /* time limit for excessive inode use */
+ time64_t dqb_btime; /* time limit for excessive disk use */
+ time64_t dqb_itime; /* time limit for excessive inode use */
};
/*