diff options
Diffstat (limited to 'fs')
347 files changed, 10414 insertions, 9378 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile index 1a940ec7af6..91fba025fcb 100644 --- a/fs/9p/Makefile +++ b/fs/9p/Makefile @@ -8,6 +8,8 @@ obj-$(CONFIG_9P_FS) := 9p.o vfs_dir.o \ vfs_dentry.o \ v9fs.o \ - fid.o + fid.o \ + xattr.o \ + xattr_user.o 9p-$(CONFIG_9P_FSCACHE) += cache.o diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 7317b39b281..35856368906 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -97,6 +97,34 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any) return ret; } +/* + * We need to hold v9ses->rename_sem as long as we hold references + * to returned path array. Array element contain pointers to + * dentry names. + */ +static int build_path_from_dentry(struct v9fs_session_info *v9ses, + struct dentry *dentry, char ***names) +{ + int n = 0, i; + char **wnames; + struct dentry *ds; + + for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent) + n++; + + wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL); + if (!wnames) + goto err_out; + + for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent) + wnames[i] = (char *)ds->d_name.name; + + *names = wnames; + return n; +err_out: + return -ENOMEM; +} + /** * v9fs_fid_lookup - lookup for a fid, try to walk if not found * @dentry: dentry to look for fid in @@ -112,7 +140,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) int i, n, l, clone, any, access; u32 uid; struct p9_fid *fid, *old_fid = NULL; - struct dentry *d, *ds; + struct dentry *ds; struct v9fs_session_info *v9ses; char **wnames, *uname; @@ -139,49 +167,62 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) fid = v9fs_fid_find(dentry, uid, any); if (fid) return fid; - + /* + * we don't have a matching fid. To do a TWALK we need + * parent fid. We need to prevent rename when we want to + * look at the parent. + */ + down_read(&v9ses->rename_sem); ds = dentry->d_parent; fid = v9fs_fid_find(ds, uid, any); - if (!fid) { /* walk from the root */ - n = 0; - for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent) - n++; + if (fid) { + /* Found the parent fid do a lookup with that */ + fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1); + goto fid_out; + } + up_read(&v9ses->rename_sem); - fid = v9fs_fid_find(ds, uid, any); - if (!fid) { /* the user is not attached to the fs yet */ - if (access == V9FS_ACCESS_SINGLE) - return ERR_PTR(-EPERM); + /* start from the root and try to do a lookup */ + fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any); + if (!fid) { + /* the user is not attached to the fs yet */ + if (access == V9FS_ACCESS_SINGLE) + return ERR_PTR(-EPERM); - if (v9fs_proto_dotu(v9ses)) + if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) uname = NULL; - else - uname = v9ses->uname; + else + uname = v9ses->uname; - fid = p9_client_attach(v9ses->clnt, NULL, uname, uid, - v9ses->aname); - - if (IS_ERR(fid)) - return fid; - - v9fs_fid_add(ds, fid); - } - } else /* walk from the parent */ - n = 1; + fid = p9_client_attach(v9ses->clnt, NULL, uname, uid, + v9ses->aname); + if (IS_ERR(fid)) + return fid; - if (ds == dentry) + v9fs_fid_add(dentry->d_sb->s_root, fid); + } + /* If we are root ourself just return that */ + if (dentry->d_sb->s_root == dentry) return fid; - - wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL); - if (!wnames) - return ERR_PTR(-ENOMEM); - - for (d = dentry, i = (n-1); i >= 0; i--, d = d->d_parent) - wnames[i] = (char *) d->d_name.name; - + /* + * Do a multipath walk with attached root. + * When walking parent we need to make sure we + * don't have a parallel rename happening + */ + down_read(&v9ses->rename_sem); + n = build_path_from_dentry(v9ses, dentry, &wnames); + if (n < 0) { + fid = ERR_PTR(n); + goto err_out; + } clone = 1; i = 0; while (i < n) { l = min(n - i, P9_MAXWELEM); + /* + * We need to hold rename lock when doing a multipath + * walk to ensure none of the patch component change + */ fid = p9_client_walk(fid, l, &wnames[i], clone); if (IS_ERR(fid)) { if (old_fid) { @@ -193,15 +234,17 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) p9_client_clunk(old_fid); } kfree(wnames); - return fid; + goto err_out; } old_fid = fid; i += l; clone = 0; } - kfree(wnames); +fid_out: v9fs_fid_add(dentry, fid); +err_out: + up_read(&v9ses->rename_sem); return fid; } diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index f8b86e92cd6..38dc0e06759 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -237,6 +237,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, __putname(v9ses->uname); return ERR_PTR(-ENOMEM); } + init_rwsem(&v9ses->rename_sem); rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); if (rc) { @@ -278,7 +279,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; /* for legacy mode, fall back to V9FS_ACCESS_ANY */ - if (!v9fs_proto_dotu(v9ses) && + if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) && ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { v9ses->flags &= ~V9FS_ACCESS_MASK; diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index bec4d0bcb45..4c963c9fc41 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -104,6 +104,7 @@ struct v9fs_session_info { struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct backing_dev_info bdi; + struct rw_semaphore rename_sem; }; struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 32ef4009d03..f47c6bbb01b 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -55,6 +55,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode); void v9fs_clear_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); +void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index d61e3b28ce3..16c8a2a98c1 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -87,29 +87,19 @@ static void p9stat_init(struct p9_wstat *stbuf) } /** - * v9fs_dir_readdir - read a directory + * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir * @filp: opened file structure - * @dirent: directory structure ??? - * @filldir: function to populate directory structure ??? + * @buflen: Length in bytes of buffer to allocate * */ -static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int v9fs_alloc_rdir_buf(struct file *filp, int buflen) { - int over; - struct p9_wstat st; - int err = 0; - struct p9_fid *fid; - int buflen; - int reclen = 0; struct p9_rdir *rdir; + struct p9_fid *fid; + int err = 0; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); fid = filp->private_data; - - buflen = fid->clnt->msize - P9_IOHDRSZ; - - /* allocate rdir on demand */ if (!fid->rdir) { rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); @@ -128,6 +118,36 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) spin_unlock(&filp->f_dentry->d_lock); kfree(rdir); } +exit: + return err; +} + +/** + * v9fs_dir_readdir - read a directory + * @filp: opened file structure + * @dirent: directory structure ??? + * @filldir: function to populate directory structure ??? + * + */ + +static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + int over; + struct p9_wstat st; + int err = 0; + struct p9_fid *fid; + int buflen; + int reclen = 0; + struct p9_rdir *rdir; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + fid = filp->private_data; + + buflen = fid->clnt->msize - P9_IOHDRSZ; + + err = v9fs_alloc_rdir_buf(filp, buflen); + if (err) + goto exit; rdir = (struct p9_rdir *) fid->rdir; err = mutex_lock_interruptible(&rdir->mutex); @@ -146,7 +166,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) while (rdir->head < rdir->tail) { p9stat_init(&st); err = p9stat_read(rdir->buf + rdir->head, - buflen - rdir->head, &st, + rdir->tail - rdir->head, &st, fid->clnt->proto_version); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); @@ -176,6 +196,88 @@ exit: return err; } +/** + * v9fs_dir_readdir_dotl - read a directory + * @filp: opened file structure + * @dirent: buffer to fill dirent structures + * @filldir: function to populate dirent structures + * + */ +static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, + filldir_t filldir) +{ + int over; + int err = 0; + struct p9_fid *fid; + int buflen; + struct p9_rdir *rdir; + struct p9_dirent curdirent; + u64 oldoffset = 0; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + fid = filp->private_data; + + buflen = fid->clnt->msize - P9_READDIRHDRSZ; + + err = v9fs_alloc_rdir_buf(filp, buflen); + if (err) + goto exit; + rdir = (struct p9_rdir *) fid->rdir; + + err = mutex_lock_interruptible(&rdir->mutex); + if (err) + return err; + + while (err == 0) { + if (rdir->tail == rdir->head) { + err = p9_client_readdir(fid, rdir->buf, buflen, + filp->f_pos); + if (err <= 0) + goto unlock_and_exit; + + rdir->head = 0; + rdir->tail = err; + } + + while (rdir->head < rdir->tail) { + + err = p9dirent_read(rdir->buf + rdir->head, + buflen - rdir->head, &curdirent, + fid->clnt->proto_version); + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); + err = -EIO; + goto unlock_and_exit; + } + + /* d_off in dirent structure tracks the offset into + * the next dirent in the dir. However, filldir() + * expects offset into the current dirent. Hence + * while calling filldir send the offset from the + * previous dirent structure. + */ + over = filldir(dirent, curdirent.d_name, + strlen(curdirent.d_name), + oldoffset, v9fs_qid2ino(&curdirent.qid), + curdirent.d_type); + oldoffset = curdirent.d_off; + + if (over) { + err = 0; + goto unlock_and_exit; + } + + filp->f_pos = curdirent.d_off; + rdir->head += err; + } + } + +unlock_and_exit: + mutex_unlock(&rdir->mutex); +exit: + return err; +} + /** * v9fs_dir_release - close a directory @@ -207,7 +309,7 @@ const struct file_operations v9fs_dir_operations = { const struct file_operations v9fs_dir_operations_dotl = { .read = generic_read_dir, .llseek = generic_file_llseek, - .readdir = v9fs_dir_readdir, + .readdir = v9fs_dir_readdir_dotl, .open = v9fs_file_open, .release = v9fs_dir_release, }; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 2bedc6c94fc..e97c92bd6f1 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -59,9 +59,13 @@ int v9fs_file_open(struct inode *inode, struct file *file) struct p9_fid *fid; int omode; - P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); + P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); v9ses = v9fs_inode2v9ses(inode); - omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses)); + if (v9fs_proto_dotl(v9ses)) + omode = file->f_flags; + else + omode = v9fs_uflags2omode(file->f_flags, + v9fs_proto_dotu(v9ses)); fid = file->private_data; if (!fid) { fid = v9fs_fid_clone(file->f_path.dentry); @@ -73,11 +77,12 @@ int v9fs_file_open(struct inode *inode, struct file *file) p9_client_clunk(fid); return err; } - if (omode & P9_OTRUNC) { + if (file->f_flags & O_TRUNC) { i_size_write(inode, 0); inode->i_blocks = 0; } - if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses))) + if ((file->f_flags & O_APPEND) && + (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))) generic_file_llseek(file, 0, SEEK_END); } @@ -139,7 +144,7 @@ ssize_t v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, u64 offset) { - int n, total; + int n, total, size; struct p9_fid *fid = filp->private_data; P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, @@ -147,6 +152,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, n = 0; total = 0; + size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; do { n = p9_client_read(fid, data, udata, offset, count); if (n <= 0) @@ -160,7 +166,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, offset += n; count -= n; total += n; - } while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ)); + } while (count > 0 && n == size); if (n < 0) total = n; @@ -183,11 +189,13 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count, { int ret; struct p9_fid *fid; + size_t size; P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); fid = filp->private_data; - if (count > (fid->clnt->msize - P9_IOHDRSZ)) + size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; + if (count > size) ret = v9fs_file_readn(filp, NULL, udata, count, *offset); else ret = p9_client_read(fid, NULL, udata, *offset, count); @@ -224,9 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data, fid = filp->private_data; clnt = fid->clnt; - rsize = fid->iounit; - if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) - rsize = clnt->msize - P9_IOHDRSZ; + rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ; do { if (count < rsize) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 4331b3b5ee1..6e94f3247ce 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -35,6 +35,7 @@ #include <linux/idr.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/xattr.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -42,6 +43,7 @@ #include "v9fs_vfs.h" #include "fid.h" #include "cache.h" +#include "xattr.h" static const struct inode_operations v9fs_dir_inode_operations; static const struct inode_operations v9fs_dir_inode_operations_dotu; @@ -236,6 +238,41 @@ void v9fs_destroy_inode(struct inode *inode) #endif /** + * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a + * new file system object. This checks the S_ISGID to determine the owning + * group of the new file system object. + */ + +static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) +{ + BUG_ON(dir_inode == NULL); + + if (dir_inode->i_mode & S_ISGID) { + /* set_gid bit is set.*/ + return dir_inode->i_gid; + } + return current_fsgid(); +} + +/** + * v9fs_dentry_from_dir_inode - helper function to get the dentry from + * dir inode. + * + */ + +static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&dcache_lock); + /* Directory should have only one entry. */ + BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry)); + dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias); + spin_unlock(&dcache_lock); + return dentry; +} + +/** * v9fs_get_inode - helper function to setup an inode * @sb: superblock * @mode: mode to setup inode with @@ -267,7 +304,13 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) case S_IFBLK: case S_IFCHR: case S_IFSOCK: - if (!v9fs_proto_dotu(v9ses)) { + if (v9fs_proto_dotl(v9ses)) { + inode->i_op = &v9fs_file_inode_operations_dotl; + inode->i_fop = &v9fs_file_operations_dotl; + } else if (v9fs_proto_dotu(v9ses)) { + inode->i_op = &v9fs_file_inode_operations; + inode->i_fop = &v9fs_file_operations; + } else { P9_DPRINTK(P9_DEBUG_ERROR, "special files without extended mode\n"); err = -EINVAL; @@ -396,23 +439,14 @@ void v9fs_clear_inode(struct inode *inode) #endif } -/** - * v9fs_inode_from_fid - populate an inode by issuing a attribute request - * @v9ses: session information - * @fid: fid to issue attribute request for - * @sb: superblock on which to create inode - * - */ - static struct inode * -v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, +v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { int err, umode; - struct inode *ret; + struct inode *ret = NULL; struct p9_wstat *st; - ret = NULL; st = p9_client_stat(fid); if (IS_ERR(st)) return ERR_CAST(st); @@ -433,15 +467,62 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, #endif p9stat_free(st); kfree(st); - return ret; - error: p9stat_free(st); kfree(st); return ERR_PTR(err); } +static struct inode * +v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + struct inode *ret = NULL; + int err; + struct p9_stat_dotl *st; + + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + if (IS_ERR(st)) + return ERR_CAST(st); + + ret = v9fs_get_inode(sb, st->st_mode); + if (IS_ERR(ret)) { + err = PTR_ERR(ret); + goto error; + } + + v9fs_stat2inode_dotl(st, ret); + ret->i_ino = v9fs_qid2ino(&st->qid); +#ifdef CONFIG_9P_FSCACHE + v9fs_vcookie_set_qid(ret, &st->qid); + v9fs_cache_inode_get_cookie(ret); +#endif + kfree(st); + return ret; +error: + kfree(st); + return ERR_PTR(err); +} + +/** + * v9fs_inode_from_fid - Helper routine to populate an inode by + * issuing a attribute request + * @v9ses: session information + * @fid: fid to issue attribute request for + * @sb: superblock on which to create inode + * + */ +static inline struct inode * +v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + if (v9fs_proto_dotl(v9ses)) + return v9fs_inode_dotl(v9ses, fid, sb); + else + return v9fs_inode(v9ses, fid, sb); +} + /** * v9fs_remove - helper function to remove files and directories * @dir: directory inode that is being deleted @@ -563,6 +644,118 @@ error: } /** + * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. + * @dir: directory inode that is being created + * @dentry: dentry that is being deleted + * @mode: create permissions + * @nd: path information + * + */ + +static int +v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + int err = 0; + char *name = NULL; + gid_t gid; + int flags; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL; + struct p9_fid *dfid, *ofid; + struct file *filp; + struct p9_qid qid; + struct inode *inode; + + v9ses = v9fs_inode2v9ses(dir); + if (nd && nd->flags & LOOKUP_OPEN) + flags = nd->intent.open.flags - 1; + else + flags = O_RDWR; + + name = (char *) dentry->d_name.name; + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x " + "mode:0x%x\n", name, flags, mode); + + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return err; + } + + /* clone a fid to use for creation */ + ofid = p9_client_walk(dfid, 0, NULL, 1); + if (IS_ERR(ofid)) { + err = PTR_ERR(ofid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + return err; + } + + gid = v9fs_get_fsgid_for_create(dir); + err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid); + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, + "p9_client_open_dotl failed in creat %d\n", + err); + goto error; + } + + /* No need to populate the inode if we are not opening the file AND + * not in cached mode. + */ + if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) { + /* Not in cached mode. No need to populate inode with stat */ + dentry->d_op = &v9fs_dentry_operations; + p9_client_clunk(ofid); + d_instantiate(dentry, NULL); + return 0; + } + + /* Now walk from the parent so we can get an unopened fid. */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + fid = NULL; + goto error; + } + + /* instantiate inode and assign the unopened fid to dentry */ + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); + goto error; + } + dentry->d_op = &v9fs_cached_dentry_operations; + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + + /* if we are opening a file, assign the open fid to the file */ + if (nd && nd->flags & LOOKUP_OPEN) { + filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created); + if (IS_ERR(filp)) { + p9_client_clunk(ofid); + return PTR_ERR(filp); + } + filp->private_data = ofid; + } else + p9_client_clunk(ofid); + + return 0; + +error: + if (ofid) + p9_client_clunk(ofid); + if (fid) + p9_client_clunk(fid); + return err; +} + +/** * v9fs_vfs_create - VFS hook to create files * @dir: directory inode that is being created * @dentry: dentry that is being deleted @@ -652,6 +845,83 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return err; } + +/** + * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory + * @dir: inode that is being unlinked + * @dentry: dentry that is being unlinked + * @mode: mode for new directory + * + */ + +static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry, + int mode) +{ + int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL, *dfid = NULL; + gid_t gid; + char *name; + struct inode *inode; + struct p9_qid qid; + struct dentry *dir_dentry; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + err = 0; + v9ses = v9fs_inode2v9ses(dir); + + mode |= S_IFDIR; + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + dfid = NULL; + goto error; + } + + gid = v9fs_get_fsgid_for_create(dir); + if (gid < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n"); + goto error; + } + + name = (char *) dentry->d_name.name; + err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid); + if (err < 0) + goto error; + + /* instantiate inode and assign the unopened fid to the dentry */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + dentry->d_op = &v9fs_cached_dentry_operations; + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + fid = NULL; + } +error: + if (fid) + p9_client_clunk(fid); + return err; +} + /** * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode * @dir: inode that is being walked from @@ -678,6 +948,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, sb = dir->i_sb; v9ses = v9fs_inode2v9ses(dir); + /* We can walk d_parent because we hold the dir->i_mutex */ dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) return ERR_CAST(dfid); @@ -785,27 +1056,33 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto clunk_olddir; } + down_write(&v9ses->rename_sem); if (v9fs_proto_dotl(v9ses)) { retval = p9_client_rename(oldfid, newdirfid, (char *) new_dentry->d_name.name); if (retval != -ENOSYS) goto clunk_newdir; } + if (old_dentry->d_parent != new_dentry->d_parent) { + /* + * 9P .u can only handle file rename in the same directory + */ - /* 9P can only handle file rename in the same directory */ - if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { P9_DPRINTK(P9_DEBUG_ERROR, "old dir and new dir are different\n"); retval = -EXDEV; goto clunk_newdir; } - v9fs_blank_wstat(&wstat); wstat.muid = v9ses->uname; wstat.name = (char *) new_dentry->d_name.name; retval = p9_client_wstat(oldfid, &wstat); clunk_newdir: + if (!retval) + /* successful rename */ + d_move(old_dentry, new_dentry); + up_write(&v9ses->rename_sem); p9_client_clunk(newdirfid); clunk_olddir: @@ -853,6 +1130,42 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } +static int +v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_stat_dotl *st; + + P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); + err = -EPERM; + v9ses = v9fs_inode2v9ses(dentry->d_inode); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + return simple_getattr(mnt, dentry, stat); + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + /* Ask for all the fields in stat structure. Server will return + * whatever it supports + */ + + st = p9_client_getattr_dotl(fid, P9_STATS_ALL); + if (IS_ERR(st)) + return PTR_ERR(st); + + v9fs_stat2inode_dotl(st, dentry->d_inode); + generic_fillattr(dentry->d_inode, stat); + /* Change block size to what the server returned */ + stat->blksize = st->st_blksize; + + kfree(st); + return 0; +} + /** * v9fs_vfs_setattr - set file metadata * @dentry: file whose metadata to set @@ -903,6 +1216,49 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) } /** + * v9fs_vfs_setattr_dotl - set file metadata + * @dentry: file whose metadata to set + * @iattr: metadata assignment structure + * + */ + +static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) +{ + int retval; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_iattr_dotl p9attr; + + P9_DPRINTK(P9_DEBUG_VFS, "\n"); + + retval = inode_change_ok(dentry->d_inode, iattr); + if (retval) + return retval; + + p9attr.valid = iattr->ia_valid; + p9attr.mode = iattr->ia_mode; + p9attr.uid = iattr->ia_uid; + p9attr.gid = iattr->ia_gid; + p9attr.size = iattr->ia_size; + p9attr.atime_sec = iattr->ia_atime.tv_sec; + p9attr.atime_nsec = iattr->ia_atime.tv_nsec; + p9attr.mtime_sec = iattr->ia_mtime.tv_sec; + p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; + + retval = -EPERM; + v9ses = v9fs_inode2v9ses(dentry->d_inode); + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + retval = p9_client_setattr(fid, &p9attr); + if (retval >= 0) + retval = inode_setattr(dentry->d_inode, iattr); + + return retval; +} + +/** * v9fs_stat2inode - populate an inode structure with mistat info * @stat: Plan 9 metadata (mistat) structure * @inode: inode to populate @@ -980,6 +1336,77 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, } /** + * v9fs_stat2inode_dotl - populate an inode structure with stat info + * @stat: stat structure + * @inode: inode to populate + * @sb: superblock of filesystem + * + */ + +void +v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) +{ + + if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { + inode->i_atime.tv_sec = stat->st_atime_sec; + inode->i_atime.tv_nsec = stat->st_atime_nsec; + inode->i_mtime.tv_sec = stat->st_mtime_sec; + inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + inode->i_ctime.tv_sec = stat->st_ctime_sec; + inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + inode->i_uid = stat->st_uid; + inode->i_gid = stat->st_gid; + inode->i_nlink = stat->st_nlink; + inode->i_mode = stat->st_mode; + inode->i_rdev = new_decode_dev(stat->st_rdev); + + if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) + init_special_inode(inode, inode->i_mode, inode->i_rdev); + + i_size_write(inode, stat->st_size); + inode->i_blocks = stat->st_blocks; + } else { + if (stat->st_result_mask & P9_STATS_ATIME) { + inode->i_atime.tv_sec = stat->st_atime_sec; + inode->i_atime.tv_nsec = stat->st_atime_nsec; + } + if (stat->st_result_mask & P9_STATS_MTIME) { + inode->i_mtime.tv_sec = stat->st_mtime_sec; + inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + } + if (stat->st_result_mask & P9_STATS_CTIME) { + inode->i_ctime.tv_sec = stat->st_ctime_sec; + inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + } + if (stat->st_result_mask & P9_STATS_UID) + inode->i_uid = stat->st_uid; + if (stat->st_result_mask & P9_STATS_GID) + inode->i_gid = stat->st_gid; + if (stat->st_result_mask & P9_STATS_NLINK) + inode->i_nlink = stat->st_nlink; + if (stat->st_result_mask & P9_STATS_MODE) { + inode->i_mode = stat->st_mode; + if ((S_ISBLK(inode->i_mode)) || + (S_ISCHR(inode->i_mode))) + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + } + if (stat->st_result_mask & P9_STATS_RDEV) + inode->i_rdev = new_decode_dev(stat->st_rdev); + if (stat->st_result_mask & P9_STATS_SIZE) + i_size_write(inode, stat->st_size); + if (stat->st_result_mask & P9_STATS_BLOCKS) + inode->i_blocks = stat->st_blocks; + } + if (stat->st_result_mask & P9_STATS_GEN) + inode->i_generation = stat->st_gen; + + /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION + * because the inode structure does not have fields for them. + */ +} + +/** * v9fs_qid2ino - convert qid into inode number * @qid: qid to hash * @@ -1022,7 +1449,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) if (IS_ERR(fid)) return PTR_ERR(fid); - if (!v9fs_proto_dotu(v9ses)) + if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) return -EBADF; st = p9_client_stat(fid); @@ -1128,6 +1555,99 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, } /** + * v9fs_vfs_symlink_dotl - helper function to create symlinks + * @dir: directory inode containing symlink + * @dentry: dentry for symlink + * @symname: symlink data + * + * See Also: 9P2000.L RFC for more information + * + */ + +static int +v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct v9fs_session_info *v9ses; + struct p9_fid *dfid; + struct p9_fid *fid = NULL; + struct inode *inode; + struct p9_qid qid; + char *name; + int err; + gid_t gid; + + name = (char *) dentry->d_name.name; + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", + dir->i_ino, name, symname); + v9ses = v9fs_inode2v9ses(dir); + + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return err; + } + + gid = v9fs_get_fsgid_for_create(dir); + + if (gid < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid); + goto error; + } + + /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */ + err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); + + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); + goto error; + } + + if (v9ses->cache) { + /* Now walk from the parent so we can get an unopened fid. */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + /* instantiate inode and assign the unopened fid to dentry */ + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + dentry->d_op = &v9fs_cached_dentry_operations; + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + fid = NULL; + } else { + /* Not in cached mode. No need to populate inode with stat */ + inode = v9fs_get_inode(dir->i_sb, S_IFLNK); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); + } + +error: + if (fid) + p9_client_clunk(fid); + + return err; +} + +/** * v9fs_vfs_symlink - helper function to create symlinks * @dir: directory inode containing symlink * @dentry: dentry for symlink @@ -1186,6 +1706,76 @@ clunk_fid: } /** + * v9fs_vfs_link_dotl - create a hardlink for dotl + * @old_dentry: dentry for file to link to + * @dir: inode destination for new link + * @dentry: dentry for link + * + */ + +static int +v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + int err; + struct p9_fid *dfid, *oldfid; + char *name; + struct v9fs_session_info *v9ses; + struct dentry *dir_dentry; + + P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", + dir->i_ino, old_dentry->d_name.name, + dentry->d_name.name); + + v9ses = v9fs_inode2v9ses(dir); + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) + return PTR_ERR(dfid); + + oldfid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(oldfid)) + return PTR_ERR(oldfid); + + name = (char *) dentry->d_name.name; + + err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); + + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); + return err; + } + + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + /* Get the latest stat info from server. */ + struct p9_fid *fid; + struct p9_stat_dotl *st; + + fid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + if (IS_ERR(st)) + return PTR_ERR(st); + + v9fs_stat2inode_dotl(st, old_dentry->d_inode); + + kfree(st); + } else { + /* Caching disabled. No need to get upto date stat info. + * This dentry will be released immediately. So, just i_count++ + */ + atomic_inc(&old_dentry->d_inode->i_count); + } + + dentry->d_op = old_dentry->d_op; + d_instantiate(dentry, old_dentry->d_inode); + + return err; +} + +/** * v9fs_vfs_mknod - create a special file * @dir: inode destination for new link * @dentry: dentry for file @@ -1230,6 +1820,100 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) return retval; } +/** + * v9fs_vfs_mknod_dotl - create a special file + * @dir: inode destination for new link + * @dentry: dentry for file + * @mode: mode for creation + * @rdev: device associated with special file + * + */ +static int +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + int err; + char *name; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL, *dfid = NULL; + struct inode *inode; + gid_t gid; + struct p9_qid qid; + struct dentry *dir_dentry; + + P9_DPRINTK(P9_DEBUG_VFS, + " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, + dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + v9ses = v9fs_inode2v9ses(dir); + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + dfid = NULL; + goto error; + } + + gid = v9fs_get_fsgid_for_create(dir); + if (gid < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n"); + goto error; + } + + name = (char *) dentry->d_name.name; + + err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid); + if (err < 0) + goto error; + + /* instantiate inode and assign the unopened fid to the dentry */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + dentry->d_op = &v9fs_cached_dentry_operations; + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + fid = NULL; + } else { + /* + * Not in cached mode. No need to populate inode with stat. + * socket syscall returns a fd, so we need instantiate + */ + inode = v9fs_get_inode(dir->i_sb, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); + } + +error: + if (fid) + p9_client_clunk(fid); + return err; +} + static const struct inode_operations v9fs_dir_inode_operations_dotu = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, @@ -1238,24 +1922,29 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = { .unlink = v9fs_vfs_unlink, .mkdir = v9fs_vfs_mkdir, .rmdir = v9fs_vfs_rmdir, - .mknod = v9fs_vfs_mknod, + .mknod = v9fs_vfs_mknod_dotl, .rename = v9fs_vfs_rename, .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; static const struct inode_operations v9fs_dir_inode_operations_dotl = { - .create = v9fs_vfs_create, + .create = v9fs_vfs_create_dotl, .lookup = v9fs_vfs_lookup, - .symlink = v9fs_vfs_symlink, - .link = v9fs_vfs_link, + .link = v9fs_vfs_link_dotl, + .symlink = v9fs_vfs_symlink_dotl, .unlink = v9fs_vfs_unlink, - .mkdir = v9fs_vfs_mkdir, + .mkdir = v9fs_vfs_mkdir_dotl, .rmdir = v9fs_vfs_rmdir, - .mknod = v9fs_vfs_mknod, + .mknod = v9fs_vfs_mknod_dotl, .rename = v9fs_vfs_rename, - .getattr = v9fs_vfs_getattr, - .setattr = v9fs_vfs_setattr, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, + }; static const struct inode_operations v9fs_dir_inode_operations = { @@ -1276,8 +1965,12 @@ static const struct inode_operations v9fs_file_inode_operations = { }; static const struct inode_operations v9fs_file_inode_operations_dotl = { - .getattr = v9fs_vfs_getattr, - .setattr = v9fs_vfs_setattr, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, }; static const struct inode_operations v9fs_symlink_inode_operations = { @@ -1292,6 +1985,10 @@ static const struct inode_operations v9fs_symlink_inode_operations_dotl = { .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link, .put_link = v9fs_vfs_put_link, - .getattr = v9fs_vfs_getattr, - .setattr = v9fs_vfs_setattr, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, }; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index be74d020436..4b9ede0b41b 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -45,6 +45,7 @@ #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" +#include "xattr.h" static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; @@ -77,9 +78,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_blocksize_bits = fls(v9ses->maxdata - 1); sb->s_blocksize = 1 << sb->s_blocksize_bits; sb->s_magic = V9FS_MAGIC; - if (v9fs_proto_dotl(v9ses)) + if (v9fs_proto_dotl(v9ses)) { sb->s_op = &v9fs_super_ops_dotl; - else + sb->s_xattr = v9fs_xattr_handlers; + } else sb->s_op = &v9fs_super_ops; sb->s_bdi = &v9ses->bdi; @@ -107,7 +109,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; - struct p9_wstat *st = NULL; int mode = S_IRWXUGO | S_ISVTX; struct p9_fid *fid; int retval = 0; @@ -124,16 +125,10 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, goto close_session; } - st = p9_client_stat(fid); - if (IS_ERR(st)) { - retval = PTR_ERR(st); - goto clunk_fid; - } - sb = sget(fs_type, NULL, v9fs_set_super, v9ses); if (IS_ERR(sb)) { retval = PTR_ERR(sb); - goto free_stat; + goto clunk_fid; } v9fs_fill_super(sb, v9ses, flags, data); @@ -151,22 +146,38 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, } sb->s_root = root; - root->d_inode->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode(st, root->d_inode, sb); + if (v9fs_proto_dotl(v9ses)) { + struct p9_stat_dotl *st = NULL; + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + if (IS_ERR(st)) { + retval = PTR_ERR(st); + goto clunk_fid; + } + + v9fs_stat2inode_dotl(st, root->d_inode); + kfree(st); + } else { + struct p9_wstat *st = NULL; + st = p9_client_stat(fid); + if (IS_ERR(st)) { + retval = PTR_ERR(st); + goto clunk_fid; + } + + root->d_inode->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode(st, root->d_inode, sb); + + p9stat_free(st); + kfree(st); + } v9fs_fid_add(root, fid); - p9stat_free(st); - kfree(st); P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); simple_set_mnt(mnt, sb); return 0; -free_stat: - p9stat_free(st); - kfree(st); - clunk_fid: p9_client_clunk(fid); @@ -176,8 +187,6 @@ close_session: return retval; release_sb: - p9stat_free(st); - kfree(st); deactivate_locked_super(sb); return retval; } @@ -278,4 +287,5 @@ struct file_system_type v9fs_fs_type = { .get_sb = v9fs_get_sb, .kill_sb = v9fs_kill_super, .owner = THIS_MODULE, + .fs_flags = FS_RENAME_DOES_D_MOVE, }; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c new file mode 100644 index 00000000000..f88e5c2dc87 --- /dev/null +++ b/fs/9p/xattr.c @@ -0,0 +1,160 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include <net/9p/9p.h> +#include <net/9p/client.h> + +#include "fid.h" +#include "xattr.h" + +/* + * v9fs_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t buffer_size) +{ + ssize_t retval; + int msize, read_count; + u64 offset = 0, attr_size; + struct p9_fid *fid, *attr_fid; + + P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n", + __func__, name, buffer_size); + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + attr_fid = p9_client_xattrwalk(fid, name, &attr_size); + if (IS_ERR(attr_fid)) { + retval = PTR_ERR(attr_fid); + P9_DPRINTK(P9_DEBUG_VFS, + "p9_client_attrwalk failed %zd\n", retval); + attr_fid = NULL; + goto error; + } + if (!buffer_size) { + /* request to get the attr_size */ + retval = attr_size; + goto error; + } + if (attr_size > buffer_size) { + retval = -ERANGE; + goto error; + } + msize = attr_fid->clnt->msize; + while (attr_size) { + if (attr_size > (msize - P9_IOHDRSZ)) + read_count = msize - P9_IOHDRSZ; + else + read_count = attr_size; + read_count = p9_client_read(attr_fid, ((char *)buffer)+offset, + NULL, offset, read_count); + if (read_count < 0) { + /* error in xattr read */ + retval = read_count; + goto error; + } + offset += read_count; + attr_size -= read_count; + } + /* Total read xattr bytes */ + retval = offset; +error: + if (attr_fid) + p9_client_clunk(attr_fid); + return retval; + +} + +/* + * v9fs_xattr_set() + * + * Create, replace or remove an extended attribute for this inode. Buffer + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int v9fs_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t value_len, int flags) +{ + u64 offset = 0; + int retval, msize, write_count; + struct p9_fid *fid = NULL; + + P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n", + __func__, name, value_len, flags); + + fid = v9fs_fid_clone(dentry); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + fid = NULL; + goto error; + } + /* + * On success fid points to xattr + */ + retval = p9_client_xattrcreate(fid, name, value_len, flags); + if (retval < 0) { + P9_DPRINTK(P9_DEBUG_VFS, + "p9_client_xattrcreate failed %d\n", retval); + goto error; + } + msize = fid->clnt->msize;; + while (value_len) { + if (value_len > (msize - P9_IOHDRSZ)) + write_count = msize - P9_IOHDRSZ; + else + write_count = value_len; + write_count = p9_client_write(fid, ((char *)value)+offset, + NULL, offset, write_count); + if (write_count < 0) { + /* error in xattr write */ + retval = write_count; + goto error; + } + offset += write_count; + value_len -= write_count; + } + /* Total read xattr bytes */ + retval = offset; +error: + if (fid) + retval = p9_client_clunk(fid); + return retval; +} + +ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + return v9fs_xattr_get(dentry, NULL, buffer, buffer_size); +} + +const struct xattr_handler *v9fs_xattr_handlers[] = { + &v9fs_xattr_user_handler, + NULL +}; diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h new file mode 100644 index 00000000000..9ddf672ae5c --- /dev/null +++ b/fs/9p/xattr.h @@ -0,0 +1,27 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ +#ifndef FS_9P_XATTR_H +#define FS_9P_XATTR_H + +#include <linux/xattr.h> + +extern const struct xattr_handler *v9fs_xattr_handlers[]; +extern struct xattr_handler v9fs_xattr_user_handler; + +extern ssize_t v9fs_xattr_get(struct dentry *, const char *, + void *, size_t); +extern int v9fs_xattr_set(struct dentry *, const char *, + const void *, size_t, int); +extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t); +#endif /* FS_9P_XATTR_H */ diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c new file mode 100644 index 00000000000..d0b701b7208 --- /dev/null +++ b/fs/9p/xattr_user.c @@ -0,0 +1,80 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include "xattr.h" + +static int v9fs_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_USER_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_USER_PREFIX, prefix_len); + memcpy(full_name+prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_get(dentry, full_name, buffer, size); + kfree(full_name); + return retval; +} + +static int v9fs_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_USER_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_USER_PREFIX, prefix_len); + memcpy(full_name + prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_set(dentry, full_name, value, size, flags); + kfree(full_name); + return retval; +} + +struct xattr_handler v9fs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = v9fs_xattr_user_get, + .set = v9fs_xattr_user_set, +}; diff --git a/fs/Kconfig b/fs/Kconfig index 5f85b594761..3d185308ec8 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -64,7 +64,7 @@ source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" config CUSE - tristate "Character device in Userpace support" + tristate "Character device in Userspace support" depends on FUSE_FS help This FUSE extension allows character devices to be diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 5c4e61d3c77..8f975f25b48 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -2,6 +2,7 @@ config AFS_FS tristate "Andrew File System support (AFS) (EXPERIMENTAL)" depends on INET && EXPERIMENTAL select AF_RXRPC + select DNS_RESOLVER help If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. diff --git a/fs/afs/cell.c b/fs/afs/cell.c index e19c13f059e..ffea35c6387 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/key.h> #include <linux/ctype.h> +#include <linux/dns_resolver.h> #include <linux/sched.h> #include <keys/rxrpc-type.h> #include "internal.h" @@ -36,6 +37,8 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) struct key *key; size_t namelen; char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next; + char *dvllist = NULL, *_vllist = NULL; + char delimiter = ':'; int ret; _enter("%s,%s", name, vllist); @@ -43,8 +46,10 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */ namelen = strlen(name); - if (namelen > AFS_MAXCELLNAME) + if (namelen > AFS_MAXCELLNAME) { + _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); + } /* allocate and initialise a cell record */ cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL); @@ -64,15 +69,31 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) INIT_LIST_HEAD(&cell->vl_list); spin_lock_init(&cell->vl_lock); + /* if the ip address is invalid, try dns query */ + if (!vllist || strlen(vllist) < 7) { + ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL); + if (ret < 0) { + _leave(" = %d", ret); + return ERR_PTR(ret); + } + _vllist = dvllist; + + /* change the delimiter for user-space reply */ + delimiter = ','; + + } else { + _vllist = vllist; + } + /* fill in the VL server list from the rest of the string */ do { unsigned a, b, c, d; - next = strchr(vllist, ':'); + next = strchr(_vllist, delimiter); if (next) *next++ = 0; - if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) + if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) goto bad_address; if (a > 255 || b > 255 || c > 255 || d > 255) @@ -81,7 +102,7 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) cell->vl_addrs[cell->vl_naddrs++].s_addr = htonl((a << 24) | (b << 16) | (c << 8) | d); - } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next)); + } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next)); /* create a key to represent an anonymous user */ memcpy(keyname, "afs@", 4); @@ -110,6 +131,7 @@ bad_address: ret = -EINVAL; error: key_put(cell->anonymous_key); + kfree(dvllist); kfree(cell); _leave(" = %d", ret); return ERR_PTR(ret); @@ -201,14 +223,12 @@ int afs_cell_init(char *rootcell) } cp = strchr(rootcell, ':'); - if (!cp) { - printk(KERN_ERR "kAFS: no VL server IP addresses specified\n"); - _leave(" = -EINVAL"); - return -EINVAL; - } + if (!cp) + _debug("kAFS: no VL server IP addresses specified"); + else + *cp++ = 0; /* allocate a cell record for the root cell */ - *cp++ = 0; new_root = afs_cell_create(rootcell, cp); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); diff --git a/fs/afs/main.c b/fs/afs/main.c index 66d54d348c5..cfd1cbe25b2 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -111,6 +111,8 @@ static int __init afs_init(void) /* initialise the callback update process */ ret = afs_callback_update_init(); + if (ret < 0) + goto error_callback_update_init; /* create the RxRPC transport */ ret = afs_open_socket(); @@ -127,15 +129,16 @@ static int __init afs_init(void) error_fs: afs_close_socket(); error_open_socket: + afs_callback_update_kill(); +error_callback_update_init: + afs_vlocation_purge(); error_vl_update_init: + afs_cell_purge(); error_cell_init: #ifdef CONFIG_AFS_FSCACHE fscache_unregister_netfs(&afs_cache_netfs); error_cache: #endif - afs_callback_update_kill(); - afs_vlocation_purge(); - afs_cell_purge(); afs_proc_cleanup(); rcu_barrier(); printk(KERN_ERR "kAFS: failed to register: %d\n", ret); diff --git a/fs/afs/server.c b/fs/afs/server.c index f4909951667..9fdc7fe3a7b 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -91,9 +91,10 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, memcpy(&server->addr, addr, sizeof(struct in_addr)); server->addr.s_addr = addr->s_addr; + _leave(" = %p{%d}", server, atomic_read(&server->usage)); + } else { + _leave(" = NULL [nomem]"); } - - _leave(" = %p{%d}", server, atomic_read(&server->usage)); return server; } diff --git a/fs/afs/write.c b/fs/afs/write.c index 3dab9e9948d..722743b152d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -680,7 +680,6 @@ int afs_writeback_all(struct afs_vnode *vnode) { struct address_space *mapping = vnode->vfs_inode.i_mapping; struct writeback_control wbc = { - .bdi = mapping->backing_dev_info, .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_cyclic = 1, @@ -1277,7 +1277,7 @@ out: /* sys_io_destroy: * Destroy the aio_context specified. May cancel any outstanding * AIOs and block on completion. Will fail with -ENOSYS if not - * implemented. May fail with -EFAULT if the context pointed to + * implemented. May fail with -EINVAL if the context pointed to * is invalid. */ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) @@ -1795,15 +1795,16 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, /* io_getevents: * Attempts to read at least min_nr events and up to nr events from - * the completion queue for the aio_context specified by ctx_id. May - * fail with -EINVAL if ctx_id is invalid, if min_nr is out of range, - * if nr is out of range, if when is out of range. May fail with - * -EFAULT if any of the memory specified to is invalid. May return - * 0 or < min_nr if no events are available and the timeout specified - * by when has elapsed, where when == NULL specifies an infinite - * timeout. Note that the timeout pointed to by when is relative and - * will be updated if not NULL and the operation blocks. Will fail - * with -ENOSYS if not implemented. + * the completion queue for the aio_context specified by ctx_id. If + * it succeeds, the number of read events is returned. May fail with + * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is + * out of range, if timeout is out of range. May fail with -EFAULT + * if any of the memory specified is invalid. May return 0 or + * < min_nr if the timeout specified by timeout has elapsed + * before sufficient events are available, where timeout == NULL + * specifies an infinite timeout. Note that the timeout pointed to by + * timeout is relative and will be updated if not NULL and the + * operation blocks. Will fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 34ddda888e6..dc39d282488 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -436,7 +436,7 @@ befs_init_inodecache(void) init_once); if (befs_inode_cachep == NULL) { printk(KERN_ERR "befs_init_inodecache: " - "Couldn't initalize inode slabcache\n"); + "Couldn't initialize inode slabcache\n"); return -ENOMEM; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 2c5f9a0e5d7..63039ed9576 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -990,10 +990,9 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( /* clear any space allocated but not loaded */ if (phdr->p_filesz < phdr->p_memsz) { - ret = clear_user((void *) (seg->addr + phdr->p_filesz), - phdr->p_memsz - phdr->p_filesz); - if (ret) - return ret; + if (clear_user((void *) (seg->addr + phdr->p_filesz), + phdr->p_memsz - phdr->p_filesz)) + return -EFAULT; } if (mm) { @@ -1027,7 +1026,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; unsigned long load_addr, delta_vaddr; - int loop, dvset, ret; + int loop, dvset; load_addr = params->load_addr; delta_vaddr = 0; @@ -1127,9 +1126,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, * PT_LOAD */ if (prot & PROT_WRITE && disp > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); - ret = clear_user((void __user *) maddr, disp); - if (ret) - return ret; + if (clear_user((void __user *) maddr, disp)) + return -EFAULT; maddr += disp; } @@ -1164,19 +1162,17 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (prot & PROT_WRITE && excess1 > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr + phdr->p_filesz, excess1); - ret = clear_user((void __user *) maddr + phdr->p_filesz, - excess1); - if (ret) - return ret; + if (clear_user((void __user *) maddr + phdr->p_filesz, + excess1)) + return -EFAULT; } #else if (excess > 0) { kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr + phdr->p_filesz, excess); - ret = clear_user((void *) maddr + phdr->p_filesz, excess); - if (ret) - return ret; + if (clear_user((void *) maddr + phdr->p_filesz, excess)) + return -EFAULT; } #endif diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 49566c1687d..811384bec8d 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -56,16 +56,19 @@ #endif /* - * User data (stack, data section and bss) needs to be aligned - * for the same reasons as SLAB memory is, and to the same amount. - * Avoid duplicating architecture specific code by using the same - * macro as with SLAB allocation: + * User data (data section and bss) needs to be aligned. + * We pick 0x20 here because it is the max value elf2flt has always + * used in producing FLAT files, and because it seems to be large + * enough to make all the gcc alignment related tests happy. */ -#ifdef ARCH_SLAB_MINALIGN -#define FLAT_DATA_ALIGN (ARCH_SLAB_MINALIGN) -#else -#define FLAT_DATA_ALIGN (sizeof(void *)) -#endif +#define FLAT_DATA_ALIGN (0x20) + +/* + * User data (stack) also needs to be aligned. + * Here we can be a bit looser than the data sections since this + * needs to only meet arch ABI requirements. + */ +#define FLAT_STACK_ALIGN max_t(unsigned long, sizeof(void *), ARCH_SLAB_MINALIGN) #define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ #define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */ @@ -129,7 +132,7 @@ static unsigned long create_flat_tables( sp = (unsigned long *)p; sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); - sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN); + sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN); argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); envp = argv + (argc + 1); @@ -589,7 +592,7 @@ static int load_flat_file(struct linux_binprm * bprm, if (IS_ERR_VALUE(result)) { printk("Unable to read data+bss, errno %d\n", (int)-result); do_munmap(current->mm, textpos, text_len); - do_munmap(current->mm, realdatastart, data_len + extra); + do_munmap(current->mm, realdatastart, len); ret = result; goto err; } @@ -876,7 +879,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ - stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */ + stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */ res = load_flat_file(bprm, &libinfo, 0, &stack_len); if (IS_ERR_VALUE(res)) diff --git a/fs/block_dev.c b/fs/block_dev.c index 7346c96308a..b3171fb0dc9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -681,8 +681,8 @@ retry: if (!bd_may_claim(bdev, whole, holder)) return -EBUSY; - /* if someone else is claiming, wait for it to finish */ - if (whole->bd_claiming && whole->bd_claiming != holder) { + /* if claiming is already in progress, wait for it to finish */ + if (whole->bd_claiming) { wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); DEFINE_WAIT(wait); @@ -706,8 +706,13 @@ retry: * @bdev is about to be opened exclusively. Check @bdev can be opened * exclusively and mark that an exclusive open is in progress. Each * successful call to this function must be matched with a call to - * either bd_claim() or bd_abort_claiming(). If this function - * succeeds, the matching bd_claim() is guaranteed to succeed. + * either bd_finish_claiming() or bd_abort_claiming() (which do not + * fail). + * + * This function is used to gain exclusive access to the block device + * without actually causing other exclusive open attempts to fail. It + * should be used when the open sequence itself requires exclusive + * access but may subsequently fail. * * CONTEXT: * Might sleep. @@ -734,6 +739,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, return ERR_PTR(-ENXIO); whole = bdget_disk(disk, 0); + module_put(disk->fops->owner); put_disk(disk); if (!whole) return ERR_PTR(-ENOMEM); @@ -782,15 +788,46 @@ static void bd_abort_claiming(struct block_device *whole, void *holder) __bd_abort_claiming(whole, holder); /* releases bdev_lock */ } +/* increment holders when we have a legitimate claim. requires bdev_lock */ +static void __bd_claim(struct block_device *bdev, struct block_device *whole, + void *holder) +{ + /* note that for a whole device bd_holders + * will be incremented twice, and bd_holder will + * be set to bd_claim before being set to holder + */ + whole->bd_holders++; + whole->bd_holder = bd_claim; + bdev->bd_holders++; + bdev->bd_holder = holder; +} + +/** + * bd_finish_claiming - finish claiming a block device + * @bdev: block device of interest (passed to bd_start_claiming()) + * @whole: whole block device returned by bd_start_claiming() + * @holder: holder trying to claim @bdev + * + * Finish a claiming block started by bd_start_claiming(). + * + * CONTEXT: + * Grabs and releases bdev_lock. + */ +static void bd_finish_claiming(struct block_device *bdev, + struct block_device *whole, void *holder) +{ + spin_lock(&bdev_lock); + BUG_ON(!bd_may_claim(bdev, whole, holder)); + __bd_claim(bdev, whole, holder); + __bd_abort_claiming(whole, holder); /* not actually an abort */ +} + /** * bd_claim - claim a block device * @bdev: block device to claim * @holder: holder trying to claim @bdev * - * Try to claim @bdev which must have been opened successfully. This - * function may be called with or without preceding - * blk_start_claiming(). In the former case, this function is always - * successful and terminates the claiming block. + * Try to claim @bdev which must have been opened successfully. * * CONTEXT: * Might sleep. @@ -806,23 +843,10 @@ int bd_claim(struct block_device *bdev, void *holder) might_sleep(); spin_lock(&bdev_lock); - res = bd_prepare_to_claim(bdev, whole, holder); - if (res == 0) { - /* note that for a whole device bd_holders - * will be incremented twice, and bd_holder will - * be set to bd_claim before being set to holder - */ - whole->bd_holders++; - whole->bd_holder = bd_claim; - bdev->bd_holders++; - bdev->bd_holder = holder; - } - - if (whole->bd_claiming) - __bd_abort_claiming(whole, holder); /* releases bdev_lock */ - else - spin_unlock(&bdev_lock); + if (res == 0) + __bd_claim(bdev, whole, holder); + spin_unlock(&bdev_lock); return res; } @@ -1476,7 +1500,7 @@ static int blkdev_open(struct inode * inode, struct file * filp) if (whole) { if (res == 0) - BUG_ON(bd_claim(bdev, filp) != 0); + bd_finish_claiming(bdev, whole, filp); else bd_abort_claiming(whole, filp); } @@ -1712,7 +1736,7 @@ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *h if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) goto out_blkdev_put; - BUG_ON(bd_claim(bdev, holder) != 0); + bd_finish_claiming(bdev, whole, holder); return bdev; out_blkdev_put: diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 8d432cd9d58..2222d161c7b 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -60,6 +60,8 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, value, size); if (size > 0) { acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return acl; set_cached_acl(inode, type, acl); } kfree(value); @@ -160,6 +162,12 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, int ret; struct posix_acl *acl = NULL; + if (!is_owner_or_cap(dentry->d_inode)) + return -EPERM; + + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + if (value) { acl = posix_acl_from_xattr(value, size); if (acl == NULL) { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0d1d966b0fe..c3df14ce2cc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2304,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, return ret; } +/* + * min slot controls the lowest index we're willing to push to the + * right. We'll push up to and including min_slot, but no lower + */ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, - int free_space, u32 left_nritems) + int free_space, u32 left_nritems, + u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; @@ -2327,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (empty) nr = 0; else - nr = 1; + nr = max_t(u32, 1, min_slot); if (path->slots[0] >= left_nritems) push_space += data_size; @@ -2469,10 +2474,14 @@ out_unlock: * * returns 1 if the push failed because the other node didn't have enough * room, 0 if everything worked out and < 0 if there were major errors. + * + * this will push starting from min_slot to the end of the leaf. It won't + * push any slot lower than min_slot */ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) + *root, struct btrfs_path *path, + int min_data_size, int data_size, + int empty, u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; @@ -2514,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; - return __push_leaf_right(trans, root, path, data_size, empty, - right, free_space, left_nritems); + return __push_leaf_right(trans, root, path, min_data_size, empty, + right, free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); free_extent_buffer(right); @@ -2525,12 +2534,17 @@ out_unlock: /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the + * items */ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, - int free_space, int right_nritems) + int free_space, u32 right_nritems, + u32 max_slot) { struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; @@ -2549,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, slot = path->slots[1]; if (empty) - nr = right_nritems; + nr = min(right_nritems, max_slot); else - nr = right_nritems - 1; + nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) { item = btrfs_item_nr(right, i); @@ -2712,10 +2726,14 @@ out: /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the + * items */ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) + *root, struct btrfs_path *path, int min_data_size, + int data_size, int empty, u32 max_slot) { struct extent_buffer *right = path->nodes[0]; struct extent_buffer *left; @@ -2761,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - return __push_leaf_left(trans, root, path, data_size, - empty, left, free_space, right_nritems); + return __push_leaf_left(trans, root, path, min_data_size, + empty, left, free_space, right_nritems, + max_slot); out: btrfs_tree_unlock(left); free_extent_buffer(left); @@ -2855,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, } /* + * double splits happen when we need to insert a big item in the middle + * of a leaf. A double split can leave us with 3 mostly empty leaves: + * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] + * A B C + * + * We avoid this by trying to push the items on either side of our target + * into the adjacent leaves. If all goes well we can avoid the double split + * completely. + */ +static noinline int push_for_double_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size) +{ + int ret; + int progress = 0; + int slot; + u32 nritems; + + slot = path->slots[0]; + + /* + * try to push all the items after our slot into the + * right leaf + */ + ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * our goal is to get our slot at the start or end of a leaf. If + * we've done so we're done + */ + if (path->slots[0] == 0 || path->slots[0] == nritems) + return 0; + + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + + /* try to push all the items before our slot into the next leaf */ + slot = path->slots[0]; + ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + if (progress) + return 0; + return 1; +} + +/* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. * @@ -2876,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int wret; int split; int num_doubles = 0; + int tried_avoid_double = 0; l = path->nodes[0]; slot = path->slots[0]; @@ -2884,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, return -EOVERFLOW; /* first try to make some room by pushing left and right */ - if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { - wret = push_leaf_right(trans, root, path, data_size, 0); + if (data_size) { + wret = push_leaf_right(trans, root, path, data_size, + data_size, 0, 0); if (wret < 0) return wret; if (wret) { - wret = push_leaf_left(trans, root, path, data_size, 0); + wret = push_leaf_left(trans, root, path, data_size, + data_size, 0, (u32)-1); if (wret < 0) return wret; } @@ -2923,6 +3003,8 @@ again: if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; split = 2; } } @@ -2939,6 +3021,8 @@ again: if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; split = 2 ; } } @@ -3019,6 +3103,13 @@ again: } return ret; + +push_for_double: + push_for_double_split(trans, root, path, data_size); + tried_avoid_double = 1; + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + goto again; } static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, @@ -3915,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, extent_buffer_get(leaf); btrfs_set_path_blocking(path); - wret = push_leaf_left(trans, root, path, 1, 1); + wret = push_leaf_left(trans, root, path, 1, 1, + 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (path->nodes[0] == leaf && btrfs_header_nritems(leaf)) { - wret = push_leaf_right(trans, root, path, 1, 1); + wret = push_leaf_right(trans, root, path, 1, + 1, 1, 0); if (wret < 0 && wret != -ENOSPC) ret = wret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f3b287c22ca..34f7c375567 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1941,8 +1941,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_level_size(tree_root, btrfs_super_log_root_level(disk_super)); - log_tree_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!log_tree_root) { + err = -ENOMEM; + goto fail_trans_kthread; + } __setup_root(nodesize, leafsize, sectorsize, stripesize, log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); @@ -1982,6 +1985,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); if (!fs_info->fs_root) goto fail_trans_kthread; + if (IS_ERR(fs_info->fs_root)) { + err = PTR_ERR(fs_info->fs_root); + goto fail_trans_kthread; + } if (!(sb->s_flags & MS_RDONLY)) { down_read(&fs_info->cleanup_work_sem); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b9080d71991..32d094002a5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4360,7 +4360,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); cache = btrfs_lookup_block_group(root->fs_info, buf->start); - BUG_ON(block_rsv->space_info != cache->space_info); + if (block_rsv->space_info != cache->space_info) + goto out; if (btrfs_header_generation(buf) == trans->transid) { if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a4080c21ec5..d74e6af9b53 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2594,7 +2594,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { - .bdi = wbc->bdi, .sync_mode = wbc->sync_mode, .older_than_this = NULL, .nr_to_write = 64, @@ -2628,7 +2627,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .sync_io = mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { - .bdi = inode->i_mapping->backing_dev_info, .sync_mode = mode, .older_than_this = NULL, .nr_to_write = nr_pages * 2, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 787b50a16a1..e354c33df08 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1140,7 +1140,7 @@ int btrfs_sync_file(struct file *file, int datasync) /* * ok we haven't committed the transaction yet, lets do a commit */ - if (file && file->private_data) + if (file->private_data) btrfs_ioctl_trans_end(file); trans = btrfs_start_transaction(root, 0); @@ -1190,14 +1190,22 @@ static const struct vm_operations_struct btrfs_file_vm_ops = { static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) { - vma->vm_ops = &btrfs_file_vm_ops; + struct address_space *mapping = filp->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(filp); + vma->vm_ops = &btrfs_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; } const struct file_operations btrfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, + .write = do_sync_write, .aio_read = generic_file_aio_read, .splice_read = generic_file_splice_read, .aio_write = btrfs_file_aio_write, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fa6ccc1bfe2..1bff92ad474 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2673,7 +2673,7 @@ static int check_path_shared(struct btrfs_root *root, struct extent_buffer *eb; int level; int ret; - u64 refs; + u64 refs = 1; for (level = 0; level < BTRFS_MAX_LEVEL; level++) { if (!path->nodes[level]) @@ -6884,7 +6884,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = btrfs_prealloc_file_range(inode, 0, cur_offset, + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, last_byte - cur_offset, 1 << inode->i_blkbits, offset + len, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4cdb98cf26d..9254b3d58db 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1280,7 +1280,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); - goto out; + goto out_up_write; } trans->block_rsv = &root->fs_info->global_block_rsv; @@ -1458,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, */ /* the destination must be opened for writing */ - if (!(file->f_mode & FMODE_WRITE)) + if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) return -EINVAL; ret = mnt_want_write(file->f_path.mnt); @@ -1511,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, /* determine range to clone */ ret = -EINVAL; - if (off >= src->i_size || off + len > src->i_size) + if (off + len > src->i_size || off + len < off) goto out_unlock; if (len == 0) olen = len = src->i_size - off; @@ -1578,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; + u64 endoff; size = btrfs_item_size_nr(leaf, slot); read_extent_buffer(leaf, buf, @@ -1712,9 +1713,18 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_release_path(root, path); inode->i_mtime = inode->i_ctime = CURRENT_TIME; - if (new_key.offset + datal > inode->i_size) - btrfs_i_size_write(inode, - new_key.offset + datal); + + /* + * we round up to the block size at eof when + * determining which extents to clone above, + * but shouldn't round up the file size + */ + endoff = new_key.offset + datal; + if (endoff > off+olen) + endoff = off+olen; + if (endoff > inode->i_size) + btrfs_i_size_write(inode, endoff); + BTRFS_I(inode)->flags = BTRFS_I(src)->flags; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); @@ -1845,7 +1855,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, dir_id, "default", 7, 1); - if (!di) { + if (IS_ERR_OR_NULL(di)) { btrfs_free_path(path); btrfs_end_transaction(trans, root); printk(KERN_ERR "Umm, you don't have the default dir item, " diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 05d41e56923..b37d723b9d4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -784,16 +784,17 @@ again: struct btrfs_extent_ref_v0 *ref0; ref0 = btrfs_item_ptr(eb, path1->slots[0], struct btrfs_extent_ref_v0); - root = find_tree_root(rc, eb, ref0); - if (!root->ref_cows) - cur->cowonly = 1; if (key.objectid == key.offset) { + root = find_tree_root(rc, eb, ref0); if (root && !should_ignore_root(root)) cur->root = root; else list_add(&cur->list, &useless); break; } + if (is_cowonly_root(btrfs_ref_root_v0(eb, + ref0))) + cur->cowonly = 1; } #else BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index b91ccd97264..2d958be761c 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -330,7 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_path *path; int ret; - u32 refs; struct btrfs_root_item *ri; struct extent_buffer *leaf; @@ -344,8 +343,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, leaf = path->nodes[0]; ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); - refs = btrfs_disk_root_refs(leaf, ri); - BUG_ON(refs != 0); ret = btrfs_del_item(trans, root, path); out: btrfs_free_path(path); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d34b2dfc962..f2393b39031 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -360,6 +360,8 @@ static struct dentry *get_default_root(struct super_block *sb, */ dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); + if (IS_ERR(di)) + return ERR_CAST(di); if (!di) { /* * Ok the default dir item isn't there. This is weird since @@ -390,8 +392,8 @@ setup_root: location.offset = 0; inode = btrfs_iget(sb, &location, new_root, &new); - if (!inode) - return ERR_PTR(-ENOMEM); + if (IS_ERR(inode)) + return ERR_CAST(inode); /* * If we're just mounting the root most subvol put the inode and return diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index f4a7840bf42..42c7fafc8bf 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -37,9 +37,9 @@ void __cachefiles_printk_object(struct cachefiles_object *object, printk(KERN_ERR "%sobject: OBJ%x\n", prefix, object->fscache.debug_id); - printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n", + printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", prefix, fscache_object_states[object->fscache.state], - object->fscache.flags, object->fscache.work.flags, + object->fscache.flags, work_busy(&object->fscache.work), object->fscache.events, object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK); printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", @@ -212,7 +212,7 @@ wait_for_old_object: /* if the object we're waiting for is queued for processing, * then just put ourselves on the queue behind it */ - if (slow_work_is_queued(&xobject->fscache.work)) { + if (work_pending(&xobject->fscache.work)) { _debug("queue OBJ%x behind OBJ%x immediately", object->fscache.debug_id, xobject->fscache.debug_id); @@ -220,8 +220,7 @@ wait_for_old_object: } /* otherwise we sleep until either the object we're waiting for - * is done, or the slow-work facility wants the thread back to - * do other work */ + * is done, or the fscache_object is congested */ wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); init_wait(&wait); requeue = false; @@ -229,8 +228,8 @@ wait_for_old_object: prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) break; - requeue = slow_work_sleep_till_thread_needed( - &object->fscache.work, &timeout); + + requeue = fscache_object_sleep_till_congested(&timeout); } while (timeout > 0 && !requeue); finish_wait(wq, &wait); diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 0f0d41fbb03..0e3c0924cc3 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -422,7 +422,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; op->op.flags &= FSCACHE_OP_KEEP_FLAGS; - op->op.flags |= FSCACHE_OP_FAST; + op->op.flags |= FSCACHE_OP_ASYNC; op->op.processor = cachefiles_read_copier; pagevec_init(&pagevec, 0); @@ -729,7 +729,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, pagevec_init(&pagevec, 0); op->op.flags &= FSCACHE_OP_KEEP_FLAGS; - op->op.flags |= FSCACHE_OP_FAST; + op->op.flags |= FSCACHE_OP_ASYNC; op->op.processor = cachefiles_read_copier; INIT_LIST_HEAD(&backpages); diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 04b8280582a..bc87b9c1d27 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -2,7 +2,7 @@ config CEPH_FS tristate "Ceph distributed file system (EXPERIMENTAL)" depends on INET && EXPERIMENTAL select LIBCRC32C - select CONFIG_CRYPTO_AES + select CRYPTO_AES help Choose Y or M here to include support for mounting the experimental Ceph distributed file system. Ceph is an extremely diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index 83d4d2785ff..6d44053ecff 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -493,7 +493,7 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, return -EAGAIN; } - op = le32_to_cpu(head->op); + op = le16_to_cpu(head->op); result = le32_to_cpu(head->result); dout("handle_reply op %d result %d\n", op, result); switch (op) { @@ -613,6 +613,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) remove_ticket_handler(ac, th); } + if (xi->auth_authorizer.buf) + ceph_buffer_put(xi->auth_authorizer.buf); + kfree(ac->private); ac->private = NULL; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ae3e3a30644..b81be9a5648 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -244,8 +244,14 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) struct ceph_cap *cap = NULL; /* temporary, until we do something about cap import/export */ - if (!ctx) - return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (!ctx) { + cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (cap) { + caps_use_count++; + caps_total_count++; + } + return cap; + } spin_lock(&caps_list_lock); dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", @@ -621,7 +627,7 @@ retry: if (fmode >= 0) __ceph_get_fmode(ci, fmode); spin_unlock(&inode->i_lock); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return 0; } @@ -981,6 +987,46 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } +static void __queue_cap_release(struct ceph_mds_session *session, + u64 ino, u64 cap_id, u32 migrate_seq, + u32 issue_seq) +{ + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + struct ceph_mds_cap_item *item; + + spin_lock(&session->s_cap_lock); + BUG_ON(!session->s_num_cap_releases); + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + + dout(" adding %llx release to mds%d msg %p (%d left)\n", + ino, session->s_mds, msg, session->s_num_cap_releases); + + BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); + head = msg->front.iov_base; + head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); + item = msg->front.iov_base + msg->front.iov_len; + item->ino = cpu_to_le64(ino); + item->cap_id = cpu_to_le64(cap_id); + item->migrate_seq = cpu_to_le32(migrate_seq); + item->seq = cpu_to_le32(issue_seq); + + session->s_num_cap_releases--; + + msg->front.iov_len += sizeof(*item); + if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + dout(" release msg %p full\n", msg); + list_move_tail(&msg->list_head, &session->s_cap_releases_done); + } else { + dout(" release msg %p at %d/%d (%d)\n", msg, + (int)le32_to_cpu(head->num), + (int)CEPH_CAPS_PER_RELEASE, + (int)msg->front.iov_len); + } + spin_unlock(&session->s_cap_lock); +} + /* * Queue cap releases when an inode is dropped from our cache. Since * inode is about to be destroyed, there is no need for i_lock. @@ -994,41 +1040,9 @@ void ceph_queue_caps_release(struct inode *inode) while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); struct ceph_mds_session *session = cap->session; - struct ceph_msg *msg; - struct ceph_mds_cap_release *head; - struct ceph_mds_cap_item *item; - spin_lock(&session->s_cap_lock); - BUG_ON(!session->s_num_cap_releases); - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - - dout(" adding %p release to mds%d msg %p (%d left)\n", - inode, session->s_mds, msg, session->s_num_cap_releases); - - BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); - head = msg->front.iov_base; - head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); - item = msg->front.iov_base + msg->front.iov_len; - item->ino = cpu_to_le64(ceph_ino(inode)); - item->cap_id = cpu_to_le64(cap->cap_id); - item->migrate_seq = cpu_to_le32(cap->mseq); - item->seq = cpu_to_le32(cap->issue_seq); - - session->s_num_cap_releases--; - - msg->front.iov_len += sizeof(*item); - if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { - dout(" release msg %p full\n", msg); - list_move_tail(&msg->list_head, - &session->s_cap_releases_done); - } else { - dout(" release msg %p at %d/%d (%d)\n", msg, - (int)le32_to_cpu(head->num), - (int)CEPH_CAPS_PER_RELEASE, - (int)msg->front.iov_len); - } - spin_unlock(&session->s_cap_lock); + __queue_cap_release(session, ceph_ino(inode), cap->cap_id, + cap->mseq, cap->issue_seq); p = rb_next(p); __ceph_remove_cap(cap); } @@ -1167,7 +1181,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, } if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return delayed; } @@ -2139,7 +2153,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) else if (flushsnaps) ceph_flush_snaps(ci); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (put) iput(inode); } @@ -2215,7 +2229,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, iput(inode); } else if (complete_capsnap) { ceph_flush_snaps(ci); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); } if (drop_capsnap) iput(inode); @@ -2391,7 +2405,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if (queue_invalidate) ceph_queue_invalidate(inode); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (check_caps == 1) ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, @@ -2446,7 +2460,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info, i_flushing_item)->vfs_inode); mdsc->num_cap_flushing--; - wake_up(&mdsc->cap_flushing_wq); + wake_up_all(&mdsc->cap_flushing_wq); dout(" inode %p now !flushing\n", inode); if (ci->i_dirty_caps == 0) { @@ -2458,7 +2472,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, } } spin_unlock(&mdsc->cap_dirty_lock); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); out: spin_unlock(&inode->i_lock); @@ -2655,7 +2669,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_mds_caps *h; int mds = session->s_mds; int op; - u32 seq; + u32 seq, mseq; struct ceph_vino vino; u64 cap_id; u64 size, max_size; @@ -2675,6 +2689,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, vino.snap = CEPH_NOSNAP; cap_id = le64_to_cpu(h->cap_id); seq = le32_to_cpu(h->seq); + mseq = le32_to_cpu(h->migrate_seq); size = le64_to_cpu(h->size); max_size = le64_to_cpu(h->max_size); @@ -2689,6 +2704,18 @@ void ceph_handle_caps(struct ceph_mds_session *session, vino.snap, inode); if (!inode) { dout(" i don't have ino %llx\n", vino.ino); + + if (op == CEPH_CAP_OP_IMPORT) + __queue_cap_release(session, vino.ino, cap_id, + mseq, seq); + + /* + * send any full release message to try to move things + * along for the mds (who clearly thinks we still have this + * cap). + */ + ceph_add_cap_releases(mdsc, session, -1); + ceph_send_cap_releases(mdsc, session); goto done; } @@ -2714,7 +2741,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, spin_lock(&inode->i_lock); cap = __get_cap_for_mds(ceph_inode(inode), mds); if (!cap) { - dout("no cap on %p ino %llx.%llx from mds%d, releasing\n", + dout(" no cap on %p ino %llx.%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), mds); spin_unlock(&inode->i_lock); goto done; @@ -2865,18 +2892,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; struct ceph_mds_request_release *rel = *p; + int used, dirty; int ret = 0; - int used = 0; spin_lock(&inode->i_lock); used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); - dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode, - mds, ceph_cap_string(used), ceph_cap_string(drop), + dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n", + inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), ceph_cap_string(unless)); - /* only drop unused caps */ - drop &= ~used; + /* only drop unused, clean caps */ + drop &= ~(used | dirty); cap = __get_cap_for_mds(ci, mds); if (cap && __cap_is_valid(cap)) { @@ -2956,6 +2984,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, memcpy(*p, dentry->d_name.name, dentry->d_name.len); *p += dentry->d_name.len; rel->dname_seq = cpu_to_le32(di->lease_seq); + __ceph_mdsc_drop_dentry_lease(dentry); } spin_unlock(&dentry->d_lock); return ret; diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c index 9ba54efb654..a4eec133258 100644 --- a/fs/ceph/crush/mapper.c +++ b/fs/ceph/crush/mapper.c @@ -238,7 +238,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket, static int crush_bucket_choose(struct crush_bucket *in, int x, int r) { - dprintk("choose %d x=%d r=%d\n", in->id, x, r); + dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); switch (in->alg) { case CRUSH_BUCKET_UNIFORM: return bucket_uniform_choose((struct crush_bucket_uniform *)in, @@ -264,7 +264,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) */ static int is_out(struct crush_map *map, __u32 *weight, int item, int x) { - if (weight[item] >= 0x1000) + if (weight[item] >= 0x10000) return 0; if (weight[item] == 0) return 1; @@ -305,7 +305,9 @@ static int crush_choose(struct crush_map *map, int itemtype; int collide, reject; const int orig_tries = 5; /* attempts before we fall back to search */ - dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos); + + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep); for (rep = outpos; rep < numrep; rep++) { /* keep trying until we get a non-out, non-colliding item */ @@ -366,6 +368,7 @@ static int crush_choose(struct crush_map *map, BUG_ON(item >= 0 || (-1-item) >= map->max_buckets); in = map->buckets[-1-item]; + retry_bucket = 1; continue; } @@ -377,15 +380,25 @@ static int crush_choose(struct crush_map *map, } } - if (recurse_to_leaf && - item < 0 && - crush_choose(map, map->buckets[-1-item], - weight, - x, outpos+1, 0, - out2, outpos, - firstn, 0, NULL) <= outpos) { - reject = 1; - } else { + reject = 0; + if (recurse_to_leaf) { + if (item < 0) { + if (crush_choose(map, + map->buckets[-1-item], + weight, + x, outpos+1, 0, + out2, outpos, + firstn, 0, + NULL) <= outpos) + /* didn't get leaf */ + reject = 1; + } else { + /* we already have a leaf! */ + out2[outpos] = item; + } + } + + if (!reject) { /* out? */ if (itemtype == 0) reject = is_out(map, weight, @@ -424,12 +437,12 @@ reject: continue; } - dprintk("choose got %d\n", item); + dprintk("CHOOSE got %d\n", item); out[outpos] = item; outpos++; } - dprintk("choose returns %d\n", outpos); + dprintk("CHOOSE returns %d\n", outpos); return outpos; } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3be33fb066c..f2f5332ddbb 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -261,7 +261,7 @@ static int osdc_show(struct seq_file *s, void *pp) static int caps_show(struct seq_file *s, void *p) { - struct ceph_client *client = p; + struct ceph_client *client = s->private; int total, avail, used, reserved, min; ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index f85719310db..f94ed3c7f6a 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -266,6 +266,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) spin_lock(&inode->i_lock); if ((filp->f_pos == 2 || fi->dentry) && !ceph_test_opt(client, NOASYNCREADDIR) && + ceph_snap(inode) != CEPH_SNAPDIR && (ci->i_ceph_flags & CEPH_I_COMPLETE) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { err = __dcache_readdir(filp, dirent, filldir); @@ -1013,18 +1014,22 @@ out_touch: /* * When a dentry is released, clear the dir I_COMPLETE if it was part - * of the current dir gen. + * of the current dir gen or if this is in the snapshot namespace. */ static void ceph_dentry_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); struct inode *parent_inode = dentry->d_parent->d_inode; + u64 snapid = ceph_snap(parent_inode); - if (parent_inode) { + dout("dentry_release %p parent %p\n", dentry, parent_inode); + + if (parent_inode && snapid != CEPH_SNAPDIR) { struct ceph_inode_info *ci = ceph_inode(parent_inode); spin_lock(&parent_inode->i_lock); - if (ci->i_shared_gen == di->lease_shared_gen) { + if (ci->i_shared_gen == di->lease_shared_gen || + snapid <= CEPH_MAXSNAP) { dout(" clearing %p complete (d_release)\n", parent_inode); ci->i_ceph_flags &= ~CEPH_I_COMPLETE; @@ -1241,7 +1246,9 @@ struct dentry_operations ceph_dentry_ops = { struct dentry_operations ceph_snapdir_dentry_ops = { .d_revalidate = ceph_snapdir_d_revalidate, + .d_release = ceph_dentry_release, }; struct dentry_operations ceph_snap_dentry_ops = { + .d_release = ceph_dentry_release, }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6251a1574b9..7c08698fad3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -265,7 +265,7 @@ int ceph_release(struct inode *inode, struct file *file) kmem_cache_free(ceph_file_cachep, cf); /* wake up anyone waiting for caps on this inode */ - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return 0; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 226f5a50d36..389f9dbd994 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -827,7 +827,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) spin_lock(&dcache_lock); spin_lock(&dn->d_lock); - list_move_tail(&dir->d_subdirs, &dn->d_u.d_child); + list_move(&dn->d_u.d_child, &dir->d_subdirs); dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, dn->d_u.d_child.prev, dn->d_u.d_child.next); spin_unlock(&dn->d_lock); @@ -854,8 +854,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, d_drop(dn); realdn = d_materialise_unique(dn, in); if (IS_ERR(realdn)) { - pr_err("splice_dentry error %p inode %p ino %llx.%llx\n", - dn, in, ceph_vinop(in)); + pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", + PTR_ERR(realdn), dn, in, ceph_vinop(in)); if (prehash) *prehash = false; /* don't rehash on error */ dn = realdn; /* note realdn contains the error */ @@ -1199,8 +1199,10 @@ retry_lookup: goto out; } err = ceph_init_dentry(dn); - if (err < 0) + if (err < 0) { + dput(dn); goto out; + } } else if (dn->d_inode && (ceph_ino(dn->d_inode) != vino.ino || ceph_snap(dn->d_inode) != vino.snap)) { @@ -1234,18 +1236,23 @@ retry_lookup: goto out; } dn = splice_dentry(dn, in, NULL); + if (IS_ERR(dn)) + dn = NULL; } if (fill_inode(in, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); - dput(dn); - continue; + goto next_item; } - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, req->r_request_started); - dput(dn); + if (dn) + update_dentry_lease(dn, rinfo->dir_dlease[i], + req->r_session, + req->r_request_started); +next_item: + if (dn) + dput(dn); } req->r_did_prepopulate = true; @@ -1494,7 +1501,7 @@ retry: if (wrbuffer_refs == 0) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b49f12822cb..dd440bd438a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -868,7 +868,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, { struct ceph_inode_info *ci = ceph_inode(inode); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (arg) { spin_lock(&inode->i_lock); ci->i_wanted_max_size = 0; @@ -1066,9 +1066,9 @@ static int trim_caps(struct ceph_mds_client *mdsc, * * Called under s_mutex. */ -static int add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - int extra) +int ceph_add_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int extra) { struct ceph_msg *msg; struct ceph_mds_cap_release *head; @@ -1176,8 +1176,8 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) /* * called under s_mutex */ -static void send_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +void ceph_send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) { struct ceph_msg *msg; @@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ceph_encode_filepath(&p, end, ino1, path1); ceph_encode_filepath(&p, end, ino2, path2); + /* make note of release offset, in case we need to replay */ + req->r_request_release_offset = p - msg->front.iov_base; + /* cap releases */ releases = 0; if (req->r_inode_drop) @@ -1561,7 +1564,7 @@ static void complete_request(struct ceph_mds_client *mdsc, if (req->r_callback) req->r_callback(mdsc, req); else - complete(&req->r_completion); + complete_all(&req->r_completion); } /* @@ -1580,6 +1583,32 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); + if (req->r_got_unsafe) { + /* + * Replay. Do not regenerate message (and rebuild + * paths, etc.); just use the original message. + * Rebuilding paths will break for renames because + * d_move mangles the src name. + */ + msg = req->r_request; + rhead = msg->front.iov_base; + + flags = le32_to_cpu(rhead->flags); + flags |= CEPH_MDS_FLAG_REPLAY; + rhead->flags = cpu_to_le32(flags); + + if (req->r_target_inode) + rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + + rhead->num_retry = req->r_attempts - 1; + + /* remove cap/dentry releases from message */ + rhead->num_releases = 0; + msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); + msg->front.iov_len = req->r_request_release_offset; + return 0; + } + if (req->r_request) { ceph_msg_put(req->r_request); req->r_request = NULL; @@ -1601,13 +1630,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead->flags = cpu_to_le32(flags); rhead->num_fwd = req->r_num_fwd; rhead->num_retry = req->r_attempts - 1; + rhead->ino = 0; dout(" r_locked_dir = %p\n", req->r_locked_dir); - - if (req->r_target_inode && req->r_got_unsafe) - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); - else - rhead->ino = 0; return 0; } @@ -1907,7 +1932,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (head->safe) { req->r_got_safe = true; __unregister_request(mdsc, req); - complete(&req->r_safe_completion); + complete_all(&req->r_safe_completion); if (req->r_got_unsafe) { /* @@ -1922,7 +1947,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* last unsafe request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) - complete(&mdsc->safe_umount_waiters); + complete_all(&mdsc->safe_umount_waiters); mutex_unlock(&mdsc->mutex); goto out; } @@ -1980,7 +2005,7 @@ out_err: } mutex_unlock(&mdsc->mutex); - add_cap_releases(mdsc, req->r_session, -1); + ceph_add_cap_releases(mdsc, req->r_session, -1); mutex_unlock(&session->s_mutex); /* kick calling process */ @@ -2101,7 +2126,7 @@ static void handle_session(struct ceph_mds_session *session, pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); wake = 1; /* for good measure */ - complete(&mdsc->session_close_waiters); + complete_all(&mdsc->session_close_waiters); kick_requests(mdsc, mds); break; @@ -2433,6 +2458,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_dentry_info *di; int mds = session->s_mds; struct ceph_mds_lease *h = msg->front.iov_base; + u32 seq; struct ceph_vino vino; int mask; struct qstr dname; @@ -2446,6 +2472,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; mask = le16_to_cpu(h->mask); + seq = le32_to_cpu(h->seq); dname.name = (void *)h + sizeof(*h) + sizeof(u32); dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); if (dname.len != get_unaligned_le32(h+1)) @@ -2456,8 +2483,9 @@ static void handle_lease(struct ceph_mds_client *mdsc, /* lookup inode */ inode = ceph_find_inode(sb, vino); - dout("handle_lease '%s', mask %d, ino %llx %p\n", - ceph_lease_op_name(h->action), mask, vino.ino, inode); + dout("handle_lease %s, mask %d, ino %llx %p %.*s\n", + ceph_lease_op_name(h->action), mask, vino.ino, inode, + dname.len, dname.name); if (inode == NULL) { dout("handle_lease no inode %llx\n", vino.ino); goto release; @@ -2482,7 +2510,8 @@ static void handle_lease(struct ceph_mds_client *mdsc, switch (h->action) { case CEPH_MDS_LEASE_REVOKE: if (di && di->lease_session == session) { - h->seq = cpu_to_le32(di->lease_seq); + if (ceph_seq_cmp(di->lease_seq, seq) > 0) + h->seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); } release = 1; @@ -2496,7 +2525,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, unsigned long duration = le32_to_cpu(h->duration_ms) * HZ / 1000; - di->lease_seq = le32_to_cpu(h->seq); + di->lease_seq = seq; dentry->d_time = di->lease_renew_from + duration; di->lease_renew_after = di->lease_renew_from + (duration >> 1); @@ -2686,10 +2715,10 @@ static void delayed_work(struct work_struct *work) send_renew_caps(mdsc, s); else ceph_con_keepalive(&s->s_con); - add_cap_releases(mdsc, s, -1); + ceph_add_cap_releases(mdsc, s, -1); if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG) - send_cap_releases(mdsc, s); + ceph_send_cap_releases(mdsc, s); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); @@ -2779,6 +2808,12 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) drop_leases(mdsc); ceph_flush_dirty_caps(mdsc); wait_requests(mdsc); + + /* + * wait for reply handlers to drop their request refs and + * their inode/dcache refs + */ + ceph_msgr_flush(); } /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index d9936c4f121..952410c60d0 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -188,6 +188,7 @@ struct ceph_mds_request { int r_old_inode_drop, r_old_inode_unless; struct ceph_msg *r_request; /* original request */ + int r_request_release_offset; struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err; @@ -322,6 +323,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } +extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int extra); +extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); + extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 64b8b1f7863..15167b2daa5 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -43,7 +43,8 @@ static void ceph_fault(struct ceph_connection *con); * nicely render a sockaddr as a string. */ #define MAX_ADDR_STR 20 -static char addr_str[MAX_ADDR_STR][40]; +#define MAX_ADDR_STR_LEN 60 +static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; static DEFINE_SPINLOCK(addr_str_lock); static int last_addr_str; @@ -52,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss) int i; char *s; struct sockaddr_in *in4 = (void *)ss; - unsigned char *quad = (void *)&in4->sin_addr.s_addr; struct sockaddr_in6 *in6 = (void *)ss; spin_lock(&addr_str_lock); @@ -64,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss) switch (ss->ss_family) { case AF_INET: - sprintf(s, "%u.%u.%u.%u:%u", - (unsigned int)quad[0], - (unsigned int)quad[1], - (unsigned int)quad[2], - (unsigned int)quad[3], - (unsigned int)ntohs(in4->sin_port)); + snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, + (unsigned int)ntohs(in4->sin_port)); break; case AF_INET6: - sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u", - in6->sin6_addr.s6_addr16[0], - in6->sin6_addr.s6_addr16[1], - in6->sin6_addr.s6_addr16[2], - in6->sin6_addr.s6_addr16[3], - in6->sin6_addr.s6_addr16[4], - in6->sin6_addr.s6_addr16[5], - in6->sin6_addr.s6_addr16[6], - in6->sin6_addr.s6_addr16[7], - (unsigned int)ntohs(in6->sin6_port)); + snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, + (unsigned int)ntohs(in6->sin6_port)); break; default: @@ -215,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock, */ static struct socket *ceph_tcp_connect(struct ceph_connection *con) { - struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr; + struct sockaddr_storage *paddr = &con->peer_addr.in_addr; struct socket *sock; int ret; BUG_ON(con->sock); - ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); if (ret) return ERR_PTR(ret); con->sock = sock; @@ -234,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); - ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK); + ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), + O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", pr_addr(&con->peer_addr.in_addr), @@ -657,7 +647,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT; + con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -1009,19 +999,32 @@ int ceph_parse_ips(const char *c, const char *end, struct sockaddr_in *in4 = (void *)ss; struct sockaddr_in6 *in6 = (void *)ss; int port; + char delim = ','; + + if (*p == '[') { + delim = ']'; + p++; + } memset(ss, 0, sizeof(*ss)); if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, - ',', &ipend)) { + delim, &ipend)) ss->ss_family = AF_INET; - } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, - ',', &ipend)) { + else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, + delim, &ipend)) ss->ss_family = AF_INET6; - } else { + else goto bad; - } p = ipend; + if (delim == ']') { + if (*p != ']') { + dout("missing matching ']'\n"); + goto bad; + } + p++; + } + /* port? */ if (p < end && *p == ':') { port = 0; @@ -1055,7 +1058,7 @@ int ceph_parse_ips(const char *c, const char *end, return 0; bad: - pr_err("parse_ips bad ip '%s'\n", c); + pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); return -EINVAL; } @@ -1396,10 +1399,12 @@ static int read_partial_message(struct ceph_connection *con) if (!con->in_msg) { dout("got hdr type %d front %d data %d\n", con->in_hdr.type, con->in_hdr.front_len, con->in_hdr.data_len); + skip = 0; con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); + BUG_ON(con->in_msg); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; @@ -2013,20 +2018,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) { mutex_lock(&con->mutex); if (!list_empty(&msg->list_head)) { - dout("con_revoke %p msg %p\n", con, msg); + dout("con_revoke %p msg %p - was on queue\n", con, msg); list_del_init(&msg->list_head); ceph_msg_put(msg); msg->hdr.seq = 0; - if (con->out_msg == msg) { - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } + } + if (con->out_msg == msg) { + dout("con_revoke %p msg %p - was sending\n", con, msg); + con->out_msg = NULL; if (con->out_kvec_is_msg) { con->out_skip = con->out_kvec_bytes; con->out_kvec_is_msg = false; } - } else { - dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg); + ceph_msg_put(msg); + msg->hdr.seq = 0; } mutex_unlock(&con->mutex); } diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 21c62e9b7d1..54fe01c5070 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -345,7 +345,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, out: mutex_unlock(&monc->mutex); - wake_up(&client->auth_wq); + wake_up_all(&client->auth_wq); } /* @@ -400,6 +400,8 @@ static void release_generic_request(struct kref *kref) ceph_msg_put(req->reply); if (req->request) ceph_msg_put(req->request); + + kfree(req); } static void put_generic_request(struct ceph_mon_generic_request *req) @@ -460,7 +462,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, } mutex_unlock(&monc->mutex); if (req) { - complete(&req->completion); + complete_all(&req->completion); put_generic_request(req); } return; @@ -716,14 +718,15 @@ static void handle_auth_reply(struct ceph_mon_client *monc, monc->m_auth->front_max); if (ret < 0) { monc->client->auth_err = ret; - wake_up(&monc->client->auth_wq); + wake_up_all(&monc->client->auth_wq); } else if (ret > 0) { __send_prepared_auth_request(monc, ret); } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; - monc->client->msgr->inst.name.num = monc->auth->global_id; + monc->client->msgr->inst.name.num = + cpu_to_le64(monc->auth->global_id); __send_subscribe(monc); __resend_generic_request(monc); diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index d25b4add85b..e3852234789 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -862,12 +862,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, if (req->r_callback) req->r_callback(req, msg); else - complete(&req->r_completion); + complete_all(&req->r_completion); if (flags & CEPH_OSD_FLAG_ONDISK) { if (req->r_safe_callback) req->r_safe_callback(req, msg); - complete(&req->r_safe_completion); /* fsync waiter */ + complete_all(&req->r_safe_completion); /* fsync waiter */ } done: @@ -1083,7 +1083,7 @@ done: if (newmap) kick_requests(osdc, NULL); up_read(&osdc->map_sem); - wake_up(&osdc->client->auth_wq); + wake_up_all(&osdc->client->auth_wq); return; bad: @@ -1344,7 +1344,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) int type = le16_to_cpu(msg->hdr.type); if (!osd) - return; + goto out; osdc = osd->o_osdc; switch (type) { @@ -1359,6 +1359,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) pr_err("received unknown message type %d %s\n", type, ceph_msg_type_name(type)); } +out: ceph_msg_put(msg); } diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index ddc656fb5c0..416d46adbf8 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -568,6 +568,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); + kfree(pi); goto bad; } __decode_pool(p, pi); @@ -707,6 +708,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, newcrush = crush_decode(*p, min(*p+len, end)); if (IS_ERR(newcrush)) return ERR_CAST(newcrush); + *p += len; } /* new flags? */ @@ -829,12 +831,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* remove any? */ while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, node)->pgid, pgid) <= 0) { - struct rb_node *cur = rbp; + struct ceph_pg_mapping *cur = + rb_entry(rbp, struct ceph_pg_mapping, node); + rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", - *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, - node)->pgid); - rb_erase(cur, &map->pg_temp); + dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); + rb_erase(&cur->node, &map->pg_temp); + kfree(cur); } if (pglen) { @@ -850,19 +853,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, for (j = 0; j < pglen; j++) pg->osds[j] = ceph_decode_32(p); err = __insert_pg_mapping(pg, &map->pg_temp); - if (err) + if (err) { + kfree(pg); goto bad; + } dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, pglen); } } while (rbp) { - struct rb_node *cur = rbp; + struct ceph_pg_mapping *cur = + rb_entry(rbp, struct ceph_pg_mapping, node); + rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", - *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, - node)->pgid); - rb_erase(cur, &map->pg_temp); + dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); + rb_erase(&cur->node, &map->pg_temp); + kfree(cur); } /* ignore the rest */ diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 4e0bee240b9..fa87f51e38e 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -89,7 +89,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = le64_to_cpu(st.num_objects); buf->f_ffree = -1; - buf->f_namelen = PATH_MAX; + buf->f_namelen = NAME_MAX; buf->f_frsize = PAGE_CACHE_SIZE; /* leave fsid little-endian, regardless of host endianness */ @@ -926,7 +926,7 @@ static int ceph_compare_super(struct super_block *sb, void *data) /* * construct our own bdi so we can control readahead, etc. */ -static atomic_long_t bdi_seq = ATOMIC_INIT(0); +static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) { diff --git a/fs/char_dev.c b/fs/char_dev.c index d6db933df2b..f80a4f25123 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -20,6 +20,7 @@ #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/backing-dev.h> +#include <linux/tty.h> #include "internal.h" diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 80f35259680..917b7d449bb 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -2,7 +2,6 @@ config CIFS tristate "CIFS support (advanced network filesystem, SMBFS successor)" depends on INET select NLS - select SLOW_WORK help This is the client VFS module for the Common Internet File System (CIFS) protocol which is the successor to the Server Message Block @@ -71,14 +70,14 @@ config CIFS_WEAK_PW_HASH If unsure, say N. config CIFS_UPCALL - bool "Kerberos/SPNEGO advanced session setup" - depends on CIFS && KEYS - help - Enables an upcall mechanism for CIFS which accesses - userspace helper utilities to provide SPNEGO packaged (RFC 4178) - Kerberos tickets which are needed to mount to certain secure servers - (for which more secure Kerberos authentication is required). If - unsure, say N. + bool "Kerberos/SPNEGO advanced session setup" + depends on CIFS && KEYS + select DNS_RESOLVER + help + Enables an upcall mechanism for CIFS which accesses userspace helper + utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets + which are needed to mount to certain secure servers (for which more + secure Kerberos authentication is required). If unsure, say N. config CIFS_XATTR bool "CIFS extended attributes" @@ -122,6 +121,7 @@ config CIFS_DEBUG2 config CIFS_DFS_UPCALL bool "DFS feature support" depends on CIFS && KEYS + select DNS_RESOLVER help Distributed File System (DFS) support is used to access shares transparently in an enterprise name space, even if the share @@ -131,6 +131,15 @@ config CIFS_DFS_UPCALL IP addresses) which is needed for implicit mounts of DFS junction points. If unsure, say N. +config CIFS_FSCACHE + bool "Provide CIFS client caching support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y + help + Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data + to be cached locally on disk through the general filesystem cache + manager. If unsure, say N. + config CIFS_EXPERIMENTAL bool "CIFS Experimental Features (EXPERIMENTAL)" depends on CIFS && EXPERIMENTAL diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 9948c0030e8..adefa60a9bd 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -11,3 +11,5 @@ cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o + +cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o diff --git a/fs/cifs/README b/fs/cifs/README index a727b7cb075..a7081eeeb85 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -568,8 +568,9 @@ module can be displayed via modinfo. Misc /proc/fs/cifs Flags and Debug Info ======================================= Informational pseudo-files: -DebugData Displays information about active CIFS sessions - and shares, as well as the cifs.ko version. +DebugData Displays information about active CIFS sessions and + shares, features enabled as well as the cifs.ko + version. Stats Lists summary resource usage information as well as per share statistics, if CONFIG_CIFS_STATS in enabled in the kernel configuration. diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c new file mode 100644 index 00000000000..224d7bbd1fc --- /dev/null +++ b/fs/cifs/cache.c @@ -0,0 +1,331 @@ +/* + * fs/cifs/cache.c - CIFS filesystem cache index structure definitions + * + * Copyright (c) 2010 Novell, Inc. + * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "fscache.h" +#include "cifs_debug.h" + +/* + * CIFS filesystem definition for FS-Cache + */ +struct fscache_netfs cifs_fscache_netfs = { + .name = "cifs", + .version = 0, +}; + +/* + * Register CIFS for caching with FS-Cache + */ +int cifs_fscache_register(void) +{ + return fscache_register_netfs(&cifs_fscache_netfs); +} + +/* + * Unregister CIFS for caching + */ +void cifs_fscache_unregister(void) +{ + fscache_unregister_netfs(&cifs_fscache_netfs); +} + +/* + * Key layout of CIFS server cache index object + */ +struct cifs_server_key { + uint16_t family; /* address family */ + uint16_t port; /* IP port */ + union { + struct in_addr ipv4_addr; + struct in6_addr ipv6_addr; + } addr[0]; +}; + +/* + * Server object keyed by {IPaddress,port,family} tuple + */ +static uint16_t cifs_server_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct TCP_Server_Info *server = cookie_netfs_data; + const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr; + struct cifs_server_key *key = buffer; + uint16_t key_len = sizeof(struct cifs_server_key); + + memset(key, 0, key_len); + + /* + * Should not be a problem as sin_family/sin6_family overlays + * sa_family field + */ + switch (sa->sa_family) { + case AF_INET: + key->family = server->addr.sockAddr.sin_family; + key->port = server->addr.sockAddr.sin_port; + key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr; + key_len += sizeof(key->addr[0].ipv4_addr); + break; + + case AF_INET6: + key->family = server->addr.sockAddr6.sin6_family; + key->port = server->addr.sockAddr6.sin6_port; + key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr; + key_len += sizeof(key->addr[0].ipv6_addr); + break; + + default: + cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family); + key_len = 0; + break; + } + + return key_len; +} + +/* + * Server object for FS-Cache + */ +const struct fscache_cookie_def cifs_fscache_server_index_def = { + .name = "CIFS.server", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = cifs_server_get_key, +}; + +/* + * Auxiliary data attached to CIFS superblock within the cache + */ +struct cifs_fscache_super_auxdata { + u64 resource_id; /* unique server resource id */ +}; + +static char *extract_sharename(const char *treename) +{ + const char *src; + char *delim, *dst; + int len; + + /* skip double chars at the beginning */ + src = treename + 2; + + /* share name is always preceded by '\\' now */ + delim = strchr(src, '\\'); + if (!delim) + return ERR_PTR(-EINVAL); + delim++; + len = strlen(delim); + + /* caller has to free the memory */ + dst = kstrndup(delim, len, GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + + return dst; +} + +/* + * Superblock object currently keyed by share name + */ +static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + const struct cifsTconInfo *tcon = cookie_netfs_data; + char *sharename; + uint16_t len; + + sharename = extract_sharename(tcon->treeName); + if (IS_ERR(sharename)) { + cFYI(1, "CIFS: couldn't extract sharename\n"); + sharename = NULL; + return 0; + } + + len = strlen(sharename); + if (len > maxbuf) + return 0; + + memcpy(buffer, sharename, len); + + kfree(sharename); + + return len; +} + +static uint16_t +cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + struct cifs_fscache_super_auxdata auxdata; + const struct cifsTconInfo *tcon = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.resource_id = tcon->resource_id; + + if (maxbuf > sizeof(auxdata)) + maxbuf = sizeof(auxdata); + + memcpy(buffer, &auxdata, maxbuf); + + return maxbuf; +} + +static enum +fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct cifs_fscache_super_auxdata auxdata; + const struct cifsTconInfo *tcon = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.resource_id = tcon->resource_id; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +/* + * Superblock object for FS-Cache + */ +const struct fscache_cookie_def cifs_fscache_super_index_def = { + .name = "CIFS.super", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = cifs_super_get_key, + .get_aux = cifs_fscache_super_get_aux, + .check_aux = cifs_fscache_super_check_aux, +}; + +/* + * Auxiliary data attached to CIFS inode within the cache + */ +struct cifs_fscache_inode_auxdata { + struct timespec last_write_time; + struct timespec last_change_time; + u64 eof; +}; + +static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + uint16_t keylen; + + /* use the UniqueId as the key */ + keylen = sizeof(cifsi->uniqueid); + if (keylen > maxbuf) + keylen = 0; + else + memcpy(buffer, &cifsi->uniqueid, keylen); + + return keylen; +} + +static void +cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size) +{ + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + + *size = cifsi->vfs_inode.i_size; +} + +static uint16_t +cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + struct cifs_fscache_inode_auxdata auxdata; + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + + if (maxbuf > sizeof(auxdata)) + maxbuf = sizeof(auxdata); + + memcpy(buffer, &auxdata, maxbuf); + + return maxbuf; +} + +static enum +fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct cifs_fscache_inode_auxdata auxdata; + struct cifsInodeInfo *cifsi = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) +{ + struct cifsInodeInfo *cifsi = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + cFYI(1, "cifs inode 0x%p now uncached", cifsi); + + for (;;) { + nr_pages = pagevec_lookup(&pvec, + cifsi->vfs_inode.i_mapping, first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +const struct fscache_cookie_def cifs_fscache_inode_object_def = { + .name = "CIFS.uniqueid", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = cifs_fscache_inode_get_key, + .get_attr = cifs_fscache_inode_get_attr, + .get_aux = cifs_fscache_inode_get_aux, + .check_aux = cifs_fscache_inode_check_aux, + .now_uncached = cifs_fscache_inode_now_uncached, +}; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 4fce6e61b34..eb1ba493489 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -119,6 +119,31 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) "Display Internal CIFS Data Structures for Debugging\n" "---------------------------------------------------\n"); seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); + seq_printf(m, "Features: "); +#ifdef CONFIG_CIFS_DFS_UPCALL + seq_printf(m, "dfs"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_FSCACHE + seq_printf(m, "fscache"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_WEAK_PW_HASH + seq_printf(m, "lanman"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_POSIX + seq_printf(m, "posix"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_UPCALL + seq_printf(m, "spnego"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_XATTR + seq_printf(m, "xattr"); +#endif + seq_putc(m, '\n'); seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); seq_printf(m, "Servers:"); diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index ac19a6f3dae..d6ced7aa23c 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -141,7 +141,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, } rc = dns_resolve_server_name_to_ip(*devname, &srvIP); - if (rc != 0) { + if (rc < 0) { cERROR(1, "%s: Failed to resolve server part of %s to IP: %d", __func__, *devname, rc); goto compose_mount_options_err; @@ -150,8 +150,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, * assuming that we have 'unc=' and 'ip=' in * the original sb_mountdata */ - md_len = strlen(sb_mountdata) + strlen(srvIP) + - strlen(ref->node_name) + 12; + md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12; mountdata = kzalloc(md_len+1, GFP_KERNEL); if (mountdata == NULL) { rc = -ENOMEM; @@ -230,28 +229,22 @@ compose_mount_options_err: goto compose_mount_options_out; } - -static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent, - struct dentry *dentry, const struct dfs_info3_param *ref) +/** + * cifs_dfs_do_refmount - mounts specified path using provided refferal + * @cifs_sb: parent/root superblock + * @fullpath: full path in UNC format + * @ref: server's referral + */ +static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, + const char *fullpath, const struct dfs_info3_param *ref) { - struct cifs_sb_info *cifs_sb; struct vfsmount *mnt; char *mountdata; char *devname = NULL; - char *fullpath; - - cifs_sb = CIFS_SB(dentry->d_inode->i_sb); - /* - * this function gives us a path with a double backslash prefix. We - * require a single backslash for DFS. - */ - fullpath = build_path_from_dentry(dentry); - if (!fullpath) - return ERR_PTR(-ENOMEM); + /* strip first '\' from fullpath */ mountdata = cifs_compose_mount_options(cifs_sb->mountdata, fullpath + 1, ref, &devname); - kfree(fullpath); if (IS_ERR(mountdata)) return (struct vfsmount *)mountdata; @@ -357,8 +350,8 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) rc = -EINVAL; goto out_err; } - mnt = cifs_dfs_do_refmount(nd->path.mnt, - nd->path.dentry, referrals + i); + mnt = cifs_dfs_do_refmount(cifs_sb, + full_path, referrals + i); cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, referrals[i].node_name, mnt); diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 246a167cb91..9e771450c3b 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -35,6 +35,7 @@ #define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */ #define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */ #define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/ +#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */ struct cifs_sb_info { struct cifsTconInfo *tcon; /* primary mount */ diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 379bd7d9c05..87044906cd1 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -84,6 +84,9 @@ struct key_type cifs_spnego_key_type = { /* strlen of ";uid=0x" */ #define UID_KEY_LEN 7 +/* strlen of ";creduid=0x" */ +#define CREDUID_KEY_LEN 11 + /* strlen of ";user=" */ #define USER_KEY_LEN 6 @@ -107,6 +110,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) IP_KEY_LEN + INET6_ADDRSTRLEN + MAX_MECH_STR_LEN + UID_KEY_LEN + (sizeof(uid_t) * 2) + + CREDUID_KEY_LEN + (sizeof(uid_t) * 2) + USER_KEY_LEN + strlen(sesInfo->userName) + PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; @@ -144,6 +148,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) sprintf(dp, ";uid=0x%x", sesInfo->linux_uid); dp = description + strlen(description); + sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid); + + dp = description + strlen(description); sprintf(dp, ";user=%s", sesInfo->userName); dp = description + strlen(description); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 78c02eb4cb1..a5ed10c9afe 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -45,8 +45,8 @@ #include "cifs_fs_sb.h" #include <linux/mm.h> #include <linux/key-type.h> -#include "dns_resolve.h" #include "cifs_spnego.h" +#include "fscache.h" #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ int cifsFYI = 0; @@ -329,6 +329,12 @@ cifs_destroy_inode(struct inode *inode) } static void +cifs_clear_inode(struct inode *inode) +{ + cifs_fscache_release_inode_cookie(inode); +} + +static void cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) { seq_printf(s, ",addr="); @@ -473,14 +479,25 @@ static int cifs_remount(struct super_block *sb, int *flags, char *data) return 0; } +void cifs_drop_inode(struct inode *inode) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) + return generic_drop_inode(inode); + + return generic_delete_inode(inode); +} + static const struct super_operations cifs_super_ops = { .put_super = cifs_put_super, .statfs = cifs_statfs, .alloc_inode = cifs_alloc_inode, .destroy_inode = cifs_destroy_inode, -/* .drop_inode = generic_delete_inode, - .delete_inode = cifs_delete_inode, */ /* Do not need above two - functions unless later we add lazy close of inodes or unless the + .drop_inode = cifs_drop_inode, + .clear_inode = cifs_clear_inode, +/* .delete_inode = cifs_delete_inode, */ /* Do not need above + function unless later we add lazy close of inodes or unless the kernel forgets to call us with the same number of releases (closes) as opens */ .show_options = cifs_show_options, @@ -892,6 +909,10 @@ init_cifs(void) cFYI(1, "cifs_max_pending set to max of 256"); } + rc = cifs_fscache_register(); + if (rc) + goto out; + rc = cifs_init_inodecache(); if (rc) goto out_clean_proc; @@ -912,27 +933,13 @@ init_cifs(void) if (rc) goto out_unregister_filesystem; #endif -#ifdef CONFIG_CIFS_DFS_UPCALL - rc = register_key_type(&key_type_dns_resolver); - if (rc) - goto out_unregister_key_type; -#endif - rc = slow_work_register_user(THIS_MODULE); - if (rc) - goto out_unregister_resolver_key; return 0; - out_unregister_resolver_key: -#ifdef CONFIG_CIFS_DFS_UPCALL - unregister_key_type(&key_type_dns_resolver); - out_unregister_key_type: -#endif #ifdef CONFIG_CIFS_UPCALL - unregister_key_type(&cifs_spnego_key_type); out_unregister_filesystem: -#endif unregister_filesystem(&cifs_fs_type); +#endif out_destroy_request_bufs: cifs_destroy_request_bufs(); out_destroy_mids: @@ -941,6 +948,8 @@ init_cifs(void) cifs_destroy_inodecache(); out_clean_proc: cifs_proc_clean(); + cifs_fscache_unregister(); + out: return rc; } @@ -949,9 +958,9 @@ exit_cifs(void) { cFYI(DBG2, "exit_cifs"); cifs_proc_clean(); + cifs_fscache_unregister(); #ifdef CONFIG_CIFS_DFS_UPCALL cifs_dfs_release_automount_timer(); - unregister_key_type(&key_type_dns_resolver); #endif #ifdef CONFIG_CIFS_UPCALL unregister_key_type(&cifs_spnego_key_type); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index a7eb65c84b1..d82f5fb4761 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -114,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.64" +#define CIFS_VERSION "1.65" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index a88479ceaad..0cdfb8c32ac 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -16,10 +16,13 @@ * the GNU Lesser General Public License for more details. * */ +#ifndef _CIFS_GLOB_H +#define _CIFS_GLOB_H + #include <linux/in.h> #include <linux/in6.h> #include <linux/slab.h> -#include <linux/slow-work.h> +#include <linux/workqueue.h> #include "cifs_fs_sb.h" #include "cifsacl.h" /* @@ -34,7 +37,7 @@ #define MAX_SHARE_SIZE 64 /* used to be 20, this should still be enough */ #define MAX_USERNAME_SIZE 32 /* 32 is to allow for 15 char names + null termination then *2 for unicode versions */ -#define MAX_PASSWORD_SIZE 16 +#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */ #define CIFS_MIN_RCV_POOL 4 @@ -80,8 +83,7 @@ enum statusEnum { }; enum securityEnum { - PLAINTXT = 0, /* Legacy with Plaintext passwords */ - LANMAN, /* Legacy LANMAN auth */ + LANMAN = 0, /* Legacy LANMAN auth */ NTLM, /* Legacy NTLM012 auth with NTLM hash */ NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ @@ -142,7 +144,6 @@ struct TCP_Server_Info { struct list_head pending_mid_q; void *Server_NlsInfo; /* BB - placeholder for future NLS info */ unsigned short server_codepage; /* codepage for the server */ - unsigned long ip_address; /* IP addr for the server if known */ enum protocolEnum protocolType; char versionMajor; char versionMinor; @@ -190,19 +191,9 @@ struct TCP_Server_Info { bool sec_mskerberos; /* supports legacy MS Kerberos */ bool sec_kerberosu2u; /* supports U2U Kerberos */ bool sec_ntlmssp; /* supports NTLMSSP */ -}; - -/* - * The following is our shortcut to user information. We surface the uid, - * and name. We always get the password on the fly in case it - * has changed. We also hang a list of sessions owned by this user off here. - */ -struct cifsUidInfo { - struct list_head userList; - struct list_head sessionList; /* SMB sessions for this user */ - uid_t linux_uid; - char user[MAX_USERNAME_SIZE + 1]; /* ascii name of user */ - /* BB may need ptr or callback for PAM or WinBind info */ +#ifdef CONFIG_CIFS_FSCACHE + struct fscache_cookie *fscache; /* client index cache cookie */ +#endif }; /* @@ -212,9 +203,6 @@ struct cifsSesInfo { struct list_head smb_ses_list; struct list_head tcon_list; struct mutex session_mutex; -#if 0 - struct cifsUidInfo *uidInfo; /* pointer to user info */ -#endif struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ enum statusEnum status; @@ -226,7 +214,8 @@ struct cifsSesInfo { char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ int Suid; /* remote smb uid */ - uid_t linux_uid; /* local Linux uid */ + uid_t linux_uid; /* overriding owner of files on the mount */ + uid_t cred_uid; /* owner of credentials */ int capabilities; char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for TCP names - will ipv6 and sctp addresses fit? */ @@ -311,6 +300,10 @@ struct cifsTconInfo { bool local_lease:1; /* check leases (only) on local system not remote */ bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ bool need_reconnect:1; /* connection reset, tid now invalid */ +#ifdef CONFIG_CIFS_FSCACHE + u64 resource_id; /* server resource id */ + struct fscache_cookie *fscache; /* cookie for share */ +#endif /* BB add field for back pointer to sb struct(s)? */ }; @@ -363,7 +356,7 @@ struct cifsFileInfo { atomic_t count; /* reference count */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; - struct slow_work oplock_break; /* slow_work job for oplock breaks */ + struct work_struct oplock_break; /* work for oplock breaks */ }; /* Take a reference on the file private data */ @@ -398,6 +391,9 @@ struct cifsInodeInfo { bool invalid_mapping:1; /* pagecache is invalid */ u64 server_eof; /* current file size on server */ u64 uniqueid; /* server inode number */ +#ifdef CONFIG_CIFS_FSCACHE + struct fscache_cookie *fscache; +#endif struct inode vfs_inode; }; @@ -732,4 +728,10 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ +void cifs_oplock_break(struct work_struct *work); +void cifs_oplock_break_get(struct cifsFileInfo *cfile); +void cifs_oplock_break_put(struct cifsFileInfo *cfile); + extern const struct slow_work_ops cifs_oplock_break_ops; + +#endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index fb1657e0fdb..1f545081408 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -86,7 +86,9 @@ extern unsigned int smbCalcSize(struct smb_hdr *ptr); extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); -extern int cifs_convert_address(char *src, void *dst); +extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); +extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, + unsigned short int port); extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr); extern void header_assemble(struct smb_hdr *, char /* command */ , const struct cifsTconInfo *, int /* length of @@ -106,7 +108,6 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file, struct vfsmount *mnt, unsigned int oflags); extern int cifs_posix_open(char *full_path, struct inode **pinode, - struct vfsmount *mnt, struct super_block *sb, int mode, int oflags, __u32 *poplock, __u16 *pnetfid, int xid); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2208f06e4c4..95c2ea67edf 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -48,6 +48,7 @@ #include "nterr.h" #include "rfc1002pdu.h" #include "cn_cifs.h" +#include "fscache.h" #define CIFS_PORT 445 #define RFC1001_PORT 139 @@ -66,6 +67,7 @@ struct smb_vol { char *iocharset; /* local code page for mapping to and from Unicode */ char source_rfc1001_name[16]; /* netbios name of client */ char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */ + uid_t cred_uid; uid_t linux_uid; gid_t linux_gid; mode_t file_mode; @@ -97,6 +99,7 @@ struct smb_vol { bool noblocksnd:1; bool noautotune:1; bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ + bool fsc:1; /* enable fscache */ unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; @@ -830,7 +833,8 @@ cifs_parse_mount_options(char *options, const char *devname, /* null target name indicates to use *SMBSERVR default called name if we end up sending RFC1001 session initialize */ vol->target_rfc1001_name[0] = 0; - vol->linux_uid = current_uid(); /* use current_euid() instead? */ + vol->cred_uid = current_uid(); + vol->linux_uid = current_uid(); vol->linux_gid = current_gid(); /* default to only allowing write access to owner of the mount */ @@ -1257,6 +1261,12 @@ cifs_parse_mount_options(char *options, const char *devname, } else if ((strnicmp(data, "nocase", 6) == 0) || (strnicmp(data, "ignorecase", 10) == 0)) { vol->nocase = 1; + } else if (strnicmp(data, "mand", 4) == 0) { + /* ignore */ + } else if (strnicmp(data, "nomand", 6) == 0) { + /* ignore */ + } else if (strnicmp(data, "_netdev", 7) == 0) { + /* ignore */ } else if (strnicmp(data, "brl", 3) == 0) { vol->nobrl = 0; } else if ((strnicmp(data, "nobrl", 5) == 0) || @@ -1331,6 +1341,8 @@ cifs_parse_mount_options(char *options, const char *devname, printk(KERN_WARNING "CIFS: Mount option noac not " "supported. Instead set " "/proc/fs/cifs/LookupCacheEnabled to 0\n"); + } else if (strnicmp(data, "fsc", 3) == 0) { + vol->fsc = true; } else printk(KERN_WARNING "CIFS: Unknown mount option %s\n", data); @@ -1380,18 +1392,92 @@ cifs_parse_mount_options(char *options, const char *devname, return 0; } +static bool +match_address(struct TCP_Server_Info *server, struct sockaddr *addr) +{ + struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; + + switch (addr->sa_family) { + case AF_INET: + if (addr4->sin_addr.s_addr != + server->addr.sockAddr.sin_addr.s_addr) + return false; + if (addr4->sin_port && + addr4->sin_port != server->addr.sockAddr.sin_port) + return false; + break; + case AF_INET6: + if (!ipv6_addr_equal(&addr6->sin6_addr, + &server->addr.sockAddr6.sin6_addr)) + return false; + if (addr6->sin6_scope_id != + server->addr.sockAddr6.sin6_scope_id) + return false; + if (addr6->sin6_port && + addr6->sin6_port != server->addr.sockAddr6.sin6_port) + return false; + break; + } + + return true; +} + +static bool +match_security(struct TCP_Server_Info *server, struct smb_vol *vol) +{ + unsigned int secFlags; + + if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) + secFlags = vol->secFlg; + else + secFlags = global_secflags | vol->secFlg; + + switch (server->secType) { + case LANMAN: + if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT))) + return false; + break; + case NTLMv2: + if (!(secFlags & CIFSSEC_MAY_NTLMV2)) + return false; + break; + case NTLM: + if (!(secFlags & CIFSSEC_MAY_NTLM)) + return false; + break; + case Kerberos: + if (!(secFlags & CIFSSEC_MAY_KRB5)) + return false; + break; + case RawNTLMSSP: + if (!(secFlags & CIFSSEC_MAY_NTLMSSP)) + return false; + break; + default: + /* shouldn't happen */ + return false; + } + + /* now check if signing mode is acceptible */ + if ((secFlags & CIFSSEC_MAY_SIGN) == 0 && + (server->secMode & SECMODE_SIGN_REQUIRED)) + return false; + else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) && + (server->secMode & + (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0) + return false; + + return true; +} + static struct TCP_Server_Info * -cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port) +cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol) { - struct list_head *tmp; struct TCP_Server_Info *server; - struct sockaddr_in *addr4 = (struct sockaddr_in *) addr; - struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) addr; write_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &cifs_tcp_ses_list) { - server = list_entry(tmp, struct TCP_Server_Info, - tcp_ses_list); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { /* * the demux thread can exit on its own while still in CifsNew * so don't accept any sockets in that state. Since the @@ -1401,37 +1487,11 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port) if (server->tcpStatus == CifsNew) continue; - switch (addr->ss_family) { - case AF_INET: - if (addr4->sin_addr.s_addr == - server->addr.sockAddr.sin_addr.s_addr) { - addr4->sin_port = htons(port); - /* user overrode default port? */ - if (addr4->sin_port) { - if (addr4->sin_port != - server->addr.sockAddr.sin_port) - continue; - } - break; - } else - continue; + if (!match_address(server, addr)) + continue; - case AF_INET6: - if (ipv6_addr_equal(&addr6->sin6_addr, - &server->addr.sockAddr6.sin6_addr) && - (addr6->sin6_scope_id == - server->addr.sockAddr6.sin6_scope_id)) { - addr6->sin6_port = htons(port); - /* user overrode default port? */ - if (addr6->sin6_port) { - if (addr6->sin6_port != - server->addr.sockAddr6.sin6_port) - continue; - } - break; - } else - continue; - } + if (!match_security(server, vol)) + continue; ++server->srv_count; write_unlock(&cifs_tcp_ses_lock); @@ -1460,6 +1520,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server) server->tcpStatus = CifsExiting; spin_unlock(&GlobalMid_Lock); + cifs_fscache_release_client_cookie(server); + task = xchg(&server->tsk, NULL); if (task) force_sig(SIGKILL, task); @@ -1479,7 +1541,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info) cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip); if (volume_info->UNCip && volume_info->UNC) { - rc = cifs_convert_address(volume_info->UNCip, &addr); + rc = cifs_fill_sockaddr((struct sockaddr *)&addr, + volume_info->UNCip, + strlen(volume_info->UNCip), + volume_info->port); if (!rc) { /* we failed translating address */ rc = -EINVAL; @@ -1499,7 +1564,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) } /* see if we already have a matching tcp_ses */ - tcp_ses = cifs_find_tcp_session(&addr, volume_info->port); + tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info); if (tcp_ses) return tcp_ses; @@ -1543,12 +1608,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info) cFYI(1, "attempting ipv6 connect"); /* BB should we allow ipv6 on port 139? */ /* other OS never observed in Wild doing 139 with v6 */ - sin_server6->sin6_port = htons(volume_info->port); memcpy(&tcp_ses->addr.sockAddr6, sin_server6, sizeof(struct sockaddr_in6)); rc = ipv6_connect(tcp_ses); } else { - sin_server->sin_port = htons(volume_info->port); memcpy(&tcp_ses->addr.sockAddr, sin_server, sizeof(struct sockaddr_in)); rc = ipv4_connect(tcp_ses); @@ -1577,6 +1640,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info) list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); write_unlock(&cifs_tcp_ses_lock); + cifs_fscache_get_client_cookie(tcp_ses); + return tcp_ses; out_err: @@ -1591,17 +1656,27 @@ out_err: } static struct cifsSesInfo * -cifs_find_smb_ses(struct TCP_Server_Info *server, char *username) +cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) { - struct list_head *tmp; struct cifsSesInfo *ses; write_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); - if (strncmp(ses->userName, username, MAX_USERNAME_SIZE)) - continue; - + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + switch (server->secType) { + case Kerberos: + if (vol->cred_uid != ses->cred_uid) + continue; + break; + default: + /* anything else takes username/password */ + if (strncmp(ses->userName, vol->username, + MAX_USERNAME_SIZE)) + continue; + if (strlen(vol->username) != 0 && + strncmp(ses->password, vol->password, + MAX_PASSWORD_SIZE)) + continue; + } ++ses->ses_count; write_unlock(&cifs_tcp_ses_lock); return ses; @@ -1643,7 +1718,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) xid = GetXid(); - ses = cifs_find_smb_ses(server, volume_info->username); + ses = cifs_find_smb_ses(server, volume_info); if (ses) { cFYI(1, "Existing smb sess found (status=%d)", ses->status); @@ -1706,6 +1781,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) if (ses->domainName) strcpy(ses->domainName, volume_info->domainname); } + ses->cred_uid = volume_info->cred_uid; ses->linux_uid = volume_info->linux_uid; ses->overrideSecFlg = volume_info->secFlg; @@ -1773,6 +1849,7 @@ cifs_put_tcon(struct cifsTconInfo *tcon) CIFSSMBTDis(xid, tcon); _FreeXid(xid); + cifs_fscache_release_super_cookie(tcon); tconInfoFree(tcon); cifs_put_smb_ses(ses); } @@ -1843,6 +1920,8 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info) list_add(&tcon->tcon_list, &ses->tcon_list); write_unlock(&cifs_tcp_ses_lock); + cifs_fscache_get_super_cookie(tcon); + return tcon; out_fail: @@ -2397,6 +2476,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID; if (pvolume_info->dynperm) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; + if (pvolume_info->fsc) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE; if (pvolume_info->direct_io) { cFYI(1, "mounting share using direct i/o"); cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 391816b461c..578d88c5b46 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -25,6 +25,7 @@ #include <linux/slab.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/file.h> #include "cifsfs.h" #include "cifspdu.h" #include "cifsglob.h" @@ -129,12 +130,6 @@ cifs_bp_rename_retry: return full_path; } -/* - * When called with struct file pointer set to NULL, there is no way we could - * update file->private_data, but getting it stuck on openFileList provides a - * way to access it from cifs_fill_filedata and thereby set file->private_data - * from cifs_open. - */ struct cifsFileInfo * cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file, struct vfsmount *mnt, unsigned int oflags) @@ -162,7 +157,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, mutex_init(&pCifsFile->lock_mutex); INIT_LIST_HEAD(&pCifsFile->llist); atomic_set(&pCifsFile->count, 1); - slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops); + INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); write_lock(&GlobalSMBSeslock); list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList); @@ -184,12 +179,13 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, } write_unlock(&GlobalSMBSeslock); + file->private_data = pCifsFile; + return pCifsFile; } int cifs_posix_open(char *full_path, struct inode **pinode, - struct vfsmount *mnt, struct super_block *sb, - int mode, int oflags, + struct super_block *sb, int mode, int oflags, __u32 *poplock, __u16 *pnetfid, int xid) { int rc; @@ -258,19 +254,6 @@ int cifs_posix_open(char *full_path, struct inode **pinode, cifs_fattr_to_inode(*pinode, &fattr); } - /* - * cifs_fill_filedata() takes care of setting cifsFileInfo pointer to - * file->private_data. - */ - if (mnt) { - struct cifsFileInfo *pfile_info; - - pfile_info = cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, - oflags); - if (pfile_info == NULL) - rc = -ENOMEM; - } - posix_open_ret: kfree(presp_data); return rc; @@ -298,7 +281,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, int create_options = CREATE_NOT_DIR; __u32 oplock = 0; int oflags; - bool posix_create = false; /* * BB below access is probably too much for mknod to request * but we have to do query and setpathinfo so requesting @@ -339,7 +321,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = cifs_posix_open(full_path, &newinode, - nd ? nd->path.mnt : NULL, inode->i_sb, mode, oflags, &oplock, &fileHandle, xid); /* EIO could indicate that (posix open) operation is not supported, despite what server claimed in capability @@ -347,7 +328,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, handled in posix open */ if (rc == 0) { - posix_create = true; if (newinode == NULL) /* query inode info */ goto cifs_create_get_file_info; else /* success, no need to query */ @@ -478,21 +458,28 @@ cifs_create_set_dentry: else cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); - /* nfsd case - nfs srv does not set nd */ - if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) { - /* mknod case - do not leave file open */ - CIFSSMBClose(xid, tcon, fileHandle); - } else if (!(posix_create) && (newinode)) { + if (newinode && nd && (nd->flags & LOOKUP_OPEN)) { struct cifsFileInfo *pfile_info; - /* - * cifs_fill_filedata() takes care of setting cifsFileInfo - * pointer to file->private_data. - */ - pfile_info = cifs_new_fileinfo(newinode, fileHandle, NULL, + struct file *filp; + + filp = lookup_instantiate_filp(nd, direntry, generic_file_open); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + CIFSSMBClose(xid, tcon, fileHandle); + goto cifs_create_out; + } + + pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp, nd->path.mnt, oflags); - if (pfile_info == NULL) + if (pfile_info == NULL) { + fput(filp); + CIFSSMBClose(xid, tcon, fileHandle); rc = -ENOMEM; + } + } else { + CIFSSMBClose(xid, tcon, fileHandle); } + cifs_create_out: kfree(buf); kfree(full_path); @@ -636,6 +623,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, bool posix_open = false; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; + struct cifsFileInfo *cfile; struct inode *newInode = NULL; char *full_path = NULL; struct file *filp; @@ -703,7 +691,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && (nd->intent.open.flags & O_CREAT)) { - rc = cifs_posix_open(full_path, &newInode, nd->path.mnt, + rc = cifs_posix_open(full_path, &newInode, parent_dir_inode->i_sb, nd->intent.open.create_mode, nd->intent.open.flags, &oplock, @@ -733,8 +721,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, else direntry->d_op = &cifs_dentry_ops; d_add(direntry, newInode); - if (posix_open) - filp = lookup_instantiate_filp(nd, direntry, NULL); + if (posix_open) { + filp = lookup_instantiate_filp(nd, direntry, + generic_file_open); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + CIFSSMBClose(xid, pTcon, fileHandle); + goto lookup_out; + } + + cfile = cifs_new_fileinfo(newInode, fileHandle, filp, + nd->path.mnt, + nd->intent.open.flags); + if (cfile == NULL) { + fput(filp); + CIFSSMBClose(xid, pTcon, fileHandle); + rc = -ENOMEM; + goto lookup_out; + } + } /* since paths are not looked up by component - the parent directories are presumed to be good here */ renew_parental_timestamps(direntry); @@ -755,6 +760,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, is a common return code */ } +lookup_out: kfree(full_path); FreeXid(xid); return ERR_PTR(rc); diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 4db2c5e7283..0eb87026cad 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -4,6 +4,8 @@ * Copyright (c) 2007 Igor Mammedov * Author(s): Igor Mammedov (niallain@gmail.com) * Steve French (sfrench@us.ibm.com) + * Wang Lei (wang840925@gmail.com) + * David Howells (dhowells@redhat.com) * * Contains the CIFS DFS upcall routines used for hostname to * IP address translation. @@ -24,145 +26,73 @@ */ #include <linux/slab.h> -#include <keys/user-type.h> +#include <linux/dns_resolver.h> #include "dns_resolve.h" #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" -/* Checks if supplied name is IP address - * returns: - * 1 - name is IP - * 0 - name is not IP - */ -static int -is_ip(char *name) -{ - struct sockaddr_storage ss; - - return cifs_convert_address(name, &ss); -} - -static int -dns_resolver_instantiate(struct key *key, const void *data, - size_t datalen) -{ - int rc = 0; - char *ip; - - ip = kmalloc(datalen + 1, GFP_KERNEL); - if (!ip) - return -ENOMEM; - - memcpy(ip, data, datalen); - ip[datalen] = '\0'; - - /* make sure this looks like an address */ - if (!is_ip(ip)) { - kfree(ip); - return -EINVAL; - } - - key->type_data.x[0] = datalen; - key->payload.data = ip; - - return rc; -} - -static void -dns_resolver_destroy(struct key *key) -{ - kfree(key->payload.data); -} - -struct key_type key_type_dns_resolver = { - .name = "dns_resolver", - .def_datalen = sizeof(struct in_addr), - .describe = user_describe, - .instantiate = dns_resolver_instantiate, - .destroy = dns_resolver_destroy, - .match = user_match, -}; - -/* Resolves server name to ip address. - * input: - * unc - server UNC - * output: - * *ip_addr - pointer to server ip, caller responcible for freeing it. - * return 0 on success +/** + * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address. + * @unc: UNC path specifying the server + * @ip_addr: Where to return the IP address. + * + * The IP address will be returned in string form, and the caller is + * responsible for freeing it. + * + * Returns length of result on success, -ve on error. */ int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) { - int rc = -EAGAIN; - struct key *rkey = ERR_PTR(-EAGAIN); + struct sockaddr_storage ss; + const char *hostname, *sep; char *name; - char *data = NULL; - int len; + int len, rc; if (!ip_addr || !unc) return -EINVAL; - /* search for server name delimiter */ len = strlen(unc); if (len < 3) { cFYI(1, "%s: unc is too short: %s", __func__, unc); return -EINVAL; } + + /* Discount leading slashes for cifs */ len -= 2; - name = memchr(unc+2, '\\', len); - if (!name) { + hostname = unc + 2; + + /* Search for server name delimiter */ + sep = memchr(hostname, '\\', len); + if (sep) + len = sep - unc; + else cFYI(1, "%s: probably server name is whole unc: %s", - __func__, unc); - } else { - len = (name - unc) - 2/* leading // */; - } + __func__, unc); + + /* Try to interpret hostname as an IPv4 or IPv6 address */ + rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len); + if (rc > 0) + goto name_is_IP_address; + + /* Perform the upcall */ + rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL); + if (rc < 0) + cERROR(1, "%s: unable to resolve: %*.*s", + __func__, len, len, hostname); + else + cFYI(1, "%s: resolved: %*.*s to %s", + __func__, len, len, hostname, *ip_addr); + return rc; - name = kmalloc(len+1, GFP_KERNEL); - if (!name) { - rc = -ENOMEM; - return rc; - } - memcpy(name, unc+2, len); +name_is_IP_address: + name = kmalloc(len + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + memcpy(name, hostname, len); name[len] = 0; - - if (is_ip(name)) { - cFYI(1, "%s: it is IP, skipping dns upcall: %s", - __func__, name); - data = name; - goto skip_upcall; - } - - rkey = request_key(&key_type_dns_resolver, name, ""); - if (!IS_ERR(rkey)) { - len = rkey->type_data.x[0]; - data = rkey->payload.data; - } else { - cERROR(1, "%s: unable to resolve: %s", __func__, name); - goto out; - } - -skip_upcall: - if (data) { - *ip_addr = kmalloc(len + 1, GFP_KERNEL); - if (*ip_addr) { - memcpy(*ip_addr, data, len + 1); - if (!IS_ERR(rkey)) - cFYI(1, "%s: resolved: %s to %s", __func__, - name, - *ip_addr - ); - rc = 0; - } else { - rc = -ENOMEM; - } - if (!IS_ERR(rkey)) - key_put(rkey); - } - -out: - kfree(name); - return rc; + cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name); + *ip_addr = name; + return 0; } - - diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h index 966e9288930..d3f5d27f4d0 100644 --- a/fs/cifs/dns_resolve.h +++ b/fs/cifs/dns_resolve.h @@ -24,8 +24,6 @@ #define _DNS_RESOLVE_H #ifdef __KERNEL__ -#include <linux/key-type.h> -extern struct key_type key_type_dns_resolver; extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr); #endif /* KERNEL */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index f1ff785b229..db11fdef0e9 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -40,6 +40,7 @@ #include "cifs_unicode.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" +#include "fscache.h" static inline int cifs_convert_flags(unsigned int flags) { @@ -162,44 +163,12 @@ psx_client_can_cache: return 0; } -static struct cifsFileInfo * -cifs_fill_filedata(struct file *file) -{ - struct list_head *tmp; - struct cifsFileInfo *pCifsFile = NULL; - struct cifsInodeInfo *pCifsInode = NULL; - - /* search inode for this file and fill in file->private_data */ - pCifsInode = CIFS_I(file->f_path.dentry->d_inode); - read_lock(&GlobalSMBSeslock); - list_for_each(tmp, &pCifsInode->openFileList) { - pCifsFile = list_entry(tmp, struct cifsFileInfo, flist); - if ((pCifsFile->pfile == NULL) && - (pCifsFile->pid == current->tgid)) { - /* mode set in cifs_create */ - - /* needed for writepage */ - pCifsFile->pfile = file; - file->private_data = pCifsFile; - break; - } - } - read_unlock(&GlobalSMBSeslock); - - if (file->private_data != NULL) { - return pCifsFile; - } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL)) - cERROR(1, "could not find file instance for " - "new file %p", file); - return NULL; -} - /* all arguments to this function must be checked for validity in caller */ -static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, - struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile, +static inline int cifs_open_inode_helper(struct inode *inode, struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf, char *full_path, int xid) { + struct cifsInodeInfo *pCifsInode = CIFS_I(inode); struct timespec temp; int rc; @@ -213,36 +182,35 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, /* if not oplocked, invalidate inode pages if mtime or file size changed */ temp = cifs_NTtimeToUnix(buf->LastWriteTime); - if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) && - (file->f_path.dentry->d_inode->i_size == + if (timespec_equal(&inode->i_mtime, &temp) && + (inode->i_size == (loff_t)le64_to_cpu(buf->EndOfFile))) { cFYI(1, "inode unchanged on server"); } else { - if (file->f_path.dentry->d_inode->i_mapping) { + if (inode->i_mapping) { /* BB no need to lock inode until after invalidate since namei code should already have it locked? */ - rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); + rc = filemap_write_and_wait(inode->i_mapping); if (rc != 0) - CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; + pCifsInode->write_behind_rc = rc; } cFYI(1, "invalidating remote inode since open detected it " "changed"); - invalidate_remote_inode(file->f_path.dentry->d_inode); + invalidate_remote_inode(inode); } client_can_cache: if (pTcon->unix_ext) - rc = cifs_get_inode_info_unix(&file->f_path.dentry->d_inode, - full_path, inode->i_sb, xid); + rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, + xid); else - rc = cifs_get_inode_info(&file->f_path.dentry->d_inode, - full_path, buf, inode->i_sb, xid, NULL); + rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, + xid, NULL); if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) { pCifsInode->clientCanCacheAll = true; pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", - file->f_path.dentry->d_inode); + cFYI(1, "Exclusive Oplock granted on inode %p", inode); } else if ((*oplock & 0xF) == OPLOCK_READ) pCifsInode->clientCanCacheRead = true; @@ -256,7 +224,7 @@ int cifs_open(struct inode *inode, struct file *file) __u32 oplock; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *tcon; - struct cifsFileInfo *pCifsFile; + struct cifsFileInfo *pCifsFile = NULL; struct cifsInodeInfo *pCifsInode; char *full_path = NULL; int desiredAccess; @@ -270,12 +238,6 @@ int cifs_open(struct inode *inode, struct file *file) tcon = cifs_sb->tcon; pCifsInode = CIFS_I(file->f_path.dentry->d_inode); - pCifsFile = cifs_fill_filedata(file); - if (pCifsFile) { - rc = 0; - FreeXid(xid); - return rc; - } full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { @@ -299,8 +261,7 @@ int cifs_open(struct inode *inode, struct file *file) int oflags = (int) cifs_posix_convert_flags(file->f_flags); oflags |= SMB_O_CREAT; /* can not refresh inode info since size could be stale */ - rc = cifs_posix_open(full_path, &inode, file->f_path.mnt, - inode->i_sb, + rc = cifs_posix_open(full_path, &inode, inode->i_sb, cifs_sb->mnt_file_mode /* ignored */, oflags, &oplock, &netfid, xid); if (rc == 0) { @@ -308,9 +269,23 @@ int cifs_open(struct inode *inode, struct file *file) /* no need for special case handling of setting mode on read only files needed here */ - pCifsFile = cifs_fill_filedata(file); - cifs_posix_open_inode_helper(inode, file, pCifsInode, - oplock, netfid); + rc = cifs_posix_open_inode_helper(inode, file, + pCifsInode, oplock, netfid); + if (rc != 0) { + CIFSSMBClose(xid, tcon, netfid); + goto out; + } + + pCifsFile = cifs_new_fileinfo(inode, netfid, file, + file->f_path.mnt, + oflags); + if (pCifsFile == NULL) { + CIFSSMBClose(xid, tcon, netfid); + rc = -ENOMEM; + } + + cifs_fscache_set_inode_cookie(inode, file); + goto out; } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { if (tcon->ses->serverNOS) @@ -391,16 +366,18 @@ int cifs_open(struct inode *inode, struct file *file) goto out; } + rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid); + if (rc != 0) + goto out; + pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt, file->f_flags); - file->private_data = pCifsFile; - if (file->private_data == NULL) { + if (pCifsFile == NULL) { rc = -ENOMEM; goto out; } - rc = cifs_open_inode_helper(inode, file, pCifsInode, pCifsFile, tcon, - &oplock, buf, full_path, xid); + cifs_fscache_set_inode_cookie(inode, file); if (oplock & CIFS_CREATE_ACTION) { /* time to set mode which we can not set earlier due to @@ -456,7 +433,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush) __u16 netfid; if (file->private_data) - pCifsFile = (struct cifsFileInfo *)file->private_data; + pCifsFile = file->private_data; else return -EBADF; @@ -513,8 +490,7 @@ reopen_error_exit: le64_to_cpu(tcon->fsUnixInfo.Capability))) { int oflags = (int) cifs_posix_convert_flags(file->f_flags); /* can not refresh inode info since size could be stale */ - rc = cifs_posix_open(full_path, NULL, file->f_path.mnt, - inode->i_sb, + rc = cifs_posix_open(full_path, NULL, inode->i_sb, cifs_sb->mnt_file_mode /* ignored */, oflags, &oplock, &netfid, xid); if (rc == 0) { @@ -595,8 +571,7 @@ int cifs_close(struct inode *inode, struct file *file) int xid, timeout; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; - struct cifsFileInfo *pSMBFile = - (struct cifsFileInfo *)file->private_data; + struct cifsFileInfo *pSMBFile = file->private_data; xid = GetXid(); @@ -671,8 +646,7 @@ int cifs_closedir(struct inode *inode, struct file *file) { int rc = 0; int xid; - struct cifsFileInfo *pCFileStruct = - (struct cifsFileInfo *)file->private_data; + struct cifsFileInfo *pCFileStruct = file->private_data; char *ptmp; cFYI(1, "Closedir inode = 0x%p", inode); @@ -893,8 +867,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) length, pfLock, posix_lock_type, wait_flag); } else { - struct cifsFileInfo *fid = - (struct cifsFileInfo *)file->private_data; + struct cifsFileInfo *fid = file->private_data; if (numLock) { rc = CIFSSMBLock(xid, tcon, netfid, length, @@ -995,7 +968,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, if (file->private_data == NULL) return -EBADF; - open_file = (struct cifsFileInfo *) file->private_data; + open_file = file->private_data; rc = generic_write_checks(file, poffset, &write_size, 0); if (rc) @@ -1097,7 +1070,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data, if (file->private_data == NULL) return -EBADF; - open_file = (struct cifsFileInfo *)file->private_data; + open_file = file->private_data; xid = GetXid(); @@ -1681,8 +1654,7 @@ int cifs_fsync(struct file *file, int datasync) int xid; int rc = 0; struct cifsTconInfo *tcon; - struct cifsFileInfo *smbfile = - (struct cifsFileInfo *)file->private_data; + struct cifsFileInfo *smbfile = file->private_data; struct inode *inode = file->f_path.dentry->d_inode; xid = GetXid(); @@ -1786,7 +1758,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data, FreeXid(xid); return rc; } - open_file = (struct cifsFileInfo *)file->private_data; + open_file = file->private_data; if ((file->f_flags & O_ACCMODE) == O_WRONLY) cFYI(1, "attempting read on write only file instance"); @@ -1867,7 +1839,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, FreeXid(xid); return rc; } - open_file = (struct cifsFileInfo *)file->private_data; + open_file = file->private_data; if ((file->f_flags & O_ACCMODE) == O_WRONLY) cFYI(1, "attempting read on write only file instance"); @@ -1952,6 +1924,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping, bytes_read -= PAGE_CACHE_SIZE; continue; } + page_cache_release(page); target = kmap_atomic(page, KM_USER0); @@ -1971,6 +1944,9 @@ static void cifs_copy_cache_pages(struct address_space *mapping, SetPageUptodate(page); unlock_page(page); data += PAGE_CACHE_SIZE; + + /* add page to FS-Cache */ + cifs_readpage_to_fscache(mapping->host, page); } return; } @@ -1997,10 +1973,19 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, FreeXid(xid); return rc; } - open_file = (struct cifsFileInfo *)file->private_data; + open_file = file->private_data; cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); pTcon = cifs_sb->tcon; + /* + * Reads as many pages as possible from fscache. Returns -ENOBUFS + * immediately if the cookie is negative + */ + rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, + &num_pages); + if (rc == 0) + goto read_complete; + cFYI(DBG2, "rpages: num pages %d", num_pages); for (i = 0; i < num_pages; ) { unsigned contig_pages; @@ -2111,6 +2096,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, smb_read_data = NULL; } +read_complete: FreeXid(xid); return rc; } @@ -2121,6 +2107,11 @@ static int cifs_readpage_worker(struct file *file, struct page *page, char *read_data; int rc; + /* Is the page cached? */ + rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page); + if (rc == 0) + goto read_complete; + page_cache_get(page); read_data = kmap(page); /* for reads over a certain size could initiate async read ahead */ @@ -2140,11 +2131,17 @@ static int cifs_readpage_worker(struct file *file, struct page *page, flush_dcache_page(page); SetPageUptodate(page); + + /* send this page to the cache */ + cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page); + rc = 0; io_error: kunmap(page); page_cache_release(page); + +read_complete: return rc; } @@ -2294,8 +2291,23 @@ out: return rc; } -static void -cifs_oplock_break(struct slow_work *work) +static int cifs_release_page(struct page *page, gfp_t gfp) +{ + if (PagePrivate(page)) + return 0; + + return cifs_fscache_release_page(page, gfp); +} + +static void cifs_invalidate_page(struct page *page, unsigned long offset) +{ + struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); + + if (offset == 0) + cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); +} + +void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, oplock_break); @@ -2332,33 +2344,30 @@ cifs_oplock_break(struct slow_work *work) LOCKING_ANDX_OPLOCK_RELEASE, false); cFYI(1, "Oplock release rc = %d", rc); } + + /* + * We might have kicked in before is_valid_oplock_break() + * finished grabbing reference for us. Make sure it's done by + * waiting for GlobalSMSSeslock. + */ + write_lock(&GlobalSMBSeslock); + write_unlock(&GlobalSMBSeslock); + + cifs_oplock_break_put(cfile); } -static int -cifs_oplock_break_get(struct slow_work *work) +void cifs_oplock_break_get(struct cifsFileInfo *cfile) { - struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, - oplock_break); mntget(cfile->mnt); cifsFileInfo_get(cfile); - return 0; } -static void -cifs_oplock_break_put(struct slow_work *work) +void cifs_oplock_break_put(struct cifsFileInfo *cfile) { - struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, - oplock_break); mntput(cfile->mnt); cifsFileInfo_put(cfile); } -const struct slow_work_ops cifs_oplock_break_ops = { - .get_ref = cifs_oplock_break_get, - .put_ref = cifs_oplock_break_put, - .execute = cifs_oplock_break, -}; - const struct address_space_operations cifs_addr_ops = { .readpage = cifs_readpage, .readpages = cifs_readpages, @@ -2367,6 +2376,8 @@ const struct address_space_operations cifs_addr_ops = { .write_begin = cifs_write_begin, .write_end = cifs_write_end, .set_page_dirty = __set_page_dirty_nobuffers, + .releasepage = cifs_release_page, + .invalidatepage = cifs_invalidate_page, /* .sync_page = cifs_sync_page, */ /* .direct_IO = */ }; @@ -2383,6 +2394,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .write_begin = cifs_write_begin, .write_end = cifs_write_end, .set_page_dirty = __set_page_dirty_nobuffers, + .releasepage = cifs_release_page, + .invalidatepage = cifs_invalidate_page, /* .sync_page = cifs_sync_page, */ /* .direct_IO = */ }; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c new file mode 100644 index 00000000000..9f3f5c4be16 --- /dev/null +++ b/fs/cifs/fscache.c @@ -0,0 +1,236 @@ +/* + * fs/cifs/fscache.c - CIFS filesystem cache interface + * + * Copyright (c) 2010 Novell, Inc. + * Author(s): Suresh Jayaraman (sjayaraman@suse.de> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "fscache.h" +#include "cifsglob.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" + +void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) +{ + server->fscache = + fscache_acquire_cookie(cifs_fscache_netfs.primary_index, + &cifs_fscache_server_index_def, server); + cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server, + server->fscache); +} + +void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) +{ + cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server, + server->fscache); + fscache_relinquish_cookie(server->fscache, 0); + server->fscache = NULL; +} + +void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) +{ + struct TCP_Server_Info *server = tcon->ses->server; + + tcon->fscache = + fscache_acquire_cookie(server->fscache, + &cifs_fscache_super_index_def, tcon); + cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)", + server->fscache, tcon->fscache); +} + +void cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) +{ + cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache); + fscache_relinquish_cookie(tcon->fscache, 0); + tcon->fscache = NULL; +} + +static void cifs_fscache_enable_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + if (cifsi->fscache) + return; + + cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache, + &cifs_fscache_inode_object_def, + cifsi); + cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", + cifs_sb->tcon->fscache, cifsi->fscache); +} + +void cifs_fscache_release_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + if (cifsi->fscache) { + cFYI(1, "CIFS releasing inode cookie (0x%p)", + cifsi->fscache); + fscache_relinquish_cookie(cifsi->fscache, 0); + cifsi->fscache = NULL; + } +} + +static void cifs_fscache_disable_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + if (cifsi->fscache) { + cFYI(1, "CIFS disabling inode cookie (0x%p)", + cifsi->fscache); + fscache_relinquish_cookie(cifsi->fscache, 1); + cifsi->fscache = NULL; + } +} + +void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) +{ + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) + cifs_fscache_disable_inode_cookie(inode); + else { + cifs_fscache_enable_inode_cookie(inode); + cFYI(1, "CIFS: fscache inode cookie set"); + } +} + +void cifs_fscache_reset_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct fscache_cookie *old = cifsi->fscache; + + if (cifsi->fscache) { + /* retire the current fscache cache and get a new one */ + fscache_relinquish_cookie(cifsi->fscache, 1); + + cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache, + &cifs_fscache_inode_object_def, + cifsi); + cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p", + cifsi->fscache, old); + } +} + +int cifs_fscache_release_page(struct page *page, gfp_t gfp) +{ + if (PageFsCache(page)) { + struct inode *inode = page->mapping->host; + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + cFYI(1, "CIFS: fscache release page (0x%p/0x%p)", + page, cifsi->fscache); + if (!fscache_maybe_release_page(cifsi->fscache, page, gfp)) + return 0; + } + + return 1; +} + +static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx, + int error) +{ + cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)", + page, error); + if (!error) + SetPageUptodate(page); + unlock_page(page); +} + +/* + * Retrieve a page from FS-Cache + */ +int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) +{ + int ret; + + cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p", + CIFS_I(inode)->fscache, page, inode); + ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page, + cifs_readpage_from_fscache_complete, + NULL, + GFP_KERNEL); + switch (ret) { + + case 0: /* page found in fscache, read submitted */ + cFYI(1, "CIFS: readpage_from_fscache: submitted"); + return ret; + case -ENOBUFS: /* page won't be cached */ + case -ENODATA: /* page not in cache */ + cFYI(1, "CIFS: readpage_from_fscache %d", ret); + return 1; + + default: + cERROR(1, "unknown error ret = %d", ret); + } + return ret; +} + +/* + * Retrieve a set of pages from FS-Cache + */ +int __cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + int ret; + + cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)", + CIFS_I(inode)->fscache, *nr_pages, inode); + ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping, + pages, nr_pages, + cifs_readpage_from_fscache_complete, + NULL, + mapping_gfp_mask(mapping)); + switch (ret) { + case 0: /* read submitted to the cache for all pages */ + cFYI(1, "CIFS: readpages_from_fscache: submitted"); + return ret; + + case -ENOBUFS: /* some pages are not cached and can't be */ + case -ENODATA: /* some pages are not cached */ + cFYI(1, "CIFS: readpages_from_fscache: no page"); + return 1; + + default: + cFYI(1, "unknown error ret = %d", ret); + } + + return ret; +} + +void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) +{ + int ret; + + cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p", + CIFS_I(inode)->fscache, page, inode); + ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL); + if (ret != 0) + fscache_uncache_page(CIFS_I(inode)->fscache, page); +} + +void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct fscache_cookie *cookie = cifsi->fscache; + + cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie); + fscache_wait_on_page_write(cookie, page); + fscache_uncache_page(cookie, page); +} + diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h new file mode 100644 index 00000000000..31b88ec2341 --- /dev/null +++ b/fs/cifs/fscache.h @@ -0,0 +1,136 @@ +/* + * fs/cifs/fscache.h - CIFS filesystem cache interface definitions + * + * Copyright (c) 2010 Novell, Inc. + * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _CIFS_FSCACHE_H +#define _CIFS_FSCACHE_H + +#include <linux/fscache.h> + +#include "cifsglob.h" + +#ifdef CONFIG_CIFS_FSCACHE + +extern struct fscache_netfs cifs_fscache_netfs; +extern const struct fscache_cookie_def cifs_fscache_server_index_def; +extern const struct fscache_cookie_def cifs_fscache_super_index_def; +extern const struct fscache_cookie_def cifs_fscache_inode_object_def; + +extern int cifs_fscache_register(void); +extern void cifs_fscache_unregister(void); + +/* + * fscache.c + */ +extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *); +extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *); +extern void cifs_fscache_get_super_cookie(struct cifsTconInfo *); +extern void cifs_fscache_release_super_cookie(struct cifsTconInfo *); + +extern void cifs_fscache_release_inode_cookie(struct inode *); +extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *); +extern void cifs_fscache_reset_inode_cookie(struct inode *); + +extern void __cifs_fscache_invalidate_page(struct page *, struct inode *); +extern int cifs_fscache_release_page(struct page *page, gfp_t gfp); +extern int __cifs_readpage_from_fscache(struct inode *, struct page *); +extern int __cifs_readpages_from_fscache(struct inode *, + struct address_space *, + struct list_head *, + unsigned *); + +extern void __cifs_readpage_to_fscache(struct inode *, struct page *); + +static inline void cifs_fscache_invalidate_page(struct page *page, + struct inode *inode) +{ + if (PageFsCache(page)) + __cifs_fscache_invalidate_page(page, inode); +} + +static inline int cifs_readpage_from_fscache(struct inode *inode, + struct page *page) +{ + if (CIFS_I(inode)->fscache) + return __cifs_readpage_from_fscache(inode, page); + + return -ENOBUFS; +} + +static inline int cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + if (CIFS_I(inode)->fscache) + return __cifs_readpages_from_fscache(inode, mapping, pages, + nr_pages); + return -ENOBUFS; +} + +static inline void cifs_readpage_to_fscache(struct inode *inode, + struct page *page) +{ + if (PageFsCache(page)) + __cifs_readpage_to_fscache(inode, page); +} + +#else /* CONFIG_CIFS_FSCACHE */ +static inline int cifs_fscache_register(void) { return 0; } +static inline void cifs_fscache_unregister(void) {} + +static inline void +cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {} +static inline void +cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {} +static inline void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) {} +static inline void +cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) {} + +static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {} +static inline void cifs_fscache_set_inode_cookie(struct inode *inode, + struct file *filp) {} +static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {} +static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp) +{ + return 1; /* May release page */ +} + +static inline void cifs_fscache_invalidate_page(struct page *page, + struct inode *inode) {} +static inline int +cifs_readpage_from_fscache(struct inode *inode, struct page *page) +{ + return -ENOBUFS; +} + +static inline int cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void cifs_readpage_to_fscache(struct inode *inode, + struct page *page) {} + +#endif /* CONFIG_CIFS_FSCACHE */ + +#endif /* _CIFS_FSCACHE_H */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 62b324f26a5..dc4c47ab958 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -29,6 +29,7 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" +#include "fscache.h" static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) @@ -288,7 +289,7 @@ int cifs_get_file_info_unix(struct file *filp) struct inode *inode = filp->f_path.dentry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsTconInfo *tcon = cifs_sb->tcon; - struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; + struct cifsFileInfo *cfile = filp->private_data; xid = GetXid(); rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data); @@ -515,7 +516,7 @@ int cifs_get_file_info(struct file *filp) struct inode *inode = filp->f_path.dentry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsTconInfo *tcon = cifs_sb->tcon; - struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; + struct cifsFileInfo *cfile = filp->private_data; xid = GetXid(); rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); @@ -723,18 +724,17 @@ cifs_find_inode(struct inode *inode, void *opaque) { struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; + /* don't match inode with different uniqueid */ if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) return 0; - /* - * uh oh -- it's a directory. We can't use it since hardlinked dirs are - * verboten. Disable serverino and return it as if it were found, the - * caller can discard it, generate a uniqueid and retry the find - */ - if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) { + /* don't match inode of different type */ + if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) + return 0; + + /* if it's not a directory or has no dentries, then flag it */ + if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) fattr->cf_flags |= CIFS_FATTR_INO_COLLISION; - cifs_autodisable_serverino(CIFS_SB(inode->i_sb)); - } return 1; } @@ -748,6 +748,27 @@ cifs_init_inode(struct inode *inode, void *opaque) return 0; } +/* + * walk dentry list for an inode and report whether it has aliases that + * are hashed. We use this to determine if a directory inode can actually + * be used. + */ +static bool +inode_has_hashed_dentries(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&dcache_lock); + list_for_each_entry(dentry, &inode->i_dentry, d_alias) { + if (!d_unhashed(dentry) || IS_ROOT(dentry)) { + spin_unlock(&dcache_lock); + return true; + } + } + spin_unlock(&dcache_lock); + return false; +} + /* Given fattrs, get a corresponding inode */ struct inode * cifs_iget(struct super_block *sb, struct cifs_fattr *fattr) @@ -763,12 +784,16 @@ retry_iget5_locked: inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr); if (inode) { - /* was there a problematic inode number collision? */ + /* was there a potentially problematic inode collision? */ if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) { - iput(inode); - fattr->cf_uniqueid = iunique(sb, ROOT_I); fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION; - goto retry_iget5_locked; + + if (inode_has_hashed_dentries(inode)) { + cifs_autodisable_serverino(CIFS_SB(sb)); + iput(inode); + fattr->cf_uniqueid = iunique(sb, ROOT_I); + goto retry_iget5_locked; + } } cifs_fattr_to_inode(inode, fattr); @@ -776,6 +801,10 @@ retry_iget5_locked: inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; +#ifdef CONFIG_CIFS_FSCACHE + /* initialize per-inode cache cookie pointer */ + CIFS_I(inode)->fscache = NULL; +#endif unlock_new_inode(inode); } } @@ -807,6 +836,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) if (!inode) return ERR_PTR(-ENOMEM); +#ifdef CONFIG_CIFS_FSCACHE + /* populate tcon->resource_id */ + cifs_sb->tcon->resource_id = CIFS_I(inode)->uniqueid; +#endif + if (rc && cifs_sb->tcon->ipc) { cFYI(1, "ipc connection - fake read inode"); inode->i_mode |= S_IFDIR; @@ -1401,6 +1435,10 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath, if (rc == 0 || rc != -ETXTBSY) return rc; + /* open-file renames don't work across directories */ + if (to_dentry->d_parent != from_dentry->d_parent) + return rc; + /* open the file to be renamed -- we need DELETE perms */ rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE, CREATE_NOT_DIR, &srcfid, &oplock, NULL, @@ -1564,6 +1602,7 @@ cifs_invalidate_mapping(struct inode *inode) cifs_i->write_behind_rc = rc; } invalidate_remote_inode(inode); + cifs_fscache_reset_inode_cookie(inode); } int cifs_revalidate_file(struct file *filp) diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 505926f1ee6..9d38a71c8e1 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -41,8 +41,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) __u64 ExtAttrMask = 0; __u64 caps; struct cifsTconInfo *tcon; - struct cifsFileInfo *pSMBFile = - (struct cifsFileInfo *)filep->private_data; + struct cifsFileInfo *pSMBFile = filep->private_data; #endif /* CONFIG_CIFS_POSIX */ xid = GetXid(); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1394aa37f26..3ccadc1326d 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -498,7 +498,6 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) struct cifsTconInfo *tcon; struct cifsInodeInfo *pCifsInode; struct cifsFileInfo *netfile; - int rc; cFYI(1, "Checking for oplock break or dnotify response"); if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && @@ -583,13 +582,18 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) pCifsInode->clientCanCacheAll = false; if (pSMB->OplockLevel == 0) pCifsInode->clientCanCacheRead = false; - rc = slow_work_enqueue(&netfile->oplock_break); - if (rc) { - cERROR(1, "failed to enqueue oplock " - "break: %d\n", rc); - } else { - netfile->oplock_break_cancelled = false; - } + + /* + * cifs_oplock_break_put() can't be called + * from here. Get reference after queueing + * succeeded. cifs_oplock_break() will + * synchronize using GlobalSMSSeslock. + */ + if (queue_work(system_nrt_wq, + &netfile->oplock_break)) + cifs_oplock_break_get(netfile); + netfile->oplock_break_cancelled = false; + read_unlock(&GlobalSMBSeslock); read_unlock(&cifs_tcp_ses_lock); return true; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index d35d52889cb..f97851119e6 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -61,6 +61,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = { {ERRremcd, -EACCES}, {ERRdiffdevice, -EXDEV}, {ERRnofiles, -ENOENT}, + {ERRwriteprot, -EROFS}, {ERRbadshare, -ETXTBSY}, {ERRlock, -EACCES}, {ERRunsup, -EINVAL}, @@ -139,17 +140,18 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = { * Returns 0 on failure. */ static int -cifs_inet_pton(const int address_family, const char *cp, void *dst) +cifs_inet_pton(const int address_family, const char *cp, int len, void *dst) { int ret = 0; /* calculate length by finding first slash or NULL */ if (address_family == AF_INET) - ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL); + ret = in4_pton(cp, len, dst, '\\', NULL); else if (address_family == AF_INET6) - ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL); + ret = in6_pton(cp, len, dst , '\\', NULL); - cFYI(DBG2, "address conversion returned %d for %s", ret, cp); + cFYI(DBG2, "address conversion returned %d for %*.*s", + ret, len, len, cp); if (ret > 0) ret = 1; return ret; @@ -164,43 +166,66 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst) * Returns 0 on failure. */ int -cifs_convert_address(char *src, void *dst) +cifs_convert_address(struct sockaddr *dst, const char *src, int len) { - int rc; - char *pct, *endp; + int rc, alen, slen; + const char *pct; + char *endp, scope_id[13]; struct sockaddr_in *s4 = (struct sockaddr_in *) dst; struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst; /* IPv4 address */ - if (cifs_inet_pton(AF_INET, src, &s4->sin_addr.s_addr)) { + if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) { s4->sin_family = AF_INET; return 1; } - /* temporarily terminate string */ - pct = strchr(src, '%'); - if (pct) - *pct = '\0'; - - rc = cifs_inet_pton(AF_INET6, src, &s6->sin6_addr.s6_addr); - - /* repair temp termination (if any) and make pct point to scopeid */ - if (pct) - *pct++ = '%'; + /* attempt to exclude the scope ID from the address part */ + pct = memchr(src, '%', len); + alen = pct ? pct - src : len; + rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr); if (!rc) return rc; s6->sin6_family = AF_INET6; if (pct) { + /* grab the scope ID */ + slen = len - (alen + 1); + if (slen <= 0 || slen > 12) + return 0; + memcpy(scope_id, pct + 1, slen); + scope_id[slen] = '\0'; + s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0); - if (!*pct || *endp) + if (endp != scope_id + slen) return 0; } return rc; } +int +cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, + const unsigned short int port) +{ + if (!cifs_convert_address(dst, src, len)) + return 0; + + switch (dst->sa_family) { + case AF_INET: + ((struct sockaddr_in *)dst)->sin_port = htons(port); + break; + case AF_INET6: + ((struct sockaddr_in6 *)dst)->sin6_port = htons(port); + break; + default: + return 0; + } + + return 1; +} + /***************************************************************************** convert a NT status code to a dos class/code *****************************************************************************/ diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index daf1753af67..d5e591fab47 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -847,6 +847,11 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); + if (tmp_buf == NULL) { + rc = -ENOMEM; + break; + } + for (i = 0; (i < num_to_fill) && (rc == 0); i++) { if (current_entry == NULL) { /* evaluate whether this case is an error */ diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 7707389bdf2..0a57cb7db5d 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -730,15 +730,7 @@ ssetup_ntlmssp_authenticate: /* calculate session key */ setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp); - if (first_time) /* should this be moved into common code - with similar ntlmv2 path? */ - /* cifs_calculate_ntlmv2_mac_key(ses->server->mac_signing_key, - response BB FIXME, v2_sess_key); */ - - /* copy session key */ - - /* memcpy(bcc_ptr, (char *)ntlm_session_key,LM2_SESS_KEY_SIZE); - bcc_ptr += LM2_SESS_KEY_SIZE; */ + /* FIXME: calculate MAC key */ memcpy(bcc_ptr, (char *)v2_sess_key, sizeof(struct ntlmv2_resp)); bcc_ptr += sizeof(struct ntlmv2_resp); diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h index c5084d27db7..7f16cb825fe 100644 --- a/fs/cifs/smberr.h +++ b/fs/cifs/smberr.h @@ -76,6 +76,7 @@ #define ERRnofiles 18 /* A File Search command can find no more files matching the specified criteria. */ +#define ERRwriteprot 19 /* media is write protected */ #define ERRgeneral 31 #define ERRbadshare 32 /* The sharing mode specified for an Open conflicts with existing FIDs on diff --git a/fs/compat.c b/fs/compat.c index f0b391c5055..c6fda9aeb86 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -8,7 +8,7 @@ * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs - * Copyright (C) 2003 Pavel Machek (pavel@suse.cz) + * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -626,7 +626,7 @@ ssize_t compat_rw_copy_check_uvector(int type, tot_len += len; if (tot_len < tmp) /* maths overflow on the compat_ssize_t */ goto out; - if (!access_ok(vrfy_dir(type), buf, len)) { + if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { ret = -EFAULT; goto out; } diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 641640dc7ae..63ae8583146 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -4,7 +4,7 @@ * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs - * Copyright (C) 2003 Pavel Machek (pavel@suse.cz) + * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) * * These routines maintain argument size conversion between 32bit and 64bit * ioctls. @@ -601,8 +601,11 @@ static int ioc_settimeout(unsigned int fd, unsigned int cmd, } /* Bluetooth ioctls */ -#define HCIUARTSETPROTO _IOW('U', 200, int) -#define HCIUARTGETPROTO _IOR('U', 201, int) +#define HCIUARTSETPROTO _IOW('U', 200, int) +#define HCIUARTGETPROTO _IOR('U', 201, int) +#define HCIUARTGETDEVICE _IOR('U', 202, int) +#define HCIUARTSETFLAGS _IOW('U', 203, int) +#define HCIUARTGETFLAGS _IOR('U', 204, int) #define BNEPCONNADD _IOW('B', 200, int) #define BNEPCONNDEL _IOW('B', 201, int) @@ -1328,6 +1331,8 @@ COMPATIBLE_IOCTL(HCISETLINKPOL) COMPATIBLE_IOCTL(HCISETLINKMODE) COMPATIBLE_IOCTL(HCISETACLMTU) COMPATIBLE_IOCTL(HCISETSCOMTU) +COMPATIBLE_IOCTL(HCIBLOCKADDR) +COMPATIBLE_IOCTL(HCIUNBLOCKADDR) COMPATIBLE_IOCTL(HCIINQUIRY) COMPATIBLE_IOCTL(HCIUARTSETPROTO) COMPATIBLE_IOCTL(HCIUARTGETPROTO) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 41645142b88..cf78d44a8d6 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -72,10 +72,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) if (!sd) return -EINVAL; - error = simple_setattr(dentry, iattr); - if (error) - return error; - sd_iattr = sd->s_iattr; if (!sd_iattr) { /* setting attributes for the first time, allocate now */ @@ -89,9 +85,12 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; sd->s_iattr = sd_iattr; } - /* attributes were changed atleast once in past */ + error = simple_setattr(dentry, iattr); + if (error) + return error; + if (ia_valid & ATTR_UID) sd_iattr->ia_uid = iattr->ia_uid; if (ia_valid & ATTR_GID) diff --git a/fs/dcache.c b/fs/dcache.c index d96047b4a63..86d4db15473 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -590,6 +590,8 @@ static void prune_dcache(int count) up_read(&sb->s_umount); } spin_lock(&sb_lock); + /* lock was dropped, must reset next */ + list_safe_reset_next(sb, n, s_list); count -= pruned; __put_super(sb); /* more work left to do? */ @@ -894,7 +896,7 @@ EXPORT_SYMBOL(shrink_dcache_parent); * * In this case we return -1 to tell the caller that we baled. */ -static int shrink_dcache_memory(int nr, gfp_t gfp_mask) +static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { if (!(gfp_mask & __GFP_FS)) diff --git a/fs/direct-io.c b/fs/direct-io.c index 7600aacf531..a10cb91cade 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio) * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static int dio_complete(struct dio *dio, loff_t offset, int ret) +static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) { ssize_t transferred = 0; @@ -239,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) transferred = dio->i_size - offset; } - if (dio->end_io && dio->result) - dio->end_io(dio->iocb, offset, transferred, - dio->map_bh.b_private); - - if (dio->flags & DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); - if (ret == 0) ret = dio->page_errors; if (ret == 0) @@ -254,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) if (ret == 0) ret = transferred; + if (dio->end_io && dio->result) { + dio->end_io(dio->iocb, offset, transferred, + dio->map_bh.b_private, ret, is_async); + } else if (is_async) { + aio_complete(dio->iocb, ret, 0); + } + + if (dio->flags & DIO_LOCKING) + /* lockdep: non-owner release */ + up_read_non_owner(&dio->inode->i_alloc_sem); + return ret; } @@ -277,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - int ret = dio_complete(dio, dio->iocb->ki_pos, 0); - aio_complete(dio->iocb, ret, 0); + dio_complete(dio, dio->iocb->ki_pos, 0, true); kfree(dio); } } @@ -1126,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, spin_unlock_irqrestore(&dio->bio_lock, flags); if (ret2 == 0) { - ret = dio_complete(dio, offset, ret); + ret = dio_complete(dio, offset, ret, false); kfree(dio); } else BUG_ON(ret != -EIOCBQUEUED); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index c0d35c62052..37a34c2c622 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -248,7 +248,7 @@ static struct connection *assoc2con(int assoc_id) for (i = 0 ; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry(con, h, &connection_hash[i], list) { - if (con && con->sctp_assoc == assoc_id) { + if (con->sctp_assoc == assoc_id) { mutex_unlock(&connections_lock); return con; } diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 2c6ad518100..ef17e0169da 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -81,24 +81,11 @@ static struct genl_ops dlm_nl_ops = { int __init dlm_netlink_init(void) { - int rv; - - rv = genl_register_family(&family); - if (rv) - return rv; - - rv = genl_register_ops(&family, &dlm_nl_ops); - if (rv < 0) - goto err; - return 0; - err: - genl_unregister_family(&family); - return rv; + return genl_register_family_with_ops(&family, &dlm_nl_ops, 1); } void dlm_netlink_exit(void) { - genl_unregister_ops(&family, &dlm_nl_ops); genl_unregister_family(&family); } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 1cc087635a5..a2e3b562e65 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -762,7 +762,7 @@ ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, /** * ecryptfs_init_crypt_ctx - * @crypt_stat: Uninitilized crypt stats structure + * @crypt_stat: Uninitialized crypt stats structure * * Initialize the crypto context. * diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 2d8dbce9d48..46c4dd8dfcc 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -31,9 +31,9 @@ static struct mutex ecryptfs_msg_ctx_lists_mux; static struct hlist_head *ecryptfs_daemon_hash; struct mutex ecryptfs_daemon_hash_mux; -static int ecryptfs_hash_buckets; +static int ecryptfs_hash_bits; #define ecryptfs_uid_hash(uid) \ - hash_long((unsigned long)uid, ecryptfs_hash_buckets) + hash_long((unsigned long)uid, ecryptfs_hash_bits) static u32 ecryptfs_msg_counter; static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; @@ -486,18 +486,19 @@ int ecryptfs_init_messaging(void) } mutex_init(&ecryptfs_daemon_hash_mux); mutex_lock(&ecryptfs_daemon_hash_mux); - ecryptfs_hash_buckets = 1; - while (ecryptfs_number_of_users >> ecryptfs_hash_buckets) - ecryptfs_hash_buckets++; + ecryptfs_hash_bits = 1; + while (ecryptfs_number_of_users >> ecryptfs_hash_bits) + ecryptfs_hash_bits++; ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head) - * ecryptfs_hash_buckets), GFP_KERNEL); + * (1 << ecryptfs_hash_bits)), + GFP_KERNEL); if (!ecryptfs_daemon_hash) { rc = -ENOMEM; printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); mutex_unlock(&ecryptfs_daemon_hash_mux); goto out; } - for (i = 0; i < ecryptfs_hash_buckets; i++) + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]); mutex_unlock(&ecryptfs_daemon_hash_mux); ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx) @@ -554,7 +555,7 @@ void ecryptfs_release_messaging(void) int i; mutex_lock(&ecryptfs_daemon_hash_mux); - for (i = 0; i < ecryptfs_hash_buckets; i++) { + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) { int rc; hlist_for_each_entry(daemon, elem, diff --git a/fs/exec.c b/fs/exec.c index e19de6a8033..97d91a03fb1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -653,6 +653,7 @@ int setup_arg_pages(struct linux_binprm *bprm, else stack_base = vma->vm_start - stack_expand; #endif + current->mm->start_stack = bprm->p; ret = expand_stack(vma, stack_base); if (ret) ret = -EFAULT; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index ca7e2a0ed98..2bcc0431bad 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -200,6 +200,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; else { inode->i_mode = mode; + inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); if (error == 0) acl = NULL; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 19214435b75..3675088cb88 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1552,7 +1552,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) if (error) return error; } - if (iattr->ia_valid & ATTR_SIZE) { + if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) { error = ext2_setsize(inode, iattr->ia_size); if (error) return error; diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig index 522b15498f4..e8c6ba0e4a3 100644 --- a/fs/ext3/Kconfig +++ b/fs/ext3/Kconfig @@ -31,6 +31,7 @@ config EXT3_FS config EXT3_DEFAULTS_TO_ORDERED bool "Default to 'data=ordered' in ext3" depends on EXT3_FS + default y help The journal mode options for ext3 have different tradeoffs between when data is guaranteed to be on disk and diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 01552abbca3..8a11fe21218 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -205,6 +205,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, return error; else { inode->i_mode = mode; + inode->i_ctime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); if (error == 0) acl = NULL; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 735f0190ec2..001eb0e2d48 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1149,9 +1149,25 @@ static int walk_page_buffers( handle_t *handle, static int do_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { + int dirty = buffer_dirty(bh); + int ret; + if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; - return ext3_journal_get_write_access(handle, bh); + /* + * __block_prepare_write() could have dirtied some buffers. Clean + * the dirty bit as jbd2_journal_get_write_access() could complain + * otherwise about fs integrity issues. Setting of the dirty bit + * by __block_prepare_write() isn't a real problem here as we clear + * the bit before releasing a page lock and thus writeback cannot + * ever write the buffer. + */ + if (dirty) + clear_buffer_dirty(bh); + ret = ext3_journal_get_write_access(handle, bh); + if (!ret && dirty) + ret = ext3_journal_dirty_metadata(handle, bh); + return ret; } /* @@ -1625,10 +1641,7 @@ static int ext3_writeback_writepage(struct page *page, goto out_fail; } - if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) - ret = nobh_writepage(page, ext3_get_block, wbc); - else - ret = block_write_full_page(page, ext3_get_block, wbc); + ret = block_write_full_page(page, ext3_get_block, wbc); err = ext3_journal_stop(handle); if (!ret) @@ -1922,17 +1935,6 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - /* - * For "nobh" option, we can only work if we don't need to - * read-in the page - otherwise we create buffers to do the IO. - */ - if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && - ext3_should_writeback_data(inode) && PageUptodate(page)) { - zero_user(page, offset, length); - set_page_dirty(page); - goto unlock; - } - if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -2284,27 +2286,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, depth); /* - * We've probably journalled the indirect block several - * times during the truncate. But it's no longer - * needed and we now drop it from the transaction via - * journal_revoke(). - * - * That's easy if it's exclusively part of this - * transaction. But if it's part of the committing - * transaction then journal_forget() will simply - * brelse() it. That means that if the underlying - * block is reallocated in ext3_get_block(), - * unmap_underlying_metadata() will find this block - * and will try to get rid of it. damn, damn. - * - * If this block has already been committed to the - * journal, a revoke record will be written. And - * revoke records must be emitted *before* clearing - * this block's bit in the bitmaps. - */ - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - - /* * Everything below this this pointer has been * released. Now let this top-of-subtree go. * @@ -2327,6 +2308,31 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, truncate_restart_transaction(handle, inode); } + /* + * We've probably journalled the indirect block several + * times during the truncate. But it's no longer + * needed and we now drop it from the transaction via + * journal_revoke(). + * + * That's easy if it's exclusively part of this + * transaction. But if it's part of the committing + * transaction then journal_forget() will simply + * brelse() it. That means that if the underlying + * block is reallocated in ext3_get_block(), + * unmap_underlying_metadata() will find this block + * and will try to get rid of it. damn, damn. Thus + * we don't allow a block to be reallocated until + * a transaction freeing it has fully committed. + * + * We also have to make sure journal replay after a + * crash does not overwrite non-journaled data blocks + * with old metadata when the block got reallocated for + * data. Thus we have to store a revoke record for a + * block in the same transaction in which we free the + * block. + */ + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + ext3_free_blocks(handle, inode, nr, 1); if (parent_bh) { diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index ee184084ca4..2b35ddb70d6 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1447,7 +1447,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry, struct inode *inode) { struct inode *dir = dentry->d_parent->d_inode; - unsigned long offset; struct buffer_head * bh; struct ext3_dir_entry_2 *de; struct super_block * sb; @@ -1469,7 +1468,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry, ext3_mark_inode_dirty(handle, dir); } blocks = dir->i_size >> sb->s_blocksize_bits; - for (block = 0, offset = 0; block < blocks; block++) { + for (block = 0; block < blocks; block++) { bh = ext3_bread(handle, dir, block, 0, &retval); if(!bh) return retval; diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 54351ac7cef..0ccd7b12b73 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -964,7 +964,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, ext3_fsblk_t n_blocks_count) { ext3_fsblk_t o_blocks_count; - unsigned long o_groups_count; ext3_grpblk_t last; ext3_grpblk_t add; struct buffer_head * bh; @@ -976,7 +975,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, * yet: we're going to revalidate es->s_blocks_count after * taking the s_resize_lock below. */ o_blocks_count = le32_to_cpu(es->s_blocks_count); - o_groups_count = EXT3_SB(sb)->s_groups_count; if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n", diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 6c953bb255e..9650a956fd0 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -661,9 +661,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) */ seq_puts(seq, ",barrier="); seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); - if (test_opt(sb, NOBH)) - seq_puts(seq, ",nobh"); - seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); @@ -1255,10 +1252,12 @@ set_qf_format: *n_blocks_count = option; break; case Opt_nobh: - set_opt(sbi->s_mount_opt, NOBH); + ext3_msg(sb, KERN_WARNING, + "warning: ignoring deprecated nobh option"); break; case Opt_bh: - clear_opt(sbi->s_mount_opt, NOBH); + ext3_msg(sb, KERN_WARNING, + "warning: ignoring deprecated bh option"); break; default: ext3_msg(sb, KERN_ERR, @@ -2001,14 +2000,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) break; } - if (test_opt(sb, NOBH)) { - if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) { - ext3_msg(sb, KERN_WARNING, - "warning: ignoring nobh option - " - "it is supported only with writeback mode"); - clear_opt(sbi->s_mount_opt, NOBH); - } - } /* * The journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index feaf498feaa..5e2ed4504ea 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -204,6 +204,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; else { inode->i_mode = mode; + inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); if (error == 0) acl = NULL; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 95b7594c76f..bd30799a43e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -377,14 +377,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ext4_grpblk_t bit; unsigned int i; struct ext4_group_desc *desc; - struct ext4_super_block *es; - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = EXT4_SB(sb); int err = 0, ret, blk_free_count; ext4_grpblk_t blocks_freed; struct ext4_group_info *grp; - sbi = EXT4_SB(sb); - es = sbi->s_es; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -477,7 +474,6 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); if (!err) err = ret; - sb->s_dirt = 1; error_return: brelse(bitmap_bh); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 5b6973fbf1b..3db5084db9b 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -229,16 +229,20 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (start_blk + count < start_blk) || - (start_blk + count > ext4_blocks_count(sbi->s_es))) + (start_blk + count > ext4_blocks_count(sbi->s_es))) { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); return 0; + } while (n) { entry = rb_entry(n, struct ext4_system_zone, node); if (start_blk + count - 1 < entry->start_blk) n = n->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = n->rb_right; - else + else { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); return 0; + } } return 1; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ea5e6cb7e2a..374510f72ba 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -61,10 +61,11 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) } -int ext4_check_dir_entry(const char *function, struct inode *dir, - struct ext4_dir_entry_2 *de, - struct buffer_head *bh, - unsigned int offset) +int __ext4_check_dir_entry(const char *function, unsigned int line, + struct inode *dir, + struct ext4_dir_entry_2 *de, + struct buffer_head *bh, + unsigned int offset) { const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len, @@ -83,11 +84,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, error_msg = "inode out of bounds"; if (error_msg != NULL) - ext4_error_inode(function, dir, - "bad entry in directory: %s - block=%llu" + ext4_error_inode(dir, function, line, bh->b_blocknr, + "bad entry in directory: %s - " "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned long long) bh->b_blocknr, - (unsigned) (offset%bh->b_size), offset, + error_msg, (unsigned) (offset%bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); return error_msg == NULL ? 1 : 0; @@ -121,7 +121,8 @@ static int ext4_readdir(struct file *filp, * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX); + ext4_clear_inode_flag(filp->f_path.dentry->d_inode, + EXT4_INODE_INDEX); } stored = 0; offset = filp->f_pos & (sb->s_blocksize - 1); @@ -193,7 +194,7 @@ revalidate: while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (!ext4_check_dir_entry("ext4_readdir", inode, de, + if (!ext4_check_dir_entry(inode, de, bh, offset)) { /* * On error, skip the f_pos to the next block @@ -343,7 +344,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct dir_private_info *info; int len; - info = (struct dir_private_info *) dir_file->private_data; + info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 19a4de57128..e03841d9f30 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -57,10 +57,13 @@ #endif #define EXT4_ERROR_INODE(inode, fmt, a...) \ - ext4_error_inode(__func__, (inode), (fmt), ## a) + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) #define EXT4_ERROR_FILE(file, fmt, a...) \ - ext4_error_file(__func__, (file), (fmt), ## a) + ext4_error_file(__func__, __LINE__, (file), (fmt), ## a) /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -167,13 +170,15 @@ struct mpage_da_data { }; #define EXT4_IO_UNWRITTEN 0x1 typedef struct ext4_io_end { - struct list_head list; /* per-file finished AIO list */ + struct list_head list; /* per-file finished IO list */ struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ struct page *page; /* page struct for buffer write */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ struct work_struct work; /* data work queue */ + struct kiocb *iocb; /* iocb struct for AIO */ + int result; /* error value for AIO */ } ext4_io_end_t; /* @@ -460,7 +465,7 @@ struct ext4_new_group_data { }; /* - * Flags used by ext4_get_blocks() + * Flags used by ext4_map_blocks() */ /* Allocate any needed blocks and/or convert an unitialized extent to be an initialized ext4 */ @@ -873,7 +878,6 @@ struct ext4_inode_info { #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ -#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */ #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ @@ -982,7 +986,7 @@ struct ext4_super_block { __le32 s_last_orphan; /* start of list of inodes to delete */ __le32 s_hash_seed[4]; /* HTREE hash seed */ __u8 s_def_hash_version; /* Default hash version to use */ - __u8 s_reserved_char_pad; + __u8 s_jnl_backup_type; __le16 s_desc_size; /* size of group descriptor */ /*100*/ __le32 s_default_mount_opts; __le32 s_first_meta_bg; /* First metablock block group */ @@ -1000,12 +1004,34 @@ struct ext4_super_block { __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ - __u8 s_reserved_char_pad2; + __u8 s_reserved_char_pad; __le16 s_reserved_pad; __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ - __u32 s_reserved[160]; /* Padding to the end of the block */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32]; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32]; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_reserved[112]; /* Padding to the end of the block */ }; +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + #ifdef __KERNEL__ /* @@ -1143,6 +1169,9 @@ struct ext4_sb_info { /* workqueue for dio unwritten */ struct workqueue_struct *dio_unwritten_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1313,6 +1342,10 @@ EXT4_INODE_BIT_FNS(state, state_flags) #define EXT4_DEFM_JMODE_DATA 0x0020 #define EXT4_DEFM_JMODE_ORDERED 0x0040 #define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 /* * Default journal batch times @@ -1379,6 +1412,43 @@ struct ext4_dir_entry_2 { #define EXT4_MAX_REC_LEN ((1<<16)-1) /* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_CACHE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); +#if (PAGE_CACHE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* * Hash Tree Directory indexing * (c) Daniel Phillips, 2001 */ @@ -1510,9 +1580,11 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb, ext4_init_block_bitmap(sb, NULL, group, desc) /* dir.c */ -extern int ext4_check_dir_entry(const char *, struct inode *, - struct ext4_dir_entry_2 *, - struct buffer_head *, unsigned int); +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct ext4_dir_entry_2 *, + struct buffer_head *, unsigned int); +#define ext4_check_dir_entry(dir, de, bh, offset) \ + __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset)) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); @@ -1601,8 +1673,6 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); extern int ext4_ext_migrate(struct inode *); /* namei.c */ -extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize); -extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize); extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, @@ -1616,25 +1686,38 @@ extern int ext4_group_extend(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ -extern void __ext4_error(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message) -extern void ext4_error_inode(const char *, struct inode *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void ext4_error_file(const char *, struct file *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void __ext4_std_error(struct super_block *, const char *, int); -extern void ext4_abort(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void __ext4_warning(struct super_block *, const char *, +extern void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...) + __attribute__ ((format (printf, 4, 5))); +#define ext4_error(sb, message...) __ext4_error(sb, __func__, \ + __LINE__, ## message) +extern void ext4_error_inode(struct inode *, const char *, unsigned int, + ext4_fsblk_t, const char *, ...) + __attribute__ ((format (printf, 5, 6))); +extern void ext4_error_file(struct file *, const char *, unsigned int, + const char *, ...) + __attribute__ ((format (printf, 4, 5))); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...) + __attribute__ ((format (printf, 4, 5))); +#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ + __LINE__, ## message) +extern void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message) + __attribute__ ((format (printf, 4, 5))); +#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ + __LINE__, ## message) extern void ext4_msg(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, - const char *, const char *, ...) - __attribute__ ((format (printf, 4, 5))); +extern void __ext4_grp_locked_error(const char *, unsigned int, \ + struct super_block *, ext4_group_t, \ + unsigned long, ext4_fsblk_t, \ + const char *, ...) + __attribute__ ((format (printf, 7, 8))); +#define ext4_grp_locked_error(sb, grp, message...) \ + __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); @@ -1768,7 +1851,7 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ - __ext4_std_error((sb), __func__, (errno)); \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ } while (0) #ifdef CONFIG_SMP @@ -1860,6 +1943,12 @@ static inline void ext4_unlock_group(struct super_block *sb, spin_unlock(ext4_group_lock_ptr(sb, group)); } +static inline void ext4_mark_super_dirty(struct super_block *sb) +{ + if (EXT4_SB(sb)->s_journal == NULL) + sb->s_dirt =1; +} + /* * Inodes and files operations */ @@ -1905,9 +1994,6 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern int ext4_get_blocks(handle_t *handle, struct inode *inode, - sector_t block, unsigned int max_blocks, - struct buffer_head *bh, int flags); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); /* move_extent.c */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 53d2764d71c..6e272ef6ba9 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -6,29 +6,29 @@ #include <trace/events/ext4.h> -int __ext4_journal_get_undo_access(const char *where, handle_t *handle, - struct buffer_head *bh) +int __ext4_journal_get_undo_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh) { int err = 0; if (ext4_handle_valid(handle)) { err = jbd2_journal_get_undo_access(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, + ext4_journal_abort_handle(where, line, __func__, bh, handle, err); } return err; } -int __ext4_journal_get_write_access(const char *where, handle_t *handle, - struct buffer_head *bh) +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh) { int err = 0; if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, + ext4_journal_abort_handle(where, line, __func__, bh, handle, err); } return err; @@ -46,9 +46,9 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle, * If the handle isn't valid we're not journaling, but we still need to * call into ext4_journal_revoke() to put the buffer head. */ -int __ext4_forget(const char *where, handle_t *handle, int is_metadata, - struct inode *inode, struct buffer_head *bh, - ext4_fsblk_t blocknr) +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr) { int err; @@ -79,8 +79,8 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata, BUFFER_TRACE(bh, "call jbd2_journal_forget"); err = jbd2_journal_forget(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); return err; } return 0; @@ -92,15 +92,16 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata, BUFFER_TRACE(bh, "call jbd2_journal_revoke"); err = jbd2_journal_revoke(handle, blocknr, bh); if (err) { - ext4_journal_abort_handle(where, __func__, bh, handle, err); - ext4_abort(inode->i_sb, __func__, + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + __ext4_abort(inode->i_sb, where, line, "error %d when attempting revoke", err); } BUFFER_TRACE(bh, "exit"); return err; } -int __ext4_journal_get_create_access(const char *where, +int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh) { int err = 0; @@ -108,22 +109,23 @@ int __ext4_journal_get_create_access(const char *where, if (ext4_handle_valid(handle)) { err = jbd2_journal_get_create_access(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); } return err; } -int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, - struct inode *inode, struct buffer_head *bh) +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh) { int err = 0; if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); } else { if (inode) mark_buffer_dirty_inode(bh, inode); @@ -132,14 +134,33 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { - ext4_error(inode->i_sb, - "IO error syncing inode, " - "inode=%lu, block=%llu", - inode->i_ino, - (unsigned long long) bh->b_blocknr); + struct ext4_super_block *es; + + es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_block = + cpu_to_le64(bh->b_blocknr); + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "IO error syncing itable block"); err = -EIO; } } } return err; } + +int __ext4_handle_dirty_super(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb) +{ + struct buffer_head *bh = EXT4_SB(sb)->s_sbh; + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_dirty_metadata(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + } else + sb->s_dirt = 1; + return err; +} diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index dade0c02479..b0bd792c58c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -122,39 +122,47 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); /* * Wrapper functions with which ext4 calls into JBD. */ -void ext4_journal_abort_handle(const char *caller, const char *err_fn, +void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, struct buffer_head *bh, handle_t *handle, int err); -int __ext4_journal_get_undo_access(const char *where, handle_t *handle, - struct buffer_head *bh); +int __ext4_journal_get_undo_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh); -int __ext4_journal_get_write_access(const char *where, handle_t *handle, - struct buffer_head *bh); +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh); -int __ext4_forget(const char *where, handle_t *handle, int is_metadata, - struct inode *inode, struct buffer_head *bh, - ext4_fsblk_t blocknr); +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr); -int __ext4_journal_get_create_access(const char *where, +int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh); -int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, - struct inode *inode, struct buffer_head *bh); +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh); + +int __ext4_handle_dirty_super(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb); #define ext4_journal_get_undo_access(handle, bh) \ - __ext4_journal_get_undo_access(__func__, (handle), (bh)) + __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh)) #define ext4_journal_get_write_access(handle, bh) \ - __ext4_journal_get_write_access(__func__, (handle), (bh)) + __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ - __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\ - (block_nr)) + __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ + (bh), (block_nr)) #define ext4_journal_get_create_access(handle, bh) \ - __ext4_journal_get_create_access(__func__, (handle), (bh)) + __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ - __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) + __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ + (bh)) +#define ext4_handle_dirty_super(handle, sb) \ + __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); -int __ext4_journal_stop(const char *where, handle_t *handle); +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -207,7 +215,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) } #define ext4_journal_stop(handle) \ - __ext4_journal_stop(__func__, (handle)) + __ext4_journal_stop(__func__, __LINE__, (handle)) static inline handle_t *ext4_journal_current_handle(void) { @@ -308,17 +316,15 @@ static inline int ext4_should_writeback_data(struct inode *inode) * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking * i_mutex for direct I/O reads. This only works for extent-based - * files, and it doesn't work for nobh or if data journaling is - * enabled, since the dioread_nolock code uses b_private to pass - * information back to the I/O completion handler, and this conflicts - * with the jbd's use of b_private. + * files, and it doesn't work if data journaling is enabled, since the + * dioread_nolock code uses b_private to pass information back to the + * I/O completion handler, and this conflicts with the jbd's use of + * b_private. */ static inline int ext4_should_dioread_nolock(struct inode *inode) { if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) return 0; - if (test_opt(inode->i_sb, NOBH)) - return 0; if (!S_ISREG(inode->i_mode)) return 0; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 377309c1af6..06328d3e571 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -401,9 +401,9 @@ static int ext4_valid_extent_entries(struct inode *inode, return 1; } -static int __ext4_ext_check(const char *function, struct inode *inode, - struct ext4_extent_header *eh, - int depth) +static int __ext4_ext_check(const char *function, unsigned int line, + struct inode *inode, struct ext4_extent_header *eh, + int depth) { const char *error_msg; int max = 0; @@ -436,7 +436,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode, return 0; corrupted: - ext4_error_inode(function, inode, + ext4_error_inode(inode, function, line, 0, "bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", error_msg, le16_to_cpu(eh->eh_magic), @@ -447,7 +447,7 @@ corrupted: } #define ext4_ext_check(inode, eh, depth) \ - __ext4_ext_check(__func__, inode, eh, depth) + __ext4_ext_check(__func__, __LINE__, inode, eh, depth) int ext4_ext_check_inode(struct inode *inode) { @@ -1083,7 +1083,6 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, { struct ext4_ext_path *curp = path; struct ext4_extent_header *neh; - struct ext4_extent_idx *fidx; struct buffer_head *bh; ext4_fsblk_t newblock; int err = 0; @@ -1144,10 +1143,10 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext4_idx_store_pblock(curp->p_idx, newblock); neh = ext_inode_hdr(inode); - fidx = EXT_FIRST_INDEX(neh); ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), - le32_to_cpu(fidx->ei_block), idx_pblock(fidx)); + le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), + idx_pblock(EXT_FIRST_INDEX(neh))); neh->eh_depth = cpu_to_le16(path->p_depth + 1); err = ext4_ext_dirty(handle, inode, curp); @@ -2937,7 +2936,7 @@ fix_extent_len: * One of more index blocks maybe needed if the extent tree grow after * the unintialized extent split. To prevent ENOSPC occur at the IO * complete, we need to split the uninitialized extent before DIO submit - * the IO. The uninitilized extent called at this time will be split + * the IO. The uninitialized extent called at this time will be split * into three uninitialized extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). @@ -2954,7 +2953,6 @@ static int ext4_split_unwritten_extents(handle_t *handle, struct ext4_extent *ex1 = NULL; struct ext4_extent *ex2 = NULL; struct ext4_extent *ex3 = NULL; - struct ext4_extent_header *eh; ext4_lblk_t ee_block, eof_block; unsigned int allocated, ee_len, depth; ext4_fsblk_t newblock; @@ -2971,7 +2969,6 @@ static int ext4_split_unwritten_extents(handle_t *handle, eof_block = map->m_lblk + map->m_len; depth = ext_depth(inode); - eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); @@ -3058,7 +3055,6 @@ static int ext4_split_unwritten_extents(handle_t *handle, err = PTR_ERR(path); goto out; } - eh = path[depth].p_hdr; ex = path[depth].p_ext; if (ex2 != &newex) ex2 = ex; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 5313ae4cda2..ee92b66d455 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -70,7 +70,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); size_t length = iov_length(iov, nr_segs); - if (pos > sbi->s_bitmap_maxbytes) + if ((pos > sbi->s_bitmap_maxbytes || + (pos == sbi->s_bitmap_maxbytes && length > 0))) return -EFBIG; if (pos + length > sbi->s_bitmap_maxbytes) { @@ -123,7 +124,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (!IS_ERR(cp)) { memcpy(sbi->s_es->s_last_mounted, cp, sizeof(sbi->s_es->s_last_mounted)); - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); } } return dquot_file_open(inode, filp); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 25c4b3173fd..ac377505ed5 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -279,7 +279,7 @@ out: err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); } else ext4_error(sb, "bit already cleared for inode %lu", ino); @@ -965,7 +965,7 @@ got: percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 19df61c321f..a0ab3754d0d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -221,6 +221,7 @@ void ext4_delete_inode(struct inode *inode) "couldn't extend journal (err %d)", err); stop_handle: ext4_journal_stop(handle); + ext4_orphan_del(NULL, inode); goto no_delete; } } @@ -337,9 +338,11 @@ static int ext4_block_to_path(struct inode *inode, return n; } -static int __ext4_check_blockref(const char *function, struct inode *inode, +static int __ext4_check_blockref(const char *function, unsigned int line, + struct inode *inode, __le32 *p, unsigned int max) { + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; __le32 *bref = p; unsigned int blk; @@ -348,8 +351,9 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, if (blk && unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), blk, 1))) { - ext4_error_inode(function, inode, - "invalid block reference %u", blk); + es->s_last_error_block = cpu_to_le64(blk); + ext4_error_inode(inode, function, line, blk, + "invalid block"); return -EIO; } } @@ -358,11 +362,13 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, #define ext4_check_indirect_blockref(inode, bh) \ - __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ + __ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ EXT4_ADDR_PER_BLOCK((inode)->i_sb)) #define ext4_check_inode_blockref(inode) \ - __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ + __ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ EXT4_NDIR_BLOCKS) /** @@ -1128,20 +1134,24 @@ void ext4_da_update_reserve_space(struct inode *inode, ext4_discard_preallocations(inode); } -static int check_block_validity(struct inode *inode, const char *func, +static int __check_block_validity(struct inode *inode, const char *func, + unsigned int line, struct ext4_map_blocks *map) { if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, map->m_len)) { - ext4_error_inode(func, inode, - "lblock %lu mapped to illegal pblock %llu " - "(length %d)", (unsigned long) map->m_lblk, - map->m_pblk, map->m_len); + ext4_error_inode(inode, func, line, map->m_pblk, + "lblock %lu mapped to illegal pblock " + "(length %d)", (unsigned long) map->m_lblk, + map->m_len); return -EIO; } return 0; } +#define check_block_validity(inode, map) \ + __check_block_validity((inode), __func__, __LINE__, (map)) + /* * Return the number of contiguous dirty pages in a given inode * starting at page frame idx. @@ -1244,7 +1254,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, up_read((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, __func__, map); + int ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -1324,9 +1334,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, - "ext4_map_blocks_after_alloc", - map); + int ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -1519,9 +1527,25 @@ static int walk_page_buffers(handle_t *handle, static int do_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { + int dirty = buffer_dirty(bh); + int ret; + if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; - return ext4_journal_get_write_access(handle, bh); + /* + * __block_prepare_write() could have dirtied some buffers. Clean + * the dirty bit as jbd2_journal_get_write_access() could complain + * otherwise about fs integrity issues. Setting of the dirty bit + * by __block_prepare_write() isn't a real problem here as we clear + * the bit before releasing a page lock and thus writeback cannot + * ever write the buffer. + */ + if (dirty) + clear_buffer_dirty(bh); + ret = ext4_journal_get_write_access(handle, bh); + if (!ret && dirty) + ret = ext4_handle_dirty_metadata(handle, NULL, bh); + return ret; } /* @@ -2194,7 +2218,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) BUG_ON(!handle); /* - * Call ext4_get_blocks() to allocate any delayed allocation + * Call ext4_map_blocks() to allocate any delayed allocation * blocks, or to convert an uninitialized extent to be * initialized (in the case where we have written into * one or more preallocated blocks). @@ -2203,7 +2227,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * indicate that we are on the delayed allocation path. This * affects functions in many different parts of the allocation * call path. This flag exists primarily because we don't - * want to change *many* call functions, so ext4_get_blocks() + * want to change *many* call functions, so ext4_map_blocks() * will set the magic i_delalloc_reserved_flag once the * inode's allocation semaphore is taken. * @@ -2221,6 +2245,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); if (blks < 0) { + struct super_block *sb = mpd->inode->i_sb; + err = blks; /* * If get block returns with error we simply @@ -2231,7 +2257,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) return 0; if (err == -ENOSPC && - ext4_count_free_blocks(mpd->inode->i_sb)) { + ext4_count_free_blocks(sb)) { mpd->retval = err; return 0; } @@ -2243,16 +2269,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * writepage and writepages will again try to write * the same. */ - ext4_msg(mpd->inode->i_sb, KERN_CRIT, - "delayed block allocation failed for inode %lu at " - "logical offset %llu with max blocks %zd with " - "error %d", mpd->inode->i_ino, - (unsigned long long) next, - mpd->b_size >> mpd->inode->i_blkbits, err); - printk(KERN_CRIT "This should not happen!! " - "Data will be lost\n"); - if (err == -ENOSPC) { - ext4_print_free_blocks(mpd->inode); + if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { + ext4_msg(sb, KERN_CRIT, + "delayed block allocation failed for inode %lu " + "at logical offset %llu with max blocks %zd " + "with error %d", mpd->inode->i_ino, + (unsigned long long) next, + mpd->b_size >> mpd->inode->i_blkbits, err); + ext4_msg(sb, KERN_CRIT, + "This should not happen!! Data will be lost\n"); + if (err == -ENOSPC) + ext4_print_free_blocks(mpd->inode); } /* invalidate all the pages */ ext4_da_block_invalidatepages(mpd, next, @@ -2320,7 +2347,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, * XXX Don't go larger than mballoc is willing to allocate * This is a stopgap solution. We eventually need to fold * mpage_da_submit_io() into this function and then call - * ext4_get_blocks() multiple times in a loop + * ext4_map_blocks() multiple times in a loop */ if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) goto flush_it; @@ -2553,18 +2580,16 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, /* * This function is used as a standard get_block_t calback function * when there is no desire to allocate any blocks. It is used as a - * callback function for block_prepare_write(), nobh_writepage(), and - * block_write_full_page(). These functions should only try to map a - * single block at a time. + * callback function for block_prepare_write() and block_write_full_page(). + * These functions should only try to map a single block at a time. * * Since this function doesn't do block allocations even if the caller * requests it by passing in create=1, it is critically important that * any caller checks to make sure that any buffer heads are returned * by this function are either all already mapped or marked for - * delayed allocation before calling nobh_writepage() or - * block_write_full_page(). Otherwise, b_blocknr could be left - * unitialized, and the page write functions will be taken by - * surprise. + * delayed allocation before calling block_write_full_page(). Otherwise, + * b_blocknr could be left unitialized, and the page write functions will + * be taken by surprise. */ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -2749,9 +2774,7 @@ static int ext4_writepage(struct page *page, return __ext4_journalled_writepage(page, len); } - if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) - ret = nobh_writepage(page, noalloc_get_block_write, wbc); - else if (page_bufs && buffer_uninit(page_bufs)) { + if (page_bufs && buffer_uninit(page_bufs)) { ext4_set_bh_endio(page_bufs, inode); ret = block_write_full_page_endio(page, noalloc_get_block_write, wbc, ext4_end_io_buffer_write); @@ -3146,13 +3169,10 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, int ret, retries = 0; struct page *page; pgoff_t index; - unsigned from, to; struct inode *inode = mapping->host; handle_t *handle; index = pos >> PAGE_CACHE_SHIFT; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; if (ext4_nonda_switch(inode->i_sb)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; @@ -3668,6 +3688,8 @@ static int ext4_end_io_nolock(ext4_io_end_t *io) return ret; } + if (io->iocb) + aio_complete(io->iocb, io->result, 0); /* clear the DIO AIO unwritten flag */ io->flag = 0; return ret; @@ -3767,6 +3789,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) io->offset = 0; io->size = 0; io->page = NULL; + io->iocb = NULL; + io->result = 0; INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -3775,7 +3799,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) + ssize_t size, void *private, int ret, + bool is_async) { ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; @@ -3784,7 +3809,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, /* if not async direct IO or dio with 0 bytes write, just return */ if (!io_end || !size) - return; + goto out; ext_debug("ext4_end_io_dio(): io_end 0x%p" "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", @@ -3795,12 +3820,18 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (io_end->flag != EXT4_IO_UNWRITTEN){ ext4_free_io_end(io_end); iocb->private = NULL; +out: + if (is_async) + aio_complete(iocb, ret, 0); return; } io_end->offset = offset; io_end->size = size; - io_end->flag = EXT4_IO_UNWRITTEN; + if (is_async) { + io_end->iocb = iocb; + io_end->result = ret; + } wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; /* queue the work to convert unwritten extents to written */ @@ -3937,7 +3968,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, return -ENOMEM; /* * we save the io structure for current async - * direct IO, so that later ext4_get_blocks() + * direct IO, so that later ext4_map_blocks() * could flag the io structure whether there * is a unwritten extents needs to be converted * when IO is completed. @@ -4128,17 +4159,6 @@ int ext4_block_truncate_page(handle_t *handle, length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - /* - * For "nobh" option, we can only work if we don't need to - * read-in the page - otherwise we create buffers to do the IO. - */ - if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && - ext4_should_writeback_data(inode) && PageUptodate(page)) { - zero_user(page, offset, length); - set_page_dirty(page); - goto unlock; - } - if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -4488,9 +4508,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * (should be rare). */ if (!bh) { - EXT4_ERROR_INODE(inode, - "Read failure block=%llu", - (unsigned long long) nr); + EXT4_ERROR_INODE_BLOCK(inode, nr, + "Read failure"); continue; } @@ -4502,27 +4521,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, depth); /* - * We've probably journalled the indirect block several - * times during the truncate. But it's no longer - * needed and we now drop it from the transaction via - * jbd2_journal_revoke(). - * - * That's easy if it's exclusively part of this - * transaction. But if it's part of the committing - * transaction then jbd2_journal_forget() will simply - * brelse() it. That means that if the underlying - * block is reallocated in ext4_get_block(), - * unmap_underlying_metadata() will find this block - * and will try to get rid of it. damn, damn. - * - * If this block has already been committed to the - * journal, a revoke record will be written. And - * revoke records must be emitted *before* clearing - * this block's bit in the bitmaps. - */ - ext4_forget(handle, 1, inode, bh, bh->b_blocknr); - - /* * Everything below this this pointer has been * released. Now let this top-of-subtree go. * @@ -4546,8 +4544,20 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, blocks_for_truncate(inode)); } + /* + * The forget flag here is critical because if + * we are journaling (and not doing data + * journaling), we have to make sure a revoke + * record is written to prevent the journal + * replay from overwriting the (former) + * indirect block if it gets reallocated as a + * data block. This must happen in the same + * transaction where the data blocks are + * actually freed. + */ ext4_free_blocks(handle, inode, 0, nr, 1, - EXT4_FREE_BLOCKS_METADATA); + EXT4_FREE_BLOCKS_METADATA| + EXT4_FREE_BLOCKS_FORGET); if (parent_bh) { /* @@ -4805,8 +4815,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (!bh) { - EXT4_ERROR_INODE(inode, "unable to read inode block - " - "block %llu", block); + EXT4_ERROR_INODE_BLOCK(inode, block, + "unable to read itable block"); return -EIO; } if (!buffer_uptodate(bh)) { @@ -4904,8 +4914,8 @@ make_io: submit_bh(READ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - EXT4_ERROR_INODE(inode, "unable to read inode " - "block %llu", block); + EXT4_ERROR_INODE_BLOCK(inode, block, + "unable to read itable block"); brelse(bh); return -EIO; } @@ -4942,20 +4952,26 @@ void ext4_set_inode_flags(struct inode *inode) /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ void ext4_get_inode_flags(struct ext4_inode_info *ei) { - unsigned int flags = ei->vfs_inode.i_flags; - - ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| - EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); - if (flags & S_SYNC) - ei->i_flags |= EXT4_SYNC_FL; - if (flags & S_APPEND) - ei->i_flags |= EXT4_APPEND_FL; - if (flags & S_IMMUTABLE) - ei->i_flags |= EXT4_IMMUTABLE_FL; - if (flags & S_NOATIME) - ei->i_flags |= EXT4_NOATIME_FL; - if (flags & S_DIRSYNC) - ei->i_flags |= EXT4_DIRSYNC_FL; + unsigned int vfs_fl; + unsigned long old_fl, new_fl; + + do { + vfs_fl = ei->vfs_inode.i_flags; + old_fl = ei->i_flags; + new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| + EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| + EXT4_DIRSYNC_FL); + if (vfs_fl & S_SYNC) + new_fl |= EXT4_SYNC_FL; + if (vfs_fl & S_APPEND) + new_fl |= EXT4_APPEND_FL; + if (vfs_fl & S_IMMUTABLE) + new_fl |= EXT4_IMMUTABLE_FL; + if (vfs_fl & S_NOATIME) + new_fl |= EXT4_NOATIME_FL; + if (vfs_fl & S_DIRSYNC) + new_fl |= EXT4_DIRSYNC_FL; + } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, @@ -4970,7 +4986,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, /* we are using combined 48 bit field */ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | le32_to_cpu(raw_inode->i_blocks_lo); - if (ei->i_flags & EXT4_HUGE_FILE_FL) { + if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { /* i_blocks represent file system block size */ return i_blocks << (inode->i_blkbits - 9); } else { @@ -5066,7 +5082,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) transaction_t *transaction; tid_t tid; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else @@ -5075,7 +5091,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) tid = transaction->t_tid; else tid = journal->j_commit_sequence; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); ei->i_sync_tid = tid; ei->i_datasync_tid = tid; } @@ -5120,7 +5136,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_file_acl); ret = -EIO; goto bad_inode; - } else if (ei->i_flags & EXT4_EXTENTS_FL) { + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode))) @@ -5191,7 +5207,7 @@ static int ext4_inode_blocks_set(handle_t *handle, */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; - ei->i_flags &= ~EXT4_HUGE_FILE_FL; + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) @@ -5204,9 +5220,9 @@ static int ext4_inode_blocks_set(handle_t *handle, */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); - ei->i_flags &= ~EXT4_HUGE_FILE_FL; + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); } else { - ei->i_flags |= EXT4_HUGE_FILE_FL; + ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); @@ -5400,9 +5416,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { - EXT4_ERROR_INODE(inode, - "IO error syncing inode (block=%llu)", - (unsigned long long) iloc.bh->b_blocknr); + EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, + "IO error syncing inode"); err = -EIO; } brelse(iloc.bh); @@ -5477,10 +5492,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (attr->ia_size > sbi->s_bitmap_maxbytes) { - error = -EFBIG; - goto err_out; - } + if (attr->ia_size > sbi->s_bitmap_maxbytes) + return -EFBIG; } } @@ -5682,7 +5695,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) * Calculate the journal credits for a chunk of data modification. * * This is called from DIO, fallocate or whoever calling - * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks. + * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. * * journal buffers for data blocks are not included here, as DIO * and fallocate do no need to journal data buffers. @@ -5748,7 +5761,6 @@ static int ext4_expand_extra_isize(struct inode *inode, { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; - struct ext4_xattr_entry *entry; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) return 0; @@ -5756,7 +5768,6 @@ static int ext4_expand_extra_isize(struct inode *inode, raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - entry = IFIRST(header); /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 12b3bc026a6..4b4ad4b7ce5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -446,10 +446,11 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += first + i; ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %u)", - inode ? inode->i_ino : 0, blocknr, - first + i, e4b->bd_group); + inode ? inode->i_ino : 0, + blocknr, + "freeing block already freed " + "(bit %u)", + first + i); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -712,9 +713,9 @@ void ext4_mb_generate_buddy(struct super_block *sb, grp->bb_fragments = fragments; if (free != grp->bb_free) { - ext4_grp_locked_error(sb, group, __func__, - "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", - group, free, grp->bb_free); + ext4_grp_locked_error(sb, group, 0, 0, + "%u blocks in bitmap, %u in gd", + free, grp->bb_free); /* * If we intent to continue, we consider group descritor * corrupt and update bb_free using bitmap value @@ -1296,10 +1297,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += block; ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %u)", - inode ? inode->i_ino : 0, blocknr, block, - e4b->bd_group); + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block " + "(bit %u)", block); } mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); e4b->bd_info->bb_counters[order]++; @@ -1788,8 +1789,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * free blocks even though group info says we * we have free blocks */ - ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "%d free blocks as per " + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free blocks as per " "group info. But bitmap says 0", free); break; @@ -1798,8 +1799,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { - ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "%d free blocks as per " + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free blocks as per " "group info. But got %d blocks", free, ex.fe_len); /* @@ -1821,8 +1822,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, /* * This is a special case for storages like raid5 - * we try to find stripe-aligned chunks for stripe-size requests - * XXX should do so at least for multiples of stripe size as well + * we try to find stripe-aligned chunks for stripe-size-multiple requests */ static noinline_for_stack void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, @@ -1999,7 +1999,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ext4_group_t ngroups, group, i; int cr; int err = 0; - int bsbits; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; @@ -2041,8 +2040,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ac->ac_2order = i - 1; } - bsbits = ac->ac_sb->s_blocksize_bits; - /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { /* TBD: may be hot point */ @@ -2094,8 +2091,8 @@ repeat: ac->ac_groups_scanned++; if (cr == 0) ext4_mb_simple_scan_group(ac, &e4b); - else if (cr == 1 && - ac->ac_g_ex.fe_len == sbi->s_stripe) + else if (cr == 1 && sbi->s_stripe && + !(ac->ac_g_ex.fe_len % sbi->s_stripe)) ext4_mb_scan_aligned(ac, &e4b); else ext4_mb_complex_scan_group(ac, &e4b); @@ -2221,7 +2218,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) rc = seq_open(file, &ext4_mb_seq_groups_ops); if (rc == 0) { - struct seq_file *m = (struct seq_file *)file->private_data; + struct seq_file *m = file->private_data; m->private = sb; } return rc; @@ -2560,6 +2557,22 @@ int ext4_mb_release(struct super_block *sb) return 0; } +static inline void ext4_issue_discard(struct super_block *sb, + ext4_group_t block_group, ext4_grpblk_t block, int count) +{ + int ret; + ext4_fsblk_t discard_block; + + discard_block = block + ext4_group_first_block_no(sb, block_group); + trace_ext4_discard_blocks(sb, + (unsigned long long) discard_block, count); + ret = sb_issue_discard(sb, discard_block, count); + if (ret == EOPNOTSUPP) { + ext4_warning(sb, "discard not supported, disabling"); + clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); + } +} + /* * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. @@ -2579,22 +2592,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); - if (test_opt(sb, DISCARD)) { - int ret; - ext4_fsblk_t discard_block; - - discard_block = entry->start_blk + - ext4_group_first_block_no(sb, entry->group); - trace_ext4_discard_blocks(sb, - (unsigned long long)discard_block, - entry->count); - ret = sb_issue_discard(sb, discard_block, entry->count); - if (ret == EOPNOTSUPP) { - ext4_warning(sb, - "discard not supported, disabling"); - clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); - } - } + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, entry->group, + entry->start_blk, entry->count); err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -2704,7 +2704,7 @@ void exit_ext4_mballoc(void) /* - * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps + * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps * Returns 0 if success or error code */ static noinline_for_stack int @@ -2712,7 +2712,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle, unsigned int reserv_blks) { struct buffer_head *bitmap_bh = NULL; - struct ext4_super_block *es; struct ext4_group_desc *gdp; struct buffer_head *gdp_bh; struct ext4_sb_info *sbi; @@ -2725,8 +2724,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, sb = ac->ac_sb; sbi = EXT4_SB(sb); - es = sbi->s_es; - err = -EIO; bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); @@ -2812,7 +2809,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); out_err: - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); brelse(bitmap_bh); return err; } @@ -2850,7 +2847,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, int bsbits, max; ext4_lblk_t end; loff_t size, orig_size, start_off; - ext4_lblk_t start, orig_start; + ext4_lblk_t start; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *pa; @@ -2881,6 +2878,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); + orig_size = size; /* max size of free chunks */ max = 2 << bsbits; @@ -2922,8 +2920,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; size = ac->ac_o_ex.fe_len << bsbits; } - orig_size = size = size >> bsbits; - orig_start = start = start_off >> bsbits; + size = size >> bsbits; + start = start_off >> bsbits; /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { @@ -3547,7 +3545,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ext4_group_t group; ext4_grpblk_t bit; unsigned long long grp_blk_start; - sector_t start; int err = 0; int free = 0; @@ -3567,10 +3564,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - start = ext4_group_first_block_no(sb, group) + bit; mb_debug(1, " free preallocated %u/%u in group %u\n", - (unsigned) start, (unsigned) next - bit, - (unsigned) group); + (unsigned) ext4_group_first_block_no(sb, group) + bit, + (unsigned) next - bit, (unsigned) group); free += next - bit; if (ac) { @@ -3581,7 +3577,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, trace_ext4_mballoc_discard(ac); } - trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, + trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit, next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; @@ -3591,8 +3587,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, (unsigned long) pa->pa_len); - ext4_grp_locked_error(sb, group, - __func__, "free %u, pa_free %u", + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* * pa is already deleted so we use the value obtained @@ -3613,7 +3608,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(ac, pa); + trace_ext4_mb_release_group_pa(sb, ac, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); @@ -3889,6 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) struct super_block *sb = ac->ac_sb; ext4_group_t ngroups, i; + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + return; + printk(KERN_ERR "EXT4-fs: Can't allocate:" " Allocation context details:\n"); printk(KERN_ERR "EXT4-fs: status %d flags %d\n", @@ -4255,7 +4253,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) * to usual allocation */ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, - struct ext4_allocation_request *ar, int *errp) + struct ext4_allocation_request *ar, int *errp) { int freed; struct ext4_allocation_context *ac = NULL; @@ -4299,7 +4297,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, inquota = ar->len; if (ar->len == 0) { *errp = -EDQUOT; - goto out3; + goto out; } } @@ -4307,13 +4305,13 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, if (!ac) { ar->len = 0; *errp = -ENOMEM; - goto out1; + goto out; } *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; - goto out2; + goto out; } ac->ac_op = EXT4_MB_HISTORY_PREALLOC; @@ -4322,7 +4320,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_mb_normalize_request(ac, ar); repeat: /* allocate space in core */ - ext4_mb_regular_allocator(ac); + *errp = ext4_mb_regular_allocator(ac); + if (*errp) + goto errout; /* as we've just preallocated more space than * user requested orinally, we store allocated @@ -4333,7 +4333,7 @@ repeat: } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); - if (*errp == -EAGAIN) { + if (*errp == -EAGAIN) { /* * drop the reference that we took * in ext4_mb_use_best_found @@ -4344,12 +4344,10 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; goto repeat; - } else if (*errp) { + } else if (*errp) + errout: ext4_discard_allocated_blocks(ac); - ac->ac_b_ex.fe_len = 0; - ar->len = 0; - ext4_mb_show_ac(ac); - } else { + else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } @@ -4358,19 +4356,19 @@ repeat: if (freed) goto repeat; *errp = -ENOSPC; + } + + if (*errp) { ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } - ext4_mb_release_context(ac); - -out2: - kmem_cache_free(ext4_ac_cachep, ac); -out1: +out: + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); if (inquota && ar->len < inquota) dquot_free_block(ar->inode, inquota - ar->len); -out3: if (!ar->len) { if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) /* release all the reserved blocks if non delalloc */ @@ -4402,6 +4400,7 @@ static noinline_for_stack int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { + ext4_group_t group = e4b->bd_group; ext4_grpblk_t block; struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; @@ -4434,9 +4433,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, else if (block >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else { - ext4_grp_locked_error(sb, e4b->bd_group, __func__, - "Double free of blocks %d (%d %d)", - block, entry->start_blk, entry->count); + ext4_grp_locked_error(sb, group, 0, + ext4_group_first_block_no(sb, group) + block, + "Block already on to-be-freed list"); return 0; } } @@ -4494,7 +4493,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; - struct ext4_super_block *es; unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; @@ -4513,7 +4511,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } sbi = EXT4_SB(sb); - es = EXT4_SB(sb)->s_es; if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_data_block_valid(sbi, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " @@ -4647,6 +4644,8 @@ do_more: mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); ext4_mb_return_to_preallocation(inode, &e4b, block, count); + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, block_group, bit, count); } ret = ext4_free_blks_count(sb, gdp) + count; @@ -4680,7 +4679,7 @@ do_more: put_bh(bitmap_bh); goto do_more; } - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); error_return: if (freed) dquot_free_block(inode, freed); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 6f3a27ec30b..1765c2c50a9 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * We have the extent map build with the tmp inode. * Now copy the i_data across */ - ei->i_flags |= EXT4_EXTENTS_FL; + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS); memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); /* diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 3a6c92ac131..5f1ed9fc913 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -148,17 +148,17 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, */ static int mext_check_null_inode(struct inode *inode1, struct inode *inode2, - const char *function) + const char *function, unsigned int line) { int ret = 0; if (inode1 == NULL) { - __ext4_error(inode2->i_sb, function, + __ext4_error(inode2->i_sb, function, line, "Both inodes should not be NULL: " "inode1 NULL inode2 %lu", inode2->i_ino); ret = -EIO; } else if (inode2 == NULL) { - __ext4_error(inode1->i_sb, function, + __ext4_error(inode1->i_sb, function, line, "Both inodes should not be NULL: " "inode1 %lu inode2 NULL", inode1->i_ino); ret = -EIO; @@ -960,6 +960,9 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } + if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) + return -EPERM; + /* Ext4 move extent does not support swapfile */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { ext4_debug("ext4 move extent: The argument files should " @@ -1081,7 +1084,7 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) BUG_ON(inode1 == NULL && inode2 == NULL); - ret = mext_check_null_inode(inode1, inode2, __func__); + ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); if (ret < 0) goto out; @@ -1118,7 +1121,7 @@ mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) BUG_ON(inode1 == NULL && inode2 == NULL); - ret = mext_check_null_inode(inode1, inode2, __func__); + ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); if (ret < 0) goto out; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a43e6617b35..314c0d3b3fa 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -179,30 +179,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); -unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) -{ - unsigned len = le16_to_cpu(dlen); - - if (len == EXT4_MAX_REC_LEN || len == 0) - return blocksize; - return (len & 65532) | ((len & 3) << 16); -} - -__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) -{ - if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) - BUG(); - if (len < 65536) - return cpu_to_le16(len); - if (len == blocksize) { - if (blocksize == 65536) - return cpu_to_le16(EXT4_MAX_REC_LEN); - else - return cpu_to_le16(0); - } - return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); -} - /* * p is at least 6 bytes before the end of page */ @@ -605,7 +581,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { - if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, + if (!ext4_check_dir_entry(dir, de, bh, (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) +((char *)de - bh->b_data))) { /* On error, skip the f_pos to the next block. */ @@ -844,8 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh, if ((char *) de + namelen <= dlimit && ext4_match (namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (!ext4_check_dir_entry("ext4_find_entry", - dir, de, bh, offset)) + if (!ext4_check_dir_entry(dir, de, bh, offset)) return -1; *res_dir = de; return 1; @@ -1019,7 +994,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) + ((char *) de - bh->b_data); - if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) { + if (!ext4_check_dir_entry(dir, de, bh, off)) { brelse(bh); *err = ERR_BAD_DX_DIR; goto errout; @@ -1088,7 +1063,6 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; - struct inode *inode; static const struct qstr dotdot = { .name = "..", .len = 2, @@ -1097,7 +1071,6 @@ struct dentry *ext4_get_parent(struct dentry *child) struct buffer_head *bh; bh = ext4_find_entry(child->d_inode, &dotdot, &de); - inode = NULL; if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); @@ -1305,8 +1278,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)bh->b_data; top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { - if (!ext4_check_dir_entry("ext4_add_entry", dir, de, - bh, offset)) + if (!ext4_check_dir_entry(dir, de, bh, offset)) return -EIO; if (ext4_match(namelen, name, de)) return -EEXIST; @@ -1673,7 +1645,7 @@ static int ext4_delete_entry(handle_t *handle, pde = NULL; de = (struct ext4_dir_entry_2 *) bh->b_data; while (i < bh->b_size) { - if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i)) + if (!ext4_check_dir_entry(dir, de, bh, i)) return -EIO; if (de == de_del) { BUFFER_TRACE(bh, "get_write_access"); @@ -1956,7 +1928,7 @@ static int empty_dir(struct inode *inode) } de = (struct ext4_dir_entry_2 *) bh->b_data; } - if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) { + if (!ext4_check_dir_entry(inode, de, bh, offset)) { de = (struct ext4_dir_entry_2 *)(bh->b_data + sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 6df797eb9ae..ca5c8aa00a2 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -921,8 +921,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) &sbi->s_flex_groups[flex_group].free_inodes); } - ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); - sb->s_dirt = 1; + ext4_handle_dirty_super(handle, sb); exit_journal: mutex_unlock(&sbi->s_resize_lock); @@ -953,7 +952,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count) { ext4_fsblk_t o_blocks_count; - ext4_group_t o_groups_count; ext4_grpblk_t last; ext4_grpblk_t add; struct buffer_head *bh; @@ -965,7 +963,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, * yet: we're going to revalidate es->s_blocks_count after * taking the s_resize_lock below. */ o_blocks_count = ext4_blocks_count(es); - o_groups_count = EXT4_SB(sb)->s_groups_count; if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", @@ -1045,13 +1042,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, goto exit_put; } ext4_blocks_count_set(es, o_blocks_count + add); - ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); - sb->s_dirt = 1; mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ ext4_add_groupblocks(handle, sb, o_blocks_count, add); + ext4_handle_dirty_super(handle, sb); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); if ((err = ext4_journal_stop(handle))) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4e8983a9811..8d65575f8c8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -241,14 +241,14 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); - vfs_check_frozen(sb, SB_FREEZE_WRITE); + vfs_check_frozen(sb, SB_FREEZE_TRANS); /* Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ journal = EXT4_SB(sb)->s_journal; if (journal) { if (is_journal_aborted(journal)) { - ext4_abort(sb, __func__, "Detected aborted journal"); + ext4_abort(sb, "Detected aborted journal"); return ERR_PTR(-EROFS); } return jbd2_journal_start(journal, nblocks); @@ -262,7 +262,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) * that sync() will call the filesystem's write_super callback if * appropriate. */ -int __ext4_journal_stop(const char *where, handle_t *handle) +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) { struct super_block *sb; int err; @@ -279,12 +279,13 @@ int __ext4_journal_stop(const char *where, handle_t *handle) if (!err) err = rc; if (err) - __ext4_std_error(sb, where, err); + __ext4_std_error(sb, where, line, err); return err; } -void ext4_journal_abort_handle(const char *caller, const char *err_fn, - struct buffer_head *bh, handle_t *handle, int err) +void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, struct buffer_head *bh, + handle_t *handle, int err) { char nbuf[16]; const char *errstr = ext4_decode_error(NULL, err, nbuf); @@ -300,12 +301,47 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn, if (is_handle_aborted(handle)) return; - printk(KERN_ERR "%s: aborting transaction: %s in %s\n", - caller, errstr, err_fn); + printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n", + caller, line, errstr, err_fn); jbd2_journal_abort_handle(handle); } +static void __save_error_info(struct super_block *sb, const char *func, + unsigned int line) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + es->s_last_error_time = cpu_to_le32(get_seconds()); + strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); + es->s_last_error_line = cpu_to_le32(line); + if (!es->s_first_error_time) { + es->s_first_error_time = es->s_last_error_time; + strncpy(es->s_first_error_func, func, + sizeof(es->s_first_error_func)); + es->s_first_error_line = cpu_to_le32(line); + es->s_first_error_ino = es->s_last_error_ino; + es->s_first_error_block = es->s_last_error_block; + } + /* + * Start the daily error reporting function if it hasn't been + * started already + */ + if (!es->s_error_count) + mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); + es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1); +} + +static void save_error_info(struct super_block *sb, const char *func, + unsigned int line) +{ + __save_error_info(sb, func, line); + ext4_commit_super(sb, 1); +} + + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * @@ -323,11 +359,6 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn, static void ext4_handle_error(struct super_block *sb) { - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - if (sb->s_flags & MS_RDONLY) return; @@ -342,19 +373,19 @@ static void ext4_handle_error(struct super_block *sb) ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); sb->s_flags |= MS_RDONLY; } - ext4_commit_super(sb, 1); if (test_opt(sb, ERRORS_PANIC)) panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } void __ext4_error(struct super_block *sb, const char *function, - const char *fmt, ...) + unsigned int line, const char *fmt, ...) { va_list args; va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ", + sb->s_id, function, line, current->comm); vprintk(fmt, args); printk("\n"); va_end(args); @@ -362,14 +393,22 @@ void __ext4_error(struct super_block *sb, const char *function, ext4_handle_error(sb); } -void ext4_error_inode(const char *function, struct inode *inode, +void ext4_error_inode(struct inode *inode, const char *function, + unsigned int line, ext4_fsblk_t block, const char *fmt, ...) { va_list args; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_ino = cpu_to_le32(inode->i_ino); + es->s_last_error_block = cpu_to_le64(block); + save_error_info(inode->i_sb, function, line); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ", - inode->i_sb->s_id, function, inode->i_ino, current->comm); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", + inode->i_sb->s_id, function, line, inode->i_ino); + if (block) + printk("block %llu: ", block); + printk("comm %s: ", current->comm); vprintk(fmt, args); printk("\n"); va_end(args); @@ -377,20 +416,26 @@ void ext4_error_inode(const char *function, struct inode *inode, ext4_handle_error(inode->i_sb); } -void ext4_error_file(const char *function, struct file *file, - const char *fmt, ...) +void ext4_error_file(struct file *file, const char *function, + unsigned int line, const char *fmt, ...) { va_list args; + struct ext4_super_block *es; struct inode *inode = file->f_dentry->d_inode; char pathname[80], *path; + es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_ino = cpu_to_le32(inode->i_ino); + save_error_info(inode->i_sb, function, line); va_start(args, fmt); path = d_path(&(file->f_path), pathname, sizeof(pathname)); if (!path) path = "(unknown)"; printk(KERN_CRIT - "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ", - inode->i_sb->s_id, function, inode->i_ino, current->comm, path); + "EXT4-fs error (device %s): %s:%d: inode #%lu " + "(comm %s path %s): ", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, path); vprintk(fmt, args); printk("\n"); va_end(args); @@ -435,7 +480,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno, /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ -void __ext4_std_error(struct super_block *sb, const char *function, int errno) +void __ext4_std_error(struct super_block *sb, const char *function, + unsigned int line, int errno) { char nbuf[16]; const char *errstr; @@ -448,8 +494,9 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno) return; errstr = ext4_decode_error(sb, errno, nbuf); - printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n", - sb->s_id, function, errstr); + printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + save_error_info(sb, function, line); ext4_handle_error(sb); } @@ -464,29 +511,29 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno) * case we take the easy way out and panic immediately. */ -void ext4_abort(struct super_block *sb, const char *function, - const char *fmt, ...) +void __ext4_abort(struct super_block *sb, const char *function, + unsigned int line, const char *fmt, ...) { va_list args; + save_error_info(sb, function, line); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id, + function, line); vprintk(fmt, args); printk("\n"); va_end(args); + if ((sb->s_flags & MS_RDONLY) == 0) { + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + sb->s_flags |= MS_RDONLY; + EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + if (EXT4_SB(sb)->s_journal) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); + save_error_info(sb, function, line); + } if (test_opt(sb, ERRORS_PANIC)) panic("EXT4-fs panic from previous error\n"); - - if (sb->s_flags & MS_RDONLY) - return; - - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - sb->s_flags |= MS_RDONLY; - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; - if (EXT4_SB(sb)->s_journal) - jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); } void ext4_msg (struct super_block * sb, const char *prefix, @@ -502,38 +549,47 @@ void ext4_msg (struct super_block * sb, const char *prefix, } void __ext4_warning(struct super_block *sb, const char *function, - const char *fmt, ...) + unsigned int line, const char *fmt, ...) { va_list args; va_start(args, fmt); - printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ", - sb->s_id, function); + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ", + sb->s_id, function, line); vprintk(fmt, args); printk("\n"); va_end(args); } -void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp, - const char *function, const char *fmt, ...) +void __ext4_grp_locked_error(const char *function, unsigned int line, + struct super_block *sb, ext4_group_t grp, + unsigned long ino, ext4_fsblk_t block, + const char *fmt, ...) __releases(bitlock) __acquires(bitlock) { va_list args; struct ext4_super_block *es = EXT4_SB(sb)->s_es; + es->s_last_error_ino = cpu_to_le32(ino); + es->s_last_error_block = cpu_to_le64(block); + __save_error_info(sb, function, line); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", + sb->s_id, function, line, grp); + if (ino) + printk("inode %lu: ", ino); + if (block) + printk("block %llu:", (unsigned long long) block); vprintk(fmt, args); printk("\n"); va_end(args); if (test_opt(sb, ERRORS_CONT)) { - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); ext4_commit_super(sb, 0); return; } + ext4_unlock_group(sb, grp); ext4_handle_error(sb); /* @@ -660,8 +716,7 @@ static void ext4_put_super(struct super_block *sb) err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; if (err < 0) - ext4_abort(sb, __func__, - "Couldn't clean up the journal"); + ext4_abort(sb, "Couldn't clean up the journal"); } ext4_release_system_zone(sb); @@ -946,14 +1001,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",journal_async_commit"); else if (test_opt(sb, JOURNAL_CHECKSUM)) seq_puts(seq, ",journal_checksum"); - if (test_opt(sb, NOBH)) - seq_puts(seq, ",nobh"); if (test_opt(sb, I_VERSION)) seq_puts(seq, ",i_version"); - if (!test_opt(sb, DELALLOC)) + if (!test_opt(sb, DELALLOC) && + !(def_mount_opts & EXT4_DEFM_NODELALLOC)) seq_puts(seq, ",nodelalloc"); - if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); /* @@ -977,7 +1030,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NO_AUTO_DA_ALLOC)) seq_puts(seq, ",noauto_da_alloc"); - if (test_opt(sb, DISCARD)) + if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD)) seq_puts(seq, ",discard"); if (test_opt(sb, NOLOAD)) @@ -986,6 +1039,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, DIOREAD_NOLOCK)) seq_puts(seq, ",dioread_nolock"); + if (test_opt(sb, BLOCK_VALIDITY) && + !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) + seq_puts(seq, ",block_validity"); + ext4_show_quota_options(seq, sb); return 0; @@ -1065,6 +1122,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, char *path); +static int ext4_quota_off(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -1086,7 +1144,7 @@ static const struct dquot_operations ext4_quota_operations = { static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, - .quota_off = dquot_quota_off, + .quota_off = ext4_quota_off, .quota_sync = dquot_quota_sync, .get_info = dquot_get_dqinfo, .set_info = dquot_set_dqinfo, @@ -1624,10 +1682,12 @@ set_qf_format: *n_blocks_count = option; break; case Opt_nobh: - set_opt(sbi->s_mount_opt, NOBH); + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated nobh option"); break; case Opt_bh: - clear_opt(sbi->s_mount_opt, NOBH); + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated bh option"); break; case Opt_i_version: set_opt(sbi->s_mount_opt, I_VERSION); @@ -2249,6 +2309,8 @@ static ssize_t session_write_kbytes_show(struct ext4_attr *a, { struct super_block *sb = sbi->s_buddy_cache->i_sb; + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%lu\n", (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - sbi->s_sectors_written_start) >> 1); @@ -2259,6 +2321,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, { struct super_block *sb = sbi->s_buddy_cache->i_sb; + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - @@ -2431,6 +2495,53 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) return 1; } +/* + * This function is called once a day if we have errors logged + * on the file system + */ +static void print_daily_error_info(unsigned long arg) +{ + struct super_block *sb = (struct super_block *) arg; + struct ext4_sb_info *sbi; + struct ext4_super_block *es; + + sbi = EXT4_SB(sb); + es = sbi->s_es; + + if (es->s_error_count) + ext4_msg(sb, KERN_NOTICE, "error count: %u", + le32_to_cpu(es->s_error_count)); + if (es->s_first_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", + sb->s_id, le32_to_cpu(es->s_first_error_time), + (int) sizeof(es->s_first_error_func), + es->s_first_error_func, + le32_to_cpu(es->s_first_error_line)); + if (es->s_first_error_ino) + printk(": inode %u", + le32_to_cpu(es->s_first_error_ino)); + if (es->s_first_error_block) + printk(": block %llu", (unsigned long long) + le64_to_cpu(es->s_first_error_block)); + printk("\n"); + } + if (es->s_last_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", + sb->s_id, le32_to_cpu(es->s_last_error_time), + (int) sizeof(es->s_last_error_func), + es->s_last_error_func, + le32_to_cpu(es->s_last_error_line)); + if (es->s_last_error_ino) + printk(": inode %u", + le32_to_cpu(es->s_last_error_ino)); + if (es->s_last_error_block) + printk(": block %llu", (unsigned long long) + le64_to_cpu(es->s_last_error_block)); + printk("\n"); + } + mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) @@ -2448,7 +2559,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) struct inode *root; char *cp; const char *descr; - int ret = -EINVAL; + int ret = -ENOMEM; int blocksize; unsigned int db_count; unsigned int i; @@ -2459,13 +2570,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) - return -ENOMEM; + goto out_free_orig; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) { kfree(sbi); - return -ENOMEM; + goto out_free_orig; } sb->s_fs_info = sbi; sbi->s_mount_opt = 0; @@ -2473,8 +2584,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_resgid = EXT4_DEF_RESGID; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; - sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, - sectors[1]); + if (sb->s_bdev->bd_part) + sbi->s_sectors_written_start = + part_stat_read(sb->s_bdev->bd_part, sectors[1]); unlock_kernel(); @@ -2482,6 +2594,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (cp = sb->s_id; (cp = strchr(cp, '/'));) *cp = '!'; + ret = -EINVAL; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { ext4_msg(sb, KERN_ERR, "unable to set blocksize"); @@ -2546,6 +2659,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, ERRORS_CONT); else set_opt(sbi->s_mount_opt, ERRORS_RO); + if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) + set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + if (def_mount_opts & EXT4_DEFM_DISCARD) + set_opt(sbi->s_mount_opt, DISCARD); sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); @@ -2553,15 +2670,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; - set_opt(sbi->s_mount_opt, BARRIER); + if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) + set_opt(sbi->s_mount_opt, BARRIER); /* * enable delayed allocation by default * Use -o nodelalloc to turn it off */ - if (!IS_EXT3_SB(sb)) + if (!IS_EXT3_SB(sb) && + ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sbi->s_mount_opt, DELALLOC); + if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, + &journal_devnum, &journal_ioprio, NULL, 0)) { + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + sbi->s_es->s_mount_opts); + } if (!parse_options((char *) data, sb, &journal_devnum, &journal_ioprio, NULL, 0)) goto failed_mount; @@ -2912,18 +3037,7 @@ no_journal: ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount_wq; } - if (test_opt(sb, NOBH)) { - if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { - ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " - "its supported only with writeback mode"); - clear_opt(sbi->s_mount_opt, NOBH); - } - if (test_opt(sb, DIOREAD_NOLOCK)) { - ext4_msg(sb, KERN_WARNING, "dioread_nolock option is " - "not supported with nobh mode"); - goto failed_mount_wq; - } - } + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); @@ -3010,7 +3124,7 @@ no_journal: ext4_ext_init(sb); err = ext4_mb_init(sb, needs_recovery); if (err) { - ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)", + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); goto failed_mount4; } @@ -3043,7 +3157,14 @@ no_journal: descr = "out journal"; ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %s", descr, orig_data); + "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + *sbi->s_es->s_mount_opts ? "; " : "", orig_data); + + init_timer(&sbi->s_err_report); + sbi->s_err_report.function = print_daily_error_info; + sbi->s_err_report.data = (unsigned long) sb; + if (es->s_error_count) + mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ lock_kernel(); kfree(orig_data); @@ -3093,6 +3214,7 @@ out_fail: kfree(sbi->s_blockgroup_lock); kfree(sbi); lock_kernel(); +out_free_orig: kfree(orig_data); return ret; } @@ -3110,7 +3232,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_min_batch_time = sbi->s_min_batch_time; journal->j_max_batch_time = sbi->s_max_batch_time; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) journal->j_flags |= JBD2_BARRIER; else @@ -3119,7 +3241,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; else journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } static journal_t *ext4_get_journal(struct super_block *sb, @@ -3327,8 +3449,17 @@ static int ext4_load_journal(struct super_block *sb, if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) err = jbd2_journal_wipe(journal, !really_read_only); - if (!err) + if (!err) { + char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); + if (save) + memcpy(save, ((char *) es) + + EXT4_S_ERR_START, EXT4_S_ERR_LEN); err = jbd2_journal_load(journal); + if (save) + memcpy(((char *) es) + EXT4_S_ERR_START, + save, EXT4_S_ERR_LEN); + kfree(save); + } if (err) { ext4_msg(sb, KERN_ERR, "error loading journal"); @@ -3384,10 +3515,14 @@ static int ext4_commit_super(struct super_block *sb, int sync) */ if (!(sb->s_flags & MS_RDONLY)) es->s_wtime = cpu_to_le32(get_seconds()); - es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + if (sb->s_bdev->bd_part) + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - EXT4_SB(sb)->s_sectors_written_start) >> 1)); + else + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); ext4_free_blocks_count_set(es, percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeblocks_counter)); es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( @@ -3491,7 +3626,7 @@ int ext4_force_commit(struct super_block *sb) journal = EXT4_SB(sb)->s_journal; if (journal) { - vfs_check_frozen(sb, SB_FREEZE_WRITE); + vfs_check_frozen(sb, SB_FREEZE_TRANS); ret = ext4_journal_force_commit(journal); } @@ -3616,7 +3751,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) - ext4_abort(sb, __func__, "Abort forced by user"); + ext4_abort(sb, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); @@ -3981,6 +4116,18 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, return err; } +static int ext4_quota_off(struct super_block *sb, int type) +{ + /* Force all delayed allocation blocks to be allocated */ + if (test_opt(sb, DELALLOC)) { + down_read(&sb->s_umount); + sync_filesystem(sb); + up_read(&sb->s_umount); + } + + return dquot_quota_off(sb, type); +} + /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and noone else should touch the files) @@ -4030,7 +4177,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err = 0; int offset = off & (sb->s_blocksize - 1); - int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -4055,24 +4201,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, bh = ext4_bread(handle, inode, blk, 1, &err); if (!bh) goto out; - if (journal_quota) { - err = ext4_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + goto out; } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); flush_dcache_page(bh->b_page); unlock_buffer(bh); - if (journal_quota) - err = ext4_handle_dirty_metadata(handle, NULL, bh); - else { - /* Always do at least ordered writes for quotas */ - err = ext4_jbd2_file_inode(handle, inode); - mark_buffer_dirty(bh); - } + err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: if (err) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 04338009793..a6f31424957 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -458,8 +458,7 @@ static void ext4_xattr_update_super_block(handle_t *handle, if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); - sb->s_dirt = 1; - ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + ext4_handle_dirty_super(handle, sb); } } diff --git a/fs/fcntl.c b/fs/fcntl.c index f74d270ba15..9d175d623aa 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -274,7 +274,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg) ret = copy_from_user(&owner, owner_p, sizeof(owner)); if (ret) - return ret; + return -EFAULT; switch (owner.type) { case F_OWNER_TID: @@ -332,8 +332,11 @@ static int f_getown_ex(struct file *filp, unsigned long arg) } read_unlock(&filp->f_owner.lock); - if (!ret) + if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); + if (ret) + ret = -EFAULT; + } return ret; } @@ -730,12 +733,14 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; + unsigned long flags; + if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } - spin_lock(&fa->fa_lock); + spin_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { fown = &fa->fa_file->f_owner; /* Don't send SIGURG to processes which have not set a @@ -744,7 +749,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } - spin_unlock(&fa->fa_lock); + spin_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } } diff --git a/fs/file.c b/fs/file.c index 34bb7f71d99..cccaead962c 100644 --- a/fs/file.c +++ b/fs/file.c @@ -178,7 +178,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr) fdt->open_fds = (fd_set *)data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = (fd_set *)data; - INIT_RCU_HEAD(&fdt->rcu); fdt->next = NULL; return fdt; @@ -312,7 +311,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; new_fdt->open_fds = (fd_set *)&newf->open_fds_init; new_fdt->fd = &newf->fd_array[0]; - INIT_RCU_HEAD(&new_fdt->rcu); new_fdt->next = NULL; spin_lock(&oldf->file_lock); @@ -430,7 +428,6 @@ struct files_struct init_files = { .fd = &init_files.fd_array[0], .close_on_exec = (fd_set *)&init_files.close_on_exec_init, .open_fds = (fd_set *)&init_files.open_fds_init, - .rcu = RCU_HEAD_INIT, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), }; diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 1e8af939b3e..5132c99b1ca 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -135,7 +135,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data) } /** - * vxfs_read_super - read superblock into memory and initalize filesystem + * vxfs_read_super - read superblock into memory and initialize filesystem * @sbp: VFS superblock (to fill) * @dp: fs private mount data * @silent: do not complain loudly when sth is wrong diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ea8592b9069..d5be1693ac9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -38,52 +38,18 @@ int nr_pdflush_threads; /* * Passed into wb_writeback(), essentially a subset of writeback_control */ -struct wb_writeback_args { +struct wb_writeback_work { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; - unsigned int sb_pinned:1; -}; -/* - * Work items for the bdi_writeback threads - */ -struct bdi_work { struct list_head list; /* pending work list */ - struct rcu_head rcu_head; /* for RCU free/clear of work */ - - unsigned long seen; /* threads that have seen this work */ - atomic_t pending; /* number of threads still to do work */ - - struct wb_writeback_args args; /* writeback arguments */ - - unsigned long state; /* flag bits, see WS_* */ + struct completion *done; /* set if the caller waits */ }; -enum { - WS_USED_B = 0, - WS_ONSTACK_B, -}; - -#define WS_USED (1 << WS_USED_B) -#define WS_ONSTACK (1 << WS_ONSTACK_B) - -static inline bool bdi_work_on_stack(struct bdi_work *work) -{ - return test_bit(WS_ONSTACK_B, &work->state); -} - -static inline void bdi_work_init(struct bdi_work *work, - struct wb_writeback_args *args) -{ - INIT_RCU_HEAD(&work->rcu_head); - work->args = *args; - work->state = WS_USED; -} - /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -96,76 +62,11 @@ int writeback_in_progress(struct backing_dev_info *bdi) return !list_empty(&bdi->work_list); } -static void bdi_work_clear(struct bdi_work *work) -{ - clear_bit(WS_USED_B, &work->state); - smp_mb__after_clear_bit(); - /* - * work can have disappeared at this point. bit waitq functions - * should be able to tolerate this, provided bdi_sched_wait does - * not dereference it's pointer argument. - */ - wake_up_bit(&work->state, WS_USED_B); -} - -static void bdi_work_free(struct rcu_head *head) -{ - struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); - - if (!bdi_work_on_stack(work)) - kfree(work); - else - bdi_work_clear(work); -} - -static void wb_work_complete(struct bdi_work *work) -{ - const enum writeback_sync_modes sync_mode = work->args.sync_mode; - int onstack = bdi_work_on_stack(work); - - /* - * For allocated work, we can clear the done/seen bit right here. - * For on-stack work, we need to postpone both the clear and free - * to after the RCU grace period, since the stack could be invalidated - * as soon as bdi_work_clear() has done the wakeup. - */ - if (!onstack) - bdi_work_clear(work); - if (sync_mode == WB_SYNC_NONE || onstack) - call_rcu(&work->rcu_head, bdi_work_free); -} - -static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) -{ - /* - * The caller has retrieved the work arguments from this work, - * drop our reference. If this is the last ref, delete and free it - */ - if (atomic_dec_and_test(&work->pending)) { - struct backing_dev_info *bdi = wb->bdi; - - spin_lock(&bdi->wb_lock); - list_del_rcu(&work->list); - spin_unlock(&bdi->wb_lock); - - wb_work_complete(work); - } -} - -static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) +static void bdi_queue_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) { - work->seen = bdi->wb_mask; - BUG_ON(!work->seen); - atomic_set(&work->pending, bdi->wb_cnt); - BUG_ON(!bdi->wb_cnt); - - /* - * list_add_tail_rcu() contains the necessary barriers to - * make sure the above stores are seen before the item is - * noticed on the list - */ spin_lock(&bdi->wb_lock); - list_add_tail_rcu(&work->list, &bdi->work_list); + list_add_tail(&work->list, &bdi->work_list); spin_unlock(&bdi->wb_lock); /* @@ -182,107 +83,59 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) } } -/* - * Used for on-stack allocated work items. The caller needs to wait until - * the wb threads have acked the work before it's safe to continue. - */ -static void bdi_wait_on_work_clear(struct bdi_work *work) -{ - wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); -} - -static void bdi_alloc_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_args *args, - int wait) +static void +__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + bool range_cyclic, bool for_background) { - struct bdi_work *work; + struct wb_writeback_work *work; /* * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback */ - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) { - bdi_work_init(work, args); - bdi_queue_work(bdi, work); - if (wait) - bdi_wait_on_work_clear(work); - } else { - struct bdi_writeback *wb = &bdi->wb; - - if (wb->task) - wake_up_process(wb->task); + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) { + if (bdi->wb.task) + wake_up_process(bdi->wb.task); + return; } + + work->sync_mode = WB_SYNC_NONE; + work->nr_pages = nr_pages; + work->range_cyclic = range_cyclic; + work->for_background = for_background; + + bdi_queue_work(bdi, work); } /** - * bdi_sync_writeback - start and wait for writeback + * bdi_start_writeback - start writeback * @bdi: the backing device to write from - * @sb: write inodes from this super_block + * @nr_pages: the number of pages to write * * Description: - * This does WB_SYNC_ALL data integrity writeback and waits for the - * IO to complete. Callers must hold the sb s_umount semaphore for - * reading, to avoid having the super disappear before we are done. + * This does WB_SYNC_NONE opportunistic writeback. The IO is only + * started when this function returns, we make no guarentees on + * completion. Caller need not hold sb s_umount semaphore. + * */ -static void bdi_sync_writeback(struct backing_dev_info *bdi, - struct super_block *sb) +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) { - struct wb_writeback_args args = { - .sb = sb, - .sync_mode = WB_SYNC_ALL, - .nr_pages = LONG_MAX, - .range_cyclic = 0, - /* - * Setting sb_pinned is not necessary for WB_SYNC_ALL, but - * lets make it explicitly clear. - */ - .sb_pinned = 1, - }; - struct bdi_work work; - - bdi_work_init(&work, &args); - work.state |= WS_ONSTACK; - - bdi_queue_work(bdi, &work); - bdi_wait_on_work_clear(&work); + __bdi_start_writeback(bdi, nr_pages, true, false); } /** - * bdi_start_writeback - start writeback + * bdi_start_background_writeback - start background writeback * @bdi: the backing device to write from - * @sb: write inodes from this super_block - * @nr_pages: the number of pages to write - * @sb_locked: caller already holds sb umount sem. * * Description: - * This does WB_SYNC_NONE opportunistic writeback. The IO is only + * This does WB_SYNC_NONE background writeback. The IO is only * started when this function returns, we make no guarentees on - * completion. Caller specifies whether sb umount sem is held already or not. - * + * completion. Caller need not hold sb s_umount semaphore. */ -void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, - long nr_pages, int sb_locked) +void bdi_start_background_writeback(struct backing_dev_info *bdi) { - struct wb_writeback_args args = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .nr_pages = nr_pages, - .range_cyclic = 1, - .sb_pinned = sb_locked, - }; - - /* - * We treat @nr_pages=0 as the special case to do background writeback, - * ie. to sync pages until the background dirty threshold is reached. - */ - if (!nr_pages) { - args.nr_pages = LONG_MAX; - args.for_background = 1; - } - - bdi_alloc_queue_work(bdi, &args, sb_locked); + __bdi_start_writeback(bdi, LONG_MAX, true, true); } /* @@ -572,75 +425,69 @@ select_queue: return ret; } -static void unpin_sb_for_writeback(struct super_block *sb) -{ - up_read(&sb->s_umount); - put_super(sb); -} - -enum sb_pin_state { - SB_PINNED, - SB_NOT_PINNED, - SB_PIN_FAILED -}; - /* - * For WB_SYNC_NONE writeback, the caller does not have the sb pinned + * For background writeback the caller does not have the sb pinned * before calling writeback. So make sure that we do pin it, so it doesn't * go away while we are writing inodes from it. */ -static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, - struct super_block *sb) +static bool pin_sb_for_writeback(struct super_block *sb) { - /* - * Caller must already hold the ref for this - */ - if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) { - WARN_ON(!rwsem_is_locked(&sb->s_umount)); - return SB_NOT_PINNED; - } spin_lock(&sb_lock); + if (list_empty(&sb->s_instances)) { + spin_unlock(&sb_lock); + return false; + } + sb->s_count++; + spin_unlock(&sb_lock); + if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root) { - spin_unlock(&sb_lock); - return SB_PINNED; - } - /* - * umounted, drop rwsem again and fall through to failure - */ + if (sb->s_root) + return true; up_read(&sb->s_umount); } - sb->s_count--; - spin_unlock(&sb_lock); - return SB_PIN_FAILED; + + put_super(sb); + return false; } /* * Write a portion of b_io inodes which belong to @sb. - * If @wbc->sb != NULL, then find and write all such + * + * If @only_this_sb is true, then find and write all such * inodes. Otherwise write only ones which go sequentially * in reverse order. + * * Return 1, if the caller writeback routine should be * interrupted. Otherwise return 0. */ -static int writeback_sb_inodes(struct super_block *sb, - struct bdi_writeback *wb, - struct writeback_control *wbc) +static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, + struct writeback_control *wbc, bool only_this_sb) { while (!list_empty(&wb->b_io)) { long pages_skipped; struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); - if (wbc->sb && sb != inode->i_sb) { - /* super block given and doesn't - match, skip this inode */ - redirty_tail(inode); - continue; - } - if (sb != inode->i_sb) - /* finish with this superblock */ + + if (inode->i_sb != sb) { + if (only_this_sb) { + /* + * We only want to write back data for this + * superblock, move all inodes not belonging + * to it back onto the dirty list. + */ + redirty_tail(inode); + continue; + } + + /* + * The inode belongs to a different superblock. + * Bounce back to the caller to unpin this and + * pin the next superblock. + */ return 0; + } + if (inode->i_state & (I_NEW | I_WILL_FREE)) { requeue_io(inode); continue; @@ -678,8 +525,8 @@ static int writeback_sb_inodes(struct super_block *sb, return 1; } -static void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +void writeback_inodes_wb(struct bdi_writeback *wb, + struct writeback_control *wbc) { int ret = 0; @@ -692,24 +539,14 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); struct super_block *sb = inode->i_sb; - enum sb_pin_state state; - - if (wbc->sb && sb != wbc->sb) { - /* super block given and doesn't - match, skip this inode */ - redirty_tail(inode); - continue; - } - state = pin_sb_for_writeback(wbc, sb); - if (state == SB_PIN_FAILED) { + if (!pin_sb_for_writeback(sb)) { requeue_io(inode); continue; } - ret = writeback_sb_inodes(sb, wb, wbc); + ret = writeback_sb_inodes(sb, wb, wbc, false); + drop_super(sb); - if (state == SB_PINNED) - unpin_sb_for_writeback(sb); if (ret) break; } @@ -717,11 +554,17 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, /* Leave any unwritten inodes on b_io */ } -void writeback_inodes_wbc(struct writeback_control *wbc) +static void __writeback_inodes_sb(struct super_block *sb, + struct bdi_writeback *wb, struct writeback_control *wbc) { - struct backing_dev_info *bdi = wbc->bdi; + WARN_ON(!rwsem_is_locked(&sb->s_umount)); - writeback_inodes_wb(&bdi->wb, wbc); + wbc->wb_start = jiffies; /* livelock avoidance */ + spin_lock(&inode_lock); + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + writeback_sb_inodes(sb, wb, wbc, true); + spin_unlock(&inode_lock); } /* @@ -759,17 +602,14 @@ static inline bool over_bground_thresh(void) * all dirty pages if they are all attached to "old" mappings. */ static long wb_writeback(struct bdi_writeback *wb, - struct wb_writeback_args *args) + struct wb_writeback_work *work) { struct writeback_control wbc = { - .bdi = wb->bdi, - .sb = args->sb, - .sync_mode = args->sync_mode, + .sync_mode = work->sync_mode, .older_than_this = NULL, - .for_kupdate = args->for_kupdate, - .for_background = args->for_background, - .range_cyclic = args->range_cyclic, - .sb_pinned = args->sb_pinned, + .for_kupdate = work->for_kupdate, + .for_background = work->for_background, + .range_cyclic = work->range_cyclic, }; unsigned long oldest_jif; long wrote = 0; @@ -789,21 +629,24 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Stop writeback when nr_pages has been consumed */ - if (args->nr_pages <= 0) + if (work->nr_pages <= 0) break; /* * For background writeout, stop when we are below the * background dirty threshold */ - if (args->for_background && !over_bground_thresh()) + if (work->for_background && !over_bground_thresh()) break; wbc.more_io = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; - writeback_inodes_wb(wb, &wbc); - args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + if (work->sb) + __writeback_inodes_sb(work->sb, wb, &wbc); + else + writeback_inodes_wb(wb, &wbc); + work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; /* @@ -839,31 +682,21 @@ static long wb_writeback(struct bdi_writeback *wb, } /* - * Return the next bdi_work struct that hasn't been processed by this - * wb thread yet. ->seen is initially set for each thread that exists - * for this device, when a thread first notices a piece of work it - * clears its bit. Depending on writeback type, the thread will notify - * completion on either receiving the work (WB_SYNC_NONE) or after - * it is done (WB_SYNC_ALL). + * Return the next wb_writeback_work struct that hasn't been processed yet. */ -static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, - struct bdi_writeback *wb) +static struct wb_writeback_work * +get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb) { - struct bdi_work *work, *ret = NULL; + struct wb_writeback_work *work = NULL; - rcu_read_lock(); - - list_for_each_entry_rcu(work, &bdi->work_list, list) { - if (!test_bit(wb->nr, &work->seen)) - continue; - clear_bit(wb->nr, &work->seen); - - ret = work; - break; + spin_lock(&bdi->wb_lock); + if (!list_empty(&bdi->work_list)) { + work = list_entry(bdi->work_list.next, + struct wb_writeback_work, list); + list_del_init(&work->list); } - - rcu_read_unlock(); - return ret; + spin_unlock(&bdi->wb_lock); + return work; } static long wb_check_old_data_flush(struct bdi_writeback *wb) @@ -888,14 +721,14 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) (inodes_stat.nr_inodes - inodes_stat.nr_unused); if (nr_pages) { - struct wb_writeback_args args = { + struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, }; - return wb_writeback(wb, &args); + return wb_writeback(wb, &work); } return 0; @@ -907,36 +740,27 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) long wb_do_writeback(struct bdi_writeback *wb, int force_wait) { struct backing_dev_info *bdi = wb->bdi; - struct bdi_work *work; + struct wb_writeback_work *work; long wrote = 0; while ((work = get_next_work_item(bdi, wb)) != NULL) { - struct wb_writeback_args args = work->args; - int post_clear; - /* * Override sync mode, in case we must wait for completion + * because this thread is exiting now. */ if (force_wait) - work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; - - post_clear = WB_SYNC_ALL || args.sb_pinned; - - /* - * If this isn't a data integrity operation, just notify - * that we have seen this work and we are now starting it. - */ - if (!post_clear) - wb_clear_pending(wb, work); + work->sync_mode = WB_SYNC_ALL; - wrote += wb_writeback(wb, &args); + wrote += wb_writeback(wb, work); /* - * This is a data integrity writeback, so only do the - * notification when we have completed the work. + * Notify the caller of completion if this is a synchronous + * work item, otherwise just free it. */ - if (post_clear) - wb_clear_pending(wb, work); + if (work->done) + complete(work->done); + else + kfree(work); } /* @@ -993,42 +817,27 @@ int bdi_writeback_task(struct bdi_writeback *wb) } /* - * Schedule writeback for all backing devices. This does WB_SYNC_NONE - * writeback, for integrity writeback see bdi_sync_writeback(). + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back + * the whole world. */ -static void bdi_writeback_all(struct super_block *sb, long nr_pages) +void wakeup_flusher_threads(long nr_pages) { - struct wb_writeback_args args = { - .sb = sb, - .nr_pages = nr_pages, - .sync_mode = WB_SYNC_NONE, - }; struct backing_dev_info *bdi; - rcu_read_lock(); + if (!nr_pages) { + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + } + rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - - bdi_alloc_queue_work(bdi, &args, 0); + __bdi_start_writeback(bdi, nr_pages, false, false); } - rcu_read_unlock(); } -/* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. - */ -void wakeup_flusher_threads(long nr_pages) -{ - if (nr_pages == 0) - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - bdi_writeback_all(NULL, nr_pages); -} - static noinline void block_dump___mark_inode_dirty(struct inode *inode) { if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { @@ -1220,18 +1029,6 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); } -static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) -{ - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - long nr_to_write; - - nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - - bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked); -} - /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock @@ -1243,21 +1040,24 @@ static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) */ void writeback_inodes_sb(struct super_block *sb) { - __writeback_inodes_sb(sb, 0); -} -EXPORT_SYMBOL(writeback_inodes_sb); + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + DECLARE_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .done = &done, + }; -/** - * writeback_inodes_sb_locked - writeback dirty inodes from given super_block - * @sb: the superblock - * - * Like writeback_inodes_sb(), except the caller already holds the - * sb umount sem. - */ -void writeback_inodes_sb_locked(struct super_block *sb) -{ - __writeback_inodes_sb(sb, 1); + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + work.nr_pages = nr_dirty + nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + + bdi_queue_work(sb->s_bdi, &work); + wait_for_completion(&done); } +EXPORT_SYMBOL(writeback_inodes_sb); /** * writeback_inodes_sb_if_idle - start writeback if none underway @@ -1269,7 +1069,9 @@ void writeback_inodes_sb_locked(struct super_block *sb) int writeback_inodes_sb_if_idle(struct super_block *sb) { if (!writeback_in_progress(sb->s_bdi)) { + down_read(&sb->s_umount); writeback_inodes_sb(sb); + up_read(&sb->s_umount); return 1; } else return 0; @@ -1285,7 +1087,20 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle); */ void sync_inodes_sb(struct super_block *sb) { - bdi_sync_writeback(sb->s_bdi, sb); + DECLARE_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_ALL, + .nr_pages = LONG_MAX, + .range_cyclic = 0, + .done = &done, + }; + + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + bdi_queue_work(sb->s_bdi, &work); + wait_for_completion(&done); + wait_sb_inodes(sb); } EXPORT_SYMBOL(sync_inodes_sb); diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index cc94bb9563f..3f6dfa98988 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -1,7 +1,6 @@ config FSCACHE tristate "General filesystem local caching manager" - select SLOW_WORK help This option enables a generic filesystem caching manager that can be used by various network and other filesystems to cache data locally. diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index edd7434ab6e..6a026441c5a 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -82,6 +82,14 @@ extern unsigned fscache_defer_lookup; extern unsigned fscache_defer_create; extern unsigned fscache_debug; extern struct kobject *fscache_root; +extern struct workqueue_struct *fscache_object_wq; +extern struct workqueue_struct *fscache_op_wq; +DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + +static inline bool fscache_object_congested(void) +{ + return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); +} extern int fscache_wait_bit(void *); extern int fscache_wait_bit_interruptible(void *); diff --git a/fs/fscache/main.c b/fs/fscache/main.c index add6bdb53f0..f9d856773f7 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -15,6 +15,7 @@ #include <linux/sched.h> #include <linux/completion.h> #include <linux/slab.h> +#include <linux/seq_file.h> #include "internal.h" MODULE_DESCRIPTION("FS Cache Manager"); @@ -40,22 +41,105 @@ MODULE_PARM_DESC(fscache_debug, "FS-Cache debugging mask"); struct kobject *fscache_root; +struct workqueue_struct *fscache_object_wq; +struct workqueue_struct *fscache_op_wq; + +DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + +/* these values serve as lower bounds, will be adjusted in fscache_init() */ +static unsigned fscache_object_max_active = 4; +static unsigned fscache_op_max_active = 2; + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *fscache_sysctl_header; + +static int fscache_max_active_sysctl(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + struct workqueue_struct **wqp = table->extra1; + unsigned int *datap = table->data; + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0) + workqueue_set_max_active(*wqp, *datap); + return ret; +} + +ctl_table fscache_sysctls[] = { + { + .procname = "object_max_active", + .data = &fscache_object_max_active, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = fscache_max_active_sysctl, + .extra1 = &fscache_object_wq, + }, + { + .procname = "operation_max_active", + .data = &fscache_op_max_active, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = fscache_max_active_sysctl, + .extra1 = &fscache_op_wq, + }, + {} +}; + +ctl_table fscache_sysctls_root[] = { + { + .procname = "fscache", + .mode = 0555, + .child = fscache_sysctls, + }, + {} +}; +#endif /* * initialise the fs caching module */ static int __init fscache_init(void) { + unsigned int nr_cpus = num_possible_cpus(); + unsigned int cpu; int ret; - ret = slow_work_register_user(THIS_MODULE); - if (ret < 0) - goto error_slow_work; + fscache_object_max_active = + clamp_val(nr_cpus, + fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE); + + ret = -ENOMEM; + fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND, + fscache_object_max_active); + if (!fscache_object_wq) + goto error_object_wq; + + fscache_op_max_active = + clamp_val(fscache_object_max_active / 2, + fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE); + + ret = -ENOMEM; + fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND, + fscache_op_max_active); + if (!fscache_op_wq) + goto error_op_wq; + + for_each_possible_cpu(cpu) + init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); ret = fscache_proc_init(); if (ret < 0) goto error_proc; +#ifdef CONFIG_SYSCTL + ret = -ENOMEM; + fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root); + if (!fscache_sysctl_header) + goto error_sysctl; +#endif + fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", sizeof(struct fscache_cookie), 0, @@ -78,10 +162,16 @@ static int __init fscache_init(void) error_kobj: kmem_cache_destroy(fscache_cookie_jar); error_cookie_jar: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(fscache_sysctl_header); +error_sysctl: +#endif fscache_proc_cleanup(); error_proc: - slow_work_unregister_user(THIS_MODULE); -error_slow_work: + destroy_workqueue(fscache_op_wq); +error_op_wq: + destroy_workqueue(fscache_object_wq); +error_object_wq: return ret; } @@ -96,8 +186,12 @@ static void __exit fscache_exit(void) kobject_put(fscache_root); kmem_cache_destroy(fscache_cookie_jar); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(fscache_sysctl_header); +#endif fscache_proc_cleanup(); - slow_work_unregister_user(THIS_MODULE); + destroy_workqueue(fscache_op_wq); + destroy_workqueue(fscache_object_wq); printk(KERN_NOTICE "FS-Cache: Unloaded\n"); } diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 4a8eb31c533..ebe29c58138 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -34,8 +34,8 @@ struct fscache_objlist_data { #define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */ #define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */ #define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ -#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with slow work */ -#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without slow work */ +#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */ +#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */ u8 buf[512]; /* key and aux data buffer */ }; @@ -231,12 +231,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v) READS, NOREADS); FILTER(obj->events & obj->event_mask, EVENTS, NOEVENTS); - FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW), - WORK, NOWORK); + FILTER(work_busy(&obj->work), WORK, NOWORK); } seq_printf(m, - "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ", + "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ", obj->debug_id, obj->parent ? obj->parent->debug_id : -1, fscache_object_states_short[obj->state], @@ -249,7 +248,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v) obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK, obj->events, obj->flags, - obj->work.flags); + work_busy(&obj->work)); no_cookie = true; keylen = auxlen = 0; diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 0b589a9b4ff..b6b897c550a 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -14,7 +14,6 @@ #define FSCACHE_DEBUG_LEVEL COOKIE #include <linux/module.h> -#include <linux/seq_file.h> #include "internal.h" const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { @@ -50,12 +49,8 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { [FSCACHE_OBJECT_DEAD] = "DEAD", }; -static void fscache_object_slow_work_put_ref(struct slow_work *); -static int fscache_object_slow_work_get_ref(struct slow_work *); -static void fscache_object_slow_work_execute(struct slow_work *); -#ifdef CONFIG_SLOW_WORK_DEBUG -static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *); -#endif +static int fscache_get_object(struct fscache_object *); +static void fscache_put_object(struct fscache_object *); static void fscache_initialise_object(struct fscache_object *); static void fscache_lookup_object(struct fscache_object *); static void fscache_object_available(struct fscache_object *); @@ -64,17 +59,6 @@ static void fscache_withdraw_object(struct fscache_object *); static void fscache_enqueue_dependents(struct fscache_object *); static void fscache_dequeue_object(struct fscache_object *); -const struct slow_work_ops fscache_object_slow_work_ops = { - .owner = THIS_MODULE, - .get_ref = fscache_object_slow_work_get_ref, - .put_ref = fscache_object_slow_work_put_ref, - .execute = fscache_object_slow_work_execute, -#ifdef CONFIG_SLOW_WORK_DEBUG - .desc = fscache_object_slow_work_desc, -#endif -}; -EXPORT_SYMBOL(fscache_object_slow_work_ops); - /* * we need to notify the parent when an op completes that we had outstanding * upon it @@ -345,7 +329,7 @@ unsupported_event: /* * execute an object */ -static void fscache_object_slow_work_execute(struct slow_work *work) +void fscache_object_work_func(struct work_struct *work) { struct fscache_object *object = container_of(work, struct fscache_object, work); @@ -359,23 +343,9 @@ static void fscache_object_slow_work_execute(struct slow_work *work) if (object->events & object->event_mask) fscache_enqueue_object(object); clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + fscache_put_object(object); } - -/* - * describe an object for slow-work debugging - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -static void fscache_object_slow_work_desc(struct slow_work *work, - struct seq_file *m) -{ - struct fscache_object *object = - container_of(work, struct fscache_object, work); - - seq_printf(m, "FSC: OBJ%x: %s", - object->debug_id, - fscache_object_states_short[object->state]); -} -#endif +EXPORT_SYMBOL(fscache_object_work_func); /* * initialise an object @@ -393,7 +363,6 @@ static void fscache_initialise_object(struct fscache_object *object) _enter(""); ASSERT(object->cookie != NULL); ASSERT(object->cookie->parent != NULL); - ASSERT(list_empty(&object->work.link)); if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) | (1 << FSCACHE_OBJECT_EV_RELEASE) | @@ -671,10 +640,8 @@ static void fscache_drop_object(struct fscache_object *object) object->parent = NULL; } - /* this just shifts the object release to the slow work processor */ - fscache_stat(&fscache_n_cop_put_object); - object->cache->ops->put_object(object); - fscache_stat_d(&fscache_n_cop_put_object); + /* this just shifts the object release to the work processor */ + fscache_put_object(object); _leave(""); } @@ -758,12 +725,10 @@ void fscache_withdrawing_object(struct fscache_cache *cache, } /* - * allow the slow work item processor to get a ref on an object + * get a ref on an object */ -static int fscache_object_slow_work_get_ref(struct slow_work *work) +static int fscache_get_object(struct fscache_object *object) { - struct fscache_object *object = - container_of(work, struct fscache_object, work); int ret; fscache_stat(&fscache_n_cop_grab_object); @@ -773,13 +738,10 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work) } /* - * allow the slow work item processor to discard a ref on a work item + * discard a ref on a work item */ -static void fscache_object_slow_work_put_ref(struct slow_work *work) +static void fscache_put_object(struct fscache_object *object) { - struct fscache_object *object = - container_of(work, struct fscache_object, work); - fscache_stat(&fscache_n_cop_put_object); object->cache->ops->put_object(object); fscache_stat_d(&fscache_n_cop_put_object); @@ -792,8 +754,48 @@ void fscache_enqueue_object(struct fscache_object *object) { _enter("{OBJ%x}", object->debug_id); - slow_work_enqueue(&object->work); + if (fscache_get_object(object) >= 0) { + wait_queue_head_t *cong_wq = + &get_cpu_var(fscache_object_cong_wait); + + if (queue_work(fscache_object_wq, &object->work)) { + if (fscache_object_congested()) + wake_up(cong_wq); + } else + fscache_put_object(object); + + put_cpu_var(fscache_object_cong_wait); + } +} + +/** + * fscache_object_sleep_till_congested - Sleep until object wq is congested + * @timoutp: Scheduler sleep timeout + * + * Allow an object handler to sleep until the object workqueue is congested. + * + * The caller must set up a wake up event before calling this and must have set + * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own + * condition before calling this function as no test is made here. + * + * %true is returned if the object wq is congested, %false otherwise. + */ +bool fscache_object_sleep_till_congested(signed long *timeoutp) +{ + wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait); + DEFINE_WAIT(wait); + + if (fscache_object_congested()) + return true; + + add_wait_queue_exclusive(cong_wq, &wait); + if (!fscache_object_congested()) + *timeoutp = schedule_timeout(*timeoutp); + finish_wait(cong_wq, &wait); + + return fscache_object_congested(); } +EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested); /* * enqueue the dependents of an object for metadata-type processing @@ -819,9 +821,7 @@ static void fscache_enqueue_dependents(struct fscache_object *object) /* sort onto appropriate lists */ fscache_enqueue_object(dep); - fscache_stat(&fscache_n_cop_put_object); - dep->cache->ops->put_object(dep); - fscache_stat_d(&fscache_n_cop_put_object); + fscache_put_object(dep); if (!list_empty(&object->dependents)) cond_resched_lock(&object->lock); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index f17cecafae4..b9f34eaede0 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -42,16 +42,12 @@ void fscache_enqueue_operation(struct fscache_operation *op) fscache_stat(&fscache_n_op_enqueue); switch (op->flags & FSCACHE_OP_TYPE) { - case FSCACHE_OP_FAST: - _debug("queue fast"); + case FSCACHE_OP_ASYNC: + _debug("queue async"); atomic_inc(&op->usage); - if (!schedule_work(&op->fast_work)) + if (!queue_work(fscache_op_wq, &op->work)) fscache_put_operation(op); break; - case FSCACHE_OP_SLOW: - _debug("queue slow"); - slow_work_enqueue(&op->slow_work); - break; case FSCACHE_OP_MYTHREAD: _debug("queue for caller's attention"); break; @@ -455,36 +451,13 @@ void fscache_operation_gc(struct work_struct *work) } /* - * allow the slow work item processor to get a ref on an operation - */ -static int fscache_op_get_ref(struct slow_work *work) -{ - struct fscache_operation *op = - container_of(work, struct fscache_operation, slow_work); - - atomic_inc(&op->usage); - return 0; -} - -/* - * allow the slow work item processor to discard a ref on an operation - */ -static void fscache_op_put_ref(struct slow_work *work) -{ - struct fscache_operation *op = - container_of(work, struct fscache_operation, slow_work); - - fscache_put_operation(op); -} - -/* - * execute an operation using the slow thread pool to provide processing context - * - the caller holds a ref to this object, so we don't need to hold one + * execute an operation using fs_op_wq to provide processing context - + * the caller holds a ref to this object, so we don't need to hold one */ -static void fscache_op_execute(struct slow_work *work) +void fscache_op_work_func(struct work_struct *work) { struct fscache_operation *op = - container_of(work, struct fscache_operation, slow_work); + container_of(work, struct fscache_operation, work); unsigned long start; _enter("{OBJ%x OP%x,%d}", @@ -494,31 +467,7 @@ static void fscache_op_execute(struct slow_work *work) start = jiffies; op->processor(op); fscache_hist(fscache_ops_histogram, start); + fscache_put_operation(op); _leave(""); } - -/* - * describe an operation for slow-work debugging - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -static void fscache_op_desc(struct slow_work *work, struct seq_file *m) -{ - struct fscache_operation *op = - container_of(work, struct fscache_operation, slow_work); - - seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx", - op->object->debug_id, op->debug_id, - op->name, op->state, op->flags); -} -#endif - -const struct slow_work_ops fscache_op_slow_work_ops = { - .owner = THIS_MODULE, - .get_ref = fscache_op_get_ref, - .put_ref = fscache_op_put_ref, - .execute = fscache_op_execute, -#ifdef CONFIG_SLOW_WORK_DEBUG - .desc = fscache_op_desc, -#endif -}; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 47aefd376e5..41c441c2058 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -105,7 +105,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie, page_busy: /* we might want to wait here, but that could deadlock the allocator as - * the slow-work threads writing to the cache may all end up sleeping + * the work threads writing to the cache may all end up sleeping * on memory allocation */ fscache_stat(&fscache_n_store_vmscan_busy); return false; @@ -188,9 +188,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return -ENOMEM; } - fscache_operation_init(op, NULL); - fscache_operation_init_slow(op, fscache_attr_changed_op); - op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE); + fscache_operation_init(op, fscache_attr_changed_op, NULL); + op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); fscache_set_op_name(op, "Attr"); spin_lock(&cookie->lock); @@ -218,24 +217,6 @@ nobufs: EXPORT_SYMBOL(__fscache_attr_changed); /* - * handle secondary execution given to a retrieval op on behalf of the - * cache - */ -static void fscache_retrieval_work(struct work_struct *work) -{ - struct fscache_retrieval *op = - container_of(work, struct fscache_retrieval, op.fast_work); - unsigned long start; - - _enter("{OP%x}", op->op.debug_id); - - start = jiffies; - op->op.processor(&op->op); - fscache_hist(fscache_ops_histogram, start); - fscache_put_operation(&op->op); -} - -/* * release a retrieval op reference */ static void fscache_release_retrieval_op(struct fscache_operation *_op) @@ -269,13 +250,12 @@ static struct fscache_retrieval *fscache_alloc_retrieval( return NULL; } - fscache_operation_init(&op->op, fscache_release_retrieval_op); + fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING); op->mapping = mapping; op->end_io_func = end_io_func; op->context = context; op->start_time = jiffies; - INIT_WORK(&op->op.fast_work, fscache_retrieval_work); INIT_LIST_HEAD(&op->to_do); fscache_set_op_name(&op->op, "Retr"); return op; @@ -710,30 +690,26 @@ static void fscache_write_op(struct fscache_operation *_op) goto superseded; } - if (page) { - radix_tree_tag_set(&cookie->stores, page->index, - FSCACHE_COOKIE_STORING_TAG); - radix_tree_tag_clear(&cookie->stores, page->index, - FSCACHE_COOKIE_PENDING_TAG); - } + radix_tree_tag_set(&cookie->stores, page->index, + FSCACHE_COOKIE_STORING_TAG); + radix_tree_tag_clear(&cookie->stores, page->index, + FSCACHE_COOKIE_PENDING_TAG); spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); - if (page) { - fscache_set_op_state(&op->op, "Store"); - fscache_stat(&fscache_n_store_pages); - fscache_stat(&fscache_n_cop_write_page); - ret = object->cache->ops->write_page(op, page); - fscache_stat_d(&fscache_n_cop_write_page); - fscache_set_op_state(&op->op, "EndWrite"); - fscache_end_page_write(object, page); - if (ret < 0) { - fscache_set_op_state(&op->op, "Abort"); - fscache_abort_object(object); - } else { - fscache_enqueue_operation(&op->op); - } + fscache_set_op_state(&op->op, "Store"); + fscache_stat(&fscache_n_store_pages); + fscache_stat(&fscache_n_cop_write_page); + ret = object->cache->ops->write_page(op, page); + fscache_stat_d(&fscache_n_cop_write_page); + fscache_set_op_state(&op->op, "EndWrite"); + fscache_end_page_write(object, page); + if (ret < 0) { + fscache_set_op_state(&op->op, "Abort"); + fscache_abort_object(object); + } else { + fscache_enqueue_operation(&op->op); } _leave(""); @@ -799,9 +775,9 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (!op) goto nomem; - fscache_operation_init(&op->op, fscache_release_write_op); - fscache_operation_init_slow(&op->op, fscache_write_op); - op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING); + fscache_operation_init(&op->op, fscache_write_op, + fscache_release_write_op); + op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING); fscache_set_op_name(&op->op, "Write1"); ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); @@ -856,7 +832,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_store_ops); fscache_stat(&fscache_n_stores_ok); - /* the slow work queue now carries its own ref on the object */ + /* the work queue now carries its own ref on the object */ fscache_put_operation(&op->op); _leave(" = 0"); return 0; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 9424796d663..69ad053ffd7 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -239,7 +239,6 @@ static u64 fuse_get_unique(struct fuse_conn *fc) static void queue_request(struct fuse_conn *fc, struct fuse_req *req) { - req->in.h.unique = fuse_get_unique(fc); req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); list_add_tail(&req->list, &fc->pending); @@ -261,6 +260,7 @@ static void flush_bg_queue(struct fuse_conn *fc) req = list_entry(fc->bg_queue.next, struct fuse_req, list); list_del(&req->list); fc->active_background++; + req->in.h.unique = fuse_get_unique(fc); queue_request(fc, req); } } @@ -398,6 +398,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) else if (fc->conn_error) req->out.h.error = -ECONNREFUSED; else { + req->in.h.unique = fuse_get_unique(fc); queue_request(fc, req); /* acquire extra reference, since request is still needed after request_end() */ @@ -450,6 +451,23 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) } EXPORT_SYMBOL_GPL(fuse_request_send_background); +static int fuse_request_send_notify_reply(struct fuse_conn *fc, + struct fuse_req *req, u64 unique) +{ + int err = -ENODEV; + + req->isreply = 0; + req->in.h.unique = unique; + spin_lock(&fc->lock); + if (fc->connected) { + queue_request(fc, req); + err = 0; + } + spin_unlock(&fc->lock); + + return err; +} + /* * Called under fc->lock * @@ -535,13 +553,13 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) if (!cs->write) { buf->ops->unmap(cs->pipe, buf, cs->mapaddr); } else { - kunmap_atomic(cs->mapaddr, KM_USER0); + kunmap(buf->page); buf->len = PAGE_SIZE - cs->len; } cs->currbuf = NULL; cs->mapaddr = NULL; } else if (cs->mapaddr) { - kunmap_atomic(cs->mapaddr, KM_USER0); + kunmap(cs->pg); if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); @@ -572,7 +590,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->mapaddr = buf->ops->map(cs->pipe, buf, 0); cs->len = buf->len; cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; @@ -592,7 +610,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap_atomic(page, KM_USER0); + cs->mapaddr = kmap(page); cs->buf = cs->mapaddr; cs->len = PAGE_SIZE; cs->pipebufs++; @@ -611,7 +629,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) return err; BUG_ON(err != 1); offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap_atomic(cs->pg, KM_USER0); + cs->mapaddr = kmap(cs->pg); cs->buf = cs->mapaddr + offset; cs->len = min(PAGE_SIZE - offset, cs->seglen); cs->seglen -= cs->len; @@ -1231,6 +1249,199 @@ err: return err; } +static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_store_out outarg; + struct inode *inode; + struct address_space *mapping; + u64 nodeid; + int err; + pgoff_t index; + unsigned int offset; + unsigned int num; + loff_t file_size; + loff_t end; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto out_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto out_finish; + + err = -EINVAL; + if (size - sizeof(outarg) != outarg.size) + goto out_finish; + + nodeid = outarg.nodeid; + + down_read(&fc->killsb); + + err = -ENOENT; + if (!fc->sb) + goto out_up_killsb; + + inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + if (!inode) + goto out_up_killsb; + + mapping = inode->i_mapping; + index = outarg.offset >> PAGE_CACHE_SHIFT; + offset = outarg.offset & ~PAGE_CACHE_MASK; + file_size = i_size_read(inode); + end = outarg.offset + outarg.size; + if (end > file_size) { + file_size = end; + fuse_write_update_size(inode, file_size); + } + + num = outarg.size; + while (num) { + struct page *page; + unsigned int this_num; + + err = -ENOMEM; + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + if (!page) + goto out_iput; + + this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + err = fuse_copy_page(cs, &page, offset, this_num, 0); + if (!err && offset == 0 && (num != 0 || file_size == end)) + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + + if (err) + goto out_iput; + + num -= this_num; + offset = 0; + index++; + } + + err = 0; + +out_iput: + iput(inode); +out_up_killsb: + up_read(&fc->killsb); +out_finish: + fuse_copy_finish(cs); + return err; +} + +static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) +{ + int i; + + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + page_cache_release(page); + } +} + +static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, + struct fuse_notify_retrieve_out *outarg) +{ + int err; + struct address_space *mapping = inode->i_mapping; + struct fuse_req *req; + pgoff_t index; + loff_t file_size; + unsigned int num; + unsigned int offset; + size_t total_len; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + offset = outarg->offset & ~PAGE_CACHE_MASK; + + req->in.h.opcode = FUSE_NOTIFY_REPLY; + req->in.h.nodeid = outarg->nodeid; + req->in.numargs = 2; + req->in.argpages = 1; + req->page_offset = offset; + req->end = fuse_retrieve_end; + + index = outarg->offset >> PAGE_CACHE_SHIFT; + file_size = i_size_read(inode); + num = outarg->size; + if (outarg->offset > file_size) + num = 0; + else if (outarg->offset + num > file_size) + num = file_size - outarg->offset; + + while (num) { + struct page *page; + unsigned int this_num; + + page = find_get_page(mapping, index); + if (!page) + break; + + this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + req->pages[req->num_pages] = page; + req->num_pages++; + + num -= this_num; + total_len += this_num; + } + req->misc.retrieve_in.offset = outarg->offset; + req->misc.retrieve_in.size = total_len; + req->in.args[0].size = sizeof(req->misc.retrieve_in); + req->in.args[0].value = &req->misc.retrieve_in; + req->in.args[1].size = total_len; + + err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); + if (err) + fuse_retrieve_end(fc, req); + + return err; +} + +static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_retrieve_out outarg; + struct inode *inode; + int err; + + err = -EINVAL; + if (size != sizeof(outarg)) + goto copy_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto copy_finish; + + fuse_copy_finish(cs); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) { + u64 nodeid = outarg.nodeid; + + inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + if (inode) { + err = fuse_retrieve(fc, inode, &outarg); + iput(inode); + } + } + up_read(&fc->killsb); + + return err; + +copy_finish: + fuse_copy_finish(cs); + return err; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -1244,6 +1455,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_INVAL_ENTRY: return fuse_notify_inval_entry(fc, size, cs); + case FUSE_NOTIFY_STORE: + return fuse_notify_store(fc, size, cs); + + case FUSE_NOTIFY_RETRIEVE: + return fuse_notify_retrieve(fc, size, cs); + default: fuse_copy_finish(cs); return -EINVAL; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3cdc5f78a40..431be0795b6 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1016,7 +1016,7 @@ static int fuse_permission(struct inode *inode, int mask) exist. So if permissions are revoked this won't be noticed immediately, only after the attribute timeout has expired */ - } else if (mask & MAY_ACCESS) { + } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { err = fuse_access(inode, mask); } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { if (!(inode->i_mode & S_IXUGO)) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ada0adeb3bb..147c1f71bdb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -706,7 +706,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, return 0; } -static void fuse_write_update_size(struct inode *inode, loff_t pos) +void fuse_write_update_size(struct inode *inode, loff_t pos) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8f309f04064..57d4a3a0f10 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -272,6 +272,7 @@ struct fuse_req { struct fuse_write_in in; struct fuse_write_out out; } write; + struct fuse_notify_retrieve_in retrieve_in; struct fuse_lk_in lk_in; } misc; @@ -748,4 +749,6 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); +void fuse_write_update_size(struct inode *inode, loff_t pos); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index a47b4310711..cc966552214 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -7,7 +7,6 @@ config GFS2_FS select IP_SCTP if DLM_SCTP select FS_POSIX_ACL select CRC32 - select SLOW_WORK select QUOTACTL help A cluster filesystem. diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9f8b52500d6..5e96cbd8a45 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -136,10 +136,7 @@ static int gfs2_writeback_writepage(struct page *page, if (ret <= 0) return ret; - ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc); - if (ret == -EAGAIN) - ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc); - return ret; + return nobh_writepage(page, gfs2_get_block_noalloc, wbc); } /** @@ -637,9 +634,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, } } - error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); - if (error) - goto out_unlock; + alloc_required = gfs2_write_alloc_required(ip, pos, len); if (alloc_required || gfs2_is_jdata(ip)) gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 4a48c0f4b40..6f482809d1a 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1040,7 +1040,8 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) goto out; if (gfs2_is_stuffed(ip)) { - u64 dsize = size + sizeof(struct gfs2_inode); + u64 dsize = size + sizeof(struct gfs2_dinode); + ip->i_disksize = size; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1243,13 +1244,12 @@ int gfs2_file_dealloc(struct gfs2_inode *ip) * @ip: the file being written to * @offset: the offset to write to * @len: the number of bytes being written - * @alloc_required: set to 1 if an alloc is required, 0 otherwise * - * Returns: errno + * Returns: 1 if an alloc is required, 0 otherwise */ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, - unsigned int len, int *alloc_required) + unsigned int len) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head bh; @@ -1257,26 +1257,23 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, u64 lblock, lblock_stop, size; u64 end_of_file; - *alloc_required = 0; - if (!len) return 0; if (gfs2_is_stuffed(ip)) { if (offset + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) - *alloc_required = 1; + return 1; return 0; } - *alloc_required = 1; shift = sdp->sd_sb.sb_bsize_shift; BUG_ON(gfs2_is_dir(ip)); end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; lblock = offset >> shift; lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; if (lblock_stop > end_of_file) - return 0; + return 1; size = (lblock_stop - lblock) << shift; do { @@ -1284,12 +1281,11 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, bh.b_size = size; gfs2_block_map(&ip->i_inode, lblock, &bh, 0); if (!buffer_mapped(&bh)) - return 0; + return 1; size -= bh.b_size; lblock += (bh.b_size >> ip->i_inode.i_blkbits); } while(size > 0); - *alloc_required = 0; return 0; } diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index c983177e05a..a20a5213135 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -52,6 +52,6 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size); int gfs2_truncatei_resume(struct gfs2_inode *ip); int gfs2_file_dealloc(struct gfs2_inode *ip); int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, - unsigned int len, int *alloc_required); + unsigned int len); #endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 8295c5b5d4a..b9dd88a78dd 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent, unsigned totlen = be16_to_cpu(dent->de_rec_len); if (gfs2_dirent_sentinel(dent)) - actual = GFS2_DIRENT_SIZE(0); + actual = 0; if (totlen - actual >= required) return 1; return 0; @@ -955,7 +955,12 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) /* Change the pointers. Don't bother distinguishing stuffed from non-stuffed. This code is complicated enough already. */ - lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS | __GFP_NOFAIL); + lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS); + if (!lp) { + error = -ENOMEM; + goto fail_brelse; + } + /* Change the pointers */ for (x = 0; x < half_len; x++) lp[x] = cpu_to_be64(bn); @@ -1063,7 +1068,9 @@ static int dir_double_exhash(struct gfs2_inode *dip) /* Allocate both the "from" and "to" buffers in one big chunk */ - buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL); + buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS); + if (!buf) + return -ENOMEM; for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { error = gfs2_dir_read_data(dip, (char *)buf, @@ -1231,6 +1238,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, return 0; } +static void *gfs2_alloc_sort_buffer(unsigned size) +{ + void *ptr = NULL; + + if (size < KMALLOC_MAX_SIZE) + ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); + if (!ptr) + ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + return ptr; +} + +static void gfs2_free_sort_buffer(void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); +} + static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, filldir_t filldir, int *copied, unsigned *depth, u64 leaf_no) @@ -1271,7 +1297,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, * 99 is the maximum number of entries that can fit in a single * leaf block. */ - larr = vmalloc((leaves + entries + 99) * sizeof(void *)); + larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *)); if (!larr) goto out; darr = (const struct gfs2_dirent **)(larr + leaves); @@ -1282,7 +1308,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, do { error = get_leaf(ip, lfn, &bh); if (error) - goto out_kfree; + goto out_free; lf = (struct gfs2_leaf *)bh->b_data; lfn = be64_to_cpu(lf->lf_next); if (lf->lf_entries) { @@ -1291,7 +1317,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, gfs2_dirent_gather, NULL, &g); error = PTR_ERR(dent); if (IS_ERR(dent)) - goto out_kfree; + goto out_free; if (entries2 != g.offset) { fs_warn(sdp, "Number of entries corrupt in dir " "leaf %llu, entries2 (%u) != " @@ -1300,7 +1326,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, entries2, g.offset); error = -EIO; - goto out_kfree; + goto out_free; } error = 0; larr[leaf++] = bh; @@ -1312,10 +1338,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, BUG_ON(entries2 != entries); error = do_filldir_main(ip, offset, opaque, filldir, darr, entries, copied); -out_kfree: +out_free: for(i = 0; i < leaf; i++) brelse(larr[i]); - vfree(larr); + gfs2_free_sort_buffer(larr); out: return error; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index ed9a94f0ef1..4edd662c823 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -351,7 +351,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned long last_index; u64 pos = page->index << PAGE_CACHE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; - int alloc_required = 0; struct gfs2_holder gh; struct gfs2_alloc *al; int ret; @@ -364,8 +363,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); - ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); - if (ret || !alloc_required) + if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) goto out_unlock; ret = -ENOMEM; al = gfs2_alloc_get(ip); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ddcdbf49353..9adf8f924e0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -328,6 +328,30 @@ static void gfs2_holder_wake(struct gfs2_holder *gh) } /** + * do_error - Something unexpected has happened during a lock request + * + */ + +static inline void do_error(struct gfs2_glock *gl, const int ret) +{ + struct gfs2_holder *gh, *tmp; + + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (ret & LM_OUT_ERROR) + gh->gh_error = -EIO; + else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) + gh->gh_error = GLR_TRYFAILED; + else + continue; + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); + gfs2_holder_wake(gh); + } +} + +/** * do_promote - promote as many requests as possible on the current queue * @gl: The glock * @@ -375,36 +399,13 @@ restart: } if (gh->gh_list.prev == &gl->gl_holders) return 1; + do_error(gl, 0); break; } return 0; } /** - * do_error - Something unexpected has happened during a lock request - * - */ - -static inline void do_error(struct gfs2_glock *gl, const int ret) -{ - struct gfs2_holder *gh, *tmp; - - list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { - if (test_bit(HIF_HOLDER, &gh->gh_iflags)) - continue; - if (ret & LM_OUT_ERROR) - gh->gh_error = -EIO; - else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) - gh->gh_error = GLR_TRYFAILED; - else - continue; - list_del_init(&gh->gh_list); - trace_gfs2_glock_queue(gh, 0); - gfs2_holder_wake(gh); - } -} - -/** * find_first_waiter - find the first gh that's waiting for the glock * @gl: the glock */ @@ -1062,6 +1063,9 @@ int gfs2_glock_nq(struct gfs2_holder *gh) spin_lock(&gl->gl_spin); add_to_queue(gh); + if ((LM_FLAG_NOEXP & gh->gh_flags) && + test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); run_queue(gl, 1); spin_unlock(&gl->gl_spin); @@ -1319,6 +1323,36 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) } /** + * gfs2_should_freeze - Figure out if glock should be frozen + * @gl: The glock in question + * + * Glocks are not frozen if (a) the result of the dlm operation is + * an error, (b) the locking operation was an unlock operation or + * (c) if there is a "noexp" flagged request anywhere in the queue + * + * Returns: 1 if freezing should occur, 0 otherwise + */ + +static int gfs2_should_freeze(const struct gfs2_glock *gl) +{ + const struct gfs2_holder *gh; + + if (gl->gl_reply & ~LM_OUT_ST_MASK) + return 0; + if (gl->gl_target == LM_ST_UNLOCKED) + return 0; + + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (LM_FLAG_NOEXP & gh->gh_flags) + return 0; + } + + return 1; +} + +/** * gfs2_glock_complete - Callback used by locking * @gl: Pointer to the glock * @ret: The return value from the dlm @@ -1328,18 +1362,17 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + gl->gl_reply = ret; + if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { - struct gfs2_holder *gh; spin_lock(&gl->gl_spin); - gh = find_first_waiter(gl); - if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) && - (gl->gl_target != LM_ST_UNLOCKED)) || - ((ret & ~LM_OUT_ST_MASK) != 0)) + if (gfs2_should_freeze(gl)) { set_bit(GLF_FROZEN, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - if (test_bit(GLF_FROZEN, &gl->gl_flags)) + spin_unlock(&gl->gl_spin); return; + } + spin_unlock(&gl->gl_spin); } set_bit(GLF_REPLY_PENDING, &gl->gl_flags); gfs2_glock_hold(gl); @@ -1348,7 +1381,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) } -static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) +static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { struct gfs2_glock *gl; int may_demote; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b5d7363b22d..fdbf4b366fa 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -12,7 +12,6 @@ #include <linux/fs.h> #include <linux/workqueue.h> -#include <linux/slow-work.h> #include <linux/dlm.h> #include <linux/buffer_head.h> @@ -383,7 +382,7 @@ struct gfs2_journal_extent { struct gfs2_jdesc { struct list_head jd_list; struct list_head extent_list; - struct slow_work jd_work; + struct work_struct jd_work; struct inode *jd_inode; unsigned long jd_flags; #define JDF_RECOVERY 1 @@ -460,6 +459,7 @@ enum { SDF_NOBARRIERS = 3, SDF_NORECOVERY = 4, SDF_DEMOTE = 5, + SDF_NOJOURNALID = 6, }; #define GFS2_FSNAME_LEN 256 diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index b5612cbb62a..f03afd9c44b 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -169,7 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, { struct inode *inode; struct gfs2_inode *ip; - struct gfs2_glock *io_gl; + struct gfs2_glock *io_gl = NULL; int error; inode = gfs2_iget(sb, no_addr); @@ -198,6 +198,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); + io_gl = NULL; if ((type == DT_UNKNOWN) && (no_formal_ino == 0)) goto gfs2_nfsbypass; @@ -228,7 +229,8 @@ gfs2_nfsbypass: fail_glock: gfs2_glock_dq(&ip->i_iopen_gh); fail_iopen: - gfs2_glock_put(io_gl); + if (io_gl) + gfs2_glock_put(io_gl); fail_put: if (inode->i_state & I_NEW) ip->i_gl->gl_object = NULL; @@ -256,7 +258,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) { struct gfs2_sbd *sdp; struct gfs2_inode *ip; - struct gfs2_glock *io_gl; + struct gfs2_glock *io_gl = NULL; int error; struct gfs2_holder gh; struct inode *inode; @@ -293,6 +295,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); + io_gl = NULL; inode->i_mode = DT2IF(DT_UNKNOWN); @@ -319,7 +322,8 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) fail_glock: gfs2_glock_dq(&ip->i_iopen_gh); fail_iopen: - gfs2_glock_put(io_gl); + if (io_gl) + gfs2_glock_put(io_gl); fail_put: ip->i_gl->gl_object = NULL; gfs2_glock_put(ip->i_gl); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index fb2a5f93b7c..b1e9630eb46 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -15,7 +15,6 @@ #include <linux/init.h> #include <linux/gfs2_ondisk.h> #include <asm/atomic.h> -#include <linux/slow-work.h> #include "gfs2.h" #include "incore.h" @@ -24,6 +23,7 @@ #include "util.h" #include "glock.h" #include "quota.h" +#include "recovery.h" static struct shrinker qd_shrinker = { .shrink = gfs2_shrink_qd_memory, @@ -138,9 +138,11 @@ static int __init init_gfs2_fs(void) if (error) goto fail_unregister; - error = slow_work_register_user(THIS_MODULE); - if (error) - goto fail_slow; + error = -ENOMEM; + gfs_recovery_wq = alloc_workqueue("gfs_recovery", + WQ_NON_REENTRANT | WQ_RESCUER, 0); + if (!gfs_recovery_wq) + goto fail_wq; gfs2_register_debugfs(); @@ -148,7 +150,7 @@ static int __init init_gfs2_fs(void) return 0; -fail_slow: +fail_wq: unregister_filesystem(&gfs2meta_fs_type); fail_unregister: unregister_filesystem(&gfs2_fs_type); @@ -190,7 +192,7 @@ static void __exit exit_gfs2_fs(void) gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); - slow_work_unregister_user(THIS_MODULE); + destroy_workqueue(gfs_recovery_wq); kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 3593b3a7290..4f44bdeb2f0 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -17,7 +17,6 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/gfs2_ondisk.h> -#include <linux/slow-work.h> #include <linux/quotaops.h> #include "gfs2.h" @@ -76,7 +75,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) sb->s_fs_info = sdp; sdp->sd_vfs = sb; - + set_bit(SDF_NOJOURNALID, &sdp->sd_flags); gfs2_tune_init(&sdp->sd_tune); init_waitqueue_head(&sdp->sd_glock_wait); @@ -673,7 +672,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) break; INIT_LIST_HEAD(&jd->extent_list); - slow_work_init(&jd->jd_work, &gfs2_recover_ops); + INIT_WORK(&jd->jd_work, gfs2_recover_func); jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { if (!jd->jd_inode) @@ -782,7 +781,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (sdp->sd_lockstruct.ls_first) { unsigned int x; for (x = 0; x < sdp->sd_journals; x++) { - error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x)); + error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x), + true); if (error) { fs_err(sdp, "error recovering journal %u: %d\n", x, error); @@ -792,7 +792,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) gfs2_others_may_mount(sdp); } else if (!sdp->sd_args.ar_spectator) { - error = gfs2_recover_journal(sdp->sd_jdesc); + error = gfs2_recover_journal(sdp->sd_jdesc, true); if (error) { fs_err(sdp, "error recovering my journal: %d\n", error); goto fail_jinode_gh; @@ -1050,7 +1050,8 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) ret = match_int(&tmp[0], &option); if (ret || option < 0) goto hostdata_error; - ls->ls_jid = option; + if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags)) + ls->ls_jid = option; break; case Opt_id: /* Obsolete, but left for backward compat purposes */ @@ -1102,6 +1103,24 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp) lm->lm_unmount(sdp); } +static int gfs2_journalid_wait(void *word) +{ + if (signal_pending(current)) + return -EINTR; + schedule(); + return 0; +} + +static int wait_on_journal(struct gfs2_sbd *sdp) +{ + if (sdp->sd_args.ar_spectator) + return 0; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + return 0; + + return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE); +} + void gfs2_online_uevent(struct gfs2_sbd *sdp) { struct super_block *sb = sdp->sd_vfs; @@ -1194,6 +1213,10 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (error) goto fail_locking; + error = wait_on_journal(sdp); + if (error) + goto fail_sb; + error = init_inodes(sdp, DO); if (error) goto fail_sb; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 49667d68769..1bc6b5695e6 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list); static atomic_t qd_lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(qd_lru_lock); -int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) +int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { struct gfs2_quota_data *qd; struct gfs2_sbd *sdp; @@ -694,10 +694,8 @@ get_a_page: if (!buffer_mapped(bh)) goto unlock_out; /* If it's a newly allocated disk block for quota, zero it */ - if (buffer_new(bh)) { - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - } + if (buffer_new(bh)) + zero_user(page, pos - blocksize, bh->b_size); } if (PageUptodate(page)) @@ -723,7 +721,7 @@ get_a_page: /* If quota straddles page boundary, we need to update the rest of the * quota at the beginning of the next page */ - if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */ + if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) { ptr = ptr + nbytes; nbytes = sizeof(struct gfs2_quota) - nbytes; offset = 0; @@ -789,15 +787,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) goto out; for (x = 0; x < num_qd; x++) { - int alloc_required; - offset = qd2offset(qda[x]); - error = gfs2_write_alloc_required(ip, offset, - sizeof(struct gfs2_quota), - &alloc_required); - if (error) - goto out_gunlock; - if (alloc_required) + if (gfs2_write_alloc_required(ip, offset, + sizeof(struct gfs2_quota))) nalloc++; } @@ -1457,10 +1449,10 @@ static int gfs2_quota_get_xstate(struct super_block *sb, switch (sdp->sd_args.ar_quota) { case GFS2_QUOTA_ON: - fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); + fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD); /*FALLTHRU*/ case GFS2_QUOTA_ACCOUNT: - fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); + fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT); break; case GFS2_QUOTA_OFF: break; @@ -1506,7 +1498,7 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id, qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; fdq->d_version = FS_DQUOT_VERSION; - fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA; + fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; fdq->d_id = id; fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); @@ -1541,12 +1533,12 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, switch(type) { case USRQUOTA: type = QUOTA_USER; - if (fdq->d_flags != XFS_USER_QUOTA) + if (fdq->d_flags != FS_USER_QUOTA) return -EINVAL; break; case GRPQUOTA: type = QUOTA_GROUP; - if (fdq->d_flags != XFS_GROUP_QUOTA) + if (fdq->d_flags != FS_GROUP_QUOTA) return -EINVAL; break; default: @@ -1586,10 +1578,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, goto out_i; offset = qd2offset(qd); - error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota), - &alloc_required); - if (error) - goto out_i; + alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota)); if (alloc_required) { al = gfs2_alloc_get(ip); if (al == NULL) diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 195f60c8bd1..e7d236ca48b 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } -extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask); +extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask); extern const struct quotactl_ops gfs2_quotactl_ops; #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 4b9bece3d43..f7f89a94a5a 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -14,7 +14,6 @@ #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> -#include <linux/slow-work.h> #include "gfs2.h" #include "incore.h" @@ -28,6 +27,8 @@ #include "util.h" #include "dir.h" +struct workqueue_struct *gfs_recovery_wq; + int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh) { @@ -443,23 +444,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); } -static int gfs2_recover_get_ref(struct slow_work *work) -{ - struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); - if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) - return -EBUSY; - return 0; -} - -static void gfs2_recover_put_ref(struct slow_work *work) -{ - struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); - clear_bit(JDF_RECOVERY, &jd->jd_flags); - smp_mb__after_clear_bit(); - wake_up_bit(&jd->jd_flags, JDF_RECOVERY); -} - -static void gfs2_recover_work(struct slow_work *work) +void gfs2_recover_func(struct work_struct *work) { struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); struct gfs2_inode *ip = GFS2_I(jd->jd_inode); @@ -578,7 +563,7 @@ static void gfs2_recover_work(struct slow_work *work) gfs2_glock_dq_uninit(&j_gh); fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); - return; + goto done; fail_gunlock_tr: gfs2_glock_dq_uninit(&t_gh); @@ -590,32 +575,35 @@ fail_gunlock_j: } fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); - fail: gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); +done: + clear_bit(JDF_RECOVERY, &jd->jd_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } -struct slow_work_ops gfs2_recover_ops = { - .owner = THIS_MODULE, - .get_ref = gfs2_recover_get_ref, - .put_ref = gfs2_recover_put_ref, - .execute = gfs2_recover_work, -}; - - static int gfs2_recovery_wait(void *word) { schedule(); return 0; } -int gfs2_recover_journal(struct gfs2_jdesc *jd) +int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) { int rv; - rv = slow_work_enqueue(&jd->jd_work); - if (rv) - return rv; - wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE); + + if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) + return -EBUSY; + + /* we have JDF_RECOVERY, queue should always succeed */ + rv = queue_work(gfs_recovery_wq, &jd->jd_work); + BUG_ON(!rv); + + if (wait) + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, + TASK_UNINTERRUPTIBLE); + return 0; } diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 1616ac22569..2226136c764 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -12,6 +12,8 @@ #include "incore.h" +extern struct workqueue_struct *gfs_recovery_wq; + static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) { if (++*blk == sdp->sd_jdesc->jd_blocks) @@ -27,8 +29,8 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); -extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); -extern struct slow_work_ops gfs2_recover_ops; +extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); +extern void gfs2_recover_func(struct work_struct *work); #endif /* __RECOVERY_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 4d1aad38f1b..4140811a921 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -342,8 +342,6 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - int ar; - int error; if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) { @@ -352,13 +350,12 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) } jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; - error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar); - if (!error && ar) { + if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) { gfs2_consist_inode(ip); - error = -EIO; + return -EIO; } - return error; + return 0; } /** diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 37f5393e68e..ccacffd2faa 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -25,6 +25,7 @@ #include "quota.h" #include "util.h" #include "glops.h" +#include "recovery.h" struct gfs2_attr { struct attribute attr; @@ -325,6 +326,30 @@ static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) return sprintf(buf, "%d\n", ls->ls_first); } +static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned first; + int rv; + + rv = sscanf(buf, "%u", &first); + if (rv != 1 || first > 1) + return -EINVAL; + spin_lock(&sdp->sd_jindex_spin); + rv = -EBUSY; + if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + rv = -EINVAL; + if (sdp->sd_args.ar_spectator) + goto out; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + sdp->sd_lockstruct.ls_first = first; + rv = 0; +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -352,7 +377,7 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { if (jd->jd_jid != jid) continue; - rv = slow_work_enqueue(&jd->jd_work); + rv = gfs2_recover_journal(jd, false); break; } out: @@ -377,14 +402,41 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); } +static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned jid; + int rv; + + rv = sscanf(buf, "%u", &jid); + if (rv != 1) + return -EINVAL; + + spin_lock(&sdp->sd_jindex_spin); + rv = -EINVAL; + if (sdp->sd_args.ar_spectator) + goto out; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + rv = -EBUSY; + if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + sdp->sd_lockstruct.ls_jid = jid; + smp_mb__after_clear_bit(); + wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); + rv = 0; +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + #define GDLM_ATTR(_name,_mode,_show,_store) \ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); GDLM_ATTR(block, 0644, block_show, block_store); GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); -GDLM_ATTR(jid, 0444, jid_show, NULL); -GDLM_ATTR(first, 0444, lkfirst_show, NULL); +GDLM_ATTR(jid, 0644, jid_show, jid_store); +GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store); GDLM_ATTR(first_done, 0444, first_done_show, NULL); GDLM_ATTR(recover, 0600, NULL, recover_store); GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); @@ -564,7 +616,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); - if (!sdp->sd_args.ar_spectator) + if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); if (gfs2_uuid_valid(uuid)) add_uevent_var(env, "UUID=%pUB", uuid); diff --git a/fs/inode.c b/fs/inode.c index 2bee20ae3d6..722860b323a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -512,7 +512,7 @@ static void prune_icache(int nr_to_scan) * This function is passed the number of inodes to scan, and it returns the * total number of remaining possibly-reclaimable inodes. */ -static int shrink_icache_memory(int nr, gfp_t gfp_mask) +static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { /* diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 93d1e47647b..f19ce94693d 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1281,13 +1281,9 @@ int journal_check_used_features (journal_t *journal, unsigned long compat, int journal_check_available_features (journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { - journal_superblock_t *sb; - if (!compat && !ro && !incompat) return 1; - sb = journal->j_superblock; - /* We can support any known requested features iff the * superblock is in version 2. Otherwise we fail to support any * extended sb features. */ @@ -1481,7 +1477,6 @@ int journal_flush(journal_t *journal) int journal_wipe(journal_t *journal, int write) { - journal_superblock_t *sb; int err = 0; J_ASSERT (!(journal->j_flags & JFS_LOADED)); @@ -1490,8 +1485,6 @@ int journal_wipe(journal_t *journal, int write) if (err) return err; - sb = journal->j_superblock; - if (!journal->j_tail) goto no_recovery; diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 54c9bc9e1b1..81051dafebf 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -283,12 +283,9 @@ int journal_recover(journal_t *journal) int journal_skip_recovery(journal_t *journal) { int err; - journal_superblock_t * sb; - struct recovery_info info; memset (&info, 0, sizeof(info)); - sb = journal->j_superblock; err = do_one_pass(journal, &info, PASS_SCAN); @@ -297,7 +294,8 @@ int journal_skip_recovery(journal_t *journal) ++journal->j_transaction_sequence; } else { #ifdef CONFIG_JBD_DEBUG - int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); + int dropped = info.end_transaction - + be32_to_cpu(journal->j_superblock->s_sequence); #endif jbd_debug(1, "JBD: ignoring %d transaction%s from the journal.\n", @@ -321,11 +319,6 @@ static int do_one_pass(journal_t *journal, unsigned int sequence; int blocktype; - /* Precompute the maximum metadata descriptors in a descriptor block */ - int MAX_BLOCKS_PER_DESC; - MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) - / sizeof(journal_block_tag_t)); - /* * First thing is to establish what we expect to find in the log * (in terms of transaction IDs), and where (in terms of log diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 076d1cc44f9..1c23a0f4e8a 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -118,13 +118,13 @@ static int __try_to_free_cp_buf(struct journal_head *jh) void __jbd2_log_wait_for_space(journal_t *journal) { int nblocks, space_left; - assert_spin_locked(&journal->j_state_lock); + /* assert_spin_locked(&journal->j_state_lock); */ nblocks = jbd_space_needed(journal); while (__jbd2_log_space_left(journal) < nblocks) { if (journal->j_flags & JBD2_ABORT) return; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); mutex_lock(&journal->j_checkpoint_mutex); /* @@ -138,7 +138,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) * filesystem, so abort the journal and leave a stack * trace for forensic evidence. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); nblocks = jbd_space_needed(journal); space_left = __jbd2_log_space_left(journal); @@ -149,7 +149,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) if (journal->j_committing_transaction) tid = journal->j_committing_transaction->t_tid; spin_unlock(&journal->j_list_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); if (chkpt) { jbd2_log_do_checkpoint(journal); } else if (jbd2_cleanup_journal_tail(journal) == 0) { @@ -167,7 +167,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) WARN_ON(1); jbd2_journal_abort(journal, 0); } - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } else { spin_unlock(&journal->j_list_lock); } @@ -474,7 +474,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * next transaction ID we will write, and where it will * start. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); transaction = journal->j_checkpoint_transactions; if (transaction) { @@ -496,7 +496,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* If the oldest pinned transaction is at the tail of the log already then there's not much we can do right now. */ if (journal->j_tail_sequence == first_tid) { - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return 1; } @@ -516,7 +516,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) journal->j_free += freed; journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); /* * If there is an external journal, we need to make sure that @@ -775,7 +775,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(transaction->t_log_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); J_ASSERT(transaction->t_checkpoint_io_list == NULL); - J_ASSERT(transaction->t_updates == 0); + J_ASSERT(atomic_read(&transaction->t_updates) == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 75716d3d2be..f52e5e8049f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -150,11 +150,11 @@ static int journal_submit_commit_record(journal_t *journal, */ if (ret == -EOPNOTSUPP && barrier_done) { printk(KERN_WARNING - "JBD: barrier-based sync failed on %s - " - "disabling barriers\n", journal->j_devname); - spin_lock(&journal->j_state_lock); + "JBD2: Disabling barriers on %s, " + "not supported by device\n", journal->j_devname); + write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_BARRIER; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); /* And try again, without the barrier */ lock_buffer(bh); @@ -180,11 +180,11 @@ retry: wait_on_buffer(bh); if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) { printk(KERN_WARNING - "JBD2: wait_on_commit_record: sync failed on %s - " - "disabling barriers\n", journal->j_devname); - spin_lock(&journal->j_state_lock); + "JBD2: %s: disabling barries on %s - not supported " + "by device\n", __func__, journal->j_devname); + write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_BARRIER; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); lock_buffer(bh); clear_buffer_dirty(bh); @@ -400,7 +400,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; /* @@ -417,23 +417,23 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_locked); spin_lock(&commit_transaction->t_handle_lock); - while (commit_transaction->t_updates) { + while (atomic_read(&commit_transaction->t_updates)) { DEFINE_WAIT(wait); prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); - if (commit_transaction->t_updates) { + if (atomic_read(&commit_transaction->t_updates)) { spin_unlock(&commit_transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); schedule(); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&commit_transaction->t_handle_lock); } finish_wait(&journal->j_wait_updates, &wait); } spin_unlock(&commit_transaction->t_handle_lock); - J_ASSERT (commit_transaction->t_outstanding_credits <= + J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= journal->j_max_transaction_buffers); /* @@ -497,7 +497,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); jbd_debug (3, "JBD: commit phase 2\n"); @@ -519,19 +519,20 @@ void jbd2_journal_commit_transaction(journal_t *journal) * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); commit_transaction->t_state = T_COMMIT; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); trace_jbd2_commit_logging(journal, commit_transaction); stats.run.rs_logging = jiffies; stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, stats.run.rs_logging); - stats.run.rs_blocks = commit_transaction->t_outstanding_credits; + stats.run.rs_blocks = + atomic_read(&commit_transaction->t_outstanding_credits); stats.run.rs_blocks_logged = 0; J_ASSERT(commit_transaction->t_nr_buffers <= - commit_transaction->t_outstanding_credits); + atomic_read(&commit_transaction->t_outstanding_credits)); err = 0; descriptor = NULL; @@ -616,7 +617,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * the free space in the log, but this counter is changed * by jbd2_journal_next_log_block() also. */ - commit_transaction->t_outstanding_credits--; + atomic_dec(&commit_transaction->t_outstanding_credits); /* Bump b_count to prevent truncate from stumbling over the shadowed buffer! @@@ This can go if we ever get @@ -977,7 +978,7 @@ restart_loop: * __jbd2_journal_drop_transaction(). Otherwise we could race with * other checkpointing code processing the transaction... */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); /* * Now recheck if some buffers did not get attached to the transaction @@ -985,7 +986,7 @@ restart_loop: */ if (commit_transaction->t_forget) { spin_unlock(&journal->j_list_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); goto restart_loop; } @@ -1003,7 +1004,8 @@ restart_loop: * File the transaction statistics */ stats.ts_tid = commit_transaction->t_tid; - stats.run.rs_handle_count = commit_transaction->t_handle_count; + stats.run.rs_handle_count = + atomic_read(&commit_transaction->t_handle_count); trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, commit_transaction->t_tid, &stats.run); @@ -1037,7 +1039,7 @@ restart_loop: journal->j_average_commit_time*3) / 4; else journal->j_average_commit_time = commit_time; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index bc2ff593276..ad5866aaf0f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -41,6 +41,7 @@ #include <linux/hash.h> #include <linux/log2.h> #include <linux/vmalloc.h> +#include <linux/backing-dev.h> #define CREATE_TRACE_POINTS #include <trace/events/jbd2.h> @@ -48,8 +49,6 @@ #include <asm/uaccess.h> #include <asm/page.h> -EXPORT_SYMBOL(jbd2_journal_start); -EXPORT_SYMBOL(jbd2_journal_restart); EXPORT_SYMBOL(jbd2_journal_extend); EXPORT_SYMBOL(jbd2_journal_stop); EXPORT_SYMBOL(jbd2_journal_lock_updates); @@ -143,7 +142,7 @@ static int kjournald2(void *arg) /* * And now, wait forever for commit wakeup events. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); loop: if (journal->j_flags & JBD2_UNMOUNT) @@ -154,10 +153,10 @@ loop: if (journal->j_commit_sequence != journal->j_commit_request) { jbd_debug(1, "OK, requests differ\n"); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); del_timer_sync(&journal->j_commit_timer); jbd2_journal_commit_transaction(journal); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); goto loop; } @@ -169,9 +168,9 @@ loop: * be already stopped. */ jbd_debug(1, "Now suspending kjournald2\n"); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); refrigerator(); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } else { /* * We assume on resume that commits are already there, @@ -191,9 +190,9 @@ loop: if (journal->j_flags & JBD2_UNMOUNT) should_sleep = 0; if (should_sleep) { - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); schedule(); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } finish_wait(&journal->j_wait_commit, &wait); } @@ -211,7 +210,7 @@ loop: goto loop; end_loop: - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); del_timer_sync(&journal->j_commit_timer); journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); @@ -234,16 +233,16 @@ static int jbd2_journal_start_thread(journal_t *journal) static void journal_kill_thread(journal_t *journal) { - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_flags |= JBD2_UNMOUNT; while (journal->j_task) { wake_up(&journal->j_wait_commit); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); wait_event(journal->j_wait_done_commit, journal->j_task == NULL); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* @@ -297,7 +296,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); - struct jbd2_buffer_trigger_type *triggers; journal_t *journal = transaction->t_journal; /* @@ -311,7 +309,17 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); - new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); +retry_alloc: + new_bh = alloc_buffer_head(GFP_NOFS); + if (!new_bh) { + /* + * Failure is not an option, but __GFP_NOFAIL is going + * away; so we retry ourselves here. + */ + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_alloc; + } + /* keep subsequent assertions sane */ new_bh->b_state = 0; init_buffer(new_bh, NULL, NULL); @@ -328,21 +336,21 @@ repeat: done_copy_out = 1; new_page = virt_to_page(jh_in->b_frozen_data); new_offset = offset_in_page(jh_in->b_frozen_data); - triggers = jh_in->b_frozen_triggers; } else { new_page = jh2bh(jh_in)->b_page; new_offset = offset_in_page(jh2bh(jh_in)->b_data); - triggers = jh_in->b_triggers; } mapped_data = kmap_atomic(new_page, KM_USER0); /* - * Fire any commit trigger. Do this before checking for escaping, - * as the trigger may modify the magic offset. If a copy-out - * happens afterwards, it will have the correct data in the buffer. + * Fire data frozen trigger if data already wasn't frozen. Do this + * before checking for escaping, as the trigger may modify the magic + * offset. If a copy-out happens afterwards, it will have the correct + * data in the buffer. */ - jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset, - triggers); + if (!done_copy_out) + jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset, + jh_in->b_triggers); /* * Check for escaping @@ -443,7 +451,7 @@ int __jbd2_log_space_left(journal_t *journal) { int left = journal->j_free; - assert_spin_locked(&journal->j_state_lock); + /* assert_spin_locked(&journal->j_state_lock); */ /* * Be pessimistic here about the number of those free blocks which @@ -488,9 +496,9 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid) { int ret; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); ret = __jbd2_log_start_commit(journal, tid); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } @@ -509,7 +517,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal) transaction_t *transaction = NULL; tid_t tid; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); if (journal->j_running_transaction && !current->journal_info) { transaction = journal->j_running_transaction; __jbd2_log_start_commit(journal, transaction->t_tid); @@ -517,12 +525,12 @@ int jbd2_journal_force_commit_nested(journal_t *journal) transaction = journal->j_committing_transaction; if (!transaction) { - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); return 0; /* Nothing to retry */ } tid = transaction->t_tid; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); jbd2_log_wait_commit(journal, tid); return 1; } @@ -536,7 +544,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) { int ret = 0; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (journal->j_running_transaction) { tid_t tid = journal->j_running_transaction->t_tid; @@ -555,7 +563,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) *ptid = journal->j_committing_transaction->t_tid; ret = 1; } - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } @@ -567,26 +575,24 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) { int err = 0; + read_lock(&journal->j_state_lock); #ifdef CONFIG_JBD2_DEBUG - spin_lock(&journal->j_state_lock); if (!tid_geq(journal->j_commit_request, tid)) { printk(KERN_EMERG "%s: error: j_commit_request=%d, tid=%d\n", __func__, journal->j_commit_request, tid); } - spin_unlock(&journal->j_state_lock); #endif - spin_lock(&journal->j_state_lock); while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); wake_up(&journal->j_wait_commit); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); wait_event(journal->j_wait_done_commit, !tid_gt(tid, journal->j_commit_sequence)); - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); } - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); if (unlikely(is_journal_aborted(journal))) { printk(KERN_EMERG "journal commit I/O error\n"); @@ -603,7 +609,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) { unsigned long blocknr; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); J_ASSERT(journal->j_free > 1); blocknr = journal->j_head; @@ -611,7 +617,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) journal->j_free--; if (journal->j_head == journal->j_last) journal->j_head = journal->j_first; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return jbd2_journal_bmap(journal, blocknr, retp); } @@ -831,7 +837,7 @@ static journal_t * journal_init_common (void) mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); spin_lock_init(&journal->j_list_lock); - spin_lock_init(&journal->j_state_lock); + rwlock_init(&journal->j_state_lock); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); journal->j_min_batch_time = 0; @@ -1097,14 +1103,14 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) set_buffer_uptodate(bh); } - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(journal->j_tail); sb->s_errno = cpu_to_be32(journal->j_errno); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); BUFFER_TRACE(bh, "marking dirty"); mark_buffer_dirty(bh); @@ -1125,12 +1131,12 @@ out: * any future commit will have to be careful to update the * superblock again to re-record the true start of the log. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (sb->s_start) journal->j_flags &= ~JBD2_FLUSHED; else journal->j_flags |= JBD2_FLUSHED; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* @@ -1392,13 +1398,9 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { - journal_superblock_t *sb; - if (!compat && !ro && !incompat) return 1; - sb = journal->j_superblock; - /* We can support any known requested features iff the * superblock is in version 2. Otherwise we fail to support any * extended sb features. */ @@ -1546,7 +1548,7 @@ int jbd2_journal_flush(journal_t *journal) transaction_t *transaction = NULL; unsigned long old_tail; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); /* Force everything buffered to the log... */ if (journal->j_running_transaction) { @@ -1559,10 +1561,10 @@ int jbd2_journal_flush(journal_t *journal) if (transaction) { tid_t tid = transaction->t_tid; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); jbd2_log_wait_commit(journal, tid); } else { - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* ...and flush everything in the log out to disk. */ @@ -1586,12 +1588,12 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); old_tail = journal->j_tail; journal->j_tail = 0; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); jbd2_journal_update_superblock(journal, 1); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_tail = old_tail; J_ASSERT(!journal->j_running_transaction); @@ -1599,7 +1601,7 @@ int jbd2_journal_flush(journal_t *journal) J_ASSERT(!journal->j_checkpoint_transactions); J_ASSERT(journal->j_head == journal->j_tail); J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return 0; } @@ -1618,7 +1620,6 @@ int jbd2_journal_flush(journal_t *journal) int jbd2_journal_wipe(journal_t *journal, int write) { - journal_superblock_t *sb; int err = 0; J_ASSERT (!(journal->j_flags & JBD2_LOADED)); @@ -1627,8 +1628,6 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (err) return err; - sb = journal->j_superblock; - if (!journal->j_tail) goto no_recovery; @@ -1666,12 +1665,12 @@ void __jbd2_journal_abort_hard(journal_t *journal) printk(KERN_ERR "Aborting journal on device %s.\n", journal->j_devname); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_flags |= JBD2_ABORT; transaction = journal->j_running_transaction; if (transaction) __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* Soft abort: record the abort error status in the journal superblock, @@ -1756,12 +1755,12 @@ int jbd2_journal_errno(journal_t *journal) { int err; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_ABORT) err = -EROFS; else err = journal->j_errno; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); return err; } @@ -1776,12 +1775,12 @@ int jbd2_journal_clear_err(journal_t *journal) { int err = 0; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_ABORT) err = -EROFS; else journal->j_errno = 0; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return err; } @@ -1794,10 +1793,10 @@ int jbd2_journal_clear_err(journal_t *journal) */ void jbd2_journal_ack_err(journal_t *journal) { - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (journal->j_errno) journal->j_flags |= JBD2_ACK_ERR; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } int jbd2_journal_blocks_per_page(struct inode *inode) @@ -2202,8 +2201,6 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode) { - int writeout = 0; - if (!journal) return; restart: @@ -2220,9 +2217,6 @@ restart: goto restart; } - /* Do we need to wait for data writeback? */ - if (journal->j_committing_transaction == jinode->i_transaction) - writeout = 1; if (jinode->i_transaction) { list_del(&jinode->i_list); jinode->i_transaction = NULL; diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 049281b7cb8..2bc4d5f116f 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -285,12 +285,10 @@ int jbd2_journal_recover(journal_t *journal) int jbd2_journal_skip_recovery(journal_t *journal) { int err; - journal_superblock_t * sb; struct recovery_info info; memset (&info, 0, sizeof(info)); - sb = journal->j_superblock; err = do_one_pass(journal, &info, PASS_SCAN); @@ -299,7 +297,8 @@ int jbd2_journal_skip_recovery(journal_t *journal) ++journal->j_transaction_sequence; } else { #ifdef CONFIG_JBD2_DEBUG - int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); + int dropped = info.end_transaction - + be32_to_cpu(journal->j_superblock->s_sequence); #endif jbd_debug(1, "JBD: ignoring %d transaction%s from the journal.\n", @@ -365,11 +364,6 @@ static int do_one_pass(journal_t *journal, int tag_bytes = journal_tag_bytes(journal); __u32 crc32_sum = ~0; /* Transactional Checksums */ - /* Precompute the maximum metadata descriptors in a descriptor block */ - int MAX_BLOCKS_PER_DESC; - MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) - / tag_bytes); - /* * First thing is to establish what we expect to find in the log * (in terms of transaction IDs), and where (in terms of log diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e214d68620a..d95cc9d0401 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -26,6 +26,8 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hrtimer.h> +#include <linux/backing-dev.h> +#include <linux/module.h> static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); @@ -53,6 +55,9 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); + atomic_set(&transaction->t_updates, 0); + atomic_set(&transaction->t_outstanding_credits, 0); + atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); INIT_LIST_HEAD(&transaction->t_private_list); @@ -83,65 +88,75 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) * transaction's buffer credits. */ -static int start_this_handle(journal_t *journal, handle_t *handle) +static int start_this_handle(journal_t *journal, handle_t *handle, + int gfp_mask) { transaction_t *transaction; int needed; int nblocks = handle->h_buffer_credits; transaction_t *new_transaction = NULL; - int ret = 0; unsigned long ts = jiffies; if (nblocks > journal->j_max_transaction_buffers) { printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", current->comm, nblocks, journal->j_max_transaction_buffers); - ret = -ENOSPC; - goto out; + return -ENOSPC; } alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), - GFP_NOFS|__GFP_NOFAIL); + new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask); if (!new_transaction) { - ret = -ENOMEM; - goto out; + /* + * If __GFP_FS is not present, then we may be + * being called from inside the fs writeback + * layer, so we MUST NOT fail. Since + * __GFP_NOFAIL is going away, we will arrange + * to retry the allocation ourselves. + */ + if ((gfp_mask & __GFP_FS) == 0) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto alloc_transaction; + } + return -ENOMEM; } } jbd_debug(3, "New handle %p going live.\n", handle); -repeat: - /* * We need to hold j_state_lock until t_updates has been incremented, * for proper journal barrier handling */ - spin_lock(&journal->j_state_lock); -repeat_locked: +repeat: + read_lock(&journal->j_state_lock); if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { - spin_unlock(&journal->j_state_lock); - ret = -EROFS; - goto out; + read_unlock(&journal->j_state_lock); + kfree(new_transaction); + return -EROFS; } /* Wait on the journal's transaction barrier if necessary */ if (journal->j_barrier_count) { - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); wait_event(journal->j_wait_transaction_locked, journal->j_barrier_count == 0); goto repeat; } if (!journal->j_running_transaction) { - if (!new_transaction) { - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); + if (!new_transaction) goto alloc_transaction; + write_lock(&journal->j_state_lock); + if (!journal->j_running_transaction) { + jbd2_get_transaction(journal, new_transaction); + new_transaction = NULL; } - jbd2_get_transaction(journal, new_transaction); - new_transaction = NULL; + write_unlock(&journal->j_state_lock); + goto repeat; } transaction = journal->j_running_transaction; @@ -155,7 +170,7 @@ repeat_locked: prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); goto repeat; @@ -166,8 +181,8 @@ repeat_locked: * buffers requested by this operation, we need to stall pending a log * checkpoint to free some more log space. */ - spin_lock(&transaction->t_handle_lock); - needed = transaction->t_outstanding_credits + nblocks; + needed = atomic_add_return(nblocks, + &transaction->t_outstanding_credits); if (needed > journal->j_max_transaction_buffers) { /* @@ -178,11 +193,11 @@ repeat_locked: DEFINE_WAIT(wait); jbd_debug(2, "Handle %p starting new commit...\n", handle); - spin_unlock(&transaction->t_handle_lock); + atomic_sub(nblocks, &transaction->t_outstanding_credits); prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); goto repeat; @@ -215,35 +230,48 @@ repeat_locked: */ if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); - spin_unlock(&transaction->t_handle_lock); - __jbd2_log_wait_for_space(journal); - goto repeat_locked; + atomic_sub(nblocks, &transaction->t_outstanding_credits); + read_unlock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); + if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) + __jbd2_log_wait_for_space(journal); + write_unlock(&journal->j_state_lock); + goto repeat; } /* OK, account for the buffers that this operation expects to - * use and add the handle to the running transaction. */ - - if (time_after(transaction->t_start, ts)) { + * use and add the handle to the running transaction. + * + * In order for t_max_wait to be reliable, it must be + * protected by a lock. But doing so will mean that + * start_this_handle() can not be run in parallel on SMP + * systems, which limits our scalability. So we only enable + * it when debugging is enabled. We may want to use a + * separate flag, eventually, so we can enable this + * independently of debugging. + */ +#ifdef CONFIG_JBD2_DEBUG + if (jbd2_journal_enable_debug && + time_after(transaction->t_start, ts)) { ts = jbd2_time_diff(ts, transaction->t_start); + spin_lock(&transaction->t_handle_lock); if (ts > transaction->t_max_wait) transaction->t_max_wait = ts; + spin_unlock(&transaction->t_handle_lock); } - +#endif handle->h_transaction = transaction; - transaction->t_outstanding_credits += nblocks; - transaction->t_updates++; - transaction->t_handle_count++; + atomic_inc(&transaction->t_updates); + atomic_inc(&transaction->t_handle_count); jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", - handle, nblocks, transaction->t_outstanding_credits, + handle, nblocks, + atomic_read(&transaction->t_outstanding_credits), __jbd2_log_space_left(journal)); - spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); lock_map_acquire(&handle->h_lockdep_map); -out: - if (unlikely(new_transaction)) /* It's usually NULL */ - kfree(new_transaction); - return ret; + kfree(new_transaction); + return 0; } static struct lock_class_key jbd2_handle_key; @@ -278,7 +306,7 @@ static handle_t *new_handle(int nblocks) * * Return a pointer to a newly allocated handle, or NULL on failure */ -handle_t *jbd2_journal_start(journal_t *journal, int nblocks) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) { handle_t *handle = journal_current_handle(); int err; @@ -298,7 +326,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) current->journal_info = handle; - err = start_this_handle(journal, handle); + err = start_this_handle(journal, handle, gfp_mask); if (err < 0) { jbd2_free_handle(handle); current->journal_info = NULL; @@ -308,6 +336,15 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) out: return handle; } +EXPORT_SYMBOL(jbd2__journal_start); + + +handle_t *jbd2_journal_start(journal_t *journal, int nblocks) +{ + return jbd2__journal_start(journal, nblocks, GFP_NOFS); +} +EXPORT_SYMBOL(jbd2_journal_start); + /** * int jbd2_journal_extend() - extend buffer credits. @@ -342,7 +379,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) result = 1; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); /* Don't extend a locked-down transaction! */ if (handle->h_transaction->t_state != T_RUNNING) { @@ -352,7 +389,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) } spin_lock(&transaction->t_handle_lock); - wanted = transaction->t_outstanding_credits + nblocks; + wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; if (wanted > journal->j_max_transaction_buffers) { jbd_debug(3, "denied handle %p %d blocks: " @@ -367,14 +404,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) } handle->h_buffer_credits += nblocks; - transaction->t_outstanding_credits += nblocks; + atomic_add(nblocks, &transaction->t_outstanding_credits); result = 0; jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); unlock: spin_unlock(&transaction->t_handle_lock); error_out: - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); out: return result; } @@ -394,8 +431,7 @@ out: * transaction capabable of guaranteeing the requested number of * credits. */ - -int jbd2_journal_restart(handle_t *handle, int nblocks) +int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; @@ -410,29 +446,35 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) * First unlink the handle from its current transaction, and start the * commit on that. */ - J_ASSERT(transaction->t_updates > 0); + J_ASSERT(atomic_read(&transaction->t_updates) > 0); J_ASSERT(journal_current_handle() == handle); - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); - transaction->t_outstanding_credits -= handle->h_buffer_credits; - transaction->t_updates--; - - if (!transaction->t_updates) + atomic_sub(handle->h_buffer_credits, + &transaction->t_outstanding_credits); + if (atomic_dec_and_test(&transaction->t_updates)) wake_up(&journal->j_wait_updates); spin_unlock(&transaction->t_handle_lock); jbd_debug(2, "restarting handle %p\n", handle); __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; - ret = start_this_handle(journal, handle); + ret = start_this_handle(journal, handle, gfp_mask); return ret; } +EXPORT_SYMBOL(jbd2__journal_restart); +int jbd2_journal_restart(handle_t *handle, int nblocks) +{ + return jbd2__journal_restart(handle, nblocks, GFP_NOFS); +} +EXPORT_SYMBOL(jbd2_journal_restart); + /** * void jbd2_journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. @@ -447,7 +489,7 @@ void jbd2_journal_lock_updates(journal_t *journal) { DEFINE_WAIT(wait); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); ++journal->j_barrier_count; /* Wait until there are no running updates */ @@ -458,19 +500,19 @@ void jbd2_journal_lock_updates(journal_t *journal) break; spin_lock(&transaction->t_handle_lock); - if (!transaction->t_updates) { + if (!atomic_read(&transaction->t_updates)) { spin_unlock(&transaction->t_handle_lock); break; } prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_updates, &wait); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); /* * We have now established a barrier against other normal updates, but @@ -494,9 +536,9 @@ void jbd2_journal_unlock_updates (journal_t *journal) J_ASSERT(journal->j_barrier_count != 0); mutex_unlock(&journal->j_barrier); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); --journal->j_barrier_count; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_transaction_locked); } @@ -725,6 +767,9 @@ done: page = jh2bh(jh)->b_page; offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; source = kmap_atomic(page, KM_USER0); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source + offset, + jh->b_triggers); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); kunmap_atomic(source, KM_USER0); @@ -963,15 +1008,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh, jh->b_triggers = type; } -void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data, +void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers) { struct buffer_head *bh = jh2bh(jh); - if (!triggers || !triggers->t_commit) + if (!triggers || !triggers->t_frozen) return; - triggers->t_commit(triggers, bh, mapped_data, bh->b_size); + triggers->t_frozen(triggers, bh, mapped_data, bh->b_size); } void jbd2_buffer_abort_trigger(struct journal_head *jh, @@ -1235,7 +1280,8 @@ int jbd2_journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - int err; + int err, wait_for_commit = 0; + tid_t tid; pid_t pid; J_ASSERT(journal_current_handle() == handle); @@ -1243,7 +1289,7 @@ int jbd2_journal_stop(handle_t *handle) if (is_handle_aborted(handle)) err = -EIO; else { - J_ASSERT(transaction->t_updates > 0); + J_ASSERT(atomic_read(&transaction->t_updates) > 0); err = 0; } @@ -1288,9 +1334,9 @@ int jbd2_journal_stop(handle_t *handle) journal->j_last_sync_writer = pid; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); commit_time = journal->j_average_commit_time; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); trans_time = ktime_to_ns(ktime_sub(ktime_get(), transaction->t_start_time)); @@ -1311,14 +1357,8 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_sync) transaction->t_synchronous_commit = 1; current->journal_info = NULL; - spin_lock(&transaction->t_handle_lock); - transaction->t_outstanding_credits -= handle->h_buffer_credits; - transaction->t_updates--; - if (!transaction->t_updates) { - wake_up(&journal->j_wait_updates); - if (journal->j_barrier_count) - wake_up(&journal->j_wait_transaction_locked); - } + atomic_sub(handle->h_buffer_credits, + &transaction->t_outstanding_credits); /* * If the handle is marked SYNC, we need to set another commit @@ -1327,15 +1367,13 @@ int jbd2_journal_stop(handle_t *handle) * transaction is too old now. */ if (handle->h_sync || - transaction->t_outstanding_credits > - journal->j_max_transaction_buffers || - time_after_eq(jiffies, transaction->t_expires)) { + (atomic_read(&transaction->t_outstanding_credits) > + journal->j_max_transaction_buffers) || + time_after_eq(jiffies, transaction->t_expires)) { /* Do this even for aborted journals: an abort still * completes the commit thread, it just doesn't write * anything to disk. */ - tid_t tid = transaction->t_tid; - spin_unlock(&transaction->t_handle_lock); jbd_debug(2, "transaction too old, requesting commit for " "handle %p\n", handle); /* This is non-blocking */ @@ -1346,11 +1384,25 @@ int jbd2_journal_stop(handle_t *handle) * to wait for the commit to complete. */ if (handle->h_sync && !(current->flags & PF_MEMALLOC)) - err = jbd2_log_wait_commit(journal, tid); - } else { - spin_unlock(&transaction->t_handle_lock); + wait_for_commit = 1; } + /* + * Once we drop t_updates, if it goes to zero the transaction + * could start commiting on us and eventually disappear. So + * once we do this, we must not dereference transaction + * pointer again. + */ + tid = transaction->t_tid; + if (atomic_dec_and_test(&transaction->t_updates)) { + wake_up(&journal->j_wait_updates); + if (journal->j_barrier_count) + wake_up(&journal->j_wait_transaction_locked); + } + + if (wait_for_commit) + err = jbd2_log_wait_commit(journal, tid); + lock_map_release(&handle->h_lockdep_map); jbd2_free_handle(handle); @@ -1716,7 +1768,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) goto zap_buffer_unlocked; /* OK, we have data buffer in journaled mode */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); @@ -1769,7 +1821,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } else { /* There is no currently-running transaction. So the @@ -1783,7 +1835,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } else { /* The orphan record's transaction has @@ -1807,7 +1859,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return 0; } else { /* Good, the buffer belongs to the running transaction. @@ -1826,7 +1878,7 @@ zap_buffer: zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); zap_buffer_unlocked: clear_buffer_dirty(bh); J_ASSERT_BH(bh, !buffer_jbddirty(bh)); @@ -2133,9 +2185,9 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal, /* Locks are here just to force reading of recent values, it is * enough that the transaction was not committing before we started * a transaction adding the inode to orphan list */ - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); commit_trans = journal->j_committing_transaction; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); inode_trans = jinode->i_transaction; spin_unlock(&journal->j_list_lock); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index a33aab6b5e6..54a92fd02bb 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -234,8 +234,9 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) if (inode->i_mode != mode) { struct iattr attr; - attr.ia_valid = ATTR_MODE; + attr.ia_valid = ATTR_MODE | ATTR_CTIME; attr.ia_mode = mode; + attr.ia_ctime = CURRENT_TIME_SEC; rc = jffs2_do_setattr(inode, &attr); if (rc < 0) return rc; diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 7aa4417e085..166062a6823 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -222,15 +222,18 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode, dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime)); jffs2_free_raw_inode(ri); - d_instantiate(dentry, inode); D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n", inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->pino_nlink, inode->i_mapping->nrpages)); + + d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; fail: make_bad_inode(inode); + unlock_new_inode(inode); iput(inode); jffs2_free_raw_inode(ri); return ret; @@ -360,8 +363,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char /* Eeek. Wave bye bye */ mutex_unlock(&f->sem); jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return PTR_ERR(fn); + ret = PTR_ERR(fn); + goto fail; } /* We use f->target field to store the target path. */ @@ -370,8 +373,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); mutex_unlock(&f->sem); jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } memcpy(f->target, target, targetlen + 1); @@ -386,30 +389,24 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char jffs2_complete_reservation(c); ret = jffs2_init_security(inode, dir_i); - if (ret) { - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; + ret = jffs2_init_acl_post(inode); - if (ret) { - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); - if (ret) { - /* Eep. */ - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; rd = jffs2_alloc_raw_dirent(); if (!rd) { /* Argh. Now we treat it like a normal delete */ jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } dir_f = JFFS2_INODE_INFO(dir_i); @@ -437,8 +434,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char jffs2_complete_reservation(c); jffs2_free_raw_dirent(rd); mutex_unlock(&dir_f->sem); - jffs2_clear_inode(inode); - return PTR_ERR(fd); + ret = PTR_ERR(fd); + goto fail; } dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); @@ -453,7 +450,14 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char jffs2_complete_reservation(c); d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; + + fail: + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + return ret; } @@ -519,8 +523,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) /* Eeek. Wave bye bye */ mutex_unlock(&f->sem); jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return PTR_ERR(fn); + ret = PTR_ERR(fn); + goto fail; } /* No data here. Only a metadata node, which will be obsoleted by the first data write @@ -531,30 +535,24 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) jffs2_complete_reservation(c); ret = jffs2_init_security(inode, dir_i); - if (ret) { - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; + ret = jffs2_init_acl_post(inode); - if (ret) { - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); - if (ret) { - /* Eep. */ - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; rd = jffs2_alloc_raw_dirent(); if (!rd) { /* Argh. Now we treat it like a normal delete */ jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } dir_f = JFFS2_INODE_INFO(dir_i); @@ -582,8 +580,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) jffs2_complete_reservation(c); jffs2_free_raw_dirent(rd); mutex_unlock(&dir_f->sem); - jffs2_clear_inode(inode); - return PTR_ERR(fd); + ret = PTR_ERR(fd); + goto fail; } dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); @@ -599,7 +597,14 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) jffs2_complete_reservation(c); d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; + + fail: + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + return ret; } static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) @@ -693,8 +698,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de /* Eeek. Wave bye bye */ mutex_unlock(&f->sem); jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return PTR_ERR(fn); + ret = PTR_ERR(fn); + goto fail; } /* No data here. Only a metadata node, which will be obsoleted by the first data write @@ -705,30 +710,24 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de jffs2_complete_reservation(c); ret = jffs2_init_security(inode, dir_i); - if (ret) { - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; + ret = jffs2_init_acl_post(inode); - if (ret) { - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); - if (ret) { - /* Eep. */ - jffs2_clear_inode(inode); - return ret; - } + if (ret) + goto fail; rd = jffs2_alloc_raw_dirent(); if (!rd) { /* Argh. Now we treat it like a normal delete */ jffs2_complete_reservation(c); - jffs2_clear_inode(inode); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } dir_f = JFFS2_INODE_INFO(dir_i); @@ -759,8 +758,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de jffs2_complete_reservation(c); jffs2_free_raw_dirent(rd); mutex_unlock(&dir_f->sem); - jffs2_clear_inode(inode); - return PTR_ERR(fd); + ret = PTR_ERR(fd); + goto fail; } dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); @@ -775,8 +774,14 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de jffs2_complete_reservation(c); d_instantiate(dentry, inode); - + unlock_new_inode(inode); return 0; + + fail: + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + return ret; } static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 8bc2c80ab15..459d39d1ea0 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -465,7 +465,12 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i inode->i_blocks = 0; inode->i_size = 0; - insert_inode_hash(inode); + if (insert_inode_locked(inode) < 0) { + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(-EINVAL); + } return inode; } diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index a2d58c96f1b..d258e261bdc 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) { - /* success of check_xattr_ref_inode() means taht inode (ic) dose not have + /* success of check_xattr_ref_inode() means that inode (ic) dose not have * duplicate name/value pairs. If duplicate name/value pair would be found, * one will be removed. */ diff --git a/fs/libfs.c b/fs/libfs.c index 09e1016eb77..dcaf972cbf1 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -489,7 +489,8 @@ int simple_write_end(struct file *file, struct address_space *mapping, * unique inode values later for this filesystem, then you must take care * to pass it an appropriate max_reserved value to avoid collisions. */ -int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files) +int simple_fill_super(struct super_block *s, unsigned long magic, + struct tree_descr *files) { struct inode *inode; struct dentry *root; diff --git a/fs/mbcache.c b/fs/mbcache.c index ec88ff3d04a..e28f21b9534 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache) * What the mbcache registers as to get shrunk dynamically. */ -static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask); +static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); static struct shrinker mb_cache_shrinker = { .shrink = mb_cache_shrink_fn, @@ -191,13 +191,14 @@ forget: * This function is called by the kernel memory management when memory * gets low. * + * @shrink: (ignored) * @nr_to_scan: Number of objects to scan * @gfp_mask: (ignored) * * Returns the number of objects which are present in the cache. */ static int -mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask) +mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(free_list); struct list_head *l, *ltmp; diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 91969589131..1dbf921ca44 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -75,10 +75,6 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n) if (!IS_ERR(page)) kmap(page); return page; - -fail: - dir_put_page(page); - return ERR_PTR(-EIO); } static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) diff --git a/fs/namei.c b/fs/namei.c index 868d0cb9d47..42d2d28fb82 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -282,8 +282,7 @@ int inode_permission(struct inode *inode, int mask) if (retval) return retval; - return security_inode_permission(inode, - mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND)); + return security_inode_permission(inode, mask); } /** @@ -1484,8 +1483,7 @@ static int handle_truncate(struct path *path) */ error = locks_verify_locked(inode); if (!error) - error = security_path_truncate(path, 0, - ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); + error = security_path_truncate(path); if (!error) { error = do_truncate(path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index fa338515402..1e634deff94 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -728,8 +728,8 @@ out_fput: out_bdi: /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: * - * The previously used put_filp(ncp_filp); was bogous, since - * it doesn't proper unlocking. + * The previously used put_filp(ncp_filp); was bogus, since + * it doesn't perform proper unlocking. */ fput(ncp_filp); out: diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index a43d07e7b92..cc1bb33b59b 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -61,8 +61,8 @@ config NFS_V3_ACL If unsure, say N. config NFS_V4 - bool "NFS client support for NFS version 4 (EXPERIMENTAL)" - depends on NFS_FS && EXPERIMENTAL + bool "NFS client support for NFS version 4" + depends on NFS_FS select RPCSEC_GSS_KRB5 help This option enables support for version 4 of the NFS protocol @@ -72,16 +72,16 @@ config NFS_V4 space programs which can be found in the Linux nfs-utils package, available from http://linux-nfs.org/. - If unsure, say N. + If unsure, say Y. config NFS_V4_1 - bool "NFS client support for NFSv4.1 (DEVELOPER ONLY)" + bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" depends on NFS_V4 && EXPERIMENTAL help This option enables support for minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. - Unless you're an NFS developer, say N. + If unsure, say N. config ROOT_NFS bool "Root file system on NFS" diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index a08770a7e85..930d10fecda 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -37,8 +37,8 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres * if (inode == NULL) goto out_putclient; nfsi = NFS_I(inode); - down_read(&nfsi->rwsem); - delegation = nfsi->delegation; + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) goto out_iput; res->size = i_size_read(inode); @@ -53,7 +53,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres * args->bitmap[1]; res->status = 0; out_iput: - up_read(&nfsi->rwsem); + rcu_read_unlock(); iput(inode); out_putclient: nfs_put_client(clp); @@ -62,16 +62,6 @@ out: return res->status; } -static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *) -{ -#if defined(CONFIG_NFS_V4_1) - if (clp->cl_minorversion > 0) - return nfs41_validate_delegation_stateid; -#endif - return nfs4_validate_delegation_stateid; -} - - __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) { struct nfs_client *clp; @@ -92,8 +82,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) inode = nfs_delegation_find_inode(clp, &args->fh); if (inode != NULL) { /* Set up a helper thread to actually return the delegation */ - switch (nfs_async_inode_return_delegation(inode, &args->stateid, - nfs_validate_delegation_stateid(clp))) { + switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { case 0: res = 0; break; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 7ec9b34a59f..4e7df2adb21 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -150,6 +150,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_boot_time = CURRENT_TIME; clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; clp->cl_minorversion = cl_init->minorversion; + clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; #endif cred = rpc_lookup_machine_cred(); if (!IS_ERR(cred)) @@ -178,7 +179,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp) clp->cl_session = NULL; } - clp->cl_call_sync = _nfs4_call_sync; + clp->cl_mvops = nfs_v4_minor_ops[0]; #endif /* CONFIG_NFS_V4_1 */ } @@ -188,7 +189,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp) static void nfs4_destroy_callback(struct nfs_client *clp) { if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) - nfs_callback_down(clp->cl_minorversion); + nfs_callback_down(clp->cl_mvops->minor_version); } static void nfs4_shutdown_client(struct nfs_client *clp) @@ -1126,7 +1127,7 @@ static int nfs4_init_callback(struct nfs_client *clp) return error; } - error = nfs_callback_up(clp->cl_minorversion, + error = nfs_callback_up(clp->cl_mvops->minor_version, clp->cl_rpcclient->cl_xprt); if (error < 0) { dprintk("%s: failed to start callback. Error = %d\n", @@ -1143,10 +1144,8 @@ static int nfs4_init_callback(struct nfs_client *clp) */ static int nfs4_init_client_minor_version(struct nfs_client *clp) { - clp->cl_call_sync = _nfs4_call_sync; - #if defined(CONFIG_NFS_V4_1) - if (clp->cl_minorversion) { + if (clp->cl_mvops->minor_version) { struct nfs4_session *session = NULL; /* * Create the session and mark it expired. @@ -1158,7 +1157,13 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) return -ENOMEM; clp->cl_session = session; - clp->cl_call_sync = _nfs4_call_sync_session; + /* + * The create session reply races with the server back + * channel probe. Mark the client NFS_CS_SESSION_INITING + * so that the client back channel can find the + * nfs_client struct + */ + clp->cl_cons_state = NFS_CS_SESSION_INITING; } #endif /* CONFIG_NFS_V4_1 */ @@ -1286,6 +1291,55 @@ static void nfs4_session_set_rwsize(struct nfs_server *server) #endif /* CONFIG_NFS_V4_1 */ } +static int nfs4_server_common_setup(struct nfs_server *server, + struct nfs_fh *mntfh) +{ + struct nfs_fattr *fattr; + int error; + + BUG_ON(!server->nfs_client); + BUG_ON(!server->nfs_client->rpc_ops); + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return -ENOMEM; + + /* We must ensure the session is initialised first */ + error = nfs4_init_session(server); + if (error < 0) + goto out; + + /* Probe the root fh to retrieve its FSID and filehandle */ + error = nfs4_get_rootfh(server, mntfh); + if (error < 0) + goto out; + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + dprintk("Mount FH: %d\n", mntfh->size); + + nfs4_session_set_rwsize(server); + + error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto out; + + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) + server->namelen = NFS4_MAXNAMLEN; + + spin_lock(&nfs_client_lock); + list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); + list_add_tail(&server->master_link, &nfs_volume_list); + spin_unlock(&nfs_client_lock); + + server->mount_time = jiffies; +out: + nfs_free_fattr(fattr); + return error; +} + /* * Create a version 4 volume record */ @@ -1346,7 +1400,6 @@ error: struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, struct nfs_fh *mntfh) { - struct nfs_fattr *fattr; struct nfs_server *server; int error; @@ -1356,55 +1409,19 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, if (!server) return ERR_PTR(-ENOMEM); - error = -ENOMEM; - fattr = nfs_alloc_fattr(); - if (fattr == NULL) - goto error; - /* set up the general RPC client */ error = nfs4_init_server(server, data); if (error < 0) goto error; - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - - error = nfs4_init_session(server); - if (error < 0) - goto error; - - /* Probe the root fh to retrieve its FSID */ - error = nfs4_get_rootfh(server, mntfh); - if (error < 0) - goto error; - - dprintk("Server FSID: %llx:%llx\n", - (unsigned long long) server->fsid.major, - (unsigned long long) server->fsid.minor); - dprintk("Mount FH: %d\n", mntfh->size); - - nfs4_session_set_rwsize(server); - - error = nfs_probe_fsinfo(server, mntfh, fattr); + error = nfs4_server_common_setup(server, mntfh); if (error < 0) goto error; - if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) - server->namelen = NFS4_MAXNAMLEN; - - spin_lock(&nfs_client_lock); - list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); - list_add_tail(&server->master_link, &nfs_volume_list); - spin_unlock(&nfs_client_lock); - - server->mount_time = jiffies; dprintk("<-- nfs4_create_server() = %p\n", server); - nfs_free_fattr(fattr); return server; error: - nfs_free_fattr(fattr); nfs_free_server(server); dprintk("<-- nfs4_create_server() = error %d\n", error); return ERR_PTR(error); @@ -1418,7 +1435,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, { struct nfs_client *parent_client; struct nfs_server *server, *parent_server; - struct nfs_fattr *fattr; int error; dprintk("--> nfs4_create_referral_server()\n"); @@ -1427,11 +1443,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (!server) return ERR_PTR(-ENOMEM); - error = -ENOMEM; - fattr = nfs_alloc_fattr(); - if (fattr == NULL) - goto error; - parent_server = NFS_SB(data->sb); parent_client = parent_server->nfs_client; @@ -1448,7 +1459,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, data->authflavor, parent_server->client->cl_xprt->prot, parent_server->client->cl_timeout, - parent_client->cl_minorversion); + parent_client->cl_mvops->minor_version); if (error < 0) goto error; @@ -1456,40 +1467,14 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (error < 0) goto error; - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - - /* Probe the root fh to retrieve its FSID and filehandle */ - error = nfs4_get_rootfh(server, mntfh); - if (error < 0) - goto error; - - /* probe the filesystem info for this server filesystem */ - error = nfs_probe_fsinfo(server, mntfh, fattr); + error = nfs4_server_common_setup(server, mntfh); if (error < 0) goto error; - if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) - server->namelen = NFS4_MAXNAMLEN; - - dprintk("Referral FSID: %llx:%llx\n", - (unsigned long long) server->fsid.major, - (unsigned long long) server->fsid.minor); - - spin_lock(&nfs_client_lock); - list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); - list_add_tail(&server->master_link, &nfs_volume_list); - spin_unlock(&nfs_client_lock); - - server->mount_time = jiffies; - - nfs_free_fattr(fattr); dprintk("<-- nfs_create_referral_server() = %p\n", server); return server; error: - nfs_free_fattr(fattr); nfs_free_server(server); dprintk("<-- nfs4_create_referral_server() = error %d\n", error); return ERR_PTR(error); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 30163454397..b9c3c43cea1 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -268,14 +268,6 @@ out: return status; } -/* Sync all data to disk upon delegation return */ -static void nfs_msync_inode(struct inode *inode) -{ - filemap_fdatawrite(inode->i_mapping); - nfs_wb_all(inode); - filemap_fdatawait(inode->i_mapping); -} - /* * Basic procedure for returning a delegation to the server */ @@ -367,7 +359,7 @@ int nfs_inode_return_delegation(struct inode *inode) delegation = nfs_detach_delegation_locked(nfsi, NULL, clp); spin_unlock(&clp->cl_lock); if (delegation != NULL) { - nfs_msync_inode(inode); + nfs_wb_all(inode); err = __nfs_inode_return_delegation(inode, delegation, 1); } } @@ -471,9 +463,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp) /* * Asynchronous delegation recall! */ -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, - int (*validate_stateid)(struct nfs_delegation *delegation, - const nfs4_stateid *stateid)) +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_delegation *delegation; @@ -481,7 +471,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (!validate_stateid(delegation, stateid)) { + if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { rcu_read_unlock(); return -ENOENT; } diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 69e7b814012..2026304bda1 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -34,9 +34,7 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); int nfs_inode_return_delegation(struct inode *inode); -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, - int (*validate_stateid)(struct nfs_delegation *delegation, - const nfs4_stateid *stateid)); +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_inode_return_delegation_noreclaim(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 782b431ef91..29539ceeb74 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1652,16 +1652,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - /* - * ... prune child dentries and writebacks if needed. - */ - if (atomic_read(&old_dentry->d_count) > 1) { - if (S_ISREG(old_inode->i_mode)) - nfs_wb_all(old_inode); - shrink_dcache_parent(old_dentry); - } nfs_inode_return_delegation(old_inode); - if (new_inode != NULL) nfs_inode_return_delegation(new_inode); @@ -1710,7 +1701,7 @@ static void nfs_access_free_list(struct list_head *head) } } -int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) +int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(head); struct nfs_inode *nfsi; @@ -1953,7 +1944,7 @@ int nfs_permission(struct inode *inode, int mask) if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) goto out; /* Is this sys_access() ? */ - if (mask & MAY_ACCESS) + if (mask & (MAY_ACCESS | MAY_CHDIR)) goto force_lookup; switch (inode->i_mode & S_IFMT) { diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index ad4cd31d605..064a8096167 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -69,6 +69,7 @@ struct nfs_direct_req { /* I/O parameters */ struct nfs_open_context *ctx; /* file open context info */ + struct nfs_lock_context *l_ctx; /* Lock context info */ struct kiocb * iocb; /* controlling i/o request */ struct inode * inode; /* target file of i/o */ @@ -160,6 +161,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) INIT_LIST_HEAD(&dreq->rewrite_list); dreq->iocb = NULL; dreq->ctx = NULL; + dreq->l_ctx = NULL; spin_lock_init(&dreq->lock); atomic_set(&dreq->io_count, 0); dreq->count = 0; @@ -173,6 +175,8 @@ static void nfs_direct_req_free(struct kref *kref) { struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); + if (dreq->l_ctx != NULL) + nfs_put_lock_context(dreq->l_ctx); if (dreq->ctx != NULL) put_nfs_open_context(dreq->ctx); kmem_cache_free(nfs_direct_cachep, dreq); @@ -336,6 +340,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); data->args.context = ctx; + data->args.lock_context = dreq->l_ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -416,24 +421,28 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - ssize_t result = 0; + ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; struct nfs_direct_req *dreq; dreq = nfs_direct_req_alloc(); - if (!dreq) - return -ENOMEM; + if (dreq == NULL) + goto out; dreq->inode = inode; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + dreq->l_ctx = nfs_get_lock_context(dreq->ctx); + if (dreq->l_ctx == NULL) + goto out_release; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); if (!result) result = nfs_direct_wait(dreq); +out_release: nfs_direct_req_release(dreq); - +out: return result; } @@ -574,6 +583,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->args.offset = 0; data->args.count = 0; data->args.context = dreq->ctx; + data->args.lock_context = dreq->l_ctx; data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -761,6 +771,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); data->args.context = ctx; + data->args.lock_context = dreq->l_ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -845,7 +856,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, size_t count) { - ssize_t result = 0; + ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; struct nfs_direct_req *dreq; size_t wsize = NFS_SERVER(inode)->wsize; @@ -853,7 +864,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, dreq = nfs_direct_req_alloc(); if (!dreq) - return -ENOMEM; + goto out; nfs_alloc_commit_data(dreq); if (dreq->commit_data == NULL || count < wsize) @@ -861,14 +872,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, dreq->inode = inode; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + dreq->l_ctx = nfs_get_lock_context(dreq->ctx); + if (dreq->l_ctx != NULL) + goto out_release; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync); if (!result) result = nfs_direct_wait(dreq); +out_release: nfs_direct_req_release(dreq); - +out: return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 36a5e74f51b..2d141a74ae8 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -27,6 +27,7 @@ #include <linux/pagemap.h> #include <linux/aio.h> #include <linux/gfp.h> +#include <linux/swap.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -202,37 +203,11 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) } /* - * Helper for nfs_file_flush() and nfs_file_fsync() - * - * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to - * disk, but it retrieves and clears ctx->error after synching, despite - * the two being set at the same time in nfs_context_set_write_error(). - * This is because the former is used to notify the _next_ call to - * nfs_file_write() that a write error occured, and hence cause it to - * fall back to doing a synchronous write. - */ -static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode) -{ - int have_error, status; - int ret = 0; - - have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - status = nfs_wb_all(inode); - have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - if (have_error) - ret = xchg(&ctx->error, 0); - if (!ret) - ret = status; - return ret; -} - -/* * Flush all dirty pages, and check for write errors. */ static int nfs_file_flush(struct file *file, fl_owner_t id) { - struct nfs_open_context *ctx = nfs_file_open_context(file); struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; @@ -245,7 +220,7 @@ nfs_file_flush(struct file *file, fl_owner_t id) return 0; /* Flush writes to the server and return any errors */ - return nfs_do_fsync(ctx, inode); + return vfs_fsync(file, 0); } static ssize_t @@ -320,6 +295,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) * Flush any dirty pages for this process, and check for write errors. * The return status from this call provides a reliable indication of * whether any write errors occurred for this process. + * + * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to + * disk, but it retrieves and clears ctx->error after synching, despite + * the two being set at the same time in nfs_context_set_write_error(). + * This is because the former is used to notify the _next_ call to + * nfs_file_write() that a write error occured, and hence cause it to + * fall back to doing a synchronous write. */ static int nfs_file_fsync(struct file *file, int datasync) @@ -327,13 +309,23 @@ nfs_file_fsync(struct file *file, int datasync) struct dentry *dentry = file->f_path.dentry; struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = dentry->d_inode; + int have_error, status; + int ret = 0; + dprintk("NFS: fsync file(%s/%s) datasync %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - return nfs_do_fsync(ctx, inode); + have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + status = nfs_commit_inode(inode, FLUSH_SYNC); + have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + if (have_error) + ret = xchg(&ctx->error, 0); + if (!ret) + ret = status; + return ret; } /* @@ -493,11 +485,19 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) */ static int nfs_release_page(struct page *page, gfp_t gfp) { + struct address_space *mapping = page->mapping; + dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); /* Only do I/O if gfp is a superset of GFP_KERNEL */ - if ((gfp & GFP_KERNEL) == GFP_KERNEL) - nfs_wb_page(page->mapping->host, page); + if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) { + int how = FLUSH_SYNC; + + /* Don't let kswapd deadlock waiting for OOM RPC calls */ + if (current_is_kswapd()) + how = 0; + nfs_commit_inode(mapping->host, how); + } /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) return 0; @@ -639,7 +639,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, /* Return error values for O_DSYNC and IS_SYNC() */ if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { - int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); + int err = vfs_fsync(iocb->ki_filp, 0); if (err < 0) result = err; } @@ -675,7 +675,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, written = ret; if (ret >= 0 && nfs_need_sync_write(filp, inode)) { - int err = nfs_do_fsync(nfs_file_open_context(filp), inode); + int err = vfs_fsync(filp, 0); if (err < 0) ret = err; } diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 7428f7d6273..a70e446e160 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -146,7 +146,7 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) goto out; } - if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_MODE) + if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE) || !S_ISDIR(fsinfo.fattr->mode)) { printk(KERN_ERR "nfs4_get_rootfh:" " getroot encountered non-directory\n"); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 099b3518fee..581d8f081e6 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -413,10 +413,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) return 0; /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) { - filemap_write_and_wait(inode->i_mapping); + if (S_ISREG(inode->i_mode)) nfs_wb_all(inode); - } fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -530,6 +528,68 @@ out: return err; } +static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) +{ + atomic_set(&l_ctx->count, 1); + l_ctx->lockowner = current->files; + l_ctx->pid = current->tgid; + INIT_LIST_HEAD(&l_ctx->list); +} + +static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *pos; + + list_for_each_entry(pos, &ctx->lock_context.list, list) { + if (pos->lockowner != current->files) + continue; + if (pos->pid != current->tgid) + continue; + atomic_inc(&pos->count); + return pos; + } + return NULL; +} + +struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *res, *new = NULL; + struct inode *inode = ctx->path.dentry->d_inode; + + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + spin_unlock(&inode->i_lock); + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) |