From 650b22b941fa03590c4a3671e79ec2c96ea59e9a Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 Oct 2013 17:10:04 +0400 Subject: fuse: Linking file to inode helper When writeback is ON every writeable file should be in per-inode write list, not only mmap-ed ones. Thus introduce a helper for this linkage. Signed-off-by: Maxim Patlasov Signed-off-by: Pavel Emelyanov Signed-off-by: Miklos Szeredi diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 77bcc30..2489aca 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -188,6 +188,22 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, } EXPORT_SYMBOL_GPL(fuse_do_open); +static void fuse_link_write_file(struct file *file) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff = file->private_data; + /* + * file may be written through mmap, so chain it onto the + * inodes's write_file list + */ + spin_lock(&fc->lock); + if (list_empty(&ff->write_entry)) + list_add(&ff->write_entry, &fi->write_files); + spin_unlock(&fc->lock); +} + void fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; @@ -1946,20 +1962,9 @@ static const struct vm_operations_struct fuse_file_vm_ops = { static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { - struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_file *ff = file->private_data; - /* - * file may be written through mmap, so chain it onto the - * inodes's write_file list - */ - spin_lock(&fc->lock); - if (list_empty(&ff->write_entry)) - list_add(&ff->write_entry, &fi->write_files); - spin_unlock(&fc->lock); - } + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + fuse_link_write_file(file); + file_accessed(file); vma->vm_ops = &fuse_file_vm_ops; return 0; -- cgit v0.10.2 From a92adc824ed5feaa2d4f7029f21170f574987aee Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 Oct 2013 17:10:16 +0400 Subject: fuse: Prepare to handle short reads A helper which gets called when read reports less bytes than was requested. See patch "trust kernel i_size only" for details. Signed-off-by: Maxim Patlasov Signed-off-by: Pavel Emelyanov Signed-off-by: Miklos Szeredi diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2489aca..592d7b4 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -671,6 +671,15 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, spin_unlock(&fc->lock); } +static void fuse_short_read(struct fuse_req *req, struct inode *inode, + u64 attr_ver) +{ + size_t num_read = req->out.args[0].size; + + loff_t pos = page_offset(req->pages[0]) + num_read; + fuse_read_update_size(inode, pos, attr_ver); +} + static int fuse_readpage(struct file *file, struct page *page) { struct fuse_io_priv io = { .async = 0, .file = file }; @@ -708,18 +717,18 @@ static int fuse_readpage(struct file *file, struct page *page) req->page_descs[0].length = count; num_read = fuse_send_read(req, &io, pos, count, NULL); err = req->out.h.error; - fuse_put_request(fc, req); if (!err) { /* * Short read means EOF. If file size is larger, truncate it */ if (num_read < count) - fuse_read_update_size(inode, pos + num_read, attr_ver); + fuse_short_read(req, inode, attr_ver); SetPageUptodate(page); } + fuse_put_request(fc, req); fuse_invalidate_atime(inode); out: unlock_page(page); @@ -742,13 +751,9 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) /* * Short read means EOF. If file size is larger, truncate it */ - if (!req->out.h.error && num_read < count) { - loff_t pos; + if (!req->out.h.error && num_read < count) + fuse_short_read(req, inode, req->misc.read.attr_ver); - pos = page_offset(req->pages[0]) + num_read; - fuse_read_update_size(inode, pos, - req->misc.read.attr_ver); - } fuse_invalidate_atime(inode); } -- cgit v0.10.2 From d5cd66c58edf10a7ee786659994595fd43995aab Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 Oct 2013 17:10:30 +0400 Subject: fuse: Connection bit for enabling writeback Off (0) by default. Will be used in the next patches and will be turned on at the very end. Signed-off-by: Maxim Patlasov Signed-off-by: Pavel Emelyanov Signed-off-by: Miklos Szeredi diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 2da5db2..374a8be 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -480,6 +480,9 @@ struct fuse_conn { /** Set if bdi is valid */ unsigned bdi_initialized:1; + /** write-back cache policy (default is write-through) */ + unsigned writeback_cache:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction -- cgit v0.10.2 From 8373200b124d03de7fa2e99be56de8642e604e9e Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 Oct 2013 17:10:46 +0400 Subject: fuse: Trust kernel i_size only Make fuse think that when writeback is on the inode's i_size is always up-to-date and not update it with the value received from the userspace. This is done because the page cache code may update i_size without letting the FS know. This assumption implies fixing the previously introduced short-read helper -- when a short read occurs the 'hole' is filled with zeroes. fuse_file_fallocate() is also fixed because now we should keep i_size up to date, so it must be updated if FUSE_FALLOCATE request succeeded. Signed-off-by: Maxim V. Patlasov Signed-off-by: Miklos Szeredi diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1d1292c..c52f143 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -839,6 +839,11 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, struct kstat *stat) { unsigned int blkbits; + struct fuse_conn *fc = get_fuse_conn(inode); + + /* see the comment in fuse_change_attributes() */ + if (fc->writeback_cache && S_ISREG(inode->i_mode)) + attr->size = i_size_read(inode); stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; @@ -1580,6 +1585,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct fuse_setattr_in inarg; struct fuse_attr_out outarg; bool is_truncate = false; + bool is_wb = fc->writeback_cache; loff_t oldsize; int err; @@ -1651,7 +1657,9 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, fuse_change_attributes_common(inode, &outarg.attr, attr_timeout(&outarg)); oldsize = inode->i_size; - i_size_write(inode, outarg.attr.size); + /* see the comment in fuse_change_attributes() */ + if (!is_wb || is_truncate || !S_ISREG(inode->i_mode)) + i_size_write(inode, outarg.attr.size); if (is_truncate) { /* NOTE: this may release/reacquire fc->lock */ @@ -1663,7 +1671,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, * Only call invalidate_inode_pages2() after removing * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. */ - if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + if ((is_truncate || !is_wb) && + S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(inode->i_mapping); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 592d7b4..c091a17 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -675,9 +675,26 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode, u64 attr_ver) { size_t num_read = req->out.args[0].size; + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fc->writeback_cache) { + /* + * A hole in a file. Some data after the hole are in page cache, + * but have not reached the client fs yet. So, the hole is not + * present there. + */ + int i; + int start_idx = num_read >> PAGE_CACHE_SHIFT; + size_t off = num_read & (PAGE_CACHE_SIZE - 1); - loff_t pos = page_offset(req->pages[0]) + num_read; - fuse_read_update_size(inode, pos, attr_ver); + for (i = start_idx; i < req->num_pages; i++) { + zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE); + off = 0; + } + } else { + loff_t pos = page_offset(req->pages[0]) + num_read; + fuse_read_update_size(inode, pos, attr_ver); + } } static int fuse_readpage(struct file *file, struct page *page) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d468643..c668c84 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -197,6 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + bool is_wb = fc->writeback_cache; loff_t oldsize; struct timespec old_mtime; @@ -211,10 +212,16 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_change_attributes_common(inode, attr, attr_valid); oldsize = inode->i_size; - i_size_write(inode, attr->size); + /* + * In case of writeback_cache enabled, the cached writes beyond EOF + * extend local i_size without keeping userspace server in sync. So, + * attr->size coming from server can be stale. We cannot trust it. + */ + if (!is_wb || !S_ISREG(inode->i_mode)) + i_size_write(inode, attr->size); spin_unlock(&fc->lock); - if (S_ISREG(inode->i_mode)) { + if (!is_wb && S_ISREG(inode->i_mode)) { bool inval = false; if (oldsize != attr->size) { -- cgit v0.10.2 From b0aa760652179072119582375f8dc896ed5b5dfd Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Thu, 26 Dec 2013 19:51:11 +0400 Subject: fuse: Trust kernel i_mtime only Let the kernel maintain i_mtime locally: - clear S_NOCMTIME - implement i_op->update_time() - flush mtime on fsync and last close - update i_mtime explicitly on truncate and fallocate Fuse inode flag FUSE_I_MTIME_DIRTY serves as indication that local i_mtime should be flushed to the server eventually. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c52f143..5b4e035 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -842,8 +842,11 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, struct fuse_conn *fc = get_fuse_conn(inode); /* see the comment in fuse_change_attributes() */ - if (fc->writeback_cache && S_ISREG(inode->i_mode)) + if (fc->writeback_cache && S_ISREG(inode->i_mode)) { attr->size = i_size_read(inode); + attr->mtime = inode->i_mtime.tv_sec; + attr->mtimensec = inode->i_mtime.tv_nsec; + } stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; @@ -1482,12 +1485,16 @@ static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); } -static bool update_mtime(unsigned ivalid) +static bool update_mtime(unsigned ivalid, bool trust_local_mtime) { /* Always update if mtime is explicitly set */ if (ivalid & ATTR_MTIME_SET) return true; + /* Or if kernel i_mtime is the official one */ + if (trust_local_mtime) + return true; + /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) return false; @@ -1496,7 +1503,8 @@ static bool update_mtime(unsigned ivalid) return true; } -static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) +static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, + bool trust_local_mtime) { unsigned ivalid = iattr->ia_valid; @@ -1515,11 +1523,11 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) if (!(ivalid & ATTR_ATIME_SET)) arg->valid |= FATTR_ATIME_NOW; } - if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) { + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_mtime)) { arg->valid |= FATTR_MTIME; arg->mtime = iattr->ia_mtime.tv_sec; arg->mtimensec = iattr->ia_mtime.tv_nsec; - if (!(ivalid & ATTR_MTIME_SET)) + if (!(ivalid & ATTR_MTIME_SET) && !trust_local_mtime) arg->valid |= FATTR_MTIME_NOW; } } @@ -1568,6 +1576,63 @@ void fuse_release_nowrite(struct inode *inode) spin_unlock(&fc->lock); } +static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, + struct inode *inode, + struct fuse_setattr_in *inarg_p, + struct fuse_attr_out *outarg_p) +{ + req->in.h.opcode = FUSE_SETATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(*inarg_p); + req->in.args[0].value = inarg_p; + req->out.numargs = 1; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + else + req->out.args[0].size = sizeof(*outarg_p); + req->out.args[0].value = outarg_p; +} + +/* + * Flush inode->i_mtime to the server + */ +int fuse_flush_mtime(struct file *file, bool nofail) +{ + struct inode *inode = file->f_mapping->host; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req = NULL; + struct fuse_setattr_in inarg; + struct fuse_attr_out outarg; + int err; + + if (nofail) { + req = fuse_get_req_nofail_nopages(fc, file); + } else { + req = fuse_get_req_nopages(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + } + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + + inarg.valid |= FATTR_MTIME; + inarg.mtime = inode->i_mtime.tv_sec; + inarg.mtimensec = inode->i_mtime.tv_nsec; + + fuse_setattr_fill(fc, req, inode, &inarg, &outarg); + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + + if (!err) + clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); + + return err; +} + /* * Set attributes, and at the same time refresh them. * @@ -1588,6 +1653,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, bool is_wb = fc->writeback_cache; loff_t oldsize; int err; + bool trust_local_mtime = is_wb && S_ISREG(inode->i_mode); if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) attr->ia_valid |= ATTR_FORCE; @@ -1616,7 +1682,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(attr, &inarg); + iattr_to_fattr(attr, &inarg, trust_local_mtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -1627,17 +1693,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, inarg.valid |= FATTR_LOCKOWNER; inarg.lock_owner = fuse_lock_owner_id(fc, current->files); } - req->in.h.opcode = FUSE_SETATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->out.numargs = 1; - if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; - else - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; + fuse_setattr_fill(fc, req, inode, &inarg, &outarg); fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); @@ -1654,6 +1710,12 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, } spin_lock(&fc->lock); + /* the kernel maintains i_mtime locally */ + if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) { + inode->i_mtime = attr->ia_mtime; + clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); + } + fuse_change_attributes_common(inode, &outarg.attr, attr_timeout(&outarg)); oldsize = inode->i_size; @@ -1884,6 +1946,17 @@ static int fuse_removexattr(struct dentry *entry, const char *name) return err; } +static int fuse_update_time(struct inode *inode, struct timespec *now, + int flags) +{ + if (flags & S_MTIME) { + inode->i_mtime = *now; + set_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state); + BUG_ON(!S_ISREG(inode->i_mode)); + } + return 0; +} + static const struct inode_operations fuse_dir_inode_operations = { .lookup = fuse_lookup, .mkdir = fuse_mkdir, @@ -1923,6 +1996,7 @@ static const struct inode_operations fuse_common_inode_operations = { .getxattr = fuse_getxattr, .listxattr = fuse_listxattr, .removexattr = fuse_removexattr, + .update_time = fuse_update_time, }; static const struct inode_operations fuse_symlink_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c091a17..69de9b8 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -308,6 +308,9 @@ static int fuse_open(struct inode *inode, struct file *file) static int fuse_release(struct inode *inode, struct file *file) { + if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) + fuse_flush_mtime(file, true); + fuse_release_common(file, FUSE_RELEASE); /* return value is ignored by VFS */ @@ -475,6 +478,12 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, fuse_sync_writes(inode); + if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) { + int err = fuse_flush_mtime(file, false); + if (err) + goto out; + } + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -960,16 +969,21 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, return req->misc.write.out.size; } -void fuse_write_update_size(struct inode *inode, loff_t pos) +bool fuse_write_update_size(struct inode *inode, loff_t pos) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + bool ret = false; spin_lock(&fc->lock); fi->attr_version = ++fc->attr_version; - if (pos > inode->i_size) + if (pos > inode->i_size) { i_size_write(inode, pos); + ret = true; + } spin_unlock(&fc->lock); + + return ret; } static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, @@ -2877,8 +2891,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, goto out; /* we could have extended the file */ - if (!(mode & FALLOC_FL_KEEP_SIZE)) - fuse_write_update_size(inode, offset + length); + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + bool changed = fuse_write_update_size(inode, offset + length); + + if (changed && fc->writeback_cache) { + struct fuse_inode *fi = get_fuse_inode(inode); + + inode->i_mtime = current_fs_time(inode->i_sb); + set_bit(FUSE_I_MTIME_DIRTY, &fi->state); + } + } if (mode & FALLOC_FL_PUNCH_HOLE) truncate_pagecache_range(inode, offset, offset + length - 1); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 374a8be..1e6ad6d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -119,6 +119,8 @@ enum { FUSE_I_INIT_RDPLUS, /** An operation changing file size is in progress */ FUSE_I_SIZE_UNSTABLE, + /** i_mtime has been updated locally; a flush to userspace needed */ + FUSE_I_MTIME_DIRTY, }; struct fuse_conn; @@ -876,7 +878,9 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd, unsigned fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); -void fuse_write_update_size(struct inode *inode, loff_t pos); +bool fuse_write_update_size(struct inode *inode, loff_t pos); + +int fuse_flush_mtime(struct file *file, bool nofail); int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct file *file); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c668c84..1061b0d9 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -170,8 +170,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_blocks = attr->blocks; inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; - inode->i_mtime.tv_sec = attr->mtime; - inode->i_mtime.tv_nsec = attr->mtimensec; + /* mtime from server may be stale due to local buffered write */ + if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; + } inode->i_ctime.tv_sec = attr->ctime; inode->i_ctime.tv_nsec = attr->ctimensec; @@ -250,6 +253,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) { inode->i_mode = attr->mode & S_IFMT; inode->i_size = attr->size; + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode); @@ -296,7 +301,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, return NULL; if ((inode->i_state & I_NEW)) { - inode->i_flags |= S_NOATIME|S_NOCMTIME; + inode->i_flags |= S_NOATIME; + if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) + inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; inode->i_data.backing_dev_info = &fc->bdi; fuse_init_inode(inode, attr); -- cgit v0.10.2 From e7cc133c370f541fa16723ad7df24de375c26fce Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 Oct 2013 17:19:06 +0400 Subject: fuse: Flush files on wb close Any write request requires a file handle to report to the userspace. Thus when we close a file (and free the fuse_file with this info) we have to flush all the outstanding dirty pages. filemap_write_and_wait() is enough because every page under fuse writeback is accounted in ff->count. This delays actual close until all fuse wb is completed. In case of "write cache" turned off, the flush is ensured by fuse_vma_close(). Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 69de9b8..530b1e8 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -308,6 +308,12 @@ static int fuse_open(struct inode *inode, struct file *file) static int fuse_release(struct inode *inode, struct file *file) { + struct fuse_conn *fc = get_fuse_conn(inode); + + /* see fuse_vma_close() for !writeback_cache case */ + if (fc->writeback_cache) + filemap_write_and_wait(file->f_mapping); + if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) fuse_flush_mtime(file, true); -- cgit v0.10.2 From 482fce55d2809d639fd0d2e6249c89dedc20eeae Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Thu, 10 Oct 2013 17:11:25 +0400 Subject: fuse: restructure fuse_readpage() Move the code filling and sending read request to a separate function. Future patches will use it for .write_begin -- partial modification of a page requires reading the page from the storage very similarly to what fuse_readpage does. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 530b1e8..b1873b5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -712,7 +712,7 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode, } } -static int fuse_readpage(struct file *file, struct page *page) +static int fuse_do_readpage(struct file *file, struct page *page) { struct fuse_io_priv io = { .async = 0, .file = file }; struct inode *inode = page->mapping->host; @@ -724,10 +724,6 @@ static int fuse_readpage(struct file *file, struct page *page) u64 attr_ver; int err; - err = -EIO; - if (is_bad_inode(inode)) - goto out; - /* * Page writeback can extend beyond the lifetime of the * page-cache page, so make sure we read a properly synced @@ -736,9 +732,8 @@ static int fuse_readpage(struct file *file, struct page *page) fuse_wait_on_page_writeback(inode, page->index); req = fuse_get_req(fc, 1); - err = PTR_ERR(req); if (IS_ERR(req)) - goto out; + return PTR_ERR(req); attr_ver = fuse_get_attr_version(fc); @@ -761,6 +756,20 @@ static int fuse_readpage(struct file *file, struct page *page) } fuse_put_request(fc, req); + + return err; +} + +static int fuse_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + err = fuse_do_readpage(file, page); fuse_invalidate_atime(inode); out: unlock_page(page); -- cgit v0.10.2 From 6b12c1b37e5556af073c1ebfa04c1f9df3a2beaf Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 Oct 2013 17:11:43 +0400 Subject: fuse: Implement write_begin/write_end callbacks The .write_begin and .write_end are requiered to use generic routines (generic_file_aio_write --> ... --> generic_perform_write) for buffered writes. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b1873b5..d857530 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1952,6 +1952,77 @@ out: return err; } +/* + * It's worthy to make sure that space is reserved on disk for the write, + * but how to implement it without killing performance need more thinking. + */ +static int fuse_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode); + struct page *page; + loff_t fsize; + int err = -ENOMEM; + + WARN_ON(!fc->writeback_cache); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + goto error; + + fuse_wait_on_page_writeback(mapping->host, page->index); + + if (PageUptodate(page) || len == PAGE_CACHE_SIZE) + goto success; + /* + * Check if the start this page comes after the end of file, in which + * case the readpage can be optimized away. + */ + fsize = i_size_read(mapping->host); + if (fsize <= (pos & PAGE_CACHE_MASK)) { + size_t off = pos & ~PAGE_CACHE_MASK; + if (off) + zero_user_segment(page, 0, off); + goto success; + } + err = fuse_do_readpage(file, page); + if (err) + goto cleanup; +success: + *pagep = page; + return 0; + +cleanup: + unlock_page(page); + page_cache_release(page); +error: + return err; +} + +static int fuse_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + + if (!PageUptodate(page)) { + /* Zero any unwritten bytes at the end of the page */ + size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK; + if (endoff) + zero_user_segment(page, endoff, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } + + fuse_write_update_size(inode, pos + copied); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + + return copied; +} + static int fuse_launder_page(struct page *page) { int err = 0; @@ -2979,6 +3050,8 @@ static const struct address_space_operations fuse_file_aops = { .set_page_dirty = __set_page_dirty_nobuffers, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, + .write_begin = fuse_write_begin, + .write_end = fuse_write_end, }; void fuse_init_file_inode(struct inode *inode) -- cgit v0.10.2 From fe38d7df230b022e72014ef7aa799a4f2acfecf3 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Thu, 10 Oct 2013 17:11:54 +0400 Subject: fuse: fuse_flush() should wait on writeback The aim of .flush fop is to hint file-system that flushing its state or caches or any other important data to reliable storage would be desirable now. fuse_flush() passes this hint by sending FUSE_FLUSH request to userspace. However, dirty pages and pages under writeback may be not visible to userspace yet if we won't ensure it explicitly. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d857530..d93f2a1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -401,6 +401,21 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) return 0; } +/* + * Wait for all pending writepages on the inode to finish. + * + * This is currently done by blocking further writes with FUSE_NOWRITE + * and waiting for all sent writes to complete. + * + * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage + * could conflict with truncation. + */ +static void fuse_sync_writes(struct inode *inode) +{ + fuse_set_nowrite(inode); + fuse_release_nowrite(inode); +} + static int fuse_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); @@ -416,6 +431,14 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fc->no_flush) return 0; + err = filemap_write_and_wait(file->f_mapping); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + fuse_sync_writes(inode); + mutex_unlock(&inode->i_mutex); + req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -436,21 +459,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) return err; } -/* - * Wait for all pending writepages on the inode to finish. - * - * This is currently done by blocking further writes with FUSE_NOWRITE - * and waiting for all sent writes to complete. - * - * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage - * could conflict with truncation. - */ -static void fuse_sync_writes(struct inode *inode) -{ - fuse_set_nowrite(inode); - fuse_release_nowrite(inode); -} - int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int isdir) { -- cgit v0.10.2 From ea8cd33390fafc1eca06a26e6a9c7bf1d386526f Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 Oct 2013 17:12:05 +0400 Subject: fuse: Fix O_DIRECT operations vs cached writeback misorder The problem is: 1. write cached data to a file 2. read directly from the same file (via another fd) The 2nd operation may read stale data, i.e. the one that was in a file before the 1st op. Problem is in how fuse manages writeback. When direct op occurs the core kernel code calls filemap_write_and_wait to flush all the cached ops in flight. But fuse acks the writeback right after the ->writepages callback exits w/o waiting for the real write to happen. Thus the subsequent direct op proceeds while the real writeback is still in flight. This is a problem for backends that reorder operation. Fix this by making the fuse direct IO callback explicitly wait on the in-flight writeback to finish. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b96a49b..23e363f 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -95,7 +95,7 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, struct iovec iov = { .iov_base = buf, .iov_len = count }; struct fuse_io_priv io = { .async = 0, .file = file }; - return fuse_direct_io(&io, &iov, 1, count, &pos, 0); + return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE); } static ssize_t cuse_write(struct file *file, const char __user *buf, @@ -109,7 +109,8 @@ static ssize_t cuse_write(struct file *file, const char __user *buf, * No locking or generic_write_checks(), the server is * responsible for locking and sanity checks. */ - return fuse_direct_io(&io, &iov, 1, count, &pos, 1); + return fuse_direct_io(&io, &iov, 1, count, &pos, + FUSE_DIO_WRITE | FUSE_DIO_CUSE); } static int cuse_open(struct inode *inode, struct file *file) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d93f2a1..2764330 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -358,12 +358,13 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) } /* - * Check if page is under writeback + * Check if any page in a range is under writeback * * This is currently done by walking the list of writepage requests * for the inode, which can be pretty inefficient. */ -static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, + pgoff_t idx_to) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -376,8 +377,8 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) BUG_ON(req->inode != inode); curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; - if (curr_index <= index && - index < curr_index + req->num_pages) { + if (idx_from < curr_index + req->num_pages && + curr_index <= idx_to) { found = true; break; } @@ -387,6 +388,11 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) return found; } +static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +{ + return fuse_range_is_writeback(inode, index, index); +} + /* * Wait for page writeback to be completed. * @@ -1364,13 +1370,18 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p) ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, unsigned long nr_segs, size_t count, loff_t *ppos, - int write) + int flags) { + int write = flags & FUSE_DIO_WRITE; + int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->file; + struct inode *inode = file->f_mapping->host; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; + pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT; + pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT; ssize_t res = 0; struct fuse_req *req; struct iov_iter ii; @@ -1384,6 +1395,14 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, if (IS_ERR(req)) return PTR_ERR(req); + if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!write) + mutex_lock(&inode->i_mutex); + fuse_sync_writes(inode); + if (!write) + mutex_unlock(&inode->i_mutex); + } + while (count) { size_t nres; fl_owner_t owner = current->files; @@ -1472,7 +1491,8 @@ static ssize_t __fuse_direct_write(struct fuse_io_priv *io, res = generic_write_checks(file, ppos, &count, 0); if (!res) - res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1); + res = fuse_direct_io(io, iov, nr_segs, count, ppos, + FUSE_DIO_WRITE); fuse_invalidate_attr(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1e6ad6d..a257ed8e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -868,9 +868,20 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir); + +/** + * fuse_direct_io() flags + */ + +/** If set, it is WRITE; otherwise - READ */ +#define FUSE_DIO_WRITE (1 << 0) + +/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */ +#define FUSE_DIO_CUSE (1 << 1) + ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, unsigned long nr_segs, size_t count, loff_t *ppos, - int write); + int flags); long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags); long fuse_ioctl_common(struct file *file, unsigned int cmd, -- cgit v0.10.2 From 4d99ff8f12eb20c6cde292f185cb1c8c334ba0ed Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 10 Oct 2013 17:12:18 +0400 Subject: fuse: Turn writeback cache on Introduce a bit kernel and userspace exchange between each-other on the init stage and turn writeback on if the userspace want this and mount option 'allow_wbcache' is present (controlled by fusermount). Also add each writable file into per-inode write list and call the generic_file_aio_write to make use of the Linux page cache engine. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2764330..d03a35d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -224,6 +224,8 @@ void fuse_finish_open(struct inode *inode, struct file *file) spin_unlock(&fc->lock); fuse_invalidate_attr(inode); } + if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) + fuse_link_write_file(file); } int fuse_open_common(struct inode *inode, struct file *file, bool isdir) @@ -1197,6 +1199,15 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, struct iov_iter i; loff_t endbyte = 0; + if (get_fuse_conn(inode)->writeback_cache) { + /* Update size (EOF optimization) and mode (SUID clearing) */ + err = fuse_update_attributes(mapping->host, NULL, file, NULL); + if (err) + return err; + + return generic_file_aio_write(iocb, iov, nr_segs, pos); + } + WARN_ON(iocb->ki_pos != pos); ocount = 0; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1061b0d9..9ba1919 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -887,6 +887,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) } if (arg->flags & FUSE_ASYNC_DIO) fc->async_dio = 1; + if (arg->flags & FUSE_WRITEBACK_CACHE) + fc->writeback_cache = 1; } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -914,7 +916,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | - FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO; + FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | + FUSE_WRITEBACK_CACHE; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 60bb2f9..cf4750e 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -93,6 +93,9 @@ * * 7.22 * - add FUSE_ASYNC_DIO + * + * 7.23 + * - add FUSE_WRITEBACK_CACHE */ #ifndef _LINUX_FUSE_H @@ -128,7 +131,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 22 +#define FUSE_KERNEL_MINOR_VERSION 23 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -219,6 +222,7 @@ struct fuse_file_lock { * FUSE_DO_READDIRPLUS: do READDIRPLUS (READDIR+LOOKUP in one) * FUSE_READDIRPLUS_AUTO: adaptive readdirplus * FUSE_ASYNC_DIO: asynchronous direct I/O submission + * FUSE_WRITEBACK_CACHE: use writeback cache for buffered writes */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -236,6 +240,7 @@ struct fuse_file_lock { #define FUSE_DO_READDIRPLUS (1 << 13) #define FUSE_READDIRPLUS_AUTO (1 << 14) #define FUSE_ASYNC_DIO (1 << 15) +#define FUSE_WRITEBACK_CACHE (1 << 16) /** * CUSE INIT request/reply flags -- cgit v0.10.2 From f3846266f593595632a07242fcbc6c24bc2ade68 Mon Sep 17 00:00:00 2001 From: Rajat Jain Date: Wed, 5 Feb 2014 15:24:57 -0800 Subject: fuse: fix "uninitialized variable" warning Fix the following warning: In file included from include/linux/fs.h:16:0, from fs/fuse/fuse_i.h:13, from fs/fuse/file.c:9: fs/fuse/file.c: In function 'fuse_file_poll': include/linux/rbtree.h:82:28: warning: 'parent' may be used uninitialized in this function [-Wmaybe-uninitialized] fs/fuse/file.c:2592:27: note: 'parent' was declared here Signed-off-by: Rajat Jain Signed-off-by: Miklos Szeredi diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d03a35d..65df7d8 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2772,7 +2772,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc, { spin_lock(&fc->lock); if (RB_EMPTY_NODE(&ff->polled_node)) { - struct rb_node **link, *parent; + struct rb_node **link, *uninitialized_var(parent); link = fuse_find_polled_node(fc, ff->kh, &parent); BUG_ON(*link); -- cgit v0.10.2