summaryrefslogtreecommitdiff
path: root/fs/btrfs/file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r--fs/btrfs/file.c640
1 files changed, 420 insertions, 220 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c1faded..75899a0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
+#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
@@ -44,11 +45,12 @@
* and be replaced with calls into generic code.
*/
static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
- int write_bytes,
+ size_t write_bytes,
struct page **prepared_pages,
struct iov_iter *i)
{
- size_t copied;
+ size_t copied = 0;
+ size_t total_copied = 0;
int pg = 0;
int offset = pos & (PAGE_CACHE_SIZE - 1);
@@ -56,23 +58,38 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
size_t count = min_t(size_t,
PAGE_CACHE_SIZE - offset, write_bytes);
struct page *page = prepared_pages[pg];
-again:
- if (unlikely(iov_iter_fault_in_readable(i, count)))
- return -EFAULT;
-
- /* Copy data from userspace to the current page */
- copied = iov_iter_copy_from_user(page, i, offset, count);
+ /*
+ * Copy data from userspace to the current page
+ *
+ * Disable pagefault to avoid recursive lock since
+ * the pages are already locked
+ */
+ pagefault_disable();
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+ pagefault_enable();
/* Flush processor's dcache for this page */
flush_dcache_page(page);
+
+ /*
+ * if we get a partial write, we can end up with
+ * partially up to date pages. These add
+ * a lot of complexity, so make sure they don't
+ * happen by forcing this copy to be retried.
+ *
+ * The rest of the btrfs_file_write code will fall
+ * back to page at a time copies after we return 0.
+ */
+ if (!PageUptodate(page) && copied < count)
+ copied = 0;
+
iov_iter_advance(i, copied);
write_bytes -= copied;
+ total_copied += copied;
- if (unlikely(copied == 0)) {
- count = min_t(size_t, PAGE_CACHE_SIZE - offset,
- iov_iter_single_seg_count(i));
- goto again;
- }
+ /* Return to btrfs_file_aio_write to fault page */
+ if (unlikely(copied == 0))
+ break;
if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
offset += copied;
@@ -81,18 +98,16 @@ again:
offset = 0;
}
}
- return 0;
+ return total_copied;
}
/*
* unlocks pages after btrfs_file_write is done with them
*/
-static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+void btrfs_drop_pages(struct page **pages, size_t num_pages)
{
size_t i;
for (i = 0; i < num_pages; i++) {
- if (!pages[i])
- break;
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
* clear it here
@@ -112,17 +127,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
* this also makes the decision about creating an inline extent vs
* doing real data extents, marking pages dirty and delalloc as required.
*/
-static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct file *file,
- struct page **pages,
- size_t num_pages,
- loff_t pos,
- size_t write_bytes)
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+ struct page **pages, size_t num_pages,
+ loff_t pos, size_t write_bytes,
+ struct extent_state **cached)
{
int err = 0;
int i;
- struct inode *inode = fdentry(file)->d_inode;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
@@ -135,8 +146,9 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
end_of_last_block = start_pos + num_bytes - 1;
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
- NULL);
- BUG_ON(err);
+ cached);
+ if (err)
+ return err;
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
@@ -144,13 +156,14 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
ClearPageChecked(p);
set_page_dirty(p);
}
- if (end_pos > isize) {
+
+ /*
+ * we've only changed i_size in ram, and we haven't updated
+ * the disk i_size. There is no need to log the inode
+ * at this time.
+ */
+ if (end_pos > isize)
i_size_write(inode, end_pos);
- /* we've only changed i_size in ram, and we haven't updated
- * the disk i_size. There is no need to log the inode
- * at this time.
- */
- }
return 0;
}
@@ -181,6 +194,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split = alloc_extent_map(GFP_NOFS);
if (!split2)
split2 = alloc_extent_map(GFP_NOFS);
+ BUG_ON(!split || !split2);
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -220,6 +234,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->bdev = em->bdev;
split->flags = flags;
+ split->compress_type = em->compress_type;
ret = add_extent_mapping(em_tree, split);
BUG_ON(ret);
free_extent_map(split);
@@ -234,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->len = em->start + em->len - (start + len);
split->bdev = em->bdev;
split->flags = flags;
+ split->compress_type = em->compress_type;
if (compressed) {
split->block_len = em->block_len;
@@ -589,6 +605,8 @@ again:
key.offset = split;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
if (ret > 0 && path->slots[0] > 0)
path->slots[0]--;
@@ -755,6 +773,27 @@ out:
}
/*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos)
+{
+ int ret = 0;
+
+ if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+ ret = btrfs_readpage(NULL, page);
+ if (ret)
+ return ret;
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ return -EIO;
+ }
+ }
+ return 0;
+}
+
+/*
* this gets pages into the page cache and locks them down, it also properly
* waits for data=ordered extents to finish before allowing the pages to be
* modified.
@@ -769,6 +808,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
unsigned long index = pos >> PAGE_CACHE_SHIFT;
struct inode *inode = fdentry(file)->d_inode;
int err = 0;
+ int faili = 0;
u64 start_pos;
u64 last_pos;
@@ -776,21 +816,33 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
if (start_pos > inode->i_size) {
- err = btrfs_cont_expand(inode, start_pos);
+ err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
if (err)
return err;
}
- memset(pages, 0, num_pages * sizeof(struct page *));
again:
for (i = 0; i < num_pages; i++) {
pages[i] = grab_cache_page(inode->i_mapping, index + i);
if (!pages[i]) {
+ faili = i - 1;
err = -ENOMEM;
- BUG_ON(1);
+ goto fail;
+ }
+
+ if (i == 0)
+ err = prepare_uptodate_page(pages[i], pos);
+ if (i == num_pages - 1)
+ err = prepare_uptodate_page(pages[i],
+ pos + write_bytes);
+ if (err) {
+ page_cache_release(pages[i]);
+ faili = i - 1;
+ goto fail;
}
wait_on_page_writeback(pages[i]);
}
+ err = 0;
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -830,199 +882,264 @@ again:
WARN_ON(!PageLocked(pages[i]));
}
return 0;
+fail:
+ while (faili >= 0) {
+ unlock_page(pages[faili]);
+ page_cache_release(pages[faili]);
+ faili--;
+ }
+ return err;
+
}
-static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static noinline ssize_t __btrfs_buffered_write(struct file *file,
+ struct iov_iter *i,
+ loff_t pos)
{
- struct file *file = iocb->ki_filp;
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct page *pinned[2];
struct page **pages = NULL;
- struct iov_iter i;
- loff_t *ppos = &iocb->ki_pos;
- loff_t start_pos;
- ssize_t num_written = 0;
- ssize_t err = 0;
- size_t count;
- size_t ocount;
- int ret = 0;
- int nrptrs;
unsigned long first_index;
unsigned long last_index;
- int will_write;
- int buffered = 0;
+ size_t num_written = 0;
+ int nrptrs;
+ int ret = 0;
- will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
- (file->f_flags & O_DIRECT));
+ nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
+ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+ (sizeof(struct page *)));
+ pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
- pinned[0] = NULL;
- pinned[1] = NULL;
+ first_index = pos >> PAGE_CACHE_SHIFT;
+ last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
- start_pos = pos;
+ while (iov_iter_count(i) > 0) {
+ size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+ size_t write_bytes = min(iov_iter_count(i),
+ nrptrs * (size_t)PAGE_CACHE_SIZE -
+ offset);
+ size_t num_pages = (write_bytes + offset +
+ PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size_t dirty_pages;
+ size_t copied;
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ WARN_ON(num_pages > nrptrs);
- mutex_lock(&inode->i_mutex);
+ /*
+ * Fault pages before locking them in prepare_pages
+ * to avoid recursive lock
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
+ ret = -EFAULT;
+ break;
+ }
- err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
- if (err)
- goto out;
- count = ocount;
+ ret = btrfs_delalloc_reserve_space(inode,
+ num_pages << PAGE_CACHE_SHIFT);
+ if (ret)
+ break;
- current->backing_dev_info = inode->i_mapping->backing_dev_info;
- err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
- if (err)
- goto out;
+ /*
+ * This is going to setup the pages array with the number of
+ * pages we want, so we don't really need to worry about the
+ * contents of pages from loop to loop
+ */
+ ret = prepare_pages(root, file, pages, num_pages,
+ pos, first_index, last_index,
+ write_bytes);
+ if (ret) {
+ btrfs_delalloc_release_space(inode,
+ num_pages << PAGE_CACHE_SHIFT);
+ break;
+ }
- if (count == 0)
- goto out;
+ copied = btrfs_copy_from_user(pos, num_pages,
+ write_bytes, pages, i);
- err = file_remove_suid(file);
- if (err)
- goto out;
+ /*
+ * if we have trouble faulting in the pages, fall
+ * back to one page at a time
+ */
+ if (copied < write_bytes)
+ nrptrs = 1;
- file_update_time(file);
- BTRFS_I(inode)->sequence++;
+ if (copied == 0)
+ dirty_pages = 0;
+ else
+ dirty_pages = (copied + offset +
+ PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
- if (unlikely(file->f_flags & O_DIRECT)) {
- num_written = generic_file_direct_write(iocb, iov, &nr_segs,
- pos, ppos, count,
- ocount);
/*
- * the generic O_DIRECT will update in-memory i_size after the
- * DIOs are done. But our endio handlers that update the on
- * disk i_size never update past the in memory i_size. So we
- * need one more update here to catch any additions to the
- * file
+ * If we had a short copy we need to release the excess delaloc
+ * bytes we reserved. We need to increment outstanding_extents
+ * because btrfs_delalloc_release_space will decrement it, but
+ * we still have an outstanding extent for the chunk we actually
+ * managed to copy.
*/
- if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
- btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
- mark_inode_dirty(inode);
+ if (num_pages > dirty_pages) {
+ if (copied > 0)
+ atomic_inc(
+ &BTRFS_I(inode)->outstanding_extents);
+ btrfs_delalloc_release_space(inode,
+ (num_pages - dirty_pages) <<
+ PAGE_CACHE_SHIFT);
}
- if (num_written < 0) {
- ret = num_written;
- num_written = 0;
- goto out;
- } else if (num_written == count) {
- /* pick up pos changes done by the generic code */
- pos = *ppos;
- goto out;
+ if (copied > 0) {
+ ret = btrfs_dirty_pages(root, inode, pages,
+ dirty_pages, pos, copied,
+ NULL);
+ if (ret) {
+ btrfs_delalloc_release_space(inode,
+ dirty_pages << PAGE_CACHE_SHIFT);
+ btrfs_drop_pages(pages, num_pages);
+ break;
+ }
}
- /*
- * We are going to do buffered for the rest of the range, so we
- * need to make sure to invalidate the buffered pages when we're
- * done.
- */
- buffered = 1;
- pos += num_written;
+
+ btrfs_drop_pages(pages, num_pages);
+
+ cond_resched();
+
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+ dirty_pages);
+ if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+ btrfs_btree_balance_dirty(root, 1);
+ btrfs_throttle(root);
+
+ pos += copied;
+ num_written += copied;
}
- iov_iter_init(&i, iov, nr_segs, count, num_written);
- nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
- PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
- (sizeof(struct page *)));
- pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+ kfree(pages);
- /* generic_write_checks can change our pos */
- start_pos = pos;
+ return num_written ? num_written : ret;
+}
- first_index = pos >> PAGE_CACHE_SHIFT;
- last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
+static ssize_t __btrfs_direct_write(struct kiocb *iocb,
+ const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos,
+ loff_t *ppos, size_t count, size_t ocount)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = fdentry(file)->d_inode;
+ struct iov_iter i;
+ ssize_t written;
+ ssize_t written_buffered;
+ loff_t endbyte;
+ int err;
+
+ written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
+ count, ocount);
/*
- * there are lots of better ways to do this, but this code
- * makes sure the first and last page in the file range are
- * up to date and ready for cow
+ * the generic O_DIRECT will update in-memory i_size after the
+ * DIOs are done. But our endio handlers that update the on
+ * disk i_size never update past the in memory i_size. So we
+ * need one more update here to catch any additions to the
+ * file
*/
- if ((pos & (PAGE_CACHE_SIZE - 1))) {
- pinned[0] = grab_cache_page(inode->i_mapping, first_index);
- if (!PageUptodate(pinned[0])) {
- ret = btrfs_readpage(NULL, pinned[0]);
- BUG_ON(ret);
- wait_on_page_locked(pinned[0]);
- } else {
- unlock_page(pinned[0]);
- }
+ if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+ btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+ mark_inode_dirty(inode);
}
- if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
- pinned[1] = grab_cache_page(inode->i_mapping, last_index);
- if (!PageUptodate(pinned[1])) {
- ret = btrfs_readpage(NULL, pinned[1]);
- BUG_ON(ret);
- wait_on_page_locked(pinned[1]);
- } else {
- unlock_page(pinned[1]);
- }
+
+ if (written < 0 || written == count)
+ return written;
+
+ pos += written;
+ count -= written;
+ iov_iter_init(&i, iov, nr_segs, count, written);
+ written_buffered = __btrfs_buffered_write(file, &i, pos);
+ if (written_buffered < 0) {
+ err = written_buffered;
+ goto out;
}
+ endbyte = pos + written_buffered - 1;
+ err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+ if (err)
+ goto out;
+ written += written_buffered;
+ *ppos = pos + written_buffered;
+ invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
+out:
+ return written ? written : err;
+}
- while (iov_iter_count(&i) > 0) {
- size_t offset = pos & (PAGE_CACHE_SIZE - 1);
- size_t write_bytes = min(iov_iter_count(&i),
- nrptrs * (size_t)PAGE_CACHE_SIZE -
- offset);
- size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
+ const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ loff_t *ppos = &iocb->ki_pos;
+ ssize_t num_written = 0;
+ ssize_t err = 0;
+ size_t count, ocount;
- WARN_ON(num_pages > nrptrs);
- memset(pages, 0, sizeof(struct page *) * nrptrs);
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
- ret = btrfs_delalloc_reserve_space(inode, write_bytes);
- if (ret)
- goto out;
+ mutex_lock(&inode->i_mutex);
- ret = prepare_pages(root, file, pages, num_pages,
- pos, first_index, last_index,
- write_bytes);
- if (ret) {
- btrfs_delalloc_release_space(inode, write_bytes);
- goto out;
- }
+ err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+ if (err) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+ count = ocount;
- ret = btrfs_copy_from_user(pos, num_pages,
- write_bytes, pages, &i);
- if (ret == 0) {
- dirty_and_release_pages(NULL, root, file, pages,
- num_pages, pos, write_bytes);
- }
+ current->backing_dev_info = inode->i_mapping->backing_dev_info;
+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (err) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
- btrfs_drop_pages(pages, num_pages);
- if (ret) {
- btrfs_delalloc_release_space(inode, write_bytes);
- goto out;
- }
+ if (count == 0) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
- if (will_write) {
- filemap_fdatawrite_range(inode->i_mapping, pos,
- pos + write_bytes - 1);
- } else {
- balance_dirty_pages_ratelimited_nr(inode->i_mapping,
- num_pages);
- if (num_pages <
- (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
- btrfs_btree_balance_dirty(root, 1);
- btrfs_throttle(root);
- }
+ err = file_remove_suid(file);
+ if (err) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
- pos += write_bytes;
- num_written += write_bytes;
+ /*
+ * If BTRFS flips readonly due to some impossible error
+ * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+ * although we have opened a file as writable, we have
+ * to stop this write operation to ensure FS consistency.
+ */
+ if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ mutex_unlock(&inode->i_mutex);
+ err = -EROFS;
+ goto out;
+ }
- cond_resched();
+ file_update_time(file);
+ BTRFS_I(inode)->sequence++;
+
+ if (unlikely(file->f_flags & O_DIRECT)) {
+ num_written = __btrfs_direct_write(iocb, iov, nr_segs,
+ pos, ppos, count, ocount);
+ } else {
+ struct iov_iter i;
+
+ iov_iter_init(&i, iov, nr_segs, count, num_written);
+
+ num_written = __btrfs_buffered_write(file, &i, pos);
+ if (num_written > 0)
+ *ppos = pos + num_written;
}
-out:
- mutex_unlock(&inode->i_mutex);
- if (ret)
- err = ret;
- kfree(pages);
- if (pinned[0])
- page_cache_release(pinned[0]);
- if (pinned[1])
- page_cache_release(pinned[1]);
- *ppos = pos;
+ mutex_unlock(&inode->i_mutex);
/*
* we want to make sure fsync finds this change
@@ -1037,43 +1154,12 @@ out:
* one running right now.
*/
BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
-
- if (num_written > 0 && will_write) {
- struct btrfs_trans_handle *trans;
-
- err = btrfs_wait_ordered_range(inode, start_pos, num_written);
- if (err)
+ if (num_written > 0 || num_written == -EIOCBQUEUED) {
+ err = generic_write_sync(file, pos, num_written);
+ if (err < 0 && num_written > 0)
num_written = err;
-
- if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- num_written = PTR_ERR(trans);
- goto done;
- }
- mutex_lock(&inode->i_mutex);
- ret = btrfs_log_dentry_safe(trans, root,
- file->f_dentry);
- mutex_unlock(&inode->i_mutex);
- if (ret == 0) {
- ret = btrfs_sync_log(trans, root);
- if (ret == 0)
- btrfs_end_transaction(trans, root);
- else
- btrfs_commit_transaction(trans, root);
- } else if (ret != BTRFS_NO_LOG_SYNC) {
- btrfs_commit_transaction(trans, root);
- } else {
- btrfs_end_transaction(trans, root);
- }
- }
- if (file->f_flags & O_DIRECT && buffered) {
- invalidate_mapping_pages(inode->i_mapping,
- start_pos >> PAGE_CACHE_SHIFT,
- (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
- }
}
-done:
+out:
current->backing_dev_info = NULL;
return num_written ? num_written : err;
}
@@ -1116,6 +1202,7 @@ int btrfs_sync_file(struct file *file, int datasync)
int ret = 0;
struct btrfs_trans_handle *trans;
+ trace_btrfs_sync_file(file, datasync);
/* we wait first, since the writeback may change the inode */
root->log_batch++;
@@ -1209,6 +1296,118 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
return 0;
}
+static long btrfs_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct extent_state *cached_state = NULL;
+ u64 cur_offset;
+ u64 last_byte;
+ u64 alloc_start;
+ u64 alloc_end;
+ u64 alloc_hint = 0;
+ u64 locked_end;
+ u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+ struct extent_map *em;
+ int ret;
+
+ alloc_start = offset & ~mask;
+ alloc_end = (offset + len + mask) & ~mask;
+
+ /* We only support the FALLOC_FL_KEEP_SIZE mode */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /*
+ * wait for ordered IO before we have any locks. We'll loop again
+ * below with the locks held.
+ */
+ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
+ mutex_lock(&inode->i_mutex);
+ ret = inode_newsize_ok(inode, alloc_end);
+ if (ret)
+ goto out;
+
+ if (alloc_start > inode->i_size) {
+ ret = btrfs_cont_expand(inode, i_size_read(inode),
+ alloc_start);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+ if (ret)
+ goto out;
+
+ locked_end = alloc_end - 1;
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+
+ /* the extent lock is ordered inside the running
+ * transaction
+ */
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+ locked_end, 0, &cached_state, GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode,
+ alloc_end - 1);
+ if (ordered &&
+ ordered->file_offset + ordered->len > alloc_start &&
+ ordered->file_offset < alloc_end) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ alloc_start, locked_end,
+ &cached_state, GFP_NOFS);
+ /*
+ * we can't wait on the range with the transaction
+ * running or with the extent lock held
+ */
+ btrfs_wait_ordered_range(inode, alloc_start,
+ alloc_end - alloc_start);
+ } else {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ }
+
+ cur_offset = alloc_start;
+ while (1) {
+ em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+ alloc_end - cur_offset, 0);
+ BUG_ON(IS_ERR(em) || !em);
+ last_byte = min(extent_map_end(em), alloc_end);
+ last_byte = (last_byte + mask) & ~mask;
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ (cur_offset >= inode->i_size &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+ last_byte - cur_offset,
+ 1 << inode->i_blkbits,
+ offset + len,
+ &alloc_hint);
+ if (ret < 0) {
+ free_extent_map(em);
+ break;
+ }
+ }
+ free_extent_map(em);
+
+ cur_offset = last_byte;
+ if (cur_offset >= alloc_end) {
+ ret = 0;
+ break;
+ }
+ }
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state, GFP_NOFS);
+
+ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -1220,6 +1419,7 @@ const struct file_operations btrfs_file_operations = {
.open = generic_file_open,
.release = btrfs_release_file,
.fsync = btrfs_sync_file,
+ .fallocate = btrfs_fallocate,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,