From 18ec7d5c3f434aed9661ed10a9e1f48cdeb4981d Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 8 Feb 2006 11:50:51 +0000 Subject: [GFS2] Make journaled data files identical to normal files on disk This is a very large patch, with a few still to be resolved issues so you might want to check out the previous head of the tree since this is known to be unstable. Fixes for the various bugs will be forthcoming shortly. This patch removes the special data format which has been used up till now for journaled data files. Directories still retain the old format so that they will remain on disk compatible with earlier releases. As a result you can now do the following with journaled data files: 1) mmap them 2) export them over NFS 3) convert to/from normal files whenever you want to (the zero length restriction is gone) In addition the level at which GFS' locking is done has changed for all files (since they all now use the page cache) such that the locking is done at the page cache level rather than the level of the fs operations. This should mean that things like loopback mounts and other things which touch the page cache directly should now work. Current known issues: 1. There is a lock mode inversion problem related to the resource group hold function which needs to be resolved. 2. Any significant amount of I/O causes an oops with an offset of hex 320 (NULL pointer dereference) which appears to be related to a journaled data buffer appearing on a list where it shouldn't be. 3. Direct I/O writes are disabled for the time being (will reappear later) 4. There is probably a deadlock between the page lock and GFS' locks under certain combinations of mmap and fs operation I/O. 5. Issue relating to ref counting on internally used inodes causes a hang on umount (discovered before this patch, and not fixed by it) 6. One part of the directory metadata is different from GFS1 and will need to be resolved before next release. Signed-off-by: Steven Whitehouse diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index 4e87b86..88f9279 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -10,7 +10,6 @@ gfs2-y := \ glock.o \ glops.o \ inode.o \ - jdata.o \ lm.o \ log.o \ lops.o \ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index bd194f6..4efcd8a 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -18,12 +18,12 @@ #include "bmap.h" #include "glock.h" #include "inode.h" -#include "jdata.h" #include "meta_io.h" #include "page.h" #include "quota.h" #include "rgrp.h" #include "trans.h" +#include "dir.h" /* This doesn't need to be that large as max 64 bit pointers in a 4k * block is 512, so __u16 is fine for that. It saves stack space to @@ -90,7 +90,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, gfs2_unstuffer_t unstuffer, { struct buffer_head *bh, *dibh; uint64_t block = 0; - int journaled = gfs2_is_jdata(ip); + int isdir = gfs2_is_dir(ip); int error; down_write(&ip->i_rw_mutex); @@ -103,10 +103,10 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, gfs2_unstuffer_t unstuffer, /* Get a free block, fill it with the stuffed data, and write it out to disk */ - if (journaled) { + if (isdir) { block = gfs2_alloc_meta(ip); - error = gfs2_jdata_get_buffer(ip, block, 1, &bh); + error = gfs2_dir_get_buffer(ip, block, 1, &bh); if (error) goto out_brelse; gfs2_buffer_copy_tail(bh, @@ -168,7 +168,7 @@ static unsigned int calc_tree_height(struct gfs2_inode *ip, uint64_t size) if (ip->i_di.di_size > size) size = ip->i_di.di_size; - if (gfs2_is_jdata(ip)) { + if (gfs2_is_dir(ip)) { arr = sdp->sd_jheightsize; max = sdp->sd_max_jheight; } else { @@ -377,7 +377,7 @@ static void lookup_block(struct gfs2_inode *ip, struct buffer_head *bh, return; if (height == ip->i_di.di_height - 1 && - !gfs2_is_jdata(ip)) + !gfs2_is_dir(ip)) *block = gfs2_alloc_data(ip); else *block = gfs2_alloc_meta(ip); @@ -430,7 +430,7 @@ int gfs2_block_map(struct gfs2_inode *ip, uint64_t lblock, int *new, if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip))) goto out; - bsize = (gfs2_is_jdata(ip)) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize; + bsize = (gfs2_is_dir(ip)) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize; height = calc_tree_height(ip, (lblock + 1) * bsize); if (ip->i_di.di_height < height) { @@ -618,7 +618,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, sm->sm_first = 0; } - metadata = (height != ip->i_di.di_height - 1) || gfs2_is_jdata(ip); + metadata = (height != ip->i_di.di_height - 1); if (metadata) revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; @@ -814,33 +814,6 @@ static int do_grow(struct gfs2_inode *ip, uint64_t size) return error; } -static int truncator_journaled(struct gfs2_inode *ip, uint64_t size) -{ - uint64_t lbn, dbn; - uint32_t off; - struct buffer_head *bh; - int new = 0; - int error; - - lbn = size; - off = do_div(lbn, ip->i_sbd->sd_jbsize); - - error = gfs2_block_map(ip, lbn, &new, &dbn, NULL); - if (error || !dbn) - return error; - - error = gfs2_jdata_get_buffer(ip, dbn, 0, &bh); - if (error) - return error; - - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header) + off); - - brelse(bh); - - return 0; -} - static int trunc_start(struct gfs2_inode *ip, uint64_t size) { struct gfs2_sbd *sdp = ip->i_sbd; @@ -866,12 +839,7 @@ static int trunc_start(struct gfs2_inode *ip, uint64_t size) error = 1; } else { - if (journaled) { - uint64_t junk = size; - /* we're just interested in the modulus */ - if (do_div(junk, sdp->sd_jbsize)) - error = truncator_journaled(ip, size); - } else if (size & (uint64_t)(sdp->sd_sb.sb_bsize - 1)) + if (size & (uint64_t)(sdp->sd_sb.sb_bsize - 1)) error = gfs2_block_truncate_page(ip->i_vnode->i_mapping); if (!error) { @@ -900,10 +868,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, uint64_t size) if (!size) lblock = 0; - else if (gfs2_is_jdata(ip)) { - lblock = size - 1; - do_div(lblock, ip->i_sbd->sd_jbsize); - } else + else lblock = (size - 1) >> ip->i_sbd->sd_sb.sb_bsize_shift; find_metapath(ip, lblock, &mp); @@ -1051,7 +1016,7 @@ void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len, struct gfs2_sbd *sdp = ip->i_sbd; unsigned int tmp; - if (gfs2_is_jdata(ip)) { + if (gfs2_is_dir(ip)) { *data_blocks = DIV_RU(len, sdp->sd_jbsize) + 2; *ind_blocks = 3 * (sdp->sd_max_jheight - 1); } else { @@ -1096,7 +1061,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset, return 0; } - if (gfs2_is_jdata(ip)) { + if (gfs2_is_dir(ip)) { unsigned int bsize = sdp->sd_jbsize; lblock = offset; do_div(lblock, bsize); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index ada283a0..c77e180 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -86,8 +86,8 @@ typedef int (*leaf_call_t) (struct gfs2_inode *dip, uint32_t index, uint32_t len, uint64_t leaf_no, void *data); -static int gfs2_dir_get_buffer(struct gfs2_inode *ip, uint64_t block, int new, - struct buffer_head **bhp) +int gfs2_dir_get_buffer(struct gfs2_inode *ip, uint64_t block, int new, + struct buffer_head **bhp) { struct buffer_head *bh; int error = 0; diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index ff6d1c5..5b01497 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -45,5 +45,7 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); int gfs2_diradd_alloc_required(struct gfs2_inode *dip, struct qstr *filename, int *alloc_required); +int gfs2_dir_get_buffer(struct gfs2_inode *ip, uint64_t block, int new, + struct buffer_head **bhp); #endif /* __DIR_DOT_H__ */ diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index e42ae38..214975c 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -20,6 +20,11 @@ static inline int gfs2_is_jdata(struct gfs2_inode *ip) return ip->i_di.di_flags & GFS2_DIF_JDATA; } +static inline int gfs2_is_dir(struct gfs2_inode *ip) +{ + return S_ISDIR(ip->i_di.di_mode); +} + void gfs2_inode_attr_in(struct gfs2_inode *ip); void gfs2_inode_attr_out(struct gfs2_inode *ip); struct inode *gfs2_ip2v_lookup(struct gfs2_inode *ip); @@ -72,9 +77,9 @@ static inline int gfs2_lookup_simple(struct inode *dip, char *name, err = gfs2_lookupi(get_v2ip(dip), &qstr, 1, &ip); if (err == 0) { *ipp = gfs2_ip2v(ip); + gfs2_inode_put(ip); if (*ipp == NULL) err = -ENOMEM; - gfs2_inode_put(ip); } return err; } diff --git a/fs/gfs2/jdata.c b/fs/gfs2/jdata.c deleted file mode 100644 index e43eaf1..0000000 --- a/fs/gfs2/jdata.c +++ /dev/null @@ -1,389 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License v.2. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "gfs2.h" -#include "bmap.h" -#include "inode.h" -#include "jdata.h" -#include "meta_io.h" -#include "trans.h" - -int gfs2_internal_read(struct gfs2_inode *ip, - struct file_ra_state *ra_state, - char *buf, loff_t *pos, unsigned size) -{ - return gfs2_jdata_read_mem(ip, buf, *pos, size); -} - -int gfs2_jdata_get_buffer(struct gfs2_inode *ip, uint64_t block, int new, - struct buffer_head **bhp) -{ - struct buffer_head *bh; - int error = 0; - - if (new) { - bh = gfs2_meta_new(ip->i_gl, block); - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD); - gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); - } else { - error = gfs2_meta_read(ip->i_gl, block, - DIO_START | DIO_WAIT, &bh); - if (error) - return error; - if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_JD)) { - brelse(bh); - return -EIO; - } - } - - *bhp = bh; - - return 0; -} - -/** - * gfs2_copy2mem - Trivial copy function for gfs2_jdata_read() - * @bh: The buffer to copy from, or NULL meaning zero the buffer - * @buf: The buffer to copy/zero - * @offset: The offset in the buffer to copy from - * @size: The amount of data to copy/zero - * - * Returns: errno - */ - -int gfs2_copy2mem(struct buffer_head *bh, char **buf, unsigned int offset, - unsigned int size) -{ - if (bh) - memcpy(*buf, bh->b_data + offset, size); - else - memset(*buf, 0, size); - *buf += size; - return 0; -} - -/** - * gfs2_copy2user - Copy bytes to user space for gfs2_jdata_read() - * @bh: The buffer - * @buf: The destination of the data - * @offset: The offset into the buffer - * @size: The amount of data to copy - * - * Returns: errno - */ - -int gfs2_copy2user(struct buffer_head *bh, char **buf, unsigned int offset, - unsigned int size) -{ - int error; - - if (bh) - error = copy_to_user(*buf, bh->b_data + offset, size); - else - error = clear_user(*buf, size); - - if (error) - error = -EFAULT; - else - *buf += size; - - return error; -} - -static int jdata_read_stuffed(struct gfs2_inode *ip, char *buf, - unsigned int offset, unsigned int size, - read_copy_fn_t copy_fn) -{ - struct buffer_head *dibh; - int error; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (!error) { - error = copy_fn(dibh, &buf, - offset + sizeof(struct gfs2_dinode), size); - brelse(dibh); - } - - return (error) ? error : size; -} - -/** - * gfs2_jdata_read - Read a jdata file - * @ip: The GFS2 Inode - * @buf: The buffer to place result into - * @offset: File offset to begin jdata_readng from - * @size: Amount of data to transfer - * @copy_fn: Function to actually perform the copy - * - * The @copy_fn only copies a maximum of a single block at once so - * we are safe calling it with int arguments. It is done so that - * we don't needlessly put 64bit arguments on the stack and it - * also makes the code in the @copy_fn nicer too. - * - * Returns: The amount of data actually copied or the error - */ - -int gfs2_jdata_read(struct gfs2_inode *ip, char __user *buf, uint64_t offset, - unsigned int size, read_copy_fn_t copy_fn) -{ - struct gfs2_sbd *sdp = ip->i_sbd; - uint64_t lblock, dblock; - uint32_t extlen = 0; - unsigned int o; - int copied = 0; - int error = 0; - - if (offset >= ip->i_di.di_size) - return 0; - - if ((offset + size) > ip->i_di.di_size) - size = ip->i_di.di_size - offset; - - if (!size) - return 0; - - if (gfs2_is_stuffed(ip)) - return jdata_read_stuffed(ip, buf, (unsigned int)offset, size, - copy_fn); - - if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) - return -EINVAL; - - lblock = offset; - o = do_div(lblock, sdp->sd_jbsize) + - sizeof(struct gfs2_meta_header); - - while (copied < size) { - unsigned int amount; - struct buffer_head *bh; - int new; - - amount = size - copied; - if (amount > sdp->sd_sb.sb_bsize - o) - amount = sdp->sd_sb.sb_bsize - o; - - if (!extlen) { - new = 0; - error = gfs2_block_map(ip, lblock, &new, - &dblock, &extlen); - if (error) - goto fail; - } - - if (extlen > 1) - gfs2_meta_ra(ip->i_gl, dblock, extlen); - - if (dblock) { - error = gfs2_jdata_get_buffer(ip, dblock, new, &bh); - if (error) - goto fail; - dblock++; - extlen--; - } else - bh = NULL; - - error = copy_fn(bh, &buf, o, amount); - brelse(bh); - if (error) - goto fail; - - copied += amount; - lblock++; - - o = sizeof(struct gfs2_meta_header); - } - - return copied; - - fail: - return (copied) ? copied : error; -} - -/** - * gfs2_copy_from_mem - Trivial copy function for gfs2_jdata_write() - * @bh: The buffer to copy to or clear - * @buf: The buffer to copy from - * @offset: The offset in the buffer to write to - * @size: The amount of data to write - * - * Returns: errno - */ - -int gfs2_copy_from_mem(struct gfs2_inode *ip, struct buffer_head *bh, - const char **buf, unsigned int offset, unsigned int size) -{ - gfs2_trans_add_bh(ip->i_gl, bh, 1); - memcpy(bh->b_data + offset, *buf, size); - - *buf += size; - - return 0; -} - -/** - * gfs2_copy_from_user - Copy bytes from user space for gfs2_jdata_write() - * @bh: The buffer to copy to or clear - * @buf: The buffer to copy from - * @offset: The offset in the buffer to write to - * @size: The amount of data to write - * - * Returns: errno - */ - -int gfs2_copy_from_user(struct gfs2_inode *ip, struct buffer_head *bh, - const char __user **buf, unsigned int offset, unsigned int size) -{ - int error = 0; - - gfs2_trans_add_bh(ip->i_gl, bh, 1); - if (copy_from_user(bh->b_data + offset, *buf, size)) - error = -EFAULT; - else - *buf += size; - - return error; -} - -static int jdata_write_stuffed(struct gfs2_inode *ip, char *buf, - unsigned int offset, unsigned int size, - write_copy_fn_t copy_fn) -{ - struct buffer_head *dibh; - int error; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - - error = copy_fn(ip, - dibh, &buf, - offset + sizeof(struct gfs2_dinode), size); - if (!error) { - if (ip->i_di.di_size < offset + size) - ip->i_di.di_size = offset + size; - ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); - gfs2_dinode_out(&ip->i_di, dibh->b_data); - } - - brelse(dibh); - - return (error) ? error : size; -} - -/** - * gfs2_jdata_write - Write bytes to a file - * @ip: The GFS2 inode - * @buf: The buffer containing information to be written - * @offset: The file offset to start writing at - * @size: The amount of data to write - * @copy_fn: Function to do the actual copying - * - * Returns: The number of bytes correctly written or error code - */ - -int gfs2_jdata_write(struct gfs2_inode *ip, const char __user *buf, uint64_t offset, - unsigned int size, write_copy_fn_t copy_fn) -{ - struct gfs2_sbd *sdp = ip->i_sbd; - struct buffer_head *dibh; - uint64_t lblock, dblock; - uint32_t extlen = 0; - unsigned int o; - int copied = 0; - int error = 0; - - if (!size) - return 0; - - if (gfs2_is_stuffed(ip) && - offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) - return jdata_write_stuffed(ip, buf, (unsigned int)offset, size, - copy_fn); - - if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) - return -EINVAL; - - if (gfs2_is_stuffed(ip)) { - error = gfs2_unstuff_dinode(ip, NULL, NULL); - if (error) - return error; - } - - lblock = offset; - o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); - - while (copied < size) { - unsigned int amount; - struct buffer_head *bh; - int new; - - amount = size - copied; - if (amount > sdp->sd_sb.sb_bsize - o) - amount = sdp->sd_sb.sb_bsize - o; - - if (!extlen) { - new = 1; - error = gfs2_block_map(ip, lblock, &new, - &dblock, &extlen); - if (error) - goto fail; - error = -EIO; - if (gfs2_assert_withdraw(sdp, dblock)) - goto fail; - } - - error = gfs2_jdata_get_buffer(ip, dblock, - (amount == sdp->sd_jbsize) ? 1 : new, - &bh); - if (error) - goto fail; - - error = copy_fn(ip, bh, &buf, o, amount); - brelse(bh); - if (error) - goto fail; - - copied += amount; - lblock++; - dblock++; - extlen--; - - o = sizeof(struct gfs2_meta_header); - } - - out: - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - - if (ip->i_di.di_size < offset + copied) - ip->i_di.di_size = offset + copied; - ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(&ip->i_di, dibh->b_data); - brelse(dibh); - - return copied; - - fail: - if (copied) - goto out; - return error; -} - diff --git a/fs/gfs2/jdata.h b/fs/gfs2/jdata.h deleted file mode 100644 index 95e18fc..0000000 --- a/fs/gfs2/jdata.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License v.2. - */ - -#ifndef __FILE_DOT_H__ -#define __FILE_DOT_H__ - -int gfs2_jdata_get_buffer(struct gfs2_inode *ip, uint64_t block, int new, - struct buffer_head **bhp); - -typedef int (*read_copy_fn_t) (struct buffer_head *bh, char **buf, - unsigned int offset, unsigned int size); -typedef int (*write_copy_fn_t) (struct gfs2_inode *ip, - struct buffer_head *bh, const char **buf, - unsigned int offset, unsigned int size); - -int gfs2_copy2mem(struct buffer_head *bh, char **buf, - unsigned int offset, unsigned int size); -int gfs2_copy2user(struct buffer_head *bh, char __user **buf, - unsigned int offset, unsigned int size); -int gfs2_jdata_read(struct gfs2_inode *ip, char __user *buf, - uint64_t offset, unsigned int size, - read_copy_fn_t copy_fn); - -int gfs2_copy_from_mem(struct gfs2_inode *ip, - struct buffer_head *bh, const char **buf, - unsigned int offset, unsigned int size); -int gfs2_copy_from_user(struct gfs2_inode *ip, - struct buffer_head *bh, const char __user **buf, - unsigned int offset, unsigned int size); -int gfs2_jdata_write(struct gfs2_inode *ip, const char __user *buf, - uint64_t offset, unsigned int size, - write_copy_fn_t copy_fn); - -static inline int gfs2_jdata_read_mem(struct gfs2_inode *ip, char *buf, - uint64_t offset, unsigned int size) -{ - return gfs2_jdata_read(ip, (__force char __user *)buf, offset, size, gfs2_copy2mem); -} - -static inline int gfs2_jdata_write_mem(struct gfs2_inode *ip, const char *buf, - uint64_t offset, unsigned int size) -{ - return gfs2_jdata_write(ip, (__force const char __user *)buf, offset, size, gfs2_copy_from_mem); -} - -#endif /* __FILE_DOT_H__ */ diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index f6d0013..9b4484d 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -387,8 +387,7 @@ struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL); atomic_set(&bh->b_count, 1); bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate); - set_bh_page(bh, virt_to_page(real->b_data), - ((unsigned long)real->b_data) & (PAGE_SIZE - 1)); + set_bh_page(bh, real->b_page, bh_offset(real)); bh->b_blocknr = blkno; bh->b_size = sdp->sd_sb.sb_bsize; bh->b_bdev = sdp->sd_vfs->s_bdev; @@ -634,6 +633,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index a065f76..dd41863 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -428,49 +428,188 @@ static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) gfs2_assert_warn(sdp, !sdp->sd_log_num_rg); } +/** + * databuf_lo_add - Add a databuf to the transaction. + * + * This is used in two distinct cases: + * i) In ordered write mode + * We put the data buffer on a list so that we can ensure that its + * synced to disk at the right time + * ii) In journaled data mode + * We need to journal the data block in the same way as metadata in + * the functions above. The difference is that here we have a tag + * which is two __be64's being the block number (as per meta data) + * and a flag which says whether the data block needs escaping or + * not. This means we need a new log entry for each 251 or so data + * blocks, which isn't an enormous overhead but twice as much as + * for normal metadata blocks. + */ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) { - get_transaction->tr_touched = 1; + struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); + struct gfs2_trans *tr = get_transaction; + struct address_space *mapping = bd->bd_bh->b_page->mapping; + struct gfs2_inode *ip = get_v2ip(mapping->host); + tr->tr_touched = 1; + if (!list_empty(&bd->bd_list_tr) && + (ip->i_di.di_flags & GFS2_DIF_JDATA)) { + tr->tr_num_buf++; + gfs2_trans_add_gl(bd->bd_gl); + list_add(&bd->bd_list_tr, &tr->tr_list_buf); + gfs2_pin(sdp, bd->bd_bh); + } else { + clear_buffer_pinned(bd->bd_bh); + } gfs2_log_lock(sdp); + if (ip->i_di.di_flags & GFS2_DIF_JDATA) + sdp->sd_log_num_jdata++; sdp->sd_log_num_databuf++; list_add(&le->le_list, &sdp->sd_log_le_databuf); gfs2_log_unlock(sdp); } +static int gfs2_check_magic(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + void *kaddr; + __be32 *ptr; + int rv = 0; + + kaddr = kmap_atomic(page, KM_USER0); + ptr = kaddr + bh_offset(bh); + if (*ptr == cpu_to_be32(GFS2_MAGIC)) + rv = 1; + kunmap_atomic(page, KM_USER0); + + return rv; +} + +/** + * databuf_lo_before_commit - Scan the data buffers, writing as we go + * + * Here we scan through the lists of buffers and make the assumption + * that any buffer thats been pinned is being journaled, and that + * any unpinned buffer is an ordered write data buffer and therefore + * will be written back rather than journaled. + */ static void databuf_lo_before_commit(struct gfs2_sbd *sdp) { - struct list_head *head = &sdp->sd_log_le_databuf; LIST_HEAD(started); - struct gfs2_bufdata *bd; - struct buffer_head *bh; + struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt; + struct buffer_head *bh = NULL; + unsigned int offset = sizeof(struct gfs2_log_descriptor); + struct gfs2_log_descriptor *ld; + unsigned int limit; + unsigned int total_dbuf = sdp->sd_log_num_databuf; + unsigned int total_jdata = sdp->sd_log_num_jdata; + unsigned int num, n; + __be64 *ptr; - while (!list_empty(head)) { - bd = list_entry(head->prev, struct gfs2_bufdata, bd_le.le_list); - list_move(&bd->bd_le.le_list, &started); + offset += (2*sizeof(__be64) - 1); + offset &= ~(2*sizeof(__be64) - 1); + limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64); - gfs2_log_lock(sdp); - bh = bd->bd_bh; + /* printk(KERN_INFO "totals: jdata=%u dbuf=%u\n", total_jdata, total_dbuf); */ + /* + * Start writing ordered buffers, write journaled buffers + * into the log along with a header + */ + bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf, bd_le.le_list); + while(total_dbuf) { + num = total_jdata; + if (num > limit) + num = limit; + n = 0; + list_for_each_entry_safe_continue(bd1, bdt, &sdp->sd_log_le_databuf, bd_le.le_list) { + gfs2_log_lock(sdp); + /* An ordered write buffer */ + if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) { + list_move(&bd1->bd_le.le_list, &started); + if (bd1 == bd2) { + bd2 = NULL; + bd2 = list_prepare_entry(bd2, &sdp->sd_log_le_databuf, bd_le.le_list); + } + total_dbuf--; + if (bd1->bd_bh) { + get_bh(bd1->bd_bh); + gfs2_log_unlock(sdp); + if (buffer_dirty(bd1->bd_bh)) { + wait_on_buffer(bd1->bd_bh); + ll_rw_block(WRITE, 1, &bd1->bd_bh); + } + brelse(bd1->bd_bh); + continue; + } + gfs2_log_unlock(sdp); + continue; + } else if (bd1->bd_bh) { /* A journaled buffer */ + int magic; + gfs2_log_unlock(sdp); + /* printk(KERN_INFO "journaled buffer\n"); */ + if (!bh) { + bh = gfs2_log_get_buf(sdp); + ld = (struct gfs2_log_descriptor *)bh->b_data; + ptr = (__be64 *)(bh->b_data + offset); + ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + ld->ld_header.mh_type = cpu_to_be16(GFS2_METATYPE_LD); + ld->ld_header.mh_format = cpu_to_be16(GFS2_FORMAT_LD); + ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_JDATA); + ld->ld_length = cpu_to_be32(num + 1); + ld->ld_data1 = cpu_to_be32(num); + ld->ld_data2 = cpu_to_be32(0); + memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved)); + } + magic = gfs2_check_magic(bd1->bd_bh); + *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr); + *ptr++ = cpu_to_be64((__u64)magic); + clear_buffer_escaped(bd1->bd_bh); + if (unlikely(magic != 0)) + set_buffer_escaped(bd1->bd_bh); + if (n++ > num) + break; + } + } if (bh) { - get_bh(bh); - gfs2_log_unlock(sdp); - if (buffer_dirty(bh)) { - wait_on_buffer(bh); - ll_rw_block(WRITE, 1, &bh); + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + bh = NULL; + } + n = 0; + /* printk(KERN_INFO "totals2: jdata=%u dbuf=%u\n", total_jdata, total_dbuf); */ + list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf, bd_le.le_list) { + if (!bd2->bd_bh) + continue; + /* copy buffer if it needs escaping */ + if (unlikely(buffer_escaped(bd2->bd_bh))) { + void *kaddr; + struct page *page = bd2->bd_bh->b_page; + bh = gfs2_log_get_buf(sdp); + kaddr = kmap_atomic(page, KM_USER0); + memcpy(bh->b_data, kaddr + bh_offset(bd2->bd_bh), sdp->sd_sb.sb_bsize); + kunmap_atomic(page, KM_USER0); + *(__be32 *)bh->b_data = 0; + } else { + bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); } - brelse(bh); - } else - gfs2_log_unlock(sdp); + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + if (++n >= num) + break; + } + bh = NULL; + total_dbuf -= num; + total_jdata -= num; } - + /* printk(KERN_INFO "wait on ordered data buffers\n"); */ + /* Wait on all ordered buffers */ while (!list_empty(&started)) { - bd = list_entry(started.next, struct gfs2_bufdata, - bd_le.le_list); - list_del(&bd->bd_le.le_list); + bd1 = list_entry(started.next, struct gfs2_bufdata, bd_le.le_list); + list_del(&bd1->bd_le.le_list); sdp->sd_log_num_databuf--; gfs2_log_lock(sdp); - bh = bd->bd_bh; + bh = bd1->bd_bh; if (bh) { set_v2bd(bh, NULL); gfs2_log_unlock(sdp); @@ -479,12 +618,103 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp) } else gfs2_log_unlock(sdp); - kfree(bd); + kfree(bd1); } + /* printk(KERN_INFO "sd_log_num_databuf %u sd_log_num_jdata %u\n", sdp->sd_log_num_databuf, sdp->sd_log_num_jdata); */ + /* We've removed all the ordered write bufs here, so only jdata left */ + gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata); +} + +static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, + __be64 *ptr, int pass) +{ + struct gfs2_sbd *sdp = jd->jd_inode->i_sbd; + struct gfs2_glock *gl = jd->jd_inode->i_gl; + unsigned int blks = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh_log, *bh_ip; + uint64_t blkno; + uint64_t esc; + int error = 0; + + if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA) + return 0; + + gfs2_replay_incr_blk(sdp, &start); + for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + blkno = be64_to_cpu(*ptr++); + esc = be64_to_cpu(*ptr++); + + sdp->sd_found_blocks++; + + if (gfs2_revoke_check(sdp, blkno, start)) + continue; + + error = gfs2_replay_read_block(jd, start, &bh_log); + if (error) + return error; + + bh_ip = gfs2_meta_new(gl, blkno); + memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size); + + /* Unescape */ + if (esc) { + __be32 *eptr = (__be32 *)bh_ip->b_data; + *eptr = cpu_to_be32(GFS2_MAGIC); + } + mark_buffer_dirty(bh_ip); + + brelse(bh_log); + brelse(bh_ip); + if (error) + break; + + sdp->sd_replayed_blocks++; + } + + return error; +} + +/* FIXME: sort out accounting for log blocks etc. */ + +static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_sbd *sdp = jd->jd_inode->i_sbd; + + if (error) { + gfs2_meta_sync(jd->jd_inode->i_gl, DIO_START | DIO_WAIT); + return; + } + if (pass != 1) + return; + + /* data sync? */ + gfs2_meta_sync(jd->jd_inode->i_gl, DIO_START | DIO_WAIT); + + fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", + jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); +} + +static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_databuf; + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); + list_del_init(&bd->bd_le.le_list); + sdp->sd_log_num_databuf--; + sdp->sd_log_num_jdata--; + gfs2_unpin(sdp, bd->bd_bh, ai); + brelse(bd->bd_bh); + kfree(bd); + } gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf); + gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata); } + struct gfs2_log_operations gfs2_glock_lops = { .lo_add = glock_lo_add, .lo_after_commit = glock_lo_after_commit, @@ -519,7 +749,11 @@ struct gfs2_log_operations gfs2_rg_lops = { struct gfs2_log_operations gfs2_databuf_lops = { .lo_add = databuf_lo_add, + .lo_incore_commit = buf_lo_incore_commit, .lo_before_commit = databuf_lo_before_commit, + .lo_after_commit = databuf_lo_after_commit, + .lo_scan_elements = databuf_lo_scan_elements, + .lo_after_scan = databuf_lo_after_scan, .lo_name = "databuf" }; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index b6bd2eb..ef58d43 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -547,10 +547,12 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, int meta { struct gfs2_bufdata *bd; - lock_page(bh->b_page); + if (meta) + lock_page(bh->b_page); if (get_v2bd(bh)) { - unlock_page(bh->b_page); + if (meta) + unlock_page(bh->b_page); return; } @@ -563,14 +565,16 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, int meta bd->bd_gl = gl; INIT_LIST_HEAD(&bd->bd_list_tr); - if (meta) + if (meta) { lops_init_le(&bd->bd_le, &gfs2_buf_lops); - else + } else { lops_init_le(&bd->bd_le, &gfs2_databuf_lops); - + get_bh(bh); + } set_v2bd(bh, bd); - unlock_page(bh->b_page); + if (meta) + unlock_page(bh->b_page); } /** diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index d611b2a..b14357e 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -20,13 +20,13 @@ #include "bmap.h" #include "glock.h" #include "inode.h" -#include "jdata.h" #include "log.h" #include "meta_io.h" #include "ops_address.h" #include "page.h" #include "quota.h" #include "trans.h" +#include "rgrp.h" /** * gfs2_get_block - Fills in a buffer head with details about a block @@ -149,33 +149,55 @@ static int get_blocks_noalloc(struct inode *inode, sector_t lblock, * * Returns: errno * - * Use Linux VFS block_write_full_page() to write one page, - * using GFS2's get_block_noalloc to find which blocks to write. + * Some of this is copied from block_write_full_page() although we still + * call it to do most of the work. */ static int gfs2_writepage(struct page *page, struct writeback_control *wbc) { + struct inode *inode = page->mapping->host; struct gfs2_inode *ip = get_v2ip(page->mapping->host); struct gfs2_sbd *sdp = ip->i_sbd; + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; int error; + int done_trans = 0; atomic_inc(&sdp->sd_ops_address); - if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) { unlock_page(page); return -EIO; } - if (get_transaction) { - redirty_page_for_writepage(wbc, page); + if (get_transaction) + goto out_ignore; + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index >= end_index+1 || !offset) { + page->mapping->a_ops->invalidatepage(page, 0); unlock_page(page); - return 0; + return 0; /* don't care */ } - error = block_write_full_page(page, get_block_noalloc, wbc); + if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) { + error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); + if (error) + goto out_ignore; + gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); + done_trans = 1; + } + error = block_write_full_page(page, get_block_noalloc, wbc); + if (done_trans) + gfs2_trans_end(sdp); gfs2_meta_cache_flush(ip); - return error; + +out_ignore: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; } /** @@ -227,40 +249,9 @@ static int zero_readpage(struct page *page) } /** - * jdata_readpage - readpage that goes through gfs2_jdata_read_mem() - * @ip: - * @page: The page to read - * - * Returns: errno - */ - -static int jdata_readpage(struct gfs2_inode *ip, struct page *page) -{ - void *kaddr; - int ret; - - kaddr = kmap(page); - - ret = gfs2_jdata_read_mem(ip, kaddr, - (uint64_t)page->index << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE); - if (ret >= 0) { - if (ret < PAGE_CACHE_SIZE) - memset(kaddr + ret, 0, PAGE_CACHE_SIZE - ret); - SetPageUptodate(page); - ret = 0; - } - - kunmap(page); - - unlock_page(page); - - return ret; -} - -/** * gfs2_readpage - readpage with locking - * @file: The file to read a page for + * @file: The file to read a page for. N.B. This may be NULL if we are + * reading an internal file. * @page: The page to read * * Returns: errno @@ -270,31 +261,35 @@ static int gfs2_readpage(struct file *file, struct page *page) { struct gfs2_inode *ip = get_v2ip(page->mapping->host); struct gfs2_sbd *sdp = ip->i_sbd; + struct gfs2_holder gh; int error; atomic_inc(&sdp->sd_ops_address); - if (gfs2_assert_warn(sdp, gfs2_glock_is_locked_by_me(ip->i_gl))) { - unlock_page(page); - return -EOPNOTSUPP; - } + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); + error = gfs2_glock_nq_m_atime(1, &gh); + if (error) + goto out_unlock; - if (!gfs2_is_jdata(ip)) { - if (gfs2_is_stuffed(ip)) { - if (!page->index) { - error = stuffed_readpage(ip, page); - unlock_page(page); - } else - error = zero_readpage(page); + if (gfs2_is_stuffed(ip)) { + if (!page->index) { + error = stuffed_readpage(ip, page); + unlock_page(page); } else - error = mpage_readpage(page, gfs2_get_block); + error = zero_readpage(page); } else - error = jdata_readpage(ip, page); + error = mpage_readpage(page, gfs2_get_block); if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) error = -EIO; + gfs2_glock_dq_m(1, &gh); + gfs2_holder_uninit(&gh); +out: return error; +out_unlock: + unlock_page(page); + goto out; } /** @@ -312,28 +307,82 @@ static int gfs2_prepare_write(struct file *file, struct page *page, { struct gfs2_inode *ip = get_v2ip(page->mapping->host); struct gfs2_sbd *sdp = ip->i_sbd; + unsigned int data_blocks, ind_blocks, rblocks; + int alloc_required; int error = 0; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from; + loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + struct gfs2_alloc *al; atomic_inc(&sdp->sd_ops_address); - if (gfs2_assert_warn(sdp, gfs2_glock_is_locked_by_me(ip->i_gl))) - return -EOPNOTSUPP; + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh); + error = gfs2_glock_nq_m_atime(1, &ip->i_gh); + if (error) + goto out_uninit; - if (gfs2_is_stuffed(ip)) { - uint64_t file_size; - file_size = ((uint64_t)page->index << PAGE_CACHE_SHIFT) + to; + gfs2_write_calc_reserv(ip, to - from, &data_blocks, &ind_blocks); + + error = gfs2_write_alloc_required(ip, pos, from - to, &alloc_required); + if (error) + goto out_unlock; - if (file_size > sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_dinode)) { - error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page, - page); - if (!error) - error = block_prepare_write(page, from, to, - gfs2_get_block); - } else if (!PageUptodate(page)) + + if (alloc_required) { + al = gfs2_alloc_get(ip); + + error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc_put; + + error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid); + if (error) + goto out_qunlock; + + al->al_requested = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) + goto out_qunlock; + } + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + if (ind_blocks || data_blocks) + rblocks += RES_STATFS + RES_QUOTA; + + error = gfs2_trans_begin(sdp, rblocks, 0); + if (error) + goto out; + + if (gfs2_is_stuffed(ip)) { + if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { + error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page, page); + if (error) + goto out; + } else if (!PageUptodate(page)) { error = stuffed_readpage(ip, page); - } else - error = block_prepare_write(page, from, to, gfs2_get_block); + goto out; + } + } + + error = block_prepare_write(page, from, to, gfs2_get_block); + +out: + if (error) { + gfs2_trans_end(sdp); + if (alloc_required) { + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); + } +out_unlock: + gfs2_glock_dq_m(1, &ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + } return error; } @@ -354,48 +403,73 @@ static int gfs2_commit_write(struct file *file, struct page *page, struct inode *inode = page->mapping->host; struct gfs2_inode *ip = get_v2ip(inode); struct gfs2_sbd *sdp = ip->i_sbd; - int error; + int error = -EOPNOTSUPP; + struct buffer_head *dibh; + struct gfs2_alloc *al = &ip->i_alloc;; atomic_inc(&sdp->sd_ops_address); + + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl))) + goto fail_nounlock; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto fail_endtrans; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + if (gfs2_is_stuffed(ip)) { - struct buffer_head *dibh; uint64_t file_size; void *kaddr; file_size = ((uint64_t)page->index << PAGE_CACHE_SHIFT) + to; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto fail; - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - - kaddr = kmap(page); + kaddr = kmap_atomic(page, KM_USER0); memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from, - (char *)kaddr + from, - to - from); - kunmap(page); - - brelse(dibh); + (char *)kaddr + from, to - from); + kunmap_atomic(page, KM_USER0); SetPageUptodate(page); if (inode->i_size < file_size) i_size_write(inode, file_size); } else { - if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) + if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) gfs2_page_add_databufs(ip, page, from, to); error = generic_commit_write(file, page, from, to); if (error) goto fail; } + if (ip->i_di.di_size < inode->i_size) + ip->i_di.di_size = inode->i_size; + + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + gfs2_trans_end(sdp); + if (al->al_requested) { + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + gfs2_glock_dq_m(1, &ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); return 0; - fail: +fail: + brelse(dibh); +fail_endtrans: + gfs2_trans_end(sdp); + if (al->al_requested) { + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + gfs2_glock_dq_m(1, &ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); +fail_nounlock: ClearPageUptodate(page); - return error; } @@ -492,12 +566,16 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *io atomic_inc(&sdp->sd_ops_address); - if (gfs2_assert_warn(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)) || - gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip))) + if (gfs2_is_jdata(ip)) return -EINVAL; - if (rw == WRITE && !get_transaction) - gb = get_blocks_noalloc; + if (rw == WRITE) { + return -EOPNOTSUPP; /* for now */ + } else { + if (gfs2_assert_warn(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)) || + gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip))) + return -EINVAL; + } return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, gb, NULL); diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 0f356fc..56820b3 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -27,7 +28,6 @@ #include "glock.h" #include "glops.h" #include "inode.h" -#include "jdata.h" #include "lm.h" #include "log.h" #include "meta_io.h" @@ -67,10 +67,37 @@ struct filldir_reg { void *fdr_opaque; }; -typedef ssize_t(*do_rw_t) (struct file *file, - char __user *buf, - size_t size, loff_t *offset, - unsigned int num_gh, struct gfs2_holder *ghs); +static int gfs2_read_actor(read_descriptor_t *desc, struct page *page, + unsigned long offset, unsigned long size) +{ + char *kaddr; + unsigned long count = desc->count; + + if (size > count) + size = count; + + kaddr = kmap(page); + memcpy(desc->arg.buf, kaddr + offset, size); + kunmap(page); + + desc->count = count - size; + desc->written += size; + desc->arg.buf += size; + return size; +} + +int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state, + char *buf, loff_t *pos, unsigned size) +{ + struct inode *inode = ip->i_vnode; + read_descriptor_t desc; + desc.written = 0; + desc.arg.buf = buf; + desc.count = size; + desc.error = 0; + do_generic_mapping_read(inode->i_mapping, ra_state, NULL, pos, &desc, gfs2_read_actor); + return desc.written ? desc.written : desc.error; +} /** * gfs2_llseek - seek to a location in a file @@ -105,247 +132,114 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) return error; } -static inline unsigned int vma2state(struct vm_area_struct *vma) -{ - if ((vma->vm_flags & (VM_MAYWRITE | VM_MAYSHARE)) == - (VM_MAYWRITE | VM_MAYSHARE)) - return LM_ST_EXCLUSIVE; - return LM_ST_SHARED; -} -static ssize_t walk_vm_hard(struct file *file, const char __user *buf, size_t size, - loff_t *offset, do_rw_t operation) +static ssize_t gfs2_direct_IO_read(struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) { - struct gfs2_holder *ghs; - unsigned int num_gh = 0; - ssize_t count; - struct super_block *sb = file->f_dentry->d_inode->i_sb; - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long start = (unsigned long)buf; - unsigned long end = start + size; - int dumping = (current->flags & PF_DUMPCORE); - unsigned int x = 0; - - for (vma = find_vma(mm, start); vma; vma = vma->vm_next) { - if (end <= vma->vm_start) - break; - if (vma->vm_file && - vma->vm_file->f_dentry->d_inode->i_sb == sb) { - num_gh++; - } - } - - ghs = kcalloc((num_gh + 1), sizeof(struct gfs2_holder), GFP_KERNEL); - if (!ghs) { - if (!dumping) - up_read(&mm->mmap_sem); - return -ENOMEM; - } + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + ssize_t retval; - for (vma = find_vma(mm, start); vma; vma = vma->vm_next) { - if (end <= vma->vm_start) - break; - if (vma->vm_file) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; - if (inode->i_sb == sb) - gfs2_holder_init(get_v2ip(inode)->i_gl, - vma2state(vma), 0, &ghs[x++]); - } + retval = filemap_write_and_wait(mapping); + if (retval == 0) { + retval = mapping->a_ops->direct_IO(READ, iocb, iov, offset, + nr_segs); } - - if (!dumping) - up_read(&mm->mmap_sem); - - gfs2_assert(get_v2sdp(sb), x == num_gh); - - count = operation(file, buf, size, offset, num_gh, ghs); - - while (num_gh--) - gfs2_holder_uninit(&ghs[num_gh]); - kfree(ghs); - - return count; + return retval; } /** - * walk_vm - Walk the vmas associated with a buffer for read or write. - * If any of them are gfs2, pass the gfs2 inode down to the read/write - * worker function so that locks can be acquired in the correct order. - * @file: The file to read/write from/to - * @buf: The buffer to copy to/from - * @size: The amount of data requested - * @offset: The current file offset - * @operation: The read or write worker function - * - * Outputs: Offset - updated according to number of bytes written - * - * Returns: The number of bytes written, errno on failure + * __gfs2_file_aio_read - The main GFS2 read function + * + * N.B. This is almost, but not quite the same as __generic_file_aio_read() + * the important subtle different being that inode->i_size isn't valid + * unless we are holding a lock, and we do this _only_ on the O_DIRECT + * path since otherwise locking is done entirely at the page cache + * layer. */ - -static ssize_t walk_vm(struct file *file, const char __user *buf, size_t size, - loff_t *offset, do_rw_t operation) +static ssize_t __gfs2_file_aio_read(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { + struct file *filp = iocb->ki_filp; + struct gfs2_inode *ip = get_v2ip(filp->f_mapping->host); struct gfs2_holder gh; - - if (current->mm) { - struct super_block *sb = file->f_dentry->d_inode->i_sb; - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long start = (unsigned long)buf; - unsigned long end = start + size; - int dumping = (current->flags & PF_DUMPCORE); - - if (!dumping) - down_read(&mm->mmap_sem); - - for (vma = find_vma(mm, start); vma; vma = vma->vm_next) { - if (end <= vma->vm_start) - break; - if (vma->vm_file && - vma->vm_file->f_dentry->d_inode->i_sb == sb) - goto do_locks; - } - - if (!dumping) - up_read(&mm->mmap_sem); - } - - return operation(file, buf, size, offset, 0, &gh); - -do_locks: - return walk_vm_hard(file, buf, size, offset, operation); -} - -static ssize_t do_jdata_read(struct file *file, char __user *buf, size_t size, - loff_t *offset) -{ - struct gfs2_inode *ip = get_v2ip(file->f_mapping->host); - ssize_t count = 0; - - if (*offset < 0) + ssize_t retval; + unsigned long seg; + size_t count; + + count = 0; + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + count += iv->iov_len; + if (unlikely((ssize_t)(count|iv->iov_len) < 0)) return -EINVAL; - if (!access_ok(VERIFY_WRITE, buf, size)) + if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) return -EFAULT; + nr_segs = seg; + count -= iv->iov_len; /* This segment is no good */ + break; + } + + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ + if (filp->f_flags & O_DIRECT) { + loff_t pos = *ppos, size; + struct address_space *mapping; + struct inode *inode; + + mapping = filp->f_mapping; + inode = mapping->host; + retval = 0; + if (!count) + goto out; /* skip atime */ + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); + retval = gfs2_glock_nq_m_atime(1, &gh); + if (retval) + goto out; - if (!(file->f_flags & O_LARGEFILE)) { - if (*offset >= MAX_NON_LFS) - return -EFBIG; - if (*offset + size > MAX_NON_LFS) - size = MAX_NON_LFS - *offset; - } - - count = gfs2_jdata_read(ip, buf, *offset, size, gfs2_copy2user); - - if (count > 0) - *offset += count; - - return count; -} - -/** - * do_read_direct - Read bytes from a file - * @file: The file to read from - * @buf: The buffer to copy into - * @size: The amount of data requested - * @offset: The current file offset - * @num_gh: The number of other locks we need to do the read - * @ghs: the locks we need plus one for our lock - * - * Outputs: Offset - updated according to number of bytes read - * - * Returns: The number of bytes read, errno on failure - */ - -static ssize_t do_read_direct(struct file *file, char __user *buf, size_t size, - loff_t *offset, unsigned int num_gh, - struct gfs2_holder *ghs) -{ - struct inode *inode = file->f_mapping->host; - struct gfs2_inode *ip = get_v2ip(inode); - unsigned int state = LM_ST_DEFERRED; - int flags = 0; - unsigned int x; - ssize_t count = 0; - int error; - - for (x = 0; x < num_gh; x++) - if (ghs[x].gh_gl == ip->i_gl) { - state = LM_ST_SHARED; - flags |= GL_LOCAL_EXCL; - break; + size = i_size_read(inode); + if (pos < size) { + retval = gfs2_direct_IO_read(iocb, iov, pos, nr_segs); + if (retval > 0 && !is_sync_kiocb(iocb)) + retval = -EIOCBQUEUED; + if (retval > 0) + *ppos = pos + retval; } - - gfs2_holder_init(ip->i_gl, state, flags, &ghs[num_gh]); - - error = gfs2_glock_nq_m(num_gh + 1, ghs); - if (error) + file_accessed(filp); + gfs2_glock_dq_m(1, &gh); + gfs2_holder_uninit(&gh); goto out; + } - error = -EINVAL; - if (gfs2_is_jdata(ip)) - goto out_gunlock; - - if (gfs2_is_stuffed(ip)) { - size_t mask = bdev_hardsect_size(inode->i_sb->s_bdev) - 1; - - if (((*offset) & mask) || (((unsigned long)buf) & mask)) - goto out_gunlock; - - count = do_jdata_read(file, buf, size & ~mask, offset); - } else - count = generic_file_read(file, buf, size, offset); - - error = 0; - - out_gunlock: - gfs2_glock_dq_m(num_gh + 1, ghs); - - out: - gfs2_holder_uninit(&ghs[num_gh]); - - return (count) ? count : error; -} - -/** - * do_read_buf - Read bytes from a file - * @file: The file to read from - * @buf: The buffer to copy into - * @size: The amount of data requested - * @offset: The current file offset - * @num_gh: The number of other locks we need to do the read - * @ghs: the locks we need plus one for our lock - * - * Outputs: Offset - updated according to number of bytes read - * - * Returns: The number of bytes read, errno on failure - */ - -static ssize_t do_read_buf(struct file *file, char __user *buf, size_t size, - loff_t *offset, unsigned int num_gh, - struct gfs2_holder *ghs) -{ - struct gfs2_inode *ip = get_v2ip(file->f_mapping->host); - ssize_t count = 0; - int error; - - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &ghs[num_gh]); - - error = gfs2_glock_nq_m_atime(num_gh + 1, ghs); - if (error) - goto out; - - if (gfs2_is_jdata(ip)) - count = do_jdata_read(file, buf, size, offset); - else - count = generic_file_read(file, buf, size, offset); - - gfs2_glock_dq_m(num_gh + 1, ghs); - - out: - gfs2_holder_uninit(&ghs[num_gh]); - - return (count) ? count : error; + retval = 0; + if (count) { + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; + + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_generic_file_read(filp,ppos,&desc,file_read_actor); + retval += desc.written; + if (desc.error) { + retval = retval ?: desc.error; + break; + } + } + } +out: + return retval; } /** @@ -360,550 +254,49 @@ static ssize_t do_read_buf(struct file *file, char __user *buf, size_t size, * Returns: The number of bytes read, errno on failure */ -static ssize_t gfs2_read(struct file *file, char __user *buf, size_t size, +static ssize_t gfs2_read(struct file *filp, char __user *buf, size_t size, loff_t *offset) { - atomic_inc(&get_v2sdp(file->f_mapping->host->i_sb)->sd_ops_file); - - if (file->f_flags & O_DIRECT) - return walk_vm(file, buf, size, offset, do_read_direct); - else - return walk_vm(file, buf, size, offset, do_read_buf); -} - -/** - * grope_mapping - feel up a mapping that needs to be written - * @buf: the start of the memory to be written - * @size: the size of the memory to be written - * - * We do this after acquiring the locks on the mapping, - * but before starting the write transaction. We need to make - * sure that we don't cause recursive transactions if blocks - * need to be allocated to the file backing the mapping. - * - * Returns: errno - */ - -static int grope_mapping(const char __user *buf, size_t size) -{ - const char __user *stop = buf + size; - char c; - - while (buf < stop) { - if (copy_from_user(&c, buf, 1)) - return -EFAULT; - buf += PAGE_CACHE_SIZE; - buf = (const char __user *)PAGE_ALIGN((unsigned long)buf); - } - - return 0; -} - -/** - * do_write_direct_alloc - Write bytes to a file - * @file: The file to write to - * @buf: The buffer to copy from - * @size: The amount of data requested - * @offset: The current file offset - * - * Outputs: Offset - updated according to number of bytes written - * - * Returns: The number of bytes written, errno on failure - */ - -static ssize_t do_write_direct_alloc(struct file *file, const char __user *buf, size_t size, - loff_t *offset) -{ - struct inode *inode = file->f_mapping->host; - struct gfs2_inode *ip = get_v2ip(inode); - struct gfs2_sbd *sdp = ip->i_sbd; - struct gfs2_alloc *al = NULL; struct iovec local_iov = { .iov_base = buf, .iov_len = size }; - struct buffer_head *dibh; - unsigned int data_blocks, ind_blocks; - ssize_t count; - int error; - - gfs2_write_calc_reserv(ip, size, &data_blocks, &ind_blocks); - - al = gfs2_alloc_get(ip); - - error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); - if (error) - goto fail; - - error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid); - if (error) - goto fail_gunlock_q; - - al->al_requested = data_blocks + ind_blocks; - - error = gfs2_inplace_reserve(ip); - if (error) - goto fail_gunlock_q; - - error = gfs2_trans_begin(sdp, - al->al_rgd->rd_ri.ri_length + ind_blocks + - RES_DINODE + RES_STATFS + RES_QUOTA, 0); - if (error) - goto fail_ipres; - - if ((ip->i_di.di_mode & (S_ISUID | S_ISGID)) && !capable(CAP_FSETID)) { - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto fail_end_trans; - - ip->i_di.di_mode &= (ip->i_di.di_mode & S_IXGRP) ? - (~(S_ISUID | S_ISGID)) : (~S_ISUID); - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(&ip->i_di, dibh->b_data); - brelse(dibh); - } - - if (gfs2_is_stuffed(ip)) { - error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_sync, NULL); - if (error) - goto fail_end_trans; - } - - count = generic_file_write_nolock(file, &local_iov, 1, offset); - if (count < 0) { - error = count; - goto fail_end_trans; - } - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto fail_end_trans; - - if (ip->i_di.di_size < inode->i_size) - ip->i_di.di_size = inode->i_size; - ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(&ip->i_di, dibh->b_data); - brelse(dibh); - - gfs2_trans_end(sdp); + struct kiocb kiocb; + ssize_t ret; - if (file->f_flags & O_SYNC) - gfs2_log_flush_glock(ip->i_gl); - - gfs2_inplace_release(ip); - gfs2_quota_unlock(ip); - gfs2_alloc_put(ip); - - if (file->f_mapping->nrpages) { - error = filemap_fdatawrite(file->f_mapping); - if (!error) - error = filemap_fdatawait(file->f_mapping); - } - if (error) - return error; - - return count; - - fail_end_trans: - gfs2_trans_end(sdp); - - fail_ipres: - gfs2_inplace_release(ip); - - fail_gunlock_q: - gfs2_quota_unlock(ip); - - fail: - gfs2_alloc_put(ip); + atomic_inc(&get_v2sdp(filp->f_mapping->host->i_sb)->sd_ops_file); - return error; -} - -/** - * do_write_direct - Write bytes to a file - * @file: The file to write to - * @buf: The buffer to copy from - * @size: The amount of data requested - * @offset: The current file offset - * @num_gh: The number of other locks we need to do the read - * @gh: the locks we need plus one for our lock - * - * Outputs: Offset - updated according to number of bytes written - * - * Returns: The number of bytes written, errno on failure - */ - -static ssize_t do_write_direct(struct file *file, const char __user *buf, size_t size, - loff_t *offset, unsigned int num_gh, - struct gfs2_holder *ghs) -{ - struct gfs2_inode *ip = get_v2ip(file->f_mapping->host); - struct gfs2_sbd *sdp = ip->i_sbd; - struct gfs2_file *fp = get_v2fp(file); - unsigned int state = LM_ST_DEFERRED; - int alloc_required; - unsigned int x; - size_t s; - ssize_t count = 0; - int error; - - if (test_bit(GFF_DID_DIRECT_ALLOC, &fp->f_flags)) - state = LM_ST_EXCLUSIVE; - else - for (x = 0; x < num_gh; x++) - if (ghs[x].gh_gl == ip->i_gl) { - state = LM_ST_EXCLUSIVE; - break; - } - - restart: - gfs2_holder_init(ip->i_gl, state, 0, &ghs[num_gh]); - - error = gfs2_glock_nq_m(num_gh + 1, ghs); - if (error) - goto out; - - error = -EINVAL; - if (gfs2_is_jdata(ip)) - goto out_gunlock; - - if (num_gh) { - error = grope_mapping(buf, size); - if (error) - goto out_gunlock; - } - - if (file->f_flags & O_APPEND) - *offset = ip->i_di.di_size; - - if (!(file->f_flags & O_LARGEFILE)) { - error = -EFBIG; - if (*offset >= MAX_NON_LFS) - goto out_gunlock; - if (*offset + size > MAX_NON_LFS) - size = MAX_NON_LFS - *offset; - } - - if (gfs2_is_stuffed(ip) || - *offset + size > ip->i_di.di_size || - ((ip->i_di.di_mode & (S_ISUID | S_ISGID)) && !capable(CAP_FSETID))) - alloc_required = 1; - else { - error = gfs2_write_alloc_required(ip, *offset, size, - &alloc_required); - if (error) - goto out_gunlock; - } - - if (alloc_required && state != LM_ST_EXCLUSIVE) { - gfs2_glock_dq_m(num_gh + 1, ghs); - gfs2_holder_uninit(&ghs[num_gh]); - state = LM_ST_EXCLUSIVE; - goto restart; - } - - if (alloc_required) { - set_bit(GFF_DID_DIRECT_ALLOC, &fp->f_flags); - - /* split large writes into smaller atomic transactions */ - while (size) { - s = gfs2_tune_get(sdp, gt_max_atomic_write); - if (s > size) - s = size; - - error = do_write_direct_alloc(file, buf, s, offset); - if (error < 0) - goto out_gunlock; - - buf += error; - size -= error; - count += error; - } - } else { - struct iovec local_iov = { .iov_base = buf, .iov_len = size }; - struct gfs2_holder t_gh; - - clear_bit(GFF_DID_DIRECT_ALLOC, &fp->f_flags); - - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, - GL_NEVER_RECURSE, &t_gh); - if (error) - goto out_gunlock; - - count = generic_file_write_nolock(file, &local_iov, 1, offset); - - gfs2_glock_dq_uninit(&t_gh); - } - - error = 0; - - out_gunlock: - gfs2_glock_dq_m(num_gh + 1, ghs); - - out: - gfs2_holder_uninit(&ghs[num_gh]); - - return (count) ? count : error; + init_sync_kiocb(&kiocb, filp); + ret = __gfs2_file_aio_read(&kiocb, &local_iov, 1, offset); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + return ret; } -/** - * do_do_write_buf - Write bytes to a file - * @file: The file to write to - * @buf: The buffer to copy from - * @size: The amount of data requested - * @offset: The current file offset - * - * Outputs: Offset - updated according to number of bytes written - * - * Returns: The number of bytes written, errno on failure - */ - -static ssize_t do_do_write_buf(struct file *file, const char __user *buf, size_t size, - loff_t *offset) +static ssize_t gfs2_file_readv(struct file *filp, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { - struct inode *inode = file->f_mapping->host; - struct gfs2_inode *ip = get_v2ip(inode); - struct gfs2_sbd *sdp = ip->i_sbd; - struct gfs2_alloc *al = NULL; - struct buffer_head *dibh; - unsigned int data_blocks, ind_blocks; - int alloc_required, journaled; - ssize_t count; - int error; - - journaled = gfs2_is_jdata(ip); - - gfs2_write_calc_reserv(ip, size, &data_blocks, &ind_blocks); - - error = gfs2_write_alloc_required(ip, *offset, size, &alloc_required); - if (error) - return error; - - if (alloc_required) { - al = gfs2_alloc_get(ip); - - error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); - if (error) - goto fail; - - error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid); - if (error) - goto fail_gunlock_q; - - al->al_requested = data_blocks + ind_blocks; - - error = gfs2_inplace_reserve(ip); - if (error) - goto fail_gunlock_q; - - error = gfs2_trans_begin(sdp, - al->al_rgd->rd_ri.ri_length + - ind_blocks + - ((journaled) ? data_blocks : 0) + - RES_DINODE + RES_STATFS + RES_QUOTA, - 0); - if (error) - goto fail_ipres; - } else { - error = gfs2_trans_begin(sdp, - ((journaled) ? data_blocks : 0) + - RES_DINODE, - 0); - if (error) - goto fail_ipres; - } - - if ((ip->i_di.di_mode & (S_ISUID | S_ISGID)) && !capable(CAP_FSETID)) { - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto fail_end_trans; - - ip->i_di.di_mode &= (ip->i_di.di_mode & S_IXGRP) ? - (~(S_ISUID | S_ISGID)) : (~S_ISUID); - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(&ip->i_di, dibh->b_data); - brelse(dibh); - } + struct kiocb kiocb; + ssize_t ret; - if (journaled) { - count = gfs2_jdata_write(ip, buf, *offset, size, - gfs2_copy_from_user); - if (count < 0) { - error = count; - goto fail_end_trans; - } - - *offset += count; - } else { - struct iovec local_iov = { .iov_base = buf, .iov_len = size }; - - count = generic_file_write_nolock(file, &local_iov, 1, offset); - if (count < 0) { - error = count; - goto fail_end_trans; - } - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto fail_end_trans; - - if (ip->i_di.di_size < inode->i_size) - ip->i_di.di_size = inode->i_size; - ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(&ip->i_di, dibh->b_data); - brelse(dibh); - } - - gfs2_trans_end(sdp); - - if (file->f_flags & O_SYNC || IS_SYNC(inode)) { - gfs2_log_flush_glock(ip->i_gl); - error = filemap_fdatawrite(file->f_mapping); - if (error == 0) - error = filemap_fdatawait(file->f_mapping); - if (error) - goto fail_ipres; - } - - if (alloc_required) { - gfs2_assert_warn(sdp, count != size || - al->al_alloced); - gfs2_inplace_release(ip); - gfs2_quota_unlock(ip); - gfs2_alloc_put(ip); - } - - return count; - - fail_end_trans: - gfs2_trans_end(sdp); - - fail_ipres: - if (alloc_required) - gfs2_inplace_release(ip); - - fail_gunlock_q: - if (alloc_required) - gfs2_quota_unlock(ip); + atomic_inc(&get_v2sdp(filp->f_mapping->host->i_sb)->sd_ops_file); - fail: - if (alloc_required) - gfs2_alloc_put(ip); - - return error; + init_sync_kiocb(&kiocb, filp); + ret = __gfs2_file_aio_read(&kiocb, iov, nr_segs, ppos); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + return ret; } -/** - * do_write_buf - Write bytes to a file - * @file: The file to write to - * @buf: The buffer to copy from - * @size: The amount of data requested - * @offset: The current file offset - * @num_gh: The number of other locks we need to do the read - * @gh: the locks we need plus one for our lock - * - * Outputs: Offset - updated according to number of bytes written - * - * Returns: The number of bytes written, errno on failure - */ - -static ssize_t do_write_buf(struct file *file, const char __user *buf, size_t size, - loff_t *offset, unsigned int num_gh, - struct gfs2_holder *ghs) +static ssize_t gfs2_file_aio_read(struct kiocb *iocb, char __user *buf, + size_t count, loff_t pos) { - struct gfs2_inode *ip = get_v2ip(file->f_mapping->host); - struct gfs2_sbd *sdp = ip->i_sbd; - size_t s; - ssize_t count = 0; - int error; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ghs[num_gh]); - - error = gfs2_glock_nq_m(num_gh + 1, ghs); - if (error) - goto out; - - if (num_gh) { - error = grope_mapping(buf, size); - if (error) - goto out_gunlock; - } - - if (file->f_flags & O_APPEND) - *offset = ip->i_di.di_size; - - if (!(file->f_flags & O_LARGEFILE)) { - error = -EFBIG; - if (*offset >= MAX_NON_LFS) - goto out_gunlock; - if (*offset + size > MAX_NON_LFS) - size = MAX_NON_LFS - *offset; - } - - /* split large writes into smaller atomic transactions */ - while (size) { - s = gfs2_tune_get(sdp, gt_max_atomic_write); - if (s > size) - s = size; - - error = do_do_write_buf(file, buf, s, offset); - if (error < 0) - goto out_gunlock; - - buf += error; - size -= error; - count += error; - } - - error = 0; + struct file *filp = iocb->ki_filp; + struct iovec local_iov = { .iov_base = buf, .iov_len = count }; - out_gunlock: - gfs2_glock_dq_m(num_gh + 1, ghs); + atomic_inc(&get_v2sdp(filp->f_mapping->host->i_sb)->sd_ops_file); - out: - gfs2_holder_uninit(&ghs[num_gh]); - - return (count) ? count : error; + BUG_ON(iocb->ki_pos != pos); + return __gfs2_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); } -/** - * gfs2_write - Write bytes to a file - * @file: The file to write to - * @buf: The buffer to copy from - * @size: The amount of data requested - * @offset: The current file offset - * - * Outputs: Offset - updated according to number of bytes written - * - * Returns: The number of bytes written, errno on failure - */ - -static ssize_t gfs2_write(struct file *file, const char __user *buf, - size_t size, loff_t *offset) -{ - struct inode *inode = file->f_mapping->host; - ssize_t count; - - atomic_inc(&get_v2sdp(inode->i_sb)->sd_ops_file); - - if (*offset < 0) - return -EINVAL; - if (!access_ok(VERIFY_READ, buf, size)) - return -EFAULT; - - mutex_lock(&inode->i_mutex); - if (file->f_flags & O_DIRECT) - count = walk_vm(file, buf, size, offset, - do_write_direct); - else - count = walk_vm(file, buf, size, offset, do_write_buf); - mutex_unlock(&inode->i_mutex); - - return count; -} /** * filldir_reg_func - Report a directory entry to the caller of gfs2_dir_read() @@ -1158,9 +551,6 @@ static int gfs2_ioctl_flags(struct gfs2_inode *ip, unsigned int cmd, unsigned lo if (flags & (GFS2_DIF_JDATA|GFS2_DIF_DIRECTIO)) { if (!S_ISREG(ip->i_di.di_mode)) goto out; - /* FIXME: Would be nice not to require the following test */ - if ((flags & GFS2_DIF_JDATA) && ip->i_di.di_size) - goto out; } if (flags & (GFS2_DIF_INHERIT_JDATA|GFS2_DIF_INHERIT_DIRECTIO)) { if (!S_ISDIR(ip->i_di.di_mode)) @@ -1246,21 +636,14 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) return error; } - if (gfs2_is_jdata(ip)) { - if (vma->vm_flags & VM_MAYSHARE) - error = -EOPNOTSUPP; - else - vma->vm_ops = &gfs2_vm_ops_private; - } else { - /* This is VM_MAYWRITE instead of VM_WRITE because a call - to mprotect() can turn on VM_WRITE later. */ - - if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) == - (VM_MAYSHARE | VM_MAYWRITE)) - vma->vm_ops = &gfs2_vm_ops_sharewrite; - else - vma->vm_ops = &gfs2_vm_ops_private; - } + /* This is VM_MAYWRITE instead of VM_WRITE because a call + to mprotect() can turn on VM_WRITE later. */ + + if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) == + (VM_MAYSHARE | VM_MAYWRITE)) + vma->vm_ops = &gfs2_vm_ops_sharewrite; + else + vma->vm_ops = &gfs2_vm_ops_private; gfs2_glock_dq_uninit(&i_gh); @@ -1313,13 +696,6 @@ static int gfs2_open(struct inode *inode, struct file *file) if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO) file->f_flags |= O_DIRECT; - /* Don't let the user open O_DIRECT on a jdata file */ - - if ((file->f_flags & O_DIRECT) && gfs2_is_jdata(ip)) { - error = -EINVAL; - goto fail_gunlock; - } - gfs2_glock_dq_uninit(&i_gh); } @@ -1446,29 +822,10 @@ static ssize_t gfs2_sendfile(struct file *in_file, loff_t *offset, size_t count, read_actor_t actor, void *target) { struct gfs2_inode *ip = get_v2ip(in_file->f_mapping->host); - struct gfs2_holder gh; - ssize_t retval; atomic_inc(&ip->i_sbd->sd_ops_file); - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); - - retval = gfs2_glock_nq_atime(&gh); - if (retval) - goto out; - - if (gfs2_is_jdata(ip)) - retval = -EOPNOTSUPP; - else - retval = generic_file_sendfile(in_file, offset, count, actor, - target); - - gfs2_glock_dq(&gh); - - out: - gfs2_holder_uninit(&gh); - - return retval; + return generic_file_sendfile(in_file, offset, count, actor, target); } static int do_flock(struct file *file, int cmd, struct file_lock *fl) @@ -1567,7 +924,11 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) struct file_operations gfs2_file_fops = { .llseek = gfs2_llseek, .read = gfs2_read, - .write = gfs2_write, + .readv = gfs2_file_readv, + .aio_read = gfs2_file_aio_read, + .write = generic_file_write, + .writev = generic_file_writev, + .aio_write = generic_file_aio_write, .ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c index a1b409c..8f77bb7 100644 --- a/fs/gfs2/ops_vm.c +++ b/fs/gfs2/ops_vm.c @@ -155,9 +155,6 @@ static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area, if (error) return NULL; - if (gfs2_is_jdata(ip)) - goto out; - set_bit(GIF_PAGED, &ip->i_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); diff --git a/fs/gfs2/page.c b/fs/gfs2/page.c index ea31bce..3542aa6 100644 --- a/fs/gfs2/page.c +++ b/fs/gfs2/page.c @@ -172,8 +172,8 @@ int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, map_bh(bh, inode->i_sb, block); set_buffer_uptodate(bh); - if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) - gfs2_trans_add_databuf(sdp, bh); + if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED) || gfs2_is_jdata(ip)) + gfs2_trans_add_bh(ip->i_gl, bh, 0); mark_buffer_dirty(bh); if (release) { @@ -245,8 +245,8 @@ int gfs2_block_truncate_page(struct address_space *mapping) goto unlock; } - if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED/* || gfs2_is_jdata(ip)*/) - gfs2_trans_add_databuf(sdp, bh); + if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) + gfs2_trans_add_bh(ip->i_gl, bh, 0); kaddr = kmap_atomic(page, KM_USER0); memset(kaddr + offset, 0, length); @@ -273,7 +273,7 @@ void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, end = start + bsize; if (end <= from || start >= to) continue; - gfs2_trans_add_databuf(ip->i_sbd, bh); + gfs2_trans_add_bh(ip->i_gl, bh, 0); } } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 69e8f4e..138fdf5 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -43,20 +43,22 @@ #include #include #include +#include #include #include "gfs2.h" #include "bmap.h" #include "glock.h" #include "glops.h" -#include "jdata.h" #include "log.h" #include "meta_io.h" #include "quota.h" #include "rgrp.h" #include "super.h" #include "trans.h" +#include "inode.h" #include "ops_file.h" +#include "ops_address.h" #define QUOTA_USER 1 #define QUOTA_GROUP 0 @@ -561,6 +563,81 @@ static void do_qc(struct gfs2_quota_data *qd, int64_t change) up(&sdp->sd_quota_mutex); } +/** + * gfs2_adjust_quota + * + * This function was mostly borrowed from gfs2_block_truncate_page which was + * in turn mostly borrowed from ext3 + */ +static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, + int64_t change, struct gfs2_quota_data *qd) +{ + struct inode *inode = gfs2_ip2v(ip); + struct address_space *mapping = inode->i_mapping; + unsigned long index = loc >> PAGE_CACHE_SHIFT; + unsigned offset = loc & (PAGE_CACHE_SHIFT - 1); + unsigned blocksize, iblock, pos; + struct buffer_head *bh; + struct page *page; + void *kaddr; + __be64 *ptr; + u64 value; + int err = -EIO; + + page = grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + + blocksize = inode->i_sb->s_blocksize; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + if (!buffer_mapped(bh)) { + gfs2_get_block(inode, iblock, bh, 1); + if (!buffer_mapped(bh)) + goto unlock; + } + + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + goto unlock; + } + + gfs2_trans_add_bh(ip->i_gl, bh, 0); + + kaddr = kmap_atomic(page, KM_USER0); + ptr = (__be64 *)(kaddr + offset); + value = *ptr = cpu_to_be64(be64_to_cpu(*ptr) + change); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + err = 0; + qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC); +#if 0 + qd->qd_qb.qb_limit = cpu_to_be64(q.qu_limit); + qd->qd_qb.qb_warn = cpu_to_be64(q.qu_warn); +#endif + qd->qd_qb.qb_value = cpu_to_be64(value); +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) { struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd; @@ -635,43 +712,14 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) file_ra_state_init(&ra_state, ip->i_vnode->i_mapping); for (x = 0; x < num_qd; x++) { - char buf[sizeof(struct gfs2_quota)]; - struct gfs2_quota q; - qd = qda[x]; offset = qd2offset(qd); - - /* The quota file may not be a multiple of - sizeof(struct gfs2_quota) bytes. */ - memset(buf, 0, sizeof(struct gfs2_quota)); - - error = gfs2_internal_read(ip, &ra_state, buf, &offset, - sizeof(struct gfs2_quota)); - if (error < 0) + error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, + (struct gfs2_quota_data *)qd->qd_gl->gl_lvb); + if (error) goto out_end_trans; - gfs2_quota_in(&q, buf); - q.qu_value += qda[x]->qd_change_sync; - gfs2_quota_out(&q, buf); - - error = gfs2_jdata_write_mem(ip, buf, offset, - sizeof(struct gfs2_quota)); - if (error < 0) - goto out_end_trans; - else if (error != sizeof(struct gfs2_quota)) { - error = -EIO; - goto out_end_trans; - } - do_qc(qd, -qd->qd_change_sync); - - memset(&qd->qd_qb, 0, sizeof(struct gfs2_quota_lvb)); - qd->qd_qb.qb_magic = GFS2_MAGIC; - qd->qd_qb.qb_limit = q.qu_limit; - qd->qd_qb.qb_warn = q.qu_warn; - qd->qd_qb.qb_value = q.qu_value; - - gfs2_quota_lvb_out(&qd->qd_qb, qd->qd_gl->gl_lvb); } error = 0; diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index b014591..104e664 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -154,14 +154,13 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta) gfs2_attach_bufdata(gl, bh, meta); bd = get_v2bd(bh); } - lops_add(sdp, &bd->bd_le); } void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno) { struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke), - GFP_KERNEL | __GFP_NOFAIL); + GFP_NOFS | __GFP_NOFAIL); lops_init_le(&rv->rv_le, &gfs2_revoke_lops); rv->rv_blkno = blkno; lops_add(sdp, &rv->rv_le); @@ -197,19 +196,3 @@ void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd) lops_add(rgd->rd_sbd, &rgd->rd_le); } -void gfs2_trans_add_databuf(struct gfs2_sbd *sdp, struct buffer_head *bh) -{ - struct gfs2_bufdata *bd; - - bd = get_v2bd(bh); - if (!bd) { - bd = kmalloc(sizeof(struct gfs2_bufdata), - GFP_NOFS | __GFP_NOFAIL); - lops_init_le(&bd->bd_le, &gfs2_databuf_lops); - get_bh(bh); - bd->bd_bh = bh; - set_v2bd(bh, bd); - lops_add(sdp, &bd->bd_le); - } -} - diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index 5a7da1e..f7f3e2a 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -35,6 +35,5 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno); void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno); void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd); -void gfs2_trans_add_databuf(struct gfs2_sbd *sdp, struct buffer_head *bh); #endif /* __TRANS_DOT_H__ */ diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index ad49153..4fb1704 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -50,6 +50,7 @@ int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", sdp->sd_fsname, assertion, sdp->sd_fsname, function, file, line); + dump_stack(); return (me) ? -1 : -2; } @@ -75,6 +76,8 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, if (sdp->sd_args.ar_debug) BUG(); + else + dump_stack(); sdp->sd_last_warning = jiffies; diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h index f1302e2..99d7ae4 100644 --- a/include/linux/gfs2_ondisk.h +++ b/include/linux/gfs2_ondisk.h @@ -336,6 +336,10 @@ struct gfs2_log_header { /* ld_data1 is the number of revoke blocks in the descriptor. ld_data2 is unused. */ +#define GFS2_LOG_DESC_JDATA 302 +/* ld_data1 is the number of data blocks in the descriptor. + ld_data2 is unused. */ + struct gfs2_log_descriptor { struct gfs2_meta_header ld_header; @@ -400,6 +404,7 @@ struct gfs2_quota_change { __be32 qc_id; }; +#ifdef __KERNEL__ /* Translation functions */ extern void gfs2_inum_in(struct gfs2_inum *no, char *buf); @@ -444,4 +449,6 @@ extern void gfs2_statfs_change_print(struct gfs2_statfs_change *sc); extern void gfs2_unlinked_tag_print(struct gfs2_unlinked_tag *ut); extern void gfs2_quota_change_print(struct gfs2_quota_change *qc); +#endif /* __KERNEL__ */ + #endif /* __GFS2_ONDISK_DOT_H__ */ -- cgit v0.10.2