From a3916e528b917851a4d2379e2fd2579ad5f2b5a7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 1 Jun 2016 17:38:12 +1000 Subject: xfs: fix broken multi-fsb buffer logging Multi-block buffers are logged based on buffer offset in xfs_trans_log_buf(). xfs_buf_item_log() ultimately walks each mapping in the buffer and marks the associated range to be logged in the xfs_buf_log_format bitmap for that mapping. This code is broken, however, in that it marks the actual buffer offsets of the associated range in each bitmap rather than shifting to the byte range for that particular mapping. For example, on a 4k fsb fs, buffer offset 4096 refers to the first byte of the second mapping in the buffer. This means byte 0 of the second log format bitmap should be tagged as dirty. Instead, the current code marks byte offset 4096 of the second log format bitmap, which is invalid and potentially out of range of the mapping. As a result of this, the log item format code invoked at transaction commit time is not be able to correctly identify what parts of the buffer to copy into log vectors. This can lead to NULL log vector pointer dereferences in CIL push context if the item format code was not able to locate any dirty ranges at all. This crash has been reproduced on a 4k FSB filesystem using 16k directory blocks where an unlink operation happened not to log anything in the first block of the mapping. The logged offsets were all over 4k, marked as such in the subsequent log format mappings, and thus left the transaction with an xfs_log_item that is marked DIRTY but without any logged regions. Further, even when the logged regions are marked correctly in the buffer log format bitmaps, the format code doesn't copy the correct ranges of the buffer into the log. This means that any logged region beyond the first block of a multi-block buffer is subject to corruption after a crash and log recovery sequence. This is due to a failure to convert the mapping bm_len field from basic blocks to bytes in the buffer offset tracking code in xfs_buf_item_format(). Update xfs_buf_item_log() to convert buffer offsets to segment relative offsets when logging multi-block buffers. This ensures that the modified regions of a buffer are logged correctly and avoids the aforementioned crash. Also update xfs_buf_item_format() to correctly track the source offset into the buffer for the log vector formatting code. This ensures that the correct data is copied into the log. Signed-off-by: Brian Foster Reviewed-by: Eric Sandeen Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 3425799..2e95ad0 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -359,7 +359,7 @@ xfs_buf_item_format( for (i = 0; i < bip->bli_format_count; i++) { xfs_buf_item_format_segment(bip, lv, &vecp, offset, &bip->bli_formats[i]); - offset += bp->b_maps[i].bm_len; + offset += BBTOB(bp->b_maps[i].bm_len); } /* @@ -915,20 +915,28 @@ xfs_buf_item_log( for (i = 0; i < bip->bli_format_count; i++) { if (start > last) break; - end = start + BBTOB(bp->b_maps[i].bm_len); + end = start + BBTOB(bp->b_maps[i].bm_len) - 1; + + /* skip to the map that includes the first byte to log */ if (first > end) { start += BBTOB(bp->b_maps[i].bm_len); continue; } + + /* + * Trim the range to this segment and mark it in the bitmap. + * Note that we must convert buffer offsets to segment relative + * offsets (e.g., the first byte of each segment is byte 0 of + * that segment). + */ if (first < start) first = start; if (end > last) end = last; - - xfs_buf_item_log_segment(first, end, + xfs_buf_item_log_segment(first - start, end - start, &bip->bli_formats[i].blf_data_map[0]); - start += bp->b_maps[i].bm_len; + start += BBTOB(bp->b_maps[i].bm_len); } } -- cgit v0.10.2 From 0c871f9a101a290e61af02df49087d118543caeb Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 1 Jun 2016 17:38:15 +1000 Subject: xfs: remove spurious shutdown type check from xfs_bmap_finish() The static checker reports that after commit 8d99fe92fed0 ("xfs: fix efi/efd error handling to avoid fs shutdown hangs"), the code has been reworked such that error == -EFSCORRUPTED is not possible in this codepath. Remove the spurious error check and just use SHUTDOWN_META_IO_ERROR unconditionally. Signed-off-by: Brian Foster Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 586bb64..3ce3c61 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -125,9 +125,7 @@ xfs_bmap_finish( if (committed) { xfs_efi_release(efi); xfs_force_shutdown((*tp)->t_mountp, - (error == -EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); + SHUTDOWN_META_IO_ERROR); } return error; } -- cgit v0.10.2 From 0d5a75e9e23ee39cd0d8a167393dcedb4f0f47b2 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 1 Jun 2016 17:38:15 +1000 Subject: xfs: make several functions static Al Viro noticed that xfs_lock_inodes should be static, and that led to ... a few more. These are just the easy ones, others require moving functions higher in source files, so that's not done here to keep this review simple. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index a708e38..99b077c 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -84,7 +84,7 @@ xfs_alloc_lookup_ge( * Lookup the first record less than or equal to [bno, len] * in the btree given by cur. */ -int /* error */ +static int /* error */ xfs_alloc_lookup_le( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 135eb3d..92a66ba 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -212,13 +212,6 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len); /* length of extent */ -int /* error */ -xfs_alloc_lookup_le( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat); /* success/failure */ - int /* error */ xfs_alloc_lookup_ge( struct xfs_btree_cur *cur, /* btree cursor */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 882c8d3..4f2aed0 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -50,7 +50,6 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); -int xfs_attr_shortform_list(struct xfs_attr_list_context *context); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); @@ -88,8 +87,6 @@ int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk); -int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); - /* * Utility routines. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 951c044..e2e1106 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ -int +static int xfs_rtbuf_get( xfs_mount_t *mp, /* file system mount structure */ xfs_trans_t *tp, /* transaction pointer */ diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 55d2149..be0b79d 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -322,7 +322,7 @@ xfs_attr3_node_inactive( * Recurse (gasp!) through the attribute nodes until we find leaves. * We're doing a depth-first traversal in order to invalidate everything. */ -int +static int xfs_attr3_root_inactive( struct xfs_trans **trans, struct xfs_inode *dp) diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index d25f26b..25e76cd 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b) * we have to calculate each entries' hashvalue and sort them before * we can begin returning them to the user. */ -int +static int xfs_attr_shortform_list(xfs_attr_list_context_t *context) { attrlist_cursor_kern_t *cursor; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 3ce3c61..28c42fb 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -407,7 +407,7 @@ xfs_bmap_count_tree( /* * Count fsblocks of the given fork. */ -int /* error */ +static int /* error */ xfs_bmap_count_blocks( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode */ diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index af97d9a..1492348 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -31,8 +31,6 @@ struct xfs_bmalloca; int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); -int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, - int whichfork, int *count); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ee6799e..8825bcf 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -431,7 +431,7 @@ xfs_lock_inumorder(int lock_mode, int subclass) * lock more than one at a time, lockdep will report false positives saying we * have violated locking orders. */ -void +static void xfs_lock_inodes( xfs_inode_t **ips, int inodes, @@ -667,14 +667,6 @@ xfs_ip2xflags( return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); } -uint -xfs_dic2xflags( - struct xfs_dinode *dip) -{ - return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), - be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip)); -} - /* * Lookups up an inode from "name". If ci_name is not NULL, then a CI match * is allowed, otherwise it has to be an exact match. If a CI match is found, @@ -748,7 +740,7 @@ out_unlock: * are not linked into the directory structure - they are attached * directly to the superblock - and so have no parent. */ -int +static int xfs_ialloc( xfs_trans_t *tp, xfs_inode_t *pip, @@ -1085,7 +1077,7 @@ xfs_dir_ialloc( * link count to go to zero, move the inode to AGI unlinked list so that it can * be freed when the last active reference goes away via xfs_inactive(). */ -int /* error */ +static int /* error */ xfs_droplink( xfs_trans_t *tp, xfs_inode_t *ip) @@ -1104,7 +1096,7 @@ xfs_droplink( /* * Increment the link count on an inode & log the change. */ -int +static int xfs_bumplink( xfs_trans_t *tp, xfs_inode_t *ip) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e52d7c7..99d7522 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -395,12 +395,8 @@ void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); -int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, - xfs_nlink_t, xfs_dev_t, prid_t, int, - struct xfs_buf **, xfs_inode_t **); uint xfs_ip2xflags(struct xfs_inode *); -uint xfs_dic2xflags(struct xfs_dinode *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, @@ -411,7 +407,6 @@ void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) int xfs_iflush(struct xfs_inode *, struct xfs_buf **); -void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); @@ -419,8 +414,6 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_inode **, int *); -int xfs_droplink(struct xfs_trans *, struct xfs_inode *); -int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); /* from xfs_file.c */ enum xfs_prealloc_flags { diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bde02f1..63dad9e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -788,7 +788,7 @@ xfs_log_mount_cancel( * As far as I know, there weren't any dependencies on the old behaviour. */ -int +static int xfs_log_unmount_write(xfs_mount_t *mp) { struct xlog *log = mp->m_log; @@ -1036,7 +1036,7 @@ xfs_log_space_wake( * there's no point in running a dummy transaction at this point because we * can't start trying to idle the log until both the CIL and AIL are empty. */ -int +static int xfs_log_need_covered(xfs_mount_t *mp) { struct xlog *log = mp->m_log; @@ -1177,7 +1177,7 @@ xlog_space_left( * The log manager needs its own routine, in order to control what * happens with the buffer after the write completes. */ -void +static void xlog_iodone(xfs_buf_t *bp) { struct xlog_in_core *iclog = bp->b_fspriv; @@ -1302,7 +1302,7 @@ xfs_log_work_queue( * disk. If there is nothing dirty, then we might need to cover the log to * indicate that the filesystem is idle. */ -void +static void xfs_log_worker( struct work_struct *work) { diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 80ba0c0..b5e7107 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -163,12 +163,8 @@ int xfs_log_reserve(struct xfs_mount *mp, __uint8_t clientid, bool permanent); int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); -int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); -int xfs_log_need_covered(struct xfs_mount *mp); - -void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); @@ -178,7 +174,6 @@ void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); -void xfs_log_worker(struct work_struct *work); void xfs_log_quiesce(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 76c0a4a..355dd9e 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -98,8 +98,6 @@ xfs_growfs_rt( /* * From xfs_rtbitmap.c */ -int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t block, int issum, struct xfs_buf **bpp); int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, int val, xfs_rtblock_t *new, int *stat); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 11ea5d5..4700f09 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -546,7 +546,7 @@ xfs_showargs( return 0; } -__uint64_t +static __uint64_t xfs_max_file_offset( unsigned int blockshift) { diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 2dfb1ce..529bce9 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -61,8 +61,6 @@ struct xfs_mount; struct xfs_buftarg; struct block_device; -extern __uint64_t xfs_max_file_offset(unsigned int); - extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, -- cgit v0.10.2 From 4478fb1f2db4b1473969ed24cf18264e3a4b1d79 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Jun 2016 17:38:15 +1000 Subject: xfs: define XFS_IOC_FREEZE even if FIFREEZE is defined And the same for XFS_IOC_THAW. Just because we now have a common version of the ioctl we still need to provide the old name for it for anyone using those. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index fffe3d0..f5ec9c5 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -521,12 +521,8 @@ typedef struct xfs_swapext #define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) /* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ -/* XFS_IOC_FREEZE -- FIFREEZE 119 */ -/* XFS_IOC_THAW -- FITHAW 120 */ -#ifndef FIFREEZE -#define XFS_IOC_FREEZE _IOWR('X', 119, int) -#define XFS_IOC_THAW _IOWR('X', 120, int) -#endif +#define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */ +#define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */ #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) -- cgit v0.10.2 From 26f1fe858f2744edfc75e92d34a6be0af5e8b45d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 1 Jun 2016 17:38:15 +1000 Subject: xfs: reduce lock hold times in buffer writeback When we have a lot of metadata to flush from the AIL, the buffer list can get very long. The current submission code tries to batch submission to optimise IO order of the metadata (i.e. ascending block order) to maximise block layer merging or IO to adjacent metadata blocks. Unfortunately, the method used can result in long lock times occurring as buffers locked early on in the buffer list might not be dispatched until the end of the IO licst processing. This is because sorting does not occur util after the buffer list has been processed and the buffers that are going to be submitted are locked. Hence when the buffer list is several thousand buffers long, the lock hold times before IO dispatch can be significant. To fix this, sort the buffer list before we start trying to lock and submit buffers. This means we can now submit buffers immediately after they are locked, allowing merging to occur immediately on the plug and dispatch to occur as quickly as possible. This means there is minimal delay between locking the buffer and IO submission occuring, hence reducing the worst case lock hold times seen during delayed write buffer IO submission signficantly. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e71cfbd..efa2a73 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1774,18 +1774,33 @@ xfs_buf_cmp( return 0; } +/* + * submit buffers for write. + * + * When we have a large buffer list, we do not want to hold all the buffers + * locked while we block on the request queue waiting for IO dispatch. To avoid + * this problem, we lock and submit buffers in groups of 50, thereby minimising + * the lock hold times for lists which may contain thousands of objects. + * + * To do this, we sort the buffer list before we walk the list to lock and + * submit buffers, and we plug and unplug around each group of buffers we + * submit. + */ static int -__xfs_buf_delwri_submit( +xfs_buf_delwri_submit_buffers( struct list_head *buffer_list, - struct list_head *io_list, - bool wait) + struct list_head *wait_list) { - struct blk_plug plug; struct xfs_buf *bp, *n; + LIST_HEAD (submit_list); int pinned = 0; + struct blk_plug plug; + + list_sort(NULL, buffer_list, xfs_buf_cmp); + blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { - if (!wait) { + if (!wait_list) { if (xfs_buf_ispinned(bp)) { pinned++; continue; @@ -1808,25 +1823,21 @@ __xfs_buf_delwri_submit( continue; } - list_move_tail(&bp->b_list, io_list); trace_xfs_buf_delwri_split(bp, _RET_IP_); - } - - list_sort(NULL, io_list, xfs_buf_cmp); - - blk_start_plug(&plug); - list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); - bp->b_flags |= XBF_WRITE | XBF_ASYNC; /* - * we do all Io submission async. This means if we need to wait - * for IO completion we need to take an extra reference so the - * buffer is still valid on the other side. + * We do all IO submission async. This means if we need + * to wait for IO completion we need to take an extra + * reference so the buffer is still valid on the other + * side. We need to move the buffer onto the io_list + * at this point so the caller can still access it. */ - if (wait) + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); + bp->b_flags |= XBF_WRITE | XBF_ASYNC; + if (wait_list) { xfs_buf_hold(bp); - else + list_move_tail(&bp->b_list, wait_list); + } else list_del_init(&bp->b_list); xfs_buf_submit(bp); @@ -1849,8 +1860,7 @@ int xfs_buf_delwri_submit_nowait( struct list_head *buffer_list) { - LIST_HEAD (io_list); - return __xfs_buf_delwri_submit(buffer_list, &io_list, false); + return xfs_buf_delwri_submit_buffers(buffer_list, NULL); } /* @@ -1865,15 +1875,15 @@ int xfs_buf_delwri_submit( struct list_head *buffer_list) { - LIST_HEAD (io_list); + LIST_HEAD (wait_list); int error = 0, error2; struct xfs_buf *bp; - __xfs_buf_delwri_submit(buffer_list, &io_list, true); + xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); /* Wait for IO to complete. */ - while (!list_empty(&io_list)) { - bp = list_first_entry(&io_list, struct xfs_buf, b_list); + while (!list_empty(&wait_list)) { + bp = list_first_entry(&wait_list, struct xfs_buf, b_list); list_del_init(&bp->b_list); -- cgit v0.10.2 From 199a31c6d93ba9dc6f831fa1e77d9926f34f4e8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:22:39 +1000 Subject: fs: move struct iomap from exportfs.h to a separate header Signed-off-by: Christoph Hellwig Reviewed-by: Bob Peterson Signed-off-by: Dave Chinner diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index e55b524..4df16ae 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -2,6 +2,7 @@ * Copyright (c) 2014-2016 Christoph Hellwig. */ #include +#include #include #include #include diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 6c3b316..4ebaaf4 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -3,6 +3,7 @@ */ #include #include +#include #include #include "nfsd.h" diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index d5b7566..db3c7df 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2014 Christoph Hellwig. */ +#include #include "xfs.h" #include "xfs_format.h" #include "xfs_log_format.h" diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index d841450..b03c062 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -6,6 +6,7 @@ struct dentry; struct iattr; struct inode; +struct iomap; struct super_block; struct vfsmount; @@ -187,21 +188,6 @@ struct fid { * get_name is not (which is possibly inconsistent) */ -/* types of block ranges for multipage write mappings. */ -#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */ -#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */ -#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */ -#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */ - -#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */ - -struct iomap { - sector_t blkno; /* first sector of mapping */ - loff_t offset; /* file offset of mapping, bytes */ - u64 length; /* length of mapping, bytes */ - int type; /* type of mapping */ -}; - struct export_operations { int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent); diff --git a/include/linux/iomap.h b/include/linux/iomap.h new file mode 100644 index 0000000..1b22197 --- /dev/null +++ b/include/linux/iomap.h @@ -0,0 +1,21 @@ +#ifndef LINUX_IOMAP_H +#define LINUX_IOMAP_H 1 + +#include + +/* types of block ranges for multipage write mappings. */ +#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */ +#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */ +#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */ +#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */ + +#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */ + +struct iomap { + sector_t blkno; /* first sector of mapping */ + loff_t offset; /* file offset of mapping, bytes */ + u64 length; /* length of mapping, bytes */ + int type; /* type of mapping */ +}; + +#endif /* LINUX_IOMAP_H */ -- cgit v0.10.2 From ae259a9c8593f98aa60d045df978a5482a67c53f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:23:11 +1000 Subject: fs: introduce iomap infrastructure Add infrastructure for multipage buffered writes. This is implemented using an main iterator that applies an actor function to a range that can be written. This infrastucture is used to implement a buffered write helper, one to zero file ranges and one to implement the ->page_mkwrite VM operations. All of them borrow a fair amount of code from fs/buffers. for now by using an internal version of __block_write_begin that gets passed an iomap and builds the corresponding buffer head. The file system is gets a set of paired ->iomap_begin and ->iomap_end calls which allow it to map/reserve a range and get a notification once the write code is finished with it. Based on earlier code from Dave Chinner. Signed-off-by: Christoph Hellwig Reviewed-by: Bob Peterson Signed-off-by: Dave Chinner diff --git a/fs/Kconfig b/fs/Kconfig index b8fcb41..4524916 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS if BLOCK +config FS_IOMAP + bool + source "fs/ext2/Kconfig" source "fs/ext4/Kconfig" source "fs/jbd2/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 85b6e13..ed2b632 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o obj-$(CONFIG_FHANDLE) += fhandle.o +obj-$(CONFIG_FS_IOMAP) += iomap.o obj-y += quota/ diff --git a/fs/buffer.c b/fs/buffer.c index 754813a..228288a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1891,8 +1892,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) } EXPORT_SYMBOL(page_zero_new_buffers); -int __block_write_begin(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block) +static void +iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, + struct iomap *iomap) +{ + loff_t offset = block << inode->i_blkbits; + + bh->b_bdev = iomap->bdev; + + /* + * Block points to offset in file we need to map, iomap contains + * the offset at which the map starts. If the map ends before the + * current block, then do not map the buffer and let the caller + * handle it. + */ + BUG_ON(offset >= iomap->offset + iomap->length); + + switch (iomap->type) { + case IOMAP_HOLE: + /* + * If the buffer is not up to date or beyond the current EOF, + * we need to mark it as new to ensure sub-block zeroing is + * executed if necessary. + */ + if (!buffer_uptodate(bh) || + (offset >= i_size_read(inode))) + set_buffer_new(bh); + break; + case IOMAP_DELALLOC: + if (!buffer_uptodate(bh) || + (offset >= i_size_read(inode))) + set_buffer_new(bh); + set_buffer_uptodate(bh); + set_buffer_mapped(bh); + set_buffer_delay(bh); + break; + case IOMAP_UNWRITTEN: + /* + * For unwritten regions, we always need to ensure that + * sub-block writes cause the regions in the block we are not + * writing to are zeroed. Set the buffer as new to ensure this. + */ + set_buffer_new(bh); + set_buffer_unwritten(bh); + /* FALLTHRU */ + case IOMAP_MAPPED: + if (offset >= i_size_read(inode)) + set_buffer_new(bh); + bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) + + ((offset - iomap->offset) >> inode->i_blkbits); + set_buffer_mapped(bh); + break; + } +} + +int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block, struct iomap *iomap) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; @@ -1928,9 +1983,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); - err = get_block(inode, block, bh, 1); - if (err) - break; + if (get_block) { + err = get_block(inode, block, bh, 1); + if (err) + break; + } else { + iomap_to_bh(inode, block, bh, iomap); + } + if (buffer_new(bh)) { unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); @@ -1971,6 +2031,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, page_zero_new_buffers(page, from, to); return err; } + +int __block_write_begin(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block) +{ + return __block_write_begin_int(page, pos, len, get_block, NULL); +} EXPORT_SYMBOL(__block_write_begin); static int __block_commit_write(struct inode *inode, struct page *page, diff --git a/fs/internal.h b/fs/internal.h index b71deee..c0c6f49 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -11,6 +11,7 @@ struct super_block; struct file_system_type; +struct iomap; struct linux_binprm; struct path; struct mount; @@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) * buffer.c */ extern void guard_bio_eod(int rw, struct bio *bio); +extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block, struct iomap *iomap); /* * char_dev.c diff --git a/fs/iomap.c b/fs/iomap.c new file mode 100644 index 0000000..8e2fc17 --- /dev/null +++ b/fs/iomap.c @@ -0,0 +1,394 @@ +/* + * Copyright (C) 2010 Red Hat, Inc. + * Copyright (c) 2016 Christoph Hellwig. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, + void *data, struct iomap *iomap); + +/* + * Execute a iomap write on a segment of the mapping that spans a + * contiguous range of pages that have identical block mapping state. + * + * This avoids the need to map pages individually, do individual allocations + * for each page and most importantly avoid the need for filesystem specific + * locking per page. Instead, all the operations are amortised over the entire + * range of pages. It is assumed that the filesystems will lock whatever + * resources they require in the iomap_begin call, and release them in the + * iomap_end call. + */ +static loff_t +iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, + struct iomap_ops *ops, void *data, iomap_actor_t actor) +{ + struct iomap iomap = { 0 }; + loff_t written = 0, ret; + + /* + * Need to map a range from start position for length bytes. This can + * span multiple pages - it is only guaranteed to return a range of a + * single type of pages (e.g. all into a hole, all mapped or all + * unwritten). Failure at this point has nothing to undo. + * + * If allocation is required for this range, reserve the space now so + * that the allocation is guaranteed to succeed later on. Once we copy + * the data into the page cache pages, then we cannot fail otherwise we + * expose transient stale data. If the reserve fails, we can safely + * back out at this point as there is nothing to undo. + */ + ret = ops->iomap_begin(inode, pos, length, flags, &iomap); + if (ret) + return ret; + if (WARN_ON(iomap.offset > pos)) + return -EIO; + + /* + * Cut down the length to the one actually provided by the filesystem, + * as it might not be able to give us the whole size that we requested. + */ + if (iomap.offset + iomap.length < pos + length) + length = iomap.offset + iomap.length - pos; + + /* + * Now that we have guaranteed that the space allocation will succeed. + * we can do the copy-in page by page without having to worry about + * failures exposing transient data. + */ + written = actor(inode, pos, length, data, &iomap); + + /* + * Now the data has been copied, commit the range we've copied. This + * should not fail unless the filesystem has had a fatal error. + */ + ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0, + flags, &iomap); + + return written ? written : ret; +} + +static void +iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) +{ + loff_t i_size = i_size_read(inode); + + /* + * Only truncate newly allocated pages beyoned EOF, even if the + * write started inside the existing inode size. + */ + if (pos + len > i_size) + truncate_pagecache_range(inode, max(pos, i_size), pos + len); +} + +static int +iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, + struct page **pagep, struct iomap *iomap) +{ + pgoff_t index = pos >> PAGE_SHIFT; + struct page *page; + int status = 0; + + BUG_ON(pos + len > iomap->offset + iomap->length); + + page = grab_cache_page_write_begin(inode->i_mapping, index, flags); + if (!page) + return -ENOMEM; + + status = __block_write_begin_int(page, pos, len, NULL, iomap); + if (unlikely(status)) { + unlock_page(page); + put_page(page); + page = NULL; + + iomap_write_failed(inode, pos, len); + } + + *pagep = page; + return status; +} + +static int +iomap_write_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct page *page) +{ + int ret; + + ret = generic_write_end(NULL, inode->i_mapping, pos, len, + copied, page, NULL); + if (ret < len) + iomap_write_failed(inode, pos, len); + return ret; +} + +static loff_t +iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct iov_iter *i = data; + long status = 0; + ssize_t written = 0; + unsigned int flags = AOP_FLAG_NOFS; + + /* + * Copies from kernel address space cannot fail (NFSD is a big user). + */ + if (!iter_is_iovec(i)) + flags |= AOP_FLAG_UNINTERRUPTIBLE; + + do { + struct page *page; + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + + offset = (pos & (PAGE_SIZE - 1)); + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_count(i)); +again: + if (bytes > length) + bytes = length; + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } + + status = iomap_write_begin(inode, pos, bytes, flags, &page, + iomap); + if (unlikely(status)) + break; + + if (mapping_writably_mapped(inode->i_mapping)) + flush_dcache_page(page); + + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + pagefault_enable(); + + flush_dcache_page(page); + mark_page_accessed(page); + + status = iomap_write_end(inode, pos, bytes, copied, page); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + iov_iter_advance(i, copied); + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + pos += copied; + written += copied; + length -= copied; + + balance_dirty_pages_ratelimited(inode->i_mapping); + } while (iov_iter_count(i) && length); + + return written ? written : status; +} + +ssize_t +iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + loff_t pos = iocb->ki_pos, ret = 0, written = 0; + + while (iov_iter_count(iter)) { + ret = iomap_apply(inode, pos, iov_iter_count(iter), + IOMAP_WRITE, ops, iter, iomap_write_actor); + if (ret <= 0) + break; + pos += ret; + written += ret; + } + + return written ? written : ret; +} +EXPORT_SYMBOL_GPL(iomap_file_buffered_write); + +static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, + unsigned bytes, struct iomap *iomap) +{ + struct page *page; + int status; + + status = iomap_write_begin(inode, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap); + if (status) + return status; + + zero_user(page, offset, bytes); + mark_page_accessed(page); + + return iomap_write_end(inode, pos, bytes, bytes, page); +} + +static loff_t +iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, + void *data, struct iomap *iomap) +{ + bool *did_zero = data; + loff_t written = 0; + int status; + + /* already zeroed? we're done. */ + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) + return count; + + do { + unsigned offset, bytes; + + offset = pos & (PAGE_SIZE - 1); /* Within page */ + bytes = min_t(unsigned, PAGE_SIZE - offset, count); + + status = iomap_zero(inode, pos, offset, bytes, iomap); + if (status < 0) + return status; + + pos += bytes; + count -= bytes; + written += bytes; + if (did_zero) + *did_zero = true; + } while (count > 0); + + return written; +} + +int +iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, + struct iomap_ops *ops) +{ + loff_t ret; + + while (len > 0) { + ret = iomap_apply(inode, pos, len, IOMAP_ZERO, + ops, did_zero, iomap_zero_range_actor); + if (ret <= 0) + return ret; + + pos += ret; + len -= ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_zero_range); + +int +iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, + struct iomap_ops *ops) +{ + unsigned blocksize = (1 << inode->i_blkbits); + unsigned off = pos & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!off) + return 0; + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); +} +EXPORT_SYMBOL_GPL(iomap_truncate_page); + +static loff_t +iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + struct page *page = data; + int ret; + + ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length, + NULL, iomap); + if (ret) + return ret; + + block_commit_write(page, 0, length); + return length; +} + +int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + unsigned long length; + loff_t offset, size; + ssize_t ret; + + lock_page(page); + size = i_size_read(inode); + if ((page->mapping != inode->i_mapping) || + (page_offset(page) > size)) { + /* We overload EFAULT to mean page got truncated */ + ret = -EFAULT; + goto out_unlock; + } + + /* page is wholly or partially inside EOF */ + if (((page->index + 1) << PAGE_SHIFT) > size) + length = size & ~PAGE_MASK; + else + length = PAGE_SIZE; + + offset = page_offset(page); + while (length > 0) { + ret = iomap_apply(inode, offset, length, IOMAP_WRITE, + ops, page, iomap_page_mkwrite_actor); + if (unlikely(ret <= 0)) + goto out_unlock; + offset += ret; + length -= ret; + } + + set_page_dirty(page); + wait_for_stable_page(page); + return 0; +out_unlock: + unlock_page(page); + return ret; +} +EXPORT_SYMBOL_GPL(iomap_page_mkwrite); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 1b22197..d2f469a 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -3,19 +3,65 @@ #include -/* types of block ranges for multipage write mappings. */ +struct inode; +struct iov_iter; +struct kiocb; +struct vm_area_struct; +struct vm_fault; + +/* + * Types of block ranges for iomap mappings: + */ #define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */ #define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */ #define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */ #define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */ +/* + * Magic value for blkno: + */ #define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */ struct iomap { - sector_t blkno; /* first sector of mapping */ - loff_t offset; /* file offset of mapping, bytes */ - u64 length; /* length of mapping, bytes */ - int type; /* type of mapping */ + sector_t blkno; /* 1st sector of mapping, 512b units */ + loff_t offset; /* file offset of mapping, bytes */ + u64 length; /* length of mapping, bytes */ + int type; /* type of mapping */ + struct block_device *bdev; /* block device for I/O */ +}; + +/* + * Flags for iomap_begin / iomap_end. No flag implies a read. + */ +#define IOMAP_WRITE (1 << 0) +#define IOMAP_ZERO (1 << 1) + +struct iomap_ops { + /* + * Return the existing mapping at pos, or reserve space starting at + * pos for up to length, as long as we can do it as a single mapping. + * The actual length is returned in iomap->length. + */ + int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap); + + /* + * Commit and/or unreserve space previous allocated using iomap_begin. + * Written indicates the length of the successful write operation which + * needs to be commited, while the rest needs to be unreserved. + * Written might be zero if no data was written. + */ + int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap); }; +ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, + struct iomap_ops *ops); +int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, + bool *did_zero, struct iomap_ops *ops); +int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, + struct iomap_ops *ops); +int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops); + #endif /* LINUX_IOMAP_H */ -- cgit v0.10.2 From 9a286f0e52a2dac362caf78a458efa8f3c05b99e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:31:39 +1000 Subject: fs: support DAX based iomap zeroing This avoid needing a separate inefficient get_block based DAX zero_range implementation in file systems. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/iomap.c b/fs/iomap.c index 8e2fc17..477817c 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "internal.h" typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, @@ -268,6 +269,15 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, return iomap_write_end(inode, pos, bytes, bytes, page); } +static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, + struct iomap *iomap) +{ + sector_t sector = iomap->blkno + + (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); + + return __dax_zero_page_range(iomap->bdev, sector, offset, bytes); +} + static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, void *data, struct iomap *iomap) @@ -286,7 +296,10 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, offset = pos & (PAGE_SIZE - 1); /* Within page */ bytes = min_t(unsigned, PAGE_SIZE - offset, count); - status = iomap_zero(inode, pos, offset, bytes, iomap); + if (IS_DAX(inode)) + status = iomap_dax_zero(pos, offset, bytes, iomap); + else + status = iomap_zero(inode, pos, offset, bytes, iomap); if (status < 0) return status; -- cgit v0.10.2 From 8be9f564d25e7adc582f9f3689040ce5aa6f1f5b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:38:45 +1000 Subject: fs: iomap based fiemap implementation Add a simple fiemap implementation based on iomap_ops, partially based on a previous implementation from Bob Peterson . Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/iomap.c b/fs/iomap.c index 477817c..48141b8 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -405,3 +405,93 @@ out_unlock: return ret; } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); + +struct fiemap_ctx { + struct fiemap_extent_info *fi; + struct iomap prev; +}; + +static int iomap_to_fiemap(struct fiemap_extent_info *fi, + struct iomap *iomap, u32 flags) +{ + switch (iomap->type) { + case IOMAP_HOLE: + /* skip holes */ + return 0; + case IOMAP_DELALLOC: + flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; + break; + case IOMAP_UNWRITTEN: + flags |= FIEMAP_EXTENT_UNWRITTEN; + break; + case IOMAP_MAPPED: + break; + } + + return fiemap_fill_next_extent(fi, iomap->offset, + iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, + iomap->length, flags | FIEMAP_EXTENT_MERGED); + +} + +static loff_t +iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct fiemap_ctx *ctx = data; + loff_t ret = length; + + if (iomap->type == IOMAP_HOLE) + return length; + + ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); + ctx->prev = *iomap; + switch (ret) { + case 0: /* success */ + return length; + case 1: /* extent array full */ + return 0; + default: + return ret; + } +} + +int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, + loff_t start, loff_t len, struct iomap_ops *ops) +{ + struct fiemap_ctx ctx; + loff_t ret; + + memset(&ctx, 0, sizeof(ctx)); + ctx.fi = fi; + ctx.prev.type = IOMAP_HOLE; + + ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + return ret; + + while (len > 0) { + ret = iomap_apply(inode, start, len, 0, ops, &ctx, + iomap_fiemap_actor); + if (ret < 0) + return ret; + if (ret == 0) + break; + + start += ret; + len -= ret; + } + + if (ctx.prev.type != IOMAP_HOLE) { + ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); + if (ret < 0) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_fiemap); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index d2f469a..3267df4 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -3,6 +3,7 @@ #include +struct fiemap_extent_info; struct inode; struct iov_iter; struct kiocb; @@ -63,5 +64,7 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, struct iomap_ops *ops); int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, struct iomap_ops *ops); +int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + loff_t start, loff_t len, struct iomap_ops *ops); #endif /* LINUX_IOMAP_H */ -- cgit v0.10.2 From 3b3dce05279b97525741b7949208017307e05621 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:52:47 +1000 Subject: xfs: make xfs_bmbt_to_iomap available outside of xfs_pnfs.c And ensure it works for RT subvolume files an set the block device, both of which will be needed to be able to use the function in the buffered write path. Signed-off-by: Christoph Hellwig Reviewed-by: Bob Peterson Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 5839135..2f37194 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -15,6 +15,7 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" @@ -940,3 +941,29 @@ error_on_bmapi_transaction: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } + +void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); + iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); +} diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 8688e66..718f07c 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,6 +18,7 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ +struct iomap; struct xfs_inode; struct xfs_bmbt_irec; @@ -29,4 +30,7 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, struct xfs_bmbt_irec *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); +void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, + struct xfs_bmbt_irec *); + #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index db3c7df..0f14b2e 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -80,32 +80,6 @@ xfs_fs_get_uuid( return 0; } -static void -xfs_bmbt_to_iomap( - struct xfs_inode *ip, - struct iomap *iomap, - struct xfs_bmbt_irec *imap) -{ - struct xfs_mount *mp = ip->i_mount; - - if (imap->br_startblock == HOLESTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_DELALLOC; - } else { - iomap->blkno = - XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock); - if (imap->br_state == XFS_EXT_UNWRITTEN) - iomap->type = IOMAP_UNWRITTEN; - else - iomap->type = IOMAP_MAPPED; - } - iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); -} - /* * Get a layout for the pNFS client. */ -- cgit v0.10.2 From f0c6bcba74ac51cb77aadb33ad35cb2dc1ad1506 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:52:47 +1000 Subject: xfs: reorder zeroing and flushing sequence in truncate Currently zeroing out blocks and waiting for writeout is a bit of a mess in truncate. This patch gives it a clear order in preparation for the iomap path: (1) we first wait for any direct I/O to complete to prevent any races for it (2) we then perform the actual zeroing, and only use the truncate_page helpers for truncating down. The truncate up case already is handled by the separate call to xfs_zero_eof. (3) only then we write back dirty data, as zeroing block may cause dirty pages when using either xfs_zero_eof or the new iomap infrastructure. Signed-off-by: Christoph Hellwig Reviewed-by: Bob Peterson Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index c5d4eba..1a5ca4b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -801,20 +801,35 @@ xfs_setattr_size( return error; /* + * Wait for all direct I/O to complete. + */ + inode_dio_wait(inode); + + /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a * part of the transaction. * - * Start with zeroing any data block beyond EOF that we may expose on - * file extension. + * Start with zeroing any data beyond EOF that we may expose on file + * extension, or zeroing out the rest of the block on a downward + * truncate. */ if (newsize > oldsize) { error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); - if (error) - return error; + } else { + if (IS_DAX(inode)) { + error = dax_truncate_page(inode, newsize, + xfs_get_blocks_direct); + } else { + error = block_truncate_page(inode->i_mapping, newsize, + xfs_get_blocks); + } } + if (error) + return error; + /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new @@ -831,9 +846,6 @@ xfs_setattr_size( return error; } - /* Now wait for all direct I/O to complete. */ - inode_dio_wait(inode); - /* * We've already locked out new page faults, so now we can safely remove * pages from the page cache knowing they won't get refaulted until we @@ -851,13 +863,6 @@ xfs_setattr_size( * to hope that the caller sees ENOMEM and retries the truncate * operation. */ - if (IS_DAX(inode)) - error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); - else - error = block_truncate_page(inode->i_mapping, newsize, - xfs_get_blocks); - if (error) - return error; truncate_setsize(inode, newsize); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); -- cgit v0.10.2 From 68a9f5e7007c1afa2cf6830b690a90d0187c0684 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:53:44 +1000 Subject: xfs: implement iomap based buffered write path Convert XFS to use the new iomap based multipage write path. This involves implementing the ->iomap_begin and ->iomap_end methods, and switching the buffered file write, page_mkwrite and xfs_iozero paths to the new iomap helpers. With this change __xfs_get_blocks will never be used for buffered writes, and the code handling them can be removed. Based on earlier code from Dave Chinner. Signed-off-by: Christoph Hellwig Reviewed-by: Bob Peterson Signed-off-by: Dave Chinner diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 5d47b4d..35faf12 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -4,6 +4,7 @@ config XFS_FS depends on (64BIT || LBDAF) select EXPORTFS select LIBCRC32C + select FS_IOMAP help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4c463b9..2ac9f7e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1427,216 +1427,6 @@ xfs_vm_direct_IO( xfs_get_blocks_direct, endio, NULL, flags); } -/* - * Punch out the delalloc blocks we have already allocated. - * - * Don't bother with xfs_setattr given that nothing can have made it to disk yet - * as the page is still locked at this point. - */ -STATIC void -xfs_vm_kill_delalloc_range( - struct inode *inode, - loff_t start, - loff_t end) -{ - struct xfs_inode *ip = XFS_I(inode); - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error; - - start_fsb = XFS_B_TO_FSB(ip->i_mount, start); - end_fsb = XFS_B_TO_FSB(ip->i_mount, end); - if (end_fsb <= start_fsb) - return; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "xfs_vm_write_failed: unable to clean up ino %lld", - ip->i_ino); - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); -} - -STATIC void -xfs_vm_write_failed( - struct inode *inode, - struct page *page, - loff_t pos, - unsigned len) -{ - loff_t block_offset; - loff_t block_start; - loff_t block_end; - loff_t from = pos & (PAGE_SIZE - 1); - loff_t to = from + len; - struct buffer_head *bh, *head; - struct xfs_mount *mp = XFS_I(inode)->i_mount; - - /* - * The request pos offset might be 32 or 64 bit, this is all fine - * on 64-bit platform. However, for 64-bit pos request on 32-bit - * platform, the high 32-bit will be masked off if we evaluate the - * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is - * 0xfffff000 as an unsigned long, hence the result is incorrect - * which could cause the following ASSERT failed in most cases. - * In order to avoid this, we can evaluate the block_offset of the - * start of the page by using shifts rather than masks the mismatch - * problem. - */ - block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; - - ASSERT(block_offset + from == pos); - - head = page_buffers(page); - block_start = 0; - for (bh = head; bh != head || !block_start; - bh = bh->b_this_page, block_start = block_end, - block_offset += bh->b_size) { - block_end = block_start + bh->b_size; - - /* skip buffers before the write */ - if (block_end <= from) - continue; - - /* if the buffer is after the write, we're done */ - if (block_start >= to) - break; - - /* - * Process delalloc and unwritten buffers beyond EOF. We can - * encounter unwritten buffers in the event that a file has - * post-EOF unwritten extents and an extending write happens to - * fail (e.g., an unaligned write that also involves a delalloc - * to the same page). - */ - if (!buffer_delay(bh) && !buffer_unwritten(bh)) - continue; - - if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) && - block_offset < i_size_read(inode)) - continue; - - if (buffer_delay(bh)) - xfs_vm_kill_delalloc_range(inode, block_offset, - block_offset + bh->b_size); - - /* - * This buffer does not contain data anymore. make sure anyone - * who finds it knows that for certain. - */ - clear_buffer_delay(bh); - clear_buffer_uptodate(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_dirty(bh); - clear_buffer_unwritten(bh); - } - -} - -/* - * This used to call block_write_begin(), but it unlocks and releases the page - * on error, and we need that page to be able to punch stale delalloc blocks out - * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at - * the appropriate point. - */ -STATIC int -xfs_vm_write_begin( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata) -{ - pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; - int status; - struct xfs_mount *mp = XFS_I(mapping->host)->i_mount; - - ASSERT(len <= PAGE_SIZE); - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - - status = __block_write_begin(page, pos, len, xfs_get_blocks); - if (xfs_mp_fail_writes(mp)) - status = -EIO; - if (unlikely(status)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - - xfs_vm_write_failed(inode, page, pos, len); - unlock_page(page); - - /* - * If the write is beyond EOF, we only want to kill blocks - * allocated in this write, not blocks that were previously - * written successfully. - */ - if (xfs_mp_fail_writes(mp)) - isize = 0; - if (pos + len > isize) { - ssize_t start = max_t(ssize_t, pos, isize); - - truncate_pagecache_range(inode, start, pos + len); - } - - put_page(page); - page = NULL; - } - - *pagep = page; - return status; -} - -/* - * On failure, we only need to kill delalloc blocks beyond EOF in the range of - * this specific write because they will never be written. Previous writes - * beyond EOF where block allocation succeeded do not need to be trashed, so - * only new blocks from this write should be trashed. For blocks within - * EOF, generic_write_end() zeros them so they are safe to leave alone and be - * written with all the other valid data. - */ -STATIC int -xfs_vm_write_end( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned copied, - struct page *page, - void *fsdata) -{ - int ret; - - ASSERT(len <= PAGE_SIZE); - - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (unlikely(ret < len)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - loff_t to = pos + len; - - if (to > isize) { - /* only kill blocks in this write beyond EOF */ - if (pos > isize) - isize = pos; - xfs_vm_kill_delalloc_range(inode, isize, to); - truncate_pagecache_range(inode, isize, to); - } - } - return ret; -} - STATIC sector_t xfs_vm_bmap( struct address_space *mapping, @@ -1747,8 +1537,6 @@ const struct address_space_operations xfs_address_space_operations = { .set_page_dirty = xfs_vm_set_page_dirty, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, - .write_begin = xfs_vm_write_begin, - .write_end = xfs_vm_write_end, .bmap = xfs_vm_bmap, .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 47fc632..7316d38 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -37,6 +37,7 @@ #include "xfs_log.h" #include "xfs_icache.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include #include @@ -79,57 +80,27 @@ xfs_rw_ilock_demote( inode_unlock(VFS_I(ip)); } -/* - * xfs_iozero clears the specified range supplied via the page cache (except in - * the DAX case). Writes through the page cache will allocate blocks over holes, - * though the callers usually map the holes first and avoid them. If a block is - * not completely zeroed, then it will be read from disk before being partially - * zeroed. - * - * In the DAX case, we can just directly write to the underlying pages. This - * will not allocate blocks, but will avoid holes and unwritten extents and so - * not do unnecessary work. - */ -int -xfs_iozero( - struct xfs_inode *ip, /* inode */ - loff_t pos, /* offset in file */ - size_t count) /* size of data to zero */ +static int +xfs_dax_zero_range( + struct inode *inode, + loff_t pos, + size_t count) { - struct page *page; - struct address_space *mapping; int status = 0; - - mapping = VFS_I(ip)->i_mapping; do { unsigned offset, bytes; - void *fsdata; offset = (pos & (PAGE_SIZE -1)); /* Within page */ bytes = PAGE_SIZE - offset; if (bytes > count) bytes = count; - if (IS_DAX(VFS_I(ip))) { - status = dax_zero_page_range(VFS_I(ip), pos, bytes, - xfs_get_blocks_direct); - if (status) - break; - } else { - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; - - zero_user(page, offset, bytes); + status = dax_zero_page_range(inode, pos, bytes, + xfs_get_blocks_direct); + if (status) + break; - status = pagecache_write_end(NULL, mapping, pos, bytes, - bytes, page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ - status = 0; - } pos += bytes; count -= bytes; } while (count); @@ -137,6 +108,24 @@ xfs_iozero( return status; } +/* + * Clear the specified ranges to zero through either the pagecache or DAX. + * Holes and unwritten extents will be left as-is as they already are zeroed. + */ +int +xfs_iozero( + struct xfs_inode *ip, + loff_t pos, + size_t count) +{ + struct inode *inode = VFS_I(ip); + + if (IS_DAX(VFS_I(ip))) + return xfs_dax_zero_range(inode, pos, count); + else + return iomap_zero_range(inode, pos, count, NULL, &xfs_iomap_ops); +} + int xfs_update_prealloc_flags( struct xfs_inode *ip, @@ -841,7 +830,7 @@ xfs_file_buffered_aio_write( write_retry: trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos, 0); - ret = generic_perform_write(file, from, iocb->ki_pos); + ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) iocb->ki_pos += ret; @@ -1553,7 +1542,7 @@ xfs_filemap_page_mkwrite( if (IS_DAX(inode)) { ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); } else { - ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2f37194..620fc91 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -967,3 +967,147 @@ xfs_bmbt_to_iomap( iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); } + +static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps) +{ + return !nimaps || + imap->br_startblock == HOLESTARTBLOCK || + imap->br_startblock == DELAYSTARTBLOCK; +} + +static int +xfs_file_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb, end_fsb; + int nimaps = 1, error = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + ASSERT(offset <= mp->m_super->s_maxbytes); + if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) + length = mp->m_super->s_maxbytes - offset; + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, XFS_BMAPI_ENTIRE); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + + if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat symmetric + * with the work writeback does. This is a completely arbitrary + * number pulled out of thin air as a best guess for initial + * testing. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + length = min_t(loff_t, length, 1024 * PAGE_SIZE); + if (xfs_get_extsz_hint(ip)) { + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + xfs_ilock_demote(ip, XFS_ILOCK_EXCL); + error = xfs_iomap_write_direct(ip, offset, length, &imap, + nimaps); + } else { + error = xfs_iomap_write_delay(ip, offset, length, &imap); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (error) + return error; + + trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } else if (nimaps) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_found(ip, offset, length, 0, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } else { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_not_found(ip, offset, length, 0, &imap); + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + iomap->offset = offset; + iomap->length = length; + } + + return 0; +} + +static int +xfs_file_iomap_end_delalloc( + struct xfs_inode *ip, + loff_t offset, + loff_t length, + ssize_t written) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + + start_fsb = XFS_B_TO_FSB(mp, offset + written); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + /* + * Trim back delalloc blocks if we didn't manage to write the whole + * range reserved. + * + * We don't need to care about racing delalloc as we hold i_mutex + * across the reserve/allocate/unreserve calls. If there are delalloc + * blocks in the range, they are ours. + */ + if (start_fsb < end_fsb) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + if (error && !XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert(mp, "%s: unable to clean up ino %lld", + __func__, ip->i_ino); + return error; + } + } + + return 0; +} + +static int +xfs_file_iomap_end( + struct inode *inode, + loff_t offset, + loff_t length, + ssize_t written, + unsigned flags, + struct iomap *iomap) +{ + if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) + return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, + length, written); + return 0; +} + +struct iomap_ops xfs_iomap_ops = { + .iomap_begin = xfs_file_iomap_begin, + .iomap_end = xfs_file_iomap_end, +}; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 718f07c..e066d04 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,7 +18,8 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ -struct iomap; +#include + struct xfs_inode; struct xfs_bmbt_irec; @@ -33,4 +34,6 @@ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); +extern struct iomap_ops xfs_iomap_ops; + #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 1a5ca4b..5d1fdae 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -38,6 +38,7 @@ #include "xfs_dir2.h" #include "xfs_trans_space.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include #include @@ -822,8 +823,8 @@ xfs_setattr_size( error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); } else { - error = block_truncate_page(inode->i_mapping, newsize, - xfs_get_blocks); + error = iomap_truncate_page(inode, newsize, + &did_zeroing, &xfs_iomap_ops); } } @@ -838,8 +839,8 @@ xfs_setattr_size( * problem. Note that this includes any block zeroing we did above; * otherwise those blocks may not be zeroed after a crash. */ - if (newsize > ip->i_d.di_size && - (oldsize != ip->i_d.di_size || did_zeroing)) { + if (did_zeroing || + (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ea94ee0..bb24ce7 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1295,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found); DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct); +DEFINE_IOMAP_EVENT(xfs_iomap_alloc); +DEFINE_IOMAP_EVENT(xfs_iomap_found); +DEFINE_IOMAP_EVENT(xfs_iomap_not_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), -- cgit v0.10.2 From 6e8a27a816390d1fbcc28903da4dd9bc348e19ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:53:45 +1000 Subject: xfs: remove buffered write support from __xfs_get_blocks Signed-off-by: Christoph Hellwig Reviewed-by: Bob Peterson Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2ac9f7e..80714eb 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1143,6 +1143,8 @@ __xfs_get_blocks( ssize_t size; int new = 0; + BUG_ON(create && !direct); + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1150,22 +1152,14 @@ __xfs_get_blocks( ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - if (!create && direct && offset >= i_size_read(inode)) + if (!create && offset >= i_size_read(inode)) return 0; /* * Direct I/O is usually done on preallocated files, so try getting - * a block mapping without an exclusive lock first. For buffered - * writes we already have the exclusive iolock anyway, so avoiding - * a lock roundtrip here by taking the ilock exclusive from the - * beginning is a useful micro optimization. + * a block mapping without an exclusive lock first. */ - if (create && !direct) { - lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockmode); - } else { - lockmode = xfs_ilock_data_map_shared(ip); - } + lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); if (offset + size > mp->m_super->s_maxbytes) @@ -1184,37 +1178,19 @@ __xfs_get_blocks( (imap.br_startblock == HOLESTARTBLOCK || imap.br_startblock == DELAYSTARTBLOCK) || (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { - if (direct || xfs_get_extsz_hint(ip)) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - - error = xfs_iomap_write_direct(ip, offset, size, - &imap, nimaps); - if (error) - return error; - new = 1; + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); - } else { - /* - * Delalloc reservations do not require a transaction, - * we can go on without dropping the lock here. If we - * are allocating a new delalloc block, make sure that - * we set the new flag so that we mark the buffer new so - * that we know that it is newly allocated if the write - * fails. - */ - if (nimaps && imap.br_startblock == HOLESTARTBLOCK) - new = 1; - error = xfs_iomap_write_delay(ip, offset, size, &imap); - if (error) - goto out_unlock; + error = xfs_iomap_write_direct(ip, offset, size, + &imap, nimaps); + if (error) + return error; + new = 1; - xfs_iunlock(ip, lockmode); - } trace_xfs_get_blocks_alloc(ip, offset, size, ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN : XFS_IO_DELALLOC, &imap); @@ -1235,9 +1211,7 @@ __xfs_get_blocks( } /* trim mapping down to size requested */ - if (direct || size > (1 << inode->i_blkbits)) - xfs_map_trim_size(inode, iblock, bh_result, - &imap, offset, size); + xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); /* * For unwritten extents do not report a disk address in the buffered @@ -1250,7 +1224,7 @@ __xfs_get_blocks( if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); /* direct IO needs special help */ - if (create && direct) { + if (create) { if (dax_fault) ASSERT(!ISUNWRITTEN(&imap)); else @@ -1279,14 +1253,7 @@ __xfs_get_blocks( (new || ISUNWRITTEN(&imap)))) set_buffer_new(bh_result); - if (imap.br_startblock == DELAYSTARTBLOCK) { - BUG_ON(direct); - if (create) { - set_buffer_uptodate(bh_result); - set_buffer_mapped(bh_result); - set_buffer_delay(bh_result); - } - } + BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK); return 0; -- cgit v0.10.2 From d2bb140e99e6710c1b46e38a6347ada364aadfc6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:54:53 +1000 Subject: xfs: use iomap fiemap implementation Note that this removes support for the untested FIEMAP_FLAG_XATTR. It could be added relatively easily with iomap ops for the attr fork, but without test coverage I don't feel safe doing this. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 5d1fdae..985a263 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include /* @@ -1004,51 +1004,6 @@ xfs_vn_update_time( return xfs_trans_commit(tp); } -#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) - -/* - * Call fiemap helper to fill in user data. - * Returns positive errors to xfs_getbmap. - */ -STATIC int -xfs_fiemap_format( - void **arg, - struct getbmapx *bmv, - int *full) -{ - int error; - struct fiemap_extent_info *fieinfo = *arg; - u32 fiemap_flags = 0; - u64 logical, physical, length; - - /* Do nothing for a hole */ - if (bmv->bmv_block == -1LL) - return 0; - - logical = BBTOB(bmv->bmv_offset); - physical = BBTOB(bmv->bmv_block); - length = BBTOB(bmv->bmv_length); - - if (bmv->bmv_oflags & BMV_OF_PREALLOC) - fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; - else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { - fiemap_flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - physical = 0; /* no block yet */ - } - if (bmv->bmv_oflags & BMV_OF_LAST) - fiemap_flags |= FIEMAP_EXTENT_LAST; - - error = fiemap_fill_next_extent(fieinfo, logical, physical, - length, fiemap_flags); - if (error > 0) { - error = 0; - *full = 1; /* user array now full */ - } - - return error; -} - STATIC int xfs_vn_fiemap( struct inode *inode, @@ -1056,38 +1011,13 @@ xfs_vn_fiemap( u64 start, u64 length) { - xfs_inode_t *ip = XFS_I(inode); - struct getbmapx bm; int error; - error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); - if (error) - return error; - - /* Set up bmap header for xfs internal routine */ - bm.bmv_offset = BTOBBT(start); - /* Special case for whole file */ - if (length == FIEMAP_MAX_OFFSET) - bm.bmv_length = -1LL; - else - bm.bmv_length = BTOBB(start + length) - bm.bmv_offset; - - /* We add one because in getbmap world count includes the header */ - bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : - fieinfo->fi_extents_max + 1; - bm.bmv_count = min_t(__s32, bm.bmv_count, - (PAGE_SIZE * 16 / sizeof(struct getbmapx))); - bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; - if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) - bm.bmv_iflags |= BMV_IF_ATTRFORK; - if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) - bm.bmv_iflags |= BMV_IF_DELALLOC; - - error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); - if (error) - return error; + xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED); + error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops); + xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED); - return 0; + return error; } STATIC int -- cgit v0.10.2 From 459f0fbc2a827da37bbfaf3cae8da4ad8884da12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:55:18 +1000 Subject: xfs: use iomap infrastructure for DAX zeroing Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7316d38..090a90f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -80,34 +80,6 @@ xfs_rw_ilock_demote( inode_unlock(VFS_I(ip)); } -static int -xfs_dax_zero_range( - struct inode *inode, - loff_t pos, - size_t count) -{ - int status = 0; - - do { - unsigned offset, bytes; - - offset = (pos & (PAGE_SIZE -1)); /* Within page */ - bytes = PAGE_SIZE - offset; - if (bytes > count) - bytes = count; - - status = dax_zero_page_range(inode, pos, bytes, - xfs_get_blocks_direct); - if (status) - break; - - pos += bytes; - count -= bytes; - } while (count); - - return status; -} - /* * Clear the specified ranges to zero through either the pagecache or DAX. * Holes and unwritten extents will be left as-is as they already are zeroed. @@ -118,12 +90,7 @@ xfs_iozero( loff_t pos, size_t count) { - struct inode *inode = VFS_I(ip); - - if (IS_DAX(VFS_I(ip))) - return xfs_dax_zero_range(inode, pos, count); - else - return iomap_zero_range(inode, pos, count, NULL, &xfs_iomap_ops); + return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); } int diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 985a263..ab820f8 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -819,13 +819,8 @@ xfs_setattr_size( if (newsize > oldsize) { error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); } else { - if (IS_DAX(inode)) { - error = dax_truncate_page(inode, newsize, - xfs_get_blocks_direct); - } else { - error = iomap_truncate_page(inode, newsize, - &did_zeroing, &xfs_iomap_ops); - } + error = iomap_truncate_page(inode, newsize, &did_zeroing, + &xfs_iomap_ops); } if (error) -- cgit v0.10.2 From 7bb41db3ea160ea55cc46af07e45f7cb1e2968ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:56:26 +1000 Subject: xfs: handle 64-bit length in xfs_iozero We'll want to use this code for large offsets now that we're skipping holes and unwritten extents efficiently. Also rename it to xfs_zero_range to be a bit more descriptive, and tell the caller if we actually did any zeroing. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 090a90f..294e5f4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -85,10 +85,11 @@ xfs_rw_ilock_demote( * Holes and unwritten extents will be left as-is as they already are zeroed. */ int -xfs_iozero( +xfs_zero_range( struct xfs_inode *ip, - loff_t pos, - size_t count) + xfs_off_t pos, + xfs_off_t count, + bool *did_zero) { return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); } @@ -419,7 +420,7 @@ xfs_zero_last_block( if (isize + zero_len > offset) zero_len = offset - isize; *did_zeroing = true; - return xfs_iozero(ip, isize, zero_len); + return xfs_zero_range(ip, isize, zero_len, NULL); } /* @@ -518,7 +519,7 @@ xfs_zero_eof( if ((zero_off + zero_len) > offset) zero_len = offset - zero_off; - error = xfs_iozero(ip, zero_off, zero_len); + error = xfs_zero_range(ip, zero_off, zero_len, NULL); if (error) return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e52d7c7..dbb0bcf 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -434,7 +434,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip, enum xfs_prealloc_flags flags); int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, xfs_fsize_t isize, bool *did_zeroing); -int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); +int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count, + bool *did_zero); loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, loff_t eof, int whence); -- cgit v0.10.2 From 570b6211b85692f408cbe47664ab2378eb9519ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 09:57:26 +1000 Subject: xfs: use xfs_zero_range in xfs_zero_eof We now skip holes in it, so no need to have the caller do it as well. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 294e5f4..713991c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -381,49 +381,6 @@ out: } /* - * This routine is called to handle zeroing any space in the last block of the - * file that is beyond the EOF. We do this since the size is being increased - * without writing anything to that block and we don't want to read the - * garbage on the disk. - */ -STATIC int /* error (positive) */ -xfs_zero_last_block( - struct xfs_inode *ip, - xfs_fsize_t offset, - xfs_fsize_t isize, - bool *did_zeroing) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); - int zero_offset = XFS_B_FSB_OFFSET(mp, isize); - int zero_len; - int nimaps = 1; - int error = 0; - struct xfs_bmbt_irec imap; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - /* - * If the block underlying isize is just a hole, then there - * is nothing to zero. - */ - if (imap.br_startblock == HOLESTARTBLOCK) - return 0; - - zero_len = mp->m_sb.sb_blocksize - zero_offset; - if (isize + zero_len > offset) - zero_len = offset - isize; - *did_zeroing = true; - return xfs_zero_range(ip, isize, zero_len, NULL); -} - -/* * Zero any on disk space between the current EOF and the new, larger EOF. * * This handles the normal case of zeroing the remainder of the last block in @@ -441,94 +398,11 @@ xfs_zero_eof( xfs_fsize_t isize, /* current inode size */ bool *did_zeroing) { - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - struct xfs_bmbt_irec imap; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); trace_xfs_zero_eof(ip, isize, offset - isize); - - /* - * First handle zeroing the block on which isize resides. - * - * We only zero a part of that block so it is handled specially. - */ - if (XFS_B_FSB_OFFSET(mp, isize) != 0) { - error = xfs_zero_last_block(ip, offset, isize, did_zeroing); - if (error) - return error; - } - - /* - * Calculate the range between the new size and the old where blocks - * needing to be zeroed may exist. - * - * To get the block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back to a block - * boundary. We subtract 1 in case the size is exactly on a block - * boundary. - */ - last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; - start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); - ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); - if (last_fsb == end_zero_fsb) { - /* - * The size was only incremented on its last block. - * We took care of that above, so just return. - */ - return 0; - } - - ASSERT(start_zero_fsb <= end_zero_fsb); - while (start_zero_fsb <= end_zero_fsb) { - nimaps = 1; - zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, - &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - if (imap.br_state == XFS_EXT_UNWRITTEN || - imap.br_startblock == HOLESTARTBLOCK) { - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - continue; - } - - /* - * There are blocks we need to zero. - */ - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); - zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); - - if ((zero_off + zero_len) > offset) - zero_len = offset - zero_off; - - error = xfs_zero_range(ip, zero_off, zero_len, NULL); - if (error) - return error; - - *did_zeroing = true; - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - } - - return 0; + return xfs_zero_range(ip, isize, offset - isize, did_zeroing); } /* -- cgit v0.10.2 From bdb0d04fa66d8d02219ca7c027adf810dd75e9e4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 10:00:55 +1000 Subject: xfs: split xfs_free_file_space in manageable pieces Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 586bb64..e36664f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1184,30 +1184,132 @@ xfs_zero_remaining_bytes( return error; } +static int +xfs_unmap_extent( + struct xfs_inode *ip, + xfs_fileoff_t startoffset_fsb, + xfs_filblks_t len_fsb, + int *done) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t firstfsb; + uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + if (error) { + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot, + ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_trans_cancel; + + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb, + &free_list, done); + if (error) + goto out_bmap_cancel; + + error = xfs_bmap_finish(&tp, &free_list, NULL); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_trans_cancel: + xfs_trans_cancel(tp); + goto out_unlock; +} + +static int +xfs_adjust_extent_unmap_boundaries( + struct xfs_inode *ip, + xfs_fileoff_t *startoffset_fsb, + xfs_fileoff_t *endoffset_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + int nimap, error; + xfs_extlen_t mod = 0; + + nimap = 1; + error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0); + if (error) + return error; + + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + xfs_daddr_t block; + + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + block = imap.br_startblock; + mod = do_div(block, mp->m_sb.sb_rextsize); + if (mod) + *startoffset_fsb += mp->m_sb.sb_rextsize - mod; + } + + nimap = 1; + error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0); + if (error) + return error; + + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + mod++; + if (mod && mod != mp->m_sb.sb_rextsize) + *endoffset_fsb -= mod; + } + + return 0; +} + +static int +xfs_flush_unmap_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + xfs_off_t rounding, start, end; + int error; + + /* wait for the completion of any pending DIOs */ + inode_dio_wait(inode); + + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); + start = round_down(offset, rounding); + end = round_up(offset + len, rounding) - 1; + + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + truncate_pagecache_range(inode, start, end); + return 0; +} + int xfs_free_file_space( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len) { - int done; - xfs_fileoff_t endoffset_fsb; - int error; - xfs_fsblock_t firstfsb; - xfs_bmap_free_t free_list; - xfs_bmbt_irec_t imap; - xfs_off_t ioffset; - xfs_off_t iendoffset; - xfs_extlen_t mod=0; - xfs_mount_t *mp; - int nimap; - uint resblks; - xfs_off_t rounding; - int rt; + struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t startoffset_fsb; - xfs_trans_t *tp; - - mp = ip->i_mount; + xfs_fileoff_t endoffset_fsb; + int done, error; trace_xfs_free_file_space(ip); @@ -1215,60 +1317,30 @@ xfs_free_file_space( if (error) return error; - error = 0; if (len <= 0) /* if nothing being freed */ - return error; - rt = XFS_IS_REALTIME_INODE(ip); - startoffset_fsb = XFS_B_TO_FSB(mp, offset); - endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); - - /* wait for the completion of any pending DIOs */ - inode_dio_wait(VFS_I(ip)); + return 0; - rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); - ioffset = round_down(offset, rounding); - iendoffset = round_up(offset + len, rounding) - 1; - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, - iendoffset); + error = xfs_flush_unmap_range(ip, offset, len); if (error) - goto out; - truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset); + return error; + + startoffset_fsb = XFS_B_TO_FSB(mp, offset); + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* - * Need to zero the stuff we're not freeing, on disk. - * If it's a realtime file & can't use unwritten extents then we - * actually need to zero the extent edges. Otherwise xfs_bunmapi - * will take care of it for us. + * Need to zero the stuff we're not freeing, on disk. If it's a RT file + * and we can't use unwritten extents then we actually need to ensure + * to zero the whole extent, otherwise we just need to take of block + * boundaries, and xfs_bunmapi will handle the rest. */ - if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { - nimap = 1; - error = xfs_bmapi_read(ip, startoffset_fsb, 1, - &imap, &nimap, 0); + if (XFS_IS_REALTIME_INODE(ip) && + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb, + &endoffset_fsb); if (error) - goto out; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - xfs_daddr_t block; - - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - block = imap.br_startblock; - mod = do_div(block, mp->m_sb.sb_rextsize); - if (mod) - startoffset_fsb += mp->m_sb.sb_rextsize - mod; - } - nimap = 1; - error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, - &imap, &nimap, 0); - if (error) - goto out; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - mod++; - if (mod && (mod != mp->m_sb.sb_rextsize)) - endoffset_fsb -= mod; - } + return error; } + if ((done = (endoffset_fsb <= startoffset_fsb))) /* * One contiguous piece to clear @@ -1288,62 +1360,12 @@ xfs_free_file_space( offset + len - 1); } - /* - * free file space until done or until there is an error - */ - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); while (!error && !done) { - - /* - * allocate and setup the transaction. Allow this - * transaction to dip into the reserve blocks to ensure - * the freeing of the space succeeds at ENOSPC. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, - &tp); - if (error) { - ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - break; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, - ip->i_udquot, ip->i_gdquot, ip->i_pdquot, - resblks, 0, XFS_QMOPT_RES_REGBLKS); - if (error) - goto error1; - - xfs_trans_ijoin(tp, ip, 0); - - /* - * issue the bunmapi() call to free the blocks - */ - xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bunmapi(tp, ip, startoffset_fsb, - endoffset_fsb - startoffset_fsb, - 0, 2, &firstfsb, &free_list, &done); - if (error) - goto error0; - - /* - * complete the transaction - */ - error = xfs_bmap_finish(&tp, &free_list, NULL); - if (error) - goto error0; - - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = xfs_unmap_extent(ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, &done); } - out: return error; - - error0: - xfs_bmap_cancel(&free_list); - error1: - xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out; } /* -- cgit v0.10.2 From 3c2bdc912a1cc050db7e858aabe564cb382c9c30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2016 10:02:23 +1000 Subject: xfs: kill xfs_zero_remaining_bytes Instead punch the whole first, and the use the our zeroing helper to punch out the edge blocks. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index e36664f..569f03d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1089,101 +1089,6 @@ error1: /* Just cancel transaction */ return error; } -/* - * Zero file bytes between startoff and endoff inclusive. - * The iolock is held exclusive and no blocks are buffered. - * - * This function is used by xfs_free_file_space() to zero - * partial blocks when the range to free is not block aligned. - * When unreserving space with boundaries that are not block - * aligned we round up the start and round down the end - * boundaries and then use this function to zero the parts of - * the blocks that got dropped during the rounding. - */ -STATIC int -xfs_zero_remaining_bytes( - xfs_inode_t *ip, - xfs_off_t startoff, - xfs_off_t endoff) -{ - xfs_bmbt_irec_t imap; - xfs_fileoff_t offset_fsb; - xfs_off_t lastoffset; - xfs_off_t offset; - xfs_buf_t *bp; - xfs_mount_t *mp = ip->i_mount; - int nimap; - int error = 0; - - /* - * Avoid doing I/O beyond eof - it's not necessary - * since nothing can read beyond eof. The space will - * be zeroed when the file is extended anyway. - */ - if (startoff >= XFS_ISIZE(ip)) - return 0; - - if (endoff > XFS_ISIZE(ip)) - endoff = XFS_ISIZE(ip); - - for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { - uint lock_mode; - - offset_fsb = XFS_B_TO_FSBT(mp, offset); - nimap = 1; - - lock_mode = xfs_ilock_data_map_shared(ip); - error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); - xfs_iunlock(ip, lock_mode); - - if (error || nimap < 1) - break; - ASSERT(imap.br_blockcount >= 1); - ASSERT(imap.br_startoff == offset_fsb); - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - - if (imap.br_startblock == HOLESTARTBLOCK || - imap.br_state == XFS_EXT_UNWRITTEN) { - /* skip the entire extent */ - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + - imap.br_blockcount) - 1; - continue; - } - - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; - if (lastoffset > endoff) - lastoffset = endoff; - - /* DAX can just zero the backing device directly */ - if (IS_DAX(VFS_I(ip))) { - error = dax_zero_page_range(VFS_I(ip), offset, - lastoffset - offset + 1, - xfs_get_blocks_direct); - if (error) - return error; - continue; - } - - error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp, - xfs_fsb_to_db(ip, imap.br_startblock), - BTOBB(mp->m_sb.sb_blocksize), - 0, &bp, NULL); - if (error) - return error; - - memset(bp->b_addr + - (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), - 0, lastoffset - offset + 1); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - return error; - } - return error; -} - static int xfs_unmap_extent( struct xfs_inode *ip, @@ -1309,7 +1214,7 @@ xfs_free_file_space( struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t startoffset_fsb; xfs_fileoff_t endoffset_fsb; - int done, error; + int done = 0, error; trace_xfs_free_file_space(ip); @@ -1341,31 +1246,21 @@ xfs_free_file_space( return error; } - if ((done = (endoffset_fsb <= startoffset_fsb))) - /* - * One contiguous piece to clear - */ - error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); - else { - /* - * Some full blocks, possibly two pieces to clear - */ - if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) - error = xfs_zero_remaining_bytes(ip, offset, - XFS_FSB_TO_B(mp, startoffset_fsb) - 1); - if (!error && - XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) - error = xfs_zero_remaining_bytes(ip, - XFS_FSB_TO_B(mp, endoffset_fsb), - offset + len - 1); - } - - while (!error && !done) { - error = xfs_unmap_extent(ip, startoffset_fsb, - endoffset_fsb - startoffset_fsb, &done); + if (endoffset_fsb > startoffset_fsb) { + while (!done) { + error = xfs_unmap_extent(ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, &done); + if (error) + return error; + } } - return error; + /* + * Now that we've unmap all full blocks we'll have to zero out any + * partial block at the beginning and/or end. xfs_zero_range is + * smart enough to skip any holes, including those we just created. + */ + return xfs_zero_range(ip, offset, len, NULL); } /* -- cgit v0.10.2 From fa5a4f57ddbece604e07abfe98e01f520635411d Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 21 Jun 2016 11:53:28 +1000 Subject: xfs: cancel eofblocks background trimming on remount read-only The filesystem quiesce sequence performs the operations necessary to drain all background work, push pending transactions through the log infrastructure and wait on I/O resulting from the final AIL push. We have had reports of remount,ro hangs in xfs_log_quiesce() -> xfs_wait_buftarg(), however, and some instrumentation code to detect transaction commits at this point in the quiesce sequence has inculpated the eofblocks background scanner as a cause. While higher level remount code generally prevents user modifications by the time the filesystem has made it to xfs_log_quiesce(), the background scanner may still be alive and can perform pending work at any time. If this occurs between the xfs_log_force() and xfs_wait_buftarg() calls within xfs_log_quiesce(), this can lead to an indefinite lockup in xfs_wait_buftarg(). To prevent this problem, cancel the background eofblocks scan worker during the remount read-only quiesce sequence. This suspends background trimming when a filesystem is remounted read-only. This is only done in the remount path because the freeze codepath has already locked out new transactions by the time the filesystem attempts to quiesce (and thus waiting on an active work item could deadlock). Kick the eofblocks worker to pick up where it left off once an fs is remounted back to read-write. Signed-off-by: Brian Foster Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 99ee6eee..fb39a66 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -765,7 +765,7 @@ restart: * Background scanning to trim post-EOF preallocated space. This is queued * based on the 'speculative_prealloc_lifetime' tunable (5m by default). */ -STATIC void +void xfs_queue_eofblocks( struct xfs_mount *mp) { diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 62f1f91..05bac99 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); void xfs_eofblocks_worker(struct work_struct *); +void xfs_queue_eofblocks(struct xfs_mount *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 11ea5d5..2d28108 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1294,6 +1294,7 @@ xfs_fs_remount( */ xfs_restore_resvblks(mp); xfs_log_work_queue(mp); + xfs_queue_eofblocks(mp); } /* rw -> ro */ @@ -1306,6 +1307,13 @@ xfs_fs_remount( * return it to the same size. */ xfs_save_resvblks(mp); + + /* + * Cancel background eofb scanning so it cannot race with the + * final log force+buftarg wait and deadlock the remount. + */ + cancel_delayed_work_sync(&mp->m_eofblocks_work); + xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; } -- cgit v0.10.2 From 408fd484618c48414eb52c86a48f11794de9a248 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 21 Jun 2016 11:53:28 +1000 Subject: xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly xfs_reserve_blocks() is responsible to update the XFS reserved block pool count at mount time or based on user request. When the caller requests to increase the reserve pool, blocks must be allocated from the global counters such that they are no longer available for general purpose use. If the requested reserve pool size is too large, XFS reserves what blocks are available. The implementation requires looking at the percpu counters and making an educated guess as to how many blocks to try and allocate from xfs_mod_fdblocks(), which can return -ENOSPC if the guess was not accurate due to counters being modified in parallel. xfs_reserve_blocks() retries the guess in this scenario until the allocation succeeds or it is determined that there is no space available in the fs. While not easily reproducible in the current form, the retry code doesn't actually work correctly if xfs_mod_fdblocks() actually fails. The problem is that the percpu calculations use the m_resblks counter to determine how many blocks to allocate, but unconditionally update m_resblks before the block allocation has actually succeeded. Therefore, if xfs_mod_fdblocks() fails, the code jumps to the retry label and uses the already updated m_resblks value to determine how many blocks to try and allocate. If the percpu counters previously suggested that the entire request was available, fdblocks_delta could end up set to 0. In that case, m_resblks is updated to the requested value, yet no blocks have been reserved at all. Refactor xfs_reserve_blocks() to use an explicit loop and make the code easier to follow. Since we have to drop the spinlock across the xfs_mod_fdblocks() call, use a delta value for m_resblks as well and only apply the delta once allocation succeeds. [dchinner: convert to do {} while() loop] Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index b4d7582..7191c38 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -667,8 +667,11 @@ xfs_reserve_blocks( __uint64_t *inval, xfs_fsop_resblks_t *outval) { - __int64_t lcounter, delta, fdblks_delta; + __int64_t lcounter, delta; + __int64_t fdblks_delta = 0; __uint64_t request; + __int64_t free; + int error = 0; /* If inval is null, report current values and return */ if (inval == (__uint64_t *)NULL) { @@ -682,24 +685,23 @@ xfs_reserve_blocks( request = *inval; /* - * With per-cpu counters, this becomes an interesting - * problem. we needto work out if we are freeing or allocation - * blocks first, then we can do the modification as necessary. + * With per-cpu counters, this becomes an interesting problem. we need + * to work out if we are freeing or allocation blocks first, then we can + * do the modification as necessary. * - * We do this under the m_sb_lock so that if we are near - * ENOSPC, we will hold out any changes while we work out - * what to do. This means that the amount of free space can - * change while we do this, so we need to retry if we end up - * trying to reserve more space than is available. + * We do this under the m_sb_lock so that if we are near ENOSPC, we will + * hold out any changes while we work out what to do. This means that + * the amount of free space can change while we do this, so we need to + * retry if we end up trying to reserve more space than is available. */ -retry: spin_lock(&mp->m_sb_lock); /* * If our previous reservation was larger than the current value, - * then move any unused blocks back to the free pool. + * then move any unused blocks back to the free pool. Modify the resblks + * counters directly since we shouldn't have any problems unreserving + * space. */ - fdblks_delta = 0; if (mp->m_resblks > request) { lcounter = mp->m_resblks_avail - request; if (lcounter > 0) { /* release unused blocks */ @@ -707,54 +709,67 @@ retry: mp->m_resblks_avail -= lcounter; } mp->m_resblks = request; - } else { - __int64_t free; + if (fdblks_delta) { + spin_unlock(&mp->m_sb_lock); + error = xfs_mod_fdblocks(mp, fdblks_delta, 0); + spin_lock(&mp->m_sb_lock); + } + + goto out; + } + /* + * If the request is larger than the current reservation, reserve the + * blocks before we update the reserve counters. Sample m_fdblocks and + * perform a partial reservation if the request exceeds free space. + */ + error = -ENOSPC; + do { free = percpu_counter_sum(&mp->m_fdblocks) - XFS_ALLOC_SET_ASIDE(mp); if (!free) - goto out; /* ENOSPC and fdblks_delta = 0 */ + break; delta = request - mp->m_resblks; lcounter = free - delta; - if (lcounter < 0) { + if (lcounter < 0) /* We can't satisfy the request, just get what we can */ - mp->m_resblks += free; - mp->m_resblks_avail += free; - fdblks_delta = -free; - } else { - fdblks_delta = -delta; - mp->m_resblks = request; - mp->m_resblks_avail += delta; - } - } -out: - if (outval) { - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - } - spin_unlock(&mp->m_sb_lock); + fdblks_delta = free; + else + fdblks_delta = delta; - if (fdblks_delta) { /* - * If we are putting blocks back here, m_resblks_avail is - * already at its max so this will put it in the free pool. - * - * If we need space, we'll either succeed in getting it - * from the free block count or we'll get an enospc. If - * we get a ENOSPC, it means things changed while we were - * calculating fdblks_delta and so we should try again to - * see if there is anything left to reserve. + * We'll either succeed in getting space from the free block + * count or we'll get an ENOSPC. If we get a ENOSPC, it means + * things changed while we were calculating fdblks_delta and so + * we should try again to see if there is anything left to + * reserve. * * Don't set the reserved flag here - we don't want to reserve * the extra reserve blocks from the reserve..... */ - int error; - error = xfs_mod_fdblocks(mp, fdblks_delta, 0); - if (error == -ENOSPC) - goto retry; + spin_unlock(&mp->m_sb_lock); + error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); + spin_lock(&mp->m_sb_lock); + } while (error == -ENOSPC); + + /* + * Update the reserve counters if blocks have been successfully + * allocated. + */ + if (!error && fdblks_delta) { + mp->m_resblks += fdblks_delta; + mp->m_resblks_avail += fdblks_delta; } - return 0; + +out: + if (outval) { + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + } + + spin_unlock(&mp->m_sb_lock); + return error; } int -- cgit v0.10.2 From 3f94c441e2c3dea029a46a2326b2170acf2c7713 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 Jun 2016 11:53:28 +1000 Subject: xfs: check offsets of variable length structures Some of the directory/attr structures contain variable-length objects, so the enclosing structure doesn't have a meaningful fixed size at compile time. We can check the offsets of the members before the variable-length member, so do those. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 184c44e..0272301 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -22,6 +22,11 @@ BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \ #structname ") is wrong, expected " #size) +#define XFS_CHECK_OFFSET(structname, member, off) \ + BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \ + "XFS: offsetof(" #structname ", " #member ") is wrong, " \ + "expected " #off) + static inline void __init xfs_check_ondisk_structs(void) { @@ -75,15 +80,28 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12); */ + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); - XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t, 8); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7); XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6); + XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0); + XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2); XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4); @@ -94,6 +112,9 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2); -- cgit v0.10.2 From 479c641273df632478cda7fe76b833df64e319bc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 Jun 2016 11:53:28 +1000 Subject: xfs: enable buffer deadlock postmortem diagnosis via ftrace Create a second buf_trylock tracepoint so that we can distinguish between a successful and a failed trylock. With this piece, we can use a script to look at the ftrace output to detect buffer deadlocks. [dchinner: update to if/else as per hch's suggestion] Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e71cfbd..f14daeb 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -944,10 +944,12 @@ xfs_buf_trylock( int locked; locked = down_trylock(&bp->b_sema) == 0; - if (locked) + if (locked) { XB_SET_OWNER(bp); - - trace_xfs_buf_trylock(bp, _RET_IP_); + trace_xfs_buf_trylock(bp, _RET_IP_); + } else { + trace_xfs_buf_trylock_fail(bp, _RET_IP_); + } return locked; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ea94ee0..68f27f7 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -354,6 +354,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait); DEFINE_BUF_EVENT(xfs_buf_bawrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); +DEFINE_BUF_EVENT(xfs_buf_trylock_fail); DEFINE_BUF_EVENT(xfs_buf_trylock); DEFINE_BUF_EVENT(xfs_buf_unlock); DEFINE_BUF_EVENT(xfs_buf_iowait); -- cgit v0.10.2 From 128f24d5d9def3c47b6b659b2454f0426a347144 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 Jun 2016 11:53:28 +1000 Subject: xfs: check for a valid error_tag in errortag_add Currently we don't check the error_tag when someone's trying to set up error injection testing. If userspace passes in a value we don't know about, send back an error. This will help xfstests to _notrun a test that uses error injection to test things like log replay. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 88693a9..355619a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -61,6 +61,9 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp) int len; int64_t fsid; + if (error_tag >= XFS_ERRTAG_MAX) + return -EINVAL; + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { -- cgit v0.10.2 From 59bad075bd135979b2a484c30f6bcf28d17b8689 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 Jun 2016 11:53:28 +1000 Subject: xfs: rearrange xfs_bmap_add_free parameters This is already in xfsprogs' libxfs, so port it to the kernel. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 932381c..8847496 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -570,10 +570,10 @@ xfs_bmap_validate_ret( */ void xfs_bmap_add_free( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_bmap_free *flist, /* list of extents */ xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - xfs_mount_t *mp) /* mount point structure */ + xfs_filblks_t len) /* length of extent */ { xfs_bmap_free_item_t *cur; /* current (next) element */ xfs_bmap_free_item_t *new; /* new element */ @@ -699,7 +699,7 @@ xfs_bmap_btree_to_extents( cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; - xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1); ip->i_d.di_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); @@ -5073,8 +5073,8 @@ xfs_bmap_del_extent( * If we need to, add to list of extents to delete. */ if (do_fx) - xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, - mp); + xfs_bmap_add_free(mp, flist, del->br_startblock, + del->br_blockcount); /* * Adjust inode # blocks in the file. */ diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 423a34e..e081c76 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -191,8 +191,8 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); -void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_bmap_free *flist, struct xfs_mount *mp); +void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist, + xfs_fsblock_t bno, xfs_filblks_t len); void xfs_bmap_cancel(struct xfs_bmap_free *flist); int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, struct xfs_inode *ip); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 6282f6e..db0c71e 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -526,7 +526,7 @@ xfs_bmbt_free_block( struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); - xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp); + xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1); ip->i_d.di_nblocks--; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 22297f9..e3c0af7 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1828,9 +1828,8 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, - XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), - mp->m_ialloc_blks, flist, mp); + xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno), + mp->m_ialloc_blks); return; } @@ -1873,8 +1872,8 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - flist, mp); + xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno), + contigblk); /* reset range to current bit and carry on... */ startidx = endidx = nextbit; -- cgit v0.10.2 From 4d89e20bf1b12bd5aa6917efc86da723b331deef Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 Jun 2016 11:53:28 +1000 Subject: xfs: separate freelist fixing into a separate helper Break up xfs_free_extent() into a helper that fixes the freelist. This helper will be used subsequently to ensure the freelist during deferred rmap processing. [darrick: refactor to put this at the head of the patchset] Signed-off-by: Dave Chinner Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index a708e38..638657a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2658,55 +2658,79 @@ error0: return error; } -/* - * Free an extent. - * Just break up the extent address and hand off to xfs_free_ag_extent - * after fixing up the freelist. - */ -int /* error */ -xfs_free_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t bno, /* starting block number of extent */ - xfs_extlen_t len) /* length of extent */ +/* Ensure that the freelist is at full capacity. */ +int +xfs_free_extent_fix_freelist( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf **agbp) { - xfs_alloc_arg_t args; - int error; + struct xfs_alloc_arg args; + int error; - ASSERT(len != 0); - memset(&args, 0, sizeof(xfs_alloc_arg_t)); + memset(&args, 0, sizeof(struct xfs_alloc_arg)); args.tp = tp; args.mp = tp->t_mountp; + args.agno = agno; /* * validate that the block number is legal - the enables us to detect * and handle a silent filesystem corruption rather than crashing. */ - args.agno = XFS_FSB_TO_AGNO(args.mp, bno); if (args.agno >= args.mp->m_sb.sb_agcount) return -EFSCORRUPTED; - args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); - if (args.agbno >= args.mp->m_sb.sb_agblocks) - return -EFSCORRUPTED; - args.pag = xfs_perag_get(args.mp, args.agno); ASSERT(args.pag); error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); if (error) - goto error0; + goto out; + + *agbp = args.agbp; +out: + xfs_perag_put(args.pag); + return error; +} + +/* + * Free an extent. + * Just break up the extent address and hand off to xfs_free_ag_extent + * after fixing up the freelist. + */ +int /* error */ +xfs_free_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agbp; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); + int error; + + ASSERT(len != 0); + + error = xfs_free_extent_fix_freelist(tp, agno, &agbp); + if (error) + return error; + + XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err); /* validate the extent size is legal now we have the agf locked */ - if (args.agbno + len > - be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { - error = -EFSCORRUPTED; - goto error0; - } + XFS_WANT_CORRUPTED_GOTO(mp, + agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), + err); - error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); - if (!error) - xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); -error0: - xfs_perag_put(args.pag); + error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0); + if (error) + goto err; + + xfs_extent_busy_insert(tp, agno, agbno, len, 0); + return 0; + +err: + xfs_trans_brelse(tp, agbp); return error; } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 135eb3d..f5b35dc 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -236,5 +236,7 @@ xfs_alloc_get_rec( int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); +int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf **agbp); #endif /* __XFS_ALLOC_H__ */ -- cgit v0.10.2 From e66a4c678e64932eb4befd95a348b9632603d27c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 Jun 2016 11:53:28 +1000 Subject: xfs: convert list of extents to free into a regular list In struct xfs_bmap_free, convert the open-coded free extent list to a regular list, then use list_sort to sort it prior to processing. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 8847496..2f2c85c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -575,9 +575,7 @@ xfs_bmap_add_free( xfs_fsblock_t bno, /* fs block number of extent */ xfs_filblks_t len) /* length of extent */ { - xfs_bmap_free_item_t *cur; /* current (next) element */ - xfs_bmap_free_item_t *new; /* new element */ - xfs_bmap_free_item_t *prev; /* previous element */ + struct xfs_bmap_free_item *new; /* new element */ #ifdef DEBUG xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -597,17 +595,7 @@ xfs_bmap_add_free( new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); new->xbfi_startblock = bno; new->xbfi_blockcount = (xfs_extlen_t)len; - for (prev = NULL, cur = flist->xbf_first; - cur != NULL; - prev = cur, cur = cur->xbfi_next) { - if (cur->xbfi_startblock >= bno) - break; - } - if (prev) - prev->xbfi_next = new; - else - flist->xbf_first = new; - new->xbfi_next = cur; + list_add(&new->xbfi_list, &flist->xbf_flist); flist->xbf_count++; } @@ -617,14 +605,10 @@ xfs_bmap_add_free( */ void xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free) /* list item to be freed */ + struct xfs_bmap_free *flist, /* free item list header */ + struct xfs_bmap_free_item *free) /* list item to be freed */ { - if (prev) - prev->xbfi_next = free->xbfi_next; - else - flist->xbf_first = free->xbfi_next; + list_del(&free->xbfi_list); flist->xbf_count--; kmem_zone_free(xfs_bmap_free_item_zone, free); } @@ -634,17 +618,16 @@ xfs_bmap_del_free( */ void xfs_bmap_cancel( - xfs_bmap_free_t *flist) /* list of bmap_free_items */ + struct xfs_bmap_free *flist) /* list of bmap_free_items */ { - xfs_bmap_free_item_t *free; /* free list item */ - xfs_bmap_free_item_t *next; + struct xfs_bmap_free_item *free; /* free list item */ if (flist->xbf_count == 0) return; - ASSERT(flist->xbf_first != NULL); - for (free = flist->xbf_first; free; free = next) { - next = free->xbfi_next; - xfs_bmap_del_free(flist, NULL, free); + while (!list_empty(&flist->xbf_flist)) { + free = list_first_entry(&flist->xbf_flist, + struct xfs_bmap_free_item, xbfi_list); + xfs_bmap_del_free(flist, free); } ASSERT(flist->xbf_count == 0); } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e081c76..f1f3ae6 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -62,12 +62,12 @@ struct xfs_bmalloca { * List of extents to be free "later". * The list is kept sorted on xbf_startblock. */ -typedef struct xfs_bmap_free_item +struct xfs_bmap_free_item { xfs_fsblock_t xbfi_startblock;/* starting fs block number */ xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */ - struct xfs_bmap_free_item *xbfi_next; /* link to next entry */ -} xfs_bmap_free_item_t; + struct list_head xbfi_list; +}; /* * Header for free extent list. @@ -85,7 +85,7 @@ typedef struct xfs_bmap_free_item */ typedef struct xfs_bmap_free { - xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ + struct list_head xbf_flist; /* list of to-be-free extents */ int xbf_count; /* count of items on list */ int xbf_low; /* alloc in low mode */ } xfs_bmap_free_t; @@ -141,8 +141,10 @@ static inline int xfs_bmapi_aflag(int w) static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) { - ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ - (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK); + INIT_LIST_HEAD(&flp->xbf_flist); + flp->xbf_count = 0; + flp->xbf_low = 0; + *fbp = NULLFSBLOCK; } /* diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 586bb64..c53e07a 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -79,6 +79,23 @@ xfs_zero_extent( GFP_NOFS, true); } +/* Sort bmap items by AG. */ +static int +xfs_bmap_free_list_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_mount *mp = priv; + struct xfs_bmap_free_item *ra; + struct xfs_bmap_free_item *rb; + + ra = container_of(a, struct xfs_bmap_free_item, xbfi_list); + rb = container_of(b, struct xfs_bmap_free_item, xbfi_list); + return XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) - + XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock); +} + /* * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi * caller. Frees all the extents that need freeing, which must be done @@ -99,14 +116,15 @@ xfs_bmap_finish( int error; /* error return value */ int committed;/* xact committed or not */ struct xfs_bmap_free_item *free; /* free extent item */ - struct xfs_bmap_free_item *next; /* next item on free list */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); if (flist->xbf_count == 0) return 0; + list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp); + efi = xfs_trans_get_efi(*tp, flist->xbf_count); - for (free = flist->xbf_first; free; free = free->xbfi_next) + list_for_each_entry(free, &flist->xbf_flist, xbfi_list) xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); @@ -138,15 +156,15 @@ xfs_bmap_finish( * on error. */ efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); - for (free = flist->xbf_first; free != NULL; free = next) { - next = free->xbfi_next; - + while (!list_empty(&flist->xbf_flist)) { + free = list_first_entry(&flist->xbf_flist, + struct xfs_bmap_free_item, xbfi_list); error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock, free->xbfi_blockcount); if (error) return error; - xfs_bmap_del_free(flist, NULL, free); + xfs_bmap_del_free(flist, free); } return 0; @@ -799,7 +817,7 @@ xfs_bmap_punch_delalloc_range( if (error) break; - ASSERT(!flist.xbf_count && !flist.xbf_first); + ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist)); next_block: start_fsb++; remaining--; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index af97d9a..4ec85d1 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -43,7 +43,6 @@ int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ void xfs_bmap_del_free(struct xfs_bmap_free *flist, - struct xfs_bmap_free_item *prev, struct xfs_bmap_free_item *free); int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2d28108..5f3c729 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1700,8 +1700,9 @@ xfs_init_zones(void) if (!xfs_log_ticket_zone) goto out_free_ioend_bioset; - xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), - "xfs_bmap_free_item"); + xfs_bmap_free_item_zone = kmem_zone_init( + sizeof(struct xfs_bmap_free_item), + "xfs_bmap_free_item"); if (!xfs_bmap_free_item_zone) goto out_destroy_log_ticket_zone; -- cgit v0.10.2 From 19b54ee66c4c5de8f8db74d5914d9a97161460bf Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 Jun 2016 11:53:28 +1000 Subject: xfs: refactor btree maxlevels computation Create a common function to calculate the maximum height of a per-AG btree. This will eventually be used by the rmapbt and refcountbt code to calculate appropriate maxlevels values for each. This is important because the verifiers and the transaction block reservations depend on accurate estimates of how many blocks are needed to satisfy a btree split. We were mistakenly using the max bnobt height for all the btrees, which creates a dangerous situation since the larger records and keys in an rmapbt make it very possible that the rmapbt will be taller than the bnobt and so we can run out of transaction block reservation. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 638657a..e56991d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1839,19 +1839,8 @@ void xfs_alloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - int level; - uint maxblocks; - uint maxleafents; - int minleafrecs; - int minnoderecs; - - maxleafents = (mp->m_sb.sb_agblocks + 1) / 2; - minleafrecs = mp->m_alloc_mnr[0]; - minnoderecs = mp->m_alloc_mnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - mp->m_ag_maxlevels = level; + mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr, + (mp->m_sb.sb_agblocks + 1) / 2); } /* diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 1f88e1c..a6779b3 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4152,3 +4152,22 @@ xfs_btree_sblock_verify( return true; } + +/* + * Calculate the number of btree levels needed to store a given number of + * records in a short-format btree. + */ +uint +xfs_btree_compute_maxlevels( + struct xfs_mount *mp, + uint *limits, + unsigned long len) +{ + uint level; + unsigned long maxblocks; + + maxblocks = (len + limits[0] - 1) / limits[0]; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + limits[1] - 1) / limits[1]; + return level; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 2e874be..785a996 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -474,5 +474,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); +uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, + unsigned long len); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index e3c0af7..4b1e408 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2394,20 +2394,11 @@ void xfs_ialloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - int level; - uint maxblocks; - uint maxleafents; - int minleafrecs; - int minnoderecs; - - maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >> - XFS_INODES_PER_CHUNK_LOG; - minleafrecs = mp->m_inobt_mnr[0]; - minnoderecs = mp->m_inobt_mnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - mp->m_in_maxlevels = level; + uint inodes; + + inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; + mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr, + inodes); } /* -- cgit v0.10.2 From 7f1b62457b58f9bb586a1b2ff7fe271b56196bd2 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 20 Jul 2016 10:30:30 +1000 Subject: xfs: fix type confusion in xfs_ioc_swapext When calling fdget() in xfs_ioc_swapext(), we need to verify that the file descriptors passed into the ioctl point to XFS inodes before we start operations on them. If we don't do this, we could be referencing arbitrary kernel memory as an XFS inode. THis could lead to memory corruption and/or performing locking operations on attacker-chosen structures in kernel memory. [dchinner: rewrite commit message ] [dchinner: add comment explaining new check ] Signed-off-by: Jann Horn Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index dbca737..408f3ad 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1575,6 +1575,17 @@ xfs_ioc_swapext( goto out_put_tmp_file; } + /* + * We need to ensure that the fds passed in point to XFS inodes + * before we cast and access them as XFS structures as we have no + * control over what the user passes us here. + */ + if (f.file->f_op != &xfs_file_operations || + tmp.file->f_op != &xfs_file_operations) { + error = -EINVAL; + goto out_put_tmp_file; + } + ip = XFS_I(file_inode(f.file)); tip = XFS_I(file_inode(tmp.file)); -- cgit v0.10.2 From fbc21f33cda0a8e13ebd71fe2e23a21d4b79afbb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Jul 2016 10:37:13 +1000 Subject: xfs: don't allow negative error tags Errors go from zero which means no error to XFS_ERRTAG_MAX (22). My static checker complains that xfs_errortag_add() puts an upper bound on this but not a lower bound. Let's fix it by making it unsigned. Signed-off-by: Dan Carpenter Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 88693a9..acd9413 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -55,7 +55,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression, } int -xfs_errortag_add(int error_tag, xfs_mount_t *mp) +xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp) { int i; int len; diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 4ed3042..2e4f67f 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -128,7 +128,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ (rf)))) -extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp); extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); #else #define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) -- cgit v0.10.2 From fbfb24bf105449eab1339c20f6f6b81d02c59c13 Mon Sep 17 00:00:00 2001 From: Kaho Ng Date: Wed, 20 Jul 2016 10:37:50 +1000 Subject: xfs: indentation fix in xfs_btree_get_iroot() The indentation in this function is different from the other functions. Those spacebars are converted to tabs to improve readability. Signed-off-by: Kaho Ng Reviewed-by: Carlos Maiolino Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 1f88e1c..4f84dde 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -543,12 +543,12 @@ xfs_btree_ptr_addr( */ STATIC struct xfs_btree_block * xfs_btree_get_iroot( - struct xfs_btree_cur *cur) + struct xfs_btree_cur *cur) { - struct xfs_ifork *ifp; + struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); - return (struct xfs_btree_block *)ifp->if_broot; + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); + return (struct xfs_btree_block *)ifp->if_broot; } /* -- cgit v0.10.2 From ad70328a503fae813a563dbe97dd3466ac079e8e Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 20 Jul 2016 10:43:11 +1000 Subject: xfs: remove the magic numbers in xfs_btree_block-related len macros replace the magic numbers by offsetof(...) and sizeof(...), and add two extra checks on xfs_check_ondisk_structs() [dchinner: renamed header structures to be more descriptive] Signed-off-by: Hou Tao Reviewed-by: Carlos Maiolino Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index dc97eb21..adb204d 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1435,41 +1435,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; * with the crc feature bit, and all accesses to them must be conditional on * that flag. */ +/* short form block header */ +struct xfs_btree_block_shdr { + __be32 bb_leftsib; + __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; +}; + +/* long form block header */ +struct xfs_btree_block_lhdr { + __be64 bb_leftsib; + __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ +}; + struct xfs_btree_block { __be32 bb_magic; /* magic number for block type */ __be16 bb_level; /* 0 is a leaf */ __be16 bb_numrecs; /* current # of data records */ union { - struct { - __be32 bb_leftsib; - __be32 bb_rightsib; - - __be64 bb_blkno; - __be64 bb_lsn; - uuid_t bb_uuid; - __be32 bb_owner; - __le32 bb_crc; - } s; /* short form pointers */ - struct { - __be64 bb_leftsib; - __be64 bb_rightsib; - - __be64 bb_blkno; - __be64 bb_lsn; - uuid_t bb_uuid; - __be64 bb_owner; - __le32 bb_crc; - __be32 bb_pad; /* padding for alignment */ - } l; /* long form pointers */ + struct xfs_btree_block_shdr s; + struct xfs_btree_block_lhdr l; } bb_u; /* rest */ }; -#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ -#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ +/* size of a short form block */ +#define XFS_BTREE_SBLOCK_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + offsetof(struct xfs_btree_block_shdr, bb_blkno)) +/* size of a long form block */ +#define XFS_BTREE_LBLOCK_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + offsetof(struct xfs_btree_block_lhdr, bb_blkno)) /* sizes of CRC enabled btree blocks */ -#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) -#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) +#define XFS_BTREE_SBLOCK_CRC_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + sizeof(struct xfs_btree_block_shdr)) +#define XFS_BTREE_LBLOCK_CRC_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + sizeof(struct xfs_btree_block_lhdr)) #define XFS_BTREE_SBLOCK_CRC_OFF \ offsetof(struct xfs_btree_block, bb_u.s.bb_crc) diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 184c44e..20a0d26 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -34,6 +34,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104); -- cgit v0.10.2 From e97f6c545f963abd7de56a58a29ba73a9edee015 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 20 Jul 2016 10:48:51 +1000 Subject: xfs: fix xfs_error_get_cfg for negative errnos xfs_error_get_cfg() is called with bp->b_error as an arg, which is negative, so the switch statement won't ever find any matches. This results in only the default error handler having any effect, as EIO/ENOSPC/ENODEV get ignored due to the wrong sign. It seems simplest to always flip the error sign to positive, so that we can handle either negative errors in bp->b_error, or possibly a positive errno via something like xfs_error_get_cfg(EIO) - this future-proofs the function. Signed-off-by: Eric Sandeen Reviewed-by: Carlos Maiolino Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 4c2c550..79cfd3f 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -634,6 +634,9 @@ xfs_error_get_cfg( { struct xfs_error_cfg *cfg; + if (error < 0) + error = -error; + switch (error) { case EIO: cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO]; -- cgit v0.10.2 From 0b4db5dff3599b46957bfd8a4c66945c915e26d3 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 20 Jul 2016 10:53:22 +1000 Subject: xfs: remove extraneous buffer flag changes Fix up a couple places where extra flag manipulation occurs. In the first case we clear XBF_ASYNC and then immediately reset it - so don't bother clearing in the first place. In the 2nd case we are at a point in the function where the buffer must already be async, so there is no need to reset it. Add consistent spacing around the " | " while we're at it. Signed-off-by: Eric Sandeen Reviewed-by: Carlos Maiolino Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e71cfbd..5d52e44 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1816,7 +1816,7 @@ __xfs_buf_delwri_submit( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); bp->b_flags |= XBF_WRITE | XBF_ASYNC; /* diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 3425799..6a2f429 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1080,10 +1080,9 @@ xfs_buf_iodone_callback_error( * async write failure at least once, but we also need to set the buffer * up to behave correctly now for repeated failures. */ - if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) || + if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) || bp->b_last_error != bp->b_error) { - bp->b_flags |= (XBF_WRITE | XBF_ASYNC | - XBF_DONE | XBF_WRITE_FAIL); + bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); bp->b_last_error = bp->b_error; bp->b_retries = 0; bp->b_first_retry_time = jiffies; -- cgit v0.10.2 From 5539d36752eb789f4067a9f88e72177895d56317 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 20 Jul 2016 10:54:09 +1000 Subject: xfs: don't reset b_retries to 0 on every failure With the code as it stands today, b_retries never increments because it gets reset to 0 in the error callback. Remove that, and fix a similar problem where the first retry time was constantly being overwritten, which defeated the timeout tunable as well. We now only set first retry time if a non-zero timeout is set, to match the behavior of only incrementing retries if a retry value is set. This way max retries & timeouts consistently take effect after a tunable is set, rather than acting retroactively on a buffer which has failed at some point in the past and has accumulated state from those prior failures. Thanks to dchinner for talking through this with me. Signed-off-by: Eric Sandeen Reviewed-by: Carlos Maiolino Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 6a2f429..3b19e52 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1073,6 +1073,8 @@ xfs_buf_iodone_callback_error( trace_xfs_buf_item_iodone_async(bp, _RET_IP_); ASSERT(bp->b_iodone != NULL); + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + /* * If the write was asynchronous then no one will be looking for the * error. If this is the first failure of this type, clear the error @@ -1084,8 +1086,8 @@ xfs_buf_iodone_callback_error( bp->b_last_error != bp->b_error) { bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); bp->b_last_error = bp->b_error; - bp->b_retries = 0; - bp->b_first_retry_time = jiffies; + if (cfg->retry_timeout && !bp->b_first_retry_time) + bp->b_first_retry_time = jiffies; xfs_buf_ioerror(bp, 0); xfs_buf_submit(bp); @@ -1096,7 +1098,6 @@ xfs_buf_iodone_callback_error( * Repeated failure on an async write. Take action according to the * error configuration we have been set up to use. */ - cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && ++bp->b_retries > cfg->max_retries) -- cgit v0.10.2 From c891c30a4dd1a236bb98630b35fc2769c5ce0d40 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 20 Jul 2016 11:13:43 +1000 Subject: xfs: exclude never-released buffers from buftarg I/O accounting The upcoming buftarg I/O accounting mechanism maintains a count of all buffers that have undergone I/O in the current hold-release cycle. Certain buffers associated with core infrastructure (e.g., the xfs_mount superblock buffer, log buffers) are never released, however. This means that accounting I/O submission on such buffers elevates the buftarg count indefinitely and could lead to lockup on unmount. Define a new buffer flag to explicitly exclude buffers from buftarg I/O accounting. Set the flag on the superblock and associated log buffers. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 5d52e44..c0bd5e0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -815,7 +815,8 @@ xfs_buf_get_uncached( struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); - bp = _xfs_buf_alloc(target, &map, 1, 0); + /* flags might contain irrelevant bits, pass only what we care about */ + bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); if (unlikely(bp == NULL)) goto fail; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 8bfb974..e2108da 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -43,6 +43,7 @@ typedef enum { #define XBF_READ (1 << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */ #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bde02f1..216aaa2 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1415,7 +1415,7 @@ xlog_alloc_log( */ error = -ENOMEM; bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, - BTOBB(log->l_iclog_size), 0); + BTOBB(log->l_iclog_size), XBF_NO_IOACCT); if (!bp) goto out_free_log; @@ -1454,7 +1454,8 @@ xlog_alloc_log( prev_iclog = iclog; bp = xfs_buf_get_uncached(mp->m_logdev_targp, - BTOBB(log->l_iclog_size), 0); + BTOBB(log->l_iclog_size), + XBF_NO_IOACCT); if (!bp) goto out_free_iclog; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index e39b023..970c19b 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -272,13 +272,15 @@ xfs_readsb( buf_ops = NULL; /* - * Allocate a (locked) buffer to hold the superblock. - * This will be kept around at all times to optimize - * access to the superblock. + * Allocate a (locked) buffer to hold the superblock. This will be kept + * around at all times to optimize access to the superblock. Therefore, + * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count + * elevated. */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, &bp, buf_ops); + BTOBB(sector_size), XBF_NO_IOACCT, &bp, + buf_ops); if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); -- cgit v0.10.2 From 9c7504aa72b6e2104ba6dcef518c15672ec51175 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 20 Jul 2016 11:15:28 +1000 Subject: xfs: track and serialize in-flight async buffers against unmount Newly allocated XFS metadata buffers are added to the LRU once the hold count is released, which typically occurs after I/O completion. There is no other mechanism at current that tracks the existence or I/O state of a new buffer. Further, readahead I/O tends to be submitted asynchronously by nature, which means the I/O can remain in flight and actually complete long after the calling context is gone. This means that file descriptors or any other holds on the filesystem can be released, allowing the filesystem to be unmounted while I/O is still in flight. When I/O completion occurs, core data structures may have been freed, causing completion to run into invalid memory accesses and likely to panic. This problem is reproduced on XFS via directory readahead. A filesystem is mounted, a directory is opened/closed and the filesystem immediately unmounted. The open/close cycle triggers a directory readahead that if delayed long enough, runs buffer I/O completion after the unmount has completed. To address this problem, add a mechanism to track all in-flight, asynchronous buffers using per-cpu counters in the buftarg. The buffer is accounted on the first I/O submission after the current reference is acquired and unaccounted once the buffer is returned to the LRU or freed. Update xfs_wait_buftarg() to wait on all in-flight I/O before walking the LRU list. Once in-flight I/O has completed and the workqueue has drained, all new buffers should have been released onto the LRU. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c0bd5e0..2722cb4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -80,6 +80,47 @@ xfs_buf_vmap_len( } /* + * Bump the I/O in flight count on the buftarg if we haven't yet done so for + * this buffer. The count is incremented once per buffer (per hold cycle) + * because the corresponding decrement is deferred to buffer release. Buffers + * can undergo I/O multiple times in a hold-release cycle and per buffer I/O + * tracking adds unnecessary overhead. This is used for sychronization purposes + * with unmount (see xfs_wait_buftarg()), so all we really need is a count of + * in-flight buffers. + * + * Buffers that are never released (e.g., superblock, iclog buffers) must set + * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count + * never reaches zero and unmount hangs indefinitely. + */ +static inline void +xfs_buf_ioacct_inc( + struct xfs_buf *bp) +{ + if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) + return; + + ASSERT(bp->b_flags & XBF_ASYNC); + bp->b_flags |= _XBF_IN_FLIGHT; + percpu_counter_inc(&bp->b_target->bt_io_count); +} + +/* + * Clear the in-flight state on a buffer about to be released to the LRU or + * freed and unaccount from the buftarg. + */ +static inline void +xfs_buf_ioacct_dec( + struct xfs_buf *bp) +{ + if (!(bp->b_flags & _XBF_IN_FLIGHT)) + return; + + ASSERT(bp->b_flags & XBF_ASYNC); + bp->b_flags &= ~_XBF_IN_FLIGHT; + percpu_counter_dec(&bp->b_target->bt_io_count); +} + +/* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer * reference count falls to zero. If the buffer is already on the LRU, we need @@ -102,6 +143,14 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; + /* + * Once the buffer is marked stale and unlocked, a subsequent lookup + * could reset b_flags. There is no guarantee that the buffer is + * unaccounted (released to LRU) before that occurs. Drop in-flight + * status now to preserve accounting consistency. + */ + xfs_buf_ioacct_dec(bp); + spin_lock(&bp->b_lock); atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && @@ -867,63 +916,85 @@ xfs_buf_hold( } /* - * Releases a hold on the specified buffer. If the - * the hold count is 1, calls xfs_buf_free. + * Release a hold on the specified buffer. If the hold count is 1, the buffer is + * placed on LRU or freed (depending on b_lru_ref). */ void xfs_buf_rele( xfs_buf_t *bp) { struct xfs_perag *pag = bp->b_pag; + bool release; + bool freebuf = false; trace_xfs_buf_rele(bp, _RET_IP_); if (!pag) { ASSERT(list_empty(&bp->b_lru)); ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); - if (atomic_dec_and_test(&bp->b_hold)) + if (atomic_dec_and_test(&bp->b_hold)) { + xfs_buf_ioacct_dec(bp); xfs_buf_free(bp); + } return; } ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); ASSERT(atomic_read(&bp->b_hold) > 0); - if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - spin_lock(&bp->b_lock); - if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { - /* - * If the buffer is added to the LRU take a new - * reference to the buffer for the LRU and clear the - * (now stale) dispose list state flag - */ - if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { - bp->b_state &= ~XFS_BSTATE_DISPOSE; - atomic_inc(&bp->b_hold); - } - spin_unlock(&bp->b_lock); - spin_unlock(&pag->pag_buf_lock); - } else { - /* - * most of the time buffers will already be removed from - * the LRU, so optimise that case by checking for the - * XFS_BSTATE_DISPOSE flag indicating the last list the - * buffer was on was the disposal list - */ - if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); - } else { - ASSERT(list_empty(&bp->b_lru)); - } - spin_unlock(&bp->b_lock); - ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); - xfs_buf_free(bp); + release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + spin_lock(&bp->b_lock); + if (!release) { + /* + * Drop the in-flight state if the buffer is already on the LRU + * and it holds the only reference. This is racy because we + * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT + * ensures the decrement occurs only once per-buf. + */ + if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) + xfs_buf_ioacct_dec(bp); + goto out_unlock; + } + + /* the last reference has been dropped ... */ + xfs_buf_ioacct_dec(bp); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new reference to the + * buffer for the LRU and clear the (now stale) dispose list + * state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); } + spin_unlock(&pag->pag_buf_lock); + } else { + /* + * most of the time buffers will already be removed from the + * LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the buffer + * was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); + } + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + freebuf = true; } + +out_unlock: + spin_unlock(&bp->b_lock); + + if (freebuf) + xfs_buf_free(bp); } @@ -1340,6 +1411,7 @@ xfs_buf_submit( * xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); + xfs_buf_ioacct_inc(bp); _xfs_buf_ioapply(bp); /* @@ -1525,13 +1597,19 @@ xfs_wait_buftarg( int loop = 0; /* - * We need to flush the buffer workqueue to ensure that all IO - * completion processing is 100% done. Just waiting on buffer locks is - * not sufficient for async IO as the reference count held over IO is - * not released until after the buffer lock is dropped. Hence we need to - * ensure here that all reference counts have been dropped before we - * start walking the LRU list. + * First wait on the buftarg I/O count for all in-flight buffers to be + * released. This is critical as new buffers do not make the LRU until + * they are released. + * + * Next, flush the buffer workqueue to ensure all completion processing + * has finished. Just waiting on buffer locks is not sufficient for + * async IO as the reference count held over IO is not released until + * after the buffer lock is dropped. Hence we need to ensure here that + * all reference counts have been dropped before we start walking the + * LRU list. */ + while (percpu_counter_sum(&btp->bt_io_count)) + delay(100); drain_workqueue(btp->bt_mount->m_buf_workqueue); /* loop until there is nothing left on the lru list. */ @@ -1628,6 +1706,8 @@ xfs_free_buftarg( struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); + ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); + percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); if (mp->m_flags & XFS_MOUNT_BARRIER) @@ -1692,6 +1772,9 @@ xfs_alloc_buftarg( if (list_lru_init(&btp->bt_lru)) goto error; + if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) + goto error; + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index e2108da..1c2e52b 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -63,6 +63,7 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ +#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */ typedef unsigned int xfs_buf_flags_t; @@ -82,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" } + { _XBF_COMPOUND, "COMPOUND" }, \ + { _XBF_IN_FLIGHT, "IN_FLIGHT" } /* @@ -116,6 +118,8 @@ typedef struct xfs_buftarg { /* LRU control structures */ struct shrinker bt_shrinker; struct list_lru bt_lru; + + struct percpu_counter bt_io_count; } xfs_buftarg_t; struct xfs_buf; -- cgit v0.10.2 From 8f3e2058e1746dc3fb8145f8fbd5ee358cbc1a30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:29:35 +1000 Subject: xfs: don't pass ioflags around in the ioctl path Instead check the file pointer for the invisble I/O flag directly, and use the chance to drop redundant arguments from the xfs_ioc_space prototype. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index dbca737..6ab5a24 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -595,13 +595,12 @@ xfs_attrmulti_by_handle( int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf) { + struct inode *inode = file_inode(filp); + struct xfs_inode *ip = XFS_I(inode); struct iattr iattr; enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; @@ -626,7 +625,7 @@ xfs_ioc_space( if (filp->f_flags & O_DSYNC) flags |= XFS_PREALLOC_SYNC; - if (ioflags & XFS_IO_INVIS) + if (filp->f_mode & FMODE_NOCMTIME) flags |= XFS_PREALLOC_INVISIBLE; error = mnt_want_write_file(filp); @@ -1464,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) STATIC int xfs_ioc_getbmap( - struct xfs_inode *ip, - int ioflags, + struct file *file, unsigned int cmd, void __user *arg) { @@ -1479,10 +1477,10 @@ xfs_ioc_getbmap( return -EINVAL; bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); - if (ioflags & XFS_IO_INVIS) + if (file->f_mode & FMODE_NOCMTIME) bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; - error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, + error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format, (__force struct getbmap *)arg+1); if (error) return error; @@ -1619,12 +1617,8 @@ xfs_file_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_ioctl(ip); switch (cmd) { @@ -1643,7 +1637,7 @@ xfs_file_ioctl( if (copy_from_user(&bf, arg, sizeof(bf))) return -EFAULT; - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_DIOINFO: { struct dioattr da; @@ -1702,7 +1696,7 @@ xfs_file_ioctl( case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(ip, ioflags, cmd, arg); + return xfs_ioc_getbmap(filp, cmd, arg); case XFS_IOC_GETBMAPX: return xfs_ioc_getbmapx(ip, arg); diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 77c02c7..8b52881 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -20,10 +20,7 @@ extern int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 1a05d8a..321f577 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -532,12 +532,8 @@ xfs_file_compat_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_compat_ioctl(ip); switch (cmd) { @@ -589,7 +585,7 @@ xfs_file_compat_ioctl( if (xfs_compat_flock64_copyin(&bf, arg)) return -EFAULT; cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(mp, arg); -- cgit v0.10.2 From 3176c3e0ef32963aa5f6f9754142e420a4ba5d64 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:31:42 +1000 Subject: xfs: kill ioflags Now that we have the direct I/O kiocb flag there is no real need to sample the value inside of XFS, and the invis flag was always just partially used and isn't worth keeping this infrastructure around for. This also splits the read tracepoint into buffered vs direct as we've done for writes a long time ago. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 47fc632..e51622c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -292,18 +292,12 @@ xfs_file_read_iter( struct xfs_mount *mp = ip->i_mount; size_t size = iov_iter_count(to); ssize_t ret = 0; - int ioflags = 0; xfs_fsize_t n; loff_t pos = iocb->ki_pos; XFS_STATS_INC(mp, xs_read_calls); - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) - ioflags |= XFS_IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - - if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { + if ((iocb->ki_flags & IOCB_DIRECT) && !IS_DAX(inode)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -336,7 +330,7 @@ xfs_file_read_iter( * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { + if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); @@ -370,7 +364,10 @@ xfs_file_read_iter( xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - trace_xfs_file_read(ip, size, pos, ioflags); + if (iocb->ki_flags & IOCB_DIRECT) + trace_xfs_file_direct_read(ip, size, pos); + else + trace_xfs_file_buffered_read(ip, size, pos); ret = generic_file_read_iter(iocb, to); if (ret > 0) @@ -389,18 +386,14 @@ xfs_file_splice_read( unsigned int flags) { struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); - int ioflags = 0; ssize_t ret; XFS_STATS_INC(ip->i_mount, xs_read_calls); - if (infilp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + trace_xfs_file_splice_read(ip, count, *ppos); /* * DAX inodes cannot ues the page cache for splice, so we have to push @@ -789,7 +782,7 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; ret = mapping->a_ops->direct_IO(iocb, &data); @@ -839,8 +832,7 @@ xfs_file_buffered_aio_write( current->backing_dev_info = inode_to_bdi(inode); write_retry: - trace_xfs_file_buffered_write(ip, iov_iter_count(from), - iocb->ki_pos, 0); + trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); ret = generic_perform_write(file, from, iocb->ki_pos); if (likely(ret >= 0)) iocb->ki_pos += ret; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e52d7c7..57b66d2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -479,14 +479,4 @@ do { \ extern struct kmem_zone *xfs_inode_zone; -/* - * Flags for read/write calls - */ -#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */ -#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */ - -#define XFS_IO_FLAGS \ - { XFS_IO_ISDIRECT, "DIRECT" }, \ - { XFS_IO_INVIS, "INVIS"} - #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ea94ee0..a1bc5c6 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1134,15 +1134,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn, ) DECLARE_EVENT_CLASS(xfs_file_class, - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), - TP_ARGS(ip, count, offset, flags), + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), + TP_ARGS(ip, count, offset), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) __field(loff_t, offset) __field(size_t, count) - __field(int, flags) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; @@ -1150,23 +1149,21 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->size = ip->i_d.di_size; __entry->offset = offset; __entry->count = count; - __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx " - "offset 0x%llx count 0x%zx ioflags %s", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, - __entry->count, - __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) + __entry->count) ) #define DEFINE_RW_EVENT(name) \ DEFINE_EVENT(xfs_file_class, name, \ - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ - TP_ARGS(ip, count, offset, flags)) -DEFINE_RW_EVENT(xfs_file_read); + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \ + TP_ARGS(ip, count, offset)) +DEFINE_RW_EVENT(xfs_file_buffered_read); +DEFINE_RW_EVENT(xfs_file_direct_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_splice_read); -- cgit v0.10.2 From cf810712cc82cbfab8f08a46ca6c0289d386a303 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:31:53 +1000 Subject: xfs: remove s_maxbytes enforcement in xfs_file_read_iter All the three low-level read implementations that we might call already take care of not overflowing the maximum supported bytes, no need to duplicate it here. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e51622c..7ec8225 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -292,7 +292,6 @@ xfs_file_read_iter( struct xfs_mount *mp = ip->i_mount; size_t size = iov_iter_count(to); ssize_t ret = 0; - xfs_fsize_t n; loff_t pos = iocb->ki_pos; XFS_STATS_INC(mp, xs_read_calls); @@ -309,13 +308,6 @@ xfs_file_read_iter( } } - n = mp->m_super->s_maxbytes - pos; - if (n <= 0 || size == 0) - return 0; - - if (n < size) - size = n; - if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; -- cgit v0.10.2 From bbc5a740c4f27a9732a3a3decf3186b4bce21108 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:35:42 +1000 Subject: xfs: split xfs_file_read_iter into buffered and direct I/O helpers Similar to what we did on the write side a while ago. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7ec8225..fdb123f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -282,35 +282,33 @@ xfs_file_fsync( } STATIC ssize_t -xfs_file_read_iter( +xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - size_t size = iov_iter_count(to); + size_t count = iov_iter_count(to); + struct xfs_buftarg *target; ssize_t ret = 0; - loff_t pos = iocb->ki_pos; - XFS_STATS_INC(mp, xs_read_calls); + trace_xfs_file_direct_read(ip, count, iocb->ki_pos); - if ((iocb->ki_flags & IOCB_DIRECT) && !IS_DAX(inode)) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; + if (XFS_IS_REALTIME_INODE(ip)) + target = ip->i_mount->m_rtdev_targp; + else + target = ip->i_mount->m_ddev_targp; + + if (!IS_DAX(inode)) { /* DIO must be aligned to device logical sector size */ - if ((pos | size) & target->bt_logical_sectormask) { - if (pos == i_size_read(inode)) + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { + if (iocb->ki_pos == i_size_read(inode)) return 0; return -EINVAL; } } - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* * Locking is a bit tricky here. If we take an exclusive lock for direct * IO, we effectively serialise all new concurrent read IO to this file @@ -322,7 +320,7 @@ xfs_file_read_iter( * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_mapping->nrpages) { + if (mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); @@ -337,8 +335,8 @@ xfs_file_read_iter( * flush and reduce the chances of repeated iolock cycles going * forward. */ - if (inode->i_mapping->nrpages) { - ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (mapping->nrpages) { + ret = filemap_write_and_wait(mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; @@ -349,23 +347,56 @@ xfs_file_read_iter( * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ - ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); + ret = invalidate_inode_pages2(mapping); WARN_ON_ONCE(ret); ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } + ret = generic_file_read_iter(iocb, to); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + return ret; +} + +STATIC ssize_t +xfs_file_buffered_aio_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + ssize_t ret; + + trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = generic_file_read_iter(iocb, to); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + return ret; +} + +STATIC ssize_t +xfs_file_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct xfs_mount *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount; + ssize_t ret = 0; + + XFS_STATS_INC(mp, xs_read_calls); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + if (iocb->ki_flags & IOCB_DIRECT) - trace_xfs_file_direct_read(ip, size, pos); + ret = xfs_file_dio_aio_read(iocb, to); else - trace_xfs_file_buffered_read(ip, size, pos); + ret = xfs_file_buffered_aio_read(iocb, to); - ret = generic_file_read_iter(iocb, to); if (ret > 0) XFS_STATS_ADD(mp, xs_read_bytes, ret); - - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -747,7 +778,7 @@ xfs_file_dio_aio_write( end = iocb->ki_pos + count - 1; /* - * See xfs_file_read_iter() for why we do a full-file flush here. + * See xfs_file_dio_aio_read() for why we do a full-file flush here. */ if (mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); -- cgit v0.10.2 From f1285ff0acf9040a39921355d07bd83a3308c402 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:36:57 +1000 Subject: xfs: stop using generic_file_read_iter for direct I/O XFS already implement it's own flushing of the pagecache because it implements proper synchronization for direct I/O reads. This means calling generic_file_read_iter for direct I/O is rather useless, as it doesn't do much but updating the atime and iocb position for us. This also gets rid of the buffered I/O fallback that isn't used for XFS. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index fdb123f..440bb8b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -289,12 +289,17 @@ xfs_file_dio_aio_read( struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); + loff_t isize = i_size_read(inode); size_t count = iov_iter_count(to); + struct iov_iter data; struct xfs_buftarg *target; ssize_t ret = 0; trace_xfs_file_direct_read(ip, count, iocb->ki_pos); + if (!count) + return 0; /* skip atime */ + if (XFS_IS_REALTIME_INODE(ip)) target = ip->i_mount->m_rtdev_targp; else @@ -303,7 +308,7 @@ xfs_file_dio_aio_read( if (!IS_DAX(inode)) { /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { - if (iocb->ki_pos == i_size_read(inode)) + if (iocb->ki_pos == isize) return 0; return -EINVAL; } @@ -354,9 +359,15 @@ xfs_file_dio_aio_read( xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - ret = generic_file_read_iter(iocb, to); + data = *to; + ret = mapping->a_ops->direct_IO(iocb, &data); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); + } xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + file_accessed(iocb->ki_filp); return ret; } -- cgit v0.10.2 From fa8d972d055c723cc427e14d4d7919640f418730 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:38:01 +1000 Subject: xfs: direct calls in the direct I/O path We control both the callers and callees of ->direct_IO, so remove the indirect calls. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4c463b9..3ba0809 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1336,7 +1336,7 @@ xfs_get_blocks_dax_fault( * whereas if we have flags set we will always be called in task context * (i.e. from a workqueue). */ -STATIC int +int xfs_end_io_direct_write( struct kiocb *iocb, loff_t offset, @@ -1407,24 +1407,10 @@ xfs_vm_direct_IO( struct kiocb *iocb, struct iov_iter *iter) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - dio_iodone_t *endio = NULL; - int flags = 0; - struct block_device *bdev; - - if (iov_iter_rw(iter) == WRITE) { - endio = xfs_end_io_direct_write; - flags = DIO_ASYNC_EXTEND; - } - - if (IS_DAX(inode)) { - return dax_do_io(iocb, inode, iter, - xfs_get_blocks_direct, endio, 0); - } - - bdev = xfs_find_bdev_for_inode(inode); - return __blockdev_direct_IO(iocb, inode, bdev, iter, - xfs_get_blocks_direct, endio, NULL, flags); + /* + * We just need the method present so that open/fcntl allow direct I/O. + */ + return -EINVAL; } /* diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 814aab7..bf2d9a1 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -60,6 +60,9 @@ int xfs_get_blocks_direct(struct inode *inode, sector_t offset, int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); +int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, + ssize_t size, void *private); + extern void xfs_count_page_state(struct page *, int *, int *); extern struct block_device *xfs_find_bdev_for_inode(struct inode *); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 440bb8b..dd5185d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -360,7 +360,13 @@ xfs_file_dio_aio_read( } data = *to; - ret = mapping->a_ops->direct_IO(iocb, &data); + if (IS_DAX(inode)) { + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + NULL, 0); + } else { + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, NULL, NULL, 0); + } if (ret > 0) { iocb->ki_pos += ret; iov_iter_advance(to, ret); @@ -819,7 +825,14 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; - ret = mapping->a_ops->direct_IO(iocb, &data); + if (IS_DAX(inode)) { + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + xfs_end_io_direct_write, 0); + } else { + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, xfs_end_io_direct_write, + NULL, DIO_ASYNC_EXTEND); + } /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { -- cgit v0.10.2 From 16d4d43595b4780daac8fcea6d042689124cb094 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:38:55 +1000 Subject: xfs: split direct I/O and DAX path So far the DAX code overloaded the direct I/O code path. There is very little in common between the two, and untangling them allows to clean up both variants. As a side effect we also get separate trace points for both I/O types. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index dd5185d..d97e8cb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -305,13 +305,11 @@ xfs_file_dio_aio_read( else target = ip->i_mount->m_ddev_targp; - if (!IS_DAX(inode)) { - /* DIO must be aligned to device logical sector size */ - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { - if (iocb->ki_pos == isize) - return 0; - return -EINVAL; - } + /* DIO must be aligned to device logical sector size */ + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { + if (iocb->ki_pos == isize) + return 0; + return -EINVAL; } /* @@ -360,13 +358,37 @@ xfs_file_dio_aio_read( } data = *to; - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, - NULL, 0); - } else { - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, NULL, NULL, 0); + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, NULL, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + file_accessed(iocb->ki_filp); + return ret; +} + +STATIC ssize_t +xfs_file_dax_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct iov_iter data = *to; + size_t count = iov_iter_count(to); + ssize_t ret = 0; + + trace_xfs_file_dax_read(ip, count, iocb->ki_pos); + + if (!count) + return 0; /* skip atime */ + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); if (ret > 0) { iocb->ki_pos += ret; iov_iter_advance(to, ret); @@ -399,7 +421,8 @@ xfs_file_read_iter( struct kiocb *iocb, struct iov_iter *to) { - struct xfs_mount *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount; + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_mount *mp = XFS_I(inode)->i_mount; ssize_t ret = 0; XFS_STATS_INC(mp, xs_read_calls); @@ -407,7 +430,9 @@ xfs_file_read_iter( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (iocb->ki_flags & IOCB_DIRECT) + if (IS_DAX(inode)) + ret = xfs_file_dax_read(iocb, to); + else if (iocb->ki_flags & IOCB_DIRECT) ret = xfs_file_dio_aio_read(iocb, to); else ret = xfs_file_buffered_aio_read(iocb, to); @@ -755,8 +780,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if (!IS_DAX(inode) && - ((iocb->ki_pos | count) & target->bt_logical_sectormask)) + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -825,14 +849,9 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, - xfs_end_io_direct_write, 0); - } else { - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, xfs_end_io_direct_write, - NULL, DIO_ASYNC_EXTEND); - } + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, xfs_end_io_direct_write, + NULL, DIO_ASYNC_EXTEND); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { @@ -849,10 +868,70 @@ out: xfs_rw_iunlock(ip, iolock); /* - * No fallback to buffered IO on errors for XFS. DAX can result in - * partial writes, but direct IO will either complete fully or fail. + * No fallback to buffered IO on errors for XFS, direct IO will either + * complete fully or fail. + */ + ASSERT(ret < 0 || ret == count); + return ret; +} + +STATIC ssize_t +xfs_file_dax_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + int unaligned_io = 0; + int iolock; + struct iov_iter data; + + /* "unaligned" here means not aligned to a filesystem block */ + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { + unaligned_io = 1; + iolock = XFS_IOLOCK_EXCL; + } else if (mapping->nrpages) { + iolock = XFS_IOLOCK_EXCL; + } else { + iolock = XFS_IOLOCK_SHARED; + } + xfs_rw_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) + goto out; + + /* + * Yes, even DAX files can have page cache attached to them: A zeroed + * page is inserted into the pagecache when we have to serve a write + * fault on a hole. It should never be dirtied and can simply be + * dropped from the pagecache once we get real data for the page. */ - ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); + if (mapping->nrpages) { + ret = invalidate_inode_pages2(mapping); + WARN_ON_ONCE(ret); + } + + if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); + + data = *from; + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + xfs_end_io_direct_write, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(from, ret); + } +out: + xfs_rw_iunlock(ip, iolock); return ret; } @@ -934,7 +1013,9 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) + if (IS_DAX(inode)) + ret = xfs_file_dax_write(iocb, from); + else if (iocb->ki_flags & IOCB_DIRECT) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a1bc5c6..c287691 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1164,8 +1164,10 @@ DEFINE_EVENT(xfs_file_class, name, \ TP_ARGS(ip, count, offset)) DEFINE_RW_EVENT(xfs_file_buffered_read); DEFINE_RW_EVENT(xfs_file_direct_read); +DEFINE_RW_EVENT(xfs_file_dax_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); +DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_file_splice_read); DECLARE_EVENT_CLASS(xfs_page_class, -- cgit v0.10.2 From 8353a649f577a5d775f4666a31b286b8a5156dfb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:47:21 +1000 Subject: xfs: kill xfs_dir2_sf_off_t Just use an array of two unsigned chars directly to avoid problems with architectures that pad the size of structures. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 8d4d8bc..a5f4d6e 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -192,12 +192,6 @@ typedef __uint16_t xfs_dir2_data_off_t; typedef uint xfs_dir2_data_aoff_t; /* argument form */ /* - * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. - * Only need 16 bits, this is the byte offset into the single block form. - */ -typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; - -/* * Offset in data space of a data entry. */ typedef __uint32_t xfs_dir2_dataptr_t; @@ -251,7 +245,7 @@ typedef struct xfs_dir2_sf_hdr { typedef struct xfs_dir2_sf_entry { __u8 namelen; /* actual name length */ - xfs_dir2_sf_off_t offset; /* saved offset */ + __u8 offset[2]; /* saved offset */ __u8 name[]; /* name, variable size */ /* * A single byte containing the file type field follows the inode @@ -260,7 +254,7 @@ typedef struct xfs_dir2_sf_entry { * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a * variable offset after the name. */ -} __arch_pack xfs_dir2_sf_entry_t; +} xfs_dir2_sf_entry_t; static inline int xfs_dir2_sf_hdr_size(int i8count) { @@ -272,13 +266,13 @@ static inline int xfs_dir2_sf_hdr_size(int i8count) static inline xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) { - return get_unaligned_be16(&sfep->offset.i); + return get_unaligned_be16(sfep->offset); } static inline void xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) { - put_unaligned_be16(off, &sfep->offset.i); + put_unaligned_be16(off, sfep->offset); } static inline struct xfs_dir2_sf_entry * diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index e5bb9cc..f8ccfd5a 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -126,11 +126,10 @@ xfs_dir2_block_sfsize( /* * Calculate the new size, see if we should give up yet. */ - size = xfs_dir2_sf_hdr_size(i8count) + /* header */ - count + /* namelen */ - count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ - namelen + /* name */ - (i8count ? /* inumber */ + size = xfs_dir2_sf_hdr_size(i8count) + /* header */ + count * 3 * sizeof(u8) + /* namelen + offset */ + namelen + /* name */ + (i8count ? /* inumber */ (uint)sizeof(xfs_dir2_ino8_t) * count : (uint)sizeof(xfs_dir2_ino4_t) * count); if (size > XFS_IFORK_DSIZE(dp)) @@ -1048,7 +1047,7 @@ xfs_dir2_sf_toino4( i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; - sfep->offset = oldsfep->offset; + memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); dp->d_ops->sf_put_ino(sfp, sfep, dp->d_ops->sf_get_ino(oldsfp, oldsfep)); @@ -1124,7 +1123,7 @@ xfs_dir2_sf_toino8( i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; - sfep->offset = oldsfep->offset; + memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); dp->d_ops->sf_put_ino(sfp, sfep, dp->d_ops->sf_get_ino(oldsfp, oldsfep)); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 184c44e..918f13c 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -95,7 +95,6 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2); /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); -- cgit v0.10.2 From 266b6969c3dfd3c81d8601754c8b0e25bb52615b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:48:31 +1000 Subject: xfs: kill xfs_dir2_inou_t And use an array of unsigned char values directly to avoid problems with architectures that pad the size of structures. This also gets rid of the xfs_dir2_ino4_t and xfs_dir2_ino8_t types, and introduces new constants for the size of 4 and 8 bytes as well as the size difference between the two. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index 9d624a6..f1e8d4d 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -40,8 +40,7 @@ xfs_dir2_sf_entsize( int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ count += len; /* name */ - count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : - sizeof(xfs_dir2_ino4_t); /* ino # */ + count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */ return count; } @@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype( static xfs_ino_t xfs_dir2_sf_get_ino( struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *from) + __uint8_t *from) { if (hdr->i8count) - return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; + return get_unaligned_be64(from) & 0x00ffffffffffffffULL; else - return get_unaligned_be32(&from->i4.i); + return get_unaligned_be32(from); } static void xfs_dir2_sf_put_ino( struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *to, + __uint8_t *to, xfs_ino_t ino) { ASSERT((ino & 0xff00000000000000ULL) == 0); if (hdr->i8count) - put_unaligned_be64(ino, &to->i8.i); + put_unaligned_be64(ino, to); else - put_unaligned_be32(ino, &to->i4.i); + put_unaligned_be32(ino, to); } static xfs_ino_t xfs_dir2_sf_get_parent_ino( struct xfs_dir2_sf_hdr *hdr) { - return xfs_dir2_sf_get_ino(hdr, &hdr->parent); + return xfs_dir2_sf_get_ino(hdr, hdr->parent); } static void @@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino( struct xfs_dir2_sf_hdr *hdr, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); + xfs_dir2_sf_put_ino(hdr, hdr->parent, ino); } /* @@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino( struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep) { - return xfs_dir2_sf_get_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]); + return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]); } static void @@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino( struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino); + xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino); } static xfs_ino_t @@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino( struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep) { - return xfs_dir2_sf_get_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]); + return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]); } static void @@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino( struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino); + xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino); } diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index a5f4d6e..f877bb1 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -208,22 +208,10 @@ typedef xfs_off_t xfs_dir2_off_t; */ typedef __uint32_t xfs_dir2_db_t; -/* - * Inode number stored as 8 8-bit values. - */ -typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; - -/* - * Inode number stored as 4 8-bit values. - * Works a lot of the time, when all the inode numbers in a directory - * fit in 32 bits. - */ -typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; +#define XFS_INO32_SIZE 4 +#define XFS_INO64_SIZE 8 +#define XFS_INO64_DIFF (XFS_INO64_SIZE - XFS_INO32_SIZE) -typedef union { - xfs_dir2_ino8_t i8; - xfs_dir2_ino4_t i4; -} xfs_dir2_inou_t; #define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) /* @@ -240,7 +228,7 @@ typedef union { typedef struct xfs_dir2_sf_hdr { __uint8_t count; /* count of entries */ __uint8_t i8count; /* count of 8-byte inode #s */ - xfs_dir2_inou_t parent; /* parent dir inode number */ + __uint8_t parent[8]; /* parent dir inode number */ } __arch_pack xfs_dir2_sf_hdr_t; typedef struct xfs_dir2_sf_entry { @@ -251,16 +239,15 @@ typedef struct xfs_dir2_sf_entry { * A single byte containing the file type field follows the inode * number for version 3 directory entries. * - * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a - * variable offset after the name. + * A 64-bit or 32-bit inode number follows here, at a variable offset + * after the name. */ } xfs_dir2_sf_entry_t; static inline int xfs_dir2_sf_hdr_size(int i8count) { return sizeof(struct xfs_dir2_sf_hdr) - - (i8count == 0) * - (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t)); + (i8count == 0) * XFS_INO64_DIFF; } static inline xfs_dir2_data_aoff_t diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index f8ccfd5a..c6809ff 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -130,8 +130,8 @@ xfs_dir2_block_sfsize( count * 3 * sizeof(u8) + /* namelen + offset */ namelen + /* name */ (i8count ? /* inumber */ - (uint)sizeof(xfs_dir2_ino8_t) * count : - (uint)sizeof(xfs_dir2_ino4_t) * count); + count * XFS_INO64_SIZE : + count * XFS_INO32_SIZE); if (size > XFS_IFORK_DSIZE(dp)) return size; /* size value is a failure */ } @@ -318,10 +318,7 @@ xfs_dir2_sf_addname( /* * Yes, adjust the inode size. old count + (parent + new) */ - incr_isize += - (sfp->count + 2) * - ((uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t)); + incr_isize += (sfp->count + 2) * XFS_INO64_DIFF; objchange = 1; } @@ -896,11 +893,7 @@ xfs_dir2_sf_replace( int error; /* error return value */ int newsize; /* new inode size */ - newsize = - dp->i_df.if_bytes + - (sfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; /* * Won't fit as shortform, convert to block then do replace. */ @@ -1021,10 +1014,7 @@ xfs_dir2_sf_toino4( /* * Compute the new inode size. */ - newsize = - oldsize - - (oldsfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF; xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* @@ -1097,10 +1087,7 @@ xfs_dir2_sf_toino8( /* * Compute the new inode size (nb: entry count + 1 for parent) */ - newsize = - oldsize + - (oldsfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF; xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 918f13c..19cc8b8 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -86,9 +86,6 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6); XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16); -- cgit v0.10.2 From aa2dd0ad4d6d7dd85bb13ed64b872803be046f96 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jul 2016 11:48:46 +1000 Subject: xfs: remove __arch_pack Instead we always declare struct xfs_dir2_sf_hdr as packed. That's the expected layout, and while most major architectures do the packing by default the new structure size and offset checker showed that not only the ARM old ABI got this wrong, but various minor embedded architectures did as well. [Verified that no code change on x86-64 results from this change] Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index f877bb1..685f23b 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -229,7 +229,7 @@ typedef struct xfs_dir2_sf_hdr { __uint8_t count; /* count of entries */ __uint8_t i8count; /* count of 8-byte inode #s */ __uint8_t parent[8]; /* parent dir inode number */ -} __arch_pack xfs_dir2_sf_hdr_t; +} __packed xfs_dir2_sf_hdr_t; typedef struct xfs_dir2_sf_entry { __u8 namelen; /* actual name length */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index a8192dc..b8d64d5 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) return x; } -/* ARM old ABI has some weird alignment/padding */ -#if defined(__arm__) && !defined(__ARM_EABI__) -#define __arch_pack __attribute__((packed)) -#else -#define __arch_pack -#endif - #define ASSERT_ALWAYS(expr) \ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) -- cgit v0.10.2 From 99579ccec4e271c3d4d4e7c946058766812afdab Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 22 Jul 2016 09:50:38 +1000 Subject: xfs: skip dirty pages in ->releasepage() XFS has had scattered reports of delalloc blocks present at ->releasepage() time. This results in a warning with a stack trace similar to the following: ... Call Trace: [] dump_stack+0x63/0x84 [] warn_slowpath_common+0x97/0xe0 [] warn_slowpath_null+0x1a/0x20 [] xfs_vm_releasepage+0x10f/0x140 [] ? page_mkclean_one+0xd0/0xd0 [] ? anon_vma_prepare+0x150/0x150 [] try_to_release_page+0x32/0x50 [] shrink_active_list+0x3ce/0x3e0 [] shrink_lruvec+0x687/0x7d0 [] shrink_zone+0xdc/0x2c0 [] kswapd+0x4f9/0x970 [] ? mem_cgroup_shrink_node_zone+0x1a0/0x1a0 [] kthread+0xc9/0xe0 [] ? kthread_stop+0x100/0x100 [] ret_from_fork+0x3f/0x70 [] ? kthread_stop+0x100/0x100 This occurs because it is possible for shrink_active_list() to send pages marked dirty to ->releasepage() when certain buffer_head threshold conditions are met. shrink_active_list() doesn't check the page dirty state apparently to handle an old ext3 corner case where in some cases clean pages would not have the dirty bit cleared, thus it is up to the filesystem to determine how to handle the page. XFS currently handles the delalloc case properly, but this behavior makes the warning spurious. Update the XFS ->releasepage() handler to explicitly skip dirty pages. Retain the existing delalloc/unwritten checks so we continue to warn if such buffers exist on clean pages when they shouldn't. Diagnosed-by: Dave Chinner Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3ba0809..6135787 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1040,6 +1040,20 @@ xfs_vm_releasepage( trace_xfs_releasepage(page->mapping->host, page, 0, 0); + /* + * mm accommodates an old ext3 case where clean pages might not have had + * the dirty bit cleared. Thus, it can send actual dirty pages to + * ->releasepage() via shrink_active_list(). Conversely, + * block_invalidatepage() can send pages that are still marked dirty + * but otherwise have invalidated buffers. + * + * We've historically freed buffers on the latter. Instead, quietly + * filter out all dirty pages to avoid spurious buffer state warnings. + * This can likely be removed once shrink_active_list() is fixed. + */ + if (PageDirty(page)) + return 0; + xfs_count_page_state(page, &delalloc, &unwritten); if (WARN_ON_ONCE(delalloc)) -- cgit v0.10.2 From f021bd071f06b545926b1031348873b05442139f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Jul 2016 09:50:55 +1000 Subject: xfs: remove dax code from object file when disabled We check IS_DAX(inode) before calling either xfs_file_dax_read or xfs_file_dax_write, and this will lead the call being optimized out at compile time when CONFIG_FS_DAX is disabled. However, the two functions are marked STATIC, so they become global symbols when CONFIG_XFS_DEBUG is set, leaving us with two unused global functions that call into an undefined function and a broken "allmodconfig" build: fs/built-in.o: In function `xfs_file_dax_read': fs/xfs/xfs_file.c:348: undefined reference to `dax_do_io' fs/built-in.o: In function `xfs_file_dax_write': fs/xfs/xfs_file.c:758: undefined reference to `dax_do_io' Marking the two functions 'static noinline' instead of 'STATIC' will let the compiler drop the symbols when there are no callers but avoid the implicit inlining. Signed-off-by: Arnd Bergmann Fixes: 16d4d43595b4 ("xfs: split direct I/O and DAX path") Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d97e8cb..49fc9ac 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -370,7 +370,7 @@ xfs_file_dio_aio_read( return ret; } -STATIC ssize_t +static noinline ssize_t xfs_file_dax_read( struct kiocb *iocb, struct iov_iter *to) @@ -875,7 +875,7 @@ out: return ret; } -STATIC ssize_t +static noinline ssize_t xfs_file_dax_write( struct kiocb *iocb, struct iov_iter *from) -- cgit v0.10.2 From 160ae76fa1a2ee2345cb66eb343e24a34d2f051d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jul 2016 09:51:05 +1000 Subject: libxfs: directory node splitting does not have an extra block xfsprogs source commit 4280e59dcbc4cd8e01585efe788a68eb378048e8 xfs_da3_split() has to handle all three versions of the directory/attribute btree structure. The attr tree is v1, the dir tre is v2 or v3. The main difference between the v1 and v2/3 trees is the way tree nodes are split - in the v1 tree we can require a double split to occur because the object to be inserted may be larger than the space made by splitting a leaf. In this case we need to do a double split - one to split the full leaf, then another to allocate an empty leaf block in the correct location for the new entry. This does not happen with dir (v2/v3) formats as the objects being inserted are always guaranteed to fit into the new space in the split blocks. Indeed, for directories they *may* be an extra block on this buffer pointer. However, it's guaranteed not to be a leaf block (i.e. a directory data block) - the directory code only ever places hash index or free space blocks in this pointer (as a cursor of sorts), and so to use it as a directory data block will immediately corrupt the directory. The problem is that the code assumes that there may be extra blocks that we need to link into the tree once we've split the root, but this is not true for either dir or attr trees, because the extra attr block is always consumed by the last node split before we split the root. Hence the linking in an extra block is always wrong at the root split level, and this manifests itself in repair as a directory corruption in a repaired directory, leaving the directory rebuild incomplete. This is a dir v2 zero-day bug - it was in the initial dir v2 commit that was made back in February 1998. Fix this by ensuring the linking of the blocks after the root split never tries to make use of the extra blocks that may be held in the cursor. They are held there for other purposes and should never be touched by the root splitting code. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 097bf77..0f1f165 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -356,7 +356,6 @@ xfs_da3_split( struct xfs_da_state_blk *newblk; struct xfs_da_state_blk *addblk; struct xfs_da_intnode *node; - struct xfs_buf *bp; int max; int action = 0; int error; @@ -397,7 +396,9 @@ xfs_da3_split( break; } /* - * Entry wouldn't fit, split the leaf again. + * Entry wouldn't fit, split the leaf again. The new + * extrablk will be consumed by xfs_da3_node_split if + * the node is split. */ state->extravalid = 1; if (state->inleaf) { @@ -446,6 +447,14 @@ xfs_da3_split( return 0; /* + * xfs_da3_node_split() should have consumed any extra blocks we added + * during a double leaf split in the attr fork. This is guaranteed as + * we can't be here if the attr fork only has a single leaf block. + */ + ASSERT(state->extravalid == 0 || + state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC); + + /* * Split the root node. */ ASSERT(state->path.active == 0); @@ -457,43 +466,33 @@ xfs_da3_split( } /* - * Update pointers to the node which used to be block 0 and - * just got bumped because of the addition of a new root node. - * There might be three blocks involved if a double split occurred, - * and the original block 0 could be at any position in the list. + * Update pointers to the node which used to be block 0 and just got + * bumped because of the addition of a new root node. Note that the + * original block 0 could be at any position in the list of blocks in + * the tree. * - * Note: the magic numbers and sibling pointers are in the same - * physical place for both v2 and v3 headers (by design). Hence it - * doesn't matter which version of the xfs_da_intnode structure we use - * here as the result will be the same using either structure. + * Note: the magic numbers and sibling pointers are in the same physical + * place for both v2 and v3 headers (by design). Hence it doesn't matter + * which version of the xfs_da_intnode structure we use here as the + * result will be the same using either structure. */ node = oldblk->bp->b_addr; if (node->hdr.info.forw) { - if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { - bp = addblk->bp; - } else { - ASSERT(state->extravalid); - bp = state->extrablk.bp; - } - node = bp->b_addr; + ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno); + node = addblk->bp->b_addr; node->hdr.info.back = cpu_to_be32(oldblk->blkno); - xfs_trans_log_buf(state->args->trans, bp, - XFS_DA_LOGRANGE(node, &node->hdr.info, - sizeof(node->hdr.info))); + xfs_trans_log_buf(state->args->trans, addblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); } node = oldblk->bp->b_addr; if (node->hdr.info.back) { - if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { - bp = addblk->bp; - } else { - ASSERT(state->extravalid); - bp = state->extrablk.bp; - } - node = bp->b_addr; + ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno); + node = addblk->bp->b_addr; node->hdr.info.forw = cpu_to_be32(oldblk->blkno); - xfs_trans_log_buf(state->args->trans, bp, - XFS_DA_LOGRANGE(node, &node->hdr.info, - sizeof(node->hdr.info))); + xfs_trans_log_buf(state->args->trans, addblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); } addblk->bp = NULL; return 0; -- cgit v0.10.2 From b1c5ebb21301fcc47392ba3dfc7214f6c2b54032 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jul 2016 09:52:35 +1000 Subject: xfs: allocate log vector buffers outside CIL context lock One of the problems we currently have with delayed logging is that under serious memory pressure we can deadlock memory reclaim. THis occurs when memory reclaim (such as run by kswapd) is reclaiming XFS inodes and issues a log force to unpin inodes that are dirty in the CIL. The CIL is pushed, but this will only occur once it gets the CIL context lock to ensure that all committing transactions are complete and no new transactions start being committed to the CIL while the push switches to a new context. The deadlock occurs when the CIL context lock is held by a committing process that is doing memory allocation for log vector buffers, and that allocation is then blocked on memory reclaim making progress. Memory reclaim, however, is blocked waiting for a log force to make progress, and so we effectively deadlock at this point. To solve this problem, we have to move the CIL log vector buffer allocation outside of the context lock so that memory reclaim can always make progress when it needs to force the log. The problem with doing this is that a CIL push can take place while we are determining if we need to allocate a new log vector buffer for an item and hence the current log vector may go away without warning. That means we canot rely on the existing log vector being present when we finally grab the context lock and so we must have a replacement buffer ready to go at all times. To ensure this, introduce a "shadow log vector" buffer that is always guaranteed to be present when we gain the CIL context lock and format the item. This shadow buffer may or may not be used during the formatting, but if the log item does not have an existing log vector buffer or that buffer is too small for the new modifications, we swap it for the new shadow buffer and format the modifications into that new log vector buffer. The result of this is that for any object we modify more than once in a given CIL checkpoint, we double the memory required to track dirty regions in the log. For single modifications then we consume the shadow log vectorwe allocate on commit, and that gets consumed by the checkpoint. However, if we make multiple modifications, then the second transaction commit will allocate a shadow log vector and hence we will end up with double the memory usage as only one of the log vectors is consumed by the CIL checkpoint. The remaining shadow vector will be freed when th elog item is freed. This can probably be optimised in future - access to the shadow log vector is serialised by the object lock (as opposited to the active log vector, which is controlled by the CIL context lock) and so we can probably free shadow log vector from some objects when the log item is marked clean on removal from the AIL. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 3425799..4561b1e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -949,6 +949,7 @@ xfs_buf_item_free( xfs_buf_log_item_t *bip) { xfs_buf_item_free_format(bip); + kmem_free(bip->bli_item.li_lv_shadow); kmem_zone_free(xfs_buf_item_zone, bip); } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index e064665..ccb0811 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -74,6 +74,7 @@ xfs_qm_dqdestroy( { ASSERT(list_empty(&dqp->q_lru)); + kmem_free(dqp->q_logitem.qli_item.li_lv_shadow); mutex_destroy(&dqp->q_qlock); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 814cff9..2c7a162 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed( spin_lock(&ailp->xa_lock); xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + kmem_free(qfs->qql_item.li_lv_shadow); + kmem_free(lip->li_lv_shadow); kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 4aa0153..ab77946 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -40,6 +40,7 @@ void xfs_efi_item_free( struct xfs_efi_log_item *efip) { + kmem_free(efip->efi_item.li_lv_shadow); if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) kmem_free(efip); else @@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) STATIC void xfs_efd_item_free(struct xfs_efd_log_item *efdp) { + kmem_free(efdp->efd_item.li_lv_shadow); if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) kmem_free(efdp); else diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index a1b0761..892c2ac 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -651,6 +651,7 @@ void xfs_inode_item_destroy( xfs_inode_t *ip) { + kmem_free(ip->i_itemp->ili_item.li_lv_shadow); kmem_zone_free(xfs_ili_zone, ip->i_itemp); } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 5e54e79..a4ab192 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -78,6 +78,157 @@ xlog_cil_init_post_recovery( log->l_cilp->xc_ctx->sequence = 1; } +static inline int +xlog_cil_iovec_space( + uint niovecs) +{ + return round_up((sizeof(struct xfs_log_vec) + + niovecs * sizeof(struct xfs_log_iovec)), + sizeof(uint64_t)); +} + +/* + * Allocate or pin log vector buffers for CIL insertion. + * + * The CIL currently uses disposable buffers for copying a snapshot of the + * modified items into the log during a push. The biggest problem with this is + * the requirement to allocate the disposable buffer during the commit if: + * a) does not exist; or + * b) it is too small + * + * If we do this allocation within xlog_cil_insert_format_items(), it is done + * under the xc_ctx_lock, which means that a CIL push cannot occur during + * the memory allocation. This means that we have a potential deadlock situation + * under low memory conditions when we have lots of dirty metadata pinned in + * the CIL and we need a CIL commit to occur to free memory. + * + * To avoid this, we need to move the memory allocation outside the + * xc_ctx_lock, but because the log vector buffers are disposable, that opens + * up a TOCTOU race condition w.r.t. the CIL committing and removing the log + * vector buffers between the check and the formatting of the item into the + * log vector buffer within the xc_ctx_lock. + * + * Because the log vector buffer needs to be unchanged during the CIL push + * process, we cannot share the buffer between the transaction commit (which + * modifies the buffer) and the CIL push context that is writing the changes + * into the log. This means skipping preallocation of buffer space is + * unreliable, but we most definitely do not want to be allocating and freeing + * buffers unnecessarily during commits when overwrites can be done safely. + * + * The simplest solution to this problem is to allocate a shadow buffer when a + * log item is committed for the second time, and then to only use this buffer + * if necessary. The buffer can remain attached to the log item until such time + * it is needed, and this is the buffer that is reallocated to match the size of + * the incoming modification. Then during the formatting of the item we can swap + * the active buffer with the new one if we can't reuse the existing buffer. We + * don't free the old buffer as it may be reused on the next modification if + * it's size is right, otherwise we'll free and reallocate it at that point. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and attaches the vector to the log item in preparation + * for the formatting step which occurs under the xc_ctx_lock. + * + * While this means the memory footprint goes up, it avoids the repeated + * alloc/free pattern that repeated modifications of an item would otherwise + * cause, and hence minimises the CPU overhead of such behaviour. + */ +static void +xlog_cil_alloc_shadow_bufs( + struct xlog *log, + struct xfs_trans *tp) +{ + struct xfs_log_item_desc *lidp; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_vec *lv; + int niovecs = 0; + int nbytes = 0; + int buf_size; + bool ordered = false; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* get number of vecs and size of data to be stored */ + lip->li_ops->iop_size(lip, &niovecs, &nbytes); + + /* + * Ordered items need to be tracked but we do not wish to write + * them. We need a logvec to track the object, but we do not + * need an iovec or buffer to be allocated for copying data. + */ + if (niovecs == XFS_LOG_VEC_ORDERED) { + ordered = true; + niovecs = 0; + nbytes = 0; + } + + /* + * We 64-bit align the length of each iovec so that the start + * of the next one is naturally aligned. We'll need to + * account for that slack space here. Then round nbytes up + * to 64-bit alignment so that the initial buffer alignment is + * easy to calculate and verify. + */ + nbytes += niovecs * sizeof(uint64_t); + nbytes = round_up(nbytes, sizeof(uint64_t)); + + /* + * The data buffer needs to start 64-bit aligned, so round up + * that space to ensure we can align it appropriately and not + * overrun the buffer. + */ + buf_size = nbytes + xlog_cil_iovec_space(niovecs); + + /* + * if we have no shadow buffer, or it is too small, we need to + * reallocate it. + */ + if (!lip->li_lv_shadow || + buf_size > lip->li_lv_shadow->lv_size) { + + /* + * We free and allocate here as a realloc would copy + * unecessary data. We don't use kmem_zalloc() for the + * same reason - we don't need to zero the data area in + * the buffer, only the log vector header and the iovec + * storage. + */ + kmem_free(lip->li_lv_shadow); + + lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS); + memset(lv, 0, xlog_cil_iovec_space(niovecs)); + + lv->lv_item = lip; + lv->lv_size = buf_size; + if (ordered) + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + else + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; + lip->li_lv_shadow = lv; + } else { + /* same or smaller, optimise common overwrite case */ + lv = lip->li_lv_shadow; + if (ordered) + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + else + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_next = NULL; + } + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = niovecs; + + /* The allocated data region lies beyond the iovec region */ + lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs); + } + +} + /* * Prepare the log item for insertion into the CIL. Calculate the difference in * log space and vectors it will consume, and if it is a new item pin it as @@ -100,16 +251,19 @@ xfs_cil_prepare_item( /* * If there is no old LV, this is the first time we've seen the item in * this CIL context and so we need to pin it. If we are replacing the - * old_lv, then remove the space it accounts for and free it. + * old_lv, then remove the space it accounts for and make it the shadow + * buffer for later freeing. In both cases we are now switching to the + * shadow buffer, so update the the pointer to it appropriately. */ - if (!old_lv) + if (!old_lv) { lv->lv_item->li_ops->iop_pin(lv->lv_item); - else if (old_lv != lv) { + lv->lv_item->li_lv_shadow = NULL; + } else if (old_lv != lv) { ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); *diff_len -= old_lv->lv_bytes; *diff_iovecs -= old_lv->lv_niovecs; - kmem_free(old_lv); + lv->lv_item->li_lv_shadow = old_lv; } /* attach new log vector to log item */ @@ -133,11 +287,13 @@ xfs_cil_prepare_item( * write it out asynchronously without needing to relock the object that was * modified at the time it gets written into the iclog. * - * This function builds a vector for the changes in each log item in the - * transaction. It then works out the length of the buffer needed for each log - * item, allocates them and formats the vector for the item into the buffer. - * The buffer is then attached to the log item are then inserted into the - * Committed Item List for tracking until the next checkpoint is written out. + * This function takes the prepared log vectors attached to each log item, and + * formats the changes into the log vector buffer. The buffer it uses is + * dependent on the current state of the vector in the CIL - the shadow lv is + * guaranteed to be large enough for the current modification, but we will only + * use that if we can't reuse the existing lv. If we can't reuse the existing + * lv, then simple swap it out for the shadow lv. We don't free it - that is + * done lazily either by th enext modification or the freeing of the log item. * * We don't set up region headers during this process; we simply copy the * regions into the flat buffer. We can do this because we still have to do a @@ -170,59 +326,29 @@ xlog_cil_insert_format_items( list_for_each_entry(lidp, &tp->t_items, lid_trans) { struct xfs_log_item *lip = lidp->lid_item; struct xfs_log_vec *lv; - struct xfs_log_vec *old_lv; - int niovecs = 0; - int nbytes = 0; - int buf_size; + struct xfs_log_vec *old_lv = NULL; + struct xfs_log_vec *shadow; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - /* get number of vecs and size of data to be stored */ - lip->li_ops->iop_size(lip, &niovecs, &nbytes); - - /* Skip items that do not have any vectors for writing */ - if (!niovecs) - continue; - /* - * Ordered items need to be tracked but we do not wish to write - * them. We need a logvec to track the object, but we do not - * need an iovec or buffer to be allocated for copying data. + * The formatting size information is already attached to + * the shadow lv on the log item. */ - if (niovecs == XFS_LOG_VEC_ORDERED) { + shadow = lip->li_lv_shadow; + if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED) ordered = true; - niovecs = 0; - nbytes = 0; - } - /* - * We 64-bit align the length of each iovec so that the start - * of the next one is naturally aligned. We'll need to - * account for that slack space here. Then round nbytes up - * to 64-bit alignment so that the initial buffer alignment is - * easy to calculate and verify. - */ - nbytes += niovecs * sizeof(uint64_t); - nbytes = round_up(nbytes, sizeof(uint64_t)); - - /* grab the old item if it exists for reservation accounting */ - old_lv = lip->li_lv; - - /* - * The data buffer needs to start 64-bit aligned, so round up - * that space to ensure we can align it appropriately and not - * overrun the buffer. - */ - buf_size = nbytes + - round_up((sizeof(struct xfs_log_vec) + - niovecs * sizeof(struct xfs_log_iovec)), - sizeof(uint64_t)); + /* Skip items that do not have any vectors for writing */ + if (!shadow->lv_niovecs && !ordered) + continue; /* compare to existing item size */ - if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { + old_lv = lip->li_lv; + if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) { /* same or smaller, optimise common overwrite case */ lv = lip->li_lv; lv->lv_next = NULL; @@ -236,32 +362,29 @@ xlog_cil_insert_format_items( */ *diff_iovecs -= lv->lv_niovecs; *diff_len -= lv->lv_bytes; + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = shadow->lv_niovecs; + + /* reset the lv buffer information for new formatting */ + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_buf = (char *)lv + + xlog_cil_iovec_space(lv->lv_niovecs); } else { - /* allocate new data chunk */ - lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + /* switch to shadow buffer! */ + lv = shadow; lv->lv_item = lip; - lv->lv_size = buf_size; if (ordered) { /* track as an ordered logvec */ ASSERT(lip->li_lv == NULL); - lv->lv_buf_len = XFS_LOG_VEC_ORDERED; goto insert; } - lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; } - /* Ensure the lv is set up according to ->iop_size */ - lv->lv_niovecs = niovecs; - - /* The allocated data region lies beyond the iovec region */ - lv->lv_buf_len = 0; - lv->lv_bytes = 0; - lv->lv_buf = (char *)lv + buf_size - nbytes; ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); - lip->li_ops->iop_format(lip, lv); insert: - ASSERT(lv->lv_buf_len <= nbytes); xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); } } @@ -783,6 +906,13 @@ xfs_log_commit_cil( struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; + /* + * Do all necessary memory allocation before we lock the CIL. + * This ensures the allocation does not deadlock with a CIL + * push in memory reclaim (e.g. from kswapd). + */ + xlog_cil_alloc_shadow_bufs(log, tp); + /* lock out background commit */ down_read(&cil->xc_ctx_lock); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9a462e8..9b2b9fa 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -52,6 +52,7 @@ typedef struct xfs_log_item { /* delayed logging */ struct list_head li_cil; /* CIL pointers */ struct xfs_log_vec *li_lv; /* active log vector */ + struct xfs_log_vec *li_lv_shadow; /* standby vector */ xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; -- cgit v0.10.2 From 28b783e47ad702b8e0f4861ef94cdfce6abd7c80 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jul 2016 09:56:38 +1000 Subject: xfs: bufferhead chains are invalid after end_page_writeback In xfs_finish_page_writeback(), we have a loop that looks like this: do { if (off < bvec->bv_offset) goto next_bh; if (off > end) break; bh->b_end_io(bh, !error); next_bh: off += bh->b_size; } while ((bh = bh->b_this_page) != head); The b_end_io function is end_buffer_async_write(), which will call end_page_writeback() once all the buffers have marked as no longer under IO. This issue here is that the only thing currently protecting both the bufferhead chain and the page from being reclaimed is the PageWriteback state held on the page. While we attempt to limit the loop to just the buffers covered by the IO, we still read from the buffer size and follow the next pointer in the bufferhead chain. There is no guarantee that either of these are valid after the PageWriteback flag has been cleared. Hence, loops like this are completely unsafe, and result in use-after-free issues. One such problem was caught by Calvin Owens with KASAN: ..... INFO: Freed in 0x103fc80ec age=18446651500051355200 cpu=2165122683 pid=-1 free_buffer_head+0x41/0x90 __slab_free+0x1ed/0x340 kmem_cache_free+0x270/0x300 free_buffer_head+0x41/0x90 try_to_free_buffers+0x171/0x240 xfs_vm_releasepage+0xcb/0x3b0 try_to_release_page+0x106/0x190 shrink_page_list+0x118e/0x1a10 shrink_inactive_list+0x42c/0xdf0 shrink_zone_memcg+0xa09/0xfa0 shrink_zone+0x2c3/0xbc0 ..... Call Trace: [] dump_stack+0x68/0x94 [] print_trailer+0x115/0x1a0 [] object_err+0x34/0x40 [] kasan_report_error+0x217/0x530 [] __asan_report_load8_noabort+0x43/0x50 [] xfs_destroy_ioend+0x3bf/0x4c0 [] xfs_end_bio+0x154/0x220 [] bio_endio+0x158/0x1b0 [] blk_update_request+0x18b/0xb80 [] scsi_end_request+0x97/0x5a0 [] scsi_io_completion+0x438/0x1690 [] scsi_finish_command+0x375/0x4e0 [] scsi_softirq_done+0x280/0x340 Where the access is occuring during IO completion after the buffer had been freed from direct memory reclaim. Prevent use-after-free accidents in this end_io processing loop by pre-calculating the loop conditionals before calling bh->b_end_io(). The loop is already limited to just the bufferheads covered by the IO in progress, so the offset checks are sufficient to prevent accessing buffers in the chain after end_page_writeback() has been called by the the bh->b_end_io() callout. Yet another example of why Bufferheads Must Die. cc: # 4.7 Signed-off-by: Dave Chinner Reported-and-Tested-by: Calvin Owens Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6135787..f1c7f8c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -87,6 +87,12 @@ xfs_find_bdev_for_inode( * We're now finished for good with this page. Update the page state via the * associated buffer_heads, paying attention to the start and end offsets that * we need to process on the page. + * + * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last + * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or + * the page at all, as we may be racing with memory reclaim and it can free both + * the bufferhead chain and the page as it will see the page as clean and + * unused. */ static void xfs_finish_page_writeback( @@ -95,8 +101,9 @@ xfs_finish_page_writeback( int error) { unsigned int end = bvec->bv_offset + bvec->bv_len - 1; - struct buffer_head *head, *bh; + struct buffer_head *head, *bh, *next; unsigned int off = 0; + unsigned int bsize; ASSERT(bvec->bv_offset < PAGE_SIZE); ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); @@ -105,15 +112,17 @@ xfs_finish_page_writeback( bh = head = page_buffers(bvec->bv_page); + bsize = bh->b_size; do { + next = bh->b_this_page; if (off < bvec->bv_offset) goto next_bh; if (off > end) break; bh->b_end_io(bh, !error); next_bh: - off += bh->b_size; - } while ((bh = bh->b_this_page) != head); + off += bsize; + } while ((bh = next) != head); } /* -- cgit v0.10.2 From 72ccbbe154fc307c98153725822be515fc0326d3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 22 Jul 2016 14:10:18 +1000 Subject: xfs: remove EXPERIMENTAL tag from sparse inode feature Been around for long enough now, hasn't caused any regression test failures in the past 3 months, so it's time to make it a fully supported feature. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 11ea5d5..1fa3f8f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1565,10 +1565,6 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hassparseinodes(&mp->m_sb)) - xfs_alert(mp, - "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); - error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; -- cgit v0.10.2