From 5e4b5386a2c29429add601c8cfb45bb10d80c490 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:50:12 +1000 Subject: xfs: disentagle EFI release from the extent count Release of the EFI either occurs based on the reference count or the extent count. The extent count used is either the count tracked in the EFI or EFD, depending on the particular situation. In either case, the count is initialized to the final value and thus always matches the current efi_next_extent value once the EFI is completely constructed. For example, the EFI extent count is increased as the extents are logged in xfs_bmap_finish() and the full free list is always completely processed. Therefore, the count is guaranteed to be complete once the EFI transaction is committed. The EFD uses the efd_nextents counter to release the EFI. This counter is initialized to the count of the EFI when the EFD is created. Thus the EFD, as currently used, has no concept of partial EFI release based on extent count. Given that the EFI extent count is always released in whole, use of the extent count for reference counting is unnecessary. Remove this level of the API and release the EFI based on the core reference count. The efi_next_extent counter remains because it is still used to track the slot to log the next extent to free. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index adc8f8f..6ff738f 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -149,7 +149,7 @@ xfs_efi_item_unpin( xfs_efi_item_free(efip); return; } - __xfs_efi_release(efip); + xfs_efi_release(efip); } /* @@ -307,18 +307,15 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) * by this efi item we can free the efi item. */ void -xfs_efi_release(xfs_efi_log_item_t *efip, - uint nextents) +xfs_efi_release( + struct xfs_efi_log_item *efip) { - ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); - if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { - /* recovery needs us to drop the EFI reference, too */ - if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) - __xfs_efi_release(efip); - + /* recovery needs us to drop the EFI reference, too */ + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) __xfs_efi_release(efip); - /* efip may now have been freed, do not reference it again. */ - } + + __xfs_efi_release(efip); + /* efip may now have been freed, do not reference it again. */ } static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) @@ -442,7 +439,7 @@ xfs_efd_item_committed( * EFI got unpinned and freed before the EFD got aborted. */ if (!(lip->li_flags & XFS_LI_ABORTED)) - xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); + xfs_efi_release(efdp->efd_efip); xfs_efd_item_free(efdp); return (xfs_lsn_t)-1; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 0ffbce3..399562e 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -77,5 +77,6 @@ xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *, int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt); void xfs_efi_item_free(xfs_efi_log_item_t *); +void xfs_efi_release(struct xfs_efi_log_item *); #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 01dd228..578bc2d 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3739,7 +3739,7 @@ xlog_recover_process_efi( * free the memory associated with it. */ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - xfs_efi_release(efip, efip->efi_format.efi_nextents); + xfs_efi_release(efip); return -EIO; } } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 3b21b4e..f48e839 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -213,7 +213,6 @@ void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); -void xfs_efi_release(struct xfs_efi_log_item *, uint); void xfs_trans_log_efi_extent(xfs_trans_t *, struct xfs_efi_log_item *, xfs_fsblock_t, -- cgit v0.10.2 From d43ac29be7a174f93a3d26cc1e68668fe86b782f Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:50:13 +1000 Subject: xfs: return committed status from xfs_trans_roll() Some callers need to make error handling decisions based on whether the current transaction successfully committed or not. Rename xfs_trans_roll(), add a new parameter and provide a wrapper to preserve existing callers. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 0582a27..a0ab1da 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1019,9 +1019,10 @@ xfs_trans_cancel( * chunk we've been working on and get a new transaction to continue. */ int -xfs_trans_roll( +__xfs_trans_roll( struct xfs_trans **tpp, - struct xfs_inode *dp) + struct xfs_inode *dp, + int *committed) { struct xfs_trans *trans; struct xfs_trans_res tres; @@ -1052,6 +1053,7 @@ xfs_trans_roll( if (error) return error; + *committed = 1; trans = *tpp; /* @@ -1074,3 +1076,12 @@ xfs_trans_roll( xfs_trans_ijoin(trans, dp, 0); return 0; } + +int +xfs_trans_roll( + struct xfs_trans **tpp, + struct xfs_inode *dp) +{ + int committed = 0; + return __xfs_trans_roll(tpp, dp, &committed); +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f48e839..ba1660b 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -225,6 +225,7 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, xfs_fsblock_t, xfs_extlen_t); int xfs_trans_commit(struct xfs_trans *); +int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); -- cgit v0.10.2 From 8d99fe92fed019e203f458370129fb28b3fb5740 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:51:16 +1000 Subject: xfs: fix efi/efd error handling to avoid fs shutdown hangs Freeing an extent in XFS involves logging an EFI (extent free intention), freeing the actual extent, and logging an EFD (extent free done). The EFI object is created with a reference count of 2: one for the current transaction and one for the subsequently created EFD. Under normal circumstances, the first reference is dropped when the EFI is unpinned and the second reference is dropped when the EFD is committed to the on-disk log. In event of errors or filesystem shutdown, there are various potential cleanup scenarios depending on the state of the EFI/EFD. The cleanup scenarios are confusing and racy, as demonstrated by the following test sequence: # mount $dev $mnt # fsstress -d $mnt -n 99999 -p 16 -z -f fallocate=1 \ -f punch=1 -f creat=1 -f unlink=1 & # sleep 5 # killall -9 fsstress; wait # godown -f $mnt # umount ... in which the final umount can hang due to the AIL being pinned indefinitely by one or more EFI items. This can occur due to several conditions. For example, if the shutdown occurs after the EFI is committed to the on-disk log and the EFD committed to the CIL, but before the EFD committed to the log, the EFD iop_committed() abort handler does not drop its reference to the EFI. Alternatively, manual error injection in the xfs_bmap_finish() codepath shows that if an error occurs after the EFI transaction is committed but before the EFD is constructed and logged, the EFI is never released from the AIL. Update the EFI/EFD item handling code to use a more straightforward and reliable approach to error handling. If an error occurs after the EFI transaction is committed and before the EFD is constructed, release the EFI explicitly from xfs_bmap_finish(). If the EFI transaction is cancelled, release the EFI in the unlock handler. Once the EFD is constructed, it is responsible for releasing the EFI under any circumstances (including whether the EFI item aborts due to log I/O error). Update the EFD item handlers to release the EFI if the transaction is cancelled or aborts due to log I/O error. Finally, update xfs_bmap_finish() to log at least one EFD extent to the transaction before xfs_free_extent() errors are handled to ensure the transaction is dirty and EFD item error handling is triggered. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0f34886c..fa65f67 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -67,16 +67,15 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) */ int /* error */ xfs_bmap_finish( - xfs_trans_t **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed) /* xact committed or not */ + struct xfs_trans **tp, /* transaction pointer addr */ + struct xfs_bmap_free *flist, /* i/o: list extents to free */ + int *committed)/* xact committed or not */ { - xfs_efd_log_item_t *efd; /* extent free data */ - xfs_efi_log_item_t *efi; /* extent free intention */ - int error; /* error return value */ - xfs_bmap_free_item_t *free; /* free extent item */ - xfs_mount_t *mp; /* filesystem mount structure */ - xfs_bmap_free_item_t *next; /* next item on free list */ + struct xfs_efd_log_item *efd; /* extent free data */ + struct xfs_efi_log_item *efi; /* extent free intention */ + int error; /* error return value */ + struct xfs_bmap_free_item *free; /* free extent item */ + struct xfs_bmap_free_item *next; /* next item on free list */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); if (flist->xbf_count == 0) { @@ -88,40 +87,55 @@ xfs_bmap_finish( xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); - error = xfs_trans_roll(tp, NULL); - *committed = 1; - /* - * We have a new transaction, so we should return committed=1, - * even though we're returning an error. - */ - if (error) + error = __xfs_trans_roll(tp, NULL, committed); + if (error) { + /* + * If the transaction was committed, drop the EFD reference + * since we're bailing out of here. The other reference is + * dropped when the EFI hits the AIL. + * + * If the transaction was not committed, the EFI is freed by the + * EFI item unlock handler on abort. Also, we have a new + * transaction so we should return committed=1 even though we're + * returning an error. + */ + if (*committed) { + xfs_efi_release(efi); + xfs_force_shutdown((*tp)->t_mountp, + (error == -EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : + SHUTDOWN_META_IO_ERROR); + } else { + *committed = 1; + } + return error; + } efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); for (free = flist->xbf_first; free != NULL; free = next) { next = free->xbfi_next; - if ((error = xfs_free_extent(*tp, free->xbfi_startblock, - free->xbfi_blockcount))) { - /* - * The bmap free list will be cleaned up at a - * higher level. The EFI will be canceled when - * this transaction is aborted. - * Need to force shutdown here to make sure it - * happens, since this transaction may not be - * dirty yet. - */ - mp = (*tp)->t_mountp; - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, - (error == -EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); - return error; - } + + /* + * Free the extent and log the EFD to dirty the transaction + * before handling errors. This ensures that the transaction is + * aborted, which: + * + * 1.) releases the EFI and frees the EFD + * 2.) shuts down the filesystem + * + * The bmap free list is cleaned up at a higher level. + */ + error = xfs_free_extent(*tp, free->xbfi_startblock, + free->xbfi_blockcount); xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock, - free->xbfi_blockcount); + free->xbfi_blockcount); + if (error) + return error; + xfs_bmap_del_free(flist, NULL, free); } + return 0; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6ff738f..475b9f0 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -61,9 +61,15 @@ __xfs_efi_release( if (atomic_dec_and_test(&efip->efi_refcount)) { spin_lock(&ailp->xa_lock); - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, &efip->efi_item, - SHUTDOWN_LOG_IO_ERROR); + /* + * We don't know whether the EFI made it to the AIL. Remove it + * if so. Note that xfs_trans_ail_delete() drops the AIL lock. + */ + if (efip->efi_item.li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, &efip->efi_item, + SHUTDOWN_LOG_IO_ERROR); + else + spin_unlock(&ailp->xa_lock); xfs_efi_item_free(efip); } } @@ -128,12 +134,12 @@ xfs_efi_item_pin( } /* - * While EFIs cannot really be pinned, the unpin operation is the last place at - * which the EFI is manipulated during a transaction. If we are being asked to - * remove the EFI it's because the transaction has been cancelled and by - * definition that means the EFI cannot be in the AIL so remove it from the - * transaction and free it. Otherwise coordinate with xfs_efi_release() - * to determine who gets to free the EFI. + * The unpin operation is the last place an EFI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the EFI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the EFI to either construct + * and commit the EFD or drop the EFD's reference in the event of error. Simply + * drop the log's EFI reference now that the log is done with it. */ STATIC void xfs_efi_item_unpin( @@ -141,14 +147,6 @@ xfs_efi_item_unpin( int remove) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - - if (remove) { - ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); - if (lip->li_desc) - xfs_trans_del_item(lip); - xfs_efi_item_free(efip); - return; - } xfs_efi_release(efip); } @@ -167,6 +165,11 @@ xfs_efi_item_push( return XFS_ITEM_PINNED; } +/* + * The EFI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an EFD isn't going to be + * constructed and thus we free the EFI here directly. + */ STATIC void xfs_efi_item_unlock( struct xfs_log_item *lip) @@ -412,20 +415,27 @@ xfs_efd_item_push( return XFS_ITEM_PINNED; } +/* + * The EFD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the EFI and free the EFD. + */ STATIC void xfs_efd_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) - xfs_efd_item_free(EFD_ITEM(lip)); + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_efi_release(efdp->efd_efip); + xfs_efd_item_free(efdp); + } } /* - * When the efd item is committed to disk, all we need to do - * is delete our reference to our partner efi item and then - * free ourselves. Since we're freeing ourselves we must - * return -1 to keep the transaction code from further referencing - * this item. + * When the efd item is committed to disk, all we need to do is delete our + * reference to our partner efi item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from further + * referencing this item. */ STATIC xfs_lsn_t xfs_efd_item_committed( @@ -435,13 +445,14 @@ xfs_efd_item_committed( struct xfs_efd_log_item *efdp = EFD_ITEM(lip); /* - * If we got a log I/O error, it's always the case that the LR with the - * EFI got unpinned and freed before the EFD got aborted. + * Drop the EFI reference regardless of whether the EFD has been + * aborted. Once the EFD transaction is constructed, it is the sole + * responsibility of the EFD to release the EFI (even if the EFI is + * aborted due to log I/O error). */ - if (!(lip->li_flags & XFS_LI_ABORTED)) - xfs_efi_release(efdp->efd_efip); - + xfs_efi_release(efdp->efd_efip); xfs_efd_item_free(efdp); + return (xfs_lsn_t)-1; } diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 399562e..8fa8651 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -39,9 +39,28 @@ struct kmem_zone; * "extent free done" log item described below. * * The EFI is reference counted so that it is not freed prior to both the EFI - * and EFD being committed and unpinned. This ensures that when the last - * reference goes away the EFI will always be in the AIL as it has been - * unpinned, regardless of whether the EFD is processed before or after the EFI. + * and EFD being committed and unpinned. This ensures the EFI is inserted into + * the AIL even in the event of out of order EFI/EFD processing. In other words, + * an EFI is born with two references: + * + * 1.) an EFI held reference to track EFI AIL insertion + * 2.) an EFD held reference to track EFD commit + * + * On allocation, both references are the responsibility of the caller. Once the + * EFI is added to and dirtied in a transaction, ownership of reference one + * transfers to the transaction. The reference is dropped once the EFI is + * inserted to the AIL or in the event of failure along the way (e.g., commit + * failure, log I/O error, etc.). Note that the caller remains responsible for + * the EFD reference under all circumstances to this point. The caller has no + * means to detect failure once the transaction is committed, however. + * Therefore, an EFD is required after this point, even in the event of + * unrelated failure. + * + * Once an EFD is allocated and dirtied in a transaction, reference two + * transfers to the transaction. The EFD reference is dropped once it reaches + * the unpin handler. Similar to the EFI, the reference also drops in the event + * of commit failure or log I/O errors. Note that the EFD is not inserted in the + * AIL, so at this point both the EFI and EFD are freed. */ typedef struct xfs_efi_log_item { xfs_log_item_t efi_item; -- cgit v0.10.2 From 6bc43af3d5f507254b8de2058ea51f6ec998ae52 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:51:43 +1000 Subject: xfs: ensure EFD trans aborts on log recovery extent free failure Log recovery attempts to free extents with leftover EFIs in the AIL after initial processing. If the extent free fails (e.g., due to unrelated fs corruption), the transaction is cancelled, though it might not be dirtied at the time. If this is the case, the EFD does not abort and thus does not release the EFI. This can lead to hangs as the EFI pins the AIL. Update xlog_recover_process_efi() to log the EFD in the transaction before xfs_free_extent() errors are handled to ensure the transaction is dirty, aborts the EFD and releases the EFI on error. Since this is a requirement for EFD processing (and consistent with xfs_bmap_finish()), update the EFD logging helper to do the extent free and unconditionally log the EFD. This encodes the required EFD logging behavior into the helper and reduces the likelihood of errors down the road. [dchinner: re-add xfs_alloc.h to xfs_log_recover.c to fix build failure.] Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fa65f67..73aab0d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -112,24 +112,17 @@ xfs_bmap_finish( return error; } + /* + * Get an EFD and free each extent in the list, logging to the EFD in + * the process. The remaining bmap free list is cleaned up by the caller + * on error. + */ efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); for (free = flist->xbf_first; free != NULL; free = next) { next = free->xbfi_next; - /* - * Free the extent and log the EFD to dirty the transaction - * before handling errors. This ensures that the transaction is - * aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - * - * The bmap free list is cleaned up at a higher level. - */ - error = xfs_free_extent(*tp, free->xbfi_startblock, - free->xbfi_blockcount); - xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock, - free->xbfi_blockcount); + error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock, + free->xbfi_blockcount); if (error) return error; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 578bc2d..434ba2c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3752,11 +3752,11 @@ xlog_recover_process_efi( for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &(efip->efi_format.efi_extents[i]); - error = xfs_free_extent(tp, extp->ext_start, extp->ext_len); + error = xfs_trans_free_extent(tp, efdp, extp->ext_start, + extp->ext_len); if (error) goto abort_error; - xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, - extp->ext_len); + } set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index ba1660b..4643070 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -220,10 +220,9 @@ void xfs_trans_log_efi_extent(xfs_trans_t *, struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *, struct xfs_efi_log_item *, uint); -void xfs_trans_log_efd_extent(xfs_trans_t *, - struct xfs_efd_log_item *, - xfs_fsblock_t, - xfs_extlen_t); +int xfs_trans_free_extent(struct xfs_trans *, + struct xfs_efd_log_item *, xfs_fsblock_t, + xfs_extlen_t); int xfs_trans_commit(struct xfs_trans *); int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 284397d..a96ae54 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -25,6 +25,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_extfree_item.h" +#include "xfs_alloc.h" /* * This routine is called to allocate an "extent free intention" @@ -108,19 +109,30 @@ xfs_trans_get_efd(xfs_trans_t *tp, } /* - * This routine is called to indicate that the described - * extent is to be logged as having been freed. It should - * be called once for each extent freed. + * Free an extent and log it to the EFD. Note that the transaction is marked + * dirty regardless of whether the extent free succeeds or fails to support the + * EFI/EFD lifecycle rules. */ -void -xfs_trans_log_efd_extent(xfs_trans_t *tp, - xfs_efd_log_item_t *efdp, - xfs_fsblock_t start_block, - xfs_extlen_t ext_len) +int +xfs_trans_free_extent( + struct xfs_trans *tp, + struct xfs_efd_log_item *efdp, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len) { uint next_extent; - xfs_extent_t *extp; + struct xfs_extent *extp; + int error; + error = xfs_free_extent(tp, start_block, ext_len); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the EFI and frees the EFD + * 2.) shuts down the filesystem + */ tp->t_flags |= XFS_TRANS_DIRTY; efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY; @@ -130,4 +142,6 @@ xfs_trans_log_efd_extent(xfs_trans_t *tp, extp->ext_start = start_block; extp->ext_len = ext_len; efdp->efd_next_extent++; + + return error; } -- cgit v0.10.2 From e32a1d1fbf6eb2bdc24aa0502e827ff4d2234604 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:52:21 +1000 Subject: xfs: use EFI refcount consistently in log recovery The EFI is initialized with a reference count of 2. One for the EFI to ensure the item makes it to the AIL and one for the subsequently created EFD to release the EFI once the EFD is committed. Log recovery uses the EFI in a similar manner, but implements a hack to remove both references in one call once the EFD is handled. Update log recovery to use EFI reference counting in a manner consistent with the log. When an EFI is encountered during recovery, an EFI item is allocated and inserted to the AIL directly. Since the EFI reference is typically dropped when the EFI is unpinned and this is analogous with AIL insertion, drop the EFI reference at this point. When a corresponding EFD is encountered in the log, this indicates that the extents were freed, no processing is required and the EFI can be dropped. Update xlog_recover_efd_pass2() to simply drop the EFD reference at this point rather than open code the AIL removal and EFI free. Remaining EFIs (i.e., with no corresponding EFD) are processed in xlog_recover_finish(). An EFD transaction is allocated and the extents are freed, which transfers ownership of the EFI reference to the EFD item in the log. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 475b9f0..ce1d4fb 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -47,34 +47,6 @@ xfs_efi_item_free( } /* - * Freeing the efi requires that we remove it from the AIL if it has already - * been placed there. However, the EFI may not yet have been placed in the AIL - * when called by xfs_efi_release() from EFD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the EFI. - */ -STATIC void -__xfs_efi_release( - struct xfs_efi_log_item *efip) -{ - struct xfs_ail *ailp = efip->efi_item.li_ailp; - - if (atomic_dec_and_test(&efip->efi_refcount)) { - spin_lock(&ailp->xa_lock); - /* - * We don't know whether the EFI made it to the AIL. Remove it - * if so. Note that xfs_trans_ail_delete() drops the AIL lock. - */ - if (efip->efi_item.li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(ailp, &efip->efi_item, - SHUTDOWN_LOG_IO_ERROR); - else - spin_unlock(&ailp->xa_lock); - xfs_efi_item_free(efip); - } -} - -/* * This returns the number of iovecs needed to log the given efi item. * We only need 1 iovec for an efi item. It just logs the efi_log_format * structure. @@ -304,21 +276,32 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } /* - * This is called by the efd item code below to release references to the given - * efi item. Each efd calls this with the number of extents that it has - * logged, and when the sum of these reaches the total number of extents logged - * by this efi item we can free the efi item. + * Freeing the efi requires that we remove it from the AIL if it has already + * been placed there. However, the EFI may not yet have been placed in the AIL + * when called by xfs_efi_release() from EFD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. */ void xfs_efi_release( struct xfs_efi_log_item *efip) { - /* recovery needs us to drop the EFI reference, too */ - if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) - __xfs_efi_release(efip); + struct xfs_ail *ailp = efip->efi_item.li_ailp; - __xfs_efi_release(efip); - /* efip may now have been freed, do not reference it again. */ + if (atomic_dec_and_test(&efip->efi_refcount)) { + spin_lock(&ailp->xa_lock); + /* + * We don't know whether the EFI made it to the AIL. Remove it + * if so. Note that xfs_trans_ail_delete() drops the AIL lock. + */ + if (efip->efi_item.li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, &efip->efi_item, + SHUTDOWN_LOG_IO_ERROR); + else + spin_unlock(&ailp->xa_lock); + + xfs_efi_item_free(efip); + } } static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 434ba2c..05c0cc8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2928,16 +2928,16 @@ xlog_recover_efi_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; - xfs_mount_t *mp = log->l_mp; - xfs_efi_log_item_t *efip; - xfs_efi_log_format_t *efi_formatp; + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_efi_log_item *efip; + struct xfs_efi_log_format *efi_formatp; efi_formatp = item->ri_buf[0].i_addr; efip = xfs_efi_init(mp, efi_formatp->efi_nextents); - if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), - &(efip->efi_format)))) { + error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); + if (error) { xfs_efi_item_free(efip); return error; } @@ -2945,20 +2945,23 @@ xlog_recover_efi_pass2( spin_lock(&log->l_ailp->xa_lock); /* - * xfs_trans_ail_update() drops the AIL lock. + * The EFI has two references. One for the EFD and one for EFI to ensure + * it makes it into the AIL. Insert the EFI into the AIL directly and + * drop the EFI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. */ xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); + xfs_efi_release(efip); return 0; } /* - * This routine is called when an efd format structure is found in - * a committed transaction in the log. It's purpose is to cancel - * the corresponding efi if it was still in the log. To do this - * it searches the AIL for the efi with an id equal to that in the - * efd format structure. If we find it, we remove the efi from the - * AIL and free it. + * This routine is called when an EFD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding EFI if it + * was still in the log. To do this it searches the AIL for the EFI with an id + * equal to that in the EFD format structure. If we find it we drop the EFD + * reference, which removes the EFI from the AIL and frees it. */ STATIC int xlog_recover_efd_pass2( @@ -2980,8 +2983,8 @@ xlog_recover_efd_pass2( efi_id = efd_formatp->efd_efi_id; /* - * Search for the efi with the id in the efd format structure - * in the AIL. + * Search for the EFI with the id in the EFD format structure in the + * AIL. */ spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); @@ -2990,18 +2993,18 @@ xlog_recover_efd_pass2( efip = (xfs_efi_log_item_t *)lip; if (efip->efi_format.efi_id == efi_id) { /* - * xfs_trans_ail_delete() drops the - * AIL lock. + * Drop the EFD reference to the EFI. This + * removes the EFI from the AIL and frees it. */ - xfs_trans_ail_delete(ailp, lip, - SHUTDOWN_CORRUPT_INCORE); - xfs_efi_item_free(efip); + spin_unlock(&ailp->xa_lock); + xfs_efi_release(efip); spin_lock(&ailp->xa_lock); break; } } lip = xfs_trans_ail_cursor_next(ailp, &cur); } + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); -- cgit v0.10.2 From f0b2efad16e78623b5a156f6e4e9166907b83155 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:58:36 +1000 Subject: xfs: don't leave EFIs on AIL on mount failure Log recovery occurs in two phases at mount time. In the first phase, EFIs and EFDs are processed and potentially cancelled out. EFIs without EFD objects are inserted into the AIL for processing and recovery in the second phase. xfs_mountfs() runs various other operations between the phases and is thus subject to failure. If failure occurs after the first phase but before the second, pending EFIs sit on the AIL, pin it and cause the mount to hang. Update the mount sequence to ensure that pending EFIs are cancelled in the event of failure. Add a recovery cancellation mechanism to iterate the AIL and cancel all EFI items when requested. Plumb cancellation support through the log mount finish helper and update xfs_mountfs() to invoke cancellation in the event of failure after recovery has started. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 08d4fe4..7ce278d 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -700,6 +700,7 @@ xfs_log_mount( if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); + xlog_recover_cancel(mp->m_log); goto out_destroy_ail; } } @@ -740,18 +741,35 @@ out: * it. */ int -xfs_log_mount_finish(xfs_mount_t *mp) +xfs_log_mount_finish( + struct xfs_mount *mp) { int error = 0; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { - error = xlog_recover_finish(mp->m_log); - if (!error) - xfs_log_work_queue(mp); - } else { + if (mp->m_flags & XFS_MOUNT_NORECOVERY) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + return 0; } + error = xlog_recover_finish(mp->m_log); + if (!error) + xfs_log_work_queue(mp); + + return error; +} + +/* + * The mount has failed. Cancel the recovery if it hasn't completed and destroy + * the log. + */ +int +xfs_log_mount_cancel( + struct xfs_mount *mp) +{ + int error; + + error = xlog_recover_cancel(mp->m_log); + xfs_log_unmount(mp); return error; } diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index fa27aae..09d91d3 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -147,6 +147,7 @@ int xfs_log_mount(struct xfs_mount *mp, xfs_daddr_t start_block, int num_bblocks); int xfs_log_mount_finish(struct xfs_mount *mp); +int xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 1c87c8a..950f3f9 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -426,6 +426,8 @@ xlog_recover( extern int xlog_recover_finish( struct xlog *log); +extern int +xlog_recover_cancel(struct xlog *); extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 05c0cc8..fd1ae47 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3791,10 +3791,10 @@ abort_error: */ STATIC int xlog_recover_process_efis( - struct xlog *log) + struct xlog *log) { - xfs_log_item_t *lip; - xfs_efi_log_item_t *efip; + struct xfs_log_item *lip; + struct xfs_efi_log_item *efip; int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; @@ -3818,7 +3818,7 @@ xlog_recover_process_efis( /* * Skip EFIs that we've already processed. */ - efip = (xfs_efi_log_item_t *)lip; + efip = container_of(lip, struct xfs_efi_log_item, efi_item); if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { lip = xfs_trans_ail_cursor_next(ailp, &cur); continue; @@ -3838,6 +3838,50 @@ out: } /* + * A cancel occurs when the mount has failed and we're bailing out. Release all + * pending EFIs so they don't pin the AIL. + */ +STATIC int +xlog_recover_cancel_efis( + struct xlog *log) +{ + struct xfs_log_item *lip; + struct xfs_efi_log_item *efip; + int error = 0; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp; + + ailp = log->l_ailp; + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + /* + * We're done when we see something other than an EFI. + * There should be no EFIs left in the AIL now. + */ + if (lip->li_type != XFS_LI_EFI) { +#ifdef DEBUG + for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) + ASSERT(lip->li_type != XFS_LI_EFI); +#endif + break; + } + + efip = container_of(lip, struct xfs_efi_log_item, efi_item); + + spin_unlock(&ailp->xa_lock); + xfs_efi_release(efip); + spin_lock(&ailp->xa_lock); + + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + return error; +} + +/* * This routine performs a transaction to null out a bad inode pointer * in an agi unlinked inode hash bucket. */ @@ -4610,6 +4654,17 @@ xlog_recover_finish( return 0; } +int +xlog_recover_cancel( + struct xlog *log) +{ + int error = 0; + + if (log->l_flags & XLOG_RECOVERY_NEEDED) + error = xlog_recover_cancel_efis(log); + + return error; +} #if defined(DEBUG) /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 461e791..4825a8a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -615,14 +615,14 @@ xfs_default_resblks(xfs_mount_t *mp) */ int xfs_mountfs( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_sb_t *sbp = &(mp->m_sb); - xfs_inode_t *rip; - __uint64_t resblks; - uint quotamount = 0; - uint quotaflags = 0; - int error = 0; + struct xfs_sb *sbp = &(mp->m_sb); + struct xfs_inode *rip; + __uint64_t resblks; + uint quotamount = 0; + uint quotaflags = 0; + int error = 0; xfs_sb_mount_common(mp, sbp); @@ -799,7 +799,9 @@ xfs_mountfs( } /* - * log's mount-time initialization. Perform 1st part recovery if needed + * Log's mount-time initialization. The first part of recovery can place + * some items on the AIL, to be handled when recovery is finished or + * cancelled. */ error = xfs_log_mount(mp, mp->m_logdev_targp, XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), @@ -910,9 +912,9 @@ xfs_mountfs( } /* - * Finish recovering the file system. This part needed to be - * delayed until after the root and real-time bitmap inodes - * were consistently read in. + * Finish recovering the file system. This part needed to be delayed + * until after the root and real-time bitmap inodes were consistently + * read in. */ error = xfs_log_mount_finish(mp); if (error) { @@ -956,7 +958,7 @@ xfs_mountfs( out_rele_rip: IRELE(rip); out_log_dealloc: - xfs_log_unmount(mp); + xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_wait_buftarg(mp->m_logdev_targp); -- cgit v0.10.2 From 78d57e4593bf700e1a4447e3a7769da8dd0e0844 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:58:48 +1000 Subject: xfs: icreate log item recovery and cancellation tracepoints Various log items have recovery tracepoints to identify whether a particular log item is recovered or cancelled. Add the equivalent tracepoints for the icreate transaction. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index fd1ae47..0c6641b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3100,9 +3100,12 @@ xlog_recover_do_icreate_pass2( * done easily. */ if (xlog_check_buffer_cancelled(log, - XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) + XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) { + trace_xfs_log_recover_icreate_cancel(log, icl); return 0; + } + trace_xfs_log_recover_icreate_recover(log, icl); xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length, be32_to_cpu(icl->icl_gen)); return 0; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8d916d3..9aeeb21 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2089,6 +2089,40 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); +DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class, + TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), + TP_ARGS(log, in_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(unsigned int, count) + __field(unsigned int, isize) + __field(xfs_agblock_t, length) + __field(unsigned int, gen) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->agno = be32_to_cpu(in_f->icl_ag); + __entry->agbno = be32_to_cpu(in_f->icl_agbno); + __entry->count = be32_to_cpu(in_f->icl_count); + __entry->isize = be32_to_cpu(in_f->icl_isize); + __entry->length = be32_to_cpu(in_f->icl_length); + __entry->gen = be32_to_cpu(in_f->icl_gen); + ), + TP_printk("dev %d:%d agno %u agbno %u count %u isize %u length %u " + "gen %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, __entry->agbno, __entry->count, __entry->isize, + __entry->length, __entry->gen) +) +#define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \ + TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), \ + TP_ARGS(log, in_f)) + +DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_cancel); +DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_recover); + DECLARE_EVENT_CLASS(xfs_discard_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len), -- cgit v0.10.2 From fc0d1656964fc53fca84549df5a6bd4a16a29cdf Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:59:38 +1000 Subject: xfs: fix broken icreate log item cancellation Inode cluster buffers are invalidated and cancelled when inode chunks are freed to notify log recovery that previous logged updates to the metadata buffer should be skipped. This ensures that log recovery does not overwrite buffers that might have already been reused. On v4 filesystems, inode chunk allocation and inode updates are logged via the cluster buffers and thus cancellation is easily detected via buffer cancellation items. v5 filesystems use the new icreate transaction, which uses logical logging and ordered buffers to log a full inode chunk allocation at once. The resulting icreate item often spans multiple inode cluster buffers. Log recovery checks for cancelled buffers when processing icreate log items, but it has a couple problems. First, it uses the full length of the inode chunk rather than the cluster size. Second, it uses the length in FSB units rather than BB units. Either of these problems prevent icreate recovery from identifying cancelled buffers and thus inode initialization proceeds unconditionally. Update xlog_recover_do_icreate_pass2() to iterate the icreate range in cluster sized increments and check each increment for cancellation. Since icreate is currently only used for the minimum atomic inode chunk allocation, we expect that either all or none of the buffers will be cancelled. Cancel the icreate if at least one buffer is cancelled to avoid making a bad situation worse by initializing a partial inode chunk, but detect such anomalies and warn the user. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 0c6641b..2fa55e1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3032,6 +3032,11 @@ xlog_recover_do_icreate_pass2( unsigned int count; unsigned int isize; xfs_agblock_t length; + int blks_per_cluster; + int bb_per_cluster; + int cancel_count; + int nbufs; + int i; icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; if (icl->icl_type != XFS_LI_ICREATE) { @@ -3090,25 +3095,45 @@ xlog_recover_do_icreate_pass2( } /* - * Inode buffers can be freed. Do not replay the inode initialisation as - * we could be overwriting something written after this inode buffer was - * cancelled. + * The icreate transaction can cover multiple cluster buffers and these + * buffers could have been freed and reused. Check the individual + * buffers for cancellation so we don't overwrite anything written after + * a cancellation. + */ + blks_per_cluster = xfs_icluster_size_fsb(mp); + bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster); + nbufs = length / blks_per_cluster; + for (i = 0, cancel_count = 0; i < nbufs; i++) { + xfs_daddr_t daddr; + + daddr = XFS_AGB_TO_DADDR(mp, agno, + agbno + i * blks_per_cluster); + if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0)) + cancel_count++; + } + + /* + * We currently only use icreate for a single allocation at a time. This + * means we should expect either all or none of the buffers to be + * cancelled. Be conservative and skip replay if at least one buffer is + * cancelled, but warn the user that something is awry if the buffers + * are not consistent. * - * XXX: we need to iterate all buffers and only init those that are not - * cancelled. I think that a more fine grained factoring of - * xfs_ialloc_inode_init may be appropriate here to enable this to be - * done easily. + * XXX: This must be refined to only skip cancelled clusters once we use + * icreate for multiple chunk allocations. */ - if (xlog_check_buffer_cancelled(log, - XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) { + ASSERT(!cancel_count || cancel_count == nbufs); + if (cancel_count) { + if (cancel_count != nbufs) + xfs_warn(mp, + "WARNING: partial inode chunk cancellation, skipped icreate."); trace_xfs_log_recover_icreate_cancel(log, icl); return 0; } trace_xfs_log_recover_icreate_recover(log, icl); - xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length, - be32_to_cpu(icl->icl_gen)); - return 0; + return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, + length, be32_to_cpu(icl->icl_gen)); } STATIC void -- cgit v0.10.2 From a3f20014659a1566a4e516e2bf95287960fe2c44 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 09:59:50 +1000 Subject: xfs: checksum log record ext headers based on record size The first 4 bytes of every basic block in the physical log is stamped with the current lsn. To support this mechanism, the log record header (first block of each new log record) contains space for the original first byte of each log record block before it is replaced with the lsn. The log record header has space for 32k worth of blocks. The version 2 log adds new extended record headers for each additional 32k worth of blocks beyond what is supported by the record header. The log record checksum incorporates the log record header, the extended headers and the record payload. xlog_cksum() checksums the extended headers based on log->l_iclog_heads, which specifies the number of extended headers in a log record based on the log buffer size mount option. The log buffer size is variable, however, and thus means the checksum can be calculated differently based on how a filesystem is mounted. This is problematic if a filesystem crashes and recovery occurs on a subsequent mount using a different log buffer size. For example, crash an active filesystem that is mounted with the default (32k) logbsize, attempt remount/recovery using '-o logbsize=64k' and the mount fails on or warns about log checksum failures. To avoid this problem, update xlog_cksum() to calculate the checksum based on the size of the log buffer according to the log record. The size is already included in the h_size field of the log record header and thus is available at log recovery time. Extended log record headers are also only written when the log record is large enough to require them. This makes checksum calculation of log records consistent with the extended record header mechanism as well as how on-disk records are checksummed with various log buffer size mount options. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7ce278d..a98daa6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1670,8 +1670,13 @@ xlog_cksum( if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; int i; + int xheads; - for (i = 1; i < log->l_iclog_heads; i++) { + xheads = size / XLOG_HEADER_CYCLE_SIZE; + if (size % XLOG_HEADER_CYCLE_SIZE) + xheads++; + + for (i = 1; i < xheads; i++) { crc = crc32c(crc, &xhdr[i].hic_xheader, sizeof(struct xlog_rec_ext_header)); } -- cgit v0.10.2 From 0ae120f8a81a8dc4f974d0819c97b58c4fa935ac Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 10:00:28 +1000 Subject: xfs: clean up root inode properly on mount failure The root inode is read as part of the xfs_mountfs() sequence and the reference is dropped in the event of failure after we grab the inode. The reference drop doesn't necessarily free the inode, however. It marks it for reclaim and potentially kicks off the reclaim workqueue. The workqueue is destroyed further up the error path, which means we are subject to crash if the workqueue job runs after this point or a memory leak which is identified if the xfs_inode_zone is destroyed (e.g., on module removal). Both of these outcomes are reproducible via manual instrumentation of a mount error after the root inode xfs_iget() call in xfs_mountfs(). Update the xfs_mountfs() error path to cancel any potential reclaim work items and to run a synchronous inode reclaim if the root inode is marked for reclaim. This ensures that no jobs remain on the queue before it is destroyed and that the root inode is freed before the reclaim mechanism is torn down. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 4825a8a..bf92e0c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -957,6 +957,8 @@ xfs_mountfs( xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); + cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_reclaim_inodes(mp, SYNC_WAIT); out_log_dealloc: xfs_log_mount_cancel(mp); out_fail_wait: -- cgit v0.10.2 From f307080a626569f89bc8fbad9f936b307aded877 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 10:00:53 +1000 Subject: xfs: fix btree cursor error cleanups The btree cursor cleanup function takes an error parameter that affects how buffers are released from the cursor. All buffers are released in the event of error. Several callers do not specify the XFS_BTREE_ERROR flag in the event of error, however. This can cause buffers to hang around locked or with an elevated hold count and thus lead to umount hangs in the event of errors. Fix up the xfs_btree_del_cursor() callers to pass XFS_BTREE_ERROR if the cursor is being torn down due to error. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 66efc70..0b29918 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2232,7 +2232,7 @@ xfs_imap_lookup( } xfs_trans_brelse(tp, agbp); - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); if (error) return error; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f41b0c3..930ebd8 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -473,7 +473,8 @@ xfs_bulkstat( * pending error, then we are done. */ del_cursor: - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, error ? + XFS_BTREE_ERROR : XFS_BTREE_NOERROR); xfs_buf_relse(agbp); if (error) break; -- cgit v0.10.2 From 146e54b71ea4b998d65c25964807ff6792bbf436 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 10:01:08 +1000 Subject: xfs: add helper to conditionally remove items from the AIL Several areas of code duplicate a pattern where we take the AIL lock, check whether an item is in the AIL and remove it if so. Create a new helper for this pattern and use it where appropriate. Signed-off-by: Brian Foster diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 092d652..919057e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -647,11 +647,7 @@ xfs_buf_item_unlock( xfs_buf_item_relse(bp); else if (aborted) { ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); - if (lip->li_flags & XFS_LI_IN_AIL) { - spin_lock(&lip->li_ailp->xa_lock); - xfs_trans_ail_delete(lip->li_ailp, lip, - SHUTDOWN_LOG_IO_ERROR); - } + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); } } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 4143dc7..6964d7c 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -954,12 +954,8 @@ xfs_qm_dqflush( struct xfs_log_item *lip = &dqp->q_logitem.qli_item; dqp->dq_flags &= ~XFS_DQ_DIRTY; - spin_lock(&mp->m_ail->xa_lock); - if (lip->li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(mp->m_ail, lip, - SHUTDOWN_CORRUPT_INCORE); - else - spin_unlock(&mp->m_ail->xa_lock); + xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE); + error = -EIO; goto out_unlock; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index ce1d4fb..4aa0153 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -286,20 +286,8 @@ void xfs_efi_release( struct xfs_efi_log_item *efip) { - struct xfs_ail *ailp = efip->efi_item.li_ailp; - if (atomic_dec_and_test(&efip->efi_refcount)) { - spin_lock(&ailp->xa_lock); - /* - * We don't know whether the EFI made it to the AIL. Remove it - * if so. Note that xfs_trans_ail_delete() drops the AIL lock. - */ - if (efip->efi_item.li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(ailp, &efip->efi_item, - SHUTDOWN_LOG_IO_ERROR); - else - spin_unlock(&ailp->xa_lock); - + xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); xfs_efi_item_free(efip); } } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index bf13a5a..62bd80f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -703,17 +703,10 @@ xfs_iflush_abort( xfs_inode_log_item_t *iip = ip->i_itemp; if (iip) { - struct xfs_ail *ailp = iip->ili_item.li_ailp; if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { - spin_lock(&ailp->xa_lock); - if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, &iip->ili_item, - stale ? - SHUTDOWN_LOG_IO_ERROR : + xfs_trans_ail_remove(&iip->ili_item, + stale ? SHUTDOWN_LOG_IO_ERROR : SHUTDOWN_CORRUPT_INCORE); - } else - spin_unlock(&ailp->xa_lock); } iip->ili_logged = 0; /* diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 1b73629..49931b7 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -119,6 +119,21 @@ xfs_trans_ail_delete( xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); } +static inline void +xfs_trans_ail_remove( + struct xfs_log_item *lip, + int shutdown_type) +{ + struct xfs_ail *ailp = lip->li_ailp; + + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_delete() drops the AIL lock */ + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, lip, shutdown_type); + else + spin_unlock(&ailp->xa_lock); +} + void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_ail_push_all(struct xfs_ail *); void xfs_ail_push_all_sync(struct xfs_ail *); -- cgit v0.10.2 From d4a97a04227d5ba91b91888a016e2300861cfbc7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 19 Aug 2015 10:01:40 +1000 Subject: xfs: add missing bmap cancel calls in error paths If a failure occurs after the bmap free list is populated and before xfs_bmap_finish() completes successfully (which returns a partial list on failure), the bmap free list must be cancelled. Otherwise, the extent items on the list are never freed and a memory leak occurs. Several random error paths throughout the code suffer this problem. Fix these up such that xfs_bmap_cancel() is always called on error. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 63e05b6..8e2010d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5945,6 +5945,7 @@ xfs_bmap_split_extent( return xfs_trans_commit(tp); out: + xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 73aab0d..3bf4ad0 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1474,7 +1474,7 @@ xfs_shift_file_space( XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, XFS_QMOPT_RES_REGBLKS); if (error) - goto out; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -1488,18 +1488,20 @@ xfs_shift_file_space( &done, stop_fsb, &first_block, &free_list, direction, XFS_BMAP_MAX_SHIFT_EXTENTS); if (error) - goto out; + goto out_bmap_cancel; error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) - goto out; + goto out_bmap_cancel; error = xfs_trans_commit(tp); } return error; -out: +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_trans_cancel: xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3da9f4d..cee2f69 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1791,14 +1791,15 @@ xfs_inactive_ifree( xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); /* - * Just ignore errors at this point. There is nothing we can - * do except to try to keep going. Make sure it's not a silent - * error. + * Just ignore errors at this point. There is nothing we can do except + * to try to keep going. Make sure it's not a silent error. */ error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) + if (error) { xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", __func__, error); + xfs_bmap_cancel(&free_list); + } error = xfs_trans_commit(tp); if (error) xfs_notice(mp, "%s: xfs_trans_commit returned error %d", diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index f4e8c06..ab1bac6 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -757,31 +757,30 @@ xfs_rtallocate_extent_size( /* * Allocate space to the bitmap or summary file, and zero it, for growfs. */ -STATIC int /* error */ +STATIC int xfs_growfs_rt_alloc( - xfs_mount_t *mp, /* file system mount point */ - xfs_extlen_t oblocks, /* old count of blocks */ - xfs_extlen_t nblocks, /* new count of blocks */ - xfs_inode_t *ip) /* inode (bitmap/summary) */ + struct xfs_mount *mp, /* file system mount point */ + xfs_extlen_t oblocks, /* old count of blocks */ + xfs_extlen_t nblocks, /* new count of blocks */ + struct xfs_inode *ip) /* inode (bitmap/summary) */ { - xfs_fileoff_t bno; /* block number in file */ - xfs_buf_t *bp; /* temporary buffer for zeroing */ - int committed; /* transaction committed flag */ - xfs_daddr_t d; /* disk block address */ - int error; /* error return value */ - xfs_fsblock_t firstblock; /* first block allocated in xaction */ - xfs_bmap_free_t flist; /* list of freed blocks */ - xfs_fsblock_t fsbno; /* filesystem block for bno */ - xfs_bmbt_irec_t map; /* block map output */ - int nmap; /* number of block maps */ - int resblks; /* space reservation */ + xfs_fileoff_t bno; /* block number in file */ + struct xfs_buf *bp; /* temporary buffer for zeroing */ + int committed; /* transaction committed flag */ + xfs_daddr_t d; /* disk block address */ + int error; /* error return value */ + xfs_fsblock_t firstblock;/* first block allocated in xaction */ + struct xfs_bmap_free flist; /* list of freed blocks */ + xfs_fsblock_t fsbno; /* filesystem block for bno */ + struct xfs_bmbt_irec map; /* block map output */ + int nmap; /* number of block maps */ + int resblks; /* space reservation */ + struct xfs_trans *tp; /* * Allocate space to the file, as necessary. */ while (oblocks < nblocks) { - xfs_trans_t *tp; - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); /* @@ -790,7 +789,7 @@ xfs_growfs_rt_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, resblks, 0); if (error) - goto error_cancel; + goto out_trans_cancel; /* * Lock the inode. */ @@ -808,16 +807,16 @@ xfs_growfs_rt_alloc( if (!error && nmap < 1) error = -ENOSPC; if (error) - goto error_cancel; + goto out_bmap_cancel; /* * Free any blocks freed up in the transaction, then commit. */ error = xfs_bmap_finish(&tp, &flist, &committed); if (error) - goto error_cancel; + goto out_bmap_cancel; error = xfs_trans_commit(tp); if (error) - goto error; + return error; /* * Now we need to clear the allocated blocks. * Do this one block per transaction, to keep it simple. @@ -832,7 +831,7 @@ xfs_growfs_rt_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, 0, 0); if (error) - goto error_cancel; + goto out_trans_cancel; /* * Lock the bitmap inode. */ @@ -846,9 +845,7 @@ xfs_growfs_rt_alloc( mp->m_bsize, 0); if (bp == NULL) { error = -EIO; -error_cancel: - xfs_trans_cancel(tp); - goto error; + goto out_trans_cancel; } memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); @@ -857,16 +854,20 @@ error_cancel: */ error = xfs_trans_commit(tp); if (error) - goto error; + return error; } /* * Go on to the next extent, if any. */ oblocks = map.br_startoff + map.br_blockcount; } + return 0; -error: +out_bmap_cancel: + xfs_bmap_cancel(&flist); +out_trans_cancel: + xfs_trans_cancel(tp); return error; } -- cgit v0.10.2