summaryrefslogtreecommitdiff
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/kmem.c18
-rw-r--r--fs/xfs/kmem.h2
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c12
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c8
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c71
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h3
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c1
-rw-r--r--fs/xfs/libxfs/xfs_btree.c33
-rw-r--r--fs/xfs/libxfs/xfs_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c59
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c9
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h27
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c14
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h16
-rw-r--r--fs/xfs/xfs_acl.c26
-rw-r--r--fs/xfs/xfs_acl.h1
-rw-r--r--fs/xfs/xfs_aops.c151
-rw-r--r--fs/xfs/xfs_attr_inactive.c2
-rw-r--r--fs/xfs/xfs_bmap_item.c17
-rw-r--r--fs/xfs/xfs_bmap_util.c135
-rw-r--r--fs/xfs/xfs_bmap_util.h13
-rw-r--r--fs/xfs/xfs_buf.c62
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_buf_item.c156
-rw-r--r--fs/xfs/xfs_buf_item.h5
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_file.c41
-rw-r--r--fs/xfs/xfs_icache.c15
-rw-r--r--fs/xfs/xfs_inode.c43
-rw-r--r--fs/xfs/xfs_inode_item.c158
-rw-r--r--fs/xfs/xfs_ioctl.c44
-rw-r--r--fs/xfs/xfs_iomap.c7
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_iops.c6
-rw-r--r--fs/xfs/xfs_itable.c6
-rw-r--r--fs/xfs/xfs_linux.h9
-rw-r--r--fs/xfs/xfs_log.c44
-rw-r--r--fs/xfs/xfs_log_recover.c182
-rw-r--r--fs/xfs/xfs_mount.c12
-rw-r--r--fs/xfs/xfs_ondisk.h2
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_qm.c31
-rw-r--r--fs/xfs/xfs_reflink.c13
-rw-r--r--fs/xfs/xfs_super.c2
-rw-r--r--fs/xfs/xfs_trace.h20
-rw-r--r--fs/xfs/xfs_trans.h16
-rw-r--r--fs/xfs/xfs_trans_ail.c72
-rw-r--r--fs/xfs/xfs_trans_bmap.c11
-rw-r--r--fs/xfs/xfs_trans_buf.c98
-rw-r--r--fs/xfs/xfs_trans_priv.h46
54 files changed, 1169 insertions, 570 deletions
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 339c696..bb2beae 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -24,24 +24,6 @@
#include "kmem.h"
#include "xfs_message.h"
-/*
- * Greedy allocation. May fail and may return vmalloced memory.
- */
-void *
-kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
-{
- void *ptr;
- size_t kmsize = maxsize;
-
- while (!(ptr = vzalloc(kmsize))) {
- if ((kmsize >>= 1) <= minsize)
- kmsize = minsize;
- }
- if (ptr)
- *size = kmsize;
- return ptr;
-}
-
void *
kmem_alloc(size_t size, xfs_km_flags_t flags)
{
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 689f746..f0fc84f 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -69,8 +69,6 @@ static inline void kmem_free(const void *ptr)
}
-extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
-
static inline void *
kmem_zalloc(size_t size, xfs_km_flags_t flags)
{
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 33db69b..eed8f58 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -157,7 +157,8 @@ __xfs_ag_resv_free(
trace_xfs_ag_resv_free(pag, type, 0);
resv = xfs_perag_resv(pag, type);
- pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+ if (pag->pag_agno == 0)
+ pag->pag_mount->m_ag_max_usable += resv->ar_asked;
/*
* AGFL blocks are always considered "free", so whatever
* was reserved at mount time must be given back at umount.
@@ -217,7 +218,14 @@ __xfs_ag_resv_init(
return error;
}
- mp->m_ag_max_usable -= ask;
+ /*
+ * Reduce the maximum per-AG allocation length by however much we're
+ * trying to reserve for an AG. Since this is a filesystem-wide
+ * counter, we only make the adjustment for AG 0. This assumes that
+ * there aren't any AGs hungrier for per-AG reservation than AG 0.
+ */
+ if (pag->pag_agno == 0)
+ mp->m_ag_max_usable -= ask;
resv = xfs_perag_resv(pag, type);
resv->ar_asked = ask;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 9f06a21..c3702cd 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1579,6 +1579,10 @@ xfs_alloc_ag_vextent_small(
bp = xfs_btree_get_bufs(args->mp, args->tp,
args->agno, fbno, 0);
+ if (!bp) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
xfs_trans_binval(args->tp, bp);
}
args->len = 1;
@@ -2136,6 +2140,10 @@ xfs_alloc_fix_freelist(
if (error)
goto out_agbp_relse;
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
+ if (!bp) {
+ error = -EFSCORRUPTED;
+ goto out_agbp_relse;
+ }
xfs_trans_binval(tp, bp);
}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2852521..c6c15e5 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -351,7 +351,7 @@ xfs_attr3_leaf_read(
err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
- if (!err && tp)
+ if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
return err;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 2a8cbd1..7eb9970 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -579,7 +579,7 @@ xfs_bmap_validate_ret(
#else
#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
-#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0)
#endif /* DEBUG */
/*
@@ -4057,6 +4057,17 @@ xfs_trim_extent(
}
}
+/* trim extent to within eof */
+void
+xfs_trim_extent_eof(
+ struct xfs_bmbt_irec *irec,
+ struct xfs_inode *ip)
+
+{
+ xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
+ i_size_read(VFS_I(ip))));
+}
+
/*
* Trim the returned map to the required bounds
*/
@@ -5555,6 +5566,8 @@ __xfs_bunmapi(
int whichfork; /* data or attribute fork */
xfs_fsblock_t sum;
xfs_filblks_t len = *rlen; /* length to unmap in file */
+ xfs_fileoff_t max_len;
+ xfs_agnumber_t prev_agno = NULLAGNUMBER, agno;
trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
@@ -5576,6 +5589,16 @@ __xfs_bunmapi(
ASSERT(len > 0);
ASSERT(nexts >= 0);
+ /*
+ * Guesstimate how many blocks we can unmap without running the risk of
+ * blowing out the transaction with a mix of EFIs and reflink
+ * adjustments.
+ */
+ if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
+ max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
+ else
+ max_len = len;
+
if (!(ifp->if_flags & XFS_IFEXTENTS) &&
(error = xfs_iread_extents(tp, ip, whichfork)))
return error;
@@ -5621,7 +5644,7 @@ __xfs_bunmapi(
extno = 0;
while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
- (nexts == 0 || extno < nexts)) {
+ (nexts == 0 || extno < nexts) && max_len > 0) {
/*
* Is the found extent after a hole in which bno lives?
* Just back up to the previous extent, if so.
@@ -5647,6 +5670,17 @@ __xfs_bunmapi(
ASSERT(ep != NULL);
del = got;
wasdel = isnullstartblock(del.br_startblock);
+
+ /*
+ * Make sure we don't touch multiple AGF headers out of order
+ * in a single transaction, as that could cause AB-BA deadlocks.
+ */
+ if (!wasdel) {
+ agno = XFS_FSB_TO_AGNO(mp, del.br_startblock);
+ if (prev_agno != NULLAGNUMBER && prev_agno > agno)
+ break;
+ prev_agno = agno;
+ }
if (got.br_startoff < start) {
del.br_startoff = start;
del.br_blockcount -= start - got.br_startoff;
@@ -5655,6 +5689,15 @@ __xfs_bunmapi(
}
if (del.br_startoff + del.br_blockcount > bno + 1)
del.br_blockcount = bno + 1 - del.br_startoff;
+
+ /* How much can we safely unmap? */
+ if (max_len < del.br_blockcount) {
+ del.br_startoff += del.br_blockcount - max_len;
+ if (!wasdel)
+ del.br_startblock += del.br_blockcount - max_len;
+ del.br_blockcount = max_len;
+ }
+
sum = del.br_startblock + del.br_blockcount;
if (isrt &&
(mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
@@ -5835,6 +5878,7 @@ __xfs_bunmapi(
if (!isrt && wasdel)
xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
+ max_len -= del.br_blockcount;
bno = del.br_startoff - 1;
nodelete:
/*
@@ -6604,25 +6648,33 @@ xfs_bmap_finish_one(
int whichfork,
xfs_fileoff_t startoff,
xfs_fsblock_t startblock,
- xfs_filblks_t blockcount,
+ xfs_filblks_t *blockcount,
xfs_exntst_t state)
{
struct xfs_bmbt_irec bmap;
int nimaps = 1;
xfs_fsblock_t firstfsb;
int flags = XFS_BMAPI_REMAP;
- int done;
int error = 0;
bmap.br_startblock = startblock;
bmap.br_startoff = startoff;
- bmap.br_blockcount = blockcount;
+ bmap.br_blockcount = *blockcount;
bmap.br_state = state;
+ /*
+ * firstfsb is tied to the transaction lifetime and is used to
+ * ensure correct AG locking order and schedule work item
+ * continuations. XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us
+ * to only making one bmap call per transaction, so it should
+ * be safe to have it as a local variable here.
+ */
+ firstfsb = NULLFSBLOCK;
+
trace_xfs_bmap_deferred(tp->t_mountp,
XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
- ip->i_ino, whichfork, startoff, blockcount, state);
+ ip->i_ino, whichfork, startoff, *blockcount, state);
if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
return -EFSCORRUPTED;
@@ -6641,12 +6693,11 @@ xfs_bmap_finish_one(
bmap.br_blockcount, flags, &firstfsb,
bmap.br_blockcount, &bmap, &nimaps,
dfops);
+ *blockcount = 0;
break;
case XFS_BMAP_UNMAP:
- error = xfs_bunmapi(tp, ip, bmap.br_startoff,
- bmap.br_blockcount, flags, 1, &firstfsb,
- dfops, &done);
- ASSERT(done);
+ error = __xfs_bunmapi(tp, ip, startoff, blockcount,
+ XFS_BMAPI_REMAP, 1, &firstfsb, dfops);
break;
default:
ASSERT(0);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index e7d40b3..f1446d1 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -196,6 +196,7 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
+void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
@@ -265,7 +266,7 @@ struct xfs_bmap_intent {
int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
struct xfs_inode *ip, enum xfs_bmap_intent_type type,
int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
- xfs_filblks_t blockcount, xfs_exntst_t state);
+ xfs_filblks_t *blockcount, xfs_exntst_t state);
int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 5c39186..9968a74 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -888,6 +888,7 @@ xfs_bmbt_change_owner(
cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
if (!cur)
return -ENOMEM;
+ cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
error = xfs_btree_change_owner(cur, new_owner, buffer_list);
xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 91c6891..4ad1e21 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -714,7 +714,8 @@ xfs_btree_firstrec(
* Get the block pointer for this level.
*/
block = xfs_btree_get_block(cur, level, &bp);
- xfs_btree_check_block(cur, block, level, bp);
+ if (xfs_btree_check_block(cur, block, level, bp))
+ return 0;
/*
* It's empty, there is no such record.
*/
@@ -743,7 +744,8 @@ xfs_btree_lastrec(
* Get the block pointer for this level.
*/
block = xfs_btree_get_block(cur, level, &bp);
- xfs_btree_check_block(cur, block, level, bp);
+ if (xfs_btree_check_block(cur, block, level, bp))
+ return 0;
/*
* It's empty, there is no such record.
*/
@@ -1772,6 +1774,7 @@ xfs_btree_lookup_get_block(
/* Check the inode owner since the verifiers don't. */
if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
+ !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
(cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
cur->bc_private.b.ip->i_ino)
@@ -4432,10 +4435,15 @@ xfs_btree_block_change_owner(
/* modify the owner */
block = xfs_btree_get_block(cur, level, &bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
+ return 0;
block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
- else
+ } else {
+ if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
+ return 0;
block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
+ }
/*
* If the block is a root block hosted in an inode, we might not have a
@@ -4444,16 +4452,19 @@ xfs_btree_block_change_owner(
* block is formatted into the on-disk inode fork. We still change it,
* though, so everything is consistent in memory.
*/
- if (bp) {
- if (cur->bc_tp) {
- xfs_trans_ordered_buf(cur->bc_tp, bp);
+ if (!bp) {
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(level == cur->bc_nlevels - 1);
+ return 0;
+ }
+
+ if (cur->bc_tp) {
+ if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
- } else {
- xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+ return -EAGAIN;
}
} else {
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
- ASSERT(level == cur->bc_nlevels - 1);
+ xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
}
return 0;
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 3b0fc1a..33c7be2 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -268,7 +268,8 @@ typedef struct xfs_btree_cur
short forksize; /* fork's inode space */
char whichfork; /* data or attr fork */
char flags; /* flags */
-#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */
+#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */
+#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */
} b;
} bc_private; /* per-btree type data */
} xfs_btree_cur_t;
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 1bdf288..b305dbf 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -263,7 +263,7 @@ xfs_da3_node_read(
err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
which_fork, &xfs_da3_node_buf_ops);
- if (!err && tp) {
+ if (!err && tp && *bpp) {
struct xfs_da_blkinfo *info = (*bpp)->b_addr;
int type;
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index aa17cb7..43c902f 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -139,7 +139,7 @@ xfs_dir3_block_read(
err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
- if (!err && tp)
+ if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
return err;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index b887fb2..f2e342e 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -268,7 +268,7 @@ xfs_dir3_leaf_read(
err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
- if (!err && tp)
+ if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
return err;
}
@@ -285,7 +285,7 @@ xfs_dir3_leafn_read(
err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
- if (!err && tp)
+ if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
return err;
}
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index a2818f6..42fef07 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -368,8 +368,6 @@ xfs_ialloc_inode_init(
* transaction and pin the log appropriately.
*/
xfs_trans_ordered_buf(tp, fbuf);
- xfs_trans_log_buf(tp, fbuf, 0,
- BBTOB(fbuf->b_length) - 1);
}
} else {
fbuf->b_flags |= XBF_DONE;
@@ -1123,6 +1121,7 @@ xfs_dialloc_ag_inobt(
int error;
int offset;
int i, j;
+ int searchdistance = 10;
pag = xfs_perag_get(mp, agno);
@@ -1149,7 +1148,6 @@ xfs_dialloc_ag_inobt(
if (pagno == agno) {
int doneleft; /* done, to the left */
int doneright; /* done, to the right */
- int searchdistance = 10;
error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
if (error)
@@ -1210,21 +1208,9 @@ xfs_dialloc_ag_inobt(
/*
* Loop until we find an inode chunk with a free inode.
*/
- while (!doneleft || !doneright) {
+ while (--searchdistance > 0 && (!doneleft || !doneright)) {
int useleft; /* using left inode chunk this time */
- if (!--searchdistance) {
- /*
- * Not in range - save last search
- * location and allocate a new inode
- */
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- pag->pagl_leftrec = trec.ir_startino;
- pag->pagl_rightrec = rec.ir_startino;
- pag->pagl_pagino = pagino;
- goto newino;
- }
-
/* figure out the closer block if both are valid. */
if (!doneleft && !doneright) {
useleft = pagino -
@@ -1236,13 +1222,13 @@ xfs_dialloc_ag_inobt(
/* free inodes to the left? */
if (useleft && trec.ir_freecount) {
- rec = trec;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
cur = tcur;
pag->pagl_leftrec = trec.ir_startino;
pag->pagl_rightrec = rec.ir_startino;
pag->pagl_pagino = pagino;
+ rec = trec;
goto alloc_inode;
}
@@ -1268,26 +1254,37 @@ xfs_dialloc_ag_inobt(
goto error1;
}
- /*
- * We've reached the end of the btree. because
- * we are only searching a small chunk of the
- * btree each search, there is obviously free
- * inodes closer to the parent inode than we
- * are now. restart the search again.
- */
- pag->pagl_pagino = NULLAGINO;
- pag->pagl_leftrec = NULLAGINO;
- pag->pagl_rightrec = NULLAGINO;
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
- goto restart_pagno;
+ if (searchdistance <= 0) {
+ /*
+ * Not in range - save last search
+ * location and allocate a new inode
+ */
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+
+ } else {
+ /*
+ * We've reached the end of the btree. because
+ * we are only searching a small chunk of the
+ * btree each search, there is obviously free
+ * inodes closer to the parent inode than we
+ * are now. restart the search again.
+ */
+ pag->pagl_pagino = NULLAGINO;
+ pag->pagl_leftrec = NULLAGINO;
+ pag->pagl_rightrec = NULLAGINO;
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ goto restart_pagno;
+ }
}
/*
* In a different AG from the parent.
* See if the most recently allocated block has any free.
*/
-newino:
if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
XFS_LOOKUP_EQ, &i);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 8a37efe..4e30448 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -1539,14 +1539,11 @@ xfs_iext_realloc_indirect(
xfs_ifork_t *ifp, /* inode fork pointer */
int new_size) /* new indirection array size */
{
- int nlists; /* number of irec's (ex lists) */
- int size; /* current indirection array size */
-
ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- size = nlists * sizeof(xfs_ext_irec_t);
ASSERT(ifp->if_real_bytes);
- ASSERT((new_size >= 0) && (new_size != size));
+ ASSERT((new_size >= 0) &&
+ (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) *
+ sizeof(xfs_ext_irec_t))));
if (new_size == 0) {
xfs_iext_destroy(ifp);
} else {
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 083cdd6..ce6958b 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -270,6 +270,7 @@ typedef struct xfs_inode_log_format {
__uint32_t ilf_fields; /* flags for fields logged */
__uint16_t ilf_asize; /* size of attr d/ext/root */
__uint16_t ilf_dsize; /* size of data/ext/root */
+ __uint32_t ilf_pad; /* pad for 64 bit boundary */
__uint64_t ilf_ino; /* inode number */
union {
__uint32_t ilfu_rdev; /* rdev value for dev inode*/
@@ -280,29 +281,17 @@ typedef struct xfs_inode_log_format {
__int32_t ilf_boffset; /* off of inode in buffer */
} xfs_inode_log_format_t;
-typedef struct xfs_inode_log_format_32 {
- __uint16_t ilf_type; /* inode log item type */
- __uint16_t ilf_size; /* size of this item */
- __uint32_t ilf_fields; /* flags for fields logged */
- __uint16_t ilf_asize; /* size of attr d/ext/root */
- __uint16_t ilf_dsize; /* size of data/ext/root */
- __uint64_t ilf_ino; /* inode number */
- union {
- __uint32_t ilfu_rdev; /* rdev value for dev inode*/
- uuid_t ilfu_uuid; /* mount point value */
- } ilf_u;
- __int64_t ilf_blkno; /* blkno of inode buffer */
- __int32_t ilf_len; /* len of inode buffer */
- __int32_t ilf_boffset; /* off of inode in buffer */
-} __attribute__((packed)) xfs_inode_log_format_32_t;
-
-typedef struct xfs_inode_log_format_64 {
+/*
+ * Old 32 bit systems will log in this format without the 64 bit
+ * alignment padding. Recovery will detect this and convert it to the
+ * correct format.
+ */
+struct xfs_inode_log_format_32 {
__uint16_t ilf_type; /* inode log item type */
__uint16_t ilf_size; /* size of this item */
__uint32_t ilf_fields; /* flags for fields logged */
__uint16_t ilf_asize; /* size of attr d/ext/root */
__uint16_t ilf_dsize; /* size of data/ext/root */
- __uint32_t ilf_pad; /* pad for 64 bit boundary */
__uint64_t ilf_ino; /* inode number */
union {
__uint32_t ilfu_rdev; /* rdev value for dev inode*/
@@ -311,7 +300,7 @@ typedef struct xfs_inode_log_format_64 {
__int64_t ilf_blkno; /* blkno of inode buffer */
__int32_t ilf_len; /* len of inode buffer */
__int32_t ilf_boffset; /* off of inode in buffer */
-} xfs_inode_log_format_64_t;
+} __attribute__((packed));
/*
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 82a38d8..d71cb63 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -784,14 +784,6 @@ xfs_refcount_merge_extents(
}
/*
- * While we're adjusting the refcounts records of an extent, we have
- * to keep an eye on the number of extents we're dirtying -- run too
- * many in a single transaction and we'll exceed the transaction's
- * reservation and crash the fs. Each record adds 12 bytes to the
- * log (plus any key updates) so we'll conservatively assume 24 bytes
- * per record. We must also leave space for btree splits on both ends
- * of the range and space for the CUD and a new CUI.
- *
* XXX: This is a pretty hand-wavy estimate. The penalty for guessing
* true incorrectly is a shutdown FS; the penalty for guessing false
* incorrectly is more transaction rolls than might be necessary.
@@ -822,7 +814,7 @@ xfs_refcount_still_have_space(
else if (overhead > cur->bc_tp->t_log_res)
return false;
return cur->bc_tp->t_log_res - overhead >
- cur->bc_private.a.priv.refc.nr_ops * 32;
+ cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
}
/*
@@ -1648,6 +1640,10 @@ xfs_refcount_recover_cow_leftovers(
error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
if (error)
goto out_trans;
+ if (!agbp) {
+ error = -ENOMEM;
+ goto out_trans;
+ }
cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL);
/* Find all the leftover CoW staging extents. */
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 098dc66..eafb9d1 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp,
extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
xfs_agnumber_t agno);
+/*
+ * While we're adjusting the refcounts records of an extent, we have
+ * to keep an eye on the number of extents we're dirtying -- run too
+ * many in a single transaction and we'll exceed the transaction's
+ * reservation and crash the fs. Each record adds 12 bytes to the
+ * log (plus any key updates) so we'll conservatively assume 32 bytes
+ * per record. We must also leave space for btree splits on both ends
+ * of the range and space for the CUD and a new CUI.
+ */
+#define XFS_REFCOUNT_ITEM_OVERHEAD 32
+
+static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
+{
+ return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
+}
+
#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index b468e04..3354140 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -170,8 +170,8 @@ xfs_get_acl(struct inode *inode, int type)
return acl;
}
-STATIC int
-__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+int
+__xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
struct xfs_inode *ip = XFS_I(inode);
unsigned char *ea_name;
@@ -247,6 +247,8 @@ xfs_set_mode(struct inode *inode, umode_t mode)
int
xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
+ umode_t mode;
+ bool set_mode = false;
int error = 0;
if (!acl)
@@ -257,16 +259,24 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return error;
if (type == ACL_TYPE_ACCESS) {
- umode_t mode;
-
error = posix_acl_update_mode(inode, &mode, &acl);
if (error)
return error;
- error = xfs_set_mode(inode, mode);
- if (error)
- return error;
+ set_mode = true;
}
set_acl:
- return __xfs_set_acl(inode, type, acl);
+ error = __xfs_set_acl(inode, acl, type);
+ if (error)
+ return error;
+
+ /*
+ * We set the mode after successfully updating the ACL xattr because the
+ * xattr update can fail at ENOSPC and we don't want to change the mode
+ * if the ACL update hasn't been applied.
+ */
+ if (set_mode)
+ error = xfs_set_mode(inode, mode);
+
+ return error;
}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 286fa89..0432731 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -24,6 +24,7 @@ struct posix_acl;
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
#else
static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
{
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 5789814..d31cd1e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -90,11 +90,11 @@ xfs_find_bdev_for_inode(
* associated buffer_heads, paying attention to the start and end offsets that
* we need to process on the page.
*
- * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
- * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
- * the page at all, as we may be racing with memory reclaim and it can free both
- * the bufferhead chain and the page as it will see the page as clean and
- * unused.
+ * Note that we open code the action in end_buffer_async_write here so that we
+ * only have to iterate over the buffers attached to the page once. This is not
+ * only more efficient, but also ensures that we only calls end_page_writeback
+ * at the end of the iteration, and thus avoids the pitfall of having the page
+ * and buffers potentially freed after every call to end_buffer_async_write.
*/
static void
xfs_finish_page_writeback(
@@ -102,29 +102,45 @@ xfs_finish_page_writeback(
struct bio_vec *bvec,
int error)
{
- unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
- struct buffer_head *head, *bh, *next;
+ struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head;
+ bool busy = false;
unsigned int off = 0;
- unsigned int bsize;
+ unsigned long flags;
ASSERT(bvec->bv_offset < PAGE_SIZE);
ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
- ASSERT(end < PAGE_SIZE);
+ ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
- bh = head = page_buffers(bvec->bv_page);
-
- bsize = bh->b_size;
+ local_irq_save(flags);
+ bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
do {
- if (off > end)
- break;
- next = bh->b_this_page;
- if (off < bvec->bv_offset)
- goto next_bh;
- bh->b_end_io(bh, !error);
-next_bh:
- off += bsize;
- } while ((bh = next) != head);
+ if (off >= bvec->bv_offset &&
+ off < bvec->bv_offset + bvec->bv_len) {
+ ASSERT(buffer_async_write(bh));
+ ASSERT(bh->b_end_io == NULL);
+
+ if (error) {
+ mapping_set_error(bvec->bv_page->mapping, -EIO);
+ set_buffer_write_io_error(bh);
+ clear_buffer_uptodate(bh);
+ SetPageError(bvec->bv_page);
+ } else {
+ set_buffer_uptodate(bh);
+ }
+ clear_buffer_async_write(bh);
+ unlock_buffer(bh);
+ } else if (buffer_async_write(bh)) {
+ ASSERT(buffer_locked(bh));
+ busy = true;
+ }
+ off += bh->b_size;
+ } while ((bh = bh->b_this_page) != head);
+ bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+ local_irq_restore(flags);
+
+ if (!busy)
+ end_page_writeback(bvec->bv_page);
}
/*
@@ -138,8 +154,10 @@ xfs_destroy_ioend(
int error)
{
struct inode *inode = ioend->io_inode;
- struct bio *last = ioend->io_bio;
- struct bio *bio, *next;
+ struct bio *bio = &ioend->io_inline_bio;
+ struct bio *last = ioend->io_bio, *next;
+ u64 start = bio->bi_iter.bi_sector;
+ bool quiet = bio_flagged(bio, BIO_QUIET);
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct bio_vec *bvec;
@@ -160,6 +178,11 @@ xfs_destroy_ioend(
bio_put(bio);
}
+
+ if (unlikely(error && !quiet)) {
+ xfs_err_ratelimited(XFS_I(inode)->i_mount,
+ "writeback error on sector %llu", start);
+ }
}
/*
@@ -312,7 +335,8 @@ xfs_end_io(
error = xfs_reflink_end_cow(ip, offset, size);
break;
case XFS_IO_UNWRITTEN:
- error = xfs_iomap_write_unwritten(ip, offset, size);
+ /* writeback should never update isize */
+ error = xfs_iomap_write_unwritten(ip, offset, size, false);
break;
default:
ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
@@ -414,6 +438,19 @@ xfs_imap_valid(
{
offset >>= inode->i_blkbits;
+ /*
+ * We have to make sure the cached mapping is within EOF to protect
+ * against eofblocks trimming on file release leaving us with a stale
+ * mapping. Otherwise, a page for a subsequent file extending buffered
+ * write could get picked up by this writeback cycle and written to the
+ * wrong blocks.
+ *
+ * Note that what we really want here is a generic mapping invalidation
+ * mechanism to protect us from arbitrary extent modifying contexts, not
+ * just eofblocks.
+ */
+ xfs_trim_extent_eof(imap, XFS_I(inode));
+
return offset >= imap->br_startoff &&
offset < imap->br_startoff + imap->br_blockcount;
}
@@ -427,7 +464,8 @@ xfs_start_buffer_writeback(
ASSERT(!buffer_delay(bh));
ASSERT(!buffer_unwritten(bh));
- mark_buffer_async_write(bh);
+ bh->b_end_io = NULL;
+ set_buffer_async_write(bh);
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
}
@@ -701,6 +739,14 @@ xfs_vm_invalidatepage(
{
trace_xfs_invalidatepage(page->mapping->host, page, offset,
length);
+
+ /*
+ * If we are invalidating the entire page, clear the dirty state from it
+ * so that we can check for attempts to release dirty cached pages in
+ * xfs_vm_releasepage().
+ */
+ if (offset == 0 && length >= PAGE_SIZE)
+ cancel_dirty_page(page);
block_invalidatepage(page, offset, length);
}
@@ -1156,25 +1202,27 @@ xfs_vm_releasepage(
* mm accommodates an old ext3 case where clean pages might not have had
* the dirty bit cleared. Thus, it can send actual dirty pages to
* ->releasepage() via shrink_active_list(). Conversely,
- * block_invalidatepage() can send pages that are still marked dirty
- * but otherwise have invalidated buffers.
+ * block_invalidatepage() can send pages that are still marked dirty but
+ * otherwise have invalidated buffers.
*
* We want to release the latter to avoid unnecessary buildup of the
- * LRU, skip the former and warn if we've left any lingering
- * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
- * or unwritten buffers and warn if the page is not dirty. Otherwise
- * try to release the buffers.
+ * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
+ * that are entirely invalidated and need to be released. Hence the
+ * only time we should get dirty pages here is through
+ * shrink_active_list() and so we can simply skip those now.
+ *
+ * warn if we've left any lingering delalloc/unwritten buffers on clean
+ * or invalidated pages we are about to release.
*/
+ if (PageDirty(page))
+ return 0;
+
xfs_count_page_state(page, &delalloc, &unwritten);
- if (delalloc) {
- WARN_ON_ONCE(!PageDirty(page));
+ if (WARN_ON_ONCE(delalloc))
return 0;
- }
- if (unwritten) {
- WARN_ON_ONCE(!PageDirty(page));
+ if (WARN_ON_ONCE(unwritten))
return 0;
- }
return try_to_free_buffers(page);
}
@@ -1508,6 +1556,21 @@ xfs_end_io_direct_write(
return 0;
}
+ if (flags & XFS_DIO_FLAG_COW)
+ error = xfs_reflink_end_cow(ip, offset, size);
+
+ /*
+ * Unwritten conversion updates the in-core isize after extent
+ * conversion but before updating the on-disk size. Updating isize any
+ * earlier allows a racing dio read to find unwritten extents before
+ * they are converted.
+ */
+ if (flags & XFS_DIO_FLAG_UNWRITTEN) {
+ trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
+
+ return xfs_iomap_write_unwritten(ip, offset, size, true);
+ }
+
/*
* We need to update the in-core inode size here so that we don't end up
* with the on-disk inode size being outside the in-core inode size. We
@@ -1524,13 +1587,6 @@ xfs_end_io_direct_write(
i_size_write(inode, offset + size);
spin_unlock(&ip->i_flags_lock);
- if (flags & XFS_DIO_FLAG_COW)
- error = xfs_reflink_end_cow(ip, offset, size);
- if (flags & XFS_DIO_FLAG_UNWRITTEN) {
- trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
-
- error = xfs_iomap_write_unwritten(ip, offset, size);
- }
if (flags & XFS_DIO_FLAG_APPEND) {
trace_xfs_end_io_direct_write_append(ip, offset, size);
@@ -1566,9 +1622,12 @@ xfs_vm_bmap(
* The swap code (ab-)uses ->bmap to get a block mapping and then
* bypasseѕ the file system for actual I/O. We really can't allow
* that on reflinks inodes, so we have to skip out here. And yes,
- * 0 is the magic code for a bmap error..
+ * 0 is the magic code for a bmap error.
+ *
+ * Since we don't pass back blockdev info, we can't return bmap
+ * information for rt files either.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) {
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return 0;
}
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index be0b79d..c664300 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -302,6 +302,8 @@ xfs_attr3_node_inactive(
&bp, XFS_ATTR_FORK);
if (error)
return error;
+ node = bp->b_addr;
+ btree = dp->d_ops->node_tree_p(node);
child_fsb = be32_to_cpu(btree[i + 1].before);
xfs_trans_brelse(*trans, bp);
}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index c4b90e7..5a54dcd 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -395,6 +395,7 @@ xfs_bui_recover(
struct xfs_map_extent *bmap;
xfs_fsblock_t startblock_fsb;
xfs_fsblock_t inode_fsb;
+ xfs_filblks_t count;
bool op_ok;
struct xfs_bud_log_item *budp;
enum xfs_bmap_intent_type type;
@@ -403,6 +404,7 @@ xfs_bui_recover(
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
struct xfs_defer_ops dfops;
+ struct xfs_bmbt_irec irec;
xfs_fsblock_t firstfsb;
ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
@@ -480,13 +482,24 @@ xfs_bui_recover(
}
xfs_trans_ijoin(tp, ip, 0);
+ count = bmap->me_len;
error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
ip, whichfork, bmap->me_startoff,
- bmap->me_startblock, bmap->me_len,
- state);
+ bmap->me_startblock, &count, state);
if (error)
goto err_dfops;
+ if (count > 0) {
+ ASSERT(type == XFS_BMAP_UNMAP);
+ irec.br_startblock = bmap->me_startblock;
+ irec.br_blockcount = count;
+ irec.br_startoff = bmap->me_startoff;
+ irec.br_state = state;
+ error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
+ if (error)
+ goto err_dfops;
+ }
+
/* Finish transaction, free inodes. */
error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 87b495e..cb62871 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -84,6 +84,7 @@ xfs_zero_extent(
GFP_NOFS, true);
}
+#ifdef CONFIG_XFS_RT
int
xfs_bmap_rtalloc(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
@@ -195,6 +196,7 @@ xfs_bmap_rtalloc(
}
return 0;
}
+#endif /* CONFIG_XFS_RT */
/*
* Check if the endoff is outside the last extent. If so the caller will grow
@@ -1445,7 +1447,19 @@ xfs_shift_file_space(
return error;
/*
- * The extent shiting code works on extent granularity. So, if
+ * Clean out anything hanging around in the cow fork now that
+ * we've flushed all the dirty data out to disk to avoid having
+ * CoW extents at the wrong offsets.
+ */
+ if (xfs_is_reflink_inode(ip)) {
+ error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF,
+ true);
+ if (error)
+ return error;
+ }
+
+ /*
+ * The extent shifting code works on extent granularity. So, if
* stop_fsb is not the starting block of extent, we need to split
* the extent at stop_fsb.
*/
@@ -1825,29 +1839,18 @@ xfs_swap_extent_forks(
}
/*
- * Before we've swapped the forks, lets set the owners of the forks
- * appropriately. We have to do this as we are demand paging the btree
- * buffers, and so the validation done on read will expect the owner
- * field to be correctly set. Once we change the owners, we can swap the
- * inode forks.
+ * Btree format (v3) inodes have the inode number stamped in the bmbt
+ * block headers. We can't start changing the bmbt blocks until the
+ * inode owner change is logged so recovery does the right thing in the
+ * event of a crash. Set the owner change log flags now and leave the
+ * bmbt scan as the last step.
*/
if (ip->i_d.di_version == 3 &&
- ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
(*target_log_flags) |= XFS_ILOG_DOWNER;
- error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
- tip->i_ino, NULL);
- if (error)
- return error;
- }
-
if (tip->i_d.di_version == 3 &&
- tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
(*src_log_flags) |= XFS_ILOG_DOWNER;
- error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
- ip->i_ino, NULL);
- if (error)
- return error;
- }
/*
* Swap the data forks of the inodes
@@ -1925,6 +1928,48 @@ xfs_swap_extent_forks(
return 0;
}
+/*
+ * Fix up the owners of the bmbt blocks to refer to the current inode. The
+ * change owner scan attempts to order all modified buffers in the current
+ * transaction. In the event of ordered buffer failure, the offending buffer is
+ * physically logged as a fallback and the scan returns -EAGAIN. We must roll
+ * the transaction in this case to replenish the fallback log reservation and
+ * restart the scan. This process repeats until the scan completes.
+ */
+static int
+xfs_swap_change_owner(
+ struct xfs_trans **tpp,
+ struct xfs_inode *ip,
+ struct xfs_inode *tmpip)
+{
+ int error;
+ struct xfs_trans *tp = *tpp;
+
+ do {
+ error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
+ NULL);
+ /* success or fatal error */
+ if (error != -EAGAIN)
+ break;
+
+ error = xfs_trans_roll(tpp, NULL);
+ if (error)
+ break;
+ tp = *tpp;
+
+ /*
+ * Redirty both inodes so they can relog and keep the log tail
+ * moving forward.
+ */
+ xfs_trans_ijoin(tp, ip, 0);
+ xfs_trans_ijoin(tp, tmpip, 0);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
+ } while (true);
+
+ return error;
+}
+
int
xfs_swap_extents(
struct xfs_inode *ip, /* target inode */
@@ -1938,8 +1983,8 @@ xfs_swap_extents(
int error = 0;
int lock_flags;
struct xfs_ifork *cowfp;
- __uint64_t f;
- int resblks;
+ uint64_t f;
+ int resblks = 0;
/*
* Lock the inodes against other IO, page faults and truncate to
@@ -1987,11 +2032,8 @@ xfs_swap_extents(
XFS_SWAP_RMAP_SPACE_RES(mp,
XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
XFS_DATA_FORK);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
- 0, 0, &tp);
- } else
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
- 0, 0, &tp);
+ }
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
if (error)
goto out_unlock;
@@ -2066,17 +2108,54 @@ xfs_swap_extents(
ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK;
+ }
+
+ /* Swap the cow forks. */
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ xfs_extnum_t extnum;
+
+ ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS);
+ ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS);
+
+ extnum = ip->i_cnextents;
+ ip->i_cnextents = tip->i_cnextents;
+ tip->i_cnextents = extnum;
+
cowfp = ip->i_cowfp;
ip->i_cowfp = tip->i_cowfp;
tip->i_cowfp = cowfp;
- xfs_inode_set_cowblocks_tag(ip);
- xfs_inode_set_cowblocks_tag(tip);
+
+ if (ip->i_cowfp && ip->i_cnextents)
+ xfs_inode_set_cowblocks_tag(ip);
+ else
+ xfs_inode_clear_cowblocks_tag(ip);
+ if (tip->i_cowfp && tip->i_cnextents)
+ xfs_inode_set_cowblocks_tag(tip);
+ else
+ xfs_inode_clear_cowblocks_tag(tip);
}
xfs_trans_log_inode(tp, ip, src_log_flags);
xfs_trans_log_inode(tp, tip, target_log_flags);
/*
+ * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
+ * have inode number owner values in the bmbt blocks that still refer to
+ * the old inode. Scan each bmbt to fix up the owner values with the
+ * inode number of the current inode.
+ */
+ if (src_log_flags & XFS_ILOG_DOWNER) {
+ error = xfs_swap_change_owner(&tp, ip, tip);
+ if (error)
+ goto out_trans_cancel;
+ }
+ if (target_log_flags & XFS_ILOG_DOWNER) {
+ error = xfs_swap_change_owner(&tp, tip, ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index f100539..ce330f0 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -28,7 +28,20 @@ struct xfs_mount;
struct xfs_trans;
struct xfs_bmalloca;
+#ifdef CONFIG_XFS_RT
int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
+#else /* !CONFIG_XFS_RT */
+/*
+ * Attempts to allocate RT extents when RT is disable indicates corruption and
+ * should trigger a shutdown.
+ */
+static inline int
+xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
+{
+ return -EFSCORRUPTED;
+}
+#endif /* CONFIG_XFS_RT */
+
int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
int whichfork, int *eof);
int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1626927..eca7bae 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -116,7 +116,7 @@ static inline void
__xfs_buf_ioacct_dec(
struct xfs_buf *bp)
{
- ASSERT(spin_is_locked(&bp->b_lock));
+ lockdep_assert_held(&bp->b_lock);
if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
@@ -2022,6 +2022,66 @@ xfs_buf_delwri_submit(
return error;
}
+/*
+ * Push a single buffer on a delwri queue.
+ *
+ * The purpose of this function is to submit a single buffer of a delwri queue
+ * and return with the buffer still on the original queue. The waiting delwri
+ * buffer submission infrastructure guarantees transfer of the delwri queue
+ * buffer reference to a temporary wait list. We reuse this infrastructure to
+ * transfer the buffer back to the original queue.
+ *
+ * Note the buffer transitions from the queued state, to the submitted and wait
+ * listed state and back to the queued state during this call. The buffer
+ * locking and queue management logic between _delwri_pushbuf() and
+ * _delwri_queue() guarantee that the buffer cannot be queued to another list
+ * before returning.
+ */
+int
+xfs_buf_delwri_pushbuf(
+ struct xfs_buf *bp,
+ struct list_head *buffer_list)
+{
+ LIST_HEAD (submit_list);
+ int error;
+
+ ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+
+ trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
+
+ /*
+ * Isolate the buffer to a new local list so we can submit it for I/O
+ * independently from the rest of the original list.
+ */
+ xfs_buf_lock(bp);
+ list_move(&bp->b_list, &submit_list);
+ xfs_buf_unlock(bp);
+
+ /*
+ * Delwri submission clears the DELWRI_Q buffer flag and returns with
+ * the buffer on the wait list with an associated reference. Rather than
+ * bounce the buffer from a local wait list back to the original list
+ * after I/O completion, reuse the original list as the wait list.
+ */
+ xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
+
+ /*
+ * The buffer is now under I/O and wait listed as during typical delwri
+ * submission. Lock the buffer to wait for I/O completion. Rather than
+ * remove the buffer from the wait list and release the reference, we
+ * want to return with the buffer queued to the original list. The
+ * buffer already sits on the original list with a wait list reference,
+ * however. If we let the queue inherit that wait list reference, all we
+ * need to do is reset the DELWRI_Q flag.
+ */
+ xfs_buf_lock(bp);
+ error = bp->b_error;
+ bp->b_flags |= _XBF_DELWRI_Q;
+ xfs_buf_unlock(bp);
+
+ return error;
+}
+
int __init
xfs_buf_init(void)
{
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index ad514a8..f961b19 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -333,6 +333,7 @@ extern void xfs_buf_delwri_cancel(struct list_head *);
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
+extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
/* Buffer Daemon Setup Routines */
extern int xfs_buf_init(void);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 0306168..e0a0af0 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -29,6 +29,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_inode.h"
kmem_zone_t *xfs_buf_item_zone;
@@ -322,6 +323,8 @@ xfs_buf_item_format(
ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
(xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
&& xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
+ ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
+ (bip->bli_flags & XFS_BLI_STALE));
/*
@@ -346,16 +349,6 @@ xfs_buf_item_format(
bip->bli_flags &= ~XFS_BLI_INODE_BUF;
}
- if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
- XFS_BLI_ORDERED) {
- /*
- * The buffer has been logged just to order it. It is not being
- * included in the transaction commit, so don't format it.
- */
- trace_xfs_buf_item_format_ordered(bip);
- return;
- }
-
for (i = 0; i < bip->bli_format_count; i++) {
xfs_buf_item_format_segment(bip, lv, &vecp, offset,
&bip->bli_formats[i]);
@@ -574,26 +567,20 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- bool clean;
- bool aborted;
- int flags;
+ bool aborted = !!(lip->li_flags & XFS_LI_ABORTED);
+ bool hold = !!(bip->bli_flags & XFS_BLI_HOLD);
+ bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
+#if defined(DEBUG) || defined(XFS_WARN)
+ bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
+#endif
/* Clear the buffer's association with this transaction. */
bp->b_transp = NULL;
/*
- * If this is a transaction abort, don't return early. Instead, allow
- * the brelse to happen. Normally it would be done for stale
- * (cancelled) buffers at unpin time, but we'll never go through the
- * pin/unpin cycle if we abort inside commit.
- */
- aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
- /*
- * Before possibly freeing the buf item, copy the per-transaction state
- * so we can reference it safely later after clearing it from the
- * buffer log item.
+ * The per-transaction state has been copied above so clear it from the
+ * bli.
*/
- flags = bip->bli_flags;
bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
/*
@@ -601,7 +588,7 @@ xfs_buf_item_unlock(
* unlock the buffer and free the buf item when the buffer is unpinned
* for the last time.
*/
- if (flags & XFS_BLI_STALE) {
+ if (bip->bli_flags & XFS_BLI_STALE) {
trace_xfs_buf_item_unlock_stale(bip);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
if (!aborted) {
@@ -619,40 +606,34 @@ xfs_buf_item_unlock(
* regardless of whether it is dirty or not. A dirty abort implies a
* shutdown, anyway.
*
- * Ordered buffers are dirty but may have no recorded changes, so ensure
- * we only release clean items here.
+ * The bli dirty state should match whether the blf has logged segments
+ * except for ordered buffers, where only the bli should be dirty.
*/
- clean = (flags & XFS_BLI_DIRTY) ? false : true;
- if (clean) {
- int i;
- for (i = 0; i < bip->bli_format_count; i++) {
- if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
- bip->bli_formats[i].blf_map_size)) {
- clean = false;
- break;
- }
- }
- }
+ ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
+ (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
/*
* Clean buffers, by definition, cannot be in the AIL. However, aborted
- * buffers may be dirty and hence in the AIL. Therefore if we are
- * aborting a buffer and we've just taken the last refernce away, we
- * have to check if it is in the AIL before freeing it. We need to free
- * it in this case, because an aborted transaction has already shut the
- * filesystem down and this is the last chance we will have to do so.
+ * buffers may be in the AIL regardless of dirty state. An aborted
+ * transaction that invalidates a buffer already in the AIL may have
+ * marked it stale and cleared the dirty state, for example.
+ *
+ * Therefore if we are aborting a buffer and we've just taken the last
+ * reference away, we have to check if it is in the AIL before freeing
+ * it. We need to free it in this case, because an aborted transaction
+ * has already shut the filesystem down and this is the last chance we
+ * will have to do so.
*/
if (atomic_dec_and_test(&bip->bli_refcount)) {
- if (clean)
- xfs_buf_item_relse(bp);
- else if (aborted) {
+ if (aborted) {
ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
- }
+ } else if (!dirty)
+ xfs_buf_item_relse(bp);
}
- if (!(flags & XFS_BLI_HOLD))
+ if (!hold)
xfs_buf_relse(bp);
}
@@ -942,14 +923,22 @@ xfs_buf_item_log(
/*
- * Return 1 if the buffer has been logged or ordered in a transaction (at any
- * point, not just the current transaction) and 0 if not.
+ * Return true if the buffer has any ranges logged/dirtied by a transaction,
+ * false otherwise.
*/
-uint
-xfs_buf_item_dirty(
- xfs_buf_log_item_t *bip)
+bool
+xfs_buf_item_dirty_format(
+ struct xfs_buf_log_item *bip)
{
- return (bip->bli_flags & XFS_BLI_DIRTY);
+ int i;
+
+ for (i = 0; i < bip->bli_format_count; i++) {
+ if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+ bip->bli_formats[i].blf_map_size))
+ return true;
+ }
+
+ return false;
}
STATIC void
@@ -1051,6 +1040,31 @@ xfs_buf_do_callbacks(
}
}
+/*
+ * Invoke the error state callback for each log item affected by the failed I/O.
+ *
+ * If a metadata buffer write fails with a non-permanent error, the buffer is
+ * eventually resubmitted and so the completion callbacks are not run. The error
+ * state may need to be propagated to the log items attached to the buffer,
+ * however, so the next AIL push of the item knows hot to handle it correctly.
+ */
+STATIC void
+xfs_buf_do_callbacks_fail(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *next;
+ struct xfs_log_item *lip = bp->b_fspriv;
+ struct xfs_ail *ailp = lip->li_ailp;
+
+ spin_lock(&ailp->xa_lock);
+ for (; lip; lip = next) {
+ next = lip->li_bio_list;
+ if (lip->li_ops->iop_error)
+ lip->li_ops->iop_error(lip, bp);
+ }
+ spin_unlock(&ailp->xa_lock);
+}
+
static bool
xfs_buf_iodone_callback_error(
struct xfs_buf *bp)
@@ -1120,7 +1134,11 @@ xfs_buf_iodone_callback_error(
if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
goto permanent_error;
- /* still a transient error, higher layers will retry */
+ /*
+ * Still a transient error, run IO completion failure callbacks and let
+ * the higher layers retry the buffer.
+ */
+ xfs_buf_do_callbacks_fail(bp);
xfs_buf_ioerror(bp, 0);
xfs_buf_relse(bp);
return true;
@@ -1201,3 +1219,31 @@ xfs_buf_iodone(
xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
xfs_buf_item_free(BUF_ITEM(lip));
}
+
+/*
+ * Requeue a failed buffer for writeback
+ *
+ * Return true if the buffer has been re-queued properly, false otherwise
+ */
+bool
+xfs_buf_resubmit_failed_buffers(
+ struct xfs_buf *bp,
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ struct xfs_log_item *next;
+
+ /*
+ * Clear XFS_LI_FAILED flag from all items before resubmit
+ *
+ * XFS_LI_FAILED set/clear is protected by xa_lock, caller this
+ * function already have it acquired
+ */
+ for (; lip; lip = next) {
+ next = lip->li_bio_list;
+ xfs_clear_li_failed(lip);
+ }
+
+ /* Add this buffer back to the delayed write list */
+ return xfs_buf_delwri_queue(bp, buffer_list);
+}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index f7eba99..9690ce6 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item {
int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_relse(struct xfs_buf *);
void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
-uint xfs_buf_item_dirty(xfs_buf_log_item_t *);
+bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_attach_iodone(struct xfs_buf *,
void(*)(struct xfs_buf *, xfs_log_item_t *),
xfs_log_item_t *);
void xfs_buf_iodone_callbacks(struct xfs_buf *);
void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
+bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
+ struct xfs_log_item *,
+ struct list_head *);
extern kmem_zone_t *xfs_buf_item_zone;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed7ee4e..bcf7297 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -167,7 +167,7 @@ xfs_verifier_error(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx",
+ xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
__return_address, bp->b_ops->name, bp->b_bn);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index df206cf..362c6b4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -92,7 +92,7 @@ xfs_zero_range(
xfs_off_t count,
bool *did_zero)
{
- return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
+ return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops);
}
int
@@ -729,6 +729,7 @@ write_retry:
xfs_rw_iunlock(ip, iolock);
eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+ xfs_icache_free_cowblocks(ip->i_mount, &eofb);
goto write_retry;
}
@@ -1139,29 +1140,8 @@ xfs_find_get_desired_pgoff(
want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
want);
- /*
- * No page mapped into given range. If we are searching holes
- * and if this is the first time we got into the loop, it means
- * that the given offset is landed in a hole, return it.
- *
- * If we have already stepped through some block buffers to find
- * holes but they all contains data. In this case, the last
- * offset is already updated and pointed to the end of the last
- * mapped page, if it does not reach the endpoint to search,
- * that means there should be a hole between them.
- */
- if (nr_pages == 0) {
- /* Data search found nothing */
- if (type == DATA_OFF)
- break;
-
- ASSERT(type == HOLE_OFF);
- if (lastoff == startoff || lastoff < endoff) {
- found = true;
- *offset = lastoff;
- }
+ if (nr_pages == 0)
break;
- }
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -1227,21 +1207,20 @@ xfs_find_get_desired_pgoff(
/*
* The number of returned pages less than our desired, search
- * done. In this case, nothing was found for searching data,
- * but we found a hole behind the last offset.
+ * done.
*/
- if (nr_pages < want) {
- if (type == HOLE_OFF) {
- *offset = lastoff;
- found = true;
- }
+ if (nr_pages < want)
break;
- }
index = pvec.pages[i - 1]->index + 1;
pagevec_release(&pvec);
} while (index <= end);
+ /* No page at lastoff and we are not done - we found a hole. */
+ if (type == HOLE_OFF && lastoff < endoff) {
+ *offset = lastoff;
+ found = true;
+ }
out:
pagevec_release(&pvec);
return found;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 74304b6..86a4911 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -66,7 +66,6 @@ xfs_inode_alloc(
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
@@ -192,7 +191,7 @@ xfs_perag_set_reclaim_tag(
{
struct xfs_mount *mp = pag->pag_mount;
- ASSERT(spin_is_locked(&pag->pag_ici_lock));
+ lockdep_assert_held(&pag->pag_ici_lock);
if (pag->pag_ici_reclaimable++)
return;
@@ -214,7 +213,7 @@ xfs_perag_clear_reclaim_tag(
{
struct xfs_mount *mp = pag->pag_mount;
- ASSERT(spin_is_locked(&pag->pag_ici_lock));
+ lockdep_assert_held(&pag->pag_ici_lock);
if (--pag->pag_ici_reclaimable)
return;
@@ -1079,11 +1078,11 @@ reclaim:
* Because we use RCU freeing we need to ensure the inode always appears
* to be reclaimed with an invalid inode number when in the free state.
* We do this as early as possible under the ILOCK so that
- * xfs_iflush_cluster() can be guaranteed to detect races with us here.
- * By doing this, we guarantee that once xfs_iflush_cluster has locked
- * XFS_ILOCK that it will see either a valid, flushable inode that will
- * serialise correctly, or it will see a clean (and invalid) inode that
- * it can skip.
+ * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
+ * detect races with us here. By doing this, we guarantee that once
+ * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
+ * it will see either a valid inode that will serialise correctly, or it
+ * will see an invalid inode that it can skip.
*/
spin_lock(&ip->i_flags_lock);
ip->i_flags = XFS_IRECLAIM;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7a0b4ee..fe9a9a1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -881,7 +881,6 @@ xfs_ialloc(
case S_IFREG:
case S_IFDIR:
if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
- uint64_t di_flags2 = 0;
uint di_flags = 0;
if (S_ISDIR(mode)) {
@@ -918,20 +917,23 @@ xfs_ialloc(
di_flags |= XFS_DIFLAG_NODEFRAG;
if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
di_flags |= XFS_DIFLAG_FILESTREAM;
- if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
- di_flags2 |= XFS_DIFLAG2_DAX;
ip->i_d.di_flags |= di_flags;
- ip->i_d.di_flags2 |= di_flags2;
}
if (pip &&
(pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
pip->i_d.di_version == 3 &&
ip->i_d.di_version == 3) {
+ uint64_t di_flags2 = 0;
+
if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
- ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
}
+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+ di_flags2 |= XFS_DIFLAG2_DAX;
+
+ ip->i_d.di_flags2 |= di_flags2;
}
/* FALLTHROUGH */
case S_IFLNK:
@@ -1630,10 +1632,12 @@ xfs_itruncate_extents(
goto out;
/*
- * Clear the reflink flag if we truncated everything.
+ * Clear the reflink flag if there are no data fork blocks and
+ * there are no extents staged in the cow fork.
*/
- if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) {
- ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) {
+ if (ip->i_d.di_nblocks == 0)
+ ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
xfs_inode_clear_cowblocks_tag(ip);
}
@@ -2366,11 +2370,24 @@ retry:
* already marked stale. If we can't lock it, back off
* and retry.
*/
- if (ip != free_ip &&
- !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- rcu_read_unlock();
- delay(1);
- goto retry;
+ if (ip != free_ip) {
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ rcu_read_unlock();
+ delay(1);
+ goto retry;
+ }
+
+ /*
+ * Check the inode number again in case we're
+ * racing with freeing in xfs_reclaim_inode().
+ * See the comments in that function for more
+ * information as to why the initial check is
+ * not sufficient.
+ */
+ if (ip->i_ino != inum + i) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ continue;
+ }
}
rcu_read_unlock();
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d90e781..d0a3c4b 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -27,6 +27,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
#include "xfs_log.h"
@@ -363,6 +364,9 @@ xfs_inode_to_log_dinode(
to->di_dmstate = from->di_dmstate;
to->di_flags = from->di_flags;
+ /* log a dummy value to ensure log structure is fully initialised */
+ to->di_next_unlinked = NULLAGINO;
+
if (from->di_version == 3) {
to->di_changecount = inode->i_version;
to->di_crtime.t_sec = from->di_crtime.t_sec;
@@ -403,6 +407,11 @@ xfs_inode_item_format_core(
* the second with the on-disk inode structure, and a possible third and/or
* fourth with the inode data/extents/b-tree root and inode attributes
* data/extents/b-tree root.
+ *
+ * Note: Always use the 64 bit inode log format structure so we don't
+ * leave an uninitialised hole in the format item on 64 bit systems. Log
+ * recovery on 32 bit systems handles this just fine, so there's no reason
+ * for not using an initialising the properly padded structure all the time.
*/
STATIC void
xfs_inode_item_format(
@@ -411,8 +420,8 @@ xfs_inode_item_format(
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
- struct xfs_inode_log_format *ilf;
struct xfs_log_iovec *vecp = NULL;
+ struct xfs_inode_log_format *ilf;
ASSERT(ip->i_d.di_version > 1);
@@ -424,7 +433,17 @@ xfs_inode_item_format(
ilf->ilf_boffset = ip->i_imap.im_boffset;
ilf->ilf_fields = XFS_ILOG_CORE;
ilf->ilf_size = 2; /* format + core */
- xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
+
+ /*
+ * make sure we don't leak uninitialised data into the log in the case
+ * when we don't log every field in the inode.
+ */
+ ilf->ilf_dsize = 0;
+ ilf->ilf_asize = 0;
+ ilf->ilf_pad = 0;
+ memset(&ilf->ilf_u.ilfu_uuid, 0, sizeof(ilf->ilf_u.ilfu_uuid));
+
+ xlog_finish_iovec(lv, vecp, sizeof(*ilf));
xfs_inode_item_format_core(ip, lv, &vecp);
xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
@@ -475,6 +494,23 @@ xfs_inode_item_unpin(
wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
}
+/*
+ * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
+ * have been failed during writeback
+ *
+ * This informs the AIL that the inode is already flush locked on the next push,
+ * and acquires a hold on the buffer to ensure that it isn't reclaimed before
+ * dirty data makes it to disk.
+ */
+STATIC void
+xfs_inode_item_error(
+ struct xfs_log_item *lip,
+ struct xfs_buf *bp)
+{
+ ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
+ xfs_set_li_failed(lip, bp);
+}
+
STATIC uint
xfs_inode_item_push(
struct xfs_log_item *lip,
@@ -484,13 +520,28 @@ xfs_inode_item_push(
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
- struct xfs_buf *bp = NULL;
+ struct xfs_buf *bp = lip->li_buf;
uint rval = XFS_ITEM_SUCCESS;
int error;
if (xfs_ipincount(ip) > 0)
return XFS_ITEM_PINNED;
+ /*
+ * The buffer containing this item failed to be written back
+ * previously. Resubmit the buffer for IO.
+ */
+ if (lip->li_flags & XFS_LI_FAILED) {
+ if (!xfs_buf_trylock(bp))
+ return XFS_ITEM_LOCKED;
+
+ if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
+ rval = XFS_ITEM_FLUSHING;
+
+ xfs_buf_unlock(bp);
+ return rval;
+ }
+
if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
return XFS_ITEM_LOCKED;
@@ -622,7 +673,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
.iop_unlock = xfs_inode_item_unlock,
.iop_committed = xfs_inode_item_committed,
.iop_push = xfs_inode_item_push,
- .iop_committing = xfs_inode_item_committing
+ .iop_committing = xfs_inode_item_committing,
+ .iop_error = xfs_inode_item_error
};
@@ -710,7 +762,8 @@ xfs_iflush_done(
* the AIL lock.
*/
iip = INODE_ITEM(blip);
- if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+ if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
+ (blip->li_flags & XFS_LI_FAILED))
need_ail++;
blip = next;
@@ -718,7 +771,8 @@ xfs_iflush_done(
/* make sure we capture the state of the initial inode. */
iip = INODE_ITEM(lip);
- if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+ if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
+ lip->li_flags & XFS_LI_FAILED)
need_ail++;
/*
@@ -731,22 +785,30 @@ xfs_iflush_done(
* holding the lock before removing the inode from the AIL.
*/
if (need_ail) {
- struct xfs_log_item *log_items[need_ail];
- int i = 0;
+ bool mlip_changed = false;
+
+ /* this is an opencoded batch version of xfs_trans_ail_delete */
spin_lock(&ailp->xa_lock);
for (blip = lip; blip; blip = blip->li_bio_list) {
- iip = INODE_ITEM(blip);
- if (iip->ili_logged &&
- blip->li_lsn == iip->ili_flush_lsn) {
- log_items[i++] = blip;
+ if (INODE_ITEM(blip)->ili_logged &&
+ blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
+ mlip_changed |= xfs_ail_delete_one(ailp, blip);
+ else {
+ xfs_clear_li_failed(blip);
}
- ASSERT(i <= need_ail);
}
- /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
- xfs_trans_ail_delete_bulk(ailp, log_items, i,
- SHUTDOWN_CORRUPT_INCORE);
- }
+ if (mlip_changed) {
+ if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+ xlog_assign_tail_lsn_locked(ailp->xa_mount);
+ if (list_empty(&ailp->xa_ail))
+ wake_up_all(&ailp->xa_empty);
+ }
+ spin_unlock(&ailp->xa_lock);
+
+ if (mlip_changed)
+ xfs_log_space_wake(ailp->xa_mount);
+ }
/*
* clean up and unlock the flush lock now we are done. We can clear the
@@ -811,48 +873,30 @@ xfs_istale_done(
}
/*
- * convert an xfs_inode_log_format struct from either 32 or 64 bit versions
- * (which can have different field alignments) to the native version
+ * convert an xfs_inode_log_format struct from the old 32 bit version
+ * (which can have different field alignments) to the native 64 bit version
*/
int
xfs_inode_item_format_convert(
- xfs_log_iovec_t *buf,
- xfs_inode_log_format_t *in_f)
+ struct xfs_log_iovec *buf,
+ struct xfs_inode_log_format *in_f)
{
- if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
- xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
-
- in_f->ilf_type = in_f32->ilf_type;
- in_f->ilf_size = in_f32->ilf_size;
- in_f->ilf_fields = in_f32->ilf_fields;
- in_f->ilf_asize = in_f32->ilf_asize;
- in_f->ilf_dsize = in_f32->ilf_dsize;
- in_f->ilf_ino = in_f32->ilf_ino;
- /* copy biggest field of ilf_u */
- memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
- in_f32->ilf_u.ilfu_uuid.__u_bits,
- sizeof(uuid_t));
- in_f->ilf_blkno = in_f32->ilf_blkno;
- in_f->ilf_len = in_f32->ilf_len;
- in_f->ilf_boffset = in_f32->ilf_boffset;
- return 0;
- } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
- xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
-
- in_f->ilf_type = in_f64->ilf_type;
- in_f->ilf_size = in_f64->ilf_size;
- in_f->ilf_fields = in_f64->ilf_fields;
- in_f->ilf_asize = in_f64->ilf_asize;
- in_f->ilf_dsize = in_f64->ilf_dsize;
- in_f->ilf_ino = in_f64->ilf_ino;
- /* copy biggest field of ilf_u */
- memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
- in_f64->ilf_u.ilfu_uuid.__u_bits,
- sizeof(uuid_t));
- in_f->ilf_blkno = in_f64->ilf_blkno;
- in_f->ilf_len = in_f64->ilf_len;
- in_f->ilf_boffset = in_f64->ilf_boffset;
- return 0;
- }
- return -EFSCORRUPTED;
+ struct xfs_inode_log_format_32 *in_f32 = buf->i_addr;
+
+ if (buf->i_len != sizeof(*in_f32))
+ return -EFSCORRUPTED;
+
+ in_f->ilf_type = in_f32->ilf_type;
+ in_f->ilf_size = in_f32->ilf_size;
+ in_f->ilf_fields = in_f32->ilf_fields;
+ in_f->ilf_asize = in_f32->ilf_asize;
+ in_f->ilf_dsize = in_f32->ilf_dsize;
+ in_f->ilf_ino = in_f32->ilf_ino;
+ /* copy biggest field of ilf_u */
+ memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+ in_f32->ilf_u.ilfu_uuid.__u_bits, sizeof(uuid_t));
+ in_f->ilf_blkno = in_f32->ilf_blkno;
+ in_f->ilf_len = in_f32->ilf_len;
+ in_f->ilf_boffset = in_f32->ilf_boffset;
+ return 0;
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 73cfc71..6c95812 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -928,16 +928,15 @@ xfs_ioc_fsgetxattr(
return 0;
}
-STATIC void
-xfs_set_diflags(
+STATIC uint16_t
+xfs_flags2diflags(
struct xfs_inode *ip,
unsigned int xflags)
{
- unsigned int di_flags;
- uint64_t di_flags2;
-
/* can't set PREALLOC this way, just preserve it */
- di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+ uint16_t di_flags =
+ (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+
if (xflags & FS_XFLAG_IMMUTABLE)
di_flags |= XFS_DIFLAG_IMMUTABLE;
if (xflags & FS_XFLAG_APPEND)
@@ -967,19 +966,24 @@ xfs_set_diflags(
if (xflags & FS_XFLAG_EXTSIZE)
di_flags |= XFS_DIFLAG_EXTSIZE;
}
- ip->i_d.di_flags = di_flags;
- /* diflags2 only valid for v3 inodes. */
- if (ip->i_d.di_version < 3)
- return;
+ return di_flags;
+}
+
+STATIC uint64_t
+xfs_flags2diflags2(
+ struct xfs_inode *ip,
+ unsigned int xflags)
+{
+ uint64_t di_flags2 =
+ (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
- di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
if (xflags & FS_XFLAG_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
if (xflags & FS_XFLAG_COWEXTSIZE)
di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
- ip->i_d.di_flags2 = di_flags2;
+ return di_flags2;
}
STATIC void
@@ -1005,11 +1009,12 @@ xfs_diflags_to_linux(
inode->i_flags |= S_NOATIME;
else
inode->i_flags &= ~S_NOATIME;
+#if 0 /* disabled until the flag switching races are sorted out */
if (xflags & FS_XFLAG_DAX)
inode->i_flags |= S_DAX;
else
inode->i_flags &= ~S_DAX;
-
+#endif
}
static int
@@ -1019,6 +1024,7 @@ xfs_ioctl_setattr_xflags(
struct fsxattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
+ uint64_t di_flags2;
/* Can't change realtime flag if any extents are allocated. */
if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
@@ -1049,7 +1055,14 @@ xfs_ioctl_setattr_xflags(
!capable(CAP_LINUX_IMMUTABLE))
return -EPERM;
- xfs_set_diflags(ip, fa->fsx_xflags);
+ /* diflags2 only valid for v3 inodes. */
+ di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
+ if (di_flags2 && ip->i_d.di_version < 3)
+ return -EINVAL;
+
+ ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
+ ip->i_d.di_flags2 = di_flags2;
+
xfs_diflags_to_linux(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1072,6 +1085,7 @@ xfs_ioctl_setattr_dax_invalidate(
int *join_flags)
{
struct inode *inode = VFS_I(ip);
+ struct super_block *sb = inode->i_sb;
int error;
*join_flags = 0;
@@ -1084,7 +1098,7 @@ xfs_ioctl_setattr_dax_invalidate(
if (fa->fsx_xflags & FS_XFLAG_DAX) {
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
return -EINVAL;
- if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE)
+ if (bdev_dax_supported(sb, sb->s_blocksize) < 0)
return -EINVAL;
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 65740d1..f286f63 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -836,7 +836,8 @@ int
xfs_iomap_write_unwritten(
xfs_inode_t *ip,
xfs_off_t offset,
- xfs_off_t count)
+ xfs_off_t count,
+ bool update_isize)
{
xfs_mount_t *mp = ip->i_mount;
xfs_fileoff_t offset_fsb;
@@ -847,6 +848,7 @@ xfs_iomap_write_unwritten(
xfs_trans_t *tp;
xfs_bmbt_irec_t imap;
struct xfs_defer_ops dfops;
+ struct inode *inode = VFS_I(ip);
xfs_fsize_t i_size;
uint resblks;
int error;
@@ -906,7 +908,8 @@ xfs_iomap_write_unwritten(
i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
if (i_size > offset + count)
i_size = offset + count;
-
+ if (update_isize && i_size > i_size_read(inode))
+ i_size_write(inode, i_size);
i_size = xfs_new_eof(ip, i_size);
if (i_size) {
ip->i_d.di_size = i_size;
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 6d45cf0..d71703a 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int);
int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
struct xfs_bmbt_irec *);
-int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
+int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
struct xfs_bmbt_irec *);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f5e0f60..5b81f7f 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -190,12 +190,12 @@ xfs_generic_create(
#ifdef CONFIG_XFS_POSIX_ACL
if (default_acl) {
- error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
goto out_cleanup_inode;
}
if (acl) {
- error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ error = __xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
if (error)
goto out_cleanup_inode;
}
@@ -802,7 +802,7 @@ xfs_vn_setattr_nonsize(
* Caution: The caller of this function is responsible for calling
* setattr_prepare() or otherwise verifying the change is fine.
*/
-int
+STATIC int
xfs_setattr_size(
struct xfs_inode *ip,
struct iattr *iattr)
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index d8a77db..26d67ce 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -361,7 +361,6 @@ xfs_bulkstat(
xfs_agino_t agino; /* inode # in allocation group */
xfs_agnumber_t agno; /* allocation group number */
xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
- size_t irbsize; /* size of irec buffer in bytes */
xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
int nirbuf; /* size of irbuf */
int ubcount; /* size of user's buffer */
@@ -388,11 +387,10 @@ xfs_bulkstat(
*ubcountp = 0;
*done = 0;
- irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
+ irbuf = kmem_zalloc_large(PAGE_SIZE * 4, KM_SLEEP);
if (!irbuf)
return -ENOMEM;
-
- nirbuf = irbsize / sizeof(*irbuf);
+ nirbuf = (PAGE_SIZE * 4) / sizeof(*irbuf);
/*
* Loop over the allocation groups, starting from the last
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 1455b2520..3ebed16 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -363,7 +363,14 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
#endif /* DEBUG */
#ifdef CONFIG_XFS_RT
-#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
+
+/*
+ * make sure we ignore the inode flag if the filesystem doesn't have a
+ * configured realtime device.
+ */
+#define XFS_IS_REALTIME_INODE(ip) \
+ (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) && \
+ (ip)->i_mount->m_rtdev_targp)
#else
#define XFS_IS_REALTIME_INODE(ip) (0)
#endif
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b57ab34..33c9a3a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -743,15 +743,45 @@ xfs_log_mount_finish(
struct xfs_mount *mp)
{
int error = 0;
+ bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
return 0;
+ } else if (readonly) {
+ /* Allow unlinked processing to proceed */
+ mp->m_flags &= ~XFS_MOUNT_RDONLY;
}
+ /*
+ * During the second phase of log recovery, we need iget and
+ * iput to behave like they do for an active filesystem.
+ * xfs_fs_drop_inode needs to be able to prevent the deletion
+ * of inodes before we're done replaying log items on those
+ * inodes. Turn it off immediately after recovery finishes
+ * so that we don't leak the quota inodes if subsequent mount
+ * activities fail.
+ *
+ * We let all inodes involved in redo item processing end up on
+ * the LRU instead of being evicted immediately so that if we do
+ * something to an unlinked inode, the irele won't cause
+ * premature truncation and freeing of the inode, which results
+ * in log recovery failure. We have to evict the unreferenced
+ * lru inodes after clearing MS_ACTIVE because we don't
+ * otherwise clean up the lru if there's a subsequent failure in
+ * xfs_mountfs, which leads to us leaking the inodes if nothing
+ * else (e.g. quotacheck) references the inodes before the
+ * mount failure occurs.
+ */
+ mp->m_super->s_flags |= MS_ACTIVE;
error = xlog_recover_finish(mp->m_log);
if (!error)
xfs_log_work_queue(mp);
+ mp->m_super->s_flags &= ~MS_ACTIVE;
+ evict_inodes(mp->m_super);
+
+ if (readonly)
+ mp->m_flags |= XFS_MOUNT_RDONLY;
return error;
}
@@ -801,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp)
int error;
/*
- * Don't write out unmount record on read-only mounts.
+ * Don't write out unmount record on norecovery mounts or ro devices.
* Or, if we are doing a forced umount (typically because of IO errors).
*/
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
+ xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+ ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
return 0;
+ }
error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
@@ -3304,8 +3337,6 @@ maybe_sleep:
*/
if (iclog->ic_state & XLOG_STATE_IOERROR)
return -EIO;
- if (log_flushed)
- *log_flushed = 1;
} else {
no_sleep:
@@ -3409,8 +3440,6 @@ try_again:
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
- if (log_flushed)
- *log_flushed = 1;
already_slept = 1;
goto try_again;
}
@@ -3444,9 +3473,6 @@ try_again:
*/
if (iclog->ic_state & XLOG_STATE_IOERROR)
return -EIO;
-
- if (log_flushed)
- *log_flushed = 1;
} else { /* just return */
spin_unlock(&log->l_icloglock);
}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9b3d7c7..0590926 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1029,61 +1029,106 @@ out_error:
}
/*
- * Check the log tail for torn writes. This is required when torn writes are
- * detected at the head and the head had to be walked back to a previous record.
- * The tail of the previous record must now be verified to ensure the torn
- * writes didn't corrupt the previous tail.
+ * Calculate distance from head to tail (i.e., unused space in the log).
+ */
+static inline int
+xlog_tail_distance(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk)
+{
+ if (head_blk < tail_blk)
+ return tail_blk - head_blk;
+
+ return tail_blk + (log->l_logBBsize - head_blk);
+}
+
+/*
+ * Verify the log tail. This is particularly important when torn or incomplete
+ * writes have been detected near the front of the log and the head has been
+ * walked back accordingly.
*
- * Return an error if CRC verification fails as recovery cannot proceed.
+ * We also have to handle the case where the tail was pinned and the head
+ * blocked behind the tail right before a crash. If the tail had been pushed
+ * immediately prior to the crash and the subsequent checkpoint was only
+ * partially written, it's possible it overwrote the last referenced tail in the
+ * log with garbage. This is not a coherency problem because the tail must have
+ * been pushed before it can be overwritten, but appears as log corruption to
+ * recovery because we have no way to know the tail was updated if the
+ * subsequent checkpoint didn't write successfully.
+ *
+ * Therefore, CRC check the log from tail to head. If a failure occurs and the
+ * offending record is within max iclog bufs from the head, walk the tail
+ * forward and retry until a valid tail is found or corruption is detected out
+ * of the range of a possible overwrite.
*/
STATIC int
xlog_verify_tail(
struct xlog *log,
xfs_daddr_t head_blk,
- xfs_daddr_t tail_blk)
+ xfs_daddr_t *tail_blk,
+ int hsize)
{
struct xlog_rec_header *thead;
struct xfs_buf *bp;
xfs_daddr_t first_bad;
- int count;
int error = 0;
bool wrapped;
- xfs_daddr_t tmp_head;
+ xfs_daddr_t tmp_tail;
+ xfs_daddr_t orig_tail = *tail_blk;
bp = xlog_get_bp(log, 1);
if (!bp)
return -ENOMEM;
/*
- * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
- * a temporary head block that points after the last possible
- * concurrently written record of the tail.
+ * Make sure the tail points to a record (returns positive count on
+ * success).
*/
- count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
- XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
- &wrapped);
- if (count < 0) {
- error = count;
+ error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
+ &tmp_tail, &thead, &wrapped);
+ if (error < 0)
goto out;
- }
-
- /*
- * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
- * into the actual log head. tmp_head points to the start of the record
- * so update it to the actual head block.
- */
- if (count < XLOG_MAX_ICLOGS + 1)
- tmp_head = head_blk;
+ if (*tail_blk != tmp_tail)
+ *tail_blk = tmp_tail;
/*
- * We now have a tail and temporary head block that covers at least
- * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
- * records were completely written. Run a CRC verification pass from
- * tail to head and return the result.
+ * Run a CRC check from the tail to the head. We can't just check
+ * MAX_ICLOGS records past the tail because the tail may point to stale
+ * blocks cleared during the search for the head/tail. These blocks are
+ * overwritten with zero-length records and thus record count is not a
+ * reliable indicator of the iclog state before a crash.
*/
- error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
+ first_bad = 0;
+ error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
XLOG_RECOVER_CRCPASS, &first_bad);
+ while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
+ int tail_distance;
+
+ /*
+ * Is corruption within range of the head? If so, retry from
+ * the next record. Otherwise return an error.
+ */
+ tail_distance = xlog_tail_distance(log, head_blk, first_bad);
+ if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
+ break;
+
+ /* skip to the next record; returns positive count on success */
+ error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
+ &tmp_tail, &thead, &wrapped);
+ if (error < 0)
+ goto out;
+
+ *tail_blk = tmp_tail;
+ first_bad = 0;
+ error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+ XLOG_RECOVER_CRCPASS, &first_bad);
+ }
+ if (!error && *tail_blk != orig_tail)
+ xfs_warn(log->l_mp,
+ "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
+ orig_tail, *tail_blk);
out:
xlog_put_bp(bp);
return error;
@@ -1143,7 +1188,7 @@ xlog_verify_head(
*/
error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
XLOG_RECOVER_CRCPASS, &first_bad);
- if (error == -EFSBADCRC) {
+ if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
/*
* We've hit a potential torn write. Reset the error and warn
* about it.
@@ -1183,31 +1228,12 @@ xlog_verify_head(
ASSERT(0);
return 0;
}
-
- /*
- * Now verify the tail based on the updated head. This is
- * required because the torn writes trimmed from the head could
- * have been written over the tail of a previous record. Return
- * any errors since recovery cannot proceed if the tail is
- * corrupt.
- *
- * XXX: This leaves a gap in truly robust protection from torn
- * writes in the log. If the head is behind the tail, the tail
- * pushes forward to create some space and then a crash occurs
- * causing the writes into the previous record's tail region to
- * tear, log recovery isn't able to recover.
- *
- * How likely is this to occur? If possible, can we do something
- * more intelligent here? Is it safe to push the tail forward if
- * we can determine that the tail is within the range of the
- * torn write (e.g., the kernel can only overwrite the tail if
- * it has actually been pushed forward)? Alternatively, could we
- * somehow prevent this condition at runtime?
- */
- error = xlog_verify_tail(log, *head_blk, *tail_blk);
}
+ if (error)
+ return error;
- return error;
+ return xlog_verify_tail(log, *head_blk, tail_blk,
+ be32_to_cpu((*rhead)->h_size));
}
/*
@@ -4152,7 +4178,7 @@ xlog_recover_commit_trans(
#define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
- hlist_del(&trans->r_list);
+ hlist_del_init(&trans->r_list);
error = xlog_recover_reorder_trans(log, trans, pass);
if (error)
@@ -4354,6 +4380,8 @@ xlog_recover_free_trans(
xlog_recover_item_t *item, *n;
int i;
+ hlist_del_init(&trans->r_list);
+
list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
/* Free the regions in the item. */
list_del(&item->ri_list);
@@ -4799,12 +4827,16 @@ xlog_recover_process_intents(
int error = 0;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp;
+#if defined(DEBUG) || defined(XFS_WARN)
xfs_lsn_t last_lsn;
+#endif
ailp = log->l_ailp;
spin_lock(&ailp->xa_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
+#endif
while (lip != NULL) {
/*
* We're done when we see something other than an intent.
@@ -5214,7 +5246,7 @@ xlog_do_recovery_pass(
xfs_daddr_t *first_bad) /* out: first bad log rec */
{
xlog_rec_header_t *rhead;
- xfs_daddr_t blk_no;
+ xfs_daddr_t blk_no, rblk_no;
xfs_daddr_t rhead_blk;
char *offset;
xfs_buf_t *hbp, *dbp;
@@ -5222,11 +5254,15 @@ xlog_do_recovery_pass(
int error2 = 0;
int bblks, split_bblks;
int hblks, split_hblks, wrapped_hblks;
+ int i;
struct hlist_head rhash[XLOG_RHASH_SIZE];
LIST_HEAD (buffer_list);
ASSERT(head_blk != tail_blk);
- rhead_blk = 0;
+ blk_no = rhead_blk = tail_blk;
+
+ for (i = 0; i < XLOG_RHASH_SIZE; i++)
+ INIT_HLIST_HEAD(&rhash[i]);
/*
* Read the header of the tail block and get the iclog buffer size from
@@ -5301,7 +5337,6 @@ xlog_do_recovery_pass(
}
memset(rhash, 0, sizeof(rhash));
- blk_no = rhead_blk = tail_blk;
if (tail_blk > head_blk) {
/*
* Perform recovery around the end of the physical log.
@@ -5363,9 +5398,19 @@ xlog_do_recovery_pass(
bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
blk_no += hblks;
- /* Read in data for log record */
- if (blk_no + bblks <= log->l_logBBsize) {
- error = xlog_bread(log, blk_no, bblks, dbp,
+ /*
+ * Read the log record data in multiple reads if it
+ * wraps around the end of the log. Note that if the
+ * header already wrapped, blk_no could point past the
+ * end of the log. The record data is contiguous in
+ * that case.
+ */
+ if (blk_no + bblks <= log->l_logBBsize ||
+ blk_no >= log->l_logBBsize) {
+ /* mod blk_no in case the header wrapped and
+ * pushed it beyond the end of the log */
+ rblk_no = do_mod(blk_no, log->l_logBBsize);
+ error = xlog_bread(log, rblk_no, bblks, dbp,
&offset);
if (error)
goto bread_err2;
@@ -5464,6 +5509,19 @@ xlog_do_recovery_pass(
if (error && first_bad)
*first_bad = rhead_blk;
+ /*
+ * Transactions are freed at commit time but transactions without commit
+ * records on disk are never committed. Free any that may be left in the
+ * hash table.
+ */
+ for (i = 0; i < XLOG_RHASH_SIZE; i++) {
+ struct hlist_node *tmp;
+ struct xlog_recover *trans;
+
+ hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
+ xlog_recover_free_trans(trans);
+ }
+
return error ? error : error2;
}
@@ -5542,6 +5600,8 @@ xlog_do_recover(
xfs_buf_t *bp;
xfs_sb_t *sbp;
+ trace_xfs_log_recover(log, head_blk, tail_blk);
+
/*
* First replay the images in the log.
*/
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 13796f2..d4ce8d2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -925,15 +925,6 @@ xfs_mountfs(
}
/*
- * During the second phase of log recovery, we need iget and
- * iput to behave like they do for an active filesystem.
- * xfs_fs_drop_inode needs to be able to prevent the deletion
- * of inodes before we're done replaying log items on those
- * inodes.
- */
- mp->m_super->s_flags |= MS_ACTIVE;
-
- /*
* Finish recovering the file system. This part needed to be delayed
* until after the root and real-time bitmap inodes were consistently
* read in.
@@ -1008,12 +999,13 @@ xfs_mountfs(
out_quota:
xfs_qm_unmount_quotas(mp);
out_rtunmount:
- mp->m_super->s_flags &= ~MS_ACTIVE;
xfs_rtunmount_inodes(mp);
out_rele_rip:
IRELE(rip);
cancel_delayed_work_sync(&mp->m_reclaim_work);
xfs_reclaim_inodes(mp, SYNC_WAIT);
+ /* Clean out dquots that might be in memory after quotacheck. */
+ xfs_qm_unmount(mp);
out_log_dealloc:
mp->m_flags |= XFS_MOUNT_UNMOUNTING;
xfs_log_mount_cancel(mp);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 0c381d7..0492436 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -134,7 +134,7 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28);
XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52);
- XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64, 56);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
}
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 93a7aaf..cecd375 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -279,7 +279,7 @@ xfs_fs_commit_blocks(
(end - 1) >> PAGE_SHIFT);
WARN_ON_ONCE(error);
- error = xfs_iomap_write_unwritten(ip, start, length);
+ error = xfs_iomap_write_unwritten(ip, start, length, false);
if (error)
goto out_drop_iolock;
}
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 8b9a9f1..1fdd3fa 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -111,6 +111,9 @@ restart:
skipped = 0;
break;
}
+ /* we're done if id overflows back to zero */
+ if (!next_index)
+ break;
}
if (skipped) {
@@ -1247,6 +1250,7 @@ xfs_qm_flush_one(
struct xfs_dquot *dqp,
void *data)
{
+ struct xfs_mount *mp = dqp->q_mount;
struct list_head *buffer_list = data;
struct xfs_buf *bp = NULL;
int error = 0;
@@ -1257,7 +1261,32 @@ xfs_qm_flush_one(
if (!XFS_DQ_IS_DIRTY(dqp))
goto out_unlock;
- xfs_dqflock(dqp);
+ /*
+ * The only way the dquot is already flush locked by the time quotacheck
+ * gets here is if reclaim flushed it before the dqadjust walk dirtied
+ * it for the final time. Quotacheck collects all dquot bufs in the
+ * local delwri queue before dquots are dirtied, so reclaim can't have
+ * possibly queued it for I/O. The only way out is to push the buffer to
+ * cycle the flush lock.
+ */
+ if (!xfs_dqflock_nowait(dqp)) {
+ /* buf is pinned in-core by delwri list */
+ DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen);
+ bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL);
+ if (!bp) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+ xfs_buf_unlock(bp);
+
+ xfs_buf_delwri_pushbuf(bp, buffer_list);
+ xfs_buf_rele(bp);
+
+ error = -EAGAIN;
+ goto out_unlock;
+ }
+
error = xfs_qm_dqflush(dqp, &bp);
if (error)
goto out_unlock;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 29a75ec..17d3c96 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -169,6 +169,8 @@ xfs_reflink_find_shared(
error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
if (error)
return error;
+ if (!agbp)
+ return -ENOMEM;
cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
@@ -333,7 +335,7 @@ xfs_reflink_convert_cow_extent(
struct xfs_defer_ops *dfops)
{
struct xfs_bmbt_irec irec = *imap;
- xfs_fsblock_t first_block;
+ xfs_fsblock_t first_block = NULLFSBLOCK;
int nimaps = 1;
if (imap->br_state == XFS_EXT_NORM)
@@ -765,7 +767,13 @@ xfs_reflink_end_cow(
/* If there is a hole at end_fsb - 1 go to the previous extent */
if (eof || got.br_startoff > end_fsb) {
- ASSERT(idx > 0);
+ /*
+ * In case of racing, overlapping AIO writes no COW extents
+ * might be left by the time I/O completes for the loser of
+ * the race. In that case we are done.
+ */
+ if (idx <= 0)
+ goto out_cancel;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got);
}
@@ -839,6 +847,7 @@ next_extent:
out_defer:
xfs_defer_cancel(&dfops);
+out_cancel:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 882fb85..67d589e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1214,7 +1214,7 @@ xfs_test_remount_options(
tmp_mp->m_super = sb;
error = xfs_parseargs(tmp_mp, options);
xfs_free_fsname(tmp_mp);
- kfree(tmp_mp);
+ kmem_free(tmp_mp);
return error;
}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 828f383..bdf69e1 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -366,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
DEFINE_BUF_EVENT(xfs_buf_delwri_split);
+DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_bdstrat_shut);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
@@ -519,7 +520,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
@@ -1990,6 +1990,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+TRACE_EVENT(xfs_log_recover,
+ TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk),
+ TP_ARGS(log, headblk, tailblk),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_daddr_t, headblk)
+ __field(xfs_daddr_t, tailblk)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->headblk = headblk;
+ __entry->tailblk = tailblk;
+ ),
+ TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk,
+ __entry->tailblk)
+)
+
TRACE_EVENT(xfs_log_recover_record,
TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
TP_ARGS(log, rhead, pass),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 98024cb..5669cf0 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -50,6 +50,7 @@ typedef struct xfs_log_item {
struct xfs_ail *li_ailp; /* ptr to AIL */
uint li_type; /* item type */
uint li_flags; /* misc flags */
+ struct xfs_buf *li_buf; /* real buffer pointer */
struct xfs_log_item *li_bio_list; /* buffer item list */
void (*li_cb)(struct xfs_buf *,
struct xfs_log_item *);
@@ -65,11 +66,13 @@ typedef struct xfs_log_item {
} xfs_log_item_t;
#define XFS_LI_IN_AIL 0x1
-#define XFS_LI_ABORTED 0x2
+#define XFS_LI_ABORTED 0x2
+#define XFS_LI_FAILED 0x4
#define XFS_LI_FLAGS \
{ XFS_LI_IN_AIL, "IN_AIL" }, \
- { XFS_LI_ABORTED, "ABORTED" }
+ { XFS_LI_ABORTED, "ABORTED" }, \
+ { XFS_LI_FAILED, "FAILED" }
struct xfs_item_ops {
void (*iop_size)(xfs_log_item_t *, int *, int *);
@@ -80,6 +83,7 @@ struct xfs_item_ops {
void (*iop_unlock)(xfs_log_item_t *);
xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+ void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
};
void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
@@ -213,12 +217,14 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
-void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
+bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
-void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
+void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
+ uint);
+void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
void xfs_extent_free_init_defer_op(void);
@@ -277,6 +283,6 @@ int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
enum xfs_bmap_intent_type type, struct xfs_inode *ip,
int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
- xfs_filblks_t blockcount, xfs_exntst_t state);
+ xfs_filblks_t *blockcount, xfs_exntst_t state);
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d6c9c3e..70f5ab0 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -684,8 +684,24 @@ xfs_trans_ail_update_bulk(
}
}
-/*
- * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
+bool
+xfs_ail_delete_one(
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_log_item *mlip = xfs_ail_min(ailp);
+
+ trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+ xfs_ail_delete(ailp, lip);
+ xfs_clear_li_failed(lip);
+ lip->li_flags &= ~XFS_LI_IN_AIL;
+ lip->li_lsn = 0;
+
+ return mlip == lip;
+}
+
+/**
+ * Remove a log items from the AIL
*
* @xfs_trans_ail_delete_bulk takes an array of log items that all need to
* removed from the AIL. The caller is already holding the AIL lock, and done
@@ -706,52 +722,36 @@ xfs_trans_ail_update_bulk(
* before returning.
*/
void
-xfs_trans_ail_delete_bulk(
+xfs_trans_ail_delete(
struct xfs_ail *ailp,
- struct xfs_log_item **log_items,
- int nr_items,
+ struct xfs_log_item *lip,
int shutdown_type) __releases(ailp->xa_lock)
{
- xfs_log_item_t *mlip;
- int mlip_changed = 0;
- int i;
-
- mlip = xfs_ail_min(ailp);
+ struct xfs_mount *mp = ailp->xa_mount;
+ bool mlip_changed;
- for (i = 0; i < nr_items; i++) {
- struct xfs_log_item *lip = log_items[i];
- if (!(lip->li_flags & XFS_LI_IN_AIL)) {
- struct xfs_mount *mp = ailp->xa_mount;
-
- spin_unlock(&ailp->xa_lock);
- if (!XFS_FORCED_SHUTDOWN(mp)) {
- xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
- "%s: attempting to delete a log item that is not in the AIL",
- __func__);
- xfs_force_shutdown(mp, shutdown_type);
- }
- return;
+ if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+ spin_unlock(&ailp->xa_lock);
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+ "%s: attempting to delete a log item that is not in the AIL",
+ __func__);
+ xfs_force_shutdown(mp, shutdown_type);
}
-
- trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
- xfs_ail_delete(ailp, lip);
- lip->li_flags &= ~XFS_LI_IN_AIL;
- lip->li_lsn = 0;
- if (mlip == lip)
- mlip_changed = 1;
+ return;
}
+ mlip_changed = xfs_ail_delete_one(ailp, lip);
if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
- xlog_assign_tail_lsn_locked(ailp->xa_mount);
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xlog_assign_tail_lsn_locked(mp);
if (list_empty(&ailp->xa_ail))
wake_up_all(&ailp->xa_empty);
- spin_unlock(&ailp->xa_lock);
+ }
+ spin_unlock(&ailp->xa_lock);
+ if (mlip_changed)
xfs_log_space_wake(ailp->xa_mount);
- } else {
- spin_unlock(&ailp->xa_lock);
- }
}
int
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
index 6408e7d..14543d9 100644
--- a/fs/xfs/xfs_trans_bmap.c
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update(
int whichfork,
xfs_fileoff_t startoff,
xfs_fsblock_t startblock,
- xfs_filblks_t blockcount,
+ xfs_filblks_t *blockcount,
xfs_exntst_t state)
{
int error;
@@ -196,16 +196,23 @@ xfs_bmap_update_finish_item(
void **state)
{
struct xfs_bmap_intent *bmap;
+ xfs_filblks_t count;
int error;
bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+ count = bmap->bi_bmap.br_blockcount;
error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
bmap->bi_type,
bmap->bi_owner, bmap->bi_whichfork,
bmap->bi_bmap.br_startoff,
bmap->bi_bmap.br_startblock,
- bmap->bi_bmap.br_blockcount,
+ &count,
bmap->bi_bmap.br_state);
+ if (!error && count > 0) {
+ ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
+ bmap->bi_bmap.br_blockcount = count;
+ return -EAGAIN;
+ }
kmem_free(bmap);
return error;
}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee29ca..3ba7a96 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
xfs_buf_t *bp)
{
xfs_buf_log_item_t *bip;
+ int freed;
/*
* Default to a normal brelse() call if the tp is NULL.
@@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t *tp,
/*
* Drop our reference to the buf log item.
*/
- atomic_dec(&bip->bli_refcount);
+ freed = atomic_dec_and_test(&bip->bli_refcount);
/*
- * If the buf item is not tracking data in the log, then
- * we must free it before releasing the buffer back to the
- * free pool. Before releasing the buffer to the free pool,
- * clear the transaction pointer in b_fsprivate2 to dissolve
- * its relation to this transaction.
+ * If the buf item is not tracking data in the log, then we must free it
+ * before releasing the buffer back to the free pool.
+ *
+ * If the fs has shutdown and we dropped the last reference, it may fall
+ * on us to release a (possibly dirty) bli if it never made it to the
+ * AIL (e.g., the aborted unpin already happened and didn't release it
+ * due to our reference). Since we're already shutdown and need xa_lock,
+ * just force remove from the AIL and release the bli here.
*/
- if (!xfs_buf_item_dirty(bip)) {
+ if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
+ xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_buf_item_relse(bp);
+ } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
/***
ASSERT(bp->b_pincount == 0);
***/
@@ -486,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
}
/*
- * This is called to mark bytes first through last inclusive of the given
- * buffer as needing to be logged when the transaction is committed.
- * The buffer must already be associated with the given transaction.
- *
- * First and last are numbers relative to the beginning of this buffer,
- * so the first byte in the buffer is numbered 0 regardless of the
- * value of b_blkno.
+ * Mark a buffer dirty in the transaction.
*/
void
-xfs_trans_log_buf(xfs_trans_t *tp,
- xfs_buf_t *bp,
- uint first,
- uint last)
+xfs_trans_dirty_buf(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
{
- xfs_buf_log_item_t *bip = bp->b_fspriv;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
ASSERT(bp->b_transp == tp);
ASSERT(bip != NULL);
- ASSERT(first <= last && last < BBTOB(bp->b_length));
ASSERT(bp->b_iodone == NULL ||
bp->b_iodone == xfs_buf_iodone_callbacks);
@@ -524,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
bp->b_iodone = xfs_buf_iodone_callbacks;
bip->bli_item.li_cb = xfs_buf_iodone;
- trace_xfs_trans_log_buf(bip);
-
/*
* If we invalidated the buffer within this transaction, then
* cancel the invalidation now that we're dirtying the buffer
@@ -538,17 +535,37 @@ xfs_trans_log_buf(xfs_trans_t *tp,
bp->b_flags &= ~XBF_STALE;
bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
}
+ bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
tp->t_flags |= XFS_TRANS_DIRTY;
bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
- /*
- * If we have an ordered buffer we are not logging any dirty range but
- * it still needs to be marked dirty and that it has been logged.
- */
- bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
- if (!(bip->bli_flags & XFS_BLI_ORDERED))
- xfs_buf_item_log(bip, first, last);
+/*
+ * This is called to mark bytes first through last inclusive of the given
+ * buffer as needing to be logged when the transaction is committed.
+ * The buffer must already be associated with the given transaction.
+ *
+ * First and last are numbers relative to the beginning of this buffer,
+ * so the first byte in the buffer is numbered 0 regardless of the
+ * value of b_blkno.
+ */
+void
+xfs_trans_log_buf(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint first,
+ uint last)
+{
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ ASSERT(first <= last && last < BBTOB(bp->b_length));
+ ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
+
+ xfs_trans_dirty_buf(tp, bp);
+
+ trace_xfs_trans_log_buf(bip);
+ xfs_buf_item_log(bip, first, last);
}
@@ -701,14 +718,13 @@ xfs_trans_inode_alloc_buf(
}
/*
- * Mark the buffer as ordered for this transaction. This means
- * that the contents of the buffer are not recorded in the transaction
- * but it is tracked in the AIL as though it was. This allows us
- * to record logical changes in transactions rather than the physical
- * changes we make to the buffer without changing writeback ordering
- * constraints of metadata buffers.
+ * Mark the buffer as ordered for this transaction. This means that the contents
+ * of the buffer are not recorded in the transaction but it is tracked in the
+ * AIL as though it was. This allows us to record logical changes in
+ * transactions rather than the physical changes we make to the buffer without
+ * changing writeback ordering constraints of metadata buffers.
*/
-void
+bool
xfs_trans_ordered_buf(
struct xfs_trans *tp,
struct xfs_buf *bp)
@@ -719,8 +735,18 @@ xfs_trans_ordered_buf(
ASSERT(bip != NULL);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ if (xfs_buf_item_dirty_format(bip))
+ return false;
+
bip->bli_flags |= XFS_BLI_ORDERED;
trace_xfs_buf_item_ordered(bip);
+
+ /*
+ * We don't log a dirty range of an ordered buffer but it still needs
+ * to be marked dirty and that it has been logged.
+ */
+ xfs_trans_dirty_buf(tp, bp);
+ return true;
}
/*
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 49931b7..b317a36 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -106,18 +106,9 @@ xfs_trans_ail_update(
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
-void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
- struct xfs_log_item **log_items, int nr_items,
- int shutdown_type)
- __releases(ailp->xa_lock);
-static inline void
-xfs_trans_ail_delete(
- struct xfs_ail *ailp,
- xfs_log_item_t *lip,
- int shutdown_type) __releases(ailp->xa_lock)
-{
- xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
-}
+bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
+ int shutdown_type) __releases(ailp->xa_lock);
static inline void
xfs_trans_ail_remove(
@@ -173,4 +164,35 @@ xfs_trans_ail_copy_lsn(
*dst = *src;
}
#endif
+
+static inline void
+xfs_clear_li_failed(
+ struct xfs_log_item *lip)
+{
+ struct xfs_buf *bp = lip->li_buf;
+
+ ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+ lockdep_assert_held(&lip->li_ailp->xa_lock);
+
+ if (lip->li_flags & XFS_LI_FAILED) {
+ lip->li_flags &= ~XFS_LI_FAILED;
+ lip->li_buf = NULL;
+ xfs_buf_rele(bp);
+ }
+}
+
+static inline void
+xfs_set_li_failed(
+ struct xfs_log_item *lip,
+ struct xfs_buf *bp)
+{
+ lockdep_assert_held(&lip->li_ailp->xa_lock);
+
+ if (!(lip->li_flags & XFS_LI_FAILED)) {
+ xfs_buf_hold(bp);
+ lip->li_flags |= XFS_LI_FAILED;
+ lip->li_buf = bp;
+ }
+}
+
#endif /* __XFS_TRANS_PRIV_H__ */