From ee1a47ab0e77600fcbdf1c87d461bd8f3f63150d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 21 Apr 2013 14:53:46 -0500 Subject: xfs: add support for large btree blocks Add support for larger btree blocks that contains a CRC32C checksum, a filesystem uuid and block number for detecting filesystem consistency and out of place writes. [dchinner@redhat.com] Also include an owner field to allow reverse mappings to be implemented for improved repairability and a LSN field to so that log recovery can easily determine the last modification that made it to disk for each buffer. [dchinner@redhat.com] Add buffer log format flags to indicate the type of buffer to recovery so that we don't have to do blind magic number tests to determine what the buffer is. [dchinner@redhat.com] Modified to fit into the verifier structure. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b1ddef6..30c4c14 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -33,6 +33,7 @@ #include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" STATIC struct xfs_btree_cur * @@ -272,7 +273,7 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -static void +static bool xfs_allocbt_verify( struct xfs_buf *bp) { @@ -280,66 +281,103 @@ xfs_allocbt_verify( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; unsigned int level; - int sblock_ok; /* block passes checks */ /* * magic number and level verification * - * During growfs operations, we can't verify the exact level as the - * perag is not fully initialised and hence not attached to the buffer. - * In this case, check against the maximum tree depth. + * During growfs operations, we can't verify the exact level or owner as + * the perag is not fully initialised and hence not attached to the + * buffer. In this case, check against the maximum tree depth. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agf information will not yet have been initialised + * from the on disk AGF. Again, we can only check against maximum limits + * in this case. */ level = be16_to_cpu(block->bb_level); switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ case cpu_to_be32(XFS_ABTB_MAGIC): - if (pag) - sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi]; - else - sblock_ok = level < mp->m_ag_maxlevels; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; break; + case cpu_to_be32(XFS_ABTC_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ case cpu_to_be32(XFS_ABTC_MAGIC): - if (pag) - sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi]; - else - sblock_ok = level < mp->m_ag_maxlevels; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; break; default: - sblock_ok = 0; - break; + return false; } /* numrecs verification */ - sblock_ok = sblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0]; + if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0]) + return false; /* sibling pointer verification */ - sblock_ok = sblock_ok && - (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_leftsib && - (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_rightsib; - - if (!sblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; } static void xfs_allocbt_read_verify( struct xfs_buf *bp) { - xfs_allocbt_verify(bp); + if (!(xfs_btree_sblock_verify_crc(bp) && + xfs_allocbt_verify(bp))) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_allocbt_write_verify( struct xfs_buf *bp) { - xfs_allocbt_verify(bp); + if (!xfs_allocbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + xfs_btree_sblock_calc_crc(bp); + } const struct xfs_buf_ops xfs_allocbt_buf_ops = { @@ -444,6 +482,9 @@ xfs_allocbt_init_cursor( cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + return cur; } diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index 7e89a2b..e3a3f74 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -31,8 +31,10 @@ struct xfs_mount; * by blockcount and blockno. All blocks look the same to make the code * simpler; if we have time later, we'll make the optimizations. */ -#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ -#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ +#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ +#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */ +#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ +#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */ /* * Data record/key structure @@ -59,10 +61,10 @@ typedef __be32 xfs_alloc_ptr_t; /* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN +#define XFS_ALLOC_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* * Record, key, and pointer address macros for btree blocks. diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index f96a734..aa4765f 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -232,7 +232,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) return 0; return dp->i_d.di_forkoff; } - dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); + dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); break; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 20efb39..0531cd3 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -439,11 +439,15 @@ xfs_bmap_sanity_check( { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - if (block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) || - be16_to_cpu(block->bb_level) != level || + if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) && + block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC)) + return 0; + + if (be16_to_cpu(block->bb_level) != level || be16_to_cpu(block->bb_numrecs) == 0 || be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) return 0; + return 1; } @@ -1031,6 +1035,7 @@ xfs_bmap_extents_to_btree( xfs_extnum_t nextents; /* number of file extents */ xfs_bmbt_ptr_t *pp; /* root block address pointer */ + mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); @@ -1044,16 +1049,18 @@ xfs_bmap_extents_to_btree( * Fill in the root. */ block = ifp->if_broot; - block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - block->bb_level = cpu_to_be16(1); - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS); /* * Need a cursor. Can't allocate until bb_level is filled in. */ - mp = ip->i_mount; cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; cur->bc_private.b.flist = flist; @@ -1102,10 +1109,15 @@ xfs_bmap_extents_to_btree( */ abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); - ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - ablock->bb_level = 0; - ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); for (cnt = i = 0; i < nextents; i++) { @@ -1155,7 +1167,8 @@ xfs_bmap_local_to_extents( xfs_extlen_t total, /* total blocks needed by transaction */ int *logflagsp, /* inode logging flags */ int whichfork, - void (*init_fn)(struct xfs_buf *bp, + void (*init_fn)(struct xfs_trans *tp, + struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp)) { @@ -1207,7 +1220,7 @@ xfs_bmap_local_to_extents( bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); /* initialise the block and copy the data */ - init_fn(bp, ip, ifp); + init_fn(tp, bp, ip, ifp); /* account for the change in fork size and log everything */ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); @@ -1314,16 +1327,19 @@ xfs_bmap_add_attrfork_extents( */ STATIC void xfs_bmap_local_to_extents_init_fn( + struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp) { bp->b_ops = &xfs_bmbt_buf_ops; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + xfs_trans_buf_set_type(tp, bp, XFS_BLF_BTREE_BUF); } STATIC void xfs_symlink_local_to_remote( + struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp) @@ -1342,8 +1358,7 @@ xfs_symlink_local_to_remote( * * XXX (dgc): investigate whether directory conversion can use the generic * formatting callout. It should be possible - it's just a very complex - * formatter. it would also require passing the transaction through to the init - * function. + * formatter. */ STATIC int /* error */ xfs_bmap_add_attrfork_local( diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 061b45c..3a86c3f 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -37,6 +37,7 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trace.h" +#include "xfs_cksum.h" /* * Determine the extent state. @@ -59,24 +60,31 @@ xfs_extent_state( */ void xfs_bmdr_to_bmbt( - struct xfs_mount *mp, + struct xfs_inode *ip, xfs_bmdr_block_t *dblock, int dblocklen, struct xfs_btree_block *rblock, int rblocklen) { + struct xfs_mount *mp = ip->i_mount; int dmxr; xfs_bmbt_key_t *fkp; __be64 *fpp; xfs_bmbt_key_t *tkp; __be64 *tpp; - rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; - rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); fkp = XFS_BMDR_KEY_ADDR(dblock, 1); tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); @@ -424,7 +432,13 @@ xfs_bmbt_to_bmdr( xfs_bmbt_key_t *tkp; __be64 *tpp; - ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == + cpu_to_be64(XFS_BUF_DADDR_NULL)); + } else + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO)); ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO)); ASSERT(rblock->bb_level != 0); @@ -708,59 +722,89 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } -static void +static int xfs_bmbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); unsigned int level; - int lblock_ok; /* block passes checks */ - /* magic number and level verification. + switch (block->bb_magic) { + case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn) + return false; + /* + * XXX: need a better way of verifying the owner here. Right now + * just make sure there has been one set. + */ + if (be64_to_cpu(block->bb_u.l.bb_owner) == 0) + return false; + /* fall through */ + case cpu_to_be32(XFS_BMAP_MAGIC): + break; + default: + return false; + } + + /* + * numrecs and level verification. * - * We don't know waht fork we belong to, so just verify that the level + * We don't know what fork we belong to, so just verify that the level * is less than the maximum of the two. Later checks will be more * precise. */ level = be16_to_cpu(block->bb_level); - lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) && - level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]); - - /* numrecs verification */ - lblock_ok = lblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0]; + if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1])) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return false; /* sibling pointer verification */ - lblock_ok = lblock_ok && - block->bb_u.l.bb_leftsib && - (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_leftsib))) && - block->bb_u.l.bb_rightsib && - (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_rightsib))); - - if (!lblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.l.bb_leftsib || + (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))) + return false; + if (!block->bb_u.l.bb_rightsib || + (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))) + return false; + + return true; + } static void xfs_bmbt_read_verify( struct xfs_buf *bp) { - xfs_bmbt_verify(bp); + if (!(xfs_btree_lblock_verify_crc(bp) && + xfs_bmbt_verify(bp))) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + } static void xfs_bmbt_write_verify( struct xfs_buf *bp) { - xfs_bmbt_verify(bp); + if (!xfs_bmbt_verify(bp)) { + xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn); + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + xfs_btree_lblock_calc_crc(bp); } const struct xfs_buf_ops xfs_bmbt_buf_ops = { @@ -838,6 +882,8 @@ xfs_bmbt_init_cursor( cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); cur->bc_private.b.ip = ip; diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 88469ca..70c43d9 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -18,7 +18,8 @@ #ifndef __XFS_BMAP_BTREE_H__ #define __XFS_BMAP_BTREE_H__ -#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ +#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ +#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */ struct xfs_btree_cur; struct xfs_btree_block; @@ -136,10 +137,10 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN +#define XFS_BMBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) #define XFS_BMBT_REC_ADDR(mp, block, index) \ ((xfs_bmbt_rec_t *) \ @@ -186,12 +187,12 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; #define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \ XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0)) -#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \ - (int)(XFS_BTREE_LBLOCK_LEN + \ +#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \ + (int)(XFS_BMBT_BLOCK_LEN(mp) + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) -#define XFS_BMAP_BROOT_SPACE(bb) \ - (XFS_BMAP_BROOT_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) +#define XFS_BMAP_BROOT_SPACE(mp, bb) \ + (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs))) #define XFS_BMDR_SPACE_CALC(nrecs) \ (int)(sizeof(xfs_bmdr_block_t) + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) @@ -204,7 +205,7 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* * Prototypes for xfs_bmap.c to call. */ -extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int, +extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, struct xfs_btree_block *, int); extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index db01040..ec77036 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -30,9 +30,11 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" +#include "xfs_buf_item.h" #include "xfs_btree.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" /* * Cursor allocation zone. @@ -42,9 +44,13 @@ kmem_zone_t *xfs_btree_cur_zone; /* * Btree magic numbers. */ -const __uint32_t xfs_magics[XFS_BTNUM_MAX] = { - XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC +static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { + { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC }, + { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC } }; +#define xfs_btree_magic(cur) \ + xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] STATIC int /* error (0 or EFSCORRUPTED) */ @@ -54,30 +60,38 @@ xfs_btree_check_lblock( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer for block, if any */ { - int lblock_ok; /* block passes checks */ + int lblock_ok = 1; /* block passes checks */ struct xfs_mount *mp; /* file system mount point */ mp = cur->bc_mp; - lblock_ok = - be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + lblock_ok = lblock_ok && + uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.l.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + lblock_ok = lblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && block->bb_u.l.bb_leftsib && (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_leftsib))) && + be64_to_cpu(block->bb_u.l.bb_leftsib))) && block->bb_u.l.bb_rightsib && (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_rightsib))); + be64_to_cpu(block->bb_u.l.bb_rightsib))); + if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK, XFS_RANDOM_BTREE_CHECK_LBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW, - mp); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } return 0; @@ -90,16 +104,26 @@ xfs_btree_check_sblock( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer containing block */ { + struct xfs_mount *mp; /* file system mount point */ struct xfs_buf *agbp; /* buffer for ag. freespace struct */ struct xfs_agf *agf; /* ag. freespace structure */ xfs_agblock_t agflen; /* native ag. freespace length */ - int sblock_ok; /* block passes checks */ + int sblock_ok = 1; /* block passes checks */ + mp = cur->bc_mp; agbp = cur->bc_private.a.agbp; agf = XFS_BUF_TO_AGF(agbp); agflen = be32_to_cpu(agf->agf_length); - sblock_ok = - be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + sblock_ok = sblock_ok && + uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.s.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + sblock_ok = sblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && @@ -109,13 +133,13 @@ xfs_btree_check_sblock( (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && block->bb_u.s.bb_rightsib; - if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp, + + if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK, XFS_RANDOM_BTREE_CHECK_SBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_btree_check_sblock", - XFS_ERRLEVEL_LOW, cur->bc_mp, block); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } return 0; @@ -194,6 +218,72 @@ xfs_btree_check_ptr( #endif /* + * Calculate CRC on the whole btree block and stuff it into the + * long-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_lblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_LBLOCK_CRC_OFF); +} + +bool +xfs_btree_lblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_LBLOCK_CRC_OFF); + return true; +} + +/* + * Calculate CRC on the whole btree block and stuff it into the + * short-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_sblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_SBLOCK_CRC_OFF); +} + +bool +xfs_btree_sblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_SBLOCK_CRC_OFF); + return true; +} + +/* * Delete the btree cursor. */ void @@ -277,10 +367,8 @@ xfs_btree_dup_cursor( *ncur = NULL; return error; } - new->bc_bufs[i] = bp; - ASSERT(!xfs_buf_geterror(bp)); - } else - new->bc_bufs[i] = NULL; + } + new->bc_bufs[i] = bp; } *ncur = new; return 0; @@ -321,9 +409,14 @@ xfs_btree_dup_cursor( */ static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) { - return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? - XFS_BTREE_LBLOCK_LEN : - XFS_BTREE_SBLOCK_LEN; + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_LBLOCK_CRC_LEN; + return XFS_BTREE_LBLOCK_LEN; + } + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_SBLOCK_CRC_LEN; + return XFS_BTREE_SBLOCK_LEN; } /* @@ -863,43 +956,85 @@ xfs_btree_set_sibling( } void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags) +{ + buf->bb_magic = cpu_to_be32(magic); + buf->bb_level = cpu_to_be16(level); + buf->bb_numrecs = cpu_to_be16(numrecs); + + if (flags & XFS_BTREE_LONG_PTRS) { + buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.l.bb_owner = cpu_to_be64(owner); + uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); + buf->bb_u.l.bb_pad = 0; + } + } else { + /* owner is a 32 bit value on short blocks */ + __u32 __owner = (__u32)owner; + + buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.s.bb_owner = cpu_to_be32(__owner); + uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); + } + } +} + +void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_buf *bp, __u32 magic, __u16 level, __u16 numrecs, + __u64 owner, unsigned int flags) { - struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp); - - new->bb_magic = cpu_to_be32(magic); - new->bb_level = cpu_to_be16(level); - new->bb_numrecs = cpu_to_be16(numrecs); - - if (flags & XFS_BTREE_LONG_PTRS) { - new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - } else { - new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - } + xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + magic, level, numrecs, owner, flags); } STATIC void xfs_btree_init_block_cur( struct xfs_btree_cur *cur, + struct xfs_buf *bp, int level, - int numrecs, - struct xfs_buf *bp) + int numrecs) { - xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum], - level, numrecs, cur->bc_flags); + __u64 owner; + + /* + * we can pull the owner from the cursor right now as the different + * owners align directly with the pointer size of the btree. This may + * change in future, but is safe for current users of the generic btree + * code. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + owner = cur->bc_private.b.ip->i_ino; + else + owner = cur->bc_private.a.agno; + + xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + xfs_btree_magic(cur), level, numrecs, + owner, cur->bc_flags); } /* * Return true if ptr is the last record in the btree and - * we need to track updateѕ to this record. The decision + * we need to track updates to this record. The decision * will be further refined in the update_lastrec method. */ STATIC int @@ -1147,6 +1282,7 @@ xfs_btree_log_keys( XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); if (bp) { + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLF_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_key_offset(cur, first), xfs_btree_key_offset(cur, last + 1) - 1); @@ -1171,6 +1307,7 @@ xfs_btree_log_recs( XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLF_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_rec_offset(cur, first), xfs_btree_rec_offset(cur, last + 1) - 1); @@ -1195,6 +1332,7 @@ xfs_btree_log_ptrs( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); int level = xfs_btree_get_level(block); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLF_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_ptr_offset(cur, first, level), xfs_btree_ptr_offset(cur, last + 1, level) - 1); @@ -1223,7 +1361,12 @@ xfs_btree_log_block( offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), - XFS_BTREE_SBLOCK_LEN + offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.s.bb_owner), + offsetof(struct xfs_btree_block, bb_u.s.bb_crc), + XFS_BTREE_SBLOCK_CRC_LEN }; static const short loffsets[] = { /* table of offsets (long) */ offsetof(struct xfs_btree_block, bb_magic), @@ -1231,17 +1374,40 @@ xfs_btree_log_block( offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), - XFS_BTREE_LBLOCK_LEN + offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.l.bb_owner), + offsetof(struct xfs_btree_block, bb_u.l.bb_crc), + offsetof(struct xfs_btree_block, bb_u.l.bb_pad), + XFS_BTREE_LBLOCK_CRC_LEN }; XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGBI(cur, bp, fields); if (bp) { + int nbits; + + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + /* + * We don't log the CRC when updating a btree + * block but instead recreate it during log + * recovery. As the log buffers have checksums + * of their own this is safe and avoids logging a crc + * update in a lot of places. + */ + if (fields == XFS_BB_ALL_BITS) + fields = XFS_BB_ALL_BITS_CRC; + nbits = XFS_BB_NUM_BITS_CRC; + } else { + nbits = XFS_BB_NUM_BITS; + } xfs_btree_offsets(fields, (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? loffsets : soffsets, - XFS_BB_NUM_BITS, &first, &last); + nbits, &first, &last); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLF_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, first, last); } else { xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, @@ -2204,7 +2370,7 @@ xfs_btree_split( goto error0; /* Fill in the btree header for the new right block. */ - xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp); + xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); /* * Split the entries between the old and the new block evenly. @@ -2513,7 +2679,7 @@ xfs_btree_new_root( nptr = 2; } /* Fill in the new block's btree header and log it. */ - xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp); + xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && !xfs_btree_ptr_is_null(cur, &rptr)); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index f932897..6e6c915 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -42,11 +42,15 @@ extern kmem_zone_t *xfs_btree_cur_zone; * Generic btree header. * * This is a combination of the actual format used on disk for short and long - * format btrees. The first three fields are shared by both format, but - * the pointers are different and should be used with care. + * format btrees. The first three fields are shared by both format, but the + * pointers are different and should be used with care. * - * To get the size of the actual short or long form headers please use - * the size macros below. Never use sizeof(xfs_btree_block). + * To get the size of the actual short or long form headers please use the size + * macros below. Never use sizeof(xfs_btree_block). + * + * The blkno, crc, lsn, owner and uuid fields are only available in filesystems + * with the crc feature bit, and all accesses to them must be conditional on + * that flag. */ struct xfs_btree_block { __be32 bb_magic; /* magic number for block type */ @@ -56,10 +60,23 @@ struct xfs_btree_block { struct { __be32 bb_leftsib; __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; } s; /* short form pointers */ struct { __be64 bb_leftsib; __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ } l; /* long form pointers */ } bb_u; /* rest */ }; @@ -67,6 +84,16 @@ struct xfs_btree_block { #define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ #define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ +/* sizes of CRC enabled btree blocks */ +#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) +#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) + + +#define XFS_BTREE_SBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.s.bb_crc) +#define XFS_BTREE_LBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.l.bb_crc) + /* * Generic key, ptr and record wrapper structures. @@ -101,13 +128,11 @@ union xfs_btree_rec { #define XFS_BB_NUMRECS 0x04 #define XFS_BB_LEFTSIB 0x08 #define XFS_BB_RIGHTSIB 0x10 +#define XFS_BB_BLKNO 0x20 #define XFS_BB_NUM_BITS 5 #define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) - -/* - * Magic numbers for btree blocks. - */ -extern const __uint32_t xfs_magics[]; +#define XFS_BB_NUM_BITS_CRC 8 +#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface @@ -256,6 +281,7 @@ typedef struct xfs_btree_cur #define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ #define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ +#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ #define XFS_BTREE_NOERROR 0 @@ -393,8 +419,20 @@ xfs_btree_init_block( __u32 magic, __u16 level, __u16 numrecs, + __u64 owner, unsigned int flags); +void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags); + /* * Common btree core entry points. */ @@ -408,6 +446,14 @@ int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); /* + * btree block CRC helpers + */ +void xfs_btree_lblock_calc_crc(struct xfs_buf *); +bool xfs_btree_lblock_verify_crc(struct xfs_buf *); +void xfs_btree_sblock_calc_crc(struct xfs_buf *); +bool xfs_btree_sblock_verify_crc(struct xfs_buf *); + +/* * Internal btree helpers also used by xfs_bmap.c. */ void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index ee36c88..101ef83 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -24,19 +24,33 @@ extern kmem_zone_t *xfs_buf_item_zone; * This flag indicates that the buffer contains on disk inodes * and requires special recovery handling. */ -#define XFS_BLF_INODE_BUF 0x1 +#define XFS_BLF_INODE_BUF (1<<0) /* * This flag indicates that the buffer should not be replayed * during recovery because its blocks are being freed. */ -#define XFS_BLF_CANCEL 0x2 +#define XFS_BLF_CANCEL (1<<1) + /* * This flag indicates that the buffer contains on disk * user or group dquots and may require special recovery handling. */ -#define XFS_BLF_UDQUOT_BUF 0x4 -#define XFS_BLF_PDQUOT_BUF 0x8 -#define XFS_BLF_GDQUOT_BUF 0x10 +#define XFS_BLF_UDQUOT_BUF (1<<2) +#define XFS_BLF_PDQUOT_BUF (1<<3) +#define XFS_BLF_GDQUOT_BUF (1<<4) + +/* + * all buffers now need flags to tell recovery where the magic number + * is so that it can verify and calculate the CRCs on the buffer correctly + * once the changes have been replayed into the buffer. + */ +#define XFS_BLF_BTREE_BUF (1<<5) + +#define XFS_BLF_TYPE_MASK \ + (XFS_BLF_UDQUOT_BUF | \ + XFS_BLF_PDQUOT_BUF | \ + XFS_BLF_GDQUOT_BUF | \ + XFS_BLF_BTREE_BUF) #define XFS_BLF_CHUNK 128 #define XFS_BLF_SHIFT 7 diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index 88a3368..6b5bd17 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -107,8 +107,8 @@ typedef enum xfs_dinode_fmt { #define XFS_LITINO(mp, version) \ ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode))) -#define XFS_BROOT_SIZE_ADJ \ - (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t)) +#define XFS_BROOT_SIZE_ADJ(ip) \ + (XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t)) /* * Inode data & attribute fork sizes, per inode. diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 2866b8c..6fe286a 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -316,7 +316,13 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, + agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -339,7 +345,13 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, + agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -363,7 +375,12 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, + agno, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index bec344b..c82ac88 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -34,6 +34,7 @@ #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" STATIC int @@ -182,52 +183,88 @@ xfs_inobt_key_diff( cur->bc_rec.i.ir_startino; } -void +static int xfs_inobt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; unsigned int level; - int sblock_ok; /* block passes checks */ - /* magic number and level verification */ - level = be16_to_cpu(block->bb_level); - sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) && - level < mp->m_in_maxlevels; + /* + * During growfs operations, we can't verify the exact owner as the + * perag is not fully initialised and hence not attached to the buffer. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agi information will not yet have been initialised + * from the on disk AGI. We don't currently use any of this information, + * but beware of the landmine (i.e. need to check pag->pagi_init) if we + * ever do. + */ + switch (block->bb_magic) { + case cpu_to_be32(XFS_IBT_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_IBT_MAGIC): + break; + default: + return 0; + } - /* numrecs verification */ - sblock_ok = sblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0]; + /* numrecs and level verification */ + level = be16_to_cpu(block->bb_level); + if (level >= mp->m_in_maxlevels) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0]) + return false; /* sibling pointer verification */ - sblock_ok = sblock_ok && - (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_leftsib && - (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_rightsib; - - if (!sblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; } static void xfs_inobt_read_verify( struct xfs_buf *bp) { - xfs_inobt_verify(bp); + if (!(xfs_btree_sblock_verify_crc(bp) && + xfs_inobt_verify(bp))) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_inobt_write_verify( struct xfs_buf *bp) { - xfs_inobt_verify(bp); + if (!xfs_inobt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + xfs_btree_sblock_calc_crc(bp); + } const struct xfs_buf_ops xfs_inobt_buf_ops = { @@ -301,6 +338,8 @@ xfs_inobt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_inobt_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index 25c0239..3ac36b76 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -29,7 +29,8 @@ struct xfs_mount; /* * There is a btree for the inode map per allocation group. */ -#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ +#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ +#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */ typedef __uint64_t xfs_inofree_t; #define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) @@ -76,10 +77,10 @@ typedef __be32 xfs_inobt_ptr_t; /* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN +#define XFS_INOBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* * Record, key, and pointer address macros for btree blocks. diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4f20165..202ce37 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -786,6 +786,7 @@ xfs_iformat_btree( xfs_dinode_t *dip, int whichfork) { + struct xfs_mount *mp = ip->i_mount; xfs_bmdr_block_t *dfp; xfs_ifork_t *ifp; /* REFERENCED */ @@ -794,7 +795,7 @@ xfs_iformat_btree( ifp = XFS_IFORK_PTR(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); - size = XFS_BMAP_BROOT_SPACE(dfp); + size = XFS_BMAP_BROOT_SPACE(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); /* @@ -805,14 +806,14 @@ xfs_iformat_btree( * blocks. */ if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_IFORK_MAXEXT(ip, whichfork) || XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || + XFS_DFORK_SIZE(dip, mp, whichfork) || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { - xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", - (unsigned long long) ip->i_ino); + xfs_warn(mp, "corrupt inode %Lu (btree).", + (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); + mp, dip); return XFS_ERROR(EFSCORRUPTED); } @@ -823,8 +824,7 @@ xfs_iformat_btree( * Copy and convert from the on-disk structure * to the in-memory structure. */ - xfs_bmdr_to_bmbt(ip->i_mount, dfp, - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), + xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), ifp->if_broot, size); ifp->if_flags &= ~XFS_IFEXTENTS; ifp->if_flags |= XFS_IFBROOT; @@ -2037,7 +2037,7 @@ xfs_iroot_realloc( * allocate it now and get out. */ if (ifp->if_broot_bytes == 0) { - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); ifp->if_broot_bytes = (int)new_size; return; @@ -2051,9 +2051,9 @@ xfs_iroot_realloc( */ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); new_max = cur_max + rec_diff; - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ + XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), KM_SLEEP | KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); @@ -2061,7 +2061,7 @@ xfs_iroot_realloc( (int)new_size); ifp->if_broot_bytes = (int)new_size; ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); return; } @@ -2076,7 +2076,7 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; ASSERT(new_max >= 0); if (new_max > 0) - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); else new_size = 0; if (new_size > 0) { @@ -2084,7 +2084,8 @@ xfs_iroot_realloc( /* * First copy over the btree block header. */ - memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN); + memcpy(new_broot, ifp->if_broot, + XFS_BMBT_BLOCK_LEN(ip->i_mount)); } else { new_broot = NULL; ifp->if_flags &= ~XFS_IFBROOT; @@ -2114,7 +2115,7 @@ xfs_iroot_realloc( ifp->if_broot = new_broot; ifp->if_broot_bytes = (int)new_size; ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); return; } @@ -2427,7 +2428,7 @@ xfs_iflush_fork( ASSERT(ifp->if_broot != NULL); ASSERT(ifp->if_broot_bytes <= (XFS_IFORK_SIZE(ip, whichfork) + - XFS_BROOT_SIZE_ADJ)); + XFS_BROOT_SIZE_ADJ(ip))); xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, XFS_DFORK_SIZE(dip, mp, whichfork)); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 3ca3380..3762ce2 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -29,6 +29,7 @@ #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" +#include "xfs_btree.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -1928,6 +1929,33 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); + + switch (buf_f->blf_flags & XFS_BLF_TYPE_MASK) { + case XFS_BLF_BTREE_BUF: + switch (be32_to_cpu(*(__be32 *)bp->b_addr)) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_allocbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + default: + xfs_warn(mp, "Bad btree block magic!"); + ASSERT(0); + break; + } + break; + default: + break; + } } /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index cd29f61..1b04fe5 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -505,6 +505,8 @@ void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *, + uint); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 3edf5db..f950edd 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -659,6 +659,7 @@ xfs_trans_binval( ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_TYPE_MASK)); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); @@ -671,6 +672,7 @@ xfs_trans_binval( bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; + bip->__bli_format.blf_flags &= ~XFS_BLF_TYPE_MASK; for (i = 0; i < bip->bli_format_count; i++) { memset(bip->bli_formats[i].blf_data_map, 0, (bip->bli_formats[i].blf_map_size * sizeof(uint))); @@ -751,6 +753,26 @@ xfs_trans_inode_alloc_buf( bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; } +/* + * Set the type of the buffer for log recovery so that it can correctly identify + * and hence attach the correct buffer ops to the buffer after replay. + */ +void +xfs_trans_buf_set_type( + struct xfs_trans *tp, + struct xfs_buf *bp, + uint type) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((type & XFS_BLF_TYPE_MASK) != 0); + + bip->__bli_format.blf_flags &= ~XFS_BLF_TYPE_MASK; + bip->__bli_format.blf_flags |= type; +} /* * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of @@ -769,14 +791,9 @@ xfs_trans_dquot_buf( xfs_buf_t *bp, uint type) { - xfs_buf_log_item_t *bip = bp->b_fspriv; - - ASSERT(bp->b_transp == tp); - ASSERT(bip != NULL); ASSERT(type == XFS_BLF_UDQUOT_BUF || type == XFS_BLF_PDQUOT_BUF || type == XFS_BLF_GDQUOT_BUF); - ASSERT(atomic_read(&bip->bli_refcount) > 0); - bip->__bli_format.blf_flags |= type; + xfs_trans_buf_set_type(tp, bp, type); } -- cgit v0.10.2