diff options
Diffstat (limited to 'fs/xfs')
-rw-r--r-- | fs/xfs/xfs_acl.c | 31 | ||||
-rw-r--r-- | fs/xfs/xfs_acl.h | 31 | ||||
-rw-r--r-- | fs/xfs/xfs_aops.c | 19 | ||||
-rw-r--r-- | fs/xfs/xfs_attr_leaf.c | 98 | ||||
-rw-r--r-- | fs/xfs/xfs_attr_leaf.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_attr_remote.c | 408 | ||||
-rw-r--r-- | fs/xfs/xfs_attr_remote.h | 10 | ||||
-rw-r--r-- | fs/xfs/xfs_btree.c | 10 | ||||
-rw-r--r-- | fs/xfs/xfs_buf.c | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_buf_item.c | 7 | ||||
-rw-r--r-- | fs/xfs/xfs_da_btree.c | 7 | ||||
-rw-r--r-- | fs/xfs/xfs_dfrag.c | 8 | ||||
-rw-r--r-- | fs/xfs/xfs_dir2_format.h | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_dir2_leaf.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_dir2_node.c | 13 | ||||
-rw-r--r-- | fs/xfs/xfs_dquot.c | 37 | ||||
-rw-r--r-- | fs/xfs/xfs_extfree_item.c | 5 | ||||
-rw-r--r-- | fs/xfs/xfs_fs.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_fsops.c | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.c | 16 | ||||
-rw-r--r-- | fs/xfs/xfs_iops.c | 47 | ||||
-rw-r--r-- | fs/xfs/xfs_log_cil.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_log_recover.c | 114 | ||||
-rw-r--r-- | fs/xfs/xfs_mount.c | 18 | ||||
-rw-r--r-- | fs/xfs/xfs_qm.c | 40 | ||||
-rw-r--r-- | fs/xfs/xfs_qm_syscalls.c | 40 | ||||
-rw-r--r-- | fs/xfs/xfs_quota.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_super.c | 11 | ||||
-rw-r--r-- | fs/xfs/xfs_symlink.c | 20 | ||||
-rw-r--r-- | fs/xfs/xfs_vnodeops.c | 4 |
30 files changed, 681 insertions, 332 deletions
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 1d32f1d..306d883 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -21,6 +21,8 @@ #include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_vnodeops.h" +#include "xfs_sb.h" +#include "xfs_mount.h" #include "xfs_trace.h" #include <linux/slab.h> #include <linux/xattr.h> @@ -34,7 +36,9 @@ */ STATIC struct posix_acl * -xfs_acl_from_disk(struct xfs_acl *aclp) +xfs_acl_from_disk( + struct xfs_acl *aclp, + int max_entries) { struct posix_acl_entry *acl_e; struct posix_acl *acl; @@ -42,7 +46,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp) unsigned int count, i; count = be32_to_cpu(aclp->acl_cnt); - if (count > XFS_ACL_MAX_ENTRIES) + if (count > max_entries) return ERR_PTR(-EFSCORRUPTED); acl = posix_acl_alloc(count, GFP_KERNEL); @@ -108,9 +112,9 @@ xfs_get_acl(struct inode *inode, int type) struct xfs_inode *ip = XFS_I(inode); struct posix_acl *acl; struct xfs_acl *xfs_acl; - int len = sizeof(struct xfs_acl); unsigned char *ea_name; int error; + int len; acl = get_cached_acl(inode, type); if (acl != ACL_NOT_CACHED) @@ -133,8 +137,8 @@ xfs_get_acl(struct inode *inode, int type) * If we have a cached ACLs value just return it, not need to * go out to the disk. */ - - xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + len = XFS_ACL_MAX_SIZE(ip->i_mount); + xfs_acl = kzalloc(len, GFP_KERNEL); if (!xfs_acl) return ERR_PTR(-ENOMEM); @@ -153,7 +157,7 @@ xfs_get_acl(struct inode *inode, int type) goto out; } - acl = xfs_acl_from_disk(xfs_acl); + acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount)); if (IS_ERR(acl)) goto out; @@ -189,16 +193,17 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) if (acl) { struct xfs_acl *xfs_acl; - int len; + int len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + xfs_acl = kzalloc(len, GFP_KERNEL); if (!xfs_acl) return -ENOMEM; xfs_acl_to_disk(xfs_acl, acl); - len = sizeof(struct xfs_acl) - - (sizeof(struct xfs_acl_entry) * - (XFS_ACL_MAX_ENTRIES - acl->a_count)); + + /* subtract away the unused acl entries */ + len -= sizeof(struct xfs_acl_entry) * + (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, len, ATTR_ROOT); @@ -243,7 +248,7 @@ xfs_set_mode(struct inode *inode, umode_t mode) static int xfs_acl_exists(struct inode *inode, unsigned char *name) { - int len = sizeof(struct xfs_acl); + int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb)); return (xfs_attr_get(XFS_I(inode), name, NULL, &len, ATTR_ROOT|ATTR_KERNOVAL) == 0); @@ -379,7 +384,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, goto out_release; error = -EINVAL; - if (acl->a_count > XFS_ACL_MAX_ENTRIES) + if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) goto out_release; if (type == ACL_TYPE_ACCESS) { diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 39632d9..4016a56 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -22,19 +22,36 @@ struct inode; struct posix_acl; struct xfs_inode; -#define XFS_ACL_MAX_ENTRIES 25 #define XFS_ACL_NOT_PRESENT (-1) /* On-disk XFS access control list structure */ +struct xfs_acl_entry { + __be32 ae_tag; + __be32 ae_id; + __be16 ae_perm; + __be16 ae_pad; /* fill the implicit hole in the structure */ +}; + struct xfs_acl { - __be32 acl_cnt; - struct xfs_acl_entry { - __be32 ae_tag; - __be32 ae_id; - __be16 ae_perm; - } acl_entry[XFS_ACL_MAX_ENTRIES]; + __be32 acl_cnt; + struct xfs_acl_entry acl_entry[0]; }; +/* + * The number of ACL entries allowed is defined by the on-disk format. + * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is + * limited only by the maximum size of the xattr that stores the information. + */ +#define XFS_ACL_MAX_ENTRIES(mp) \ + (xfs_sb_version_hascrc(&mp->m_sb) \ + ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ + sizeof(struct xfs_acl_entry) \ + : 25) + +#define XFS_ACL_MAX_SIZE(mp) \ + (sizeof(struct xfs_acl) + \ + sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) + /* On-disk XFS extended attribute names */ #define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" #define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2b2691b..41a6950 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -725,6 +725,25 @@ xfs_convert_page( (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, i_size_read(inode)); + /* + * If the current map does not span the entire page we are about to try + * to write, then give up. The only way we can write a page that spans + * multiple mappings in a single writeback iteration is via the + * xfs_vm_writepage() function. Data integrity writeback requires the + * entire page to be written in a single attempt, otherwise the part of + * the page we don't write here doesn't get written as part of the data + * integrity sync. + * + * For normal writeback, we also don't attempt to write partial pages + * here as it simply means that write_cache_pages() will see it under + * writeback and ignore the page until some point in the future, at + * which time this will be the only page in the file that needs + * writeback. Hence for more optimal IO patterns, we should always + * avoid partial page writeback due to multiple mappings on a page here. + */ + if (!xfs_imap_valid(inode, imap, end_offset)) + goto fail_unlock_page; + len = 1 << inode->i_blkbits; p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), PAGE_CACHE_SIZE); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 08d5457..31d3cd1 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -931,20 +931,22 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) */ int xfs_attr_shortform_allfit( - struct xfs_buf *bp, - struct xfs_inode *dp) + struct xfs_buf *bp, + struct xfs_inode *dp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; xfs_attr_leaf_name_local_t *name_loc; - int bytes, i; + struct xfs_attr3_icleaf_hdr leafhdr; + int bytes; + int i; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); - entry = &leaf->entries[0]; bytes = sizeof(struct xfs_attr_sf_hdr); - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + for (i = 0; i < leafhdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!(entry->flags & XFS_ATTR_LOCAL)) @@ -954,15 +956,15 @@ xfs_attr_shortform_allfit( return(0); if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return(0); - bytes += sizeof(struct xfs_attr_sf_entry)-1 + bytes += sizeof(struct xfs_attr_sf_entry) - 1 + name_loc->namelen + be16_to_cpu(name_loc->valuelen); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) - return(-1); - return(xfs_attr_shortform_bytesfit(dp, bytes)); + return -1; + return xfs_attr_shortform_bytesfit(dp, bytes); } /* @@ -1410,7 +1412,7 @@ xfs_attr3_leaf_add_work( name_rmt->valuelen = 0; name_rmt->valueblk = 0; args->rmtblkno = 1; - args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), @@ -1443,11 +1445,12 @@ xfs_attr3_leaf_add_work( STATIC void xfs_attr3_leaf_compact( struct xfs_da_args *args, - struct xfs_attr3_icleaf_hdr *ichdr_d, + struct xfs_attr3_icleaf_hdr *ichdr_dst, struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf_s, *leaf_d; - struct xfs_attr3_icleaf_hdr ichdr_s; + struct xfs_attr_leafblock *leaf_src; + struct xfs_attr_leafblock *leaf_dst; + struct xfs_attr3_icleaf_hdr ichdr_src; struct xfs_trans *trans = args->trans; struct xfs_mount *mp = trans->t_mountp; char *tmpbuffer; @@ -1455,29 +1458,38 @@ xfs_attr3_leaf_compact( trace_xfs_attr_leaf_compact(args); tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); - ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); memset(bp->b_addr, 0, XFS_LBSIZE(mp)); + leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; + leaf_dst = bp->b_addr; /* - * Copy basic information + * Copy the on-disk header back into the destination buffer to ensure + * all the information in the header that is not part of the incore + * header structure is preserved. */ - leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; - leaf_d = bp->b_addr; - ichdr_s = *ichdr_d; /* struct copy */ - ichdr_d->firstused = XFS_LBSIZE(mp); - ichdr_d->usedbytes = 0; - ichdr_d->count = 0; - ichdr_d->holes = 0; - ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_s); - ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; + memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src)); + + /* Initialise the incore headers */ + ichdr_src = *ichdr_dst; /* struct copy */ + ichdr_dst->firstused = XFS_LBSIZE(mp); + ichdr_dst->usedbytes = 0; + ichdr_dst->count = 0; + ichdr_dst->holes = 0; + ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src); + ichdr_dst->freemap[0].size = ichdr_dst->firstused - + ichdr_dst->freemap[0].base; + + + /* write the header back to initialise the underlying buffer */ + xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ - xfs_attr3_leaf_moveents(leaf_s, &ichdr_s, 0, leaf_d, ichdr_d, 0, - ichdr_s.count, mp); + xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0, + ichdr_src.count, mp); /* * this logs the entire buffer, but the caller must write the header * back to the buffer when it is finished modifying it. @@ -2179,14 +2191,24 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kmem_alloc(state->blocksize, KM_SLEEP); - memset(tmp_leaf, 0, state->blocksize); - memset(&tmphdr, 0, sizeof(tmphdr)); + tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP); + + /* + * Copy the header into the temp leaf so that all the stuff + * not in the incore header is present and gets copied back in + * once we've moved all the entries. + */ + memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf)); + memset(&tmphdr, 0, sizeof(tmphdr)); tmphdr.magic = savehdr.magic; tmphdr.forw = savehdr.forw; tmphdr.back = savehdr.back; tmphdr.firstused = state->blocksize; + + /* write the header to the temp buffer to initialise it */ + xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, @@ -2330,9 +2352,11 @@ xfs_attr3_leaf_lookup_int( if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; + args->valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); + args->rmtblkcnt = xfs_attr3_rmt_blocks( + args->dp->i_mount, + args->valuelen); return XFS_ERROR(EEXIST); } } @@ -2383,7 +2407,8 @@ xfs_attr3_leaf_getvalue( ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, + valuelen); if (args->flags & ATTR_KERNOVAL) { args->valuelen = valuelen; return 0; @@ -2709,7 +2734,8 @@ xfs_attr3_leaf_list_int( args.valuelen = valuelen; args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); - args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); + args.rmtblkcnt = xfs_attr3_rmt_blocks( + args.dp->i_mount, valuelen); retval = xfs_attr_rmtval_get(&args); if (retval) return retval; @@ -3232,7 +3258,7 @@ xfs_attr3_leaf_inactive( name_rmt = xfs_attr3_leaf_name_remote(leaf, i); if (name_rmt->valueblk) { lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = XFS_B_TO_FSB(dp->i_mount, + lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, be32_to_cpu(name_rmt->valuelen)); lp++; } diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index f9d7846..444a770 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -128,6 +128,7 @@ struct xfs_attr3_leaf_hdr { __u8 holes; __u8 pad1; struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; + __be32 pad2; /* 64 bit alignment */ }; #define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index dee8446..ef6b0c1 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -47,22 +47,55 @@ * Each contiguous block has a header, so it is not just a simple attribute * length to FSB conversion. */ -static int +int xfs_attr3_rmt_blocks( struct xfs_mount *mp, int attrlen) { - int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, - mp->m_sb.sb_blocksize); - return (attrlen + buflen - 1) / buflen; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + return (attrlen + buflen - 1) / buflen; + } + return XFS_B_TO_FSB(mp, attrlen); +} + +/* + * Checking of the remote attribute header is split into two parts. The verifier + * does CRC, location and bounds checking, the unpacking function checks the + * attribute parameters and owner. + */ +static bool +xfs_attr3_rmt_hdr_ok( + struct xfs_mount *mp, + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (bno != be64_to_cpu(rmt->rm_blkno)) + return false; + if (offset != be32_to_cpu(rmt->rm_offset)) + return false; + if (size != be32_to_cpu(rmt->rm_bytes)) + return false; + if (ino != be64_to_cpu(rmt->rm_owner)) + return false; + + /* ok */ + return true; } static bool xfs_attr3_rmt_verify( - struct xfs_buf *bp) + struct xfs_mount *mp, + void *ptr, + int fsbsize, + xfs_daddr_t bno) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + struct xfs_attr3_rmt_hdr *rmt = ptr; if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; @@ -70,7 +103,9 @@ xfs_attr3_rmt_verify( return false; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) return false; - if (bp->b_bn != be64_to_cpu(rmt->rm_blkno)) + if (be64_to_cpu(rmt->rm_blkno) != bno) + return false; + if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) return false; if (be32_to_cpu(rmt->rm_offset) + be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) @@ -86,17 +121,40 @@ xfs_attr3_rmt_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + char *ptr; + int len; + bool corrupt = false; + xfs_daddr_t bno; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_ATTR3_RMT_CRC_OFF) || - !xfs_attr3_rmt_verify(bp)) { + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0) { + if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp), + XFS_ATTR3_RMT_CRC_OFF)) { + corrupt = true; + break; + } + if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + corrupt = true; + break; + } + len -= XFS_LBSIZE(mp); + ptr += XFS_LBSIZE(mp); + bno += mp->m_bsize; + } + + if (corrupt) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); - } + } else + ASSERT(len == 0); } static void @@ -105,23 +163,39 @@ xfs_attr3_rmt_write_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_buf_log_item *bip = bp->b_fspriv; + char *ptr; + int len; + xfs_daddr_t bno; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_attr3_rmt_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - return; - } + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0) { + if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + if (bip) { + struct xfs_attr3_rmt_hdr *rmt; + + rmt = (struct xfs_attr3_rmt_hdr *)ptr; + rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF); - if (bip) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; - rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + len -= XFS_LBSIZE(mp); + ptr += XFS_LBSIZE(mp); + bno += mp->m_bsize; } - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_ATTR3_RMT_CRC_OFF); + ASSERT(len == 0); } const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { @@ -129,15 +203,16 @@ const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { .verify_write = xfs_attr3_rmt_write_verify, }; -static int +STATIC int xfs_attr3_rmt_hdr_set( struct xfs_mount *mp, + void *ptr, xfs_ino_t ino, uint32_t offset, uint32_t size, - struct xfs_buf *bp) + xfs_daddr_t bno) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + struct xfs_attr3_rmt_hdr *rmt = ptr; if (!xfs_sb_version_hascrc(&mp->m_sb)) return 0; @@ -147,36 +222,107 @@ xfs_attr3_rmt_hdr_set( rmt->rm_bytes = cpu_to_be32(size); uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); rmt->rm_owner = cpu_to_be64(ino); - rmt->rm_blkno = cpu_to_be64(bp->b_bn); - bp->b_ops = &xfs_attr3_rmt_buf_ops; + rmt->rm_blkno = cpu_to_be64(bno); return sizeof(struct xfs_attr3_rmt_hdr); } /* - * Checking of the remote attribute header is split into two parts. the verifier - * does CRC, location and bounds checking, the unpacking function checks the - * attribute parameters and owner. + * Helper functions to copy attribute data in and out of the one disk extents */ -static bool -xfs_attr3_rmt_hdr_ok( - struct xfs_mount *mp, - xfs_ino_t ino, - uint32_t offset, - uint32_t size, - struct xfs_buf *bp) +STATIC int +xfs_attr_rmtval_copyout( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + char **dst) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + char *src = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); - if (offset != be32_to_cpu(rmt->rm_offset)) - return false; - if (size != be32_to_cpu(rmt->rm_bytes)) - return false; - if (ino != be64_to_cpu(rmt->rm_owner)) - return false; + ASSERT(len >= XFS_LBSIZE(mp)); - /* ok */ - return true; + while (len > 0 && *valuelen > 0) { + int hdr_size = 0; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + + byte_cnt = min_t(int, *valuelen, byte_cnt); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset, + byte_cnt, bno)) { + xfs_alert(mp, +"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", + bno, *offset, byte_cnt, ino); + return EFSCORRUPTED; + } + hdr_size = sizeof(struct xfs_attr3_rmt_hdr); + } + + memcpy(*dst, src + hdr_size, byte_cnt); + + /* roll buffer forwards */ + len -= XFS_LBSIZE(mp); + src += XFS_LBSIZE(mp); + bno += mp->m_bsize; + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *dst += byte_cnt; + *offset += byte_cnt; + } + return 0; +} + +STATIC void +xfs_attr_rmtval_copyin( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + char **src) +{ + char *dst = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0 && *valuelen > 0) { + int hdr_size; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + + byte_cnt = min(*valuelen, byte_cnt); + hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, + byte_cnt, bno); + + memcpy(dst + hdr_size, *src, byte_cnt); + + /* + * If this is the last block, zero the remainder of it. + * Check that we are actually the last block, too. + */ + if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) { + ASSERT(*valuelen - byte_cnt == 0); + ASSERT(len == XFS_LBSIZE(mp)); + memset(dst + hdr_size + byte_cnt, 0, + XFS_LBSIZE(mp) - hdr_size - byte_cnt); + } + + /* roll buffer forwards */ + len -= XFS_LBSIZE(mp); + dst += XFS_LBSIZE(mp); + bno += mp->m_bsize; + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *src += byte_cnt; + *offset += byte_cnt; + } } /* @@ -190,13 +336,12 @@ xfs_attr_rmtval_get( struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE]; struct xfs_mount *mp = args->dp->i_mount; struct xfs_buf *bp; - xfs_daddr_t dblkno; xfs_dablk_t lblkno = args->rmtblkno; - void *dst = args->value; + char *dst = args->value; int valuelen = args->valuelen; int nmap; int error; - int blkcnt; + int blkcnt = args->rmtblkcnt; int i; int offset = 0; @@ -207,52 +352,36 @@ xfs_attr_rmtval_get( while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, map, &nmap, + blkcnt, map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; ASSERT(nmap >= 1); for (i = 0; (i < nmap) && (valuelen > 0); i++) { - int byte_cnt; - char *src; + xfs_daddr_t dblkno; + int dblkcnt; ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); - blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp, + dblkno, dblkcnt, 0, &bp, &xfs_attr3_rmt_buf_ops); if (error) return error; - byte_cnt = min_t(int, valuelen, BBTOB(bp->b_length)); - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); - - src = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (!xfs_attr3_rmt_hdr_ok(mp, args->dp->i_ino, - offset, byte_cnt, bp)) { - xfs_alert(mp, -"remote attribute header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", - offset, byte_cnt, args->dp->i_ino); - xfs_buf_relse(bp); - return EFSCORRUPTED; - - } - - src += sizeof(struct xfs_attr3_rmt_hdr); - } - - memcpy(dst, src, byte_cnt); + error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, + &offset, &valuelen, + &dst); xfs_buf_relse(bp); + if (error) + return error; - offset += byte_cnt; - dst += byte_cnt; - valuelen -= byte_cnt; - + /* roll attribute extent map forwards */ lblkno += map[i].br_blockcount; + blkcnt -= map[i].br_blockcount; } } ASSERT(valuelen == 0); @@ -270,17 +399,13 @@ xfs_attr_rmtval_set( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_daddr_t dblkno; xfs_dablk_t lblkno; xfs_fileoff_t lfileoff = 0; - void *src = args->value; + char *src = args->value; int blkcnt; int valuelen; int nmap; int error; - int hdrcnt = 0; - bool crcs = xfs_sb_version_hascrc(&mp->m_sb); int offset = 0; trace_xfs_attr_rmtval_set(args); @@ -289,24 +414,14 @@ xfs_attr_rmtval_set( * Find a "hole" in the attribute address space large enough for * us to drop the new attribute's value into. Because CRC enable * attributes have headers, we can't just do a straight byte to FSB - * conversion. We calculate the worst case block count in this case - * and we may not need that many, so we have to handle this when - * allocating the blocks below. + * conversion and have to take the header space into account. */ - if (!crcs) - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - else - blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); - + blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) return error; - /* Start with the attribute data. We'll allocate the rest afterwards. */ - if (crcs) - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; args->rmtblkcnt = blkcnt; @@ -349,26 +464,6 @@ xfs_attr_rmtval_set( (map.br_startblock != HOLESTARTBLOCK)); lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; - hdrcnt++; - - /* - * If we have enough blocks for the attribute data, calculate - * how many extra blocks we need for headers. We might run - * through this multiple times in the case that the additional - * headers in the blocks needed for the data fragments spills - * into requiring more blocks. e.g. for 512 byte blocks, we'll - * spill for another block every 9 headers we require in this - * loop. - */ - if (crcs && blkcnt == 0) { - int total_len; - - total_len = args->valuelen + - hdrcnt * sizeof(struct xfs_attr3_rmt_hdr); - blkcnt = XFS_B_TO_FSB(mp, total_len); - blkcnt -= args->rmtblkcnt; - args->rmtblkcnt += blkcnt; - } /* * Start the next trans in the chain. @@ -385,18 +480,19 @@ xfs_attr_rmtval_set( * the INCOMPLETE flag. */ lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; valuelen = args->valuelen; while (valuelen > 0) { - int byte_cnt; - char *buf; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT(blkcnt > 0); - /* - * Try to remember where we decided to put the value. - */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return(error); @@ -405,41 +501,27 @@ xfs_attr_rmtval_set( (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); if (!bp) return ENOMEM; bp->b_ops = &xfs_attr3_rmt_buf_ops; - byte_cnt = BBTOB(bp->b_length); - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); - if (valuelen < byte_cnt) - byte_cnt = valuelen; - - buf = bp->b_addr; - buf += xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset, - byte_cnt, bp); - memcpy(buf, src, byte_cnt); - - if (byte_cnt < BBTOB(bp->b_length)) - xfs_buf_zero(bp, byte_cnt, - BBTOB(bp->b_length) - byte_cnt); + xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, + &valuelen, &src); error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); if (error) return error; - src += byte_cnt; - valuelen -= byte_cnt; - offset += byte_cnt; - hdrcnt--; + /* roll attribute extent map forwards */ lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } ASSERT(valuelen == 0); - ASSERT(hdrcnt == 0); return 0; } @@ -448,33 +530,40 @@ xfs_attr_rmtval_set( * out-of-line buffer that it is stored on. */ int -xfs_attr_rmtval_remove(xfs_da_args_t *args) +xfs_attr_rmtval_remove( + struct xfs_da_args *args) { - xfs_mount_t *mp; - xfs_bmbt_irec_t map; - xfs_buf_t *bp; - xfs_daddr_t dblkno; - xfs_dablk_t lblkno; - int valuelen, blkcnt, nmap, error, done, committed; + struct xfs_mount *mp = args->dp->i_mount; + xfs_dablk_t lblkno; + int blkcnt; + int error; + int done; trace_xfs_attr_rmtval_remove(args); - mp = args->dp->i_mount; - /* - * Roll through the "value", invalidating the attribute value's - * blocks. + * Roll through the "value", invalidating the attribute value's blocks. + * Note that args->rmtblkcnt is the minimum number of data blocks we'll + * see for a CRC enabled remote attribute. Each extent will have a + * header, and so we may have more blocks than we realise here. If we + * fail to map the blocks correctly, we'll have problems with the buffer + * lookups. */ lblkno = args->rmtblkno; - valuelen = args->rmtblkcnt; - while (valuelen > 0) { + blkcnt = args->rmtblkcnt; + while (blkcnt > 0) { + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + int nmap; + /* * Try to remember where we decided to put the value. */ nmap = 1; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, - XFS_BMAPI_ATTRFORK); + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return(error); ASSERT(nmap == 1); @@ -482,21 +571,20 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); /* * If the "remote" value is in the cache, remove it. */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); + bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); if (bp) { xfs_buf_stale(bp); xfs_buf_relse(bp); bp = NULL; } - valuelen -= map.br_blockcount; - lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } /* @@ -506,6 +594,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) blkcnt = args->rmtblkcnt; done = 0; while (!done) { + int committed; + xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h index c7cca60..92a8fd7 100644 --- a/fs/xfs/xfs_attr_remote.h +++ b/fs/xfs/xfs_attr_remote.h @@ -20,6 +20,14 @@ #define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ +/* + * There is one of these headers per filesystem block in a remote attribute. + * This is done to ensure there is a 1:1 mapping between the attribute value + * length and the number of blocks needed to store the attribute. This makes the + * verification of a buffer a little more complex, but greatly simplifies the + * allocation, reading and writing of these attributes as we don't have to guess + * the number of blocks needed to store the attribute data. + */ struct xfs_attr3_rmt_hdr { __be32 rm_magic; __be32 rm_offset; @@ -39,6 +47,8 @@ struct xfs_attr3_rmt_hdr { extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; +int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); + int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 8804b8a..0903960 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -2544,7 +2544,17 @@ xfs_btree_new_iroot( if (error) goto error0; + /* + * we can't just memcpy() the root in for CRC enabled btree blocks. + * In that case have to also ensure the blkno remains correct + */ memcpy(cblock, block, xfs_btree_block_len(cur)); + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn); + else + cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn); + } be16_add_cpu(&block->bb_level, 1); xfs_btree_set_numrecs(block, 1); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 82b70bd..1b2472a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -513,6 +513,7 @@ _xfs_buf_find( xfs_alert(btp->bt_mount, "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", __func__, blkno, eofs); + WARN_ON(1); return NULL; } @@ -1649,7 +1650,7 @@ xfs_alloc_buftarg( { xfs_buftarg_t *btp; - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); + btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index cf26347..4ec4317 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -262,12 +262,7 @@ xfs_buf_item_format_segment( vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; -/* - * You would think we need to bump the nvecs here too, but we do not - * this number is used by recovery, and it gets confused by the boundary - * split here - * nvecs++; - */ + nvecs++; vecp++; first_bit = next_bit; last_bit = next_bit; diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 9b26a99..0b8b2a1 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -270,6 +270,7 @@ xfs_da3_node_read_verify( break; return; case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: bp->b_ops = &xfs_attr3_leaf_buf_ops; bp->b_ops->verify_read(bp); return; @@ -2464,7 +2465,8 @@ xfs_buf_map_from_irec( ASSERT(nirecs >= 1); if (nirecs > 1) { - map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP); + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), + KM_SLEEP | KM_NOFS); if (!map) return ENOMEM; *mapp = map; @@ -2520,7 +2522,8 @@ xfs_dabuf_map( * Optimize the one-block case. */ if (nfsb != 1) - irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP); + irecs = kmem_zalloc(sizeof(irec) * nfsb, + KM_SLEEP | KM_NOFS); nirecs = nfsb; error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index f852b08..c407e1c 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -219,6 +219,14 @@ xfs_swap_extents( int taforkblks = 0; __uint64_t tmp; + /* + * We have no way of updating owner information in the BMBT blocks for + * each inode on CRC enabled filesystems, so to avoid corrupting the + * this metadata we simply don't allow extent swaps to occur. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return XFS_ERROR(EINVAL); + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h index a3b1bd8..7826782b 100644 --- a/fs/xfs/xfs_dir2_format.h +++ b/fs/xfs/xfs_dir2_format.h @@ -266,6 +266,7 @@ struct xfs_dir3_blk_hdr { struct xfs_dir3_data_hdr { struct xfs_dir3_blk_hdr hdr; xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT]; + __be32 pad; /* 64 bit alignment */ }; #define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc) @@ -477,7 +478,7 @@ struct xfs_dir3_leaf_hdr { struct xfs_da3_blkinfo info; /* header for da routines */ __be16 count; /* count of entries */ __be16 stale; /* count of stale entries */ - __be32 pad; + __be32 pad; /* 64 bit alignment */ }; struct xfs_dir3_icleaf_hdr { @@ -715,6 +716,7 @@ struct xfs_dir3_free_hdr { __be32 firstdb; /* db of first entry */ __be32 nvalid; /* count of valid entries */ __be32 nused; /* count of used entries */ + __be32 pad; /* 64 bit alignment */ }; struct xfs_dir3_free { diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 721ba2f..da71a18 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1336,7 +1336,7 @@ xfs_dir2_leaf_getdents( mp->m_sb.sb_blocksize); map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + (length * sizeof(struct xfs_bmbt_irec)), - KM_SLEEP); + KM_SLEEP | KM_NOFS); map_info->map_size = length; /* diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 5246de4..2226a00 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -263,18 +263,19 @@ xfs_dir3_free_get_buf( * Initialize the new block to be empty, and remember * its first slot as our empty slot. */ - hdr.magic = XFS_DIR2_FREE_MAGIC; - hdr.firstdb = 0; - hdr.nused = 0; - hdr.nvalid = 0; + memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr)); + memset(&hdr, 0, sizeof(hdr)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; hdr.magic = XFS_DIR3_FREE_MAGIC; + hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); hdr3->hdr.owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); - } + } else + hdr.magic = XFS_DIR2_FREE_MAGIC; xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr); *bpp = bp; return 0; @@ -1921,8 +1922,6 @@ xfs_dir2_node_addname_int( */ freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * xfs_dir3_free_max_bests(mp); - free->hdr.nvalid = 0; - free->hdr.nused = 0; } else { free = fbp->b_addr; bests = xfs_dir3_free_bests_p(mp, free); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a41f8bf..044e97a 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -249,8 +249,11 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_flags = type; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_sb_version_hascrc(&mp->m_sb)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } } xfs_trans_dquot_buf(tp, bp, @@ -286,23 +289,6 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; } -STATIC void -xfs_dquot_buf_calc_crc( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; - int i; - - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; - - for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++, d++) { - xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), - offsetof(struct xfs_dqblk, dd_crc)); - } -} - STATIC bool xfs_dquot_buf_verify_crc( struct xfs_mount *mp, @@ -328,12 +314,11 @@ xfs_dquot_buf_verify_crc( for (i = 0; i < ndquots; i++, d++) { if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), - offsetof(struct xfs_dqblk, dd_crc))) + XFS_DQUOT_CRC_OFF)) return false; if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) return false; } - return true; } @@ -393,6 +378,11 @@ xfs_dquot_buf_read_verify( } } +/* + * we don't calculate the CRC here as that is done when the dquot is flushed to + * the buffer after the update is done. This ensures that the dquot in the + * buffer always has an up-to-date CRC value. + */ void xfs_dquot_buf_write_verify( struct xfs_buf *bp) @@ -404,7 +394,6 @@ xfs_dquot_buf_write_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); return; } - xfs_dquot_buf_calc_crc(mp, bp); } const struct xfs_buf_ops xfs_dquot_buf_ops = { @@ -1151,11 +1140,17 @@ xfs_qm_dqflush( * copy the lsn into the on-disk dquot now while we have the in memory * dquot here. This can't be done later in the write verifier as we * can't get access to the log item at that point in time. + * + * We also calculate the CRC here so that the on-disk dquot in the + * buffer always has a valid CRC. This ensures there is no possibility + * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); } /* diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index c0f3750..452920a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -305,11 +305,12 @@ xfs_efi_release(xfs_efi_log_item_t *efip, { ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { - __xfs_efi_release(efip); - /* recovery needs us to drop the EFI reference, too */ if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) __xfs_efi_release(efip); + + __xfs_efi_release(efip); + /* efip may now have been freed, do not reference it again. */ } } diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 6dda3f9..d046955 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ #define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ +#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ /* diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 87595b2..3c3644e 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -99,7 +99,9 @@ xfs_fs_geometry( (xfs_sb_version_hasattr2(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_PROJID32 : 0); + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) | + (xfs_sb_version_hascrc(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_V5SB : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index efbe1ac..7f7be5f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1638,6 +1638,10 @@ xfs_iunlink( dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1723,6 +1727,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1796,6 +1804,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1809,6 +1821,10 @@ xfs_iunlink_remove( last_dip->di_next_unlinked = cpu_to_be32(next_agino); ASSERT(next_agino != 0); offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, last_dip); + xfs_trans_inode_buf(tp, last_ibp); xfs_trans_log_buf(tp, last_ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index d82efaa..ca9ecaa 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -455,6 +455,28 @@ xfs_vn_getattr( return 0; } +static void +xfs_setattr_mode( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + umode_t mode = iattr->ia_mode; + + ASSERT(tp); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; +} + int xfs_setattr_nonsize( struct xfs_inode *ip, @@ -606,18 +628,8 @@ xfs_setattr_nonsize( /* * Change file access modes. */ - if (mask & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - - ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= mode & ~S_IFMT; - - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; - } + if (mask & ATTR_MODE) + xfs_setattr_mode(tp, ip, iattr); /* * Change file access or modified times. @@ -714,9 +726,8 @@ xfs_setattr_size( return XFS_ERROR(error); ASSERT(S_ISREG(ip->i_d.di_mode)); - ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| - ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| - ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); + ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); if (!(flags & XFS_ATTR_NOLOCK)) { lock_flags |= XFS_IOLOCK_EXCL; @@ -860,6 +871,12 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } + /* + * Change file access modes. + */ + if (mask & ATTR_MODE) + xfs_setattr_mode(tp, ip, iattr); + if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index e3d0b85..d0833b5 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -139,7 +139,7 @@ xlog_cil_prepare_log_vecs( new_lv = kmem_zalloc(sizeof(*new_lv) + niovecs * sizeof(struct xfs_log_iovec), - KM_SLEEP); + KM_SLEEP|KM_NOFS); /* The allocated iovec region lies beyond the log vector. */ new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 93f03ec..7cf5e4e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1599,10 +1599,43 @@ xlog_recover_add_to_trans( } /* - * Sort the log items in the transaction. Cancelled buffers need - * to be put first so they are processed before any items that might - * modify the buffers. If they are cancelled, then the modifications - * don't need to be replayed. + * Sort the log items in the transaction. + * + * The ordering constraints are defined by the inode allocation and unlink + * behaviour. The rules are: + * + * 1. Every item is only logged once in a given transaction. Hence it + * represents the last logged state of the item. Hence ordering is + * dependent on the order in which operations need to be performed so + * required initial conditions are always met. + * + * 2. Cancelled buffers are recorded in pass 1 in a separate table and + * there's nothing to replay from them so we can simply cull them + * from the transaction. However, we can't do that until after we've + * replayed all the other items because they may be dependent on the + * cancelled buffer and replaying the cancelled buffer can remove it + * form the cancelled buffer table. Hence they have tobe done last. + * + * 3. Inode allocation buffers must be replayed before inode items that + * read the buffer and replay changes into it. + * + * 4. Inode unlink buffers must be replayed after inode items are replayed. + * This ensures that inodes are completely flushed to the inode buffer + * in a "free" state before we remove the unlinked inode list pointer. + * + * Hence the ordering needs to be inode allocation buffers first, inode items + * second, inode unlink buffers third and cancelled buffers last. + * + * But there's a problem with that - we can't tell an inode allocation buffer + * apart from a regular buffer, so we can't separate them. We can, however, + * tell an inode unlink buffer from the others, and so we can separate them out + * from all the other buffers and move them to last. + * + * Hence, 4 lists, in order from head to tail: + * - buffer_list for all buffers except cancelled/inode unlink buffers + * - item_list for all non-buffer items + * - inode_buffer_list for inode unlink buffers + * - cancel_list for the cancelled buffers */ STATIC int xlog_recover_reorder_trans( @@ -1612,6 +1645,10 @@ xlog_recover_reorder_trans( { xlog_recover_item_t *item, *n; LIST_HEAD(sort_list); + LIST_HEAD(cancel_list); + LIST_HEAD(buffer_list); + LIST_HEAD(inode_buffer_list); + LIST_HEAD(inode_list); list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { @@ -1619,12 +1656,18 @@ xlog_recover_reorder_trans( switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + if (buf_f->blf_flags & XFS_BLF_CANCEL) { trace_xfs_log_recover_item_reorder_head(log, trans, item, pass); - list_move(&item->ri_list, &trans->r_itemq); + list_move(&item->ri_list, &cancel_list); + break; + } + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + list_move(&item->ri_list, &inode_buffer_list); break; } + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_INODE: case XFS_LI_DQUOT: case XFS_LI_QUOTAOFF: @@ -1632,7 +1675,7 @@ xlog_recover_reorder_trans( case XFS_LI_EFI: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); - list_move_tail(&item->ri_list, &trans->r_itemq); + list_move_tail(&item->ri_list, &inode_list); break; default: xfs_warn(log->l_mp, @@ -1643,6 +1686,14 @@ xlog_recover_reorder_trans( } } ASSERT(list_empty(&sort_list)); + if (!list_empty(&buffer_list)) + list_splice(&buffer_list, &trans->r_itemq); + if (!list_empty(&inode_list)) + list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&inode_buffer_list)) + list_splice_tail(&inode_buffer_list, &trans->r_itemq); + if (!list_empty(&cancel_list)) + list_splice_tail(&cancel_list, &trans->r_itemq); return 0; } @@ -1794,7 +1845,13 @@ xlog_recover_do_inode_buffer( xfs_agino_t *buffer_nextp; trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - bp->b_ops = &xfs_inode_buf_ops; + + /* + * Post recovery validation only works properly on CRC enabled + * filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { @@ -1861,6 +1918,15 @@ xlog_recover_do_inode_buffer( buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + } return 0; @@ -2097,6 +2163,17 @@ xlog_recover_do_reg_buffer( ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + + /* * Do a sanity check if this is a dquot buffer. Just checking * the first dquot in the buffer should do. XXXThis is * probably a good thing to do for other buf types also. @@ -2134,7 +2211,16 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); - xlog_recovery_validate_buf_type(mp, bp, buf_f); + /* + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + xlog_recovery_validate_buf_type(mp, bp, buf_f); } /* @@ -2255,6 +2341,12 @@ xfs_qm_dqcheck( d->dd_diskdq.d_flags = type; d->dd_diskdq.d_id = cpu_to_be32(id); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + return errs; } @@ -2782,6 +2874,10 @@ xlog_recover_dquot_pass2( } memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f6bfbd7..e8e310c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -314,7 +314,8 @@ STATIC int xfs_mount_validate_sb( xfs_mount_t *mp, xfs_sb_t *sbp, - bool check_inprogress) + bool check_inprogress, + bool check_version) { /* @@ -337,9 +338,10 @@ xfs_mount_validate_sb( /* * Version 5 superblock feature mask validation. Reject combinations the - * kernel cannot support up front before checking anything else. + * kernel cannot support up front before checking anything else. For + * write validation, we don't need to check feature masks. */ - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { + if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { xfs_alert(mp, "Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" "Use of these features in this kernel is at your own risk!"); @@ -675,7 +677,8 @@ xfs_sb_to_disk( static int xfs_sb_verify( - struct xfs_buf *bp) + struct xfs_buf *bp, + bool check_version) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_sb sb; @@ -686,7 +689,8 @@ xfs_sb_verify( * Only check the in progress field for the primary superblock as * mkfs.xfs doesn't clear it from secondary superblocks. */ - return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); + return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + check_version); } /* @@ -719,7 +723,7 @@ xfs_sb_read_verify( goto out_error; } } - error = xfs_sb_verify(bp); + error = xfs_sb_verify(bp, true); out_error: if (error) { @@ -758,7 +762,7 @@ xfs_sb_write_verify( struct xfs_buf_log_item *bip = bp->b_fspriv; int error; - error = xfs_sb_verify(bp); + error = xfs_sb_verify(bp, false); if (error) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, error); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index f41702b..b75c9bb 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -41,6 +41,7 @@ #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_cksum.h" /* * The global quota manager. There is only one of these for the entire @@ -839,7 +840,7 @@ xfs_qm_reset_dqcounts( xfs_dqid_t id, uint type) { - xfs_disk_dquot_t *ddq; + struct xfs_dqblk *dqb; int j; trace_xfs_reset_dqcounts(bp, _RET_IP_); @@ -853,8 +854,12 @@ xfs_qm_reset_dqcounts( do_div(j, sizeof(xfs_dqblk_t)); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif - ddq = bp->b_addr; + dqb = bp->b_addr; for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { + struct xfs_disk_dquot *ddq; + + ddq = (struct xfs_disk_dquot *)&dqb[j]; + /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to @@ -871,7 +876,12 @@ xfs_qm_reset_dqcounts( ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; - ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)&dqb[j], + sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } } } @@ -907,19 +917,29 @@ xfs_qm_dqiter_bufs( XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); - if (error) - break; /* - * XXX(hch): need to figure out if it makes sense to validate - * the CRC here. + * CRC and validation errors will return a EFSCORRUPTED here. If + * this occurs, re-read without CRC validation so that we can + * repair the damage via xfs_qm_reset_dqcounts(). This process + * will leave a trace in the log indicating corruption has + * been detected. */ + if (error == EFSCORRUPTED) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + NULL); + } + + if (error) + break; + xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); - /* - * goto the next block. - */ + + /* goto the next block. */ bno++; firstid += mp->m_quotainfo->qi_dqperchunk; } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index c41190c..6cdf6ff 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -489,31 +489,36 @@ xfs_qm_scall_setqlim( if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) return 0; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), - 0, 0, XFS_DEFAULT_LOG_COUNT); - if (error) { - xfs_trans_cancel(tp, 0); - return (error); - } - /* * We don't want to race with a quotaoff so take the quotaoff lock. - * (We don't hold an inode lock, so there's nothing else to stop - * a quotaoff from happening). (XXXThis doesn't currently happen - * because we take the vfslock before calling xfs_qm_sysent). + * We don't hold an inode lock, so there's nothing else to stop + * a quotaoff from happening. */ mutex_lock(&q->qi_quotaofflock); /* - * Get the dquot (locked), and join it to the transaction. - * Allocate the dquot if this doesn't exist. + * Get the dquot (locked) before we start, as we need to do a + * transaction to allocate it if it doesn't exist. Once we have the + * dquot, unlock it so we can start the next transaction safely. We hold + * a reference to the dquot, so it's safe to do this unlock/lock without + * it being reclaimed in the mean time. */ - if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); + if (error) { ASSERT(error != ENOENT); goto out_unlock; } + xfs_dqunlock(dqp); + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); + error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_rele; + } + + xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); ddq = &dqp->q_core; @@ -621,9 +626,10 @@ xfs_qm_scall_setqlim( xfs_trans_log_dquot(tp, dqp); error = xfs_trans_commit(tp, 0); - xfs_qm_dqrele(dqp); - out_unlock: +out_rele: + xfs_qm_dqrele(dqp); +out_unlock: mutex_unlock(&q->qi_quotaofflock); return error; } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index c61e31c..c38068f 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -87,6 +87,8 @@ typedef struct xfs_dqblk { uuid_t dd_uuid; /* location information */ } xfs_dqblk_t; +#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) + /* * flags for q_flags field in the dquot. */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ea341ce..3033ba5 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1373,6 +1373,17 @@ xfs_finish_flags( } /* + * V5 filesystems always use attr2 format for attributes. + */ + if (xfs_sb_version_hascrc(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_warn(mp, +"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.", + MNTOPT_NOATTR2, MNTOPT_ATTR2); + return XFS_ERROR(EINVAL); + } + + /* * mkfs'ed attr2 will turn on attr2 mount unless explicitly * told by noattr2 to turn it off */ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 5f234389..195a403 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -56,16 +56,9 @@ xfs_symlink_blocks( struct xfs_mount *mp, int pathlen) { - int fsblocks = 0; - int len = pathlen; + int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - do { - fsblocks++; - len -= XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - } while (len > 0); - - ASSERT(fsblocks <= XFS_SYMLINK_MAPS); - return fsblocks; + return (pathlen + buflen - 1) / buflen; } static int @@ -405,7 +398,7 @@ xfs_symlink( if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) fs_blocks = 0; else - fs_blocks = XFS_B_TO_FSB(mp, pathlen); + fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); @@ -512,7 +505,7 @@ xfs_symlink( cur_chunk = target_path; offset = 0; for (n = 0; n < nmaps; n++) { - char *buf; + char *buf; d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); @@ -525,9 +518,7 @@ xfs_symlink( bp->b_ops = &xfs_symlink_buf_ops; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); - if (pathlen < byte_cnt) { - byte_cnt = pathlen; - } + byte_cnt = min(byte_cnt, pathlen); buf = bp->b_addr; buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, @@ -542,6 +533,7 @@ xfs_symlink( xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - (char *)bp->b_addr); } + ASSERT(pathlen == 0); } /* diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 1501f4f..0176bb2 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1453,7 +1453,7 @@ xfs_free_file_space( xfs_mount_t *mp; int nimap; uint resblks; - uint rounding; + xfs_off_t rounding; int rt; xfs_fileoff_t startoffset_fsb; xfs_trans_t *tp; @@ -1482,7 +1482,7 @@ xfs_free_file_space( inode_dio_wait(VFS_I(ip)); } - rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); ioffset = offset & ~(rounding - 1); error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, -1); |