From eedf32bfcace7d8e20cc66757d74fc68f3439ff7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 4 Aug 2014 11:35:35 +1000 Subject: xfs: fix rounding error of fiemap length parameter The offset and length parameters are converted from bytes to basic blocks by xfs_vn_fiemap(). The BTOBB() converter rounds the value up to the nearest basic block. This leads to unexpected behavior when unaligned offsets are provided to FIEMAP. Fix the conversions of byte values to block values to cover the provided offsets. Round down the start offset to the nearest basic block. Calculate the end offset based on the provided values, round up and calculate length based on the start block offset. Reported-by: Chandan Rajendra Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index d75621a..7212949 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1055,12 +1055,12 @@ xfs_vn_fiemap( return error; /* Set up bmap header for xfs internal routine */ - bm.bmv_offset = BTOBB(start); + bm.bmv_offset = BTOBBT(start); /* Special case for whole file */ if (length == FIEMAP_MAX_OFFSET) bm.bmv_length = -1LL; else - bm.bmv_length = BTOBB(length); + bm.bmv_length = BTOBB(start + length) - bm.bmv_offset; /* We add one because in getbmap world count includes the header */ bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : -- cgit v0.10.2 From 5ef828c4152726f56751c78ea844f08d2b2a4fa3 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 4 Aug 2014 11:35:44 +1000 Subject: xfs: avoid false quotacheck after unclean shutdown The commit 83e782e xfs: Remove incore use of XFS_OQUOTA_ENFD and XFS_OQUOTA_CHKD added a new function xfs_sb_quota_from_disk() which swaps on-disk XFS_OQUOTA_* flags for in-core XFS_GQUOTA_* and XFS_PQUOTA_* flags after the superblock is read. However, if log recovery is required, the superblock is read again, and the modified in-core flags are re-read from disk, so we have XFS_OQUOTA_* flags in memory again. This causes the XFS_QM_NEED_QUOTACHECK() test to be true, because the XFS_OQUOTA_CHKD is still set, and not XFS_GQUOTA_CHKD or XFS_PQUOTA_CHKD. Change xfs_sb_from_disk to call xfs_sb_quota_from disk and always convert the disk flags to in-memory flags. Add a lower-level function which can be called with "false" to not convert the flags, so that the sb verifier can verify exactly what was on disk, per Brian Foster's suggestion. Reported-by: Cyril B. Signed-off-by: Eric Sandeen diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index f5ca028..8db9e92 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -386,10 +386,11 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp) } } -void -xfs_sb_from_disk( +static void +__xfs_sb_from_disk( struct xfs_sb *to, - xfs_dsb_t *from) + xfs_dsb_t *from, + bool convert_xquota) { to->sb_magicnum = be32_to_cpu(from->sb_magicnum); to->sb_blocksize = be32_to_cpu(from->sb_blocksize); @@ -445,6 +446,17 @@ xfs_sb_from_disk( to->sb_pad = 0; to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); + /* Convert on-disk flags to in-memory flags? */ + if (convert_xquota) + xfs_sb_quota_from_disk(to); +} + +void +xfs_sb_from_disk( + struct xfs_sb *to, + xfs_dsb_t *from) +{ + __xfs_sb_from_disk(to, from, true); } static inline void @@ -560,7 +572,11 @@ xfs_sb_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_sb sb; - xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + /* + * Use call variant which doesn't convert quota flags from disk + * format, because xfs_mount_validate_sb checks the on-disk flags. + */ + __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); /* * Only check the in progress field for the primary superblock as diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d5c44a6..5612aa8 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -324,7 +324,6 @@ reread: * Initialize the mount structure from the superblock. */ xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); - xfs_sb_quota_from_disk(sbp); /* * If we haven't validated the superblock, do so now before we try -- cgit v0.10.2 From 400b9d88757c0bfbdfa97014e090ec40a31c1282 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Aug 2014 12:42:40 +1000 Subject: xfs: catch buffers written without verifiers attached We recently had a bug where buffers were slipping through log recovery without any verifier attached to them. This was resulting in on-disk CRC mismatches for valid data. Add some warning code to catch this occurrence so that we catch such bugs during development rather than not being aware they exist. Note that we cannot do this verification unconditionally as non-CRC filesystems don't always attach verifiers to the buffers being written. e.g. during log recovery we cannot identify all the different types of buffers correctly on non-CRC filesystems, so we can't attach the correct verifiers in all cases and so we don't attach any. Hence we don't want on non-CRC filesystems to avoid spamming the logs with false indications. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index a6dc83e..cd7b8ca 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1330,6 +1330,20 @@ _xfs_buf_ioapply( SHUTDOWN_CORRUPT_INCORE); return; } + } else if (bp->b_bn != XFS_BUF_DADDR_NULL) { + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* + * non-crc filesystems don't attach verifiers during + * log recovery, so don't warn for such filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_warn(mp, + "%s: no ops on block 0x%llx/0x%x", + __func__, bp->b_bn, bp->b_length); + xfs_hex_dump(bp->b_addr, 64); + dump_stack(); + } } } else if (bp->b_flags & XBF_READ_AHEAD) { rw = READA; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7647818..d015ed7 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1369,8 +1369,14 @@ xlog_alloc_log( xlog_get_iclog_buffer_size(mp, log); + /* + * Use a NULL block for the extra log buffer used during splits so that + * it will trigger errors if we ever try to do IO on it without first + * having set it up properly. + */ error = -ENOMEM; - bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); + bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, + BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; -- cgit v0.10.2 From 67dc288c21064b31a98a53dc64f6b9714b819fd6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Aug 2014 12:43:06 +1000 Subject: xfs: ensure verifiers are attached to recovered buffers Crash testing of CRC enabled filesystems has resulted in a number of reports of bad CRCs being detected after the filesystem was mounted. Errors such as the following were being seen: XFS (sdb3): Mounting V5 Filesystem XFS (sdb3): Starting recovery (logdev: internal) XFS (sdb3): Metadata CRC error detected at xfs_agf_read_verify+0x5a/0x100 [xfs], block 0x1 XFS (sdb3): Unmount and run xfs_repair XFS (sdb3): First 64 bytes of corrupted metadata buffer: ffff880136ffd600: 58 41 47 46 00 00 00 01 00 00 00 00 00 0f aa 40 XAGF...........@ ffff880136ffd610: 00 02 6d 53 00 02 77 f8 00 00 00 00 00 00 00 01 ..mS..w......... ffff880136ffd620: 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 03 ................ ffff880136ffd630: 00 00 00 04 00 08 81 d0 00 08 81 a7 00 00 00 00 ................ XFS (sdb3): metadata I/O error: block 0x1 ("xfs_trans_read_buf_map") error 74 numblks 1 The errors were typically being seen in AGF, AGI and their related btree block buffers some time after log recovery had run. Often it wasn't until later subsequent mounts that the problem was discovered. The common symptom was a buffer with the correct contents, but a CRC and an LSN that matched an older version of the contents. Some debug added to _xfs_buf_ioapply() indicated that buffers were being written without verifiers attached to them from log recovery, and Jan Kara isolated the cause to log recovery readahead an dit's interactions with buffers that had a more recent LSN on disk than the transaction being recovered. In this case, the buffer did not get a verifier attached, and os when the second phase of log recovery ran and recovered EFIs and unlinked inodes, the buffers were modified and written without the verifier running. Hence they had up to date contents, but stale LSNs and CRCs. Fix it by attaching verifiers to buffers we skip due to future LSN values so they don't escape into the buffer cache without the correct verifier attached. This patch is based on analysis and a patch from Jan Kara. cc: Reported-by: Jan Kara Reported-by: Fanael Linithien Reported-by: Grozdan Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index fbc2362..8a7d8a7 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2126,6 +2126,17 @@ xlog_recover_validate_buf_type( __uint16_t magic16; __uint16_t magicda; + /* + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); magic16 = be16_to_cpu(*(__be16*)bp->b_addr); magicda = be16_to_cpu(info->magic); @@ -2163,8 +2174,6 @@ xlog_recover_validate_buf_type( bp->b_ops = &xfs_agf_buf_ops; break; case XFS_BLFT_AGFL_BUF: - if (!xfs_sb_version_hascrc(&mp->m_sb)) - break; if (magic32 != XFS_AGFL_MAGIC) { xfs_warn(mp, "Bad AGFL block magic!"); ASSERT(0); @@ -2197,10 +2206,6 @@ xlog_recover_validate_buf_type( #endif break; case XFS_BLFT_DINO_BUF: - /* - * we get here with inode allocation buffers, not buffers that - * track unlinked list changes. - */ if (magic16 != XFS_DINODE_MAGIC) { xfs_warn(mp, "Bad INODE block magic!"); ASSERT(0); @@ -2280,8 +2285,6 @@ xlog_recover_validate_buf_type( bp->b_ops = &xfs_attr3_leaf_buf_ops; break; case XFS_BLFT_ATTR_RMT_BUF: - if (!xfs_sb_version_hascrc(&mp->m_sb)) - break; if (magic32 != XFS_ATTR3_RMT_MAGIC) { xfs_warn(mp, "Bad attr remote magic!"); ASSERT(0); @@ -2388,16 +2391,7 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); - /* - * We can only do post recovery validation on items on CRC enabled - * fielsystems as we need to know when the buffer was written to be able - * to determine if we should have replayed the item. If we replay old - * metadata over a newer buffer, then it will enter a temporarily - * inconsistent state resulting in verification failures. Hence for now - * just avoid the verification stage for non-crc filesystems - */ - if (xfs_sb_version_hascrc(&mp->m_sb)) - xlog_recover_validate_buf_type(mp, bp, buf_f); + xlog_recover_validate_buf_type(mp, bp, buf_f); } /* @@ -2505,12 +2499,29 @@ xlog_recover_buffer_pass2( } /* - * recover the buffer only if we get an LSN from it and it's less than + * Recover the buffer only if we get an LSN from it and it's less than * the lsn of the transaction we are replaying. + * + * Note that we have to be extremely careful of readahead here. + * Readahead does not attach verfiers to the buffers so if we don't + * actually do any replay after readahead because of the LSN we found + * in the buffer if more recent than that current transaction then we + * need to attach the verifier directly. Failure to do so can lead to + * future recovery actions (e.g. EFI and unlinked list recovery) can + * operate on the buffers and they won't get the verifier attached. This + * can lead to blocks on disk having the correct content but a stale + * CRC. + * + * It is safe to assume these clean buffers are currently up to date. + * If the buffer is dirtied by a later transaction being replayed, then + * the verifier will be reset to match whatever recover turns that + * buffer into. */ lsn = xlog_recover_get_buf_lsn(mp, bp); - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + xlog_recover_validate_buf_type(mp, bp, buf_f); goto out_release; + } if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); -- cgit v0.10.2 From 5fd364fee81a7888af806e42ed8a91c845894f2d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Aug 2014 12:43:26 +1000 Subject: xfs: quotacheck leaves dquot buffers without verifiers When running xfs/305, I noticed that quotacheck was flushing dquot buffers that did not have the xfs_dquot_buf_ops verifiers attached: XFS (vdb): _xfs_buf_ioapply: no ops on block 0x1dc8/0x1dc8 ffff880052489000: 44 51 01 04 00 00 65 b8 00 00 00 00 00 00 00 00 DQ....e......... ffff880052489010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ ffff880052489020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ ffff880052489030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ CPU: 1 PID: 2376 Comm: mount Not tainted 3.16.0-rc2-dgc+ #306 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 ffff88006fe38000 ffff88004a0ffae8 ffffffff81cf1cca 0000000000000001 ffff88004a0ffb88 ffffffff814d50ca 000010004a0ffc70 0000000000000000 ffff88006be56dc4 0000000000000021 0000000000001dc8 ffff88007c773d80 Call Trace: [] dump_stack+0x45/0x56 [] _xfs_buf_ioapply+0x3ca/0x3d0 [] ? wake_up_state+0x20/0x20 [] ? xfs_bdstrat_cb+0x55/0xb0 [] xfs_buf_iorequest+0x6b/0xd0 [] xfs_bdstrat_cb+0x55/0xb0 [] __xfs_buf_delwri_submit+0x15b/0x220 [] ? xfs_buf_delwri_submit+0x30/0x90 [] xfs_buf_delwri_submit+0x30/0x90 [] xfs_qm_quotacheck+0x17d/0x3c0 [] xfs_qm_mount_quotas+0x151/0x1e0 [] xfs_mountfs+0x56c/0x7d0 [] xfs_fs_fill_super+0x2c2/0x340 [] mount_bdev+0x194/0x1d0 [] ? xfs_finish_flags+0x170/0x170 [] xfs_fs_mount+0x15/0x20 [] mount_fs+0x39/0x1b0 [] vfs_kern_mount+0x67/0x120 [] do_mount+0x23e/0xad0 [] ? __get_free_pages+0xe/0x50 [] ? copy_mount_options+0x36/0x150 [] SyS_mount+0x83/0xc0 [] tracesys+0xdd/0xe2 This was caused by dquot buffer readahead not attaching a verifier structure to the buffer when readahead was issued, resulting in the followup read of the buffer finding a valid buffer and so not attaching new verifiers to the buffer as part of the read. Also, when a verifier failure occurs, we then read the buffer without verifiers. Attach the verifiers manually after this read so that if the buffer is then written it will be verified that the corruption has been repaired. Further, when flushing a dquot we don't ask for a verifier when reading in the dquot buffer the dquot belongs to. Most of the time this isn't an issue because the buffer is still cached, but when it is not cached it will result in writing the dquot buffer without having the verfier attached. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 8a44a79..63c2de4 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -974,7 +974,8 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + &xfs_dquot_buf_ops); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index ba284f6..e261547 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1005,6 +1005,12 @@ xfs_qm_dqiter_bufs( if (error) break; + /* + * A corrupt buffer might not have a verifier attached, so + * make sure we have the correct one attached before writeback + * occurs. + */ + bp->b_ops = &xfs_dquot_buf_ops; xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); @@ -1090,7 +1096,7 @@ xfs_qm_dqiterate( xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), mp->m_quotainfo->qi_dqchunklen, - NULL); + &xfs_dquot_buf_ops); rablkno++; } } -- cgit v0.10.2 From ad3714b82c631a34724da09a7daa53afcab952fa Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Aug 2014 12:59:31 +1000 Subject: xfs: dquot recovery needs verifiers dquot recovery should add verifiers to the dquot buffers that it recovers changes into. Unfortunately, it doesn't attached the verifiers to the buffers in a consistent manner. For example, xlog_recover_dquot_pass2() reads dquot buffers without a verifier and then writes it without ever having attached a verifier to the buffer. Further, dquot buffer recovery may write a dquot buffer that has not been modified, or indeed, shoul dbe written because quotas are not enabled and hence changes to the buffer were not replayed. In this case, we again write buffers without verifiers attached because that doesn't happen until after the buffer changes have been replayed. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8a7d8a7..1fd5787 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2399,8 +2399,11 @@ xlog_recover_do_reg_buffer( * Simple algorithm: if we have found a QUOTAOFF log item of the same type * (ie. USR or GRP), then just toss this buffer away; don't recover it. * Else, treat it as a regular buffer and do recovery. + * + * Return false if the buffer was tossed and true if we recovered the buffer to + * indicate to the caller if the buffer needs writing. */ -STATIC void +STATIC bool xlog_recover_do_dquot_buffer( struct xfs_mount *mp, struct xlog *log, @@ -2415,9 +2418,8 @@ xlog_recover_do_dquot_buffer( /* * Filesystems are required to send in quota flags at mount time. */ - if (mp->m_qflags == 0) { - return; - } + if (!mp->m_qflags) + return false; type = 0; if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) @@ -2430,9 +2432,10 @@ xlog_recover_do_dquot_buffer( * This type of quotas was turned off, so ignore this buffer */ if (log->l_quotaoffs_flag & type) - return; + return false; xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + return true; } /* @@ -2525,14 +2528,18 @@ xlog_recover_buffer_pass2( if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); + if (error) + goto out_release; } else if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); + bool dirty; + + dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); + if (!dirty) + goto out_release; } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } - if (error) - goto out_release; /* * Perform delayed write on the buffer. Asynchronous writes will be @@ -3022,9 +3029,16 @@ xlog_recover_dquot_pass2( return -EIO; ASSERT(dq_f->qlf_len == 1); + /* + * At this point we are assuming that the dquots have been allocated + * and hence the buffer has valid dquots stamped in it. It should, + * therefore, pass verifier validation. If the dquot is bad, then the + * we'll return an error here, so we don't need to specifically check + * the dquot in the buffer after the verifier has run. + */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, - NULL); + &xfs_dquot_buf_ops); if (error) return error; @@ -3032,18 +3046,6 @@ xlog_recover_dquot_pass2( ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); /* - * At least the magic num portion should be on disk because this - * was among a chunk of dquots created earlier, and we did some - * minimal initialization then. - */ - error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, - "xlog_recover_dquot_pass2"); - if (error) { - xfs_buf_relse(bp); - return -EIO; - } - - /* * If the dquot has an LSN in it, recover the dquot only if it's less * than the lsn of the transaction we are replaying. */ -- cgit v0.10.2 From eac152b4742ec5f1ed04d73d699ae2be3607d56b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Aug 2014 13:22:49 +1000 Subject: xfs: kill VN_DIRTY() Only one user of the macro and the dirty mapping check is redundant so just get rid of it. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1a5e068..fea3c92 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1635,7 +1635,7 @@ xfs_release( truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); if (truncated) { xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); - if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) { + if (ip->i_delayed_blks > 0) { error = filemap_flush(VFS_I(ip)->i_mapping); if (error) return error; diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h index e8a7738..07b475a 100644 --- a/fs/xfs/xfs_vnode.h +++ b/fs/xfs/xfs_vnode.h @@ -39,8 +39,6 @@ struct attrlist_cursor_kern; */ #define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) #define VN_CACHED(vp) (vp->i_mapping->nrpages) -#define VN_DIRTY(vp) mapping_tagged(vp->i_mapping, \ - PAGECACHE_TAG_DIRTY) #endif /* __XFS_VNODE_H__ */ -- cgit v0.10.2 From 2667c6f935d979cea1ab7fa58568fd0fd725525f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Aug 2014 13:23:15 +1000 Subject: xfs: kill VN_CACHED Only has 2 users, has outlived it's usefulness. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 56f050e..1b79b8c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -862,7 +862,7 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) * have speculative prealloc/delalloc blocks to remove. */ if (VFS_I(ip)->i_size == 0 && - VN_CACHED(VFS_I(ip)) == 0 && + VFS_I(ip)->i_mapping->nrpages == 0 && ip->i_delayed_blks == 0) return false; @@ -1720,7 +1720,7 @@ xfs_swap_extents( truncate_pagecache_range(VFS_I(tip), 0, -1); /* Verify O_DIRECT for ftmp */ - if (VN_CACHED(VFS_I(tip)) != 0) { + if (VFS_I(tip)->i_mapping->nrpages) { error = -EINVAL; goto out_unlock; } diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h index 07b475a..bcf0c74 100644 --- a/fs/xfs/xfs_vnode.h +++ b/fs/xfs/xfs_vnode.h @@ -38,7 +38,6 @@ struct attrlist_cursor_kern; * Some useful predicates. */ #define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) -#define VN_CACHED(vp) (vp->i_mapping->nrpages) #endif /* __XFS_VNODE_H__ */ -- cgit v0.10.2 From dd8c38bab0d88615e0bdf013e6de3d4345f8cda2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Aug 2014 13:23:35 +1000 Subject: xfs: kill VN_MAPPED Only one user, no longer needed. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 1b79b8c..bf7e615 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1766,7 +1766,7 @@ xfs_swap_extents( * vop_read (or write in the case of autogrow) they block on the iolock * until we have switched the extents. */ - if (VN_MAPPED(VFS_I(ip))) { + if (mapping_mapped(VFS_I(ip)->i_mapping)) { error = -EBUSY; goto out_unlock; } diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h index bcf0c74..300725d 100644 --- a/fs/xfs/xfs_vnode.h +++ b/fs/xfs/xfs_vnode.h @@ -34,10 +34,4 @@ struct attrlist_cursor_kern; { IO_ISDIRECT, "DIRECT" }, \ { IO_INVIS, "INVIS"} -/* - * Some useful predicates. - */ -#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) - - #endif /* __XFS_VNODE_H__ */ -- cgit v0.10.2 From b92cc59f69537f26d5a42e4171ccc864ae4d9383 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Aug 2014 13:28:20 +1000 Subject: xfs: kill xfs_vnode.h Move the IO flag definitions to xfs_inode.h and kill the header file as it is now empty. Removing the xfs_vnode.h file showed up an implicit header include path: xfs_linux.h -> xfs_vnode.h -> xfs_fs.h And so every xfs header file has been inplicitly been including xfs_fs.h where it is needed or not. Hence the removal of xfs_vnode.h causes all sorts of build issues because BBTOB() and friends are no longer automatically included in the build. This also gets fixed. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 181605d..5284a7e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -246,11 +246,11 @@ xfs_file_read_iter( XFS_STATS_INC(xs_read_calls); if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; + ioflags |= XFS_IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + ioflags |= XFS_IO_INVIS; - if (unlikely(ioflags & IO_ISDIRECT)) { + if (unlikely(ioflags & XFS_IO_ISDIRECT)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -283,7 +283,7 @@ xfs_file_read_iter( * proceeed concurrently without serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { + if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); @@ -325,7 +325,7 @@ xfs_file_splice_read( XFS_STATS_INC(xs_read_calls); if (infilp->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + ioflags |= XFS_IO_INVIS; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index f72bffa..c10e3fa 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -398,4 +398,14 @@ do { \ extern struct kmem_zone *xfs_inode_zone; +/* + * Flags for read/write calls + */ +#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */ +#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */ + +#define XFS_IO_FLAGS \ + { XFS_IO_ISDIRECT, "DIRECT" }, \ + { XFS_IO_INVIS, "INVIS"} + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 30983b8..12ef44e 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -736,7 +736,7 @@ xfs_ioc_space( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - if (!(ioflags & IO_INVIS)) { + if (!(ioflags & XFS_IO_INVIS)) { ip->i_d.di_mode &= ~S_ISUID; if (ip->i_d.di_mode & S_IXGRP) ip->i_d.di_mode &= ~S_ISGID; @@ -1376,7 +1376,7 @@ xfs_ioc_getbmap( return -EINVAL; bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); - if (ioflags & IO_INVIS) + if (ioflags & XFS_IO_INVIS) bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, @@ -1520,7 +1520,7 @@ xfs_file_ioctl( int error; if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + ioflags |= XFS_IO_INVIS; trace_xfs_file_ioctl(ip); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index e65ea67..04ffc1b 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -28,7 +28,6 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_vnode.h" #include "xfs_inode.h" #include "xfs_itable.h" #include "xfs_error.h" @@ -537,7 +536,7 @@ xfs_file_compat_ioctl( int error; if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; + ioflags |= XFS_IO_INVIS; trace_xfs_file_compat_ioctl(ip); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index f59b966..f76e44e 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -113,7 +113,7 @@ typedef __uint64_t __psunsigned_t; #include #include -#include "xfs_vnode.h" +#include "xfs_fs.h" #include "xfs_stats.h" #include "xfs_sysctl.h" #include "xfs_iops.h" diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h deleted file mode 100644 index 300725d..0000000 --- a/fs/xfs/xfs_vnode.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_VNODE_H__ -#define __XFS_VNODE_H__ - -#include "xfs_fs.h" - -struct file; -struct xfs_inode; -struct attrlist_cursor_kern; - -/* - * Flags for read/write calls - same values as IRIX - */ -#define IO_ISDIRECT 0x00004 /* bypass page cache */ -#define IO_INVIS 0x00020 /* don't update inode timestamps */ - -#define XFS_IO_FLAGS \ - { IO_ISDIRECT, "DIRECT" }, \ - { IO_INVIS, "INVIS"} - -#endif /* __XFS_VNODE_H__ */ -- cgit v0.10.2 From 812176832169c77b4bacddd01edc3e55340263fd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Aug 2014 13:29:32 +1000 Subject: xfs: fix swapext ilock deadlock xfs_swap_extents() holds the ilock over a call to filemap_write_and_wait(), which can then try to write data and take the ilock. That causes a self-deadlock. Fix the deadlock and clean up the code by separating the locking appropriately. Add a lockflags variable to track what locks we are holding as we gain and drop them and cleanup the error handling to always use "out_unlock" with the lockflags variable. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index bf7e615..5d29aa1 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1686,6 +1686,7 @@ xfs_swap_extents( int aforkblks = 0; int taforkblks = 0; __uint64_t tmp; + int lock_flags; tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { @@ -1694,13 +1695,13 @@ xfs_swap_extents( } /* - * we have to do two separate lock calls here to keep lockdep - * happy. If we try to get all the locks in one call, lock will - * report false positives when we drop the ILOCK and regain them - * below. + * Lock up the inodes against other IO and truncate to begin with. + * Then we can ensure the inodes are flushed and have no page cache + * safely. Once we have done this we can take the ilocks and do the rest + * of the checks. */ + lock_flags = XFS_IOLOCK_EXCL; xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); /* Verify that both files have the same format */ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { @@ -1719,6 +1720,9 @@ xfs_swap_extents( goto out_unlock; truncate_pagecache_range(VFS_I(tip), 0, -1); + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + lock_flags |= XFS_ILOCK_EXCL; + /* Verify O_DIRECT for ftmp */ if (VFS_I(tip)->i_mapping->nrpages) { error = -EINVAL; @@ -1773,6 +1777,7 @@ xfs_swap_extents( xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(tip, XFS_ILOCK_EXCL); + lock_flags &= ~XFS_ILOCK_EXCL; /* * There is a race condition here since we gave up the @@ -1785,13 +1790,11 @@ xfs_swap_extents( tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - xfs_iunlock(tip, XFS_IOLOCK_EXCL); - xfs_trans_cancel(tp, 0); - goto out; - } + if (error) + goto out_trans_cancel; + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + lock_flags |= XFS_ILOCK_EXCL; /* * Count the number of extended attribute blocks @@ -1810,8 +1813,8 @@ xfs_swap_extents( goto out_trans_cancel; } - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ijoin(tp, tip, lock_flags); /* * Before we've swapped the forks, lets set the owners of the forks @@ -1940,8 +1943,8 @@ out: return error; out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); goto out; out_trans_cancel: -- cgit v0.10.2 From 4ef897a27543b513351262881660147366c042a1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Aug 2014 13:44:08 +1000 Subject: xfs: flush both inodes in xfs_swap_extents We need to treat both inodes identically from a page cache point of view when prepareing them for extent swapping. We don't do this right now - we assume that one of the inodes empty, because that's what xfs_fsr currently does. Remove this assumption from the code. While factoring out the flushing and related checks, move the transactions reservation to immeidately after the flushes so that we don't need to pick up and then drop the ilock to do the transaction reservation. There are no issues with aborting the transaction it if the checks fail before we join the inodes to the transaction and dirty them, so this is a safe change to make. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 5d29aa1..8f7da58 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1672,6 +1672,30 @@ xfs_swap_extents_check_format( } int +xfs_swap_extent_flush( + struct xfs_inode *ip) +{ + int error; + + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + return error; + truncate_pagecache_range(VFS_I(ip), 0, -1); + + /* Verify O_DIRECT for ftmp */ + if (VFS_I(ip)->i_mapping->nrpages) + return -EINVAL; + + /* + * Don't try to swap extents on mmap()d files because we can't lock + * out races against page faults safely. + */ + if (mapping_mapped(VFS_I(ip)->i_mapping)) + return -EBUSY; + return 0; +} + +int xfs_swap_extents( xfs_inode_t *ip, /* target inode */ xfs_inode_t *tip, /* tmp inode */ @@ -1715,26 +1739,28 @@ xfs_swap_extents( goto out_unlock; } - error = filemap_write_and_wait(VFS_I(tip)->i_mapping); + error = xfs_swap_extent_flush(ip); + if (error) + goto out_unlock; + error = xfs_swap_extent_flush(tip); if (error) goto out_unlock; - truncate_pagecache_range(VFS_I(tip), 0, -1); - - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); - lock_flags |= XFS_ILOCK_EXCL; - /* Verify O_DIRECT for ftmp */ - if (VFS_I(tip)->i_mapping->nrpages) { - error = -EINVAL; + tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); goto out_unlock; } + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + lock_flags |= XFS_ILOCK_EXCL; /* Verify all data are being swapped */ if (sxp->sx_offset != 0 || sxp->sx_length != ip->i_d.di_size || sxp->sx_length != tip->i_d.di_size) { error = -EFAULT; - goto out_unlock; + goto out_trans_cancel; } trace_xfs_swap_extent_before(ip, 0); @@ -1746,7 +1772,7 @@ xfs_swap_extents( xfs_notice(mp, "%s: inode 0x%llx format is incompatible for exchanging.", __func__, ip->i_ino); - goto out_unlock; + goto out_trans_cancel; } /* @@ -1761,41 +1787,8 @@ xfs_swap_extents( (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { error = -EBUSY; - goto out_unlock; - } - - /* We need to fail if the file is memory mapped. Once we have tossed - * all existing pages, the page fault will have no option - * but to go to the filesystem for pages. By making the page fault call - * vop_read (or write in the case of autogrow) they block on the iolock - * until we have switched the extents. - */ - if (mapping_mapped(VFS_I(ip)->i_mapping)) { - error = -EBUSY; - goto out_unlock; - } - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_iunlock(tip, XFS_ILOCK_EXCL); - lock_flags &= ~XFS_ILOCK_EXCL; - - /* - * There is a race condition here since we gave up the - * ilock. However, the data fork will not change since - * we have the iolock (locked for truncation too) so we - * are safe. We don't really care if non-io related - * fields change. - */ - truncate_pagecache_range(VFS_I(ip), 0, -1); - - tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) goto out_trans_cancel; - - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); - lock_flags |= XFS_ILOCK_EXCL; - + } /* * Count the number of extended attribute blocks */ -- cgit v0.10.2