summaryrefslogtreecommitdiff
path: root/fs/xfs
diff options
context:
space:
mode:
authorScott Wood <scottwood@freescale.com>2015-02-13 22:12:06 (GMT)
committerScott Wood <scottwood@freescale.com>2015-02-13 22:19:22 (GMT)
commit6faa2909871d8937cb2f79a10e1b21ffe193fac1 (patch)
treef558a94f1553814cc122ab8d9e04c0ebad5262a5 /fs/xfs
parentfcb2fb84301c673ee15ca04e7a2fc965712d49a0 (diff)
downloadlinux-fsl-qoriq-6faa2909871d8937cb2f79a10e1b21ffe193fac1.tar.xz
Reset to 3.12.37
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_aops.c80
-rw-r--r--fs/xfs/xfs_discard.c5
-rw-r--r--fs/xfs/xfs_dquot.c3
-rw-r--r--fs/xfs/xfs_file.c23
-rw-r--r--fs/xfs/xfs_fsops.c22
-rw-r--r--fs/xfs/xfs_inode.c72
-rw-r--r--fs/xfs/xfs_inode_fork.c18
-rw-r--r--fs/xfs/xfs_ioctl.c2
-rw-r--r--fs/xfs/xfs_iops.c9
-rw-r--r--fs/xfs/xfs_log.c56
-rw-r--r--fs/xfs/xfs_log_cil.c14
-rw-r--r--fs/xfs/xfs_log_priv.h10
-rw-r--r--fs/xfs/xfs_log_recover.c51
-rw-r--r--fs/xfs/xfs_mount.c1
-rw-r--r--fs/xfs/xfs_qm.c8
-rw-r--r--fs/xfs/xfs_sb.c29
16 files changed, 285 insertions, 118 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e51e581..ab28ad5 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -431,10 +431,22 @@ xfs_start_page_writeback(
{
ASSERT(PageLocked(page));
ASSERT(!PageWriteback(page));
- if (clear_dirty)
+
+ /*
+ * if the page was not fully cleaned, we need to ensure that the higher
+ * layers come back to it correctly. That means we need to keep the page
+ * dirty, and for WB_SYNC_ALL writeback we need to ensure the
+ * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
+ * write this page in this writeback sweep will be made.
+ */
+ if (clear_dirty) {
clear_page_dirty_for_io(page);
- set_page_writeback(page);
+ set_page_writeback(page);
+ } else
+ set_page_writeback_keepwrite(page);
+
unlock_page(page);
+
/* If no buffers on the page are to be written, finish it here */
if (!buffers)
end_page_writeback(page);
@@ -1569,8 +1581,7 @@ xfs_vm_write_begin(
ASSERT(len <= PAGE_CACHE_SIZE);
- page = grab_cache_page_write_begin(mapping, index,
- flags | AOP_FLAG_NOFS);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -1658,11 +1669,72 @@ xfs_vm_readpages(
return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
}
+/*
+ * This is basically a copy of __set_page_dirty_buffers() with one
+ * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
+ * dirty, we'll never be able to clean them because we don't write buffers
+ * beyond EOF, and that means we can't invalidate pages that span EOF
+ * that have been marked dirty. Further, the dirty state can leak into
+ * the file interior if the file is extended, resulting in all sorts of
+ * bad things happening as the state does not match the underlying data.
+ *
+ * XXX: this really indicates that bufferheads in XFS need to die. Warts like
+ * this only exist because of bufferheads and how the generic code manages them.
+ */
+STATIC int
+xfs_vm_set_page_dirty(
+ struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ loff_t end_offset;
+ loff_t offset;
+ int newly_dirty;
+
+ if (unlikely(!mapping))
+ return !TestSetPageDirty(page);
+
+ end_offset = i_size_read(inode);
+ offset = page_offset(page);
+
+ spin_lock(&mapping->private_lock);
+ if (page_has_buffers(page)) {
+ struct buffer_head *head = page_buffers(page);
+ struct buffer_head *bh = head;
+
+ do {
+ if (offset < end_offset)
+ set_buffer_dirty(bh);
+ bh = bh->b_this_page;
+ offset += 1 << inode->i_blkbits;
+ } while (bh != head);
+ }
+ newly_dirty = !TestSetPageDirty(page);
+ spin_unlock(&mapping->private_lock);
+
+ if (newly_dirty) {
+ /* sigh - __set_page_dirty() is static, so copy it here, too */
+ unsigned long flags;
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ if (page->mapping) { /* Race with truncate? */
+ WARN_ON_ONCE(!PageUptodate(page));
+ account_page_dirtied(page, mapping);
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ }
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ }
+ return newly_dirty;
+}
+
const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
.readpages = xfs_vm_readpages,
.writepage = xfs_vm_writepage,
.writepages = xfs_vm_writepages,
+ .set_page_dirty = xfs_vm_set_page_dirty,
.releasepage = xfs_vm_releasepage,
.invalidatepage = xfs_vm_invalidatepage,
.write_begin = xfs_vm_write_begin,
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 45560ee..19d9fd6 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -158,7 +158,7 @@ xfs_ioc_trim(
struct xfs_mount *mp,
struct fstrim_range __user *urange)
{
- struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+ struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
unsigned int granularity = q->limits.discard_granularity;
struct fstrim_range range;
xfs_daddr_t start, end, minlen;
@@ -181,7 +181,8 @@ xfs_ioc_trim(
* matter as trimming blocks is an advisory interface.
*/
if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
- range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)))
+ range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
+ range.len < mp->m_sb.sb_blocksize)
return -XFS_ERROR(EINVAL);
start = BTOBB(range.start);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 1ee776d..895db7a 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1121,7 +1121,8 @@ xfs_qm_dqflush(
* Get the buffer containing the on-disk dquot
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+ &xfs_dquot_buf_ops);
if (error)
goto out_unlock;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 4c749ab..aa60645 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -299,7 +299,16 @@ xfs_file_aio_read(
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
- truncate_pagecache_range(VFS_I(ip), pos, -1);
+
+ /*
+ * Invalidate whole pages. This can return an error if
+ * we fail to invalidate a page, but this should never
+ * happen on XFS. Warn if it does fail.
+ */
+ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+ pos >> PAGE_CACHE_SHIFT, -1);
+ WARN_ON_ONCE(ret);
+ ret = 0;
}
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
@@ -678,7 +687,15 @@ xfs_file_dio_aio_write(
pos, -1);
if (ret)
goto out;
- truncate_pagecache_range(VFS_I(ip), pos, -1);
+ /*
+ * Invalidate whole pages. This can return an error if
+ * we fail to invalidate a page, but this should never
+ * happen on XFS. Warn if it does fail.
+ */
+ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+ pos >> PAGE_CACHE_SHIFT, -1);
+ WARN_ON_ONCE(ret);
+ ret = 0;
}
/*
@@ -794,7 +811,7 @@ xfs_file_aio_write(
XFS_STATS_ADD(xs_write_bytes, ret);
/* Handle various SYNC-type writes */
- err = generic_write_sync(file, pos, ret);
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
if (err < 0)
ret = err;
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index c888040..20ccca1 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -153,7 +153,7 @@ xfs_growfs_data_private(
xfs_buf_t *bp;
int bucket;
int dpct;
- int error;
+ int error, saved_error = 0;
xfs_agnumber_t nagcount;
xfs_agnumber_t nagimax = 0;
xfs_rfsblock_t nb, nb_mod;
@@ -500,29 +500,33 @@ xfs_growfs_data_private(
error = ENOMEM;
}
+ /*
+ * If we get an error reading or writing alternate superblocks,
+ * continue. xfs_repair chooses the "best" superblock based
+ * on most matches; if we break early, we'll leave more
+ * superblocks un-updated than updated, and xfs_repair may
+ * pick them over the properly-updated primary.
+ */
if (error) {
xfs_warn(mp,
"error %d reading secondary superblock for ag %d",
error, agno);
- break;
+ saved_error = error;
+ continue;
}
xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
- /*
- * If we get an error writing out the alternate superblocks,
- * just issue a warning and continue. The real work is
- * already done and committed.
- */
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error) {
xfs_warn(mp,
"write error %d updating secondary superblock for ag %d",
error, agno);
- break; /* no point in continuing */
+ saved_error = error;
+ continue;
}
}
- return error;
+ return saved_error ? saved_error : error;
error0:
xfs_trans_cancel(tp, XFS_TRANS_ABORT);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e3d7538..7a460d8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2370,6 +2370,33 @@ xfs_iunpin_wait(
__xfs_iunpin_wait(ip);
}
+/*
+ * Removing an inode from the namespace involves removing the directory entry
+ * and dropping the link count on the inode. Removing the directory entry can
+ * result in locking an AGF (directory blocks were freed) and removing a link
+ * count can result in placing the inode on an unlinked list which results in
+ * locking an AGI.
+ *
+ * The big problem here is that we have an ordering constraint on AGF and AGI
+ * locking - inode allocation locks the AGI, then can allocate a new extent for
+ * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
+ * removes the inode from the unlinked list, requiring that we lock the AGI
+ * first, and then freeing the inode can result in an inode chunk being freed
+ * and hence freeing disk space requiring that we lock an AGF.
+ *
+ * Hence the ordering that is imposed by other parts of the code is AGI before
+ * AGF. This means we cannot remove the directory entry before we drop the inode
+ * reference count and put it on the unlinked list as this results in a lock
+ * order of AGF then AGI, and this can deadlock against inode allocation and
+ * freeing. Therefore we must drop the link counts before we remove the
+ * directory entry.
+ *
+ * This is still safe from a transactional point of view - it is not until we
+ * get to xfs_bmap_finish() that we have the possibility of multiple
+ * transactions in this operation. Hence as long as we remove the directory
+ * entry and drop the link count in the first transaction of the remove
+ * operation, there are no transactional constraints on the ordering here.
+ */
int
xfs_remove(
xfs_inode_t *dp,
@@ -2439,6 +2466,7 @@ xfs_remove(
/*
* If we're removing a directory perform some additional validation.
*/
+ cancel_flags |= XFS_TRANS_ABORT;
if (is_dir) {
ASSERT(ip->i_d.di_nlink >= 2);
if (ip->i_d.di_nlink != 2) {
@@ -2449,31 +2477,16 @@ xfs_remove(
error = XFS_ERROR(ENOTEMPTY);
goto out_trans_cancel;
}
- }
- xfs_bmap_init(&free_list, &first_block);
- error = xfs_dir_removename(tp, dp, name, ip->i_ino,
- &first_block, &free_list, resblks);
- if (error) {
- ASSERT(error != ENOENT);
- goto out_bmap_cancel;
- }
- xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
- if (is_dir) {
- /*
- * Drop the link from ip's "..".
- */
+ /* Drop the link from ip's "..". */
error = xfs_droplink(tp, dp);
if (error)
- goto out_bmap_cancel;
+ goto out_trans_cancel;
- /*
- * Drop the "." link from ip to self.
- */
+ /* Drop the "." link from ip to self. */
error = xfs_droplink(tp, ip);
if (error)
- goto out_bmap_cancel;
+ goto out_trans_cancel;
} else {
/*
* When removing a non-directory we need to log the parent
@@ -2482,20 +2495,24 @@ xfs_remove(
*/
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
}
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- /*
- * Drop the link from dp to ip.
- */
+ /* Drop the link from dp to ip. */
error = xfs_droplink(tp, ip);
if (error)
- goto out_bmap_cancel;
+ goto out_trans_cancel;
- /*
- * Determine if this is the last link while
- * we are in the transaction.
- */
+ /* Determine if this is the last link while the inode is locked */
link_zero = (ip->i_d.di_nlink == 0);
+ xfs_bmap_init(&free_list, &first_block);
+ error = xfs_dir_removename(tp, dp, name, ip->i_ino,
+ &first_block, &free_list, resblks);
+ if (error) {
+ ASSERT(error != ENOENT);
+ goto out_bmap_cancel;
+ }
+
/*
* If this is a synchronous mount, make sure that the
* remove transaction goes to disk before returning to
@@ -2525,7 +2542,6 @@ xfs_remove(
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
- cancel_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
xfs_trans_cancel(tp, cancel_flags);
std_return:
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index 02f1083..6829134 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -1031,15 +1031,14 @@ xfs_iext_add(
* the next index needed in the indirection array.
*/
else {
- int count = ext_diff;
+ uint count = ext_diff;
while (count) {
erp = xfs_iext_irec_new(ifp, erp_idx);
- erp->er_extcount = count;
- count -= MIN(count, (int)XFS_LINEAR_EXTS);
- if (count) {
+ erp->er_extcount = min(count, XFS_LINEAR_EXTS);
+ count -= erp->er_extcount;
+ if (count)
erp_idx++;
- }
}
}
}
@@ -1359,7 +1358,7 @@ xfs_iext_remove_indirect(
void
xfs_iext_realloc_direct(
xfs_ifork_t *ifp, /* inode fork pointer */
- int new_size) /* new size of extents */
+ int new_size) /* new size of extents after adding */
{
int rnew_size; /* real new size of extents */
@@ -1397,13 +1396,8 @@ xfs_iext_realloc_direct(
rnew_size - ifp->if_real_bytes);
}
}
- /*
- * Switch from the inline extent buffer to a direct
- * extent list. Be sure to include the inline extent
- * bytes in new_size.
- */
+ /* Switch from the inline extent buffer to a direct extent list */
else {
- new_size += ifp->if_bytes;
if (!is_power_of_2(new_size)) {
rnew_size = roundup_pow_of_two(new_size);
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 8c8ef24..52b5375 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1133,7 +1133,7 @@ xfs_ioctl_setattr(
* cleared upon successful return from chown()
*/
if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
- !inode_capable(VFS_I(ip), CAP_FSETID))
+ !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
/*
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 2b8952d..584996c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1169,6 +1169,7 @@ xfs_setup_inode(
struct xfs_inode *ip)
{
struct inode *inode = &ip->i_vnode;
+ gfp_t gfp_mask;
inode->i_ino = ip->i_ino;
inode->i_state = I_NEW;
@@ -1229,6 +1230,14 @@ xfs_setup_inode(
}
/*
+ * Ensure all page cache allocations are done from GFP_NOFS context to
+ * prevent direct reclaim recursion back into the filesystem and blowing
+ * stacks or deadlocking.
+ */
+ gfp_mask = mapping_gfp_mask(inode->i_mapping);
+ mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
+
+ /*
* If there is no attribute fork no ACL can exist on this inode,
* and it can't have any file capabilities attached to it either.
*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a2dea108..3c4ddc1 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1000,27 +1000,34 @@ xfs_log_space_wake(
}
/*
- * Determine if we have a transaction that has gone to disk
- * that needs to be covered. To begin the transition to the idle state
- * firstly the log needs to be idle (no AIL and nothing in the iclogs).
- * If we are then in a state where covering is needed, the caller is informed
- * that dummy transactions are required to move the log into the idle state.
+ * Determine if we have a transaction that has gone to disk that needs to be
+ * covered. To begin the transition to the idle state firstly the log needs to
+ * be idle. That means the CIL, the AIL and the iclogs needs to be empty before
+ * we start attempting to cover the log.
*
- * Because this is called as part of the sync process, we should also indicate
- * that dummy transactions should be issued in anything but the covered or
- * idle states. This ensures that the log tail is accurately reflected in
- * the log at the end of the sync, hence if a crash occurrs avoids replay
- * of transactions where the metadata is already on disk.
+ * Only if we are then in a state where covering is needed, the caller is
+ * informed that dummy transactions are required to move the log into the idle
+ * state.
+ *
+ * If there are any items in the AIl or CIL, then we do not want to attempt to
+ * cover the log as we may be in a situation where there isn't log space
+ * available to run a dummy transaction and this can lead to deadlocks when the
+ * tail of the log is pinned by an item that is modified in the CIL. Hence
+ * there's no point in running a dummy transaction at this point because we
+ * can't start trying to idle the log until both the CIL and AIL are empty.
*/
int
xfs_log_need_covered(xfs_mount_t *mp)
{
- int needed = 0;
struct xlog *log = mp->m_log;
+ int needed = 0;
if (!xfs_fs_writable(mp))
return 0;
+ if (!xlog_cil_empty(log))
+ return 0;
+
spin_lock(&log->l_icloglock);
switch (log->l_covered_state) {
case XLOG_STATE_COVER_DONE:
@@ -1029,14 +1036,17 @@ xfs_log_need_covered(xfs_mount_t *mp)
break;
case XLOG_STATE_COVER_NEED:
case XLOG_STATE_COVER_NEED2:
- if (!xfs_ail_min_lsn(log->l_ailp) &&
- xlog_iclogs_empty(log)) {
- if (log->l_covered_state == XLOG_STATE_COVER_NEED)
- log->l_covered_state = XLOG_STATE_COVER_DONE;
- else
- log->l_covered_state = XLOG_STATE_COVER_DONE2;
- }
- /* FALLTHRU */
+ if (xfs_ail_min_lsn(log->l_ailp))
+ break;
+ if (!xlog_iclogs_empty(log))
+ break;
+
+ needed = 1;
+ if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+ log->l_covered_state = XLOG_STATE_COVER_DONE;
+ else
+ log->l_covered_state = XLOG_STATE_COVER_DONE2;
+ break;
default:
needed = 1;
break;
@@ -3702,11 +3712,9 @@ xlog_verify_iclog(
/* check validity of iclog pointers */
spin_lock(&log->l_icloglock);
icptr = log->l_iclog;
- for (i=0; i < log->l_iclog_bufs; i++) {
- if (icptr == NULL)
- xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
- icptr = icptr->ic_next;
- }
+ for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next)
+ ASSERT(icptr);
+
if (icptr != log->l_iclog)
xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
spin_unlock(&log->l_icloglock);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index cfe9797..da8524e77 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -711,6 +711,20 @@ xlog_cil_push_foreground(
xlog_cil_push(log);
}
+bool
+xlog_cil_empty(
+ struct xlog *log)
+{
+ struct xfs_cil *cil = log->l_cilp;
+ bool empty = false;
+
+ spin_lock(&cil->xc_push_lock);
+ if (list_empty(&cil->xc_cil))
+ empty = true;
+ spin_unlock(&cil->xc_push_lock);
+ return empty;
+}
+
/*
* Commit a transaction with the given vector to the Committed Item List.
*
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 136654b..f80cff2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -514,12 +514,10 @@ xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
/*
* Committed Item List interfaces
*/
-int
-xlog_cil_init(struct xlog *log);
-void
-xlog_cil_init_post_recovery(struct xlog *log);
-void
-xlog_cil_destroy(struct xlog *log);
+int xlog_cil_init(struct xlog *log);
+void xlog_cil_init_post_recovery(struct xlog *log);
+void xlog_cil_destroy(struct xlog *log);
+bool xlog_cil_empty(struct xlog *log);
/*
* CIL force routines
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 3979749..5b166a0 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2121,6 +2121,17 @@ xlog_recover_validate_buf_type(
__uint16_t magic16;
__uint16_t magicda;
+ /*
+ * We can only do post recovery validation on items on CRC enabled
+ * fielsystems as we need to know when the buffer was written to be able
+ * to determine if we should have replayed the item. If we replay old
+ * metadata over a newer buffer, then it will enter a temporarily
+ * inconsistent state resulting in verification failures. Hence for now
+ * just avoid the verification stage for non-crc filesystems
+ */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
magicda = be16_to_cpu(info->magic);
@@ -2156,8 +2167,6 @@ xlog_recover_validate_buf_type(
bp->b_ops = &xfs_agf_buf_ops;
break;
case XFS_BLFT_AGFL_BUF:
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- break;
if (magic32 != XFS_AGFL_MAGIC) {
xfs_warn(mp, "Bad AGFL block magic!");
ASSERT(0);
@@ -2190,10 +2199,6 @@ xlog_recover_validate_buf_type(
#endif
break;
case XFS_BLFT_DINO_BUF:
- /*
- * we get here with inode allocation buffers, not buffers that
- * track unlinked list changes.
- */
if (magic16 != XFS_DINODE_MAGIC) {
xfs_warn(mp, "Bad INODE block magic!");
ASSERT(0);
@@ -2273,8 +2278,6 @@ xlog_recover_validate_buf_type(
bp->b_ops = &xfs_attr3_leaf_buf_ops;
break;
case XFS_BLFT_ATTR_RMT_BUF:
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- break;
if (magic32 != XFS_ATTR3_RMT_MAGIC) {
xfs_warn(mp, "Bad attr remote magic!");
ASSERT(0);
@@ -2381,16 +2384,7 @@ xlog_recover_do_reg_buffer(
/* Shouldn't be any more regions */
ASSERT(i == item->ri_total);
- /*
- * We can only do post recovery validation on items on CRC enabled
- * fielsystems as we need to know when the buffer was written to be able
- * to determine if we should have replayed the item. If we replay old
- * metadata over a newer buffer, then it will enter a temporarily
- * inconsistent state resulting in verification failures. Hence for now
- * just avoid the verification stage for non-crc filesystems
- */
- if (xfs_sb_version_hascrc(&mp->m_sb))
- xlog_recover_validate_buf_type(mp, bp, buf_f);
+ xlog_recover_validate_buf_type(mp, bp, buf_f);
}
/*
@@ -2625,12 +2619,29 @@ xlog_recover_buffer_pass2(
}
/*
- * recover the buffer only if we get an LSN from it and it's less than
+ * Recover the buffer only if we get an LSN from it and it's less than
* the lsn of the transaction we are replaying.
+ *
+ * Note that we have to be extremely careful of readahead here.
+ * Readahead does not attach verfiers to the buffers so if we don't
+ * actually do any replay after readahead because of the LSN we found
+ * in the buffer if more recent than that current transaction then we
+ * need to attach the verifier directly. Failure to do so can lead to
+ * future recovery actions (e.g. EFI and unlinked list recovery) can
+ * operate on the buffers and they won't get the verifier attached. This
+ * can lead to blocks on disk having the correct content but a stale
+ * CRC.
+ *
+ * It is safe to assume these clean buffers are currently up to date.
+ * If the buffer is dirtied by a later transaction being replayed, then
+ * the verifier will be reset to match whatever recover turns that
+ * buffer into.
*/
lsn = xlog_recover_get_buf_lsn(mp, bp);
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ xlog_recover_validate_buf_type(mp, bp, buf_f);
goto out_release;
+ }
if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5dcc680..dc602b5 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -318,7 +318,6 @@ reread:
* Initialize the mount structure from the superblock.
*/
xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
- xfs_sb_quota_from_disk(&mp->m_sb);
/*
* We must be able to do sector-sized and sector-aligned IO.
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 4688a62..794aa2f 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1193,6 +1193,12 @@ xfs_qm_dqiter_bufs(
if (error)
break;
+ /*
+ * A corrupt buffer might not have a verifier attached, so
+ * make sure we have the correct one attached before writeback
+ * occurs.
+ */
+ bp->b_ops = &xfs_dquot_buf_ops;
xfs_qm_reset_dqcounts(mp, bp, firstid, type);
xfs_buf_delwri_queue(bp, buffer_list);
xfs_buf_relse(bp);
@@ -1276,7 +1282,7 @@ xfs_qm_dqiterate(
xfs_buf_readahead(mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, rablkno),
mp->m_quotainfo->qi_dqchunklen,
- NULL);
+ &xfs_dquot_buf_ops);
rablkno++;
}
}
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 0397081..1351ff0 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -406,10 +406,11 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
}
}
-void
-xfs_sb_from_disk(
+static void
+__xfs_sb_from_disk(
struct xfs_sb *to,
- xfs_dsb_t *from)
+ xfs_dsb_t *from,
+ bool convert_xquota)
{
to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
@@ -465,6 +466,17 @@ xfs_sb_from_disk(
to->sb_pad = 0;
to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
to->sb_lsn = be64_to_cpu(from->sb_lsn);
+ /* Convert on-disk flags to in-memory flags? */
+ if (convert_xquota)
+ xfs_sb_quota_from_disk(to);
+}
+
+void
+xfs_sb_from_disk(
+ struct xfs_sb *to,
+ xfs_dsb_t *from)
+{
+ __xfs_sb_from_disk(to, from, true);
}
static inline void
@@ -580,7 +592,11 @@ xfs_sb_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_sb sb;
- xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+ /*
+ * Use call variant which doesn't convert quota flags from disk
+ * format, because xfs_mount_validate_sb checks the on-disk flags.
+ */
+ __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
/*
* Only check the in progress field for the primary superblock as
@@ -633,8 +649,9 @@ xfs_sb_read_verify(
out_error:
if (error) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
- mp, bp->b_addr);
+ if (error != EWRONGFS)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+ mp, bp->b_addr);
xfs_buf_ioerror(bp, error);
}
}