xfs: validate metadata LSNs against log on v5 superblocks
authorBrian Foster <bfoster@redhat.com>
Mon, 12 Oct 2015 04:59:25 +0000 (15:59 +1100)
committerDave Chinner <david@fromorbit.com>
Mon, 12 Oct 2015 04:59:25 +0000 (15:59 +1100)
Since the onset of v5 superblocks, the LSN of the last modification has
been included in a variety of on-disk data structures. This LSN is used
to provide log recovery ordering guarantees (e.g., to ensure an older
log recovery item is not replayed over a newer target data structure).

While this works correctly from the point a filesystem is formatted and
mounted, userspace tools have some problematic behaviors that defeat
this mechanism. For example, xfs_repair historically zeroes out the log
unconditionally (regardless of whether corruption is detected). If this
occurs, the LSN of the filesystem is reset and the log is now in a
problematic state with respect to on-disk metadata structures that might
have a larger LSN. Until either the log catches up to the highest
previously used metadata LSN or each affected data structure is modified
and written out without incident (which resets the metadata LSN), log
recovery is susceptible to filesystem corruption.

This problem is ultimately addressed and repaired in the associated
userspace tools. The kernel is still responsible to detect the problem
and notify the user that something is wrong. Check the superblock LSN at
mount time and fail the mount if it is invalid. From that point on,
trigger verifier failure on any metadata I/O where an invalid LSN is
detected. This results in a filesystem shutdown and guarantees that we
do not log metadata changes with invalid LSNs on disk. Since this is a
known issue with a known recovery path, present a warning to instruct
the user how to recover.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
15 files changed:
fs/xfs/libxfs/xfs_alloc.c
fs/xfs/libxfs/xfs_attr_leaf.c
fs/xfs/libxfs/xfs_btree.c
fs/xfs/libxfs/xfs_da_btree.c
fs/xfs/libxfs/xfs_dir2_block.c
fs/xfs/libxfs/xfs_dir2_data.c
fs/xfs/libxfs/xfs_dir2_leaf.c
fs/xfs/libxfs/xfs_dir2_node.c
fs/xfs/libxfs/xfs_ialloc.c
fs/xfs/libxfs/xfs_sb.c
fs/xfs/libxfs/xfs_symlink_remote.c
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_log_recover.c

index ffad7f20342f6e328b8daafa918ffeb1ae076ab1..d39e6d8a4949cfac23def2c30113226692f45158 100644 (file)
@@ -482,7 +482,9 @@ xfs_agfl_verify(
                    be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
                        return false;
        }
-       return true;
+
+       return xfs_log_check_lsn(mp,
+                                be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn));
 }
 
 static void
@@ -2259,9 +2261,13 @@ xfs_agf_verify(
  {
        struct xfs_agf  *agf = XFS_BUF_TO_AGF(bp);
 
-       if (xfs_sb_version_hascrc(&mp->m_sb) &&
-           !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
+               if (!xfs_log_check_lsn(mp,
+                               be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn)))
+                       return false;
+       }
 
        if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
              XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
index 33df52d97ec77bfeb07e4eb8bf45c33d7033a15c..aa187f7ba2dd341a4a797ab268747881f3f4deca 100644 (file)
@@ -41,6 +41,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
 #include "xfs_dir2.h"
+#include "xfs_log.h"
 
 
 /*
@@ -266,6 +267,8 @@ xfs_attr3_leaf_verify(
                        return false;
                if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
                        return false;
+               if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+                       return false;
        } else {
                if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
                        return false;
index f7d7ee7a26072587262dfd27509ba8f502214cc7..235d026c7f9ce8f23f0b33ec1d8bf2fcfeb62823 100644 (file)
@@ -32,6 +32,7 @@
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
 #include "xfs_alloc.h"
+#include "xfs_log.h"
 
 /*
  * Cursor allocation zone.
@@ -243,8 +244,14 @@ bool
 xfs_btree_lblock_verify_crc(
        struct xfs_buf          *bp)
 {
-       if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
+                       return false;
                return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+       }
 
        return true;
 }
@@ -275,8 +282,14 @@ bool
 xfs_btree_sblock_verify_crc(
        struct xfs_buf          *bp)
 {
-       if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
+                       return false;
                return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+       }
 
        return true;
 }
index be43248a5822844f642007ad40bd1e00c808d54e..e89a0f8f827ce0da6829c7189471a4438d019b05 100644 (file)
@@ -39,6 +39,7 @@
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
 #include "xfs_buf_item.h"
+#include "xfs_log.h"
 
 /*
  * xfs_da_btree.c
@@ -150,6 +151,8 @@ xfs_da3_node_verify(
                        return false;
                if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
                        return false;
+               if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+                       return false;
        } else {
                if (ichdr.magic != XFS_DA_NODE_MAGIC)
                        return false;
@@ -322,6 +325,7 @@ xfs_da3_node_create(
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
 
+               memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
                ichdr.magic = XFS_DA3_NODE_MAGIC;
                hdr3->info.blkno = cpu_to_be64(bp->b_bn);
                hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
index 4778d1dd511afae50eee4c6a7b5d0a7220fc4c4d..9c10e2b8cfcb594101a38011ee68bfcf6a5e09a1 100644 (file)
@@ -34,6 +34,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
+#include "xfs_log.h"
 
 /*
  * Local function prototypes.
@@ -71,6 +72,8 @@ xfs_dir3_block_verify(
                        return false;
                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
                        return false;
+               if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+                       return false;
        } else {
                if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
                        return false;
index 824131e71bc53017082f518355c47f963e792cd8..af71a84f343c33b98471665df2d2f37b025571cc 100644 (file)
@@ -31,6 +31,7 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
+#include "xfs_log.h"
 
 /*
  * Check the consistency of the data block.
@@ -224,6 +225,8 @@ xfs_dir3_data_verify(
                        return false;
                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
                        return false;
+               if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+                       return false;
        } else {
                if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
                        return false;
index f300240ebb8d1575191379978ba5ed722c009718..3923e1f9469761548b34a6db73076bb8f90d0c66 100644 (file)
@@ -33,6 +33,7 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
+#include "xfs_log.h"
 
 /*
  * Local function declarations.
@@ -164,6 +165,8 @@ xfs_dir3_leaf_verify(
                        return false;
                if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
                        return false;
+               if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
+                       return false;
        } else {
                if (leaf->hdr.info.magic != cpu_to_be16(magic))
                        return false;
index cc28e924545b52159cb8b0201b5d52dba58f7b1d..70b0cb2fd55606422eaf08ce9e016e33f66fe6a2 100644 (file)
@@ -33,6 +33,7 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
+#include "xfs_log.h"
 
 /*
  * Function declarations.
@@ -97,6 +98,8 @@ xfs_dir3_free_verify(
                        return false;
                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
                        return false;
+               if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+                       return false;
        } else {
                if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
                        return false;
index 54deb2d12ac6bdfeb55ca2e05b56c81ed169e1a5..70c1db99f6a720df26fa0d4c430ad401141f1ad2 100644 (file)
@@ -38,6 +38,7 @@
 #include "xfs_icreate_item.h"
 #include "xfs_icache.h"
 #include "xfs_trace.h"
+#include "xfs_log.h"
 
 
 /*
@@ -2500,9 +2501,14 @@ xfs_agi_verify(
        struct xfs_mount *mp = bp->b_target->bt_mount;
        struct xfs_agi  *agi = XFS_BUF_TO_AGI(bp);
 
-       if (xfs_sb_version_hascrc(&mp->m_sb) &&
-           !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
+                       return false;
+               if (!xfs_log_check_lsn(mp,
+                               be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn)))
                        return false;
+       }
+
        /*
         * Validate the magic number of the agi block.
         */
index 47425140f34303745bd8ff4b2b2dd0d34d0f033e..a0b071d881a02aa3d99b08e83aadb999b65270a4 100644 (file)
@@ -35,6 +35,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
+#include "xfs_log.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -163,6 +164,15 @@ xfs_mount_validate_sb(
 "Filesystem can not be safely mounted by this kernel.");
                        return -EINVAL;
                }
+       } else if (xfs_sb_version_hascrc(sbp)) {
+               /*
+                * We can't read verify the sb LSN because the read verifier is
+                * called before the log is allocated and processed. We know the
+                * log is set up before write verifier (!check_version) calls,
+                * so just check it here.
+                */
+               if (!xfs_log_check_lsn(mp, sbp->sb_lsn))
+                       return -EFSCORRUPTED;
        }
 
        if (xfs_sb_version_has_pquotino(sbp)) {
index b9884aab46d32eb2e51c790357ea4a6c4bf6c193..cb6fd20a4d3d19b61ac5db3dde29345a18432845 100644 (file)
@@ -31,6 +31,7 @@
 #include "xfs_cksum.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
+#include "xfs_log.h"
 
 
 /*
@@ -60,6 +61,7 @@ xfs_symlink_hdr_set(
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return 0;
 
+       memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr));
        dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
        dsl->sl_offset = cpu_to_be32(offset);
        dsl->sl_bytes = cpu_to_be32(size);
@@ -116,6 +118,8 @@ xfs_symlink_verify(
                return false;
        if (dsl->sl_owner == 0)
                return false;
+       if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn)))
+               return false;
 
        return true;
 }
index aaadee0969c929022725b11dcc2cd3a66995bd29..0c8ef767c3a97dea298e69fdaef53db7d4ede871 100644 (file)
@@ -3165,11 +3165,19 @@ xlog_state_switch_iclogs(
        }
 
        if (log->l_curr_block >= log->l_logBBsize) {
+               /*
+                * Rewind the current block before the cycle is bumped to make
+                * sure that the combined LSN never transiently moves forward
+                * when the log wraps to the next cycle. This is to support the
+                * unlocked sample of these fields from xlog_valid_lsn(). Most
+                * other cases should acquire l_icloglock.
+                */
+               log->l_curr_block -= log->l_logBBsize;
+               ASSERT(log->l_curr_block >= 0);
+               smp_wmb();
                log->l_curr_cycle++;
                if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
                        log->l_curr_cycle++;
-               log->l_curr_block -= log->l_logBBsize;
-               ASSERT(log->l_curr_block >= 0);
        }
        ASSERT(iclog == log->l_iclog);
        log->l_iclog = iclog->ic_next;
@@ -4023,3 +4031,45 @@ xlog_iclogs_empty(
        return 1;
 }
 
+/*
+ * Verify that an LSN stamped into a piece of metadata is valid. This is
+ * intended for use in read verifiers on v5 superblocks.
+ */
+bool
+xfs_log_check_lsn(
+       struct xfs_mount        *mp,
+       xfs_lsn_t               lsn)
+{
+       struct xlog             *log = mp->m_log;
+       bool                    valid;
+
+       /*
+        * norecovery mode skips mount-time log processing and unconditionally
+        * resets the in-core LSN. We can't validate in this mode, but
+        * modifications are not allowed anyways so just return true.
+        */
+       if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+               return true;
+
+       /*
+        * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is
+        * handled by recovery and thus safe to ignore here.
+        */
+       if (lsn == NULLCOMMITLSN)
+               return true;
+
+       valid = xlog_valid_lsn(mp->m_log, lsn);
+
+       /* warn the user about what's gone wrong before verifier failure */
+       if (!valid) {
+               spin_lock(&log->l_icloglock);
+               xfs_warn(mp,
+"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). "
+"Please unmount and run xfs_repair (>= v4.3) to resolve.",
+                        CYCLE_LSN(lsn), BLOCK_LSN(lsn),
+                        log->l_curr_cycle, log->l_curr_block);
+               spin_unlock(&log->l_icloglock);
+       }
+
+       return valid;
+}
index 09d91d3166cde43479366e0a3a3ad8e325350cf5..aa533a7d50f2186f051b09c7dcfd78b431571d15 100644 (file)
@@ -181,5 +181,6 @@ bool        xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 void   xfs_log_work_queue(struct xfs_mount *mp);
 void   xfs_log_worker(struct work_struct *work);
 void   xfs_log_quiesce(struct xfs_mount *mp);
+bool   xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
 
 #endif /* __XFS_LOG_H__ */
index 950f3f94720c66524baa9a6a4dc0b853e41c3d1b..8daba7491b13f094750d082d5eb8f14a4b3d03d6 100644 (file)
@@ -560,4 +560,55 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
        remove_wait_queue(wq, &wait);
 }
 
+/*
+ * The LSN is valid so long as it is behind the current LSN. If it isn't, this
+ * means that the next log record that includes this metadata could have a
+ * smaller LSN. In turn, this means that the modification in the log would not
+ * replay.
+ */
+static inline bool
+xlog_valid_lsn(
+       struct xlog     *log,
+       xfs_lsn_t       lsn)
+{
+       int             cur_cycle;
+       int             cur_block;
+       bool            valid = true;
+
+       /*
+        * First, sample the current lsn without locking to avoid added
+        * contention from metadata I/O. The current cycle and block are updated
+        * (in xlog_state_switch_iclogs()) and read here in a particular order
+        * to avoid false negatives (e.g., thinking the metadata LSN is valid
+        * when it is not).
+        *
+        * The current block is always rewound before the cycle is bumped in
+        * xlog_state_switch_iclogs() to ensure the current LSN is never seen in
+        * a transiently forward state. Instead, we can see the LSN in a
+        * transiently behind state if we happen to race with a cycle wrap.
+        */
+       cur_cycle = ACCESS_ONCE(log->l_curr_cycle);
+       smp_rmb();
+       cur_block = ACCESS_ONCE(log->l_curr_block);
+
+       if ((CYCLE_LSN(lsn) > cur_cycle) ||
+           (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
+               /*
+                * If the metadata LSN appears invalid, it's possible the check
+                * above raced with a wrap to the next log cycle. Grab the lock
+                * to check for sure.
+                */
+               spin_lock(&log->l_icloglock);
+               cur_cycle = log->l_curr_cycle;
+               cur_block = log->l_curr_block;
+               spin_unlock(&log->l_icloglock);
+
+               if ((CYCLE_LSN(lsn) > cur_cycle) ||
+                   (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block))
+                       valid = false;
+       }
+
+       return valid;
+}
+
 #endif /* __XFS_LOG_PRIV_H__ */
index 512a0945d52ac4e023e2181510e70e65f3da04e8..f8f1363dc045d40be43befb102bf0f473ef75501 100644 (file)
@@ -4609,9 +4609,19 @@ xlog_recover(
        int             error;
 
        /* find the tail of the log */
-       if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
+       error = xlog_find_tail(log, &head_blk, &tail_blk);
+       if (error)
                return error;
 
+       /*
+        * The superblock was read before the log was available and thus the LSN
+        * could not be verified. Check the superblock LSN against the current
+        * LSN now that it's known.
+        */
+       if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
+           !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
+               return -EINVAL;
+
        if (tail_blk != head_blk) {
                /* There used to be a comment here:
                 *