Message ID | 150706335008.19351.13005092237576865356.stgit@magnolia (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On Tue, Oct 03, 2017 at 01:42:30PM -0700, Darrick J. Wong wrote: > @@ -559,3 +563,64 @@ xfs_scrub_setup_ag_btree( > > return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa); > } > + > +/* > + * Given an inode and the scrub control structure, grab either the > + * inode referenced in the control structure or the inode passed in. > + * The inode is not locked. > + */ > +int > +xfs_scrub_get_inode( > + struct xfs_scrub_context *sc, > + struct xfs_inode *ip_in) > +{ > + struct xfs_mount *mp = sc->mp; > + struct xfs_inode *ips = NULL; *ip? > + int error; > + > + if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino)) > + return -EINVAL; What's this checking? > + > + /* We want to scan the inode we already had opened. */ > + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { > + sc->ip = ip_in; > + return 0; > + } > + > + /* Look up the inode, see if the generation number matches. */ > + if (xfs_internal_inum(mp, sc->sm->sm_ino)) > + return -ENOENT; maybe xfs_internal_inum should be moved to the same place as all the inode/agbno/bno verification functions.... > + error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED, > + 0, &ips); I think we also want XFS_IGET_DONTCACHE here, so we don't trash the inode cache with inodes that we use once for scrub and never touch again. > + if (error == -ENOENT || error == -EINVAL) { > + /* inode doesn't exist... */ > + return -ENOENT; > + } else if (error) { > + trace_xfs_scrub_op_error(sc, > + XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), > + XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), > + error, __return_address); > + return error; > + } > + if (VFS_I(ips)->i_generation != sc->sm->sm_gen) { > + iput(VFS_I(ips)); > + return -ENOENT; > + } > + > + sc->ip = ips; > + return 0; > +} > + > +/* Push everything out of the log onto disk. */ > +int > +xfs_scrub_checkpoint_log( > + struct xfs_mount *mp) > +{ > + int error; > + > + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); > + if (error) > + return error; > + xfs_ail_push_all_sync(mp->m_ail); > + return 0; > +} Oooo, that's a nasty thing to do on busy systems with large dirty logs. I hope this is a "last resort" kind of thing.... > +/* Set us up with an inode. */ What state are we trying to get the inode into here? We grab all the various locks, but we can still have data changing via mmap pages that are already faulted in and delalloc extents in the incore extent list that aren't reflected on disk... A comment explaining what we expect here would be nice. > +int > +xfs_scrub_setup_inode( > + struct xfs_scrub_context *sc, > + struct xfs_inode *ip) > +{ > + struct xfs_mount *mp = sc->mp; > + int error; > + > + /* > + * Try to get the inode. If the verifiers fail, we try again > + * in raw mode. > + */ > + error = xfs_scrub_get_inode(sc, ip); > + switch (error) { > + case 0: > + break; > + case -EFSCORRUPTED: > + case -EFSBADCRC: > + return xfs_scrub_checkpoint_log(mp); > + default: > + return error; > + } > + > + /* Got the inode, lock it and we're ready to go. */ > + sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; > + xfs_ilock(sc->ip, sc->ilock_flags); > + error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); > + if (error) > + goto out_unlock; > + sc->ilock_flags |= XFS_ILOCK_EXCL; > + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); Should the inode be joined to the transaction so that cancelling the transaction unlocks the inode? Then the need for the ilock_flags variable goes away.... > + > + return error; > +out_unlock: > + xfs_iunlock(sc->ip, sc->ilock_flags); > + if (sc->ip != ip) > + iput(VFS_I(sc->ip)); > + sc->ip = NULL; > + return error; > +} > + > +/* Inode core */ > + > +/* Scrub an inode. */ > +int > +xfs_scrub_inode( > + struct xfs_scrub_context *sc) > +{ > + struct xfs_imap imap; > + struct xfs_dinode di; > + struct xfs_mount *mp = sc->mp; > + struct xfs_buf *bp = NULL; > + struct xfs_dinode *dip; > + xfs_ino_t ino; > + size_t fork_recs; > + unsigned long long isize; > + uint64_t flags2; > + uint32_t nextents; > + uint32_t extsize; > + uint32_t cowextsize; > + uint16_t flags; > + uint16_t mode; > + bool has_shared; > + int error = 0; > + > + /* Did we get the in-core inode, or are we doing this manually? */ > + if (sc->ip) { > + ino = sc->ip->i_ino; > + xfs_inode_to_disk(sc->ip, &di, 0); > + dip = &di; > + } else { > + /* Map & read inode. */ > + ino = sc->sm->sm_ino; > + error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED); > + if (error == -EINVAL) { > + /* > + * Inode could have gotten deleted out from under us; > + * just forget about it. > + */ > + error = -ENOENT; > + goto out; > + } > + if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino), > + XFS_INO_TO_AGBNO(mp, ino), &error)) > + goto out; > + > + error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, > + imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp, > + NULL); > + if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino), > + XFS_INO_TO_AGBNO(mp, ino), &error)) > + goto out; > + > + /* Is this really an inode? */ > + bp->b_ops = &xfs_inode_buf_ops; > + dip = xfs_buf_offset(bp, imap.im_boffset); > + if (!xfs_dinode_verify(mp, ino, dip) || > + !xfs_dinode_good_version(mp, dip->di_version)) { > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + goto out; > + } > + > + /* ...and is it the one we asked for? */ > + if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) { > + error = -ENOENT; > + goto out; > + } > + } Can we factor the manual mapping into a separate function? Just makes it a bit cleaner and gets rid of a bunch of local variables from this function that are just used to map and read the inode. ANd reading on and getting ahead of the code, could we split it further into <setup dip> xfs_scrub_dinode(sc, ino, dip, bp); <do live incore inode stuff> > + > + flags = be16_to_cpu(dip->di_flags); > + if (dip->di_version >= 3) > + flags2 = be64_to_cpu(dip->di_flags2); > + else > + flags2 = 0; > + > + /* di_mode */ > + mode = be16_to_cpu(dip->di_mode); > + if (mode & ~(S_IALLUGO | S_IFMT)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + /* v1/v2 fields */ > + switch (dip->di_version) { > + case 1: > + if (dip->di_nlink != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + if (dip->di_mode == 0 && sc->ip) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + if (dip->di_projid_lo != 0 || dip->di_projid_hi != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; We don't really support v1 inode format anymore - we convert it to version 2 automatically in xfs_inode_from_disk() so the in-memory inode is always v2 or v3, never v1. And when we write it back out, we write it as a v2 inode, never as a v1 inode. Hence I'm not sure whether we should be worrying about scrubbing such inodes - they are going to be in an ever shrinking minority of filesystems. At minimum, they should always return "preen". > + case 2: > + case 3: > + if (dip->di_onlink != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + if (dip->di_mode == 0 && sc->ip) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + if (dip->di_projid_hi != 0 && > + !xfs_sb_version_hasprojid32bit(&mp->m_sb)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + default: > + ASSERT(0); If we don't understand the version, it's corrupt. > + break; > + } > + > + /* > + * di_uid/di_gid -- -1 isn't invalid, but there's no way that > + * userspace could have created that. > + */ > + if (dip->di_uid == cpu_to_be32(-1U) || > + dip->di_gid == cpu_to_be32(-1U)) > + xfs_scrub_ino_set_warning(sc, bp); > + > + /* di_format */ > + switch (dip->di_format) { > + case XFS_DINODE_FMT_DEV: > + if (!S_ISCHR(mode) && !S_ISBLK(mode) && > + !S_ISFIFO(mode) && !S_ISSOCK(mode)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_LOCAL: > + if (!S_ISDIR(mode) && !S_ISLNK(mode)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_EXTENTS: > + if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_BTREE: > + if (!S_ISREG(mode) && !S_ISDIR(mode)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_UUID: > + default: > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + } > + > + /* di_size */ > + isize = be64_to_cpu(dip->di_size); > + if (isize & (1ULL << 63)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); Should we be checking against the on disk format size, or the mounted filesystem maximum size (i.e. mp->m_super->s_maxbytes)? 32 or 64 bit systems are going to have different maximum valid file sizes.. Directories have a maximum bound size, too - the data space, leaf space and freespace space, each of which are 32GB in size, IIRC. And symlinks have a different maximum size, too. > + if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + /* di_nblocks */ > + if (flags2 & XFS_DIFLAG2_REFLINK) { > + ; /* nblocks can exceed dblocks */ > + } else if (flags & XFS_DIFLAG_REALTIME) { > + if (be64_to_cpu(dip->di_nblocks) >= > + mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); That doesn't seem right. the file can be on either the data or the rt device, so the maximum file blocks is the size of one device or the other, not both combined. > + } else { > + if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + } > + > + /* di_extsize */ > + extsize = be32_to_cpu(dip->di_extsize); > + if (flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) { > + if (extsize <= 0 || extsize > MAXEXTLEN) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + if (!(flags & XFS_DIFLAG_REALTIME) && > + extsize > mp->m_sb.sb_agblocks / 2) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + } else { > + if (extsize != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + } There more to validating the extentsize hints than this. From xfs_ioctl_setattr_check_extsize(): /* * extent size hint validation is somewhat cumbersome. Rules are: * * 1. extent size hint is only valid for directories and regular files * 2. FS_XFLAG_EXTSIZE is only valid for regular files * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. * 4. can only be changed on regular files if no extents are allocated * 5. can be changed on directories at any time * 6. extsize hint of 0 turns off hints, clears inode flags. * 7. Extent size must be a multiple of the appropriate block size. * 8. for non-realtime files, the extent size hint must be limited * to half the AG size to avoid alignment extending the extent beyond the * limits of the AG. */ Maybe there's some commonality between these two functions... > + > + /* di_flags */ > + if ((flags & XFS_DIFLAG_IMMUTABLE) && (flags & XFS_DIFLAG_APPEND)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME)) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); What about project id flags? Also, there are flags that are allowed only on regular files, and there are flags that are only allowed on directories. Those should probably also be checked for preening. > + > + /* di_nextents */ > + nextents = be32_to_cpu(dip->di_nextents); > + fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); > + switch (dip->di_format) { > + case XFS_DINODE_FMT_EXTENTS: > + if (nextents > fork_recs) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_BTREE: > + if (nextents <= fork_recs) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_LOCAL: > + case XFS_DINODE_FMT_DEV: > + case XFS_DINODE_FMT_UUID: > + default: > + if (nextents != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + } > + > + /* di_anextents */ > + nextents = be16_to_cpu(dip->di_anextents); > + fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); > + switch (dip->di_aformat) { > + case XFS_DINODE_FMT_EXTENTS: > + if (nextents > fork_recs) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_BTREE: > + if (nextents <= fork_recs) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + case XFS_DINODE_FMT_LOCAL: > + case XFS_DINODE_FMT_DEV: > + case XFS_DINODE_FMT_UUID: > + default: > + if (nextents != 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + break; > + } Don't we need a check here first to see whether an attribute fork exists or not? > + > + /* di_forkoff */ > + if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + if (dip->di_anextents != 0 && dip->di_forkoff == 0) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > + > + /* di_aformat */ > + if (dip->di_aformat != XFS_DINODE_FMT_LOCAL && > + dip->di_aformat != XFS_DINODE_FMT_EXTENTS && > + dip->di_aformat != XFS_DINODE_FMT_BTREE) > + xfs_scrub_ino_set_corrupt(sc, ino, bp); Shouldn't this come before we use dip->di_aformat in a switch statement? Hmmm - aren't we missing the same checks for the data fork? Cheers, Dave.
On Thu, Oct 05, 2017 at 03:04:52PM +1100, Dave Chinner wrote: > On Tue, Oct 03, 2017 at 01:42:30PM -0700, Darrick J. Wong wrote: > > @@ -559,3 +563,64 @@ xfs_scrub_setup_ag_btree( > > > > return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa); > > } > > + > > +/* > > + * Given an inode and the scrub control structure, grab either the > > + * inode referenced in the control structure or the inode passed in. > > + * The inode is not locked. > > + */ > > +int > > +xfs_scrub_get_inode( > > + struct xfs_scrub_context *sc, > > + struct xfs_inode *ip_in) > > +{ > > + struct xfs_mount *mp = sc->mp; > > + struct xfs_inode *ips = NULL; > > *ip? > > > + int error; > > + > > + if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino)) > > + return -EINVAL; > > What's this checking? /* * Jump out if userspace fed us an AG number or a inode generation * without an inode number. Both indicate that userspace hasn't got a * clue. */ > > + > > + /* We want to scan the inode we already had opened. */ > > + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { > > + sc->ip = ip_in; > > + return 0; > > + } > > + > > + /* Look up the inode, see if the generation number matches. */ > > + if (xfs_internal_inum(mp, sc->sm->sm_ino)) > > + return -ENOENT; > > maybe xfs_internal_inum should be moved to the same place as all the > inode/agbno/bno verification functions.... Yes. > > + error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED, > > + 0, &ips); > > I think we also want XFS_IGET_DONTCACHE here, so we don't trash the > inode cache with inodes that we use once for scrub and never touch > again. I thought about adding this, but if we let the inodes fall out of the cache now then we'll just have to load them back in for the bmap checks, right? > > + if (error == -ENOENT || error == -EINVAL) { > > + /* inode doesn't exist... */ > > + return -ENOENT; > > + } else if (error) { > > + trace_xfs_scrub_op_error(sc, > > + XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), > > + XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), > > + error, __return_address); > > + return error; > > + } > > + if (VFS_I(ips)->i_generation != sc->sm->sm_gen) { > > + iput(VFS_I(ips)); > > + return -ENOENT; > > + } > > + > > + sc->ip = ips; > > + return 0; > > +} > > + > > +/* Push everything out of the log onto disk. */ > > +int > > +xfs_scrub_checkpoint_log( > > + struct xfs_mount *mp) > > +{ > > + int error; > > + > > + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); > > + if (error) > > + return error; > > + xfs_ail_push_all_sync(mp->m_ail); > > + return 0; > > +} > > Oooo, that's a nasty thing to do on busy systems with large dirty > logs. I hope this is a "last resort" kind of thing.... It is; we only do this if the inobt says there's an inode there and the inode verifiers fail. > > +/* Set us up with an inode. */ > > What state are we trying to get the inode into here? We grab all the > various locks, but we can still have data changing via mmap pages > that are already faulted in and delalloc extents in the incore > extent list that aren't reflected on disk... > > A comment explaining what we expect here would be nice. /* * Grab total control of the inode metadata. It doesn't matter here if * the file data is still changing, we just want exclusive access to the * metadata. */ > > +int > > +xfs_scrub_setup_inode( > > + struct xfs_scrub_context *sc, > > + struct xfs_inode *ip) > > +{ > > + struct xfs_mount *mp = sc->mp; > > + int error; > > + > > + /* > > + * Try to get the inode. If the verifiers fail, we try again > > + * in raw mode. > > + */ > > + error = xfs_scrub_get_inode(sc, ip); > > + switch (error) { > > + case 0: > > + break; > > + case -EFSCORRUPTED: > > + case -EFSBADCRC: > > + return xfs_scrub_checkpoint_log(mp); > > + default: > > + return error; > > + } > > + > > + /* Got the inode, lock it and we're ready to go. */ > > + sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; > > + xfs_ilock(sc->ip, sc->ilock_flags); > > + error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); > > + if (error) > > + goto out_unlock; > > + sc->ilock_flags |= XFS_ILOCK_EXCL; > > + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); > > Should the inode be joined to the transaction so that cancelling the > transaction unlocks the inode? Then the need for the ilock_flags > variable goes away.... This is the confluence of two semi-icky things: first, some of the scrubbers (particularly the dir and parent pointer scrubbers) will need to drop the ILOCK for short periods of time; later on, repair will want to keep the inode locked across all the repair transactions, so it makes more sense to control the lock and unlock directly. > > + > > + return error; > > +out_unlock: > > + xfs_iunlock(sc->ip, sc->ilock_flags); > > + if (sc->ip != ip) > > + iput(VFS_I(sc->ip)); > > + sc->ip = NULL; > > + return error; > > +} > > + > > +/* Inode core */ > > + > > +/* Scrub an inode. */ > > +int > > +xfs_scrub_inode( > > + struct xfs_scrub_context *sc) > > +{ > > + struct xfs_imap imap; > > + struct xfs_dinode di; > > + struct xfs_mount *mp = sc->mp; > > + struct xfs_buf *bp = NULL; > > + struct xfs_dinode *dip; > > + xfs_ino_t ino; > > + size_t fork_recs; > > + unsigned long long isize; > > + uint64_t flags2; > > + uint32_t nextents; > > + uint32_t extsize; > > + uint32_t cowextsize; > > + uint16_t flags; > > + uint16_t mode; > > + bool has_shared; > > + int error = 0; > > + > > + /* Did we get the in-core inode, or are we doing this manually? */ > > + if (sc->ip) { > > + ino = sc->ip->i_ino; > > + xfs_inode_to_disk(sc->ip, &di, 0); > > + dip = &di; > > + } else { > > + /* Map & read inode. */ > > + ino = sc->sm->sm_ino; > > + error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED); > > + if (error == -EINVAL) { > > + /* > > + * Inode could have gotten deleted out from under us; > > + * just forget about it. > > + */ > > + error = -ENOENT; > > + goto out; > > + } > > + if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino), > > + XFS_INO_TO_AGBNO(mp, ino), &error)) > > + goto out; > > + > > + error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, > > + imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp, > > + NULL); > > + if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino), > > + XFS_INO_TO_AGBNO(mp, ino), &error)) > > + goto out; > > + > > + /* Is this really an inode? */ > > + bp->b_ops = &xfs_inode_buf_ops; > > + dip = xfs_buf_offset(bp, imap.im_boffset); > > + if (!xfs_dinode_verify(mp, ino, dip) || > > + !xfs_dinode_good_version(mp, dip->di_version)) { > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + goto out; > > + } > > + > > + /* ...and is it the one we asked for? */ > > + if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) { > > + error = -ENOENT; > > + goto out; > > + } > > + } > > Can we factor the manual mapping into a separate function? Just > makes it a bit cleaner and gets rid of a bunch of local variables > from this function that are just used to map and read the inode. > > ANd reading on and getting ahead of the code, could we split it > further into > > <setup dip> > > xfs_scrub_dinode(sc, ino, dip, bp); > > <do live incore inode stuff> Yes, good plan. > > + > > + flags = be16_to_cpu(dip->di_flags); > > + if (dip->di_version >= 3) > > + flags2 = be64_to_cpu(dip->di_flags2); > > + else > > + flags2 = 0; > > + > > + /* di_mode */ > > + mode = be16_to_cpu(dip->di_mode); > > + if (mode & ~(S_IALLUGO | S_IFMT)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + /* v1/v2 fields */ > > + switch (dip->di_version) { > > + case 1: > > + if (dip->di_nlink != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + if (dip->di_mode == 0 && sc->ip) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + if (dip->di_projid_lo != 0 || dip->di_projid_hi != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > We don't really support v1 inode format anymore - we convert it to > version 2 automatically in xfs_inode_from_disk() so the in-memory > inode is always v2 or v3, never v1. And when we write it back out, > we write it as a v2 inode, never as a v1 inode. > > Hence I'm not sure whether we should be worrying about scrubbing > such inodes - they are going to be in an ever shrinking minority > of filesystems. At minimum, they should always return "preen". Ok. I figured that we might end up at "v1 => preen" but decided to play this straight until we got to review. > > + case 2: > > + case 3: > > + if (dip->di_onlink != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + if (dip->di_mode == 0 && sc->ip) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + if (dip->di_projid_hi != 0 && > > + !xfs_sb_version_hasprojid32bit(&mp->m_sb)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + default: > > + ASSERT(0); > > If we don't understand the version, it's corrupt. Yikes an assert. Will replace that stat! > > + break; > > + } > > + > > + /* > > + * di_uid/di_gid -- -1 isn't invalid, but there's no way that > > + * userspace could have created that. > > + */ > > + if (dip->di_uid == cpu_to_be32(-1U) || > > + dip->di_gid == cpu_to_be32(-1U)) > > + xfs_scrub_ino_set_warning(sc, bp); > > + > > + /* di_format */ > > + switch (dip->di_format) { > > + case XFS_DINODE_FMT_DEV: > > + if (!S_ISCHR(mode) && !S_ISBLK(mode) && > > + !S_ISFIFO(mode) && !S_ISSOCK(mode)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_LOCAL: > > + if (!S_ISDIR(mode) && !S_ISLNK(mode)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_EXTENTS: > > + if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_BTREE: > > + if (!S_ISREG(mode) && !S_ISDIR(mode)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_UUID: > > + default: > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + } > > + > > + /* di_size */ > > + isize = be64_to_cpu(dip->di_size); > > + if (isize & (1ULL << 63)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > Should we be checking against the on disk format size, or the > mounted filesystem maximum size (i.e. mp->m_super->s_maxbytes)? > 32 or 64 bit systems are going to have different maximum valid file > sizes.. It's perfectly valid to 'truncate -s $((2 ** 60) foofile' so the only thing we can really check for here is that the upper bit isn't set (because the VFS does not check, but barfs on, files with that large of a size). > Directories have a maximum bound size, too - the data space, leaf > space and freespace space, each of which are 32GB in size, IIRC. > > And symlinks have a different maximum size, too. Fair enough, I'll expand the i_size checks, though ISTR the verifiers now check that for us. > > + if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > + > > + /* di_nblocks */ > > + if (flags2 & XFS_DIFLAG2_REFLINK) { > > + ; /* nblocks can exceed dblocks */ > > + } else if (flags & XFS_DIFLAG_REALTIME) { > > + if (be64_to_cpu(dip->di_nblocks) >= > > + mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > That doesn't seem right. the file can be on either the data or the > rt device, so the maximum file blocks is the size of one device or > the other, not both combined. di_nblocks is the sum of (data blocks + bmbt blocks + attr blocks), right? So in theory if you had a rt file with 1000 data blocks, 10 bmbt blocks to map the data blocks, and 100 attr blocks then di_nblocks has to be 1110. > > + } else { > > + if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + } > > + > > + /* di_extsize */ > > + extsize = be32_to_cpu(dip->di_extsize); > > + if (flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) { > > + if (extsize <= 0 || extsize > MAXEXTLEN) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + if (!(flags & XFS_DIFLAG_REALTIME) && > > + extsize > mp->m_sb.sb_agblocks / 2) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + } else { > > + if (extsize != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + } > > There more to validating the extentsize hints than this. From > xfs_ioctl_setattr_check_extsize(): > > /* > * extent size hint validation is somewhat cumbersome. Rules are: > * > * 1. extent size hint is only valid for directories and regular files > * 2. FS_XFLAG_EXTSIZE is only valid for regular files > * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. > * 4. can only be changed on regular files if no extents are allocated > * 5. can be changed on directories at any time > * 6. extsize hint of 0 turns off hints, clears inode flags. > * 7. Extent size must be a multiple of the appropriate block size. > * 8. for non-realtime files, the extent size hint must be limited > * to half the AG size to avoid alignment extending the extent beyond the > * limits of the AG. > */ > > Maybe there's some commonality between these two functions... Definitely, will refactor both. > > + > > + /* di_flags */ > > + if ((flags & XFS_DIFLAG_IMMUTABLE) && (flags & XFS_DIFLAG_APPEND)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > What about project id flags? > > Also, there are flags that are allowed only on regular files, and > there are flags that are only allowed on directories. Those should > probably also be checked for preening. <nod> Some of these are checked by the dinode verifier, but there needs to be a comment (or more comment) explaining that. > > + > > + /* di_nextents */ > > + nextents = be32_to_cpu(dip->di_nextents); > > + fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); > > + switch (dip->di_format) { > > + case XFS_DINODE_FMT_EXTENTS: > > + if (nextents > fork_recs) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_BTREE: > > + if (nextents <= fork_recs) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_LOCAL: > > + case XFS_DINODE_FMT_DEV: > > + case XFS_DINODE_FMT_UUID: > > + default: > > + if (nextents != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + } > > + > > + /* di_anextents */ > > + nextents = be16_to_cpu(dip->di_anextents); > > + fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); > > + switch (dip->di_aformat) { > > + case XFS_DINODE_FMT_EXTENTS: > > + if (nextents > fork_recs) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_BTREE: > > + if (nextents <= fork_recs) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_LOCAL: > > + case XFS_DINODE_FMT_DEV: > > + case XFS_DINODE_FMT_UUID: > > + default: > > + if (nextents != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + } > > Don't we need a check here first to see whether an attribute fork > exists or not? Do you mean the xfs_inode_fork, or something else? XFS_DFORK_ASIZE returns zero if !XFS_DFORK_Q which in turn is based on di_forkoff so we're really only checking that di_aformat makes sense given the number of extents and the size of the attr fork area. We're not actually checking anything in the attr fork; that's a different scrubber. > > + > > + /* di_forkoff */ > > + if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + if (dip->di_anextents != 0 && dip->di_forkoff == 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + /* di_aformat */ > > + if (dip->di_aformat != XFS_DINODE_FMT_LOCAL && > > + dip->di_aformat != XFS_DINODE_FMT_EXTENTS && > > + dip->di_aformat != XFS_DINODE_FMT_BTREE) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > Shouldn't this come before we use dip->di_aformat in a switch > statement? Yes. Good catch. > Hmmm - aren't we missing the same checks for the data fork? I believe you'll find them further up in the function. --D > > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Oct 04, 2017 at 10:22:19PM -0700, Darrick J. Wong wrote: > On Thu, Oct 05, 2017 at 03:04:52PM +1100, Dave Chinner wrote: > > On Tue, Oct 03, 2017 at 01:42:30PM -0700, Darrick J. Wong wrote: > > > + error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED, > > > + 0, &ips); > > > > I think we also want XFS_IGET_DONTCACHE here, so we don't trash the > > inode cache with inodes that we use once for scrub and never touch > > again. > > I thought about adding this, but if we let the inodes fall out of the > cache now then we'll just have to load them back in for the bmap checks, > right? Well, I'm looking at ensuring that we don't blow out the memory side of things. We've still got the inode buffer in the buffer cache, so I don't see why we should double cache these things and then leave both cached copied hanging around after we've finished with them. Leave the buffer around because we do a fair few checks with it, but don't use excessive icache memory and trash the working set if we can avoid it... > > > +xfs_scrub_checkpoint_log( > > > + struct xfs_mount *mp) > > > +{ > > > + int error; > > > + > > > + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); > > > + if (error) > > > + return error; > > > + xfs_ail_push_all_sync(mp->m_ail); > > > + return 0; > > > +} > > > > Oooo, that's a nasty thing to do on busy systems with large dirty > > logs. I hope this is a "last resort" kind of thing.... > > It is; we only do this if the inobt says there's an inode there and the > inode verifiers fail. Ok, so why would pushing the log and the AIL make the verifier then succeed? how likely is this to occur on a busy system? > > > +/* Set us up with an inode. */ > > > > What state are we trying to get the inode into here? We grab all the > > various locks, but we can still have data changing via mmap pages > > that are already faulted in and delalloc extents in the incore > > extent list that aren't reflected on disk... > > > > A comment explaining what we expect here would be nice. > > /* > * Grab total control of the inode metadata. It doesn't matter here if > * the file data is still changing, we just want exclusive access to the > * metadata. > */ *nod* > > > + /* Got the inode, lock it and we're ready to go. */ > > > + sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; > > > + xfs_ilock(sc->ip, sc->ilock_flags); > > > + error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); > > > + if (error) > > > + goto out_unlock; > > > + sc->ilock_flags |= XFS_ILOCK_EXCL; > > > + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); > > > > Should the inode be joined to the transaction so that cancelling the > > transaction unlocks the inode? Then the need for the ilock_flags > > variable goes away.... > > This is the confluence of two semi-icky things: first, some of the > scrubbers (particularly the dir and parent pointer scrubbers) will need > to drop the ILOCK for short periods of time; later on, repair will want > to keep the inode locked across all the repair transactions, so it makes > more sense to control the lock and unlock directly. Ok, I'll pass on this for now, see how the rest of the code falls out. > > > + /* di_size */ > > > + isize = be64_to_cpu(dip->di_size); > > > + if (isize & (1ULL << 63)) > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > > Should we be checking against the on disk format size, or the > > mounted filesystem maximum size (i.e. mp->m_super->s_maxbytes)? > > 32 or 64 bit systems are going to have different maximum valid file > > sizes.. > > It's perfectly valid to 'truncate -s $((2 ** 60) foofile' so the only Ugh. We can't do IO past 16TB on 32 bit systems, so I'm kinda surprised truncate doesn't have the same s_maxbytes restriction... > thing we can really check for here is that the upper bit isn't set > (because the VFS does not check, but barfs on, files with that large of > a size). xfs_max_file_offset() sets the max file offset to 2^63 - 1, so it looks like the lack of checking in truncate is the problem here, not the IO path. > > Directories have a maximum bound size, too - the data space, leaf > > space and freespace space, each of which are 32GB in size, IIRC. > > > > And symlinks have a different maximum size, too. > > Fair enough, I'll expand the i_size checks, though ISTR the verifiers > now check that for us. If they do, then just drop a comment in there to say what is checked by the verifier. > > > + if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > > > + > > > + /* di_nblocks */ > > > + if (flags2 & XFS_DIFLAG2_REFLINK) { > > > + ; /* nblocks can exceed dblocks */ > > > + } else if (flags & XFS_DIFLAG_REALTIME) { > > > + if (be64_to_cpu(dip->di_nblocks) >= > > > + mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > > That doesn't seem right. the file can be on either the data or the > > rt device, so the maximum file blocks is the size of one device or > > the other, not both combined. > > di_nblocks is the sum of (data blocks + bmbt blocks + attr blocks), > right? Yeah, forgot it was more than just data extents. > So in theory if you had a rt file with 1000 data blocks, 10 bmbt > blocks to map the data blocks, and 100 attr blocks then di_nblocks has > to be 1110. Yup, but the additional metadata on the data device is not going to be anywhere near the size of the data device. /me shrugs I can't think of an easy way to get a maximum block count, so I guess that'll have to do... > > > + if (nextents > fork_recs) > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > + break; > > > + case XFS_DINODE_FMT_BTREE: > > > + if (nextents <= fork_recs) > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > + break; > > > + case XFS_DINODE_FMT_LOCAL: > > > + case XFS_DINODE_FMT_DEV: > > > + case XFS_DINODE_FMT_UUID: > > > + default: > > > + if (nextents != 0) > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > + break; > > > + } > > > + > > > + /* di_anextents */ > > > + nextents = be16_to_cpu(dip->di_anextents); > > > + fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); > > > + switch (dip->di_aformat) { > > > + case XFS_DINODE_FMT_EXTENTS: > > > + if (nextents > fork_recs) > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > + break; > > > + case XFS_DINODE_FMT_BTREE: > > > + if (nextents <= fork_recs) > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > + break; > > > + case XFS_DINODE_FMT_LOCAL: > > > + case XFS_DINODE_FMT_DEV: > > > + case XFS_DINODE_FMT_UUID: > > > + default: > > > + if (nextents != 0) > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > + break; > > > + } > > > > Don't we need a check here first to see whether an attribute fork > > exists or not? > > Do you mean the xfs_inode_fork, or something else? SOmething else. :P > XFS_DFORK_ASIZE returns zero if !XFS_DFORK_Q which in turn is based on > di_forkoff so we're really only checking that di_aformat makes sense > given the number of extents and the size of the attr fork area. Right, but if XFS_DFORK_ASIZE == 0, the dip->di_aformat *must* be XFS_DINODE_FMT_EXTENTS. That's the only valid configuration when there is no attribute fork present. If there is an attribute fork present, then it can be XFS_DINODE_FMT_LOCAL, EXTENT or BTREE, and then the extent count needs checking. XFS_DINODE_FMT_DEV and XFS_DINODE_FMT_UUID are both invalid for the attribute fork. Cheers, Dave.
On Thu, Oct 05, 2017 at 06:13:39PM +1100, Dave Chinner wrote: > On Wed, Oct 04, 2017 at 10:22:19PM -0700, Darrick J. Wong wrote: > > On Thu, Oct 05, 2017 at 03:04:52PM +1100, Dave Chinner wrote: > > > On Tue, Oct 03, 2017 at 01:42:30PM -0700, Darrick J. Wong wrote: > > > > + error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED, > > > > + 0, &ips); > > > > > > I think we also want XFS_IGET_DONTCACHE here, so we don't trash the > > > inode cache with inodes that we use once for scrub and never touch > > > again. > > > > I thought about adding this, but if we let the inodes fall out of the > > cache now then we'll just have to load them back in for the bmap checks, > > right? > > Well, I'm looking at ensuring that we don't blow out the memory > side of things. We've still got the inode buffer in the buffer > cache, so I don't see why we should double cache these things > and then leave both cached copied hanging around after we've > finished with them. Leave the buffer around because we do a fair few > checks with it, but don't use excessive icache memory and trash the > working set if we can avoid it... Sure, I'll give it a try and see what the performance impacts are. (I /have/ been looking into OOM problems on artificially memory-constrained VMs.) > > > > +xfs_scrub_checkpoint_log( > > > > + struct xfs_mount *mp) > > > > +{ > > > > + int error; > > > > + > > > > + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); > > > > + if (error) > > > > + return error; > > > > + xfs_ail_push_all_sync(mp->m_ail); > > > > + return 0; > > > > +} > > > > > > Oooo, that's a nasty thing to do on busy systems with large dirty > > > logs. I hope this is a "last resort" kind of thing.... > > > > It is; we only do this if the inobt says there's an inode there and the > > inode verifiers fail. > > Ok, so why would pushing the log and the AIL make the verifier then > succeed? how likely is this to occur on a busy system? Hmm, looking through my notes the original reason for shoving on the log is to force any dirty inode items to get checkpointed back to disk so that we could read the raw buffers. I think there's still a desire to be able to checkpoint the log in order to resolve discrepancies via raw disk buffer reads ... but at this point the only code that does this is: (a) the code that checks inobt freemask against the on-disk inodes if the inode isn't in the cache (we can't rely on the inobt to look up uncached inodes in order to check inobt fields) (b) AGF/AGI repair will want to ->verify_read the alleged btree root blocks. If the root blocks are new and have never been checkpointed, they'll have a crc/lsn of zero, which causes verifier error. (Granted this should never happen except when we're artificially forcing repair to rebuild otherwise ok metadata as part of testing.) (c) bnobt repair will want to be able to flush the log to empty out the busy extent list because it works by freeing all the blocks not listed by the rmap and the busy extent list can't handle overlapping entries. Though now that I look at it, this function belongs in the ialloc patch, and the if (force_log) xfs_scrub_checkpoint_log() call has somehow migrated out to one of the repair patches. Ugh. > > > > +/* Set us up with an inode. */ > > > > > > What state are we trying to get the inode into here? We grab all the > > > various locks, but we can still have data changing via mmap pages > > > that are already faulted in and delalloc extents in the incore > > > extent list that aren't reflected on disk... > > > > > > A comment explaining what we expect here would be nice. > > > > /* > > * Grab total control of the inode metadata. It doesn't matter here if > > * the file data is still changing, we just want exclusive access to the > > * metadata. > > */ > > *nod* > > > > > + /* Got the inode, lock it and we're ready to go. */ > > > > + sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; > > > > + xfs_ilock(sc->ip, sc->ilock_flags); > > > > + error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); > > > > + if (error) > > > > + goto out_unlock; > > > > + sc->ilock_flags |= XFS_ILOCK_EXCL; > > > > + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); > > > > > > Should the inode be joined to the transaction so that cancelling the > > > transaction unlocks the inode? Then the need for the ilock_flags > > > variable goes away.... > > > > This is the confluence of two semi-icky things: first, some of the > > scrubbers (particularly the dir and parent pointer scrubbers) will need > > to drop the ILOCK for short periods of time; later on, repair will want > > to keep the inode locked across all the repair transactions, so it makes > > more sense to control the lock and unlock directly. > > Ok, I'll pass on this for now, see how the rest of the code falls > out. > > > > > + /* di_size */ > > > > + isize = be64_to_cpu(dip->di_size); > > > > + if (isize & (1ULL << 63)) > > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > > > > Should we be checking against the on disk format size, or the > > > mounted filesystem maximum size (i.e. mp->m_super->s_maxbytes)? > > > 32 or 64 bit systems are going to have different maximum valid file > > > sizes.. > > > > It's perfectly valid to 'truncate -s $((2 ** 60) foofile' so the only > > Ugh. We can't do IO past 16TB on 32 bit systems, so I'm kinda > surprised truncate doesn't have the same s_maxbytes restriction... > > > thing we can really check for here is that the upper bit isn't set > > (because the VFS does not check, but barfs on, files with that large of > > a size). > > xfs_max_file_offset() sets the max file offset to 2^63 - 1, so it > looks like the lack of checking in truncate is the problem here, > not the IO path. Imagine that you format an xfs on a x64 system, create a 20TB file, and scrub/repair say it's ok. Then you mount the same fs on your 2014-era smartphone^W^Warm32 system. Yes, the smartphone can't handle such a big file, but is this fs corruption? I don't think this qualifies for OFLAG_CORRUPT. Come to think of it, this is one of the scenarios for which OFLAG_WARNING was intended -- not technically a violation of the disk spec, but needs review anyway. Ok, I've convinced myself. :) /* * Warn if the running kernel can't handle the kinds of offsets * needed to deal with the file size. In other words, if the * pagecache can't cache all the blocks in this file due to * overly large offsets, flag the inode for admin review. */ if (isize >= mp->m_super->s_maxbytes) xfs_scrub_ino_set_warning(sc, bp); > > > Directories have a maximum bound size, too - the data space, leaf > > > space and freespace space, each of which are 32GB in size, IIRC. > > > > > > And symlinks have a different maximum size, too. > > > > Fair enough, I'll expand the i_size checks, though ISTR the verifiers > > now check that for us. > > If they do, then just drop a comment in there to say what is checked > by the verifier. Hmm, the dinode verifier checks for the upper bit and for zero-length symlinks and directories. Will add the symlink/directory isize checks. > > > > + if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) > > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > > > > > + > > > > + /* di_nblocks */ > > > > + if (flags2 & XFS_DIFLAG2_REFLINK) { > > > > + ; /* nblocks can exceed dblocks */ > > > > + } else if (flags & XFS_DIFLAG_REALTIME) { > > > > + if (be64_to_cpu(dip->di_nblocks) >= > > > > + mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) > > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > > > > That doesn't seem right. the file can be on either the data or the > > > rt device, so the maximum file blocks is the size of one device or > > > the other, not both combined. > > > > di_nblocks is the sum of (data blocks + bmbt blocks + attr blocks), > > right? > > Yeah, forgot it was more than just data extents. > > > So in theory if you had a rt file with 1000 data blocks, 10 bmbt > > blocks to map the data blocks, and 100 attr blocks then di_nblocks has > > to be 1110. > > Yup, but the additional metadata on the data device is not going to > be anywhere near the size of the data device. > > /me shrugs > > I can't think of an easy way to get a maximum block count, so I > guess that'll have to do... Yeah, I couldn't come up with a more precise check that doesn't involve checking the data forks directly (which comes up later). > > > > + if (nextents > fork_recs) > > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > > + break; > > > > + case XFS_DINODE_FMT_BTREE: > > > > + if (nextents <= fork_recs) > > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > > + break; > > > > + case XFS_DINODE_FMT_LOCAL: > > > > + case XFS_DINODE_FMT_DEV: > > > > + case XFS_DINODE_FMT_UUID: > > > > + default: > > > > + if (nextents != 0) > > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > > + break; > > > > + } > > > > + > > > > + /* di_anextents */ > > > > + nextents = be16_to_cpu(dip->di_anextents); > > > > + fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); > > > > + switch (dip->di_aformat) { > > > > + case XFS_DINODE_FMT_EXTENTS: > > > > + if (nextents > fork_recs) > > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > > + break; > > > > + case XFS_DINODE_FMT_BTREE: > > > > + if (nextents <= fork_recs) > > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > > + break; > > > > + case XFS_DINODE_FMT_LOCAL: > > > > + case XFS_DINODE_FMT_DEV: > > > > + case XFS_DINODE_FMT_UUID: > > > > + default: > > > > + if (nextents != 0) > > > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > > + break; > > > > + } > > > > > > Don't we need a check here first to see whether an attribute fork > > > exists or not? > > > > Do you mean the xfs_inode_fork, or something else? > > SOmething else. :P > > > XFS_DFORK_ASIZE returns zero if !XFS_DFORK_Q which in turn is based on > > di_forkoff so we're really only checking that di_aformat makes sense > > given the number of extents and the size of the attr fork area. > > Right, but if XFS_DFORK_ASIZE == 0, the dip->di_aformat *must* be > XFS_DINODE_FMT_EXTENTS. That's the only valid configuration when > there is no attribute fork present. > > If there is an attribute fork present, then it can be XFS_DINODE_FMT_LOCAL, > EXTENT or BTREE, and then the extent count needs checking. > XFS_DINODE_FMT_DEV and XFS_DINODE_FMT_UUID are both invalid for the > attribute fork. Ok. Will rearrange this to check di_forkoff and di_aformat before bothering with the di_anextents checks. --D > > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index a7c5752..28e14b7 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -151,6 +151,7 @@ xfs-y += $(addprefix scrub/, \ btree.o \ common.o \ ialloc.o \ + inode.o \ refcount.o \ rmap.o \ scrub.o \ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index b3f992c..f8463e0 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -494,9 +494,10 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_FINOBT 8 /* free inode btree */ #define XFS_SCRUB_TYPE_RMAPBT 9 /* reverse mapping btree */ #define XFS_SCRUB_TYPE_REFCNTBT 10 /* reference count btree */ +#define XFS_SCRUB_TYPE_INODE 11 /* inode record */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 11 +#define XFS_SCRUB_TYPE_NR 12 /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1 << 0) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 4f8d103..415dec6 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -30,6 +30,8 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_itable.h" #include "xfs_alloc.h" #include "xfs_alloc_btree.h" #include "xfs_bmap.h" @@ -40,6 +42,8 @@ #include "xfs_refcount_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -559,3 +563,64 @@ xfs_scrub_setup_ag_btree( return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa); } + +/* + * Given an inode and the scrub control structure, grab either the + * inode referenced in the control structure or the inode passed in. + * The inode is not locked. + */ +int +xfs_scrub_get_inode( + struct xfs_scrub_context *sc, + struct xfs_inode *ip_in) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ips = NULL; + int error; + + if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino)) + return -EINVAL; + + /* We want to scan the inode we already had opened. */ + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { + sc->ip = ip_in; + return 0; + } + + /* Look up the inode, see if the generation number matches. */ + if (xfs_internal_inum(mp, sc->sm->sm_ino)) + return -ENOENT; + error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED, + 0, &ips); + if (error == -ENOENT || error == -EINVAL) { + /* inode doesn't exist... */ + return -ENOENT; + } else if (error) { + trace_xfs_scrub_op_error(sc, + XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), + XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), + error, __return_address); + return error; + } + if (VFS_I(ips)->i_generation != sc->sm->sm_gen) { + iput(VFS_I(ips)); + return -ENOENT; + } + + sc->ip = ips; + return 0; +} + +/* Push everything out of the log onto disk. */ +int +xfs_scrub_checkpoint_log( + struct xfs_mount *mp) +{ + int error; + + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + if (error) + return error; + xfs_ail_push_all_sync(mp->m_ail); + return 0; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 6325f03..c8143e8 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -74,6 +74,8 @@ void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork, void xfs_scrub_set_incomplete(struct xfs_scrub_context *sc); +int xfs_scrub_checkpoint_log(struct xfs_mount *mp); + /* Setup functions */ int xfs_scrub_setup_fs(struct xfs_scrub_context *sc, struct xfs_inode *ip); int xfs_scrub_setup_ag_header(struct xfs_scrub_context *sc, @@ -86,6 +88,8 @@ int xfs_scrub_setup_ag_rmapbt(struct xfs_scrub_context *sc, struct xfs_inode *ip); int xfs_scrub_setup_ag_refcountbt(struct xfs_scrub_context *sc, struct xfs_inode *ip); +int xfs_scrub_setup_inode(struct xfs_scrub_context *sc, + struct xfs_inode *ip); void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa); @@ -106,5 +110,6 @@ int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc, int xfs_scrub_setup_ag_btree(struct xfs_scrub_context *sc, struct xfs_inode *ip, bool force_log); +int xfs_scrub_get_inode(struct xfs_scrub_context *sc, struct xfs_inode *ip_in); #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c new file mode 100644 index 0000000..88dd3fc --- /dev/null +++ b/fs/xfs/scrub/inode.c @@ -0,0 +1,353 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" +#include "xfs_ialloc.h" +#include "xfs_reflink.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* Set us up with an inode. */ +int +xfs_scrub_setup_inode( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = sc->mp; + int error; + + /* + * Try to get the inode. If the verifiers fail, we try again + * in raw mode. + */ + error = xfs_scrub_get_inode(sc, ip); + switch (error) { + case 0: + break; + case -EFSCORRUPTED: + case -EFSBADCRC: + return xfs_scrub_checkpoint_log(mp); + default: + return error; + } + + /* Got the inode, lock it and we're ready to go. */ + sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + if (error) + goto out_unlock; + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + + return error; +out_unlock: + xfs_iunlock(sc->ip, sc->ilock_flags); + if (sc->ip != ip) + iput(VFS_I(sc->ip)); + sc->ip = NULL; + return error; +} + +/* Inode core */ + +/* Scrub an inode. */ +int +xfs_scrub_inode( + struct xfs_scrub_context *sc) +{ + struct xfs_imap imap; + struct xfs_dinode di; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp = NULL; + struct xfs_dinode *dip; + xfs_ino_t ino; + size_t fork_recs; + unsigned long long isize; + uint64_t flags2; + uint32_t nextents; + uint32_t extsize; + uint32_t cowextsize; + uint16_t flags; + uint16_t mode; + bool has_shared; + int error = 0; + + /* Did we get the in-core inode, or are we doing this manually? */ + if (sc->ip) { + ino = sc->ip->i_ino; + xfs_inode_to_disk(sc->ip, &di, 0); + dip = &di; + } else { + /* Map & read inode. */ + ino = sc->sm->sm_ino; + error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED); + if (error == -EINVAL) { + /* + * Inode could have gotten deleted out from under us; + * just forget about it. + */ + error = -ENOENT; + goto out; + } + if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGBNO(mp, ino), &error)) + goto out; + + error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, + imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp, + NULL); + if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGBNO(mp, ino), &error)) + goto out; + + /* Is this really an inode? */ + bp->b_ops = &xfs_inode_buf_ops; + dip = xfs_buf_offset(bp, imap.im_boffset); + if (!xfs_dinode_verify(mp, ino, dip) || + !xfs_dinode_good_version(mp, dip->di_version)) { + xfs_scrub_ino_set_corrupt(sc, ino, bp); + goto out; + } + + /* ...and is it the one we asked for? */ + if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) { + error = -ENOENT; + goto out; + } + } + + flags = be16_to_cpu(dip->di_flags); + if (dip->di_version >= 3) + flags2 = be64_to_cpu(dip->di_flags2); + else + flags2 = 0; + + /* di_mode */ + mode = be16_to_cpu(dip->di_mode); + if (mode & ~(S_IALLUGO | S_IFMT)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* v1/v2 fields */ + switch (dip->di_version) { + case 1: + if (dip->di_nlink != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + if (dip->di_mode == 0 && sc->ip) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + if (dip->di_projid_lo != 0 || dip->di_projid_hi != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case 2: + case 3: + if (dip->di_onlink != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + if (dip->di_mode == 0 && sc->ip) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + if (dip->di_projid_hi != 0 && + !xfs_sb_version_hasprojid32bit(&mp->m_sb)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + default: + ASSERT(0); + break; + } + + /* + * di_uid/di_gid -- -1 isn't invalid, but there's no way that + * userspace could have created that. + */ + if (dip->di_uid == cpu_to_be32(-1U) || + dip->di_gid == cpu_to_be32(-1U)) + xfs_scrub_ino_set_warning(sc, bp); + + /* di_format */ + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + if (!S_ISCHR(mode) && !S_ISBLK(mode) && + !S_ISFIFO(mode) && !S_ISSOCK(mode)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_LOCAL: + if (!S_ISDIR(mode) && !S_ISLNK(mode)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_EXTENTS: + if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_BTREE: + if (!S_ISREG(mode) && !S_ISDIR(mode)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_UUID: + default: + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + } + + /* di_size */ + isize = be64_to_cpu(dip->di_size); + if (isize & (1ULL << 63)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* di_nblocks */ + if (flags2 & XFS_DIFLAG2_REFLINK) { + ; /* nblocks can exceed dblocks */ + } else if (flags & XFS_DIFLAG_REALTIME) { + if (be64_to_cpu(dip->di_nblocks) >= + mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + } else { + if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + } + + /* di_extsize */ + extsize = be32_to_cpu(dip->di_extsize); + if (flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) { + if (extsize <= 0 || extsize > MAXEXTLEN) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + if (!(flags & XFS_DIFLAG_REALTIME) && + extsize > mp->m_sb.sb_agblocks / 2) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + } else { + if (extsize != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + } + + /* di_flags */ + if ((flags & XFS_DIFLAG_IMMUTABLE) && (flags & XFS_DIFLAG_APPEND)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* di_nextents */ + nextents = be32_to_cpu(dip->di_nextents); + fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); + switch (dip->di_format) { + case XFS_DINODE_FMT_EXTENTS: + if (nextents > fork_recs) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_BTREE: + if (nextents <= fork_recs) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_DEV: + case XFS_DINODE_FMT_UUID: + default: + if (nextents != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + } + + /* di_anextents */ + nextents = be16_to_cpu(dip->di_anextents); + fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); + switch (dip->di_aformat) { + case XFS_DINODE_FMT_EXTENTS: + if (nextents > fork_recs) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_BTREE: + if (nextents <= fork_recs) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_DEV: + case XFS_DINODE_FMT_UUID: + default: + if (nextents != 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + break; + } + + /* di_forkoff */ + if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + if (dip->di_anextents != 0 && dip->di_forkoff == 0) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* di_aformat */ + if (dip->di_aformat != XFS_DINODE_FMT_LOCAL && + dip->di_aformat != XFS_DINODE_FMT_EXTENTS && + dip->di_aformat != XFS_DINODE_FMT_BTREE) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + + /* di_cowextsize */ + if (flags2 & XFS_DIFLAG2_COWEXTSIZE) { + cowextsize = be32_to_cpu(dip->di_cowextsize); + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + if (cowextsize <= 0 || cowextsize > MAXEXTLEN) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + if (cowextsize > mp->m_sb.sb_agblocks / 2) + xfs_scrub_ino_set_corrupt(sc, ino, bp); + } + + /* Now let's do the things that require a live inode. */ + if (!sc->ip) + goto out; + + /* + * Does this inode have the reflink flag set but no shared extents? + * Set the preening flag if this is the case. + */ + if (xfs_is_reflink_inode(sc->ip)) { + error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, + &has_shared); + if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino), + XFS_INO_TO_AGBNO(mp, ino), &error)) + goto out; + if (!has_shared) + xfs_scrub_ino_set_preen(sc, bp); + } + +out: + if (bp) + xfs_trans_brelse(sc->tp, bp); + return error; +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 7b44364..f014ef0 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -30,6 +30,8 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_itable.h" #include "xfs_alloc.h" #include "xfs_alloc_btree.h" #include "xfs_bmap.h" @@ -136,6 +138,7 @@ xfs_scrub_probe( STATIC int xfs_scrub_teardown( struct xfs_scrub_context *sc, + struct xfs_inode *ip_in, int error) { xfs_scrub_ag_free(sc, &sc->sa); @@ -143,6 +146,12 @@ xfs_scrub_teardown( xfs_trans_cancel(sc->tp); sc->tp = NULL; } + if (sc->ip) { + xfs_iunlock(sc->ip, sc->ilock_flags); + if (sc->ip != ip_in) + iput(VFS_I(sc->ip)); + sc->ip = NULL; + } return error; } @@ -196,6 +205,10 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = { .scrub = xfs_scrub_refcountbt, .has = xfs_sb_version_hasreflink, }, + { /* inode record */ + .setup = xfs_scrub_setup_inode, + .scrub = xfs_scrub_inode, + }, }; /* This isn't a stable feature, warn once per day. */ @@ -281,7 +294,7 @@ xfs_scrub_metadata( * Tear down everything we hold, then set up again with * preparation for worst-case scenarios. */ - error = xfs_scrub_teardown(&sc, 0); + error = xfs_scrub_teardown(&sc, ip, 0); if (error) goto out; try_harder = true; @@ -294,7 +307,7 @@ xfs_scrub_metadata( xfs_alert_ratelimited(mp, "Corruption detected during scrub."); out_teardown: - error = xfs_scrub_teardown(&sc, error); + error = xfs_scrub_teardown(&sc, ip, error); out: trace_xfs_scrub_done(ip, sm, error); return error; diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 1c80bf5..ec635d4 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -59,6 +59,7 @@ struct xfs_scrub_context { const struct xfs_scrub_meta_ops *ops; struct xfs_trans *tp; struct xfs_inode *ip; + uint ilock_flags; bool try_harder; /* State tracking for single-AG operations. */ @@ -77,5 +78,6 @@ int xfs_scrub_inobt(struct xfs_scrub_context *sc); int xfs_scrub_finobt(struct xfs_scrub_context *sc); int xfs_scrub_rmapbt(struct xfs_scrub_context *sc); int xfs_scrub_refcountbt(struct xfs_scrub_context *sc); +int xfs_scrub_inode(struct xfs_scrub_context *sc); #endif /* __XFS_SCRUB_SCRUB_H__ */