Message ID | 150706333138.19351.1481631767118687082.stgit@magnolia (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On Tue, Oct 03, 2017 at 01:42:11PM -0700, Darrick J. Wong wrote: > +/* > + * Set us up to scrub inode btrees. > + * If we detect a discrepancy between the inobt and the inode, > + * try again after forcing logged inode cores out to disk. > + */ > +int > +xfs_scrub_setup_ag_iallocbt( > + struct xfs_scrub_context *sc, > + struct xfs_inode *ip) > +{ > + return xfs_scrub_setup_ag_btree(sc, ip, sc->try_harder); > +} > + > +/* Inode btree scrubber. */ > + > +/* Is this chunk worth checking? */ > +STATIC bool > +xfs_scrub_iallocbt_chunk( > + struct xfs_scrub_btree *bs, > + struct xfs_inobt_rec_incore *irec, > + xfs_agino_t agino, > + xfs_extlen_t len) > +{ > + struct xfs_mount *mp = bs->cur->bc_mp; > + struct xfs_agf *agf; > + unsigned long long rec_end; > + xfs_agblock_t eoag; > + xfs_agblock_t bno; > + > + agf = XFS_BUF_TO_AGF(bs->sc->sa.agf_bp); > + eoag = be32_to_cpu(agf->agf_length); Probably should use the AGI for this. > + bno = XFS_AGINO_TO_AGBNO(mp, agino); > + rec_end = (unsigned long long)bno + len; > + > + if (bno >= mp->m_sb.sb_agblocks || bno >= eoag || > + rec_end > mp->m_sb.sb_agblocks || rec_end > eoag) { Same comment as last patch. > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > + return false; > + } > + > + return true; > +} I note there is no check on the length passed in for the inode chunk - should that be verified? > + > +/* Count the number of free inodes. */ > +static unsigned int > +xfs_scrub_iallocbt_freecount( > + xfs_inofree_t freemask) > +{ > + int bits = XFS_INODES_PER_CHUNK; > + unsigned int ret = 0; > + > + while (bits--) { > + if (freemask & 1) > + ret++; > + freemask >>= 1; > + } Seems a little cumbersome. Perhaps a loop using xfs_next_bit() might be a bit faster, something like: nextbit = xfs_next_bit(&freemask, 1, 0); while (nextbit != -1) { ret++; nextbit = xfs_next_bit(&freemask, 1, nextbit + 1); } > +/* Check a particular inode with ir_free. */ > +STATIC int > +xfs_scrub_iallocbt_check_cluster_freemask( > + struct xfs_scrub_btree *bs, > + xfs_ino_t fsino, > + xfs_agino_t chunkino, > + xfs_agino_t clusterino, > + struct xfs_inobt_rec_incore *irec, > + struct xfs_buf *bp) > +{ > + struct xfs_dinode *dip; > + struct xfs_mount *mp = bs->cur->bc_mp; > + bool freemask_ok; > + bool inuse; > + int error; > + > + dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize); > + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || > + (dip->di_version >= 3 && > + be64_to_cpu(dip->di_ino) != fsino + clusterino)) { > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > + goto out; > + } > + > + freemask_ok = !!(irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino)); No need for !!(...) for a bool type - the compiler will squash it down to 0/1 autmoatically. > + error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, > + fsino + clusterino, &inuse); > + if (error == -ENODATA) { > + /* Not cached, just read the disk buffer */ I think that is wrong. xfs_icache_inode_is_allocated() returns -ENOENT if the inode is not in cache.... > + freemask_ok ^= !!(dip->di_mode); > + if (!bs->sc->try_harder && !freemask_ok) > + return -EDEADLOCK; > + } else if (error < 0) { > + /* Inode is only half assembled, don't bother. */ > + freemask_ok = true; Or we had an IO error looking it up. i.e. -EAGAIN is the "half assembled" state (i.e. in the XFS_INEW state) or the half *disasembled* state (i.e. XFS_IRECLAIMABLE), anything else is an error... > + } else { > + /* Inode is all there. */ > + freemask_ok ^= inuse; So inuse is returned from a mode check after iget succeeds. The mode isn't zeroed until /after/ XFS_IRECLAIMABLE is set, but it's also set before XFS_INEW is cleared. IOWs, how can xfs_icache_inode_is_allocated() report anything other than inuse == true here? If that's the case, what's the point of the mode check inside xfs_icache_inode_is_allocated()? > + } > + if (!freemask_ok) > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > +out: > + return 0; > +} > + > +/* Make sure the free mask is consistent with what the inodes think. */ > +STATIC int > +xfs_scrub_iallocbt_check_freemask( > + struct xfs_scrub_btree *bs, > + struct xfs_inobt_rec_incore *irec) > +{ > + struct xfs_owner_info oinfo; > + struct xfs_imap imap; > + struct xfs_mount *mp = bs->cur->bc_mp; > + struct xfs_dinode *dip; > + struct xfs_buf *bp; > + xfs_ino_t fsino; > + xfs_agino_t nr_inodes; > + xfs_agino_t agino; > + xfs_agino_t chunkino; > + xfs_agino_t clusterino; > + xfs_agblock_t agbno; > + int blks_per_cluster; > + uint16_t holemask; > + uint16_t ir_holemask; > + int error = 0; > + > + /* Make sure the freemask matches the inode records. */ > + blks_per_cluster = xfs_icluster_size_fsb(mp); > + nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0); Does this setup and loop work for the case where we have 64k filesystem blocks and so two or more inode chunks per filesystem block (i.e. ppc64)? > +/* Scrub an inobt/finobt record. */ > +STATIC int > +xfs_scrub_iallocbt_helper( > + struct xfs_scrub_btree *bs, > + union xfs_btree_rec *rec) > +{ > + struct xfs_mount *mp = bs->cur->bc_mp; > + struct xfs_agi *agi; > + struct xfs_inobt_rec_incore irec; > + uint64_t holes; > + xfs_agino_t agino; > + xfs_agblock_t agbno; > + xfs_extlen_t len; > + int holecount; > + int i; > + int error = 0; > + unsigned int real_freecount; > + uint16_t holemask; > + > + xfs_inobt_btrec_to_irec(mp, rec, &irec); > + > + if (irec.ir_count > XFS_INODES_PER_CHUNK || > + irec.ir_freecount > XFS_INODES_PER_CHUNK) > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > + > + real_freecount = irec.ir_freecount + > + (XFS_INODES_PER_CHUNK - irec.ir_count); > + if (real_freecount != xfs_scrub_iallocbt_freecount(irec.ir_free)) > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > + > + agi = XFS_BUF_TO_AGI(bs->sc->sa.agi_bp); > + agino = irec.ir_startino; > + agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino); > + if (agbno >= be32_to_cpu(agi->agi_length)) { Validate as per every other agbno? > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > + goto out; > + } > + > + if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) || > + (agbno & (xfs_icluster_size_fsb(mp) - 1))) > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); What's the magic masking checks being done here? (comment?) > + /* Handle non-sparse inodes */ > + if (!xfs_inobt_issparse(irec.ir_holemask)) { > + len = XFS_B_TO_FSB(mp, > + XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize); > + if (irec.ir_count != XFS_INODES_PER_CHUNK) > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > + > + if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len)) > + goto out; > + goto check_freemask; > + } > + > + /* Check each chunk of a sparse inode cluster. */ > + holemask = irec.ir_holemask; > + holecount = 0; > + len = XFS_B_TO_FSB(mp, > + XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize); > + holes = ~xfs_inobt_irec_to_allocmask(&irec); > + if ((holes & irec.ir_free) != holes || > + irec.ir_freecount > irec.ir_count) > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > + > + for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1, > + i++, agino += XFS_INODES_PER_HOLEMASK_BIT) { Urk. THat's a bit hard to read. > + if (holemask & 1) { > + holecount += XFS_INODES_PER_HOLEMASK_BIT; > + continue; > + } > + > + if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len)) > + break; > + } How about for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) { if (holemask & 1) { holecount += XFS_INODES_PER_HOLEMASK_BIT; } else if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len)) break; holemask >>= 1; agino += XFS_INODES_PER_HOLEMASK_BIT; } Cheers, Dave.
On Thu, Oct 05, 2017 at 01:08:10PM +1100, Dave Chinner wrote: > On Tue, Oct 03, 2017 at 01:42:11PM -0700, Darrick J. Wong wrote: > > +/* > > + * Set us up to scrub inode btrees. > > + * If we detect a discrepancy between the inobt and the inode, > > + * try again after forcing logged inode cores out to disk. > > + */ > > +int > > +xfs_scrub_setup_ag_iallocbt( > > + struct xfs_scrub_context *sc, > > + struct xfs_inode *ip) > > +{ > > + return xfs_scrub_setup_ag_btree(sc, ip, sc->try_harder); > > +} > > + > > +/* Inode btree scrubber. */ > > + > > +/* Is this chunk worth checking? */ > > +STATIC bool > > +xfs_scrub_iallocbt_chunk( > > + struct xfs_scrub_btree *bs, > > + struct xfs_inobt_rec_incore *irec, > > + xfs_agino_t agino, > > + xfs_extlen_t len) > > +{ > > + struct xfs_mount *mp = bs->cur->bc_mp; > > + struct xfs_agf *agf; > > + unsigned long long rec_end; > > + xfs_agblock_t eoag; > > + xfs_agblock_t bno; > > + > > + agf = XFS_BUF_TO_AGF(bs->sc->sa.agf_bp); > > + eoag = be32_to_cpu(agf->agf_length); <nod> > Probably should use the AGI for this. > > > + bno = XFS_AGINO_TO_AGBNO(mp, agino); > > + rec_end = (unsigned long long)bno + len; > > + > > + if (bno >= mp->m_sb.sb_agblocks || bno >= eoag || > > + rec_end > mp->m_sb.sb_agblocks || rec_end > eoag) { > > Same comment as last patch. Yup yup. > > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > > + return false; > > + } > > + > > + return true; > > +} > > I note there is no check on the length passed in for the inode > chunk - should that be verified? len is computed from constants, so in theory it should be safe since if the geometry is all messed up then we likely wouldn't have been able to mount. OTOH it's pretty cheap to put in, so sure. > > + > > +/* Count the number of free inodes. */ > > +static unsigned int > > +xfs_scrub_iallocbt_freecount( > > + xfs_inofree_t freemask) > > +{ > > + int bits = XFS_INODES_PER_CHUNK; > > + unsigned int ret = 0; > > + > > + while (bits--) { > > + if (freemask & 1) > > + ret++; > > + freemask >>= 1; > > + } > > > Seems a little cumbersome. Perhaps a loop using xfs_next_bit() > might be a bit faster, something like: > > nextbit = xfs_next_bit(&freemask, 1, 0); > while (nextbit != -1) { > ret++; > nextbit = xfs_next_bit(&freemask, 1, nextbit + 1); > } <nod> A pity there's no popcnt()... > > +/* Check a particular inode with ir_free. */ > > +STATIC int > > +xfs_scrub_iallocbt_check_cluster_freemask( > > + struct xfs_scrub_btree *bs, > > + xfs_ino_t fsino, > > + xfs_agino_t chunkino, > > + xfs_agino_t clusterino, > > + struct xfs_inobt_rec_incore *irec, > > + struct xfs_buf *bp) > > +{ > > + struct xfs_dinode *dip; > > + struct xfs_mount *mp = bs->cur->bc_mp; > > + bool freemask_ok; > > + bool inuse; > > + int error; > > + > > + dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize); > > + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || > > + (dip->di_version >= 3 && > > + be64_to_cpu(dip->di_ino) != fsino + clusterino)) { > > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > > + goto out; > > + } > > + > > + freemask_ok = !!(irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino)); > > No need for !!(...) for a bool type - the compiler will squash it > down to 0/1 autmoatically. <nod> > > + error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, > > + fsino + clusterino, &inuse); > > + if (error == -ENODATA) { > > + /* Not cached, just read the disk buffer */ > > I think that is wrong. xfs_icache_inode_is_allocated() returns > -ENOENT if the inode is not in cache.... I changed it to ENODATA so that we can tell the difference between inode not in cache (ENODATA) and inode racing with unlink (ENOENT). (Patch was sent to the ML a while ago and I omitted it from this posting...) > > + freemask_ok ^= !!(dip->di_mode); > > + if (!bs->sc->try_harder && !freemask_ok) > > + return -EDEADLOCK; > > + } else if (error < 0) { > > + /* Inode is only half assembled, don't bother. */ > > + freemask_ok = true; > > Or we had an IO error looking it up. i.e. -EAGAIN is the "half > assembled" state (i.e. in the XFS_INEW state) or the half > *disasembled* state (i.e. XFS_IRECLAIMABLE), anything > else is an error... <nod> > > + } else { > > + /* Inode is all there. */ > > + freemask_ok ^= inuse; > > So inuse is returned from a mode check after iget succeeds. The mode > isn't zeroed until /after/ XFS_IRECLAIMABLE is set, but it's also > set before XFS_INEW is cleared. IOWs, how can > xfs_icache_inode_is_allocated() report anything > other than inuse == true here? If that's the case, what's the point > of the mode check inside xfs_icache_inode_is_allocated()? I think you're right about this, I'll have a look in the morning. > > + } > > + if (!freemask_ok) > > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > > +out: > > + return 0; > > +} > > + > > +/* Make sure the free mask is consistent with what the inodes think. */ > > +STATIC int > > +xfs_scrub_iallocbt_check_freemask( > > + struct xfs_scrub_btree *bs, > > + struct xfs_inobt_rec_incore *irec) > > +{ > > + struct xfs_owner_info oinfo; > > + struct xfs_imap imap; > > + struct xfs_mount *mp = bs->cur->bc_mp; > > + struct xfs_dinode *dip; > > + struct xfs_buf *bp; > > + xfs_ino_t fsino; > > + xfs_agino_t nr_inodes; > > + xfs_agino_t agino; > > + xfs_agino_t chunkino; > > + xfs_agino_t clusterino; > > + xfs_agblock_t agbno; > > + int blks_per_cluster; > > + uint16_t holemask; > > + uint16_t ir_holemask; > > + int error = 0; > > + > > + /* Make sure the freemask matches the inode records. */ > > + blks_per_cluster = xfs_icluster_size_fsb(mp); > > + nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0); > > Does this setup and loop work for the case where we have 64k > filesystem blocks and so two or more inode chunks per filesystem > block (i.e. ppc64)? I think the answer is yes, at worst case we end up processing a block's worth of inodes at a time. The last time I ran scrub on ppc64 (last week) it worked fine. > > +/* Scrub an inobt/finobt record. */ > > +STATIC int > > +xfs_scrub_iallocbt_helper( > > + struct xfs_scrub_btree *bs, > > + union xfs_btree_rec *rec) > > +{ > > + struct xfs_mount *mp = bs->cur->bc_mp; > > + struct xfs_agi *agi; > > + struct xfs_inobt_rec_incore irec; > > + uint64_t holes; > > + xfs_agino_t agino; > > + xfs_agblock_t agbno; > > + xfs_extlen_t len; > > + int holecount; > > + int i; > > + int error = 0; > > + unsigned int real_freecount; > > + uint16_t holemask; > > + > > + xfs_inobt_btrec_to_irec(mp, rec, &irec); > > + > > + if (irec.ir_count > XFS_INODES_PER_CHUNK || > > + irec.ir_freecount > XFS_INODES_PER_CHUNK) > > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > > + > > + real_freecount = irec.ir_freecount + > > + (XFS_INODES_PER_CHUNK - irec.ir_count); > > + if (real_freecount != xfs_scrub_iallocbt_freecount(irec.ir_free)) > > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > > + > > + agi = XFS_BUF_TO_AGI(bs->sc->sa.agi_bp); > > + agino = irec.ir_startino; > > + agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino); > > + if (agbno >= be32_to_cpu(agi->agi_length)) { > > Validate as per every other agbno? <nod> > > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > > + goto out; > > + } > > + > > + if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) || > > + (agbno & (xfs_icluster_size_fsb(mp) - 1))) > > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > > What's the magic masking checks being done here? (comment?) /* Make sure this record is aligned to cluster and inoalignmnt size. */ > > + /* Handle non-sparse inodes */ > > + if (!xfs_inobt_issparse(irec.ir_holemask)) { > > + len = XFS_B_TO_FSB(mp, > > + XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize); > > + if (irec.ir_count != XFS_INODES_PER_CHUNK) > > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > > + > > + if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len)) > > + goto out; > > + goto check_freemask; > > + } > > + > > + /* Check each chunk of a sparse inode cluster. */ > > + holemask = irec.ir_holemask; > > + holecount = 0; > > + len = XFS_B_TO_FSB(mp, > > + XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize); > > + holes = ~xfs_inobt_irec_to_allocmask(&irec); > > + if ((holes & irec.ir_free) != holes || > > + irec.ir_freecount > irec.ir_count) > > + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); > > + > > + for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1, > > + i++, agino += XFS_INODES_PER_HOLEMASK_BIT) { > > Urk. THat's a bit hard to read. > > > + if (holemask & 1) { > > + holecount += XFS_INODES_PER_HOLEMASK_BIT; > > + continue; > > + } > > + > > + if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len)) > > + break; > > + } > > How about > > for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) { > if (holemask & 1) { > holecount += XFS_INODES_PER_HOLEMASK_BIT; > } else if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len)) > break; > > holemask >>= 1; > agino += XFS_INODES_PER_HOLEMASK_BIT; > } Looks better than mine. :) --D > > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Oct 04, 2017 at 10:47:14PM -0700, Darrick J. Wong wrote: > On Thu, Oct 05, 2017 at 01:08:10PM +1100, Dave Chinner wrote: > > On Tue, Oct 03, 2017 at 01:42:11PM -0700, Darrick J. Wong wrote: > > > +/* Count the number of free inodes. */ > > > +static unsigned int > > > +xfs_scrub_iallocbt_freecount( > > > + xfs_inofree_t freemask) > > > +{ > > > + int bits = XFS_INODES_PER_CHUNK; > > > + unsigned int ret = 0; > > > + > > > + while (bits--) { > > > + if (freemask & 1) > > > + ret++; > > > + freemask >>= 1; > > > + } > > > > > > Seems a little cumbersome. Perhaps a loop using xfs_next_bit() > > might be a bit faster, something like: > > > > nextbit = xfs_next_bit(&freemask, 1, 0); > > while (nextbit != -1) { > > ret++; > > nextbit = xfs_next_bit(&freemask, 1, nextbit + 1); > > } > > <nod> A pity there's no popcnt()... Oh, hweight64(). > > > + error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, > > > + fsino + clusterino, &inuse); > > > + if (error == -ENODATA) { > > > + /* Not cached, just read the disk buffer */ > > > > I think that is wrong. xfs_icache_inode_is_allocated() returns > > -ENOENT if the inode is not in cache.... > > I changed it to ENODATA so that we can tell the difference between > inode not in cache (ENODATA) and inode racing with unlink (ENOENT). > > (Patch was sent to the ML a while ago and I omitted it from this posting...) Ah, if it's not committed upstream yet, include it in the next posting, please :) > > > + /* Make sure the freemask matches the inode records. */ > > > + blks_per_cluster = xfs_icluster_size_fsb(mp); > > > + nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0); > > > > Does this setup and loop work for the case where we have 64k > > filesystem blocks and so two or more inode chunks per filesystem > > block (i.e. ppc64)? > > I think the answer is yes, at worst case we end up processing a block's > worth of inodes at a time. The last time I ran scrub on ppc64 (last > week) it worked fine. Hmmm - there's nothing to count how many inodes are scrubbed, is there? Perhaps it would be good to gcount as we go so we know if we've scrubbed all the inodes? Hmmm - I might have missed it, but is there anywhere in this code where we check the inode number in the inode that we have read actually matches the agino we are attempting to validate? Cheers, Dave.
On Thu, Oct 05, 2017 at 06:22:45PM +1100, Dave Chinner wrote: > On Wed, Oct 04, 2017 at 10:47:14PM -0700, Darrick J. Wong wrote: > > On Thu, Oct 05, 2017 at 01:08:10PM +1100, Dave Chinner wrote: > > > On Tue, Oct 03, 2017 at 01:42:11PM -0700, Darrick J. Wong wrote: > > > > +/* Count the number of free inodes. */ > > > > +static unsigned int > > > > +xfs_scrub_iallocbt_freecount( > > > > + xfs_inofree_t freemask) > > > > +{ > > > > + int bits = XFS_INODES_PER_CHUNK; > > > > + unsigned int ret = 0; > > > > + > > > > + while (bits--) { > > > > + if (freemask & 1) > > > > + ret++; > > > > + freemask >>= 1; > > > > + } > > > > > > > > > Seems a little cumbersome. Perhaps a loop using xfs_next_bit() > > > might be a bit faster, something like: > > > > > > nextbit = xfs_next_bit(&freemask, 1, 0); > > > while (nextbit != -1) { > > > ret++; > > > nextbit = xfs_next_bit(&freemask, 1, nextbit + 1); > > > } > > > > <nod> A pity there's no popcnt()... > > Oh, hweight64(). > > > > > + error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, > > > > + fsino + clusterino, &inuse); > > > > + if (error == -ENODATA) { > > > > + /* Not cached, just read the disk buffer */ > > > > > > I think that is wrong. xfs_icache_inode_is_allocated() returns > > > -ENOENT if the inode is not in cache.... > > > > I changed it to ENODATA so that we can tell the difference between > > inode not in cache (ENODATA) and inode racing with unlink (ENOENT). > > > > (Patch was sent to the ML a while ago and I omitted it from this posting...) > > Ah, if it's not committed upstream yet, include it in the next > posting, please :) > > > > > + /* Make sure the freemask matches the inode records. */ > > > > + blks_per_cluster = xfs_icluster_size_fsb(mp); > > > > + nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0); > > > > > > Does this setup and loop work for the case where we have 64k > > > filesystem blocks and so two or more inode chunks per filesystem > > > block (i.e. ppc64)? > > > > I think the answer is yes, at worst case we end up processing a block's > > worth of inodes at a time. The last time I ran scrub on ppc64 (last > > week) it worked fine. > > Hmmm - there's nothing to count how many inodes are scrubbed, is > there? Perhaps it would be good to gcount as we go so we know if > we've scrubbed all the inodes? Userspace will count the number of inodes scrubbed (since it's initiating all the scrub requests) and check it against statfs. > Hmmm - I might have missed it, but is there anywhere in this code > where we check the inode number in the inode that we have read > actually matches the agino we are attempting to validate? The dinode verifier checks di_ino, so if they don't match then xfs_iread -> xfs_iget return -EFSCORRUPTED. --D > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 84ac733..82326b7 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -150,6 +150,7 @@ xfs-y += $(addprefix scrub/, \ alloc.o \ btree.o \ common.o \ + ialloc.o \ scrub.o \ ) endif diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 23229f0..154c3dd 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -518,7 +518,7 @@ static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); } -static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasfinobt(xfs_sb_t *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 1e23d13..74df6ec 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -490,9 +490,11 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_AGI 4 /* AG inode header */ #define XFS_SCRUB_TYPE_BNOBT 5 /* freesp by block btree */ #define XFS_SCRUB_TYPE_CNTBT 6 /* freesp by length btree */ +#define XFS_SCRUB_TYPE_INOBT 7 /* inode btree */ +#define XFS_SCRUB_TYPE_FINOBT 8 /* free inode btree */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 7 +#define XFS_SCRUB_TYPE_NR 9 /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1 << 0) diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 9a37e05..60b159a 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -80,6 +80,8 @@ int xfs_scrub_setup_ag_header(struct xfs_scrub_context *sc, struct xfs_inode *ip); int xfs_scrub_setup_ag_allocbt(struct xfs_scrub_context *sc, struct xfs_inode *ip); +int xfs_scrub_setup_ag_iallocbt(struct xfs_scrub_context *sc, + struct xfs_inode *ip); void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa); diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c new file mode 100644 index 0000000..db8404c --- /dev/null +++ b/fs/xfs/scrub/ialloc.c @@ -0,0 +1,341 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" + +/* + * Set us up to scrub inode btrees. + * If we detect a discrepancy between the inobt and the inode, + * try again after forcing logged inode cores out to disk. + */ +int +xfs_scrub_setup_ag_iallocbt( + struct xfs_scrub_context *sc, + struct xfs_inode *ip) +{ + return xfs_scrub_setup_ag_btree(sc, ip, sc->try_harder); +} + +/* Inode btree scrubber. */ + +/* Is this chunk worth checking? */ +STATIC bool +xfs_scrub_iallocbt_chunk( + struct xfs_scrub_btree *bs, + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino, + xfs_extlen_t len) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_agf *agf; + unsigned long long rec_end; + xfs_agblock_t eoag; + xfs_agblock_t bno; + + agf = XFS_BUF_TO_AGF(bs->sc->sa.agf_bp); + eoag = be32_to_cpu(agf->agf_length); + bno = XFS_AGINO_TO_AGBNO(mp, agino); + rec_end = (unsigned long long)bno + len; + + if (bno >= mp->m_sb.sb_agblocks || bno >= eoag || + rec_end > mp->m_sb.sb_agblocks || rec_end > eoag) { + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + return false; + } + + return true; +} + +/* Count the number of free inodes. */ +static unsigned int +xfs_scrub_iallocbt_freecount( + xfs_inofree_t freemask) +{ + int bits = XFS_INODES_PER_CHUNK; + unsigned int ret = 0; + + while (bits--) { + if (freemask & 1) + ret++; + freemask >>= 1; + } + + return ret; +} + +/* Check a particular inode with ir_free. */ +STATIC int +xfs_scrub_iallocbt_check_cluster_freemask( + struct xfs_scrub_btree *bs, + xfs_ino_t fsino, + xfs_agino_t chunkino, + xfs_agino_t clusterino, + struct xfs_inobt_rec_incore *irec, + struct xfs_buf *bp) +{ + struct xfs_dinode *dip; + struct xfs_mount *mp = bs->cur->bc_mp; + bool freemask_ok; + bool inuse; + int error; + + dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || + (dip->di_version >= 3 && + be64_to_cpu(dip->di_ino) != fsino + clusterino)) { + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + goto out; + } + + freemask_ok = !!(irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino)); + error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, + fsino + clusterino, &inuse); + if (error == -ENODATA) { + /* Not cached, just read the disk buffer */ + freemask_ok ^= !!(dip->di_mode); + if (!bs->sc->try_harder && !freemask_ok) + return -EDEADLOCK; + } else if (error < 0) { + /* Inode is only half assembled, don't bother. */ + freemask_ok = true; + } else { + /* Inode is all there. */ + freemask_ok ^= inuse; + } + if (!freemask_ok) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); +out: + return 0; +} + +/* Make sure the free mask is consistent with what the inodes think. */ +STATIC int +xfs_scrub_iallocbt_check_freemask( + struct xfs_scrub_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + struct xfs_owner_info oinfo; + struct xfs_imap imap; + struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_dinode *dip; + struct xfs_buf *bp; + xfs_ino_t fsino; + xfs_agino_t nr_inodes; + xfs_agino_t agino; + xfs_agino_t chunkino; + xfs_agino_t clusterino; + xfs_agblock_t agbno; + int blks_per_cluster; + uint16_t holemask; + uint16_t ir_holemask; + int error = 0; + + /* Make sure the freemask matches the inode records. */ + blks_per_cluster = xfs_icluster_size_fsb(mp); + nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); + + for (agino = irec->ir_startino; + agino < irec->ir_startino + XFS_INODES_PER_CHUNK; + agino += blks_per_cluster * mp->m_sb.sb_inopblock) { + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + chunkino = agino - irec->ir_startino; + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + + /* Compute the holemask mask for this cluster. */ + for (clusterino = 0, holemask = 0; clusterino < nr_inodes; + clusterino += XFS_INODES_PER_HOLEMASK_BIT) + holemask |= XFS_INOBT_MASK((chunkino + clusterino) / + XFS_INODES_PER_HOLEMASK_BIT); + + /* The whole cluster must be a hole or not a hole. */ + ir_holemask = (irec->ir_holemask & holemask); + if (ir_holemask != holemask && ir_holemask != 0) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + /* If any part of this is a hole, skip it. */ + if (ir_holemask) + continue; + + /* Grab the inode cluster buffer. */ + imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, + agbno); + imap.im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); + imap.im_boffset = 0; + + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, + &dip, &bp, 0, 0); + if (!xfs_scrub_btree_op_ok(bs->sc, bs->cur, 0, &error)) + continue; + + /* Which inodes are free? */ + for (clusterino = 0; clusterino < nr_inodes; clusterino++) { + error = xfs_scrub_iallocbt_check_cluster_freemask(bs, + fsino, chunkino, clusterino, irec, bp); + if (error) { + xfs_trans_brelse(bs->cur->bc_tp, bp); + return error; + } + } + + xfs_trans_brelse(bs->cur->bc_tp, bp); + } + + return error; +} + +/* Scrub an inobt/finobt record. */ +STATIC int +xfs_scrub_iallocbt_helper( + struct xfs_scrub_btree *bs, + union xfs_btree_rec *rec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_agi *agi; + struct xfs_inobt_rec_incore irec; + uint64_t holes; + xfs_agino_t agino; + xfs_agblock_t agbno; + xfs_extlen_t len; + int holecount; + int i; + int error = 0; + unsigned int real_freecount; + uint16_t holemask; + + xfs_inobt_btrec_to_irec(mp, rec, &irec); + + if (irec.ir_count > XFS_INODES_PER_CHUNK || + irec.ir_freecount > XFS_INODES_PER_CHUNK) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + real_freecount = irec.ir_freecount + + (XFS_INODES_PER_CHUNK - irec.ir_count); + if (real_freecount != xfs_scrub_iallocbt_freecount(irec.ir_free)) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + agi = XFS_BUF_TO_AGI(bs->sc->sa.agi_bp); + agino = irec.ir_startino; + agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino); + if (agbno >= be32_to_cpu(agi->agi_length)) { + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + goto out; + } + + if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) || + (agbno & (xfs_icluster_size_fsb(mp) - 1))) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + /* Handle non-sparse inodes */ + if (!xfs_inobt_issparse(irec.ir_holemask)) { + len = XFS_B_TO_FSB(mp, + XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize); + if (irec.ir_count != XFS_INODES_PER_CHUNK) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len)) + goto out; + goto check_freemask; + } + + /* Check each chunk of a sparse inode cluster. */ + holemask = irec.ir_holemask; + holecount = 0; + len = XFS_B_TO_FSB(mp, + XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize); + holes = ~xfs_inobt_irec_to_allocmask(&irec); + if ((holes & irec.ir_free) != holes || + irec.ir_freecount > irec.ir_count) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + + for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1, + i++, agino += XFS_INODES_PER_HOLEMASK_BIT) { + if (holemask & 1) { + holecount += XFS_INODES_PER_HOLEMASK_BIT; + continue; + } + + if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len)) + break; + } + + if (holecount > XFS_INODES_PER_CHUNK || + holecount + irec.ir_count != XFS_INODES_PER_CHUNK) + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0); + +check_freemask: + error = xfs_scrub_iallocbt_check_freemask(bs, &irec); + if (error) + goto out; + +out: + return error; +} + +/* Scrub the inode btrees for some AG. */ +STATIC int +xfs_scrub_iallocbt( + struct xfs_scrub_context *sc, + xfs_btnum_t which) +{ + struct xfs_btree_cur *cur; + struct xfs_owner_info oinfo; + + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); + cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; + return xfs_scrub_btree(sc, cur, xfs_scrub_iallocbt_helper, + &oinfo, NULL); +} + +int +xfs_scrub_inobt( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_iallocbt(sc, XFS_BTNUM_INO); +} + +int +xfs_scrub_finobt( + struct xfs_scrub_context *sc) +{ + return xfs_scrub_iallocbt(sc, XFS_BTNUM_FINO); +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index f543ce9..f209348 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -177,6 +177,15 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = { .setup = xfs_scrub_setup_ag_allocbt, .scrub = xfs_scrub_cntbt, }, + { /* inobt */ + .setup = xfs_scrub_setup_ag_iallocbt, + .scrub = xfs_scrub_inobt, + }, + { /* finobt */ + .setup = xfs_scrub_setup_ag_iallocbt, + .scrub = xfs_scrub_finobt, + .has = xfs_sb_version_hasfinobt, + }, }; /* This isn't a stable feature, warn once per day. */ diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index a4af99c..5d97453 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -73,5 +73,7 @@ int xfs_scrub_agfl(struct xfs_scrub_context *sc); int xfs_scrub_agi(struct xfs_scrub_context *sc); int xfs_scrub_bnobt(struct xfs_scrub_context *sc); int xfs_scrub_cntbt(struct xfs_scrub_context *sc); +int xfs_scrub_inobt(struct xfs_scrub_context *sc); +int xfs_scrub_finobt(struct xfs_scrub_context *sc); #endif /* __XFS_SCRUB_SCRUB_H__ */