Message ID | 153292970169.24509.4581630892233165448.stgit@magnolia (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | xfs-4.19: online repair support | expand |
On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <darrick.wong@oracle.com> > > Rebuild the free space btrees from the gaps in the rmap btree. > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > --- > fs/xfs/Makefile | 1 > fs/xfs/scrub/alloc.c | 1 > fs/xfs/scrub/alloc_repair.c | 581 +++++++++++++++++++++++++++++++++++++++++++ > fs/xfs/scrub/common.c | 8 + > fs/xfs/scrub/repair.h | 2 > fs/xfs/scrub/scrub.c | 4 > fs/xfs/scrub/trace.h | 2 > fs/xfs/xfs_extent_busy.c | 14 + > fs/xfs/xfs_extent_busy.h | 2 > 9 files changed, 610 insertions(+), 5 deletions(-) > create mode 100644 fs/xfs/scrub/alloc_repair.c > > ... > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c > new file mode 100644 > index 000000000000..b228c2906de2 > --- /dev/null > +++ b/fs/xfs/scrub/alloc_repair.c > @@ -0,0 +1,581 @@ ... > +/* Record extents that aren't in use from gaps in the rmap records. */ > +STATIC int > +xrep_abt_walk_rmap( > + struct xfs_btree_cur *cur, > + struct xfs_rmap_irec *rec, > + void *priv) > +{ > + struct xrep_abt *ra = priv; > + struct xrep_abt_extent *rae; > + xfs_fsblock_t fsb; > + int error; > + > + /* Record all the OWN_AG blocks... */ > + if (rec->rm_owner == XFS_RMAP_OWN_AG) { > + fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, > + rec->rm_startblock); > + error = xfs_bitmap_set(ra->btlist, fsb, rec->rm_blockcount); > + if (error) > + return error; > + } > + > + /* ...and all the rmapbt blocks... */ > + error = xfs_bitmap_set_btcur_path(&ra->nobtlist, cur); > + if (error) > + return error; > + > + /* ...and all the free space. */ > + if (rec->rm_startblock > ra->next_bno) { > + trace_xrep_abt_walk_rmap(cur->bc_mp, cur->bc_private.a.agno, > + ra->next_bno, rec->rm_startblock - ra->next_bno, > + XFS_RMAP_OWN_NULL, 0, 0); > + > + rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL); > + if (!rae) > + return -ENOMEM; > + INIT_LIST_HEAD(&rae->list); > + rae->bno = ra->next_bno; > + rae->len = rec->rm_startblock - ra->next_bno; > + list_add_tail(&rae->list, ra->extlist); Any reason we don't use a bitmap for this one? > + ra->nr_records++; > + ra->nr_blocks += rae->len; > + } > + ra->next_bno = max_t(xfs_agblock_t, ra->next_bno, > + rec->rm_startblock + rec->rm_blockcount); The max_t() is to cover the record overlap case, right? If so, another one liner comment would be good. > + return 0; > +} > + ... > +/* Free an extent, which creates a record in the bnobt/cntbt. */ > +STATIC int > +xrep_abt_free_extent( > + struct xfs_scrub *sc, > + xfs_fsblock_t fsbno, > + xfs_extlen_t len, > + struct xfs_owner_info *oinfo) > +{ > + int error; > + > + error = xfs_free_extent(sc->tp, fsbno, len, oinfo, 0); > + if (error) > + return error; > + error = xrep_roll_ag_trans(sc); > + if (error) > + return error; > + return xfs_mod_fdblocks(sc->mp, -(int64_t)len, false); What's this call for? Is it because the blocks we're freeing were already free? (Similar question on the other xfs_mod_fdblocks() call further down). BTW, what prevents some other task from coming along and screwing with this? For example, could a large falloc or buffered write come in and allocate these global blocks before we take them away here (causing the whole sequence to fail)? > +} > + ... > +/* > + * Allocate a block from the (cached) first extent in the AG. In theory > + * this should never fail, since we already checked that there was enough > + * space to handle the new btrees. > + */ > +STATIC xfs_fsblock_t > +xrep_abt_alloc_block( > + struct xfs_scrub *sc, > + struct list_head *free_extents) > +{ > + struct xrep_abt_extent *ext; > + > + /* Pull the first free space extent off the list, and... */ > + ext = list_first_entry(free_extents, struct xrep_abt_extent, list); > + > + /* ...take its first block. */ > + ext->bno++; > + ext->len--; > + if (ext->len == 0) { > + list_del(&ext->list); > + kmem_free(ext); > + } > + > + return XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, ext->bno - 1); Looks like a potential use after free of ext. > +} > + ... > +/* > + * Reset the global free block counter and the per-AG counters to make it look > + * like this AG has no free space. > + */ > +STATIC int > +xrep_abt_reset_counters( > + struct xfs_scrub *sc, > + int *log_flags) > +{ > + struct xfs_perag *pag = sc->sa.pag; > + struct xfs_agf *agf; > + xfs_agblock_t new_btblks; > + xfs_agblock_t to_free; > + int error; > + > + /* > + * Since we're abandoning the old bnobt/cntbt, we have to decrease > + * fdblocks by the # of blocks in those trees. btreeblks counts the > + * non-root blocks of the free space and rmap btrees. Do this before > + * resetting the AGF counters. > + */ Hmm, I'm not quite following the comment wrt to the xfs_mod_fdblocks() below. to_free looks like it's the count of all current btree blocks minus rmap blocks (i.e., old bno/cnt btree blocks). Are we "allocating" those blocks here because we're going to free them later? > + agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); > + > + /* rmap_blocks accounts root block, btreeblks doesn't */ > + new_btblks = be32_to_cpu(agf->agf_rmap_blocks) - 1; > + > + /* btreeblks doesn't account bno/cnt root blocks */ > + to_free = pag->pagf_btreeblks + 2; > + > + /* and don't account for the blocks we aren't freeing */ > + to_free -= new_btblks; > + > + error = xfs_mod_fdblocks(sc->mp, -(int64_t)to_free, false); > + if (error) > + return error; > + > + /* > + * Reset the per-AG info, both incore and ondisk. Mark the incore > + * state stale in case we fail out of here. > + */ > + ASSERT(pag->pagf_init); > + pag->pagf_init = 0; > + pag->pagf_btreeblks = new_btblks; > + pag->pagf_freeblks = 0; > + pag->pagf_longest = 0; > + > + agf->agf_btreeblks = cpu_to_be32(new_btblks); > + agf->agf_freeblks = 0; > + agf->agf_longest = 0; > + *log_flags |= XFS_AGF_BTREEBLKS | XFS_AGF_LONGEST | XFS_AGF_FREEBLKS; > + > + return 0; > +} > + > +/* Initialize a new free space btree root and implant into AGF. */ > +STATIC int > +xrep_abt_reset_btree( > + struct xfs_scrub *sc, > + xfs_btnum_t btnum, > + struct list_head *free_extents) > +{ > + struct xfs_owner_info oinfo; > + struct xfs_buf *bp; > + struct xfs_perag *pag = sc->sa.pag; > + struct xfs_mount *mp = sc->mp; > + struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); > + xfs_fsblock_t fsbno; > + int error; > + > + /* Allocate new root block. */ > + fsbno = xrep_abt_alloc_block(sc, free_extents); xrep_abt_alloc_block() converts an agbno to return an fsb. This function passes the fsb to the init call just below and then converts it back to an agbno in two places. It seems like there might be less conversions to follow if the above just returned an agbno and we converted it to an fsb once for xrep_init_btblock(). > + if (fsbno == NULLFSBLOCK) > + return -ENOSPC; > + > + /* Initialize new tree root. */ > + error = xrep_init_btblock(sc, fsbno, &bp, btnum, &xfs_allocbt_buf_ops); > + if (error) > + return error; > + > + /* Implant into AGF. */ > + agf->agf_roots[btnum] = cpu_to_be32(XFS_FSB_TO_AGBNO(mp, fsbno)); > + agf->agf_levels[btnum] = cpu_to_be32(1); > + > + /* Add rmap records for the btree roots */ > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, > + XFS_FSB_TO_AGBNO(mp, fsbno), 1, &oinfo); > + if (error) > + return error; > + > + /* Reset the incore state. */ > + pag->pagf_levels[btnum] = 1; > + > + return 0; > +} > + ... > + > +/* > + * Make our new freespace btree roots permanent so that we can start freeing > + * unused space back into the AG. > + */ > +STATIC int > +xrep_abt_commit_new( > + struct xfs_scrub *sc, > + struct xfs_bitmap *old_allocbt_blocks, > + int log_flags) > +{ > + int error; > + > + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); > + > + /* Invalidate the old freespace btree blocks and commit. */ > + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); > + if (error) > + return error; It looks like the above invalidation all happens in the same transaction. Those aren't logging buffer data or anything, but any idea how many log formats we can get away with in this single transaction? > + error = xrep_roll_ag_trans(sc); > + if (error) > + return error; > + > + /* Now that we've succeeded, mark the incore state valid again. */ > + sc->sa.pag->pagf_init = 1; > + return 0; > +} > + > +/* Build new free space btrees and dispose of the old one. */ > +STATIC int > +xrep_abt_rebuild_trees( > + struct xfs_scrub *sc, > + struct list_head *free_extents, > + struct xfs_bitmap *old_allocbt_blocks) > +{ > + struct xfs_owner_info oinfo; > + struct xrep_abt_extent *rae; > + struct xrep_abt_extent *n; > + struct xrep_abt_extent *longest; > + int error; > + > + xfs_rmap_skip_owner_update(&oinfo); > + > + /* > + * Insert the longest free extent in case it's necessary to > + * refresh the AGFL with multiple blocks. If there is no longest > + * extent, we had exactly the free space we needed; we're done. > + */ I'm confused by the last sentence. longest should only be NULL if the free space list is empty and haven't we already bailed out with -ENOSPC if that's the case? > + longest = xrep_abt_get_longest(free_extents); > + if (!longest) > + goto done; > + error = xrep_abt_free_extent(sc, > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), > + longest->len, &oinfo); > + list_del(&longest->list); > + kmem_free(longest); > + if (error) > + return error; > + > + /* Insert records into the new btrees. */ > + list_for_each_entry_safe(rae, n, free_extents, list) { > + error = xrep_abt_free_extent(sc, > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), > + rae->len, &oinfo); > + if (error) > + return error; > + list_del(&rae->list); > + kmem_free(rae); > + } Ok, at this point we've reset the btree roots and we start freeing the free ranges that were discovered via the rmapbt analysis. AFAICT, if we fail or crash at this point, we leave the allocbts in a partially constructed state. I take it that is Ok with respect to the broader repair algorithm because we'd essentially start over by inspecting the rmapbt again on a retry. The blocks allocated for the btrees that we've begun to construct here end up mapped in the rmapbt as we go, right? IIUC, that means we don't necessarily have infinite retries to make sure this completes. IOW, suppose that a first repair attempt finds just enough free space to construct new trees, gets far enough along to consume most of that free space and then crashes. Is it possible that a subsequent repair attempt includes the btree blocks allocated during the previous failed repair attempt in the sum of "old btree blocks" and determines we don't have enough free space to repair? > + > +done: > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > + XFS_AG_RESV_NONE); > +} > + ... > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > index 0ed68379e551..82f99633a597 100644 > --- a/fs/xfs/xfs_extent_busy.c > +++ b/fs/xfs/xfs_extent_busy.c > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > diff = b1->bno - b2->bno; > return diff; > } > + > +/* Are there any busy extents in this AG? */ > +bool > +xfs_extent_busy_list_empty( > + struct xfs_perag *pag) > +{ > + spin_lock(&pag->pagb_lock); > + if (pag->pagb_tree.rb_node) { RB_EMPTY_ROOT()? Brian > + spin_unlock(&pag->pagb_lock); > + return false; > + } > + spin_unlock(&pag->pagb_lock); > + return true; > +} > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > index 990ab3891971..2f8c73c712c6 100644 > --- a/fs/xfs/xfs_extent_busy.h > +++ b/fs/xfs/xfs_extent_busy.h > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > } > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > + > #endif /* __XFS_EXTENT_BUSY_H__ */ > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > Rebuild the free space btrees from the gaps in the rmap btree. > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > --- > > fs/xfs/Makefile | 1 > > fs/xfs/scrub/alloc.c | 1 > > fs/xfs/scrub/alloc_repair.c | 581 +++++++++++++++++++++++++++++++++++++++++++ > > fs/xfs/scrub/common.c | 8 + > > fs/xfs/scrub/repair.h | 2 > > fs/xfs/scrub/scrub.c | 4 > > fs/xfs/scrub/trace.h | 2 > > fs/xfs/xfs_extent_busy.c | 14 + > > fs/xfs/xfs_extent_busy.h | 2 > > 9 files changed, 610 insertions(+), 5 deletions(-) > > create mode 100644 fs/xfs/scrub/alloc_repair.c > > > > > ... > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c > > new file mode 100644 > > index 000000000000..b228c2906de2 > > --- /dev/null > > +++ b/fs/xfs/scrub/alloc_repair.c > > @@ -0,0 +1,581 @@ > ... > > +/* Record extents that aren't in use from gaps in the rmap records. */ > > +STATIC int > > +xrep_abt_walk_rmap( > > + struct xfs_btree_cur *cur, > > + struct xfs_rmap_irec *rec, > > + void *priv) > > +{ > > + struct xrep_abt *ra = priv; > > + struct xrep_abt_extent *rae; > > + xfs_fsblock_t fsb; > > + int error; > > + > > + /* Record all the OWN_AG blocks... */ > > + if (rec->rm_owner == XFS_RMAP_OWN_AG) { > > + fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, > > + rec->rm_startblock); > > + error = xfs_bitmap_set(ra->btlist, fsb, rec->rm_blockcount); > > + if (error) > > + return error; > > + } > > + > > + /* ...and all the rmapbt blocks... */ > > + error = xfs_bitmap_set_btcur_path(&ra->nobtlist, cur); > > + if (error) > > + return error; > > + > > + /* ...and all the free space. */ > > + if (rec->rm_startblock > ra->next_bno) { > > + trace_xrep_abt_walk_rmap(cur->bc_mp, cur->bc_private.a.agno, > > + ra->next_bno, rec->rm_startblock - ra->next_bno, > > + XFS_RMAP_OWN_NULL, 0, 0); > > + > > + rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL); > > + if (!rae) > > + return -ENOMEM; > > + INIT_LIST_HEAD(&rae->list); > > + rae->bno = ra->next_bno; > > + rae->len = rec->rm_startblock - ra->next_bno; > > + list_add_tail(&rae->list, ra->extlist); > > Any reason we don't use a bitmap for this one? > > > + ra->nr_records++; > > + ra->nr_blocks += rae->len; > > + } > > + ra->next_bno = max_t(xfs_agblock_t, ra->next_bno, > > + rec->rm_startblock + rec->rm_blockcount); > > The max_t() is to cover the record overlap case, right? If so, another > one liner comment would be good. Right. Will add a comment. > > + return 0; > > +} > > + > ... > > +/* Free an extent, which creates a record in the bnobt/cntbt. */ > > +STATIC int > > +xrep_abt_free_extent( > > + struct xfs_scrub *sc, > > + xfs_fsblock_t fsbno, > > + xfs_extlen_t len, > > + struct xfs_owner_info *oinfo) > > +{ > > + int error; > > + > > + error = xfs_free_extent(sc->tp, fsbno, len, oinfo, 0); > > + if (error) > > + return error; > > + error = xrep_roll_ag_trans(sc); > > + if (error) > > + return error; > > + return xfs_mod_fdblocks(sc->mp, -(int64_t)len, false); > > What's this call for? Is it because the blocks we're freeing were > already free? (Similar question on the other xfs_mod_fdblocks() call > further down). Yes. The goal here is to free the (already free) extent with no net change in fdblocks... > BTW, what prevents some other task from coming along and screwing with > this? For example, could a large falloc or buffered write come in and > allocate these global blocks before we take them away here (causing the > whole sequence to fail)? ...but you're right that here is a window of opportunity for someone to swoop in and reserve the blocks while we still have the AGF locked, which means that we'll fail here even though that other process will never get the space. Thinking about this a bit more, what we really want to do is to skip the xfs_trans_mod_sb(len) that happens after xfs_free_ag_extent inserts the record into the bno/cntbt. Hm. If a record insertion requires an expansion of the bnobt/cntbt, we'll pull blocks from the AGFL, but we separately force those to be accounted to XFS_AG_RESV_AGFL. Therefore, we could make a "fake" per-AG reservation type that would skip the fdblocks update. That avoids the problem where we commit the free space record but someone else reserves all the free space and then we blow out with ENOSPC and a half-rebuilt bnobt. For the second case (which I assume is xrep_abt_reset_counters?) I'll respond below. > > +} > > + > ... > > +/* > > + * Allocate a block from the (cached) first extent in the AG. In theory > > + * this should never fail, since we already checked that there was enough > > + * space to handle the new btrees. > > + */ > > +STATIC xfs_fsblock_t > > +xrep_abt_alloc_block( > > + struct xfs_scrub *sc, > > + struct list_head *free_extents) > > +{ > > + struct xrep_abt_extent *ext; > > + > > + /* Pull the first free space extent off the list, and... */ > > + ext = list_first_entry(free_extents, struct xrep_abt_extent, list); Missing a if (!ext) return NULLFSBLOCK; here for some reason... > > + /* ...take its first block. */ > > + ext->bno++; > > + ext->len--; > > + if (ext->len == 0) { > > + list_del(&ext->list); > > + kmem_free(ext); > > + } > > + > > + return XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, ext->bno - 1); > > Looks like a potential use after free of ext. Oops, good catch! I'll add a temporary variable to hold the value for the return. > > +} > > + > ... > > +/* > > + * Reset the global free block counter and the per-AG counters to make it look > > + * like this AG has no free space. > > + */ > > +STATIC int > > +xrep_abt_reset_counters( > > + struct xfs_scrub *sc, > > + int *log_flags) > > +{ > > + struct xfs_perag *pag = sc->sa.pag; > > + struct xfs_agf *agf; > > + xfs_agblock_t new_btblks; > > + xfs_agblock_t to_free; > > + int error; > > + > > + /* > > + * Since we're abandoning the old bnobt/cntbt, we have to decrease > > + * fdblocks by the # of blocks in those trees. btreeblks counts the > > + * non-root blocks of the free space and rmap btrees. Do this before > > + * resetting the AGF counters. > > + */ > > Hmm, I'm not quite following the comment wrt to the xfs_mod_fdblocks() > below. to_free looks like it's the count of all current btree blocks > minus rmap blocks (i.e., old bno/cnt btree blocks). Are we "allocating" > those blocks here because we're going to free them later? Yes. Though now that I have a XFS_AG_RESV_IGNORE, maybe I should just pass that to xrep_reap_extents in xrep_abt_rebuild_trees and then I can skip the racy mod_fdblocks thing here too. > > + agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); > > + > > + /* rmap_blocks accounts root block, btreeblks doesn't */ > > + new_btblks = be32_to_cpu(agf->agf_rmap_blocks) - 1; > > + > > + /* btreeblks doesn't account bno/cnt root blocks */ > > + to_free = pag->pagf_btreeblks + 2; > > + > > + /* and don't account for the blocks we aren't freeing */ > > + to_free -= new_btblks; > > + > > + error = xfs_mod_fdblocks(sc->mp, -(int64_t)to_free, false); > > + if (error) > > + return error; > > + > > + /* > > + * Reset the per-AG info, both incore and ondisk. Mark the incore > > + * state stale in case we fail out of here. > > + */ > > + ASSERT(pag->pagf_init); > > + pag->pagf_init = 0; > > + pag->pagf_btreeblks = new_btblks; > > + pag->pagf_freeblks = 0; > > + pag->pagf_longest = 0; > > + > > + agf->agf_btreeblks = cpu_to_be32(new_btblks); > > + agf->agf_freeblks = 0; > > + agf->agf_longest = 0; > > + *log_flags |= XFS_AGF_BTREEBLKS | XFS_AGF_LONGEST | XFS_AGF_FREEBLKS; > > + > > + return 0; > > +} > > + > > +/* Initialize a new free space btree root and implant into AGF. */ > > +STATIC int > > +xrep_abt_reset_btree( > > + struct xfs_scrub *sc, > > + xfs_btnum_t btnum, > > + struct list_head *free_extents) > > +{ > > + struct xfs_owner_info oinfo; > > + struct xfs_buf *bp; > > + struct xfs_perag *pag = sc->sa.pag; > > + struct xfs_mount *mp = sc->mp; > > + struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); > > + xfs_fsblock_t fsbno; > > + int error; > > + > > + /* Allocate new root block. */ > > + fsbno = xrep_abt_alloc_block(sc, free_extents); > > xrep_abt_alloc_block() converts an agbno to return an fsb. This function > passes the fsb to the init call just below and then converts it back to > an agbno in two places. It seems like there might be less conversions to > follow if the above just returned an agbno and we converted it to an fsb > once for xrep_init_btblock(). Yep, will fix. > > + if (fsbno == NULLFSBLOCK) > > + return -ENOSPC; > > + > > + /* Initialize new tree root. */ > > + error = xrep_init_btblock(sc, fsbno, &bp, btnum, &xfs_allocbt_buf_ops); > > + if (error) > > + return error; > > + > > + /* Implant into AGF. */ > > + agf->agf_roots[btnum] = cpu_to_be32(XFS_FSB_TO_AGBNO(mp, fsbno)); > > + agf->agf_levels[btnum] = cpu_to_be32(1); > > + > > + /* Add rmap records for the btree roots */ > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, > > + XFS_FSB_TO_AGBNO(mp, fsbno), 1, &oinfo); > > + if (error) > > + return error; > > + > > + /* Reset the incore state. */ > > + pag->pagf_levels[btnum] = 1; > > + > > + return 0; > > +} > > + > ... > > + > > +/* > > + * Make our new freespace btree roots permanent so that we can start freeing > > + * unused space back into the AG. > > + */ > > +STATIC int > > +xrep_abt_commit_new( > > + struct xfs_scrub *sc, > > + struct xfs_bitmap *old_allocbt_blocks, > > + int log_flags) > > +{ > > + int error; > > + > > + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); > > + > > + /* Invalidate the old freespace btree blocks and commit. */ > > + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); > > + if (error) > > + return error; > > It looks like the above invalidation all happens in the same > transaction. Those aren't logging buffer data or anything, but any idea > how many log formats we can get away with in this single transaction? Hm... well, on my computer a log format is ~88 bytes. Assuming 4K blocks, the max AG size of 1TB, maximum free space fragmentation, and two btrees, the tree could be up to ~270 million records. Assuming ~505 records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks for both btrees. If we invalidate both, that's ~46M of RAM? > > + error = xrep_roll_ag_trans(sc); > > + if (error) > > + return error; > > + > > + /* Now that we've succeeded, mark the incore state valid again. */ > > + sc->sa.pag->pagf_init = 1; > > + return 0; > > +} > > + > > +/* Build new free space btrees and dispose of the old one. */ > > +STATIC int > > +xrep_abt_rebuild_trees( > > + struct xfs_scrub *sc, > > + struct list_head *free_extents, > > + struct xfs_bitmap *old_allocbt_blocks) > > +{ > > + struct xfs_owner_info oinfo; > > + struct xrep_abt_extent *rae; > > + struct xrep_abt_extent *n; > > + struct xrep_abt_extent *longest; > > + int error; > > + > > + xfs_rmap_skip_owner_update(&oinfo); > > + > > + /* > > + * Insert the longest free extent in case it's necessary to > > + * refresh the AGFL with multiple blocks. If there is no longest > > + * extent, we had exactly the free space we needed; we're done. > > + */ > > I'm confused by the last sentence. longest should only be NULL if the > free space list is empty and haven't we already bailed out with -ENOSPC > if that's the case? > > > + longest = xrep_abt_get_longest(free_extents); xrep_abt_rebuild_trees is called after we allocate and initialize two new btree roots in xrep_abt_reset_btrees. If free_extents is an empty list here, then we found exactly two blocks worth of free space and used them to set up new btree roots. > > + if (!longest) > > + goto done; > > + error = xrep_abt_free_extent(sc, > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), > > + longest->len, &oinfo); > > + list_del(&longest->list); > > + kmem_free(longest); > > + if (error) > > + return error; > > + > > + /* Insert records into the new btrees. */ > > + list_for_each_entry_safe(rae, n, free_extents, list) { > > + error = xrep_abt_free_extent(sc, > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), > > + rae->len, &oinfo); > > + if (error) > > + return error; > > + list_del(&rae->list); > > + kmem_free(rae); > > + } > > Ok, at this point we've reset the btree roots and we start freeing the > free ranges that were discovered via the rmapbt analysis. AFAICT, if we > fail or crash at this point, we leave the allocbts in a partially > constructed state. I take it that is Ok with respect to the broader > repair algorithm because we'd essentially start over by inspecting the > rmapbt again on a retry. Right. Though in the crash/shutdown case, you'll end up with the filesystem in an offline state at some point before you can retry the scrub, it's probably faster to run xfs_repair to fix the damage. > The blocks allocated for the btrees that we've begun to construct here > end up mapped in the rmapbt as we go, right? IIUC, that means we don't > necessarily have infinite retries to make sure this completes. IOW, > suppose that a first repair attempt finds just enough free space to > construct new trees, gets far enough along to consume most of that free > space and then crashes. Is it possible that a subsequent repair attempt > includes the btree blocks allocated during the previous failed repair > attempt in the sum of "old btree blocks" and determines we don't have > enough free space to repair? Yes, that's a risk of running the free space repair. > > + > > +done: > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > + XFS_AG_RESV_NONE); > > +} > > + > ... > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > index 0ed68379e551..82f99633a597 100644 > > --- a/fs/xfs/xfs_extent_busy.c > > +++ b/fs/xfs/xfs_extent_busy.c > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > diff = b1->bno - b2->bno; > > return diff; > > } > > + > > +/* Are there any busy extents in this AG? */ > > +bool > > +xfs_extent_busy_list_empty( > > + struct xfs_perag *pag) > > +{ > > + spin_lock(&pag->pagb_lock); > > + if (pag->pagb_tree.rb_node) { > > RB_EMPTY_ROOT()? Good suggestion, thank you! --D > Brian > > > + spin_unlock(&pag->pagb_lock); > > + return false; > > + } > > + spin_unlock(&pag->pagb_lock); > > + return true; > > +} > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > index 990ab3891971..2f8c73c712c6 100644 > > --- a/fs/xfs/xfs_extent_busy.h > > +++ b/fs/xfs/xfs_extent_busy.h > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > } > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > + > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > Rebuild the free space btrees from the gaps in the rmap btree. > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > --- > > > fs/xfs/Makefile | 1 > > > fs/xfs/scrub/alloc.c | 1 > > > fs/xfs/scrub/alloc_repair.c | 581 +++++++++++++++++++++++++++++++++++++++++++ > > > fs/xfs/scrub/common.c | 8 + > > > fs/xfs/scrub/repair.h | 2 > > > fs/xfs/scrub/scrub.c | 4 > > > fs/xfs/scrub/trace.h | 2 > > > fs/xfs/xfs_extent_busy.c | 14 + > > > fs/xfs/xfs_extent_busy.h | 2 > > > 9 files changed, 610 insertions(+), 5 deletions(-) > > > create mode 100644 fs/xfs/scrub/alloc_repair.c > > > > > > > > ... > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c > > > new file mode 100644 > > > index 000000000000..b228c2906de2 > > > --- /dev/null > > > +++ b/fs/xfs/scrub/alloc_repair.c > > > @@ -0,0 +1,581 @@ > > ... > > > +/* Record extents that aren't in use from gaps in the rmap records. */ > > > +STATIC int > > > +xrep_abt_walk_rmap( > > > + struct xfs_btree_cur *cur, > > > + struct xfs_rmap_irec *rec, > > > + void *priv) > > > +{ > > > + struct xrep_abt *ra = priv; > > > + struct xrep_abt_extent *rae; > > > + xfs_fsblock_t fsb; > > > + int error; > > > + > > > + /* Record all the OWN_AG blocks... */ > > > + if (rec->rm_owner == XFS_RMAP_OWN_AG) { > > > + fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, > > > + rec->rm_startblock); > > > + error = xfs_bitmap_set(ra->btlist, fsb, rec->rm_blockcount); > > > + if (error) > > > + return error; > > > + } > > > + > > > + /* ...and all the rmapbt blocks... */ > > > + error = xfs_bitmap_set_btcur_path(&ra->nobtlist, cur); > > > + if (error) > > > + return error; > > > + > > > + /* ...and all the free space. */ > > > + if (rec->rm_startblock > ra->next_bno) { > > > + trace_xrep_abt_walk_rmap(cur->bc_mp, cur->bc_private.a.agno, > > > + ra->next_bno, rec->rm_startblock - ra->next_bno, > > > + XFS_RMAP_OWN_NULL, 0, 0); > > > + > > > + rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL); > > > + if (!rae) > > > + return -ENOMEM; > > > + INIT_LIST_HEAD(&rae->list); > > > + rae->bno = ra->next_bno; > > > + rae->len = rec->rm_startblock - ra->next_bno; > > > + list_add_tail(&rae->list, ra->extlist); > > > > Any reason we don't use a bitmap for this one? > > ?? > > > + ra->nr_records++; > > > + ra->nr_blocks += rae->len; > > > + } > > > + ra->next_bno = max_t(xfs_agblock_t, ra->next_bno, > > > + rec->rm_startblock + rec->rm_blockcount); > > > > The max_t() is to cover the record overlap case, right? If so, another > > one liner comment would be good. > > Right. Will add a comment. > > > > + return 0; > > > +} > > > + > > ... > > > +/* Free an extent, which creates a record in the bnobt/cntbt. */ > > > +STATIC int > > > +xrep_abt_free_extent( > > > + struct xfs_scrub *sc, > > > + xfs_fsblock_t fsbno, > > > + xfs_extlen_t len, > > > + struct xfs_owner_info *oinfo) > > > +{ > > > + int error; > > > + > > > + error = xfs_free_extent(sc->tp, fsbno, len, oinfo, 0); > > > + if (error) > > > + return error; > > > + error = xrep_roll_ag_trans(sc); > > > + if (error) > > > + return error; > > > + return xfs_mod_fdblocks(sc->mp, -(int64_t)len, false); > > > > What's this call for? Is it because the blocks we're freeing were > > already free? (Similar question on the other xfs_mod_fdblocks() call > > further down). > > Yes. The goal here is to free the (already free) extent with no net > change in fdblocks... > > > BTW, what prevents some other task from coming along and screwing with > > this? For example, could a large falloc or buffered write come in and > > allocate these global blocks before we take them away here (causing the > > whole sequence to fail)? > > ...but you're right that here is a window of opportunity for someone to > swoop in and reserve the blocks while we still have the AGF locked, > which means that we'll fail here even though that other process will > never get the space. > > Thinking about this a bit more, what we really want to do is to skip the > xfs_trans_mod_sb(len) that happens after xfs_free_ag_extent inserts the > record into the bno/cntbt. Hm. If a record insertion requires an > expansion of the bnobt/cntbt, we'll pull blocks from the AGFL, but we > separately force those to be accounted to XFS_AG_RESV_AGFL. Therefore, > we could make a "fake" per-AG reservation type that would skip the > fdblocks update. That avoids the problem where we commit the free space > record but someone else reserves all the free space and then we blow out > with ENOSPC and a half-rebuilt bnobt. > Ok, that sounds a bit more straightforward to me. > For the second case (which I assume is xrep_abt_reset_counters?) I'll > respond below. > > > > +} > > > + ... > > > +/* > > > + * Reset the global free block counter and the per-AG counters to make it look > > > + * like this AG has no free space. > > > + */ > > > +STATIC int > > > +xrep_abt_reset_counters( > > > + struct xfs_scrub *sc, > > > + int *log_flags) > > > +{ > > > + struct xfs_perag *pag = sc->sa.pag; > > > + struct xfs_agf *agf; > > > + xfs_agblock_t new_btblks; > > > + xfs_agblock_t to_free; > > > + int error; > > > + > > > + /* > > > + * Since we're abandoning the old bnobt/cntbt, we have to decrease > > > + * fdblocks by the # of blocks in those trees. btreeblks counts the > > > + * non-root blocks of the free space and rmap btrees. Do this before > > > + * resetting the AGF counters. > > > + */ > > > > Hmm, I'm not quite following the comment wrt to the xfs_mod_fdblocks() > > below. to_free looks like it's the count of all current btree blocks > > minus rmap blocks (i.e., old bno/cnt btree blocks). Are we "allocating" > > those blocks here because we're going to free them later? > > Yes. Though now that I have a XFS_AG_RESV_IGNORE, maybe I should just > pass that to xrep_reap_extents in xrep_abt_rebuild_trees and then I can > skip the racy mod_fdblocks thing here too. > I think I'll ultimately need to see the code to make sure I follow the ignore thing correctly, but that overall sounds better to me. If we do retain these kind of calls to undo/work-around underlying infrastructure, I think we need a bit more specific comments that describe precisely what behavior the call is offsetting. > > > + agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); > > > + > > > + /* rmap_blocks accounts root block, btreeblks doesn't */ > > > + new_btblks = be32_to_cpu(agf->agf_rmap_blocks) - 1; > > > + > > > + /* btreeblks doesn't account bno/cnt root blocks */ > > > + to_free = pag->pagf_btreeblks + 2; > > > + > > > + /* and don't account for the blocks we aren't freeing */ > > > + to_free -= new_btblks; > > > + > > > + error = xfs_mod_fdblocks(sc->mp, -(int64_t)to_free, false); > > > + if (error) > > > + return error; > > > + > > > + /* > > > + * Reset the per-AG info, both incore and ondisk. Mark the incore > > > + * state stale in case we fail out of here. > > > + */ > > > + ASSERT(pag->pagf_init); > > > + pag->pagf_init = 0; > > > + pag->pagf_btreeblks = new_btblks; > > > + pag->pagf_freeblks = 0; > > > + pag->pagf_longest = 0; > > > + > > > + agf->agf_btreeblks = cpu_to_be32(new_btblks); > > > + agf->agf_freeblks = 0; > > > + agf->agf_longest = 0; > > > + *log_flags |= XFS_AGF_BTREEBLKS | XFS_AGF_LONGEST | XFS_AGF_FREEBLKS; > > > + > > > + return 0; > > > +} > > > + ... > > > + > > > +/* > > > + * Make our new freespace btree roots permanent so that we can start freeing > > > + * unused space back into the AG. > > > + */ > > > +STATIC int > > > +xrep_abt_commit_new( > > > + struct xfs_scrub *sc, > > > + struct xfs_bitmap *old_allocbt_blocks, > > > + int log_flags) > > > +{ > > > + int error; > > > + > > > + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); > > > + > > > + /* Invalidate the old freespace btree blocks and commit. */ > > > + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); > > > + if (error) > > > + return error; > > > > It looks like the above invalidation all happens in the same > > transaction. Those aren't logging buffer data or anything, but any idea > > how many log formats we can get away with in this single transaction? > > Hm... well, on my computer a log format is ~88 bytes. Assuming 4K > blocks, the max AG size of 1TB, maximum free space fragmentation, and > two btrees, the tree could be up to ~270 million records. Assuming ~505 > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks > for both btrees. If we invalidate both, that's ~46M of RAM? > I was thinking more about transaction reservation than RAM. It may not currently be an issue, but it might be worth putting something down in a comment to note that this is a single transaction and we expect to not have to invalidate more than N (ballpark) blocks in a single go, whatever that value happens to be. > > > + error = xrep_roll_ag_trans(sc); > > > + if (error) > > > + return error; > > > + > > > + /* Now that we've succeeded, mark the incore state valid again. */ > > > + sc->sa.pag->pagf_init = 1; > > > + return 0; > > > +} > > > + > > > +/* Build new free space btrees and dispose of the old one. */ > > > +STATIC int > > > +xrep_abt_rebuild_trees( > > > + struct xfs_scrub *sc, > > > + struct list_head *free_extents, > > > + struct xfs_bitmap *old_allocbt_blocks) > > > +{ > > > + struct xfs_owner_info oinfo; > > > + struct xrep_abt_extent *rae; > > > + struct xrep_abt_extent *n; > > > + struct xrep_abt_extent *longest; > > > + int error; > > > + > > > + xfs_rmap_skip_owner_update(&oinfo); > > > + > > > + /* > > > + * Insert the longest free extent in case it's necessary to > > > + * refresh the AGFL with multiple blocks. If there is no longest > > > + * extent, we had exactly the free space we needed; we're done. > > > + */ > > > > I'm confused by the last sentence. longest should only be NULL if the > > free space list is empty and haven't we already bailed out with -ENOSPC > > if that's the case? > > > > > + longest = xrep_abt_get_longest(free_extents); > > xrep_abt_rebuild_trees is called after we allocate and initialize two > new btree roots in xrep_abt_reset_btrees. If free_extents is an empty > list here, then we found exactly two blocks worth of free space and used > them to set up new btree roots. > Got it, thanks. > > > + if (!longest) > > > + goto done; > > > + error = xrep_abt_free_extent(sc, > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), > > > + longest->len, &oinfo); > > > + list_del(&longest->list); > > > + kmem_free(longest); > > > + if (error) > > > + return error; > > > + > > > + /* Insert records into the new btrees. */ > > > + list_for_each_entry_safe(rae, n, free_extents, list) { > > > + error = xrep_abt_free_extent(sc, > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), > > > + rae->len, &oinfo); > > > + if (error) > > > + return error; > > > + list_del(&rae->list); > > > + kmem_free(rae); > > > + } > > > > Ok, at this point we've reset the btree roots and we start freeing the > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we > > fail or crash at this point, we leave the allocbts in a partially > > constructed state. I take it that is Ok with respect to the broader > > repair algorithm because we'd essentially start over by inspecting the > > rmapbt again on a retry. > > Right. Though in the crash/shutdown case, you'll end up with the > filesystem in an offline state at some point before you can retry the > scrub, it's probably faster to run xfs_repair to fix the damage. > Can we really assume that if we're already up and running an online repair? The filesystem has to be mountable in that case in the first place. If we've already reset and started reconstructing the allocation btrees then I'd think those transactions would recover just fine on a power loss or something (perhaps not in the event of some other corruption related shutdown). > > The blocks allocated for the btrees that we've begun to construct here > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't > > necessarily have infinite retries to make sure this completes. IOW, > > suppose that a first repair attempt finds just enough free space to > > construct new trees, gets far enough along to consume most of that free > > space and then crashes. Is it possible that a subsequent repair attempt > > includes the btree blocks allocated during the previous failed repair > > attempt in the sum of "old btree blocks" and determines we don't have > > enough free space to repair? > > Yes, that's a risk of running the free space repair. > Can we improve on that? For example, are the rmapbt entries for the old allocation btree blocks necessary once we commit the btree resets? If not, could we remove those entries before we start tree reconstruction? Alternatively, could we incorporate use of the old btree blocks? As it is, we discover those blocks simply so we can free them at the end. Perhaps we could free them sooner or find a more clever means to reallocate directly from that in-core list? I guess we have to consider whether they were really valid/sane btree blocks, but either way ISTM that the old blocks list is essentially invalidated once we reset the btrees. Brian > > > + > > > +done: > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > + XFS_AG_RESV_NONE); > > > +} > > > + > > ... > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > index 0ed68379e551..82f99633a597 100644 > > > --- a/fs/xfs/xfs_extent_busy.c > > > +++ b/fs/xfs/xfs_extent_busy.c > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > diff = b1->bno - b2->bno; > > > return diff; > > > } > > > + > > > +/* Are there any busy extents in this AG? */ > > > +bool > > > +xfs_extent_busy_list_empty( > > > + struct xfs_perag *pag) > > > +{ > > > + spin_lock(&pag->pagb_lock); > > > + if (pag->pagb_tree.rb_node) { > > > > RB_EMPTY_ROOT()? > > Good suggestion, thank you! > > --D > > > Brian > > > > > + spin_unlock(&pag->pagb_lock); > > > + return false; > > > + } > > > + spin_unlock(&pag->pagb_lock); > > > + return true; > > > +} > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > index 990ab3891971..2f8c73c712c6 100644 > > > --- a/fs/xfs/xfs_extent_busy.h > > > +++ b/fs/xfs/xfs_extent_busy.h > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > } > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > + > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree. > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > --- > > > > fs/xfs/Makefile | 1 > > > > fs/xfs/scrub/alloc.c | 1 > > > > fs/xfs/scrub/alloc_repair.c | 581 +++++++++++++++++++++++++++++++++++++++++++ > > > > fs/xfs/scrub/common.c | 8 + > > > > fs/xfs/scrub/repair.h | 2 > > > > fs/xfs/scrub/scrub.c | 4 > > > > fs/xfs/scrub/trace.h | 2 > > > > fs/xfs/xfs_extent_busy.c | 14 + > > > > fs/xfs/xfs_extent_busy.h | 2 > > > > 9 files changed, 610 insertions(+), 5 deletions(-) > > > > create mode 100644 fs/xfs/scrub/alloc_repair.c > > > > > > > > > > > ... > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c > > > > new file mode 100644 > > > > index 000000000000..b228c2906de2 > > > > --- /dev/null > > > > +++ b/fs/xfs/scrub/alloc_repair.c > > > > @@ -0,0 +1,581 @@ > > > ... > > > > +/* Record extents that aren't in use from gaps in the rmap records. */ > > > > +STATIC int > > > > +xrep_abt_walk_rmap( > > > > + struct xfs_btree_cur *cur, > > > > + struct xfs_rmap_irec *rec, > > > > + void *priv) > > > > +{ > > > > + struct xrep_abt *ra = priv; > > > > + struct xrep_abt_extent *rae; > > > > + xfs_fsblock_t fsb; > > > > + int error; > > > > + > > > > + /* Record all the OWN_AG blocks... */ > > > > + if (rec->rm_owner == XFS_RMAP_OWN_AG) { > > > > + fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, > > > > + rec->rm_startblock); > > > > + error = xfs_bitmap_set(ra->btlist, fsb, rec->rm_blockcount); > > > > + if (error) > > > > + return error; > > > > + } > > > > + > > > > + /* ...and all the rmapbt blocks... */ > > > > + error = xfs_bitmap_set_btcur_path(&ra->nobtlist, cur); > > > > + if (error) > > > > + return error; > > > > + > > > > + /* ...and all the free space. */ > > > > + if (rec->rm_startblock > ra->next_bno) { > > > > + trace_xrep_abt_walk_rmap(cur->bc_mp, cur->bc_private.a.agno, > > > > + ra->next_bno, rec->rm_startblock - ra->next_bno, > > > > + XFS_RMAP_OWN_NULL, 0, 0); > > > > + > > > > + rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL); > > > > + if (!rae) > > > > + return -ENOMEM; > > > > + INIT_LIST_HEAD(&rae->list); > > > > + rae->bno = ra->next_bno; > > > > + rae->len = rec->rm_startblock - ra->next_bno; > > > > + list_add_tail(&rae->list, ra->extlist); > > > > > > Any reason we don't use a bitmap for this one? > > > > > ?? Yes, I could probably do that, let's see if it works... > > > > + ra->nr_records++; > > > > + ra->nr_blocks += rae->len; > > > > + } > > > > + ra->next_bno = max_t(xfs_agblock_t, ra->next_bno, > > > > + rec->rm_startblock + rec->rm_blockcount); > > > > > > The max_t() is to cover the record overlap case, right? If so, another > > > one liner comment would be good. > > > > Right. Will add a comment. > > > > > > + return 0; > > > > +} > > > > + > > > ... > > > > +/* Free an extent, which creates a record in the bnobt/cntbt. */ > > > > +STATIC int > > > > +xrep_abt_free_extent( > > > > + struct xfs_scrub *sc, > > > > + xfs_fsblock_t fsbno, > > > > + xfs_extlen_t len, > > > > + struct xfs_owner_info *oinfo) > > > > +{ > > > > + int error; > > > > + > > > > + error = xfs_free_extent(sc->tp, fsbno, len, oinfo, 0); > > > > + if (error) > > > > + return error; > > > > + error = xrep_roll_ag_trans(sc); > > > > + if (error) > > > > + return error; > > > > + return xfs_mod_fdblocks(sc->mp, -(int64_t)len, false); > > > > > > What's this call for? Is it because the blocks we're freeing were > > > already free? (Similar question on the other xfs_mod_fdblocks() call > > > further down). > > > > Yes. The goal here is to free the (already free) extent with no net > > change in fdblocks... > > > > > BTW, what prevents some other task from coming along and screwing with > > > this? For example, could a large falloc or buffered write come in and > > > allocate these global blocks before we take them away here (causing the > > > whole sequence to fail)? > > > > ...but you're right that here is a window of opportunity for someone to > > swoop in and reserve the blocks while we still have the AGF locked, > > which means that we'll fail here even though that other process will > > never get the space. > > > > Thinking about this a bit more, what we really want to do is to skip the > > xfs_trans_mod_sb(len) that happens after xfs_free_ag_extent inserts the > > record into the bno/cntbt. Hm. If a record insertion requires an > > expansion of the bnobt/cntbt, we'll pull blocks from the AGFL, but we > > separately force those to be accounted to XFS_AG_RESV_AGFL. Therefore, > > we could make a "fake" per-AG reservation type that would skip the > > fdblocks update. That avoids the problem where we commit the free space > > record but someone else reserves all the free space and then we blow out > > with ENOSPC and a half-rebuilt bnobt. > > > > Ok, that sounds a bit more straightforward to me. > > > For the second case (which I assume is xrep_abt_reset_counters?) I'll > > respond below. > > > > > > +} > > > > + > ... > > > > +/* > > > > + * Reset the global free block counter and the per-AG counters to make it look > > > > + * like this AG has no free space. > > > > + */ > > > > +STATIC int > > > > +xrep_abt_reset_counters( > > > > + struct xfs_scrub *sc, > > > > + int *log_flags) > > > > +{ > > > > + struct xfs_perag *pag = sc->sa.pag; > > > > + struct xfs_agf *agf; > > > > + xfs_agblock_t new_btblks; > > > > + xfs_agblock_t to_free; > > > > + int error; > > > > + > > > > + /* > > > > + * Since we're abandoning the old bnobt/cntbt, we have to decrease > > > > + * fdblocks by the # of blocks in those trees. btreeblks counts the > > > > + * non-root blocks of the free space and rmap btrees. Do this before > > > > + * resetting the AGF counters. > > > > + */ > > > > > > Hmm, I'm not quite following the comment wrt to the xfs_mod_fdblocks() > > > below. to_free looks like it's the count of all current btree blocks > > > minus rmap blocks (i.e., old bno/cnt btree blocks). Are we "allocating" > > > those blocks here because we're going to free them later? > > > > Yes. Though now that I have a XFS_AG_RESV_IGNORE, maybe I should just > > pass that to xrep_reap_extents in xrep_abt_rebuild_trees and then I can > > skip the racy mod_fdblocks thing here too. > > > > I think I'll ultimately need to see the code to make sure I follow the > ignore thing correctly, but that overall sounds better to me. If we do > retain these kind of calls to undo/work-around underlying > infrastructure, I think we need a bit more specific comments that > describe precisely what behavior the call is offsetting. I'll push out a new revision after I finish rebasing everything atop your latest dfops refactoring series. > > > > + agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); > > > > + > > > > + /* rmap_blocks accounts root block, btreeblks doesn't */ > > > > + new_btblks = be32_to_cpu(agf->agf_rmap_blocks) - 1; > > > > + > > > > + /* btreeblks doesn't account bno/cnt root blocks */ > > > > + to_free = pag->pagf_btreeblks + 2; > > > > + > > > > + /* and don't account for the blocks we aren't freeing */ > > > > + to_free -= new_btblks; > > > > + > > > > + error = xfs_mod_fdblocks(sc->mp, -(int64_t)to_free, false); > > > > + if (error) > > > > + return error; > > > > + > > > > + /* > > > > + * Reset the per-AG info, both incore and ondisk. Mark the incore > > > > + * state stale in case we fail out of here. > > > > + */ > > > > + ASSERT(pag->pagf_init); > > > > + pag->pagf_init = 0; > > > > + pag->pagf_btreeblks = new_btblks; > > > > + pag->pagf_freeblks = 0; > > > > + pag->pagf_longest = 0; > > > > + > > > > + agf->agf_btreeblks = cpu_to_be32(new_btblks); > > > > + agf->agf_freeblks = 0; > > > > + agf->agf_longest = 0; > > > > + *log_flags |= XFS_AGF_BTREEBLKS | XFS_AGF_LONGEST | XFS_AGF_FREEBLKS; > > > > + > > > > + return 0; > > > > +} > > > > + > ... > > > > + > > > > +/* > > > > + * Make our new freespace btree roots permanent so that we can start freeing > > > > + * unused space back into the AG. > > > > + */ > > > > +STATIC int > > > > +xrep_abt_commit_new( > > > > + struct xfs_scrub *sc, > > > > + struct xfs_bitmap *old_allocbt_blocks, > > > > + int log_flags) > > > > +{ > > > > + int error; > > > > + > > > > + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); > > > > + > > > > + /* Invalidate the old freespace btree blocks and commit. */ > > > > + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); > > > > + if (error) > > > > + return error; > > > > > > It looks like the above invalidation all happens in the same > > > transaction. Those aren't logging buffer data or anything, but any idea > > > how many log formats we can get away with in this single transaction? > > > > Hm... well, on my computer a log format is ~88 bytes. Assuming 4K > > blocks, the max AG size of 1TB, maximum free space fragmentation, and > > two btrees, the tree could be up to ~270 million records. Assuming ~505 > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks > > for both btrees. If we invalidate both, that's ~46M of RAM? > > > > I was thinking more about transaction reservation than RAM. It may not Hmm. tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's about ... ~7300 log format items? Not a lot, maybe it should roll the transaction every 1000 invalidations or so... > currently be an issue, but it might be worth putting something down in a > comment to note that this is a single transaction and we expect to not > have to invalidate more than N (ballpark) blocks in a single go, > whatever that value happens to be. > > > > > + error = xrep_roll_ag_trans(sc); > > > > + if (error) > > > > + return error; > > > > + > > > > + /* Now that we've succeeded, mark the incore state valid again. */ > > > > + sc->sa.pag->pagf_init = 1; > > > > + return 0; > > > > +} > > > > + > > > > +/* Build new free space btrees and dispose of the old one. */ > > > > +STATIC int > > > > +xrep_abt_rebuild_trees( > > > > + struct xfs_scrub *sc, > > > > + struct list_head *free_extents, > > > > + struct xfs_bitmap *old_allocbt_blocks) > > > > +{ > > > > + struct xfs_owner_info oinfo; > > > > + struct xrep_abt_extent *rae; > > > > + struct xrep_abt_extent *n; > > > > + struct xrep_abt_extent *longest; > > > > + int error; > > > > + > > > > + xfs_rmap_skip_owner_update(&oinfo); > > > > + > > > > + /* > > > > + * Insert the longest free extent in case it's necessary to > > > > + * refresh the AGFL with multiple blocks. If there is no longest > > > > + * extent, we had exactly the free space we needed; we're done. > > > > + */ > > > > > > I'm confused by the last sentence. longest should only be NULL if the > > > free space list is empty and haven't we already bailed out with -ENOSPC > > > if that's the case? > > > > > > > + longest = xrep_abt_get_longest(free_extents); > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two > > new btree roots in xrep_abt_reset_btrees. If free_extents is an empty > > list here, then we found exactly two blocks worth of free space and used > > them to set up new btree roots. > > > > Got it, thanks. > > > > > + if (!longest) > > > > + goto done; > > > > + error = xrep_abt_free_extent(sc, > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), > > > > + longest->len, &oinfo); > > > > + list_del(&longest->list); > > > > + kmem_free(longest); > > > > + if (error) > > > > + return error; > > > > + > > > > + /* Insert records into the new btrees. */ > > > > + list_for_each_entry_safe(rae, n, free_extents, list) { > > > > + error = xrep_abt_free_extent(sc, > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), > > > > + rae->len, &oinfo); > > > > + if (error) > > > > + return error; > > > > + list_del(&rae->list); > > > > + kmem_free(rae); > > > > + } > > > > > > Ok, at this point we've reset the btree roots and we start freeing the > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we > > > fail or crash at this point, we leave the allocbts in a partially > > > constructed state. I take it that is Ok with respect to the broader > > > repair algorithm because we'd essentially start over by inspecting the > > > rmapbt again on a retry. > > > > Right. Though in the crash/shutdown case, you'll end up with the > > filesystem in an offline state at some point before you can retry the > > scrub, it's probably faster to run xfs_repair to fix the damage. > > > > Can we really assume that if we're already up and running an online > repair? The filesystem has to be mountable in that case in the first > place. If we've already reset and started reconstructing the allocation > btrees then I'd think those transactions would recover just fine on a > power loss or something (perhaps not in the event of some other > corruption related shutdown). Right, for the system crash case, whatever transactions committed should replay just fine, and you can even start up the online repair again, and if the AG isn't particularly close to ENOSPC then (barring rmap corruption) it should work just fine. If the fs went down because either (a) repair hit other corruption or (b) some other thread hit an error in some other part of the filesystem, then it's not so clear -- in (b) you could probably try again, but for (a) you'll definitely have to unmount and run xfs_repair. Perhaps the guideline here is that if the fs goes down more than once during online repair then unmount it and run xfs_repair. > > > The blocks allocated for the btrees that we've begun to construct here > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't > > > necessarily have infinite retries to make sure this completes. IOW, > > > suppose that a first repair attempt finds just enough free space to > > > construct new trees, gets far enough along to consume most of that free > > > space and then crashes. Is it possible that a subsequent repair attempt > > > includes the btree blocks allocated during the previous failed repair > > > attempt in the sum of "old btree blocks" and determines we don't have > > > enough free space to repair? > > > > Yes, that's a risk of running the free space repair. > > > > Can we improve on that? For example, are the rmapbt entries for the old > allocation btree blocks necessary once we commit the btree resets? If > not, could we remove those entries before we start tree reconstruction? > > Alternatively, could we incorporate use of the old btree blocks? As it > is, we discover those blocks simply so we can free them at the end. > Perhaps we could free them sooner or find a more clever means to > reallocate directly from that in-core list? I guess we have to consider > whether they were really valid/sane btree blocks, but either way ISTM > that the old blocks list is essentially invalidated once we reset the > btrees. Hmm, it's a little tricky to do that -- we could reap the old bnobt and cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a record causes a btree split we'll pull blocks from the AGFL, and if there aren't enough blocks in the bnobt to fill the AGFL back up then fix_freelist won't succeed. That complication is why it finds the longest extent in the unclaimed list and pushes that in first, then works on the rest of the extents. I suppose one could try to avoid ENOSPC by pushing that longest extent in first (since we know that won't trigger a split), then reap the old alloc btree blocks, and then add everything else back in... --D > Brian > > > > > + > > > > +done: > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > + XFS_AG_RESV_NONE); > > > > +} > > > > + > > > ... > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > index 0ed68379e551..82f99633a597 100644 > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > diff = b1->bno - b2->bno; > > > > return diff; > > > > } > > > > + > > > > +/* Are there any busy extents in this AG? */ > > > > +bool > > > > +xfs_extent_busy_list_empty( > > > > + struct xfs_perag *pag) > > > > +{ > > > > + spin_lock(&pag->pagb_lock); > > > > + if (pag->pagb_tree.rb_node) { > > > > > > RB_EMPTY_ROOT()? > > > > Good suggestion, thank you! > > > > --D > > > > > Brian > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > + return false; > > > > + } > > > > + spin_unlock(&pag->pagb_lock); > > > > + return true; > > > > +} > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > } > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > + > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree. > > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > > --- > > > > > fs/xfs/Makefile | 1 > > > > > fs/xfs/scrub/alloc.c | 1 > > > > > fs/xfs/scrub/alloc_repair.c | 581 +++++++++++++++++++++++++++++++++++++++++++ > > > > > fs/xfs/scrub/common.c | 8 + > > > > > fs/xfs/scrub/repair.h | 2 > > > > > fs/xfs/scrub/scrub.c | 4 > > > > > fs/xfs/scrub/trace.h | 2 > > > > > fs/xfs/xfs_extent_busy.c | 14 + > > > > > fs/xfs/xfs_extent_busy.h | 2 > > > > > 9 files changed, 610 insertions(+), 5 deletions(-) > > > > > create mode 100644 fs/xfs/scrub/alloc_repair.c > > > > > > > > > > > > > > ... > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c > > > > > new file mode 100644 > > > > > index 000000000000..b228c2906de2 > > > > > --- /dev/null > > > > > +++ b/fs/xfs/scrub/alloc_repair.c > > > > > @@ -0,0 +1,581 @@ ... > > > > > + > > > > > +/* > > > > > + * Make our new freespace btree roots permanent so that we can start freeing > > > > > + * unused space back into the AG. > > > > > + */ > > > > > +STATIC int > > > > > +xrep_abt_commit_new( > > > > > + struct xfs_scrub *sc, > > > > > + struct xfs_bitmap *old_allocbt_blocks, > > > > > + int log_flags) > > > > > +{ > > > > > + int error; > > > > > + > > > > > + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); > > > > > + > > > > > + /* Invalidate the old freespace btree blocks and commit. */ > > > > > + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); > > > > > + if (error) > > > > > + return error; > > > > > > > > It looks like the above invalidation all happens in the same > > > > transaction. Those aren't logging buffer data or anything, but any idea > > > > how many log formats we can get away with in this single transaction? > > > > > > Hm... well, on my computer a log format is ~88 bytes. Assuming 4K > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and > > > two btrees, the tree could be up to ~270 million records. Assuming ~505 > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks > > > for both btrees. If we invalidate both, that's ~46M of RAM? > > > > > > > I was thinking more about transaction reservation than RAM. It may not > > Hmm. tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's > about ... ~7300 log format items? Not a lot, maybe it should roll the > transaction every 1000 invalidations or so... > I'm not really sure what categorizes as a lot here given that the blocks would need to be in-core, but rolling on some fixed/safe interval sounds reasonable to me. > > currently be an issue, but it might be worth putting something down in a > > comment to note that this is a single transaction and we expect to not > > have to invalidate more than N (ballpark) blocks in a single go, > > whatever that value happens to be. > > > > > > > + error = xrep_roll_ag_trans(sc); > > > > > + if (error) > > > > > + return error; > > > > > + > > > > > + /* Now that we've succeeded, mark the incore state valid again. */ > > > > > + sc->sa.pag->pagf_init = 1; > > > > > + return 0; > > > > > +} > > > > > + > > > > > +/* Build new free space btrees and dispose of the old one. */ > > > > > +STATIC int > > > > > +xrep_abt_rebuild_trees( > > > > > + struct xfs_scrub *sc, > > > > > + struct list_head *free_extents, > > > > > + struct xfs_bitmap *old_allocbt_blocks) > > > > > +{ > > > > > + struct xfs_owner_info oinfo; > > > > > + struct xrep_abt_extent *rae; > > > > > + struct xrep_abt_extent *n; > > > > > + struct xrep_abt_extent *longest; > > > > > + int error; > > > > > + > > > > > + xfs_rmap_skip_owner_update(&oinfo); > > > > > + > > > > > + /* > > > > > + * Insert the longest free extent in case it's necessary to > > > > > + * refresh the AGFL with multiple blocks. If there is no longest > > > > > + * extent, we had exactly the free space we needed; we're done. > > > > > + */ > > > > > > > > I'm confused by the last sentence. longest should only be NULL if the > > > > free space list is empty and haven't we already bailed out with -ENOSPC > > > > if that's the case? > > > > > > > > > + longest = xrep_abt_get_longest(free_extents); > > > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two > > > new btree roots in xrep_abt_reset_btrees. If free_extents is an empty > > > list here, then we found exactly two blocks worth of free space and used > > > them to set up new btree roots. > > > > > > > Got it, thanks. > > > > > > > + if (!longest) > > > > > + goto done; > > > > > + error = xrep_abt_free_extent(sc, > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), > > > > > + longest->len, &oinfo); > > > > > + list_del(&longest->list); > > > > > + kmem_free(longest); > > > > > + if (error) > > > > > + return error; > > > > > + > > > > > + /* Insert records into the new btrees. */ > > > > > + list_for_each_entry_safe(rae, n, free_extents, list) { > > > > > + error = xrep_abt_free_extent(sc, > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), > > > > > + rae->len, &oinfo); > > > > > + if (error) > > > > > + return error; > > > > > + list_del(&rae->list); > > > > > + kmem_free(rae); > > > > > + } > > > > > > > > Ok, at this point we've reset the btree roots and we start freeing the > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we > > > > fail or crash at this point, we leave the allocbts in a partially > > > > constructed state. I take it that is Ok with respect to the broader > > > > repair algorithm because we'd essentially start over by inspecting the > > > > rmapbt again on a retry. > > > > > > Right. Though in the crash/shutdown case, you'll end up with the > > > filesystem in an offline state at some point before you can retry the > > > scrub, it's probably faster to run xfs_repair to fix the damage. > > > > > > > Can we really assume that if we're already up and running an online > > repair? The filesystem has to be mountable in that case in the first > > place. If we've already reset and started reconstructing the allocation > > btrees then I'd think those transactions would recover just fine on a > > power loss or something (perhaps not in the event of some other > > corruption related shutdown). > > Right, for the system crash case, whatever transactions committed should > replay just fine, and you can even start up the online repair again, and > if the AG isn't particularly close to ENOSPC then (barring rmap > corruption) it should work just fine. > > If the fs went down because either (a) repair hit other corruption or > (b) some other thread hit an error in some other part of the filesystem, > then it's not so clear -- in (b) you could probably try again, but for > (a) you'll definitely have to unmount and run xfs_repair. > Indeed, there are certainly cases where we simply won't be able to do an online repair. I'm trying to think about scenarios where we should be able to do an online repair, but we lose power or hit some kind of transient error like a memory allocation failure before it completes. It would be nice if the online repair itself didn't contribute (within reason) to the inability to simply try again just because the fs was close to -ENOSPC. For one, I think it's potentially confusing behavior. Second, it might be concerning to regular users who perceive it as an online repair leaving the fs in a worse off state. Us fs devs know that may not really be the case, but I think we're better for addressing it if we can reasonably do so. > Perhaps the guideline here is that if the fs goes down more than once > during online repair then unmount it and run xfs_repair. > Yep, I think that makes sense if the filesystem or repair itself is tripping over other corruptions that fail to keep it active for the duration of the repair. > > > > The blocks allocated for the btrees that we've begun to construct here > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't > > > > necessarily have infinite retries to make sure this completes. IOW, > > > > suppose that a first repair attempt finds just enough free space to > > > > construct new trees, gets far enough along to consume most of that free > > > > space and then crashes. Is it possible that a subsequent repair attempt > > > > includes the btree blocks allocated during the previous failed repair > > > > attempt in the sum of "old btree blocks" and determines we don't have > > > > enough free space to repair? > > > > > > Yes, that's a risk of running the free space repair. > > > > > > > Can we improve on that? For example, are the rmapbt entries for the old > > allocation btree blocks necessary once we commit the btree resets? If > > not, could we remove those entries before we start tree reconstruction? > > > > Alternatively, could we incorporate use of the old btree blocks? As it > > is, we discover those blocks simply so we can free them at the end. > > Perhaps we could free them sooner or find a more clever means to > > reallocate directly from that in-core list? I guess we have to consider > > whether they were really valid/sane btree blocks, but either way ISTM > > that the old blocks list is essentially invalidated once we reset the > > btrees. > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a > record causes a btree split we'll pull blocks from the AGFL, and if > there aren't enough blocks in the bnobt to fill the AGFL back up then > fix_freelist won't succeed. That complication is why it finds the > longest extent in the unclaimed list and pushes that in first, then > works on the rest of the extents. > Hmm, but doesn't a btree split require at least one full space btree block per-level? In conjunction, the agfl minimum size requirement grows with the height of the tree, which implies available free space..? I could be missing something, perhaps we have to account for the rmapbt in that case as well? Regardless... > I suppose one could try to avoid ENOSPC by pushing that longest extent > in first (since we know that won't trigger a split), then reap the old > alloc btree blocks, and then add everything else back in... > I think it would be reasonable to seed the btree with the longest record or some fixed number of longest records (~1/2 a root block, for example) before making actual use of the btrees to reap the old blocks. I think then you'd only have a very short window of a single block leak on a poorly timed power loss and repair retry sequence before you start actually freeing originally used space (which in practice, I think solves the problem). Given that we're starting from empty, I wonder if another option may be to over fill the agfl with old btree blocks or something. The first real free should shift enough blocks back into the btrees to ensure the agfl can be managed from that point forward, right? That may be more work than it's worth though and/or a job for another patch. (FWIW, we also have that NOSHRINK agfl fixup flag for userspace repair.) Brian > --D > > > Brian > > > > > > > + > > > > > +done: > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > + XFS_AG_RESV_NONE); > > > > > +} > > > > > + > > > > ... > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > diff = b1->bno - b2->bno; > > > > > return diff; > > > > > } > > > > > + > > > > > +/* Are there any busy extents in this AG? */ > > > > > +bool > > > > > +xfs_extent_busy_list_empty( > > > > > + struct xfs_perag *pag) > > > > > +{ > > > > > + spin_lock(&pag->pagb_lock); > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > RB_EMPTY_ROOT()? > > > > > > Good suggestion, thank you! > > > > > > --D > > > > > > > Brian > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > + return false; > > > > > + } > > > > > + spin_unlock(&pag->pagb_lock); > > > > > + return true; > > > > > +} > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > } > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > + > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree. > > > > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > --- > > > > > > fs/xfs/Makefile | 1 > > > > > > fs/xfs/scrub/alloc.c | 1 > > > > > > fs/xfs/scrub/alloc_repair.c | 581 +++++++++++++++++++++++++++++++++++++++++++ > > > > > > fs/xfs/scrub/common.c | 8 + > > > > > > fs/xfs/scrub/repair.h | 2 > > > > > > fs/xfs/scrub/scrub.c | 4 > > > > > > fs/xfs/scrub/trace.h | 2 > > > > > > fs/xfs/xfs_extent_busy.c | 14 + > > > > > > fs/xfs/xfs_extent_busy.h | 2 > > > > > > 9 files changed, 610 insertions(+), 5 deletions(-) > > > > > > create mode 100644 fs/xfs/scrub/alloc_repair.c > > > > > > > > > > > > > > > > > ... > > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c > > > > > > new file mode 100644 > > > > > > index 000000000000..b228c2906de2 > > > > > > --- /dev/null > > > > > > +++ b/fs/xfs/scrub/alloc_repair.c > > > > > > @@ -0,0 +1,581 @@ > ... > > > > > > + > > > > > > +/* > > > > > > + * Make our new freespace btree roots permanent so that we can start freeing > > > > > > + * unused space back into the AG. > > > > > > + */ > > > > > > +STATIC int > > > > > > +xrep_abt_commit_new( > > > > > > + struct xfs_scrub *sc, > > > > > > + struct xfs_bitmap *old_allocbt_blocks, > > > > > > + int log_flags) > > > > > > +{ > > > > > > + int error; > > > > > > + > > > > > > + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); > > > > > > + > > > > > > + /* Invalidate the old freespace btree blocks and commit. */ > > > > > > + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); > > > > > > + if (error) > > > > > > + return error; > > > > > > > > > > It looks like the above invalidation all happens in the same > > > > > transaction. Those aren't logging buffer data or anything, but any idea > > > > > how many log formats we can get away with in this single transaction? > > > > > > > > Hm... well, on my computer a log format is ~88 bytes. Assuming 4K > > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and > > > > two btrees, the tree could be up to ~270 million records. Assuming ~505 > > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks > > > > for both btrees. If we invalidate both, that's ~46M of RAM? > > > > > > > > > > I was thinking more about transaction reservation than RAM. It may not > > > > Hmm. tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's > > about ... ~7300 log format items? Not a lot, maybe it should roll the > > transaction every 1000 invalidations or so... > > > > I'm not really sure what categorizes as a lot here given that the blocks > would need to be in-core, but rolling on some fixed/safe interval sounds > reasonable to me. > > > > currently be an issue, but it might be worth putting something down in a > > > comment to note that this is a single transaction and we expect to not > > > have to invalidate more than N (ballpark) blocks in a single go, > > > whatever that value happens to be. > > > > > > > > > + error = xrep_roll_ag_trans(sc); > > > > > > + if (error) > > > > > > + return error; > > > > > > + > > > > > > + /* Now that we've succeeded, mark the incore state valid again. */ > > > > > > + sc->sa.pag->pagf_init = 1; > > > > > > + return 0; > > > > > > +} > > > > > > + > > > > > > +/* Build new free space btrees and dispose of the old one. */ > > > > > > +STATIC int > > > > > > +xrep_abt_rebuild_trees( > > > > > > + struct xfs_scrub *sc, > > > > > > + struct list_head *free_extents, > > > > > > + struct xfs_bitmap *old_allocbt_blocks) > > > > > > +{ > > > > > > + struct xfs_owner_info oinfo; > > > > > > + struct xrep_abt_extent *rae; > > > > > > + struct xrep_abt_extent *n; > > > > > > + struct xrep_abt_extent *longest; > > > > > > + int error; > > > > > > + > > > > > > + xfs_rmap_skip_owner_update(&oinfo); > > > > > > + > > > > > > + /* > > > > > > + * Insert the longest free extent in case it's necessary to > > > > > > + * refresh the AGFL with multiple blocks. If there is no longest > > > > > > + * extent, we had exactly the free space we needed; we're done. > > > > > > + */ > > > > > > > > > > I'm confused by the last sentence. longest should only be NULL if the > > > > > free space list is empty and haven't we already bailed out with -ENOSPC > > > > > if that's the case? > > > > > > > > > > > + longest = xrep_abt_get_longest(free_extents); > > > > > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two > > > > new btree roots in xrep_abt_reset_btrees. If free_extents is an empty > > > > list here, then we found exactly two blocks worth of free space and used > > > > them to set up new btree roots. > > > > > > > > > > Got it, thanks. > > > > > > > > > + if (!longest) > > > > > > + goto done; > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), > > > > > > + longest->len, &oinfo); > > > > > > + list_del(&longest->list); > > > > > > + kmem_free(longest); > > > > > > + if (error) > > > > > > + return error; > > > > > > + > > > > > > + /* Insert records into the new btrees. */ > > > > > > + list_for_each_entry_safe(rae, n, free_extents, list) { > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), > > > > > > + rae->len, &oinfo); > > > > > > + if (error) > > > > > > + return error; > > > > > > + list_del(&rae->list); > > > > > > + kmem_free(rae); > > > > > > + } > > > > > > > > > > Ok, at this point we've reset the btree roots and we start freeing the > > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we > > > > > fail or crash at this point, we leave the allocbts in a partially > > > > > constructed state. I take it that is Ok with respect to the broader > > > > > repair algorithm because we'd essentially start over by inspecting the > > > > > rmapbt again on a retry. > > > > > > > > Right. Though in the crash/shutdown case, you'll end up with the > > > > filesystem in an offline state at some point before you can retry the > > > > scrub, it's probably faster to run xfs_repair to fix the damage. > > > > > > > > > > Can we really assume that if we're already up and running an online > > > repair? The filesystem has to be mountable in that case in the first > > > place. If we've already reset and started reconstructing the allocation > > > btrees then I'd think those transactions would recover just fine on a > > > power loss or something (perhaps not in the event of some other > > > corruption related shutdown). > > > > Right, for the system crash case, whatever transactions committed should > > replay just fine, and you can even start up the online repair again, and > > if the AG isn't particularly close to ENOSPC then (barring rmap > > corruption) it should work just fine. > > > > If the fs went down because either (a) repair hit other corruption or > > (b) some other thread hit an error in some other part of the filesystem, > > then it's not so clear -- in (b) you could probably try again, but for > > (a) you'll definitely have to unmount and run xfs_repair. > > > > Indeed, there are certainly cases where we simply won't be able to do an > online repair. I'm trying to think about scenarios where we should be > able to do an online repair, but we lose power or hit some kind of > transient error like a memory allocation failure before it completes. It > would be nice if the online repair itself didn't contribute (within > reason) to the inability to simply try again just because the fs was > close to -ENOSPC. Agreed. Most of the, uh, opportunities to hit ENOMEM happen before we start modifying on-disk metadata. If that happens, we just free all the memory and bail out having done nothing. > For one, I think it's potentially confusing behavior. Second, it might > be concerning to regular users who perceive it as an online repair > leaving the fs in a worse off state. Us fs devs know that may not really > be the case, but I think we're better for addressing it if we can > reasonably do so. <nod> Further in the future I want to add the ability to offline an AG, so the worst that happens is that scrub turns the AG off, repair doesn't fix it, and the AG simply stays offline. That might give us the ability to survive cancelling the repair transaction, since if the AG's offline already anyway we could just throw away the dirty buffers and resurrect the AG later. I don't know, that's purely speculative. > > Perhaps the guideline here is that if the fs goes down more than once > > during online repair then unmount it and run xfs_repair. > > > > Yep, I think that makes sense if the filesystem or repair itself is > tripping over other corruptions that fail to keep it active for the > duration of the repair. <nod> > > > > > The blocks allocated for the btrees that we've begun to construct here > > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't > > > > > necessarily have infinite retries to make sure this completes. IOW, > > > > > suppose that a first repair attempt finds just enough free space to > > > > > construct new trees, gets far enough along to consume most of that free > > > > > space and then crashes. Is it possible that a subsequent repair attempt > > > > > includes the btree blocks allocated during the previous failed repair > > > > > attempt in the sum of "old btree blocks" and determines we don't have > > > > > enough free space to repair? > > > > > > > > Yes, that's a risk of running the free space repair. > > > > > > > > > > Can we improve on that? For example, are the rmapbt entries for the old > > > allocation btree blocks necessary once we commit the btree resets? If > > > not, could we remove those entries before we start tree reconstruction? > > > > > > Alternatively, could we incorporate use of the old btree blocks? As it > > > is, we discover those blocks simply so we can free them at the end. > > > Perhaps we could free them sooner or find a more clever means to > > > reallocate directly from that in-core list? I guess we have to consider > > > whether they were really valid/sane btree blocks, but either way ISTM > > > that the old blocks list is essentially invalidated once we reset the > > > btrees. > > > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and > > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a > > record causes a btree split we'll pull blocks from the AGFL, and if > > there aren't enough blocks in the bnobt to fill the AGFL back up then > > fix_freelist won't succeed. That complication is why it finds the > > longest extent in the unclaimed list and pushes that in first, then > > works on the rest of the extents. > > > > Hmm, but doesn't a btree split require at least one full space btree > block per-level? In conjunction, the agfl minimum size requirement grows > with the height of the tree, which implies available free space..? I > could be missing something, perhaps we have to account for the rmapbt in > that case as well? Regardless... > > > I suppose one could try to avoid ENOSPC by pushing that longest extent > > in first (since we know that won't trigger a split), then reap the old > > alloc btree blocks, and then add everything else back in... > > > > I think it would be reasonable to seed the btree with the longest record > or some fixed number of longest records (~1/2 a root block, for example) > before making actual use of the btrees to reap the old blocks. I think > then you'd only have a very short window of a single block leak on a > poorly timed power loss and repair retry sequence before you start > actually freeing originally used space (which in practice, I think > solves the problem). > > Given that we're starting from empty, I wonder if another option may be > to over fill the agfl with old btree blocks or something. The first real > free should shift enough blocks back into the btrees to ensure the agfl > can be managed from that point forward, right? That may be more work > than it's worth though and/or a job for another patch. (FWIW, we also > have that NOSHRINK agfl fixup flag for userspace repair.) Yes, I'll give that a try tomorrow, now that I've finished porting all the 4.19 stuff to xfsprogs. :) Looping back to something we discussed earlier in this thread, I'd prefer to hold off on converting the list of already-freed extents to xfs_bitmap because the same problem exists in all the repair functions of having to store a large number of records for the rebuilt btree, and maybe there's some way to <cough> use pageable memory for that, since the access patterns for that are append, sort, and iterate; for those three uses we don't necessarily require all the records to be in memory all the time. For the allocbt repair I expect the free space records to be far more numerous than the list of old bnobt/cntbt blocks. --D > Brian > > > --D > > > > > Brian > > > > > > > > > + > > > > > > +done: > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > + XFS_AG_RESV_NONE); > > > > > > +} > > > > > > + > > > > > ... > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > diff = b1->bno - b2->bno; > > > > > > return diff; > > > > > > } > > > > > > + > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > +bool > > > > > > +xfs_extent_busy_list_empty( > > > > > > + struct xfs_perag *pag) > > > > > > +{ > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > Good suggestion, thank you! > > > > > > > > --D > > > > > > > > > Brian > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > + return false; > > > > > > + } > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > + return true; > > > > > > +} > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > } > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > + > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree. > > > > > > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > --- > > > > > > > fs/xfs/Makefile | 1 > > > > > > > fs/xfs/scrub/alloc.c | 1 > > > > > > > fs/xfs/scrub/alloc_repair.c | 581 +++++++++++++++++++++++++++++++++++++++++++ > > > > > > > fs/xfs/scrub/common.c | 8 + > > > > > > > fs/xfs/scrub/repair.h | 2 > > > > > > > fs/xfs/scrub/scrub.c | 4 > > > > > > > fs/xfs/scrub/trace.h | 2 > > > > > > > fs/xfs/xfs_extent_busy.c | 14 + > > > > > > > fs/xfs/xfs_extent_busy.h | 2 > > > > > > > 9 files changed, 610 insertions(+), 5 deletions(-) > > > > > > > create mode 100644 fs/xfs/scrub/alloc_repair.c > > > > > > > > > > > > > > > > > > > > ... > > > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c > > > > > > > new file mode 100644 > > > > > > > index 000000000000..b228c2906de2 > > > > > > > --- /dev/null > > > > > > > +++ b/fs/xfs/scrub/alloc_repair.c > > > > > > > @@ -0,0 +1,581 @@ > > ... > > > > > > > + > > > > > > > +/* > > > > > > > + * Make our new freespace btree roots permanent so that we can start freeing > > > > > > > + * unused space back into the AG. > > > > > > > + */ > > > > > > > +STATIC int > > > > > > > +xrep_abt_commit_new( > > > > > > > + struct xfs_scrub *sc, > > > > > > > + struct xfs_bitmap *old_allocbt_blocks, > > > > > > > + int log_flags) > > > > > > > +{ > > > > > > > + int error; > > > > > > > + > > > > > > > + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); > > > > > > > + > > > > > > > + /* Invalidate the old freespace btree blocks and commit. */ > > > > > > > + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); > > > > > > > + if (error) > > > > > > > + return error; > > > > > > > > > > > > It looks like the above invalidation all happens in the same > > > > > > transaction. Those aren't logging buffer data or anything, but any idea > > > > > > how many log formats we can get away with in this single transaction? > > > > > > > > > > Hm... well, on my computer a log format is ~88 bytes. Assuming 4K > > > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and > > > > > two btrees, the tree could be up to ~270 million records. Assuming ~505 > > > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks > > > > > for both btrees. If we invalidate both, that's ~46M of RAM? > > > > > > > > > > > > > I was thinking more about transaction reservation than RAM. It may not > > > > > > Hmm. tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's > > > about ... ~7300 log format items? Not a lot, maybe it should roll the > > > transaction every 1000 invalidations or so... > > > > > > > I'm not really sure what categorizes as a lot here given that the blocks > > would need to be in-core, but rolling on some fixed/safe interval sounds > > reasonable to me. > > > > > > currently be an issue, but it might be worth putting something down in a > > > > comment to note that this is a single transaction and we expect to not > > > > have to invalidate more than N (ballpark) blocks in a single go, > > > > whatever that value happens to be. > > > > > > > > > > > + error = xrep_roll_ag_trans(sc); > > > > > > > + if (error) > > > > > > > + return error; > > > > > > > + > > > > > > > + /* Now that we've succeeded, mark the incore state valid again. */ > > > > > > > + sc->sa.pag->pagf_init = 1; > > > > > > > + return 0; > > > > > > > +} > > > > > > > + > > > > > > > +/* Build new free space btrees and dispose of the old one. */ > > > > > > > +STATIC int > > > > > > > +xrep_abt_rebuild_trees( > > > > > > > + struct xfs_scrub *sc, > > > > > > > + struct list_head *free_extents, > > > > > > > + struct xfs_bitmap *old_allocbt_blocks) > > > > > > > +{ > > > > > > > + struct xfs_owner_info oinfo; > > > > > > > + struct xrep_abt_extent *rae; > > > > > > > + struct xrep_abt_extent *n; > > > > > > > + struct xrep_abt_extent *longest; > > > > > > > + int error; > > > > > > > + > > > > > > > + xfs_rmap_skip_owner_update(&oinfo); > > > > > > > + > > > > > > > + /* > > > > > > > + * Insert the longest free extent in case it's necessary to > > > > > > > + * refresh the AGFL with multiple blocks. If there is no longest > > > > > > > + * extent, we had exactly the free space we needed; we're done. > > > > > > > + */ > > > > > > > > > > > > I'm confused by the last sentence. longest should only be NULL if the > > > > > > free space list is empty and haven't we already bailed out with -ENOSPC > > > > > > if that's the case? > > > > > > > > > > > > > + longest = xrep_abt_get_longest(free_extents); > > > > > > > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two > > > > > new btree roots in xrep_abt_reset_btrees. If free_extents is an empty > > > > > list here, then we found exactly two blocks worth of free space and used > > > > > them to set up new btree roots. > > > > > > > > > > > > > Got it, thanks. > > > > > > > > > > > + if (!longest) > > > > > > > + goto done; > > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), > > > > > > > + longest->len, &oinfo); > > > > > > > + list_del(&longest->list); > > > > > > > + kmem_free(longest); > > > > > > > + if (error) > > > > > > > + return error; > > > > > > > + > > > > > > > + /* Insert records into the new btrees. */ > > > > > > > + list_for_each_entry_safe(rae, n, free_extents, list) { > > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), > > > > > > > + rae->len, &oinfo); > > > > > > > + if (error) > > > > > > > + return error; > > > > > > > + list_del(&rae->list); > > > > > > > + kmem_free(rae); > > > > > > > + } > > > > > > > > > > > > Ok, at this point we've reset the btree roots and we start freeing the > > > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we > > > > > > fail or crash at this point, we leave the allocbts in a partially > > > > > > constructed state. I take it that is Ok with respect to the broader > > > > > > repair algorithm because we'd essentially start over by inspecting the > > > > > > rmapbt again on a retry. > > > > > > > > > > Right. Though in the crash/shutdown case, you'll end up with the > > > > > filesystem in an offline state at some point before you can retry the > > > > > scrub, it's probably faster to run xfs_repair to fix the damage. > > > > > > > > > > > > > Can we really assume that if we're already up and running an online > > > > repair? The filesystem has to be mountable in that case in the first > > > > place. If we've already reset and started reconstructing the allocation > > > > btrees then I'd think those transactions would recover just fine on a > > > > power loss or something (perhaps not in the event of some other > > > > corruption related shutdown). > > > > > > Right, for the system crash case, whatever transactions committed should > > > replay just fine, and you can even start up the online repair again, and > > > if the AG isn't particularly close to ENOSPC then (barring rmap > > > corruption) it should work just fine. > > > > > > If the fs went down because either (a) repair hit other corruption or > > > (b) some other thread hit an error in some other part of the filesystem, > > > then it's not so clear -- in (b) you could probably try again, but for > > > (a) you'll definitely have to unmount and run xfs_repair. > > > > > > > Indeed, there are certainly cases where we simply won't be able to do an > > online repair. I'm trying to think about scenarios where we should be > > able to do an online repair, but we lose power or hit some kind of > > transient error like a memory allocation failure before it completes. It > > would be nice if the online repair itself didn't contribute (within > > reason) to the inability to simply try again just because the fs was > > close to -ENOSPC. > > Agreed. Most of the, uh, opportunities to hit ENOMEM happen before we > start modifying on-disk metadata. If that happens, we just free all the > memory and bail out having done nothing. > > > For one, I think it's potentially confusing behavior. Second, it might > > be concerning to regular users who perceive it as an online repair > > leaving the fs in a worse off state. Us fs devs know that may not really > > be the case, but I think we're better for addressing it if we can > > reasonably do so. > > <nod> Further in the future I want to add the ability to offline an AG, > so the worst that happens is that scrub turns the AG off, repair doesn't > fix it, and the AG simply stays offline. That might give us the > ability to survive cancelling the repair transaction, since if the AG's > offline already anyway we could just throw away the dirty buffers and > resurrect the AG later. I don't know, that's purely speculative. > > > > Perhaps the guideline here is that if the fs goes down more than once > > > during online repair then unmount it and run xfs_repair. > > > > > > > Yep, I think that makes sense if the filesystem or repair itself is > > tripping over other corruptions that fail to keep it active for the > > duration of the repair. > > <nod> > > > > > > > The blocks allocated for the btrees that we've begun to construct here > > > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't > > > > > > necessarily have infinite retries to make sure this completes. IOW, > > > > > > suppose that a first repair attempt finds just enough free space to > > > > > > construct new trees, gets far enough along to consume most of that free > > > > > > space and then crashes. Is it possible that a subsequent repair attempt > > > > > > includes the btree blocks allocated during the previous failed repair > > > > > > attempt in the sum of "old btree blocks" and determines we don't have > > > > > > enough free space to repair? > > > > > > > > > > Yes, that's a risk of running the free space repair. > > > > > > > > > > > > > Can we improve on that? For example, are the rmapbt entries for the old > > > > allocation btree blocks necessary once we commit the btree resets? If > > > > not, could we remove those entries before we start tree reconstruction? > > > > > > > > Alternatively, could we incorporate use of the old btree blocks? As it > > > > is, we discover those blocks simply so we can free them at the end. > > > > Perhaps we could free them sooner or find a more clever means to > > > > reallocate directly from that in-core list? I guess we have to consider > > > > whether they were really valid/sane btree blocks, but either way ISTM > > > > that the old blocks list is essentially invalidated once we reset the > > > > btrees. > > > > > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and > > > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a > > > record causes a btree split we'll pull blocks from the AGFL, and if > > > there aren't enough blocks in the bnobt to fill the AGFL back up then > > > fix_freelist won't succeed. That complication is why it finds the > > > longest extent in the unclaimed list and pushes that in first, then > > > works on the rest of the extents. > > > > > > > Hmm, but doesn't a btree split require at least one full space btree > > block per-level? In conjunction, the agfl minimum size requirement grows > > with the height of the tree, which implies available free space..? I > > could be missing something, perhaps we have to account for the rmapbt in > > that case as well? Regardless... > > > > > I suppose one could try to avoid ENOSPC by pushing that longest extent > > > in first (since we know that won't trigger a split), then reap the old > > > alloc btree blocks, and then add everything else back in... > > > > > > > I think it would be reasonable to seed the btree with the longest record > > or some fixed number of longest records (~1/2 a root block, for example) > > before making actual use of the btrees to reap the old blocks. I think > > then you'd only have a very short window of a single block leak on a > > poorly timed power loss and repair retry sequence before you start > > actually freeing originally used space (which in practice, I think > > solves the problem). > > > > Given that we're starting from empty, I wonder if another option may be > > to over fill the agfl with old btree blocks or something. The first real > > free should shift enough blocks back into the btrees to ensure the agfl > > can be managed from that point forward, right? That may be more work > > than it's worth though and/or a job for another patch. (FWIW, we also > > have that NOSHRINK agfl fixup flag for userspace repair.) > > Yes, I'll give that a try tomorrow, now that I've finished porting all > the 4.19 stuff to xfsprogs. :) > > Looping back to something we discussed earlier in this thread, I'd > prefer to hold off on converting the list of already-freed extents to > xfs_bitmap because the same problem exists in all the repair functions > of having to store a large number of records for the rebuilt btree, and > maybe there's some way to <cough> use pageable memory for that, since > the access patterns for that are append, sort, and iterate; for those > three uses we don't necessarily require all the records to be in memory > all the time. For the allocbt repair I expect the free space records to > be far more numerous than the list of old bnobt/cntbt blocks. > Ok, it's fair enough that we'll probably want to find some kind of generic, more efficient technique for handling this across the various applicable repair algorithms. One other high level thing that crossed my mind with regard to the general btree reconstruction algorithms is whether we need to build up this kind of central record list at all. For example, rather than slurp up the entire list of btree records in-core, sort it and dump it back out, could we take advantage of the fact that our existing on-disk structure insertion mechanisms already handle out of order records (simply stated, an extent free knows how to insert the associated record at the right place in the space btrees)? For example, suppose we reset the existing btrees first, then scanned the rmapbt and repopulated the new btrees as records are discovered..? The obvious problem is that we still have some checks that allow the whole repair operation to bail out before we determine whether we can start to rebuild the on-disk btrees. These are things like making sure we can actually read the associated rmapbt blocks (i.e., no read errors or verifier failures), basic record sanity checks, etc. But ISTM that isn't anything we couldn't get around with a multi-pass implementation. Secondary issues might be things like no longer being able to easily insert the longest free extent range(s) first (meaning we'd have to stuff the agfl with old btree blocks or figure out some other approach). BTW, isn't the typical scrub sequence already multi-pass by virtue of the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub() callout could not only detect corruption, but validate whether repair (if requested) is possible based on the kind of checks that are currently in the repair side rmapbt walkers. Thoughts? Are there future changes that are better supported by an in-core tracking structure in general (assuming we'll eventually replace the linked lists with something more efficient) as opposed to attempting to optimize out the need for that tracking at all? Brian > --D > > > Brian > > > > > --D > > > > > > > Brian > > > > > > > > > > > + > > > > > > > +done: > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > +} > > > > > > > + > > > > > > ... > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > diff = b1->bno - b2->bno; > > > > > > > return diff; > > > > > > > } > > > > > > > + > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > +bool > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > + struct xfs_perag *pag) > > > > > > > +{ > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > --D > > > > > > > > > > > Brian > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > + return false; > > > > > > > + } > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > + return true; > > > > > > > +} > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > } > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > + > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > -- > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote: > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > > > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree. > > > > > > > > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > --- > > > > > > > > fs/xfs/Makefile | 1 > > > > > > > > fs/xfs/scrub/alloc.c | 1 > > > > > > > > fs/xfs/scrub/alloc_repair.c | 581 +++++++++++++++++++++++++++++++++++++++++++ > > > > > > > > fs/xfs/scrub/common.c | 8 + > > > > > > > > fs/xfs/scrub/repair.h | 2 > > > > > > > > fs/xfs/scrub/scrub.c | 4 > > > > > > > > fs/xfs/scrub/trace.h | 2 > > > > > > > > fs/xfs/xfs_extent_busy.c | 14 + > > > > > > > > fs/xfs/xfs_extent_busy.h | 2 > > > > > > > > 9 files changed, 610 insertions(+), 5 deletions(-) > > > > > > > > create mode 100644 fs/xfs/scrub/alloc_repair.c > > > > > > > > > > > > > > > > > > > > > > > ... > > > > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c > > > > > > > > new file mode 100644 > > > > > > > > index 000000000000..b228c2906de2 > > > > > > > > --- /dev/null > > > > > > > > +++ b/fs/xfs/scrub/alloc_repair.c > > > > > > > > @@ -0,0 +1,581 @@ > > > ... > > > > > > > > + > > > > > > > > +/* > > > > > > > > + * Make our new freespace btree roots permanent so that we can start freeing > > > > > > > > + * unused space back into the AG. > > > > > > > > + */ > > > > > > > > +STATIC int > > > > > > > > +xrep_abt_commit_new( > > > > > > > > + struct xfs_scrub *sc, > > > > > > > > + struct xfs_bitmap *old_allocbt_blocks, > > > > > > > > + int log_flags) > > > > > > > > +{ > > > > > > > > + int error; > > > > > > > > + > > > > > > > > + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); > > > > > > > > + > > > > > > > > + /* Invalidate the old freespace btree blocks and commit. */ > > > > > > > > + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); > > > > > > > > + if (error) > > > > > > > > + return error; > > > > > > > > > > > > > > It looks like the above invalidation all happens in the same > > > > > > > transaction. Those aren't logging buffer data or anything, but any idea > > > > > > > how many log formats we can get away with in this single transaction? > > > > > > > > > > > > Hm... well, on my computer a log format is ~88 bytes. Assuming 4K > > > > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and > > > > > > two btrees, the tree could be up to ~270 million records. Assuming ~505 > > > > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks > > > > > > for both btrees. If we invalidate both, that's ~46M of RAM? > > > > > > > > > > > > > > > > I was thinking more about transaction reservation than RAM. It may not > > > > > > > > Hmm. tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's > > > > about ... ~7300 log format items? Not a lot, maybe it should roll the > > > > transaction every 1000 invalidations or so... > > > > > > > > > > I'm not really sure what categorizes as a lot here given that the blocks > > > would need to be in-core, but rolling on some fixed/safe interval sounds > > > reasonable to me. > > > > > > > > currently be an issue, but it might be worth putting something down in a > > > > > comment to note that this is a single transaction and we expect to not > > > > > have to invalidate more than N (ballpark) blocks in a single go, > > > > > whatever that value happens to be. > > > > > > > > > > > > > + error = xrep_roll_ag_trans(sc); > > > > > > > > + if (error) > > > > > > > > + return error; > > > > > > > > + > > > > > > > > + /* Now that we've succeeded, mark the incore state valid again. */ > > > > > > > > + sc->sa.pag->pagf_init = 1; > > > > > > > > + return 0; > > > > > > > > +} > > > > > > > > + > > > > > > > > +/* Build new free space btrees and dispose of the old one. */ > > > > > > > > +STATIC int > > > > > > > > +xrep_abt_rebuild_trees( > > > > > > > > + struct xfs_scrub *sc, > > > > > > > > + struct list_head *free_extents, > > > > > > > > + struct xfs_bitmap *old_allocbt_blocks) > > > > > > > > +{ > > > > > > > > + struct xfs_owner_info oinfo; > > > > > > > > + struct xrep_abt_extent *rae; > > > > > > > > + struct xrep_abt_extent *n; > > > > > > > > + struct xrep_abt_extent *longest; > > > > > > > > + int error; > > > > > > > > + > > > > > > > > + xfs_rmap_skip_owner_update(&oinfo); > > > > > > > > + > > > > > > > > + /* > > > > > > > > + * Insert the longest free extent in case it's necessary to > > > > > > > > + * refresh the AGFL with multiple blocks. If there is no longest > > > > > > > > + * extent, we had exactly the free space we needed; we're done. > > > > > > > > + */ > > > > > > > > > > > > > > I'm confused by the last sentence. longest should only be NULL if the > > > > > > > free space list is empty and haven't we already bailed out with -ENOSPC > > > > > > > if that's the case? > > > > > > > > > > > > > > > + longest = xrep_abt_get_longest(free_extents); > > > > > > > > > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two > > > > > > new btree roots in xrep_abt_reset_btrees. If free_extents is an empty > > > > > > list here, then we found exactly two blocks worth of free space and used > > > > > > them to set up new btree roots. > > > > > > > > > > > > > > > > Got it, thanks. > > > > > > > > > > > > > + if (!longest) > > > > > > > > + goto done; > > > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), > > > > > > > > + longest->len, &oinfo); > > > > > > > > + list_del(&longest->list); > > > > > > > > + kmem_free(longest); > > > > > > > > + if (error) > > > > > > > > + return error; > > > > > > > > + > > > > > > > > + /* Insert records into the new btrees. */ > > > > > > > > + list_for_each_entry_safe(rae, n, free_extents, list) { > > > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), > > > > > > > > + rae->len, &oinfo); > > > > > > > > + if (error) > > > > > > > > + return error; > > > > > > > > + list_del(&rae->list); > > > > > > > > + kmem_free(rae); > > > > > > > > + } > > > > > > > > > > > > > > Ok, at this point we've reset the btree roots and we start freeing the > > > > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we > > > > > > > fail or crash at this point, we leave the allocbts in a partially > > > > > > > constructed state. I take it that is Ok with respect to the broader > > > > > > > repair algorithm because we'd essentially start over by inspecting the > > > > > > > rmapbt again on a retry. > > > > > > > > > > > > Right. Though in the crash/shutdown case, you'll end up with the > > > > > > filesystem in an offline state at some point before you can retry the > > > > > > scrub, it's probably faster to run xfs_repair to fix the damage. > > > > > > > > > > > > > > > > Can we really assume that if we're already up and running an online > > > > > repair? The filesystem has to be mountable in that case in the first > > > > > place. If we've already reset and started reconstructing the allocation > > > > > btrees then I'd think those transactions would recover just fine on a > > > > > power loss or something (perhaps not in the event of some other > > > > > corruption related shutdown). > > > > > > > > Right, for the system crash case, whatever transactions committed should > > > > replay just fine, and you can even start up the online repair again, and > > > > if the AG isn't particularly close to ENOSPC then (barring rmap > > > > corruption) it should work just fine. > > > > > > > > If the fs went down because either (a) repair hit other corruption or > > > > (b) some other thread hit an error in some other part of the filesystem, > > > > then it's not so clear -- in (b) you could probably try again, but for > > > > (a) you'll definitely have to unmount and run xfs_repair. > > > > > > > > > > Indeed, there are certainly cases where we simply won't be able to do an > > > online repair. I'm trying to think about scenarios where we should be > > > able to do an online repair, but we lose power or hit some kind of > > > transient error like a memory allocation failure before it completes. It > > > would be nice if the online repair itself didn't contribute (within > > > reason) to the inability to simply try again just because the fs was > > > close to -ENOSPC. > > > > Agreed. Most of the, uh, opportunities to hit ENOMEM happen before we > > start modifying on-disk metadata. If that happens, we just free all the > > memory and bail out having done nothing. > > > > > For one, I think it's potentially confusing behavior. Second, it might > > > be concerning to regular users who perceive it as an online repair > > > leaving the fs in a worse off state. Us fs devs know that may not really > > > be the case, but I think we're better for addressing it if we can > > > reasonably do so. > > > > <nod> Further in the future I want to add the ability to offline an AG, > > so the worst that happens is that scrub turns the AG off, repair doesn't > > fix it, and the AG simply stays offline. That might give us the > > ability to survive cancelling the repair transaction, since if the AG's > > offline already anyway we could just throw away the dirty buffers and > > resurrect the AG later. I don't know, that's purely speculative. > > > > > > Perhaps the guideline here is that if the fs goes down more than once > > > > during online repair then unmount it and run xfs_repair. > > > > > > > > > > Yep, I think that makes sense if the filesystem or repair itself is > > > tripping over other corruptions that fail to keep it active for the > > > duration of the repair. > > > > <nod> > > > > > > > > > The blocks allocated for the btrees that we've begun to construct here > > > > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't > > > > > > > necessarily have infinite retries to make sure this completes. IOW, > > > > > > > suppose that a first repair attempt finds just enough free space to > > > > > > > construct new trees, gets far enough along to consume most of that free > > > > > > > space and then crashes. Is it possible that a subsequent repair attempt > > > > > > > includes the btree blocks allocated during the previous failed repair > > > > > > > attempt in the sum of "old btree blocks" and determines we don't have > > > > > > > enough free space to repair? > > > > > > > > > > > > Yes, that's a risk of running the free space repair. > > > > > > > > > > > > > > > > Can we improve on that? For example, are the rmapbt entries for the old > > > > > allocation btree blocks necessary once we commit the btree resets? If > > > > > not, could we remove those entries before we start tree reconstruction? > > > > > > > > > > Alternatively, could we incorporate use of the old btree blocks? As it > > > > > is, we discover those blocks simply so we can free them at the end. > > > > > Perhaps we could free them sooner or find a more clever means to > > > > > reallocate directly from that in-core list? I guess we have to consider > > > > > whether they were really valid/sane btree blocks, but either way ISTM > > > > > that the old blocks list is essentially invalidated once we reset the > > > > > btrees. > > > > > > > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and > > > > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a > > > > record causes a btree split we'll pull blocks from the AGFL, and if > > > > there aren't enough blocks in the bnobt to fill the AGFL back up then > > > > fix_freelist won't succeed. That complication is why it finds the > > > > longest extent in the unclaimed list and pushes that in first, then > > > > works on the rest of the extents. > > > > > > > > > > Hmm, but doesn't a btree split require at least one full space btree > > > block per-level? In conjunction, the agfl minimum size requirement grows > > > with the height of the tree, which implies available free space..? I > > > could be missing something, perhaps we have to account for the rmapbt in > > > that case as well? Regardless... > > > > > > > I suppose one could try to avoid ENOSPC by pushing that longest extent > > > > in first (since we know that won't trigger a split), then reap the old > > > > alloc btree blocks, and then add everything else back in... > > > > > > > > > > I think it would be reasonable to seed the btree with the longest record > > > or some fixed number of longest records (~1/2 a root block, for example) > > > before making actual use of the btrees to reap the old blocks. I think > > > then you'd only have a very short window of a single block leak on a > > > poorly timed power loss and repair retry sequence before you start > > > actually freeing originally used space (which in practice, I think > > > solves the problem). > > > > > > Given that we're starting from empty, I wonder if another option may be > > > to over fill the agfl with old btree blocks or something. The first real > > > free should shift enough blocks back into the btrees to ensure the agfl > > > can be managed from that point forward, right? That may be more work > > > than it's worth though and/or a job for another patch. (FWIW, we also > > > have that NOSHRINK agfl fixup flag for userspace repair.) > > > > Yes, I'll give that a try tomorrow, now that I've finished porting all > > the 4.19 stuff to xfsprogs. :) > > > > Looping back to something we discussed earlier in this thread, I'd > > prefer to hold off on converting the list of already-freed extents to > > xfs_bitmap because the same problem exists in all the repair functions > > of having to store a large number of records for the rebuilt btree, and > > maybe there's some way to <cough> use pageable memory for that, since > > the access patterns for that are append, sort, and iterate; for those > > three uses we don't necessarily require all the records to be in memory > > all the time. For the allocbt repair I expect the free space records to > > be far more numerous than the list of old bnobt/cntbt blocks. > > > > Ok, it's fair enough that we'll probably want to find some kind of > generic, more efficient technique for handling this across the various > applicable repair algorithms. > > One other high level thing that crossed my mind with regard to the > general btree reconstruction algorithms is whether we need to build up > this kind of central record list at all. For example, rather than slurp > up the entire list of btree records in-core, sort it and dump it back > out, could we take advantage of the fact that our existing on-disk > structure insertion mechanisms already handle out of order records > (simply stated, an extent free knows how to insert the associated record > at the right place in the space btrees)? For example, suppose we reset > the existing btrees first, then scanned the rmapbt and repopulated the > new btrees as records are discovered..? I tried that in an earlier draft of the bnobt repair function. The biggest problem with inserting as we go is dealing with the inevitable transaction rolls (right now we do after every record insertion to avoid playing games with guessing how much reservation is left). Btree cursor state can't survive transaction rolls because the transaction commit releases all the buffers that aren't bhold'en, and we can't bhold that many buffers across a _defer_finish. So, that early draft spent a lot of time tearing down and reconstructing rmapbt cursors since the standard _btree_query_all isn't suited to that kind of usage. It was easily twice as slow on a RAM-backed disk just from the rmap cursor overhead and much more complex, so I rewrote it to be simpler. I also have a slight preference for not touching anything until we're absolutely sure we have all the data we need to repair the structure. For other repair functions (like the data/attr fork repairs) we have to scan all the rmapbts for extents, and I'd prefer to lock those AGs only for as long as necessary to extract the extents we want. > The obvious problem is that we still have some checks that allow the > whole repair operation to bail out before we determine whether we can > start to rebuild the on-disk btrees. These are things like making sure > we can actually read the associated rmapbt blocks (i.e., no read errors > or verifier failures), basic record sanity checks, etc. But ISTM that > isn't anything we couldn't get around with a multi-pass implementation. > Secondary issues might be things like no longer being able to easily > insert the longest free extent range(s) first (meaning we'd have to > stuff the agfl with old btree blocks or figure out some other approach). Well, you could scan the rmapbt twice -- once to find the longest record, then again to do the actual insertion. > BTW, isn't the typical scrub sequence already multi-pass by virtue of > the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub() > callout could not only detect corruption, but validate whether repair > (if requested) is possible based on the kind of checks that are > currently in the repair side rmapbt walkers. Thoughts?r Yes, scrub basically validates that for us now, with the notable exception of the notorious rmapbt scrubber, which doesn't cross-reference with inode block mappings because that would be a locking nightmare. > Are there future > changes that are better supported by an in-core tracking structure in > general (assuming we'll eventually replace the linked lists with > something more efficient) as opposed to attempting to optimize out the > need for that tracking at all? Well, I was thinking that we could just allocate a memfd (or a file on the same xfs once we have AG offlining) and store the records in there. That saves us the list_head overhead and potentially enables access to a lot more storage than pinning things in RAM. --D > Brian > > > --D > > > > > Brian > > > > > > > --D > > > > > > > > > Brian > > > > > > > > > > > > > + > > > > > > > > +done: > > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > > +} > > > > > > > > + > > > > > > > ... > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > > diff = b1->bno - b2->bno; > > > > > > > > return diff; > > > > > > > > } > > > > > > > > + > > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > > +bool > > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > > + struct xfs_perag *pag) > > > > > > > > +{ > > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > > > --D > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > + return false; > > > > > > > > + } > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > + return true; > > > > > > > > +} > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > > } > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > > + > > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > > > -- > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > -- > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote: > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote: > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > > > > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > > > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree. > > > > > > > > > > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > --- > > > > > > > > > fs/xfs/Makefile | 1 > > > > > > > > > fs/xfs/scrub/alloc.c | 1 > > > > > > > > > fs/xfs/scrub/alloc_repair.c | 581 +++++++++++++++++++++++++++++++++++++++++++ > > > > > > > > > fs/xfs/scrub/common.c | 8 + > > > > > > > > > fs/xfs/scrub/repair.h | 2 > > > > > > > > > fs/xfs/scrub/scrub.c | 4 > > > > > > > > > fs/xfs/scrub/trace.h | 2 > > > > > > > > > fs/xfs/xfs_extent_busy.c | 14 + > > > > > > > > > fs/xfs/xfs_extent_busy.h | 2 > > > > > > > > > 9 files changed, 610 insertions(+), 5 deletions(-) > > > > > > > > > create mode 100644 fs/xfs/scrub/alloc_repair.c > > > > > > > > > > > > > > > > > > > > > > > > > > ... > > > > > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c > > > > > > > > > new file mode 100644 > > > > > > > > > index 000000000000..b228c2906de2 > > > > > > > > > --- /dev/null > > > > > > > > > +++ b/fs/xfs/scrub/alloc_repair.c > > > > > > > > > @@ -0,0 +1,581 @@ > > > > ... > > > > > > > > > + > > > > > > > > > +/* > > > > > > > > > + * Make our new freespace btree roots permanent so that we can start freeing > > > > > > > > > + * unused space back into the AG. > > > > > > > > > + */ > > > > > > > > > +STATIC int > > > > > > > > > +xrep_abt_commit_new( > > > > > > > > > + struct xfs_scrub *sc, > > > > > > > > > + struct xfs_bitmap *old_allocbt_blocks, > > > > > > > > > + int log_flags) > > > > > > > > > +{ > > > > > > > > > + int error; > > > > > > > > > + > > > > > > > > > + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); > > > > > > > > > + > > > > > > > > > + /* Invalidate the old freespace btree blocks and commit. */ > > > > > > > > > + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); > > > > > > > > > + if (error) > > > > > > > > > + return error; > > > > > > > > > > > > > > > > It looks like the above invalidation all happens in the same > > > > > > > > transaction. Those aren't logging buffer data or anything, but any idea > > > > > > > > how many log formats we can get away with in this single transaction? > > > > > > > > > > > > > > Hm... well, on my computer a log format is ~88 bytes. Assuming 4K > > > > > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and > > > > > > > two btrees, the tree could be up to ~270 million records. Assuming ~505 > > > > > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks > > > > > > > for both btrees. If we invalidate both, that's ~46M of RAM? > > > > > > > > > > > > > > > > > > > I was thinking more about transaction reservation than RAM. It may not > > > > > > > > > > Hmm. tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's > > > > > about ... ~7300 log format items? Not a lot, maybe it should roll the > > > > > transaction every 1000 invalidations or so... > > > > > > > > > > > > > I'm not really sure what categorizes as a lot here given that the blocks > > > > would need to be in-core, but rolling on some fixed/safe interval sounds > > > > reasonable to me. > > > > > > > > > > currently be an issue, but it might be worth putting something down in a > > > > > > comment to note that this is a single transaction and we expect to not > > > > > > have to invalidate more than N (ballpark) blocks in a single go, > > > > > > whatever that value happens to be. > > > > > > > > > > > > > > > + error = xrep_roll_ag_trans(sc); > > > > > > > > > + if (error) > > > > > > > > > + return error; > > > > > > > > > + > > > > > > > > > + /* Now that we've succeeded, mark the incore state valid again. */ > > > > > > > > > + sc->sa.pag->pagf_init = 1; > > > > > > > > > + return 0; > > > > > > > > > +} > > > > > > > > > + > > > > > > > > > +/* Build new free space btrees and dispose of the old one. */ > > > > > > > > > +STATIC int > > > > > > > > > +xrep_abt_rebuild_trees( > > > > > > > > > + struct xfs_scrub *sc, > > > > > > > > > + struct list_head *free_extents, > > > > > > > > > + struct xfs_bitmap *old_allocbt_blocks) > > > > > > > > > +{ > > > > > > > > > + struct xfs_owner_info oinfo; > > > > > > > > > + struct xrep_abt_extent *rae; > > > > > > > > > + struct xrep_abt_extent *n; > > > > > > > > > + struct xrep_abt_extent *longest; > > > > > > > > > + int error; > > > > > > > > > + > > > > > > > > > + xfs_rmap_skip_owner_update(&oinfo); > > > > > > > > > + > > > > > > > > > + /* > > > > > > > > > + * Insert the longest free extent in case it's necessary to > > > > > > > > > + * refresh the AGFL with multiple blocks. If there is no longest > > > > > > > > > + * extent, we had exactly the free space we needed; we're done. > > > > > > > > > + */ > > > > > > > > > > > > > > > > I'm confused by the last sentence. longest should only be NULL if the > > > > > > > > free space list is empty and haven't we already bailed out with -ENOSPC > > > > > > > > if that's the case? > > > > > > > > > > > > > > > > > + longest = xrep_abt_get_longest(free_extents); > > > > > > > > > > > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two > > > > > > > new btree roots in xrep_abt_reset_btrees. If free_extents is an empty > > > > > > > list here, then we found exactly two blocks worth of free space and used > > > > > > > them to set up new btree roots. > > > > > > > > > > > > > > > > > > > Got it, thanks. > > > > > > > > > > > > > > > + if (!longest) > > > > > > > > > + goto done; > > > > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), > > > > > > > > > + longest->len, &oinfo); > > > > > > > > > + list_del(&longest->list); > > > > > > > > > + kmem_free(longest); > > > > > > > > > + if (error) > > > > > > > > > + return error; > > > > > > > > > + > > > > > > > > > + /* Insert records into the new btrees. */ > > > > > > > > > + list_for_each_entry_safe(rae, n, free_extents, list) { > > > > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), > > > > > > > > > + rae->len, &oinfo); > > > > > > > > > + if (error) > > > > > > > > > + return error; > > > > > > > > > + list_del(&rae->list); > > > > > > > > > + kmem_free(rae); > > > > > > > > > + } > > > > > > > > > > > > > > > > Ok, at this point we've reset the btree roots and we start freeing the > > > > > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we > > > > > > > > fail or crash at this point, we leave the allocbts in a partially > > > > > > > > constructed state. I take it that is Ok with respect to the broader > > > > > > > > repair algorithm because we'd essentially start over by inspecting the > > > > > > > > rmapbt again on a retry. > > > > > > > > > > > > > > Right. Though in the crash/shutdown case, you'll end up with the > > > > > > > filesystem in an offline state at some point before you can retry the > > > > > > > scrub, it's probably faster to run xfs_repair to fix the damage. > > > > > > > > > > > > > > > > > > > Can we really assume that if we're already up and running an online > > > > > > repair? The filesystem has to be mountable in that case in the first > > > > > > place. If we've already reset and started reconstructing the allocation > > > > > > btrees then I'd think those transactions would recover just fine on a > > > > > > power loss or something (perhaps not in the event of some other > > > > > > corruption related shutdown). > > > > > > > > > > Right, for the system crash case, whatever transactions committed should > > > > > replay just fine, and you can even start up the online repair again, and > > > > > if the AG isn't particularly close to ENOSPC then (barring rmap > > > > > corruption) it should work just fine. > > > > > > > > > > If the fs went down because either (a) repair hit other corruption or > > > > > (b) some other thread hit an error in some other part of the filesystem, > > > > > then it's not so clear -- in (b) you could probably try again, but for > > > > > (a) you'll definitely have to unmount and run xfs_repair. > > > > > > > > > > > > > Indeed, there are certainly cases where we simply won't be able to do an > > > > online repair. I'm trying to think about scenarios where we should be > > > > able to do an online repair, but we lose power or hit some kind of > > > > transient error like a memory allocation failure before it completes. It > > > > would be nice if the online repair itself didn't contribute (within > > > > reason) to the inability to simply try again just because the fs was > > > > close to -ENOSPC. > > > > > > Agreed. Most of the, uh, opportunities to hit ENOMEM happen before we > > > start modifying on-disk metadata. If that happens, we just free all the > > > memory and bail out having done nothing. > > > > > > > For one, I think it's potentially confusing behavior. Second, it might > > > > be concerning to regular users who perceive it as an online repair > > > > leaving the fs in a worse off state. Us fs devs know that may not really > > > > be the case, but I think we're better for addressing it if we can > > > > reasonably do so. > > > > > > <nod> Further in the future I want to add the ability to offline an AG, > > > so the worst that happens is that scrub turns the AG off, repair doesn't > > > fix it, and the AG simply stays offline. That might give us the > > > ability to survive cancelling the repair transaction, since if the AG's > > > offline already anyway we could just throw away the dirty buffers and > > > resurrect the AG later. I don't know, that's purely speculative. > > > > > > > > Perhaps the guideline here is that if the fs goes down more than once > > > > > during online repair then unmount it and run xfs_repair. > > > > > > > > > > > > > Yep, I think that makes sense if the filesystem or repair itself is > > > > tripping over other corruptions that fail to keep it active for the > > > > duration of the repair. > > > > > > <nod> > > > > > > > > > > > The blocks allocated for the btrees that we've begun to construct here > > > > > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't > > > > > > > > necessarily have infinite retries to make sure this completes. IOW, > > > > > > > > suppose that a first repair attempt finds just enough free space to > > > > > > > > construct new trees, gets far enough along to consume most of that free > > > > > > > > space and then crashes. Is it possible that a subsequent repair attempt > > > > > > > > includes the btree blocks allocated during the previous failed repair > > > > > > > > attempt in the sum of "old btree blocks" and determines we don't have > > > > > > > > enough free space to repair? > > > > > > > > > > > > > > Yes, that's a risk of running the free space repair. > > > > > > > > > > > > > > > > > > > Can we improve on that? For example, are the rmapbt entries for the old > > > > > > allocation btree blocks necessary once we commit the btree resets? If > > > > > > not, could we remove those entries before we start tree reconstruction? > > > > > > > > > > > > Alternatively, could we incorporate use of the old btree blocks? As it > > > > > > is, we discover those blocks simply so we can free them at the end. > > > > > > Perhaps we could free them sooner or find a more clever means to > > > > > > reallocate directly from that in-core list? I guess we have to consider > > > > > > whether they were really valid/sane btree blocks, but either way ISTM > > > > > > that the old blocks list is essentially invalidated once we reset the > > > > > > btrees. > > > > > > > > > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and > > > > > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a > > > > > record causes a btree split we'll pull blocks from the AGFL, and if > > > > > there aren't enough blocks in the bnobt to fill the AGFL back up then > > > > > fix_freelist won't succeed. That complication is why it finds the > > > > > longest extent in the unclaimed list and pushes that in first, then > > > > > works on the rest of the extents. > > > > > > > > > > > > > Hmm, but doesn't a btree split require at least one full space btree > > > > block per-level? In conjunction, the agfl minimum size requirement grows > > > > with the height of the tree, which implies available free space..? I > > > > could be missing something, perhaps we have to account for the rmapbt in > > > > that case as well? Regardless... > > > > > > > > > I suppose one could try to avoid ENOSPC by pushing that longest extent > > > > > in first (since we know that won't trigger a split), then reap the old > > > > > alloc btree blocks, and then add everything else back in... > > > > > > > > > > > > > I think it would be reasonable to seed the btree with the longest record > > > > or some fixed number of longest records (~1/2 a root block, for example) > > > > before making actual use of the btrees to reap the old blocks. I think > > > > then you'd only have a very short window of a single block leak on a > > > > poorly timed power loss and repair retry sequence before you start > > > > actually freeing originally used space (which in practice, I think > > > > solves the problem). > > > > > > > > Given that we're starting from empty, I wonder if another option may be > > > > to over fill the agfl with old btree blocks or something. The first real > > > > free should shift enough blocks back into the btrees to ensure the agfl > > > > can be managed from that point forward, right? That may be more work > > > > than it's worth though and/or a job for another patch. (FWIW, we also > > > > have that NOSHRINK agfl fixup flag for userspace repair.) > > > > > > Yes, I'll give that a try tomorrow, now that I've finished porting all > > > the 4.19 stuff to xfsprogs. :) > > > > > > Looping back to something we discussed earlier in this thread, I'd > > > prefer to hold off on converting the list of already-freed extents to > > > xfs_bitmap because the same problem exists in all the repair functions > > > of having to store a large number of records for the rebuilt btree, and > > > maybe there's some way to <cough> use pageable memory for that, since > > > the access patterns for that are append, sort, and iterate; for those > > > three uses we don't necessarily require all the records to be in memory > > > all the time. For the allocbt repair I expect the free space records to > > > be far more numerous than the list of old bnobt/cntbt blocks. > > > > > > > Ok, it's fair enough that we'll probably want to find some kind of > > generic, more efficient technique for handling this across the various > > applicable repair algorithms. > > > > One other high level thing that crossed my mind with regard to the > > general btree reconstruction algorithms is whether we need to build up > > this kind of central record list at all. For example, rather than slurp > > up the entire list of btree records in-core, sort it and dump it back > > out, could we take advantage of the fact that our existing on-disk > > structure insertion mechanisms already handle out of order records > > (simply stated, an extent free knows how to insert the associated record > > at the right place in the space btrees)? For example, suppose we reset > > the existing btrees first, then scanned the rmapbt and repopulated the > > new btrees as records are discovered..? > > I tried that in an earlier draft of the bnobt repair function. The > biggest problem with inserting as we go is dealing with the inevitable > transaction rolls (right now we do after every record insertion to avoid > playing games with guessing how much reservation is left). Btree > cursor state can't survive transaction rolls because the transaction > commit releases all the buffers that aren't bhold'en, and we can't bhold > that many buffers across a _defer_finish. > Ok, interesting. Where do we need to run an xfs_defer_finish() during the reconstruction sequence, btw? I thought that would only run on final commit as opposed to intermediate rolls. We could just try and make the automatic buffer relogging list a dynamic allocation if there are enough held buffers in the transaction. > So, that early draft spent a lot of time tearing down and reconstructing > rmapbt cursors since the standard _btree_query_all isn't suited to that > kind of usage. It was easily twice as slow on a RAM-backed disk just > from the rmap cursor overhead and much more complex, so I rewrote it to > be simpler. I also have a slight preference for not touching anything > until we're absolutely sure we have all the data we need to repair the > structure. > Yes, I think that is sane in principle. I'm just a bit concerned about how reliable that xfs_repair-like approach will be in the kernel longer term, particularly once we start having to deal with large filesystems and limited or contended memory, etc. We already have xfs_repair users that need to tweak settings because there isn't enough memory available to repair the fs. Granted that is for fs-wide repairs and the flipside is that we know a single AG can only be up to 1TB. It's certainly possible that putting some persistent backing behind the in-core data is enough to resolve the problem (and the current approach is certainly reasonable enough to me for the initial implementation). bjoin limitations aside, I wonder if a cursor roll mechanism that held all of the cursor buffers, rolled the transaction and then rejoined all said buffers would help us get around that. (Not sure I follow the early prototype behavior, but it sounds like we had to restart the rmapbt lookup over and over...). Another caveat with that approach may be that I think we'd need to be sure that the reconstruction operation doesn't ever need to update the rmapbt while we're mid walk of the latter. That may be an issue for inode btree reconstruction, for example, since it looks like inobt block allocation requires rmapbt updates. We'd probably need some way to share (or invalidate) a cursor across different contexts to deal with that. > For other repair functions (like the data/attr fork repairs) we have to > scan all the rmapbts for extents, and I'd prefer to lock those AGs only > for as long as necessary to extract the extents we want. > > > The obvious problem is that we still have some checks that allow the > > whole repair operation to bail out before we determine whether we can > > start to rebuild the on-disk btrees. These are things like making sure > > we can actually read the associated rmapbt blocks (i.e., no read errors > > or verifier failures), basic record sanity checks, etc. But ISTM that > > isn't anything we couldn't get around with a multi-pass implementation. > > Secondary issues might be things like no longer being able to easily > > insert the longest free extent range(s) first (meaning we'd have to > > stuff the agfl with old btree blocks or figure out some other approach). > > Well, you could scan the rmapbt twice -- once to find the longest > record, then again to do the actual insertion. > Yep, that's what I meant by multi-pass. > > BTW, isn't the typical scrub sequence already multi-pass by virtue of > > the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub() > > callout could not only detect corruption, but validate whether repair > > (if requested) is possible based on the kind of checks that are > > currently in the repair side rmapbt walkers. Thoughts?r > > Yes, scrub basically validates that for us now, with the notable > exception of the notorious rmapbt scrubber, which doesn't > cross-reference with inode block mappings because that would be a > locking nightmare. > > > Are there future > > changes that are better supported by an in-core tracking structure in > > general (assuming we'll eventually replace the linked lists with > > something more efficient) as opposed to attempting to optimize out the > > need for that tracking at all? > > Well, I was thinking that we could just allocate a memfd (or a file on > the same xfs once we have AG offlining) and store the records in there. > That saves us the list_head overhead and potentially enables access to a > lot more storage than pinning things in RAM. > Would using the same fs mean we have to store the repair data in a separate AG, or somehow locate/use free space in the target AG? I presume either way we'd have to ensure that AG is either consistent or locked out from outside I/O. If we have the total record count we can preallocate the file and hope there is no such other free space corruption or something that would allow some other task to mess with our blocks. I'm a little skeptical overall on relying on a corrupted filesystem to store repair data, but perhaps there are ways to mitigate the risks. I'm not familiar with memfd. The manpage suggests it's ram backed, is it swappable or something? If so, that sounds a reasonable option provided the swap space requirement can be made clear to users and the failure characteristics aren't more severe than for userspace. An online repair that puts the broader system at risk of OOM as opposed to predictably failing gracefully may not be the most useful tool. Brian > --D > > > Brian > > > > > --D > > > > > > > Brian > > > > > > > > > --D > > > > > > > > > > > Brian > > > > > > > > > > > > > > > + > > > > > > > > > +done: > > > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > > > +} > > > > > > > > > + > > > > > > > > ... > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > > > diff = b1->bno - b2->bno; > > > > > > > > > return diff; > > > > > > > > > } > > > > > > > > > + > > > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > > > +bool > > > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > > > + struct xfs_perag *pag) > > > > > > > > > +{ > > > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > + return false; > > > > > > > > > + } > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > + return true; > > > > > > > > > +} > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > > > } > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > > > + > > > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > > > > > -- > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > -- > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > -- > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote: > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote: > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote: > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > > > > > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > > > > > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree. > > > > > > > > > > > > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > --- > > > > > > > > > > fs/xfs/Makefile | 1 > > > > > > > > > > fs/xfs/scrub/alloc.c | 1 > > > > > > > > > > fs/xfs/scrub/alloc_repair.c | 581 +++++++++++++++++++++++++++++++++++++++++++ > > > > > > > > > > fs/xfs/scrub/common.c | 8 + > > > > > > > > > > fs/xfs/scrub/repair.h | 2 > > > > > > > > > > fs/xfs/scrub/scrub.c | 4 > > > > > > > > > > fs/xfs/scrub/trace.h | 2 > > > > > > > > > > fs/xfs/xfs_extent_busy.c | 14 + > > > > > > > > > > fs/xfs/xfs_extent_busy.h | 2 > > > > > > > > > > 9 files changed, 610 insertions(+), 5 deletions(-) > > > > > > > > > > create mode 100644 fs/xfs/scrub/alloc_repair.c > > > > > > > > > > > > > > > > > > > > > > > > > > > > > ... > > > > > > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c > > > > > > > > > > new file mode 100644 > > > > > > > > > > index 000000000000..b228c2906de2 > > > > > > > > > > --- /dev/null > > > > > > > > > > +++ b/fs/xfs/scrub/alloc_repair.c > > > > > > > > > > @@ -0,0 +1,581 @@ > > > > > ... > > > > > > > > > > + > > > > > > > > > > +/* > > > > > > > > > > + * Make our new freespace btree roots permanent so that we can start freeing > > > > > > > > > > + * unused space back into the AG. > > > > > > > > > > + */ > > > > > > > > > > +STATIC int > > > > > > > > > > +xrep_abt_commit_new( > > > > > > > > > > + struct xfs_scrub *sc, > > > > > > > > > > + struct xfs_bitmap *old_allocbt_blocks, > > > > > > > > > > + int log_flags) > > > > > > > > > > +{ > > > > > > > > > > + int error; > > > > > > > > > > + > > > > > > > > > > + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); > > > > > > > > > > + > > > > > > > > > > + /* Invalidate the old freespace btree blocks and commit. */ > > > > > > > > > > + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); > > > > > > > > > > + if (error) > > > > > > > > > > + return error; > > > > > > > > > > > > > > > > > > It looks like the above invalidation all happens in the same > > > > > > > > > transaction. Those aren't logging buffer data or anything, but any idea > > > > > > > > > how many log formats we can get away with in this single transaction? > > > > > > > > > > > > > > > > Hm... well, on my computer a log format is ~88 bytes. Assuming 4K > > > > > > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and > > > > > > > > two btrees, the tree could be up to ~270 million records. Assuming ~505 > > > > > > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks > > > > > > > > for both btrees. If we invalidate both, that's ~46M of RAM? > > > > > > > > > > > > > > > > > > > > > > I was thinking more about transaction reservation than RAM. It may not > > > > > > > > > > > > Hmm. tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's > > > > > > about ... ~7300 log format items? Not a lot, maybe it should roll the > > > > > > transaction every 1000 invalidations or so... > > > > > > > > > > > > > > > > I'm not really sure what categorizes as a lot here given that the blocks > > > > > would need to be in-core, but rolling on some fixed/safe interval sounds > > > > > reasonable to me. > > > > > > > > > > > > currently be an issue, but it might be worth putting something down in a > > > > > > > comment to note that this is a single transaction and we expect to not > > > > > > > have to invalidate more than N (ballpark) blocks in a single go, > > > > > > > whatever that value happens to be. > > > > > > > > > > > > > > > > > + error = xrep_roll_ag_trans(sc); > > > > > > > > > > + if (error) > > > > > > > > > > + return error; > > > > > > > > > > + > > > > > > > > > > + /* Now that we've succeeded, mark the incore state valid again. */ > > > > > > > > > > + sc->sa.pag->pagf_init = 1; > > > > > > > > > > + return 0; > > > > > > > > > > +} > > > > > > > > > > + > > > > > > > > > > +/* Build new free space btrees and dispose of the old one. */ > > > > > > > > > > +STATIC int > > > > > > > > > > +xrep_abt_rebuild_trees( > > > > > > > > > > + struct xfs_scrub *sc, > > > > > > > > > > + struct list_head *free_extents, > > > > > > > > > > + struct xfs_bitmap *old_allocbt_blocks) > > > > > > > > > > +{ > > > > > > > > > > + struct xfs_owner_info oinfo; > > > > > > > > > > + struct xrep_abt_extent *rae; > > > > > > > > > > + struct xrep_abt_extent *n; > > > > > > > > > > + struct xrep_abt_extent *longest; > > > > > > > > > > + int error; > > > > > > > > > > + > > > > > > > > > > + xfs_rmap_skip_owner_update(&oinfo); > > > > > > > > > > + > > > > > > > > > > + /* > > > > > > > > > > + * Insert the longest free extent in case it's necessary to > > > > > > > > > > + * refresh the AGFL with multiple blocks. If there is no longest > > > > > > > > > > + * extent, we had exactly the free space we needed; we're done. > > > > > > > > > > + */ > > > > > > > > > > > > > > > > > > I'm confused by the last sentence. longest should only be NULL if the > > > > > > > > > free space list is empty and haven't we already bailed out with -ENOSPC > > > > > > > > > if that's the case? > > > > > > > > > > > > > > > > > > > + longest = xrep_abt_get_longest(free_extents); > > > > > > > > > > > > > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two > > > > > > > > new btree roots in xrep_abt_reset_btrees. If free_extents is an empty > > > > > > > > list here, then we found exactly two blocks worth of free space and used > > > > > > > > them to set up new btree roots. > > > > > > > > > > > > > > > > > > > > > > Got it, thanks. > > > > > > > > > > > > > > > > > + if (!longest) > > > > > > > > > > + goto done; > > > > > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), > > > > > > > > > > + longest->len, &oinfo); > > > > > > > > > > + list_del(&longest->list); > > > > > > > > > > + kmem_free(longest); > > > > > > > > > > + if (error) > > > > > > > > > > + return error; > > > > > > > > > > + > > > > > > > > > > + /* Insert records into the new btrees. */ > > > > > > > > > > + list_for_each_entry_safe(rae, n, free_extents, list) { > > > > > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), > > > > > > > > > > + rae->len, &oinfo); > > > > > > > > > > + if (error) > > > > > > > > > > + return error; > > > > > > > > > > + list_del(&rae->list); > > > > > > > > > > + kmem_free(rae); > > > > > > > > > > + } > > > > > > > > > > > > > > > > > > Ok, at this point we've reset the btree roots and we start freeing the > > > > > > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we > > > > > > > > > fail or crash at this point, we leave the allocbts in a partially > > > > > > > > > constructed state. I take it that is Ok with respect to the broader > > > > > > > > > repair algorithm because we'd essentially start over by inspecting the > > > > > > > > > rmapbt again on a retry. > > > > > > > > > > > > > > > > Right. Though in the crash/shutdown case, you'll end up with the > > > > > > > > filesystem in an offline state at some point before you can retry the > > > > > > > > scrub, it's probably faster to run xfs_repair to fix the damage. > > > > > > > > > > > > > > > > > > > > > > Can we really assume that if we're already up and running an online > > > > > > > repair? The filesystem has to be mountable in that case in the first > > > > > > > place. If we've already reset and started reconstructing the allocation > > > > > > > btrees then I'd think those transactions would recover just fine on a > > > > > > > power loss or something (perhaps not in the event of some other > > > > > > > corruption related shutdown). > > > > > > > > > > > > Right, for the system crash case, whatever transactions committed should > > > > > > replay just fine, and you can even start up the online repair again, and > > > > > > if the AG isn't particularly close to ENOSPC then (barring rmap > > > > > > corruption) it should work just fine. > > > > > > > > > > > > If the fs went down because either (a) repair hit other corruption or > > > > > > (b) some other thread hit an error in some other part of the filesystem, > > > > > > then it's not so clear -- in (b) you could probably try again, but for > > > > > > (a) you'll definitely have to unmount and run xfs_repair. > > > > > > > > > > > > > > > > Indeed, there are certainly cases where we simply won't be able to do an > > > > > online repair. I'm trying to think about scenarios where we should be > > > > > able to do an online repair, but we lose power or hit some kind of > > > > > transient error like a memory allocation failure before it completes. It > > > > > would be nice if the online repair itself didn't contribute (within > > > > > reason) to the inability to simply try again just because the fs was > > > > > close to -ENOSPC. > > > > > > > > Agreed. Most of the, uh, opportunities to hit ENOMEM happen before we > > > > start modifying on-disk metadata. If that happens, we just free all the > > > > memory and bail out having done nothing. > > > > > > > > > For one, I think it's potentially confusing behavior. Second, it might > > > > > be concerning to regular users who perceive it as an online repair > > > > > leaving the fs in a worse off state. Us fs devs know that may not really > > > > > be the case, but I think we're better for addressing it if we can > > > > > reasonably do so. > > > > > > > > <nod> Further in the future I want to add the ability to offline an AG, > > > > so the worst that happens is that scrub turns the AG off, repair doesn't > > > > fix it, and the AG simply stays offline. That might give us the > > > > ability to survive cancelling the repair transaction, since if the AG's > > > > offline already anyway we could just throw away the dirty buffers and > > > > resurrect the AG later. I don't know, that's purely speculative. > > > > > > > > > > Perhaps the guideline here is that if the fs goes down more than once > > > > > > during online repair then unmount it and run xfs_repair. > > > > > > > > > > > > > > > > Yep, I think that makes sense if the filesystem or repair itself is > > > > > tripping over other corruptions that fail to keep it active for the > > > > > duration of the repair. > > > > > > > > <nod> > > > > > > > > > > > > > The blocks allocated for the btrees that we've begun to construct here > > > > > > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't > > > > > > > > > necessarily have infinite retries to make sure this completes. IOW, > > > > > > > > > suppose that a first repair attempt finds just enough free space to > > > > > > > > > construct new trees, gets far enough along to consume most of that free > > > > > > > > > space and then crashes. Is it possible that a subsequent repair attempt > > > > > > > > > includes the btree blocks allocated during the previous failed repair > > > > > > > > > attempt in the sum of "old btree blocks" and determines we don't have > > > > > > > > > enough free space to repair? > > > > > > > > > > > > > > > > Yes, that's a risk of running the free space repair. > > > > > > > > > > > > > > > > > > > > > > Can we improve on that? For example, are the rmapbt entries for the old > > > > > > > allocation btree blocks necessary once we commit the btree resets? If > > > > > > > not, could we remove those entries before we start tree reconstruction? > > > > > > > > > > > > > > Alternatively, could we incorporate use of the old btree blocks? As it > > > > > > > is, we discover those blocks simply so we can free them at the end. > > > > > > > Perhaps we could free them sooner or find a more clever means to > > > > > > > reallocate directly from that in-core list? I guess we have to consider > > > > > > > whether they were really valid/sane btree blocks, but either way ISTM > > > > > > > that the old blocks list is essentially invalidated once we reset the > > > > > > > btrees. > > > > > > > > > > > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and > > > > > > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a > > > > > > record causes a btree split we'll pull blocks from the AGFL, and if > > > > > > there aren't enough blocks in the bnobt to fill the AGFL back up then > > > > > > fix_freelist won't succeed. That complication is why it finds the > > > > > > longest extent in the unclaimed list and pushes that in first, then > > > > > > works on the rest of the extents. > > > > > > > > > > > > > > > > Hmm, but doesn't a btree split require at least one full space btree > > > > > block per-level? In conjunction, the agfl minimum size requirement grows > > > > > with the height of the tree, which implies available free space..? I > > > > > could be missing something, perhaps we have to account for the rmapbt in > > > > > that case as well? Regardless... > > > > > > > > > > > I suppose one could try to avoid ENOSPC by pushing that longest extent > > > > > > in first (since we know that won't trigger a split), then reap the old > > > > > > alloc btree blocks, and then add everything else back in... > > > > > > > > > > > > > > > > I think it would be reasonable to seed the btree with the longest record > > > > > or some fixed number of longest records (~1/2 a root block, for example) > > > > > before making actual use of the btrees to reap the old blocks. I think > > > > > then you'd only have a very short window of a single block leak on a > > > > > poorly timed power loss and repair retry sequence before you start > > > > > actually freeing originally used space (which in practice, I think > > > > > solves the problem). > > > > > > > > > > Given that we're starting from empty, I wonder if another option may be > > > > > to over fill the agfl with old btree blocks or something. The first real > > > > > free should shift enough blocks back into the btrees to ensure the agfl > > > > > can be managed from that point forward, right? That may be more work > > > > > than it's worth though and/or a job for another patch. (FWIW, we also > > > > > have that NOSHRINK agfl fixup flag for userspace repair.) > > > > > > > > Yes, I'll give that a try tomorrow, now that I've finished porting all > > > > the 4.19 stuff to xfsprogs. :) > > > > > > > > Looping back to something we discussed earlier in this thread, I'd > > > > prefer to hold off on converting the list of already-freed extents to > > > > xfs_bitmap because the same problem exists in all the repair functions > > > > of having to store a large number of records for the rebuilt btree, and > > > > maybe there's some way to <cough> use pageable memory for that, since > > > > the access patterns for that are append, sort, and iterate; for those > > > > three uses we don't necessarily require all the records to be in memory > > > > all the time. For the allocbt repair I expect the free space records to > > > > be far more numerous than the list of old bnobt/cntbt blocks. > > > > > > > > > > Ok, it's fair enough that we'll probably want to find some kind of > > > generic, more efficient technique for handling this across the various > > > applicable repair algorithms. > > > > > > One other high level thing that crossed my mind with regard to the > > > general btree reconstruction algorithms is whether we need to build up > > > this kind of central record list at all. For example, rather than slurp > > > up the entire list of btree records in-core, sort it and dump it back > > > out, could we take advantage of the fact that our existing on-disk > > > structure insertion mechanisms already handle out of order records > > > (simply stated, an extent free knows how to insert the associated record > > > at the right place in the space btrees)? For example, suppose we reset > > > the existing btrees first, then scanned the rmapbt and repopulated the > > > new btrees as records are discovered..? > > > > I tried that in an earlier draft of the bnobt repair function. The > > biggest problem with inserting as we go is dealing with the inevitable > > transaction rolls (right now we do after every record insertion to avoid > > playing games with guessing how much reservation is left). Btree > > cursor state can't survive transaction rolls because the transaction > > commit releases all the buffers that aren't bhold'en, and we can't bhold > > that many buffers across a _defer_finish. > > > > Ok, interesting. > > Where do we need to run an xfs_defer_finish() during the reconstruction > sequence, btw? Not here, as I'm sure you were thinking. :) For the AG btrees themselves it's sufficient to roll the transaction. I suppose we could simply have a xfs_btree_bhold function that would bhold every buffer so that a cursor could survive a roll. Inode fork reconstruction is going to require _defer_finish, however. > I thought that would only run on final commit as opposed to > intermediate rolls. We could let the deferred items sit around until final commit, but I think I'd prefer to process them as soon as possible since iirc deferred items pin the log until they're finished. I would hope that userspace isn't banging on the log while repair runs, but it's certainly possible. > We could just try and make the automatic buffer relogging list a > dynamic allocation if there are enough held buffers in the > transaction. Hmm. Might be worth pursuing... > > So, that early draft spent a lot of time tearing down and reconstructing > > rmapbt cursors since the standard _btree_query_all isn't suited to that > > kind of usage. It was easily twice as slow on a RAM-backed disk just > > from the rmap cursor overhead and much more complex, so I rewrote it to > > be simpler. I also have a slight preference for not touching anything > > until we're absolutely sure we have all the data we need to repair the > > structure. > > > > Yes, I think that is sane in principle. I'm just a bit concerned about > how reliable that xfs_repair-like approach will be in the kernel longer > term, particularly once we start having to deal with large filesystems > and limited or contended memory, etc. We already have xfs_repair users > that need to tweak settings because there isn't enough memory available > to repair the fs. Granted that is for fs-wide repairs and the flipside > is that we know a single AG can only be up to 1TB. It's certainly > possible that putting some persistent backing behind the in-core data is > enough to resolve the problem (and the current approach is certainly > reasonable enough to me for the initial implementation). > > bjoin limitations aside, I wonder if a cursor roll mechanism that held > all of the cursor buffers, rolled the transaction and then rejoined all > said buffers would help us get around that. (Not sure I follow the early > prototype behavior, but it sounds like we had to restart the rmapbt > lookup over and over...). Correct. > Another caveat with that approach may be that I think we'd need to be > sure that the reconstruction operation doesn't ever need to update the > rmapbt while we're mid walk of the latter. <nod> Looking even farther back in my notes, that was also an issue -- fixing the free list causes blocks to go on or off the agfl, which causes rmapbt updates, which meant that the only way I could get in-place updates to work was to re-lookup where we were in the btree and also try to deal with any rmapbt entries that might have crept in as result of the record insertion. Getting the concurrency right for each repair function looked like a difficult problem to solve, but amassing all the records elsewhere and rebuilding was easy to understand. > That may be an issue for inode btree reconstruction, for example, > since it looks like inobt block allocation requires rmapbt updates. > We'd probably need some way to share (or invalidate) a cursor across > different contexts to deal with that. I might pursue that strategy if we ever hit the point where we can't find space to store the records (see below). Another option could be to divert all deferred items for an AG, build a replacement btree in new space, then finish all the deferred items... but that's starting to get into offlineable AGs, which is its own project that I want to tackle later. (Not that much later, just not this cycle.) > > For other repair functions (like the data/attr fork repairs) we have to > > scan all the rmapbts for extents, and I'd prefer to lock those AGs only > > for as long as necessary to extract the extents we want. > > > > > The obvious problem is that we still have some checks that allow the > > > whole repair operation to bail out before we determine whether we can > > > start to rebuild the on-disk btrees. These are things like making sure > > > we can actually read the associated rmapbt blocks (i.e., no read errors > > > or verifier failures), basic record sanity checks, etc. But ISTM that > > > isn't anything we couldn't get around with a multi-pass implementation. > > > Secondary issues might be things like no longer being able to easily > > > insert the longest free extent range(s) first (meaning we'd have to > > > stuff the agfl with old btree blocks or figure out some other approach). > > > > Well, you could scan the rmapbt twice -- once to find the longest > > record, then again to do the actual insertion. > > > > Yep, that's what I meant by multi-pass. > > > > BTW, isn't the typical scrub sequence already multi-pass by virtue of > > > the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub() > > > callout could not only detect corruption, but validate whether repair > > > (if requested) is possible based on the kind of checks that are > > > currently in the repair side rmapbt walkers. Thoughts?r > > > > Yes, scrub basically validates that for us now, with the notable > > exception of the notorious rmapbt scrubber, which doesn't > > cross-reference with inode block mappings because that would be a > > locking nightmare. > > > > > Are there future > > > changes that are better supported by an in-core tracking structure in > > > general (assuming we'll eventually replace the linked lists with > > > something more efficient) as opposed to attempting to optimize out the > > > need for that tracking at all? > > > > Well, I was thinking that we could just allocate a memfd (or a file on > > the same xfs once we have AG offlining) and store the records in there. > > That saves us the list_head overhead and potentially enables access to a > > lot more storage than pinning things in RAM. > > > > Would using the same fs mean we have to store the repair data in a > separate AG, or somehow locate/use free space in the target AG? As part of building an "offline AG" feature we'd presumably have to teach the allocators to avoid the offline AGs for allocations, which would make it so that we could host the repair data files in the same XFS that's being fixed. That seems a little risky to me, but the disk is probably larger than mem+swap. > presume either way we'd have to ensure that AG is either consistent or > locked out from outside I/O. If we have the total record count we can We usually don't, but for the btrees that have their own record/blocks counters we might be able to guess a number, fallocate it, and see if that doesn't ENOSPC. > preallocate the file and hope there is no such other free space > corruption or something that would allow some other task to mess with > our blocks. I'm a little skeptical overall on relying on a corrupted > filesystem to store repair data, but perhaps there are ways to mitigate > the risks. Store it elsewhere? /home for root repairs, /root for any other repair... though if we're going to do that, why not just add a swap file temporarily? > I'm not familiar with memfd. The manpage suggests it's ram backed, is it > swappable or something? It's supposed to be. The quick test I ran (allocate a memfd, write 1GB of junk to it on a VM with 400M of RAM) seemed to push about 980MB into the swap file. > If so, that sounds a reasonable option provided the swap space > requirement can be made clear to users We can document it. I don't think it's any worse than xfs_repair being able to use up all the memory + swap... and since we're probably only going to be repairing one thing at a time, most likely scrub won't need as much memory. > and the failure characteristics aren't more severe than for userspace. > An online repair that puts the broader system at risk of OOM as > opposed to predictably failing gracefully may not be the most useful > tool. Agreed. One huge downside of memfd seems to be the lack of a mechanism for the vm to push back on us if we successfully write all we need to the memfd but then other processes need some memory. Obviously, if the memfd write itself comes up short or fails then we dump the memfd and error back to userspace. We might simply have to free array memory while we iterate the records to minimize the time spent at peak memory usage. --D > > Brian > > > --D > > > > > Brian > > > > > > > --D > > > > > > > > > Brian > > > > > > > > > > > --D > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > + > > > > > > > > > > +done: > > > > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > > > > +} > > > > > > > > > > + > > > > > > > > > ... > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > > > > diff = b1->bno - b2->bno; > > > > > > > > > > return diff; > > > > > > > > > > } > > > > > > > > > > + > > > > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > > > > +bool > > > > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > > > > + struct xfs_perag *pag) > > > > > > > > > > +{ > > > > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > + return false; > > > > > > > > > > + } > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > + return true; > > > > > > > > > > +} > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > > > > + > > > > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > -- > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > -- > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > -- > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote: > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote: > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote: > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote: > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > > > > > > > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree. > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > > --- > > > > > > > > > > > fs/xfs/Makefile | 1 > > > > > > > > > > > fs/xfs/scrub/alloc.c | 1 > > > > > > > > > > > fs/xfs/scrub/alloc_repair.c | 581 +++++++++++++++++++++++++++++++++++++++++++ > > > > > > > > > > > fs/xfs/scrub/common.c | 8 + > > > > > > > > > > > fs/xfs/scrub/repair.h | 2 > > > > > > > > > > > fs/xfs/scrub/scrub.c | 4 > > > > > > > > > > > fs/xfs/scrub/trace.h | 2 > > > > > > > > > > > fs/xfs/xfs_extent_busy.c | 14 + > > > > > > > > > > > fs/xfs/xfs_extent_busy.h | 2 > > > > > > > > > > > 9 files changed, 610 insertions(+), 5 deletions(-) > > > > > > > > > > > create mode 100644 fs/xfs/scrub/alloc_repair.c > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > ... > > > > > > > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c > > > > > > > > > > > new file mode 100644 > > > > > > > > > > > index 000000000000..b228c2906de2 > > > > > > > > > > > --- /dev/null > > > > > > > > > > > +++ b/fs/xfs/scrub/alloc_repair.c > > > > > > > > > > > @@ -0,0 +1,581 @@ > > > > > > ... > > > > > > > > > > > + > > > > > > > > > > > +/* > > > > > > > > > > > + * Make our new freespace btree roots permanent so that we can start freeing > > > > > > > > > > > + * unused space back into the AG. > > > > > > > > > > > + */ > > > > > > > > > > > +STATIC int > > > > > > > > > > > +xrep_abt_commit_new( > > > > > > > > > > > + struct xfs_scrub *sc, > > > > > > > > > > > + struct xfs_bitmap *old_allocbt_blocks, > > > > > > > > > > > + int log_flags) > > > > > > > > > > > +{ > > > > > > > > > > > + int error; > > > > > > > > > > > + > > > > > > > > > > > + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); > > > > > > > > > > > + > > > > > > > > > > > + /* Invalidate the old freespace btree blocks and commit. */ > > > > > > > > > > > + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); > > > > > > > > > > > + if (error) > > > > > > > > > > > + return error; > > > > > > > > > > > > > > > > > > > > It looks like the above invalidation all happens in the same > > > > > > > > > > transaction. Those aren't logging buffer data or anything, but any idea > > > > > > > > > > how many log formats we can get away with in this single transaction? > > > > > > > > > > > > > > > > > > Hm... well, on my computer a log format is ~88 bytes. Assuming 4K > > > > > > > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and > > > > > > > > > two btrees, the tree could be up to ~270 million records. Assuming ~505 > > > > > > > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks > > > > > > > > > for both btrees. If we invalidate both, that's ~46M of RAM? > > > > > > > > > > > > > > > > > > > > > > > > > I was thinking more about transaction reservation than RAM. It may not > > > > > > > > > > > > > > Hmm. tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's > > > > > > > about ... ~7300 log format items? Not a lot, maybe it should roll the > > > > > > > transaction every 1000 invalidations or so... > > > > > > > > > > > > > > > > > > > I'm not really sure what categorizes as a lot here given that the blocks > > > > > > would need to be in-core, but rolling on some fixed/safe interval sounds > > > > > > reasonable to me. > > > > > > > > > > > > > > currently be an issue, but it might be worth putting something down in a > > > > > > > > comment to note that this is a single transaction and we expect to not > > > > > > > > have to invalidate more than N (ballpark) blocks in a single go, > > > > > > > > whatever that value happens to be. > > > > > > > > > > > > > > > > > > > + error = xrep_roll_ag_trans(sc); > > > > > > > > > > > + if (error) > > > > > > > > > > > + return error; > > > > > > > > > > > + > > > > > > > > > > > + /* Now that we've succeeded, mark the incore state valid again. */ > > > > > > > > > > > + sc->sa.pag->pagf_init = 1; > > > > > > > > > > > + return 0; > > > > > > > > > > > +} > > > > > > > > > > > + > > > > > > > > > > > +/* Build new free space btrees and dispose of the old one. */ > > > > > > > > > > > +STATIC int > > > > > > > > > > > +xrep_abt_rebuild_trees( > > > > > > > > > > > + struct xfs_scrub *sc, > > > > > > > > > > > + struct list_head *free_extents, > > > > > > > > > > > + struct xfs_bitmap *old_allocbt_blocks) > > > > > > > > > > > +{ > > > > > > > > > > > + struct xfs_owner_info oinfo; > > > > > > > > > > > + struct xrep_abt_extent *rae; > > > > > > > > > > > + struct xrep_abt_extent *n; > > > > > > > > > > > + struct xrep_abt_extent *longest; > > > > > > > > > > > + int error; > > > > > > > > > > > + > > > > > > > > > > > + xfs_rmap_skip_owner_update(&oinfo); > > > > > > > > > > > + > > > > > > > > > > > + /* > > > > > > > > > > > + * Insert the longest free extent in case it's necessary to > > > > > > > > > > > + * refresh the AGFL with multiple blocks. If there is no longest > > > > > > > > > > > + * extent, we had exactly the free space we needed; we're done. > > > > > > > > > > > + */ > > > > > > > > > > > > > > > > > > > > I'm confused by the last sentence. longest should only be NULL if the > > > > > > > > > > free space list is empty and haven't we already bailed out with -ENOSPC > > > > > > > > > > if that's the case? > > > > > > > > > > > > > > > > > > > > > + longest = xrep_abt_get_longest(free_extents); > > > > > > > > > > > > > > > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two > > > > > > > > > new btree roots in xrep_abt_reset_btrees. If free_extents is an empty > > > > > > > > > list here, then we found exactly two blocks worth of free space and used > > > > > > > > > them to set up new btree roots. > > > > > > > > > > > > > > > > > > > > > > > > > Got it, thanks. > > > > > > > > > > > > > > > > > > > + if (!longest) > > > > > > > > > > > + goto done; > > > > > > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), > > > > > > > > > > > + longest->len, &oinfo); > > > > > > > > > > > + list_del(&longest->list); > > > > > > > > > > > + kmem_free(longest); > > > > > > > > > > > + if (error) > > > > > > > > > > > + return error; > > > > > > > > > > > + > > > > > > > > > > > + /* Insert records into the new btrees. */ > > > > > > > > > > > + list_for_each_entry_safe(rae, n, free_extents, list) { > > > > > > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), > > > > > > > > > > > + rae->len, &oinfo); > > > > > > > > > > > + if (error) > > > > > > > > > > > + return error; > > > > > > > > > > > + list_del(&rae->list); > > > > > > > > > > > + kmem_free(rae); > > > > > > > > > > > + } > > > > > > > > > > > > > > > > > > > > Ok, at this point we've reset the btree roots and we start freeing the > > > > > > > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we > > > > > > > > > > fail or crash at this point, we leave the allocbts in a partially > > > > > > > > > > constructed state. I take it that is Ok with respect to the broader > > > > > > > > > > repair algorithm because we'd essentially start over by inspecting the > > > > > > > > > > rmapbt again on a retry. > > > > > > > > > > > > > > > > > > Right. Though in the crash/shutdown case, you'll end up with the > > > > > > > > > filesystem in an offline state at some point before you can retry the > > > > > > > > > scrub, it's probably faster to run xfs_repair to fix the damage. > > > > > > > > > > > > > > > > > > > > > > > > > Can we really assume that if we're already up and running an online > > > > > > > > repair? The filesystem has to be mountable in that case in the first > > > > > > > > place. If we've already reset and started reconstructing the allocation > > > > > > > > btrees then I'd think those transactions would recover just fine on a > > > > > > > > power loss or something (perhaps not in the event of some other > > > > > > > > corruption related shutdown). > > > > > > > > > > > > > > Right, for the system crash case, whatever transactions committed should > > > > > > > replay just fine, and you can even start up the online repair again, and > > > > > > > if the AG isn't particularly close to ENOSPC then (barring rmap > > > > > > > corruption) it should work just fine. > > > > > > > > > > > > > > If the fs went down because either (a) repair hit other corruption or > > > > > > > (b) some other thread hit an error in some other part of the filesystem, > > > > > > > then it's not so clear -- in (b) you could probably try again, but for > > > > > > > (a) you'll definitely have to unmount and run xfs_repair. > > > > > > > > > > > > > > > > > > > Indeed, there are certainly cases where we simply won't be able to do an > > > > > > online repair. I'm trying to think about scenarios where we should be > > > > > > able to do an online repair, but we lose power or hit some kind of > > > > > > transient error like a memory allocation failure before it completes. It > > > > > > would be nice if the online repair itself didn't contribute (within > > > > > > reason) to the inability to simply try again just because the fs was > > > > > > close to -ENOSPC. > > > > > > > > > > Agreed. Most of the, uh, opportunities to hit ENOMEM happen before we > > > > > start modifying on-disk metadata. If that happens, we just free all the > > > > > memory and bail out having done nothing. > > > > > > > > > > > For one, I think it's potentially confusing behavior. Second, it might > > > > > > be concerning to regular users who perceive it as an online repair > > > > > > leaving the fs in a worse off state. Us fs devs know that may not really > > > > > > be the case, but I think we're better for addressing it if we can > > > > > > reasonably do so. > > > > > > > > > > <nod> Further in the future I want to add the ability to offline an AG, > > > > > so the worst that happens is that scrub turns the AG off, repair doesn't > > > > > fix it, and the AG simply stays offline. That might give us the > > > > > ability to survive cancelling the repair transaction, since if the AG's > > > > > offline already anyway we could just throw away the dirty buffers and > > > > > resurrect the AG later. I don't know, that's purely speculative. > > > > > > > > > > > > Perhaps the guideline here is that if the fs goes down more than once > > > > > > > during online repair then unmount it and run xfs_repair. > > > > > > > > > > > > > > > > > > > Yep, I think that makes sense if the filesystem or repair itself is > > > > > > tripping over other corruptions that fail to keep it active for the > > > > > > duration of the repair. > > > > > > > > > > <nod> > > > > > > > > > > > > > > > The blocks allocated for the btrees that we've begun to construct here > > > > > > > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't > > > > > > > > > > necessarily have infinite retries to make sure this completes. IOW, > > > > > > > > > > suppose that a first repair attempt finds just enough free space to > > > > > > > > > > construct new trees, gets far enough along to consume most of that free > > > > > > > > > > space and then crashes. Is it possible that a subsequent repair attempt > > > > > > > > > > includes the btree blocks allocated during the previous failed repair > > > > > > > > > > attempt in the sum of "old btree blocks" and determines we don't have > > > > > > > > > > enough free space to repair? > > > > > > > > > > > > > > > > > > Yes, that's a risk of running the free space repair. > > > > > > > > > > > > > > > > > > > > > > > > > Can we improve on that? For example, are the rmapbt entries for the old > > > > > > > > allocation btree blocks necessary once we commit the btree resets? If > > > > > > > > not, could we remove those entries before we start tree reconstruction? > > > > > > > > > > > > > > > > Alternatively, could we incorporate use of the old btree blocks? As it > > > > > > > > is, we discover those blocks simply so we can free them at the end. > > > > > > > > Perhaps we could free them sooner or find a more clever means to > > > > > > > > reallocate directly from that in-core list? I guess we have to consider > > > > > > > > whether they were really valid/sane btree blocks, but either way ISTM > > > > > > > > that the old blocks list is essentially invalidated once we reset the > > > > > > > > btrees. > > > > > > > > > > > > > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and > > > > > > > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a > > > > > > > record causes a btree split we'll pull blocks from the AGFL, and if > > > > > > > there aren't enough blocks in the bnobt to fill the AGFL back up then > > > > > > > fix_freelist won't succeed. That complication is why it finds the > > > > > > > longest extent in the unclaimed list and pushes that in first, then > > > > > > > works on the rest of the extents. > > > > > > > > > > > > > > > > > > > Hmm, but doesn't a btree split require at least one full space btree > > > > > > block per-level? In conjunction, the agfl minimum size requirement grows > > > > > > with the height of the tree, which implies available free space..? I > > > > > > could be missing something, perhaps we have to account for the rmapbt in > > > > > > that case as well? Regardless... > > > > > > > > > > > > > I suppose one could try to avoid ENOSPC by pushing that longest extent > > > > > > > in first (since we know that won't trigger a split), then reap the old > > > > > > > alloc btree blocks, and then add everything else back in... > > > > > > > > > > > > > > > > > > > I think it would be reasonable to seed the btree with the longest record > > > > > > or some fixed number of longest records (~1/2 a root block, for example) > > > > > > before making actual use of the btrees to reap the old blocks. I think > > > > > > then you'd only have a very short window of a single block leak on a > > > > > > poorly timed power loss and repair retry sequence before you start > > > > > > actually freeing originally used space (which in practice, I think > > > > > > solves the problem). > > > > > > > > > > > > Given that we're starting from empty, I wonder if another option may be > > > > > > to over fill the agfl with old btree blocks or something. The first real > > > > > > free should shift enough blocks back into the btrees to ensure the agfl > > > > > > can be managed from that point forward, right? That may be more work > > > > > > than it's worth though and/or a job for another patch. (FWIW, we also > > > > > > have that NOSHRINK agfl fixup flag for userspace repair.) > > > > > > > > > > Yes, I'll give that a try tomorrow, now that I've finished porting all > > > > > the 4.19 stuff to xfsprogs. :) > > > > > > > > > > Looping back to something we discussed earlier in this thread, I'd > > > > > prefer to hold off on converting the list of already-freed extents to > > > > > xfs_bitmap because the same problem exists in all the repair functions > > > > > of having to store a large number of records for the rebuilt btree, and > > > > > maybe there's some way to <cough> use pageable memory for that, since > > > > > the access patterns for that are append, sort, and iterate; for those > > > > > three uses we don't necessarily require all the records to be in memory > > > > > all the time. For the allocbt repair I expect the free space records to > > > > > be far more numerous than the list of old bnobt/cntbt blocks. > > > > > > > > > > > > > Ok, it's fair enough that we'll probably want to find some kind of > > > > generic, more efficient technique for handling this across the various > > > > applicable repair algorithms. > > > > > > > > One other high level thing that crossed my mind with regard to the > > > > general btree reconstruction algorithms is whether we need to build up > > > > this kind of central record list at all. For example, rather than slurp > > > > up the entire list of btree records in-core, sort it and dump it back > > > > out, could we take advantage of the fact that our existing on-disk > > > > structure insertion mechanisms already handle out of order records > > > > (simply stated, an extent free knows how to insert the associated record > > > > at the right place in the space btrees)? For example, suppose we reset > > > > the existing btrees first, then scanned the rmapbt and repopulated the > > > > new btrees as records are discovered..? > > > > > > I tried that in an earlier draft of the bnobt repair function. The > > > biggest problem with inserting as we go is dealing with the inevitable > > > transaction rolls (right now we do after every record insertion to avoid > > > playing games with guessing how much reservation is left). Btree > > > cursor state can't survive transaction rolls because the transaction > > > commit releases all the buffers that aren't bhold'en, and we can't bhold > > > that many buffers across a _defer_finish. > > > > > > > Ok, interesting. > > > > Where do we need to run an xfs_defer_finish() during the reconstruction > > sequence, btw? > > Not here, as I'm sure you were thinking. :) For the AG btrees > themselves it's sufficient to roll the transaction. I suppose we could > simply have a xfs_btree_bhold function that would bhold every buffer so > that a cursor could survive a roll. > > Inode fork reconstruction is going to require _defer_finish, however. > Ok, just wasn't sure if I missed something in the bits I've looked through so far.. > > I thought that would only run on final commit as opposed to > > intermediate rolls. > > We could let the deferred items sit around until final commit, but I > think I'd prefer to process them as soon as possible since iirc deferred > items pin the log until they're finished. I would hope that userspace > isn't banging on the log while repair runs, but it's certainly possible. > I was just surmising in general, not necessarily suggesting we change behavior. > > We could just try and make the automatic buffer relogging list a > > dynamic allocation if there are enough held buffers in the > > transaction. > > Hmm. Might be worth pursuing... > > > > So, that early draft spent a lot of time tearing down and reconstructing > > > rmapbt cursors since the standard _btree_query_all isn't suited to that > > > kind of usage. It was easily twice as slow on a RAM-backed disk just > > > from the rmap cursor overhead and much more complex, so I rewrote it to > > > be simpler. I also have a slight preference for not touching anything > > > until we're absolutely sure we have all the data we need to repair the > > > structure. > > > > > > > Yes, I think that is sane in principle. I'm just a bit concerned about > > how reliable that xfs_repair-like approach will be in the kernel longer > > term, particularly once we start having to deal with large filesystems > > and limited or contended memory, etc. We already have xfs_repair users > > that need to tweak settings because there isn't enough memory available > > to repair the fs. Granted that is for fs-wide repairs and the flipside > > is that we know a single AG can only be up to 1TB. It's certainly > > possible that putting some persistent backing behind the in-core data is > > enough to resolve the problem (and the current approach is certainly > > reasonable enough to me for the initial implementation). > > > > bjoin limitations aside, I wonder if a cursor roll mechanism that held > > all of the cursor buffers, rolled the transaction and then rejoined all > > said buffers would help us get around that. (Not sure I follow the early > > prototype behavior, but it sounds like we had to restart the rmapbt > > lookup over and over...). > > Correct. > > > Another caveat with that approach may be that I think we'd need to be > > sure that the reconstruction operation doesn't ever need to update the > > rmapbt while we're mid walk of the latter. > > <nod> Looking even farther back in my notes, that was also an issue -- > fixing the free list causes blocks to go on or off the agfl, which > causes rmapbt updates, which meant that the only way I could get > in-place updates to work was to re-lookup where we were in the btree and > also try to deal with any rmapbt entries that might have crept in as > result of the record insertion. > > Getting the concurrency right for each repair function looked like a > difficult problem to solve, but amassing all the records elsewhere and > rebuilding was easy to understand. > Yeah. This all points to this kind of strategy being too complex to be worth the prospective benefits in the short term. Clearly we have several, potentially tricky roadblocks to work through before this can be made feasible. Thanks for the background, it's still useful to have this context to compare with whatever we may have to do to support a reclaimable memory approach. > > That may be an issue for inode btree reconstruction, for example, > > since it looks like inobt block allocation requires rmapbt updates. > > We'd probably need some way to share (or invalidate) a cursor across > > different contexts to deal with that. > > I might pursue that strategy if we ever hit the point where we can't > find space to store the records (see below). Another option could be to > divert all deferred items for an AG, build a replacement btree in new > space, then finish all the deferred items... but that's starting to get > into offlineable AGs, which is its own project that I want to tackle > later. > > (Not that much later, just not this cycle.) > *nod* > > > For other repair functions (like the data/attr fork repairs) we have to > > > scan all the rmapbts for extents, and I'd prefer to lock those AGs only > > > for as long as necessary to extract the extents we want. > > > > > > > The obvious problem is that we still have some checks that allow the > > > > whole repair operation to bail out before we determine whether we can > > > > start to rebuild the on-disk btrees. These are things like making sure > > > > we can actually read the associated rmapbt blocks (i.e., no read errors > > > > or verifier failures), basic record sanity checks, etc. But ISTM that > > > > isn't anything we couldn't get around with a multi-pass implementation. > > > > Secondary issues might be things like no longer being able to easily > > > > insert the longest free extent range(s) first (meaning we'd have to > > > > stuff the agfl with old btree blocks or figure out some other approach). > > > > > > Well, you could scan the rmapbt twice -- once to find the longest > > > record, then again to do the actual insertion. > > > > > > > Yep, that's what I meant by multi-pass. > > > > > > BTW, isn't the typical scrub sequence already multi-pass by virtue of > > > > the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub() > > > > callout could not only detect corruption, but validate whether repair > > > > (if requested) is possible based on the kind of checks that are > > > > currently in the repair side rmapbt walkers. Thoughts?r > > > > > > Yes, scrub basically validates that for us now, with the notable > > > exception of the notorious rmapbt scrubber, which doesn't > > > cross-reference with inode block mappings because that would be a > > > locking nightmare. > > > > > > > Are there future > > > > changes that are better supported by an in-core tracking structure in > > > > general (assuming we'll eventually replace the linked lists with > > > > something more efficient) as opposed to attempting to optimize out the > > > > need for that tracking at all? > > > > > > Well, I was thinking that we could just allocate a memfd (or a file on > > > the same xfs once we have AG offlining) and store the records in there. > > > That saves us the list_head overhead and potentially enables access to a > > > lot more storage than pinning things in RAM. > > > > > > > Would using the same fs mean we have to store the repair data in a > > separate AG, or somehow locate/use free space in the target AG? > > As part of building an "offline AG" feature we'd presumably have to > teach the allocators to avoid the offline AGs for allocations, which > would make it so that we could host the repair data files in the same > XFS that's being fixed. That seems a little risky to me, but the disk > is probably larger than mem+swap. > Got it, so we'd use the remaining space in the fs outside of the target AG. ISTM that still presumes the rest of the fs is coherent, but I suppose the offline AG thing helps us with that. We'd just have to make sure we've shut down all currently corrupted AGs before we start to repair a particular corrupted one, and then hope there's still enough free space in the fs to proceed. That makes more sense, but I still agree that it seems risky in general. Technical risk aside, there's also usability concerns in that the local free space requirement is another bit of non-determinism around the ability to online repair vs. having to punt to xfs_repair, or if the repair consumes whatever free space remains in the fs to the detriment of whatever workload the user presumably wanted to keep the fs online for, etc. > > presume either way we'd have to ensure that AG is either consistent or > > locked out from outside I/O. If we have the total record count we can > > We usually don't, but for the btrees that have their own record/blocks > counters we might be able to guess a number, fallocate it, and see if > that doesn't ENOSPC. > > > preallocate the file and hope there is no such other free space > > corruption or something that would allow some other task to mess with > > our blocks. I'm a little skeptical overall on relying on a corrupted > > filesystem to store repair data, but perhaps there are ways to mitigate > > the risks. > > Store it elsewhere? /home for root repairs, /root for any other > repair... though if we're going to do that, why not just add a swap file > temporarily? > Indeed. The thought crossed my mind about whether we could do something like have an internal/isolated swap file for dedicated XFS allocations to avoid contention with the traditional swap. Userspace could somehow set it up or communicate to the kernel. I have no idea how realistic that is though or if there's a better interface for that kind of thing (i.e., file backed kmem cache?). What _seems_ beneficial about that approach is we get (potentially external) persistent backing and memory reclaim ability with the traditional memory allocation model. ISTM that if we used a regular file, we'd need to deal with the traditional file interface somehow or another (file read/pagecache lookup -> record ??). We could repurpose some existing mechanism like the directory code or quota inode mechanism to use xfs buffers for that purpose, but I think that would require us to always use an internal inode. Allowing userspace to pass an fd/file passes that consideration on to the user, which might be more flexible. We could always warn about additional limitations if that fd happens to be based on the target fs. > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it > > swappable or something? > > It's supposed to be. The quick test I ran (allocate a memfd, write 1GB > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into > the swap file. > Ok. > > If so, that sounds a reasonable option provided the swap space > > requirement can be made clear to users > > We can document it. I don't think it's any worse than xfs_repair being > able to use up all the memory + swap... and since we're probably only > going to be repairing one thing at a time, most likely scrub won't need > as much memory. > Right, but as noted below, my concerns with the xfs_repair comparison are that 1.) the kernel generally has more of a limit on anonymous memory allocations than userspace (i.e., not swappable AFAIU?) and 2.) it's not clear how effectively running the system out of memory via the kernel will behave from a failure perspective. IOW, xfs_repair can run the system out of memory but for the most part that ends up being a simple problem for the system: OOM kill the bloated xfs_repair process. For an online repair in a similar situation, I have no idea what's going to happen. The hope is that the online repair hits -ENOMEM and unwinds, but ISTM we'd still be at risk of other subsystems running into memory allocation problems, filling up swap, the OOM killer going after unrelated processes, etc. What if, for example, the OOM killer starts picking off processes in service to a running online repair that immediately consumes freed up memory until the system is borked? I don't know how likely that is or if it really ends up much different from the analogous xfs_repair situation. My only point right now is that failure scenario is something we should explore for any solution we ultimately consider because it may be an unexpected use case of the underlying mechanism. (To the contrary, just using a cached file seems a natural fit from that perspective.) > > and the failure characteristics aren't more severe than for userspace. > > An online repair that puts the broader system at risk of OOM as > > opposed to predictably failing gracefully may not be the most useful > > tool. > > Agreed. One huge downside of memfd seems to be the lack of a mechanism > for the vm to push back on us if we successfully write all we need to > the memfd but then other processes need some memory. Obviously, if the > memfd write itself comes up short or fails then we dump the memfd and > error back to userspace. We might simply have to free array memory > while we iterate the records to minimize the time spent at peak memory > usage. > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach may simplify things because we could allocate it up front and know right away whether we just don't have enough memory available to repair. Brian > --D > > > > > Brian > > > > > --D > > > > > > > Brian > > > > > > > > > --D > > > > > > > > > > > Brian > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > +done: > > > > > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > > > > > +} > > > > > > > > > > > + > > > > > > > > > > ... > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > > > > > diff = b1->bno - b2->bno; > > > > > > > > > > > return diff; > > > > > > > > > > > } > > > > > > > > > > > + > > > > > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > > > > > +bool > > > > > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > > > > > + struct xfs_perag *pag) > > > > > > > > > > > +{ > > > > > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > + return false; > > > > > > > > > > > + } > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > + return true; > > > > > > > > > > > +} > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > > > > > + > > > > > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > -- > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > -- > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > -- > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote: > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote: > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote: > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote: > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote: > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > > > > > > > > > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree. > > > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > > > --- > > > > > > > > > > > > fs/xfs/Makefile | 1 > > > > > > > > > > > > fs/xfs/scrub/alloc.c | 1 > > > > > > > > > > > > fs/xfs/scrub/alloc_repair.c | 581 +++++++++++++++++++++++++++++++++++++++++++ > > > > > > > > > > > > fs/xfs/scrub/common.c | 8 + > > > > > > > > > > > > fs/xfs/scrub/repair.h | 2 > > > > > > > > > > > > fs/xfs/scrub/scrub.c | 4 > > > > > > > > > > > > fs/xfs/scrub/trace.h | 2 > > > > > > > > > > > > fs/xfs/xfs_extent_busy.c | 14 + > > > > > > > > > > > > fs/xfs/xfs_extent_busy.h | 2 > > > > > > > > > > > > 9 files changed, 610 insertions(+), 5 deletions(-) > > > > > > > > > > > > create mode 100644 fs/xfs/scrub/alloc_repair.c > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > ... > > > > > > > > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c > > > > > > > > > > > > new file mode 100644 > > > > > > > > > > > > index 000000000000..b228c2906de2 > > > > > > > > > > > > --- /dev/null > > > > > > > > > > > > +++ b/fs/xfs/scrub/alloc_repair.c > > > > > > > > > > > > @@ -0,0 +1,581 @@ > > > > > > > ... > > > > > > > > > > > > + > > > > > > > > > > > > +/* > > > > > > > > > > > > + * Make our new freespace btree roots permanent so that we can start freeing > > > > > > > > > > > > + * unused space back into the AG. > > > > > > > > > > > > + */ > > > > > > > > > > > > +STATIC int > > > > > > > > > > > > +xrep_abt_commit_new( > > > > > > > > > > > > + struct xfs_scrub *sc, > > > > > > > > > > > > + struct xfs_bitmap *old_allocbt_blocks, > > > > > > > > > > > > + int log_flags) > > > > > > > > > > > > +{ > > > > > > > > > > > > + int error; > > > > > > > > > > > > + > > > > > > > > > > > > + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); > > > > > > > > > > > > + > > > > > > > > > > > > + /* Invalidate the old freespace btree blocks and commit. */ > > > > > > > > > > > > + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); > > > > > > > > > > > > + if (error) > > > > > > > > > > > > + return error; > > > > > > > > > > > > > > > > > > > > > > It looks like the above invalidation all happens in the same > > > > > > > > > > > transaction. Those aren't logging buffer data or anything, but any idea > > > > > > > > > > > how many log formats we can get away with in this single transaction? > > > > > > > > > > > > > > > > > > > > Hm... well, on my computer a log format is ~88 bytes. Assuming 4K > > > > > > > > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and > > > > > > > > > > two btrees, the tree could be up to ~270 million records. Assuming ~505 > > > > > > > > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks > > > > > > > > > > for both btrees. If we invalidate both, that's ~46M of RAM? > > > > > > > > > > > > > > > > > > > > > > > > > > > > I was thinking more about transaction reservation than RAM. It may not > > > > > > > > > > > > > > > > Hmm. tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's > > > > > > > > about ... ~7300 log format items? Not a lot, maybe it should roll the > > > > > > > > transaction every 1000 invalidations or so... > > > > > > > > > > > > > > > > > > > > > > I'm not really sure what categorizes as a lot here given that the blocks > > > > > > > would need to be in-core, but rolling on some fixed/safe interval sounds > > > > > > > reasonable to me. > > > > > > > > > > > > > > > > currently be an issue, but it might be worth putting something down in a > > > > > > > > > comment to note that this is a single transaction and we expect to not > > > > > > > > > have to invalidate more than N (ballpark) blocks in a single go, > > > > > > > > > whatever that value happens to be. > > > > > > > > > > > > > > > > > > > > > + error = xrep_roll_ag_trans(sc); > > > > > > > > > > > > + if (error) > > > > > > > > > > > > + return error; > > > > > > > > > > > > + > > > > > > > > > > > > + /* Now that we've succeeded, mark the incore state valid again. */ > > > > > > > > > > > > + sc->sa.pag->pagf_init = 1; > > > > > > > > > > > > + return 0; > > > > > > > > > > > > +} > > > > > > > > > > > > + > > > > > > > > > > > > +/* Build new free space btrees and dispose of the old one. */ > > > > > > > > > > > > +STATIC int > > > > > > > > > > > > +xrep_abt_rebuild_trees( > > > > > > > > > > > > + struct xfs_scrub *sc, > > > > > > > > > > > > + struct list_head *free_extents, > > > > > > > > > > > > + struct xfs_bitmap *old_allocbt_blocks) > > > > > > > > > > > > +{ > > > > > > > > > > > > + struct xfs_owner_info oinfo; > > > > > > > > > > > > + struct xrep_abt_extent *rae; > > > > > > > > > > > > + struct xrep_abt_extent *n; > > > > > > > > > > > > + struct xrep_abt_extent *longest; > > > > > > > > > > > > + int error; > > > > > > > > > > > > + > > > > > > > > > > > > + xfs_rmap_skip_owner_update(&oinfo); > > > > > > > > > > > > + > > > > > > > > > > > > + /* > > > > > > > > > > > > + * Insert the longest free extent in case it's necessary to > > > > > > > > > > > > + * refresh the AGFL with multiple blocks. If there is no longest > > > > > > > > > > > > + * extent, we had exactly the free space we needed; we're done. > > > > > > > > > > > > + */ > > > > > > > > > > > > > > > > > > > > > > I'm confused by the last sentence. longest should only be NULL if the > > > > > > > > > > > free space list is empty and haven't we already bailed out with -ENOSPC > > > > > > > > > > > if that's the case? > > > > > > > > > > > > > > > > > > > > > > > + longest = xrep_abt_get_longest(free_extents); > > > > > > > > > > > > > > > > > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two > > > > > > > > > > new btree roots in xrep_abt_reset_btrees. If free_extents is an empty > > > > > > > > > > list here, then we found exactly two blocks worth of free space and used > > > > > > > > > > them to set up new btree roots. > > > > > > > > > > > > > > > > > > > > > > > > > > > > Got it, thanks. > > > > > > > > > > > > > > > > > > > > > + if (!longest) > > > > > > > > > > > > + goto done; > > > > > > > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), > > > > > > > > > > > > + longest->len, &oinfo); > > > > > > > > > > > > + list_del(&longest->list); > > > > > > > > > > > > + kmem_free(longest); > > > > > > > > > > > > + if (error) > > > > > > > > > > > > + return error; > > > > > > > > > > > > + > > > > > > > > > > > > + /* Insert records into the new btrees. */ > > > > > > > > > > > > + list_for_each_entry_safe(rae, n, free_extents, list) { > > > > > > > > > > > > + error = xrep_abt_free_extent(sc, > > > > > > > > > > > > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), > > > > > > > > > > > > + rae->len, &oinfo); > > > > > > > > > > > > + if (error) > > > > > > > > > > > > + return error; > > > > > > > > > > > > + list_del(&rae->list); > > > > > > > > > > > > + kmem_free(rae); > > > > > > > > > > > > + } > > > > > > > > > > > > > > > > > > > > > > Ok, at this point we've reset the btree roots and we start freeing the > > > > > > > > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we > > > > > > > > > > > fail or crash at this point, we leave the allocbts in a partially > > > > > > > > > > > constructed state. I take it that is Ok with respect to the broader > > > > > > > > > > > repair algorithm because we'd essentially start over by inspecting the > > > > > > > > > > > rmapbt again on a retry. > > > > > > > > > > > > > > > > > > > > Right. Though in the crash/shutdown case, you'll end up with the > > > > > > > > > > filesystem in an offline state at some point before you can retry the > > > > > > > > > > scrub, it's probably faster to run xfs_repair to fix the damage. > > > > > > > > > > > > > > > > > > > > > > > > > > > > Can we really assume that if we're already up and running an online > > > > > > > > > repair? The filesystem has to be mountable in that case in the first > > > > > > > > > place. If we've already reset and started reconstructing the allocation > > > > > > > > > btrees then I'd think those transactions would recover just fine on a > > > > > > > > > power loss or something (perhaps not in the event of some other > > > > > > > > > corruption related shutdown). > > > > > > > > > > > > > > > > Right, for the system crash case, whatever transactions committed should > > > > > > > > replay just fine, and you can even start up the online repair again, and > > > > > > > > if the AG isn't particularly close to ENOSPC then (barring rmap > > > > > > > > corruption) it should work just fine. > > > > > > > > > > > > > > > > If the fs went down because either (a) repair hit other corruption or > > > > > > > > (b) some other thread hit an error in some other part of the filesystem, > > > > > > > > then it's not so clear -- in (b) you could probably try again, but for > > > > > > > > (a) you'll definitely have to unmount and run xfs_repair. > > > > > > > > > > > > > > > > > > > > > > Indeed, there are certainly cases where we simply won't be able to do an > > > > > > > online repair. I'm trying to think about scenarios where we should be > > > > > > > able to do an online repair, but we lose power or hit some kind of > > > > > > > transient error like a memory allocation failure before it completes. It > > > > > > > would be nice if the online repair itself didn't contribute (within > > > > > > > reason) to the inability to simply try again just because the fs was > > > > > > > close to -ENOSPC. > > > > > > > > > > > > Agreed. Most of the, uh, opportunities to hit ENOMEM happen before we > > > > > > start modifying on-disk metadata. If that happens, we just free all the > > > > > > memory and bail out having done nothing. > > > > > > > > > > > > > For one, I think it's potentially confusing behavior. Second, it might > > > > > > > be concerning to regular users who perceive it as an online repair > > > > > > > leaving the fs in a worse off state. Us fs devs know that may not really > > > > > > > be the case, but I think we're better for addressing it if we can > > > > > > > reasonably do so. > > > > > > > > > > > > <nod> Further in the future I want to add the ability to offline an AG, > > > > > > so the worst that happens is that scrub turns the AG off, repair doesn't > > > > > > fix it, and the AG simply stays offline. That might give us the > > > > > > ability to survive cancelling the repair transaction, since if the AG's > > > > > > offline already anyway we could just throw away the dirty buffers and > > > > > > resurrect the AG later. I don't know, that's purely speculative. > > > > > > > > > > > > > > Perhaps the guideline here is that if the fs goes down more than once > > > > > > > > during online repair then unmount it and run xfs_repair. > > > > > > > > > > > > > > > > > > > > > > Yep, I think that makes sense if the filesystem or repair itself is > > > > > > > tripping over other corruptions that fail to keep it active for the > > > > > > > duration of the repair. > > > > > > > > > > > > <nod> > > > > > > > > > > > > > > > > > The blocks allocated for the btrees that we've begun to construct here > > > > > > > > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't > > > > > > > > > > > necessarily have infinite retries to make sure this completes. IOW, > > > > > > > > > > > suppose that a first repair attempt finds just enough free space to > > > > > > > > > > > construct new trees, gets far enough along to consume most of that free > > > > > > > > > > > space and then crashes. Is it possible that a subsequent repair attempt > > > > > > > > > > > includes the btree blocks allocated during the previous failed repair > > > > > > > > > > > attempt in the sum of "old btree blocks" and determines we don't have > > > > > > > > > > > enough free space to repair? > > > > > > > > > > > > > > > > > > > > Yes, that's a risk of running the free space repair. > > > > > > > > > > > > > > > > > > > > > > > > > > > > Can we improve on that? For example, are the rmapbt entries for the old > > > > > > > > > allocation btree blocks necessary once we commit the btree resets? If > > > > > > > > > not, could we remove those entries before we start tree reconstruction? > > > > > > > > > > > > > > > > > > Alternatively, could we incorporate use of the old btree blocks? As it > > > > > > > > > is, we discover those blocks simply so we can free them at the end. > > > > > > > > > Perhaps we could free them sooner or find a more clever means to > > > > > > > > > reallocate directly from that in-core list? I guess we have to consider > > > > > > > > > whether they were really valid/sane btree blocks, but either way ISTM > > > > > > > > > that the old blocks list is essentially invalidated once we reset the > > > > > > > > > btrees. > > > > > > > > > > > > > > > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and > > > > > > > > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a > > > > > > > > record causes a btree split we'll pull blocks from the AGFL, and if > > > > > > > > there aren't enough blocks in the bnobt to fill the AGFL back up then > > > > > > > > fix_freelist won't succeed. That complication is why it finds the > > > > > > > > longest extent in the unclaimed list and pushes that in first, then > > > > > > > > works on the rest of the extents. > > > > > > > > > > > > > > > > > > > > > > Hmm, but doesn't a btree split require at least one full space btree > > > > > > > block per-level? In conjunction, the agfl minimum size requirement grows > > > > > > > with the height of the tree, which implies available free space..? I > > > > > > > could be missing something, perhaps we have to account for the rmapbt in > > > > > > > that case as well? Regardless... > > > > > > > > > > > > > > > I suppose one could try to avoid ENOSPC by pushing that longest extent > > > > > > > > in first (since we know that won't trigger a split), then reap the old > > > > > > > > alloc btree blocks, and then add everything else back in... > > > > > > > > > > > > > > > > > > > > > > I think it would be reasonable to seed the btree with the longest record > > > > > > > or some fixed number of longest records (~1/2 a root block, for example) > > > > > > > before making actual use of the btrees to reap the old blocks. I think > > > > > > > then you'd only have a very short window of a single block leak on a > > > > > > > poorly timed power loss and repair retry sequence before you start > > > > > > > actually freeing originally used space (which in practice, I think > > > > > > > solves the problem). > > > > > > > > > > > > > > Given that we're starting from empty, I wonder if another option may be > > > > > > > to over fill the agfl with old btree blocks or something. The first real > > > > > > > free should shift enough blocks back into the btrees to ensure the agfl > > > > > > > can be managed from that point forward, right? That may be more work > > > > > > > than it's worth though and/or a job for another patch. (FWIW, we also > > > > > > > have that NOSHRINK agfl fixup flag for userspace repair.) > > > > > > > > > > > > Yes, I'll give that a try tomorrow, now that I've finished porting all > > > > > > the 4.19 stuff to xfsprogs. :) > > > > > > > > > > > > Looping back to something we discussed earlier in this thread, I'd > > > > > > prefer to hold off on converting the list of already-freed extents to > > > > > > xfs_bitmap because the same problem exists in all the repair functions > > > > > > of having to store a large number of records for the rebuilt btree, and > > > > > > maybe there's some way to <cough> use pageable memory for that, since > > > > > > the access patterns for that are append, sort, and iterate; for those > > > > > > three uses we don't necessarily require all the records to be in memory > > > > > > all the time. For the allocbt repair I expect the free space records to > > > > > > be far more numerous than the list of old bnobt/cntbt blocks. > > > > > > > > > > > > > > > > Ok, it's fair enough that we'll probably want to find some kind of > > > > > generic, more efficient technique for handling this across the various > > > > > applicable repair algorithms. > > > > > > > > > > One other high level thing that crossed my mind with regard to the > > > > > general btree reconstruction algorithms is whether we need to build up > > > > > this kind of central record list at all. For example, rather than slurp > > > > > up the entire list of btree records in-core, sort it and dump it back > > > > > out, could we take advantage of the fact that our existing on-disk > > > > > structure insertion mechanisms already handle out of order records > > > > > (simply stated, an extent free knows how to insert the associated record > > > > > at the right place in the space btrees)? For example, suppose we reset > > > > > the existing btrees first, then scanned the rmapbt and repopulated the > > > > > new btrees as records are discovered..? > > > > > > > > I tried that in an earlier draft of the bnobt repair function. The > > > > biggest problem with inserting as we go is dealing with the inevitable > > > > transaction rolls (right now we do after every record insertion to avoid > > > > playing games with guessing how much reservation is left). Btree > > > > cursor state can't survive transaction rolls because the transaction > > > > commit releases all the buffers that aren't bhold'en, and we can't bhold > > > > that many buffers across a _defer_finish. > > > > > > > > > > Ok, interesting. > > > > > > Where do we need to run an xfs_defer_finish() during the reconstruction > > > sequence, btw? > > > > Not here, as I'm sure you were thinking. :) For the AG btrees > > themselves it's sufficient to roll the transaction. I suppose we could > > simply have a xfs_btree_bhold function that would bhold every buffer so > > that a cursor could survive a roll. > > > > Inode fork reconstruction is going to require _defer_finish, however. > > > > Ok, just wasn't sure if I missed something in the bits I've looked > through so far.. > > > > I thought that would only run on final commit as opposed to > > > intermediate rolls. > > > > We could let the deferred items sit around until final commit, but I > > think I'd prefer to process them as soon as possible since iirc deferred > > items pin the log until they're finished. I would hope that userspace > > isn't banging on the log while repair runs, but it's certainly possible. > > > > I was just surmising in general, not necessarily suggesting we change > behavior. Oh, ok. Sorry, I misinterpreted you. :) > > > We could just try and make the automatic buffer relogging list a > > > dynamic allocation if there are enough held buffers in the > > > transaction. > > > > Hmm. Might be worth pursuing... > > > > > > So, that early draft spent a lot of time tearing down and reconstructing > > > > rmapbt cursors since the standard _btree_query_all isn't suited to that > > > > kind of usage. It was easily twice as slow on a RAM-backed disk just > > > > from the rmap cursor overhead and much more complex, so I rewrote it to > > > > be simpler. I also have a slight preference for not touching anything > > > > until we're absolutely sure we have all the data we need to repair the > > > > structure. > > > > > > > > > > Yes, I think that is sane in principle. I'm just a bit concerned about > > > how reliable that xfs_repair-like approach will be in the kernel longer > > > term, particularly once we start having to deal with large filesystems > > > and limited or contended memory, etc. We already have xfs_repair users > > > that need to tweak settings because there isn't enough memory available > > > to repair the fs. Granted that is for fs-wide repairs and the flipside > > > is that we know a single AG can only be up to 1TB. It's certainly > > > possible that putting some persistent backing behind the in-core data is > > > enough to resolve the problem (and the current approach is certainly > > > reasonable enough to me for the initial implementation). > > > > > > bjoin limitations aside, I wonder if a cursor roll mechanism that held > > > all of the cursor buffers, rolled the transaction and then rejoined all > > > said buffers would help us get around that. (Not sure I follow the early > > > prototype behavior, but it sounds like we had to restart the rmapbt > > > lookup over and over...). > > > > Correct. > > > > > Another caveat with that approach may be that I think we'd need to be > > > sure that the reconstruction operation doesn't ever need to update the > > > rmapbt while we're mid walk of the latter. > > > > <nod> Looking even farther back in my notes, that was also an issue -- > > fixing the free list causes blocks to go on or off the agfl, which > > causes rmapbt updates, which meant that the only way I could get > > in-place updates to work was to re-lookup where we were in the btree and > > also try to deal with any rmapbt entries that might have crept in as > > result of the record insertion. > > > > Getting the concurrency right for each repair function looked like a > > difficult problem to solve, but amassing all the records elsewhere and > > rebuilding was easy to understand. > > > > Yeah. This all points to this kind of strategy being too complex to be > worth the prospective benefits in the short term. Clearly we have > several, potentially tricky roadblocks to work through before this can > be made feasible. Thanks for the background, it's still useful to have > this context to compare with whatever we may have to do to support a > reclaimable memory approach. <nod> Reclaimable memfd "memory" isn't too difficult, we can call kernel_read and kernel_write, though lockdep gets pretty mad about xfs taking sb_start_write (on the memfd filesystem) at the same time it has sb_starT_write on the xfs (not to mention the stack usage) so I had to throw in the extra twist of delegating the actual file io to a workqueue item (a la xfs_btree_split). > > > That may be an issue for inode btree reconstruction, for example, > > > since it looks like inobt block allocation requires rmapbt updates. > > > We'd probably need some way to share (or invalidate) a cursor across > > > different contexts to deal with that. > > > > I might pursue that strategy if we ever hit the point where we can't > > find space to store the records (see below). Another option could be to > > divert all deferred items for an AG, build a replacement btree in new > > space, then finish all the deferred items... but that's starting to get > > into offlineable AGs, which is its own project that I want to tackle > > later. > > > > (Not that much later, just not this cycle.) > > > > *nod* > > > > > For other repair functions (like the data/attr fork repairs) we have to > > > > scan all the rmapbts for extents, and I'd prefer to lock those AGs only > > > > for as long as necessary to extract the extents we want. > > > > > > > > > The obvious problem is that we still have some checks that allow the > > > > > whole repair operation to bail out before we determine whether we can > > > > > start to rebuild the on-disk btrees. These are things like making sure > > > > > we can actually read the associated rmapbt blocks (i.e., no read errors > > > > > or verifier failures), basic record sanity checks, etc. But ISTM that > > > > > isn't anything we couldn't get around with a multi-pass implementation. > > > > > Secondary issues might be things like no longer being able to easily > > > > > insert the longest free extent range(s) first (meaning we'd have to > > > > > stuff the agfl with old btree blocks or figure out some other approach). > > > > > > > > Well, you could scan the rmapbt twice -- once to find the longest > > > > record, then again to do the actual insertion. > > > > > > > > > > Yep, that's what I meant by multi-pass. > > > > > > > > BTW, isn't the typical scrub sequence already multi-pass by virtue of > > > > > the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub() > > > > > callout could not only detect corruption, but validate whether repair > > > > > (if requested) is possible based on the kind of checks that are > > > > > currently in the repair side rmapbt walkers. Thoughts?r > > > > > > > > Yes, scrub basically validates that for us now, with the notable > > > > exception of the notorious rmapbt scrubber, which doesn't > > > > cross-reference with inode block mappings because that would be a > > > > locking nightmare. > > > > > > > > > Are there future > > > > > changes that are better supported by an in-core tracking structure in > > > > > general (assuming we'll eventually replace the linked lists with > > > > > something more efficient) as opposed to attempting to optimize out the > > > > > need for that tracking at all? > > > > > > > > Well, I was thinking that we could just allocate a memfd (or a file on > > > > the same xfs once we have AG offlining) and store the records in there. > > > > That saves us the list_head overhead and potentially enables access to a > > > > lot more storage than pinning things in RAM. > > > > > > > > > > Would using the same fs mean we have to store the repair data in a > > > separate AG, or somehow locate/use free space in the target AG? > > > > As part of building an "offline AG" feature we'd presumably have to > > teach the allocators to avoid the offline AGs for allocations, which > > would make it so that we could host the repair data files in the same > > XFS that's being fixed. That seems a little risky to me, but the disk > > is probably larger than mem+swap. > > > > Got it, so we'd use the remaining space in the fs outside of the target > AG. ISTM that still presumes the rest of the fs is coherent, but I > suppose the offline AG thing helps us with that. We'd just have to make > sure we've shut down all currently corrupted AGs before we start to > repair a particular corrupted one, and then hope there's still enough > free space in the fs to proceed. That's a pretty big hope. :) I think for now > That makes more sense, but I still agree that it seems risky in general. > Technical risk aside, there's also usability concerns in that the local > free space requirement is another bit of non-determinism I don't think it's non-deterministic, it's just hard for the filesystem to communicate to the user/admin ahead of time. Roughly speaking, we need to have about as much disk space for the new btree as we had allocated for the old one. As far as memory requirements go, in last week's revising of the patches I compressed the in-memory record structs down about as far as possible; with the removal of the list heads, the memory requirements drop by 30-60%. We require the same amount of memory as would be needed to store all of the records in the leaf nodes, and no more, and we can use swap space to do it. > around the ability to online repair vs. having to punt to xfs_repair, > or if the repair consumes whatever free space remains in the fs to the > detriment of whatever workload the user presumably wanted to keep the > fs online for, etc. I've occasionally thought that future xfs_scrub could ask the kernel to estimate how much disk and memory it will need for the repair (and whether the disk space requirement is fs-scope or AG-scope); then it could forego a repair action and recommend xfs_repair if running the online repair would take the system below some configurable threshold. > > > presume either way we'd have to ensure that AG is either consistent or > > > locked out from outside I/O. If we have the total record count we can > > > > We usually don't, but for the btrees that have their own record/blocks > > counters we might be able to guess a number, fallocate it, and see if > > that doesn't ENOSPC. > > > > > preallocate the file and hope there is no such other free space > > > corruption or something that would allow some other task to mess with > > > our blocks. I'm a little skeptical overall on relying on a corrupted > > > filesystem to store repair data, but perhaps there are ways to mitigate > > > the risks. > > > > Store it elsewhere? /home for root repairs, /root for any other > > repair... though if we're going to do that, why not just add a swap file > > temporarily? > > > > Indeed. The thought crossed my mind about whether we could do something > like have an internal/isolated swap file for dedicated XFS allocations > to avoid contention with the traditional swap. Heh, I think e2fsck has some feature like that where you can pass it a swap file. No idea how much good that does on modern systems where there's one huge partition... :) > Userspace could somehow set it up or communicate to the kernel. I have > no idea how realistic that is though or if there's a better interface > for that kind of thing (i.e., file backed kmem cache?). I looked, and there aren't any other mechanisms for unpinnned kernel memory allocations. > What _seems_ beneficial about that approach is we get (potentially > external) persistent backing and memory reclaim ability with the > traditional memory allocation model. > > ISTM that if we used a regular file, we'd need to deal with the > traditional file interface somehow or another (file read/pagecache > lookup -> record ??). Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so all we need is a (struct file *). > We could repurpose some existing mechanism like the directory code or > quota inode mechanism to use xfs buffers for that purpose, but I think > that would require us to always use an internal inode. Allowing > userspace to pass an fd/file passes that consideration on to the user, > which might be more flexible. We could always warn about additional > limitations if that fd happens to be based on the target fs. <nod> A second advantage of the struct file/kernel_{read,write} approach is that we if we ever decide to let userspace pass in a fd, it's trivial to feed that struct file to the kernel io routines instead of a memfd one. > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it > > > swappable or something? > > > > It's supposed to be. The quick test I ran (allocate a memfd, write 1GB > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into > > the swap file. > > > > Ok. > > > > If so, that sounds a reasonable option provided the swap space > > > requirement can be made clear to users > > > > We can document it. I don't think it's any worse than xfs_repair being > > able to use up all the memory + swap... and since we're probably only > > going to be repairing one thing at a time, most likely scrub won't need > > as much memory. > > > > Right, but as noted below, my concerns with the xfs_repair comparison > are that 1.) the kernel generally has more of a limit on anonymous > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.) > it's not clear how effectively running the system out of memory via the > kernel will behave from a failure perspective. > > IOW, xfs_repair can run the system out of memory but for the most part > that ends up being a simple problem for the system: OOM kill the bloated > xfs_repair process. For an online repair in a similar situation, I have > no idea what's going to happen. Back in the days of the huge linked lists the oom killer would target other proceses because it doesn't know that the online repair thread is sitting on a ton of pinned kernel memory... > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM > we'd still be at risk of other subsystems running into memory > allocation problems, filling up swap, the OOM killer going after > unrelated processes, etc. What if, for example, the OOM killer starts > picking off processes in service to a running online repair that > immediately consumes freed up memory until the system is borked? Yeah. One thing we /could/ do is register an oom notifier that would urge any running repair threads to bail out if they can. It seems to me that the oom killer blocks on the oom_notify_list chain, so our handler could wait until at least one thread exits before returning. > I don't know how likely that is or if it really ends up much different > from the analogous xfs_repair situation. My only point right now is > that failure scenario is something we should explore for any solution > we ultimately consider because it may be an unexpected use case of the > underlying mechanism. Ideally, online repair would always be the victim since we know we have a reasonable fallback. At least for memfd, however, I think the only clues we have to decide the question "is this memfd getting in the way of other threads?" is either seeing ENOMEM, short writes, or getting kicked by an oom notification. Maybe that'll be enough? > (To the contrary, just using a cached file seems a natural fit from > that perspective.) Same here. > > > and the failure characteristics aren't more severe than for userspace. > > > An online repair that puts the broader system at risk of OOM as > > > opposed to predictably failing gracefully may not be the most useful > > > tool. > > > > Agreed. One huge downside of memfd seems to be the lack of a mechanism > > for the vm to push back on us if we successfully write all we need to > > the memfd but then other processes need some memory. Obviously, if the > > memfd write itself comes up short or fails then we dump the memfd and > > error back to userspace. We might simply have to free array memory > > while we iterate the records to minimize the time spent at peak memory > > usage. > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach > may simplify things because we could allocate it up front and know right > away whether we just don't have enough memory available to repair. Hmm. Apparently we actually /can/ call fallocate on memfd to grab all the pages at once, provided we have some guesstimate beforehand of how much space we think we'll need. So long as my earlier statement about the memory requirements being no more than the size of the btree leaves is actually true (I haven't rigorously tried to prove it), we need about (xrep_calc_ag_resblks() * blocksize) worth of space in the memfd file. Maybe we ask for 1.5x that and if we don't get it, we kill the memfd and exit. --D > > Brian > > > --D > > > > > > > > Brian > > > > > > > --D > > > > > > > > > Brian > > > > > > > > > > > --D > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > +done: > > > > > > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > > > > > > +} > > > > > > > > > > > > + > > > > > > > > > > > ... > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > > > > > > diff = b1->bno - b2->bno; > > > > > > > > > > > > return diff; > > > > > > > > > > > > } > > > > > > > > > > > > + > > > > > > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > > > > > > +bool > > > > > > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > > > > > > + struct xfs_perag *pag) > > > > > > > > > > > > +{ > > > > > > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > + return false; > > > > > > > > > > > > + } > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > + return true; > > > > > > > > > > > > +} > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > > > > > > + > > > > > > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > -- > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > -- > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > -- > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > -- > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote: > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote: > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote: > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote: > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote: > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote: > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: ... > > > > > So, that early draft spent a lot of time tearing down and reconstructing > > > > > rmapbt cursors since the standard _btree_query_all isn't suited to that > > > > > kind of usage. It was easily twice as slow on a RAM-backed disk just > > > > > from the rmap cursor overhead and much more complex, so I rewrote it to > > > > > be simpler. I also have a slight preference for not touching anything > > > > > until we're absolutely sure we have all the data we need to repair the > > > > > structure. > > > > > > > > > > > > > Yes, I think that is sane in principle. I'm just a bit concerned about > > > > how reliable that xfs_repair-like approach will be in the kernel longer > > > > term, particularly once we start having to deal with large filesystems > > > > and limited or contended memory, etc. We already have xfs_repair users > > > > that need to tweak settings because there isn't enough memory available > > > > to repair the fs. Granted that is for fs-wide repairs and the flipside > > > > is that we know a single AG can only be up to 1TB. It's certainly > > > > possible that putting some persistent backing behind the in-core data is > > > > enough to resolve the problem (and the current approach is certainly > > > > reasonable enough to me for the initial implementation). > > > > > > > > bjoin limitations aside, I wonder if a cursor roll mechanism that held > > > > all of the cursor buffers, rolled the transaction and then rejoined all > > > > said buffers would help us get around that. (Not sure I follow the early > > > > prototype behavior, but it sounds like we had to restart the rmapbt > > > > lookup over and over...). > > > > > > Correct. > > > > > > > Another caveat with that approach may be that I think we'd need to be > > > > sure that the reconstruction operation doesn't ever need to update the > > > > rmapbt while we're mid walk of the latter. > > > > > > <nod> Looking even farther back in my notes, that was also an issue -- > > > fixing the free list causes blocks to go on or off the agfl, which > > > causes rmapbt updates, which meant that the only way I could get > > > in-place updates to work was to re-lookup where we were in the btree and > > > also try to deal with any rmapbt entries that might have crept in as > > > result of the record insertion. > > > > > > Getting the concurrency right for each repair function looked like a > > > difficult problem to solve, but amassing all the records elsewhere and > > > rebuilding was easy to understand. > > > > > > > Yeah. This all points to this kind of strategy being too complex to be > > worth the prospective benefits in the short term. Clearly we have > > several, potentially tricky roadblocks to work through before this can > > be made feasible. Thanks for the background, it's still useful to have > > this context to compare with whatever we may have to do to support a > > reclaimable memory approach. > > <nod> Reclaimable memfd "memory" isn't too difficult, we can call > kernel_read and kernel_write, though lockdep gets pretty mad about xfs > taking sb_start_write (on the memfd filesystem) at the same time it has > sb_starT_write on the xfs (not to mention the stack usage) so I had to > throw in the extra twist of delegating the actual file io to a workqueue > item (a la xfs_btree_split). > Ok, I'm more curious what the surrounding code looks like around managing the underlying file pages. Now that I think of it, the primary usage was to dump everything into the file and read it back sequentually, so perhaps this really isn't that difficult to deal with since the file content is presumably fixed size data structures. (Hmm, was there a sort in there somewhere as well?). > > > > That may be an issue for inode btree reconstruction, for example, > > > > since it looks like inobt block allocation requires rmapbt updates. > > > > We'd probably need some way to share (or invalidate) a cursor across > > > > different contexts to deal with that. > > > > > > I might pursue that strategy if we ever hit the point where we can't > > > find space to store the records (see below). Another option could be to > > > divert all deferred items for an AG, build a replacement btree in new > > > space, then finish all the deferred items... but that's starting to get > > > into offlineable AGs, which is its own project that I want to tackle > > > later. > > > > > > (Not that much later, just not this cycle.) > > > > > > > *nod* > > > > > > > For other repair functions (like the data/attr fork repairs) we have to > > > > > scan all the rmapbts for extents, and I'd prefer to lock those AGs only > > > > > for as long as necessary to extract the extents we want. > > > > > > > > > > > The obvious problem is that we still have some checks that allow the > > > > > > whole repair operation to bail out before we determine whether we can > > > > > > start to rebuild the on-disk btrees. These are things like making sure > > > > > > we can actually read the associated rmapbt blocks (i.e., no read errors > > > > > > or verifier failures), basic record sanity checks, etc. But ISTM that > > > > > > isn't anything we couldn't get around with a multi-pass implementation. > > > > > > Secondary issues might be things like no longer being able to easily > > > > > > insert the longest free extent range(s) first (meaning we'd have to > > > > > > stuff the agfl with old btree blocks or figure out some other approach). > > > > > > > > > > Well, you could scan the rmapbt twice -- once to find the longest > > > > > record, then again to do the actual insertion. > > > > > > > > > > > > > Yep, that's what I meant by multi-pass. > > > > > > > > > > BTW, isn't the typical scrub sequence already multi-pass by virtue of > > > > > > the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub() > > > > > > callout could not only detect corruption, but validate whether repair > > > > > > (if requested) is possible based on the kind of checks that are > > > > > > currently in the repair side rmapbt walkers. Thoughts?r > > > > > > > > > > Yes, scrub basically validates that for us now, with the notable > > > > > exception of the notorious rmapbt scrubber, which doesn't > > > > > cross-reference with inode block mappings because that would be a > > > > > locking nightmare. > > > > > > > > > > > Are there future > > > > > > changes that are better supported by an in-core tracking structure in > > > > > > general (assuming we'll eventually replace the linked lists with > > > > > > something more efficient) as opposed to attempting to optimize out the > > > > > > need for that tracking at all? > > > > > > > > > > Well, I was thinking that we could just allocate a memfd (or a file on > > > > > the same xfs once we have AG offlining) and store the records in there. > > > > > That saves us the list_head overhead and potentially enables access to a > > > > > lot more storage than pinning things in RAM. > > > > > > > > > > > > > Would using the same fs mean we have to store the repair data in a > > > > separate AG, or somehow locate/use free space in the target AG? > > > > > > As part of building an "offline AG" feature we'd presumably have to > > > teach the allocators to avoid the offline AGs for allocations, which > > > would make it so that we could host the repair data files in the same > > > XFS that's being fixed. That seems a little risky to me, but the disk > > > is probably larger than mem+swap. > > > > > > > Got it, so we'd use the remaining space in the fs outside of the target > > AG. ISTM that still presumes the rest of the fs is coherent, but I > > suppose the offline AG thing helps us with that. We'd just have to make > > sure we've shut down all currently corrupted AGs before we start to > > repair a particular corrupted one, and then hope there's still enough > > free space in the fs to proceed. > > That's a pretty big hope. :) I think for now > > > That makes more sense, but I still agree that it seems risky in general. > > Technical risk aside, there's also usability concerns in that the local > > free space requirement is another bit of non-determinism > > I don't think it's non-deterministic, it's just hard for the filesystem > to communicate to the user/admin ahead of time. Roughly speaking, we > need to have about as much disk space for the new btree as we had > allocated for the old one. > Right, maybe non-deterministic is not the best term. What I mean is that it's not clear to the user why a particular filesystem may not be able to run a repair (e.g., if it has plenty of reported free space but enough AGs may be shut down due to corruption). So in certain scenarios an unrelated corruption or particular ordering of AG repairs could be the difference between whether an online repair succeeds or defers to offline repair on the otherwise same filesystem. > As far as memory requirements go, in last week's revising of the patches > I compressed the in-memory record structs down about as far as possible; > with the removal of the list heads, the memory requirements drop by > 30-60%. We require the same amount of memory as would be needed to > store all of the records in the leaf nodes, and no more, and we can use > swap space to do it. > Nice. When looking at the existing structures it looked like a worst case (1TB AG, every other 1k block allocated) could require up to 10-12GB RAM (but I could have easily messed that up). That's not insane on its own, it's just the question of allocating that much memory in the kernel. Slimming that down and pushing it into something swappable doesn't _sound_ too overbearing. I'm not really sure what default distro swap sizes are these days (some % of RAM?), but it shouldn't be that hard to find ~10GB of disk space somewhere to facilitate a repair. > > around the ability to online repair vs. having to punt to xfs_repair, > > or if the repair consumes whatever free space remains in the fs to the > > detriment of whatever workload the user presumably wanted to keep the > > fs online for, etc. > > I've occasionally thought that future xfs_scrub could ask the kernel to > estimate how much disk and memory it will need for the repair (and > whether the disk space requirement is fs-scope or AG-scope); then it > could forego a repair action and recommend xfs_repair if running the > online repair would take the system below some configurable threshold. > I think something like that would improve usability once we nail down the core mechanism. > > > > presume either way we'd have to ensure that AG is either consistent or > > > > locked out from outside I/O. If we have the total record count we can > > > > > > We usually don't, but for the btrees that have their own record/blocks > > > counters we might be able to guess a number, fallocate it, and see if > > > that doesn't ENOSPC. > > > > > > > preallocate the file and hope there is no such other free space > > > > corruption or something that would allow some other task to mess with > > > > our blocks. I'm a little skeptical overall on relying on a corrupted > > > > filesystem to store repair data, but perhaps there are ways to mitigate > > > > the risks. > > > > > > Store it elsewhere? /home for root repairs, /root for any other > > > repair... though if we're going to do that, why not just add a swap file > > > temporarily? > > > > > > > Indeed. The thought crossed my mind about whether we could do something > > like have an internal/isolated swap file for dedicated XFS allocations > > to avoid contention with the traditional swap. > > Heh, I think e2fsck has some feature like that where you can pass it a > swap file. No idea how much good that does on modern systems where > there's one huge partition... :) > Interesting. Couldn't you always create an additional swap file, run the repair then kill it off when it's no longer needed? > > Userspace could somehow set it up or communicate to the kernel. I have > > no idea how realistic that is though or if there's a better interface > > for that kind of thing (i.e., file backed kmem cache?). > > I looked, and there aren't any other mechanisms for unpinnned kernel > memory allocations. > Ok, it looks like swap or traditional files it is then. ;P > > What _seems_ beneficial about that approach is we get (potentially > > external) persistent backing and memory reclaim ability with the > > traditional memory allocation model. > > > > ISTM that if we used a regular file, we'd need to deal with the > > traditional file interface somehow or another (file read/pagecache > > lookup -> record ??). > > Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so > all we need is a (struct file *). > > > We could repurpose some existing mechanism like the directory code or > > quota inode mechanism to use xfs buffers for that purpose, but I think > > that would require us to always use an internal inode. Allowing > > userspace to pass an fd/file passes that consideration on to the user, > > which might be more flexible. We could always warn about additional > > limitations if that fd happens to be based on the target fs. > > <nod> A second advantage of the struct file/kernel_{read,write} approach > is that we if we ever decide to let userspace pass in a fd, it's trivial > to feed that struct file to the kernel io routines instead of a memfd > one. > Yeah, I like this flexibility. In fact, I'm wondering why we wouldn't do something like this anyways. Could/should xfs_scrub be responsible for allocating a memfd and passing along the fd? Another advantage of doing that is whatever logic we may need to clean up old repair files or whatever is pushed to userspace. > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it > > > > swappable or something? > > > > > > It's supposed to be. The quick test I ran (allocate a memfd, write 1GB > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into > > > the swap file. > > > > > > > Ok. > > > > > > If so, that sounds a reasonable option provided the swap space > > > > requirement can be made clear to users > > > > > > We can document it. I don't think it's any worse than xfs_repair being > > > able to use up all the memory + swap... and since we're probably only > > > going to be repairing one thing at a time, most likely scrub won't need > > > as much memory. > > > > > > > Right, but as noted below, my concerns with the xfs_repair comparison > > are that 1.) the kernel generally has more of a limit on anonymous > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.) > > it's not clear how effectively running the system out of memory via the > > kernel will behave from a failure perspective. > > > > IOW, xfs_repair can run the system out of memory but for the most part > > that ends up being a simple problem for the system: OOM kill the bloated > > xfs_repair process. For an online repair in a similar situation, I have > > no idea what's going to happen. > > Back in the days of the huge linked lists the oom killer would target > other proceses because it doesn't know that the online repair thread is > sitting on a ton of pinned kernel memory... > Makes sense, kind of what I'd expect... > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM > > we'd still be at risk of other subsystems running into memory > > allocation problems, filling up swap, the OOM killer going after > > unrelated processes, etc. What if, for example, the OOM killer starts > > picking off processes in service to a running online repair that > > immediately consumes freed up memory until the system is borked? > > Yeah. One thing we /could/ do is register an oom notifier that would > urge any running repair threads to bail out if they can. It seems to me > that the oom killer blocks on the oom_notify_list chain, so our handler > could wait until at least one thread exits before returning. > Ok, something like that could be useful. I agree that we probably don't need to go that far until the mechanism is nailed down and testing shows that OOM is a problem. > > I don't know how likely that is or if it really ends up much different > > from the analogous xfs_repair situation. My only point right now is > > that failure scenario is something we should explore for any solution > > we ultimately consider because it may be an unexpected use case of the > > underlying mechanism. > > Ideally, online repair would always be the victim since we know we have > a reasonable fallback. At least for memfd, however, I think the only > clues we have to decide the question "is this memfd getting in the way > of other threads?" is either seeing ENOMEM, short writes, or getting > kicked by an oom notification. Maybe that'll be enough? > Hm, yeah. It may be challenging to track memfd usage as such. If userspace has access to the fd on an OOM notification or whatever, it might be able to do more accurate analysis based on an fstat() or something. Related question... is the online repair sequence currently interruptible, if xfs_scrub receives a fatal signal while pulling in entries during an allocbt scan for example? > > (To the contrary, just using a cached file seems a natural fit from > > that perspective.) > > Same here. > > > > > and the failure characteristics aren't more severe than for userspace. > > > > An online repair that puts the broader system at risk of OOM as > > > > opposed to predictably failing gracefully may not be the most useful > > > > tool. > > > > > > Agreed. One huge downside of memfd seems to be the lack of a mechanism > > > for the vm to push back on us if we successfully write all we need to > > > the memfd but then other processes need some memory. Obviously, if the > > > memfd write itself comes up short or fails then we dump the memfd and > > > error back to userspace. We might simply have to free array memory > > > while we iterate the records to minimize the time spent at peak memory > > > usage. > > > > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach > > may simplify things because we could allocate it up front and know right > > away whether we just don't have enough memory available to repair. > > Hmm. Apparently we actually /can/ call fallocate on memfd to grab all > the pages at once, provided we have some guesstimate beforehand of how > much space we think we'll need. > > So long as my earlier statement about the memory requirements being no > more than the size of the btree leaves is actually true (I haven't > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() * > blocksize) worth of space in the memfd file. Maybe we ask for 1.5x > that and if we don't get it, we kill the memfd and exit. > Indeed. It would be nice if we could do all of the file management bits in userspace. Brian > --D > > > > > Brian > > > > > --D > > > > > > > > > > > Brian > > > > > > > > > --D > > > > > > > > > > > Brian > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > +done: > > > > > > > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > > > > > > > +} > > > > > > > > > > > > > + > > > > > > > > > > > > ... > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > > > > > > > diff = b1->bno - b2->bno; > > > > > > > > > > > > > return diff; > > > > > > > > > > > > > } > > > > > > > > > > > > > + > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > > > > > > > +bool > > > > > > > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > > > > > > > + struct xfs_perag *pag) > > > > > > > > > > > > > +{ > > > > > > > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > > > > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > + return false; > > > > > > > > > > > > > + } > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > + return true; > > > > > > > > > > > > > +} > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > > > > > > > + > > > > > > > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > -- > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > -- > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > -- > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > -- > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > -- > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote: > On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote: > > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote: > > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote: > > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote: > > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote: > > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote: > > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > ... > > > > > > So, that early draft spent a lot of time tearing down and reconstructing > > > > > > rmapbt cursors since the standard _btree_query_all isn't suited to that > > > > > > kind of usage. It was easily twice as slow on a RAM-backed disk just > > > > > > from the rmap cursor overhead and much more complex, so I rewrote it to > > > > > > be simpler. I also have a slight preference for not touching anything > > > > > > until we're absolutely sure we have all the data we need to repair the > > > > > > structure. > > > > > > > > > > > > > > > > Yes, I think that is sane in principle. I'm just a bit concerned about > > > > > how reliable that xfs_repair-like approach will be in the kernel longer > > > > > term, particularly once we start having to deal with large filesystems > > > > > and limited or contended memory, etc. We already have xfs_repair users > > > > > that need to tweak settings because there isn't enough memory available > > > > > to repair the fs. Granted that is for fs-wide repairs and the flipside > > > > > is that we know a single AG can only be up to 1TB. It's certainly > > > > > possible that putting some persistent backing behind the in-core data is > > > > > enough to resolve the problem (and the current approach is certainly > > > > > reasonable enough to me for the initial implementation). > > > > > > > > > > bjoin limitations aside, I wonder if a cursor roll mechanism that held > > > > > all of the cursor buffers, rolled the transaction and then rejoined all > > > > > said buffers would help us get around that. (Not sure I follow the early > > > > > prototype behavior, but it sounds like we had to restart the rmapbt > > > > > lookup over and over...). > > > > > > > > Correct. > > > > > > > > > Another caveat with that approach may be that I think we'd need to be > > > > > sure that the reconstruction operation doesn't ever need to update the > > > > > rmapbt while we're mid walk of the latter. > > > > > > > > <nod> Looking even farther back in my notes, that was also an issue -- > > > > fixing the free list causes blocks to go on or off the agfl, which > > > > causes rmapbt updates, which meant that the only way I could get > > > > in-place updates to work was to re-lookup where we were in the btree and > > > > also try to deal with any rmapbt entries that might have crept in as > > > > result of the record insertion. > > > > > > > > Getting the concurrency right for each repair function looked like a > > > > difficult problem to solve, but amassing all the records elsewhere and > > > > rebuilding was easy to understand. > > > > > > > > > > Yeah. This all points to this kind of strategy being too complex to be > > > worth the prospective benefits in the short term. Clearly we have > > > several, potentially tricky roadblocks to work through before this can > > > be made feasible. Thanks for the background, it's still useful to have > > > this context to compare with whatever we may have to do to support a > > > reclaimable memory approach. > > > > <nod> Reclaimable memfd "memory" isn't too difficult, we can call > > kernel_read and kernel_write, though lockdep gets pretty mad about xfs > > taking sb_start_write (on the memfd filesystem) at the same time it has > > sb_starT_write on the xfs (not to mention the stack usage) so I had to > > throw in the extra twist of delegating the actual file io to a workqueue > > item (a la xfs_btree_split). > > > > Ok, I'm more curious what the surrounding code looks like around > managing the underlying file pages. Now that I think of it, the primary > usage was to dump everything into the file and read it back > sequentually, Yep. Simplified, the code is more or less: array_init(array) { array->filp = shmem_file_create(...); } array_destroy(array) { fput(array->filp); } array_set(array, nr, ptr) { kernel_write(array->filp, ptr, array->obj_size, nr * array->obj_size); } array_get(array, nr, ptr) { kernel_read(array->filp, ptr, array->obj_size, nr * array->obj_size); } That's leaving out all the bookkeeping and other weird details to show pseudocode versions of the file manipulation calls. I did end up playing a bit of sleight-of-hand with the file io, however -- all the io is deferred to a workqueue for the dual purpose of avoiding stack overflows in the memfd file's io paths and to avoid some sort of deadlock in the page fault handler of the memfd write. I didn't investigate the deadlock too deeply, as solving the first problem seemed to make the second go away. > so perhaps this really isn't that difficult to deal with since the > file content is presumably fixed size data structures. Correct. There is one user that needs variable-sized records (the extended attribute repair) for which I've constructed the 'xblob' data structure which stores blobs in a second memfd and returns the file offset of a blob as a magic cookie that is recorded in the (fixed size) attr keys. Presumably the future directory rebuilder will use xblob too. > (Hmm, was there a sort in there somewhere as well?). Yes. I spent a couple of days implementing a hybrid quicksort/insertion sort that won't blow out the call stack. > > > > > That may be an issue for inode btree reconstruction, for example, > > > > > since it looks like inobt block allocation requires rmapbt updates. > > > > > We'd probably need some way to share (or invalidate) a cursor across > > > > > different contexts to deal with that. > > > > > > > > I might pursue that strategy if we ever hit the point where we can't > > > > find space to store the records (see below). Another option could be to > > > > divert all deferred items for an AG, build a replacement btree in new > > > > space, then finish all the deferred items... but that's starting to get > > > > into offlineable AGs, which is its own project that I want to tackle > > > > later. > > > > > > > > (Not that much later, just not this cycle.) > > > > > > > > > > *nod* > > > > > > > > > For other repair functions (like the data/attr fork repairs) we have to > > > > > > scan all the rmapbts for extents, and I'd prefer to lock those AGs only > > > > > > for as long as necessary to extract the extents we want. > > > > > > > > > > > > > The obvious problem is that we still have some checks that allow the > > > > > > > whole repair operation to bail out before we determine whether we can > > > > > > > start to rebuild the on-disk btrees. These are things like making sure > > > > > > > we can actually read the associated rmapbt blocks (i.e., no read errors > > > > > > > or verifier failures), basic record sanity checks, etc. But ISTM that > > > > > > > isn't anything we couldn't get around with a multi-pass implementation. > > > > > > > Secondary issues might be things like no longer being able to easily > > > > > > > insert the longest free extent range(s) first (meaning we'd have to > > > > > > > stuff the agfl with old btree blocks or figure out some other approach). > > > > > > > > > > > > Well, you could scan the rmapbt twice -- once to find the longest > > > > > > record, then again to do the actual insertion. > > > > > > > > > > > > > > > > Yep, that's what I meant by multi-pass. > > > > > > > > > > > > BTW, isn't the typical scrub sequence already multi-pass by virtue of > > > > > > > the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub() > > > > > > > callout could not only detect corruption, but validate whether repair > > > > > > > (if requested) is possible based on the kind of checks that are > > > > > > > currently in the repair side rmapbt walkers. Thoughts?r > > > > > > > > > > > > Yes, scrub basically validates that for us now, with the notable > > > > > > exception of the notorious rmapbt scrubber, which doesn't > > > > > > cross-reference with inode block mappings because that would be a > > > > > > locking nightmare. > > > > > > > > > > > > > Are there future > > > > > > > changes that are better supported by an in-core tracking structure in > > > > > > > general (assuming we'll eventually replace the linked lists with > > > > > > > something more efficient) as opposed to attempting to optimize out the > > > > > > > need for that tracking at all? > > > > > > > > > > > > Well, I was thinking that we could just allocate a memfd (or a file on > > > > > > the same xfs once we have AG offlining) and store the records in there. > > > > > > That saves us the list_head overhead and potentially enables access to a > > > > > > lot more storage than pinning things in RAM. > > > > > > > > > > > > > > > > Would using the same fs mean we have to store the repair data in a > > > > > separate AG, or somehow locate/use free space in the target AG? > > > > > > > > As part of building an "offline AG" feature we'd presumably have to > > > > teach the allocators to avoid the offline AGs for allocations, which > > > > would make it so that we could host the repair data files in the same > > > > XFS that's being fixed. That seems a little risky to me, but the disk > > > > is probably larger than mem+swap. > > > > > > > > > > Got it, so we'd use the remaining space in the fs outside of the target > > > AG. ISTM that still presumes the rest of the fs is coherent, but I > > > suppose the offline AG thing helps us with that. We'd just have to make > > > sure we've shut down all currently corrupted AGs before we start to > > > repair a particular corrupted one, and then hope there's still enough > > > free space in the fs to proceed. > > > > That's a pretty big hope. :) I think for now > > > > > That makes more sense, but I still agree that it seems risky in general. > > > Technical risk aside, there's also usability concerns in that the local > > > free space requirement is another bit of non-determinism > > > > I don't think it's non-deterministic, it's just hard for the filesystem > > to communicate to the user/admin ahead of time. Roughly speaking, we > > need to have about as much disk space for the new btree as we had > > allocated for the old one. > > > > Right, maybe non-deterministic is not the best term. What I mean is that > it's not clear to the user why a particular filesystem may not be able > to run a repair (e.g., if it has plenty of reported free space but > enough AGs may be shut down due to corruption). So in certain scenarios > an unrelated corruption or particular ordering of AG repairs could be > the difference between whether an online repair succeeds or defers to > offline repair on the otherwise same filesystem. <nod> > > As far as memory requirements go, in last week's revising of the patches > > I compressed the in-memory record structs down about as far as possible; > > with the removal of the list heads, the memory requirements drop by > > 30-60%. We require the same amount of memory as would be needed to > > store all of the records in the leaf nodes, and no more, and we can use > > swap space to do it. > > > > Nice. When looking at the existing structures it looked like a worst > case (1TB AG, every other 1k block allocated) could require up to > 10-12GB RAM (but I could have easily messed that up).r Sounds about right. 1TB AG = 268 million 4k blocks bnobt: 8-byte records, or ~2.2GB of memory inobt: 16-byte records, or ~4.3GB of memory refcountbt: 12-byte records, or ~3.2GB of memory rmapbt: 24-byte records, or ~6.4GB of memory Multiply by 4 for a 1k block filesystem, divide by 16 for a 64k block fs. Note that if the AG is full and heavily shared then the rmapbt requirements can exceed that, but that's a known property of rmap in general. > That's not insane on its own, it's just the question of allocating > that much memory in the kernel. Slimming that down and pushing it into > something swappable doesn't _sound_ too overbearing. I'm not really > sure what default distro swap sizes are these days (some % of RAM?), I think so? I think RH/Centos/OL default to the size of RAM + 2GB nowadays, and Ubuntu seems to do RAM+sqrt(RAM)? > but it shouldn't be that hard to find ~10GB of disk space somewhere to > facilitate a repair. > > > > around the ability to online repair vs. having to punt to xfs_repair, > > > or if the repair consumes whatever free space remains in the fs to the > > > detriment of whatever workload the user presumably wanted to keep the > > > fs online for, etc. > > > > I've occasionally thought that future xfs_scrub could ask the kernel to > > estimate how much disk and memory it will need for the repair (and > > whether the disk space requirement is fs-scope or AG-scope); then it > > could forego a repair action and recommend xfs_repair if running the > > online repair would take the system below some configurable threshold. > > > > I think something like that would improve usability once we nail down > the core mechanism. Ok, I'll put it on my list of things to do. > > > > > presume either way we'd have to ensure that AG is either consistent or > > > > > locked out from outside I/O. If we have the total record count we can > > > > > > > > We usually don't, but for the btrees that have their own record/blocks > > > > counters we might be able to guess a number, fallocate it, and see if > > > > that doesn't ENOSPC. > > > > > > > > > preallocate the file and hope there is no such other free space > > > > > corruption or something that would allow some other task to mess with > > > > > our blocks. I'm a little skeptical overall on relying on a corrupted > > > > > filesystem to store repair data, but perhaps there are ways to mitigate > > > > > the risks. > > > > > > > > Store it elsewhere? /home for root repairs, /root for any other > > > > repair... though if we're going to do that, why not just add a swap file > > > > temporarily? > > > > > > > > > > Indeed. The thought crossed my mind about whether we could do something > > > like have an internal/isolated swap file for dedicated XFS allocations > > > to avoid contention with the traditional swap. > > > > Heh, I think e2fsck has some feature like that where you can pass it a > > swap file. No idea how much good that does on modern systems where > > there's one huge partition... :) > > > > Interesting. Couldn't you always create an additional swap file, run the > repair then kill it off when it's no longer needed? Yes, though as I think you said in an earlier reply, it would be nice to have our own private swap file instead of risking some other process taking it. > > > Userspace could somehow set it up or communicate to the kernel. I have > > > no idea how realistic that is though or if there's a better interface > > > for that kind of thing (i.e., file backed kmem cache?). > > > > I looked, and there aren't any other mechanisms for unpinnned kernel > > memory allocations. > > > > Ok, it looks like swap or traditional files it is then. ;P > > > > What _seems_ beneficial about that approach is we get (potentially > > > external) persistent backing and memory reclaim ability with the > > > traditional memory allocation model. > > > > > > ISTM that if we used a regular file, we'd need to deal with the > > > traditional file interface somehow or another (file read/pagecache > > > lookup -> record ??). > > > > Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so > > all we need is a (struct file *). > > > > > We could repurpose some existing mechanism like the directory code or > > > quota inode mechanism to use xfs buffers for that purpose, but I think > > > that would require us to always use an internal inode. Allowing > > > userspace to pass an fd/file passes that consideration on to the user, > > > which might be more flexible. We could always warn about additional > > > limitations if that fd happens to be based on the target fs. > > > > <nod> A second advantage of the struct file/kernel_{read,write} approach > > is that we if we ever decide to let userspace pass in a fd, it's trivial > > to feed that struct file to the kernel io routines instead of a memfd > > one. > > > > Yeah, I like this flexibility. In fact, I'm wondering why we wouldn't do > something like this anyways. Could/should xfs_scrub be responsible for > allocating a memfd and passing along the fd? Another advantage of doing > that is whatever logic we may need to clean up old repair files or > whatever is pushed to userspace. There are two ways we could do this -- one is to have the kernel manage the memfd creation internally (like my patches do now); the other is for xfs_scrub to pass in creat(O_TMPFILE). When repair fputs the file (or fdputs the fd if we switch to using that), the kernel will perform the usual deletion of the zero-linkcount zero-refcount file. We get all the "cleanup" for free by closing the file. One other potential complication is that a couple of the repair functions need two memfds. The extended attribute repair creates a fixed-record array for attr keys and an xblob to hold names and values; each structure gets its own memfd. The refcount repair creates two fixed-record arrays, one for refcount records and another to act as a stack of rmaps to compute reference counts. (In theory the xbitmap could also be converted to use the fixed record array, but in practice they haven't (yet) become large enough to warrant it, and there's currently no way to insert or delete records from the middle of the array.) > > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it > > > > > swappable or something? > > > > > > > > It's supposed to be. The quick test I ran (allocate a memfd, write 1GB > > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into > > > > the swap file. > > > > > > > > > > Ok. > > > > > > > > If so, that sounds a reasonable option provided the swap space > > > > > requirement can be made clear to users > > > > > > > > We can document it. I don't think it's any worse than xfs_repair being > > > > able to use up all the memory + swap... and since we're probably only > > > > going to be repairing one thing at a time, most likely scrub won't need > > > > as much memory. > > > > > > > > > > Right, but as noted below, my concerns with the xfs_repair comparison > > > are that 1.) the kernel generally has more of a limit on anonymous > > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.) > > > it's not clear how effectively running the system out of memory via the > > > kernel will behave from a failure perspective. > > > > > > IOW, xfs_repair can run the system out of memory but for the most part > > > that ends up being a simple problem for the system: OOM kill the bloated > > > xfs_repair process. For an online repair in a similar situation, I have > > > no idea what's going to happen. > > > > Back in the days of the huge linked lists the oom killer would target > > other proceses because it doesn't know that the online repair thread is > > sitting on a ton of pinned kernel memory... > > > > Makes sense, kind of what I'd expect... > > > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM > > > we'd still be at risk of other subsystems running into memory > > > allocation problems, filling up swap, the OOM killer going after > > > unrelated processes, etc. What if, for example, the OOM killer starts > > > picking off processes in service to a running online repair that > > > immediately consumes freed up memory until the system is borked? > > > > Yeah. One thing we /could/ do is register an oom notifier that would > > urge any running repair threads to bail out if they can. It seems to me > > that the oom killer blocks on the oom_notify_list chain, so our handler > > could wait until at least one thread exits before returning. > > > > Ok, something like that could be useful. I agree that we probably don't > need to go that far until the mechanism is nailed down and testing shows > that OOM is a problem. It already is a problem on my contrived "2TB hardlink/reflink farm fs" + "400M of RAM and no swap" scenario. Granted, pretty much every other xfs utility also blows out on that so I'm not sure how hard I really need to try... > > > I don't know how likely that is or if it really ends up much different > > > from the analogous xfs_repair situation. My only point right now is > > > that failure scenario is something we should explore for any solution > > > we ultimately consider because it may be an unexpected use case of the > > > underlying mechanism. > > > > Ideally, online repair would always be the victim since we know we have > > a reasonable fallback. At least for memfd, however, I think the only > > clues we have to decide the question "is this memfd getting in the way > > of other threads?" is either seeing ENOMEM, short writes, or getting > > kicked by an oom notification. Maybe that'll be enough? > > > > Hm, yeah. It may be challenging to track memfd usage as such. If > userspace has access to the fd on an OOM notification or whatever, it > might be able to do more accurate analysis based on an fstat() or > something. > > Related question... is the online repair sequence currently > interruptible, if xfs_scrub receives a fatal signal while pulling in > entries during an allocbt scan for example? It's interruptible (fatal signals only) during the scan phase, but once it starts logging metadata updates it will run all the way to completion. > > > (To the contrary, just using a cached file seems a natural fit from > > > that perspective.) > > > > Same here. > > > > > > > and the failure characteristics aren't more severe than for userspace. > > > > > An online repair that puts the broader system at risk of OOM as > > > > > opposed to predictably failing gracefully may not be the most useful > > > > > tool. > > > > > > > > Agreed. One huge downside of memfd seems to be the lack of a mechanism > > > > for the vm to push back on us if we successfully write all we need to > > > > the memfd but then other processes need some memory. Obviously, if the > > > > memfd write itself comes up short or fails then we dump the memfd and > > > > error back to userspace. We might simply have to free array memory > > > > while we iterate the records to minimize the time spent at peak memory > > > > usage. > > > > > > > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach > > > may simplify things because we could allocate it up front and know right > > > away whether we just don't have enough memory available to repair. > > > > Hmm. Apparently we actually /can/ call fallocate on memfd to grab all > > the pages at once, provided we have some guesstimate beforehand of how > > much space we think we'll need. > > > > So long as my earlier statement about the memory requirements being no > > more than the size of the btree leaves is actually true (I haven't > > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() * > > blocksize) worth of space in the memfd file. Maybe we ask for 1.5x > > that and if we don't get it, we kill the memfd and exit. > > > > Indeed. It would be nice if we could do all of the file management bits > in userspace. Agreed, though no file management would be even better. :) --D > Brian > > > --D > > > > > > > > Brian > > > > > > > --D > > > > > > > > > > > > > > Brian > > > > > > > > > > > --D > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > +done: > > > > > > > > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > + > > > > > > > > > > > > > ... > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > > > > > > > > diff = b1->bno - b2->bno; > > > > > > > > > > > > > > return diff; > > > > > > > > > > > > > > } > > > > > > > > > > > > > > + > > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > > > > > > > > +bool > > > > > > > > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > > > > > > > > + struct xfs_perag *pag) > > > > > > > > > > > > > > +{ > > > > > > > > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > > > > > > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > + return false; > > > > > > > > > > > > > > + } > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > + return true; > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > > > > > > > > + > > > > > > > > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > -- > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > -- > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > -- > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > -- > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > -- > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > -- > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 09, 2018 at 08:59:59AM -0700, Darrick J. Wong wrote: > On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote: > > On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote: > > > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote: > > > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote: > > > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote: > > > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote: > > > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote: > > > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > > > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: ... > > > > What _seems_ beneficial about that approach is we get (potentially > > > > external) persistent backing and memory reclaim ability with the > > > > traditional memory allocation model. > > > > > > > > ISTM that if we used a regular file, we'd need to deal with the > > > > traditional file interface somehow or another (file read/pagecache > > > > lookup -> record ??). > > > > > > Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so > > > all we need is a (struct file *). > > > > > > > We could repurpose some existing mechanism like the directory code or > > > > quota inode mechanism to use xfs buffers for that purpose, but I think > > > > that would require us to always use an internal inode. Allowing > > > > userspace to pass an fd/file passes that consideration on to the user, > > > > which might be more flexible. We could always warn about additional > > > > limitations if that fd happens to be based on the target fs. > > > > > > <nod> A second advantage of the struct file/kernel_{read,write} approach > > > is that we if we ever decide to let userspace pass in a fd, it's trivial > > > to feed that struct file to the kernel io routines instead of a memfd > > > one. > > > > > > > Yeah, I like this flexibility. In fact, I'm wondering why we wouldn't do > > something like this anyways. Could/should xfs_scrub be responsible for > > allocating a memfd and passing along the fd? Another advantage of doing > > that is whatever logic we may need to clean up old repair files or > > whatever is pushed to userspace. > > There are two ways we could do this -- one is to have the kernel manage > the memfd creation internally (like my patches do now); the other is for > xfs_scrub to pass in creat(O_TMPFILE). > > When repair fputs the file (or fdputs the fd if we switch to using > that), the kernel will perform the usual deletion of the zero-linkcount > zero-refcount file. We get all the "cleanup" for free by closing the > file. > Ok. FWIW, the latter approach where xfs_scrub creates a file and passes the fd along to the kernel seems preferable to me, but perhaps others have different opinions. We could accept a pathname from the user to create the file or otherwise attempt to allocate an memfd by default and pass that along. > One other potential complication is that a couple of the repair > functions need two memfds. The extended attribute repair creates a > fixed-record array for attr keys and an xblob to hold names and values; > each structure gets its own memfd. The refcount repair creates two > fixed-record arrays, one for refcount records and another to act as a > stack of rmaps to compute reference counts. > Hmm, I guess there's nothing stopping scrub from passing in two fds. Maybe it would make more sense for the userspace option to be a path basename or directory where scrub is allowed to create whatever scratch files it needs. That aside, is there any reason the repair mechanism couldn't emulate multiple files with a single fd via a magic offset delimeter or something? E.g., "file 1" starts at offset 0, "file 2" starts at offset 1TB, etc. (1TB is probably overkill, but you get the idea..). Brian > (In theory the xbitmap could also be converted to use the fixed record > array, but in practice they haven't (yet) become large enough to warrant > it, and there's currently no way to insert or delete records from the > middle of the array.) > > > > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it > > > > > > swappable or something? > > > > > > > > > > It's supposed to be. The quick test I ran (allocate a memfd, write 1GB > > > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into > > > > > the swap file. > > > > > > > > > > > > > Ok. > > > > > > > > > > If so, that sounds a reasonable option provided the swap space > > > > > > requirement can be made clear to users > > > > > > > > > > We can document it. I don't think it's any worse than xfs_repair being > > > > > able to use up all the memory + swap... and since we're probably only > > > > > going to be repairing one thing at a time, most likely scrub won't need > > > > > as much memory. > > > > > > > > > > > > > Right, but as noted below, my concerns with the xfs_repair comparison > > > > are that 1.) the kernel generally has more of a limit on anonymous > > > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.) > > > > it's not clear how effectively running the system out of memory via the > > > > kernel will behave from a failure perspective. > > > > > > > > IOW, xfs_repair can run the system out of memory but for the most part > > > > that ends up being a simple problem for the system: OOM kill the bloated > > > > xfs_repair process. For an online repair in a similar situation, I have > > > > no idea what's going to happen. > > > > > > Back in the days of the huge linked lists the oom killer would target > > > other proceses because it doesn't know that the online repair thread is > > > sitting on a ton of pinned kernel memory... > > > > > > > Makes sense, kind of what I'd expect... > > > > > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM > > > > we'd still be at risk of other subsystems running into memory > > > > allocation problems, filling up swap, the OOM killer going after > > > > unrelated processes, etc. What if, for example, the OOM killer starts > > > > picking off processes in service to a running online repair that > > > > immediately consumes freed up memory until the system is borked? > > > > > > Yeah. One thing we /could/ do is register an oom notifier that would > > > urge any running repair threads to bail out if they can. It seems to me > > > that the oom killer blocks on the oom_notify_list chain, so our handler > > > could wait until at least one thread exits before returning. > > > > > > > Ok, something like that could be useful. I agree that we probably don't > > need to go that far until the mechanism is nailed down and testing shows > > that OOM is a problem. > > It already is a problem on my contrived "2TB hardlink/reflink farm fs" + > "400M of RAM and no swap" scenario. Granted, pretty much every other > xfs utility also blows out on that so I'm not sure how hard I really > need to try... > > > > > I don't know how likely that is or if it really ends up much different > > > > from the analogous xfs_repair situation. My only point right now is > > > > that failure scenario is something we should explore for any solution > > > > we ultimately consider because it may be an unexpected use case of the > > > > underlying mechanism. > > > > > > Ideally, online repair would always be the victim since we know we have > > > a reasonable fallback. At least for memfd, however, I think the only > > > clues we have to decide the question "is this memfd getting in the way > > > of other threads?" is either seeing ENOMEM, short writes, or getting > > > kicked by an oom notification. Maybe that'll be enough? > > > > > > > Hm, yeah. It may be challenging to track memfd usage as such. If > > userspace has access to the fd on an OOM notification or whatever, it > > might be able to do more accurate analysis based on an fstat() or > > something. > > > > Related question... is the online repair sequence currently > > interruptible, if xfs_scrub receives a fatal signal while pulling in > > entries during an allocbt scan for example? > > It's interruptible (fatal signals only) during the scan phase, but once > it starts logging metadata updates it will run all the way to > completion. > > > > > (To the contrary, just using a cached file seems a natural fit from > > > > that perspective.) > > > > > > Same here. > > > > > > > > > and the failure characteristics aren't more severe than for userspace. > > > > > > An online repair that puts the broader system at risk of OOM as > > > > > > opposed to predictably failing gracefully may not be the most useful > > > > > > tool. > > > > > > > > > > Agreed. One huge downside of memfd seems to be the lack of a mechanism > > > > > for the vm to push back on us if we successfully write all we need to > > > > > the memfd but then other processes need some memory. Obviously, if the > > > > > memfd write itself comes up short or fails then we dump the memfd and > > > > > error back to userspace. We might simply have to free array memory > > > > > while we iterate the records to minimize the time spent at peak memory > > > > > usage. > > > > > > > > > > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach > > > > may simplify things because we could allocate it up front and know right > > > > away whether we just don't have enough memory available to repair. > > > > > > Hmm. Apparently we actually /can/ call fallocate on memfd to grab all > > > the pages at once, provided we have some guesstimate beforehand of how > > > much space we think we'll need. > > > > > > So long as my earlier statement about the memory requirements being no > > > more than the size of the btree leaves is actually true (I haven't > > > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() * > > > blocksize) worth of space in the memfd file. Maybe we ask for 1.5x > > > that and if we don't get it, we kill the memfd and exit. > > > > > > > Indeed. It would be nice if we could do all of the file management bits > > in userspace. > > Agreed, though no file management would be even better. :) > > --D > > > Brian > > > > > --D > > > > > > > > > > > Brian > > > > > > > > > --D > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > +done: > > > > > > > > > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > > > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > > > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > > > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > ... > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > > > > > > > > > diff = b1->bno - b2->bno; > > > > > > > > > > > > > > > return diff; > > > > > > > > > > > > > > > } > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > > > > > > > > > +bool > > > > > > > > > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > > > > > > > > > + struct xfs_perag *pag) > > > > > > > > > > > > > > > +{ > > > > > > > > > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > > > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > > > > > > > > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > > + return false; > > > > > > > > > > > > > > > + } > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > > + return true; > > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > > > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > -- > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > -- > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > -- > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > -- > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > -- > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > -- > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Aug 10, 2018 at 06:33:52AM -0400, Brian Foster wrote: > On Thu, Aug 09, 2018 at 08:59:59AM -0700, Darrick J. Wong wrote: > > On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote: > > > On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote: > > > > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote: > > > > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote: > > > > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote: > > > > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote: > > > > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote: > > > > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > > > > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > > > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > ... > > > > > What _seems_ beneficial about that approach is we get (potentially > > > > > external) persistent backing and memory reclaim ability with the > > > > > traditional memory allocation model. > > > > > > > > > > ISTM that if we used a regular file, we'd need to deal with the > > > > > traditional file interface somehow or another (file read/pagecache > > > > > lookup -> record ??). > > > > > > > > Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so > > > > all we need is a (struct file *). > > > > > > > > > We could repurpose some existing mechanism like the directory code or > > > > > quota inode mechanism to use xfs buffers for that purpose, but I think > > > > > that would require us to always use an internal inode. Allowing > > > > > userspace to pass an fd/file passes that consideration on to the user, > > > > > which might be more flexible. We could always warn about additional > > > > > limitations if that fd happens to be based on the target fs. > > > > > > > > <nod> A second advantage of the struct file/kernel_{read,write} approach > > > > is that we if we ever decide to let userspace pass in a fd, it's trivial > > > > to feed that struct file to the kernel io routines instead of a memfd > > > > one. > > > > > > > > > > Yeah, I like this flexibility. In fact, I'm wondering why we wouldn't do > > > something like this anyways. Could/should xfs_scrub be responsible for > > > allocating a memfd and passing along the fd? Another advantage of doing > > > that is whatever logic we may need to clean up old repair files or > > > whatever is pushed to userspace. > > > > There are two ways we could do this -- one is to have the kernel manage > > the memfd creation internally (like my patches do now); the other is for > > xfs_scrub to pass in creat(O_TMPFILE). > > > > When repair fputs the file (or fdputs the fd if we switch to using > > that), the kernel will perform the usual deletion of the zero-linkcount > > zero-refcount file. We get all the "cleanup" for free by closing the > > file. > > > > Ok. FWIW, the latter approach where xfs_scrub creates a file and passes > the fd along to the kernel seems preferable to me, but perhaps others > have different opinions. We could accept a pathname from the user to > create the file or otherwise attempt to allocate an memfd by default and > pass that along. > > > One other potential complication is that a couple of the repair > > functions need two memfds. The extended attribute repair creates a > > fixed-record array for attr keys and an xblob to hold names and values; > > each structure gets its own memfd. The refcount repair creates two > > fixed-record arrays, one for refcount records and another to act as a > > stack of rmaps to compute reference counts. > > > > Hmm, I guess there's nothing stopping scrub from passing in two fds. > Maybe it would make more sense for the userspace option to be a path > basename or directory where scrub is allowed to create whatever scratch > files it needs. > > That aside, is there any reason the repair mechanism couldn't emulate > multiple files with a single fd via a magic offset delimeter or > something? E.g., "file 1" starts at offset 0, "file 2" starts at offset > 1TB, etc. (1TB is probably overkill, but you get the idea..). Hmm, ok, so to summarize, I see five options: 1) Pass in a dirfd, repair can internally openat(dirfd, O_TMPFILE...) however many files it needs. 2) Pass in a however many file fds we need and segment the space. 3) Pass in a single file fd. 4) Let the repair code create as many memfd files as it wants. 5) Let the repair code create one memfd file and segment the space. I'm pretty sure we don't want to support (2) because that just seems like a requirements communication nightmare and can burn up a lot of space in struct xfs_scrub_metadata. (3) and (5) are basically the same except for where the file comes from. For (3) we'd have to make sure the fd filesystem supports large sparse files (and presumably isn't the xfs we're trying to repair), which shouldn't be too difficult to probe. For (5) we know that tmpfs already supports large sparse files. Another difficulty might be that on 32-bit the page cache only supports offsets as high as (ULONG_MAX * PAGE_SIZE), though I suppose at this point we only need two files and 8TB should be enough for anyone. (I also think it's reasonable to consider not supporting online repair on a 32-bit system with a large filesystem...) In general, the "pass in a thing from userspace" variants come with the complication that we have to check the functionality of whatever gets passed in. On the plus side it likely unlocks access to a lot more storage than we could get with mem+swap. On the minus side someone passes in a fd to a drive-managed SMR on USB 2.0, and... (1) seems like it would maximize the kernel's flexibility to create as many (regular, non-sparse) files as it needs, but now we're calling do_sys_open and managing files ourselves, which might be avoided. (4) of course is what we do right now. :) Soooo... the simplest userspace interface (I think) is to allow userspace to pass in a single file fd. Scrub can reject it if it doesn't measure up (fs is the same, sparse not supported, high offsets not supported, etc.). If userspace doesn't pass in an fd then we create a memfd and use that instead. We end up with a hybrid between (3) and (5). --D > Brian > > > (In theory the xbitmap could also be converted to use the fixed record > > array, but in practice they haven't (yet) become large enough to warrant > > it, and there's currently no way to insert or delete records from the > > middle of the array.) > > > > > > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it > > > > > > > swappable or something? > > > > > > > > > > > > It's supposed to be. The quick test I ran (allocate a memfd, write 1GB > > > > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into > > > > > > the swap file. > > > > > > > > > > > > > > > > Ok. > > > > > > > > > > > > If so, that sounds a reasonable option provided the swap space > > > > > > > requirement can be made clear to users > > > > > > > > > > > > We can document it. I don't think it's any worse than xfs_repair being > > > > > > able to use up all the memory + swap... and since we're probably only > > > > > > going to be repairing one thing at a time, most likely scrub won't need > > > > > > as much memory. > > > > > > > > > > > > > > > > Right, but as noted below, my concerns with the xfs_repair comparison > > > > > are that 1.) the kernel generally has more of a limit on anonymous > > > > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.) > > > > > it's not clear how effectively running the system out of memory via the > > > > > kernel will behave from a failure perspective. > > > > > > > > > > IOW, xfs_repair can run the system out of memory but for the most part > > > > > that ends up being a simple problem for the system: OOM kill the bloated > > > > > xfs_repair process. For an online repair in a similar situation, I have > > > > > no idea what's going to happen. > > > > > > > > Back in the days of the huge linked lists the oom killer would target > > > > other proceses because it doesn't know that the online repair thread is > > > > sitting on a ton of pinned kernel memory... > > > > > > > > > > Makes sense, kind of what I'd expect... > > > > > > > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM > > > > > we'd still be at risk of other subsystems running into memory > > > > > allocation problems, filling up swap, the OOM killer going after > > > > > unrelated processes, etc. What if, for example, the OOM killer starts > > > > > picking off processes in service to a running online repair that > > > > > immediately consumes freed up memory until the system is borked? > > > > > > > > Yeah. One thing we /could/ do is register an oom notifier that would > > > > urge any running repair threads to bail out if they can. It seems to me > > > > that the oom killer blocks on the oom_notify_list chain, so our handler > > > > could wait until at least one thread exits before returning. > > > > > > > > > > Ok, something like that could be useful. I agree that we probably don't > > > need to go that far until the mechanism is nailed down and testing shows > > > that OOM is a problem. > > > > It already is a problem on my contrived "2TB hardlink/reflink farm fs" + > > "400M of RAM and no swap" scenario. Granted, pretty much every other > > xfs utility also blows out on that so I'm not sure how hard I really > > need to try... > > > > > > > I don't know how likely that is or if it really ends up much different > > > > > from the analogous xfs_repair situation. My only point right now is > > > > > that failure scenario is something we should explore for any solution > > > > > we ultimately consider because it may be an unexpected use case of the > > > > > underlying mechanism. > > > > > > > > Ideally, online repair would always be the victim since we know we have > > > > a reasonable fallback. At least for memfd, however, I think the only > > > > clues we have to decide the question "is this memfd getting in the way > > > > of other threads?" is either seeing ENOMEM, short writes, or getting > > > > kicked by an oom notification. Maybe that'll be enough? > > > > > > > > > > Hm, yeah. It may be challenging to track memfd usage as such. If > > > userspace has access to the fd on an OOM notification or whatever, it > > > might be able to do more accurate analysis based on an fstat() or > > > something. > > > > > > Related question... is the online repair sequence currently > > > interruptible, if xfs_scrub receives a fatal signal while pulling in > > > entries during an allocbt scan for example? > > > > It's interruptible (fatal signals only) during the scan phase, but once > > it starts logging metadata updates it will run all the way to > > completion. > > > > > > > (To the contrary, just using a cached file seems a natural fit from > > > > > that perspective.) > > > > > > > > Same here. > > > > > > > > > > > and the failure characteristics aren't more severe than for userspace. > > > > > > > An online repair that puts the broader system at risk of OOM as > > > > > > > opposed to predictably failing gracefully may not be the most useful > > > > > > > tool. > > > > > > > > > > > > Agreed. One huge downside of memfd seems to be the lack of a mechanism > > > > > > for the vm to push back on us if we successfully write all we need to > > > > > > the memfd but then other processes need some memory. Obviously, if the > > > > > > memfd write itself comes up short or fails then we dump the memfd and > > > > > > error back to userspace. We might simply have to free array memory > > > > > > while we iterate the records to minimize the time spent at peak memory > > > > > > usage. > > > > > > > > > > > > > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach > > > > > may simplify things because we could allocate it up front and know right > > > > > away whether we just don't have enough memory available to repair. > > > > > > > > Hmm. Apparently we actually /can/ call fallocate on memfd to grab all > > > > the pages at once, provided we have some guesstimate beforehand of how > > > > much space we think we'll need. > > > > > > > > So long as my earlier statement about the memory requirements being no > > > > more than the size of the btree leaves is actually true (I haven't > > > > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() * > > > > blocksize) worth of space in the memfd file. Maybe we ask for 1.5x > > > > that and if we don't get it, we kill the memfd and exit. > > > > > > > > > > Indeed. It would be nice if we could do all of the file management bits > > > in userspace. > > > > Agreed, though no file management would be even better. :) > > > > --D > > > > > Brian > > > > > > > --D > > > > > > > > > > > > > > Brian > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > +done: > > > > > > > > > > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > > > > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > > > > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > > > > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > ... > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > > > > > > > > > > diff = b1->bno - b2->bno; > > > > > > > > > > > > > > > > return diff; > > > > > > > > > > > > > > > > } > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > > > > > > > > > > +bool > > > > > > > > > > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > > > > > > > > > > + struct xfs_perag *pag) > > > > > > > > > > > > > > > > +{ > > > > > > > > > > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > > > > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > > > > > > > > > > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > > > + return false; > > > > > > > > > > > > > > > > + } > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > > > + return true; > > > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > > > > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > -- > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > -- > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > -- > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > -- > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > -- > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > -- > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Aug 10, 2018 at 08:39:44AM -0700, Darrick J. Wong wrote: > On Fri, Aug 10, 2018 at 06:33:52AM -0400, Brian Foster wrote: > > On Thu, Aug 09, 2018 at 08:59:59AM -0700, Darrick J. Wong wrote: > > > On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote: > > > > On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote: > > > > > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote: > > > > > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote: > > > > > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote: > > > > > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote: > > > > > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote: > > > > > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > > > > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > > ... > > > > > > What _seems_ beneficial about that approach is we get (potentially > > > > > > external) persistent backing and memory reclaim ability with the > > > > > > traditional memory allocation model. > > > > > > > > > > > > ISTM that if we used a regular file, we'd need to deal with the > > > > > > traditional file interface somehow or another (file read/pagecache > > > > > > lookup -> record ??). > > > > > > > > > > Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so > > > > > all we need is a (struct file *). > > > > > > > > > > > We could repurpose some existing mechanism like the directory code or > > > > > > quota inode mechanism to use xfs buffers for that purpose, but I think > > > > > > that would require us to always use an internal inode. Allowing > > > > > > userspace to pass an fd/file passes that consideration on to the user, > > > > > > which might be more flexible. We could always warn about additional > > > > > > limitations if that fd happens to be based on the target fs. > > > > > > > > > > <nod> A second advantage of the struct file/kernel_{read,write} approach > > > > > is that we if we ever decide to let userspace pass in a fd, it's trivial > > > > > to feed that struct file to the kernel io routines instead of a memfd > > > > > one. > > > > > > > > > > > > > Yeah, I like this flexibility. In fact, I'm wondering why we wouldn't do > > > > something like this anyways. Could/should xfs_scrub be responsible for > > > > allocating a memfd and passing along the fd? Another advantage of doing > > > > that is whatever logic we may need to clean up old repair files or > > > > whatever is pushed to userspace. > > > > > > There are two ways we could do this -- one is to have the kernel manage > > > the memfd creation internally (like my patches do now); the other is for > > > xfs_scrub to pass in creat(O_TMPFILE). > > > > > > When repair fputs the file (or fdputs the fd if we switch to using > > > that), the kernel will perform the usual deletion of the zero-linkcount > > > zero-refcount file. We get all the "cleanup" for free by closing the > > > file. > > > > > > > Ok. FWIW, the latter approach where xfs_scrub creates a file and passes > > the fd along to the kernel seems preferable to me, but perhaps others > > have different opinions. We could accept a pathname from the user to > > create the file or otherwise attempt to allocate an memfd by default and > > pass that along. > > > > > One other potential complication is that a couple of the repair > > > functions need two memfds. The extended attribute repair creates a > > > fixed-record array for attr keys and an xblob to hold names and values; > > > each structure gets its own memfd. The refcount repair creates two > > > fixed-record arrays, one for refcount records and another to act as a > > > stack of rmaps to compute reference counts. > > > > > > > Hmm, I guess there's nothing stopping scrub from passing in two fds. > > Maybe it would make more sense for the userspace option to be a path > > basename or directory where scrub is allowed to create whatever scratch > > files it needs. > > > > That aside, is there any reason the repair mechanism couldn't emulate > > multiple files with a single fd via a magic offset delimeter or > > something? E.g., "file 1" starts at offset 0, "file 2" starts at offset > > 1TB, etc. (1TB is probably overkill, but you get the idea..). > > Hmm, ok, so to summarize, I see five options: > > 1) Pass in a dirfd, repair can internally openat(dirfd, O_TMPFILE...) > however many files it needs. > > 2) Pass in a however many file fds we need and segment the space. > > 3) Pass in a single file fd. > > 4) Let the repair code create as many memfd files as it wants. > > 5) Let the repair code create one memfd file and segment the space. > > I'm pretty sure we don't want to support (2) because that just seems > like a requirements communication nightmare and can burn up a lot of > space in struct xfs_scrub_metadata. > > (3) and (5) are basically the same except for where the file comes from. > For (3) we'd have to make sure the fd filesystem supports large sparse > files (and presumably isn't the xfs we're trying to repair), which > shouldn't be too difficult to probe. For (5) we know that tmpfs already > supports large sparse files. Another difficulty might be that on 32-bit > the page cache only supports offsets as high as (ULONG_MAX * PAGE_SIZE), > though I suppose at this point we only need two files and 8TB should be > enough for anyone. > > (I also think it's reasonable to consider not supporting online repair > on a 32-bit system with a large filesystem...) > > In general, the "pass in a thing from userspace" variants come with the > complication that we have to check the functionality of whatever gets > passed in. On the plus side it likely unlocks access to a lot more > storage than we could get with mem+swap. On the minus side someone > passes in a fd to a drive-managed SMR on USB 2.0, and... > > (1) seems like it would maximize the kernel's flexibility to create as > many (regular, non-sparse) files as it needs, but now we're calling > do_sys_open and managing files ourselves, which might be avoided. > > (4) of course is what we do right now. :) > > Soooo... the simplest userspace interface (I think) is to allow > userspace to pass in a single file fd. Scrub can reject it if it > doesn't measure up (fs is the same, sparse not supported, high offsets > not supported, etc.). If userspace doesn't pass in an fd then we create > a memfd and use that instead. We end up with a hybrid between (3) and (5). > That all sounds about right to me except I was thinking userspace would do the memfd fallback of #5 rather than the kernel, just to keep the policy out of the kernel as much as possible. Is there any major advantage to doing it in the kernel? I guess it would slightly complicate 'xfs_io -c repair' ... Brian > --D > > > Brian > > > > > (In theory the xbitmap could also be converted to use the fixed record > > > array, but in practice they haven't (yet) become large enough to warrant > > > it, and there's currently no way to insert or delete records from the > > > middle of the array.) > > > > > > > > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it > > > > > > > > swappable or something? > > > > > > > > > > > > > > It's supposed to be. The quick test I ran (allocate a memfd, write 1GB > > > > > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into > > > > > > > the swap file. > > > > > > > > > > > > > > > > > > > Ok. > > > > > > > > > > > > > > If so, that sounds a reasonable option provided the swap space > > > > > > > > requirement can be made clear to users > > > > > > > > > > > > > > We can document it. I don't think it's any worse than xfs_repair being > > > > > > > able to use up all the memory + swap... and since we're probably only > > > > > > > going to be repairing one thing at a time, most likely scrub won't need > > > > > > > as much memory. > > > > > > > > > > > > > > > > > > > Right, but as noted below, my concerns with the xfs_repair comparison > > > > > > are that 1.) the kernel generally has more of a limit on anonymous > > > > > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.) > > > > > > it's not clear how effectively running the system out of memory via the > > > > > > kernel will behave from a failure perspective. > > > > > > > > > > > > IOW, xfs_repair can run the system out of memory but for the most part > > > > > > that ends up being a simple problem for the system: OOM kill the bloated > > > > > > xfs_repair process. For an online repair in a similar situation, I have > > > > > > no idea what's going to happen. > > > > > > > > > > Back in the days of the huge linked lists the oom killer would target > > > > > other proceses because it doesn't know that the online repair thread is > > > > > sitting on a ton of pinned kernel memory... > > > > > > > > > > > > > Makes sense, kind of what I'd expect... > > > > > > > > > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM > > > > > > we'd still be at risk of other subsystems running into memory > > > > > > allocation problems, filling up swap, the OOM killer going after > > > > > > unrelated processes, etc. What if, for example, the OOM killer starts > > > > > > picking off processes in service to a running online repair that > > > > > > immediately consumes freed up memory until the system is borked? > > > > > > > > > > Yeah. One thing we /could/ do is register an oom notifier that would > > > > > urge any running repair threads to bail out if they can. It seems to me > > > > > that the oom killer blocks on the oom_notify_list chain, so our handler > > > > > could wait until at least one thread exits before returning. > > > > > > > > > > > > > Ok, something like that could be useful. I agree that we probably don't > > > > need to go that far until the mechanism is nailed down and testing shows > > > > that OOM is a problem. > > > > > > It already is a problem on my contrived "2TB hardlink/reflink farm fs" + > > > "400M of RAM and no swap" scenario. Granted, pretty much every other > > > xfs utility also blows out on that so I'm not sure how hard I really > > > need to try... > > > > > > > > > I don't know how likely that is or if it really ends up much different > > > > > > from the analogous xfs_repair situation. My only point right now is > > > > > > that failure scenario is something we should explore for any solution > > > > > > we ultimately consider because it may be an unexpected use case of the > > > > > > underlying mechanism. > > > > > > > > > > Ideally, online repair would always be the victim since we know we have > > > > > a reasonable fallback. At least for memfd, however, I think the only > > > > > clues we have to decide the question "is this memfd getting in the way > > > > > of other threads?" is either seeing ENOMEM, short writes, or getting > > > > > kicked by an oom notification. Maybe that'll be enough? > > > > > > > > > > > > > Hm, yeah. It may be challenging to track memfd usage as such. If > > > > userspace has access to the fd on an OOM notification or whatever, it > > > > might be able to do more accurate analysis based on an fstat() or > > > > something. > > > > > > > > Related question... is the online repair sequence currently > > > > interruptible, if xfs_scrub receives a fatal signal while pulling in > > > > entries during an allocbt scan for example? > > > > > > It's interruptible (fatal signals only) during the scan phase, but once > > > it starts logging metadata updates it will run all the way to > > > completion. > > > > > > > > > (To the contrary, just using a cached file seems a natural fit from > > > > > > that perspective.) > > > > > > > > > > Same here. > > > > > > > > > > > > > and the failure characteristics aren't more severe than for userspace. > > > > > > > > An online repair that puts the broader system at risk of OOM as > > > > > > > > opposed to predictably failing gracefully may not be the most useful > > > > > > > > tool. > > > > > > > > > > > > > > Agreed. One huge downside of memfd seems to be the lack of a mechanism > > > > > > > for the vm to push back on us if we successfully write all we need to > > > > > > > the memfd but then other processes need some memory. Obviously, if the > > > > > > > memfd write itself comes up short or fails then we dump the memfd and > > > > > > > error back to userspace. We might simply have to free array memory > > > > > > > while we iterate the records to minimize the time spent at peak memory > > > > > > > usage. > > > > > > > > > > > > > > > > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach > > > > > > may simplify things because we could allocate it up front and know right > > > > > > away whether we just don't have enough memory available to repair. > > > > > > > > > > Hmm. Apparently we actually /can/ call fallocate on memfd to grab all > > > > > the pages at once, provided we have some guesstimate beforehand of how > > > > > much space we think we'll need. > > > > > > > > > > So long as my earlier statement about the memory requirements being no > > > > > more than the size of the btree leaves is actually true (I haven't > > > > > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() * > > > > > blocksize) worth of space in the memfd file. Maybe we ask for 1.5x > > > > > that and if we don't get it, we kill the memfd and exit. > > > > > > > > > > > > > Indeed. It would be nice if we could do all of the file management bits > > > > in userspace. > > > > > > Agreed, though no file management would be even better. :) > > > > > > --D > > > > > > > Brian > > > > > > > > > --D > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > +done: > > > > > > > > > > > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > > > > > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > > > > > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > > > > > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > ... > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > > > > > > > > > > > diff = b1->bno - b2->bno; > > > > > > > > > > > > > > > > > return diff; > > > > > > > > > > > > > > > > > } > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > > > > > > > > > > > +bool > > > > > > > > > > > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > > > > > > > > > > > + struct xfs_perag *pag) > > > > > > > > > > > > > > > > > +{ > > > > > > > > > > > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > > > > > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > > > > + return false; > > > > > > > > > > > > > > > > > + } > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > > > > + return true; > > > > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > > > > > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > -- > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > -- > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > -- > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > -- > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > -- > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > -- > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Aug 10, 2018 at 03:07:40PM -0400, Brian Foster wrote: > On Fri, Aug 10, 2018 at 08:39:44AM -0700, Darrick J. Wong wrote: > > On Fri, Aug 10, 2018 at 06:33:52AM -0400, Brian Foster wrote: > > > On Thu, Aug 09, 2018 at 08:59:59AM -0700, Darrick J. Wong wrote: > > > > On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote: > > > > > On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote: > > > > > > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote: > > > > > > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote: > > > > > > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote: > > > > > > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote: > > > > > > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote: > > > > > > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > > > ... > > > > > > > What _seems_ beneficial about that approach is we get (potentially > > > > > > > external) persistent backing and memory reclaim ability with the > > > > > > > traditional memory allocation model. > > > > > > > > > > > > > > ISTM that if we used a regular file, we'd need to deal with the > > > > > > > traditional file interface somehow or another (file read/pagecache > > > > > > > lookup -> record ??). > > > > > > > > > > > > Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so > > > > > > all we need is a (struct file *). > > > > > > > > > > > > > We could repurpose some existing mechanism like the directory code or > > > > > > > quota inode mechanism to use xfs buffers for that purpose, but I think > > > > > > > that would require us to always use an internal inode. Allowing > > > > > > > userspace to pass an fd/file passes that consideration on to the user, > > > > > > > which might be more flexible. We could always warn about additional > > > > > > > limitations if that fd happens to be based on the target fs. > > > > > > > > > > > > <nod> A second advantage of the struct file/kernel_{read,write} approach > > > > > > is that we if we ever decide to let userspace pass in a fd, it's trivial > > > > > > to feed that struct file to the kernel io routines instead of a memfd > > > > > > one. > > > > > > > > > > > > > > > > Yeah, I like this flexibility. In fact, I'm wondering why we wouldn't do > > > > > something like this anyways. Could/should xfs_scrub be responsible for > > > > > allocating a memfd and passing along the fd? Another advantage of doing > > > > > that is whatever logic we may need to clean up old repair files or > > > > > whatever is pushed to userspace. > > > > > > > > There are two ways we could do this -- one is to have the kernel manage > > > > the memfd creation internally (like my patches do now); the other is for > > > > xfs_scrub to pass in creat(O_TMPFILE). > > > > > > > > When repair fputs the file (or fdputs the fd if we switch to using > > > > that), the kernel will perform the usual deletion of the zero-linkcount > > > > zero-refcount file. We get all the "cleanup" for free by closing the > > > > file. > > > > > > > > > > Ok. FWIW, the latter approach where xfs_scrub creates a file and passes > > > the fd along to the kernel seems preferable to me, but perhaps others > > > have different opinions. We could accept a pathname from the user to > > > create the file or otherwise attempt to allocate an memfd by default and > > > pass that along. > > > > > > > One other potential complication is that a couple of the repair > > > > functions need two memfds. The extended attribute repair creates a > > > > fixed-record array for attr keys and an xblob to hold names and values; > > > > each structure gets its own memfd. The refcount repair creates two > > > > fixed-record arrays, one for refcount records and another to act as a > > > > stack of rmaps to compute reference counts. > > > > > > > > > > Hmm, I guess there's nothing stopping scrub from passing in two fds. > > > Maybe it would make more sense for the userspace option to be a path > > > basename or directory where scrub is allowed to create whatever scratch > > > files it needs. > > > > > > That aside, is there any reason the repair mechanism couldn't emulate > > > multiple files with a single fd via a magic offset delimeter or > > > something? E.g., "file 1" starts at offset 0, "file 2" starts at offset > > > 1TB, etc. (1TB is probably overkill, but you get the idea..). > > > > Hmm, ok, so to summarize, I see five options: > > > > 1) Pass in a dirfd, repair can internally openat(dirfd, O_TMPFILE...) > > however many files it needs. > > > > 2) Pass in a however many file fds we need and segment the space. > > > > 3) Pass in a single file fd. > > > > 4) Let the repair code create as many memfd files as it wants. > > > > 5) Let the repair code create one memfd file and segment the space. > > > > I'm pretty sure we don't want to support (2) because that just seems > > like a requirements communication nightmare and can burn up a lot of > > space in struct xfs_scrub_metadata. > > > > (3) and (5) are basically the same except for where the file comes from. > > For (3) we'd have to make sure the fd filesystem supports large sparse > > files (and presumably isn't the xfs we're trying to repair), which > > shouldn't be too difficult to probe. For (5) we know that tmpfs already > > supports large sparse files. Another difficulty might be that on 32-bit > > the page cache only supports offsets as high as (ULONG_MAX * PAGE_SIZE), > > though I suppose at this point we only need two files and 8TB should be > > enough for anyone. > > > > (I also think it's reasonable to consider not supporting online repair > > on a 32-bit system with a large filesystem...) > > > > In general, the "pass in a thing from userspace" variants come with the > > complication that we have to check the functionality of whatever gets > > passed in. On the plus side it likely unlocks access to a lot more > > storage than we could get with mem+swap. On the minus side someone > > passes in a fd to a drive-managed SMR on USB 2.0, and... > > > > (1) seems like it would maximize the kernel's flexibility to create as > > many (regular, non-sparse) files as it needs, but now we're calling > > do_sys_open and managing files ourselves, which might be avoided. > > > > (4) of course is what we do right now. :) > > > > Soooo... the simplest userspace interface (I think) is to allow > > userspace to pass in a single file fd. Scrub can reject it if it > > doesn't measure up (fs is the same, sparse not supported, high offsets > > not supported, etc.). If userspace doesn't pass in an fd then we create > > a memfd and use that instead. We end up with a hybrid between (3) and (5). > > > > That all sounds about right to me except I was thinking userspace would > do the memfd fallback of #5 rather than the kernel, just to keep the > policy out of the kernel as much as possible. Is there any major > advantage to doing it in the kernel? I guess it would slightly > complicate 'xfs_io -c repair' ... Hm. We'll have to use one of the reserved areas of struct xfs_scrub_metadata to pass in the file descriptor. If we create a new XFS_SCRUB_IFLAG_FD flag to indicate that we're passing in a file descriptor then either we lose compatibility with old kernels (because they reject unknown flags) or xfs_scrub will have to try a repair without a fd (to see if the kernel even cares) and retry if the repair fails with some prearranged error code that means "give me a swapfile, please". Alternately we simply require that the fd cannot be fd 0 since using stdin for swap space is a stupid idea anyways. Technically we're not supposed to have flag days, but otoh this is a xfs-only ioctl for a feature that's still experimental, so perhaps it's not crucial to maintain compatibility with old kernels where the feature is incomplete and experimental? Hmm. We could define the fd field with the requirement that fd > 0, and if the repair function requires an fd and one hasn't been provided, it can fail out with ENOMEM. If it doesn't need extra memory it can just ignore the contents of the fd field. xfs_scrub can then arrange to pass in mem fds or file fds or whatever. --D > Brian > > > --D > > > > > Brian > > > > > > > (In theory the xbitmap could also be converted to use the fixed record > > > > array, but in practice they haven't (yet) become large enough to warrant > > > > it, and there's currently no way to insert or delete records from the > > > > middle of the array.) > > > > > > > > > > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it > > > > > > > > > swappable or something? > > > > > > > > > > > > > > > > It's supposed to be. The quick test I ran (allocate a memfd, write 1GB > > > > > > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into > > > > > > > > the swap file. > > > > > > > > > > > > > > > > > > > > > > Ok. > > > > > > > > > > > > > > > > If so, that sounds a reasonable option provided the swap space > > > > > > > > > requirement can be made clear to users > > > > > > > > > > > > > > > > We can document it. I don't think it's any worse than xfs_repair being > > > > > > > > able to use up all the memory + swap... and since we're probably only > > > > > > > > going to be repairing one thing at a time, most likely scrub won't need > > > > > > > > as much memory. > > > > > > > > > > > > > > > > > > > > > > Right, but as noted below, my concerns with the xfs_repair comparison > > > > > > > are that 1.) the kernel generally has more of a limit on anonymous > > > > > > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.) > > > > > > > it's not clear how effectively running the system out of memory via the > > > > > > > kernel will behave from a failure perspective. > > > > > > > > > > > > > > IOW, xfs_repair can run the system out of memory but for the most part > > > > > > > that ends up being a simple problem for the system: OOM kill the bloated > > > > > > > xfs_repair process. For an online repair in a similar situation, I have > > > > > > > no idea what's going to happen. > > > > > > > > > > > > Back in the days of the huge linked lists the oom killer would target > > > > > > other proceses because it doesn't know that the online repair thread is > > > > > > sitting on a ton of pinned kernel memory... > > > > > > > > > > > > > > > > Makes sense, kind of what I'd expect... > > > > > > > > > > > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM > > > > > > > we'd still be at risk of other subsystems running into memory > > > > > > > allocation problems, filling up swap, the OOM killer going after > > > > > > > unrelated processes, etc. What if, for example, the OOM killer starts > > > > > > > picking off processes in service to a running online repair that > > > > > > > immediately consumes freed up memory until the system is borked? > > > > > > > > > > > > Yeah. One thing we /could/ do is register an oom notifier that would > > > > > > urge any running repair threads to bail out if they can. It seems to me > > > > > > that the oom killer blocks on the oom_notify_list chain, so our handler > > > > > > could wait until at least one thread exits before returning. > > > > > > > > > > > > > > > > Ok, something like that could be useful. I agree that we probably don't > > > > > need to go that far until the mechanism is nailed down and testing shows > > > > > that OOM is a problem. > > > > > > > > It already is a problem on my contrived "2TB hardlink/reflink farm fs" + > > > > "400M of RAM and no swap" scenario. Granted, pretty much every other > > > > xfs utility also blows out on that so I'm not sure how hard I really > > > > need to try... > > > > > > > > > > > I don't know how likely that is or if it really ends up much different > > > > > > > from the analogous xfs_repair situation. My only point right now is > > > > > > > that failure scenario is something we should explore for any solution > > > > > > > we ultimately consider because it may be an unexpected use case of the > > > > > > > underlying mechanism. > > > > > > > > > > > > Ideally, online repair would always be the victim since we know we have > > > > > > a reasonable fallback. At least for memfd, however, I think the only > > > > > > clues we have to decide the question "is this memfd getting in the way > > > > > > of other threads?" is either seeing ENOMEM, short writes, or getting > > > > > > kicked by an oom notification. Maybe that'll be enough? > > > > > > > > > > > > > > > > Hm, yeah. It may be challenging to track memfd usage as such. If > > > > > userspace has access to the fd on an OOM notification or whatever, it > > > > > might be able to do more accurate analysis based on an fstat() or > > > > > something. > > > > > > > > > > Related question... is the online repair sequence currently > > > > > interruptible, if xfs_scrub receives a fatal signal while pulling in > > > > > entries during an allocbt scan for example? > > > > > > > > It's interruptible (fatal signals only) during the scan phase, but once > > > > it starts logging metadata updates it will run all the way to > > > > completion. > > > > > > > > > > > (To the contrary, just using a cached file seems a natural fit from > > > > > > > that perspective.) > > > > > > > > > > > > Same here. > > > > > > > > > > > > > > > and the failure characteristics aren't more severe than for userspace. > > > > > > > > > An online repair that puts the broader system at risk of OOM as > > > > > > > > > opposed to predictably failing gracefully may not be the most useful > > > > > > > > > tool. > > > > > > > > > > > > > > > > Agreed. One huge downside of memfd seems to be the lack of a mechanism > > > > > > > > for the vm to push back on us if we successfully write all we need to > > > > > > > > the memfd but then other processes need some memory. Obviously, if the > > > > > > > > memfd write itself comes up short or fails then we dump the memfd and > > > > > > > > error back to userspace. We might simply have to free array memory > > > > > > > > while we iterate the records to minimize the time spent at peak memory > > > > > > > > usage. > > > > > > > > > > > > > > > > > > > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach > > > > > > > may simplify things because we could allocate it up front and know right > > > > > > > away whether we just don't have enough memory available to repair. > > > > > > > > > > > > Hmm. Apparently we actually /can/ call fallocate on memfd to grab all > > > > > > the pages at once, provided we have some guesstimate beforehand of how > > > > > > much space we think we'll need. > > > > > > > > > > > > So long as my earlier statement about the memory requirements being no > > > > > > more than the size of the btree leaves is actually true (I haven't > > > > > > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() * > > > > > > blocksize) worth of space in the memfd file. Maybe we ask for 1.5x > > > > > > that and if we don't get it, we kill the memfd and exit. > > > > > > > > > > > > > > > > Indeed. It would be nice if we could do all of the file management bits > > > > > in userspace. > > > > > > > > Agreed, though no file management would be even better. :) > > > > > > > > --D > > > > > > > > > Brian > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > > +done: > > > > > > > > > > > > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > > > > > > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > > > > > > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > > > > > > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > ... > > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > > > > > > > > > > > > diff = b1->bno - b2->bno; > > > > > > > > > > > > > > > > > > return diff; > > > > > > > > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > > > > > > > > > > > > +bool > > > > > > > > > > > > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > > > > > > > > > > > > + struct xfs_perag *pag) > > > > > > > > > > > > > > > > > > +{ > > > > > > > > > > > > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > > > > > > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > > > > > + return false; > > > > > > > > > > > > > > > > > > + } > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > > > > > + return true; > > > > > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > > > > > > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > -- > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > -- > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > -- > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > -- > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > -- > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > -- > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > -- > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@vger.kernel.org > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Aug 10, 2018 at 12:36:51PM -0700, Darrick J. Wong wrote: > On Fri, Aug 10, 2018 at 03:07:40PM -0400, Brian Foster wrote: > > On Fri, Aug 10, 2018 at 08:39:44AM -0700, Darrick J. Wong wrote: > > > On Fri, Aug 10, 2018 at 06:33:52AM -0400, Brian Foster wrote: > > > > On Thu, Aug 09, 2018 at 08:59:59AM -0700, Darrick J. Wong wrote: > > > > > On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote: > > > > > > On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote: > > > > > > > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote: > > > > > > > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote: > > > > > > > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote: > > > > > > > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote: > > > > > > > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: ... > > > > > > Hmm, ok, so to summarize, I see five options: > > > > > > 1) Pass in a dirfd, repair can internally openat(dirfd, O_TMPFILE...) > > > however many files it needs. > > > > > > 2) Pass in a however many file fds we need and segment the space. > > > > > > 3) Pass in a single file fd. > > > > > > 4) Let the repair code create as many memfd files as it wants. > > > > > > 5) Let the repair code create one memfd file and segment the space. > > > > > > I'm pretty sure we don't want to support (2) because that just seems > > > like a requirements communication nightmare and can burn up a lot of > > > space in struct xfs_scrub_metadata. > > > > > > (3) and (5) are basically the same except for where the file comes from. > > > For (3) we'd have to make sure the fd filesystem supports large sparse > > > files (and presumably isn't the xfs we're trying to repair), which > > > shouldn't be too difficult to probe. For (5) we know that tmpfs already > > > supports large sparse files. Another difficulty might be that on 32-bit > > > the page cache only supports offsets as high as (ULONG_MAX * PAGE_SIZE), > > > though I suppose at this point we only need two files and 8TB should be > > > enough for anyone. > > > > > > (I also think it's reasonable to consider not supporting online repair > > > on a 32-bit system with a large filesystem...) > > > > > > In general, the "pass in a thing from userspace" variants come with the > > > complication that we have to check the functionality of whatever gets > > > passed in. On the plus side it likely unlocks access to a lot more > > > storage than we could get with mem+swap. On the minus side someone > > > passes in a fd to a drive-managed SMR on USB 2.0, and... > > > > > > (1) seems like it would maximize the kernel's flexibility to create as > > > many (regular, non-sparse) files as it needs, but now we're calling > > > do_sys_open and managing files ourselves, which might be avoided. > > > > > > (4) of course is what we do right now. :) > > > > > > Soooo... the simplest userspace interface (I think) is to allow > > > userspace to pass in a single file fd. Scrub can reject it if it > > > doesn't measure up (fs is the same, sparse not supported, high offsets > > > not supported, etc.). If userspace doesn't pass in an fd then we create > > > a memfd and use that instead. We end up with a hybrid between (3) and (5). > > > > > > > That all sounds about right to me except I was thinking userspace would > > do the memfd fallback of #5 rather than the kernel, just to keep the > > policy out of the kernel as much as possible. Is there any major > > advantage to doing it in the kernel? I guess it would slightly > > complicate 'xfs_io -c repair' ... > > Hm. We'll have to use one of the reserved areas of struct > xfs_scrub_metadata to pass in the file descriptor. If we create a new > XFS_SCRUB_IFLAG_FD flag to indicate that we're passing in a file > descriptor then either we lose compatibility with old kernels (because > they reject unknown flags) or xfs_scrub will have to try a repair > without a fd (to see if the kernel even cares) and retry if the repair > fails with some prearranged error code that means "give me a swapfile, > please". Alternately we simply require that the fd cannot be fd 0 since > using stdin for swap space is a stupid idea anyways. > I'm assuming that the kernel would have some basic checks on the fd to ensure it's usable (seekable, large offsets, etc.), as you mentioned previously. With regard to xfs_scrub_metadata, it sounds like we need to deal with that regardless if we want to support the ability to specify an external file. Is the issue backwards compatibility with the interface as it exists today..? > Technically we're not supposed to have flag days, but otoh this is a > xfs-only ioctl for a feature that's still experimental, so perhaps it's > not crucial to maintain compatibility with old kernels where the feature > is incomplete and experimental? > In my mind, I kind of take the experimental status as all bits/interface may explode and are otherwise subject to change or disappear. Perhaps others feel differently, it does seem we've kind of hinted towards the contrary recently with respect to the per-inode dax bits and then now in this discussion, but IMO that's kind of an inherent risk of doing incremental work on complex features upstream. I dunno, perhaps that's just a misunderstanding on my part. If so, I do wonder if we should be a bit more cautious (in the future) about exposing interfaces to experimental features (DEBUG mode only, for example) for a period of time until the underlying mechanism is fleshed out enough to establish confidence in the interface. It's one thing if an experimental feature is shiny new and potentially unstable at the time it is merged, but enough bits are there for reviewers to understand the design and interface requirements. It's another thing if the implementation is not yet complete, because then it's obviously harder to surmise whether the interface is ultimately sufficient. This of course is all higher level discussion from how to handle scrub.. > Hmm. We could define the fd field with the requirement that fd > 0, and > if the repair function requires an fd and one hasn't been provided, it > can fail out with ENOMEM. If it doesn't need extra memory it can just > ignore the contents of the fd field. xfs_scrub can then arrange to pass > in mem fds or file fds or whatever. > Is there a versioning mechanism to the interface? I thought we used that approach (or planned to..) in other similar internal commands, so a particular kernel could bump the version and appropriately decide how to handle older versions. Brian > --D > > > Brian > > > > > --D > > > > > > > Brian > > > > > > > > > (In theory the xbitmap could also be converted to use the fixed record > > > > > array, but in practice they haven't (yet) become large enough to warrant > > > > > it, and there's currently no way to insert or delete records from the > > > > > middle of the array.) > > > > > > > > > > > > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it > > > > > > > > > > swappable or something? > > > > > > > > > > > > > > > > > > It's supposed to be. The quick test I ran (allocate a memfd, write 1GB > > > > > > > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into > > > > > > > > > the swap file. > > > > > > > > > > > > > > > > > > > > > > > > > Ok. > > > > > > > > > > > > > > > > > > If so, that sounds a reasonable option provided the swap space > > > > > > > > > > requirement can be made clear to users > > > > > > > > > > > > > > > > > > We can document it. I don't think it's any worse than xfs_repair being > > > > > > > > > able to use up all the memory + swap... and since we're probably only > > > > > > > > > going to be repairing one thing at a time, most likely scrub won't need > > > > > > > > > as much memory. > > > > > > > > > > > > > > > > > > > > > > > > > Right, but as noted below, my concerns with the xfs_repair comparison > > > > > > > > are that 1.) the kernel generally has more of a limit on anonymous > > > > > > > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.) > > > > > > > > it's not clear how effectively running the system out of memory via the > > > > > > > > kernel will behave from a failure perspective. > > > > > > > > > > > > > > > > IOW, xfs_repair can run the system out of memory but for the most part > > > > > > > > that ends up being a simple problem for the system: OOM kill the bloated > > > > > > > > xfs_repair process. For an online repair in a similar situation, I have > > > > > > > > no idea what's going to happen. > > > > > > > > > > > > > > Back in the days of the huge linked lists the oom killer would target > > > > > > > other proceses because it doesn't know that the online repair thread is > > > > > > > sitting on a ton of pinned kernel memory... > > > > > > > > > > > > > > > > > > > Makes sense, kind of what I'd expect... > > > > > > > > > > > > > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM > > > > > > > > we'd still be at risk of other subsystems running into memory > > > > > > > > allocation problems, filling up swap, the OOM killer going after > > > > > > > > unrelated processes, etc. What if, for example, the OOM killer starts > > > > > > > > picking off processes in service to a running online repair that > > > > > > > > immediately consumes freed up memory until the system is borked? > > > > > > > > > > > > > > Yeah. One thing we /could/ do is register an oom notifier that would > > > > > > > urge any running repair threads to bail out if they can. It seems to me > > > > > > > that the oom killer blocks on the oom_notify_list chain, so our handler > > > > > > > could wait until at least one thread exits before returning. > > > > > > > > > > > > > > > > > > > Ok, something like that could be useful. I agree that we probably don't > > > > > > need to go that far until the mechanism is nailed down and testing shows > > > > > > that OOM is a problem. > > > > > > > > > > It already is a problem on my contrived "2TB hardlink/reflink farm fs" + > > > > > "400M of RAM and no swap" scenario. Granted, pretty much every other > > > > > xfs utility also blows out on that so I'm not sure how hard I really > > > > > need to try... > > > > > > > > > > > > > I don't know how likely that is or if it really ends up much different > > > > > > > > from the analogous xfs_repair situation. My only point right now is > > > > > > > > that failure scenario is something we should explore for any solution > > > > > > > > we ultimately consider because it may be an unexpected use case of the > > > > > > > > underlying mechanism. > > > > > > > > > > > > > > Ideally, online repair would always be the victim since we know we have > > > > > > > a reasonable fallback. At least for memfd, however, I think the only > > > > > > > clues we have to decide the question "is this memfd getting in the way > > > > > > > of other threads?" is either seeing ENOMEM, short writes, or getting > > > > > > > kicked by an oom notification. Maybe that'll be enough? > > > > > > > > > > > > > > > > > > > Hm, yeah. It may be challenging to track memfd usage as such. If > > > > > > userspace has access to the fd on an OOM notification or whatever, it > > > > > > might be able to do more accurate analysis based on an fstat() or > > > > > > something. > > > > > > > > > > > > Related question... is the online repair sequence currently > > > > > > interruptible, if xfs_scrub receives a fatal signal while pulling in > > > > > > entries during an allocbt scan for example? > > > > > > > > > > It's interruptible (fatal signals only) during the scan phase, but once > > > > > it starts logging metadata updates it will run all the way to > > > > > completion. > > > > > > > > > > > > > (To the contrary, just using a cached file seems a natural fit from > > > > > > > > that perspective.) > > > > > > > > > > > > > > Same here. > > > > > > > > > > > > > > > > > and the failure characteristics aren't more severe than for userspace. > > > > > > > > > > An online repair that puts the broader system at risk of OOM as > > > > > > > > > > opposed to predictably failing gracefully may not be the most useful > > > > > > > > > > tool. > > > > > > > > > > > > > > > > > > Agreed. One huge downside of memfd seems to be the lack of a mechanism > > > > > > > > > for the vm to push back on us if we successfully write all we need to > > > > > > > > > the memfd but then other processes need some memory. Obviously, if the > > > > > > > > > memfd write itself comes up short or fails then we dump the memfd and > > > > > > > > > error back to userspace. We might simply have to free array memory > > > > > > > > > while we iterate the records to minimize the time spent at peak memory > > > > > > > > > usage. > > > > > > > > > > > > > > > > > > > > > > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach > > > > > > > > may simplify things because we could allocate it up front and know right > > > > > > > > away whether we just don't have enough memory available to repair. > > > > > > > > > > > > > > Hmm. Apparently we actually /can/ call fallocate on memfd to grab all > > > > > > > the pages at once, provided we have some guesstimate beforehand of how > > > > > > > much space we think we'll need. > > > > > > > > > > > > > > So long as my earlier statement about the memory requirements being no > > > > > > > more than the size of the btree leaves is actually true (I haven't > > > > > > > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() * > > > > > > > blocksize) worth of space in the memfd file. Maybe we ask for 1.5x > > > > > > > that and if we don't get it, we kill the memfd and exit. > > > > > > > > > > > > > > > > > > > Indeed. It would be nice if we could do all of the file management bits > > > > > > in userspace. > > > > > > > > > > Agreed, though no file management would be even better. :) > > > > > > > > > > --D > > > > > > > > > > > Brian > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > > > +done: > > > > > > > > > > > > > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > > > > > > > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > > > > > > > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > > > > > > > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > > ... > > > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > > > > > > > > > > > > > diff = b1->bno - b2->bno; > > > > > > > > > > > > > > > > > > > return diff; > > > > > > > > > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > > > > > > > > > > > > > +bool > > > > > > > > > > > > > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > > > > > > > > > > > > > + struct xfs_perag *pag) > > > > > > > > > > > > > > > > > > > +{ > > > > > > > > > > > > > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > > > > > > > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > > > > > > + return false; > > > > > > > > > > > > > > > > > > > + } > > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > > > > > > + return true; > > > > > > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > > > > > > > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > > > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > -- > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > -- > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > -- > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > -- > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > -- > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > -- > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > -- > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > -- > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > the body of a message to majordomo@vger.kernel.org > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Aug 11, 2018 at 08:50:49AM -0400, Brian Foster wrote: > On Fri, Aug 10, 2018 at 12:36:51PM -0700, Darrick J. Wong wrote: > > On Fri, Aug 10, 2018 at 03:07:40PM -0400, Brian Foster wrote: > > > On Fri, Aug 10, 2018 at 08:39:44AM -0700, Darrick J. Wong wrote: > > > > On Fri, Aug 10, 2018 at 06:33:52AM -0400, Brian Foster wrote: > > > > > On Thu, Aug 09, 2018 at 08:59:59AM -0700, Darrick J. Wong wrote: > > > > > > On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote: > > > > > > > On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote: > > > > > > > > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote: > > > > > > > > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote: > > > > > > > > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote: > > > > > > > > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote: > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote: > > > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote: > > > > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote: > > > > > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote: > > > > > > > > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > ... > > > > > > > > Hmm, ok, so to summarize, I see five options: > > > > > > > > 1) Pass in a dirfd, repair can internally openat(dirfd, O_TMPFILE...) > > > > however many files it needs. > > > > > > > > 2) Pass in a however many file fds we need and segment the space. > > > > > > > > 3) Pass in a single file fd. > > > > > > > > 4) Let the repair code create as many memfd files as it wants. > > > > > > > > 5) Let the repair code create one memfd file and segment the space. > > > > > > > > I'm pretty sure we don't want to support (2) because that just seems > > > > like a requirements communication nightmare and can burn up a lot of > > > > space in struct xfs_scrub_metadata. > > > > > > > > (3) and (5) are basically the same except for where the file comes from. > > > > For (3) we'd have to make sure the fd filesystem supports large sparse > > > > files (and presumably isn't the xfs we're trying to repair), which > > > > shouldn't be too difficult to probe. For (5) we know that tmpfs already > > > > supports large sparse files. Another difficulty might be that on 32-bit > > > > the page cache only supports offsets as high as (ULONG_MAX * PAGE_SIZE), > > > > though I suppose at this point we only need two files and 8TB should be > > > > enough for anyone. > > > > > > > > (I also think it's reasonable to consider not supporting online repair > > > > on a 32-bit system with a large filesystem...) > > > > > > > > In general, the "pass in a thing from userspace" variants come with the > > > > complication that we have to check the functionality of whatever gets > > > > passed in. On the plus side it likely unlocks access to a lot more > > > > storage than we could get with mem+swap. On the minus side someone > > > > passes in a fd to a drive-managed SMR on USB 2.0, and... > > > > > > > > (1) seems like it would maximize the kernel's flexibility to create as > > > > many (regular, non-sparse) files as it needs, but now we're calling > > > > do_sys_open and managing files ourselves, which might be avoided. > > > > > > > > (4) of course is what we do right now. :) > > > > > > > > Soooo... the simplest userspace interface (I think) is to allow > > > > userspace to pass in a single file fd. Scrub can reject it if it > > > > doesn't measure up (fs is the same, sparse not supported, high offsets > > > > not supported, etc.). If userspace doesn't pass in an fd then we create > > > > a memfd and use that instead. We end up with a hybrid between (3) and (5). > > > > > > > > > > That all sounds about right to me except I was thinking userspace would > > > do the memfd fallback of #5 rather than the kernel, just to keep the > > > policy out of the kernel as much as possible. Is there any major > > > advantage to doing it in the kernel? I guess it would slightly > > > complicate 'xfs_io -c repair' ... > > > > Hm. We'll have to use one of the reserved areas of struct > > xfs_scrub_metadata to pass in the file descriptor. If we create a new > > XFS_SCRUB_IFLAG_FD flag to indicate that we're passing in a file > > descriptor then either we lose compatibility with old kernels (because > > they reject unknown flags) or xfs_scrub will have to try a repair > > without a fd (to see if the kernel even cares) and retry if the repair > > fails with some prearranged error code that means "give me a swapfile, > > please". Alternately we simply require that the fd cannot be fd 0 since > > using stdin for swap space is a stupid idea anyways. > > > > I'm assuming that the kernel would have some basic checks on the fd to > ensure it's usable (seekable, large offsets, etc.), as you mentioned > previously. Of course. :) > With regard to xfs_scrub_metadata, it sounds like we need to deal with > that regardless if we want to support the ability to specify an external > file. Is the issue backwards compatibility with the interface as it > exists today..? Yes, my question is how hard do we try to maintain backwards compatibility with an ioctl that controls an EXPERIMENTAL feature that is disabled by default in Kconfig? :) > > Technically we're not supposed to have flag days, but otoh this is a > > xfs-only ioctl for a feature that's still experimental, so perhaps it's > > not crucial to maintain compatibility with old kernels where the feature > > is incomplete and experimental? > > > > In my mind, I kind of take the experimental status as all bits/interface > may explode and are otherwise subject to change or disappear. Perhaps > others feel differently, it does seem we've kind of hinted towards the > contrary recently with respect to the per-inode dax bits and then now in > this discussion, but IMO that's kind of an inherent risk of doing > incremental work on complex features upstream. > > I dunno, perhaps that's just a misunderstanding on my part. If so, I do > wonder if we should be a bit more cautious (in the future) about > exposing interfaces to experimental features (DEBUG mode only, for > example) for a period of time until the underlying mechanism is fleshed > out enough to establish confidence in the interface. That was my reason for hiding it all behind a 'default N' Kconfig option -- to limit the number of users to those who build their own kernels. > It's one thing if an experimental feature is shiny new and potentially > unstable at the time it is merged, but enough bits are there for > reviewers to understand the design and interface requirements. It's > another thing if the implementation is not yet complete, because then > it's obviously harder to surmise whether the interface is ultimately > sufficient. <nod> I decided that it if we left experimental warnings in dmesg and the xfs_scrub output and forced users to rebuild their kernel to turn on scrub/repair then it was reasonable that we could change the ioctl interface without worrying too much about backwards compatibility. I think it's fine to add a 's32 sm_fd' field that can't be zero and can be picked up by scrub or repair if they want access to more space. > This of course is all higher level discussion from how to handle scrub.. > > > Hmm. We could define the fd field with the requirement that fd > 0, and > > if the repair function requires an fd and one hasn't been provided, it > > can fail out with ENOMEM. If it doesn't need extra memory it can just > > ignore the contents of the fd field. xfs_scrub can then arrange to pass > > in mem fds or file fds or whatever. > > > > Is there a versioning mechanism to the interface? I thought we used that > approach (or planned to..) in other similar internal commands, so a > particular kernel could bump the version and appropriately decide how to > handle older versions. There's plenty of space in the structure that's all required to be zero, so we could easily add a u8 sm_version some day. The IFLAG bit I mentioned would be sufficient for the fd field. --D > > Brian > > > --D > > > > > Brian > > > > > > > --D > > > > > > > > > Brian > > > > > > > > > > > (In theory the xbitmap could also be converted to use the fixed record > > > > > > array, but in practice they haven't (yet) become large enough to warrant > > > > > > it, and there's currently no way to insert or delete records from the > > > > > > middle of the array.) > > > > > > > > > > > > > > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it > > > > > > > > > > > swappable or something? > > > > > > > > > > > > > > > > > > > > It's supposed to be. The quick test I ran (allocate a memfd, write 1GB > > > > > > > > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into > > > > > > > > > > the swap file. > > > > > > > > > > > > > > > > > > > > > > > > > > > > Ok. > > > > > > > > > > > > > > > > > > > > If so, that sounds a reasonable option provided the swap space > > > > > > > > > > > requirement can be made clear to users > > > > > > > > > > > > > > > > > > > > We can document it. I don't think it's any worse than xfs_repair being > > > > > > > > > > able to use up all the memory + swap... and since we're probably only > > > > > > > > > > going to be repairing one thing at a time, most likely scrub won't need > > > > > > > > > > as much memory. > > > > > > > > > > > > > > > > > > > > > > > > > > > > Right, but as noted below, my concerns with the xfs_repair comparison > > > > > > > > > are that 1.) the kernel generally has more of a limit on anonymous > > > > > > > > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.) > > > > > > > > > it's not clear how effectively running the system out of memory via the > > > > > > > > > kernel will behave from a failure perspective. > > > > > > > > > > > > > > > > > > IOW, xfs_repair can run the system out of memory but for the most part > > > > > > > > > that ends up being a simple problem for the system: OOM kill the bloated > > > > > > > > > xfs_repair process. For an online repair in a similar situation, I have > > > > > > > > > no idea what's going to happen. > > > > > > > > > > > > > > > > Back in the days of the huge linked lists the oom killer would target > > > > > > > > other proceses because it doesn't know that the online repair thread is > > > > > > > > sitting on a ton of pinned kernel memory... > > > > > > > > > > > > > > > > > > > > > > Makes sense, kind of what I'd expect... > > > > > > > > > > > > > > > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM > > > > > > > > > we'd still be at risk of other subsystems running into memory > > > > > > > > > allocation problems, filling up swap, the OOM killer going after > > > > > > > > > unrelated processes, etc. What if, for example, the OOM killer starts > > > > > > > > > picking off processes in service to a running online repair that > > > > > > > > > immediately consumes freed up memory until the system is borked? > > > > > > > > > > > > > > > > Yeah. One thing we /could/ do is register an oom notifier that would > > > > > > > > urge any running repair threads to bail out if they can. It seems to me > > > > > > > > that the oom killer blocks on the oom_notify_list chain, so our handler > > > > > > > > could wait until at least one thread exits before returning. > > > > > > > > > > > > > > > > > > > > > > Ok, something like that could be useful. I agree that we probably don't > > > > > > > need to go that far until the mechanism is nailed down and testing shows > > > > > > > that OOM is a problem. > > > > > > > > > > > > It already is a problem on my contrived "2TB hardlink/reflink farm fs" + > > > > > > "400M of RAM and no swap" scenario. Granted, pretty much every other > > > > > > xfs utility also blows out on that so I'm not sure how hard I really > > > > > > need to try... > > > > > > > > > > > > > > > I don't know how likely that is or if it really ends up much different > > > > > > > > > from the analogous xfs_repair situation. My only point right now is > > > > > > > > > that failure scenario is something we should explore for any solution > > > > > > > > > we ultimately consider because it may be an unexpected use case of the > > > > > > > > > underlying mechanism. > > > > > > > > > > > > > > > > Ideally, online repair would always be the victim since we know we have > > > > > > > > a reasonable fallback. At least for memfd, however, I think the only > > > > > > > > clues we have to decide the question "is this memfd getting in the way > > > > > > > > of other threads?" is either seeing ENOMEM, short writes, or getting > > > > > > > > kicked by an oom notification. Maybe that'll be enough? > > > > > > > > > > > > > > > > > > > > > > Hm, yeah. It may be challenging to track memfd usage as such. If > > > > > > > userspace has access to the fd on an OOM notification or whatever, it > > > > > > > might be able to do more accurate analysis based on an fstat() or > > > > > > > something. > > > > > > > > > > > > > > Related question... is the online repair sequence currently > > > > > > > interruptible, if xfs_scrub receives a fatal signal while pulling in > > > > > > > entries during an allocbt scan for example? > > > > > > > > > > > > It's interruptible (fatal signals only) during the scan phase, but once > > > > > > it starts logging metadata updates it will run all the way to > > > > > > completion. > > > > > > > > > > > > > > > (To the contrary, just using a cached file seems a natural fit from > > > > > > > > > that perspective.) > > > > > > > > > > > > > > > > Same here. > > > > > > > > > > > > > > > > > > > and the failure characteristics aren't more severe than for userspace. > > > > > > > > > > > An online repair that puts the broader system at risk of OOM as > > > > > > > > > > > opposed to predictably failing gracefully may not be the most useful > > > > > > > > > > > tool. > > > > > > > > > > > > > > > > > > > > Agreed. One huge downside of memfd seems to be the lack of a mechanism > > > > > > > > > > for the vm to push back on us if we successfully write all we need to > > > > > > > > > > the memfd but then other processes need some memory. Obviously, if the > > > > > > > > > > memfd write itself comes up short or fails then we dump the memfd and > > > > > > > > > > error back to userspace. We might simply have to free array memory > > > > > > > > > > while we iterate the records to minimize the time spent at peak memory > > > > > > > > > > usage. > > > > > > > > > > > > > > > > > > > > > > > > > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach > > > > > > > > > may simplify things because we could allocate it up front and know right > > > > > > > > > away whether we just don't have enough memory available to repair. > > > > > > > > > > > > > > > > Hmm. Apparently we actually /can/ call fallocate on memfd to grab all > > > > > > > > the pages at once, provided we have some guesstimate beforehand of how > > > > > > > > much space we think we'll need. > > > > > > > > > > > > > > > > So long as my earlier statement about the memory requirements being no > > > > > > > > more than the size of the btree leaves is actually true (I haven't > > > > > > > > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() * > > > > > > > > blocksize) worth of space in the memfd file. Maybe we ask for 1.5x > > > > > > > > that and if we don't get it, we kill the memfd and exit. > > > > > > > > > > > > > > > > > > > > > > Indeed. It would be nice if we could do all of the file management bits > > > > > > > in userspace. > > > > > > > > > > > > Agreed, though no file management would be even better. :) > > > > > > > > > > > > --D > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > > > > +done: > > > > > > > > > > > > > > > > > > > > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > > > > > > > > > > > > > > > > > > > > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > > > > > > > > > > > > > > > > > > > > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > > > > > > > > > > > > > > > > > > > > + XFS_AG_RESV_NONE); > > > > > > > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > > > ... > > > > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644 > > > > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c > > > > > > > > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > > > > > > > > > > > > > > > > > > > > diff = b1->bno - b2->bno; > > > > > > > > > > > > > > > > > > > > return diff; > > > > > > > > > > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */ > > > > > > > > > > > > > > > > > > > > +bool > > > > > > > > > > > > > > > > > > > > +xfs_extent_busy_list_empty( > > > > > > > > > > > > > > > > > > > > + struct xfs_perag *pag) > > > > > > > > > > > > > > > > > > > > +{ > > > > > > > > > > > > > > > > > > > > + spin_lock(&pag->pagb_lock); > > > > > > > > > > > > > > > > > > > > + if (pag->pagb_tree.rb_node) { > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()? > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Good suggestion, thank you! > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > --D > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Brian > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > > > > > > > + return false; > > > > > > > > > > > > > > > > > > > > + } > > > > > > > > > > > > > > > > > > > > + spin_unlock(&pag->pagb_lock); > > > > > > > > > > > > > > > > > > > > + return true; > > > > > > > > > > > > > > > > > > > > +} > > > > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644 > > > > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h > > > > > > > > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > > > > > > > > > > > > > > > > > > > > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > > > > > > > > > > > > > > > > > > > > } > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > > > > > > > > > > > > > > > > > > > > + > > > > > > > > > > > > > > > > > > > > #endif /* __XFS_EXTENT_BUSY_H__ */ > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > > -- > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > > -- > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > > -- > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > > -- > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > > -- > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > > -- > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > > -- > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > > > -- > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > > > the body of a message to majordomo@vger.kernel.org > > > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Aug 11, 2018 at 08:50:49AM -0400, Brian Foster wrote: > On Fri, Aug 10, 2018 at 12:36:51PM -0700, Darrick J. Wong wrote: > > Technically we're not supposed to have flag days, but otoh this is a > > xfs-only ioctl for a feature that's still experimental, so perhaps it's > > not crucial to maintain compatibility with old kernels where the feature > > is incomplete and experimental? > > > > In my mind, I kind of take the experimental status as all bits/interface > may explode and are otherwise subject to change or disappear. Perhaps > others feel differently, it does seem we've kind of hinted towards the > contrary recently with respect to the per-inode dax bits and then now in > this discussion, but IMO that's kind of an inherent risk of doing > incremental work on complex features upstream. I've always considered that the experimental tag covers the user/ioctl interfaces as much as it does the functionality and on-disk format. i.e. like the on-disk format, the ioctl interfaces are subject to change until we clear the exp. tag, at which point they are essentially fixed forever. We /try/ not to have to change them after the initial merge, but sometimes we screw up and need to fix them before we commit to long term support. Cheers, Dave.
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 57ec46951ede..44ddd112acd2 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -164,6 +164,7 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ + alloc_repair.o \ bitmap.o \ repair.o \ ) diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 036b5c7021eb..c9b34ba312ab 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -15,7 +15,6 @@ #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_sb.h" -#include "xfs_alloc.h" #include "xfs_rmap.h" #include "xfs_alloc.h" #include "scrub/xfs_scrub.h" diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c new file mode 100644 index 000000000000..b228c2906de2 --- /dev/null +++ b/fs/xfs/scrub/alloc_repair.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2018 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_inode.h" +#include "xfs_refcount.h" +#include "xfs_extent_busy.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" + +/* + * Free Space Btree Repair + * ======================= + * + * The reverse mappings are supposed to record all space usage for the entire + * AG. Therefore, we can recalculate the free extents in an AG by looking for + * gaps in the physical extents recorded in the rmapbt. On a reflink + * filesystem this is a little more tricky in that we have to be aware that + * the rmap records are allowed to overlap. + * + * We derive which blocks belonged to the old bnobt/cntbt by recording all the + * OWN_AG extents and subtracting out the blocks owned by all other OWN_AG + * metadata: the rmapbt blocks visited while iterating the reverse mappings + * and the AGFL blocks. + * + * Once we have both of those pieces, we can reconstruct the bnobt and cntbt + * by blowing out the free block state and freeing all the extents that we + * found. This adds the requirement that we can't have any busy extents in + * the AG because the busy code cannot handle duplicate records. + * + * Note that we can only rebuild both free space btrees at the same time + * because the regular extent freeing infrastructure loads both btrees at the + * same time. + * + * We use the prefix 'xrep_abt' here because we regenerate both free space + * allocation btrees at the same time. + */ + +struct xrep_abt_extent { + struct list_head list; + xfs_agblock_t bno; + xfs_extlen_t len; +}; + +struct xrep_abt { + /* Blocks owned by the rmapbt or the agfl. */ + struct xfs_bitmap nobtlist; + + /* All OWN_AG blocks. */ + struct xfs_bitmap *btlist; + + /* Free space extents. */ + struct list_head *extlist; + + struct xfs_scrub *sc; + + /* Length of extlist. */ + uint64_t nr_records; + + /* + * Next block we anticipate seeing in the rmap records. If the next + * rmap record is greater than next_bno, we have found unused space. + */ + xfs_agblock_t next_bno; + + /* Number of free blocks in this AG. */ + xfs_agblock_t nr_blocks; +}; + +/* Record extents that aren't in use from gaps in the rmap records. */ +STATIC int +xrep_abt_walk_rmap( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_abt *ra = priv; + struct xrep_abt_extent *rae; + xfs_fsblock_t fsb; + int error; + + /* Record all the OWN_AG blocks... */ + if (rec->rm_owner == XFS_RMAP_OWN_AG) { + fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + rec->rm_startblock); + error = xfs_bitmap_set(ra->btlist, fsb, rec->rm_blockcount); + if (error) + return error; + } + + /* ...and all the rmapbt blocks... */ + error = xfs_bitmap_set_btcur_path(&ra->nobtlist, cur); + if (error) + return error; + + /* ...and all the free space. */ + if (rec->rm_startblock > ra->next_bno) { + trace_xrep_abt_walk_rmap(cur->bc_mp, cur->bc_private.a.agno, + ra->next_bno, rec->rm_startblock - ra->next_bno, + XFS_RMAP_OWN_NULL, 0, 0); + + rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL); + if (!rae) + return -ENOMEM; + INIT_LIST_HEAD(&rae->list); + rae->bno = ra->next_bno; + rae->len = rec->rm_startblock - ra->next_bno; + list_add_tail(&rae->list, ra->extlist); + ra->nr_records++; + ra->nr_blocks += rae->len; + } + ra->next_bno = max_t(xfs_agblock_t, ra->next_bno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + +/* Collect an AGFL block for the not-to-release list. */ +static int +xrep_abt_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t bno, + void *priv) +{ + struct xrep_abt *ra = priv; + xfs_fsblock_t fsb; + + fsb = XFS_AGB_TO_FSB(mp, ra->sc->sa.agno, bno); + return xfs_bitmap_set(&ra->nobtlist, fsb, 1); +} + +/* Compare two free space extents. */ +static int +xrep_abt_extent_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xrep_abt_extent *ap; + struct xrep_abt_extent *bp; + + ap = container_of(a, struct xrep_abt_extent, list); + bp = container_of(b, struct xrep_abt_extent, list); + + if (ap->bno > bp->bno) + return 1; + else if (ap->bno < bp->bno) + return -1; + return 0; +} + +/* Free an extent, which creates a record in the bnobt/cntbt. */ +STATIC int +xrep_abt_free_extent( + struct xfs_scrub *sc, + xfs_fsblock_t fsbno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo) +{ + int error; + + error = xfs_free_extent(sc->tp, fsbno, len, oinfo, 0); + if (error) + return error; + error = xrep_roll_ag_trans(sc); + if (error) + return error; + return xfs_mod_fdblocks(sc->mp, -(int64_t)len, false); +} + +/* Find the longest free extent in the list. */ +static struct xrep_abt_extent * +xrep_abt_get_longest( + struct list_head *free_extents) +{ + struct xrep_abt_extent *rae; + struct xrep_abt_extent *res = NULL; + + list_for_each_entry(rae, free_extents, list) { + if (!res || rae->len > res->len) + res = rae; + } + return res; +} + +/* + * Allocate a block from the (cached) first extent in the AG. In theory + * this should never fail, since we already checked that there was enough + * space to handle the new btrees. + */ +STATIC xfs_fsblock_t +xrep_abt_alloc_block( + struct xfs_scrub *sc, + struct list_head *free_extents) +{ + struct xrep_abt_extent *ext; + + /* Pull the first free space extent off the list, and... */ + ext = list_first_entry(free_extents, struct xrep_abt_extent, list); + + /* ...take its first block. */ + ext->bno++; + ext->len--; + if (ext->len == 0) { + list_del(&ext->list); + kmem_free(ext); + } + + return XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, ext->bno - 1); +} + +/* Free every record in the extent list. */ +STATIC void +xrep_abt_cancel_freelist( + struct list_head *extlist) +{ + struct xrep_abt_extent *rae; + struct xrep_abt_extent *n; + + list_for_each_entry_safe(rae, n, extlist, list) { + list_del(&rae->list); + kmem_free(rae); + } +} + +/* + * Iterate all reverse mappings to find (1) the free extents, (2) the OWN_AG + * extents, (3) the rmapbt blocks, and (4) the AGFL blocks. The free space is + * (1) + (2) - (3) - (4). Figure out if we have enough free space to + * reconstruct the free space btrees. Caller must clean up the input lists + * if something goes wrong. + */ +STATIC int +xrep_abt_find_freespace( + struct xfs_scrub *sc, + struct list_head *free_extents, + struct xfs_bitmap *old_allocbt_blocks) +{ + struct xrep_abt ra; + struct xrep_abt_extent *rae; + struct xfs_btree_cur *cur; + struct xfs_mount *mp = sc->mp; + xfs_agblock_t agend; + xfs_agblock_t nr_blocks; + int error; + + ra.extlist = free_extents; + ra.btlist = old_allocbt_blocks; + xfs_bitmap_init(&ra.nobtlist); + ra.next_bno = 0; + ra.nr_records = 0; + ra.nr_blocks = 0; + ra.sc = sc; + + /* + * Iterate all the reverse mappings to find gaps in the physical + * mappings, all the OWN_AG blocks, and all the rmapbt extents. + */ + cur = xfs_rmapbt_init_cursor(mp, sc->tp, sc->sa.agf_bp, sc->sa.agno); + error = xfs_rmap_query_all(cur, xrep_abt_walk_rmap, &ra); + if (error) + goto err; + xfs_btree_del_cursor(cur, error); + cur = NULL; + + /* Insert a record for space between the last rmap and EOAG. */ + agend = be32_to_cpu(XFS_BUF_TO_AGF(sc->sa.agf_bp)->agf_length); + if (ra.next_bno < agend) { + rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL); + if (!rae) { + error = -ENOMEM; + goto err; + } + INIT_LIST_HEAD(&rae->list); + rae->bno = ra.next_bno; + rae->len = agend - ra.next_bno; + list_add_tail(&rae->list, free_extents); + ra.nr_records++; + ra.nr_blocks += rae->len; + } + + /* Collect all the AGFL blocks. */ + error = xfs_agfl_walk(mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), + sc->sa.agfl_bp, xrep_abt_walk_agfl, &ra); + if (error) + goto err; + + /* Do we have enough space to rebuild both freespace btrees? */ + nr_blocks = 2 * xfs_allocbt_calc_size(mp, ra.nr_records); + if (!xrep_ag_has_space(sc->sa.pag, nr_blocks, XFS_AG_RESV_NONE) || + ra.nr_blocks < nr_blocks) { + error = -ENOSPC; + goto err; + } + + /* Compute the old bnobt/cntbt blocks. */ + error = xfs_bitmap_disunion(old_allocbt_blocks, &ra.nobtlist); +err: + xfs_bitmap_destroy(&ra.nobtlist); + if (cur) + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Reset the global free block counter and the per-AG counters to make it look + * like this AG has no free space. + */ +STATIC int +xrep_abt_reset_counters( + struct xfs_scrub *sc, + int *log_flags) +{ + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf; + xfs_agblock_t new_btblks; + xfs_agblock_t to_free; + int error; + + /* + * Since we're abandoning the old bnobt/cntbt, we have to decrease + * fdblocks by the # of blocks in those trees. btreeblks counts the + * non-root blocks of the free space and rmap btrees. Do this before + * resetting the AGF counters. + */ + agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + + /* rmap_blocks accounts root block, btreeblks doesn't */ + new_btblks = be32_to_cpu(agf->agf_rmap_blocks) - 1; + + /* btreeblks doesn't account bno/cnt root blocks */ + to_free = pag->pagf_btreeblks + 2; + + /* and don't account for the blocks we aren't freeing */ + to_free -= new_btblks; + + error = xfs_mod_fdblocks(sc->mp, -(int64_t)to_free, false); + if (error) + return error; + + /* + * Reset the per-AG info, both incore and ondisk. Mark the incore + * state stale in case we fail out of here. + */ + ASSERT(pag->pagf_init); + pag->pagf_init = 0; + pag->pagf_btreeblks = new_btblks; + pag->pagf_freeblks = 0; + pag->pagf_longest = 0; + + agf->agf_btreeblks = cpu_to_be32(new_btblks); + agf->agf_freeblks = 0; + agf->agf_longest = 0; + *log_flags |= XFS_AGF_BTREEBLKS | XFS_AGF_LONGEST | XFS_AGF_FREEBLKS; + + return 0; +} + +/* Initialize a new free space btree root and implant into AGF. */ +STATIC int +xrep_abt_reset_btree( + struct xfs_scrub *sc, + xfs_btnum_t btnum, + struct list_head *free_extents) +{ + struct xfs_owner_info oinfo; + struct xfs_buf *bp; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_mount *mp = sc->mp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + xfs_fsblock_t fsbno; + int error; + + /* Allocate new root block. */ + fsbno = xrep_abt_alloc_block(sc, free_extents); + if (fsbno == NULLFSBLOCK) + return -ENOSPC; + + /* Initialize new tree root. */ + error = xrep_init_btblock(sc, fsbno, &bp, btnum, &xfs_allocbt_buf_ops); + if (error) + return error; + + /* Implant into AGF. */ + agf->agf_roots[btnum] = cpu_to_be32(XFS_FSB_TO_AGBNO(mp, fsbno)); + agf->agf_levels[btnum] = cpu_to_be32(1); + + /* Add rmap records for the btree roots */ + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, + XFS_FSB_TO_AGBNO(mp, fsbno), 1, &oinfo); + if (error) + return error; + + /* Reset the incore state. */ + pag->pagf_levels[btnum] = 1; + + return 0; +} + +/* Initialize new bnobt/cntbt roots and implant them into the AGF. */ +STATIC int +xrep_abt_reset_btrees( + struct xfs_scrub *sc, + struct list_head *free_extents, + int *log_flags) +{ + int error; + + error = xrep_abt_reset_btree(sc, XFS_BTNUM_BNOi, free_extents); + if (error) + return error; + error = xrep_abt_reset_btree(sc, XFS_BTNUM_CNTi, free_extents); + if (error) + return error; + + *log_flags |= XFS_AGF_ROOTS | XFS_AGF_LEVELS; + return 0; +} + +/* + * Make our new freespace btree roots permanent so that we can start freeing + * unused space back into the AG. + */ +STATIC int +xrep_abt_commit_new( + struct xfs_scrub *sc, + struct xfs_bitmap *old_allocbt_blocks, + int log_flags) +{ + int error; + + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); + + /* Invalidate the old freespace btree blocks and commit. */ + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); + if (error) + return error; + error = xrep_roll_ag_trans(sc); + if (error) + return error; + + /* Now that we've succeeded, mark the incore state valid again. */ + sc->sa.pag->pagf_init = 1; + return 0; +} + +/* Build new free space btrees and dispose of the old one. */ +STATIC int +xrep_abt_rebuild_trees( + struct xfs_scrub *sc, + struct list_head *free_extents, + struct xfs_bitmap *old_allocbt_blocks) +{ + struct xfs_owner_info oinfo; + struct xrep_abt_extent *rae; + struct xrep_abt_extent *n; + struct xrep_abt_extent *longest; + int error; + + xfs_rmap_skip_owner_update(&oinfo); + + /* + * Insert the longest free extent in case it's necessary to + * refresh the AGFL with multiple blocks. If there is no longest + * extent, we had exactly the free space we needed; we're done. + */ + longest = xrep_abt_get_longest(free_extents); + if (!longest) + goto done; + error = xrep_abt_free_extent(sc, + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), + longest->len, &oinfo); + list_del(&longest->list); + kmem_free(longest); + if (error) + return error; + + /* Insert records into the new btrees. */ + list_for_each_entry_safe(rae, n, free_extents, list) { + error = xrep_abt_free_extent(sc, + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), + rae->len, &oinfo); + if (error) + return error; + list_del(&rae->list); + kmem_free(rae); + } + +done: + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, + XFS_AG_RESV_NONE); +} + +/* Repair the freespace btrees for some AG. */ +int +xrep_allocbt( + struct xfs_scrub *sc) +{ + struct list_head free_extents; + struct xfs_bitmap old_allocbt_blocks; + struct xfs_mount *mp = sc->mp; + int log_flags = 0; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return -EOPNOTSUPP; + + xchk_perag_get(sc->mp, &sc->sa); + + /* + * Make sure the busy extent list is clear because we can't put + * extents on there twice. + */ + if (!xfs_extent_busy_list_empty(sc->sa.pag)) + return -EDEADLOCK; + + /* Collect the free space data and find the old btree blocks. */ + INIT_LIST_HEAD(&free_extents); + xfs_bitmap_init(&old_allocbt_blocks); + error = xrep_abt_find_freespace(sc, &free_extents, &old_allocbt_blocks); + if (error) + goto out; + + /* Make sure we got some free space. */ + if (list_empty(&free_extents)) { + error = -ENOSPC; + goto out; + } + + /* + * Sort the free extents by block number to avoid bnobt splits when we + * rebuild the free space btrees. + */ + list_sort(NULL, &free_extents, xrep_abt_extent_cmp); + + /* + * Blow out the old free space btrees. This is the point at which + * we are no longer able to bail out gracefully. + */ + error = xrep_abt_reset_counters(sc, &log_flags); + if (error) + goto out; + error = xrep_abt_reset_btrees(sc, &free_extents, &log_flags); + if (error) + goto out; + error = xrep_abt_commit_new(sc, &old_allocbt_blocks, log_flags); + if (error) + goto out; + + /* Now rebuild the freespace information. */ + error = xrep_abt_rebuild_trees(sc, &free_extents, &old_allocbt_blocks); +out: + xrep_abt_cancel_freelist(&free_extents); + xfs_bitmap_destroy(&old_allocbt_blocks); + return error; +} diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 346b02abccf7..0fb949afaca9 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -623,8 +623,14 @@ xchk_setup_ag_btree( * expensive operation should be performed infrequently and only * as a last resort. Any caller that sets force_log should * document why they need to do so. + * + * Force everything in memory out to disk if we're repairing. + * This ensures we won't get tripped up by btree blocks sitting + * in memory waiting to have LSNs stamped in. The AGF/AGI repair + * routines use any available rmap data to try to find a btree + * root that also passes the read verifiers. */ - if (force_log) { + if (force_log || (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) { error = xchk_checkpoint_log(mp); if (error) return error; diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 9de321eee4ab..bc1a5f1cbcdc 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -61,6 +61,7 @@ int xrep_superblock(struct xfs_scrub *sc); int xrep_agf(struct xfs_scrub *sc); int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); +int xrep_allocbt(struct xfs_scrub *sc); #else @@ -87,6 +88,7 @@ xrep_calc_ag_resblks( #define xrep_agf xrep_notsupported #define xrep_agfl xrep_notsupported #define xrep_agi xrep_notsupported +#define xrep_allocbt xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 4bfae1e61d30..2133a3199372 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -232,13 +232,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, .scrub = xchk_bnobt, - .repair = xrep_notsupported, + .repair = xrep_allocbt, }, [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, .scrub = xchk_cntbt, - .repair = xrep_notsupported, + .repair = xrep_allocbt, }, [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ .type = ST_PERAG, diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 4e20f0e48232..26bd5dc68efe 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -551,7 +551,7 @@ DEFINE_EVENT(xrep_rmap_class, name, \ xfs_agblock_t agbno, xfs_extlen_t len, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_abt_walk_rmap); DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn); diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 0ed68379e551..82f99633a597 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( diff = b1->bno - b2->bno; return diff; } + +/* Are there any busy extents in this AG? */ +bool +xfs_extent_busy_list_empty( + struct xfs_perag *pag) +{ + spin_lock(&pag->pagb_lock); + if (pag->pagb_tree.rb_node) { + spin_unlock(&pag->pagb_lock); + return false; + } + spin_unlock(&pag->pagb_lock); + return true; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 990ab3891971..2f8c73c712c6 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) list_sort(NULL, list, xfs_extent_busy_ag_cmp); } +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); + #endif /* __XFS_EXTENT_BUSY_H__ */