Message ID | 155916878781.757870.16686660432312494436.stgit@magnolia (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | xfs: refactor and improve inode iteration | expand |
On Wed, May 29, 2019 at 03:26:27PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <darrick.wong@oracle.com> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Description? :) > +/* > + * Walking All the Inodes in the Filesystem > + * ======================================== > + * Starting at some @startino, call a walk function on every allocated inode in > + * the system. The walk function is called with the relevant inode number and > + * a pointer to caller-provided data. The walk function can return the usual > + * negative error code, 0, or XFS_IWALK_ABORT to stop the iteration. This > + * return value is returned to the caller. The walker iterates inodes in what order? What does it do with inodes before @startino? > + * Internally, we allow the walk function to do anything, which means that we > + * cannot maintain the inobt cursor or our lock on the AGI buffer. We > + * therefore build up a batch of inobt records in kernel memory and only call > + * the walk function when our memory buffer is full. > + */ "It is the responsibility of the walk function to ensure it accesses allocated inodes, as the inobt records may be stale by the time they are acted upon." > +struct xfs_iwalk_ag { > + struct xfs_mount *mp; > + struct xfs_trans *tp; > + > + /* Where do we start the traversal? */ > + xfs_ino_t startino; > + > + /* Array of inobt records we cache. */ > + struct xfs_inobt_rec_incore *recs; > + unsigned int sz_recs; > + unsigned int nr_recs; sz is the size of the allocated array, nr is the number of entries used? > + /* Inode walk function and data pointer. */ > + xfs_iwalk_fn iwalk_fn; > + void *data; > +}; > + > +/* Allocate memory for a walk. */ > +STATIC int > +xfs_iwalk_allocbuf( > + struct xfs_iwalk_ag *iwag) > +{ > + size_t size; > + > + ASSERT(iwag->recs == NULL); > + iwag->nr_recs = 0; > + > + /* Allocate a prefetch buffer for inobt records. */ > + size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore); > + iwag->recs = kmem_alloc(size, KM_SLEEP); > + if (iwag->recs == NULL) > + return -ENOMEM; KM_SLEEP will never fail. You mean to use KM_MAYFAIL here? > + > + return 0; > +} > + > +/* Free memory we allocated for a walk. */ > +STATIC void > +xfs_iwalk_freebuf( > + struct xfs_iwalk_ag *iwag) > +{ > + ASSERT(iwag->recs != NULL); > + kmem_free(iwag->recs); > +} No need for the assert here - kmem_free() handles null pointers just fine. > +/* For each inuse inode in each cached inobt record, call our function. */ > +STATIC int > +xfs_iwalk_ag_recs( > + struct xfs_iwalk_ag *iwag) > +{ > + struct xfs_mount *mp = iwag->mp; > + struct xfs_trans *tp = iwag->tp; > + struct xfs_inobt_rec_incore *irec; > + xfs_ino_t ino; > + unsigned int i, j; > + xfs_agnumber_t agno; > + int error; > + > + agno = XFS_INO_TO_AGNO(mp, iwag->startino); > + for (i = 0, irec = iwag->recs; i < iwag->nr_recs; i++, irec++) { I kinda prefer single iterator loops for array walking like this: for (i = 0; i < iwag->nr_recs; i++) { irec = &iwag->recs[i]; It's much easier to read and understand what is going on... > + trace_xfs_iwalk_ag_rec(mp, agno, irec->ir_startino, > + irec->ir_free); Could just pass irec to the trace function and extract startino/free within the tracepoint macro.... > + for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { > + /* Skip if this inode is free */ > + if (XFS_INOBT_MASK(j) & irec->ir_free) > + continue; > + > + /* Otherwise call our function. */ > + ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j); > + error = iwag->iwalk_fn(mp, tp, ino, iwag->data); > + if (error) > + return error; > + } > + } > + > + iwag->nr_recs = 0; Why is this zeroed here? > + return 0; > +} > + > +/* Read AGI and create inobt cursor. */ > +static inline int > +xfs_iwalk_inobt_cur( > + struct xfs_mount *mp, > + struct xfs_trans *tp, > + xfs_agnumber_t agno, > + struct xfs_btree_cur **curpp, > + struct xfs_buf **agi_bpp) > +{ > + struct xfs_btree_cur *cur; > + int error; > + > + ASSERT(*agi_bpp == NULL); > + > + error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp); > + if (error) > + return error; > + > + cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, XFS_BTNUM_INO); > + if (!cur) > + return -ENOMEM; > + *curpp = cur; > + return 0; > +} This is a common pattern. Used in xfs_imap_lookup(), xfs_bulkstat(), xfs_inumbers and xfs_inobt_count_blocks. Perhaps should be a common inobt function? > + > +/* Delete cursor and let go of AGI. */ > +static inline void > +xfs_iwalk_del_inobt( > + struct xfs_trans *tp, > + struct xfs_btree_cur **curpp, > + struct xfs_buf **agi_bpp, > + int error) > +{ > + if (*curpp) { > + xfs_btree_del_cursor(*curpp, error); > + *curpp = NULL; > + } > + if (*agi_bpp) { > + xfs_trans_brelse(tp, *agi_bpp); > + *agi_bpp = NULL; > + } > +} > + > +/* > + * Set ourselves up for walking inobt records starting from a given point in > + * the filesystem. > + * > + * If caller passed in a nonzero start inode number, load the record from the > + * inobt and make the record look like all the inodes before agino are free so > + * that we skip them, and then move the cursor to the next inobt record. This > + * is how we support starting an iwalk in the middle of an inode chunk. > + * > + * If the caller passed in a start number of zero, move the cursor to the first > + * inobt record. > + * > + * The caller is responsible for cleaning up the cursor and buffer pointer > + * regardless of the error status. > + */ > +STATIC int > +xfs_iwalk_ag_start( > + struct xfs_iwalk_ag *iwag, > + xfs_agnumber_t agno, > + xfs_agino_t agino, > + struct xfs_btree_cur **curpp, > + struct xfs_buf **agi_bpp, > + int *has_more) > +{ > + struct xfs_mount *mp = iwag->mp; > + struct xfs_trans *tp = iwag->tp; > + int icount; > + int error; > + > + /* Set up a fresh cursor and empty the inobt cache. */ > + iwag->nr_recs = 0; > + error = xfs_iwalk_inobt_cur(mp, tp, agno, curpp, agi_bpp); > + if (error) > + return error; > + > + /* Starting at the beginning of the AG? That's easy! */ > + if (agino == 0) > + return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more); > + > + /* > + * Otherwise, we have to grab the inobt record where we left off, stuff > + * the record into our cache, and then see if there are more records. > + * We require a lookup cache of at least two elements so that we don't > + * have to deal with tearing down the cursor to walk the records. > + */ > + error = xfs_bulkstat_grab_ichunk(*curpp, agino - 1, &icount, > + &iwag->recs[iwag->nr_recs]); > + if (error) > + return error; > + if (icount) > + iwag->nr_recs++; > + > + ASSERT(iwag->nr_recs < iwag->sz_recs); Why this code does what it does with nr_recs is a bit of a mystery to me... > + return xfs_btree_increment(*curpp, 0, has_more); > +} > + > +typedef int (*xfs_iwalk_ag_recs_fn)(struct xfs_iwalk_ag *iwag); > + > +/* > + * Acknowledge that we added an inobt record to the cache. Flush the inobt > + * record cache if the buffer is full, and position the cursor wherever it > + * needs to be so that we can keep going. > + */ > +STATIC int > +xfs_iwalk_ag_increment( > + struct xfs_iwalk_ag *iwag, > + xfs_iwalk_ag_recs_fn walk_ag_recs_fn, > + xfs_agnumber_t agno, > + struct xfs_btree_cur **curpp, > + struct xfs_buf **agi_bpp, > + int *has_more) > +{ > + struct xfs_mount *mp = iwag->mp; > + struct xfs_trans *tp = iwag->tp; > + struct xfs_inobt_rec_incore *irec; > + xfs_agino_t restart; > + int error; > + > + iwag->nr_recs++; > + > + /* If there's space, just increment and look for more records. */ > + if (iwag->nr_recs < iwag->sz_recs) > + return xfs_btree_increment(*curpp, 0, has_more); Incrementing before explaining why we're incrementing seems a bit fack-to-bront.... > + /* > + * Otherwise the record cache is full; delete the cursor and walk the > + * records... > + */ > + xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); > + irec = &iwag->recs[iwag->nr_recs - 1]; > + restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1; > + > + error = walk_ag_recs_fn(iwag); > + if (error) > + return error; Urk, so an "increment" function actually run all the object callbacks? But only if it fails to increment? > + > + /* ...and recreate cursor where we left off. */ > + error = xfs_iwalk_inobt_cur(mp, tp, agno, curpp, agi_bpp); > + if (error) > + return error; > + > + return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more); And then it goes an increments anyway? That's all a bit .... non-obvious. Especially as it has a single caller - this should really be something like xfs_iwalk_run_callbacks(). Bit more context below... > +} > + > +/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ > +STATIC int > +xfs_iwalk_ag( > + struct xfs_iwalk_ag *iwag) > +{ > + struct xfs_mount *mp = iwag->mp; > + struct xfs_trans *tp = iwag->tp; > + struct xfs_buf *agi_bp = NULL; > + struct xfs_btree_cur *cur = NULL; > + xfs_agnumber_t agno; > + xfs_agino_t agino; > + int has_more; > + int error = 0; > + > + /* Set up our cursor at the right place in the inode btree. */ > + agno = XFS_INO_TO_AGNO(mp, iwag->startino); > + agino = XFS_INO_TO_AGINO(mp, iwag->startino); > + error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); > + if (error) > + goto out_cur; > + > + while (has_more) { > + struct xfs_inobt_rec_incore *irec; > + > + /* Fetch the inobt record. */ > + irec = &iwag->recs[iwag->nr_recs]; > + error = xfs_inobt_get_rec(cur, irec, &has_more); > + if (error) > + goto out_cur; > + if (!has_more) > + break; > + > + /* No allocated inodes in this chunk; skip it. */ > + if (irec->ir_freecount == irec->ir_count) { > + error = xfs_btree_increment(cur, 0, &has_more); > + goto next_loop; > + } > + > + /* > + * Start readahead for this inode chunk in anticipation of > + * walking the inodes. > + */ > + xfs_bulkstat_ichunk_ra(mp, agno, irec); > + > + /* > + * Add this inobt record to our cache, flush the cache if > + * needed, and move on to the next record. > + */ > + error = xfs_iwalk_ag_increment(iwag, xfs_iwalk_ag_recs, agno, > + &cur, &agi_bp, &has_more); Ok, so given this loop already has an increment case in it, it seems like it would be better to pull some of this function into the loop somewhat like: while (has_more) { struct xfs_inobt_rec_incore *irec; cond_resched(); /* Fetch the inobt record. */ irec = &iwag->recs[iwag->nr_recs]; error = xfs_inobt_get_rec(cur, irec, &has_more); if (error || !has_more) break; /* No allocated inodes in this chunk; skip it. */ if (irec->ir_freecount == irec->ir_count) { error = xfs_btree_increment(cur, 0, &has_more); if (error) break; continue; } /* * Start readahead for this inode chunk in anticipation of * walking the inodes. */ xfs_bulkstat_ichunk_ra(mp, agno, irec); /* If there's space in the buffer, just grab more records. */ if (++iwag->nr_recs < iwag->sz_recs) error = xfs_btree_increment(cur, 0, &has_more); if (error) break; continue; } error = xfs_iwalk_run_callbacks(iwag, ...); } xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); if (!iwag->nr_recs || error) return error; return xfs_iwalk_ag_recs(iwag); } > + /* Walk any records left behind in the cache. */ > + if (iwag->nr_recs) { > + xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); > + return xfs_iwalk_ag_recs(iwag); > + } > + > +out_cur: > + xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); > + return error; > +} > + > +/* > + * Given the number of inodes to prefetch, set the number of inobt records that > + * we cache in memory, which controls the number of inodes we try to read > + * ahead. > + * > + * If no max prefetch was given, default to one page's worth of inobt records; > + * this should be plenty of inodes to read ahead. That's a lot of inodes on a 64k page size machine. I think it would be better capped at number that doesn't change with processor architecture... > + */ > +static inline void > +xfs_iwalk_set_prefetch( > + struct xfs_iwalk_ag *iwag, > + unsigned int max_prefetch) > +{ > + if (max_prefetch) > + iwag->sz_recs = round_up(max_prefetch, XFS_INODES_PER_CHUNK) / > + XFS_INODES_PER_CHUNK; > + else > + iwag->sz_recs = PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore); > + > + /* > + * Allocate enough space to prefetch at least two records so that we > + * can cache both the inobt record where the iwalk started and the next > + * record. This simplifies the AG inode walk loop setup code. > + */ > + if (iwag->sz_recs < 2) > + iwag->sz_recs = 2; iwag->sz_recs = max(iwag->sz_recs, 2); .... > + xfs_iwalk_set_prefetch(&iwag, max_prefetch); > + error = xfs_iwalk_allocbuf(&iwag); .... > + xfs_iwalk_freebuf(&iwag); I'd drop the "buf" from the names of those two functions... Cheers, Dave.
On Tue, Jun 04, 2019 at 05:41:42PM +1000, Dave Chinner wrote: > On Wed, May 29, 2019 at 03:26:27PM -0700, Darrick J. Wong wrote: > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > Description? :) > > > +/* > > + * Walking All the Inodes in the Filesystem > > + * ======================================== > > + * Starting at some @startino, call a walk function on every allocated inode in > > + * the system. The walk function is called with the relevant inode number and > > + * a pointer to caller-provided data. The walk function can return the usual > > + * negative error code, 0, or XFS_IWALK_ABORT to stop the iteration. This > > + * return value is returned to the caller. > > The walker iterates inodes in what order? What does it do with > inodes before @startino? They're walked in increasing order, and it ignores the ones before @startino. How about: /* * This iterator function walks a subset of filesystem inodes in increasing * order from @startino until there are no more inodes. For each allocated * inode it finds, it calls a walk function with the relevant inode number and * a pointer to caller-provided data. The walk function can return the usual * negative error code to stop the iteration; 0 to continue the iteration; or * XFS_IWALK_ABORT to stop the iteration. This return value is returned to the * caller. */ > > + * Internally, we allow the walk function to do anything, which means that we > > + * cannot maintain the inobt cursor or our lock on the AGI buffer. We > > + * therefore build up a batch of inobt records in kernel memory and only call > > + * the walk function when our memory buffer is full. > > + */ > > "It is the responsibility of the walk function to ensure it accesses > allocated inodes, as the inobt records may be stale by the time they are > acted upon." Added. > > > +struct xfs_iwalk_ag { > > + struct xfs_mount *mp; > > + struct xfs_trans *tp; > > + > > + /* Where do we start the traversal? */ > > + xfs_ino_t startino; > > + > > + /* Array of inobt records we cache. */ > > + struct xfs_inobt_rec_incore *recs; > > + unsigned int sz_recs; > > + unsigned int nr_recs; > > sz is the size of the allocated array, nr is the number of entries > used? Yes. I'll clarify that: /* Number of entries allocated for the @recs array. */ unsigned int sz_recs; /* Number of entries in the @recs array that are in use. */ unsigned int nr_recs; > > + /* Inode walk function and data pointer. */ > > + xfs_iwalk_fn iwalk_fn; > > + void *data; > > +}; > > + > > +/* Allocate memory for a walk. */ > > +STATIC int > > +xfs_iwalk_allocbuf( > > + struct xfs_iwalk_ag *iwag) > > +{ > > + size_t size; > > + > > + ASSERT(iwag->recs == NULL); > > + iwag->nr_recs = 0; > > + > > + /* Allocate a prefetch buffer for inobt records. */ > > + size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore); > > + iwag->recs = kmem_alloc(size, KM_SLEEP); > > + if (iwag->recs == NULL) > > + return -ENOMEM; > > KM_SLEEP will never fail. You mean to use KM_MAYFAIL here? > > > + > > + return 0; > > +} > > + > > +/* Free memory we allocated for a walk. */ > > +STATIC void > > +xfs_iwalk_freebuf( > > + struct xfs_iwalk_ag *iwag) > > +{ > > + ASSERT(iwag->recs != NULL); > > + kmem_free(iwag->recs); > > +} > > No need for the assert here - kmem_free() handles null pointers just > fine. > > > +/* For each inuse inode in each cached inobt record, call our function. */ > > +STATIC int > > +xfs_iwalk_ag_recs( > > + struct xfs_iwalk_ag *iwag) > > +{ > > + struct xfs_mount *mp = iwag->mp; > > + struct xfs_trans *tp = iwag->tp; > > + struct xfs_inobt_rec_incore *irec; > > + xfs_ino_t ino; > > + unsigned int i, j; > > + xfs_agnumber_t agno; > > + int error; > > + > > + agno = XFS_INO_TO_AGNO(mp, iwag->startino); > > + for (i = 0, irec = iwag->recs; i < iwag->nr_recs; i++, irec++) { > > I kinda prefer single iterator loops for array walking like this: > > for (i = 0; i < iwag->nr_recs; i++) { > irec = &iwag->recs[i]; > > It's much easier to read and understand what is going on... Ok, I'll shorten the variable scope while I'm at it. > > + trace_xfs_iwalk_ag_rec(mp, agno, irec->ir_startino, > > + irec->ir_free); > > Could just pass irec to the trace function and extract startino/free > within the tracepoint macro.... <nod> > > + for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { > > + /* Skip if this inode is free */ > > + if (XFS_INOBT_MASK(j) & irec->ir_free) > > + continue; > > + > > + /* Otherwise call our function. */ > > + ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j); > > + error = iwag->iwalk_fn(mp, tp, ino, iwag->data); > > + if (error) > > + return error; > > + } > > + } > > + > > + iwag->nr_recs = 0; > > Why is this zeroed here? Hmm, that should be pushed to the caller, especially given the name... > > + return 0; > > +} > > + > > +/* Read AGI and create inobt cursor. */ > > +static inline int > > +xfs_iwalk_inobt_cur( > > + struct xfs_mount *mp, > > + struct xfs_trans *tp, > > + xfs_agnumber_t agno, > > + struct xfs_btree_cur **curpp, > > + struct xfs_buf **agi_bpp) > > +{ > > + struct xfs_btree_cur *cur; > > + int error; > > + > > + ASSERT(*agi_bpp == NULL); > > + > > + error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp); > > + if (error) > > + return error; > > + > > + cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, XFS_BTNUM_INO); > > + if (!cur) > > + return -ENOMEM; > > + *curpp = cur; > > + return 0; > > +} > > This is a common pattern. Used in xfs_imap_lookup(), xfs_bulkstat(), > xfs_inumbers and xfs_inobt_count_blocks. Perhaps should be a common > inobt function? We're about to zap the middle two callers, but yes, these two could be common functions. I wasn't sure if it was worth it to save a few lines. > > + > > +/* Delete cursor and let go of AGI. */ > > +static inline void > > +xfs_iwalk_del_inobt( > > + struct xfs_trans *tp, > > + struct xfs_btree_cur **curpp, > > + struct xfs_buf **agi_bpp, > > + int error) > > +{ > > + if (*curpp) { > > + xfs_btree_del_cursor(*curpp, error); > > + *curpp = NULL; > > + } > > + if (*agi_bpp) { > > + xfs_trans_brelse(tp, *agi_bpp); > > + *agi_bpp = NULL; > > + } > > +} > > + > > +/* > > + * Set ourselves up for walking inobt records starting from a given point in > > + * the filesystem. > > + * > > + * If caller passed in a nonzero start inode number, load the record from the > > + * inobt and make the record look like all the inodes before agino are free so > > + * that we skip them, and then move the cursor to the next inobt record. This > > + * is how we support starting an iwalk in the middle of an inode chunk. > > + * > > + * If the caller passed in a start number of zero, move the cursor to the first > > + * inobt record. > > + * > > + * The caller is responsible for cleaning up the cursor and buffer pointer > > + * regardless of the error status. > > + */ > > +STATIC int > > +xfs_iwalk_ag_start( > > + struct xfs_iwalk_ag *iwag, > > + xfs_agnumber_t agno, > > + xfs_agino_t agino, > > + struct xfs_btree_cur **curpp, > > + struct xfs_buf **agi_bpp, > > + int *has_more) > > +{ > > + struct xfs_mount *mp = iwag->mp; > > + struct xfs_trans *tp = iwag->tp; > > + int icount; > > + int error; > > + > > + /* Set up a fresh cursor and empty the inobt cache. */ > > + iwag->nr_recs = 0; > > + error = xfs_iwalk_inobt_cur(mp, tp, agno, curpp, agi_bpp); > > + if (error) > > + return error; > > + > > + /* Starting at the beginning of the AG? That's easy! */ > > + if (agino == 0) > > + return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more); > > + > > + /* > > + * Otherwise, we have to grab the inobt record where we left off, stuff > > + * the record into our cache, and then see if there are more records. > > + * We require a lookup cache of at least two elements so that we don't > > + * have to deal with tearing down the cursor to walk the records. > > + */ > > + error = xfs_bulkstat_grab_ichunk(*curpp, agino - 1, &icount, > > + &iwag->recs[iwag->nr_recs]); > > + if (error) > > + return error; > > + if (icount) > > + iwag->nr_recs++; > > + > > + ASSERT(iwag->nr_recs < iwag->sz_recs); > > Why this code does what it does with nr_recs is a bit of a mystery > to me... sz_recs is the number of records we can store in the inobt record cache, and nr_recs is the number of records that are actually cached. Therefore, nr_recs should start at zero and increase until nr == sz at which point we have to run_callbacks(). I'll add the following to the assert if that was a point of confusion: /* * set_prefetch is supposed to give us a large enough inobt * record cache that grab_ichunk can stage a partial first * record and the loop body can cache a record without having to * check for cache space until after it reads an inobt record. */ > > + return xfs_btree_increment(*curpp, 0, has_more); > > +} > > + > > +typedef int (*xfs_iwalk_ag_recs_fn)(struct xfs_iwalk_ag *iwag); > > + > > +/* > > + * Acknowledge that we added an inobt record to the cache. Flush the inobt > > + * record cache if the buffer is full, and position the cursor wherever it > > + * needs to be so that we can keep going. > > + */ > > +STATIC int > > +xfs_iwalk_ag_increment( > > + struct xfs_iwalk_ag *iwag, > > + xfs_iwalk_ag_recs_fn walk_ag_recs_fn, > > + xfs_agnumber_t agno, > > + struct xfs_btree_cur **curpp, > > + struct xfs_buf **agi_bpp, > > + int *has_more) > > +{ > > + struct xfs_mount *mp = iwag->mp; > > + struct xfs_trans *tp = iwag->tp; > > + struct xfs_inobt_rec_incore *irec; > > + xfs_agino_t restart; > > + int error; > > + > > + iwag->nr_recs++; > > + > > + /* If there's space, just increment and look for more records. */ > > + if (iwag->nr_recs < iwag->sz_recs) > > + return xfs_btree_increment(*curpp, 0, has_more); > > Incrementing before explaining why we're incrementing seems a bit > fack-to-bront.... > > > + /* > > + * Otherwise the record cache is full; delete the cursor and walk the > > + * records... > > + */ > > + xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); > > + irec = &iwag->recs[iwag->nr_recs - 1]; > > + restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1; > > + > > + error = walk_ag_recs_fn(iwag); > > + if (error) > > + return error; > > Urk, so an "increment" function actually run all the object callbacks? > But only if it fails to increment? > > > + > > + /* ...and recreate cursor where we left off. */ > > + error = xfs_iwalk_inobt_cur(mp, tp, agno, curpp, agi_bpp); > > + if (error) > > + return error; > > + > > + return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more); > > And then it goes an increments anyway? > > That's all a bit .... non-obvious. Especially as it has a single > caller - this should really be something like > xfs_iwalk_run_callbacks(). Bit more context below... (I'll just skip to the big code blob below...) > > +} > > + > > +/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ > > +STATIC int > > +xfs_iwalk_ag( > > + struct xfs_iwalk_ag *iwag) > > +{ > > + struct xfs_mount *mp = iwag->mp; > > + struct xfs_trans *tp = iwag->tp; > > + struct xfs_buf *agi_bp = NULL; > > + struct xfs_btree_cur *cur = NULL; > > + xfs_agnumber_t agno; > > + xfs_agino_t agino; > > + int has_more; > > + int error = 0; > > + > > + /* Set up our cursor at the right place in the inode btree. */ > > + agno = XFS_INO_TO_AGNO(mp, iwag->startino); > > + agino = XFS_INO_TO_AGINO(mp, iwag->startino); > > + error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); > > + if (error) > > + goto out_cur; > > + > > + while (has_more) { > > + struct xfs_inobt_rec_incore *irec; > > + > > + /* Fetch the inobt record. */ > > + irec = &iwag->recs[iwag->nr_recs]; > > + error = xfs_inobt_get_rec(cur, irec, &has_more); > > + if (error) > > + goto out_cur; > > + if (!has_more) > > + break; > > + > > + /* No allocated inodes in this chunk; skip it. */ > > + if (irec->ir_freecount == irec->ir_count) { > > + error = xfs_btree_increment(cur, 0, &has_more); > > + goto next_loop; > > + } > > + > > + /* > > + * Start readahead for this inode chunk in anticipation of > > + * walking the inodes. > > + */ > > + xfs_bulkstat_ichunk_ra(mp, agno, irec); > > + > > + /* > > + * Add this inobt record to our cache, flush the cache if > > + * needed, and move on to the next record. > > + */ > > + error = xfs_iwalk_ag_increment(iwag, xfs_iwalk_ag_recs, agno, > > + &cur, &agi_bp, &has_more); > > Ok, so given this loop already has an increment case in it, it seems > like it would be better to pull some of this function into the loop > somewhat like: > > while (has_more) { > struct xfs_inobt_rec_incore *irec; > > cond_resched(); > > /* Fetch the inobt record. */ > irec = &iwag->recs[iwag->nr_recs]; > error = xfs_inobt_get_rec(cur, irec, &has_more); > if (error || !has_more) > break; > > /* No allocated inodes in this chunk; skip it. */ > if (irec->ir_freecount == irec->ir_count) { > error = xfs_btree_increment(cur, 0, &has_more); > if (error) > break; > continue; > } > > /* > * Start readahead for this inode chunk in anticipation of > * walking the inodes. > */ > xfs_bulkstat_ichunk_ra(mp, agno, irec); > > /* If there's space in the buffer, just grab more records. */ > if (++iwag->nr_recs < iwag->sz_recs) > error = xfs_btree_increment(cur, 0, &has_more); > if (error) > break; > continue; > } > > error = xfs_iwalk_run_callbacks(iwag, ...); > } > > xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); > if (!iwag->nr_recs || error) > return error; > return xfs_iwalk_ag_recs(iwag); > } Yeah, that is cleaner. :) > > + /* Walk any records left behind in the cache. */ > > + if (iwag->nr_recs) { > > + xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); > > + return xfs_iwalk_ag_recs(iwag); > > + } > > + > > +out_cur: > > + xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); > > + return error; > > +} > > + > > +/* > > + * Given the number of inodes to prefetch, set the number of inobt records that > > + * we cache in memory, which controls the number of inodes we try to read > > + * ahead. > > + * > > + * If no max prefetch was given, default to one page's worth of inobt records; > > + * this should be plenty of inodes to read ahead. > > That's a lot of inodes on a 64k page size machine. I think it would > be better capped at number that doesn't change with processor > architecture... 4096 / sizeof(...); then? since that's a single x86 page which means we're unlikely to fail the memory allocation? :) > > + */ > > +static inline void > > +xfs_iwalk_set_prefetch( > > + struct xfs_iwalk_ag *iwag, > > + unsigned int max_prefetch) > > +{ > > + if (max_prefetch) > > + iwag->sz_recs = round_up(max_prefetch, XFS_INODES_PER_CHUNK) / > > + XFS_INODES_PER_CHUNK; > > + else > > + iwag->sz_recs = PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore); > > + > > + /* > > + * Allocate enough space to prefetch at least two records so that we > > + * can cache both the inobt record where the iwalk started and the next > > + * record. This simplifies the AG inode walk loop setup code. > > + */ > > + if (iwag->sz_recs < 2) > > + iwag->sz_recs = 2; > > iwag->sz_recs = max(iwag->sz_recs, 2); > > .... > > + xfs_iwalk_set_prefetch(&iwag, max_prefetch); > > + error = xfs_iwalk_allocbuf(&iwag); > .... > > + xfs_iwalk_freebuf(&iwag); > > I'd drop the "buf" from the names of those two functions... <nod> --D > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 91831975363b..74d30ef0dbce 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -80,6 +80,7 @@ xfs-y += xfs_aops.o \ xfs_iops.o \ xfs_inode.o \ xfs_itable.o \ + xfs_iwalk.o \ xfs_message.o \ xfs_mount.o \ xfs_mru_cache.o \ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index cff28ee73deb..96590d9f917c 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -19,6 +19,7 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_health.h" +#include "xfs_iwalk.h" /* * Return stat information for one inode. @@ -161,7 +162,7 @@ xfs_bulkstat_one( * Loop over all clusters in a chunk for a given incore inode allocation btree * record. Do a readahead if there are any allocated inodes in that cluster. */ -STATIC void +void xfs_bulkstat_ichunk_ra( struct xfs_mount *mp, xfs_agnumber_t agno, @@ -195,7 +196,7 @@ xfs_bulkstat_ichunk_ra( * are some left allocated, update the data for the pointed-to record as well as * return the count of grabbed inodes. */ -STATIC int +int xfs_bulkstat_grab_ichunk( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agino_t agino, /* starting inode of chunk */ diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 8a822285b671..369e3f159d4e 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -84,4 +84,12 @@ xfs_inumbers( void __user *buffer, /* buffer with inode info */ inumbers_fmt_pf formatter); +/* Temporarily needed while we refactor functions. */ +struct xfs_btree_cur; +struct xfs_inobt_rec_incore; +void xfs_bulkstat_ichunk_ra(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *irec); +int xfs_bulkstat_grab_ichunk(struct xfs_btree_cur *cur, xfs_agino_t agino, + int *icount, struct xfs_inobt_rec_incore *irec); + #endif /* __XFS_ITABLE_H__ */ diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c new file mode 100644 index 000000000000..0ce3baa159ba --- /dev/null +++ b/fs/xfs/xfs_iwalk.c @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_itable.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_health.h" +#include "xfs_trans.h" +#include "xfs_iwalk.h" + +/* + * Walking All the Inodes in the Filesystem + * ======================================== + * Starting at some @startino, call a walk function on every allocated inode in + * the system. The walk function is called with the relevant inode number and + * a pointer to caller-provided data. The walk function can return the usual + * negative error code, 0, or XFS_IWALK_ABORT to stop the iteration. This + * return value is returned to the caller. + * + * Internally, we allow the walk function to do anything, which means that we + * cannot maintain the inobt cursor or our lock on the AGI buffer. We + * therefore build up a batch of inobt records in kernel memory and only call + * the walk function when our memory buffer is full. + */ + +struct xfs_iwalk_ag { + struct xfs_mount *mp; + struct xfs_trans *tp; + + /* Where do we start the traversal? */ + xfs_ino_t startino; + + /* Array of inobt records we cache. */ + struct xfs_inobt_rec_incore *recs; + unsigned int sz_recs; + unsigned int nr_recs; + + /* Inode walk function and data pointer. */ + xfs_iwalk_fn iwalk_fn; + void *data; +}; + +/* Allocate memory for a walk. */ +STATIC int +xfs_iwalk_allocbuf( + struct xfs_iwalk_ag *iwag) +{ + size_t size; + + ASSERT(iwag->recs == NULL); + iwag->nr_recs = 0; + + /* Allocate a prefetch buffer for inobt records. */ + size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore); + iwag->recs = kmem_alloc(size, KM_SLEEP); + if (iwag->recs == NULL) + return -ENOMEM; + + return 0; +} + +/* Free memory we allocated for a walk. */ +STATIC void +xfs_iwalk_freebuf( + struct xfs_iwalk_ag *iwag) +{ + ASSERT(iwag->recs != NULL); + kmem_free(iwag->recs); +} + +/* For each inuse inode in each cached inobt record, call our function. */ +STATIC int +xfs_iwalk_ag_recs( + struct xfs_iwalk_ag *iwag) +{ + struct xfs_mount *mp = iwag->mp; + struct xfs_trans *tp = iwag->tp; + struct xfs_inobt_rec_incore *irec; + xfs_ino_t ino; + unsigned int i, j; + xfs_agnumber_t agno; + int error; + + agno = XFS_INO_TO_AGNO(mp, iwag->startino); + for (i = 0, irec = iwag->recs; i < iwag->nr_recs; i++, irec++) { + trace_xfs_iwalk_ag_rec(mp, agno, irec->ir_startino, + irec->ir_free); + for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { + /* Skip if this inode is free */ + if (XFS_INOBT_MASK(j) & irec->ir_free) + continue; + + /* Otherwise call our function. */ + ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j); + error = iwag->iwalk_fn(mp, tp, ino, iwag->data); + if (error) + return error; + } + } + + iwag->nr_recs = 0; + return 0; +} + +/* Read AGI and create inobt cursor. */ +static inline int +xfs_iwalk_inobt_cur( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_btree_cur **curpp, + struct xfs_buf **agi_bpp) +{ + struct xfs_btree_cur *cur; + int error; + + ASSERT(*agi_bpp == NULL); + + error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp); + if (error) + return error; + + cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, XFS_BTNUM_INO); + if (!cur) + return -ENOMEM; + *curpp = cur; + return 0; +} + +/* Delete cursor and let go of AGI. */ +static inline void +xfs_iwalk_del_inobt( + struct xfs_trans *tp, + struct xfs_btree_cur **curpp, + struct xfs_buf **agi_bpp, + int error) +{ + if (*curpp) { + xfs_btree_del_cursor(*curpp, error); + *curpp = NULL; + } + if (*agi_bpp) { + xfs_trans_brelse(tp, *agi_bpp); + *agi_bpp = NULL; + } +} + +/* + * Set ourselves up for walking inobt records starting from a given point in + * the filesystem. + * + * If caller passed in a nonzero start inode number, load the record from the + * inobt and make the record look like all the inodes before agino are free so + * that we skip them, and then move the cursor to the next inobt record. This + * is how we support starting an iwalk in the middle of an inode chunk. + * + * If the caller passed in a start number of zero, move the cursor to the first + * inobt record. + * + * The caller is responsible for cleaning up the cursor and buffer pointer + * regardless of the error status. + */ +STATIC int +xfs_iwalk_ag_start( + struct xfs_iwalk_ag *iwag, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_btree_cur **curpp, + struct xfs_buf **agi_bpp, + int *has_more) +{ + struct xfs_mount *mp = iwag->mp; + struct xfs_trans *tp = iwag->tp; + int icount; + int error; + + /* Set up a fresh cursor and empty the inobt cache. */ + iwag->nr_recs = 0; + error = xfs_iwalk_inobt_cur(mp, tp, agno, curpp, agi_bpp); + if (error) + return error; + + /* Starting at the beginning of the AG? That's easy! */ + if (agino == 0) + return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more); + + /* + * Otherwise, we have to grab the inobt record where we left off, stuff + * the record into our cache, and then see if there are more records. + * We require a lookup cache of at least two elements so that we don't + * have to deal with tearing down the cursor to walk the records. + */ + error = xfs_bulkstat_grab_ichunk(*curpp, agino - 1, &icount, + &iwag->recs[iwag->nr_recs]); + if (error) + return error; + if (icount) + iwag->nr_recs++; + + ASSERT(iwag->nr_recs < iwag->sz_recs); + return xfs_btree_increment(*curpp, 0, has_more); +} + +typedef int (*xfs_iwalk_ag_recs_fn)(struct xfs_iwalk_ag *iwag); + +/* + * Acknowledge that we added an inobt record to the cache. Flush the inobt + * record cache if the buffer is full, and position the cursor wherever it + * needs to be so that we can keep going. + */ +STATIC int +xfs_iwalk_ag_increment( + struct xfs_iwalk_ag *iwag, + xfs_iwalk_ag_recs_fn walk_ag_recs_fn, + xfs_agnumber_t agno, + struct xfs_btree_cur **curpp, + struct xfs_buf **agi_bpp, + int *has_more) +{ + struct xfs_mount *mp = iwag->mp; + struct xfs_trans *tp = iwag->tp; + struct xfs_inobt_rec_incore *irec; + xfs_agino_t restart; + int error; + + iwag->nr_recs++; + + /* If there's space, just increment and look for more records. */ + if (iwag->nr_recs < iwag->sz_recs) + return xfs_btree_increment(*curpp, 0, has_more); + + /* + * Otherwise the record cache is full; delete the cursor and walk the + * records... + */ + xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); + irec = &iwag->recs[iwag->nr_recs - 1]; + restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1; + + error = walk_ag_recs_fn(iwag); + if (error) + return error; + + /* ...and recreate cursor where we left off. */ + error = xfs_iwalk_inobt_cur(mp, tp, agno, curpp, agi_bpp); + if (error) + return error; + + return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more); +} + +/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ +STATIC int +xfs_iwalk_ag( + struct xfs_iwalk_ag *iwag) +{ + struct xfs_mount *mp = iwag->mp; + struct xfs_trans *tp = iwag->tp; + struct xfs_buf *agi_bp = NULL; + struct xfs_btree_cur *cur = NULL; + xfs_agnumber_t agno; + xfs_agino_t agino; + int has_more; + int error = 0; + + /* Set up our cursor at the right place in the inode btree. */ + agno = XFS_INO_TO_AGNO(mp, iwag->startino); + agino = XFS_INO_TO_AGINO(mp, iwag->startino); + error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); + if (error) + goto out_cur; + + while (has_more) { + struct xfs_inobt_rec_incore *irec; + + /* Fetch the inobt record. */ + irec = &iwag->recs[iwag->nr_recs]; + error = xfs_inobt_get_rec(cur, irec, &has_more); + if (error) + goto out_cur; + if (!has_more) + break; + + /* No allocated inodes in this chunk; skip it. */ + if (irec->ir_freecount == irec->ir_count) { + error = xfs_btree_increment(cur, 0, &has_more); + goto next_loop; + } + + /* + * Start readahead for this inode chunk in anticipation of + * walking the inodes. + */ + xfs_bulkstat_ichunk_ra(mp, agno, irec); + + /* + * Add this inobt record to our cache, flush the cache if + * needed, and move on to the next record. + */ + error = xfs_iwalk_ag_increment(iwag, xfs_iwalk_ag_recs, agno, + &cur, &agi_bp, &has_more); +next_loop: + if (error) + goto out_cur; + cond_resched(); + } + + /* Walk any records left behind in the cache. */ + if (iwag->nr_recs) { + xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); + return xfs_iwalk_ag_recs(iwag); + } + +out_cur: + xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); + return error; +} + +/* + * Given the number of inodes to prefetch, set the number of inobt records that + * we cache in memory, which controls the number of inodes we try to read + * ahead. + * + * If no max prefetch was given, default to one page's worth of inobt records; + * this should be plenty of inodes to read ahead. + */ +static inline void +xfs_iwalk_set_prefetch( + struct xfs_iwalk_ag *iwag, + unsigned int max_prefetch) +{ + if (max_prefetch) + iwag->sz_recs = round_up(max_prefetch, XFS_INODES_PER_CHUNK) / + XFS_INODES_PER_CHUNK; + else + iwag->sz_recs = PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore); + + /* + * Allocate enough space to prefetch at least two records so that we + * can cache both the inobt record where the iwalk started and the next + * record. This simplifies the AG inode walk loop setup code. + */ + if (iwag->sz_recs < 2) + iwag->sz_recs = 2; +} + +/* + * Walk all inodes in the filesystem starting from @startino. The @iwalk_fn + * will be called for each allocated inode, being passed the inode's number and + * @data. @max_prefetch controls how many inobt records' worth of inodes we + * try to readahead. + */ +int +xfs_iwalk( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_ino_t startino, + xfs_iwalk_fn iwalk_fn, + unsigned int max_prefetch, + void *data) +{ + struct xfs_iwalk_ag iwag = { + .mp = mp, + .tp = tp, + .iwalk_fn = iwalk_fn, + .data = data, + .startino = startino, + }; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); + int error; + + ASSERT(agno < mp->m_sb.sb_agcount); + + xfs_iwalk_set_prefetch(&iwag, max_prefetch); + error = xfs_iwalk_allocbuf(&iwag); + if (error) + return error; + + for (; agno < mp->m_sb.sb_agcount; agno++) { + error = xfs_iwalk_ag(&iwag); + if (error) + break; + iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); + } + + xfs_iwalk_freebuf(&iwag); + return error; +} diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h new file mode 100644 index 000000000000..45b1baabcd2d --- /dev/null +++ b/fs/xfs/xfs_iwalk.h @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_IWALK_H__ +#define __XFS_IWALK_H__ + +/* Walk all inodes in the filesystem starting from @startino. */ +typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_ino_t ino, void *data); +/* Return value (for xfs_iwalk_fn) that aborts the walk immediately. */ +#define XFS_IWALK_ABORT (1) + +int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, + xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data); + +#endif /* __XFS_IWALK_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 2464ea351f83..a2881659f776 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3516,6 +3516,46 @@ DEFINE_EVENT(xfs_inode_corrupt_class, name, \ DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick); DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy); +TRACE_EVENT(xfs_iwalk_ag, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino), + TP_ARGS(mp, agno, startino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + ), + TP_printk("dev %d:%d agno %d startino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __entry->startino) +) + +TRACE_EVENT(xfs_iwalk_ag_rec, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino, uint64_t freemask), + TP_ARGS(mp, agno, startino, freemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(uint64_t, freemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + __entry->freemask = freemask; + ), + TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __entry->startino, __entry->freemask) +) + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH