Message ID | 160192209677.2569942.16673759463630442919.stgit@magnolia (mailing list archive) |
---|---|
State | Accepted, archived |
Headers | show |
Series | xfs: a few fixes and cleanups to GETFSMAP | expand |
On Mon, Oct 05, 2020 at 11:21:36AM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <darrick.wong@oracle.com> > > Refactor xfs_getfsmap to improve its performance: instead of indirectly > calling a function that copies one record to userspace at a time, create > a shadow buffer in the kernel and copy the whole array once at the end. > On the author's computer, this reduces the runtime on his /home by ~20%. > > This also eliminates a deadlock when running GETFSMAP against the > realtime device. The current code locks the rtbitmap to create > fsmappings and copies them into userspace, having not released the > rtbitmap lock. If the userspace buffer is an mmap of a sparse file that > itself resides on the realtime device, the write page fault will recurse > into the fs for allocation, which will deadlock on the rtbitmap lock. > > Fixes: 4c934c7dd60c ("xfs: report realtime space information via the rtbitmap") > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Looks good, Reviewed-by: Christoph Hellwig <hch@lst.de>
On Monday 5 October 2020 11:51:36 PM IST Darrick J. Wong wrote: > From: Darrick J. Wong <darrick.wong@oracle.com> > > Refactor xfs_getfsmap to improve its performance: instead of indirectly > calling a function that copies one record to userspace at a time, create > a shadow buffer in the kernel and copy the whole array once at the end. > On the author's computer, this reduces the runtime on his /home by ~20%. > > This also eliminates a deadlock when running GETFSMAP against the > realtime device. The current code locks the rtbitmap to create > fsmappings and copies them into userspace, having not released the > rtbitmap lock. If the userspace buffer is an mmap of a sparse file that > itself resides on the realtime device, the write page fault will recurse > into the fs for allocation, which will deadlock on the rtbitmap lock. > The changes look good to me, Reviewed-by: Chandan Babu R <chandanrlinux@gmail.com> > Fixes: 4c934c7dd60c ("xfs: report realtime space information via the rtbitmap") > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > --- > fs/xfs/xfs_fsmap.c | 45 +++++++++------- > fs/xfs/xfs_fsmap.h | 6 -- > fs/xfs/xfs_ioctl.c | 146 +++++++++++++++++++++++++++++++++++----------------- > 3 files changed, 125 insertions(+), 72 deletions(-) > > > diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c > index aa36e7daf82c..9ce5e7d5bf8f 100644 > --- a/fs/xfs/xfs_fsmap.c > +++ b/fs/xfs/xfs_fsmap.c > @@ -26,7 +26,7 @@ > #include "xfs_rtalloc.h" > > /* Convert an xfs_fsmap to an fsmap. */ > -void > +static void > xfs_fsmap_from_internal( > struct fsmap *dest, > struct xfs_fsmap *src) > @@ -155,8 +155,7 @@ xfs_fsmap_owner_from_rmap( > /* getfsmap query state */ > struct xfs_getfsmap_info { > struct xfs_fsmap_head *head; > - xfs_fsmap_format_t formatter; /* formatting fn */ > - void *format_arg; /* format buffer */ > + struct fsmap *fsmap_recs; /* mapping records */ > struct xfs_buf *agf_bp; /* AGF, for refcount queries */ > xfs_daddr_t next_daddr; /* next daddr we expect */ > u64 missing_owner; /* owner of holes */ > @@ -224,6 +223,20 @@ xfs_getfsmap_is_shared( > return 0; > } > > +static inline void > +xfs_getfsmap_format( > + struct xfs_mount *mp, > + struct xfs_fsmap *xfm, > + struct xfs_getfsmap_info *info) > +{ > + struct fsmap *rec; > + > + trace_xfs_getfsmap_mapping(mp, xfm); > + > + rec = &info->fsmap_recs[info->head->fmh_entries++]; > + xfs_fsmap_from_internal(rec, xfm); > +} > + > /* > * Format a reverse mapping for getfsmap, having translated rm_startblock > * into the appropriate daddr units. > @@ -288,10 +301,7 @@ xfs_getfsmap_helper( > fmr.fmr_offset = 0; > fmr.fmr_length = rec_daddr - info->next_daddr; > fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; > - error = info->formatter(&fmr, info->format_arg); > - if (error) > - return error; > - info->head->fmh_entries++; > + xfs_getfsmap_format(mp, &fmr, info); > } > > if (info->last) > @@ -323,11 +333,8 @@ xfs_getfsmap_helper( > if (shared) > fmr.fmr_flags |= FMR_OF_SHARED; > } > - error = info->formatter(&fmr, info->format_arg); > - if (error) > - return error; > - info->head->fmh_entries++; > > + xfs_getfsmap_format(mp, &fmr, info); > out: > rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); > if (info->next_daddr < rec_daddr) > @@ -795,11 +802,11 @@ xfs_getfsmap_check_keys( > #endif /* CONFIG_XFS_RT */ > > /* > - * Get filesystem's extents as described in head, and format for > - * output. Calls formatter to fill the user's buffer until all > - * extents are mapped, until the passed-in head->fmh_count slots have > - * been filled, or until the formatter short-circuits the loop, if it > - * is tracking filled-in extents on its own. > + * Get filesystem's extents as described in head, and format for output. Fills > + * in the supplied records array until there are no more reverse mappings to > + * return or head.fmh_entries == head.fmh_count. In the second case, this > + * function returns -ECANCELED to indicate that more records would have been > + * returned. > * > * Key to Confusion > * ---------------- > @@ -819,8 +826,7 @@ int > xfs_getfsmap( > struct xfs_mount *mp, > struct xfs_fsmap_head *head, > - xfs_fsmap_format_t formatter, > - void *arg) > + struct fsmap *fsmap_recs) > { > struct xfs_trans *tp = NULL; > struct xfs_fsmap dkeys[2]; /* per-dev keys */ > @@ -895,8 +901,7 @@ xfs_getfsmap( > > info.next_daddr = head->fmh_keys[0].fmr_physical + > head->fmh_keys[0].fmr_length; > - info.formatter = formatter; > - info.format_arg = arg; > + info.fsmap_recs = fsmap_recs; > info.head = head; > > /* > diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h > index c6c57739b862..a0775788e7b1 100644 > --- a/fs/xfs/xfs_fsmap.h > +++ b/fs/xfs/xfs_fsmap.h > @@ -27,13 +27,9 @@ struct xfs_fsmap_head { > struct xfs_fsmap fmh_keys[2]; /* low and high keys */ > }; > > -void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src); > void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src); > > -/* fsmap to userspace formatter - copy to user & advance pointer */ > -typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *); > - > int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head, > - xfs_fsmap_format_t formatter, void *arg); > + struct fsmap *out_recs); > > #endif /* __XFS_FSMAP_H__ */ > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c > index bca7659fb5c6..3fbd98f61ea5 100644 > --- a/fs/xfs/xfs_ioctl.c > +++ b/fs/xfs/xfs_ioctl.c > @@ -1716,39 +1716,17 @@ xfs_ioc_getbmap( > return error; > } > > -struct getfsmap_info { > - struct xfs_mount *mp; > - struct fsmap_head __user *data; > - unsigned int idx; > - __u32 last_flags; > -}; > - > -STATIC int > -xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv) > -{ > - struct getfsmap_info *info = priv; > - struct fsmap fm; > - > - trace_xfs_getfsmap_mapping(info->mp, xfm); > - > - info->last_flags = xfm->fmr_flags; > - xfs_fsmap_from_internal(&fm, xfm); > - if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm, > - sizeof(struct fsmap))) > - return -EFAULT; > - > - return 0; > -} > - > STATIC int > xfs_ioc_getfsmap( > struct xfs_inode *ip, > struct fsmap_head __user *arg) > { > - struct getfsmap_info info = { NULL }; > struct xfs_fsmap_head xhead = {0}; > struct fsmap_head head; > - bool aborted = false; > + struct fsmap *recs; > + unsigned int count; > + __u32 last_flags = 0; > + bool done = false; > int error; > > if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) > @@ -1760,38 +1738,112 @@ xfs_ioc_getfsmap( > sizeof(head.fmh_keys[1].fmr_reserved))) > return -EINVAL; > > + /* > + * Use an internal memory buffer so that we don't have to copy fsmap > + * data to userspace while holding locks. Start by trying to allocate > + * up to 128k for the buffer, but fall back to a single page if needed. > + */ > + count = min_t(unsigned int, head.fmh_count, > + 131072 / sizeof(struct fsmap)); > + recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL); > + if (!recs) { > + count = min_t(unsigned int, head.fmh_count, > + PAGE_SIZE / sizeof(struct fsmap)); > + recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL); > + if (!recs) > + return -ENOMEM; > + } > + > xhead.fmh_iflags = head.fmh_iflags; > - xhead.fmh_count = head.fmh_count; > xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]); > xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]); > > trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); > trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]); > > - info.mp = ip->i_mount; > - info.data = arg; > - error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); > - if (error == -ECANCELED) { > - error = 0; > - aborted = true; > - } else if (error) > - return error; > - > - /* If we didn't abort, set the "last" flag in the last fmx */ > - if (!aborted && info.idx) { > - info.last_flags |= FMR_OF_LAST; > - if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags, > - &info.last_flags, sizeof(info.last_flags))) > - return -EFAULT; > + head.fmh_entries = 0; > + do { > + struct fsmap __user *user_recs; > + struct fsmap *last_rec; > + > + user_recs = &arg->fmh_recs[head.fmh_entries]; > + xhead.fmh_entries = 0; > + xhead.fmh_count = min_t(unsigned int, count, > + head.fmh_count - head.fmh_entries); > + > + /* Run query, record how many entries we got. */ > + error = xfs_getfsmap(ip->i_mount, &xhead, recs); > + switch (error) { > + case 0: > + /* > + * There are no more records in the result set. Copy > + * whatever we got to userspace and break out. > + */ > + done = true; > + break; > + case -ECANCELED: > + /* > + * The internal memory buffer is full. Copy whatever > + * records we got to userspace and go again if we have > + * not yet filled the userspace buffer. > + */ > + error = 0; > + break; > + default: > + goto out_free; > + } > + head.fmh_entries += xhead.fmh_entries; > + head.fmh_oflags = xhead.fmh_oflags; > + > + /* > + * If the caller wanted a record count or there aren't any > + * new records to return, we're done. > + */ > + if (head.fmh_count == 0 || xhead.fmh_entries == 0) > + break; > + > + /* Copy all the records we got out to userspace. */ > + if (copy_to_user(user_recs, recs, > + xhead.fmh_entries * sizeof(struct fsmap))) { > + error = -EFAULT; > + goto out_free; > + } > + > + /* Remember the last record flags we copied to userspace. */ > + last_rec = &recs[xhead.fmh_entries - 1]; > + last_flags = last_rec->fmr_flags; > + > + /* Set up the low key for the next iteration. */ > + xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec); > + trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); > + } while (!done && head.fmh_entries < head.fmh_count); > + > + /* > + * If there are no more records in the query result set and we're not > + * in counting mode, mark the last record returned with the LAST flag. > + */ > + if (done && head.fmh_count > 0 && head.fmh_entries > 0) { > + struct fsmap __user *user_rec; > + > + last_flags |= FMR_OF_LAST; > + user_rec = &arg->fmh_recs[head.fmh_entries - 1]; > + > + if (copy_to_user(&user_rec->fmr_flags, &last_flags, > + sizeof(last_flags))) { > + error = -EFAULT; > + goto out_free; > + } > } > > /* copy back header */ > - head.fmh_entries = xhead.fmh_entries; > - head.fmh_oflags = xhead.fmh_oflags; > - if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) > - return -EFAULT; > + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) { > + error = -EFAULT; > + goto out_free; > + } > > - return 0; > +out_free: > + kmem_free(recs); > + return error; > } > > STATIC int > >
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index aa36e7daf82c..9ce5e7d5bf8f 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -26,7 +26,7 @@ #include "xfs_rtalloc.h" /* Convert an xfs_fsmap to an fsmap. */ -void +static void xfs_fsmap_from_internal( struct fsmap *dest, struct xfs_fsmap *src) @@ -155,8 +155,7 @@ xfs_fsmap_owner_from_rmap( /* getfsmap query state */ struct xfs_getfsmap_info { struct xfs_fsmap_head *head; - xfs_fsmap_format_t formatter; /* formatting fn */ - void *format_arg; /* format buffer */ + struct fsmap *fsmap_recs; /* mapping records */ struct xfs_buf *agf_bp; /* AGF, for refcount queries */ xfs_daddr_t next_daddr; /* next daddr we expect */ u64 missing_owner; /* owner of holes */ @@ -224,6 +223,20 @@ xfs_getfsmap_is_shared( return 0; } +static inline void +xfs_getfsmap_format( + struct xfs_mount *mp, + struct xfs_fsmap *xfm, + struct xfs_getfsmap_info *info) +{ + struct fsmap *rec; + + trace_xfs_getfsmap_mapping(mp, xfm); + + rec = &info->fsmap_recs[info->head->fmh_entries++]; + xfs_fsmap_from_internal(rec, xfm); +} + /* * Format a reverse mapping for getfsmap, having translated rm_startblock * into the appropriate daddr units. @@ -288,10 +301,7 @@ xfs_getfsmap_helper( fmr.fmr_offset = 0; fmr.fmr_length = rec_daddr - info->next_daddr; fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; - error = info->formatter(&fmr, info->format_arg); - if (error) - return error; - info->head->fmh_entries++; + xfs_getfsmap_format(mp, &fmr, info); } if (info->last) @@ -323,11 +333,8 @@ xfs_getfsmap_helper( if (shared) fmr.fmr_flags |= FMR_OF_SHARED; } - error = info->formatter(&fmr, info->format_arg); - if (error) - return error; - info->head->fmh_entries++; + xfs_getfsmap_format(mp, &fmr, info); out: rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) @@ -795,11 +802,11 @@ xfs_getfsmap_check_keys( #endif /* CONFIG_XFS_RT */ /* - * Get filesystem's extents as described in head, and format for - * output. Calls formatter to fill the user's buffer until all - * extents are mapped, until the passed-in head->fmh_count slots have - * been filled, or until the formatter short-circuits the loop, if it - * is tracking filled-in extents on its own. + * Get filesystem's extents as described in head, and format for output. Fills + * in the supplied records array until there are no more reverse mappings to + * return or head.fmh_entries == head.fmh_count. In the second case, this + * function returns -ECANCELED to indicate that more records would have been + * returned. * * Key to Confusion * ---------------- @@ -819,8 +826,7 @@ int xfs_getfsmap( struct xfs_mount *mp, struct xfs_fsmap_head *head, - xfs_fsmap_format_t formatter, - void *arg) + struct fsmap *fsmap_recs) { struct xfs_trans *tp = NULL; struct xfs_fsmap dkeys[2]; /* per-dev keys */ @@ -895,8 +901,7 @@ xfs_getfsmap( info.next_daddr = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; - info.formatter = formatter; - info.format_arg = arg; + info.fsmap_recs = fsmap_recs; info.head = head; /* diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h index c6c57739b862..a0775788e7b1 100644 --- a/fs/xfs/xfs_fsmap.h +++ b/fs/xfs/xfs_fsmap.h @@ -27,13 +27,9 @@ struct xfs_fsmap_head { struct xfs_fsmap fmh_keys[2]; /* low and high keys */ }; -void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src); void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src); -/* fsmap to userspace formatter - copy to user & advance pointer */ -typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *); - int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head, - xfs_fsmap_format_t formatter, void *arg); + struct fsmap *out_recs); #endif /* __XFS_FSMAP_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bca7659fb5c6..3fbd98f61ea5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1716,39 +1716,17 @@ xfs_ioc_getbmap( return error; } -struct getfsmap_info { - struct xfs_mount *mp; - struct fsmap_head __user *data; - unsigned int idx; - __u32 last_flags; -}; - -STATIC int -xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv) -{ - struct getfsmap_info *info = priv; - struct fsmap fm; - - trace_xfs_getfsmap_mapping(info->mp, xfm); - - info->last_flags = xfm->fmr_flags; - xfs_fsmap_from_internal(&fm, xfm); - if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm, - sizeof(struct fsmap))) - return -EFAULT; - - return 0; -} - STATIC int xfs_ioc_getfsmap( struct xfs_inode *ip, struct fsmap_head __user *arg) { - struct getfsmap_info info = { NULL }; struct xfs_fsmap_head xhead = {0}; struct fsmap_head head; - bool aborted = false; + struct fsmap *recs; + unsigned int count; + __u32 last_flags = 0; + bool done = false; int error; if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) @@ -1760,38 +1738,112 @@ xfs_ioc_getfsmap( sizeof(head.fmh_keys[1].fmr_reserved))) return -EINVAL; + /* + * Use an internal memory buffer so that we don't have to copy fsmap + * data to userspace while holding locks. Start by trying to allocate + * up to 128k for the buffer, but fall back to a single page if needed. + */ + count = min_t(unsigned int, head.fmh_count, + 131072 / sizeof(struct fsmap)); + recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL); + if (!recs) { + count = min_t(unsigned int, head.fmh_count, + PAGE_SIZE / sizeof(struct fsmap)); + recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL); + if (!recs) + return -ENOMEM; + } + xhead.fmh_iflags = head.fmh_iflags; - xhead.fmh_count = head.fmh_count; xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]); xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]); trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]); - info.mp = ip->i_mount; - info.data = arg; - error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); - if (error == -ECANCELED) { - error = 0; - aborted = true; - } else if (error) - return error; - - /* If we didn't abort, set the "last" flag in the last fmx */ - if (!aborted && info.idx) { - info.last_flags |= FMR_OF_LAST; - if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags, - &info.last_flags, sizeof(info.last_flags))) - return -EFAULT; + head.fmh_entries = 0; + do { + struct fsmap __user *user_recs; + struct fsmap *last_rec; + + user_recs = &arg->fmh_recs[head.fmh_entries]; + xhead.fmh_entries = 0; + xhead.fmh_count = min_t(unsigned int, count, + head.fmh_count - head.fmh_entries); + + /* Run query, record how many entries we got. */ + error = xfs_getfsmap(ip->i_mount, &xhead, recs); + switch (error) { + case 0: + /* + * There are no more records in the result set. Copy + * whatever we got to userspace and break out. + */ + done = true; + break; + case -ECANCELED: + /* + * The internal memory buffer is full. Copy whatever + * records we got to userspace and go again if we have + * not yet filled the userspace buffer. + */ + error = 0; + break; + default: + goto out_free; + } + head.fmh_entries += xhead.fmh_entries; + head.fmh_oflags = xhead.fmh_oflags; + + /* + * If the caller wanted a record count or there aren't any + * new records to return, we're done. + */ + if (head.fmh_count == 0 || xhead.fmh_entries == 0) + break; + + /* Copy all the records we got out to userspace. */ + if (copy_to_user(user_recs, recs, + xhead.fmh_entries * sizeof(struct fsmap))) { + error = -EFAULT; + goto out_free; + } + + /* Remember the last record flags we copied to userspace. */ + last_rec = &recs[xhead.fmh_entries - 1]; + last_flags = last_rec->fmr_flags; + + /* Set up the low key for the next iteration. */ + xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec); + trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); + } while (!done && head.fmh_entries < head.fmh_count); + + /* + * If there are no more records in the query result set and we're not + * in counting mode, mark the last record returned with the LAST flag. + */ + if (done && head.fmh_count > 0 && head.fmh_entries > 0) { + struct fsmap __user *user_rec; + + last_flags |= FMR_OF_LAST; + user_rec = &arg->fmh_recs[head.fmh_entries - 1]; + + if (copy_to_user(&user_rec->fmr_flags, &last_flags, + sizeof(last_flags))) { + error = -EFAULT; + goto out_free; + } } /* copy back header */ - head.fmh_entries = xhead.fmh_entries; - head.fmh_oflags = xhead.fmh_oflags; - if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) - return -EFAULT; + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) { + error = -EFAULT; + goto out_free; + } - return 0; +out_free: + kmem_free(recs); + return error; } STATIC int