Message ID | 20190821083820.11725-3-david@fromorbit.com (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
Series | xfs: avoid IO issues unaligned memory allocation | expand |
On Wed, Aug 21, 2019 at 06:38:19PM +1000, Dave Chinner wrote: > From: Dave Chinner <dchinner@redhat.com> > > Memory we use to submit for IO needs strict alignment to the > underlying driver contraints. Worst case, this is 512 bytes. Given > that all allocations for IO are always a power of 2 multiple of 512 > bytes, the kernel heap provides natural alignment for objects of > these sizes and that suffices. > > Until, of course, memory debugging of some kind is turned on (e.g. > red zones, poisoning, KASAN) and then the alignment of the heap > objects is thrown out the window. Then we get weird IO errors and > data corruption problems because drivers don't validate alignment > and do the wrong thing when passed unaligned memory buffers in bios. > > TO fix this, introduce kmem_alloc_io(), which will guaranteeat least s/TO/To/ > 512 byte alignment of buffers for IO, even if memory debugging > options are turned on. It is assumed that the minimum allocation > size will be 512 bytes, and that sizes will be power of 2 mulitples > of 512 bytes. > > Use this everywhere we allocate buffers for IO. > > This no longer fails with log recovery errors when KASAN is enabled > due to the brd driver not handling unaligned memory buffers: > > # mkfs.xfs -f /dev/ram0 ; mount /dev/ram0 /mnt/test > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > --- > fs/xfs/kmem.c | 61 +++++++++++++++++++++++++++++----------- > fs/xfs/kmem.h | 1 + > fs/xfs/xfs_buf.c | 4 +-- > fs/xfs/xfs_log.c | 2 +- > fs/xfs/xfs_log_recover.c | 2 +- > fs/xfs/xfs_trace.h | 1 + > 6 files changed, 50 insertions(+), 21 deletions(-) > > diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c > index edcf393c8fd9..ec693c0fdcff 100644 > --- a/fs/xfs/kmem.c > +++ b/fs/xfs/kmem.c ... > @@ -62,6 +56,39 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) > return ptr; > } > > +/* > + * Same as kmem_alloc_large, except we guarantee a 512 byte aligned buffer is > + * returned. vmalloc always returns an aligned region. > + */ > +void * > +kmem_alloc_io(size_t size, xfs_km_flags_t flags) > +{ > + void *ptr; > + > + trace_kmem_alloc_io(size, flags, _RET_IP_); > + > + ptr = kmem_alloc(size, flags | KM_MAYFAIL); > + if (ptr) { > + if (!((long)ptr & 511)) > + return ptr; > + kfree(ptr); > + } > + return __kmem_vmalloc(size, flags); > +} Even though it is unfortunate, this seems like a quite reasonable and isolated temporary solution to the problem to me. The one concern I have is if/how much this could affect performance under certain circumstances. I realize that these callsites are isolated in the common scenario. Less common scenarios like sub-page block sizes (whether due to explicit mkfs time format or default configurations on larger page size systems) can fall into this path much more frequently, however. Since this implies some kind of vm debug option is enabled, performance itself isn't critical when this solution is active. But how bad is it in those cases where we might depend on this more heavily? Have you confirmed that the end configuration is still "usable," at least? I ask because the repeated alloc/free behavior can easily be avoided via something like an mp flag (which may require a tweak to the kmem_alloc_io() interface) to skip further kmem_alloc() calls from this path once we see one unaligned allocation. That assumes this behavior is tied to functionality that isn't dynamically configured at runtime, of course. Brian > + > +void * > +kmem_alloc_large(size_t size, xfs_km_flags_t flags) > +{ > + void *ptr; > + > + trace_kmem_alloc_large(size, flags, _RET_IP_); > + > + ptr = kmem_alloc(size, flags | KM_MAYFAIL); > + if (ptr) > + return ptr; > + return __kmem_vmalloc(size, flags); > +} > + > void * > kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) > { > diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h > index 267655acd426..423a1fa0fcd6 100644 > --- a/fs/xfs/kmem.h > +++ b/fs/xfs/kmem.h > @@ -59,6 +59,7 @@ kmem_flags_convert(xfs_km_flags_t flags) > } > > extern void *kmem_alloc(size_t, xfs_km_flags_t); > +extern void *kmem_alloc_io(size_t, xfs_km_flags_t); > extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); > extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); > static inline void kmem_free(const void *ptr) > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c > index ca0849043f54..7bd1f31febfc 100644 > --- a/fs/xfs/xfs_buf.c > +++ b/fs/xfs/xfs_buf.c > @@ -353,7 +353,7 @@ xfs_buf_allocate_memory( > */ > size = BBTOB(bp->b_length); > if (size < PAGE_SIZE) { > - bp->b_addr = kmem_alloc(size, KM_NOFS); > + bp->b_addr = kmem_alloc_io(size, KM_NOFS); > if (!bp->b_addr) { > /* low memory - use alloc_page loop instead */ > goto use_alloc_page; > @@ -368,7 +368,7 @@ xfs_buf_allocate_memory( > } > bp->b_offset = offset_in_page(bp->b_addr); > bp->b_pages = bp->b_page_array; > - bp->b_pages[0] = virt_to_page(bp->b_addr); > + bp->b_pages[0] = kmem_to_page(bp->b_addr); > bp->b_page_count = 1; > bp->b_flags |= _XBF_KMEM; > return 0; > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c > index 7fc3c1ad36bc..1830d185d7fc 100644 > --- a/fs/xfs/xfs_log.c > +++ b/fs/xfs/xfs_log.c > @@ -1415,7 +1415,7 @@ xlog_alloc_log( > iclog->ic_prev = prev_iclog; > prev_iclog = iclog; > > - iclog->ic_data = kmem_alloc_large(log->l_iclog_size, > + iclog->ic_data = kmem_alloc_io(log->l_iclog_size, > KM_MAYFAIL); > if (!iclog->ic_data) > goto out_free_iclog; > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c > index 13d1d3e95b88..b4a6a008986b 100644 > --- a/fs/xfs/xfs_log_recover.c > +++ b/fs/xfs/xfs_log_recover.c > @@ -125,7 +125,7 @@ xlog_alloc_buffer( > if (nbblks > 1 && log->l_sectBBsize > 1) > nbblks += log->l_sectBBsize; > nbblks = round_up(nbblks, log->l_sectBBsize); > - return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL); > + return kmem_alloc_io(BBTOB(nbblks), KM_MAYFAIL); > } > > /* > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > index 8bb8b4704a00..eaae275ed430 100644 > --- a/fs/xfs/xfs_trace.h > +++ b/fs/xfs/xfs_trace.h > @@ -3604,6 +3604,7 @@ DEFINE_EVENT(xfs_kmem_class, name, \ > TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ > TP_ARGS(size, flags, caller_ip)) > DEFINE_KMEM_EVENT(kmem_alloc); > +DEFINE_KMEM_EVENT(kmem_alloc_io); > DEFINE_KMEM_EVENT(kmem_alloc_large); > DEFINE_KMEM_EVENT(kmem_realloc); > DEFINE_KMEM_EVENT(kmem_zone_alloc); > -- > 2.23.0.rc1 >
On Wed, Aug 21, 2019 at 09:35:33AM -0400, Brian Foster wrote: > On Wed, Aug 21, 2019 at 06:38:19PM +1000, Dave Chinner wrote: > > From: Dave Chinner <dchinner@redhat.com> > > > > Memory we use to submit for IO needs strict alignment to the > > underlying driver contraints. Worst case, this is 512 bytes. Given > > that all allocations for IO are always a power of 2 multiple of 512 > > bytes, the kernel heap provides natural alignment for objects of > > these sizes and that suffices. > > > > Until, of course, memory debugging of some kind is turned on (e.g. > > red zones, poisoning, KASAN) and then the alignment of the heap > > objects is thrown out the window. Then we get weird IO errors and > > data corruption problems because drivers don't validate alignment > > and do the wrong thing when passed unaligned memory buffers in bios. > > > > TO fix this, introduce kmem_alloc_io(), which will guaranteeat least > > s/TO/To/ > > > 512 byte alignment of buffers for IO, even if memory debugging > > options are turned on. It is assumed that the minimum allocation > > size will be 512 bytes, and that sizes will be power of 2 mulitples > > of 512 bytes. > > > > Use this everywhere we allocate buffers for IO. > > > > This no longer fails with log recovery errors when KASAN is enabled > > due to the brd driver not handling unaligned memory buffers: > > > > # mkfs.xfs -f /dev/ram0 ; mount /dev/ram0 /mnt/test > > > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > > --- > > fs/xfs/kmem.c | 61 +++++++++++++++++++++++++++++----------- > > fs/xfs/kmem.h | 1 + > > fs/xfs/xfs_buf.c | 4 +-- > > fs/xfs/xfs_log.c | 2 +- > > fs/xfs/xfs_log_recover.c | 2 +- > > fs/xfs/xfs_trace.h | 1 + > > 6 files changed, 50 insertions(+), 21 deletions(-) > > > > diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c > > index edcf393c8fd9..ec693c0fdcff 100644 > > --- a/fs/xfs/kmem.c > > +++ b/fs/xfs/kmem.c > ... > > @@ -62,6 +56,39 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) > > return ptr; > > } > > > > +/* > > + * Same as kmem_alloc_large, except we guarantee a 512 byte aligned buffer is > > + * returned. vmalloc always returns an aligned region. > > + */ > > +void * > > +kmem_alloc_io(size_t size, xfs_km_flags_t flags) > > +{ > > + void *ptr; > > + > > + trace_kmem_alloc_io(size, flags, _RET_IP_); > > + > > + ptr = kmem_alloc(size, flags | KM_MAYFAIL); > > + if (ptr) { > > + if (!((long)ptr & 511)) Er... didn't you say "it needs to grab the alignment from [blk_]queue_dma_alignment(), not use a hard coded value of 511"? How is this different? If this buffer is really for IO then shouldn't we pass in the buftarg or something so that we find the real alignment? Or check it down in the xfs_buf code that associates a page to a buffer? Even if all that logic is hidden behind CONFIG_XFS_DEBUG? > > + return ptr; > > + kfree(ptr); > > + } > > + return __kmem_vmalloc(size, flags); How about checking the vmalloc alignment too? If we're going to be this paranoid we might as well go all the way. :) --D > > +} > > Even though it is unfortunate, this seems like a quite reasonable and > isolated temporary solution to the problem to me. The one concern I have > is if/how much this could affect performance under certain > circumstances. I realize that these callsites are isolated in the common > scenario. Less common scenarios like sub-page block sizes (whether due > to explicit mkfs time format or default configurations on larger page > size systems) can fall into this path much more frequently, however. > > Since this implies some kind of vm debug option is enabled, performance > itself isn't critical when this solution is active. But how bad is it in > those cases where we might depend on this more heavily? Have you > confirmed that the end configuration is still "usable," at least? > > I ask because the repeated alloc/free behavior can easily be avoided via > something like an mp flag (which may require a tweak to the > kmem_alloc_io() interface) to skip further kmem_alloc() calls from this > path once we see one unaligned allocation. That assumes this behavior is > tied to functionality that isn't dynamically configured at runtime, of > course. > > Brian > > > + > > +void * > > +kmem_alloc_large(size_t size, xfs_km_flags_t flags) > > +{ > > + void *ptr; > > + > > + trace_kmem_alloc_large(size, flags, _RET_IP_); > > + > > + ptr = kmem_alloc(size, flags | KM_MAYFAIL); > > + if (ptr) > > + return ptr; > > + return __kmem_vmalloc(size, flags); > > +} > > + > > void * > > kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) > > { > > diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h > > index 267655acd426..423a1fa0fcd6 100644 > > --- a/fs/xfs/kmem.h > > +++ b/fs/xfs/kmem.h > > @@ -59,6 +59,7 @@ kmem_flags_convert(xfs_km_flags_t flags) > > } > > > > extern void *kmem_alloc(size_t, xfs_km_flags_t); > > +extern void *kmem_alloc_io(size_t, xfs_km_flags_t); > > extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); > > extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); > > static inline void kmem_free(const void *ptr) > > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c > > index ca0849043f54..7bd1f31febfc 100644 > > --- a/fs/xfs/xfs_buf.c > > +++ b/fs/xfs/xfs_buf.c > > @@ -353,7 +353,7 @@ xfs_buf_allocate_memory( > > */ > > size = BBTOB(bp->b_length); > > if (size < PAGE_SIZE) { > > - bp->b_addr = kmem_alloc(size, KM_NOFS); > > + bp->b_addr = kmem_alloc_io(size, KM_NOFS); > > if (!bp->b_addr) { > > /* low memory - use alloc_page loop instead */ > > goto use_alloc_page; > > @@ -368,7 +368,7 @@ xfs_buf_allocate_memory( > > } > > bp->b_offset = offset_in_page(bp->b_addr); > > bp->b_pages = bp->b_page_array; > > - bp->b_pages[0] = virt_to_page(bp->b_addr); > > + bp->b_pages[0] = kmem_to_page(bp->b_addr); > > bp->b_page_count = 1; > > bp->b_flags |= _XBF_KMEM; > > return 0; > > diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c > > index 7fc3c1ad36bc..1830d185d7fc 100644 > > --- a/fs/xfs/xfs_log.c > > +++ b/fs/xfs/xfs_log.c > > @@ -1415,7 +1415,7 @@ xlog_alloc_log( > > iclog->ic_prev = prev_iclog; > > prev_iclog = iclog; > > > > - iclog->ic_data = kmem_alloc_large(log->l_iclog_size, > > + iclog->ic_data = kmem_alloc_io(log->l_iclog_size, > > KM_MAYFAIL); > > if (!iclog->ic_data) > > goto out_free_iclog; > > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c > > index 13d1d3e95b88..b4a6a008986b 100644 > > --- a/fs/xfs/xfs_log_recover.c > > +++ b/fs/xfs/xfs_log_recover.c > > @@ -125,7 +125,7 @@ xlog_alloc_buffer( > > if (nbblks > 1 && log->l_sectBBsize > 1) > > nbblks += log->l_sectBBsize; > > nbblks = round_up(nbblks, log->l_sectBBsize); > > - return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL); > > + return kmem_alloc_io(BBTOB(nbblks), KM_MAYFAIL); > > } > > > > /* > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > > index 8bb8b4704a00..eaae275ed430 100644 > > --- a/fs/xfs/xfs_trace.h > > +++ b/fs/xfs/xfs_trace.h > > @@ -3604,6 +3604,7 @@ DEFINE_EVENT(xfs_kmem_class, name, \ > > TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ > > TP_ARGS(size, flags, caller_ip)) > > DEFINE_KMEM_EVENT(kmem_alloc); > > +DEFINE_KMEM_EVENT(kmem_alloc_io); > > DEFINE_KMEM_EVENT(kmem_alloc_large); > > DEFINE_KMEM_EVENT(kmem_realloc); > > DEFINE_KMEM_EVENT(kmem_zone_alloc); > > -- > > 2.23.0.rc1 > >
On 8/21/19 8:35 AM, Brian Foster wrote: > On Wed, Aug 21, 2019 at 06:38:19PM +1000, Dave Chinner wrote: >> From: Dave Chinner <dchinner@redhat.com> >> >> Memory we use to submit for IO needs strict alignment to the >> underlying driver contraints. Worst case, this is 512 bytes. Given >> that all allocations for IO are always a power of 2 multiple of 512 >> bytes, the kernel heap provides natural alignment for objects of >> these sizes and that suffices. >> >> Until, of course, memory debugging of some kind is turned on (e.g. >> red zones, poisoning, KASAN) and then the alignment of the heap >> objects is thrown out the window. Then we get weird IO errors and >> data corruption problems because drivers don't validate alignment >> and do the wrong thing when passed unaligned memory buffers in bios. >> >> TO fix this, introduce kmem_alloc_io(), which will guaranteeat least > > s/TO/To/ > >> 512 byte alignment of buffers for IO, even if memory debugging >> options are turned on. It is assumed that the minimum allocation >> size will be 512 bytes, and that sizes will be power of 2 mulitples >> of 512 bytes. >> >> Use this everywhere we allocate buffers for IO. >> >> This no longer fails with log recovery errors when KASAN is enabled >> due to the brd driver not handling unaligned memory buffers: >> >> # mkfs.xfs -f /dev/ram0 ; mount /dev/ram0 /mnt/test >> >> Signed-off-by: Dave Chinner <dchinner@redhat.com> >> --- >> fs/xfs/kmem.c | 61 +++++++++++++++++++++++++++++----------- >> fs/xfs/kmem.h | 1 + >> fs/xfs/xfs_buf.c | 4 +-- >> fs/xfs/xfs_log.c | 2 +- >> fs/xfs/xfs_log_recover.c | 2 +- >> fs/xfs/xfs_trace.h | 1 + >> 6 files changed, 50 insertions(+), 21 deletions(-) >> >> diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c >> index edcf393c8fd9..ec693c0fdcff 100644 >> --- a/fs/xfs/kmem.c >> +++ b/fs/xfs/kmem.c > ... >> @@ -62,6 +56,39 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) >> return ptr; >> } >> >> +/* >> + * Same as kmem_alloc_large, except we guarantee a 512 byte aligned buffer is >> + * returned. vmalloc always returns an aligned region. >> + */ >> +void * >> +kmem_alloc_io(size_t size, xfs_km_flags_t flags) >> +{ >> + void *ptr; >> + >> + trace_kmem_alloc_io(size, flags, _RET_IP_); >> + >> + ptr = kmem_alloc(size, flags | KM_MAYFAIL); >> + if (ptr) { >> + if (!((long)ptr & 511)) >> + return ptr; >> + kfree(ptr); >> + } >> + return __kmem_vmalloc(size, flags); >> +} > > Even though it is unfortunate, this seems like a quite reasonable and > isolated temporary solution to the problem to me. The one concern I have > is if/how much this could affect performance under certain > circumstances. I realize that these callsites are isolated in the common > scenario. Less common scenarios like sub-page block sizes (whether due > to explicit mkfs time format or default configurations on larger page > size systems) can fall into this path much more frequently, however. > > Since this implies some kind of vm debug option is enabled, performance > itself isn't critical when this solution is active. But how bad is it in > those cases where we might depend on this more heavily? Have you > confirmed that the end configuration is still "usable," at least? > > I ask because the repeated alloc/free behavior can easily be avoided via > something like an mp flag (which may require a tweak to the > kmem_alloc_io() interface) to skip further kmem_alloc() calls from this > path once we see one unaligned allocation. That assumes this behavior is > tied to functionality that isn't dynamically configured at runtime, of > course. This seems like a good idea to me. -Eric
On Wed, Aug 21, 2019 at 09:35:33AM -0400, Brian Foster wrote: > On Wed, Aug 21, 2019 at 06:38:19PM +1000, Dave Chinner wrote: > > From: Dave Chinner <dchinner@redhat.com> > > > > Memory we use to submit for IO needs strict alignment to the > > underlying driver contraints. Worst case, this is 512 bytes. Given > > that all allocations for IO are always a power of 2 multiple of 512 > > bytes, the kernel heap provides natural alignment for objects of > > these sizes and that suffices. > > > > Until, of course, memory debugging of some kind is turned on (e.g. > > red zones, poisoning, KASAN) and then the alignment of the heap > > objects is thrown out the window. Then we get weird IO errors and > > data corruption problems because drivers don't validate alignment > > and do the wrong thing when passed unaligned memory buffers in bios. > > > > TO fix this, introduce kmem_alloc_io(), which will guaranteeat least > > s/TO/To/ > > > 512 byte alignment of buffers for IO, even if memory debugging > > options are turned on. It is assumed that the minimum allocation > > size will be 512 bytes, and that sizes will be power of 2 mulitples > > of 512 bytes. > > > > Use this everywhere we allocate buffers for IO. > > > > This no longer fails with log recovery errors when KASAN is enabled > > due to the brd driver not handling unaligned memory buffers: > > > > # mkfs.xfs -f /dev/ram0 ; mount /dev/ram0 /mnt/test > > > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > > --- > > fs/xfs/kmem.c | 61 +++++++++++++++++++++++++++++----------- > > fs/xfs/kmem.h | 1 + > > fs/xfs/xfs_buf.c | 4 +-- > > fs/xfs/xfs_log.c | 2 +- > > fs/xfs/xfs_log_recover.c | 2 +- > > fs/xfs/xfs_trace.h | 1 + > > 6 files changed, 50 insertions(+), 21 deletions(-) > > > > diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c > > index edcf393c8fd9..ec693c0fdcff 100644 > > --- a/fs/xfs/kmem.c > > +++ b/fs/xfs/kmem.c > ... > > @@ -62,6 +56,39 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) > > return ptr; > > } > > > > +/* > > + * Same as kmem_alloc_large, except we guarantee a 512 byte aligned buffer is > > + * returned. vmalloc always returns an aligned region. > > + */ > > +void * > > +kmem_alloc_io(size_t size, xfs_km_flags_t flags) > > +{ > > + void *ptr; > > + > > + trace_kmem_alloc_io(size, flags, _RET_IP_); > > + > > + ptr = kmem_alloc(size, flags | KM_MAYFAIL); > > + if (ptr) { > > + if (!((long)ptr & 511)) > > + return ptr; > > + kfree(ptr); > > + } > > + return __kmem_vmalloc(size, flags); > > +} > > Even though it is unfortunate, this seems like a quite reasonable and > isolated temporary solution to the problem to me. The one concern I have > is if/how much this could affect performance under certain > circumstances. Can't measure a difference on 4k block size filesystems. It's only used for log recovery and then for allocation AGF/AGI buffers on 512 byte sector devices. Anything using 4k sectors only hits it during mount. So for default configs with memory posioning/KASAN enabled, the massive overhead of poisoning/tracking makes this disappear in the noise. For 1k block size filesystems, it gets hit much harder, but there's no noticable increase in runtime of xfstests vs 4k block size with KASAN enabled. The big increase in overhead comes from enabling KASAN (takes 3x longer than without), not doing one extra allocation/free pair. > I realize that these callsites are isolated in the common > scenario. Less common scenarios like sub-page block sizes (whether due > to explicit mkfs time format or default configurations on larger page > size systems) can fall into this path much more frequently, however. *nod* > Since this implies some kind of vm debug option is enabled, performance > itself isn't critical when this solution is active. But how bad is it in > those cases where we might depend on this more heavily? Have you > confirmed that the end configuration is still "usable," at least? No noticable difference, most definitely still usable. > I ask because the repeated alloc/free behavior can easily be avoided via > something like an mp flag (which may require a tweak to the What's an "mp flag"? > kmem_alloc_io() interface) to skip further kmem_alloc() calls from this > path once we see one unaligned allocation. That assumes this behavior is > tied to functionality that isn't dynamically configured at runtime, of > course. vmalloc() has a _lot_ more overhead than kmalloc (especially when vmalloc has to do multiple allocations itself to set up page table entries) so there is still an overall gain to be had by trying kmalloc even if 50% of allocations are unaligned. Cheers, Dave.
On Wed, Aug 21, 2019 at 08:08:01AM -0700, Darrick J. Wong wrote: > On Wed, Aug 21, 2019 at 09:35:33AM -0400, Brian Foster wrote: > > On Wed, Aug 21, 2019 at 06:38:19PM +1000, Dave Chinner wrote: > > > From: Dave Chinner <dchinner@redhat.com> > > > > > > Memory we use to submit for IO needs strict alignment to the > > > underlying driver contraints. Worst case, this is 512 bytes. Given > > > that all allocations for IO are always a power of 2 multiple of 512 > > > bytes, the kernel heap provides natural alignment for objects of > > > these sizes and that suffices. > > > > > > Until, of course, memory debugging of some kind is turned on (e.g. > > > red zones, poisoning, KASAN) and then the alignment of the heap > > > objects is thrown out the window. Then we get weird IO errors and > > > data corruption problems because drivers don't validate alignment > > > and do the wrong thing when passed unaligned memory buffers in bios. > > > > > > TO fix this, introduce kmem_alloc_io(), which will guaranteeat least > > > > s/TO/To/ > > > > > 512 byte alignment of buffers for IO, even if memory debugging > > > options are turned on. It is assumed that the minimum allocation > > > size will be 512 bytes, and that sizes will be power of 2 mulitples > > > of 512 bytes. > > > > > > Use this everywhere we allocate buffers for IO. > > > > > > This no longer fails with log recovery errors when KASAN is enabled > > > due to the brd driver not handling unaligned memory buffers: > > > > > > # mkfs.xfs -f /dev/ram0 ; mount /dev/ram0 /mnt/test > > > > > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > > > --- > > > fs/xfs/kmem.c | 61 +++++++++++++++++++++++++++++----------- > > > fs/xfs/kmem.h | 1 + > > > fs/xfs/xfs_buf.c | 4 +-- > > > fs/xfs/xfs_log.c | 2 +- > > > fs/xfs/xfs_log_recover.c | 2 +- > > > fs/xfs/xfs_trace.h | 1 + > > > 6 files changed, 50 insertions(+), 21 deletions(-) > > > > > > diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c > > > index edcf393c8fd9..ec693c0fdcff 100644 > > > --- a/fs/xfs/kmem.c > > > +++ b/fs/xfs/kmem.c > > ... > > > @@ -62,6 +56,39 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) > > > return ptr; > > > } > > > > > > +/* > > > + * Same as kmem_alloc_large, except we guarantee a 512 byte aligned buffer is > > > + * returned. vmalloc always returns an aligned region. > > > + */ > > > +void * > > > +kmem_alloc_io(size_t size, xfs_km_flags_t flags) > > > +{ > > > + void *ptr; > > > + > > > + trace_kmem_alloc_io(size, flags, _RET_IP_); > > > + > > > + ptr = kmem_alloc(size, flags | KM_MAYFAIL); > > > + if (ptr) { > > > + if (!((long)ptr & 511)) > > Er... didn't you say "it needs to grab the alignment from > [blk_]queue_dma_alignment(), not use a hard coded value of 511"? That's fine for the bio, which has a direct pointer to the request queue. Here the allocation may be a long way away from the IO itself, and we migh actually be slicing and dicing an allocated region into a bio and not just the whole region itself. So I've just taken the worst case - queue_dma_alignment() returns 511 if no queue is supplied, so this is the worst case alignment that the block device will require. We don't need any more to fix the problem right now, and getting alignment into this function in all cases makes it a bit more complicated... > How is this different? If this buffer is really for IO then shouldn't > we pass in the buftarg or something so that we find the real alignment? > Or check it down in the xfs_buf code that associates a page to a buffer? No, at worst we should in the alignment - there is no good reason to be passing buftargs, block devices or request queues into memory allocation APIs. I'll have a look at this. > Even if all that logic is hidden behind CONFIG_XFS_DEBUG? That's no good because memory debugging can be turned on without CONFIG_XFS_DEBUG (how we tripped over the xenblk case). > > > > + return ptr; > > > + kfree(ptr); > > > + } > > > + return __kmem_vmalloc(size, flags); > > How about checking the vmalloc alignment too? If we're going to be this > paranoid we might as well go all the way. :) if vmalloc is returning unaligned regions, lots of stuff is going to break everywhere. It has to be page aligned because of the pte mappings it requires. Memory debugging puts guard pages around vmalloc, it doesn't change the alignment. Cheers, Dave.
> + > +/* > + * __vmalloc() will allocate data pages and auxillary structures (e.g. > + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence > + * we need to tell memory reclaim that we are in such a context via > + * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here > + * and potentially deadlocking. > + */ Btw, I think we should eventually kill off KM_NOFS and just use PF_MEMALLOC_NOFS in XFS, as the interface makes so much more sense. But that's something for the future. > +/* > + * Same as kmem_alloc_large, except we guarantee a 512 byte aligned buffer is > + * returned. vmalloc always returns an aligned region. > + */ > +void * > +kmem_alloc_io(size_t size, xfs_km_flags_t flags) > +{ > + void *ptr; > + > + trace_kmem_alloc_io(size, flags, _RET_IP_); > + > + ptr = kmem_alloc(size, flags | KM_MAYFAIL); > + if (ptr) { > + if (!((long)ptr & 511)) Please use unsigned long (or uintptr_t if you want to be fancy), and (SECTOR_SIZE - 1). As said elsewhere if we want to be fancy we should probably pass a request queue or something pointing to it. But then again I don't think it really matters much, it would save us the reallocation with slub debug for a bunch of scsi adapters that support dword aligned I/O. But last least the interface would be a little more obvious.
On Wed, Aug 21, 2019 at 04:24:40PM -0700, Christoph Hellwig wrote: > > + > > +/* > > + * __vmalloc() will allocate data pages and auxillary structures (e.g. > > + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence > > + * we need to tell memory reclaim that we are in such a context via > > + * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here > > + * and potentially deadlocking. > > + */ > > Btw, I think we should eventually kill off KM_NOFS and just use > PF_MEMALLOC_NOFS in XFS, as the interface makes so much more sense. > But that's something for the future. Yeah, and it's not quite as simple as just using PF_MEMALLOC_NOFS at high levels - we'll still need to annotate callers that use KM_NOFS to avoid lockdep false positives. i.e. any code that can be called from GFP_KERNEL and reclaim context will throw false positives from lockdep if we don't annotate tehm correctly.... > > +/* > > + * Same as kmem_alloc_large, except we guarantee a 512 byte aligned buffer is > > + * returned. vmalloc always returns an aligned region. > > + */ > > +void * > > +kmem_alloc_io(size_t size, xfs_km_flags_t flags) > > +{ > > + void *ptr; > > + > > + trace_kmem_alloc_io(size, flags, _RET_IP_); > > + > > + ptr = kmem_alloc(size, flags | KM_MAYFAIL); > > + if (ptr) { > > + if (!((long)ptr & 511)) > > Please use unsigned long (or uintptr_t if you want to be fancy), > and (SECTOR_SIZE - 1). Already changed it to uintptr_t when I did... > > As said elsewhere if we want to be fancy we should probably pass a > request queue or something pointing to it. .... this. Well, not exactly this - I pass in the alignment required as an int, and the callers get it from the request queue.... > But then again I don't think > it really matters much, it would save us the reallocation with slub debug > for a bunch of scsi adapters that support dword aligned I/O. But last > least the interface would be a little more obvious. Yup, just smoke testing it now before I resend. Cheers, Dave.
On Thu, Aug 22, 2019 at 10:31:32AM +1000, Dave Chinner wrote: > > Btw, I think we should eventually kill off KM_NOFS and just use > > PF_MEMALLOC_NOFS in XFS, as the interface makes so much more sense. > > But that's something for the future. > > Yeah, and it's not quite as simple as just using PF_MEMALLOC_NOFS > at high levels - we'll still need to annotate callers that use KM_NOFS > to avoid lockdep false positives. i.e. any code that can be called from > GFP_KERNEL and reclaim context will throw false positives from > lockdep if we don't annotate tehm correctly.... Oh well. For now we have the XFS kmem_wrappers to turn that into GFP_NOFS so we shouldn't be too worried, but I think that is something we should fix in lockdep to ensure it is generally useful. I've added the maintainers and relevant lists to kick off a discussion.
On Thu, Aug 22, 2019 at 12:59:48AM -0700, Christoph Hellwig wrote: > On Thu, Aug 22, 2019 at 10:31:32AM +1000, Dave Chinner wrote: > > > Btw, I think we should eventually kill off KM_NOFS and just use > > > PF_MEMALLOC_NOFS in XFS, as the interface makes so much more sense. > > > But that's something for the future. > > > > Yeah, and it's not quite as simple as just using PF_MEMALLOC_NOFS > > at high levels - we'll still need to annotate callers that use KM_NOFS > > to avoid lockdep false positives. i.e. any code that can be called from > > GFP_KERNEL and reclaim context will throw false positives from > > lockdep if we don't annotate tehm correctly.... > > Oh well. For now we have the XFS kmem_wrappers to turn that into > GFP_NOFS so we shouldn't be too worried, but I think that is something > we should fix in lockdep to ensure it is generally useful. I've added > the maintainers and relevant lists to kick off a discussion. Strictly speaking the fs_reclaim annotation is no longer part of the lockdep core, but is simply a fake lock in page_alloc.c and thus falls under the mm people's purview. That said; it should be fairly straight forward to teach __need_fs_reclaim() about PF_MEMALLOC_NOFS, much like how it already knows about PF_MEMALLOC.
On Thu, Aug 22, 2019 at 10:51:30AM +0200, Peter Zijlstra wrote: > On Thu, Aug 22, 2019 at 12:59:48AM -0700, Christoph Hellwig wrote: > > On Thu, Aug 22, 2019 at 10:31:32AM +1000, Dave Chinner wrote: > > > > Btw, I think we should eventually kill off KM_NOFS and just use > > > > PF_MEMALLOC_NOFS in XFS, as the interface makes so much more sense. > > > > But that's something for the future. > > > > > > Yeah, and it's not quite as simple as just using PF_MEMALLOC_NOFS > > > at high levels - we'll still need to annotate callers that use KM_NOFS > > > to avoid lockdep false positives. i.e. any code that can be called from > > > GFP_KERNEL and reclaim context will throw false positives from > > > lockdep if we don't annotate tehm correctly.... > > > > Oh well. For now we have the XFS kmem_wrappers to turn that into > > GFP_NOFS so we shouldn't be too worried, but I think that is something > > we should fix in lockdep to ensure it is generally useful. I've added > > the maintainers and relevant lists to kick off a discussion. > > Strictly speaking the fs_reclaim annotation is no longer part of the > lockdep core, but is simply a fake lock in page_alloc.c and thus falls > under the mm people's purview. > > That said; it should be fairly straight forward to teach > __need_fs_reclaim() about PF_MEMALLOC_NOFS, much like how it already > knows about PF_MEMALLOC. Ah, current_gfp_context() already seems to transfer PF_MEMALLOC_NOFS into the GFP flags. So are we sure it is broken and needs mending?
On Thu, Aug 22, 2019 at 11:10:57AM +0200, Peter Zijlstra wrote: > On Thu, Aug 22, 2019 at 10:51:30AM +0200, Peter Zijlstra wrote: > > On Thu, Aug 22, 2019 at 12:59:48AM -0700, Christoph Hellwig wrote: > > > On Thu, Aug 22, 2019 at 10:31:32AM +1000, Dave Chinner wrote: > > > > > Btw, I think we should eventually kill off KM_NOFS and just use > > > > > PF_MEMALLOC_NOFS in XFS, as the interface makes so much more sense. > > > > > But that's something for the future. > > > > > > > > Yeah, and it's not quite as simple as just using PF_MEMALLOC_NOFS > > > > at high levels - we'll still need to annotate callers that use KM_NOFS > > > > to avoid lockdep false positives. i.e. any code that can be called from > > > > GFP_KERNEL and reclaim context will throw false positives from > > > > lockdep if we don't annotate tehm correctly.... > > > > > > Oh well. For now we have the XFS kmem_wrappers to turn that into > > > GFP_NOFS so we shouldn't be too worried, but I think that is something > > > we should fix in lockdep to ensure it is generally useful. I've added > > > the maintainers and relevant lists to kick off a discussion. > > > > Strictly speaking the fs_reclaim annotation is no longer part of the > > lockdep core, but is simply a fake lock in page_alloc.c and thus falls > > under the mm people's purview. > > > > That said; it should be fairly straight forward to teach > > __need_fs_reclaim() about PF_MEMALLOC_NOFS, much like how it already > > knows about PF_MEMALLOC. > > Ah, current_gfp_context() already seems to transfer PF_MEMALLOC_NOFS > into the GFP flags. > > So are we sure it is broken and needs mending? Well, that's what we are trying to work out. The problem is that we have code that takes locks and does allocations that is called both above and below the reclaim "lock" context. Once it's been seen below the reclaim lock context, calling it with GFP_KERNEL context above the reclaim lock context throws a deadlock warning. The only way around that was to mark these allocation sites as GFP_NOFS so lockdep is never allowed to see that recursion through reclaim occur. Even though it isn't a deadlock vector. What we're looking at is whether PF_MEMALLOC_NOFS changes this - I don't think it does solve this problem. i.e. if we define the allocation as GFP_KERNEL and then use PF_MEMALLOC_NOFS where reclaim is not allowed, we still have GFP_KERNEL allocations in code above reclaim that has also been seen below relcaim. And so we'll get false positive warnings again. What I think we are going to have to do here is manually audit each of the KM_NOFS call sites as we remove the NOFS from them and determine if ___GFP_NOLOCKDEP is needed to stop lockdep from trying to track these allocation sites. We've never used this tag because we'd already fixed most of these false positives with explicit GFP_NOFS tags long before ___GFP_NOLOCKDEP was created. But until someone starts doing the work, I don't know if it will work or even whether conversion PF_MEMALLOC_NOFS is going to introduce a bunch of new ways to get false positives from lockdep... Cheers, Dave.
On 8/22/19 12:14 PM, Dave Chinner wrote: > On Thu, Aug 22, 2019 at 11:10:57AM +0200, Peter Zijlstra wrote: >> >> Ah, current_gfp_context() already seems to transfer PF_MEMALLOC_NOFS >> into the GFP flags. >> >> So are we sure it is broken and needs mending? > > Well, that's what we are trying to work out. The problem is that we > have code that takes locks and does allocations that is called both > above and below the reclaim "lock" context. Once it's been seen > below the reclaim lock context, calling it with GFP_KERNEL context > above the reclaim lock context throws a deadlock warning. > > The only way around that was to mark these allocation sites as > GFP_NOFS so lockdep is never allowed to see that recursion through > reclaim occur. Even though it isn't a deadlock vector. > > What we're looking at is whether PF_MEMALLOC_NOFS changes this - I > don't think it does solve this problem. i.e. if we define the > allocation as GFP_KERNEL and then use PF_MEMALLOC_NOFS where reclaim > is not allowed, we still have GFP_KERNEL allocations in code above > reclaim that has also been seen below relcaim. And so we'll get > false positive warnings again. If I understand both you and the code directly, the code sites won't call __fs_reclaim_acquire when called with current->flags including PF_MEMALLOC_NOFS. So that would mean they "won't be seen below the reclaim" and all would be fine, right? > What I think we are going to have to do here is manually audit > each of the KM_NOFS call sites as we remove the NOFS from them and > determine if ___GFP_NOLOCKDEP is needed to stop lockdep from trying > to track these allocation sites. We've never used this tag because > we'd already fixed most of these false positives with explicit > GFP_NOFS tags long before ___GFP_NOLOCKDEP was created. > > But until someone starts doing the work, I don't know if it will > work or even whether conversion PF_MEMALLOC_NOFS is going to > introduce a bunch of new ways to get false positives from lockdep... > > Cheers, > > Dave. >
On Thu, Aug 22, 2019 at 01:14:30PM +0200, Vlastimil Babka wrote: > On 8/22/19 12:14 PM, Dave Chinner wrote: > > On Thu, Aug 22, 2019 at 11:10:57AM +0200, Peter Zijlstra wrote: > >> > >> Ah, current_gfp_context() already seems to transfer PF_MEMALLOC_NOFS > >> into the GFP flags. > >> > >> So are we sure it is broken and needs mending? > > > > Well, that's what we are trying to work out. The problem is that we > > have code that takes locks and does allocations that is called both > > above and below the reclaim "lock" context. Once it's been seen > > below the reclaim lock context, calling it with GFP_KERNEL context > > above the reclaim lock context throws a deadlock warning. > > > > The only way around that was to mark these allocation sites as > > GFP_NOFS so lockdep is never allowed to see that recursion through > > reclaim occur. Even though it isn't a deadlock vector. > > > > What we're looking at is whether PF_MEMALLOC_NOFS changes this - I > > don't think it does solve this problem. i.e. if we define the > > allocation as GFP_KERNEL and then use PF_MEMALLOC_NOFS where reclaim > > is not allowed, we still have GFP_KERNEL allocations in code above > > reclaim that has also been seen below relcaim. And so we'll get > > false positive warnings again. > > If I understand both you and the code directly, the code sites won't call > __fs_reclaim_acquire when called with current->flags including PF_MEMALLOC_NOFS. > So that would mean they "won't be seen below the reclaim" and all would be fine, > right? No, the problem is this (using kmalloc as a general term for allocation, whether it be kmalloc, kmem_cache_alloc, alloc_page, etc) some random kernel code kmalloc(GFP_KERNEL) reclaim PF_MEMALLOC shrink_slab xfs_inode_shrink XFS_ILOCK xfs_buf_allocate_memory() kmalloc(GFP_KERNEL) And so locks on inodes in reclaim are seen below reclaim. Then somewhere else we have: some high level read-only xfs code like readdir XFS_ILOCK xfs_buf_allocate_memory() kmalloc(GFP_KERNEL) reclaim And this one throws false positive lockdep warnings because we called into reclaim with XFS_ILOCK held and GFP_KERNEL alloc context. So the only solution we had at the tiem to shut it up was: some high level read-only xfs code like readdir XFS_ILOCK xfs_buf_allocate_memory() kmalloc(GFP_NOFS) So that lockdep sees it's not going to recurse into reclaim and doesn't throw a warning... Cheers, Dave.
On 8/22/19 2:07 PM, Dave Chinner wrote: > On Thu, Aug 22, 2019 at 01:14:30PM +0200, Vlastimil Babka wrote: > > No, the problem is this (using kmalloc as a general term for > allocation, whether it be kmalloc, kmem_cache_alloc, alloc_page, etc) > > some random kernel code > kmalloc(GFP_KERNEL) > reclaim > PF_MEMALLOC > shrink_slab > xfs_inode_shrink > XFS_ILOCK > xfs_buf_allocate_memory() > kmalloc(GFP_KERNEL) > > And so locks on inodes in reclaim are seen below reclaim. Then > somewhere else we have: > > some high level read-only xfs code like readdir > XFS_ILOCK > xfs_buf_allocate_memory() > kmalloc(GFP_KERNEL) > reclaim > > And this one throws false positive lockdep warnings because we > called into reclaim with XFS_ILOCK held and GFP_KERNEL alloc OK, and what exactly makes this positive a false one? Why can't it continue like the first example where reclaim leads to another XFS_ILOCK, thus deadlock? > context. So the only solution we had at the tiem to shut it up was: > > some high level read-only xfs code like readdir > XFS_ILOCK > xfs_buf_allocate_memory() > kmalloc(GFP_NOFS) > > So that lockdep sees it's not going to recurse into reclaim and > doesn't throw a warning... AFAICS that GFP_NOFS would fix not only a warning but also a real deadlock (depending on the answer to my previous question). > Cheers, > > Dave. >
On Thu, Aug 22, 2019 at 02:19:04PM +0200, Vlastimil Babka wrote: > On 8/22/19 2:07 PM, Dave Chinner wrote: > > On Thu, Aug 22, 2019 at 01:14:30PM +0200, Vlastimil Babka wrote: > > > > No, the problem is this (using kmalloc as a general term for > > allocation, whether it be kmalloc, kmem_cache_alloc, alloc_page, etc) > > > > some random kernel code > > kmalloc(GFP_KERNEL) > > reclaim > > PF_MEMALLOC > > shrink_slab > > xfs_inode_shrink > > XFS_ILOCK > > xfs_buf_allocate_memory() > > kmalloc(GFP_KERNEL) > > > > And so locks on inodes in reclaim are seen below reclaim. Then > > somewhere else we have: > > > > some high level read-only xfs code like readdir > > XFS_ILOCK > > xfs_buf_allocate_memory() > > kmalloc(GFP_KERNEL) > > reclaim > > > > And this one throws false positive lockdep warnings because we > > called into reclaim with XFS_ILOCK held and GFP_KERNEL alloc > > OK, and what exactly makes this positive a false one? Why can't it continue like > the first example where reclaim leads to another XFS_ILOCK, thus deadlock? Because above reclaim we only have operations being done on referenced inodes, and below reclaim we only have unreferenced inodes. We never lock the same inode both above and below reclaim at the same time. IOWs, an operation above reclaim cannot see, access or lock unreferenced inodes, except in inode write clustering, and that uses trylocks so cannot deadlock with reclaim. An operation below reclaim cannot see, access or lock referenced inodes except during inode write clustering, and that uses trylocks so cannot deadlock with code above reclaim. FWIW, I'm trying to make the inode writeback clustering go away from reclaim at the moment, so even that possibility is going away soon. That will change everything to trylocks in reclaim context, so lockdep is going to stop tracking it entirely. Hmmm - maybe we're getting to the point where we actually don't need GFP_NOFS/PF_MEMALLOC_NOFS at all in XFS anymore..... Cheers, Dave.
On Thu, Aug 22, 2019 at 07:14:52AM +1000, Dave Chinner wrote: > On Wed, Aug 21, 2019 at 09:35:33AM -0400, Brian Foster wrote: > > On Wed, Aug 21, 2019 at 06:38:19PM +1000, Dave Chinner wrote: > > > From: Dave Chinner <dchinner@redhat.com> > > > > > > Memory we use to submit for IO needs strict alignment to the > > > underlying driver contraints. Worst case, this is 512 bytes. Given > > > that all allocations for IO are always a power of 2 multiple of 512 > > > bytes, the kernel heap provides natural alignment for objects of > > > these sizes and that suffices. > > > > > > Until, of course, memory debugging of some kind is turned on (e.g. > > > red zones, poisoning, KASAN) and then the alignment of the heap > > > objects is thrown out the window. Then we get weird IO errors and > > > data corruption problems because drivers don't validate alignment > > > and do the wrong thing when passed unaligned memory buffers in bios. > > > > > > TO fix this, introduce kmem_alloc_io(), which will guaranteeat least > > > > s/TO/To/ > > > > > 512 byte alignment of buffers for IO, even if memory debugging > > > options are turned on. It is assumed that the minimum allocation > > > size will be 512 bytes, and that sizes will be power of 2 mulitples > > > of 512 bytes. > > > > > > Use this everywhere we allocate buffers for IO. > > > > > > This no longer fails with log recovery errors when KASAN is enabled > > > due to the brd driver not handling unaligned memory buffers: > > > > > > # mkfs.xfs -f /dev/ram0 ; mount /dev/ram0 /mnt/test > > > > > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > > > --- > > > fs/xfs/kmem.c | 61 +++++++++++++++++++++++++++++----------- > > > fs/xfs/kmem.h | 1 + > > > fs/xfs/xfs_buf.c | 4 +-- > > > fs/xfs/xfs_log.c | 2 +- > > > fs/xfs/xfs_log_recover.c | 2 +- > > > fs/xfs/xfs_trace.h | 1 + > > > 6 files changed, 50 insertions(+), 21 deletions(-) > > > > > > diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c > > > index edcf393c8fd9..ec693c0fdcff 100644 > > > --- a/fs/xfs/kmem.c > > > +++ b/fs/xfs/kmem.c > > ... > > > @@ -62,6 +56,39 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) > > > return ptr; > > > } > > > > > > +/* > > > + * Same as kmem_alloc_large, except we guarantee a 512 byte aligned buffer is > > > + * returned. vmalloc always returns an aligned region. > > > + */ > > > +void * > > > +kmem_alloc_io(size_t size, xfs_km_flags_t flags) > > > +{ > > > + void *ptr; > > > + > > > + trace_kmem_alloc_io(size, flags, _RET_IP_); > > > + > > > + ptr = kmem_alloc(size, flags | KM_MAYFAIL); > > > + if (ptr) { > > > + if (!((long)ptr & 511)) > > > + return ptr; > > > + kfree(ptr); > > > + } > > > + return __kmem_vmalloc(size, flags); > > > +} > > > > Even though it is unfortunate, this seems like a quite reasonable and > > isolated temporary solution to the problem to me. The one concern I have > > is if/how much this could affect performance under certain > > circumstances. > > Can't measure a difference on 4k block size filesystems. It's only > used for log recovery and then for allocation AGF/AGI buffers on > 512 byte sector devices. Anything using 4k sectors only hits it > during mount. So for default configs with memory posioning/KASAN > enabled, the massive overhead of poisoning/tracking makes this > disappear in the noise. > > For 1k block size filesystems, it gets hit much harder, but > there's no noticable increase in runtime of xfstests vs 4k block > size with KASAN enabled. The big increase in overhead comes from > enabling KASAN (takes 3x longer than without), not doing one extra > allocation/free pair. > > > I realize that these callsites are isolated in the common > > scenario. Less common scenarios like sub-page block sizes (whether due > > to explicit mkfs time format or default configurations on larger page > > size systems) can fall into this path much more frequently, however. > > *nod* > > > Since this implies some kind of vm debug option is enabled, performance > > itself isn't critical when this solution is active. But how bad is it in > > those cases where we might depend on this more heavily? Have you > > confirmed that the end configuration is still "usable," at least? > > No noticable difference, most definitely still usable. > Ok, thanks. > > I ask because the repeated alloc/free behavior can easily be avoided via > > something like an mp flag (which may require a tweak to the > > What's an "mp flag"? > A bool or something similar added to xfs_mount to control further slab allocation attempts for I/O allocations for this mount. I was just throwing this out there if the performance hit happened to be bad (on top of whatever vm debug option is enabled) in those configurations where slab based buffer allocations are more common. If the performance hit is negligible in practice, then I'm not worried about it. > > kmem_alloc_io() interface) to skip further kmem_alloc() calls from this > > path once we see one unaligned allocation. That assumes this behavior is > > tied to functionality that isn't dynamically configured at runtime, of > > course. > > vmalloc() has a _lot_ more overhead than kmalloc (especially when > vmalloc has to do multiple allocations itself to set up page table > entries) so there is still an overall gain to be had by trying > kmalloc even if 50% of allocations are unaligned. > I had the impression that this unaligned allocation behavior is tied to enablement of debug options that otherwise aren't enabled/disabled dynamically. Hence, the unaligned allocation behavior is persistent for a particular mount and repeated attempts are pointless once we see at least one such result. Is that not the case? Again, I don't think performance is a huge deal so long as testing shows that an fs is still usable with XFS running this kind of allocation pattern. In thinking further about it, aren't we essentially bypassing these tools for associated allocations if they don't offer similar functionality for vmalloc allocations? It might be worth 1.) noting that as a consequence of this change in the commit log and 2.) having a oneshot warning somewhere when we initially hit this problem so somebody actually using one of these tools realizes that enabling it actually changes allocation behavior. For example: XFS ...: WARNING: Unaligned I/O memory allocation. VM debug enabled? Disabling slab allocations for I/O. ... or alternatively just add a WARN_ON_ONCE() or something with a similar comment in the code. Brian > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com
On 8/22/19 3:17 PM, Dave Chinner wrote: > On Thu, Aug 22, 2019 at 02:19:04PM +0200, Vlastimil Babka wrote: >> On 8/22/19 2:07 PM, Dave Chinner wrote: >> > On Thu, Aug 22, 2019 at 01:14:30PM +0200, Vlastimil Babka wrote: >> > >> > No, the problem is this (using kmalloc as a general term for >> > allocation, whether it be kmalloc, kmem_cache_alloc, alloc_page, etc) >> > >> > some random kernel code >> > kmalloc(GFP_KERNEL) >> > reclaim >> > PF_MEMALLOC >> > shrink_slab >> > xfs_inode_shrink >> > XFS_ILOCK >> > xfs_buf_allocate_memory() >> > kmalloc(GFP_KERNEL) >> > >> > And so locks on inodes in reclaim are seen below reclaim. Then >> > somewhere else we have: >> > >> > some high level read-only xfs code like readdir >> > XFS_ILOCK >> > xfs_buf_allocate_memory() >> > kmalloc(GFP_KERNEL) >> > reclaim >> > >> > And this one throws false positive lockdep warnings because we >> > called into reclaim with XFS_ILOCK held and GFP_KERNEL alloc >> >> OK, and what exactly makes this positive a false one? Why can't it continue like >> the first example where reclaim leads to another XFS_ILOCK, thus deadlock? > > Because above reclaim we only have operations being done on > referenced inodes, and below reclaim we only have unreferenced > inodes. We never lock the same inode both above and below reclaim > at the same time. > > IOWs, an operation above reclaim cannot see, access or lock > unreferenced inodes, except in inode write clustering, and that uses > trylocks so cannot deadlock with reclaim. > > An operation below reclaim cannot see, access or lock referenced > inodes except during inode write clustering, and that uses trylocks > so cannot deadlock with code above reclaim. Thanks for elaborating. Perhaps lockdep experts (not me) would know how to express that. If not possible, then replacing GFP_NOFS with __GFP_NOLOCKDEP should indeed suppress the warning, while allowing FS reclaim. > FWIW, I'm trying to make the inode writeback clustering go away from > reclaim at the moment, so even that possibility is going away soon. > That will change everything to trylocks in reclaim context, so > lockdep is going to stop tracking it entirely. That's also a nice solution :) > Hmmm - maybe we're getting to the point where we actually > don't need GFP_NOFS/PF_MEMALLOC_NOFS at all in XFS anymore..... > > Cheers, > > Dave. >
On Thu, Aug 22, 2019 at 09:40:17AM -0400, Brian Foster wrote: > On Thu, Aug 22, 2019 at 07:14:52AM +1000, Dave Chinner wrote: > > On Wed, Aug 21, 2019 at 09:35:33AM -0400, Brian Foster wrote: > > > On Wed, Aug 21, 2019 at 06:38:19PM +1000, Dave Chinner wrote: > > > kmem_alloc_io() interface) to skip further kmem_alloc() calls from this > > > path once we see one unaligned allocation. That assumes this behavior is > > > tied to functionality that isn't dynamically configured at runtime, of > > > course. > > > > vmalloc() has a _lot_ more overhead than kmalloc (especially when > > vmalloc has to do multiple allocations itself to set up page table > > entries) so there is still an overall gain to be had by trying > > kmalloc even if 50% of allocations are unaligned. > > > > I had the impression that this unaligned allocation behavior is tied to > enablement of debug options that otherwise aren't enabled/disabled > dynamically. Hence, the unaligned allocation behavior is persistent for > a particular mount and repeated attempts are pointless once we see at > least one such result. Is that not the case? The alignment for a given slab is likely to be persistent, but it will be different for different sized slabs. e.g. I saw 128 offsets for 512 slabs, and 1024 byte offsets for 4k slabs. The 1024 byte offsets worked just fine (because multiple of 512 bytes!) but the 128 byte ones didn't. Hence it's not a black and white "everythign is unaligned and unsupportable" situation, nor is the alignment necessarily an issue for the underlying driver. e.g. most scsi and nvme handle 8 byte alignment of buffers, and if the buffer is not aligned they bounce it (detected via the same blk_rq_alignment() check I added) and can still do the IO anyway. So a large amount of the IO stack just doesn't care about user buffers being unaligned.... > Again, I don't think performance is a huge deal so long as testing shows > that an fs is still usable with XFS running this kind of allocation > pattern. It's added 10 minutes to the runtime of a full auto run with KASAN enabled on pmem. To put that in context, the entire run took: real 461m36.831s user 44m31.779s sys 708m37.467s More than 7.5 hours to complete, so ten minutes here or there is noise. > In thinking further about it, aren't we essentially bypassing > these tools for associated allocations if they don't offer similar > functionality for vmalloc allocations? kasan uses guard pages around vmalloc allocations to detect out of bound accesses. It still tracks the page allocations, etc, so we still get use after free tracking, etc. i.e. AFAICT we don't actually lose any debugging functonality by using vmalloc here. > It might be worth 1.) noting that > as a consequence of this change in the commit log and 2.) having a > oneshot warning somewhere when we initially hit this problem so somebody > actually using one of these tools realizes that enabling it actually > changes allocation behavior. For example: > > XFS ...: WARNING: Unaligned I/O memory allocation. VM debug enabled? > Disabling slab allocations for I/O. > > ... or alternatively just add a WARN_ON_ONCE() or something with a > similar comment in the code. Well, the WARN_ON_ONCE is in xfs_bio_add_page() when it detects an invalid alignment. So we'll get this warning on production systems as well as debug/test systems. I think that's the important case to catch, because misalignment will result in silent data corruption... Cheers, Dave.
On Fri, Aug 23, 2019 at 08:39:59AM +1000, Dave Chinner wrote: > On Thu, Aug 22, 2019 at 09:40:17AM -0400, Brian Foster wrote: > > On Thu, Aug 22, 2019 at 07:14:52AM +1000, Dave Chinner wrote: > > > On Wed, Aug 21, 2019 at 09:35:33AM -0400, Brian Foster wrote: > > > > On Wed, Aug 21, 2019 at 06:38:19PM +1000, Dave Chinner wrote: > > > > kmem_alloc_io() interface) to skip further kmem_alloc() calls from this > > > > path once we see one unaligned allocation. That assumes this behavior is > > > > tied to functionality that isn't dynamically configured at runtime, of > > > > course. > > > > > > vmalloc() has a _lot_ more overhead than kmalloc (especially when > > > vmalloc has to do multiple allocations itself to set up page table > > > entries) so there is still an overall gain to be had by trying > > > kmalloc even if 50% of allocations are unaligned. > > > > > > > I had the impression that this unaligned allocation behavior is tied to > > enablement of debug options that otherwise aren't enabled/disabled > > dynamically. Hence, the unaligned allocation behavior is persistent for > > a particular mount and repeated attempts are pointless once we see at > > least one such result. Is that not the case? > > The alignment for a given slab is likely to be persistent, but it > will be different for different sized slabs. e.g. I saw 128 offsets > for 512 slabs, and 1024 byte offsets for 4k slabs. The 1024 byte > offsets worked just fine (because multiple of 512 bytes!) but the > 128 byte ones didn't. > Interesting. I wasn't aware offsets could be large enough to maintain alignment requirements. > Hence it's not a black and white "everythign is unaligned and > unsupportable" situation, nor is the alignment necessarily an issue > for the underlying driver. e.g. most scsi and nvme handle 8 > byte alignment of buffers, and if the buffer is not aligned they > bounce it (detected via the same blk_rq_alignment() check I added) > and can still do the IO anyway. So a large amount of the IO stack > just doesn't care about user buffers being unaligned.... > > > Again, I don't think performance is a huge deal so long as testing shows > > that an fs is still usable with XFS running this kind of allocation > > pattern. > > It's added 10 minutes to the runtime of a full auto run with KASAN > enabled on pmem. To put that in context, the entire run took: > > real 461m36.831s > user 44m31.779s > sys 708m37.467s > > More than 7.5 hours to complete, so ten minutes here or there is > noise. > Agreed. Thanks for the data. > > In thinking further about it, aren't we essentially bypassing > > these tools for associated allocations if they don't offer similar > > functionality for vmalloc allocations? > > kasan uses guard pages around vmalloc allocations to detect out of > bound accesses. It still tracks the page allocations, etc, so we > still get use after free tracking, etc. i.e. AFAICT we don't > actually lose any debugging functonality by using vmalloc here. > Ok. > > It might be worth 1.) noting that > > as a consequence of this change in the commit log and 2.) having a > > oneshot warning somewhere when we initially hit this problem so somebody > > actually using one of these tools realizes that enabling it actually > > changes allocation behavior. For example: > > > > XFS ...: WARNING: Unaligned I/O memory allocation. VM debug enabled? > > Disabling slab allocations for I/O. > > > > ... or alternatively just add a WARN_ON_ONCE() or something with a > > similar comment in the code. > > Well, the WARN_ON_ONCE is in xfs_bio_add_page() when it detects an > invalid alignment. So we'll get this warning on production systems > as well as debug/test systems. I think that's the important case to > catch, because misalignment will result in silent data corruption... > That's in the subsequent patch that I thought was being dropped (or more accurately, deferred until Christoph has a chance to try and get it fixed in the block layer..)..? Brian > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com
On Thu 22-08-19 16:26:42, Vlastimil Babka wrote: > On 8/22/19 3:17 PM, Dave Chinner wrote: > > On Thu, Aug 22, 2019 at 02:19:04PM +0200, Vlastimil Babka wrote: > >> On 8/22/19 2:07 PM, Dave Chinner wrote: > >> > On Thu, Aug 22, 2019 at 01:14:30PM +0200, Vlastimil Babka wrote: > >> > > >> > No, the problem is this (using kmalloc as a general term for > >> > allocation, whether it be kmalloc, kmem_cache_alloc, alloc_page, etc) > >> > > >> > some random kernel code > >> > kmalloc(GFP_KERNEL) > >> > reclaim > >> > PF_MEMALLOC > >> > shrink_slab > >> > xfs_inode_shrink > >> > XFS_ILOCK > >> > xfs_buf_allocate_memory() > >> > kmalloc(GFP_KERNEL) > >> > > >> > And so locks on inodes in reclaim are seen below reclaim. Then > >> > somewhere else we have: > >> > > >> > some high level read-only xfs code like readdir > >> > XFS_ILOCK > >> > xfs_buf_allocate_memory() > >> > kmalloc(GFP_KERNEL) > >> > reclaim > >> > > >> > And this one throws false positive lockdep warnings because we > >> > called into reclaim with XFS_ILOCK held and GFP_KERNEL alloc > >> > >> OK, and what exactly makes this positive a false one? Why can't it continue like > >> the first example where reclaim leads to another XFS_ILOCK, thus deadlock? > > > > Because above reclaim we only have operations being done on > > referenced inodes, and below reclaim we only have unreferenced > > inodes. We never lock the same inode both above and below reclaim > > at the same time. > > > > IOWs, an operation above reclaim cannot see, access or lock > > unreferenced inodes, except in inode write clustering, and that uses > > trylocks so cannot deadlock with reclaim. > > > > An operation below reclaim cannot see, access or lock referenced > > inodes except during inode write clustering, and that uses trylocks > > so cannot deadlock with code above reclaim. > > Thanks for elaborating. Perhaps lockdep experts (not me) would know how to > express that. If not possible, then replacing GFP_NOFS with __GFP_NOLOCKDEP > should indeed suppress the warning, while allowing FS reclaim. This was certainly my hope to happen when introducing __GFP_NOLOCKDEP. I couldn't have done the second step because that requires a deep understanding of the code in question which is beyond my capacity. It seems we still haven't found a brave soul to start converting GFP_NOFS to __GFP_NOLOCKDEP. And it would be really appreciated. Thanks.
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index edcf393c8fd9..ec693c0fdcff 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -30,30 +30,24 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) } while (1); } -void * -kmem_alloc_large(size_t size, xfs_km_flags_t flags) + +/* + * __vmalloc() will allocate data pages and auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence + * we need to tell memory reclaim that we are in such a context via + * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here + * and potentially deadlocking. + */ +static void * +__kmem_vmalloc(size_t size, xfs_km_flags_t flags) { unsigned nofs_flag = 0; void *ptr; - gfp_t lflags; - - trace_kmem_alloc_large(size, flags, _RET_IP_); - - ptr = kmem_alloc(size, flags | KM_MAYFAIL); - if (ptr) - return ptr; + gfp_t lflags = kmem_flags_convert(flags); - /* - * __vmalloc() will allocate data pages and auxillary structures (e.g. - * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context - * here. Hence we need to tell memory reclaim that we are in such a - * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering - * the filesystem here and potentially deadlocking. - */ if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); - lflags = kmem_flags_convert(flags); ptr = __vmalloc(size, lflags, PAGE_KERNEL); if (flags & KM_NOFS) @@ -62,6 +56,39 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags) return ptr; } +/* + * Same as kmem_alloc_large, except we guarantee a 512 byte aligned buffer is + * returned. vmalloc always returns an aligned region. + */ +void * +kmem_alloc_io(size_t size, xfs_km_flags_t flags) +{ + void *ptr; + + trace_kmem_alloc_io(size, flags, _RET_IP_); + + ptr = kmem_alloc(size, flags | KM_MAYFAIL); + if (ptr) { + if (!((long)ptr & 511)) + return ptr; + kfree(ptr); + } + return __kmem_vmalloc(size, flags); +} + +void * +kmem_alloc_large(size_t size, xfs_km_flags_t flags) +{ + void *ptr; + + trace_kmem_alloc_large(size, flags, _RET_IP_); + + ptr = kmem_alloc(size, flags | KM_MAYFAIL); + if (ptr) + return ptr; + return __kmem_vmalloc(size, flags); +} + void * kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) { diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 267655acd426..423a1fa0fcd6 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -59,6 +59,7 @@ kmem_flags_convert(xfs_km_flags_t flags) } extern void *kmem_alloc(size_t, xfs_km_flags_t); +extern void *kmem_alloc_io(size_t, xfs_km_flags_t); extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ca0849043f54..7bd1f31febfc 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -353,7 +353,7 @@ xfs_buf_allocate_memory( */ size = BBTOB(bp->b_length); if (size < PAGE_SIZE) { - bp->b_addr = kmem_alloc(size, KM_NOFS); + bp->b_addr = kmem_alloc_io(size, KM_NOFS); if (!bp->b_addr) { /* low memory - use alloc_page loop instead */ goto use_alloc_page; @@ -368,7 +368,7 @@ xfs_buf_allocate_memory( } bp->b_offset = offset_in_page(bp->b_addr); bp->b_pages = bp->b_page_array; - bp->b_pages[0] = virt_to_page(bp->b_addr); + bp->b_pages[0] = kmem_to_page(bp->b_addr); bp->b_page_count = 1; bp->b_flags |= _XBF_KMEM; return 0; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7fc3c1ad36bc..1830d185d7fc 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1415,7 +1415,7 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog; - iclog->ic_data = kmem_alloc_large(log->l_iclog_size, + iclog->ic_data = kmem_alloc_io(log->l_iclog_size, KM_MAYFAIL); if (!iclog->ic_data) goto out_free_iclog; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 13d1d3e95b88..b4a6a008986b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -125,7 +125,7 @@ xlog_alloc_buffer( if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL); + return kmem_alloc_io(BBTOB(nbblks), KM_MAYFAIL); } /* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8bb8b4704a00..eaae275ed430 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3604,6 +3604,7 @@ DEFINE_EVENT(xfs_kmem_class, name, \ TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ TP_ARGS(size, flags, caller_ip)) DEFINE_KMEM_EVENT(kmem_alloc); +DEFINE_KMEM_EVENT(kmem_alloc_io); DEFINE_KMEM_EVENT(kmem_alloc_large); DEFINE_KMEM_EVENT(kmem_realloc); DEFINE_KMEM_EVENT(kmem_zone_alloc);