diff mbox series

[v3,6/7] 9p/trans_virtio: support larger msize values

Message ID 8119d4d93a39758075bb83730dcb571f5d071af6.1632327421.git.linux_oss@crudebyte.com (mailing list archive)
State Not Applicable
Delegated to: Netdev Maintainers
Headers show
Series net/9p: remove msize limit in virtio transport | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Guessed tree name to be net-next
netdev/subject_prefix warning Target tree name not specified in the subject
netdev/cc_maintainers warning 2 maintainers not CCed: kuba@kernel.org davem@davemloft.net
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: Avoid crashing the kernel - try using WARN_ON & recovery code rather than BUG() or BUG_ON() WARNING: networking block comments don't use an empty /* line, use /* Comment... WARNING: quoted string split across lines
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/header_inline success Link

Commit Message

Christian Schoenebeck Sept. 22, 2021, 4 p.m. UTC
The virtio transport supports by default a 9p 'msize' of up to
approximately 500 kB. This patch adds support for larger 'msize'
values by resizing the amount of scatter/gather lists if required.

Signed-off-by: Christian Schoenebeck <linux_oss@crudebyte.com>
---
 net/9p/trans_virtio.c | 61 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

Comments

Nikolay Kichukov Nov. 20, 2021, 11:20 a.m. UTC | #1
Thanks for the patches and sorry for top-posting.

I've tested them on GNU/Gentoo Linux, kernel 5.15.3 on amd64
architecture on both guest and KVM host.

The patches from this series, v3 have been applied to the host kernel
and also to the guest kernel. Guest kernel is clang compiled and host
kernel is compiled with gcc-11.

The host also received the qemu patches:
https://github.com/cschoenebeck/qemu/commit/04a7f9e55e0930b87805f7c97851eea4610e78fc
https://github.com/cschoenebeck/qemu/commit/b565bccb00afe8b73d529bbc3a38682996dac5c7
https://github.com/cschoenebeck/qemu/commit/669ced09b3b6070d478acce51810591b78ab0ccd

Qemu version on the host is 6.0.0-r54.

When the client mounts the share via virtio, requested msize is:
10485760 or 104857600

however the mount succeeds with:
msize=507904 in the end as per the /proc filesystem. This is less than
the previous maximum value.

However, performance-wise, I do see an improvement in throughput,
perhaps related to the qemu patches or some combination.

In addition to the above, when the kernel on the guest boots and loads
9pfs support, the attached memory allocation failure trace is generated.

Is anyone else seeing similar and was anybody able to get msize set to
10MB via virtio protocol with these patches?

Thank you,
-Nikolay

On Wed, 2021-09-22 at 18:00 +0200, Christian Schoenebeck wrote:
> The virtio transport supports by default a 9p 'msize' of up to
> approximately 500 kB. This patch adds support for larger 'msize'
> values by resizing the amount of scatter/gather lists if required.
> 
> Signed-off-by: Christian Schoenebeck <linux_oss@crudebyte.com>
> ---
>  net/9p/trans_virtio.c | 61 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 61 insertions(+)
> 
> diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
> index e478a34326f1..147ebf647a95 100644
> --- a/net/9p/trans_virtio.c
> +++ b/net/9p/trans_virtio.c
> @@ -203,6 +203,31 @@ static struct virtqueue_sg *vq_sg_alloc(unsigned
> int nsgl)
>         return vq_sg;
>  }
>  
> +/**
> + * vq_sg_resize - resize passed virtqueue scatter/gather lists to the
> passed
> + * amount of lists
> + * @_vq_sg: scatter/gather lists to be resized
> + * @nsgl: new amount of scatter/gather lists
> + */
> +static int vq_sg_resize(struct virtqueue_sg **_vq_sg, unsigned int
> nsgl)
> +{
> +       struct virtqueue_sg *vq_sg;
> +
> +       BUG_ON(!_vq_sg || !nsgl);
> +       vq_sg = *_vq_sg;
> +       if (vq_sg->nsgl == nsgl)
> +               return 0;
> +
> +       /* lazy resize implementation for now */
> +       vq_sg = vq_sg_alloc(nsgl);
> +       if (!vq_sg)
> +               return -ENOMEM;
> +
> +       kfree(*_vq_sg);
> +       *_vq_sg = vq_sg;
> +       return 0;
> +}
> +
>  /**
>   * p9_virtio_close - reclaim resources of a channel
>   * @client: client instance
> @@ -774,6 +799,10 @@ p9_virtio_create(struct p9_client *client, const
> char *devname, char *args)
>         struct virtio_chan *chan;
>         int ret = -ENOENT;
>         int found = 0;
> +#if !defined(CONFIG_ARCH_NO_SG_CHAIN)
> +       size_t npages;
> +       size_t nsgl;
> +#endif
>  
>         if (devname == NULL)
>                 return -EINVAL;
> @@ -796,6 +825,38 @@ p9_virtio_create(struct p9_client *client, const
> char *devname, char *args)
>                 return ret;
>         }
>  
> +       /*
> +        * if user supplied an 'msize' option that's larger than what
> this
> +        * transport supports by default, then try to allocate more sg
> lists
> +        */
> +       if (client->msize > client->trans_maxsize) {
> +#ifdef CONFIG_ARCH_NO_SG_CHAIN
> +               pr_info("limiting 'msize' to %d because architecture
> does not "
> +                       "support chained scatter gather lists\n",
> +                       client->trans_maxsize);
> +#else
> +               npages = DIV_ROUND_UP(client->msize, PAGE_SIZE);
> +               if (npages > chan->p9_max_pages) {
> +                       npages = chan->p9_max_pages;
> +                       pr_info("limiting 'msize' as it would exceed the
> max. "
> +                               "of %lu pages allowed on this system\n",
> +                               chan->p9_max_pages);
> +               }
> +               nsgl = DIV_ROUND_UP(npages, SG_USER_PAGES_PER_LIST);
> +               if (nsgl > chan->vq_sg->nsgl) {
> +                       /*
> +                        * if resize fails, no big deal, then just
> +                        * continue with default msize instead
> +                        */
> +                       if (!vq_sg_resize(&chan->vq_sg, nsgl)) {
> +                               client->trans_maxsize =
> +                                       PAGE_SIZE *
> +                                       ((nsgl * SG_USER_PAGES_PER_LIST)
> - 3);
> +                       }
> +               }
> +#endif /* CONFIG_ARCH_NO_SG_CHAIN */
> +       }
> +
>         client->trans = (void *)chan;
>         client->status = Connected;
>         chan->client = client;
[    1.527981] 9p: Installing v9fs 9p2000 file system support
[    1.528173] ------------[ cut here ]------------
[    1.528174] WARNING: CPU: 1 PID: 791 at mm/page_alloc.c:5356 __alloc_pages+0x1ed/0x290
[    1.528179] Modules linked in: 9p 9pnet_virtio virtio_net net_failover failover virtio_console 9pnet virtio_balloon efivarfs
[    1.528182] CPU: 1 PID: 791 Comm: mount Not tainted 5.15.3-gentoo-x86_64 #1
[    1.528184] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
[    1.528185] RIP: 0010:__alloc_pages+0x1ed/0x290
[    1.528187] Code: ef 44 89 f6 e8 34 13 00 00 31 ed e9 5b ff ff ff 81 e3 3f ff ff ff 89 d9 89 cb 83 e3 f7 a9 00 00 00 10 0f 44 d9 e9 64 fe ff ff <0f> 0b 31 ed e9 42 ff ff ff 41 89 df 65 8b 05 08 65 9f 69 41 81 cf
[    1.528188] RSP: 0018:ffffb666405039e8 EFLAGS: 00010246
[    1.528189] RAX: c491259349f3ef00 RBX: 0000000000040c40 RCX: 0000000000000000
[    1.528190] RDX: 0000000000000000 RSI: 000000000000000c RDI: 0000000000040c40
[    1.528191] RBP: 000000000000000c R08: 0000000000000090 R09: fffffa3cc0111640
[    1.528192] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000c40
[    1.528192] R13: ffff9fc944459c88 R14: 000000000000000c R15: 0000000000000c40
[    1.528194] FS:  00007ff620e57740(0000) GS:ffff9fc9bbc40000(0000) knlGS:0000000000000000
[    1.528195] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    1.528196] CR2: 00007fcfc14e8000 CR3: 0000000003358000 CR4: 0000000000350ea0
[    1.528198] Call Trace:
[    1.528201]  <TASK>
[    1.528202]  kmalloc_order+0x39/0xf0
[    1.528204]  kmalloc_order_trace+0x13/0x70
[    1.528205]  __kmalloc+0x1fc/0x2b0
[    1.528208]  p9_fcall_init+0x3d/0x60 [9pnet]
[    1.528210]  p9_client_prepare_req+0x82/0x2b0 [9pnet]
[    1.528212]  p9_client_rpc+0x80/0x350 [9pnet]
[    1.528214]  ? p9_virtio_create+0x253/0x2b0 [9pnet_virtio]
[    1.528216]  ? kfree+0x260/0x350
[    1.528217]  p9_client_version+0x60/0x1d0 [9pnet]
[    1.528219]  p9_client_create+0x3b4/0x460 [9pnet]
[    1.528221]  v9fs_session_init+0xab/0x760 [9p]
[    1.528222]  ? user_path_at_empty+0x7b/0x90
[    1.528224]  ? kmem_cache_alloc_trace+0x188/0x260
[    1.528226]  v9fs_mount+0x43/0x2d0 [9p]
[    1.528227]  legacy_get_tree.llvm.612377983374665620+0x22/0x40
[    1.528230]  vfs_get_tree+0x21/0xb0
[    1.528232]  path_mount+0x70d/0xcd0
[    1.528234]  __x64_sys_mount+0x148/0x1c0
[    1.528236]  do_syscall_64+0x4a/0xb0
[    1.528238]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[    1.528240] RIP: 0033:0x7ff620f92d5a
[    1.528241] Code: 48 8b 0d 11 c1 0b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d de c0 0b 00 f7 d8 64 89 01 48
[    1.528242] RSP: 002b:00007ffe4675f9f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
[    1.528244] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007ff620f92d5a
[    1.528244] RDX: 0000000000541f90 RSI: 0000000000541f40 RDI: 0000000000541f20
[    1.528245] RBP: 0000000000000c00 R08: 00000000005421f0 R09: 00007ffe4675e790
[    1.528246] R10: 0000000000000c00 R11: 0000000000000246 R12: 000000000053d4f0
[    1.528246] R13: 0000000000541f20 R14: 0000000000541f90 R15: 00007ff62109dcf4
[    1.528247]  </TASK>
[    1.528248] ---[ end trace 84f05b2aa35f19b3 ]---

[   90.894853] 9pnet: Limiting 'msize' to 507904 as this is the maximum supported by transport virtio
Dominique Martinet Nov. 20, 2021, 11:45 a.m. UTC | #2
(Thanks for restarting this thread, this had totally slipped out of my
mind...)

Nikolay Kichukov wrote on Sat, Nov 20, 2021 at 12:20:35PM +0100:
> When the client mounts the share via virtio, requested msize is:
> 10485760 or 104857600
> 
> however the mount succeeds with:
> msize=507904 in the end as per the /proc filesystem. This is less than
> the previous maximum value.

(Not sure about this, I'll test these patches tomorrow, but since
something failed I'm not surprised you have less than what you could
have here: what do you get with a more reasonable value like 1M first?)

> In addition to the above, when the kernel on the guest boots and loads
> 9pfs support, the attached memory allocation failure trace is generated.
> 
> Is anyone else seeing similar and was anybody able to get msize set to
> 10MB via virtio protocol with these patches?

I don't think the kernel would ever allow this with the given code, as
the "common" part of 9p is not smart enough to use scatter-gather and
tries to do a big allocation (almost) the size of the msize:

---
        clnt->fcall_cache =
                kmem_cache_create_usercopy("9p-fcall-cache", clnt->msize,
                                           0, 0, P9_HDRSZ + 4,
                                           clnt->msize - (P9_HDRSZ + 4),
                                           NULL);

...
		fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
---
So in practice, you will be capped at 2MB as that is the biggest the
slab will be able to hand over in a single chunk.

It'll also make allocation failures quite likely as soon as the system
has had some uptime (depending on your workload, look at /proc/buddyinfo
if your machines normally have 2MB chunks available), so I would really
not recommend running with buffers bigger than e.g. 512k on real
workloads -- it looks great on benchmarks, especially as it's on its own
slab so as long as you're doing a lot of requests it will dish out
buffers fast, but it'll likely be unreliable over time.
(oh, and we allocate two of these per request...)


The next step to support large buffers really would be splitting htat
buffer to:
 1/ not allocate huge buffers for small metadata ops, e.g. an open call
certainly doesn't need to allocate 10MB of memory
 2/ support splitting the buffer in some scattergather array

Ideally we'd only allocate on an as-need basis, most of the protocol
calls bound how much data is supposed to come back and we know how much
we want to send (it's a format string actually, but we can majorate it
quite easily), so one would need to adjust all protocol calls to pass
this info to p9_client_rpc/p9_client_zc_rpc so it only allocates buffers
as required, if necessary in multiple reasonably-sized segments (I'd
love 2MB hugepages-backed folios...), and have all transports use these
buffers.

I have a rough idea on how to do all this but honestly less than 0 time
for that, so happy to give advices or review any patch, but it's going
to be a lot of work that stand in the way of really big IOs.



> [    1.527981] 9p: Installing v9fs 9p2000 file system support
> [    1.528173] ------------[ cut here ]------------
> [    1.528174] WARNING: CPU: 1 PID: 791 at mm/page_alloc.c:5356 __alloc_pages+0x1ed/0x290


This warning is exactly what I was saying about the allocation cap:
you've requested an allocation that was bigger than the max __alloc_page
is willing to provide (MAX_ORDER, 11, so 2MB as I was saying)
Christian Schoenebeck Nov. 20, 2021, 2:46 p.m. UTC | #3
On Samstag, 20. November 2021 12:45:09 CET Dominique Martinet wrote:
> (Thanks for restarting this thread, this had totally slipped out of my
> mind...)

Hi guys,

I have (more or less) silently been working on these patches on all ends in 
the meantime. If desired I try to find some time next week to summarize 
current status of this overall effort and what still needs to be done.

> Nikolay Kichukov wrote on Sat, Nov 20, 2021 at 12:20:35PM +0100:
> > When the client mounts the share via virtio, requested msize is:
> > 10485760 or 104857600
> > 
> > however the mount succeeds with:
> > msize=507904 in the end as per the /proc filesystem. This is less than
> > the previous maximum value.
> 
> (Not sure about this, I'll test these patches tomorrow, but since
> something failed I'm not surprised you have less than what you could
> have here: what do you get with a more reasonable value like 1M first?)

The highest 'msize' value possible for me with this particular version of the 
kernel patches is 4186212 (so slightly below 4MB).

Some benchmarks, linear reading a 12 GB file:

msize    average      notes

8 kB     52.0 MB/s    default msize of Linux kernel <v5.15
128 kB   624.8 MB/s   default msize of Linux kernel >=v5.15
512 kB   1961 MB/s    current max. msize with any Linux kernel <=v5.15
1 MB     2551 MB/s    this msize would violate current virtio specs
2 MB     2521 MB/s    this msize would violate current virtio specs
4 MB     2628 MB/s    planned milestone

That does not mean it already makes sense to use these patches in this version 
as is to increase overall performance yet, but the numbers already suggest 
that increasing msize can improve performance on large sequential I/O. There 
are still some things to do though to fix other use patterns from slowing down 
with rising msize, which I will describe in a separate email.

I also have an experimental version of kernel patches and QEMU that goes as 
high as msize=128MB. But that's a highly hacked version that copies buffers 
between 9p client level and virtio level back and forth and with intentional 
large buffer sizes on every 9p message type. This was solely meant as a stress 
test, i.e. whether it was possible to go as high as virtio's theoretical 
protocol limit in the first place, and whether it was stable. This stress test 
was not about performance at all. And yes, I had it running with 128MB for 
weeks without issues (except of being very slow of course due to hacks used).

> > In addition to the above, when the kernel on the guest boots and loads
> > 9pfs support, the attached memory allocation failure trace is generated.
> > 
> > Is anyone else seeing similar and was anybody able to get msize set to
> > 10MB via virtio protocol with these patches?
> 
> I don't think the kernel would ever allow this with the given code, as
> the "common" part of 9p is not smart enough to use scatter-gather and
> tries to do a big allocation (almost) the size of the msize:
> 
> ---
>         clnt->fcall_cache =
>                 kmem_cache_create_usercopy("9p-fcall-cache", clnt->msize,
>                                            0, 0, P9_HDRSZ + 4,
>                                            clnt->msize - (P9_HDRSZ + 4),
>                                            NULL);
> 
> ...
> 		fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
> ---
> So in practice, you will be capped at 2MB as that is the biggest the
> slab will be able to hand over in a single chunk.

I did not encounter a 2MB limit here. But kmalloc() clearly has a 4MB limit, 
so when trying an msize larger than 4MB it inevitably causes a memory 
allocation error. In my tests this allocation error would always happen 
immediately at mount time causing an instant kernel oops.

> It'll also make allocation failures quite likely as soon as the system
> has had some uptime (depending on your workload, look at /proc/buddyinfo
> if your machines normally have 2MB chunks available), so I would really
> not recommend running with buffers bigger than e.g. 512k on real
> workloads -- it looks great on benchmarks, especially as it's on its own
> slab so as long as you're doing a lot of requests it will dish out
> buffers fast, but it'll likely be unreliable over time.
> (oh, and we allocate two of these per request...)
> 
> 
> The next step to support large buffers really would be splitting htat
> buffer to:
>  1/ not allocate huge buffers for small metadata ops, e.g. an open call
> certainly doesn't need to allocate 10MB of memory
>  2/ support splitting the buffer in some scattergather array
> 
> Ideally we'd only allocate on an as-need basis, most of the protocol
> calls bound how much data is supposed to come back and we know how much
> we want to send (it's a format string actually, but we can majorate it
> quite easily), so one would need to adjust all protocol calls to pass
> this info to p9_client_rpc/p9_client_zc_rpc so it only allocates buffers
> as required, if necessary in multiple reasonably-sized segments (I'd
> love 2MB hugepages-backed folios...), and have all transports use these
> buffers.

It is not that bad in sense of pending work. One major thing that needs to be 
done is to cap the majority of 9p message types to allocate only as much as 
they need, which is for most message types <8k. Right now they always simply 
kmalloc(msize), which hurts with increasing msize values. That task does not 
need many changes though.

> I have a rough idea on how to do all this but honestly less than 0 time
> for that, so happy to give advices or review any patch, but it's going
> to be a lot of work that stand in the way of really big IOs.

Reviews of the patches would actually help a lot to bring this overall effort 
forward, but probably rather starting with the upcoming next version of the 
kernel patches, not this old v3.

> > [    1.527981] 9p: Installing v9fs 9p2000 file system support
> > [    1.528173] ------------[ cut here ]------------
> > [    1.528174] WARNING: CPU: 1 PID: 791 at mm/page_alloc.c:5356
> > __alloc_pages+0x1ed/0x290
> This warning is exactly what I was saying about the allocation cap:
> you've requested an allocation that was bigger than the max __alloc_page
> is willing to provide (MAX_ORDER, 11, so 2MB as I was saying)

Best regards,
Christian Schoenebeck
Nikolay Kichukov Nov. 20, 2021, 9:28 p.m. UTC | #4
On Sat, 2021-11-20 at 15:46 +0100, Christian Schoenebeck wrote:
> On Samstag, 20. November 2021 12:45:09 CET Dominique Martinet wrote:
> > (Thanks for restarting this thread, this had totally slipped out of
> > my
> > mind...)
> 
> Hi guys,
> 
> I have (more or less) silently been working on these patches on all
> ends in 
> the meantime. If desired I try to find some time next week to
> summarize 
> current status of this overall effort and what still needs to be done.

Great, I would be more than happy to test next version of these patches.

> 
> > Nikolay Kichukov wrote on Sat, Nov 20, 2021 at 12:20:35PM +0100:
> > > When the client mounts the share via virtio, requested msize is:
> > > 10485760 or 104857600
> > > 
> > > however the mount succeeds with:
> > > msize=507904 in the end as per the /proc filesystem. This is less
> > > than
> > > the previous maximum value.
> > 
> > (Not sure about this, I'll test these patches tomorrow, but since
> > something failed I'm not surprised you have less than what you could
> > have here: what do you get with a more reasonable value like 1M
> > first?)

It worked with 1MB, I can stick to this for the time being.

Are the kernel patches supposed to be included in the KVM host kernel or
would the guest kernel suffice?

> 
> The highest 'msize' value possible for me with this particular version
> of the 
> kernel patches is 4186212 (so slightly below 4MB).
> 
> Some benchmarks, linear reading a 12 GB file:
> 
> msize    average      notes
> 
> 8 kB     52.0 MB/s    default msize of Linux kernel <v5.15
> 128 kB   624.8 MB/s   default msize of Linux kernel >=v5.15
> 512 kB   1961 MB/s    current max. msize with any Linux kernel <=v5.15
> 1 MB     2551 MB/s    this msize would violate current virtio specs
> 2 MB     2521 MB/s    this msize would violate current virtio specs
> 4 MB     2628 MB/s    planned milestone
> 
> That does not mean it already makes sense to use these patches in this
> version 
> as is to increase overall performance yet, but the numbers already
> suggest 
> that increasing msize can improve performance on large sequential I/O.
> There 
> are still some things to do though to fix other use patterns from
> slowing down 
> with rising msize, which I will describe in a separate email.
> 
> I also have an experimental version of kernel patches and QEMU that
> goes as 
> high as msize=128MB. But that's a highly hacked version that copies
> buffers 
> between 9p client level and virtio level back and forth and with
> intentional 
> large buffer sizes on every 9p message type. This was solely meant as
> a stress 
> test, i.e. whether it was possible to go as high as virtio's
> theoretical 
> protocol limit in the first place, and whether it was stable. This
> stress test 
> was not about performance at all. And yes, I had it running with 128MB
> for 
> weeks without issues (except of being very slow of course due to hacks
> used).
> 
> > > In addition to the above, when the kernel on the guest boots and
> > > loads
> > > 9pfs support, the attached memory allocation failure trace is
> > > generated.
> > > 
> > > Is anyone else seeing similar and was anybody able to get msize
> > > set to
> > > 10MB via virtio protocol with these patches?
> > 
> > I don't think the kernel would ever allow this with the given code,
> > as
> > the "common" part of 9p is not smart enough to use scatter-gather
> > and
> > tries to do a big allocation (almost) the size of the msize:
> > 
> > ---
> >         clnt->fcall_cache =
> >                 kmem_cache_create_usercopy("9p-fcall-cache", clnt-
> > >msize,
> >                                            0, 0, P9_HDRSZ + 4,
> >                                            clnt->msize - (P9_HDRSZ +
> > 4),
> >                                            NULL);
> > 
> > ...
> >                 fc->sdata = kmem_cache_alloc(c->fcall_cache,
> > GFP_NOFS);
> > ---
> > So in practice, you will be capped at 2MB as that is the biggest the
> > slab will be able to hand over in a single chunk.
> 
> I did not encounter a 2MB limit here. But kmalloc() clearly has a 4MB
> limit, 
> so when trying an msize larger than 4MB it inevitably causes a memory 
> allocation error. In my tests this allocation error would always
> happen 
> immediately at mount time causing an instant kernel oops.
> 
> > It'll also make allocation failures quite likely as soon as the
> > system
> > has had some uptime (depending on your workload, look at
> > /proc/buddyinfo
> > if your machines normally have 2MB chunks available), so I would
> > really
> > not recommend running with buffers bigger than e.g. 512k on real
> > workloads -- it looks great on benchmarks, especially as it's on its
> > own
> > slab so as long as you're doing a lot of requests it will dish out
> > buffers fast, but it'll likely be unreliable over time.
> > (oh, and we allocate two of these per request...)
> > 
> > 
> > The next step to support large buffers really would be splitting
> > htat
> > buffer to:
> >  1/ not allocate huge buffers for small metadata ops, e.g. an open
> > call
> > certainly doesn't need to allocate 10MB of memory
> >  2/ support splitting the buffer in some scattergather array
> > 
> > Ideally we'd only allocate on an as-need basis, most of the protocol
> > calls bound how much data is supposed to come back and we know how
> > much
> > we want to send (it's a format string actually, but we can majorate
> > it
> > quite easily), so one would need to adjust all protocol calls to
> > pass
> > this info to p9_client_rpc/p9_client_zc_rpc so it only allocates
> > buffers
> > as required, if necessary in multiple reasonably-sized segments (I'd
> > love 2MB hugepages-backed folios...), and have all transports use
> > these
> > buffers.
> 
> It is not that bad in sense of pending work. One major thing that
> needs to be 
> done is to cap the majority of 9p message types to allocate only as
> much as 
> they need, which is for most message types <8k. Right now they always
> simply 
> kmalloc(msize), which hurts with increasing msize values. That task
> does not 
> need many changes though.
> 
> > I have a rough idea on how to do all this but honestly less than 0
> > time
> > for that, so happy to give advices or review any patch, but it's
> > going
> > to be a lot of work that stand in the way of really big IOs.
> 
> Reviews of the patches would actually help a lot to bring this overall
> effort 
> forward, but probably rather starting with the upcoming next version
> of the 
> kernel patches, not this old v3.
> 
> > > [    1.527981] 9p: Installing v9fs 9p2000 file system support
> > > [    1.528173] ------------[ cut here ]------------
> > > [    1.528174] WARNING: CPU: 1 PID: 791 at mm/page_alloc.c:5356
> > > __alloc_pages+0x1ed/0x290
> > This warning is exactly what I was saying about the allocation cap:
> > you've requested an allocation that was bigger than the max
> > __alloc_page
> > is willing to provide (MAX_ORDER, 11, so 2MB as I was saying)

Yes, this is no longer happening when I lowered the value of the msize
parameter.

> 
> Best regards,
> Christian Schoenebeck
> 
>
diff mbox series

Patch

diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index e478a34326f1..147ebf647a95 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -203,6 +203,31 @@  static struct virtqueue_sg *vq_sg_alloc(unsigned int nsgl)
 	return vq_sg;
 }
 
+/**
+ * vq_sg_resize - resize passed virtqueue scatter/gather lists to the passed
+ * amount of lists
+ * @_vq_sg: scatter/gather lists to be resized
+ * @nsgl: new amount of scatter/gather lists
+ */
+static int vq_sg_resize(struct virtqueue_sg **_vq_sg, unsigned int nsgl)
+{
+	struct virtqueue_sg *vq_sg;
+
+	BUG_ON(!_vq_sg || !nsgl);
+	vq_sg = *_vq_sg;
+	if (vq_sg->nsgl == nsgl)
+		return 0;
+
+	/* lazy resize implementation for now */
+	vq_sg = vq_sg_alloc(nsgl);
+	if (!vq_sg)
+		return -ENOMEM;
+
+	kfree(*_vq_sg);
+	*_vq_sg = vq_sg;
+	return 0;
+}
+
 /**
  * p9_virtio_close - reclaim resources of a channel
  * @client: client instance
@@ -774,6 +799,10 @@  p9_virtio_create(struct p9_client *client, const char *devname, char *args)
 	struct virtio_chan *chan;
 	int ret = -ENOENT;
 	int found = 0;
+#if !defined(CONFIG_ARCH_NO_SG_CHAIN)
+	size_t npages;
+	size_t nsgl;
+#endif
 
 	if (devname == NULL)
 		return -EINVAL;
@@ -796,6 +825,38 @@  p9_virtio_create(struct p9_client *client, const char *devname, char *args)
 		return ret;
 	}
 
+	/*
+	 * if user supplied an 'msize' option that's larger than what this
+	 * transport supports by default, then try to allocate more sg lists
+	 */
+	if (client->msize > client->trans_maxsize) {
+#ifdef CONFIG_ARCH_NO_SG_CHAIN
+		pr_info("limiting 'msize' to %d because architecture does not "
+			"support chained scatter gather lists\n",
+			client->trans_maxsize);
+#else
+		npages = DIV_ROUND_UP(client->msize, PAGE_SIZE);
+		if (npages > chan->p9_max_pages) {
+			npages = chan->p9_max_pages;
+			pr_info("limiting 'msize' as it would exceed the max. "
+				"of %lu pages allowed on this system\n",
+				chan->p9_max_pages);
+		}
+		nsgl = DIV_ROUND_UP(npages, SG_USER_PAGES_PER_LIST);
+		if (nsgl > chan->vq_sg->nsgl) {
+			/*
+			 * if resize fails, no big deal, then just
+			 * continue with default msize instead
+			 */
+			if (!vq_sg_resize(&chan->vq_sg, nsgl)) {
+				client->trans_maxsize =
+					PAGE_SIZE *
+					((nsgl * SG_USER_PAGES_PER_LIST) - 3);
+			}
+		}
+#endif /* CONFIG_ARCH_NO_SG_CHAIN */
+	}
+
 	client->trans = (void *)chan;
 	client->status = Connected;
 	chan->client = client;