diff mbox

[v2,5/6] 9p: Use a slab for allocating requests

Message ID 20180711210225.19730-6-willy@infradead.org (mailing list archive)
State New, archived
Headers show

Commit Message

Matthew Wilcox July 11, 2018, 9:02 p.m. UTC
Replace the custom batch allocation with a slab.  Use an IDR to store
pointers to the active requests instead of an array.  We don't try to
handle P9_NOTAG specially; the IDR will happily shrink all the way back
once the TVERSION call has completed.

Signed-off-by: Matthew Wilcox <willy@infradead.org>
---
 include/net/9p/client.h |  51 ++-------
 net/9p/client.c         | 239 +++++++++++++++-------------------------
 net/9p/mod.c            |   7 +-
 3 files changed, 101 insertions(+), 196 deletions(-)

Comments

Dominique Martinet July 18, 2018, 10:05 a.m. UTC | #1
+Cc Greg, I could use your opinion on this if you have a moment.

Matthew Wilcox wrote on Wed, Jul 11, 2018:
> Replace the custom batch allocation with a slab.  Use an IDR to store
> pointers to the active requests instead of an array.  We don't try to
> handle P9_NOTAG specially; the IDR will happily shrink all the way back
> once the TVERSION call has completed.

Sorry for coming back to this patch now, I just noticed something that's
actually probably a fairly big hit on performance...

While the slab is just as good as the array for the request itself, this
makes every single request allocate "fcalls" everytime instead of
reusing a cached allocation.
The default msize is 8k and these allocs probably are fairly efficient,
but some transports like RDMA allow to increase this to up to 1MB... And
doing this kind of allocation twice for every packet is going to be very
slow.
(not that hogging megabytes of memory was a great practice either!)


One thing is that the buffers are all going to be the same size for a
given client (.... except virtio zc buffers, I wonder what I'm missing
or why that didn't blow up before?)
Err, that aside I was going to ask if we couldn't find a way to keep a
pool of these somehow.
Ideally putting them in another slab so they could be reclaimed if
necessary, but the size could vary from one client to another, can we
create a kmem_cache object per client? the KMEM_CACHE macro is not very
flexible so I don't think that is encouraged... :)


It's a shame because I really like that patch, I'll try to find time to
run some light benchmark with varying msizes eventually but I'm not sure
when I'll find time for that... Hopefully before the 4.19 merge window!


>  /**
> - * p9_tag_alloc - lookup/allocate a request by tag
> - * @c: client session to lookup tag within
> - * @tag: numeric id for transaction
> - *
> - * this is a simple array lookup, but will grow the
> - * request_slots as necessary to accommodate transaction
> - * ids which did not previously have a slot.
> - *
> - * this code relies on the client spinlock to manage locks, its
> - * possible we should switch to something else, but I'd rather
> - * stick with something low-overhead for the common case.
> + * p9_req_alloc - Allocate a new request.
> + * @c: Client session.
> + * @type: Transaction type.
> + * @max_size: Maximum packet size for this request.
>   *
> + * Context: Process context.
> + * Return: Pointer to new request.
>   */
> -
>  static struct p9_req_t *
> -p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size)
> +p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
>  {
> -	unsigned long flags;
> -	int row, col;
> -	struct p9_req_t *req;
> +	struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS);
>  	int alloc_msize = min(c->msize, max_size);
> +	int tag;
>  
> -	/* This looks up the original request by tag so we know which
> -	 * buffer to read the data into */
> -	tag++;
> -
> -	if (tag >= c->max_tag) {
> -		spin_lock_irqsave(&c->lock, flags);
> -		/* check again since original check was outside of lock */
> -		while (tag >= c->max_tag) {
> -			row = (tag / P9_ROW_MAXTAG);
> -			c->reqs[row] = kcalloc(P9_ROW_MAXTAG,
> -					sizeof(struct p9_req_t), GFP_ATOMIC);
> -
> -			if (!c->reqs[row]) {
> -				pr_err("Couldn't grow tag array\n");
> -				spin_unlock_irqrestore(&c->lock, flags);
> -				return ERR_PTR(-ENOMEM);
> -			}
> -			for (col = 0; col < P9_ROW_MAXTAG; col++) {
> -				req = &c->reqs[row][col];
> -				req->status = REQ_STATUS_IDLE;
> -				init_waitqueue_head(&req->wq);
> -			}
> -			c->max_tag += P9_ROW_MAXTAG;
> -		}
> -		spin_unlock_irqrestore(&c->lock, flags);
> -	}
> -	row = tag / P9_ROW_MAXTAG;
> -	col = tag % P9_ROW_MAXTAG;
> +	if (!req)
> +		return NULL;
>  
> -	req = &c->reqs[row][col];
> -	if (!req->tc)
> -		req->tc = p9_fcall_alloc(alloc_msize);
> -	if (!req->rc)
> -		req->rc = p9_fcall_alloc(alloc_msize);
> +	req->tc = p9_fcall_alloc(alloc_msize);
> +	req->rc = p9_fcall_alloc(alloc_msize);
>  	if (!req->tc || !req->rc)
> -		goto grow_failed;
> +		goto free;
>  
>  	p9pdu_reset(req->tc);
>  	p9pdu_reset(req->rc);
> -
> -	req->tc->tag = tag-1;
>  	req->status = REQ_STATUS_ALLOC;
> +	init_waitqueue_head(&req->wq);
> +	INIT_LIST_HEAD(&req->req_list);
> +
> +	idr_preload(GFP_NOFS);
> +	spin_lock_irq(&c->lock);
> +	if (type == P9_TVERSION)
> +		tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1,
> +				GFP_NOWAIT);
> +	else
> +		tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT);
> +	req->tc->tag = tag;
> +	spin_unlock_irq(&c->lock);
> +	idr_preload_end();
> +	if (tag < 0)
> +		goto free;
Greg Kurz July 23, 2018, 11:52 a.m. UTC | #2
On Wed, 18 Jul 2018 12:05:54 +0200
Dominique Martinet <asmadeus@codewreck.org> wrote:

> +Cc Greg, I could use your opinion on this if you have a moment.
> 

Hi Dominique,

The patch is quite big and I'm not sure I can find time to review it
carefully, but I'll try to help anyway.

> Matthew Wilcox wrote on Wed, Jul 11, 2018:
> > Replace the custom batch allocation with a slab.  Use an IDR to store
> > pointers to the active requests instead of an array.  We don't try to
> > handle P9_NOTAG specially; the IDR will happily shrink all the way back
> > once the TVERSION call has completed.  
> 
> Sorry for coming back to this patch now, I just noticed something that's
> actually probably a fairly big hit on performance...
> 
> While the slab is just as good as the array for the request itself, this
> makes every single request allocate "fcalls" everytime instead of
> reusing a cached allocation.
> The default msize is 8k and these allocs probably are fairly efficient,
> but some transports like RDMA allow to increase this to up to 1MB... And

It can be even bigger with virtio:

#define VIRTQUEUE_NUM	128

	.maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3),

On a typical ppc64 server class setup with 64KB pages, this is nearly 8MB.

> doing this kind of allocation twice for every packet is going to be very
> slow.
> (not that hogging megabytes of memory was a great practice either!)
> 
> 
> One thing is that the buffers are all going to be the same size for a
> given client (.... except virtio zc buffers, I wonder what I'm missing
> or why that didn't blow up before?)

ZC allocates a 4KB buffer, which is more than enough to hold the 7-byte 9P
header and the "dqd" part of all messages that may use ZC, ie, 16 bytes.
So I'm not sure to catch what could blow up.

> Err, that aside I was going to ask if we couldn't find a way to keep a
> pool of these somehow.
> Ideally putting them in another slab so they could be reclaimed if
> necessary, but the size could vary from one client to another, can we
> create a kmem_cache object per client? the KMEM_CACHE macro is not very
> flexible so I don't think that is encouraged... :)
> 
> 
> It's a shame because I really like that patch, I'll try to find time to
> run some light benchmark with varying msizes eventually but I'm not sure
> when I'll find time for that... Hopefully before the 4.19 merge window!
> 

Yeah, the open-coded cache we have now really obfuscates things.

Maybe have a per-client kmem_cache object for non-ZC requests with
size msize [*], and a global kmem_cache object for ZC requests with
fixed size P9_ZC_HDR_SZ.

[*] the server can require a smaller msize during version negotiation,
    so maybe we should change the kmem_cache object in this case.

Cheers,

--
Greg

> 
> >  /**
> > - * p9_tag_alloc - lookup/allocate a request by tag
> > - * @c: client session to lookup tag within
> > - * @tag: numeric id for transaction
> > - *
> > - * this is a simple array lookup, but will grow the
> > - * request_slots as necessary to accommodate transaction
> > - * ids which did not previously have a slot.
> > - *
> > - * this code relies on the client spinlock to manage locks, its
> > - * possible we should switch to something else, but I'd rather
> > - * stick with something low-overhead for the common case.
> > + * p9_req_alloc - Allocate a new request.
> > + * @c: Client session.
> > + * @type: Transaction type.
> > + * @max_size: Maximum packet size for this request.
> >   *
> > + * Context: Process context.
> > + * Return: Pointer to new request.
> >   */
> > -
> >  static struct p9_req_t *
> > -p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size)
> > +p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
> >  {
> > -	unsigned long flags;
> > -	int row, col;
> > -	struct p9_req_t *req;
> > +	struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS);
> >  	int alloc_msize = min(c->msize, max_size);
> > +	int tag;
> >  
> > -	/* This looks up the original request by tag so we know which
> > -	 * buffer to read the data into */
> > -	tag++;
> > -
> > -	if (tag >= c->max_tag) {
> > -		spin_lock_irqsave(&c->lock, flags);
> > -		/* check again since original check was outside of lock */
> > -		while (tag >= c->max_tag) {
> > -			row = (tag / P9_ROW_MAXTAG);
> > -			c->reqs[row] = kcalloc(P9_ROW_MAXTAG,
> > -					sizeof(struct p9_req_t), GFP_ATOMIC);
> > -
> > -			if (!c->reqs[row]) {
> > -				pr_err("Couldn't grow tag array\n");
> > -				spin_unlock_irqrestore(&c->lock, flags);
> > -				return ERR_PTR(-ENOMEM);
> > -			}
> > -			for (col = 0; col < P9_ROW_MAXTAG; col++) {
> > -				req = &c->reqs[row][col];
> > -				req->status = REQ_STATUS_IDLE;
> > -				init_waitqueue_head(&req->wq);
> > -			}
> > -			c->max_tag += P9_ROW_MAXTAG;
> > -		}
> > -		spin_unlock_irqrestore(&c->lock, flags);
> > -	}
> > -	row = tag / P9_ROW_MAXTAG;
> > -	col = tag % P9_ROW_MAXTAG;
> > +	if (!req)
> > +		return NULL;
> >  
> > -	req = &c->reqs[row][col];
> > -	if (!req->tc)
> > -		req->tc = p9_fcall_alloc(alloc_msize);
> > -	if (!req->rc)
> > -		req->rc = p9_fcall_alloc(alloc_msize);
> > +	req->tc = p9_fcall_alloc(alloc_msize);
> > +	req->rc = p9_fcall_alloc(alloc_msize);
> >  	if (!req->tc || !req->rc)
> > -		goto grow_failed;
> > +		goto free;
> >  
> >  	p9pdu_reset(req->tc);
> >  	p9pdu_reset(req->rc);
> > -
> > -	req->tc->tag = tag-1;
> >  	req->status = REQ_STATUS_ALLOC;
> > +	init_waitqueue_head(&req->wq);
> > +	INIT_LIST_HEAD(&req->req_list);
> > +
> > +	idr_preload(GFP_NOFS);
> > +	spin_lock_irq(&c->lock);
> > +	if (type == P9_TVERSION)
> > +		tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1,
> > +				GFP_NOWAIT);
> > +	else
> > +		tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT);
> > +	req->tc->tag = tag;
> > +	spin_unlock_irq(&c->lock);
> > +	idr_preload_end();
> > +	if (tag < 0)
> > +		goto free;  
>
Dominique Martinet July 23, 2018, 12:25 p.m. UTC | #3
Greg Kurz wrote on Mon, Jul 23, 2018:
> The patch is quite big and I'm not sure I can find time to review it
> carefully, but I'll try to help anyway.

No worry, thanks for this already.

> > Sorry for coming back to this patch now, I just noticed something that's
> > actually probably a fairly big hit on performance...
> > 
> > While the slab is just as good as the array for the request itself, this
> > makes every single request allocate "fcalls" everytime instead of
> > reusing a cached allocation.
> > The default msize is 8k and these allocs probably are fairly efficient,
> > but some transports like RDMA allow to increase this to up to 1MB... And
> 
> It can be even bigger with virtio:
> 
> #define VIRTQUEUE_NUM	128
> 
> 	.maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3),
> 
> On a typical ppc64 server class setup with 64KB pages, this is nearly 8MB.

I don't think I'll be able to test 64KB pages, and it's "just" 500k with
4K pages so I'll go with IB.
I just finished reinstalling my IB-enabled VMs, now to get some iops
test running (dbench maybe) and I'll get some figures to be able to play
with different models and evaluate the impact of these.

> > One thing is that the buffers are all going to be the same size for a
> > given client (.... except virtio zc buffers, I wonder what I'm missing
> > or why that didn't blow up before?)
> 
> ZC allocates a 4KB buffer, which is more than enough to hold the 7-byte 9P
> header and the "dqd" part of all messages that may use ZC, ie, 16 bytes.
> So I'm not sure to catch what could blow up.

ZC requests won't blow up, but from what I can see with the current
(old) request cache array, if a ZC request has a not-yet used tag it'll
allocate a new 4k buffer, then if a normal request uses that tag it'll
get the 4k buffer instead of an msize sized one.

On the client size the request would be posted with req->rc->capacity
which would correctly be 4k, but I'm not sure what would happen if qemu
tries to write more than the given size to that request?

> > It's a shame because I really like that patch, I'll try to find time to
> > run some light benchmark with varying msizes eventually but I'm not sure
> > when I'll find time for that... Hopefully before the 4.19 merge window!
> > 
> 
> Yeah, the open-coded cache we have now really obfuscates things.
> 
> Maybe have a per-client kmem_cache object for non-ZC requests with
> size msize [*], and a global kmem_cache object for ZC requests with
> fixed size P9_ZC_HDR_SZ.
> 
> [*] the server can require a smaller msize during version negotiation,
>     so maybe we should change the kmem_cache object in this case.

Yeah, if we're going to want to accomodate non-power of two buffers, I
think we'll need a separate kmem_cache for them.
The ZC requests could be made into exactly 4k and these could come with
regular kmalloc just fine, it looks like trying to create a cache of
that size would just return the same cache used by kmalloc anyway so
it's probably easier to fall back to kmalloc if requested alloc size
doesn't match what we were hoping for.

I'll try to get figures for various approaches before the merge window
for 4.19 starts, it's getting closer though...
Greg Kurz July 23, 2018, 2:24 p.m. UTC | #4
On Mon, 23 Jul 2018 14:25:31 +0200
Dominique Martinet <asmadeus@codewreck.org> wrote:

> Greg Kurz wrote on Mon, Jul 23, 2018:
> > The patch is quite big and I'm not sure I can find time to review it
> > carefully, but I'll try to help anyway.  
> 
> No worry, thanks for this already.
> 
> > > Sorry for coming back to this patch now, I just noticed something that's
> > > actually probably a fairly big hit on performance...
> > > 
> > > While the slab is just as good as the array for the request itself, this
> > > makes every single request allocate "fcalls" everytime instead of
> > > reusing a cached allocation.
> > > The default msize is 8k and these allocs probably are fairly efficient,
> > > but some transports like RDMA allow to increase this to up to 1MB... And  
> > 
> > It can be even bigger with virtio:
> > 
> > #define VIRTQUEUE_NUM	128
> > 
> > 	.maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3),
> > 
> > On a typical ppc64 server class setup with 64KB pages, this is nearly 8MB.  
> 
> I don't think I'll be able to test 64KB pages, and it's "just" 500k with
> 4K pages so I'll go with IB.
> I just finished reinstalling my IB-enabled VMs, now to get some iops
> test running (dbench maybe) and I'll get some figures to be able to play
> with different models and evaluate the impact of these.
> 

Sounds like a good plan.

> > > One thing is that the buffers are all going to be the same size for a
> > > given client (.... except virtio zc buffers, I wonder what I'm missing
> > > or why that didn't blow up before?)  
> > 
> > ZC allocates a 4KB buffer, which is more than enough to hold the 7-byte 9P
> > header and the "dqd" part of all messages that may use ZC, ie, 16 bytes.
> > So I'm not sure to catch what could blow up.  
> 
> ZC requests won't blow up, but from what I can see with the current
> (old) request cache array, if a ZC request has a not-yet used tag it'll
> allocate a new 4k buffer, then if a normal request uses that tag it'll
> get the 4k buffer instead of an msize sized one.
> 

Indeed.

> On the client size the request would be posted with req->rc->capacity
> which would correctly be 4k, but I'm not sure what would happen if qemu
> tries to write more than the given size to that request?
> 

QEMU would detect that the sg list doesn't have enough capacity.
Old QEMUs used to return a RERROR or RLERROR message with ENOBUFS
in this case. This didn't made sense to hijack the 9P protocol, which
is transport agnostic to report misconfigured buffers. Especially, in
the worst case, maybe we wouldn't even have enough space for the error
response... So, since QEMU 2.10, we put the virtio 9p device into
broken state instead, ie, inoperative until it gets reset.


I guess this situation was never hit because server responses mostly
need less than 4KB...

> > > It's a shame because I really like that patch, I'll try to find time to
> > > run some light benchmark with varying msizes eventually but I'm not sure
> > > when I'll find time for that... Hopefully before the 4.19 merge window!
> > >   
> > 
> > Yeah, the open-coded cache we have now really obfuscates things.
> > 
> > Maybe have a per-client kmem_cache object for non-ZC requests with
> > size msize [*], and a global kmem_cache object for ZC requests with
> > fixed size P9_ZC_HDR_SZ.
> > 
> > [*] the server can require a smaller msize during version negotiation,
> >     so maybe we should change the kmem_cache object in this case.  
> 
> Yeah, if we're going to want to accomodate non-power of two buffers, I
> think we'll need a separate kmem_cache for them.
> The ZC requests could be made into exactly 4k and these could come with
> regular kmalloc just fine, it looks like trying to create a cache of
> that size would just return the same cache used by kmalloc anyway so
> it's probably easier to fall back to kmalloc if requested alloc size
> doesn't match what we were hoping for.
> 

You're right, ZC requests could rely on kmalloc() directly.

> I'll try to get figures for various approaches before the merge window
> for 4.19 starts, it's getting closer though...
> 

Great thanks for your effort, but we've been leaving with this code
since the beginning. If this misses the 4.19 merge window, we'll have
more time to validate the approach and polish the fix for 4.20 :)

Cheers,

--
Greg
Dominique Martinet July 30, 2018, 9:31 a.m. UTC | #5
Dominique Martinet wrote on Mon, Jul 23, 2018:
> I'll try to get figures for various approaches before the merge window
> for 4.19 starts, it's getting closer though...

Here's some numbers; with v4.18-rc7 + current test tree (my 9p-next) as
a base.


For the context, I'm running on VMs that bind their cores to CPUs on the
host (32 cores), and have a Connect-IB mellanox card through SRIOV.

The server is nfs-ganesha, serving a tmpfs filesystem on a second VM
(different host)

Mounting with msize=$((1024*1024))

My main problem with this test is that the client has way too much
memory and it's mostly pristine with a boot not long before, so any kind
of memory pressure won't be seen here.
If someone knows how to fragment memory quickly I'll take that and rerun
the tests :)


I've changed my mind from mdtest to a simple ior, as I'm testing on
trans=rdma there's no difference and I'm more familiar with ior options.

I ran two workloads:
 - 32 processes, file per process, 512k at a time writing a total of
32GB (1GB per file), repeated 10 times
 - 32 processes, file per process, 32 bytes at a time writing a total of
16MB (512k per file), repeated 10 times.

The first test gives a proper impression of the throughput the systems
can sustain and the results are pretty much around what I was expecting
for the setup; the second test is purely a latency test (how long does
it take to send 512k RPCs)


I ran almost all of these tests with KASAN enabled in the VMs a first
time, so leaving the results with KASAN at the end for reference...


Overall I'm rather happy with the result, without KASAN the overhead of
the patch isn't negligible (~6%) but I'd say it's acceptable for
correctness and with an extra two patchs with the suggesteed changes
(rounding down the alloc size to not include the struct overhead and
separate kmem cache) it's getting down to 0.5% which is quite good, I
think.

I'll send the two patchs to the list shortly. The first one is rather
huge even if it's a trivial change logically, so part of me wants to get
it merged quickly to not have to deal with rebases... ;)


With KASAN, well, it certainly does more things but I hope
performance-critical systems don't have it enabled in the first place.



Raw results:

 * Base = 4.18-rc7 + queued patches, without request cache rework
- "Big" I/Os:
Summary of all tests:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write        5842.40    5751.58    5793.53      23.93    5.65606 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0
read         6098.92    6018.63    6064.30      20.00    5.40348 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0

- "Small" I/Os:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write           2.10       1.91       2.00       0.05    8.01074 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
read            1.27       1.07       1.15       0.06   13.93901 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
 -> 512k / 8.01074 = 65.4k req/s


 * Base + patch as submitted
- "Big" I/Os:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write        5844.84    5665.32    5787.15      48.94    5.66261 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0
read         6082.24    6039.62    6057.14      12.50    5.40983 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0
                             
- "Small" I/Os:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write           1.95       1.82       1.88       0.04    8.50453 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
read            1.18       1.07       1.14       0.03   14.04634 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
 -> 512k / 8.50453 = 61.6k req/s


 * Base + patch as submitted + moving the header into req so the
allocation is "round" as suggested by Matthew
- "Big" I/Os:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write        5861.79    5680.99    5795.71      48.84    5.65424 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0
read         6098.54    6037.55    6067.80      19.39    5.40036 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0

- "Small" I/Os:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write           1.98       1.81       1.90       0.06    8.43521 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
read            1.19       1.08       1.13       0.03   14.11709 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
 -> 62.2k req/s

 * Base + patchs submitted + round alloc + kmem cache in the client
struct
- "Big" I/Os
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write        5859.51    5747.64    5808.22      34.81    5.64186 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0
read         6087.90    6037.03    6063.98      15.14    5.40374 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0

- "Small" I/Os:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write           2.07       1.95       1.99       0.03    8.05362 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
read            1.22       1.11       1.16       0.04   13.75312 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
 -> 65.1k req/s

 * Base + patchs submitted + kmem cache in the client struct (kind of
similar to testing an 'odd' msize like 1.001MB)
- "Big" I/Os:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write        5883.03    5725.30    5811.58      45.22    5.63874 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0
read         6090.29    6015.23    6062.49      25.93    5.40514 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0

- "Small" I/Os:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write           2.07       1.89       1.98       0.05    8.10028 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
read            1.23       1.05       1.12       0.05   14.25607 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
 -> 64.7k req/s




Raw results with KASAN:
 * Base = 4.18-rc7 + queued patches, without request cache rework
- "Big" I/Os:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write        5790.03    5705.32    5749.69      27.63    5.69922 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0
read         6095.11    6007.29    6066.50      26.26    5.40157 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0

- "Small" I/Os:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write           1.63       1.53       1.58       0.03   10.10286 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
read            1.43       1.19       1.31       0.07   12.27704 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0


 * Base + patch as submitted
- "Big" I/Os:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write        5773.60    5673.92    5729.01      29.63    5.71982 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0
read         6097.96    6006.50    6059.40      26.74    5.40790 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0

- "Small" I/Os:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write           1.15       1.08       1.12       0.02   14.32230 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
read            1.18       1.06       1.10       0.04   14.51172 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0


 * Base + patch as submitted + moving the header into req so the
allocation is "round" as suggested by Matthew
- "Big" I/Os:
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write        5878.75    5709.74    5798.96      57.12    5.65122 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0
read         6089.83    6039.75    6072.64      14.78    5.39604 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0

- "Small" I/Os
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test#
#Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize
aggsize API RefNum
write           1.33       1.26       1.29       0.02   12.38185 0 32 32
10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
read            1.18       1.08       1.15       0.03   13.90525 0 32 32
10 1 0 1 0 0 1 524288 32 16777216 POSIX 0


 * Base + patchs submitted + round alloc + kmem cache in the client
struct
- "Big" I/Os
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write        5816.89    5729.58    5775.02      26.71    5.67422 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0
read         6087.33    6032.62    6058.69      16.73    5.40847 0 32 32 10 1 0 1 0 0 1 1073741824 524288 34359738368 POSIX 0


- "Small" I/Os
Operation   Max(MiB)   Min(MiB)  Mean(MiB)     StdDev    Mean(s) Test# #Tasks tPN reps fPP reord reordoff reordrand seed segcnt blksiz xsize aggsize API RefNum
write           0.87       0.85       0.86       0.01   18.59584 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
read            0.89       0.86       0.88       0.01   18.26275 0 32 32 10 1 0 1 0 0 1 524288 32 16777216 POSIX 0
 -> I'm not sure why it's so different, actually; the cache doesn't turn
up in /proc/slabinfo so I'm figuring it got merged with kmalloc-1024 so
there should be no difference? And this turned out fine without KASAN...
diff mbox

Patch

diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 0fa0fbab33b0..a4dc42c53d18 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -64,22 +64,15 @@  enum p9_trans_status {
 
 /**
  * enum p9_req_status_t - status of a request
- * @REQ_STATUS_IDLE: request slot unused
  * @REQ_STATUS_ALLOC: request has been allocated but not sent
  * @REQ_STATUS_UNSENT: request waiting to be sent
  * @REQ_STATUS_SENT: request sent to server
  * @REQ_STATUS_RCVD: response received from server
  * @REQ_STATUS_FLSHD: request has been flushed
  * @REQ_STATUS_ERROR: request encountered an error on the client side
- *
- * The @REQ_STATUS_IDLE state is used to mark a request slot as unused
- * but use is actually tracked by the idpool structure which handles tag
- * id allocation.
- *
  */
 
 enum p9_req_status_t {
-	REQ_STATUS_IDLE,
 	REQ_STATUS_ALLOC,
 	REQ_STATUS_UNSENT,
 	REQ_STATUS_SENT,
@@ -92,24 +85,12 @@  enum p9_req_status_t {
  * struct p9_req_t - request slots
  * @status: status of this request slot
  * @t_err: transport error
- * @flush_tag: tag of request being flushed (for flush requests)
  * @wq: wait_queue for the client to block on for this request
  * @tc: the request fcall structure
  * @rc: the response fcall structure
  * @aux: transport specific data (provided for trans_fd migration)
  * @req_list: link for higher level objects to chain requests
- *
- * Transport use an array to track outstanding requests
- * instead of a list.  While this may incurr overhead during initial
- * allocation or expansion, it makes request lookup much easier as the
- * tag id is a index into an array.  (We use tag+1 so that we can accommodate
- * the -1 tag for the T_VERSION request).
- * This also has the nice effect of only having to allocate wait_queues
- * once, instead of constantly allocating and freeing them.  Its possible
- * other resources could benefit from this scheme as well.
- *
  */
-
 struct p9_req_t {
 	int status;
 	int t_err;
@@ -117,40 +98,26 @@  struct p9_req_t {
 	struct p9_fcall *tc;
 	struct p9_fcall *rc;
 	void *aux;
-
 	struct list_head req_list;
 };
 
 /**
  * struct p9_client - per client instance state
- * @lock: protect @fidlist
+ * @lock: protect @fids and @reqs
  * @msize: maximum data size negotiated by protocol
- * @dotu: extension flags negotiated by protocol
  * @proto_version: 9P protocol version to use
  * @trans_mod: module API instantiated with this client
+ * @status: connection state
  * @trans: tranport instance state and API
  * @fids: All active FID handles
- * @tagpool - transaction id accounting for session
- * @reqs - 2D array of requests
- * @max_tag - current maximum tag id allocated
- * @name - node name used as client id
+ * @reqs: All active requests.
+ * @name: node name used as client id
  *
  * The client structure is used to keep track of various per-client
  * state that has been instantiated.
- * In order to minimize per-transaction overhead we use a
- * simple array to lookup requests instead of a hash table
- * or linked list.  In order to support larger number of
- * transactions, we make this a 2D array, allocating new rows
- * when we need to grow the total number of the transactions.
- *
- * Each row is 256 requests and we'll support up to 256 rows for
- * a total of 64k concurrent requests per session.
- *
- * Bugs: duplicated data and potentially unnecessary elements.
  */
-
 struct p9_client {
-	spinlock_t lock; /* protect client structure */
+	spinlock_t lock;
 	unsigned int msize;
 	unsigned char proto_version;
 	struct p9_trans_module *trans_mod;
@@ -170,10 +137,7 @@  struct p9_client {
 	} trans_opts;
 
 	struct idr fids;
-
-	struct p9_idpool *tagpool;
-	struct p9_req_t *reqs[P9_ROW_MAXTAG];
-	int max_tag;
+	struct idr reqs;
 
 	char name[__NEW_UTS_LEN + 1];
 };
@@ -279,4 +243,7 @@  struct p9_fid *p9_client_xattrwalk(struct p9_fid *, const char *, u64 *);
 int p9_client_xattrcreate(struct p9_fid *, const char *, u64, int);
 int p9_client_readlink(struct p9_fid *fid, char **target);
 
+int p9_client_init(void);
+void p9_client_exit(void);
+
 #endif /* NET_9P_CLIENT_H */
diff --git a/net/9p/client.c b/net/9p/client.c
index bc8aba6b5ce0..ecf5dd269f4c 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -241,132 +241,103 @@  static struct p9_fcall *p9_fcall_alloc(int alloc_msize)
 	return fc;
 }
 
+static struct kmem_cache *p9_req_cache;
+
 /**
- * p9_tag_alloc - lookup/allocate a request by tag
- * @c: client session to lookup tag within
- * @tag: numeric id for transaction
- *
- * this is a simple array lookup, but will grow the
- * request_slots as necessary to accommodate transaction
- * ids which did not previously have a slot.
- *
- * this code relies on the client spinlock to manage locks, its
- * possible we should switch to something else, but I'd rather
- * stick with something low-overhead for the common case.
+ * p9_req_alloc - Allocate a new request.
+ * @c: Client session.
+ * @type: Transaction type.
+ * @max_size: Maximum packet size for this request.
  *
+ * Context: Process context.
+ * Return: Pointer to new request.
  */
-
 static struct p9_req_t *
-p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size)
+p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
 {
-	unsigned long flags;
-	int row, col;
-	struct p9_req_t *req;
+	struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS);
 	int alloc_msize = min(c->msize, max_size);
+	int tag;
 
-	/* This looks up the original request by tag so we know which
-	 * buffer to read the data into */
-	tag++;
-
-	if (tag >= c->max_tag) {
-		spin_lock_irqsave(&c->lock, flags);
-		/* check again since original check was outside of lock */
-		while (tag >= c->max_tag) {
-			row = (tag / P9_ROW_MAXTAG);
-			c->reqs[row] = kcalloc(P9_ROW_MAXTAG,
-					sizeof(struct p9_req_t), GFP_ATOMIC);
-
-			if (!c->reqs[row]) {
-				pr_err("Couldn't grow tag array\n");
-				spin_unlock_irqrestore(&c->lock, flags);
-				return ERR_PTR(-ENOMEM);
-			}
-			for (col = 0; col < P9_ROW_MAXTAG; col++) {
-				req = &c->reqs[row][col];
-				req->status = REQ_STATUS_IDLE;
-				init_waitqueue_head(&req->wq);
-			}
-			c->max_tag += P9_ROW_MAXTAG;
-		}
-		spin_unlock_irqrestore(&c->lock, flags);
-	}
-	row = tag / P9_ROW_MAXTAG;
-	col = tag % P9_ROW_MAXTAG;
+	if (!req)
+		return NULL;
 
-	req = &c->reqs[row][col];
-	if (!req->tc)
-		req->tc = p9_fcall_alloc(alloc_msize);
-	if (!req->rc)
-		req->rc = p9_fcall_alloc(alloc_msize);
+	req->tc = p9_fcall_alloc(alloc_msize);
+	req->rc = p9_fcall_alloc(alloc_msize);
 	if (!req->tc || !req->rc)
-		goto grow_failed;
+		goto free;
 
 	p9pdu_reset(req->tc);
 	p9pdu_reset(req->rc);
-
-	req->tc->tag = tag-1;
 	req->status = REQ_STATUS_ALLOC;
+	init_waitqueue_head(&req->wq);
+	INIT_LIST_HEAD(&req->req_list);
+
+	idr_preload(GFP_NOFS);
+	spin_lock_irq(&c->lock);
+	if (type == P9_TVERSION)
+		tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1,
+				GFP_NOWAIT);
+	else
+		tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT);
+	req->tc->tag = tag;
+	spin_unlock_irq(&c->lock);
+	idr_preload_end();
+	if (tag < 0)
+		goto free;
 
 	return req;
 
-grow_failed:
-	pr_err("Couldn't grow tag array\n");
+free:
 	kfree(req->tc);
 	kfree(req->rc);
-	req->tc = req->rc = NULL;
+	kmem_cache_free(p9_req_cache, req);
 	return ERR_PTR(-ENOMEM);
 }
 
 /**
- * p9_tag_lookup - lookup a request by tag
- * @c: client session to lookup tag within
- * @tag: numeric id for transaction
+ * p9_tag_lookup - Look up a request by tag.
+ * @c: Client session.
+ * @tag: Transaction ID.
  *
+ * Context: Any context.
+ * Return: A request, or %NULL if there is no request with that tag.
  */
-
 struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
 {
-	int row, col;
-
-	/* This looks up the original request by tag so we know which
-	 * buffer to read the data into */
-	tag++;
-
-	if(tag >= c->max_tag) 
-		return NULL;
+	struct p9_req_t *req;
 
-	row = tag / P9_ROW_MAXTAG;
-	col = tag % P9_ROW_MAXTAG;
+	rcu_read_lock();
+	req = idr_find(&c->reqs, tag);
+	/*
+	 * There's no refcount on the req; a malicious server could cause
+	 * us to dereference a NULL pointer
+	 */
+	rcu_read_unlock();
 
-	return &c->reqs[row][col];
+	return req;
 }
 EXPORT_SYMBOL(p9_tag_lookup);
 
 /**
- * p9_tag_init - setup tags structure and contents
- * @c:  v9fs client struct
- *
- * This initializes the tags structure for each client instance.
+ * p9_free_req - Free a request.
+ * @c: Client session.
+ * @r: Request to free.
  *
+ * Context: Any context.
  */
-
-static int p9_tag_init(struct p9_client *c)
+static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
 {
-	int err = 0;
+	unsigned long flags;
+	u16 tag = r->tc->tag;
 
-	c->tagpool = p9_idpool_create();
-	if (IS_ERR(c->tagpool)) {
-		err = PTR_ERR(c->tagpool);
-		goto error;
-	}
-	err = p9_idpool_get(c->tagpool); /* reserve tag 0 */
-	if (err < 0) {
-		p9_idpool_destroy(c->tagpool);
-		goto error;
-	}
-	c->max_tag = 0;
-error:
-	return err;
+	p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag);
+	spin_lock_irqsave(&c->lock, flags);
+	idr_remove(&c->reqs, tag);
+	spin_unlock_irqrestore(&c->lock, flags);
+	kfree(r->tc);
+	kfree(r->rc);
+	kmem_cache_free(p9_req_cache, r);
 }
 
 /**
@@ -378,52 +349,15 @@  static int p9_tag_init(struct p9_client *c)
  */
 static void p9_tag_cleanup(struct p9_client *c)
 {
-	int row, col;
-
-	/* check to insure all requests are idle */
-	for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
-		for (col = 0; col < P9_ROW_MAXTAG; col++) {
-			if (c->reqs[row][col].status != REQ_STATUS_IDLE) {
-				p9_debug(P9_DEBUG_MUX,
-					 "Attempting to cleanup non-free tag %d,%d\n",
-					 row, col);
-				/* TODO: delay execution of cleanup */
-				return;
-			}
-		}
-	}
-
-	if (c->tagpool) {
-		p9_idpool_put(0, c->tagpool); /* free reserved tag 0 */
-		p9_idpool_destroy(c->tagpool);
-	}
+	struct p9_req_t *req;
+	int id;
 
-	/* free requests associated with tags */
-	for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
-		for (col = 0; col < P9_ROW_MAXTAG; col++) {
-			kfree(c->reqs[row][col].tc);
-			kfree(c->reqs[row][col].rc);
-		}
-		kfree(c->reqs[row]);
+	rcu_read_lock();
+	idr_for_each_entry(&c->reqs, req, id) {
+		pr_info("Tag %d still in use\n", id);
+		p9_free_req(c, req);
 	}
-	c->max_tag = 0;
-}
-
-/**
- * p9_free_req - free a request and clean-up as necessary
- * c: client state
- * r: request to release
- *
- */
-
-static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
-{
-	int tag = r->tc->tag;
-	p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag);
-
-	r->status = REQ_STATUS_IDLE;
-	if (tag != P9_NOTAG && p9_idpool_check(tag, c->tagpool))
-		p9_idpool_put(tag, c->tagpool);
+	rcu_read_unlock();
 }
 
 /**
@@ -690,7 +624,7 @@  static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
 					      int8_t type, int req_size,
 					      const char *fmt, va_list ap)
 {
-	int tag, err;
+	int err;
 	struct p9_req_t *req;
 
 	p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type);
@@ -703,24 +637,17 @@  static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
 	if ((c->status == BeginDisconnect) && (type != P9_TCLUNK))
 		return ERR_PTR(-EIO);
 
-	tag = P9_NOTAG;
-	if (type != P9_TVERSION) {
-		tag = p9_idpool_get(c->tagpool);
-		if (tag < 0)
-			return ERR_PTR(-ENOMEM);
-	}
-
-	req = p9_tag_alloc(c, tag, req_size);
+	req = p9_tag_alloc(c, type, req_size);
 	if (IS_ERR(req))
 		return req;
 
 	/* marshall the data */
-	p9pdu_prepare(req->tc, tag, type);
+	p9pdu_prepare(req->tc, req->tc->tag, type);
 	err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap);
 	if (err)
 		goto reterr;
 	p9pdu_finalize(c, req->tc);
-	trace_9p_client_req(c, type, tag);
+	trace_9p_client_req(c, type, req->tc->tag);
 	return req;
 reterr:
 	p9_free_req(c, req);
@@ -1018,14 +945,11 @@  struct p9_client *p9_client_create(const char *dev_name, char *options)
 
 	spin_lock_init(&clnt->lock);
 	idr_init(&clnt->fids);
-
-	err = p9_tag_init(clnt);
-	if (err < 0)
-		goto free_client;
+	idr_init(&clnt->reqs);
 
 	err = parse_opts(options, clnt);
 	if (err < 0)
-		goto destroy_tagpool;
+		goto free_client;
 
 	if (!clnt->trans_mod)
 		clnt->trans_mod = v9fs_get_default_trans();
@@ -1034,7 +958,7 @@  struct p9_client *p9_client_create(const char *dev_name, char *options)
 		err = -EPROTONOSUPPORT;
 		p9_debug(P9_DEBUG_ERROR,
 			 "No transport defined or default transport\n");
-		goto destroy_tagpool;
+		goto free_client;
 	}
 
 	p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n",
@@ -1057,8 +981,6 @@  struct p9_client *p9_client_create(const char *dev_name, char *options)
 	clnt->trans_mod->close(clnt);
 put_trans:
 	v9fs_put_trans(clnt->trans_mod);
-destroy_tagpool:
-	p9_idpool_destroy(clnt->tagpool);
 free_client:
 	kfree(clnt);
 	return ERR_PTR(err);
@@ -2274,3 +2196,14 @@  int p9_client_readlink(struct p9_fid *fid, char **target)
 	return err;
 }
 EXPORT_SYMBOL(p9_client_readlink);
+
+int __init p9_client_init(void)
+{
+	p9_req_cache = KMEM_CACHE(p9_req_t, 0);
+	return p9_req_cache ? 0 : -ENOMEM;
+}
+
+void __exit p9_client_exit(void)
+{
+	kmem_cache_destroy(p9_req_cache);
+}
diff --git a/net/9p/mod.c b/net/9p/mod.c
index eb9777f05755..0da56d6af73b 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -171,7 +171,11 @@  void v9fs_put_trans(struct p9_trans_module *m)
  */
 static int __init init_p9(void)
 {
-	int ret = 0;
+	int ret;
+
+	ret = p9_client_init();
+	if (ret)
+		return ret;
 
 	p9_error_init();
 	pr_info("Installing 9P2000 support\n");
@@ -190,6 +194,7 @@  static void __exit exit_p9(void)
 	pr_info("Unloading 9P2000 support\n");
 
 	p9_trans_fd_exit();
+	p9_client_exit();
 }
 
 module_init(init_p9)