diff mbox

target/user: Add daynmic growing data area feature support

Message ID 1487323472-20481-1-git-send-email-lixiubo@cmss.chinamobile.com (mailing list archive)
State Superseded
Headers show

Commit Message

Xiubo Li Feb. 17, 2017, 9:24 a.m. UTC
From: Xiubo Li <lixiubo@cmss.chinamobile.com>

Currently for the TCMU, the ring buffer size is fixed to 64K cmd
area + 1M data area, and this will be bottlenecks for high iops.

The struct tcmu_cmd_entry {} size is fixed about 44 bytes without
iovec[N], and the size of struct iovec[N] is about (16 * N) bytes.

The sizeof(cmd entry) will be [44B, N *16 + 44B], and corresponding
iovec[N] size will be [0, N * 4096], so the ratio of
sizeof(cmd entry) : sizeof(entry datas) ==
(N * 16 + 44)Bytes : (N * 4096)Bytes == (N * 16)/(N * 4096) +
44/(N*4096) == 4/1024 + 11/(N * 1024).

When N is bigger, the ratio will be smaller. If N >= 1, the ratio
will be [15/1024, 4/1024), for this the ratio 15 : 1024 will be
enough. But maybe some iscsi cmds has no datas, N == 0. So the ratio
should be bigger.

For now we will increase the data area size to 1G, and the cmd area
size to 128M. The tcmu-runner should mmap() about (128M + 1G) when
running and the TCMU will dynamically grows the data area from 0 to
max 1G size.

The cmd area memory will be allocated through vmalloc(), and the data
area's blocks will be allocated individually later when needed.

The allocated data area block memory will be managed via radix tree.
For now the bitmap still be the most efficient way to search and
manage the block index, this could be update later.

Signed-off-by: Xiubo Li <lixiubo@cmss.chinamobile.com>
Signed-off-by: Jianfei Hu <hujianfei@cmss.chinamobile.com>
---
 drivers/target/target_core_user.c | 241 ++++++++++++++++++++++++++------------
 1 file changed, 167 insertions(+), 74 deletions(-)

Comments

Andy Grover Feb. 22, 2017, 8:32 p.m. UTC | #1
On 02/17/2017 01:24 AM, lixiubo@cmss.chinamobile.com wrote:
> From: Xiubo Li <lixiubo@cmss.chinamobile.com>
> 
> Currently for the TCMU, the ring buffer size is fixed to 64K cmd
> area + 1M data area, and this will be bottlenecks for high iops.

Hi Xiubo, thanks for your work.

daynmic -> dynamic

Have you benchmarked this patch and determined what kind of iops
improvement it allows? Do you see the data area reaching its
fully-allocated size?

> 
> The struct tcmu_cmd_entry {} size is fixed about 44 bytes without
> iovec[N], and the size of struct iovec[N] is about (16 * N) bytes.
> 
> The sizeof(cmd entry) will be [44B, N *16 + 44B], and corresponding
> iovec[N] size will be [0, N * 4096], so the ratio of
> sizeof(cmd entry) : sizeof(entry datas) ==
> (N * 16 + 44)Bytes : (N * 4096)Bytes == (N * 16)/(N * 4096) +
> 44/(N*4096) == 4/1024 + 11/(N * 1024).
> 
> When N is bigger, the ratio will be smaller. If N >= 1, the ratio
> will be [15/1024, 4/1024), for this the ratio 15 : 1024 will be
> enough. But maybe some iscsi cmds has no datas, N == 0. So the ratio
> should be bigger.
> 
> For now we will increase the data area size to 1G, and the cmd area
> size to 128M. The tcmu-runner should mmap() about (128M + 1G) when
> running and the TCMU will dynamically grows the data area from 0 to
> max 1G size.

Cool. This is a good approach for an initial patch but this raises
concerns about efficiently managing kernel memory usage -- the data area
grows but never shrinks, and total possible usage increases per
backstore. (What if there are 1000?) Any ideas how we could also improve
these aspects of the design? (Global TCMU data area usage limit?)

> 
> The cmd area memory will be allocated through vmalloc(), and the data
> area's blocks will be allocated individually later when needed.
> 
> The allocated data area block memory will be managed via radix tree.
> For now the bitmap still be the most efficient way to search and
> manage the block index, this could be update later.
> 
> Signed-off-by: Xiubo Li <lixiubo@cmss.chinamobile.com>
> Signed-off-by: Jianfei Hu <hujianfei@cmss.chinamobile.com>
> ---
>  drivers/target/target_core_user.c | 241 ++++++++++++++++++++++++++------------
>  1 file changed, 167 insertions(+), 74 deletions(-)
> 
> diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
> index 2e33100..f2e3769 100644
> --- a/drivers/target/target_core_user.c
> +++ b/drivers/target/target_core_user.c
> @@ -2,6 +2,7 @@
>   * Copyright (C) 2013 Shaohua Li <shli@kernel.org>
>   * Copyright (C) 2014 Red Hat, Inc.
>   * Copyright (C) 2015 Arrikto, Inc.
> + * Copyright (C) 2017 Chinamobile, Inc.
>   *
>   * This program is free software; you can redistribute it and/or modify it
>   * under the terms and conditions of the GNU General Public License,
> @@ -25,6 +26,7 @@
>  #include <linux/parser.h>
>  #include <linux/vmalloc.h>
>  #include <linux/uio_driver.h>
> +#include <linux/radix-tree.h>
>  #include <linux/stringify.h>
>  #include <linux/bitops.h>
>  #include <linux/highmem.h>
> @@ -65,12 +67,15 @@
>  
>  #define TCMU_TIME_OUT (30 * MSEC_PER_SEC)
>  
> -#define DATA_BLOCK_BITS 256
> -#define DATA_BLOCK_SIZE 4096
> +/* For cmd area, the size is fixed 64M */
> +#define CMDR_SIZE (64 * 1024 * 1024)

Commitmsg mistakenly says 128M..?

> -#define CMDR_SIZE (16 * 4096)
> +/* For data area, the size is fixed 1G */
> +#define DATA_BLOCK_BITS (256 * 1024)
> +#define DATA_BLOCK_SIZE 4096
>  #define DATA_SIZE (DATA_BLOCK_BITS * DATA_BLOCK_SIZE)
>  
> +/* The ring buffer size is 64M + 1G */
>  #define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE)
>  
>  static struct device *tcmu_root_device;
> @@ -103,6 +108,7 @@ struct tcmu_dev {
>  	size_t data_size;
>  
>  	DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
> +	struct radix_tree_root data_blocks;
>  
>  	wait_queue_head_t wait_cmdr;
>  	/* TODO should this be a mutex? */
> @@ -120,15 +126,22 @@ struct tcmu_dev {
>  
>  #define CMDR_OFF sizeof(struct tcmu_mailbox)
>  
> +struct tcmu_data_block_index {
> +	struct list_head list;
> +	uint32_t index;
> +};

is this used anywhere? leftover from an earlier version?

> +
>  struct tcmu_cmd {
>  	struct se_cmd *se_cmd;
>  	struct tcmu_dev *tcmu_dev;
>  
> -	uint16_t cmd_id;
> +	uint32_t cmd_id;

why this? tcmu_cmd_entry_hdr has a u16 cmd_id, so making this bigger
doesn't really help anything. (natural alignment? But, compiler should
be inserting padding as needed.)

>  
>  	/* Can't use se_cmd when cleaning up expired cmds, because if
>  	   cmd has been completed then accessing se_cmd is off limits */
> -	DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
> +	uint32_t dbi_cnt;
> +	uint32_t dbi_cur;
> +	uint32_t *dbi;

Ah, cool. Saves tcmu_cmd from being huge, now that data_bitmap is 32MiB.

<snip>

> @@ -919,7 +999,7 @@ static int tcmu_configure_device(struct se_device *dev)
>  
>  	info->name = str;
>  
> -	udev->mb_addr = vzalloc(TCMU_RING_SIZE);
> +	udev->mb_addr = vzalloc(CMDR_SIZE);

I'm still concerned about this -- 64M vmalloc'd for cmd ring seems like
too much. (again: what if there are 1000 tcmu backstores?) and is only
needed if data area grows to max size.

<snip>

Very promising, thanks again for doing this work.

Regards -- Andy

--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiubo Li Feb. 24, 2017, 2:07 a.m. UTC | #2
>> When N is bigger, the ratio will be smaller. If N >= 1, the ratio
>> will be [15/1024, 4/1024), for this the ratio 15 : 1024 will be
>> enough. But maybe some iscsi cmds has no datas, N == 0. So the ratio
>> should be bigger.
>>
>> For now we will increase the data area size to 1G, and the cmd area
>> size to 128M. The tcmu-runner should mmap() about (128M + 1G) when
>> running and the TCMU will dynamically grows the data area from 0 to
>> max 1G size.
> Cool. This is a good approach for an initial patch but this raises
> concerns about efficiently managing kernel memory usage -- the data area
> grows but never shrinks, and total possible usage increases per
> backstore. (What if there are 1000?) Any ideas how we could also improve
> these aspects of the design? (Global TCMU data area usage limit?)
Two ways in my mind:

The first:
How about by setting a threshold cmd(SHRINK cmd), something likes
the PAD cmd, to tell the userspace runner try to shrink the memories?

When the runner get the SHRINK cmd, it will try to remmap uio0's ring
buffer(?). Then the kernel will get chance to shrink the memories....

The second:
Try to extern the data area by using /dev/uio1, we could remmap the
uio1 device when need, so it will be easy to get a chance to shrink the
memories in uio1.

Maybe these are a little ugly, are there other more effective ways ?

Thanks,

BRs
Xiubo


>> The cmd area memory will be allocated through vmalloc(), and the data
>> area's blocks will be allocated individually later when needed.
>>
>>



--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Grover Feb. 24, 2017, 10:29 p.m. UTC | #3
On 02/23/2017 06:07 PM, Xiubo Li wrote:
>> Cool. This is a good approach for an initial patch but this raises
>> concerns about efficiently managing kernel memory usage -- the data area
>> grows but never shrinks, and total possible usage increases per
>> backstore. (What if there are 1000?) Any ideas how we could also improve
>> these aspects of the design? (Global TCMU data area usage limit?)
> Two ways in my mind:
> 
> The first:
> How about by setting a threshold cmd(SHRINK cmd), something likes
> the PAD cmd, to tell the userspace runner try to shrink the memories?

Why should userspace need to know if the kernel is shrinking memory
allocated to the data area? Userspace knows about memory described in
iovecs for in-flight cmds, we shouldn't need its cooperation to free
other allocated parts of the data area.

But, We likely don't want to release memory from the data area anyways
while active, in any case. How about if we set a timer when active
commands go to zero, and then reduce data area to some minimum if no new
cmds come in before timer expires?

> When the runner get the SHRINK cmd, it will try to remmap uio0's ring
> buffer(?). Then the kernel will get chance to shrink the memories....
> 
> The second:
> Try to extern the data area by using /dev/uio1, we could remmap the
> uio1 device when need, so it will be easy to get a chance to shrink the
> memories in uio1.

Userspace should not need to remap the region in order for the kernel to
free and unmap pages from the region.

The only thing we need to watch out for is if blocks are referenced by
in-flight cmds, we can't free them or userspace will segfault. So, if we
wait until there are no in-flight cmds, then it follows that the kernel
can free whatever it wants and userspace will not segfault.

-- Andy

--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiubo Li Feb. 27, 2017, 5:59 a.m. UTC | #4
>>> Cool. This is a good approach for an initial patch but this raises
>>> concerns about efficiently managing kernel memory usage -- the data area
>>> grows but never shrinks, and total possible usage increases per
>>> backstore. (What if there are 1000?) Any ideas how we could also improve
>>> these aspects of the design? (Global TCMU data area usage limit?)
>> Two ways in my mind:
>>
>> The first:
>> How about by setting a threshold cmd(SHRINK cmd), something likes
>> the PAD cmd, to tell the userspace runner try to shrink the memories?
> Why should userspace need to know if the kernel is shrinking memory
> allocated to the data area? Userspace knows about memory described in
> iovecs for in-flight cmds, we shouldn't need its cooperation to free
> other allocated parts of the data area.
Yes.

> But, We likely don't want to release memory from the data area anyways
> while active, in any case. How about if we set a timer when active
> commands go to zero, and then reduce data area to some minimum if no new
> cmds come in before timer expires?

If I understand correctly: for example, we have 1G(as the minimum)
data area and all blocks have been allocated and mapped to runner's
vma, then we extern it to 1G + 256M as needed. When there have no
active cmds and after the timer expires, will it reduce the data area
back to 1G ? And then should it release the reduced 256M data area's
memories ?

If so, after kfree()ed the blocks' memories, it should also try to remove
all the ptes which are mapping this page(like using the try_to_umap()),
but something like try_to_umap() doesn't export for the modules.

Without ummaping the kfree()ed pages' ptes mentioned above, then
the reduced 256M vma space couldn't be reused again for the runner
process, because the runner has already do the mapping for the reduced
vma space to some old physical pages(won't trigger new page fault
again). Then there will be a hole, and the hole will be bigger and bigger.

Without ummaping the kfree()ed pages' ptes mentioned above, the
pages' reference count (page_ref_dec(), which _inc()ed in page fault)
couldn't be reduced back too.

>> When the runner get the SHRINK cmd, it will try to remmap uio0's ring
>> buffer(?). Then the kernel will get chance to shrink the memories....
>>
>> The second:
>> Try to extern the data area by using /dev/uio1, we could remmap the
>> uio1 device when need, so it will be easy to get a chance to shrink the
>> memories in uio1.
> Userspace should not need to remap the region in order for the kernel to
> free and unmap pages from the region.
>
> The only thing we need to watch out for is if blocks are referenced by
> in-flight cmds, we can't free them or userspace will segfault.
Yes, agree.

> So, if we
> wait until there are no in-flight cmds, then it follows that the kernel
> can free whatever it wants and userspace will not segfault.
So, the problem is how to ummap the kfree()ed pages' ptes.


BRs
Xiubo

> -- Andy
>



--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Grover Feb. 27, 2017, 7:32 p.m. UTC | #5
On 02/26/2017 09:59 PM, Xiubo Li wrote:
>> But, We likely don't want to release memory from the data area anyways
>> while active, in any case. How about if we set a timer when active
>> commands go to zero, and then reduce data area to some minimum if no new
>> cmds come in before timer expires?
> 
> If I understand correctly: for example, we have 1G(as the minimum)
> data area and all blocks have been allocated and mapped to runner's
> vma, then we extern it to 1G + 256M as needed. When there have no
> active cmds and after the timer expires, will it reduce the data area
> back to 1G ? And then should it release the reduced 256M data area's
> memories ?
> 
> If so, after kfree()ed the blocks' memories, it should also try to remove
> all the ptes which are mapping this page(like using the try_to_umap()),
> but something like try_to_umap() doesn't export for the modules.
> 
> Without ummaping the kfree()ed pages' ptes mentioned above, then
> the reduced 256M vma space couldn't be reused again for the runner
> process, because the runner has already do the mapping for the reduced
> vma space to some old physical pages(won't trigger new page fault
> again). Then there will be a hole, and the hole will be bigger and bigger.
> 
> Without ummaping the kfree()ed pages' ptes mentioned above, the
> pages' reference count (page_ref_dec(), which _inc()ed in page fault)
> couldn't be reduced back too.

Let's ask people who will know...

Hi linux-mm,

TCM-User (drivers/target/target_core_user.c) currently uses vmalloc()ed
memory to back a ring buffer that is mmaped by userspace.

We want to move to dynamically mapping pages into this region, and also
we'd like to unmap/free pages when idle. What's the right way to unmap?
I see unmap_mapping_range() but that mentions an underlying file, which
TCMU doesn't have. Or maybe zap_page_range()? But it's not exported.

Any advice?

Thanks in advance -- Regards -- Andy
--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Christie Feb. 27, 2017, 11:56 p.m. UTC | #6
On 02/22/2017 02:32 PM, Andy Grover wrote:
> On 02/17/2017 01:24 AM, lixiubo@cmss.chinamobile.com wrote:
>> > From: Xiubo Li <lixiubo@cmss.chinamobile.com>
>> > 
>> > Currently for the TCMU, the ring buffer size is fixed to 64K cmd
>> > area + 1M data area, and this will be bottlenecks for high iops.
> Hi Xiubo, thanks for your work.
> 
> daynmic -> dynamic
> 
> Have you benchmarked this patch and determined what kind of iops
> improvement it allows? Do you see the data area reaching its
> fully-allocated size?
> 

I tested this patch with Venky's tcmu-runner rbd aio patches, with one
10 gig iscsi session, and for pretty basic fio direct io (64 -256K
read/writes with a queue depth of 64 numjobs between 1 and 4) tests read
throughput goes from about 80 to 500 MB/s. Write throughput is pretty
low at around 150 MB/s.

I did not hit the fully allocated size. I did not drive a lot of IO though.
--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiubo Li Feb. 28, 2017, 1:22 a.m. UTC | #7
Hi Mike

Thanks verrry much for your work and test cases.


>>>> From: Xiubo Li <lixiubo@cmss.chinamobile.com>
>>>>
>>>> Currently for the TCMU, the ring buffer size is fixed to 64K cmd
>>>> area + 1M data area, and this will be bottlenecks for high iops.
>> Hi Xiubo, thanks for your work.
>>
>> daynmic -> dynamic
>>
>> Have you benchmarked this patch and determined what kind of iops
>> improvement it allows? Do you see the data area reaching its
>> fully-allocated size?
>>
> I tested this patch with Venky's tcmu-runner rbd aio patches, with one
> 10 gig iscsi session, and for pretty basic fio direct io (64 -256K
> read/writes with a queue depth of 64 numjobs between 1 and 4) tests read
> throughput goes from about 80 to 500 MB/s.
Looks nice.

> Write throughput is pretty
> low at around 150 MB/s.
What's the original write throughput without this patch? Is it also
around 80 MB/s ?

> I did not hit the fully allocated size. I did not drive a lot of IO though.
Yes, if the cmd area won't hit the fully allocated size, the data area is
also very hard to hit the fully allocated size.

And for the 64MB cmd area size is a little larger for 1GB data area.

Next, I will down it to smaller as needed.

Thanks,

BRs
Xiubo




--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiubo Li Feb. 28, 2017, 9:13 a.m. UTC | #8
>> On 02/17/2017 01:24 AM, lixiubo@cmss.chinamobile.com wrote:
>>>> From: Xiubo Li <lixiubo@cmss.chinamobile.com>
>>>>
>>>> Currently for the TCMU, the ring buffer size is fixed to 64K cmd
>>>> area + 1M data area, and this will be bottlenecks for high iops.
>> Hi Xiubo, thanks for your work.
>>
>> daynmic -> dynamic
>>
>> Have you benchmarked this patch and determined what kind of iops
>> improvement it allows? Do you see the data area reaching its
>> fully-allocated size?
>>
> I tested this patch with Venky's tcmu-runner rbd aio patches, with one
> 10 gig iscsi session, and for pretty basic fio direct io (64 -256K
> read/writes with a queue depth of 64 numjobs between 1 and 4) tests read
> throughput goes from about 80 to 500 MB/s. Write throughput is pretty
> low at around 150 MB/s.
>
> I did not hit the fully allocated size. I did not drive a lot of IO though.

How about dealing with memories shrinking in patch series followed?

As the initial patch, we could set the cmd area size to 8MB and the
data area size to 512MB. And this could work fine for most cases
without using too much memories.

On my similar test case by using VMs(low iops case) using fio, -bs=[64K,
128K, 512K, 1M] -size=20G, -iodepth 1 -numjobs=10,  the bw of read
increases from about 5200KB/s to about 6100KB/s, and the bw of write
increases from about 3000KB/s to about 3300KB/s.

While bs < 64K(from the log, the maximum of the data length is 64K),
the smaller of it the two bws will be closer.

But for all my test cases, the allocated size is far away from the full size
too.

Thanks,

BRs
Xiubo




--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Christie Feb. 28, 2017, 6:13 p.m. UTC | #9
On 02/27/2017 07:22 PM, Xiubo Li wrote:
> Hi Mike
> 
> Thanks verrry much for your work and test cases.
> 
> 
>>>>> From: Xiubo Li <lixiubo@cmss.chinamobile.com>
>>>>>
>>>>> Currently for the TCMU, the ring buffer size is fixed to 64K cmd
>>>>> area + 1M data area, and this will be bottlenecks for high iops.
>>> Hi Xiubo, thanks for your work.
>>>
>>> daynmic -> dynamic
>>>
>>> Have you benchmarked this patch and determined what kind of iops
>>> improvement it allows? Do you see the data area reaching its
>>> fully-allocated size?
>>>
>> I tested this patch with Venky's tcmu-runner rbd aio patches, with one
>> 10 gig iscsi session, and for pretty basic fio direct io (64 -256K
>> read/writes with a queue depth of 64 numjobs between 1 and 4) tests read
>> throughput goes from about 80 to 500 MB/s.
> Looks nice.
> 
>> Write throughput is pretty
>> low at around 150 MB/s.
> What's the original write throughput without this patch? Is it also
> around 80 MB/s ?

It is around 20-30 MB/s. Same fio args except using --rw=write.
--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiubo Li March 1, 2017, 1:22 a.m. UTC | #10
>>> Write throughput is pretty
>>> low at around 150 MB/s.
>> What's the original write throughput without this patch? Is it also
>> around 80 MB/s ?
> It is around 20-30 MB/s. Same fio args except using --rw=write.
Got it.

Thanks.

BRs
Xiubo


--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiubo Li March 1, 2017, 10:53 a.m. UTC | #11
>> For now we will increase the data area size to 1G, and the cmd area
>> size to 128M. The tcmu-runner should mmap() about (128M + 1G) when
>> running and the TCMU will dynamically grows the data area from 0 to
>> max 1G size.
> Cool. This is a good approach for an initial patch but this raises
> concerns about efficiently managing kernel memory usage -- the data area
> grows but never shrinks, and total possible usage increases per
> backstore. (What if there are 1000?) Any ideas how we could also improve
> these aspects of the design? (Global TCMU data area usage limit?)

Sorry for misunderstanding about this on my part before.

If we couldn't get a feasible way from mm to deal with the memories
shrinking. Maybe a global TCMU data area usage limit is a good choice:

We can limit the global physical data area size to 2G as default, and export
one sysfs to configure this as needed(such as 10G size with possible 1000
targets).

Then use one global radix tree to manage all the 2G physical pages(will grow
from 0 to 2G). Each ring buffer will search it's own data area bitmaps, and
if the current block is reusing, then we should get the old page, which has
already mapped to the runner process, from the global radix tree, or should
we get one new page(from global radix tree or system MM). After getting the
page, tcmu in kernel will use it my kmap(). Free it with kumapp() and 
insert to
global radix tree.


BRs
Xiubo






--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiubo Li March 10, 2017, 1:45 a.m. UTC | #12
On 2017年02月28日 03:32, Andy Grover wrote:
> On 02/26/2017 09:59 PM, Xiubo Li wrote:
>>> But, We likely don't want to release memory from the data area anyways
>>> while active, in any case. How about if we set a timer when active
>>> commands go to zero, and then reduce data area to some minimum if no new
>>> cmds come in before timer expires?
>> If I understand correctly: for example, we have 1G(as the minimum)
>> data area and all blocks have been allocated and mapped to runner's
>> vma, then we extern it to 1G + 256M as needed. When there have no
>> active cmds and after the timer expires, will it reduce the data area
>> back to 1G ? And then should it release the reduced 256M data area's
>> memories ?
>>
>> If so, after kfree()ed the blocks' memories, it should also try to remove
>> all the ptes which are mapping this page(like using the try_to_umap()),
>> but something like try_to_umap() doesn't export for the modules.
>>
>> Without ummaping the kfree()ed pages' ptes mentioned above, then
>> the reduced 256M vma space couldn't be reused again for the runner
>> process, because the runner has already do the mapping for the reduced
>> vma space to some old physical pages(won't trigger new page fault
>> again). Then there will be a hole, and the hole will be bigger and bigger.
>>
>> Without ummaping the kfree()ed pages' ptes mentioned above, the
>> pages' reference count (page_ref_dec(), which _inc()ed in page fault)
>> couldn't be reduced back too.
> Let's ask people who will know...
>
> Hi linux-mm,
>
> TCM-User (drivers/target/target_core_user.c) currently uses vmalloc()ed
> memory to back a ring buffer that is mmaped by userspace.
>
> We want to move to dynamically mapping pages into this region, and also
> we'd like to unmap/free pages when idle. What's the right way to unmap?
> I see unmap_mapping_range() but that mentions an underlying file, which
> TCMU doesn't have. Or maybe zap_page_range()? But it's not exported.
Hi linux-mm

For the TCMU case, the vm is not anonymous mapping. And still has
device file desc:

mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED, dev->fd, 0);

If using the unmap_mapping_range() to do the dynamically maping,
is it okay ? Any other potential risks ?

Or the mentioned 'underlying file' is must one desk file ?

Thanks very much,

BRs
Xiubo



> Any advice?
>
> Thanks in advance -- Regards -- Andy



--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 2e33100..f2e3769 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -2,6 +2,7 @@ 
  * Copyright (C) 2013 Shaohua Li <shli@kernel.org>
  * Copyright (C) 2014 Red Hat, Inc.
  * Copyright (C) 2015 Arrikto, Inc.
+ * Copyright (C) 2017 Chinamobile, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -25,6 +26,7 @@ 
 #include <linux/parser.h>
 #include <linux/vmalloc.h>
 #include <linux/uio_driver.h>
+#include <linux/radix-tree.h>
 #include <linux/stringify.h>
 #include <linux/bitops.h>
 #include <linux/highmem.h>
@@ -65,12 +67,15 @@ 
 
 #define TCMU_TIME_OUT (30 * MSEC_PER_SEC)
 
-#define DATA_BLOCK_BITS 256
-#define DATA_BLOCK_SIZE 4096
+/* For cmd area, the size is fixed 64M */
+#define CMDR_SIZE (64 * 1024 * 1024)
 
-#define CMDR_SIZE (16 * 4096)
+/* For data area, the size is fixed 1G */
+#define DATA_BLOCK_BITS (256 * 1024)
+#define DATA_BLOCK_SIZE 4096
 #define DATA_SIZE (DATA_BLOCK_BITS * DATA_BLOCK_SIZE)
 
+/* The ring buffer size is 64M + 1G */
 #define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE)
 
 static struct device *tcmu_root_device;
@@ -103,6 +108,7 @@  struct tcmu_dev {
 	size_t data_size;
 
 	DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
+	struct radix_tree_root data_blocks;
 
 	wait_queue_head_t wait_cmdr;
 	/* TODO should this be a mutex? */
@@ -120,15 +126,22 @@  struct tcmu_dev {
 
 #define CMDR_OFF sizeof(struct tcmu_mailbox)
 
+struct tcmu_data_block_index {
+	struct list_head list;
+	uint32_t index;
+};
+
 struct tcmu_cmd {
 	struct se_cmd *se_cmd;
 	struct tcmu_dev *tcmu_dev;
 
-	uint16_t cmd_id;
+	uint32_t cmd_id;
 
 	/* Can't use se_cmd when cleaning up expired cmds, because if
 	   cmd has been completed then accessing se_cmd is off limits */
-	DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
+	uint32_t dbi_cnt;
+	uint32_t dbi_cur;
+	uint32_t *dbi;
 
 	unsigned long deadline;
 
@@ -159,10 +172,77 @@  enum tcmu_multicast_groups {
 	.netnsok = true,
 };
 
+static int tcmu_db_get_empty_block(struct tcmu_dev *udev, void **addr)
+{
+	void *p;
+	uint32_t block;
+	int ret;
+
+	block = find_first_zero_bit(udev->data_bitmap, DATA_BLOCK_BITS);
+	set_bit(block, udev->data_bitmap);
+
+	p = radix_tree_lookup(&udev->data_blocks, block);
+	if (!p) {
+		p = kzalloc(DATA_BLOCK_SIZE, GFP_ATOMIC);
+		if (!p) {
+			clear_bit(block, udev->data_bitmap);
+			return -ENOMEM;
+		}
+
+		ret = radix_tree_insert(&udev->data_blocks, block, p);
+		if (ret) {
+			kfree(p);
+			clear_bit(block, udev->data_bitmap);
+			return ret;
+		}
+	}
+
+	*addr = p;
+	return block;
+}
+
+static void *tcmu_db_get_block_addr(struct tcmu_dev *udev, uint32_t block)
+{
+	return radix_tree_lookup(&udev->data_blocks, block);
+}
+
+#define	tcmu_cmd_reset_dbi_cur(cmd) ((cmd)->dbi_cur = 0)
+#define tcmu_cmd_set_dbi(cmd, index) ((cmd)->dbi[(cmd)->dbi_cur++] = (index))
+#define tcmu_cmd_get_dbi(cmd) ((cmd)->dbi[(cmd)->dbi_cur++])
+
+static void tcmu_cmd_free_data(struct tcmu_cmd *tcmu_cmd)
+{
+	struct tcmu_dev *udev = tcmu_cmd->tcmu_dev;
+	uint32_t bi;
+
+	for (bi = 0; bi < tcmu_cmd->dbi_cnt; bi++)
+		clear_bit(tcmu_cmd->dbi[bi], udev->data_bitmap);
+}
+
+static inline void tcmu_free_cmd(struct tcmu_cmd *tcmu_cmd)
+{
+	kfree(tcmu_cmd->dbi);
+	kmem_cache_free(tcmu_cmd_cache, tcmu_cmd);
+}
+
+static inline uint32_t tcmu_cmd_get_dbi_count(struct se_cmd *se_cmd)
+{
+	size_t data_length = se_cmd->data_length;
+	uint32_t dbi_cnt;
+
+	if (se_cmd->se_cmd_flags & SCF_BIDI)
+		data_length += se_cmd->t_bidi_data_sg->length;
+
+	dbi_cnt = (data_length + DATA_BLOCK_SIZE - 1) / DATA_BLOCK_SIZE;
+
+	return dbi_cnt;
+}
+
 static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd)
 {
 	struct se_device *se_dev = se_cmd->se_dev;
 	struct tcmu_dev *udev = TCMU_DEV(se_dev);
+	uint32_t dbi_cnt = tcmu_cmd_get_dbi_count(se_cmd);
 	struct tcmu_cmd *tcmu_cmd;
 	int cmd_id;
 
@@ -174,6 +254,14 @@  static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd)
 	tcmu_cmd->tcmu_dev = udev;
 	tcmu_cmd->deadline = jiffies + msecs_to_jiffies(TCMU_TIME_OUT);
 
+	tcmu_cmd_reset_dbi_cur(tcmu_cmd);
+	tcmu_cmd->dbi_cnt = dbi_cnt;
+	tcmu_cmd->dbi = kcalloc(dbi_cnt, sizeof(uint32_t), GFP_KERNEL);
+	if (!tcmu_cmd->dbi) {
+		kmem_cache_free(tcmu_cmd_cache, tcmu_cmd);
+		return NULL;
+	}
+
 	idr_preload(GFP_KERNEL);
 	spin_lock_irq(&udev->commands_lock);
 	cmd_id = idr_alloc(&udev->commands, tcmu_cmd, 0,
@@ -182,7 +270,7 @@  static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd)
 	idr_preload_end();
 
 	if (cmd_id < 0) {
-		kmem_cache_free(tcmu_cmd_cache, tcmu_cmd);
+		tcmu_free_cmd(tcmu_cmd);
 		return NULL;
 	}
 	tcmu_cmd->cmd_id = cmd_id;
@@ -244,10 +332,10 @@  static inline void new_iov(struct iovec **iov, int *iov_cnt,
 #define UPDATE_HEAD(head, used, size) smp_store_release(&head, ((head % size) + used) % size)
 
 /* offset is relative to mb_addr */
-static inline size_t get_block_offset(struct tcmu_dev *dev,
-		int block, int remaining)
+static inline size_t get_block_offset_user(struct tcmu_dev *dev,
+		int bi, int remaining)
 {
-	return dev->data_off + block * DATA_BLOCK_SIZE +
+	return dev->data_off + bi * DATA_BLOCK_SIZE +
 		DATA_BLOCK_SIZE - remaining;
 }
 
@@ -257,13 +345,14 @@  static inline size_t iov_tail(struct tcmu_dev *udev, struct iovec *iov)
 }
 
 static void alloc_and_scatter_data_area(struct tcmu_dev *udev,
-	struct scatterlist *data_sg, unsigned int data_nents,
-	struct iovec **iov, int *iov_cnt, bool copy_data)
+	struct tcmu_cmd *tcmu_cmd, struct scatterlist *data_sg,
+	unsigned int data_nents, struct iovec **iov,
+	int *iov_cnt, bool copy_data)
 {
-	int i, block;
+	int i, bi;
 	int block_remaining = 0;
-	void *from, *to;
-	size_t copy_bytes, to_offset;
+	void *from, *to = NULL;
+	size_t copy_bytes, to_offset, offset;
 	struct scatterlist *sg;
 
 	for_each_sg(data_sg, sg, data_nents, i) {
@@ -271,22 +360,24 @@  static void alloc_and_scatter_data_area(struct tcmu_dev *udev,
 		from = kmap_atomic(sg_page(sg)) + sg->offset;
 		while (sg_remaining > 0) {
 			if (block_remaining == 0) {
-				block = find_first_zero_bit(udev->data_bitmap,
-						DATA_BLOCK_BITS);
 				block_remaining = DATA_BLOCK_SIZE;
-				set_bit(block, udev->data_bitmap);
+				bi = tcmu_db_get_empty_block(udev, &to);
+				tcmu_cmd_set_dbi(tcmu_cmd, bi);
 			}
+
 			copy_bytes = min_t(size_t, sg_remaining,
 					block_remaining);
-			to_offset = get_block_offset(udev, block,
+			to_offset = get_block_offset_user(udev, bi,
 					block_remaining);
-			to = (void *)udev->mb_addr + to_offset;
+			offset = DATA_BLOCK_SIZE - block_remaining;
+			to = (void *)(unsigned long)to + offset;
+
 			if (*iov_cnt != 0 &&
 			    to_offset == iov_tail(udev, *iov)) {
 				(*iov)->iov_len += copy_bytes;
 			} else {
 				new_iov(iov, iov_cnt, udev);
-				(*iov)->iov_base = (void __user *) to_offset;
+				(*iov)->iov_base = (void __user *)to_offset;
 				(*iov)->iov_len = copy_bytes;
 			}
 			if (copy_data) {
@@ -301,19 +392,13 @@  static void alloc_and_scatter_data_area(struct tcmu_dev *udev,
 	}
 }
 
-static void free_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd)
-{
-	bitmap_xor(udev->data_bitmap, udev->data_bitmap, cmd->data_bitmap,
-		   DATA_BLOCK_BITS);
-}
-
-static void gather_data_area(struct tcmu_dev *udev, unsigned long *cmd_bitmap,
+static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *tcmu_cmd,
 		struct scatterlist *data_sg, unsigned int data_nents)
 {
-	int i, block;
+	int i, bi;
 	int block_remaining = 0;
 	void *from, *to;
-	size_t copy_bytes, from_offset;
+	size_t copy_bytes, offset;
 	struct scatterlist *sg;
 
 	for_each_sg(data_sg, sg, data_nents, i) {
@@ -321,16 +406,15 @@  static void gather_data_area(struct tcmu_dev *udev, unsigned long *cmd_bitmap,
 		to = kmap_atomic(sg_page(sg)) + sg->offset;
 		while (sg_remaining > 0) {
 			if (block_remaining == 0) {
-				block = find_first_bit(cmd_bitmap,
-						DATA_BLOCK_BITS);
 				block_remaining = DATA_BLOCK_SIZE;
-				clear_bit(block, cmd_bitmap);
+				bi = tcmu_cmd_get_dbi(tcmu_cmd);
+
+				from = tcmu_db_get_block_addr(udev, bi);
 			}
 			copy_bytes = min_t(size_t, sg_remaining,
 					block_remaining);
-			from_offset = get_block_offset(udev, block,
-					block_remaining);
-			from = (void *) udev->mb_addr + from_offset;
+			offset = DATA_BLOCK_SIZE - block_remaining;
+			from = (void *)(unsigned long)from + offset;
 			tcmu_flush_dcache_range(from, copy_bytes);
 			memcpy(to + sg->length - sg_remaining, from,
 					copy_bytes);
@@ -404,7 +488,6 @@  static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d
 	uint64_t cdb_off;
 	bool copy_to_data_area;
 	size_t data_length;
-	DECLARE_BITMAP(old_bitmap, DATA_BLOCK_BITS);
 
 	if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags))
 		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
@@ -490,28 +573,22 @@  static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d
 	entry->hdr.kflags = 0;
 	entry->hdr.uflags = 0;
 
-	bitmap_copy(old_bitmap, udev->data_bitmap, DATA_BLOCK_BITS);
-
 	/* Handle allocating space from the data area */
 	iov = &entry->req.iov[0];
 	iov_cnt = 0;
 	copy_to_data_area = (se_cmd->data_direction == DMA_TO_DEVICE
 		|| se_cmd->se_cmd_flags & SCF_BIDI);
-	alloc_and_scatter_data_area(udev, se_cmd->t_data_sg,
+	alloc_and_scatter_data_area(udev, tcmu_cmd, se_cmd->t_data_sg,
 		se_cmd->t_data_nents, &iov, &iov_cnt, copy_to_data_area);
 	entry->req.iov_cnt = iov_cnt;
 	entry->req.iov_dif_cnt = 0;
 
 	/* Handle BIDI commands */
 	iov_cnt = 0;
-	alloc_and_scatter_data_area(udev, se_cmd->t_bidi_data_sg,
+	alloc_and_scatter_data_area(udev, tcmu_cmd, se_cmd->t_bidi_data_sg,
 		se_cmd->t_bidi_data_nents, &iov, &iov_cnt, false);
 	entry->req.iov_bidi_cnt = iov_cnt;
 
-	/* cmd's data_bitmap is what changed in process */
-	bitmap_xor(tcmu_cmd->data_bitmap, old_bitmap, udev->data_bitmap,
-			DATA_BLOCK_BITS);
-
 	/* All offsets relative to mb_addr, not start of entry! */
 	cdb_off = CMDR_OFF + cmd_head + base_command_size;
 	memcpy((void *) mb + cdb_off, se_cmd->t_task_cdb, scsi_command_size(se_cmd->t_task_cdb));
@@ -551,7 +628,7 @@  static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d
 		idr_remove(&udev->commands, tcmu_cmd->cmd_id);
 		spin_unlock_irq(&udev->commands_lock);
 
-		kmem_cache_free(tcmu_cmd_cache, tcmu_cmd);
+		tcmu_free_cmd(tcmu_cmd);
 	}
 
 	return ret;
@@ -562,43 +639,38 @@  static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *
 	struct se_cmd *se_cmd = cmd->se_cmd;
 	struct tcmu_dev *udev = cmd->tcmu_dev;
 
+	tcmu_cmd_reset_dbi_cur(cmd);
+
 	if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) {
 		/*
 		 * cmd has been completed already from timeout, just reclaim
 		 * data area space and free cmd
 		 */
-		free_data_area(udev, cmd);
+		tcmu_cmd_free_data(cmd);
 
-		kmem_cache_free(tcmu_cmd_cache, cmd);
+		tcmu_free_cmd(cmd);
 		return;
 	}
 
 	if (entry->hdr.uflags & TCMU_UFLAG_UNKNOWN_OP) {
-		free_data_area(udev, cmd);
+		tcmu_cmd_free_data(cmd);
 		pr_warn("TCMU: Userspace set UNKNOWN_OP flag on se_cmd %p\n",
 			cmd->se_cmd);
 		entry->rsp.scsi_status = SAM_STAT_CHECK_CONDITION;
 	} else if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) {
 		memcpy(se_cmd->sense_buffer, entry->rsp.sense_buffer,
 			       se_cmd->scsi_sense_length);
-		free_data_area(udev, cmd);
+		tcmu_cmd_free_data(cmd);
 	} else if (se_cmd->se_cmd_flags & SCF_BIDI) {
-		DECLARE_BITMAP(bitmap, DATA_BLOCK_BITS);
-
-		/* Get Data-In buffer before clean up */
-		bitmap_copy(bitmap, cmd->data_bitmap, DATA_BLOCK_BITS);
-		gather_data_area(udev, bitmap,
+		gather_data_area(udev, cmd,
 			se_cmd->t_bidi_data_sg, se_cmd->t_bidi_data_nents);
-		free_data_area(udev, cmd);
+		tcmu_cmd_free_data(cmd);
 	} else if (se_cmd->data_direction == DMA_FROM_DEVICE) {
-		DECLARE_BITMAP(bitmap, DATA_BLOCK_BITS);
-
-		bitmap_copy(bitmap, cmd->data_bitmap, DATA_BLOCK_BITS);
-		gather_data_area(udev, bitmap,
+		gather_data_area(udev, cmd,
 			se_cmd->t_data_sg, se_cmd->t_data_nents);
-		free_data_area(udev, cmd);
+		tcmu_cmd_free_data(cmd);
 	} else if (se_cmd->data_direction == DMA_TO_DEVICE) {
-		free_data_area(udev, cmd);
+		tcmu_cmd_free_data(cmd);
 	} else if (se_cmd->data_direction != DMA_NONE) {
 		pr_warn("TCMU: data direction was %d!\n",
 			se_cmd->data_direction);
@@ -607,7 +679,7 @@  static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *
 	target_complete_cmd(cmd->se_cmd, entry->rsp.scsi_status);
 	cmd->se_cmd = NULL;
 
-	kmem_cache_free(tcmu_cmd_cache, cmd);
+	tcmu_free_cmd(cmd);
 }
 
 static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
@@ -801,11 +873,19 @@  static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 */
 	offset = (vmf->pgoff - mi) << PAGE_SHIFT;
 
-	addr = (void *)(unsigned long)info->mem[mi].addr + offset;
-	if (info->mem[mi].memtype == UIO_MEM_LOGICAL)
-		page = virt_to_page(addr);
-	else
+	if (offset < udev->data_off) {
+		/* For the vmalloc()ed cmd area pages */
+		addr = (void *)(unsigned long)info->mem[mi].addr + offset;
 		page = vmalloc_to_page(addr);
+	} else {
+		/* For the dynamically growing data area pages */
+		uint32_t block;
+
+		block = (offset - udev->data_off) / DATA_BLOCK_SIZE;
+		addr = tcmu_db_get_block_addr(udev, block);
+		page = virt_to_page(addr);
+	}
+
 	get_page(page);
 	vmf->page = page;
 	return 0;
@@ -919,7 +999,7 @@  static int tcmu_configure_device(struct se_device *dev)
 
 	info->name = str;
 
-	udev->mb_addr = vzalloc(TCMU_RING_SIZE);
+	udev->mb_addr = vzalloc(CMDR_SIZE);
 	if (!udev->mb_addr) {
 		ret = -ENOMEM;
 		goto err_vzalloc;
@@ -928,8 +1008,9 @@  static int tcmu_configure_device(struct se_device *dev)
 	/* mailbox fits in first part of CMDR space */
 	udev->cmdr_size = CMDR_SIZE - CMDR_OFF;
 	udev->data_off = CMDR_SIZE;
-	udev->data_size = TCMU_RING_SIZE - CMDR_SIZE;
+	udev->data_size = DATA_SIZE;
 
+	/* Initialise the mailbox of the ring buffer */
 	mb = udev->mb_addr;
 	mb->version = TCMU_MAILBOX_VERSION;
 	mb->flags = TCMU_MAILBOX_FLAG_CAP_OOOC;
@@ -940,12 +1021,14 @@  static int tcmu_configure_device(struct se_device *dev)
 	WARN_ON(udev->data_size % PAGE_SIZE);
 	WARN_ON(udev->data_size % DATA_BLOCK_SIZE);
 
+	INIT_RADIX_TREE(&udev->data_blocks, GFP_ATOMIC);
+
 	info->version = __stringify(TCMU_MAILBOX_VERSION);
 
 	info->mem[0].name = "tcm-user command & data buffer";
 	info->mem[0].addr = (phys_addr_t)(uintptr_t)udev->mb_addr;
 	info->mem[0].size = TCMU_RING_SIZE;
-	info->mem[0].memtype = UIO_MEM_VIRTUAL;
+	info->mem[0].memtype = UIO_MEM_NONE;
 
 	info->irqcontrol = tcmu_irqcontrol;
 	info->irq = UIO_IRQ_CUSTOM;
@@ -982,13 +1065,23 @@  static int tcmu_configure_device(struct se_device *dev)
 	return ret;
 }
 
-static int tcmu_check_and_free_pending_cmd(struct tcmu_cmd *cmd)
+static inline int tcmu_check_and_free_pending_cmd(struct tcmu_cmd *cmd)
 {
-	if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) {
-		kmem_cache_free(tcmu_cmd_cache, cmd);
-		return 0;
+	struct tcmu_dev *udev = cmd->tcmu_dev;
+	void *addr;
+	int i, ret = 0;
+
+	for (i = 0; i < cmd->dbi_cnt; i++) {
+		addr = radix_tree_delete(&udev->data_blocks, cmd->dbi[i]);
+		kfree(addr);
 	}
-	return -EINVAL;
+
+	if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags))
+		ret = -EINVAL;
+
+	tcmu_free_cmd(cmd);
+
+	return ret;
 }
 
 static void tcmu_dev_call_rcu(struct rcu_head *p)