diff mbox series

[V4,1/3] block: Allow mapping of vmalloc-ed buffers

Message ID 20190627024910.23987-2-damien.lemoal@wdc.com (mailing list archive)
State New, archived
Headers show
Series Fix zone revalidation memory allocation failures | expand

Commit Message

Damien Le Moal June 27, 2019, 2:49 a.m. UTC
To allow the SCSI subsystem scsi_execute_req() function to issue
requests using large buffers that are better allocated with vmalloc()
rather than kmalloc(), modify bio_map_kern() and bio_copy_kern() to
allow passing a buffer allocated with vmalloc(). To do so, detect
vmalloc-ed buffers using is_vmalloc_addr(). For vmalloc-ed buffers,
flush the buffer using flush_kernel_vmap_range(), use vmalloc_to_page()
instead of virt_to_page() to obtain the pages of the buffer, and
invalidate the buffer addresses with invalidate_kernel_vmap_range() on
completion of read BIOs. This last point is executed using the function
bio_invalidate_vmalloc_pages() which is defined only if the
architecture defines ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE, that is, if the
architecture actually needs the invalidation done.

Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 block/bio.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

Comments

Christoph Hellwig June 27, 2019, 7:28 a.m. UTC | #1
> +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE

That seems like an odd constructu, as you don't call
flush_kernel_dcache_page.  From looking whoe defines it it seems
to be about the right set of architectures, but that might be
by a mix of chance and similar requirements for cache flushing.

> +static void bio_invalidate_vmalloc_pages(struct bio *bio)
> +{
> +	if (bio->bi_private) {
> +		struct bvec_iter_all iter_all;
> +		struct bio_vec *bvec;
> +		unsigned long len = 0;
> +
> +		bio_for_each_segment_all(bvec, bio, iter_all)
> +			len += bvec->bv_len;
> +             invalidate_kernel_vmap_range(bio->bi_private, len);

We control the bio here, so we can directly iterate over the
segments instead of doing the fairly expensive bio_for_each_segment_all
call that goes to each page and builds a bvec for it.

> +	struct page *page;
>  	int offset, i;
>  	struct bio *bio;
>  
> @@ -1508,6 +1529,12 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
>  	if (!bio)
>  		return ERR_PTR(-ENOMEM);
>  
> +	if (is_vmalloc) {
> +		flush_kernel_vmap_range(data, len);
> +		if ((!op_is_write(bio_op(bio))))
> +			bio->bi_private = data;
> +	}

We've just allocate the bio, so bio->bi_opf is not actually set at
this point unfortunately.
Ming Lei June 27, 2019, 7:47 a.m. UTC | #2
On Thu, Jun 27, 2019 at 11:49:08AM +0900, Damien Le Moal wrote:
> To allow the SCSI subsystem scsi_execute_req() function to issue
> requests using large buffers that are better allocated with vmalloc()
> rather than kmalloc(), modify bio_map_kern() and bio_copy_kern() to
> allow passing a buffer allocated with vmalloc(). To do so, detect
> vmalloc-ed buffers using is_vmalloc_addr(). For vmalloc-ed buffers,
> flush the buffer using flush_kernel_vmap_range(), use vmalloc_to_page()
> instead of virt_to_page() to obtain the pages of the buffer, and
> invalidate the buffer addresses with invalidate_kernel_vmap_range() on
> completion of read BIOs. This last point is executed using the function
> bio_invalidate_vmalloc_pages() which is defined only if the
> architecture defines ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE, that is, if the
> architecture actually needs the invalidation done.
> 
> Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
> Fixes: e76239a3748c ("block: add a report_zones method")
> Cc: stable@vger.kernel.org
> Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
> ---
>  block/bio.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 42 insertions(+), 1 deletion(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index ce797d73bb43..1c21d1e7f1b8 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -16,6 +16,7 @@
>  #include <linux/workqueue.h>
>  #include <linux/cgroup.h>
>  #include <linux/blk-cgroup.h>
> +#include <linux/highmem.h>
>  
>  #include <trace/events/block.h>
>  #include "blk.h"
> @@ -1479,8 +1480,26 @@ void bio_unmap_user(struct bio *bio)
>  	bio_put(bio);
>  }
>  
> +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
> +static void bio_invalidate_vmalloc_pages(struct bio *bio)
> +{
> +	if (bio->bi_private) {
> +		struct bvec_iter_all iter_all;
> +		struct bio_vec *bvec;
> +		unsigned long len = 0;
> +
> +		bio_for_each_segment_all(bvec, bio, iter_all)
> +			len += bvec->bv_len;
> +		invalidate_kernel_vmap_range(bio->bi_private, len);
> +	}
> +}
> +#else
> +static void bio_invalidate_vmalloc_pages(struct bio *bio) {}
> +#endif
> +
>  static void bio_map_kern_endio(struct bio *bio)
>  {
> +	bio_invalidate_vmalloc_pages(bio);
>  	bio_put(bio);
>  }
>  
> @@ -1501,6 +1520,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
>  	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
>  	unsigned long start = kaddr >> PAGE_SHIFT;
>  	const int nr_pages = end - start;
> +	bool is_vmalloc = is_vmalloc_addr(data);
> +	struct page *page;
>  	int offset, i;
>  	struct bio *bio;
>  
> @@ -1508,6 +1529,12 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
>  	if (!bio)
>  		return ERR_PTR(-ENOMEM);
>  
> +	if (is_vmalloc) {
> +		flush_kernel_vmap_range(data, len);
> +		if ((!op_is_write(bio_op(bio))))
> +			bio->bi_private = data;
> +	}
> +
>  	offset = offset_in_page(kaddr);
>  	for (i = 0; i < nr_pages; i++) {
>  		unsigned int bytes = PAGE_SIZE - offset;
> @@ -1518,7 +1545,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
>  		if (bytes > len)
>  			bytes = len;
>  
> -		if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
> +		if (!is_vmalloc)
> +			page = virt_to_page(data);
> +		else
> +			page = vmalloc_to_page(data);
> +		if (bio_add_pc_page(q, bio, page, bytes,
>  				    offset) < bytes) {
>  			/* we don't support partial mappings */
>  			bio_put(bio);
> @@ -1531,6 +1562,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
>  	}
>  
>  	bio->bi_end_io = bio_map_kern_endio;
> +
>  	return bio;
>  }
>  EXPORT_SYMBOL(bio_map_kern);
> @@ -1543,6 +1575,7 @@ static void bio_copy_kern_endio(struct bio *bio)
>  
>  static void bio_copy_kern_endio_read(struct bio *bio)
>  {
> +	unsigned long len = 0;
>  	char *p = bio->bi_private;
>  	struct bio_vec *bvec;
>  	struct bvec_iter_all iter_all;
> @@ -1550,8 +1583,12 @@ static void bio_copy_kern_endio_read(struct bio *bio)
>  	bio_for_each_segment_all(bvec, bio, iter_all) {
>  		memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
>  		p += bvec->bv_len;
> +		len += bvec->bv_len;
>  	}
>  
> +	if (is_vmalloc_addr(bio->bi_private))
> +		invalidate_kernel_vmap_range(bio->bi_private, len);
> +
>  	bio_copy_kern_endio(bio);
>  }
>  
> @@ -1572,6 +1609,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
>  	unsigned long kaddr = (unsigned long)data;
>  	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
>  	unsigned long start = kaddr >> PAGE_SHIFT;
> +	bool is_vmalloc = is_vmalloc_addr(data);
>  	struct bio *bio;
>  	void *p = data;
>  	int nr_pages = 0;
> @@ -1587,6 +1625,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
>  	if (!bio)
>  		return ERR_PTR(-ENOMEM);
>  
> +	if (is_vmalloc)
> +		flush_kernel_vmap_range(data, len);
> +

Are your sure that invalidate[|flush]_kernel_vmap_range is needed for
bio_copy_kernel? The vmalloc buffer isn't involved in IO, and only
accessed by CPU.

Thanks,
Ming
Damien Le Moal June 27, 2019, 8:14 a.m. UTC | #3
Christoph,

On 2019/06/27 16:28, Christoph Hellwig wrote:
>> +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
> 
> That seems like an odd constructu, as you don't call
> flush_kernel_dcache_page.  From looking whoe defines it it seems
> to be about the right set of architectures, but that might be
> by a mix of chance and similar requirements for cache flushing.

This comes from include/linux/highmem.h:

#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
static inline void flush_kernel_dcache_page(struct page *page)
{
}
static inline void flush_kernel_vmap_range(void *vaddr, int size)
{
}
static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
{
}
#endif

which I guessed is for the architectures that do not need the flush/invalidate
vmap functions. I copied. Is there a better way ? The point was to avoid doing
the loop on the bvec for the range length on architectures that have an empty
definition of invalidate_kernel_vmap_range().

> 
>> +static void bio_invalidate_vmalloc_pages(struct bio *bio)
>> +{
>> +	if (bio->bi_private) {
>> +		struct bvec_iter_all iter_all;
>> +		struct bio_vec *bvec;
>> +		unsigned long len = 0;
>> +
>> +		bio_for_each_segment_all(bvec, bio, iter_all)
>> +			len += bvec->bv_len;
>> +             invalidate_kernel_vmap_range(bio->bi_private, len);
> 
> We control the bio here, so we can directly iterate over the
> segments instead of doing the fairly expensive bio_for_each_segment_all
> call that goes to each page and builds a bvec for it.

OK. Got it. Will update it.

> 
>> +	struct page *page;
>>  	int offset, i;
>>  	struct bio *bio;
>>  
>> @@ -1508,6 +1529,12 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
>>  	if (!bio)
>>  		return ERR_PTR(-ENOMEM);
>>  
>> +	if (is_vmalloc) {
>> +		flush_kernel_vmap_range(data, len);
>> +		if ((!op_is_write(bio_op(bio))))
>> +			bio->bi_private = data;
>> +	}
> 
> We've just allocate the bio, so bio->bi_opf is not actually set at
> this point unfortunately.

OK. I will move the check to the completion path then.

Thanks !
Damien Le Moal June 27, 2019, 8:17 a.m. UTC | #4
Hi Ming,

On 2019/06/27 16:47, Ming Lei wrote:
[...]
>>  static void bio_copy_kern_endio_read(struct bio *bio)
>>  {
>> +	unsigned long len = 0;
>>  	char *p = bio->bi_private;
>>  	struct bio_vec *bvec;
>>  	struct bvec_iter_all iter_all;
>> @@ -1550,8 +1583,12 @@ static void bio_copy_kern_endio_read(struct bio *bio)
>>  	bio_for_each_segment_all(bvec, bio, iter_all) {
>>  		memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
>>  		p += bvec->bv_len;
>> +		len += bvec->bv_len;
>>  	}
>>  
>> +	if (is_vmalloc_addr(bio->bi_private))
>> +		invalidate_kernel_vmap_range(bio->bi_private, len);
>> +
>>  	bio_copy_kern_endio(bio);
>>  }
>>  
>> @@ -1572,6 +1609,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
>>  	unsigned long kaddr = (unsigned long)data;
>>  	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
>>  	unsigned long start = kaddr >> PAGE_SHIFT;
>> +	bool is_vmalloc = is_vmalloc_addr(data);
>>  	struct bio *bio;
>>  	void *p = data;
>>  	int nr_pages = 0;
>> @@ -1587,6 +1625,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
>>  	if (!bio)
>>  		return ERR_PTR(-ENOMEM);
>>  
>> +	if (is_vmalloc)
>> +		flush_kernel_vmap_range(data, len);
>> +
> 
> Are your sure that invalidate[|flush]_kernel_vmap_range is needed for
> bio_copy_kernel? The vmalloc buffer isn't involved in IO, and only
> accessed by CPU.

Not sure at all. I am a little out of my league here.
Christoph mentioned that this is necessary.

Christoph, can you confirm ?

Thanks !
Christoph Hellwig June 27, 2019, 8:25 a.m. UTC | #5
On Thu, Jun 27, 2019 at 08:17:08AM +0000, Damien Le Moal wrote:
> > Are your sure that invalidate[|flush]_kernel_vmap_range is needed for
> > bio_copy_kernel? The vmalloc buffer isn't involved in IO, and only
> > accessed by CPU.
> 
> Not sure at all. I am a little out of my league here.
> Christoph mentioned that this is necessary.
> 
> Christoph, can you confirm ?

Ming is right.  Sorry for misleading you.
Christoph Hellwig June 27, 2019, 8:25 a.m. UTC | #6
On Thu, Jun 27, 2019 at 08:14:56AM +0000, Damien Le Moal wrote:
> which I guessed is for the architectures that do not need the flush/invalidate
> vmap functions. I copied. Is there a better way ? The point was to avoid doing
> the loop on the bvec for the range length on architectures that have an empty
> definition of invalidate_kernel_vmap_range().

No, looks like what you did is right.  I blame my lack of attention
on the heat wave here and the resulting lack of sleep..
Damien Le Moal June 27, 2019, 8:36 a.m. UTC | #7
On 2019/06/27 17:25, Christoph Hellwig wrote:
> On Thu, Jun 27, 2019 at 08:17:08AM +0000, Damien Le Moal wrote:
>>> Are your sure that invalidate[|flush]_kernel_vmap_range is needed for
>>> bio_copy_kernel? The vmalloc buffer isn't involved in IO, and only
>>> accessed by CPU.
>>
>> Not sure at all. I am a little out of my league here.
>> Christoph mentioned that this is necessary.
>>
>> Christoph, can you confirm ?
> 
> Ming is right.  Sorry for misleading you.

No problem. Sending a V5 with cleanups.

Thanks !
Damien Le Moal June 27, 2019, 8:37 a.m. UTC | #8
On 2019/06/27 17:26, Christoph Hellwig wrote:
> On Thu, Jun 27, 2019 at 08:14:56AM +0000, Damien Le Moal wrote:
>> which I guessed is for the architectures that do not need the flush/invalidate
>> vmap functions. I copied. Is there a better way ? The point was to avoid doing
>> the loop on the bvec for the range length on architectures that have an empty
>> definition of invalidate_kernel_vmap_range().
> 
> No, looks like what you did is right.  I blame my lack of attention
> on the heat wave here and the resulting lack of sleep..

No problem ! At least we checked twice now !
diff mbox series

Patch

diff --git a/block/bio.c b/block/bio.c
index ce797d73bb43..1c21d1e7f1b8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -16,6 +16,7 @@ 
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
 #include <linux/blk-cgroup.h>
+#include <linux/highmem.h>
 
 #include <trace/events/block.h>
 #include "blk.h"
@@ -1479,8 +1480,26 @@  void bio_unmap_user(struct bio *bio)
 	bio_put(bio);
 }
 
+#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
+static void bio_invalidate_vmalloc_pages(struct bio *bio)
+{
+	if (bio->bi_private) {
+		struct bvec_iter_all iter_all;
+		struct bio_vec *bvec;
+		unsigned long len = 0;
+
+		bio_for_each_segment_all(bvec, bio, iter_all)
+			len += bvec->bv_len;
+		invalidate_kernel_vmap_range(bio->bi_private, len);
+	}
+}
+#else
+static void bio_invalidate_vmalloc_pages(struct bio *bio) {}
+#endif
+
 static void bio_map_kern_endio(struct bio *bio)
 {
+	bio_invalidate_vmalloc_pages(bio);
 	bio_put(bio);
 }
 
@@ -1501,6 +1520,8 @@  struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
 	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	unsigned long start = kaddr >> PAGE_SHIFT;
 	const int nr_pages = end - start;
+	bool is_vmalloc = is_vmalloc_addr(data);
+	struct page *page;
 	int offset, i;
 	struct bio *bio;
 
@@ -1508,6 +1529,12 @@  struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 
+	if (is_vmalloc) {
+		flush_kernel_vmap_range(data, len);
+		if ((!op_is_write(bio_op(bio))))
+			bio->bi_private = data;
+	}
+
 	offset = offset_in_page(kaddr);
 	for (i = 0; i < nr_pages; i++) {
 		unsigned int bytes = PAGE_SIZE - offset;
@@ -1518,7 +1545,11 @@  struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
 		if (bytes > len)
 			bytes = len;
 
-		if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
+		if (!is_vmalloc)
+			page = virt_to_page(data);
+		else
+			page = vmalloc_to_page(data);
+		if (bio_add_pc_page(q, bio, page, bytes,
 				    offset) < bytes) {
 			/* we don't support partial mappings */
 			bio_put(bio);
@@ -1531,6 +1562,7 @@  struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
 	}
 
 	bio->bi_end_io = bio_map_kern_endio;
+
 	return bio;
 }
 EXPORT_SYMBOL(bio_map_kern);
@@ -1543,6 +1575,7 @@  static void bio_copy_kern_endio(struct bio *bio)
 
 static void bio_copy_kern_endio_read(struct bio *bio)
 {
+	unsigned long len = 0;
 	char *p = bio->bi_private;
 	struct bio_vec *bvec;
 	struct bvec_iter_all iter_all;
@@ -1550,8 +1583,12 @@  static void bio_copy_kern_endio_read(struct bio *bio)
 	bio_for_each_segment_all(bvec, bio, iter_all) {
 		memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
 		p += bvec->bv_len;
+		len += bvec->bv_len;
 	}
 
+	if (is_vmalloc_addr(bio->bi_private))
+		invalidate_kernel_vmap_range(bio->bi_private, len);
+
 	bio_copy_kern_endio(bio);
 }
 
@@ -1572,6 +1609,7 @@  struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 	unsigned long kaddr = (unsigned long)data;
 	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	unsigned long start = kaddr >> PAGE_SHIFT;
+	bool is_vmalloc = is_vmalloc_addr(data);
 	struct bio *bio;
 	void *p = data;
 	int nr_pages = 0;
@@ -1587,6 +1625,9 @@  struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 
+	if (is_vmalloc)
+		flush_kernel_vmap_range(data, len);
+
 	while (len) {
 		struct page *page;
 		unsigned int bytes = PAGE_SIZE;