@@ -1651,6 +1651,21 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
}
}
+struct crypt_bulk_cb_data {
+ struct bio *bio;
+ unsigned int size;
+};
+
+static void crypt_bulk_alloc_cb(struct page *page, void *data)
+{
+ unsigned int len;
+ struct crypt_bulk_cb_data *b_data = (struct crypt_bulk_cb_data *)data;
+
+ len = (b_data->size > PAGE_SIZE) ? PAGE_SIZE : b_data->size;
+ bio_add_page(b_data->bio, page, len, 0);
+ b_data->size -= len;
+}
+
/*
* Generate a new unfragmented bio with the given size
* This should never violate the device limitations (but only because
@@ -1674,8 +1689,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
struct bio *clone;
unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
- unsigned i, len, remaining_size;
- struct page *page;
+ struct crypt_bulk_cb_data data;
retry:
if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -1686,22 +1700,17 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
clone->bi_private = io;
clone->bi_end_io = crypt_endio;
- remaining_size = size;
-
- for (i = 0; i < nr_iovecs; i++) {
- page = mempool_alloc(&cc->page_pool, gfp_mask);
- if (!page) {
- crypt_free_buffer_pages(cc, clone);
- bio_put(clone);
- gfp_mask |= __GFP_DIRECT_RECLAIM;
- goto retry;
- }
-
- len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size;
-
- bio_add_page(clone, page, len, 0);
+ data.bio = clone;
+ data.size = size;
- remaining_size -= len;
+ if (!mempool_alloc_pages_bulk_cb(&cc->page_pool, gfp_mask, nr_iovecs,
+ crypt_bulk_alloc_cb, &data)) {
+ crypt_free_buffer_pages(cc, clone);
+ bio_put(clone);
+ data.bio = NULL;
+ data.size = 0;
+ gfp_mask |= __GFP_DIRECT_RECLAIM;
+ goto retry;
}
/* Allocate space for integrity tags */
@@ -2655,10 +2664,14 @@ static void crypt_calculate_pages_per_client(void)
dm_crypt_pages_per_client = pages;
}
-static void *crypt_page_alloc(gfp_t gfp_mask, void *pool_data)
+static unsigned int crypt_alloc_pages_bulk(gfp_t gfp_mask, unsigned int nr,
+ void *pool_data,
+ struct page **page_array,
+ void (*cb)(struct page *, void *),
+ void *data)
{
struct crypt_config *cc = pool_data;
- struct page *page;
+ unsigned int ret;
/*
* Note, percpu_counter_read_positive() may over (and under) estimate
@@ -2667,13 +2680,13 @@ static void *crypt_page_alloc(gfp_t gfp_mask, void *pool_data)
*/
if (unlikely(percpu_counter_read_positive(&cc->n_allocated_pages) >= dm_crypt_pages_per_client) &&
likely(gfp_mask & __GFP_NORETRY))
- return NULL;
+ return 0;
- page = alloc_page(gfp_mask);
- if (likely(page != NULL))
- percpu_counter_add(&cc->n_allocated_pages, 1);
+ ret = alloc_pages_bulk_cb(gfp_mask, nr, cb, data);
- return page;
+ percpu_counter_add(&cc->n_allocated_pages, ret);
+
+ return ret;
}
static void crypt_page_free(void *page, void *pool_data)
@@ -2705,11 +2718,16 @@ static void crypt_dtr(struct dm_target *ti)
bioset_exit(&cc->bs);
+ /*
+ * With mempool bulk allocator the pages in the pool are not
+ * counted in n_allocated_pages.
+ */
+ WARN_ON(percpu_counter_sum(&cc->n_allocated_pages) != 0);
+
mempool_exit(&cc->page_pool);
mempool_exit(&cc->req_pool);
mempool_exit(&cc->tag_pool);
- WARN_ON(percpu_counter_sum(&cc->n_allocated_pages) != 0);
percpu_counter_destroy(&cc->n_allocated_pages);
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
@@ -3251,7 +3269,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
ARCH_KMALLOC_MINALIGN);
- ret = mempool_init(&cc->page_pool, BIO_MAX_VECS, crypt_page_alloc, crypt_page_free, cc);
+ ret = mempool_init_pages_bulk(&cc->page_pool, BIO_MAX_VECS,
+ crypt_alloc_pages_bulk, crypt_page_free,
+ cc);
if (ret) {
ti->error = "Cannot allocate page mempool";
goto bad;
When using dm-crypt for full disk encryption, dm-crypt would allocate an out bio and allocate the same amount of pages as in bio for encryption. It currently allocates one page at a time in a loop. This is not efficient. So using mempool page bulk allocator instead of allocating one page at a time. The mempool page bulk allocator would improve the IOPS with 1M I/O by approxiamately 6%. The test is done on a machine with 80 vCPU and 128GB memory with an encrypted ram device (the impact from storage hardware could be minimized so that we could benchmark the dm-crypt layer more accurately). Before the patch: Jobs: 1 (f=1): [w(1)][100.0%][w=1301MiB/s][w=1301 IOPS][eta 00m:00s] crypt: (groupid=0, jobs=1): err= 0: pid=48512: Wed Feb 1 18:11:30 2023 write: IOPS=1300, BW=1301MiB/s (1364MB/s)(76.2GiB/60001msec); 0 zone resets slat (usec): min=724, max=867, avg=765.71, stdev=19.27 clat (usec): min=4, max=196297, avg=195688.86, stdev=6450.50 lat (usec): min=801, max=197064, avg=196454.90, stdev=6450.35 clat percentiles (msec): | 1.00th=[ 197], 5.00th=[ 197], 10.00th=[ 197], 20.00th=[ 197], | 30.00th=[ 197], 40.00th=[ 197], 50.00th=[ 197], 60.00th=[ 197], | 70.00th=[ 197], 80.00th=[ 197], 90.00th=[ 197], 95.00th=[ 197], | 99.00th=[ 197], 99.50th=[ 197], 99.90th=[ 197], 99.95th=[ 197], | 99.99th=[ 197] bw ( MiB/s): min= 800, max= 1308, per=99.69%, avg=1296.94, stdev=46.02, samples=119 iops : min= 800, max= 1308, avg=1296.94, stdev=46.02, samples=119 lat (usec) : 10=0.01%, 1000=0.01% lat (msec) : 2=0.01%, 4=0.01%, 10=0.01%, 20=0.02%, 50=0.05% lat (msec) : 100=0.08%, 250=99.83% cpu : usr=3.88%, sys=96.02%, ctx=69, majf=1, minf=9 IO depths : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=0.1%, >=64=99.9% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.1% issued rwts: total=0,78060,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=256 Run status group 0 (all jobs): WRITE: bw=1301MiB/s (1364MB/s), 1301MiB/s-1301MiB/s (1364MB/s-1364MB/s), io=76.2GiB (81.9GB), run=60001-60001msec After the patch: Jobs: 1 (f=1): [w(1)][100.0%][w=1401MiB/s][w=1401 IOPS][eta 00m:00s] crypt: (groupid=0, jobs=1): err= 0: pid=2171: Wed Feb 1 21:08:16 2023 write: IOPS=1401, BW=1402MiB/s (1470MB/s)(82.1GiB/60001msec); 0 zone resets slat (usec): min=685, max=815, avg=710.77, stdev=13.24 clat (usec): min=4, max=182206, avg=181658.31, stdev=5810.58 lat (usec): min=709, max=182913, avg=182369.36, stdev=5810.67 clat percentiles (msec): | 1.00th=[ 182], 5.00th=[ 182], 10.00th=[ 182], 20.00th=[ 182], | 30.00th=[ 182], 40.00th=[ 182], 50.00th=[ 182], 60.00th=[ 182], | 70.00th=[ 182], 80.00th=[ 182], 90.00th=[ 182], 95.00th=[ 182], | 99.00th=[ 182], 99.50th=[ 182], 99.90th=[ 182], 99.95th=[ 182], | 99.99th=[ 182] bw ( MiB/s): min= 900, max= 1408, per=99.71%, avg=1397.60, stdev=46.04, samples=119 iops : min= 900, max= 1408, avg=1397.60, stdev=46.04, samples=119 lat (usec) : 10=0.01%, 750=0.01% lat (msec) : 2=0.01%, 4=0.01%, 10=0.01%, 20=0.02%, 50=0.05% lat (msec) : 100=0.08%, 250=99.83% cpu : usr=3.66%, sys=96.23%, ctx=76, majf=1, minf=9 IO depths : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=0.1%, >=64=99.9% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.1% issued rwts: total=0,84098,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=256 Run status group 0 (all jobs): WRITE: bw=1402MiB/s (1470MB/s), 1402MiB/s-1402MiB/s (1470MB/s-1470MB/s), io=82.1GiB (88.2GB), run=60001-60001msec The function tracing also shows the time consumed by page allocations is reduced significantly. The test allocated 1M (256 pages) bio in the same environment. Before the patch: It took approximately 600us by excluding the bio_add_page() calls. 2720.630754 | 56) xfs_io-38859 | 2.571 us | mempool_alloc(); 2720.630757 | 56) xfs_io-38859 | 0.937 us | bio_add_page(); 2720.630758 | 56) xfs_io-38859 | 1.772 us | mempool_alloc(); 2720.630760 | 56) xfs_io-38859 | 0.852 us | bio_add_page(); …. 2720.631559 | 56) xfs_io-38859 | 2.058 us | mempool_alloc(); 2720.631561 | 56) xfs_io-38859 | 0.717 us | bio_add_page(); 2720.631562 | 56) xfs_io-38859 | 2.014 us | mempool_alloc(); 2720.631564 | 56) xfs_io-38859 | 0.620 us | bio_add_page(); After the patch: It took approxiamately 30us. 11564.266385 | 22) xfs_io-136183 | + 30.551 us | __alloc_pages_bulk(); Page allocations overhead is around 6% (600us/9853us) in dm-crypt layer shown by function trace. The data also matches the IOPS data shown by fio. And the benchmark with 4K size I/O doesn't show measurable regression. Signed-off-by: Yang Shi <shy828301@gmail.com> --- drivers/md/dm-crypt.c | 72 +++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 26 deletions(-)