Message ID | 20210830233500.51395-1-mgurtovoy@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/1] virtio-blk: avoid preallocating big SGL for data | expand |
Does this hurt the performance of virtio-blk? I think a fio result is needed here. On Tue, Aug 31, 2021 at 7:36 AM Max Gurtovoy <mgurtovoy@nvidia.com> wrote: > > No need to pre-allocate a big buffer for the IO SGL anymore. If a device > has lots of deep queues, preallocation for the sg list can consume > substantial amounts of memory. For HW virtio-blk device, nr_hw_queues > can be 64 or 128 and each queue's depth might be 128. This means the > resulting preallocation for the data SGLs is big. > > Switch to runtime allocation for SGL for lists longer than 2 entries. > This is the approach used by NVMe drivers so it should be reasonable for > virtio block as well. Runtime SGL allocation has always been the case > for the legacy I/O path so this is nothing new. > > The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't > support SG_CHAIN, use only runtime allocation for the SGL. > > Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> > Reviewed-by: Israel Rukshin <israelr@nvidia.com> > --- > drivers/block/virtio_blk.c | 37 ++++++++++++++++++++++--------------- > 1 file changed, 22 insertions(+), 15 deletions(-) > > diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c > index 77e8468e8593..9a4c5d428b58 100644 > --- a/drivers/block/virtio_blk.c > +++ b/drivers/block/virtio_blk.c > @@ -24,6 +24,12 @@ > /* The maximum number of sg elements that fit into a virtqueue */ > #define VIRTIO_BLK_MAX_SG_ELEMS 32768 > > +#ifdef CONFIG_ARCH_NO_SG_CHAIN > +#define VIRTIO_BLK_INLINE_SG_CNT 0 > +#else > +#define VIRTIO_BLK_INLINE_SG_CNT 2 > +#endif > + > static int virtblk_queue_count_set(const char *val, > const struct kernel_param *kp) > { > @@ -99,7 +105,7 @@ struct virtio_blk { > struct virtblk_req { > struct virtio_blk_outhdr out_hdr; > u8 status; > - struct scatterlist sg[]; > + struct sg_table sg_table; > }; > > static inline blk_status_t virtblk_result(struct virtblk_req *vbr) > @@ -188,6 +194,8 @@ static inline void virtblk_request_done(struct request *req) > { > struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); > > + sg_free_table_chained(&vbr->sg_table, VIRTIO_BLK_INLINE_SG_CNT); > + > if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { > kfree(page_address(req->special_vec.bv_page) + > req->special_vec.bv_offset); > @@ -291,7 +299,15 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, > return BLK_STS_RESOURCE; > } > > - num = blk_rq_map_sg(hctx->queue, req, vbr->sg); > + vbr->sg_table.sgl = (struct scatterlist *)(vbr + 1); > + err = sg_alloc_table_chained(&vbr->sg_table, > + blk_rq_nr_phys_segments(req), > + vbr->sg_table.sgl, > + VIRTIO_BLK_INLINE_SG_CNT); > + if (err) > + return BLK_STS_RESOURCE; > + > + num = blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl); > if (num) { > if (rq_data_dir(req) == WRITE) > vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT); > @@ -300,7 +316,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, > } > > spin_lock_irqsave(&vblk->vqs[qid].lock, flags); > - err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num); > + err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg_table.sgl, num); > if (err) { > virtqueue_kick(vblk->vqs[qid].vq); > /* Don't stop the queue if -ENOMEM: we may have failed to > @@ -309,6 +325,8 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, > if (err == -ENOSPC) > blk_mq_stop_hw_queue(hctx); > spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); > + sg_free_table_chained(&vbr->sg_table, > + VIRTIO_BLK_INLINE_SG_CNT); > switch (err) { > case -ENOSPC: > return BLK_STS_DEV_RESOURCE; > @@ -687,16 +705,6 @@ static const struct attribute_group *virtblk_attr_groups[] = { > NULL, > }; > > -static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq, > - unsigned int hctx_idx, unsigned int numa_node) > -{ > - struct virtio_blk *vblk = set->driver_data; > - struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq); > - > - sg_init_table(vbr->sg, vblk->sg_elems); > - return 0; > -} > - > static int virtblk_map_queues(struct blk_mq_tag_set *set) > { > struct virtio_blk *vblk = set->driver_data; > @@ -709,7 +717,6 @@ static const struct blk_mq_ops virtio_mq_ops = { > .queue_rq = virtio_queue_rq, > .commit_rqs = virtio_commit_rqs, > .complete = virtblk_request_done, > - .init_request = virtblk_init_request, > .map_queues = virtblk_map_queues, > }; > > @@ -805,7 +812,7 @@ static int virtblk_probe(struct virtio_device *vdev) > vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; > vblk->tag_set.cmd_size = > sizeof(struct virtblk_req) + > - sizeof(struct scatterlist) * sg_elems; > + sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; > vblk->tag_set.driver_data = vblk; > vblk->tag_set.nr_hw_queues = vblk->num_vqs; > > -- > 2.18.1 >
On Tue, Aug 31, 2021 at 02:35:00AM +0300, Max Gurtovoy wrote: > No need to pre-allocate a big buffer for the IO SGL anymore. If a device > has lots of deep queues, preallocation for the sg list can consume > substantial amounts of memory. For HW virtio-blk device, nr_hw_queues > can be 64 or 128 and each queue's depth might be 128. This means the > resulting preallocation for the data SGLs is big. > > Switch to runtime allocation for SGL for lists longer than 2 entries. > This is the approach used by NVMe drivers so it should be reasonable for > virtio block as well. Runtime SGL allocation has always been the case > for the legacy I/O path so this is nothing new. > > The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't > support SG_CHAIN, use only runtime allocation for the SGL. > > Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> > Reviewed-by: Israel Rukshin <israelr@nvidia.com> > --- > drivers/block/virtio_blk.c | 37 ++++++++++++++++++++++--------------- > 1 file changed, 22 insertions(+), 15 deletions(-) > > diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c > index 77e8468e8593..9a4c5d428b58 100644 > --- a/drivers/block/virtio_blk.c > +++ b/drivers/block/virtio_blk.c > @@ -24,6 +24,12 @@ > /* The maximum number of sg elements that fit into a virtqueue */ > #define VIRTIO_BLK_MAX_SG_ELEMS 32768 > > +#ifdef CONFIG_ARCH_NO_SG_CHAIN > +#define VIRTIO_BLK_INLINE_SG_CNT 0 > +#else > +#define VIRTIO_BLK_INLINE_SG_CNT 2 > +#endif > + > static int virtblk_queue_count_set(const char *val, > const struct kernel_param *kp) > { > @@ -99,7 +105,7 @@ struct virtio_blk { > struct virtblk_req { > struct virtio_blk_outhdr out_hdr; > u8 status; > - struct scatterlist sg[]; > + struct sg_table sg_table; Please keep the sg flexible array member here instead of the pointer arithmetics that is added instead below. > + err = sg_alloc_table_chained(&vbr->sg_table, > + blk_rq_nr_phys_segments(req), > + vbr->sg_table.sgl, > + VIRTIO_BLK_INLINE_SG_CNT); > + if (err) > + return BLK_STS_RESOURCE; > + This will BUG() for requests without segments (fush and discard). You probably want a separate helper to actually map data in the, extending the big switch on the op. While we're at it, the blk_mq_start_request should also move as close as possible to the actual sending of the request to the host. You'll also need to select SG_POOL now that you're using these functions.
On 9/1/2021 6:38 AM, Feng Li wrote: > Does this hurt the performance of virtio-blk? > I think a fio result is needed here. No, we use this mechanism in NVMe/NVMf for few years already and didn't see any performance issues. Also with the fio tests I run with our NVIDIA's Virtio-blk SNAP devices showed same perf numbers. I can add it to v2. > > On Tue, Aug 31, 2021 at 7:36 AM Max Gurtovoy <mgurtovoy@nvidia.com> wrote: >> No need to pre-allocate a big buffer for the IO SGL anymore. If a device >> has lots of deep queues, preallocation for the sg list can consume >> substantial amounts of memory. For HW virtio-blk device, nr_hw_queues >> can be 64 or 128 and each queue's depth might be 128. This means the >> resulting preallocation for the data SGLs is big. >> >> Switch to runtime allocation for SGL for lists longer than 2 entries. >> This is the approach used by NVMe drivers so it should be reasonable for >> virtio block as well. Runtime SGL allocation has always been the case >> for the legacy I/O path so this is nothing new. >> >> The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't >> support SG_CHAIN, use only runtime allocation for the SGL. >> >> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> >> Reviewed-by: Israel Rukshin <israelr@nvidia.com> >> --- >> drivers/block/virtio_blk.c | 37 ++++++++++++++++++++++--------------- >> 1 file changed, 22 insertions(+), 15 deletions(-) >> >> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c >> index 77e8468e8593..9a4c5d428b58 100644 >> --- a/drivers/block/virtio_blk.c >> +++ b/drivers/block/virtio_blk.c >> @@ -24,6 +24,12 @@ >> /* The maximum number of sg elements that fit into a virtqueue */ >> #define VIRTIO_BLK_MAX_SG_ELEMS 32768 >> >> +#ifdef CONFIG_ARCH_NO_SG_CHAIN >> +#define VIRTIO_BLK_INLINE_SG_CNT 0 >> +#else >> +#define VIRTIO_BLK_INLINE_SG_CNT 2 >> +#endif >> + >> static int virtblk_queue_count_set(const char *val, >> const struct kernel_param *kp) >> { >> @@ -99,7 +105,7 @@ struct virtio_blk { >> struct virtblk_req { >> struct virtio_blk_outhdr out_hdr; >> u8 status; >> - struct scatterlist sg[]; >> + struct sg_table sg_table; >> }; >> >> static inline blk_status_t virtblk_result(struct virtblk_req *vbr) >> @@ -188,6 +194,8 @@ static inline void virtblk_request_done(struct request *req) >> { >> struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); >> >> + sg_free_table_chained(&vbr->sg_table, VIRTIO_BLK_INLINE_SG_CNT); >> + >> if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { >> kfree(page_address(req->special_vec.bv_page) + >> req->special_vec.bv_offset); >> @@ -291,7 +299,15 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, >> return BLK_STS_RESOURCE; >> } >> >> - num = blk_rq_map_sg(hctx->queue, req, vbr->sg); >> + vbr->sg_table.sgl = (struct scatterlist *)(vbr + 1); >> + err = sg_alloc_table_chained(&vbr->sg_table, >> + blk_rq_nr_phys_segments(req), >> + vbr->sg_table.sgl, >> + VIRTIO_BLK_INLINE_SG_CNT); >> + if (err) >> + return BLK_STS_RESOURCE; >> + >> + num = blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl); >> if (num) { >> if (rq_data_dir(req) == WRITE) >> vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT); >> @@ -300,7 +316,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, >> } >> >> spin_lock_irqsave(&vblk->vqs[qid].lock, flags); >> - err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num); >> + err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg_table.sgl, num); >> if (err) { >> virtqueue_kick(vblk->vqs[qid].vq); >> /* Don't stop the queue if -ENOMEM: we may have failed to >> @@ -309,6 +325,8 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, >> if (err == -ENOSPC) >> blk_mq_stop_hw_queue(hctx); >> spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); >> + sg_free_table_chained(&vbr->sg_table, >> + VIRTIO_BLK_INLINE_SG_CNT); >> switch (err) { >> case -ENOSPC: >> return BLK_STS_DEV_RESOURCE; >> @@ -687,16 +705,6 @@ static const struct attribute_group *virtblk_attr_groups[] = { >> NULL, >> }; >> >> -static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq, >> - unsigned int hctx_idx, unsigned int numa_node) >> -{ >> - struct virtio_blk *vblk = set->driver_data; >> - struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq); >> - >> - sg_init_table(vbr->sg, vblk->sg_elems); >> - return 0; >> -} >> - >> static int virtblk_map_queues(struct blk_mq_tag_set *set) >> { >> struct virtio_blk *vblk = set->driver_data; >> @@ -709,7 +717,6 @@ static const struct blk_mq_ops virtio_mq_ops = { >> .queue_rq = virtio_queue_rq, >> .commit_rqs = virtio_commit_rqs, >> .complete = virtblk_request_done, >> - .init_request = virtblk_init_request, >> .map_queues = virtblk_map_queues, >> }; >> >> @@ -805,7 +812,7 @@ static int virtblk_probe(struct virtio_device *vdev) >> vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; >> vblk->tag_set.cmd_size = >> sizeof(struct virtblk_req) + >> - sizeof(struct scatterlist) * sg_elems; >> + sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; >> vblk->tag_set.driver_data = vblk; >> vblk->tag_set.nr_hw_queues = vblk->num_vqs; >> >> -- >> 2.18.1 >>
Looks good,
Reviewed-by: Christoph Hellwig <hch@lst.de>
On Mon, Sep 27, 2021 at 12:53:14PM +0100, Christoph Hellwig wrote: > Looks good, > > Reviewed-by: Christoph Hellwig <hch@lst.de> Err, sorry. This was supposed to go to the lastest iteration, I'll add it there.
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 77e8468e8593..9a4c5d428b58 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -24,6 +24,12 @@ /* The maximum number of sg elements that fit into a virtqueue */ #define VIRTIO_BLK_MAX_SG_ELEMS 32768 +#ifdef CONFIG_ARCH_NO_SG_CHAIN +#define VIRTIO_BLK_INLINE_SG_CNT 0 +#else +#define VIRTIO_BLK_INLINE_SG_CNT 2 +#endif + static int virtblk_queue_count_set(const char *val, const struct kernel_param *kp) { @@ -99,7 +105,7 @@ struct virtio_blk { struct virtblk_req { struct virtio_blk_outhdr out_hdr; u8 status; - struct scatterlist sg[]; + struct sg_table sg_table; }; static inline blk_status_t virtblk_result(struct virtblk_req *vbr) @@ -188,6 +194,8 @@ static inline void virtblk_request_done(struct request *req) { struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + sg_free_table_chained(&vbr->sg_table, VIRTIO_BLK_INLINE_SG_CNT); + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { kfree(page_address(req->special_vec.bv_page) + req->special_vec.bv_offset); @@ -291,7 +299,15 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_RESOURCE; } - num = blk_rq_map_sg(hctx->queue, req, vbr->sg); + vbr->sg_table.sgl = (struct scatterlist *)(vbr + 1); + err = sg_alloc_table_chained(&vbr->sg_table, + blk_rq_nr_phys_segments(req), + vbr->sg_table.sgl, + VIRTIO_BLK_INLINE_SG_CNT); + if (err) + return BLK_STS_RESOURCE; + + num = blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl); if (num) { if (rq_data_dir(req) == WRITE) vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT); @@ -300,7 +316,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, } spin_lock_irqsave(&vblk->vqs[qid].lock, flags); - err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num); + err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg_table.sgl, num); if (err) { virtqueue_kick(vblk->vqs[qid].vq); /* Don't stop the queue if -ENOMEM: we may have failed to @@ -309,6 +325,8 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, if (err == -ENOSPC) blk_mq_stop_hw_queue(hctx); spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); + sg_free_table_chained(&vbr->sg_table, + VIRTIO_BLK_INLINE_SG_CNT); switch (err) { case -ENOSPC: return BLK_STS_DEV_RESOURCE; @@ -687,16 +705,6 @@ static const struct attribute_group *virtblk_attr_groups[] = { NULL, }; -static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq, - unsigned int hctx_idx, unsigned int numa_node) -{ - struct virtio_blk *vblk = set->driver_data; - struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq); - - sg_init_table(vbr->sg, vblk->sg_elems); - return 0; -} - static int virtblk_map_queues(struct blk_mq_tag_set *set) { struct virtio_blk *vblk = set->driver_data; @@ -709,7 +717,6 @@ static const struct blk_mq_ops virtio_mq_ops = { .queue_rq = virtio_queue_rq, .commit_rqs = virtio_commit_rqs, .complete = virtblk_request_done, - .init_request = virtblk_init_request, .map_queues = virtblk_map_queues, }; @@ -805,7 +812,7 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; vblk->tag_set.cmd_size = sizeof(struct virtblk_req) + - sizeof(struct scatterlist) * sg_elems; + sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; vblk->tag_set.driver_data = vblk; vblk->tag_set.nr_hw_queues = vblk->num_vqs;