@@ -2365,11 +2365,63 @@ static void blk_mq_release_rqs_page(struct page *page)
__free_pages(page, blk_mq_rqs_page_order(page));
}
+#define SHRINK_RQS_PAGE_DELAY (10 * HZ)
+
static void blk_mq_free_rqs_page(struct blk_mq_tag_set *set, struct page *page)
{
spin_lock(&set->free_page_list_lock);
list_add_tail(&page->lru, &set->free_page_list);
spin_unlock(&set->free_page_list_lock);
+
+ schedule_delayed_work(&set->rqs_page_shrink, SHRINK_RQS_PAGE_DELAY);
+}
+
+static bool blk_mq_can_shrink_rqs_page(struct blk_mq_tag_set *set,
+ struct page *pg)
+{
+ unsigned hctx_idx = blk_mq_rqs_page_hctx_idx(pg);
+ struct blk_mq_tags *tags = set->tags[hctx_idx];
+ unsigned long start = (unsigned long)page_address(pg);
+ unsigned long end = start + order_to_size(blk_mq_rqs_page_order(pg));
+ int i;
+
+ for (i = 0; i < set->queue_depth; i++) {
+ unsigned long rq_addr = (unsigned long)tags->rqs[i];
+ if (rq_addr >= start && rq_addr < end)
+ return false;
+ }
+ return true;
+}
+
+static void blk_mq_rqs_page_shrink_work(struct work_struct *work)
+{
+ struct blk_mq_tag_set *set =
+ container_of(work, struct blk_mq_tag_set, rqs_page_shrink.work);
+ LIST_HEAD(pg_list);
+ struct page *page, *tmp;
+ bool resched;
+
+ spin_lock(&set->free_page_list_lock);
+ list_splice_init(&set->free_page_list, &pg_list);
+ spin_unlock(&set->free_page_list_lock);
+
+ mutex_lock(&set->tag_list_lock);
+ list_for_each_entry_safe(page, tmp, &pg_list, lru) {
+ if (blk_mq_can_shrink_rqs_page(set, page)) {
+ list_del_init(&page->lru);
+ blk_mq_release_rqs_page(page);
+ }
+ }
+ mutex_unlock(&set->tag_list_lock);
+
+ spin_lock(&set->free_page_list_lock);
+ list_splice_init(&pg_list, &set->free_page_list);
+ resched = !list_empty(&set->free_page_list);
+ spin_unlock(&set->free_page_list_lock);
+
+ if (resched)
+ schedule_delayed_work(&set->rqs_page_shrink,
+ SHRINK_RQS_PAGE_DELAY);
}
static void blk_mq_release_all_rqs_page(struct blk_mq_tag_set *set)
@@ -2377,6 +2429,8 @@ static void blk_mq_release_all_rqs_page(struct blk_mq_tag_set *set)
struct page *page;
LIST_HEAD(pg_list);
+ cancel_delayed_work_sync(&set->rqs_page_shrink);
+
spin_lock(&set->free_page_list_lock);
list_splice_init(&set->free_page_list, &pg_list);
spin_unlock(&set->free_page_list_lock);
@@ -3527,6 +3581,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
spin_lock_init(&set->free_page_list_lock);
INIT_LIST_HEAD(&set->free_page_list);
+ INIT_DELAYED_WORK(&set->rqs_page_shrink, blk_mq_rqs_page_shrink_work);
ret = blk_mq_alloc_map_and_requests(set);
if (ret)
@@ -250,6 +250,7 @@ struct blk_mq_tag_set {
spinlock_t free_page_list_lock;
struct list_head free_page_list;
+ struct delayed_work rqs_page_shrink;
};
/**
request pool pages may take a bit more space, and each request queue may hold one unused request pool at most, so memory waste can be big when there are lots of request queues. Schedule a delayed work to check if tags->rqs[] still may refer to page in freed request pool page. If no any request in tags->rqs[] refers to the freed request pool page, release the page now. Otherwise, schedule the delayed work after 10 seconds for check & release the pages. Signed-off-by: Ming Lei <ming.lei@redhat.com> Cc: Hannes Reinecke <hare@suse.de> Cc: Bart Van Assche <bvanassche@acm.org> Cc: John Garry <john.garry@huawei.com> Cc: Christoph Hellwig <hch@lst.de> --- block/blk-mq.c | 55 ++++++++++++++++++++++++++++++++++++++++++ include/linux/blk-mq.h | 1 + 2 files changed, 56 insertions(+)