diff mbox series

[10/10] dmapool: improve scalability of dma_pool_free

Message ID 801335ba-00f3-12ae-59e0-119d7d8fd8cd@cybernetics.com (mailing list archive)
State New
Headers show
Series mpt3sas and dmapool scalability | expand

Commit Message

Tony Battersby May 31, 2022, 6:23 p.m. UTC
dma_pool_free() scales poorly when the pool contains many pages because
pool_find_page() does a linear scan of all allocated pages.  Improve its
scalability by replacing the linear scan with a red-black tree lookup.
In big O notation, this improves the algorithm from O(n^2) to O(n * log n).

Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
---
 mm/dmapool.c | 128 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 100 insertions(+), 28 deletions(-)

Comments

Keith Busch May 31, 2022, 9:54 p.m. UTC | #1
On Tue, May 31, 2022 at 02:23:44PM -0400, Tony Battersby wrote:
> dma_pool_free() scales poorly when the pool contains many pages because
> pool_find_page() does a linear scan of all allocated pages.  Improve its
> scalability by replacing the linear scan with a red-black tree lookup.
> In big O notation, this improves the algorithm from O(n^2) to O(n * log n).

The improvement should say O(n) to O(log n), right?
Tony Battersby May 31, 2022, 10:10 p.m. UTC | #2
On 5/31/22 17:54, Keith Busch wrote:
> On Tue, May 31, 2022 at 02:23:44PM -0400, Tony Battersby wrote:
>> dma_pool_free() scales poorly when the pool contains many pages because
>> pool_find_page() does a linear scan of all allocated pages.  Improve its
>> scalability by replacing the linear scan with a red-black tree lookup.
>> In big O notation, this improves the algorithm from O(n^2) to O(n * log n).
> The improvement should say O(n) to O(log n), right?

That would be the improvement for a single call to dma_pool_alloc or
dma_pool_free, but I was going with the improvement for "n" calls
instead, which is consistent with the improvement for the example in the
cover letter for mpt3sas.  I would have to look up the convention to be
sure of the proper notation in a situation like this, but I usually
think "inserting N items takes N^2 time"; to me it makes less sense to
say "inserting 1 item takes N time", because the N seems to come out of
nowhere.

Tony
Robin Murphy June 1, 2022, 9:44 a.m. UTC | #3
On 2022-05-31 23:10, Tony Battersby wrote:
> On 5/31/22 17:54, Keith Busch wrote:
>> On Tue, May 31, 2022 at 02:23:44PM -0400, Tony Battersby wrote:
>>> dma_pool_free() scales poorly when the pool contains many pages because
>>> pool_find_page() does a linear scan of all allocated pages.  Improve its
>>> scalability by replacing the linear scan with a red-black tree lookup.
>>> In big O notation, this improves the algorithm from O(n^2) to O(n * log n).
>> The improvement should say O(n) to O(log n), right?
> 
> That would be the improvement for a single call to dma_pool_alloc or
> dma_pool_free, but I was going with the improvement for "n" calls
> instead, which is consistent with the improvement for the example in the
> cover letter for mpt3sas.  I would have to look up the convention to be
> sure of the proper notation in a situation like this, but I usually
> think "inserting N items takes N^2 time"; to me it makes less sense to
> say "inserting 1 item takes N time", because the N seems to come out of
> nowhere.

No, n represents the size of the set that the algorithm is inserting 
into (or removing from, searching within, etc.). Hence why constant time 
is represented by O(1), i.e. not involving the variable at all.

Robin.
diff mbox series

Patch

diff --git a/mm/dmapool.c b/mm/dmapool.c
index b3dd2ace0d2a..24535483f781 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -12,11 +12,12 @@ 
  * Many older drivers still have their own code to do this.
  *
  * The current design of this allocator is fairly simple.  The pool is
- * represented by the 'struct dma_pool' which keeps a doubly-linked list of
- * allocated pages.  Each page in the page_list is split into blocks of at
- * least 'size' bytes.  Free blocks are tracked in an unsorted singly-linked
- * list of free blocks within the page.  Used blocks aren't tracked, but we
- * keep a count of how many are currently allocated from each page.
+ * represented by the 'struct dma_pool' which keeps a red-black tree of all
+ * allocated pages, keyed by DMA address for fast lookup when freeing.
+ * Each page in the page_tree is split into blocks of at least 'size' bytes.
+ * Free blocks are tracked in an unsorted singly-linked list of free blocks
+ * within the page.  Used blocks aren't tracked, but we keep a count of how
+ * many are currently allocated from each page.
  *
  * The avail_page_list keeps track of pages that have one or more free blocks
  * available to (re)allocate.  Pages are moved in and out of avail_page_list
@@ -36,6 +37,7 @@ 
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/spinlock.h>
+#include <linux/rbtree.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/wait.h>
@@ -45,7 +47,7 @@ 
 #endif
 
 struct dma_pool {		/* the pool */
-	struct list_head page_list;
+	struct rb_root page_tree;
 	struct list_head avail_page_list;
 	spinlock_t lock;
 	unsigned int size;
@@ -58,7 +60,7 @@  struct dma_pool {		/* the pool */
 };
 
 struct dma_page {		/* cacheable header for 'allocation' bytes */
-	struct list_head page_list;
+	struct rb_node page_node;
 	struct list_head avail_page_link;
 	void *vaddr;
 	dma_addr_t dma;
@@ -69,6 +71,11 @@  struct dma_page {		/* cacheable header for 'allocation' bytes */
 static DEFINE_MUTEX(pools_lock);
 static DEFINE_MUTEX(pools_reg_lock);
 
+static inline struct dma_page *rb_to_dma_page(struct rb_node *node)
+{
+	return rb_entry(node, struct dma_page, page_node);
+}
+
 static ssize_t pools_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	unsigned temp;
@@ -76,6 +83,7 @@  static ssize_t pools_show(struct device *dev, struct device_attribute *attr, cha
 	char *next;
 	struct dma_page *page;
 	struct dma_pool *pool;
+	struct rb_node *node;
 
 	next = buf;
 	size = PAGE_SIZE;
@@ -90,7 +98,10 @@  static ssize_t pools_show(struct device *dev, struct device_attribute *attr, cha
 		size_t blocks = 0;
 
 		spin_lock_irq(&pool->lock);
-		list_for_each_entry(page, &pool->page_list, page_list) {
+		for (node = rb_first(&pool->page_tree);
+		     node;
+		     node = rb_next(node)) {
+			page = rb_to_dma_page(node);
 			pages++;
 			blocks += page->in_use;
 		}
@@ -169,7 +180,7 @@  struct dma_pool *dma_pool_create(const char *name, struct device *dev,
 
 	retval->dev = dev;
 
-	INIT_LIST_HEAD(&retval->page_list);
+	retval->page_tree = RB_ROOT;
 	INIT_LIST_HEAD(&retval->avail_page_list);
 	spin_lock_init(&retval->lock);
 	retval->size = size;
@@ -213,6 +224,63 @@  struct dma_pool *dma_pool_create(const char *name, struct device *dev,
 }
 EXPORT_SYMBOL(dma_pool_create);
 
+/*
+ * Find the dma_page that manages the given DMA address.
+ */
+static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
+{
+	struct rb_node *node = pool->page_tree.rb_node;
+
+	while (node) {
+		struct dma_page *page = rb_to_dma_page(node);
+
+		if (dma < page->dma)
+			node = node->rb_left;
+		else if ((dma - page->dma) >= pool->allocation)
+			node = node->rb_right;
+		else
+			return page;
+	}
+	return NULL;
+}
+
+/*
+ * Insert a dma_page into the page_tree.
+ */
+static int pool_insert_page(struct dma_pool *pool, struct dma_page *new_page)
+{
+	dma_addr_t dma = new_page->dma;
+	struct rb_node **node = &(pool->page_tree.rb_node), *parent = NULL;
+
+	while (*node) {
+		struct dma_page *this_page = rb_to_dma_page(*node);
+
+		parent = *node;
+		if (dma < this_page->dma)
+			node = &((*node)->rb_left);
+		else if (likely((dma - this_page->dma) >= pool->allocation))
+			node = &((*node)->rb_right);
+		else {
+			/*
+			 * A page that overlaps the new DMA range is already
+			 * present in the tree.  This should not happen.
+			 */
+			WARN(1,
+			     "%s: %s: DMA address overlap: old %pad new %pad len %u\n",
+			     dev_name(pool->dev),
+			     pool->name, &this_page->dma, &dma,
+			     pool->allocation);
+			return -1;
+		}
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&new_page->page_node, parent, node);
+	rb_insert_color(&new_page->page_node, &pool->page_tree);
+
+	return 0;
+}
+
 static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
 {
 	unsigned int offset = 0;
@@ -276,8 +344,16 @@  static void pool_free_page(struct dma_pool *pool,
 		dma_free_coherent(pool->dev, pool->allocation, vaddr, dma);
 	}
 
-	list_del(&page->page_list);
 	list_del(&page->avail_page_link);
+
+	/*
+	 * If the pool is being destroyed, it is not safe to modify the
+	 * page_tree while iterating over it, and it is also unnecessary since
+	 * the whole tree will be discarded anyway.
+	 */
+	if (!destroying_pool)
+		rb_erase(&page->page_node, &pool->page_tree);
+
 	kfree(page);
 }
 
@@ -291,7 +367,7 @@  static void pool_free_page(struct dma_pool *pool,
  */
 void dma_pool_destroy(struct dma_pool *pool)
 {
-	struct dma_page *page;
+	struct dma_page *page, *tmp;
 	bool empty = false;
 
 	if (unlikely(!pool))
@@ -307,9 +383,10 @@  void dma_pool_destroy(struct dma_pool *pool)
 		device_remove_file(pool->dev, &dev_attr_pools);
 	mutex_unlock(&pools_reg_lock);
 
-	while ((page = list_first_entry_or_null(&pool->page_list,
-						struct dma_page,
-						page_list))) {
+	rbtree_postorder_for_each_entry_safe(page,
+					     tmp,
+					     &pool->page_tree,
+					     page_node) {
 		pool_free_page(pool, page, true);
 	}
 
@@ -353,7 +430,15 @@  void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 
 	spin_lock_irqsave(&pool->lock, flags);
 
-	list_add(&page->page_list, &pool->page_list);
+	if (unlikely(pool_insert_page(pool, page))) {
+		/*
+		 * This should not happen, so something must have gone horribly
+		 * wrong.  Instead of crashing, intentionally leak the memory
+		 * and make for the exit.
+		 */
+		spin_unlock_irqrestore(&pool->lock, flags);
+		return NULL;
+	}
 	list_add(&page->avail_page_link, &pool->avail_page_list);
  ready:
 	page->in_use++;
@@ -395,19 +480,6 @@  void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 }
 EXPORT_SYMBOL(dma_pool_alloc);
 
-static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
-{
-	struct dma_page *page;
-
-	list_for_each_entry(page, &pool->page_list, page_list) {
-		if (dma < page->dma)
-			continue;
-		if ((dma - page->dma) < pool->allocation)
-			return page;
-	}
-	return NULL;
-}
-
 /**
  * dma_pool_free - put block back into dma pool
  * @pool: the dma pool holding the block