new file mode 100644
@@ -0,0 +1,18 @@
+#ifndef __NET_DCALLOC_H
+#define __NET_DCALLOC_H
+
+#include <linux/types.h>
+
+struct device;
+
+struct dma_cocoa;
+
+struct dma_cocoa *dma_cocoa_create(struct device *dev, gfp_t gfp);
+void dma_cocoa_destroy(struct dma_cocoa *cocoa);
+
+void *dma_cocoa_alloc(struct dma_cocoa *cocoa, unsigned long size,
+ dma_addr_t *dma, gfp_t gfp);
+void dma_cocoa_free(struct dma_cocoa *cocoa, unsigned long size, void *addr,
+ dma_addr_t dma);
+
+#endif
@@ -13,7 +13,7 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
fib_notifier.o xdp.o flow_offload.o gro.o \
- netdev-genl.o netdev-genl-gen.o gso.o
+ netdev-genl.o netdev-genl-gen.o gso.o dcalloc.o
obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
new file mode 100644
@@ -0,0 +1,390 @@
+#include "dcalloc.h"
+
+#include <linux/dma-mapping.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+
+static bool dma_sal_in_use(struct dma_slow_allocator *sal)
+{
+ return refcount_read(&sal->user_cnt);
+}
+
+int dma_slow_huge_init(struct dma_slow_huge *shu, void *addr,
+ unsigned int size, dma_addr_t dma, gfp_t gfp)
+{
+ struct dma_slow_buddy *bud;
+
+ bud = kzalloc(sizeof(*bud), gfp);
+ if (!bud)
+ return -ENOMEM;
+
+ shu->addr = addr;
+ shu->size = size;
+ shu->dma = dma;
+
+ INIT_LIST_HEAD(&shu->buddy_list);
+
+ bud->size = size;
+ bud->free = true;
+ list_add(&bud->list, &shu->buddy_list);
+
+ return 0;
+}
+
+static struct dma_slow_buddy *
+dma_slow_bud_split(struct dma_slow_buddy *bud, gfp_t gfp)
+{
+ struct dma_slow_buddy *right;
+
+ right = kzalloc(sizeof(*bud), gfp);
+ if (!right)
+ return NULL;
+
+ bud->size /= 2;
+
+ right->offset = bud->offset + bud->size;
+ right->size = bud->size;
+ right->free = true;
+
+ list_add(&right->list, &bud->list);
+
+ return bud;
+}
+
+static bool dma_slow_bud_coalesce(struct dma_slow_huge *shu)
+{
+ struct dma_slow_buddy *bud, *left = NULL, *right = NULL;
+
+ list_for_each_entry(bud, &shu->buddy_list, list) {
+ if (left && bud &&
+ left->free && bud->free &&
+ left->size == bud->size &&
+ (left->offset & bud->offset) == left->offset) {
+ right = bud;
+ break;
+ }
+ left = bud;
+ }
+
+ if (!right)
+ return false;
+
+ left->size *= 2;
+ list_del(&right->list);
+ kfree(right);
+ return true;
+}
+
+static void *
+__dma_sal_alloc_buddy(struct dma_slow_allocator *sal, struct dma_slow_huge *shu,
+ unsigned int size, dma_addr_t *dma, gfp_t gfp)
+{
+ struct dma_slow_buddy *small_fit = NULL;
+ struct dma_slow_buddy *bud;
+
+ if (shu->size < size)
+ return NULL;
+
+ list_for_each_entry(bud, &shu->buddy_list, list) {
+ if (!bud->free || bud->size < size)
+ continue;
+
+ if (!small_fit || small_fit->size > bud->size)
+ small_fit = bud;
+ if (bud->size == size)
+ break;
+ }
+ if (!small_fit)
+ return NULL;
+ bud = small_fit;
+
+ while (bud->size >= size * 2) {
+ bud = dma_slow_bud_split(bud, gfp);
+ if (!bud)
+ return NULL;
+ }
+
+ bud->free = false;
+ *dma = shu->dma + bud->offset;
+ return shu->addr + (bud->offset >> sal->ops->ptr_shf);
+}
+
+static void *
+dma_sal_alloc_buddy(struct dma_slow_allocator *sal, unsigned int size,
+ dma_addr_t *dma, gfp_t gfp)
+{
+ struct dma_slow_huge *shu;
+ void *addr;
+
+ list_for_each_entry(shu, &sal->huge, huge) {
+ addr = __dma_sal_alloc_buddy(sal, shu, size, dma, gfp);
+ if (addr)
+ return addr;
+ }
+
+ if (!sal->ops->alloc_huge)
+ return NULL;
+
+ shu = kzalloc(sizeof(*shu), gfp);
+ if (!shu)
+ return NULL;
+ if (sal->ops->alloc_huge(sal, shu, size, gfp)) {
+ kfree(shu);
+ return NULL;
+ }
+ list_add(&shu->huge, &sal->huge);
+
+ return __dma_sal_alloc_buddy(sal, shu, size, dma, gfp);
+}
+
+static bool
+__dma_sal_free_buddy(struct dma_slow_allocator *sal, struct dma_slow_huge *shu,
+ void *addr, unsigned int size, dma_addr_t dma)
+{
+ struct dma_slow_buddy *bud;
+ dma_addr_t exp_dma;
+ void *exp_addr;
+
+ list_for_each_entry(bud, &shu->buddy_list, list) {
+ exp_dma = shu->dma + bud->offset;
+ exp_addr = shu->addr + (bud->offset >> sal->ops->ptr_shf);
+
+ if (exp_addr != addr)
+ continue;
+
+ if (exp_dma != dma || bud->size != size)
+ pr_warn("mep param mismatch: %u %u, %lu %lu\n",
+ bud->size, size, (ulong)exp_dma, (ulong)dma);
+ if (bud->free)
+ pr_warn("double free: %d %lu\n", size, (ulong)dma);
+ bud->free = true;
+ return true;
+ }
+
+ return false;
+}
+
+static void
+dma_slow_maybe_free_huge(struct dma_slow_allocator *sal,
+ struct dma_slow_huge *shu)
+{
+ struct dma_slow_buddy *bud;
+
+ bud = list_first_entry(&shu->buddy_list, typeof(*bud), list);
+ if (!bud->free || bud->size != shu->size)
+ return;
+
+ if (!sal->ops->alloc_huge)
+ return;
+
+ kfree(bud);
+
+ sal->ops->free_huge(sal, shu);
+ list_del(&shu->huge);
+ kfree(shu);
+}
+
+static bool
+dma_sal_free_buddy(struct dma_slow_allocator *sal, void *addr,
+ unsigned int order, dma_addr_t dma)
+{
+ struct dma_slow_huge *shu;
+ bool freed = false;
+
+ list_for_each_entry(shu, &sal->huge, huge) {
+ freed = __dma_sal_free_buddy(sal, shu, addr, order, dma);
+ if (freed)
+ break;
+ }
+ if (freed) {
+ while (dma_slow_bud_coalesce(shu))
+ /* I know, it's not efficient.
+ * But all of SAL is on the config path.
+ */;
+ dma_slow_maybe_free_huge(sal, shu);
+ }
+ return freed;
+}
+
+static void *
+dma_sal_alloc_fb(struct dma_slow_allocator *sal, unsigned int size,
+ dma_addr_t *dma, gfp_t gfp)
+{
+ struct dma_slow_fall *fb;
+
+ fb = kzalloc(sizeof(*fb), gfp);
+ if (!fb)
+ return NULL;
+ fb->size = size;
+
+ if (sal->ops->alloc_fall(sal, fb, size, gfp)) {
+ kfree(fb);
+ return NULL;
+ }
+ list_add(&fb->fb, &sal->fallback);
+
+ *dma = fb->dma;
+ return fb->addr;
+}
+
+static bool dma_sal_free_fb(struct dma_slow_allocator *sal, void *addr,
+ unsigned int size, dma_addr_t dma)
+{
+ struct dma_slow_fall *fb, *pos;
+
+ fb = NULL;
+ list_for_each_entry(pos, &sal->fallback, fb)
+ if (pos->addr == addr) {
+ fb = pos;
+ break;
+ }
+
+ if (!fb) {
+ pr_warn("free: address %px not found\n", addr);
+ return false;
+ }
+
+ if (fb->size != size || fb->dma != dma)
+ pr_warn("free: param mismatch: %u %u, %lu %lu\n",
+ fb->size, size, (ulong)fb->dma, (ulong)dma);
+
+ list_del(&fb->fb);
+ sal->ops->free_fall(sal, fb);
+ kfree(fb);
+ return true;
+}
+
+void *dma_sal_alloc(struct dma_slow_allocator *sal, unsigned int size,
+ dma_addr_t *dma, gfp_t gfp)
+{
+ void *ret;
+
+ ret = dma_sal_alloc_buddy(sal, size, dma, gfp);
+ if (!ret)
+ ret = dma_sal_alloc_fb(sal, size, dma, gfp);
+ if (!ret)
+ return NULL;
+
+ dma_slow_get(sal);
+ return ret;
+}
+
+void dma_sal_free(struct dma_slow_allocator *sal, void *addr,
+ unsigned int size, dma_addr_t dma)
+{
+ if (!dma_sal_free_buddy(sal, addr, size, dma) &&
+ !dma_sal_free_fb(sal, addr, size, dma))
+ return;
+
+ dma_slow_put(sal);
+}
+
+void dma_sal_init(struct dma_slow_allocator *sal,
+ const struct dma_slow_allocator_ops *ops,
+ struct device *dev)
+{
+ sal->ops = ops;
+ sal->dev = dev;
+
+ INIT_LIST_HEAD(&sal->huge);
+ INIT_LIST_HEAD(&sal->fallback);
+
+ refcount_set(&sal->user_cnt, 1);
+}
+
+/*****************************
+ *** DMA COCOA allocator ***
+ *****************************/
+static int
+dma_cocoa_alloc_huge(struct dma_slow_allocator *sal, struct dma_slow_huge *shu,
+ unsigned int size, gfp_t gfp)
+{
+ if (size >= SZ_2M)
+ return -ENOMEM;
+
+ shu->addr = dma_alloc_coherent(sal->dev, SZ_2M, &shu->dma, gfp);
+ if (!shu->addr)
+ return -ENOMEM;
+
+ if (dma_slow_huge_init(shu, shu->addr, SZ_2M, shu->dma, gfp))
+ goto err_free_dma;
+
+ return 0;
+
+err_free_dma:
+ dma_free_coherent(sal->dev, SZ_2M, shu->addr, shu->dma);
+ return -ENOMEM;
+}
+
+static void
+dma_cocoa_free_huge(struct dma_slow_allocator *sal, struct dma_slow_huge *shu)
+{
+ dma_free_coherent(sal->dev, SZ_2M, shu->addr, shu->dma);
+}
+
+static int
+dma_cocoa_alloc_fall(struct dma_slow_allocator *sal, struct dma_slow_fall *fb,
+ unsigned int size, gfp_t gfp)
+{
+ fb->addr = dma_alloc_coherent(sal->dev, size, &fb->dma, gfp);
+ if (!fb->addr)
+ return -ENOMEM;
+ return 0;
+}
+
+static void
+dma_cocoa_free_fall(struct dma_slow_allocator *sal, struct dma_slow_fall *fb)
+{
+ dma_free_coherent(sal->dev, fb->size, fb->addr, fb->dma);
+}
+
+struct dma_slow_allocator_ops dma_cocoa_ops = {
+ .alloc_huge = dma_cocoa_alloc_huge,
+ .free_huge = dma_cocoa_free_huge,
+ .alloc_fall = dma_cocoa_alloc_fall,
+ .free_fall = dma_cocoa_free_fall,
+};
+
+struct dma_cocoa {
+ struct dma_slow_allocator sal;
+};
+
+struct dma_cocoa *dma_cocoa_create(struct device *dev, gfp_t gfp)
+{
+ struct dma_cocoa *cocoa;
+
+ cocoa = kzalloc(sizeof(*cocoa), gfp);
+ if (!cocoa)
+ return NULL;
+
+ dma_sal_init(&cocoa->sal, &dma_cocoa_ops, dev);
+
+ return cocoa;
+}
+
+void dma_cocoa_destroy(struct dma_cocoa *cocoa)
+{
+ dma_slow_put(&cocoa->sal);
+ WARN_ON(dma_sal_in_use(&cocoa->sal));
+ kfree(cocoa);
+}
+
+void *dma_cocoa_alloc(struct dma_cocoa *cocoa, unsigned long size,
+ dma_addr_t *dma, gfp_t gfp)
+{
+ void *addr;
+
+ size = roundup_pow_of_two(size);
+ addr = dma_sal_alloc(&cocoa->sal, size, dma, gfp);
+ if (!addr)
+ return NULL;
+ memset(addr, 0, size);
+ return addr;
+}
+
+void dma_cocoa_free(struct dma_cocoa *cocoa, unsigned long size, void *addr,
+ dma_addr_t dma)
+{
+ size = roundup_pow_of_two(size);
+ return dma_sal_free(&cocoa->sal, addr, size, dma);
+}
new file mode 100644
@@ -0,0 +1,93 @@
+#ifndef __DCALLOC_H
+#define __DCALLOC_H
+
+#include <linux/dma-mapping.h>
+#include <net/dcalloc.h>
+
+struct device;
+
+/* struct dma_slow_huge - AKA @shu, large block which will get chopped up */
+struct dma_slow_huge {
+ void *addr;
+ unsigned int size;
+ dma_addr_t dma;
+
+ struct list_head huge;
+ struct list_head buddy_list; /* struct dma_slow_buddy */
+};
+
+/* Single allocation piece */
+struct dma_slow_buddy {
+ unsigned int offset;
+ unsigned int size;
+
+ bool free;
+
+ struct list_head list;
+};
+
+/* struct dma_slow_fall - AKA @fb, fallback when huge can't be allocated */
+struct dma_slow_fall {
+ void *addr;
+ unsigned int size;
+ dma_addr_t dma;
+
+ struct list_head fb;
+};
+
+/* struct dma_slow_allocator - AKA @sal, per device allocator */
+struct dma_slow_allocator {
+ const struct dma_slow_allocator_ops *ops;
+ struct device *dev;
+
+ unsigned int ptr_shf;
+ refcount_t user_cnt;
+
+ struct list_head huge; /* struct dma_slow_huge */
+ struct list_head fallback; /* struct dma_slow_fall */
+};
+
+struct dma_slow_allocator_ops {
+ u8 ptr_shf;
+
+ int (*alloc_huge)(struct dma_slow_allocator *sal,
+ struct dma_slow_huge *shu,
+ unsigned int size, gfp_t gfp);
+ void (*free_huge)(struct dma_slow_allocator *sal,
+ struct dma_slow_huge *fb);
+ int (*alloc_fall)(struct dma_slow_allocator *sal,
+ struct dma_slow_fall *fb,
+ unsigned int size, gfp_t gfp);
+ void (*free_fall)(struct dma_slow_allocator *sal,
+ struct dma_slow_fall *fb);
+
+ void (*release)(struct dma_slow_allocator *sal);
+};
+
+int dma_slow_huge_init(struct dma_slow_huge *shu, void *addr,
+ unsigned int size, dma_addr_t dma, gfp_t gfp);
+
+void dma_sal_init(struct dma_slow_allocator *sal,
+ const struct dma_slow_allocator_ops *ops,
+ struct device *dev);
+
+void *dma_sal_alloc(struct dma_slow_allocator *sal, unsigned int size,
+ dma_addr_t *dma, gfp_t gfp);
+void dma_sal_free(struct dma_slow_allocator *sal, void *addr,
+ unsigned int size, dma_addr_t dma);
+
+static inline void dma_slow_get(struct dma_slow_allocator *sal)
+{
+ refcount_inc(&sal->user_cnt);
+}
+
+static inline void dma_slow_put(struct dma_slow_allocator *sal)
+{
+ if (!refcount_dec_and_test(&sal->user_cnt))
+ return;
+
+ if (sal->ops->release)
+ sal->ops->release(sal);
+}
+
+#endif
Implement a simple buddy allocator with a fallback. It will be used to split huge pages into smaller pools. And fallback to alloc_pages() if huge pages are exhausted. This code will be used exclusively on slow paths and is generally "not great" but it doesn't seem to immediately crash which is good enough for now? This patch contains a basic "coherent allocator" which splits 2M coherently mapped pages into smaller chunks. Certian drivers appear to allocate a few MB in single coherent pages which is not great for IOTLB pressure (simple iperf test on bnxt with Rx backed by huge pages goes from 170k IOTLB misses to 60k when using this). Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- include/net/dcalloc.h | 18 ++ net/core/Makefile | 2 +- net/core/dcalloc.c | 390 ++++++++++++++++++++++++++++++++++++++++++ net/core/dcalloc.h | 93 ++++++++++ 4 files changed, 502 insertions(+), 1 deletion(-) create mode 100644 include/net/dcalloc.h create mode 100644 net/core/dcalloc.c create mode 100644 net/core/dcalloc.h