@@ -4,3 +4,4 @@ obj-y += \
ioctl.o \
main.o
obj-$(CONFIG_X86_SGX_KVM) += virt.o
+obj-$(CONFIG_CGROUP_MISC) += epc_cgroup.o
new file mode 100644
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022-2024 Intel Corporation. */
+
+#include<linux/slab.h>
+#include "epc_cgroup.h"
+
+/* The root SGX EPC cgroup */
+static struct sgx_cgroup sgx_cg_root;
+
+/**
+ * sgx_cgroup_try_charge() - try to charge cgroup for a single EPC page
+ *
+ * @sgx_cg: The EPC cgroup to be charged for the page.
+ * Return:
+ * * %0 - If successfully charged.
+ * * -errno - for failures.
+ */
+int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg)
+{
+ return misc_cg_try_charge(MISC_CG_RES_SGX_EPC, sgx_cg->cg, PAGE_SIZE);
+}
+
+/**
+ * sgx_cgroup_uncharge() - uncharge a cgroup for an EPC page
+ * @sgx_cg: The charged sgx cgroup.
+ */
+void sgx_cgroup_uncharge(struct sgx_cgroup *sgx_cg)
+{
+ misc_cg_uncharge(MISC_CG_RES_SGX_EPC, sgx_cg->cg, PAGE_SIZE);
+}
+
+static void sgx_cgroup_free(struct misc_cg *cg)
+{
+ struct sgx_cgroup *sgx_cg;
+
+ sgx_cg = sgx_cgroup_from_misc_cg(cg);
+ if (!sgx_cg)
+ return;
+
+ kfree(sgx_cg);
+}
+
+static void sgx_cgroup_misc_init(struct misc_cg *cg, struct sgx_cgroup *sgx_cg)
+{
+ cg->res[MISC_CG_RES_SGX_EPC].priv = sgx_cg;
+ sgx_cg->cg = cg;
+}
+
+static int sgx_cgroup_alloc(struct misc_cg *cg)
+{
+ struct sgx_cgroup *sgx_cg;
+
+ sgx_cg = kzalloc(sizeof(*sgx_cg), GFP_KERNEL);
+ if (!sgx_cg)
+ return -ENOMEM;
+
+ sgx_cgroup_misc_init(cg, sgx_cg);
+
+ return 0;
+}
+
+const struct misc_res_ops sgx_cgroup_ops = {
+ .alloc = sgx_cgroup_alloc,
+ .free = sgx_cgroup_free,
+};
+
+int __init sgx_cgroup_init(void)
+{
+ sgx_cgroup_misc_init(misc_cg_root(), &sgx_cg_root);
+
+ return 0;
+}
+
+/**
+ * Register capacity and ops for SGX cgroup.
+ * Only called at the end of sgx_init() when SGX is ready to handle the ops
+ * callbacks.
+ */
+void __init sgx_cgroup_register(void)
+{
+ unsigned int nid = first_node(sgx_numa_mask);
+ unsigned int first = nid;
+ u64 capacity = 0;
+
+ misc_cg_set_ops(MISC_CG_RES_SGX_EPC, &sgx_cgroup_ops);
+
+ /* sgx_numa_mask is not empty when this is called */
+ do {
+ capacity += sgx_numa_nodes[nid].size;
+ nid = next_node_in(nid, sgx_numa_mask);
+ } while (nid != first);
+ misc_cg_set_capacity(MISC_CG_RES_SGX_EPC, capacity);
+}
new file mode 100644
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _SGX_EPC_CGROUP_H_
+#define _SGX_EPC_CGROUP_H_
+
+#include <asm/sgx.h>
+#include <linux/cgroup.h>
+#include <linux/misc_cgroup.h>
+
+#include "sgx.h"
+
+#ifndef CONFIG_CGROUP_MISC
+
+#define MISC_CG_RES_SGX_EPC MISC_CG_RES_TYPES
+struct sgx_cgroup;
+
+static inline struct sgx_cgroup *sgx_get_current_cg(void)
+{
+ return NULL;
+}
+
+static inline void sgx_put_cg(struct sgx_cgroup *sgx_cg) { }
+
+static inline int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg)
+{
+ return 0;
+}
+
+static inline void sgx_cgroup_uncharge(struct sgx_cgroup *sgx_cg) { }
+
+static inline int __init sgx_cgroup_init(void)
+{
+ return 0;
+}
+
+static inline void __init sgx_cgroup_register(void) { }
+
+#else /* CONFIG_CGROUP_MISC */
+
+struct sgx_cgroup {
+ struct misc_cg *cg;
+};
+
+static inline struct sgx_cgroup *sgx_cgroup_from_misc_cg(struct misc_cg *cg)
+{
+ return (struct sgx_cgroup *)(cg->res[MISC_CG_RES_SGX_EPC].priv);
+}
+
+/**
+ * sgx_get_current_cg() - get the EPC cgroup of current process.
+ *
+ * Returned cgroup has its ref count increased by 1. Caller must call
+ * sgx_put_cg() to return the reference.
+ *
+ * Return: EPC cgroup to which the current task belongs to.
+ */
+static inline struct sgx_cgroup *sgx_get_current_cg(void)
+{
+ /* get_current_misc_cg() never returns NULL when Kconfig enabled */
+ return sgx_cgroup_from_misc_cg(get_current_misc_cg());
+}
+
+/**
+ * sgx_put_cg() - Put the EPC cgroup and reduce its ref count.
+ * @sgx_cg - EPC cgroup to put.
+ */
+static inline void sgx_put_cg(struct sgx_cgroup *sgx_cg)
+{
+ put_misc_cg(sgx_cg->cg);
+}
+
+int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg);
+void sgx_cgroup_uncharge(struct sgx_cgroup *sgx_cg);
+int __init sgx_cgroup_init(void);
+void __init sgx_cgroup_register(void);
+
+#endif /* CONFIG_CGROUP_MISC */
+
+#endif /* _SGX_EPC_CGROUP_H_ */
@@ -18,6 +18,7 @@
#include "driver.h"
#include "encl.h"
#include "encls.h"
+#include "epc_cgroup.h"
struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
static int sgx_nr_epc_sections;
@@ -35,14 +36,14 @@ static DEFINE_SPINLOCK(sgx_reclaimer_lock);
static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
/* Nodes with one or more EPC sections. */
-static nodemask_t sgx_numa_mask;
+nodemask_t sgx_numa_mask;
/*
* Array with one list_head for each possible NUMA node. Each
* list contains all the sgx_epc_section's which are on that
* node.
*/
-static struct sgx_numa_node *sgx_numa_nodes;
+struct sgx_numa_node *sgx_numa_nodes;
static LIST_HEAD(sgx_dirty_page_list);
@@ -559,7 +560,16 @@ int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
*/
struct sgx_epc_page *sgx_alloc_epc_page(void *owner, enum sgx_reclaim reclaim)
{
+ struct sgx_cgroup *sgx_cg;
struct sgx_epc_page *page;
+ int ret;
+
+ sgx_cg = sgx_get_current_cg();
+ ret = sgx_cgroup_try_charge(sgx_cg);
+ if (ret) {
+ sgx_put_cg(sgx_cg);
+ return ERR_PTR(ret);
+ }
for ( ; ; ) {
page = __sgx_alloc_epc_page();
@@ -568,8 +578,10 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, enum sgx_reclaim reclaim)
break;
}
- if (list_empty(&sgx_active_page_list))
- return ERR_PTR(-ENOMEM);
+ if (list_empty(&sgx_active_page_list)) {
+ page = ERR_PTR(-ENOMEM);
+ break;
+ }
if (reclaim == SGX_NO_RECLAIM) {
page = ERR_PTR(-EBUSY);
@@ -585,6 +597,15 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, enum sgx_reclaim reclaim)
cond_resched();
}
+ if (!IS_ERR(page)) {
+ WARN_ON_ONCE(sgx_epc_page_get_cgroup(page));
+ /* sgx_put_cg() in sgx_free_epc_page() */
+ sgx_epc_page_set_cgroup(page, sgx_cg);
+ } else {
+ sgx_cgroup_uncharge(sgx_cg);
+ sgx_put_cg(sgx_cg);
+ }
+
if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
wake_up(&ksgxd_waitq);
@@ -603,8 +624,16 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, enum sgx_reclaim reclaim)
void sgx_free_epc_page(struct sgx_epc_page *page)
{
struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ struct sgx_cgroup *sgx_cg = sgx_epc_page_get_cgroup(page);
struct sgx_numa_node *node = section->node;
+ /* sgx_cg could be NULL if called from __sgx_sanitize_pages() */
+ if (sgx_cg) {
+ sgx_cgroup_uncharge(sgx_cg);
+ sgx_put_cg(sgx_cg);
+ sgx_epc_page_set_cgroup(page, NULL);
+ }
+
spin_lock(&node->lock);
page->owner = NULL;
@@ -644,6 +673,8 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
section->pages[i].flags = 0;
section->pages[i].owner = NULL;
section->pages[i].poison = 0;
+ sgx_epc_page_set_cgroup(§ion->pages[i], NULL);
+
list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list);
}
@@ -930,6 +961,9 @@ static int __init sgx_init(void)
if (ret)
goto err_kthread;
+ ret = sgx_cgroup_init();
+ if (ret)
+ goto err_provision;
/*
* Always try to initialize the native *and* KVM drivers.
* The KVM driver is less picky than the native one and
@@ -943,6 +977,8 @@ static int __init sgx_init(void)
if (sgx_vepc_init() && ret)
goto err_provision;
+ sgx_cgroup_register();
+
return 0;
err_provision:
@@ -39,14 +39,35 @@ enum sgx_reclaim {
SGX_DO_RECLAIM
};
+struct sgx_cgroup;
+
struct sgx_epc_page {
unsigned int section;
u16 flags;
u16 poison;
struct sgx_encl_page *owner;
struct list_head list;
+#ifdef CONFIG_CGROUP_MISC
+ struct sgx_cgroup *sgx_cg;
+#endif
};
+static inline void sgx_epc_page_set_cgroup(struct sgx_epc_page *page, struct sgx_cgroup *cg)
+{
+#ifdef CONFIG_CGROUP_MISC
+ page->sgx_cg = cg;
+#endif
+}
+
+static inline struct sgx_cgroup *sgx_epc_page_get_cgroup(struct sgx_epc_page *page)
+{
+#ifdef CONFIG_CGROUP_MISC
+ return page->sgx_cg;
+#else
+ return NULL;
+#endif
+}
+
/*
* Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
* the free page list local to the node is stored here.
@@ -58,6 +79,9 @@ struct sgx_numa_node {
spinlock_t lock;
};
+extern nodemask_t sgx_numa_mask;
+extern struct sgx_numa_node *sgx_numa_nodes;
+
/*
* The firmware can define multiple chunks of EPC to the different areas of the
* physical memory e.g. for memory areas of the each node. This structure is
@@ -48,6 +48,7 @@ struct misc_res_ops {
* @watermark: Historical maximum usage of the resource.
* @usage: Current usage of the resource.
* @events: Number of times, the resource limit exceeded.
+ * @priv: resource specific data.
*/
struct misc_res {
u64 max;
@@ -55,6 +56,7 @@ struct misc_res {
atomic64_t usage;
atomic64_t events;
atomic64_t events_local;
+ void *priv;
};
/**