diff mbox series

[v12,12/14] x86/sgx: Turn on per-cgroup EPC reclamation

Message ID 20240416032011.58578-13-haitao.huang@linux.intel.com (mailing list archive)
State New
Headers show
Series Add Cgroup support for SGX EPC memory | expand

Commit Message

Haitao Huang April 16, 2024, 3:20 a.m. UTC
From: Kristen Carlson Accardi <kristen@linux.intel.com>

Previous patches have implemented all infrastructure needed for
per-cgroup EPC page tracking and reclaiming. But all reclaimable EPC
pages are still tracked in the global LRU as sgx_lru_list() returns hard
coded reference to the global LRU.

Change sgx_lru_list() to return the LRU of the cgroup in which the given
EPC page is allocated.

This makes all EPC pages tracked in per-cgroup LRUs and the global
reclaimer (ksgxd) will not be able to reclaim any pages from the global
LRU. However, in cases of over-committing, i.e., the sum of cgroup
limits greater than the total capacity, cgroups may never reclaim but
the total usage can still be near the capacity. Therefore a global
reclamation is still needed in those cases and it should be performed
from the root cgroup.

Modify sgx_reclaim_pages_global(), to reclaim from the root EPC cgroup
when cgroup is enabled, otherwise from the global LRU. Export
sgx_cgroup_reclaim_pages() in the header file so it can be reused for
this purpose.

Similarly, modify sgx_can_reclaim(), to check emptiness of LRUs of all
cgroups when EPC cgroup is enabled, otherwise only check the global LRU.
Export sgx_cgroup_lru_empty() so it can be reused for this purpose.

Finally, change sgx_reclaim_direct(), to check and ensure there are free
pages at cgroup level so forward progress can be made by the caller.
Export sgx_cgroup_should_reclaim() for reuse.

With these changes, the global reclamation and per-cgroup reclamation
both work properly with all pages tracked in per-cgroup LRUs.

Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com>
Co-developed-by: Haitao Huang <haitao.huang@linux.intel.com>
Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
Tested-by: Mikko Ylinen <mikko.ylinen@linux.intel.com>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
---
V12:
- Remove CONFIG_CGROUP_SGX_EPC, conditional compile SGX Cgroup for
CONFIGCONFIG_CGROUPMISC. (Jarkko)

V11:
- Reword the comments for global reclamation for allocation failure
after passing cgroup charging. (Kai)
- Add stub functions to remove ifdefs in c file (Kai)
- Add more detailed comments to clarify each page belongs to one cgroup, or the
root. (Kai)

V10:
- Add comment to clarify each page belongs to one cgroup, or the root by
default. (Kai)
- Merge the changes that expose sgx_cgroup_* functions to this patch.
- Add changes for sgx_reclaim_direct() that was missed previously.

V7:
- Split this out from the big patch, #10 in V6. (Dave, Kai)
---
 arch/x86/kernel/cpu/sgx/epc_cgroup.c |  6 ++--
 arch/x86/kernel/cpu/sgx/epc_cgroup.h | 27 +++++++++++++++++
 arch/x86/kernel/cpu/sgx/main.c       | 43 ++++++++++++++++++++++++++--
 3 files changed, 71 insertions(+), 5 deletions(-)

Comments

Huang, Kai April 29, 2024, 10:49 a.m. UTC | #1
> +/*
> + * Get the per-cgroup or global LRU list that tracks the given reclaimable page.
> + */
>  static inline struct sgx_epc_lru_list *sgx_lru_list(struct sgx_epc_page *epc_page)
>  {
> +#ifdef CONFIG_CGROUP_MISC
> +	/*
> +	 * epc_page->sgx_cg here is never NULL during a reclaimable epc_page's
> +	 * life between sgx_alloc_epc_page() and sgx_free_epc_page():
> +	 *
> +	 * In sgx_alloc_epc_page(), epc_page->sgx_cg is set to the return from
> +	 * sgx_get_current_cg() which is the misc cgroup of the current task, or
> +	 * the root by default even if the misc cgroup is disabled by kernel
> +	 * command line.
> +	 *
> +	 * epc_page->sgx_cg is only unset by sgx_free_epc_page().
> +	 *
> +	 * This function is never used before sgx_alloc_epc_page() or after
> +	 * sgx_free_epc_page().
> +	 */
> +	return &epc_page->sgx_cg->lru;
> +#else
>  	return &sgx_global_lru;
> +#endif
>  }
>  
>  /*
> @@ -42,7 +63,8 @@ static inline struct sgx_epc_lru_list *sgx_lru_list(struct sgx_epc_page *epc_pag
>   */
>  static inline bool sgx_can_reclaim(void)
>  {
> -	return !list_empty(&sgx_global_lru.reclaimable);
> +	return !sgx_cgroup_lru_empty(misc_cg_root()) ||
> +	       !list_empty(&sgx_global_lru.reclaimable);
>  }

Shouldn't this be:

	if (IS_ENABLED(CONFIG_CGROUP_MISC))
		return !sgx_cgroup_lru_empty(misc_cg_root());
	else
		return !list_empty(&sgx_global_lru.reclaimable);
?

In this way, it is consistent with the sgx_reclaim_pages_global() below.

>  
>  static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
> @@ -404,7 +426,10 @@ static bool sgx_should_reclaim(unsigned long watermark)
>  
>  static void sgx_reclaim_pages_global(struct mm_struct *charge_mm)
>  {
> -	sgx_reclaim_pages(&sgx_global_lru, charge_mm);
> +	if (IS_ENABLED(CONFIG_CGROUP_MISC))
> +		sgx_cgroup_reclaim_pages(misc_cg_root(), charge_mm);
> +	else
> +		sgx_reclaim_pages(&sgx_global_lru, charge_mm);
>  }
>  
>  /*
> @@ -414,6 +439,14 @@ static void sgx_reclaim_pages_global(struct mm_struct *charge_mm)
>   */
>  void sgx_reclaim_direct(void)
>  {
> +	struct sgx_cgroup *sgx_cg = sgx_get_current_cg();
> +
> +	/* Make sure there are some free pages at cgroup level */
> +	if (sgx_cg && sgx_cgroup_should_reclaim(sgx_cg)) {
> +		sgx_cgroup_reclaim_pages(misc_from_sgx(sgx_cg), current->mm);
> +		sgx_put_cg(sgx_cg);
> +	}

Empty line.

> +	/* Make sure there are some free pages at global level */
>  	if (sgx_should_reclaim(SGX_NR_LOW_PAGES))

Looking at the code, to me sgx_should_reclaim() is a little bit vague
because from the name we don't know whether it interally checks the
current cgroup or the global.  

It's better to rename to sgx_should_reclaim_global().

Ditto for sgx_can_reclaim() -> sgx_can_reclaim_global().

And I think you can do the renaming in the previous patch, because in the
changelog of your previous patch, it seems you have called out the two
functions are for global reclaim.

>  		sgx_reclaim_pages_global(current->mm);
>  }
> @@ -616,6 +649,12 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, enum sgx_reclaim reclaim)
>  			break;
>  		}
>  
> +		/*
> +		 * At this point, the usage within this cgroup is under its
> +		 * limit but there is no physical page left for allocation.
> +		 * Perform a global reclaim to get some pages released from any
> +		 * cgroup with reclaimable pages.
> +		 */
>  		sgx_reclaim_pages_global(current->mm);
>  		cond_resched();
>  	}
Haitao Huang April 29, 2024, 4:05 p.m. UTC | #2
On Mon, 29 Apr 2024 05:49:13 -0500, Huang, Kai <kai.huang@intel.com> wrote:

>
>> +/*
>> + * Get the per-cgroup or global LRU list that tracks the given  
>> reclaimable page.
>> + */
>>  static inline struct sgx_epc_lru_list *sgx_lru_list(struct  
>> sgx_epc_page *epc_page)
>>  {
>> +#ifdef CONFIG_CGROUP_MISC
>> +	/*
>> +	 * epc_page->sgx_cg here is never NULL during a reclaimable epc_page's
>> +	 * life between sgx_alloc_epc_page() and sgx_free_epc_page():
>> +	 *
>> +	 * In sgx_alloc_epc_page(), epc_page->sgx_cg is set to the return from
>> +	 * sgx_get_current_cg() which is the misc cgroup of the current task,  
>> or
>> +	 * the root by default even if the misc cgroup is disabled by kernel
>> +	 * command line.
>> +	 *
>> +	 * epc_page->sgx_cg is only unset by sgx_free_epc_page().
>> +	 *
>> +	 * This function is never used before sgx_alloc_epc_page() or after
>> +	 * sgx_free_epc_page().
>> +	 */
>> +	return &epc_page->sgx_cg->lru;
>> +#else
>>  	return &sgx_global_lru;
>> +#endif
>>  }
>>
>>  /*
>> @@ -42,7 +63,8 @@ static inline struct sgx_epc_lru_list  
>> *sgx_lru_list(struct sgx_epc_page *epc_pag
>>   */
>>  static inline bool sgx_can_reclaim(void)
>>  {
>> -	return !list_empty(&sgx_global_lru.reclaimable);
>> +	return !sgx_cgroup_lru_empty(misc_cg_root()) ||
>> +	       !list_empty(&sgx_global_lru.reclaimable);
>>  }
>
> Shouldn't this be:
>
> 	if (IS_ENABLED(CONFIG_CGROUP_MISC))
> 		return !sgx_cgroup_lru_empty(misc_cg_root());
> 	else
> 		return !list_empty(&sgx_global_lru.reclaimable);
> ?
>
> In this way, it is consistent with the sgx_reclaim_pages_global() below.
>

I changed to this way because sgx_cgroup_lru_empty() is now defined in  
both KConfig cases.
And it seems better to minimize use of the KConfig variables based on  
earlier feedback (some are yours).
Don't really have strong preference here. So let me know one way of the  
other.

>>
>>  static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
>> @@ -404,7 +426,10 @@ static bool sgx_should_reclaim(unsigned long  
>> watermark)
>>
>>  static void sgx_reclaim_pages_global(struct mm_struct *charge_mm)
>>  {
>> -	sgx_reclaim_pages(&sgx_global_lru, charge_mm);
>> +	if (IS_ENABLED(CONFIG_CGROUP_MISC))
>> +		sgx_cgroup_reclaim_pages(misc_cg_root(), charge_mm);
>> +	else
>> +		sgx_reclaim_pages(&sgx_global_lru, charge_mm);
>>  }
>>
>>  /*
>> @@ -414,6 +439,14 @@ static void sgx_reclaim_pages_global(struct  
>> mm_struct *charge_mm)
>>   */
>>  void sgx_reclaim_direct(void)
>>  {
>> +	struct sgx_cgroup *sgx_cg = sgx_get_current_cg();
>> +
>> +	/* Make sure there are some free pages at cgroup level */
>> +	if (sgx_cg && sgx_cgroup_should_reclaim(sgx_cg)) {
>> +		sgx_cgroup_reclaim_pages(misc_from_sgx(sgx_cg), current->mm);
>> +		sgx_put_cg(sgx_cg);
>> +	}
>
> Empty line.
>

Sure

>> +	/* Make sure there are some free pages at global level */
>>  	if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
>
> Looking at the code, to me sgx_should_reclaim() is a little bit vague
> because from the name we don't know whether it interally checks the
> current cgroup or the global.  
> It's better to rename to sgx_should_reclaim_global().
>
> Ditto for sgx_can_reclaim() -> sgx_can_reclaim_global().
>
> And I think you can do the renaming in the previous patch, because in the
> changelog of your previous patch, it seems you have called out the two
> functions are for global reclaim.
>

ok

Thanks
Haitao
Huang, Kai April 29, 2024, 10:18 p.m. UTC | #3
>>>  /*
>>> @@ -42,7 +63,8 @@ static inline struct sgx_epc_lru_list 
>>> *sgx_lru_list(struct sgx_epc_page *epc_pag
>>>   */
>>>  static inline bool sgx_can_reclaim(void)
>>>  {
>>> -    return !list_empty(&sgx_global_lru.reclaimable);
>>> +    return !sgx_cgroup_lru_empty(misc_cg_root()) ||
>>> +           !list_empty(&sgx_global_lru.reclaimable);
>>>  }
>>
>> Shouldn't this be:
>>
>>     if (IS_ENABLED(CONFIG_CGROUP_MISC))
>>         return !sgx_cgroup_lru_empty(misc_cg_root());
>>     else
>>         return !list_empty(&sgx_global_lru.reclaimable);
>> ?
>>
>> In this way, it is consistent with the sgx_reclaim_pages_global() below.
>>
> 
> I changed to this way because sgx_cgroup_lru_empty() is now defined in 
> both KConfig cases.
> And it seems better to minimize use of the KConfig variables based on 
> earlier feedback (some are yours).
> Don't really have strong preference here. So let me know one way of the 
> other.
> 

But IMHO your code could be confusing, e.g., it can be interpreted as:

   The EPC pages can be managed by both the cgroup LRUs and the
   sgx_global_lru simultaneously at runtime when CONFIG_CGROUP_MISC
   is on.

Which is not true.

So we should make the code clearly reflect the true behaviour.
Haitao Huang April 30, 2024, 1:31 a.m. UTC | #4
On Mon, 29 Apr 2024 17:18:05 -0500, Huang, Kai <kai.huang@intel.com> wrote:

>
>>>>  /*
>>>> @@ -42,7 +63,8 @@ static inline struct sgx_epc_lru_list  
>>>> *sgx_lru_list(struct sgx_epc_page *epc_pag
>>>>   */
>>>>  static inline bool sgx_can_reclaim(void)
>>>>  {
>>>> -    return !list_empty(&sgx_global_lru.reclaimable);
>>>> +    return !sgx_cgroup_lru_empty(misc_cg_root()) ||
>>>> +           !list_empty(&sgx_global_lru.reclaimable);
>>>>  }
>>>
>>> Shouldn't this be:
>>>
>>>     if (IS_ENABLED(CONFIG_CGROUP_MISC))
>>>         return !sgx_cgroup_lru_empty(misc_cg_root());
>>>     else
>>>         return !list_empty(&sgx_global_lru.reclaimable);
>>> ?
>>>
>>> In this way, it is consistent with the sgx_reclaim_pages_global()  
>>> below.
>>>
>>  I changed to this way because sgx_cgroup_lru_empty() is now defined in  
>> both KConfig cases.
>> And it seems better to minimize use of the KConfig variables based on  
>> earlier feedback (some are yours).
>> Don't really have strong preference here. So let me know one way of the  
>> other.
>>
>
> But IMHO your code could be confusing, e.g., it can be interpreted as:
>
>    The EPC pages can be managed by both the cgroup LRUs and the
>    sgx_global_lru simultaneously at runtime when CONFIG_CGROUP_MISC
>    is on.
>
> Which is not true.
>
> So we should make the code clearly reflect the true behaviour.
>

Ok, I'll change back.
Thanks
Haitao
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.c b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
index 2efc33476b0b..16fe0e1574ec 100644
--- a/arch/x86/kernel/cpu/sgx/epc_cgroup.c
+++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
@@ -68,7 +68,7 @@  static inline u64 sgx_cgroup_max_pages_to_root(struct sgx_cgroup *sgx_cg)
  *
  * Return: %true if all cgroups under the specified root have empty LRU lists.
  */
-static bool sgx_cgroup_lru_empty(struct misc_cg *root)
+bool sgx_cgroup_lru_empty(struct misc_cg *root)
 {
 	struct cgroup_subsys_state *css_root;
 	struct cgroup_subsys_state *pos;
@@ -116,7 +116,7 @@  static bool sgx_cgroup_lru_empty(struct misc_cg *root)
  * the LRUs are recently accessed, i.e., considered "too young" to reclaim, no
  * page will actually be reclaimed after walking the whole tree.
  */
-static void sgx_cgroup_reclaim_pages(struct misc_cg *root, struct mm_struct *charge_mm)
+void sgx_cgroup_reclaim_pages(struct misc_cg *root, struct mm_struct *charge_mm)
 {
 	struct cgroup_subsys_state *css_root;
 	struct cgroup_subsys_state *pos;
@@ -157,7 +157,7 @@  static void sgx_cgroup_reclaim_pages(struct misc_cg *root, struct mm_struct *cha
  * threshold (%SGX_CG_MIN_FREE_PAGE) and there are reclaimable pages within the
  * cgroup.
  */
-static bool sgx_cgroup_should_reclaim(struct sgx_cgroup *sgx_cg)
+bool sgx_cgroup_should_reclaim(struct sgx_cgroup *sgx_cg)
 {
 	u64 cur, max;
 
diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.h b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
index 2044e0d64076..9d69608eadf6 100644
--- a/arch/x86/kernel/cpu/sgx/epc_cgroup.h
+++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
@@ -13,6 +13,11 @@ 
 #define MISC_CG_RES_SGX_EPC MISC_CG_RES_TYPES
 struct sgx_cgroup;
 
+static inline struct misc_cg *misc_from_sgx(struct sgx_cgroup *sgx_cg)
+{
+	return NULL;
+}
+
 static inline struct sgx_cgroup *sgx_get_current_cg(void)
 {
 	return NULL;
@@ -27,8 +32,22 @@  static inline int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg, enum sgx_recl
 
 static inline void sgx_cgroup_uncharge(struct sgx_cgroup *sgx_cg) { }
 
+static inline bool sgx_cgroup_lru_empty(struct misc_cg *root)
+{
+	return true;
+}
+
+static inline bool sgx_cgroup_should_reclaim(struct sgx_cgroup *sgx_cg)
+{
+	return false;
+}
+
 static inline void sgx_cgroup_init(void) { }
 
+static inline void sgx_cgroup_reclaim_pages(struct misc_cg *root, struct mm_struct *charge_mm)
+{
+}
+
 #else /* CONFIG_CGROUP_MISC */
 
 struct sgx_cgroup {
@@ -37,6 +56,11 @@  struct sgx_cgroup {
 	struct work_struct reclaim_work;
 };
 
+static inline struct misc_cg *misc_from_sgx(struct sgx_cgroup *sgx_cg)
+{
+	return sgx_cg->cg;
+}
+
 static inline struct sgx_cgroup *sgx_cgroup_from_misc_cg(struct misc_cg *cg)
 {
 	return (struct sgx_cgroup *)(cg->res[MISC_CG_RES_SGX_EPC].priv);
@@ -67,6 +91,9 @@  static inline void sgx_put_cg(struct sgx_cgroup *sgx_cg)
 
 int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg, enum sgx_reclaim reclaim);
 void sgx_cgroup_uncharge(struct sgx_cgroup *sgx_cg);
+bool sgx_cgroup_lru_empty(struct misc_cg *root);
+bool sgx_cgroup_should_reclaim(struct sgx_cgroup *sgx_cg);
+void sgx_cgroup_reclaim_pages(struct misc_cg *root,  struct mm_struct *charge_mm);
 void sgx_cgroup_init(void);
 
 #endif /* CONFIG_CGROUP_MISC */
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 11edbdb06782..9343f7d50649 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -32,9 +32,30 @@  static DEFINE_XARRAY(sgx_epc_address_space);
  */
 static struct sgx_epc_lru_list sgx_global_lru;
 
+/*
+ * Get the per-cgroup or global LRU list that tracks the given reclaimable page.
+ */
 static inline struct sgx_epc_lru_list *sgx_lru_list(struct sgx_epc_page *epc_page)
 {
+#ifdef CONFIG_CGROUP_MISC
+	/*
+	 * epc_page->sgx_cg here is never NULL during a reclaimable epc_page's
+	 * life between sgx_alloc_epc_page() and sgx_free_epc_page():
+	 *
+	 * In sgx_alloc_epc_page(), epc_page->sgx_cg is set to the return from
+	 * sgx_get_current_cg() which is the misc cgroup of the current task, or
+	 * the root by default even if the misc cgroup is disabled by kernel
+	 * command line.
+	 *
+	 * epc_page->sgx_cg is only unset by sgx_free_epc_page().
+	 *
+	 * This function is never used before sgx_alloc_epc_page() or after
+	 * sgx_free_epc_page().
+	 */
+	return &epc_page->sgx_cg->lru;
+#else
 	return &sgx_global_lru;
+#endif
 }
 
 /*
@@ -42,7 +63,8 @@  static inline struct sgx_epc_lru_list *sgx_lru_list(struct sgx_epc_page *epc_pag
  */
 static inline bool sgx_can_reclaim(void)
 {
-	return !list_empty(&sgx_global_lru.reclaimable);
+	return !sgx_cgroup_lru_empty(misc_cg_root()) ||
+	       !list_empty(&sgx_global_lru.reclaimable);
 }
 
 static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
@@ -404,7 +426,10 @@  static bool sgx_should_reclaim(unsigned long watermark)
 
 static void sgx_reclaim_pages_global(struct mm_struct *charge_mm)
 {
-	sgx_reclaim_pages(&sgx_global_lru, charge_mm);
+	if (IS_ENABLED(CONFIG_CGROUP_MISC))
+		sgx_cgroup_reclaim_pages(misc_cg_root(), charge_mm);
+	else
+		sgx_reclaim_pages(&sgx_global_lru, charge_mm);
 }
 
 /*
@@ -414,6 +439,14 @@  static void sgx_reclaim_pages_global(struct mm_struct *charge_mm)
  */
 void sgx_reclaim_direct(void)
 {
+	struct sgx_cgroup *sgx_cg = sgx_get_current_cg();
+
+	/* Make sure there are some free pages at cgroup level */
+	if (sgx_cg && sgx_cgroup_should_reclaim(sgx_cg)) {
+		sgx_cgroup_reclaim_pages(misc_from_sgx(sgx_cg), current->mm);
+		sgx_put_cg(sgx_cg);
+	}
+	/* Make sure there are some free pages at global level */
 	if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
 		sgx_reclaim_pages_global(current->mm);
 }
@@ -616,6 +649,12 @@  struct sgx_epc_page *sgx_alloc_epc_page(void *owner, enum sgx_reclaim reclaim)
 			break;
 		}
 
+		/*
+		 * At this point, the usage within this cgroup is under its
+		 * limit but there is no physical page left for allocation.
+		 * Perform a global reclaim to get some pages released from any
+		 * cgroup with reclaimable pages.
+		 */
 		sgx_reclaim_pages_global(current->mm);
 		cond_resched();
 	}