diff mbox series

[v9,09/15] x86/sgx: Charge mem_cgroup for per-cgroup reclamation

Message ID 20240205210638.157741-10-haitao.huang@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Add Cgroup support for SGX EPC memory | expand

Commit Message

Haitao Huang Feb. 5, 2024, 9:06 p.m. UTC
Enclave Page Cache(EPC) memory can be swapped out to regular system
memory, and the consumed memory should be charged to a proper
mem_cgroup. Currently the selection of mem_cgroup to charge is done in
sgx_encl_get_mem_cgroup(). But it only considers two contexts in which
the swapping can be done: normal tasks and the ksgxd kthread.
With the new EPC cgroup implementation, the swapping can also happen in
EPC cgroup work-queue threads. In those cases, it improperly selects the
root mem_cgroup to charge for the RAM usage.

Change sgx_encl_get_mem_cgroup() to handle non-task contexts only and
return the mem_cgroup of an mm_struct associated with the enclave. The
return is used to charge for EPC backing pages in all kthread cases.

Pass a flag into the top level reclamation function,
sgx_reclaim_pages(), to explicitly indicate whether it is called from a
background kthread. Internally, if the flag is true, switch the active
mem_cgroup to the one returned from sgx_encl_get_mem_cgroup(), prior to
any backing page allocation, in order to ensure that shmem page
allocations are charged to the enclave's cgroup.

Removed current_is_ksgxd() as it is no longer needed.

Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
Reported-by: Mikko Ylinen <mikko.ylinen@linux.intel.com>
---
V9:
- Reduce number of if statements. (Tim)

V8:
- Limit text paragraphs to 80 characters wide. (Jarkko)
---
 arch/x86/kernel/cpu/sgx/encl.c       | 38 +++++++++++++---------------
 arch/x86/kernel/cpu/sgx/encl.h       |  3 +--
 arch/x86/kernel/cpu/sgx/epc_cgroup.c |  7 ++---
 arch/x86/kernel/cpu/sgx/main.c       | 27 +++++++++-----------
 arch/x86/kernel/cpu/sgx/sgx.h        |  3 ++-
 5 files changed, 36 insertions(+), 42 deletions(-)

Comments

Jarkko Sakkinen Feb. 12, 2024, 7:46 p.m. UTC | #1
On Mon Feb 5, 2024 at 11:06 PM EET, Haitao Huang wrote:
> Enclave Page Cache(EPC) memory can be swapped out to regular system

"Enclave Page Cache (EPC)"
                   ~

> memory, and the consumed memory should be charged to a proper
> mem_cgroup. Currently the selection of mem_cgroup to charge is done in
> sgx_encl_get_mem_cgroup(). But it only considers two contexts in which
> the swapping can be done: normal tasks and the ksgxd kthread.
> With the new EPC cgroup implementation, the swapping can also happen in
> EPC cgroup work-queue threads. In those cases, it improperly selects the
> root mem_cgroup to charge for the RAM usage.
>
> Change sgx_encl_get_mem_cgroup() to handle non-task contexts only and
> return the mem_cgroup of an mm_struct associated with the enclave. The
> return is used to charge for EPC backing pages in all kthread cases.
>
> Pass a flag into the top level reclamation function,
> sgx_reclaim_pages(), to explicitly indicate whether it is called from a
> background kthread. Internally, if the flag is true, switch the active
> mem_cgroup to the one returned from sgx_encl_get_mem_cgroup(), prior to
> any backing page allocation, in order to ensure that shmem page
> allocations are charged to the enclave's cgroup.
>
> Removed current_is_ksgxd() as it is no longer needed.
>
> Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
> Reported-by: Mikko Ylinen <mikko.ylinen@linux.intel.com>
> ---
> V9:
> - Reduce number of if statements. (Tim)
>
> V8:
> - Limit text paragraphs to 80 characters wide. (Jarkko)
> ---
>  arch/x86/kernel/cpu/sgx/encl.c       | 38 +++++++++++++---------------
>  arch/x86/kernel/cpu/sgx/encl.h       |  3 +--
>  arch/x86/kernel/cpu/sgx/epc_cgroup.c |  7 ++---
>  arch/x86/kernel/cpu/sgx/main.c       | 27 +++++++++-----------
>  arch/x86/kernel/cpu/sgx/sgx.h        |  3 ++-
>  5 files changed, 36 insertions(+), 42 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
> index 279148e72459..4e5948362060 100644
> --- a/arch/x86/kernel/cpu/sgx/encl.c
> +++ b/arch/x86/kernel/cpu/sgx/encl.c
> @@ -993,9 +993,7 @@ static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_inde
>  }
>  
>  /*
> - * When called from ksgxd, returns the mem_cgroup of a struct mm stored
> - * in the enclave's mm_list. When not called from ksgxd, just returns
> - * the mem_cgroup of the current task.
> + * Returns the mem_cgroup of a struct mm stored in the enclave's mm_list.
>   */
>  static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
>  {
> @@ -1003,14 +1001,6 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
>  	struct sgx_encl_mm *encl_mm;
>  	int idx;
>  
> -	/*
> -	 * If called from normal task context, return the mem_cgroup
> -	 * of the current task's mm. The remainder of the handling is for
> -	 * ksgxd.
> -	 */
> -	if (!current_is_ksgxd())
> -		return get_mem_cgroup_from_mm(current->mm);
> -
>  	/*
>  	 * Search the enclave's mm_list to find an mm associated with
>  	 * this enclave to charge the allocation to.
> @@ -1047,27 +1037,33 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
>   * @encl:	an enclave pointer
>   * @page_index:	enclave page index
>   * @backing:	data for accessing backing storage for the page
> + * @indirect:	in ksgxd or EPC cgroup work queue context
> + *
> + * Create a backing page for loading data back into an EPC page with ELDU. This
> + * function takes a reference on a new backing page which must be dropped with a
> + * corresponding call to sgx_encl_put_backing().
>   *
> - * When called from ksgxd, sets the active memcg from one of the
> - * mms in the enclave's mm_list prior to any backing page allocation,
> - * in order to ensure that shmem page allocations are charged to the
> - * enclave.  Create a backing page for loading data back into an EPC page with
> - * ELDU.  This function takes a reference on a new backing page which
> - * must be dropped with a corresponding call to sgx_encl_put_backing().
> + * When @indirect is true, sets the active memcg from one of the mms in the
> + * enclave's mm_list prior to any backing page allocation, in order to ensure
> + * that shmem page allocations are charged to the enclave.
>   *
>   * Return:
>   *   0 on success,
>   *   -errno otherwise.
>   */
>  int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
> -			   struct sgx_backing *backing)
> +			   struct sgx_backing *backing, bool indirect)

Boolean parameters should be avoided when possible because they confuse
in the call sites.

>  {
> -	struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl);
> -	struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
> +	struct mem_cgroup *encl_memcg;
> +	struct mem_cgroup *memcg;
>  	int ret;
>  
> -	ret = __sgx_encl_get_backing(encl, page_index, backing);
> +	if (!indirect)
> +		return  __sgx_encl_get_backing(encl, page_index, backing);

If a call is either in heead or tail of the code block, then
obviously better option is to make __sgx_encl_get_backing()
as non-static sgx_encl_get_backing() and call it in those
call sites that would call this with "false".

I.e. you need a new patch where this preparation is done.

>  
> +	encl_memcg = sgx_encl_get_mem_cgroup(encl);
> +	memcg = set_active_memcg(encl_memcg);
> +	ret = __sgx_encl_get_backing(encl, page_index, backing);
>  	set_active_memcg(memcg);
>  	mem_cgroup_put(encl_memcg);
>  
> diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
> index f94ff14c9486..549cd2e8d98b 100644
> --- a/arch/x86/kernel/cpu/sgx/encl.h
> +++ b/arch/x86/kernel/cpu/sgx/encl.h
> @@ -103,12 +103,11 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
>  int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
>  		     unsigned long end, unsigned long vm_flags);
>  
> -bool current_is_ksgxd(void);
>  void sgx_encl_release(struct kref *ref);
>  int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
>  const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl);
>  int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
> -			   struct sgx_backing *backing);
> +			   struct sgx_backing *backing, bool indirect);
>  void sgx_encl_put_backing(struct sgx_backing *backing);
>  int sgx_encl_test_and_clear_young(struct mm_struct *mm,
>  				  struct sgx_encl_page *page);
> diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.c b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> index 16b6d9f909eb..d399fda2b55e 100644
> --- a/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> +++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> @@ -93,9 +93,10 @@ bool sgx_epc_cgroup_lru_empty(struct misc_cg *root)
>  /**
>   * sgx_epc_cgroup_reclaim_pages() - walk a cgroup tree and scan LRUs to reclaim pages
>   * @root:	Root of the tree to start walking from.
> + * @indirect:   In ksgxd or EPC cgroup work queue context.
>   * Return:	Number of pages reclaimed.
>   */
> -unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
> +static unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root, bool indirect)
>  {
>  	/*
>  	 * Attempting to reclaim only a few pages will often fail and is
> @@ -119,7 +120,7 @@ unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
>  		rcu_read_unlock();
>  
>  		epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
> -		cnt += sgx_reclaim_pages(&epc_cg->lru, &nr_to_scan);
> +		cnt += sgx_reclaim_pages(&epc_cg->lru, &nr_to_scan, indirect);
>  
>  		rcu_read_lock();
>  		css_put(pos);
> @@ -176,7 +177,7 @@ static void sgx_epc_cgroup_reclaim_work_func(struct work_struct *work)
>  			break;
>  
>  		/* Keep reclaiming until above condition is met. */
> -		sgx_epc_cgroup_reclaim_pages(epc_cg->cg);
> +		sgx_epc_cgroup_reclaim_pages(epc_cg->cg, true);
>  	}
>  }
>  
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index 4f5824c4751d..51904f191b97 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -254,7 +254,7 @@ static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
>  }
>  
>  static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
> -				struct sgx_backing *backing)
> +				struct sgx_backing *backing, bool indirect)
>  {
>  	struct sgx_encl_page *encl_page = epc_page->owner;
>  	struct sgx_encl *encl = encl_page->encl;
> @@ -270,7 +270,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
>  
>  	if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
>  		ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size),
> -					   &secs_backing);
> +					   &secs_backing, indirect);
>  		if (ret)
>  			goto out;
>  
> @@ -304,9 +304,11 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
>   * @lru:	The LRU from which pages are reclaimed.
>   * @nr_to_scan: Pointer to the target number of pages to scan, must be less than
>   *		SGX_NR_TO_SCAN.
> + * @indirect:	In ksgxd or EPC cgroup work queue contexts.
>   * Return:	Number of pages reclaimed.
>   */
> -unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru, unsigned int *nr_to_scan)
> +unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru, unsigned int *nr_to_scan,
> +			       bool indirect)
>  {
>  	struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
>  	struct sgx_backing backing[SGX_NR_TO_SCAN];
> @@ -348,7 +350,7 @@ unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru, unsigned int *nr_to
>  		page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
>  
>  		mutex_lock(&encl_page->encl->lock);
> -		ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]);
> +		ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i], indirect);
>  		if (ret) {
>  			mutex_unlock(&encl_page->encl->lock);
>  			goto skip;
> @@ -381,7 +383,7 @@ unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru, unsigned int *nr_to
>  			continue;
>  
>  		encl_page = epc_page->owner;
> -		sgx_reclaimer_write(epc_page, &backing[i]);
> +		sgx_reclaimer_write(epc_page, &backing[i], indirect);
>  
>  		kref_put(&encl_page->encl->refcount, sgx_encl_release);
>  		epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
> @@ -399,11 +401,11 @@ static bool sgx_should_reclaim(unsigned long watermark)
>  	       !list_empty(&sgx_global_lru.reclaimable);
>  }
>  
> -static void sgx_reclaim_pages_global(void)
> +static void sgx_reclaim_pages_global(bool indirect)
>  {
>  	unsigned int nr_to_scan = SGX_NR_TO_SCAN;
>  
> -	sgx_reclaim_pages(&sgx_global_lru, &nr_to_scan);
> +	sgx_reclaim_pages(&sgx_global_lru, &nr_to_scan, indirect);
>  }
>  
>  /*
> @@ -414,7 +416,7 @@ static void sgx_reclaim_pages_global(void)
>  void sgx_reclaim_direct(void)
>  {
>  	if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
> -		sgx_reclaim_pages_global();
> +		sgx_reclaim_pages_global(false);
>  }
>  
>  static int ksgxd(void *p)
> @@ -437,7 +439,7 @@ static int ksgxd(void *p)
>  				     sgx_should_reclaim(SGX_NR_HIGH_PAGES));
>  
>  		if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
> -			sgx_reclaim_pages_global();
> +			sgx_reclaim_pages_global(true);
>  
>  		cond_resched();
>  	}
> @@ -460,11 +462,6 @@ static bool __init sgx_page_reclaimer_init(void)
>  	return true;
>  }
>  
> -bool current_is_ksgxd(void)
> -{
> -	return current == ksgxd_tsk;
> -}
> -
>  static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
>  {
>  	struct sgx_numa_node *node = &sgx_numa_nodes[nid];
> @@ -623,7 +620,7 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
>  		 * Need to do a global reclamation if cgroup was not full but free
>  		 * physical pages run out, causing __sgx_alloc_epc_page() to fail.
>  		 */
> -		sgx_reclaim_pages_global();
> +		sgx_reclaim_pages_global(false);
>  		cond_resched();
>  	}
>  
> diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
> index 2593c013d091..cfe906054d85 100644
> --- a/arch/x86/kernel/cpu/sgx/sgx.h
> +++ b/arch/x86/kernel/cpu/sgx/sgx.h
> @@ -110,7 +110,8 @@ void sgx_reclaim_direct(void);
>  void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
>  int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
>  struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
> -unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru, unsigned int *nr_to_scan);
> +unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru, unsigned int *nr_to_scan,
> +			       bool indirect);
>  
>  void sgx_ipi_cb(void *info);
>  

Br, Jarkko
Haitao Huang Feb. 13, 2024, 3:21 a.m. UTC | #2
On Mon, 12 Feb 2024 13:46:06 -0600, Jarkko Sakkinen <jarkko@kernel.org>  
wrote:

> On Mon Feb 5, 2024 at 11:06 PM EET, Haitao Huang wrote:
>> Enclave Page Cache(EPC) memory can be swapped out to regular system
>
> "Enclave Page Cache (EPC)"
>                    ~
>
Will fix.

[...]
>>  int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long  
>> page_index,
>> -			   struct sgx_backing *backing)
>> +			   struct sgx_backing *backing, bool indirect)
>
> Boolean parameters should be avoided when possible because they confuse
> in the call sites.
>
>>  {
>> -	struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl);
>> -	struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
>> +	struct mem_cgroup *encl_memcg;
>> +	struct mem_cgroup *memcg;
>>  	int ret;
>>
>> -	ret = __sgx_encl_get_backing(encl, page_index, backing);
>> +	if (!indirect)
>> +		return  __sgx_encl_get_backing(encl, page_index, backing);
>
> If a call is either in heead or tail of the code block, then
> obviously better option is to make __sgx_encl_get_backing()
> as non-static sgx_encl_get_backing() and call it in those
> call sites that would call this with "false".
>
> I.e. you need a new patch where this preparation is done.
>

This would actually require more intrusive changes to the call stack for  
global and cgroup reclaim:

{sgx_epc_cgroup_reclaim_pages(),sgx_reclaim_pages_global()}->sgx_reclaim_pages()[->sgx_reclaimer_write()]->sgx_encl_alloc_backing()

We need make two versions of each of those functions.
It'd be especially complicated in refactoring sgx_reclaim_pages() for two  
versions.

I now double checked the history of current_is_ksgxd()[1], it seemed the  
intent was to replace "current->mm == NULL" criteria so it is more obvious  
we are running in ksgxd.

@Dave, @Kristen,

Can we restore the original criteria like below so it works for the cgroup  
work-queues?

bool current_is_ksgxd(void)
{
	return current == ksgxd_tsk;
}

--->

bool current_is_kthread(void)
{
	return current->mm == NULL;	
}

I'm not experienced in this area and not sure how reliable it is to use  
current->mm == NULL for kthread and work-queues. But it would eliminate  
the need for the boolean parameter.

Would appreciate the input.

Haitao

[1]https://lore.kernel.org/linux-sgx/9c269c70-35fe-a1a4-34c9-b1e62ab3bb3b@intel.com/
Dave Hansen Feb. 15, 2024, 11:43 p.m. UTC | #3
On 2/5/24 13:06, Haitao Huang wrote:
>  static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
>  {
> @@ -1003,14 +1001,6 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
>  	struct sgx_encl_mm *encl_mm;
>  	int idx;
>  
> -	/*
> -	 * If called from normal task context, return the mem_cgroup
> -	 * of the current task's mm. The remainder of the handling is for
> -	 * ksgxd.
> -	 */
> -	if (!current_is_ksgxd())
> -		return get_mem_cgroup_from_mm(current->mm);

Why is this being removed?

Searching the enclave mm list is a last resort.  It's expensive and
imprecise.

get_mem_cgroup_from_mm(current->mm), on the other hand is fast and precise.
Haitao Huang Feb. 16, 2024, 6:07 a.m. UTC | #4
Hi Dave,

On Thu, 15 Feb 2024 17:43:18 -0600, Dave Hansen <dave.hansen@intel.com>  
wrote:

> On 2/5/24 13:06, Haitao Huang wrote:
>>  static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl  
>> *encl)
>>  {
>> @@ -1003,14 +1001,6 @@ static struct mem_cgroup  
>> *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
>>  	struct sgx_encl_mm *encl_mm;
>>  	int idx;
>>
>> -	/*
>> -	 * If called from normal task context, return the mem_cgroup
>> -	 * of the current task's mm. The remainder of the handling is for
>> -	 * ksgxd.
>> -	 */
>> -	if (!current_is_ksgxd())
>> -		return get_mem_cgroup_from_mm(current->mm);
>
> Why is this being removed?
>
> Searching the enclave mm list is a last resort.  It's expensive and
> imprecise.
>
> get_mem_cgroup_from_mm(current->mm), on the other hand is fast and  
> precise.
>

I introduced a boolean flag to indicate caller is in kthread (ksgxd or  
cgroup workqueue), so sgx_encl_alloc_backing only calls this function if  
that flag is true, meaning search through the mm_list is needed.

But now I think a more straightforward way is to just replace  
current_is_ksgxd() with (current->flags & PF_KTHREAD).

Thanks

Haitao
Dave Hansen Feb. 16, 2024, 3:15 p.m. UTC | #5
On 2/5/24 13:06, Haitao Huang wrote:
> @@ -414,7 +416,7 @@ static void sgx_reclaim_pages_global(void)
>  void sgx_reclaim_direct(void)
>  {
>  	if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
> -		sgx_reclaim_pages_global();
> +		sgx_reclaim_pages_global(false);
>  }
>  
>  static int ksgxd(void *p)
> @@ -437,7 +439,7 @@ static int ksgxd(void *p)
>  				     sgx_should_reclaim(SGX_NR_HIGH_PAGES));
>  
>  		if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
> -			sgx_reclaim_pages_global();
> +			sgx_reclaim_pages_global(true);
>  
>  		cond_resched();
>  	}

First, I'm never a fan of random true/false or 0/1 arguments to
functions like this.  You end up having to go look at the called
function to make any sense of it.  You can either do an enum, or some
construct like this:

 		if (sgx_should_reclaim(SGX_NR_HIGH_PAGES)) {
			bool indirect = true;
			sgx_reclaim_pages_global(indirect);
		}

Yeah, it takes a few more lines but it saves you having to comment the
thing.

Does this 'indirect' change any behavior other than whether it does a
search for an mm to find a place to charge the backing storage?  Instead
of passing a flag around, why not just pass the mm?

This refactoring out of 'indirect' or passing the mm around really wants
to be in its own patch anyway.
Haitao Huang Feb. 16, 2024, 9:38 p.m. UTC | #6
On Fri, 16 Feb 2024 09:15:59 -0600, Dave Hansen <dave.hansen@intel.com>  
wrote:

> On 2/5/24 13:06, Haitao Huang wrote:
>> @@ -414,7 +416,7 @@ static void sgx_reclaim_pages_global(void)
>>  void sgx_reclaim_direct(void)
>>  {
>>  	if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
>> -		sgx_reclaim_pages_global();
>> +		sgx_reclaim_pages_global(false);
>>  }
>>
>>  static int ksgxd(void *p)
>> @@ -437,7 +439,7 @@ static int ksgxd(void *p)
>>  				     sgx_should_reclaim(SGX_NR_HIGH_PAGES));
>>
>>  		if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
>> -			sgx_reclaim_pages_global();
>> +			sgx_reclaim_pages_global(true);
>>
>>  		cond_resched();
>>  	}
>
> First, I'm never a fan of random true/false or 0/1 arguments to
> functions like this.  You end up having to go look at the called
> function to make any sense of it.  You can either do an enum, or some
> construct like this:
>
>  		if (sgx_should_reclaim(SGX_NR_HIGH_PAGES)) {
> 			bool indirect = true;
> 			sgx_reclaim_pages_global(indirect);
> 		}
>
> Yeah, it takes a few more lines but it saves you having to comment the
> thing.
>
> Does this 'indirect' change any behavior other than whether it does a
> search for an mm to find a place to charge the backing storage?

No.

> Instead of passing a flag around, why not just pass the mm?
>
There is no need to pass in mm. We could just check if current->mm == NULL  
for the need of doing the search in the enclave mm list.

But you had a concern [1] that the purpose was not clear hence suggested  
current_is_ksgxd().

Would it be OK if we replace current_is_ksgxd() with (current->flags &  
PF_KTHREAD)? That would express the real intent of checking if calling  
context is not in a user context.

> This refactoring out of 'indirect' or passing the mm around really wants
> to be in its own patch anyway.
>
Looks like I could do:
1) refactoring of 'indirect' value/enum suggested above. This seems the  
most straightforward without depending on any assumptions of other kernel  
code.
2) replace  current_is_ksgxd() with current->mm == NULL. This assumes  
kthreads has no mm.
3) replace current_is_ksgxd() with current->flags & PF_KTHREAD. This is  
direct use of the flag PF_KTHREAD, so it should be better than #2?

Any preference or further thoughts?

Thanks
Haitao

[1]  
https://lore.kernel.org/linux-sgx/9c269c70-35fe-a1a4-34c9-b1e62ab3bb3b@intel.com/
Dave Hansen Feb. 16, 2024, 9:55 p.m. UTC | #7
On 2/16/24 13:38, Haitao Huang wrote:
> On Fri, 16 Feb 2024 09:15:59 -0600, Dave Hansen <dave.hansen@intel.com>
> wrote:
...
>> Does this 'indirect' change any behavior other than whether it does a
>> search for an mm to find a place to charge the backing storage?
> 
> No.
> 
>> Instead of passing a flag around, why not just pass the mm?
>>
> There is no need to pass in mm. We could just check if current->mm ==
> NULL for the need of doing the search in the enclave mm list.
> 
> But you had a concern [1] that the purpose was not clear hence suggested
> current_is_ksgxd().

Right, because there was only one possible way that mm could be NULL but
it wasn't obvious from the code what that way was.

> Would it be OK if we replace current_is_ksgxd() with (current->flags &
> PF_KTHREAD)? That would express the real intent of checking if calling
> context is not in a user context.

No, I think that focuses on the symptom and not on the fundamental problem.

The fundamental problem is that you need an mm in order to charge your
allocations to the right group.  Indirect reclaim means you are not in a
context which is connected to the mm that should be charged while direct
reclaim is.

>> This refactoring out of 'indirect' or passing the mm around really wants
>> to be in its own patch anyway.
>>
> Looks like I could do:
> 1) refactoring of 'indirect' value/enum suggested above. This seems the
> most straightforward without depending on any assumptions of other
> kernel code.
> 2) replaceĀ  current_is_ksgxd() with current->mm == NULL. This assumes
> kthreads has no mm.
> 3) replace current_is_ksgxd() with current->flags & PF_KTHREAD. This is
> direct use of the flag PF_KTHREAD, so it should be better than #2?
> 
> Any preference or further thoughts?

Pass around a:

	struct mm_struct *charge_mm

Then, at the bottom do:

	/*
	 * Backing RAM allocations need to be charged to some mm and
	 * associated cgroup.  If this context does not have an mm to
	 * charge, search the enclave's mm_list to find some mm
	 * associated with this enclave.
	 */
	if (!charge_mm)
		... do slow mm lookup
	else
		return mm_to_cgroup_whatever(charge_mm);

Then just comment the call sites where the initial charge_mm comes in:

	
	/* Indirect SGX reclaim, no mm to charge, so NULL: */
	foo(..., NULL);


	/* Direct SGX reclaim, charge current mm for allocations: */
	foo(..., current->mm);
Haitao Huang Feb. 16, 2024, 11:33 p.m. UTC | #8
On Fri, 16 Feb 2024 15:55:10 -0600, Dave Hansen <dave.hansen@intel.com>  
wrote:

> On 2/16/24 13:38, Haitao Huang wrote:
>> On Fri, 16 Feb 2024 09:15:59 -0600, Dave Hansen <dave.hansen@intel.com>
>> wrote:
> ...
>>> Does this 'indirect' change any behavior other than whether it does a
>>> search for an mm to find a place to charge the backing storage?
>>
>> No.
>>
>>> Instead of passing a flag around, why not just pass the mm?
>>>
>> There is no need to pass in mm. We could just check if current->mm ==
>> NULL for the need of doing the search in the enclave mm list.
>>
>> But you had a concern [1] that the purpose was not clear hence suggested
>> current_is_ksgxd().
>
> Right, because there was only one possible way that mm could be NULL but
> it wasn't obvious from the code what that way was.
>
>> Would it be OK if we replace current_is_ksgxd() with (current->flags &
>> PF_KTHREAD)? That would express the real intent of checking if calling
>> context is not in a user context.
>
> No, I think that focuses on the symptom and not on the fundamental  
> problem.
>
> The fundamental problem is that you need an mm in order to charge your
> allocations to the right group.  Indirect reclaim means you are not in a
> context which is connected to the mm that should be charged while direct
> reclaim is.
>
>>> This refactoring out of 'indirect' or passing the mm around really  
>>> wants
>>> to be in its own patch anyway.
>>>
>> Looks like I could do:
>> 1) refactoring of 'indirect' value/enum suggested above. This seems the
>> most straightforward without depending on any assumptions of other
>> kernel code.
>> 2) replace  current_is_ksgxd() with current->mm == NULL. This assumes
>> kthreads has no mm.
>> 3) replace current_is_ksgxd() with current->flags & PF_KTHREAD. This is
>> direct use of the flag PF_KTHREAD, so it should be better than #2?
>>
>> Any preference or further thoughts?
>
> Pass around a:
>
> 	struct mm_struct *charge_mm
>
> Then, at the bottom do:
>
> 	/*
> 	 * Backing RAM allocations need to be charged to some mm and
> 	 * associated cgroup.  If this context does not have an mm to
> 	 * charge, search the enclave's mm_list to find some mm
> 	 * associated with this enclave.
> 	 */
> 	if (!charge_mm)
> 		... do slow mm lookup
> 	else
> 		return mm_to_cgroup_whatever(charge_mm);
>
> Then just comment the call sites where the initial charge_mm comes in:
>
> 	
> 	/* Indirect SGX reclaim, no mm to charge, so NULL: */
> 	foo(..., NULL);
>
>
> 	/* Direct SGX reclaim, charge current mm for allocations: */
> 	foo(..., current->mm);
>
>
Okay. got it now.

Thank you very much!

Haitao
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 279148e72459..4e5948362060 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -993,9 +993,7 @@  static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_inde
 }
 
 /*
- * When called from ksgxd, returns the mem_cgroup of a struct mm stored
- * in the enclave's mm_list. When not called from ksgxd, just returns
- * the mem_cgroup of the current task.
+ * Returns the mem_cgroup of a struct mm stored in the enclave's mm_list.
  */
 static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
 {
@@ -1003,14 +1001,6 @@  static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
 	struct sgx_encl_mm *encl_mm;
 	int idx;
 
-	/*
-	 * If called from normal task context, return the mem_cgroup
-	 * of the current task's mm. The remainder of the handling is for
-	 * ksgxd.
-	 */
-	if (!current_is_ksgxd())
-		return get_mem_cgroup_from_mm(current->mm);
-
 	/*
 	 * Search the enclave's mm_list to find an mm associated with
 	 * this enclave to charge the allocation to.
@@ -1047,27 +1037,33 @@  static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
  * @encl:	an enclave pointer
  * @page_index:	enclave page index
  * @backing:	data for accessing backing storage for the page
+ * @indirect:	in ksgxd or EPC cgroup work queue context
+ *
+ * Create a backing page for loading data back into an EPC page with ELDU. This
+ * function takes a reference on a new backing page which must be dropped with a
+ * corresponding call to sgx_encl_put_backing().
  *
- * When called from ksgxd, sets the active memcg from one of the
- * mms in the enclave's mm_list prior to any backing page allocation,
- * in order to ensure that shmem page allocations are charged to the
- * enclave.  Create a backing page for loading data back into an EPC page with
- * ELDU.  This function takes a reference on a new backing page which
- * must be dropped with a corresponding call to sgx_encl_put_backing().
+ * When @indirect is true, sets the active memcg from one of the mms in the
+ * enclave's mm_list prior to any backing page allocation, in order to ensure
+ * that shmem page allocations are charged to the enclave.
  *
  * Return:
  *   0 on success,
  *   -errno otherwise.
  */
 int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
-			   struct sgx_backing *backing)
+			   struct sgx_backing *backing, bool indirect)
 {
-	struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl);
-	struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
+	struct mem_cgroup *encl_memcg;
+	struct mem_cgroup *memcg;
 	int ret;
 
-	ret = __sgx_encl_get_backing(encl, page_index, backing);
+	if (!indirect)
+		return  __sgx_encl_get_backing(encl, page_index, backing);
 
+	encl_memcg = sgx_encl_get_mem_cgroup(encl);
+	memcg = set_active_memcg(encl_memcg);
+	ret = __sgx_encl_get_backing(encl, page_index, backing);
 	set_active_memcg(memcg);
 	mem_cgroup_put(encl_memcg);
 
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index f94ff14c9486..549cd2e8d98b 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -103,12 +103,11 @@  static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
 int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
 		     unsigned long end, unsigned long vm_flags);
 
-bool current_is_ksgxd(void);
 void sgx_encl_release(struct kref *ref);
 int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
 const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl);
 int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
-			   struct sgx_backing *backing);
+			   struct sgx_backing *backing, bool indirect);
 void sgx_encl_put_backing(struct sgx_backing *backing);
 int sgx_encl_test_and_clear_young(struct mm_struct *mm,
 				  struct sgx_encl_page *page);
diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.c b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
index 16b6d9f909eb..d399fda2b55e 100644
--- a/arch/x86/kernel/cpu/sgx/epc_cgroup.c
+++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
@@ -93,9 +93,10 @@  bool sgx_epc_cgroup_lru_empty(struct misc_cg *root)
 /**
  * sgx_epc_cgroup_reclaim_pages() - walk a cgroup tree and scan LRUs to reclaim pages
  * @root:	Root of the tree to start walking from.
+ * @indirect:   In ksgxd or EPC cgroup work queue context.
  * Return:	Number of pages reclaimed.
  */
-unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
+static unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root, bool indirect)
 {
 	/*
 	 * Attempting to reclaim only a few pages will often fail and is
@@ -119,7 +120,7 @@  unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
 		rcu_read_unlock();
 
 		epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
-		cnt += sgx_reclaim_pages(&epc_cg->lru, &nr_to_scan);
+		cnt += sgx_reclaim_pages(&epc_cg->lru, &nr_to_scan, indirect);
 
 		rcu_read_lock();
 		css_put(pos);
@@ -176,7 +177,7 @@  static void sgx_epc_cgroup_reclaim_work_func(struct work_struct *work)
 			break;
 
 		/* Keep reclaiming until above condition is met. */
-		sgx_epc_cgroup_reclaim_pages(epc_cg->cg);
+		sgx_epc_cgroup_reclaim_pages(epc_cg->cg, true);
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 4f5824c4751d..51904f191b97 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -254,7 +254,7 @@  static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
 }
 
 static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
-				struct sgx_backing *backing)
+				struct sgx_backing *backing, bool indirect)
 {
 	struct sgx_encl_page *encl_page = epc_page->owner;
 	struct sgx_encl *encl = encl_page->encl;
@@ -270,7 +270,7 @@  static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
 
 	if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
 		ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size),
-					   &secs_backing);
+					   &secs_backing, indirect);
 		if (ret)
 			goto out;
 
@@ -304,9 +304,11 @@  static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
  * @lru:	The LRU from which pages are reclaimed.
  * @nr_to_scan: Pointer to the target number of pages to scan, must be less than
  *		SGX_NR_TO_SCAN.
+ * @indirect:	In ksgxd or EPC cgroup work queue contexts.
  * Return:	Number of pages reclaimed.
  */
-unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru, unsigned int *nr_to_scan)
+unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru, unsigned int *nr_to_scan,
+			       bool indirect)
 {
 	struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
 	struct sgx_backing backing[SGX_NR_TO_SCAN];
@@ -348,7 +350,7 @@  unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru, unsigned int *nr_to
 		page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
 
 		mutex_lock(&encl_page->encl->lock);
-		ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]);
+		ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i], indirect);
 		if (ret) {
 			mutex_unlock(&encl_page->encl->lock);
 			goto skip;
@@ -381,7 +383,7 @@  unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru, unsigned int *nr_to
 			continue;
 
 		encl_page = epc_page->owner;
-		sgx_reclaimer_write(epc_page, &backing[i]);
+		sgx_reclaimer_write(epc_page, &backing[i], indirect);
 
 		kref_put(&encl_page->encl->refcount, sgx_encl_release);
 		epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
@@ -399,11 +401,11 @@  static bool sgx_should_reclaim(unsigned long watermark)
 	       !list_empty(&sgx_global_lru.reclaimable);
 }
 
-static void sgx_reclaim_pages_global(void)
+static void sgx_reclaim_pages_global(bool indirect)
 {
 	unsigned int nr_to_scan = SGX_NR_TO_SCAN;
 
-	sgx_reclaim_pages(&sgx_global_lru, &nr_to_scan);
+	sgx_reclaim_pages(&sgx_global_lru, &nr_to_scan, indirect);
 }
 
 /*
@@ -414,7 +416,7 @@  static void sgx_reclaim_pages_global(void)
 void sgx_reclaim_direct(void)
 {
 	if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
-		sgx_reclaim_pages_global();
+		sgx_reclaim_pages_global(false);
 }
 
 static int ksgxd(void *p)
@@ -437,7 +439,7 @@  static int ksgxd(void *p)
 				     sgx_should_reclaim(SGX_NR_HIGH_PAGES));
 
 		if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
-			sgx_reclaim_pages_global();
+			sgx_reclaim_pages_global(true);
 
 		cond_resched();
 	}
@@ -460,11 +462,6 @@  static bool __init sgx_page_reclaimer_init(void)
 	return true;
 }
 
-bool current_is_ksgxd(void)
-{
-	return current == ksgxd_tsk;
-}
-
 static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
 {
 	struct sgx_numa_node *node = &sgx_numa_nodes[nid];
@@ -623,7 +620,7 @@  struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
 		 * Need to do a global reclamation if cgroup was not full but free
 		 * physical pages run out, causing __sgx_alloc_epc_page() to fail.
 		 */
-		sgx_reclaim_pages_global();
+		sgx_reclaim_pages_global(false);
 		cond_resched();
 	}
 
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 2593c013d091..cfe906054d85 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -110,7 +110,8 @@  void sgx_reclaim_direct(void);
 void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
 int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
 struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
-unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru, unsigned int *nr_to_scan);
+unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru, unsigned int *nr_to_scan,
+			       bool indirect);
 
 void sgx_ipi_cb(void *info);