diff mbox series

[3/5] crypto: ccp: Play nice with vmalloc'd memory for SEV command structs

Message ID 20210402233702.3291792-4-seanjc@google.com (mailing list archive)
State Superseded
Delegated to: Herbert Xu
Headers show
Series ccp: KVM: SVM: Use stack for SEV command buffers | expand

Commit Message

Sean Christopherson April 2, 2021, 11:37 p.m. UTC
Copy vmalloc'd data to an internal buffer instead of rejecting outright
so that callers can put SEV command buffers on the stack without running
afoul of CONFIG_VMAP_STACK=y.  Currently, the largest supported command
takes a 68 byte buffer, i.e. pretty much every command can be put on the
stack.  Because sev_cmd_mutex is held for the entirety of a transaction,
only a single bounce buffer is required.

Use a flexible array for the buffer, sized to hold the largest known
command.   Alternatively, the buffer could be a union of all known
command structs, but that would incur a higher maintenance cost due to
the need to update the union for every command in addition to updating
the existing sev_cmd_buffer_len().

Align the buffer to an 8-byte boundary, mimicking the alignment that
would be provided by the compiler if any of the structs were embedded
directly.  Note, sizeof() correctly incorporates this alignment.

Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 drivers/crypto/ccp/sev-dev.c | 33 +++++++++++++++++++++++++++------
 drivers/crypto/ccp/sev-dev.h |  7 +++++++
 2 files changed, 34 insertions(+), 6 deletions(-)

Comments

Christophe Leroy April 3, 2021, 5:05 p.m. UTC | #1
Le 03/04/2021 à 01:37, Sean Christopherson a écrit :
> Copy vmalloc'd data to an internal buffer instead of rejecting outright
> so that callers can put SEV command buffers on the stack without running
> afoul of CONFIG_VMAP_STACK=y.  Currently, the largest supported command
> takes a 68 byte buffer, i.e. pretty much every command can be put on the
> stack.  Because sev_cmd_mutex is held for the entirety of a transaction,
> only a single bounce buffer is required.
> 
> Use a flexible array for the buffer, sized to hold the largest known
> command.   Alternatively, the buffer could be a union of all known
> command structs, but that would incur a higher maintenance cost due to
> the need to update the union for every command in addition to updating
> the existing sev_cmd_buffer_len().
> 
> Align the buffer to an 8-byte boundary, mimicking the alignment that
> would be provided by the compiler if any of the structs were embedded
> directly.  Note, sizeof() correctly incorporates this alignment.
> 
> Cc: Brijesh Singh <brijesh.singh@amd.com>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   drivers/crypto/ccp/sev-dev.c | 33 +++++++++++++++++++++++++++------
>   drivers/crypto/ccp/sev-dev.h |  7 +++++++
>   2 files changed, 34 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
> index 4c513318f16a..6d5882290cfc 100644
> --- a/drivers/crypto/ccp/sev-dev.c
> +++ b/drivers/crypto/ccp/sev-dev.c
> @@ -135,13 +135,14 @@ static int sev_cmd_buffer_len(int cmd)
>   	return 0;
>   }
>   
> -static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
> +static int __sev_do_cmd_locked(int cmd, void *__data, int *psp_ret)
>   {
>   	struct psp_device *psp = psp_master;
>   	struct sev_device *sev;
>   	unsigned int phys_lsb, phys_msb;
>   	unsigned int reg, ret = 0;
>   	int buf_len;
> +	void *data;
>   
>   	if (!psp || !psp->sev_data)
>   		return -ENODEV;
> @@ -152,11 +153,21 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
>   	sev = psp->sev_data;
>   
>   	buf_len = sev_cmd_buffer_len(cmd);
> -	if (WARN_ON_ONCE(!!data != !!buf_len))
> +	if (WARN_ON_ONCE(!!__data != !!buf_len))
>   		return -EINVAL;
>   
> -	if (WARN_ON_ONCE(data && is_vmalloc_addr(data)))
> -		return -EINVAL;
> +	if (__data && is_vmalloc_addr(__data)) {

I think you want to use !virt_addr_valid() here, because not only vmalloc addresses are a problem. 
For instance, module addresses are a problem as well.

> +		/*
> +		 * If the incoming buffer is virtually allocated, copy it to
> +		 * the driver's scratch buffer as __pa() will not work for such
> +		 * addresses, vmalloc_to_page() is not guaranteed to succeed,
> +		 * and vmalloc'd data may not be physically contiguous.
> +		 */
> +		data = sev->cmd_buf;
> +		memcpy(data, __data, buf_len);
> +	} else {
> +		data = __data;
> +	}
>   
>   	/* Get the physical address of the command buffer */
>   	phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0;
> @@ -204,6 +215,13 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
>   	print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data,
>   			     buf_len, false);
>   
> +	/*
> +	 * Copy potential output from the PSP back to __data.  Do this even on
> +	 * failure in case the caller wants to glean something from the error.
> +	 */
> +	if (__data && data != __data)
> +		memcpy(__data, data, buf_len);
> +
>   	return ret;
>   }
>   
> @@ -978,9 +996,12 @@ int sev_dev_init(struct psp_device *psp)
>   {
>   	struct device *dev = psp->dev;
>   	struct sev_device *sev;
> -	int ret = -ENOMEM;
> +	int ret = -ENOMEM, cmd_buf_size = 0, i;
>   
> -	sev = devm_kzalloc(dev, sizeof(*sev), GFP_KERNEL);
> +	for (i = 0; i < SEV_CMD_MAX; i++)
> +		cmd_buf_size = max(cmd_buf_size, sev_cmd_buffer_len(i));
> +
> +	sev = devm_kzalloc(dev, sizeof(*sev) + cmd_buf_size, GFP_KERNEL);
>   	if (!sev)
>   		goto e_err;
>   
> diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h
> index dd5c4fe82914..b43283ce2d73 100644
> --- a/drivers/crypto/ccp/sev-dev.h
> +++ b/drivers/crypto/ccp/sev-dev.h
> @@ -52,6 +52,13 @@ struct sev_device {
>   	u8 api_major;
>   	u8 api_minor;
>   	u8 build;
> +
> +	/*
> +	 * Buffer used for incoming commands whose physical address cannot be
> +	 * resolved via __pa(), e.g. stack pointers when CONFIG_VMAP_STACK=y.
> +	 * Note, alignment isn't strictly required.
> +	 */
> +	u8 cmd_buf[] __aligned(8);
>   };
>   
>   int sev_dev_init(struct psp_device *psp);
>
Christophe Leroy April 3, 2021, 5:13 p.m. UTC | #2
Le 03/04/2021 à 01:37, Sean Christopherson a écrit :
> Copy vmalloc'd data to an internal buffer instead of rejecting outright
> so that callers can put SEV command buffers on the stack without running
> afoul of CONFIG_VMAP_STACK=y.  Currently, the largest supported command
> takes a 68 byte buffer, i.e. pretty much every command can be put on the
> stack.  Because sev_cmd_mutex is held for the entirety of a transaction,
> only a single bounce buffer is required.
> 
> Use a flexible array for the buffer, sized to hold the largest known
> command.   Alternatively, the buffer could be a union of all known
> command structs, but that would incur a higher maintenance cost due to
> the need to update the union for every command in addition to updating
> the existing sev_cmd_buffer_len().
> 
> Align the buffer to an 8-byte boundary, mimicking the alignment that
> would be provided by the compiler if any of the structs were embedded
> directly.  Note, sizeof() correctly incorporates this alignment.
> 
> Cc: Brijesh Singh <brijesh.singh@amd.com>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   drivers/crypto/ccp/sev-dev.c | 33 +++++++++++++++++++++++++++------
>   drivers/crypto/ccp/sev-dev.h |  7 +++++++
>   2 files changed, 34 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
> index 4c513318f16a..6d5882290cfc 100644
> --- a/drivers/crypto/ccp/sev-dev.c
> +++ b/drivers/crypto/ccp/sev-dev.c
> @@ -135,13 +135,14 @@ static int sev_cmd_buffer_len(int cmd)
>   	return 0;
>   }
>   
> -static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
> +static int __sev_do_cmd_locked(int cmd, void *__data, int *psp_ret)
>   {
>   	struct psp_device *psp = psp_master;
>   	struct sev_device *sev;
>   	unsigned int phys_lsb, phys_msb;
>   	unsigned int reg, ret = 0;
>   	int buf_len;
> +	void *data;
>   
>   	if (!psp || !psp->sev_data)
>   		return -ENODEV;
> @@ -152,11 +153,21 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
>   	sev = psp->sev_data;
>   
>   	buf_len = sev_cmd_buffer_len(cmd);
> -	if (WARN_ON_ONCE(!!data != !!buf_len))
> +	if (WARN_ON_ONCE(!!__data != !!buf_len))

Why do you need a double !! ?
I think !__data != !buf_len should be enough.

>   		return -EINVAL;
>   
> -	if (WARN_ON_ONCE(data && is_vmalloc_addr(data)))
> -		return -EINVAL;
> +	if (__data && is_vmalloc_addr(__data)) {
> +		/*
> +		 * If the incoming buffer is virtually allocated, copy it to
> +		 * the driver's scratch buffer as __pa() will not work for such
> +		 * addresses, vmalloc_to_page() is not guaranteed to succeed,
> +		 * and vmalloc'd data may not be physically contiguous.
> +		 */
> +		data = sev->cmd_buf;
> +		memcpy(data, __data, buf_len);
> +	} else {
> +		data = __data;
> +	}
>   
>   	/* Get the physical address of the command buffer */
>   	phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0;
> @@ -204,6 +215,13 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
>   	print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data,
>   			     buf_len, false);
>   
> +	/*
> +	 * Copy potential output from the PSP back to __data.  Do this even on
> +	 * failure in case the caller wants to glean something from the error.
> +	 */
> +	if (__data && data != __data)

IIUC, when __data is NULL, data is also NULL, so this double test is useless.

Checking data != __data should be enough

> +		memcpy(__data, data, buf_len);
> +
>   	return ret;
>   }
>   
> @@ -978,9 +996,12 @@ int sev_dev_init(struct psp_device *psp)
>   {
>   	struct device *dev = psp->dev;
>   	struct sev_device *sev;
> -	int ret = -ENOMEM;
> +	int ret = -ENOMEM, cmd_buf_size = 0, i;
>   
> -	sev = devm_kzalloc(dev, sizeof(*sev), GFP_KERNEL);
> +	for (i = 0; i < SEV_CMD_MAX; i++)
> +		cmd_buf_size = max(cmd_buf_size, sev_cmd_buffer_len(i));
> +
> +	sev = devm_kzalloc(dev, sizeof(*sev) + cmd_buf_size, GFP_KERNEL);
>   	if (!sev)
>   		goto e_err;
>   
> diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h
> index dd5c4fe82914..b43283ce2d73 100644
> --- a/drivers/crypto/ccp/sev-dev.h
> +++ b/drivers/crypto/ccp/sev-dev.h
> @@ -52,6 +52,13 @@ struct sev_device {
>   	u8 api_major;
>   	u8 api_minor;
>   	u8 build;
> +
> +	/*
> +	 * Buffer used for incoming commands whose physical address cannot be
> +	 * resolved via __pa(), e.g. stack pointers when CONFIG_VMAP_STACK=y.
> +	 * Note, alignment isn't strictly required.
> +	 */
> +	u8 cmd_buf[] __aligned(8);
>   };
>   
>   int sev_dev_init(struct psp_device *psp);
>
Christophe Leroy April 4, 2021, 6:48 a.m. UTC | #3
Le 03/04/2021 à 01:37, Sean Christopherson a écrit :
> Copy vmalloc'd data to an internal buffer instead of rejecting outright
> so that callers can put SEV command buffers on the stack without running
> afoul of CONFIG_VMAP_STACK=y.  Currently, the largest supported command
> takes a 68 byte buffer, i.e. pretty much every command can be put on the
> stack.  Because sev_cmd_mutex is held for the entirety of a transaction,
> only a single bounce buffer is required.
> 
> Use a flexible array for the buffer, sized to hold the largest known
> command.   Alternatively, the buffer could be a union of all known
> command structs, but that would incur a higher maintenance cost due to
> the need to update the union for every command in addition to updating
> the existing sev_cmd_buffer_len().
> 
> Align the buffer to an 8-byte boundary, mimicking the alignment that
> would be provided by the compiler if any of the structs were embedded
> directly.  Note, sizeof() correctly incorporates this alignment.
> 
> Cc: Brijesh Singh <brijesh.singh@amd.com>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   drivers/crypto/ccp/sev-dev.c | 33 +++++++++++++++++++++++++++------
>   drivers/crypto/ccp/sev-dev.h |  7 +++++++
>   2 files changed, 34 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
> index 4c513318f16a..6d5882290cfc 100644
> --- a/drivers/crypto/ccp/sev-dev.c
> +++ b/drivers/crypto/ccp/sev-dev.c
> @@ -135,13 +135,14 @@ static int sev_cmd_buffer_len(int cmd)
>   	return 0;
>   }
>   
> -static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
> +static int __sev_do_cmd_locked(int cmd, void *__data, int *psp_ret)
>   {
>   	struct psp_device *psp = psp_master;
>   	struct sev_device *sev;
>   	unsigned int phys_lsb, phys_msb;
>   	unsigned int reg, ret = 0;
>   	int buf_len;
> +	void *data;
>   
>   	if (!psp || !psp->sev_data)
>   		return -ENODEV;
> @@ -152,11 +153,21 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
>   	sev = psp->sev_data;
>   
>   	buf_len = sev_cmd_buffer_len(cmd);
> -	if (WARN_ON_ONCE(!!data != !!buf_len))
> +	if (WARN_ON_ONCE(!!__data != !!buf_len))
>   		return -EINVAL;
>   
> -	if (WARN_ON_ONCE(data && is_vmalloc_addr(data)))
> -		return -EINVAL;
> +	if (__data && is_vmalloc_addr(__data)) {
> +		/*
> +		 * If the incoming buffer is virtually allocated, copy it to
> +		 * the driver's scratch buffer as __pa() will not work for such
> +		 * addresses, vmalloc_to_page() is not guaranteed to succeed,
> +		 * and vmalloc'd data may not be physically contiguous.
> +		 */
> +		data = sev->cmd_buf;
> +		memcpy(data, __data, buf_len);
> +	} else {
> +		data = __data;
> +	}

I don't know how big commands are, but if they are small, it would probably be more efficient to 
inconditionnally copy them to the buffer rather then doing the test.

>   
>   	/* Get the physical address of the command buffer */
>   	phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0;
> @@ -204,6 +215,13 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
>   	print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data,
>   			     buf_len, false);
>   
> +	/*
> +	 * Copy potential output from the PSP back to __data.  Do this even on
> +	 * failure in case the caller wants to glean something from the error.
> +	 */
> +	if (__data && data != __data)
> +		memcpy(__data, data, buf_len);
> +
>   	return ret;
>   }
>   
> @@ -978,9 +996,12 @@ int sev_dev_init(struct psp_device *psp)
>   {
>   	struct device *dev = psp->dev;
>   	struct sev_device *sev;
> -	int ret = -ENOMEM;
> +	int ret = -ENOMEM, cmd_buf_size = 0, i;
>   
> -	sev = devm_kzalloc(dev, sizeof(*sev), GFP_KERNEL);
> +	for (i = 0; i < SEV_CMD_MAX; i++)
> +		cmd_buf_size = max(cmd_buf_size, sev_cmd_buffer_len(i));
> +
> +	sev = devm_kzalloc(dev, sizeof(*sev) + cmd_buf_size, GFP_KERNEL);
>   	if (!sev)
>   		goto e_err;
>   
> diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h
> index dd5c4fe82914..b43283ce2d73 100644
> --- a/drivers/crypto/ccp/sev-dev.h
> +++ b/drivers/crypto/ccp/sev-dev.h
> @@ -52,6 +52,13 @@ struct sev_device {
>   	u8 api_major;
>   	u8 api_minor;
>   	u8 build;
> +
> +	/*
> +	 * Buffer used for incoming commands whose physical address cannot be
> +	 * resolved via __pa(), e.g. stack pointers when CONFIG_VMAP_STACK=y.
> +	 * Note, alignment isn't strictly required.
> +	 */
> +	u8 cmd_buf[] __aligned(8);
>   };
>   
>   int sev_dev_init(struct psp_device *psp);
>
Sean Christopherson April 5, 2021, 3:06 p.m. UTC | #4
On Sun, Apr 04, 2021, Christophe Leroy wrote:
> 
> Le 03/04/2021 à 01:37, Sean Christopherson a écrit :
> > @@ -152,11 +153,21 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
> >   	sev = psp->sev_data;
> >   	buf_len = sev_cmd_buffer_len(cmd);
> > -	if (WARN_ON_ONCE(!!data != !!buf_len))
> > +	if (WARN_ON_ONCE(!!__data != !!buf_len))
> >   		return -EINVAL;
> > -	if (WARN_ON_ONCE(data && is_vmalloc_addr(data)))
> > -		return -EINVAL;
> > +	if (__data && is_vmalloc_addr(__data)) {
> > +		/*
> > +		 * If the incoming buffer is virtually allocated, copy it to
> > +		 * the driver's scratch buffer as __pa() will not work for such
> > +		 * addresses, vmalloc_to_page() is not guaranteed to succeed,
> > +		 * and vmalloc'd data may not be physically contiguous.
> > +		 */
> > +		data = sev->cmd_buf;
> > +		memcpy(data, __data, buf_len);
> > +	} else {
> > +		data = __data;
> > +	}
> 
> I don't know how big commands are, but if they are small, it would probably
> be more efficient to inconditionnally copy them to the buffer rather then
> doing the test.

Brijesh, I assume SNP support will need to copy the commands unconditionally? If
yes, it probably makes sense to do so now and avoid vmalloc dependencies
completely.  And I think that would allow for the removal of status_cmd_buf and
init_cmd_buf, or is there another reason those dedicated buffers exist?
Brijesh Singh April 5, 2021, 4:01 p.m. UTC | #5
On 4/5/21 10:06 AM, Sean Christopherson wrote:
> On Sun, Apr 04, 2021, Christophe Leroy wrote:
>> Le 03/04/2021 à 01:37, Sean Christopherson a écrit :
>>> @@ -152,11 +153,21 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
>>>   	sev = psp->sev_data;
>>>   	buf_len = sev_cmd_buffer_len(cmd);
>>> -	if (WARN_ON_ONCE(!!data != !!buf_len))
>>> +	if (WARN_ON_ONCE(!!__data != !!buf_len))
>>>   		return -EINVAL;
>>> -	if (WARN_ON_ONCE(data && is_vmalloc_addr(data)))
>>> -		return -EINVAL;
>>> +	if (__data && is_vmalloc_addr(__data)) {
>>> +		/*
>>> +		 * If the incoming buffer is virtually allocated, copy it to
>>> +		 * the driver's scratch buffer as __pa() will not work for such
>>> +		 * addresses, vmalloc_to_page() is not guaranteed to succeed,
>>> +		 * and vmalloc'd data may not be physically contiguous.
>>> +		 */
>>> +		data = sev->cmd_buf;
>>> +		memcpy(data, __data, buf_len);
>>> +	} else {
>>> +		data = __data;
>>> +	}
>> I don't know how big commands are, but if they are small, it would probably
>> be more efficient to inconditionnally copy them to the buffer rather then
>> doing the test.
> Brijesh, I assume SNP support will need to copy the commands unconditionally? If
> yes, it probably makes sense to do so now and avoid vmalloc dependencies
> completely.  And I think that would allow for the removal of status_cmd_buf and
> init_cmd_buf, or is there another reason those dedicated buffers exist?


Yes, we need to copy the commands unconditionally for the SNP support.
It makes sense to avoid the vmalloc dependencies. I can't think of any
reason why we would need the status_cmd_buf and init_cmd_buf after those
changes.
diff mbox series

Patch

diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 4c513318f16a..6d5882290cfc 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -135,13 +135,14 @@  static int sev_cmd_buffer_len(int cmd)
 	return 0;
 }
 
-static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
+static int __sev_do_cmd_locked(int cmd, void *__data, int *psp_ret)
 {
 	struct psp_device *psp = psp_master;
 	struct sev_device *sev;
 	unsigned int phys_lsb, phys_msb;
 	unsigned int reg, ret = 0;
 	int buf_len;
+	void *data;
 
 	if (!psp || !psp->sev_data)
 		return -ENODEV;
@@ -152,11 +153,21 @@  static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
 	sev = psp->sev_data;
 
 	buf_len = sev_cmd_buffer_len(cmd);
-	if (WARN_ON_ONCE(!!data != !!buf_len))
+	if (WARN_ON_ONCE(!!__data != !!buf_len))
 		return -EINVAL;
 
-	if (WARN_ON_ONCE(data && is_vmalloc_addr(data)))
-		return -EINVAL;
+	if (__data && is_vmalloc_addr(__data)) {
+		/*
+		 * If the incoming buffer is virtually allocated, copy it to
+		 * the driver's scratch buffer as __pa() will not work for such
+		 * addresses, vmalloc_to_page() is not guaranteed to succeed,
+		 * and vmalloc'd data may not be physically contiguous.
+		 */
+		data = sev->cmd_buf;
+		memcpy(data, __data, buf_len);
+	} else {
+		data = __data;
+	}
 
 	/* Get the physical address of the command buffer */
 	phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0;
@@ -204,6 +215,13 @@  static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
 	print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data,
 			     buf_len, false);
 
+	/*
+	 * Copy potential output from the PSP back to __data.  Do this even on
+	 * failure in case the caller wants to glean something from the error.
+	 */
+	if (__data && data != __data)
+		memcpy(__data, data, buf_len);
+
 	return ret;
 }
 
@@ -978,9 +996,12 @@  int sev_dev_init(struct psp_device *psp)
 {
 	struct device *dev = psp->dev;
 	struct sev_device *sev;
-	int ret = -ENOMEM;
+	int ret = -ENOMEM, cmd_buf_size = 0, i;
 
-	sev = devm_kzalloc(dev, sizeof(*sev), GFP_KERNEL);
+	for (i = 0; i < SEV_CMD_MAX; i++)
+		cmd_buf_size = max(cmd_buf_size, sev_cmd_buffer_len(i));
+
+	sev = devm_kzalloc(dev, sizeof(*sev) + cmd_buf_size, GFP_KERNEL);
 	if (!sev)
 		goto e_err;
 
diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h
index dd5c4fe82914..b43283ce2d73 100644
--- a/drivers/crypto/ccp/sev-dev.h
+++ b/drivers/crypto/ccp/sev-dev.h
@@ -52,6 +52,13 @@  struct sev_device {
 	u8 api_major;
 	u8 api_minor;
 	u8 build;
+
+	/*
+	 * Buffer used for incoming commands whose physical address cannot be
+	 * resolved via __pa(), e.g. stack pointers when CONFIG_VMAP_STACK=y.
+	 * Note, alignment isn't strictly required.
+	 */
+	u8 cmd_buf[] __aligned(8);
 };
 
 int sev_dev_init(struct psp_device *psp);