diff mbox

[V3,6/6] crypto/nx: Add P9 NX support for 842 compression engine

Message ID 1500699702.23205.8.camel@hbabu-laptop (mailing list archive)
State Changes Requested
Delegated to: Herbert Xu
Headers show

Commit Message

Haren Myneni July 22, 2017, 5:01 a.m. UTC
This patch adds P9 NX support for 842 compression engine. Virtual
Accelerator Switchboard (VAS) is used to access 842 engine on P9.

For each NX engine per chip, setup receive window using
vas_rx_win_open() which configures RxFIFo with FIFO address, lpid,
pid and tid values. This unique (lpid, pid, tid) combination will
be used to identify the target engine.

For crypto open request, open send window on the NX engine for
the corresponding chip / cpu where the open request is executed.
This send window will be closed upon crypto close request.

NX provides high and normal priority FIFOs. For compression /
decompression requests, we use only hight priority FIFOs in kernel.

Each NX request will be communicated to VAS using copy/paste
instructions with vas_copy_crb() / vas_paste_crb() functions.

Signed-off-by: Haren Myneni <haren@us.ibm.com>
---
 drivers/crypto/nx/Kconfig          |   1 +
 drivers/crypto/nx/nx-842-powernv.c | 375 ++++++++++++++++++++++++++++++++++++-
 drivers/crypto/nx/nx-842.c         |   2 +-
 3 files changed, 371 insertions(+), 7 deletions(-)

Comments

Ram Pai July 24, 2017, 4:46 p.m. UTC | #1
On Fri, Jul 21, 2017 at 10:01:42PM -0700, Haren Myneni wrote:
> 
> This patch adds P9 NX support for 842 compression engine. Virtual
> Accelerator Switchboard (VAS) is used to access 842 engine on P9.
> 
> For each NX engine per chip, setup receive window using
> vas_rx_win_open() which configures RxFIFo with FIFO address, lpid,
> pid and tid values. This unique (lpid, pid, tid) combination will
> be used to identify the target engine.
> 
> For crypto open request, open send window on the NX engine for
> the corresponding chip / cpu where the open request is executed.
> This send window will be closed upon crypto close request.
> 
> NX provides high and normal priority FIFOs. For compression /
> decompression requests, we use only hight priority FIFOs in kernel.
> 
> Each NX request will be communicated to VAS using copy/paste
> instructions with vas_copy_crb() / vas_paste_crb() functions.
> 

Reviewed-by: Ram Pai <linuxram@us.ibm.com>

> Signed-off-by: Haren Myneni <haren@us.ibm.com>
> ---
>  drivers/crypto/nx/Kconfig          |   1 +
>  drivers/crypto/nx/nx-842-powernv.c | 375 ++++++++++++++++++++++++++++++++++++-
>  drivers/crypto/nx/nx-842.c         |   2 +-
>  3 files changed, 371 insertions(+), 7 deletions(-)
Michael Ellerman Aug. 28, 2017, 11:25 p.m. UTC | #2
Hi Haren,

Some comments inline ...

Haren Myneni <haren@linux.vnet.ibm.com> writes:

> diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
> index c0dd4c7e17d3..13089a0b9dfa 100644
> --- a/drivers/crypto/nx/nx-842-powernv.c
> +++ b/drivers/crypto/nx/nx-842-powernv.c
> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>  
>  #define WORKMEM_ALIGN	(CRB_ALIGN)
>  #define CSB_WAIT_MAX	(5000) /* ms */
> +#define VAS_RETRIES	(10)

Where does that number come from?

Do we have any idea what the trade off is between retrying vs just
falling back to doing the request in software?

> +/* # of requests allowed per RxFIFO at a time. 0 for unlimited */
> +#define MAX_CREDITS_PER_RXFIFO	(1024)
>  
>  struct nx842_workmem {
>  	/* Below fields must be properly aligned */
> @@ -42,16 +46,27 @@ struct nx842_workmem {
>  
>  	ktime_t start;
>  
> +	struct vas_window *txwin;	/* Used with VAS function */

I don't understand how it makes sense to put txwin and start between the
fields above, and the padding.

If the workmem pointer you receive is not aligned, then PTR_ALIGN() will
advance it and mean you end up writing over start and txwin.

That's probably not your bug, the code is already like that.

>  	char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
>  } __packed __aligned(WORKMEM_ALIGN);

> @@ -576,6 +690,198 @@ static inline void nx842_add_coprocs_list(struct nx842_coproc *coproc,
>  	list_add(&coproc->list, &nx842_coprocs);
>  }
>  
> +/*
> + * Identify chip ID for each CPU and save coprocesor adddress for the
> + * corresponding NX engine in percpu coproc_inst.
> + * coproc_inst is used in crypto_init to open send window on the NX instance
> + * for the corresponding CPU / chip where the open request is executed.
> + */
> +static void nx842_set_per_cpu_coproc(struct nx842_coproc *coproc)
> +{
> +	unsigned int i, chip_id;
> +
> +	for_each_possible_cpu(i) {
> +		chip_id = cpu_to_chip_id(i);
> +
> +		if (coproc->chip_id == chip_id)
> +			per_cpu(coproc_inst, i) = coproc;
> +	}
> +}
> +
> +
> +static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc)
> +{
> +	struct vas_window *txwin = NULL;
> +	struct vas_tx_win_attr txattr;
> +
> +	/*
> +	 * Kernel requests will be high priority. So open send
> +	 * windows only for high priority RxFIFO entries.
> +	 */
> +	vas_init_tx_win_attr(&txattr, coproc->ct);
> +	txattr.lpid = 0;	/* lpid is 0 for kernel requests */
> +	txattr.pid = mfspr(SPRN_PID);

Should we be using SPRN_PID here? That makes it appear as if it comes
from the current user process, which seems fishy.

> +	/*
> +	 * Open a VAS send window which is used to send request to NX.
> +	 */
> +	txwin = vas_tx_win_open(coproc->vas.id, coproc->ct, &txattr);
> +	if (IS_ERR(txwin)) {
> +		pr_err("ibm,nx-842: Can not open TX window: %ld\n",
> +				PTR_ERR(txwin));
> +		return NULL;
> +	}
> +
> +	return txwin;
> +}
> +
> +static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id,
> +					int vasid)
> +{
> +	struct vas_window *rxwin = NULL;
> +	struct vas_rx_win_attr rxattr;
> +	struct nx842_coproc *coproc;
> +	u32 lpid, pid, tid, fifo_size;
> +	u64 rx_fifo;
> +	const char *priority;
> +	int ret;
> +
> +	ret = of_property_read_u64(dn, "rx-fifo-address", (void *)&rx_fifo);
                                                          ^^^^^^^^
                                                          Unnecessary cast.

> +	if (ret) {
> +		pr_err("Missing rx-fifo-address property\n");
> +		return ret;
> +	}
> +
> +	ret = of_property_read_u32(dn, "rx-fifo-size", &fifo_size);
> +	if (ret) {
> +		pr_err("Missing rx-fifo-size property\n");
> +		return ret;
> +	}
> +
> +	ret = of_property_read_u32(dn, "lpid", &lpid);
> +	if (ret) {
> +		pr_err("Missing lpid property\n");
> +		return ret;
> +	}
> +
> +	ret = of_property_read_u32(dn, "pid", &pid);
> +	if (ret) {
> +		pr_err("Missing pid property\n");
> +		return ret;
> +	}
> +
> +	ret = of_property_read_u32(dn, "tid", &tid);
> +	if (ret) {
> +		pr_err("Missing tid property\n");
> +		return ret;
> +	}
> +
> +	ret = of_property_read_string(dn, "priority", &priority);
> +	if (ret) {
> +		pr_err("Missing priority property\n");
> +		return ret;
> +	}
> +
> +	coproc = kzalloc(sizeof(*coproc), GFP_KERNEL);
> +	if (!coproc)
> +		return -ENOMEM;
> +
> +	if (!strcmp(priority, "High"))
> +		coproc->ct = VAS_COP_TYPE_842_HIPRI;
> +	else if (!strcmp(priority, "Normal"))
> +		coproc->ct = VAS_COP_TYPE_842;
> +	else {
> +		pr_err("Invalid RxFIFO priority value\n");
> +		ret =  -EINVAL;
> +		goto err_out;
> +	}
> +
> +	vas_init_rx_win_attr(&rxattr, coproc->ct);
> +	rxattr.rx_fifo = (void *)rx_fifo;
> +	rxattr.rx_fifo_size = fifo_size;
> +	rxattr.lnotify_lpid = lpid;
> +	rxattr.lnotify_pid = pid;
> +	rxattr.lnotify_tid = tid;
> +	rxattr.wcreds_max = MAX_CREDITS_PER_RXFIFO;
> +
> +	/*
> +	 * Open a VAS receice window which is used to configure RxFIFO
> +	 * for NX.
> +	 */
> +	rxwin = vas_rx_win_open(vasid, coproc->ct, &rxattr);
> +	if (IS_ERR(rxwin)) {
> +		ret = PTR_ERR(rxwin);
> +		pr_err("setting RxFIFO with VAS failed: %d\n",
> +			ret);
> +		goto err_out;
> +	}
> +
> +	coproc->vas.rxwin = rxwin;
> +	coproc->vas.id = vasid;
> +	nx842_add_coprocs_list(coproc, chip_id);
> +
> +	/*
> +	 * Kernel requests use only high priority FIFOs. So save coproc
> +	 * info in percpu coproc_inst which will be used to open send
> +	 * windows for crypto open requests later.
> +	 */
> +	if (coproc->ct == VAS_COP_TYPE_842_HIPRI)
> +		nx842_set_per_cpu_coproc(coproc);
> +
> +	return 0;
> +
> +err_out:
> +	kfree(coproc);
> +	return ret;
> +}
> +
> +
> +static int __init nx842_powernv_probe_vas(struct device_node *pn)
> +{
> +	struct device_node *dn;
> +	int chip_id, vasid, ret = 0;
> +	int nx_fifo_found = 0;
> +
> +	chip_id = of_get_ibm_chip_id(pn);
> +	if (chip_id < 0) {
> +		pr_err("ibm,chip-id missing\n");
> +		return -EINVAL;
> +	}
> +
> +	dn = of_find_compatible_node(pn, NULL, "ibm,power9-vas-x");

That's the wrong compatible, that will find the raw node, not the one
that's intended for Linux use.

It's also not really the NX drivers business to be looking for VAS
nodes. You should just look for the P9 NX nodes, and if VAS wasn't
configured for some reason then the VAS window open will fail and you
detect it then.

> +	if (!dn) {
> +		pr_err("Missing VAS device node\n");
> +		return -EINVAL;
> +	}
> +
> +	if (of_property_read_u32(dn, "ibm,vas-id", &vasid)) {
> +		pr_err("Missing ibm,vas-id device property\n");
> +		of_node_put(dn);
> +		return -EINVAL;
> +	}
> +
> +	of_node_put(dn);

So basically just drop all of that above.

> diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c
> index d94e25df503b..da3cb8c35ec7 100644
> --- a/drivers/crypto/nx/nx-842.c
> +++ b/drivers/crypto/nx/nx-842.c
> @@ -116,7 +116,7 @@ int nx842_crypto_init(struct crypto_tfm *tfm, struct nx842_driver *driver)
>  
>  	spin_lock_init(&ctx->lock);
>  	ctx->driver = driver;
> -	ctx->wmem = kmalloc(driver->workmem_size, GFP_KERNEL);
> +	ctx->wmem = kzalloc(driver->workmem_size, GFP_KERNEL);
>  	ctx->sbounce = (u8 *)__get_free_pages(GFP_KERNEL, BOUNCE_BUFFER_ORDER);
>  	ctx->dbounce = (u8 *)__get_free_pages(GFP_KERNEL, BOUNCE_BUFFER_ORDER);
>  	if (!ctx->wmem || !ctx->sbounce || !ctx->dbounce) {

That looks OK but should be split into a separate patch because it
affects the Power8 code as well as the pseries code.

cheers
Dan Streetman Aug. 29, 2017, 1:32 p.m. UTC | #3
On Mon, Aug 28, 2017 at 7:25 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
> Hi Haren,
>
> Some comments inline ...
>
> Haren Myneni <haren@linux.vnet.ibm.com> writes:
>
>> diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
>> index c0dd4c7e17d3..13089a0b9dfa 100644
>> --- a/drivers/crypto/nx/nx-842-powernv.c
>> +++ b/drivers/crypto/nx/nx-842-powernv.c
>> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>>
>>  #define WORKMEM_ALIGN        (CRB_ALIGN)
>>  #define CSB_WAIT_MAX (5000) /* ms */
>> +#define VAS_RETRIES  (10)
>
> Where does that number come from?
>
> Do we have any idea what the trade off is between retrying vs just
> falling back to doing the request in software?
>
>> +/* # of requests allowed per RxFIFO at a time. 0 for unlimited */
>> +#define MAX_CREDITS_PER_RXFIFO       (1024)
>>
>>  struct nx842_workmem {
>>       /* Below fields must be properly aligned */
>> @@ -42,16 +46,27 @@ struct nx842_workmem {
>>
>>       ktime_t start;
>>
>> +     struct vas_window *txwin;       /* Used with VAS function */
>
> I don't understand how it makes sense to put txwin and start between the
> fields above, and the padding.

workmem is a scratch buffer and shouldn't be used for something
persistent like this.

>
> If the workmem pointer you receive is not aligned, then PTR_ALIGN() will
> advance it and mean you end up writing over start and txwin.
>
> That's probably not your bug, the code is already like that.

no, it's a bug in this patch, because workmem is scratch whose
contents are only valid for the duration of each operation (compress
or decompress).

>
>>       char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
>>  } __packed __aligned(WORKMEM_ALIGN);
>
>> @@ -576,6 +690,198 @@ static inline void nx842_add_coprocs_list(struct nx842_coproc *coproc,
>>       list_add(&coproc->list, &nx842_coprocs);
>>  }
>>
>> +/*
>> + * Identify chip ID for each CPU and save coprocesor adddress for the
>> + * corresponding NX engine in percpu coproc_inst.
>> + * coproc_inst is used in crypto_init to open send window on the NX instance
>> + * for the corresponding CPU / chip where the open request is executed.
>> + */
>> +static void nx842_set_per_cpu_coproc(struct nx842_coproc *coproc)
>> +{
>> +     unsigned int i, chip_id;
>> +
>> +     for_each_possible_cpu(i) {
>> +             chip_id = cpu_to_chip_id(i);
>> +
>> +             if (coproc->chip_id == chip_id)
>> +                     per_cpu(coproc_inst, i) = coproc;
>> +     }
>> +}
>> +
>> +
>> +static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc)
>> +{
>> +     struct vas_window *txwin = NULL;
>> +     struct vas_tx_win_attr txattr;
>> +
>> +     /*
>> +      * Kernel requests will be high priority. So open send
>> +      * windows only for high priority RxFIFO entries.
>> +      */
>> +     vas_init_tx_win_attr(&txattr, coproc->ct);
>> +     txattr.lpid = 0;        /* lpid is 0 for kernel requests */
>> +     txattr.pid = mfspr(SPRN_PID);
>
> Should we be using SPRN_PID here? That makes it appear as if it comes
> from the current user process, which seems fishy.
>
>> +     /*
>> +      * Open a VAS send window which is used to send request to NX.
>> +      */
>> +     txwin = vas_tx_win_open(coproc->vas.id, coproc->ct, &txattr);
>> +     if (IS_ERR(txwin)) {
>> +             pr_err("ibm,nx-842: Can not open TX window: %ld\n",
>> +                             PTR_ERR(txwin));
>> +             return NULL;
>> +     }
>> +
>> +     return txwin;
>> +}
>> +
>> +static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id,
>> +                                     int vasid)
>> +{
>> +     struct vas_window *rxwin = NULL;
>> +     struct vas_rx_win_attr rxattr;
>> +     struct nx842_coproc *coproc;
>> +     u32 lpid, pid, tid, fifo_size;
>> +     u64 rx_fifo;
>> +     const char *priority;
>> +     int ret;
>> +
>> +     ret = of_property_read_u64(dn, "rx-fifo-address", (void *)&rx_fifo);
>                                                           ^^^^^^^^
>                                                           Unnecessary cast.
>
>> +     if (ret) {
>> +             pr_err("Missing rx-fifo-address property\n");
>> +             return ret;
>> +     }
>> +
>> +     ret = of_property_read_u32(dn, "rx-fifo-size", &fifo_size);
>> +     if (ret) {
>> +             pr_err("Missing rx-fifo-size property\n");
>> +             return ret;
>> +     }
>> +
>> +     ret = of_property_read_u32(dn, "lpid", &lpid);
>> +     if (ret) {
>> +             pr_err("Missing lpid property\n");
>> +             return ret;
>> +     }
>> +
>> +     ret = of_property_read_u32(dn, "pid", &pid);
>> +     if (ret) {
>> +             pr_err("Missing pid property\n");
>> +             return ret;
>> +     }
>> +
>> +     ret = of_property_read_u32(dn, "tid", &tid);
>> +     if (ret) {
>> +             pr_err("Missing tid property\n");
>> +             return ret;
>> +     }
>> +
>> +     ret = of_property_read_string(dn, "priority", &priority);
>> +     if (ret) {
>> +             pr_err("Missing priority property\n");
>> +             return ret;
>> +     }
>> +
>> +     coproc = kzalloc(sizeof(*coproc), GFP_KERNEL);
>> +     if (!coproc)
>> +             return -ENOMEM;
>> +
>> +     if (!strcmp(priority, "High"))
>> +             coproc->ct = VAS_COP_TYPE_842_HIPRI;
>> +     else if (!strcmp(priority, "Normal"))
>> +             coproc->ct = VAS_COP_TYPE_842;
>> +     else {
>> +             pr_err("Invalid RxFIFO priority value\n");
>> +             ret =  -EINVAL;
>> +             goto err_out;
>> +     }
>> +
>> +     vas_init_rx_win_attr(&rxattr, coproc->ct);
>> +     rxattr.rx_fifo = (void *)rx_fifo;
>> +     rxattr.rx_fifo_size = fifo_size;
>> +     rxattr.lnotify_lpid = lpid;
>> +     rxattr.lnotify_pid = pid;
>> +     rxattr.lnotify_tid = tid;
>> +     rxattr.wcreds_max = MAX_CREDITS_PER_RXFIFO;
>> +
>> +     /*
>> +      * Open a VAS receice window which is used to configure RxFIFO
>> +      * for NX.
>> +      */
>> +     rxwin = vas_rx_win_open(vasid, coproc->ct, &rxattr);
>> +     if (IS_ERR(rxwin)) {
>> +             ret = PTR_ERR(rxwin);
>> +             pr_err("setting RxFIFO with VAS failed: %d\n",
>> +                     ret);
>> +             goto err_out;
>> +     }
>> +
>> +     coproc->vas.rxwin = rxwin;
>> +     coproc->vas.id = vasid;
>> +     nx842_add_coprocs_list(coproc, chip_id);
>> +
>> +     /*
>> +      * Kernel requests use only high priority FIFOs. So save coproc
>> +      * info in percpu coproc_inst which will be used to open send
>> +      * windows for crypto open requests later.
>> +      */
>> +     if (coproc->ct == VAS_COP_TYPE_842_HIPRI)
>> +             nx842_set_per_cpu_coproc(coproc);
>> +
>> +     return 0;
>> +
>> +err_out:
>> +     kfree(coproc);
>> +     return ret;
>> +}
>> +
>> +
>> +static int __init nx842_powernv_probe_vas(struct device_node *pn)
>> +{
>> +     struct device_node *dn;
>> +     int chip_id, vasid, ret = 0;
>> +     int nx_fifo_found = 0;
>> +
>> +     chip_id = of_get_ibm_chip_id(pn);
>> +     if (chip_id < 0) {
>> +             pr_err("ibm,chip-id missing\n");
>> +             return -EINVAL;
>> +     }
>> +
>> +     dn = of_find_compatible_node(pn, NULL, "ibm,power9-vas-x");
>
> That's the wrong compatible, that will find the raw node, not the one
> that's intended for Linux use.
>
> It's also not really the NX drivers business to be looking for VAS
> nodes. You should just look for the P9 NX nodes, and if VAS wasn't
> configured for some reason then the VAS window open will fail and you
> detect it then.
>
>> +     if (!dn) {
>> +             pr_err("Missing VAS device node\n");
>> +             return -EINVAL;
>> +     }
>> +
>> +     if (of_property_read_u32(dn, "ibm,vas-id", &vasid)) {
>> +             pr_err("Missing ibm,vas-id device property\n");
>> +             of_node_put(dn);
>> +             return -EINVAL;
>> +     }
>> +
>> +     of_node_put(dn);
>
> So basically just drop all of that above.
>
>> diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c
>> index d94e25df503b..da3cb8c35ec7 100644
>> --- a/drivers/crypto/nx/nx-842.c
>> +++ b/drivers/crypto/nx/nx-842.c
>> @@ -116,7 +116,7 @@ int nx842_crypto_init(struct crypto_tfm *tfm, struct nx842_driver *driver)
>>
>>       spin_lock_init(&ctx->lock);
>>       ctx->driver = driver;
>> -     ctx->wmem = kmalloc(driver->workmem_size, GFP_KERNEL);
>> +     ctx->wmem = kzalloc(driver->workmem_size, GFP_KERNEL);

don't put persistent fields in the workmem, and you don't need to zero it.

>>       ctx->sbounce = (u8 *)__get_free_pages(GFP_KERNEL, BOUNCE_BUFFER_ORDER);
>>       ctx->dbounce = (u8 *)__get_free_pages(GFP_KERNEL, BOUNCE_BUFFER_ORDER);
>>       if (!ctx->wmem || !ctx->sbounce || !ctx->dbounce) {
>
> That looks OK but should be split into a separate patch because it
> affects the Power8 code as well as the pseries code.
>
> cheers
Dan Streetman Aug. 29, 2017, 1:58 p.m. UTC | #4
On Sat, Jul 22, 2017 at 1:01 AM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
>
> This patch adds P9 NX support for 842 compression engine. Virtual
> Accelerator Switchboard (VAS) is used to access 842 engine on P9.
>
> For each NX engine per chip, setup receive window using
> vas_rx_win_open() which configures RxFIFo with FIFO address, lpid,
> pid and tid values. This unique (lpid, pid, tid) combination will
> be used to identify the target engine.
>
> For crypto open request, open send window on the NX engine for
> the corresponding chip / cpu where the open request is executed.
> This send window will be closed upon crypto close request.
>
> NX provides high and normal priority FIFOs. For compression /
> decompression requests, we use only hight priority FIFOs in kernel.
>
> Each NX request will be communicated to VAS using copy/paste
> instructions with vas_copy_crb() / vas_paste_crb() functions.
>
> Signed-off-by: Haren Myneni <haren@us.ibm.com>
> ---
>  drivers/crypto/nx/Kconfig          |   1 +
>  drivers/crypto/nx/nx-842-powernv.c | 375 ++++++++++++++++++++++++++++++++++++-
>  drivers/crypto/nx/nx-842.c         |   2 +-
>  3 files changed, 371 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/crypto/nx/Kconfig b/drivers/crypto/nx/Kconfig
> index ad7552a6998c..cd5dda9c48f4 100644
> --- a/drivers/crypto/nx/Kconfig
> +++ b/drivers/crypto/nx/Kconfig
> @@ -38,6 +38,7 @@ config CRYPTO_DEV_NX_COMPRESS_PSERIES
>  config CRYPTO_DEV_NX_COMPRESS_POWERNV
>         tristate "Compression acceleration support on PowerNV platform"
>         depends on PPC_POWERNV
> +       depends on PPC_VAS
>         default y
>         help
>           Support for PowerPC Nest (NX) compression acceleration. This
> diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
> index c0dd4c7e17d3..13089a0b9dfa 100644
> --- a/drivers/crypto/nx/nx-842-powernv.c
> +++ b/drivers/crypto/nx/nx-842-powernv.c
> @@ -23,6 +23,7 @@
>  #include <asm/prom.h>
>  #include <asm/icswx.h>
>  #include <asm/vas.h>
> +#include <asm/reg.h>
>
>  MODULE_LICENSE("GPL");
>  MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>
>  #define WORKMEM_ALIGN  (CRB_ALIGN)
>  #define CSB_WAIT_MAX   (5000) /* ms */
> +#define VAS_RETRIES    (10)
> +/* # of requests allowed per RxFIFO at a time. 0 for unlimited */
> +#define MAX_CREDITS_PER_RXFIFO (1024)
>
>  struct nx842_workmem {
>         /* Below fields must be properly aligned */
> @@ -42,16 +46,27 @@ struct nx842_workmem {
>
>         ktime_t start;
>
> +       struct vas_window *txwin;       /* Used with VAS function */
>         char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
>  } __packed __aligned(WORKMEM_ALIGN);
>
>  struct nx842_coproc {
>         unsigned int chip_id;
>         unsigned int ct;
> -       unsigned int ci;
> +       unsigned int ci;        /* Coprocessor instance, used with icswx */
> +       struct {
> +               struct vas_window *rxwin;
> +               int id;
> +       } vas;
>         struct list_head list;
>  };
>
> +/*
> + * Send the request to NX engine on the chip for the corresponding CPU
> + * where the process is executing. Use with VAS function.
> + */
> +static DEFINE_PER_CPU(struct nx842_coproc *, coproc_inst);
> +
>  /* no cpu hotplug on powernv, so this list never changes after init */
>  static LIST_HEAD(nx842_coprocs);
>  static unsigned int nx842_ct;  /* used in icswx function */
> @@ -513,6 +528,105 @@ static int nx842_exec_icswx(const unsigned char *in, unsigned int inlen,
>  }
>
>  /**
> + * nx842_exec_vas - compress/decompress data using the 842 algorithm
> + *
> + * (De)compression provided by the NX842 coprocessor on IBM PowerNV systems.
> + * This compresses or decompresses the provided input buffer into the provided
> + * output buffer.
> + *
> + * Upon return from this function @outlen contains the length of the
> + * output data.  If there is an error then @outlen will be 0 and an
> + * error will be specified by the return code from this function.
> + *
> + * The @workmem buffer should only be used by one function call at a time.
> + *
> + * @in: input buffer pointer
> + * @inlen: input buffer size
> + * @out: output buffer pointer
> + * @outlenp: output buffer size pointer
> + * @workmem: working memory buffer pointer, size determined by
> + *           nx842_powernv_driver.workmem_size
> + * @fc: function code, see CCW Function Codes in nx-842.h
> + *
> + * Returns:
> + *   0         Success, output of length @outlenp stored in the buffer
> + *             at @out
> + *   -ENODEV   Hardware unavailable
> + *   -ENOSPC   Output buffer is to small
> + *   -EMSGSIZE Input buffer too large
> + *   -EINVAL   buffer constraints do not fix nx842_constraints
> + *   -EPROTO   hardware error during operation
> + *   -ETIMEDOUT        hardware did not complete operation in reasonable time
> + *   -EINTR    operation was aborted
> + */
> +static int nx842_exec_vas(const unsigned char *in, unsigned int inlen,
> +                                 unsigned char *out, unsigned int *outlenp,
> +                                 void *workmem, int fc)
> +{
> +       struct coprocessor_request_block *crb;
> +       struct coprocessor_status_block *csb;
> +       struct nx842_workmem *wmem;
> +       struct vas_window *txwin;
> +       int ret, i = 0;
> +       u32 ccw;
> +       unsigned int outlen = *outlenp;
> +
> +       wmem = PTR_ALIGN(workmem, WORKMEM_ALIGN);
> +
> +       *outlenp = 0;
> +
> +       crb = &wmem->crb;
> +       csb = &crb->csb;
> +
> +       ret = nx842_config_crb(in, inlen, out, outlen, wmem);
> +       if (ret)
> +               return ret;
> +
> +       ccw = 0;
> +       ccw = SET_FIELD(CCW_FC_842, ccw, fc);
> +       crb->ccw = cpu_to_be32(ccw);
> +
> +       txwin = wmem->txwin;
> +       /* shoudn't happen, we don't load without a coproc */
> +       if (!txwin) {
> +               pr_err_ratelimited("NX-842 coprocessor is not available");
> +               return -ENODEV;
> +       }
> +
> +       do {
> +               wmem->start = ktime_get();
> +               preempt_disable();
> +               /*
> +                * VAS copy CRB into L2 cache. Refer <asm/vas.h>.
> +                * @crb, @offset and @first (must be true)
> +                */
> +               vas_copy_crb(crb, 0, 1);
> +
> +               /*
> +                * VAS paste previously copied CRB to NX.
> +                * @txwin, @offset, @last (must be true) and @re is
> +                * expected/assumed to be true for NX windows.
> +                */
> +               ret = vas_paste_crb(txwin, 0, 1, 1);
> +               preempt_enable();
> +               /*
> +                * Retry copy/paste function for VAS failures.
> +                */
> +       } while (ret && (i++ < VAS_RETRIES));
> +
> +       if (ret) {
> +               pr_err_ratelimited("VAS copy/paste failed\n");
> +               return ret;
> +       }
> +
> +       ret = wait_for_csb(wmem, csb);
> +       if (!ret)
> +               *outlenp = be32_to_cpu(csb->count);
> +
> +       return ret;
> +}
> +
> +/**
>   * nx842_powernv_compress - Compress data using the 842 algorithm
>   *
>   * Compression provided by the NX842 coprocessor on IBM PowerNV systems.
> @@ -576,6 +690,198 @@ static inline void nx842_add_coprocs_list(struct nx842_coproc *coproc,
>         list_add(&coproc->list, &nx842_coprocs);
>  }
>
> +/*
> + * Identify chip ID for each CPU and save coprocesor adddress for the
> + * corresponding NX engine in percpu coproc_inst.
> + * coproc_inst is used in crypto_init to open send window on the NX instance
> + * for the corresponding CPU / chip where the open request is executed.
> + */
> +static void nx842_set_per_cpu_coproc(struct nx842_coproc *coproc)
> +{
> +       unsigned int i, chip_id;
> +
> +       for_each_possible_cpu(i) {
> +               chip_id = cpu_to_chip_id(i);
> +
> +               if (coproc->chip_id == chip_id)
> +                       per_cpu(coproc_inst, i) = coproc;
> +       }
> +}
> +
> +
> +static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc)
> +{
> +       struct vas_window *txwin = NULL;
> +       struct vas_tx_win_attr txattr;
> +
> +       /*
> +        * Kernel requests will be high priority. So open send
> +        * windows only for high priority RxFIFO entries.
> +        */
> +       vas_init_tx_win_attr(&txattr, coproc->ct);
> +       txattr.lpid = 0;        /* lpid is 0 for kernel requests */
> +       txattr.pid = mfspr(SPRN_PID);
> +
> +       /*
> +        * Open a VAS send window which is used to send request to NX.
> +        */
> +       txwin = vas_tx_win_open(coproc->vas.id, coproc->ct, &txattr);
> +       if (IS_ERR(txwin)) {
> +               pr_err("ibm,nx-842: Can not open TX window: %ld\n",
> +                               PTR_ERR(txwin));
> +               return NULL;
> +       }
> +
> +       return txwin;
> +}
> +
> +static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id,
> +                                       int vasid)
> +{
> +       struct vas_window *rxwin = NULL;
> +       struct vas_rx_win_attr rxattr;
> +       struct nx842_coproc *coproc;
> +       u32 lpid, pid, tid, fifo_size;
> +       u64 rx_fifo;
> +       const char *priority;
> +       int ret;
> +
> +       ret = of_property_read_u64(dn, "rx-fifo-address", (void *)&rx_fifo);
> +       if (ret) {
> +               pr_err("Missing rx-fifo-address property\n");
> +               return ret;
> +       }
> +
> +       ret = of_property_read_u32(dn, "rx-fifo-size", &fifo_size);
> +       if (ret) {
> +               pr_err("Missing rx-fifo-size property\n");
> +               return ret;
> +       }
> +
> +       ret = of_property_read_u32(dn, "lpid", &lpid);
> +       if (ret) {
> +               pr_err("Missing lpid property\n");
> +               return ret;
> +       }
> +
> +       ret = of_property_read_u32(dn, "pid", &pid);
> +       if (ret) {
> +               pr_err("Missing pid property\n");
> +               return ret;
> +       }
> +
> +       ret = of_property_read_u32(dn, "tid", &tid);
> +       if (ret) {
> +               pr_err("Missing tid property\n");
> +               return ret;
> +       }
> +
> +       ret = of_property_read_string(dn, "priority", &priority);
> +       if (ret) {
> +               pr_err("Missing priority property\n");
> +               return ret;
> +       }
> +
> +       coproc = kzalloc(sizeof(*coproc), GFP_KERNEL);
> +       if (!coproc)
> +               return -ENOMEM;
> +
> +       if (!strcmp(priority, "High"))
> +               coproc->ct = VAS_COP_TYPE_842_HIPRI;
> +       else if (!strcmp(priority, "Normal"))
> +               coproc->ct = VAS_COP_TYPE_842;
> +       else {
> +               pr_err("Invalid RxFIFO priority value\n");
> +               ret =  -EINVAL;
> +               goto err_out;
> +       }
> +
> +       vas_init_rx_win_attr(&rxattr, coproc->ct);
> +       rxattr.rx_fifo = (void *)rx_fifo;
> +       rxattr.rx_fifo_size = fifo_size;
> +       rxattr.lnotify_lpid = lpid;
> +       rxattr.lnotify_pid = pid;
> +       rxattr.lnotify_tid = tid;
> +       rxattr.wcreds_max = MAX_CREDITS_PER_RXFIFO;
> +
> +       /*
> +        * Open a VAS receice window which is used to configure RxFIFO
> +        * for NX.
> +        */
> +       rxwin = vas_rx_win_open(vasid, coproc->ct, &rxattr);
> +       if (IS_ERR(rxwin)) {
> +               ret = PTR_ERR(rxwin);
> +               pr_err("setting RxFIFO with VAS failed: %d\n",
> +                       ret);
> +               goto err_out;
> +       }
> +
> +       coproc->vas.rxwin = rxwin;
> +       coproc->vas.id = vasid;
> +       nx842_add_coprocs_list(coproc, chip_id);
> +
> +       /*
> +        * Kernel requests use only high priority FIFOs. So save coproc
> +        * info in percpu coproc_inst which will be used to open send
> +        * windows for crypto open requests later.
> +        */
> +       if (coproc->ct == VAS_COP_TYPE_842_HIPRI)
> +               nx842_set_per_cpu_coproc(coproc);
> +
> +       return 0;
> +
> +err_out:
> +       kfree(coproc);
> +       return ret;
> +}
> +
> +
> +static int __init nx842_powernv_probe_vas(struct device_node *pn)
> +{
> +       struct device_node *dn;
> +       int chip_id, vasid, ret = 0;
> +       int nx_fifo_found = 0;
> +
> +       chip_id = of_get_ibm_chip_id(pn);
> +       if (chip_id < 0) {
> +               pr_err("ibm,chip-id missing\n");
> +               return -EINVAL;
> +       }
> +
> +       dn = of_find_compatible_node(pn, NULL, "ibm,power9-vas-x");
> +
> +       if (!dn) {
> +               pr_err("Missing VAS device node\n");
> +               return -EINVAL;
> +       }
> +
> +       if (of_property_read_u32(dn, "ibm,vas-id", &vasid)) {
> +               pr_err("Missing ibm,vas-id device property\n");
> +               of_node_put(dn);
> +               return -EINVAL;
> +       }
> +
> +       of_node_put(dn);
> +
> +       for_each_child_of_node(pn, dn) {
> +               if (of_device_is_compatible(dn, "ibm,p9-nx-842")) {
> +                       ret = vas_cfg_coproc_info(dn, chip_id, vasid);
> +                       if (ret) {
> +                               of_node_put(dn);
> +                               return ret;
> +                       }
> +                       nx_fifo_found++;
> +               }
> +       }
> +
> +       if (!nx_fifo_found) {
> +               pr_err("NX842 FIFO nodes are missing\n");
> +               ret = -EINVAL;
> +       }
> +
> +       return ret;
> +}
> +
>  static int __init nx842_powernv_probe(struct device_node *dn)
>  {
>         struct nx842_coproc *coproc;
> @@ -622,6 +928,9 @@ static void nx842_delete_coprocs(void)
>         struct nx842_coproc *coproc, *n;
>
>         list_for_each_entry_safe(coproc, n, &nx842_coprocs, list) {
> +               if (coproc->vas.rxwin)
> +                       vas_win_close(coproc->vas.rxwin);
> +
>                 list_del(&coproc->list);
>                 kfree(coproc);
>         }
> @@ -643,6 +952,46 @@ static struct nx842_driver nx842_powernv_driver = {
>         .decompress =   nx842_powernv_decompress,
>  };
>
> +static int nx842_powernv_crypto_init_vas(struct crypto_tfm *tfm)
> +{
> +       struct nx842_crypto_ctx *ctx = crypto_tfm_ctx(tfm);
> +       struct nx842_workmem *wmem;
> +       struct nx842_coproc *coproc;
> +       int ret;
> +
> +       ret = nx842_crypto_init(tfm, &nx842_powernv_driver);
> +
> +       if (ret)
> +               return ret;
> +
> +       wmem = PTR_ALIGN((struct nx842_workmem *)ctx->wmem, WORKMEM_ALIGN);
> +       coproc = per_cpu(coproc_inst, smp_processor_id());

this is wrong.  the crypto transform init function is not guaranteed
to be called by the same processor that later uses it.  Just because
that happens to be how zswap operates doesn't guarantee other crypto
users will do the same.

> +
> +       ret = -EINVAL;
> +       if (coproc && coproc->vas.rxwin) {
> +               wmem->txwin = nx842_alloc_txwin(coproc);

this is wrong.  the workmem is scratch memory that's valid only for
the duration of a single operation.

do you actually need a txwin per crypto transform?  or do you need a
txwin per coprocessor?  or txwin per processor?  either per-coproc or
per-cpu should be created at driver init and held separately
(globally) instead of a per-transform txwin.  I really don't see why
you would need a txwin per transform, because the coproc should not
care how many different transforms there are.


> +               if (!IS_ERR(wmem->txwin))
> +                       return 0;
> +
> +               ret = PTR_ERR(wmem->txwin);
> +       }
> +
> +       return ret;
> +}
> +
> +void nx842_powernv_crypto_exit_vas(struct crypto_tfm *tfm)
> +{
> +       struct nx842_crypto_ctx *ctx = crypto_tfm_ctx(tfm);
> +       struct nx842_workmem *wmem;
> +
> +       wmem = PTR_ALIGN((struct nx842_workmem *)ctx->wmem, WORKMEM_ALIGN);
> +
> +       if (wmem && wmem->txwin)
> +               vas_win_close(wmem->txwin);
> +
> +       nx842_crypto_exit(tfm);
> +}
> +
>  static int nx842_powernv_crypto_init(struct crypto_tfm *tfm)
>  {
>         return nx842_crypto_init(tfm, &nx842_powernv_driver);
> @@ -676,13 +1025,27 @@ static __init int nx842_powernv_init(void)
>         BUILD_BUG_ON(DDE_BUFFER_ALIGN % DDE_BUFFER_SIZE_MULT);
>         BUILD_BUG_ON(DDE_BUFFER_SIZE_MULT % DDE_BUFFER_LAST_MULT);
>
> -       for_each_compatible_node(dn, NULL, "ibm,power-nx")
> -               nx842_powernv_probe(dn);
> +       for_each_compatible_node(dn, NULL, "ibm,power9-nx") {
> +               ret = nx842_powernv_probe_vas(dn);
> +               if (ret) {
> +                       nx842_delete_coprocs();
> +                       return ret;
> +               }
> +       }
>
> -       if (!nx842_ct)
> -               return -ENODEV;
> +       if (list_empty(&nx842_coprocs)) {
> +               for_each_compatible_node(dn, NULL, "ibm,power-nx")
> +                       nx842_powernv_probe(dn);
> +
> +               if (!nx842_ct)
> +                       return -ENODEV;
>
> -       nx842_powernv_exec = nx842_exec_icswx;
> +               nx842_powernv_exec = nx842_exec_icswx;
> +       } else {
> +               nx842_powernv_exec = nx842_exec_vas;
> +               nx842_powernv_alg.cra_init = nx842_powernv_crypto_init_vas;
> +               nx842_powernv_alg.cra_exit = nx842_powernv_crypto_exit_vas;
> +       }
>
>         ret = crypto_register_alg(&nx842_powernv_alg);
>         if (ret) {
> diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c
> index d94e25df503b..da3cb8c35ec7 100644
> --- a/drivers/crypto/nx/nx-842.c
> +++ b/drivers/crypto/nx/nx-842.c
> @@ -116,7 +116,7 @@ int nx842_crypto_init(struct crypto_tfm *tfm, struct nx842_driver *driver)
>
>         spin_lock_init(&ctx->lock);
>         ctx->driver = driver;
> -       ctx->wmem = kmalloc(driver->workmem_size, GFP_KERNEL);
> +       ctx->wmem = kzalloc(driver->workmem_size, GFP_KERNEL);
>         ctx->sbounce = (u8 *)__get_free_pages(GFP_KERNEL, BOUNCE_BUFFER_ORDER);
>         ctx->dbounce = (u8 *)__get_free_pages(GFP_KERNEL, BOUNCE_BUFFER_ORDER);
>         if (!ctx->wmem || !ctx->sbounce || !ctx->dbounce) {
> --
> 2.11.0
>
>
>
Benjamin Herrenschmidt Aug. 29, 2017, 9:23 p.m. UTC | #5
On Tue, 2017-08-29 at 09:58 -0400, Dan Streetman wrote:
> > +
> > +       ret = -EINVAL;
> > +       if (coproc && coproc->vas.rxwin) {
> > +               wmem->txwin = nx842_alloc_txwin(coproc);
> 
> this is wrong.  the workmem is scratch memory that's valid only for
> the duration of a single operation.
> 
> do you actually need a txwin per crypto transform?  or do you need a
> txwin per coprocessor?  or txwin per processor?  either per-coproc or
> per-cpu should be created at driver init and held separately
> (globally) instead of a per-transform txwin.  I really don't see why
> you would need a txwin per transform, because the coproc should not
> care how many different transforms there are.

We should only need a single window for the whole kernel really, plus
one per user process who wants direct access but that's not relevant
here.

Cheers,
Ben.
Haren Myneni Aug. 29, 2017, 9:54 p.m. UTC | #6
On 08/29/2017 02:23 PM, Benjamin Herrenschmidt wrote:
> On Tue, 2017-08-29 at 09:58 -0400, Dan Streetman wrote:
>>> +
>>> +       ret = -EINVAL;
>>> +       if (coproc && coproc->vas.rxwin) {
>>> +               wmem->txwin = nx842_alloc_txwin(coproc);
>>
>> this is wrong.  the workmem is scratch memory that's valid only for
>> the duration of a single operation.

Correct, workmem is used until crypto_free is called. 
>>
>> do you actually need a txwin per crypto transform?  or do you need a
>> txwin per coprocessor?  or txwin per processor?  either per-coproc or
>> per-cpu should be created at driver init and held separately
>> (globally) instead of a per-transform txwin.  I really don't see why
>> you would need a txwin per transform, because the coproc should not
>> care how many different transforms there are.
> 
> We should only need a single window for the whole kernel really, plus
> one per user process who wants direct access but that's not relevant
> here.

Opening send window for each crypto transform (crypto_alloc, compression/decompression, ..., crypto_free) so that does not have to wait for the previous copy/paste complete. VAS will map send and receive windows, and can cache in send windows (up to 128). So I thought using the same send window (per chip) for more requests (say 1000) may be adding overhead.

I will make changes if you prefer using 1 send window per chip.  

> 
> Cheers,
> Ben.
>
Benjamin Herrenschmidt Aug. 29, 2017, 9:57 p.m. UTC | #7
On Tue, 2017-08-29 at 14:54 -0700, Haren Myneni wrote:
> Opening send window for each crypto transform (crypto_alloc,
> compression/decompression, ..., crypto_free) so that does not have to
> wait for the previous copy/paste complete. VAS will map send and
> receive windows, and can cache in send windows (up to 128). So I
> thought using the same send window (per chip) for more requests (say
> 1000) may be adding overhead.
> 
> I will make changes if you prefer using 1 send window per chip.  

Did you check the cost of opening/closing a window ?

Cheers,
Ben.
Haren Myneni Aug. 30, 2017, 1:02 a.m. UTC | #8
On 08/29/2017 02:57 PM, Benjamin Herrenschmidt wrote:
> On Tue, 2017-08-29 at 14:54 -0700, Haren Myneni wrote:
>> Opening send window for each crypto transform (crypto_alloc,
>> compression/decompression, ..., crypto_free) so that does not have to
>> wait for the previous copy/paste complete. VAS will map send and
>> receive windows, and can cache in send windows (up to 128). So I
>> thought using the same send window (per chip) for more requests (say
>> 1000) may be adding overhead.
>>
>> I will make changes if you prefer using 1 send window per chip.  
> 
> Did you check the cost of opening/closing a window ?

No, Not yet. opening / closing happens only during alloc/free, but not for each compression/decompression. Hence used separate send windows. 

Thanks
Haren  
 
> 
> Cheers,
> Ben.
>
Haren Myneni Aug. 31, 2017, 7:44 a.m. UTC | #9
Thanks MIchael and Dan for your review comments.


On 08/29/2017 06:32 AM, Dan Streetman wrote:
> On Mon, Aug 28, 2017 at 7:25 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
>> Hi Haren,
>>
>> Some comments inline ...
>>
>> Haren Myneni <haren@linux.vnet.ibm.com> writes:
>>
>>> diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
>>> index c0dd4c7e17d3..13089a0b9dfa 100644
>>> --- a/drivers/crypto/nx/nx-842-powernv.c
>>> +++ b/drivers/crypto/nx/nx-842-powernv.c
>>> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>>>
>>>  #define WORKMEM_ALIGN        (CRB_ALIGN)
>>>  #define CSB_WAIT_MAX (5000) /* ms */
>>> +#define VAS_RETRIES  (10)
>>
>> Where does that number come from?

Sometimes HW returns copy/paste failures. So we should retry the request again. With 10 retries, Test running 12 hours was successful for repeated compression/decompression requests with 1024 threads. 

>>
>> Do we have any idea what the trade off is between retrying vs just
>> falling back to doing the request in software?

Not checked the overhead with falling back to SW compression. 

>>
>>> +/* # of requests allowed per RxFIFO at a time. 0 for unlimited */
>>> +#define MAX_CREDITS_PER_RXFIFO       (1024)
>>>
>>>  struct nx842_workmem {
>>>       /* Below fields must be properly aligned */
>>> @@ -42,16 +46,27 @@ struct nx842_workmem {
>>>
>>>       ktime_t start;
>>>
>>> +     struct vas_window *txwin;       /* Used with VAS function */
>>
>> I don't understand how it makes sense to put txwin and start between the
>> fields above, and the padding.
> 
> workmem is a scratch buffer and shouldn't be used for something
> persistent like this.
> 
>>
>> If the workmem pointer you receive is not aligned, then PTR_ALIGN() will
>> advance it and mean you end up writing over start and txwin.

We always access workmem with PTR_ALIGN even when assigning txwin (nx842_powernv_crypto_init/exit_vas).
So we should not overwrite start and txwin,

We can add txwin in nx842_crypto_ctx instead of workmem. But nx842_crypto_ctx is used for both powernv and pseries. Hence used workmem. But if nx842_crypto_ctx is preferred, I will send new patch soon. 

>>
>> That's probably not your bug, the code is already like that.
> 
> no, it's a bug in this patch, because workmem is scratch whose
> contents are only valid for the duration of each operation (compress
> or decompress).
> 
>>
>>>       char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
>>>  } __packed __aligned(WORKMEM_ALIGN);
>>

Thanks
Haren
Dan Streetman Aug. 31, 2017, 1:31 p.m. UTC | #10
On Tue, Aug 29, 2017 at 5:54 PM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
> On 08/29/2017 02:23 PM, Benjamin Herrenschmidt wrote:
>> On Tue, 2017-08-29 at 09:58 -0400, Dan Streetman wrote:
>>>> +
>>>> +       ret = -EINVAL;
>>>> +       if (coproc && coproc->vas.rxwin) {
>>>> +               wmem->txwin = nx842_alloc_txwin(coproc);
>>>
>>> this is wrong.  the workmem is scratch memory that's valid only for
>>> the duration of a single operation.
>
> Correct, workmem is used until crypto_free is called.

that's not a 'single operation'.  a single operation is compress() or
decompress().

>>>
>>> do you actually need a txwin per crypto transform?  or do you need a
>>> txwin per coprocessor?  or txwin per processor?  either per-coproc or
>>> per-cpu should be created at driver init and held separately
>>> (globally) instead of a per-transform txwin.  I really don't see why
>>> you would need a txwin per transform, because the coproc should not
>>> care how many different transforms there are.
>>
>> We should only need a single window for the whole kernel really, plus
>> one per user process who wants direct access but that's not relevant
>> here.
>
> Opening send window for each crypto transform (crypto_alloc,
> compression/decompression, ..., crypto_free) so that does not
> have to wait for the previous copy/paste complete.
> VAS will map send and receive windows, and can cache in send
> windows (up to 128). So I thought using the same send window
> (per chip) for more requests (say 1000) may be adding overhead.
>
> I will make changes if you prefer using 1 send window per chip.

i don't have the spec, so i shouldn't be making the decision on it,
but i do know putting a persistent field into the workmem is the wrong
location.  If it's valid for the life of the transform, put it into
the transform context.  The workmem buffer is intended to be used only
during a single operation - it's "working memory" to perform each
individual crypto transformation.

>
>>
>> Cheers,
>> Ben.
>>
>
Dan Streetman Aug. 31, 2017, 1:40 p.m. UTC | #11
On Thu, Aug 31, 2017 at 3:44 AM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
> Thanks MIchael and Dan for your review comments.
>
>
> On 08/29/2017 06:32 AM, Dan Streetman wrote:
>> On Mon, Aug 28, 2017 at 7:25 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
>>> Hi Haren,
>>>
>>> Some comments inline ...
>>>
>>> Haren Myneni <haren@linux.vnet.ibm.com> writes:
>>>
>>>> diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
>>>> index c0dd4c7e17d3..13089a0b9dfa 100644
>>>> --- a/drivers/crypto/nx/nx-842-powernv.c
>>>> +++ b/drivers/crypto/nx/nx-842-powernv.c
>>>> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>>>>
>>>>  #define WORKMEM_ALIGN        (CRB_ALIGN)
>>>>  #define CSB_WAIT_MAX (5000) /* ms */
>>>> +#define VAS_RETRIES  (10)
>>>
>>> Where does that number come from?
>
> Sometimes HW returns copy/paste failures.

why?  what is causing the failure?

> So we should retry the request again. With 10 retries, Test running
> 12 hours was successful for repeated compression/decompression
> requests with 1024 threads.
>
>>>
>>> Do we have any idea what the trade off is between retrying vs just
>>> falling back to doing the request in software?
>
> Not checked the overhead with falling back to SW compression.

SW is very, very, very slow, due to 842 being an unaligned compression format.

>
>>>
>>>> +/* # of requests allowed per RxFIFO at a time. 0 for unlimited */
>>>> +#define MAX_CREDITS_PER_RXFIFO       (1024)
>>>>
>>>>  struct nx842_workmem {
>>>>       /* Below fields must be properly aligned */
>>>> @@ -42,16 +46,27 @@ struct nx842_workmem {
>>>>
>>>>       ktime_t start;
>>>>
>>>> +     struct vas_window *txwin;       /* Used with VAS function */
>>>
>>> I don't understand how it makes sense to put txwin and start between the
>>> fields above, and the padding.
>>
>> workmem is a scratch buffer and shouldn't be used for something
>> persistent like this.
>>
>>>
>>> If the workmem pointer you receive is not aligned, then PTR_ALIGN() will
>>> advance it and mean you end up writing over start and txwin.
>
> We always access workmem with PTR_ALIGN even when assigning txwin (nx842_powernv_crypto_init/exit_vas).
> So we should not overwrite start and txwin,
>
> We can add txwin in nx842_crypto_ctx instead of workmem. But nx842_crypto_ctx is used for both powernv and pseries. Hence used workmem. But if nx842_crypto_ctx is preferred, I will send new patch soon.
>
>>>
>>> That's probably not your bug, the code is already like that.
>>
>> no, it's a bug in this patch, because workmem is scratch whose
>> contents are only valid for the duration of each operation (compress
>> or decompress).
>>
>>>
>>>>       char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
>>>>  } __packed __aligned(WORKMEM_ALIGN);
>>>
>
> Thanks
> Haren
>
Haren Myneni Aug. 31, 2017, 7:03 p.m. UTC | #12
On 08/31/2017 06:40 AM, Dan Streetman wrote:
> On Thu, Aug 31, 2017 at 3:44 AM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
>> Thanks MIchael and Dan for your review comments.
>>
>>
>> On 08/29/2017 06:32 AM, Dan Streetman wrote:
>>> On Mon, Aug 28, 2017 at 7:25 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
>>>> Hi Haren,
>>>>
>>>> Some comments inline ...
>>>>
>>>> Haren Myneni <haren@linux.vnet.ibm.com> writes:
>>>>
>>>>> diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
>>>>> index c0dd4c7e17d3..13089a0b9dfa 100644
>>>>> --- a/drivers/crypto/nx/nx-842-powernv.c
>>>>> +++ b/drivers/crypto/nx/nx-842-powernv.c
>>>>> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>>>>>
>>>>>  #define WORKMEM_ALIGN        (CRB_ALIGN)
>>>>>  #define CSB_WAIT_MAX (5000) /* ms */
>>>>> +#define VAS_RETRIES  (10)
>>>>
>>>> Where does that number come from?
>>
>> Sometimes HW returns copy/paste failures.
> 
> why?  what is causing the failure?

Vas can return RMA_busy for several reasons - receive / send windows does not have credits or cached and etc.
 
> 
>> So we should retry the request again. With 10 retries, Test running
>> 12 hours was successful for repeated compression/decompression
>> requests with 1024 threads.
>>
>>>>
>>>> Do we have any idea what the trade off is between retrying vs just
>>>> falling back to doing the request in software?
>>
>> Not checked the overhead with falling back to SW compression.
> 
> SW is very, very, very slow, due to 842 being an unaligned compression format.
> 
>>
>>>>
>>>>> +/* # of requests allowed per RxFIFO at a time. 0 for unlimited */
>>>>> +#define MAX_CREDITS_PER_RXFIFO       (1024)
>>>>>
>>>>>  struct nx842_workmem {
>>>>>       /* Below fields must be properly aligned */
>>>>> @@ -42,16 +46,27 @@ struct nx842_workmem {
>>>>>
>>>>>       ktime_t start;
>>>>>
>>>>> +     struct vas_window *txwin;       /* Used with VAS function */
>>>>
>>>> I don't understand how it makes sense to put txwin and start between the
>>>> fields above, and the padding.
>>>
>>> workmem is a scratch buffer and shouldn't be used for something
>>> persistent like this.
>>>
>>>>
>>>> If the workmem pointer you receive is not aligned, then PTR_ALIGN() will
>>>> advance it and mean you end up writing over start and txwin.
>>
>> We always access workmem with PTR_ALIGN even when assigning txwin (nx842_powernv_crypto_init/exit_vas).
>> So we should not overwrite start and txwin,
>>
>> We can add txwin in nx842_crypto_ctx instead of workmem. But nx842_crypto_ctx is used for both powernv and pseries. Hence used workmem. But if nx842_crypto_ctx is preferred, I will send new patch soon.
>>
>>>>
>>>> That's probably not your bug, the code is already like that.
>>>
>>> no, it's a bug in this patch, because workmem is scratch whose
>>> contents are only valid for the duration of each operation (compress
>>> or decompress).
>>>
>>>>
>>>>>       char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
>>>>>  } __packed __aligned(WORKMEM_ALIGN);
>>>>
>>
>> Thanks
>> Haren
>>
>
Haren Myneni Aug. 31, 2017, 7:09 p.m. UTC | #13
On 08/31/2017 06:31 AM, Dan Streetman wrote:
> On Tue, Aug 29, 2017 at 5:54 PM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
>> On 08/29/2017 02:23 PM, Benjamin Herrenschmidt wrote:
>>> On Tue, 2017-08-29 at 09:58 -0400, Dan Streetman wrote:
>>>>> +
>>>>> +       ret = -EINVAL;
>>>>> +       if (coproc && coproc->vas.rxwin) {
>>>>> +               wmem->txwin = nx842_alloc_txwin(coproc);
>>>>
>>>> this is wrong.  the workmem is scratch memory that's valid only for
>>>> the duration of a single operation.
>>
>> Correct, workmem is used until crypto_free is called.
> 
> that's not a 'single operation'.  a single operation is compress() or
> decompress().

workmem is allocated in nx842_crypto_init (called from crypto_alloc) and freed in crypto_free(). We can have single compression / decompression() operation or multiple within this crypto session. In case of single operation, we will end up workmem, ctx->sbounce/dbounce alloc/free for each request.  

> 
>>>>
>>>> do you actually need a txwin per crypto transform?  or do you need a
>>>> txwin per coprocessor?  or txwin per processor?  either per-coproc or
>>>> per-cpu should be created at driver init and held separately
>>>> (globally) instead of a per-transform txwin.  I really don't see why
>>>> you would need a txwin per transform, because the coproc should not
>>>> care how many different transforms there are.
>>>
>>> We should only need a single window for the whole kernel really, plus
>>> one per user process who wants direct access but that's not relevant
>>> here.
>>
>> Opening send window for each crypto transform (crypto_alloc,
>> compression/decompression, ..., crypto_free) so that does not
>> have to wait for the previous copy/paste complete.
>> VAS will map send and receive windows, and can cache in send
>> windows (up to 128). So I thought using the same send window
>> (per chip) for more requests (say 1000) may be adding overhead.
>>
>> I will make changes if you prefer using 1 send window per chip.
> 
> i don't have the spec, so i shouldn't be making the decision on it,
> but i do know putting a persistent field into the workmem is the wrong
> location.  If it's valid for the life of the transform, put it into
> the transform context.  The workmem buffer is intended to be used only
> during a single operation - it's "working memory" to perform each
> individual crypto transformation.
> 
>>
>>>
>>> Cheers,
>>> Ben.
>>>
>>
>
Michael Ellerman Sept. 1, 2017, 11:29 a.m. UTC | #14
Hi Dan,

Thanks for reviewing this series.

Dan Streetman <ddstreet@ieee.org> writes:
> On Tue, Aug 29, 2017 at 5:54 PM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
>> On 08/29/2017 02:23 PM, Benjamin Herrenschmidt wrote:
>>> On Tue, 2017-08-29 at 09:58 -0400, Dan Streetman wrote:
>>>>> +
>>>>> +       ret = -EINVAL;
>>>>> +       if (coproc && coproc->vas.rxwin) {
>>>>> +               wmem->txwin = nx842_alloc_txwin(coproc);
>>>>
>>>> this is wrong.  the workmem is scratch memory that's valid only for
>>>> the duration of a single operation.
>>
>> Correct, workmem is used until crypto_free is called.
>
> that's not a 'single operation'.  a single operation is compress() or
> decompress().
>
>>>>
>>>> do you actually need a txwin per crypto transform?  or do you need a
>>>> txwin per coprocessor?  or txwin per processor?  either per-coproc or
>>>> per-cpu should be created at driver init and held separately
>>>> (globally) instead of a per-transform txwin.  I really don't see why
>>>> you would need a txwin per transform, because the coproc should not
>>>> care how many different transforms there are.
>>>
>>> We should only need a single window for the whole kernel really, plus
>>> one per user process who wants direct access but that's not relevant
>>> here.
>>
>> Opening send window for each crypto transform (crypto_alloc,
>> compression/decompression, ..., crypto_free) so that does not
>> have to wait for the previous copy/paste complete.
>> VAS will map send and receive windows, and can cache in send
>> windows (up to 128). So I thought using the same send window
>> (per chip) for more requests (say 1000) may be adding overhead.
>>
>> I will make changes if you prefer using 1 send window per chip.
>
> i don't have the spec, so i shouldn't be making the decision on it,
> but i do know putting a persistent field into the workmem is the wrong
> location.  If it's valid for the life of the transform, put it into
> the transform context.  The workmem buffer is intended to be used only
> during a single operation - it's "working memory" to perform each
> individual crypto transformation.

I agree workmem isn't the right place for the txwin. But I don't believe
it actually breaks anything to put txwin there.

So for now I'm going to merge this series as-is and I've asked Haren to
send fixes as soon as he can to clean it up.

cheers
Michael Ellerman Sept. 1, 2017, 11:34 a.m. UTC | #15
Haren Myneni <haren@linux.vnet.ibm.com> writes:
>> On Mon, Aug 28, 2017 at 7:25 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
>>> Hi Haren,
>>>
>>> Some comments inline ...
>>>
>>> Haren Myneni <haren@linux.vnet.ibm.com> writes:
>>>
>>>> diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
>>>> index c0dd4c7e17d3..13089a0b9dfa 100644
>>>> --- a/drivers/crypto/nx/nx-842-powernv.c
>>>> +++ b/drivers/crypto/nx/nx-842-powernv.c
>>>> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>>>>
>>>>  #define WORKMEM_ALIGN        (CRB_ALIGN)
>>>>  #define CSB_WAIT_MAX (5000) /* ms */
>>>> +#define VAS_RETRIES  (10)
>>>
>>> Where does that number come from?
>
> Sometimes HW returns copy/paste failures. So we should retry the
> request again. With 10 retries, Test running 12 hours was successful
> for repeated compression/decompression requests with 1024 threads.

But why 10. Why not 5, or 100, or 1, or 10,000?

Presumably when we have to retry it means the NX is too busy to service the
request?

Do we have any way to find out how long it might be busy for?

Should we try an NX on another chip?

We should also take into account the size of our request, ie. are we
asking the NX to compress one page, or 1GB ?

If it's just one page maybe we should fall back to software immediately.

cheers
Haren Myneni Sept. 2, 2017, 3:27 a.m. UTC | #16
On 09/01/2017 04:29 AM, Michael Ellerman wrote:
> Hi Dan,
> 
> Thanks for reviewing this series.
> 
> Dan Streetman <ddstreet@ieee.org> writes:
>> On Tue, Aug 29, 2017 at 5:54 PM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
>>> On 08/29/2017 02:23 PM, Benjamin Herrenschmidt wrote:
>>>> On Tue, 2017-08-29 at 09:58 -0400, Dan Streetman wrote:
>>>>>> +
>>>>>> +       ret = -EINVAL;
>>>>>> +       if (coproc && coproc->vas.rxwin) {
>>>>>> +               wmem->txwin = nx842_alloc_txwin(coproc);
>>>>>
>>>>> this is wrong.  the workmem is scratch memory that's valid only for
>>>>> the duration of a single operation.
>>>
>>> Correct, workmem is used until crypto_free is called.
>>
>> that's not a 'single operation'.  a single operation is compress() or
>> decompress().
>>
>>>>>
>>>>> do you actually need a txwin per crypto transform?  or do you need a
>>>>> txwin per coprocessor?  or txwin per processor?  either per-coproc or
>>>>> per-cpu should be created at driver init and held separately
>>>>> (globally) instead of a per-transform txwin.  I really don't see why
>>>>> you would need a txwin per transform, because the coproc should not
>>>>> care how many different transforms there are.
>>>>
>>>> We should only need a single window for the whole kernel really, plus
>>>> one per user process who wants direct access but that's not relevant
>>>> here.
>>>
>>> Opening send window for each crypto transform (crypto_alloc,
>>> compression/decompression, ..., crypto_free) so that does not
>>> have to wait for the previous copy/paste complete.
>>> VAS will map send and receive windows, and can cache in send
>>> windows (up to 128). So I thought using the same send window
>>> (per chip) for more requests (say 1000) may be adding overhead.
>>>
>>> I will make changes if you prefer using 1 send window per chip.
>>
>> i don't have the spec, so i shouldn't be making the decision on it,
>> but i do know putting a persistent field into the workmem is the wrong
>> location.  If it's valid for the life of the transform, put it into
>> the transform context.  The workmem buffer is intended to be used only
>> during a single operation - it's "working memory" to perform each
>> individual crypto transformation.
> 
> I agree workmem isn't the right place for the txwin. But I don't believe
> it actually breaks anything to put txwin there.
> 
> So for now I'm going to merge this series as-is and I've asked Haren to
> send fixes as soon as he can to clean it up.

I will move txwin to nx842_crypto_ctx and send patch soon.  Even though nx842_crypto_ctx is also used for pseries, we can ignore txwin. 

Thanks
Haren


> 
> cheers
>
Haren Myneni Sept. 2, 2017, 4:11 a.m. UTC | #17
On 09/01/2017 04:34 AM, Michael Ellerman wrote:
> Haren Myneni <haren@linux.vnet.ibm.com> writes:
>>> On Mon, Aug 28, 2017 at 7:25 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
>>>> Hi Haren,
>>>>
>>>> Some comments inline ...
>>>>
>>>> Haren Myneni <haren@linux.vnet.ibm.com> writes:
>>>>
>>>>> diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
>>>>> index c0dd4c7e17d3..13089a0b9dfa 100644
>>>>> --- a/drivers/crypto/nx/nx-842-powernv.c
>>>>> +++ b/drivers/crypto/nx/nx-842-powernv.c
>>>>> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>>>>>
>>>>>  #define WORKMEM_ALIGN        (CRB_ALIGN)
>>>>>  #define CSB_WAIT_MAX (5000) /* ms */
>>>>> +#define VAS_RETRIES  (10)
>>>>
>>>> Where does that number come from?
>>
>> Sometimes HW returns copy/paste failures. So we should retry the
>> request again. With 10 retries, Test running 12 hours was successful
>> for repeated compression/decompression requests with 1024 threads.
> 
> But why 10. Why not 5, or 100, or 1, or 10,000?

VAS spec says small number of retries. During my 12 hour test with 1024 threads - doing continuous compression/decompression requests, noticed around 6 or 7 retries needed. Hence used 10 retries.  

> 
> Presumably when we have to retry it means the NX is too busy to service the
> request?

One possible case. We can also see failures when receive/send credit are exhausted or reached the cached windows limit.

> 
> Do we have any way to find out how long it might be busy for?
Hard to know the reason of failure. VAS simply returns success or failure without providing the actual reason.

> 
> Should we try an NX on another chip?
Hard to decide whether to fall back on other NX engine since no way to know the actual failure reason on the current NX or no idea whether other NX is free. 

> 
> We should also take into account the size of our request, ie. are we
> asking the NX to compress one page, or 1GB ?

842 compression/decompression request size for NX is always fixed. So divide in to smaller requests for large buffer. Whereas NX gzip engine is different - can be configurable request size. We can look at this optimization when gzip support is added.   
   
> 
> If it's just one page maybe we should fall back to software immediately.

Right now falls back to SW decompression after 10 retries. Whereas user can use SW 842 compression upon failures.

We are planning to look in to performance analysis as part of VAS/NX optimizatiion and make necessary changes.

Thanks
Haren
> 
> cheers
>
Haren Myneni Sept. 2, 2017, 8:40 a.m. UTC | #18
On 08/29/2017 06:58 AM, Dan Streetman wrote:
> On Sat, Jul 22, 2017 at 1:01 AM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
>>
>> This patch adds P9 NX support for 842 compression engine. Virtual
>> Accelerator Switchboard (VAS) is used to access 842 engine on P9.
>>
>> For each NX engine per chip, setup receive window using
>> vas_rx_win_open() which configures RxFIFo with FIFO address, lpid,
>> pid and tid values. This unique (lpid, pid, tid) combination will
>> be used to identify the target engine.
>>
>> For crypto open request, open send window on the NX engine for
>> the corresponding chip / cpu where the open request is executed.
>> This send window will be closed upon crypto close request.
>>
>> NX provides high and normal priority FIFOs. For compression /
>> decompression requests, we use only hight priority FIFOs in kernel.
>>
>> Each NX request will be communicated to VAS using copy/paste
>> instructions with vas_copy_crb() / vas_paste_crb() functions.
>>
>> Signed-off-by: Haren Myneni <haren@us.ibm.com>
>> ---
>>  drivers/crypto/nx/Kconfig          |   1 +
>>  drivers/crypto/nx/nx-842-powernv.c | 375 ++++++++++++++++++++++++++++++++++++-
>>  drivers/crypto/nx/nx-842.c         |   2 +-
>>  3 files changed, 371 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/crypto/nx/Kconfig b/drivers/crypto/nx/Kconfig
>> index ad7552a6998c..cd5dda9c48f4 100644
>> --- a/drivers/crypto/nx/Kconfig
>> +++ b/drivers/crypto/nx/Kconfig
>> @@ -38,6 +38,7 @@ config CRYPTO_DEV_NX_COMPRESS_PSERIES
>>  config CRYPTO_DEV_NX_COMPRESS_POWERNV
>>         tristate "Compression acceleration support on PowerNV platform"
>>         depends on PPC_POWERNV
>> +       depends on PPC_VAS
>>         default y
>>         help
>>           Support for PowerPC Nest (NX) compression acceleration. This
>> diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
>> index c0dd4c7e17d3..13089a0b9dfa 100644
>> --- a/drivers/crypto/nx/nx-842-powernv.c
>> +++ b/drivers/crypto/nx/nx-842-powernv.c
>> @@ -23,6 +23,7 @@
>>  #include <asm/prom.h>
>>  #include <asm/icswx.h>
>>  #include <asm/vas.h>
>> +#include <asm/reg.h>
>>
>>  MODULE_LICENSE("GPL");
>>  MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
>> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>>
>>  #define WORKMEM_ALIGN  (CRB_ALIGN)
>>  #define CSB_WAIT_MAX   (5000) /* ms */
>> +#define VAS_RETRIES    (10)
>> +/* # of requests allowed per RxFIFO at a time. 0 for unlimited */
>> +#define MAX_CREDITS_PER_RXFIFO (1024)
>>
>>  struct nx842_workmem {
>>         /* Below fields must be properly aligned */
>> @@ -42,16 +46,27 @@ struct nx842_workmem {
>>
>>         ktime_t start;
>>
>> +       struct vas_window *txwin;       /* Used with VAS function */
>>         char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
>>  } __packed __aligned(WORKMEM_ALIGN);
>>
>>  struct nx842_coproc {
>>         unsigned int chip_id;
>>         unsigned int ct;
>> -       unsigned int ci;
>> +       unsigned int ci;        /* Coprocessor instance, used with icswx */
>> +       struct {
>> +               struct vas_window *rxwin;
>> +               int id;
>> +       } vas;
>>         struct list_head list;
>>  };
>>
>> +/*
>> + * Send the request to NX engine on the chip for the corresponding CPU
>> + * where the process is executing. Use with VAS function.
>> + */
>> +static DEFINE_PER_CPU(struct nx842_coproc *, coproc_inst);
>> +
>>  /* no cpu hotplug on powernv, so this list never changes after init */
>>  static LIST_HEAD(nx842_coprocs);
>>  static unsigned int nx842_ct;  /* used in icswx function */
>> @@ -513,6 +528,105 @@ static int nx842_exec_icswx(const unsigned char *in, unsigned int inlen,
>>  }
>>
>>  /**
>> + * nx842_exec_vas - compress/decompress data using the 842 algorithm
>> + *
>> + * (De)compression provided by the NX842 coprocessor on IBM PowerNV systems.
>> + * This compresses or decompresses the provided input buffer into the provided
>> + * output buffer.
>> + *
>> + * Upon return from this function @outlen contains the length of the
>> + * output data.  If there is an error then @outlen will be 0 and an
>> + * error will be specified by the return code from this function.
>> + *
>> + * The @workmem buffer should only be used by one function call at a time.
>> + *
>> + * @in: input buffer pointer
>> + * @inlen: input buffer size
>> + * @out: output buffer pointer
>> + * @outlenp: output buffer size pointer
>> + * @workmem: working memory buffer pointer, size determined by
>> + *           nx842_powernv_driver.workmem_size
>> + * @fc: function code, see CCW Function Codes in nx-842.h
>> + *
>> + * Returns:
>> + *   0         Success, output of length @outlenp stored in the buffer
>> + *             at @out
>> + *   -ENODEV   Hardware unavailable
>> + *   -ENOSPC   Output buffer is to small
>> + *   -EMSGSIZE Input buffer too large
>> + *   -EINVAL   buffer constraints do not fix nx842_constraints
>> + *   -EPROTO   hardware error during operation
>> + *   -ETIMEDOUT        hardware did not complete operation in reasonable time
>> + *   -EINTR    operation was aborted
>> + */
>> +static int nx842_exec_vas(const unsigned char *in, unsigned int inlen,
>> +                                 unsigned char *out, unsigned int *outlenp,
>> +                                 void *workmem, int fc)
>> +{
>> +       struct coprocessor_request_block *crb;
>> +       struct coprocessor_status_block *csb;
>> +       struct nx842_workmem *wmem;
>> +       struct vas_window *txwin;
>> +       int ret, i = 0;
>> +       u32 ccw;
>> +       unsigned int outlen = *outlenp;
>> +
>> +       wmem = PTR_ALIGN(workmem, WORKMEM_ALIGN);
>> +
>> +       *outlenp = 0;
>> +
>> +       crb = &wmem->crb;
>> +       csb = &crb->csb;
>> +
>> +       ret = nx842_config_crb(in, inlen, out, outlen, wmem);
>> +       if (ret)
>> +               return ret;
>> +
>> +       ccw = 0;
>> +       ccw = SET_FIELD(CCW_FC_842, ccw, fc);
>> +       crb->ccw = cpu_to_be32(ccw);
>> +
>> +       txwin = wmem->txwin;
>> +       /* shoudn't happen, we don't load without a coproc */
>> +       if (!txwin) {
>> +               pr_err_ratelimited("NX-842 coprocessor is not available");
>> +               return -ENODEV;
>> +       }
>> +
>> +       do {
>> +               wmem->start = ktime_get();
>> +               preempt_disable();
>> +               /*
>> +                * VAS copy CRB into L2 cache. Refer <asm/vas.h>.
>> +                * @crb, @offset and @first (must be true)
>> +                */
>> +               vas_copy_crb(crb, 0, 1);
>> +
>> +               /*
>> +                * VAS paste previously copied CRB to NX.
>> +                * @txwin, @offset, @last (must be true) and @re is
>> +                * expected/assumed to be true for NX windows.
>> +                */
>> +               ret = vas_paste_crb(txwin, 0, 1, 1);
>> +               preempt_enable();
>> +               /*
>> +                * Retry copy/paste function for VAS failures.
>> +                */
>> +       } while (ret && (i++ < VAS_RETRIES));
>> +
>> +       if (ret) {
>> +               pr_err_ratelimited("VAS copy/paste failed\n");
>> +               return ret;
>> +       }
>> +
>> +       ret = wait_for_csb(wmem, csb);
>> +       if (!ret)
>> +               *outlenp = be32_to_cpu(csb->count);
>> +
>> +       return ret;
>> +}
>> +
>> +/**
>>   * nx842_powernv_compress - Compress data using the 842 algorithm
>>   *
>>   * Compression provided by the NX842 coprocessor on IBM PowerNV systems.
>> @@ -576,6 +690,198 @@ static inline void nx842_add_coprocs_list(struct nx842_coproc *coproc,
>>         list_add(&coproc->list, &nx842_coprocs);
>>  }
>>
>> +/*
>> + * Identify chip ID for each CPU and save coprocesor adddress for the
>> + * corresponding NX engine in percpu coproc_inst.
>> + * coproc_inst is used in crypto_init to open send window on the NX instance
>> + * for the corresponding CPU / chip where the open request is executed.
>> + */
>> +static void nx842_set_per_cpu_coproc(struct nx842_coproc *coproc)
>> +{
>> +       unsigned int i, chip_id;
>> +
>> +       for_each_possible_cpu(i) {
>> +               chip_id = cpu_to_chip_id(i);
>> +
>> +               if (coproc->chip_id == chip_id)
>> +                       per_cpu(coproc_inst, i) = coproc;
>> +       }
>> +}
>> +
>> +
>> +static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc)
>> +{
>> +       struct vas_window *txwin = NULL;
>> +       struct vas_tx_win_attr txattr;
>> +
>> +       /*
>> +        * Kernel requests will be high priority. So open send
>> +        * windows only for high priority RxFIFO entries.
>> +        */
>> +       vas_init_tx_win_attr(&txattr, coproc->ct);
>> +       txattr.lpid = 0;        /* lpid is 0 for kernel requests */
>> +       txattr.pid = mfspr(SPRN_PID);
>> +
>> +       /*
>> +        * Open a VAS send window which is used to send request to NX.
>> +        */
>> +       txwin = vas_tx_win_open(coproc->vas.id, coproc->ct, &txattr);
>> +       if (IS_ERR(txwin)) {
>> +               pr_err("ibm,nx-842: Can not open TX window: %ld\n",
>> +                               PTR_ERR(txwin));
>> +               return NULL;
>> +       }
>> +
>> +       return txwin;
>> +}
>> +
>> +static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id,
>> +                                       int vasid)
>> +{
>> +       struct vas_window *rxwin = NULL;
>> +       struct vas_rx_win_attr rxattr;
>> +       struct nx842_coproc *coproc;
>> +       u32 lpid, pid, tid, fifo_size;
>> +       u64 rx_fifo;
>> +       const char *priority;
>> +       int ret;
>> +
>> +       ret = of_property_read_u64(dn, "rx-fifo-address", (void *)&rx_fifo);
>> +       if (ret) {
>> +               pr_err("Missing rx-fifo-address property\n");
>> +               return ret;
>> +       }
>> +
>> +       ret = of_property_read_u32(dn, "rx-fifo-size", &fifo_size);
>> +       if (ret) {
>> +               pr_err("Missing rx-fifo-size property\n");
>> +               return ret;
>> +       }
>> +
>> +       ret = of_property_read_u32(dn, "lpid", &lpid);
>> +       if (ret) {
>> +               pr_err("Missing lpid property\n");
>> +               return ret;
>> +       }
>> +
>> +       ret = of_property_read_u32(dn, "pid", &pid);
>> +       if (ret) {
>> +               pr_err("Missing pid property\n");
>> +               return ret;
>> +       }
>> +
>> +       ret = of_property_read_u32(dn, "tid", &tid);
>> +       if (ret) {
>> +               pr_err("Missing tid property\n");
>> +               return ret;
>> +       }
>> +
>> +       ret = of_property_read_string(dn, "priority", &priority);
>> +       if (ret) {
>> +               pr_err("Missing priority property\n");
>> +               return ret;
>> +       }
>> +
>> +       coproc = kzalloc(sizeof(*coproc), GFP_KERNEL);
>> +       if (!coproc)
>> +               return -ENOMEM;
>> +
>> +       if (!strcmp(priority, "High"))
>> +               coproc->ct = VAS_COP_TYPE_842_HIPRI;
>> +       else if (!strcmp(priority, "Normal"))
>> +               coproc->ct = VAS_COP_TYPE_842;
>> +       else {
>> +               pr_err("Invalid RxFIFO priority value\n");
>> +               ret =  -EINVAL;
>> +               goto err_out;
>> +       }
>> +
>> +       vas_init_rx_win_attr(&rxattr, coproc->ct);
>> +       rxattr.rx_fifo = (void *)rx_fifo;
>> +       rxattr.rx_fifo_size = fifo_size;
>> +       rxattr.lnotify_lpid = lpid;
>> +       rxattr.lnotify_pid = pid;
>> +       rxattr.lnotify_tid = tid;
>> +       rxattr.wcreds_max = MAX_CREDITS_PER_RXFIFO;
>> +
>> +       /*
>> +        * Open a VAS receice window which is used to configure RxFIFO
>> +        * for NX.
>> +        */
>> +       rxwin = vas_rx_win_open(vasid, coproc->ct, &rxattr);
>> +       if (IS_ERR(rxwin)) {
>> +               ret = PTR_ERR(rxwin);
>> +               pr_err("setting RxFIFO with VAS failed: %d\n",
>> +                       ret);
>> +               goto err_out;
>> +       }
>> +
>> +       coproc->vas.rxwin = rxwin;
>> +       coproc->vas.id = vasid;
>> +       nx842_add_coprocs_list(coproc, chip_id);
>> +
>> +       /*
>> +        * Kernel requests use only high priority FIFOs. So save coproc
>> +        * info in percpu coproc_inst which will be used to open send
>> +        * windows for crypto open requests later.
>> +        */
>> +       if (coproc->ct == VAS_COP_TYPE_842_HIPRI)
>> +               nx842_set_per_cpu_coproc(coproc);
>> +
>> +       return 0;
>> +
>> +err_out:
>> +       kfree(coproc);
>> +       return ret;
>> +}
>> +
>> +
>> +static int __init nx842_powernv_probe_vas(struct device_node *pn)
>> +{
>> +       struct device_node *dn;
>> +       int chip_id, vasid, ret = 0;
>> +       int nx_fifo_found = 0;
>> +
>> +       chip_id = of_get_ibm_chip_id(pn);
>> +       if (chip_id < 0) {
>> +               pr_err("ibm,chip-id missing\n");
>> +               return -EINVAL;
>> +       }
>> +
>> +       dn = of_find_compatible_node(pn, NULL, "ibm,power9-vas-x");
>> +
>> +       if (!dn) {
>> +               pr_err("Missing VAS device node\n");
>> +               return -EINVAL;
>> +       }
>> +
>> +       if (of_property_read_u32(dn, "ibm,vas-id", &vasid)) {
>> +               pr_err("Missing ibm,vas-id device property\n");
>> +               of_node_put(dn);
>> +               return -EINVAL;
>> +       }
>> +
>> +       of_node_put(dn);
>> +
>> +       for_each_child_of_node(pn, dn) {
>> +               if (of_device_is_compatible(dn, "ibm,p9-nx-842")) {
>> +                       ret = vas_cfg_coproc_info(dn, chip_id, vasid);
>> +                       if (ret) {
>> +                               of_node_put(dn);
>> +                               return ret;
>> +                       }
>> +                       nx_fifo_found++;
>> +               }
>> +       }
>> +
>> +       if (!nx_fifo_found) {
>> +               pr_err("NX842 FIFO nodes are missing\n");
>> +               ret = -EINVAL;
>> +       }
>> +
>> +       return ret;
>> +}
>> +
>>  static int __init nx842_powernv_probe(struct device_node *dn)
>>  {
>>         struct nx842_coproc *coproc;
>> @@ -622,6 +928,9 @@ static void nx842_delete_coprocs(void)
>>         struct nx842_coproc *coproc, *n;
>>
>>         list_for_each_entry_safe(coproc, n, &nx842_coprocs, list) {
>> +               if (coproc->vas.rxwin)
>> +                       vas_win_close(coproc->vas.rxwin);
>> +
>>                 list_del(&coproc->list);
>>                 kfree(coproc);
>>         }
>> @@ -643,6 +952,46 @@ static struct nx842_driver nx842_powernv_driver = {
>>         .decompress =   nx842_powernv_decompress,
>>  };
>>
>> +static int nx842_powernv_crypto_init_vas(struct crypto_tfm *tfm)
>> +{
>> +       struct nx842_crypto_ctx *ctx = crypto_tfm_ctx(tfm);
>> +       struct nx842_workmem *wmem;
>> +       struct nx842_coproc *coproc;
>> +       int ret;
>> +
>> +       ret = nx842_crypto_init(tfm, &nx842_powernv_driver);
>> +
>> +       if (ret)
>> +               return ret;
>> +
>> +       wmem = PTR_ALIGN((struct nx842_workmem *)ctx->wmem, WORKMEM_ALIGN);
>> +       coproc = per_cpu(coproc_inst, smp_processor_id());
> 
> this is wrong.  the crypto transform init function is not guaranteed
> to be called by the same processor that later uses it.  Just because
> that happens to be how zswap operates doesn't guarantee other crypto
> users will do the same.

Dan, Sorry missed this comment. 

Right, The actual crypto request can be executed on other processor than the CPU when the init is executed. The main goal is open send window on the NX engine which is on the same chip for the corresponding CPU. So we are OK if the request is scheduled on other CPU as long as it belongs to same chip. Otherwise in the worst case we will end up using remote NX.  

Thanks
Haren
Michael Neuling Sept. 2, 2017, 1:42 p.m. UTC | #19
> > > +static int nx842_powernv_crypto_init_vas(struct crypto_tfm *tfm)
> > > +{
> > > +       struct nx842_crypto_ctx *ctx = crypto_tfm_ctx(tfm);
> > > +       struct nx842_workmem *wmem;
> > > +       struct nx842_coproc *coproc;
> > > +       int ret;
> > > +
> > > +       ret = nx842_crypto_init(tfm, &nx842_powernv_driver);
> > > +
> > > +       if (ret)
> > > +               return ret;
> > > +
> > > +       wmem = PTR_ALIGN((struct nx842_workmem *)ctx->wmem,
> > > WORKMEM_ALIGN);
> > > +       coproc = per_cpu(coproc_inst, smp_processor_id());
> > 
> > this is wrong.  the crypto transform init function is not guaranteed
> > to be called by the same processor that later uses it.  Just because
> > that happens to be how zswap operates doesn't guarantee other crypto
> > users will do the same.
> 
> Dan, Sorry missed this comment. 
> 
> Right, The actual crypto request can be executed on other processor than the
> CPU when the init is executed. The main goal is open send window on the NX
> engine which is on the same chip for the corresponding CPU. So we are OK if
> the request is scheduled on other CPU as long as it belongs to same chip.
> Otherwise in the worst case we will end up using remote NX.  

You want the NX to be close to the requester CPU, but probably more importantly
you want the NX close to the memory it's going to be operating on. 

Preferably they would all be on the same node.

Mikey
Dan Streetman Sept. 2, 2017, 4:14 p.m. UTC | #20
On Fri, Sep 1, 2017 at 7:29 AM, Michael Ellerman <mpe@ellerman.id.au> wrote:
> Hi Dan,
>
> Thanks for reviewing this series.
>
> Dan Streetman <ddstreet@ieee.org> writes:
>> On Tue, Aug 29, 2017 at 5:54 PM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
>>> On 08/29/2017 02:23 PM, Benjamin Herrenschmidt wrote:
>>>> On Tue, 2017-08-29 at 09:58 -0400, Dan Streetman wrote:
>>>>>> +
>>>>>> +       ret = -EINVAL;
>>>>>> +       if (coproc && coproc->vas.rxwin) {
>>>>>> +               wmem->txwin = nx842_alloc_txwin(coproc);
>>>>>
>>>>> this is wrong.  the workmem is scratch memory that's valid only for
>>>>> the duration of a single operation.
>>>
>>> Correct, workmem is used until crypto_free is called.
>>
>> that's not a 'single operation'.  a single operation is compress() or
>> decompress().
>>
>>>>>
>>>>> do you actually need a txwin per crypto transform?  or do you need a
>>>>> txwin per coprocessor?  or txwin per processor?  either per-coproc or
>>>>> per-cpu should be created at driver init and held separately
>>>>> (globally) instead of a per-transform txwin.  I really don't see why
>>>>> you would need a txwin per transform, because the coproc should not
>>>>> care how many different transforms there are.
>>>>
>>>> We should only need a single window for the whole kernel really, plus
>>>> one per user process who wants direct access but that's not relevant
>>>> here.
>>>
>>> Opening send window for each crypto transform (crypto_alloc,
>>> compression/decompression, ..., crypto_free) so that does not
>>> have to wait for the previous copy/paste complete.
>>> VAS will map send and receive windows, and can cache in send
>>> windows (up to 128). So I thought using the same send window
>>> (per chip) for more requests (say 1000) may be adding overhead.
>>>
>>> I will make changes if you prefer using 1 send window per chip.
>>
>> i don't have the spec, so i shouldn't be making the decision on it,
>> but i do know putting a persistent field into the workmem is the wrong
>> location.  If it's valid for the life of the transform, put it into
>> the transform context.  The workmem buffer is intended to be used only
>> during a single operation - it's "working memory" to perform each
>> individual crypto transformation.
>
> I agree workmem isn't the right place for the txwin. But I don't believe
> it actually breaks anything to put txwin there.

it doesn't currently no, but workmem should be able to be memset(0) at
the start of each compress/decompress operation without breaking
anything.

Otherwise, the workmem fields should just go directly into the
nx842_crypto_ctx, which contains other persistent fields.

My concern isn't about breaking anything right now, it's about
misusing the design causing obscure breakage later.

>
> So for now I'm going to merge this series as-is and I've asked Haren to
> send fixes as soon as he can to clean it up.

sure, as i said i've been out of the 842 area for years now so i was
going to just stay out of it...I just happened to notice things i
thought i should comment on :-)

>
> cheers
Dan Streetman Sept. 2, 2017, 4:17 p.m. UTC | #21
On Sat, Sep 2, 2017 at 4:40 AM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
> On 08/29/2017 06:58 AM, Dan Streetman wrote:
>> On Sat, Jul 22, 2017 at 1:01 AM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
>>>
>>> This patch adds P9 NX support for 842 compression engine. Virtual
>>> Accelerator Switchboard (VAS) is used to access 842 engine on P9.
>>>
>>> For each NX engine per chip, setup receive window using
>>> vas_rx_win_open() which configures RxFIFo with FIFO address, lpid,
>>> pid and tid values. This unique (lpid, pid, tid) combination will
>>> be used to identify the target engine.
>>>
>>> For crypto open request, open send window on the NX engine for
>>> the corresponding chip / cpu where the open request is executed.
>>> This send window will be closed upon crypto close request.
>>>
>>> NX provides high and normal priority FIFOs. For compression /
>>> decompression requests, we use only hight priority FIFOs in kernel.
>>>
>>> Each NX request will be communicated to VAS using copy/paste
>>> instructions with vas_copy_crb() / vas_paste_crb() functions.
>>>
>>> Signed-off-by: Haren Myneni <haren@us.ibm.com>
>>> ---
>>>  drivers/crypto/nx/Kconfig          |   1 +
>>>  drivers/crypto/nx/nx-842-powernv.c | 375 ++++++++++++++++++++++++++++++++++++-
>>>  drivers/crypto/nx/nx-842.c         |   2 +-
>>>  3 files changed, 371 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/crypto/nx/Kconfig b/drivers/crypto/nx/Kconfig
>>> index ad7552a6998c..cd5dda9c48f4 100644
>>> --- a/drivers/crypto/nx/Kconfig
>>> +++ b/drivers/crypto/nx/Kconfig
>>> @@ -38,6 +38,7 @@ config CRYPTO_DEV_NX_COMPRESS_PSERIES
>>>  config CRYPTO_DEV_NX_COMPRESS_POWERNV
>>>         tristate "Compression acceleration support on PowerNV platform"
>>>         depends on PPC_POWERNV
>>> +       depends on PPC_VAS
>>>         default y
>>>         help
>>>           Support for PowerPC Nest (NX) compression acceleration. This
>>> diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
>>> index c0dd4c7e17d3..13089a0b9dfa 100644
>>> --- a/drivers/crypto/nx/nx-842-powernv.c
>>> +++ b/drivers/crypto/nx/nx-842-powernv.c
>>> @@ -23,6 +23,7 @@
>>>  #include <asm/prom.h>
>>>  #include <asm/icswx.h>
>>>  #include <asm/vas.h>
>>> +#include <asm/reg.h>
>>>
>>>  MODULE_LICENSE("GPL");
>>>  MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
>>> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>>>
>>>  #define WORKMEM_ALIGN  (CRB_ALIGN)
>>>  #define CSB_WAIT_MAX   (5000) /* ms */
>>> +#define VAS_RETRIES    (10)
>>> +/* # of requests allowed per RxFIFO at a time. 0 for unlimited */
>>> +#define MAX_CREDITS_PER_RXFIFO (1024)
>>>
>>>  struct nx842_workmem {
>>>         /* Below fields must be properly aligned */
>>> @@ -42,16 +46,27 @@ struct nx842_workmem {
>>>
>>>         ktime_t start;
>>>
>>> +       struct vas_window *txwin;       /* Used with VAS function */
>>>         char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
>>>  } __packed __aligned(WORKMEM_ALIGN);
>>>
>>>  struct nx842_coproc {
>>>         unsigned int chip_id;
>>>         unsigned int ct;
>>> -       unsigned int ci;
>>> +       unsigned int ci;        /* Coprocessor instance, used with icswx */
>>> +       struct {
>>> +               struct vas_window *rxwin;
>>> +               int id;
>>> +       } vas;
>>>         struct list_head list;
>>>  };
>>>
>>> +/*
>>> + * Send the request to NX engine on the chip for the corresponding CPU
>>> + * where the process is executing. Use with VAS function.
>>> + */
>>> +static DEFINE_PER_CPU(struct nx842_coproc *, coproc_inst);
>>> +
>>>  /* no cpu hotplug on powernv, so this list never changes after init */
>>>  static LIST_HEAD(nx842_coprocs);
>>>  static unsigned int nx842_ct;  /* used in icswx function */
>>> @@ -513,6 +528,105 @@ static int nx842_exec_icswx(const unsigned char *in, unsigned int inlen,
>>>  }
>>>
>>>  /**
>>> + * nx842_exec_vas - compress/decompress data using the 842 algorithm
>>> + *
>>> + * (De)compression provided by the NX842 coprocessor on IBM PowerNV systems.
>>> + * This compresses or decompresses the provided input buffer into the provided
>>> + * output buffer.
>>> + *
>>> + * Upon return from this function @outlen contains the length of the
>>> + * output data.  If there is an error then @outlen will be 0 and an
>>> + * error will be specified by the return code from this function.
>>> + *
>>> + * The @workmem buffer should only be used by one function call at a time.
>>> + *
>>> + * @in: input buffer pointer
>>> + * @inlen: input buffer size
>>> + * @out: output buffer pointer
>>> + * @outlenp: output buffer size pointer
>>> + * @workmem: working memory buffer pointer, size determined by
>>> + *           nx842_powernv_driver.workmem_size
>>> + * @fc: function code, see CCW Function Codes in nx-842.h
>>> + *
>>> + * Returns:
>>> + *   0         Success, output of length @outlenp stored in the buffer
>>> + *             at @out
>>> + *   -ENODEV   Hardware unavailable
>>> + *   -ENOSPC   Output buffer is to small
>>> + *   -EMSGSIZE Input buffer too large
>>> + *   -EINVAL   buffer constraints do not fix nx842_constraints
>>> + *   -EPROTO   hardware error during operation
>>> + *   -ETIMEDOUT        hardware did not complete operation in reasonable time
>>> + *   -EINTR    operation was aborted
>>> + */
>>> +static int nx842_exec_vas(const unsigned char *in, unsigned int inlen,
>>> +                                 unsigned char *out, unsigned int *outlenp,
>>> +                                 void *workmem, int fc)
>>> +{
>>> +       struct coprocessor_request_block *crb;
>>> +       struct coprocessor_status_block *csb;
>>> +       struct nx842_workmem *wmem;
>>> +       struct vas_window *txwin;
>>> +       int ret, i = 0;
>>> +       u32 ccw;
>>> +       unsigned int outlen = *outlenp;
>>> +
>>> +       wmem = PTR_ALIGN(workmem, WORKMEM_ALIGN);
>>> +
>>> +       *outlenp = 0;
>>> +
>>> +       crb = &wmem->crb;
>>> +       csb = &crb->csb;
>>> +
>>> +       ret = nx842_config_crb(in, inlen, out, outlen, wmem);
>>> +       if (ret)
>>> +               return ret;
>>> +
>>> +       ccw = 0;
>>> +       ccw = SET_FIELD(CCW_FC_842, ccw, fc);
>>> +       crb->ccw = cpu_to_be32(ccw);
>>> +
>>> +       txwin = wmem->txwin;
>>> +       /* shoudn't happen, we don't load without a coproc */
>>> +       if (!txwin) {
>>> +               pr_err_ratelimited("NX-842 coprocessor is not available");
>>> +               return -ENODEV;
>>> +       }
>>> +
>>> +       do {
>>> +               wmem->start = ktime_get();
>>> +               preempt_disable();
>>> +               /*
>>> +                * VAS copy CRB into L2 cache. Refer <asm/vas.h>.
>>> +                * @crb, @offset and @first (must be true)
>>> +                */
>>> +               vas_copy_crb(crb, 0, 1);
>>> +
>>> +               /*
>>> +                * VAS paste previously copied CRB to NX.
>>> +                * @txwin, @offset, @last (must be true) and @re is
>>> +                * expected/assumed to be true for NX windows.
>>> +                */
>>> +               ret = vas_paste_crb(txwin, 0, 1, 1);
>>> +               preempt_enable();
>>> +               /*
>>> +                * Retry copy/paste function for VAS failures.
>>> +                */
>>> +       } while (ret && (i++ < VAS_RETRIES));
>>> +
>>> +       if (ret) {
>>> +               pr_err_ratelimited("VAS copy/paste failed\n");
>>> +               return ret;
>>> +       }
>>> +
>>> +       ret = wait_for_csb(wmem, csb);
>>> +       if (!ret)
>>> +               *outlenp = be32_to_cpu(csb->count);
>>> +
>>> +       return ret;
>>> +}
>>> +
>>> +/**
>>>   * nx842_powernv_compress - Compress data using the 842 algorithm
>>>   *
>>>   * Compression provided by the NX842 coprocessor on IBM PowerNV systems.
>>> @@ -576,6 +690,198 @@ static inline void nx842_add_coprocs_list(struct nx842_coproc *coproc,
>>>         list_add(&coproc->list, &nx842_coprocs);
>>>  }
>>>
>>> +/*
>>> + * Identify chip ID for each CPU and save coprocesor adddress for the
>>> + * corresponding NX engine in percpu coproc_inst.
>>> + * coproc_inst is used in crypto_init to open send window on the NX instance
>>> + * for the corresponding CPU / chip where the open request is executed.
>>> + */
>>> +static void nx842_set_per_cpu_coproc(struct nx842_coproc *coproc)
>>> +{
>>> +       unsigned int i, chip_id;
>>> +
>>> +       for_each_possible_cpu(i) {
>>> +               chip_id = cpu_to_chip_id(i);
>>> +
>>> +               if (coproc->chip_id == chip_id)
>>> +                       per_cpu(coproc_inst, i) = coproc;
>>> +       }
>>> +}
>>> +
>>> +
>>> +static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc)
>>> +{
>>> +       struct vas_window *txwin = NULL;
>>> +       struct vas_tx_win_attr txattr;
>>> +
>>> +       /*
>>> +        * Kernel requests will be high priority. So open send
>>> +        * windows only for high priority RxFIFO entries.
>>> +        */
>>> +       vas_init_tx_win_attr(&txattr, coproc->ct);
>>> +       txattr.lpid = 0;        /* lpid is 0 for kernel requests */
>>> +       txattr.pid = mfspr(SPRN_PID);
>>> +
>>> +       /*
>>> +        * Open a VAS send window which is used to send request to NX.
>>> +        */
>>> +       txwin = vas_tx_win_open(coproc->vas.id, coproc->ct, &txattr);
>>> +       if (IS_ERR(txwin)) {
>>> +               pr_err("ibm,nx-842: Can not open TX window: %ld\n",
>>> +                               PTR_ERR(txwin));
>>> +               return NULL;
>>> +       }
>>> +
>>> +       return txwin;
>>> +}
>>> +
>>> +static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id,
>>> +                                       int vasid)
>>> +{
>>> +       struct vas_window *rxwin = NULL;
>>> +       struct vas_rx_win_attr rxattr;
>>> +       struct nx842_coproc *coproc;
>>> +       u32 lpid, pid, tid, fifo_size;
>>> +       u64 rx_fifo;
>>> +       const char *priority;
>>> +       int ret;
>>> +
>>> +       ret = of_property_read_u64(dn, "rx-fifo-address", (void *)&rx_fifo);
>>> +       if (ret) {
>>> +               pr_err("Missing rx-fifo-address property\n");
>>> +               return ret;
>>> +       }
>>> +
>>> +       ret = of_property_read_u32(dn, "rx-fifo-size", &fifo_size);
>>> +       if (ret) {
>>> +               pr_err("Missing rx-fifo-size property\n");
>>> +               return ret;
>>> +       }
>>> +
>>> +       ret = of_property_read_u32(dn, "lpid", &lpid);
>>> +       if (ret) {
>>> +               pr_err("Missing lpid property\n");
>>> +               return ret;
>>> +       }
>>> +
>>> +       ret = of_property_read_u32(dn, "pid", &pid);
>>> +       if (ret) {
>>> +               pr_err("Missing pid property\n");
>>> +               return ret;
>>> +       }
>>> +
>>> +       ret = of_property_read_u32(dn, "tid", &tid);
>>> +       if (ret) {
>>> +               pr_err("Missing tid property\n");
>>> +               return ret;
>>> +       }
>>> +
>>> +       ret = of_property_read_string(dn, "priority", &priority);
>>> +       if (ret) {
>>> +               pr_err("Missing priority property\n");
>>> +               return ret;
>>> +       }
>>> +
>>> +       coproc = kzalloc(sizeof(*coproc), GFP_KERNEL);
>>> +       if (!coproc)
>>> +               return -ENOMEM;
>>> +
>>> +       if (!strcmp(priority, "High"))
>>> +               coproc->ct = VAS_COP_TYPE_842_HIPRI;
>>> +       else if (!strcmp(priority, "Normal"))
>>> +               coproc->ct = VAS_COP_TYPE_842;
>>> +       else {
>>> +               pr_err("Invalid RxFIFO priority value\n");
>>> +               ret =  -EINVAL;
>>> +               goto err_out;
>>> +       }
>>> +
>>> +       vas_init_rx_win_attr(&rxattr, coproc->ct);
>>> +       rxattr.rx_fifo = (void *)rx_fifo;
>>> +       rxattr.rx_fifo_size = fifo_size;
>>> +       rxattr.lnotify_lpid = lpid;
>>> +       rxattr.lnotify_pid = pid;
>>> +       rxattr.lnotify_tid = tid;
>>> +       rxattr.wcreds_max = MAX_CREDITS_PER_RXFIFO;
>>> +
>>> +       /*
>>> +        * Open a VAS receice window which is used to configure RxFIFO
>>> +        * for NX.
>>> +        */
>>> +       rxwin = vas_rx_win_open(vasid, coproc->ct, &rxattr);
>>> +       if (IS_ERR(rxwin)) {
>>> +               ret = PTR_ERR(rxwin);
>>> +               pr_err("setting RxFIFO with VAS failed: %d\n",
>>> +                       ret);
>>> +               goto err_out;
>>> +       }
>>> +
>>> +       coproc->vas.rxwin = rxwin;
>>> +       coproc->vas.id = vasid;
>>> +       nx842_add_coprocs_list(coproc, chip_id);
>>> +
>>> +       /*
>>> +        * Kernel requests use only high priority FIFOs. So save coproc
>>> +        * info in percpu coproc_inst which will be used to open send
>>> +        * windows for crypto open requests later.
>>> +        */
>>> +       if (coproc->ct == VAS_COP_TYPE_842_HIPRI)
>>> +               nx842_set_per_cpu_coproc(coproc);
>>> +
>>> +       return 0;
>>> +
>>> +err_out:
>>> +       kfree(coproc);
>>> +       return ret;
>>> +}
>>> +
>>> +
>>> +static int __init nx842_powernv_probe_vas(struct device_node *pn)
>>> +{
>>> +       struct device_node *dn;
>>> +       int chip_id, vasid, ret = 0;
>>> +       int nx_fifo_found = 0;
>>> +
>>> +       chip_id = of_get_ibm_chip_id(pn);
>>> +       if (chip_id < 0) {
>>> +               pr_err("ibm,chip-id missing\n");
>>> +               return -EINVAL;
>>> +       }
>>> +
>>> +       dn = of_find_compatible_node(pn, NULL, "ibm,power9-vas-x");
>>> +
>>> +       if (!dn) {
>>> +               pr_err("Missing VAS device node\n");
>>> +               return -EINVAL;
>>> +       }
>>> +
>>> +       if (of_property_read_u32(dn, "ibm,vas-id", &vasid)) {
>>> +               pr_err("Missing ibm,vas-id device property\n");
>>> +               of_node_put(dn);
>>> +               return -EINVAL;
>>> +       }
>>> +
>>> +       of_node_put(dn);
>>> +
>>> +       for_each_child_of_node(pn, dn) {
>>> +               if (of_device_is_compatible(dn, "ibm,p9-nx-842")) {
>>> +                       ret = vas_cfg_coproc_info(dn, chip_id, vasid);
>>> +                       if (ret) {
>>> +                               of_node_put(dn);
>>> +                               return ret;
>>> +                       }
>>> +                       nx_fifo_found++;
>>> +               }
>>> +       }
>>> +
>>> +       if (!nx_fifo_found) {
>>> +               pr_err("NX842 FIFO nodes are missing\n");
>>> +               ret = -EINVAL;
>>> +       }
>>> +
>>> +       return ret;
>>> +}
>>> +
>>>  static int __init nx842_powernv_probe(struct device_node *dn)
>>>  {
>>>         struct nx842_coproc *coproc;
>>> @@ -622,6 +928,9 @@ static void nx842_delete_coprocs(void)
>>>         struct nx842_coproc *coproc, *n;
>>>
>>>         list_for_each_entry_safe(coproc, n, &nx842_coprocs, list) {
>>> +               if (coproc->vas.rxwin)
>>> +                       vas_win_close(coproc->vas.rxwin);
>>> +
>>>                 list_del(&coproc->list);
>>>                 kfree(coproc);
>>>         }
>>> @@ -643,6 +952,46 @@ static struct nx842_driver nx842_powernv_driver = {
>>>         .decompress =   nx842_powernv_decompress,
>>>  };
>>>
>>> +static int nx842_powernv_crypto_init_vas(struct crypto_tfm *tfm)
>>> +{
>>> +       struct nx842_crypto_ctx *ctx = crypto_tfm_ctx(tfm);
>>> +       struct nx842_workmem *wmem;
>>> +       struct nx842_coproc *coproc;
>>> +       int ret;
>>> +
>>> +       ret = nx842_crypto_init(tfm, &nx842_powernv_driver);
>>> +
>>> +       if (ret)
>>> +               return ret;
>>> +
>>> +       wmem = PTR_ALIGN((struct nx842_workmem *)ctx->wmem, WORKMEM_ALIGN);
>>> +       coproc = per_cpu(coproc_inst, smp_processor_id());
>>
>> this is wrong.  the crypto transform init function is not guaranteed
>> to be called by the same processor that later uses it.  Just because
>> that happens to be how zswap operates doesn't guarantee other crypto
>> users will do the same.
>
> Dan, Sorry missed this comment.
>
> Right, The actual crypto request can be executed on other processor than the CPU when the init is executed. The main goal is open send window on the NX engine which is on the same chip for the corresponding CPU. So we are OK if the request is scheduled on other CPU as long as it belongs to same chip. Otherwise in the worst case we will end up using remote NX.

ok, but there's no guarantee of future crypto calls being on the same
chip either, so i still don't understand why you're doing this.  if
you want each crypto comp/decomp call to be cpu-local or node-local,
then choose which corpco/txwin to use at comp/decomp time, not
transform init time.


>
> Thanks
> Haren
>
>
>
>
Haren Myneni Sept. 3, 2017, 8:32 a.m. UTC | #22
On 09/02/2017 09:17 AM, Dan Streetman wrote:
> On Sat, Sep 2, 2017 at 4:40 AM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
>> On 08/29/2017 06:58 AM, Dan Streetman wrote:
>>> On Sat, Jul 22, 2017 at 1:01 AM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
>>>>
>>>> This patch adds P9 NX support for 842 compression engine. Virtual
>>>> Accelerator Switchboard (VAS) is used to access 842 engine on P9.
>>>>
>>>> For each NX engine per chip, setup receive window using
>>>> vas_rx_win_open() which configures RxFIFo with FIFO address, lpid,
>>>> pid and tid values. This unique (lpid, pid, tid) combination will
>>>> be used to identify the target engine.
>>>>
>>>> For crypto open request, open send window on the NX engine for
>>>> the corresponding chip / cpu where the open request is executed.
>>>> This send window will be closed upon crypto close request.
>>>>
>>>> NX provides high and normal priority FIFOs. For compression /
>>>> decompression requests, we use only hight priority FIFOs in kernel.
>>>>
>>>> Each NX request will be communicated to VAS using copy/paste
>>>> instructions with vas_copy_crb() / vas_paste_crb() functions.
>>>>
>>>> Signed-off-by: Haren Myneni <haren@us.ibm.com>
>>>> ---
>>>>  drivers/crypto/nx/Kconfig          |   1 +
>>>>  drivers/crypto/nx/nx-842-powernv.c | 375 ++++++++++++++++++++++++++++++++++++-
>>>>  drivers/crypto/nx/nx-842.c         |   2 +-
>>>>  3 files changed, 371 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/crypto/nx/Kconfig b/drivers/crypto/nx/Kconfig
>>>> index ad7552a6998c..cd5dda9c48f4 100644
>>>> --- a/drivers/crypto/nx/Kconfig
>>>> +++ b/drivers/crypto/nx/Kconfig
>>>> @@ -38,6 +38,7 @@ config CRYPTO_DEV_NX_COMPRESS_PSERIES
>>>>  config CRYPTO_DEV_NX_COMPRESS_POWERNV
>>>>         tristate "Compression acceleration support on PowerNV platform"
>>>>         depends on PPC_POWERNV
>>>> +       depends on PPC_VAS
>>>>         default y
>>>>         help
>>>>           Support for PowerPC Nest (NX) compression acceleration. This
>>>> diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
>>>> index c0dd4c7e17d3..13089a0b9dfa 100644
>>>> --- a/drivers/crypto/nx/nx-842-powernv.c
>>>> +++ b/drivers/crypto/nx/nx-842-powernv.c
>>>> @@ -23,6 +23,7 @@
>>>>  #include <asm/prom.h>
>>>>  #include <asm/icswx.h>
>>>>  #include <asm/vas.h>
>>>> +#include <asm/reg.h>
>>>>
>>>>  MODULE_LICENSE("GPL");
>>>>  MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
>>>> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>>>>
>>>>  #define WORKMEM_ALIGN  (CRB_ALIGN)
>>>>  #define CSB_WAIT_MAX   (5000) /* ms */
>>>> +#define VAS_RETRIES    (10)
>>>> +/* # of requests allowed per RxFIFO at a time. 0 for unlimited */
>>>> +#define MAX_CREDITS_PER_RXFIFO (1024)
>>>>
>>>>  struct nx842_workmem {
>>>>         /* Below fields must be properly aligned */
>>>> @@ -42,16 +46,27 @@ struct nx842_workmem {
>>>>
>>>>         ktime_t start;
>>>>
>>>> +       struct vas_window *txwin;       /* Used with VAS function */
>>>>         char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
>>>>  } __packed __aligned(WORKMEM_ALIGN);
>>>>
>>>>  struct nx842_coproc {
>>>>         unsigned int chip_id;
>>>>         unsigned int ct;
>>>> -       unsigned int ci;
>>>> +       unsigned int ci;        /* Coprocessor instance, used with icswx */
>>>> +       struct {
>>>> +               struct vas_window *rxwin;
>>>> +               int id;
>>>> +       } vas;
>>>>         struct list_head list;
>>>>  };
>>>>
>>>> +/*
>>>> + * Send the request to NX engine on the chip for the corresponding CPU
>>>> + * where the process is executing. Use with VAS function.
>>>> + */
>>>> +static DEFINE_PER_CPU(struct nx842_coproc *, coproc_inst);
>>>> +
>>>>  /* no cpu hotplug on powernv, so this list never changes after init */
>>>>  static LIST_HEAD(nx842_coprocs);
>>>>  static unsigned int nx842_ct;  /* used in icswx function */
>>>> @@ -513,6 +528,105 @@ static int nx842_exec_icswx(const unsigned char *in, unsigned int inlen,
>>>>  }
>>>>
>>>>  /**
>>>> + * nx842_exec_vas - compress/decompress data using the 842 algorithm
>>>> + *
>>>> + * (De)compression provided by the NX842 coprocessor on IBM PowerNV systems.
>>>> + * This compresses or decompresses the provided input buffer into the provided
>>>> + * output buffer.
>>>> + *
>>>> + * Upon return from this function @outlen contains the length of the
>>>> + * output data.  If there is an error then @outlen will be 0 and an
>>>> + * error will be specified by the return code from this function.
>>>> + *
>>>> + * The @workmem buffer should only be used by one function call at a time.
>>>> + *
>>>> + * @in: input buffer pointer
>>>> + * @inlen: input buffer size
>>>> + * @out: output buffer pointer
>>>> + * @outlenp: output buffer size pointer
>>>> + * @workmem: working memory buffer pointer, size determined by
>>>> + *           nx842_powernv_driver.workmem_size
>>>> + * @fc: function code, see CCW Function Codes in nx-842.h
>>>> + *
>>>> + * Returns:
>>>> + *   0         Success, output of length @outlenp stored in the buffer
>>>> + *             at @out
>>>> + *   -ENODEV   Hardware unavailable
>>>> + *   -ENOSPC   Output buffer is to small
>>>> + *   -EMSGSIZE Input buffer too large
>>>> + *   -EINVAL   buffer constraints do not fix nx842_constraints
>>>> + *   -EPROTO   hardware error during operation
>>>> + *   -ETIMEDOUT        hardware did not complete operation in reasonable time
>>>> + *   -EINTR    operation was aborted
>>>> + */
>>>> +static int nx842_exec_vas(const unsigned char *in, unsigned int inlen,
>>>> +                                 unsigned char *out, unsigned int *outlenp,
>>>> +                                 void *workmem, int fc)
>>>> +{
>>>> +       struct coprocessor_request_block *crb;
>>>> +       struct coprocessor_status_block *csb;
>>>> +       struct nx842_workmem *wmem;
>>>> +       struct vas_window *txwin;
>>>> +       int ret, i = 0;
>>>> +       u32 ccw;
>>>> +       unsigned int outlen = *outlenp;
>>>> +
>>>> +       wmem = PTR_ALIGN(workmem, WORKMEM_ALIGN);
>>>> +
>>>> +       *outlenp = 0;
>>>> +
>>>> +       crb = &wmem->crb;
>>>> +       csb = &crb->csb;
>>>> +
>>>> +       ret = nx842_config_crb(in, inlen, out, outlen, wmem);
>>>> +       if (ret)
>>>> +               return ret;
>>>> +
>>>> +       ccw = 0;
>>>> +       ccw = SET_FIELD(CCW_FC_842, ccw, fc);
>>>> +       crb->ccw = cpu_to_be32(ccw);
>>>> +
>>>> +       txwin = wmem->txwin;
>>>> +       /* shoudn't happen, we don't load without a coproc */
>>>> +       if (!txwin) {
>>>> +               pr_err_ratelimited("NX-842 coprocessor is not available");
>>>> +               return -ENODEV;
>>>> +       }
>>>> +
>>>> +       do {
>>>> +               wmem->start = ktime_get();
>>>> +               preempt_disable();
>>>> +               /*
>>>> +                * VAS copy CRB into L2 cache. Refer <asm/vas.h>.
>>>> +                * @crb, @offset and @first (must be true)
>>>> +                */
>>>> +               vas_copy_crb(crb, 0, 1);
>>>> +
>>>> +               /*
>>>> +                * VAS paste previously copied CRB to NX.
>>>> +                * @txwin, @offset, @last (must be true) and @re is
>>>> +                * expected/assumed to be true for NX windows.
>>>> +                */
>>>> +               ret = vas_paste_crb(txwin, 0, 1, 1);
>>>> +               preempt_enable();
>>>> +               /*
>>>> +                * Retry copy/paste function for VAS failures.
>>>> +                */
>>>> +       } while (ret && (i++ < VAS_RETRIES));
>>>> +
>>>> +       if (ret) {
>>>> +               pr_err_ratelimited("VAS copy/paste failed\n");
>>>> +               return ret;
>>>> +       }
>>>> +
>>>> +       ret = wait_for_csb(wmem, csb);
>>>> +       if (!ret)
>>>> +               *outlenp = be32_to_cpu(csb->count);
>>>> +
>>>> +       return ret;
>>>> +}
>>>> +
>>>> +/**
>>>>   * nx842_powernv_compress - Compress data using the 842 algorithm
>>>>   *
>>>>   * Compression provided by the NX842 coprocessor on IBM PowerNV systems.
>>>> @@ -576,6 +690,198 @@ static inline void nx842_add_coprocs_list(struct nx842_coproc *coproc,
>>>>         list_add(&coproc->list, &nx842_coprocs);
>>>>  }
>>>>
>>>> +/*
>>>> + * Identify chip ID for each CPU and save coprocesor adddress for the
>>>> + * corresponding NX engine in percpu coproc_inst.
>>>> + * coproc_inst is used in crypto_init to open send window on the NX instance
>>>> + * for the corresponding CPU / chip where the open request is executed.
>>>> + */
>>>> +static void nx842_set_per_cpu_coproc(struct nx842_coproc *coproc)
>>>> +{
>>>> +       unsigned int i, chip_id;
>>>> +
>>>> +       for_each_possible_cpu(i) {
>>>> +               chip_id = cpu_to_chip_id(i);
>>>> +
>>>> +               if (coproc->chip_id == chip_id)
>>>> +                       per_cpu(coproc_inst, i) = coproc;
>>>> +       }
>>>> +}
>>>> +
>>>> +
>>>> +static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc)
>>>> +{
>>>> +       struct vas_window *txwin = NULL;
>>>> +       struct vas_tx_win_attr txattr;
>>>> +
>>>> +       /*
>>>> +        * Kernel requests will be high priority. So open send
>>>> +        * windows only for high priority RxFIFO entries.
>>>> +        */
>>>> +       vas_init_tx_win_attr(&txattr, coproc->ct);
>>>> +       txattr.lpid = 0;        /* lpid is 0 for kernel requests */
>>>> +       txattr.pid = mfspr(SPRN_PID);
>>>> +
>>>> +       /*
>>>> +        * Open a VAS send window which is used to send request to NX.
>>>> +        */
>>>> +       txwin = vas_tx_win_open(coproc->vas.id, coproc->ct, &txattr);
>>>> +       if (IS_ERR(txwin)) {
>>>> +               pr_err("ibm,nx-842: Can not open TX window: %ld\n",
>>>> +                               PTR_ERR(txwin));
>>>> +               return NULL;
>>>> +       }
>>>> +
>>>> +       return txwin;
>>>> +}
>>>> +
>>>> +static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id,
>>>> +                                       int vasid)
>>>> +{
>>>> +       struct vas_window *rxwin = NULL;
>>>> +       struct vas_rx_win_attr rxattr;
>>>> +       struct nx842_coproc *coproc;
>>>> +       u32 lpid, pid, tid, fifo_size;
>>>> +       u64 rx_fifo;
>>>> +       const char *priority;
>>>> +       int ret;
>>>> +
>>>> +       ret = of_property_read_u64(dn, "rx-fifo-address", (void *)&rx_fifo);
>>>> +       if (ret) {
>>>> +               pr_err("Missing rx-fifo-address property\n");
>>>> +               return ret;
>>>> +       }
>>>> +
>>>> +       ret = of_property_read_u32(dn, "rx-fifo-size", &fifo_size);
>>>> +       if (ret) {
>>>> +               pr_err("Missing rx-fifo-size property\n");
>>>> +               return ret;
>>>> +       }
>>>> +
>>>> +       ret = of_property_read_u32(dn, "lpid", &lpid);
>>>> +       if (ret) {
>>>> +               pr_err("Missing lpid property\n");
>>>> +               return ret;
>>>> +       }
>>>> +
>>>> +       ret = of_property_read_u32(dn, "pid", &pid);
>>>> +       if (ret) {
>>>> +               pr_err("Missing pid property\n");
>>>> +               return ret;
>>>> +       }
>>>> +
>>>> +       ret = of_property_read_u32(dn, "tid", &tid);
>>>> +       if (ret) {
>>>> +               pr_err("Missing tid property\n");
>>>> +               return ret;
>>>> +       }
>>>> +
>>>> +       ret = of_property_read_string(dn, "priority", &priority);
>>>> +       if (ret) {
>>>> +               pr_err("Missing priority property\n");
>>>> +               return ret;
>>>> +       }
>>>> +
>>>> +       coproc = kzalloc(sizeof(*coproc), GFP_KERNEL);
>>>> +       if (!coproc)
>>>> +               return -ENOMEM;
>>>> +
>>>> +       if (!strcmp(priority, "High"))
>>>> +               coproc->ct = VAS_COP_TYPE_842_HIPRI;
>>>> +       else if (!strcmp(priority, "Normal"))
>>>> +               coproc->ct = VAS_COP_TYPE_842;
>>>> +       else {
>>>> +               pr_err("Invalid RxFIFO priority value\n");
>>>> +               ret =  -EINVAL;
>>>> +               goto err_out;
>>>> +       }
>>>> +
>>>> +       vas_init_rx_win_attr(&rxattr, coproc->ct);
>>>> +       rxattr.rx_fifo = (void *)rx_fifo;
>>>> +       rxattr.rx_fifo_size = fifo_size;
>>>> +       rxattr.lnotify_lpid = lpid;
>>>> +       rxattr.lnotify_pid = pid;
>>>> +       rxattr.lnotify_tid = tid;
>>>> +       rxattr.wcreds_max = MAX_CREDITS_PER_RXFIFO;
>>>> +
>>>> +       /*
>>>> +        * Open a VAS receice window which is used to configure RxFIFO
>>>> +        * for NX.
>>>> +        */
>>>> +       rxwin = vas_rx_win_open(vasid, coproc->ct, &rxattr);
>>>> +       if (IS_ERR(rxwin)) {
>>>> +               ret = PTR_ERR(rxwin);
>>>> +               pr_err("setting RxFIFO with VAS failed: %d\n",
>>>> +                       ret);
>>>> +               goto err_out;
>>>> +       }
>>>> +
>>>> +       coproc->vas.rxwin = rxwin;
>>>> +       coproc->vas.id = vasid;
>>>> +       nx842_add_coprocs_list(coproc, chip_id);
>>>> +
>>>> +       /*
>>>> +        * Kernel requests use only high priority FIFOs. So save coproc
>>>> +        * info in percpu coproc_inst which will be used to open send
>>>> +        * windows for crypto open requests later.
>>>> +        */
>>>> +       if (coproc->ct == VAS_COP_TYPE_842_HIPRI)
>>>> +               nx842_set_per_cpu_coproc(coproc);
>>>> +
>>>> +       return 0;
>>>> +
>>>> +err_out:
>>>> +       kfree(coproc);
>>>> +       return ret;
>>>> +}
>>>> +
>>>> +
>>>> +static int __init nx842_powernv_probe_vas(struct device_node *pn)
>>>> +{
>>>> +       struct device_node *dn;
>>>> +       int chip_id, vasid, ret = 0;
>>>> +       int nx_fifo_found = 0;
>>>> +
>>>> +       chip_id = of_get_ibm_chip_id(pn);
>>>> +       if (chip_id < 0) {
>>>> +               pr_err("ibm,chip-id missing\n");
>>>> +               return -EINVAL;
>>>> +       }
>>>> +
>>>> +       dn = of_find_compatible_node(pn, NULL, "ibm,power9-vas-x");
>>>> +
>>>> +       if (!dn) {
>>>> +               pr_err("Missing VAS device node\n");
>>>> +               return -EINVAL;
>>>> +       }
>>>> +
>>>> +       if (of_property_read_u32(dn, "ibm,vas-id", &vasid)) {
>>>> +               pr_err("Missing ibm,vas-id device property\n");
>>>> +               of_node_put(dn);
>>>> +               return -EINVAL;
>>>> +       }
>>>> +
>>>> +       of_node_put(dn);
>>>> +
>>>> +       for_each_child_of_node(pn, dn) {
>>>> +               if (of_device_is_compatible(dn, "ibm,p9-nx-842")) {
>>>> +                       ret = vas_cfg_coproc_info(dn, chip_id, vasid);
>>>> +                       if (ret) {
>>>> +                               of_node_put(dn);
>>>> +                               return ret;
>>>> +                       }
>>>> +                       nx_fifo_found++;
>>>> +               }
>>>> +       }
>>>> +
>>>> +       if (!nx_fifo_found) {
>>>> +               pr_err("NX842 FIFO nodes are missing\n");
>>>> +               ret = -EINVAL;
>>>> +       }
>>>> +
>>>> +       return ret;
>>>> +}
>>>> +
>>>>  static int __init nx842_powernv_probe(struct device_node *dn)
>>>>  {
>>>>         struct nx842_coproc *coproc;
>>>> @@ -622,6 +928,9 @@ static void nx842_delete_coprocs(void)
>>>>         struct nx842_coproc *coproc, *n;
>>>>
>>>>         list_for_each_entry_safe(coproc, n, &nx842_coprocs, list) {
>>>> +               if (coproc->vas.rxwin)
>>>> +                       vas_win_close(coproc->vas.rxwin);
>>>> +
>>>>                 list_del(&coproc->list);
>>>>                 kfree(coproc);
>>>>         }
>>>> @@ -643,6 +952,46 @@ static struct nx842_driver nx842_powernv_driver = {
>>>>         .decompress =   nx842_powernv_decompress,
>>>>  };
>>>>
>>>> +static int nx842_powernv_crypto_init_vas(struct crypto_tfm *tfm)
>>>> +{
>>>> +       struct nx842_crypto_ctx *ctx = crypto_tfm_ctx(tfm);
>>>> +       struct nx842_workmem *wmem;
>>>> +       struct nx842_coproc *coproc;
>>>> +       int ret;
>>>> +
>>>> +       ret = nx842_crypto_init(tfm, &nx842_powernv_driver);
>>>> +
>>>> +       if (ret)
>>>> +               return ret;
>>>> +
>>>> +       wmem = PTR_ALIGN((struct nx842_workmem *)ctx->wmem, WORKMEM_ALIGN);
>>>> +       coproc = per_cpu(coproc_inst, smp_processor_id());
>>>
>>> this is wrong.  the crypto transform init function is not guaranteed
>>> to be called by the same processor that later uses it.  Just because
>>> that happens to be how zswap operates doesn't guarantee other crypto
>>> users will do the same.
>>
>> Dan, Sorry missed this comment.
>>
>> Right, The actual crypto request can be executed on other processor than the CPU when the init is executed. The main goal is open send window on the NX engine which is on the same chip for the corresponding CPU. So we are OK if the request is scheduled on other CPU as long as it belongs to same chip. Otherwise in the worst case we will end up using remote NX.
> 
> ok, but there's no guarantee of future crypto calls being on the same
> chip either, so i still don't understand why you're doing this.  if
> you want each crypto comp/decomp call to be cpu-local or node-local,
> then choose which corpco/txwin to use at comp/decomp time, not
> transform init time.

On P8, requests are always send to same NX engine (nx842_ct) even though if the system has multiple NX coprocessors. Whereas on P9, requests will be forwarded to different NX engines, preferable to the local NX for the corresponding chip. 

One way is open 1 send window for each NX during device probe and use the corresponding window for each request. But in this case we might have overhead with parallel requests (ex: 1024). 

We can open limited number of send windows on each NX (including for 842, gzip - kernel and user space requests). So we can not keep send windows open forever. The current code is opening TX window for each crypto session during crpto alloc and closing it during crypto free. We do not want window open / close for each request (copy/paste). With this approach, the compression/decompression request can be scheduled on different chip than the one used during window open. So in the worst case, requests will be processed on the remote NX engine.

One other way is reserve few windows (say 100) for each engine type and use them - open 100 windows during probe and manage these windows for different requests depends on the cpu / chip where these requests are executed. Our next plan is look at performance analysis as part of VAS/NX optimization once NX gzip support is added and make necessary changes.     

 
> 
> 
>>
>> Thanks
>> Haren
>>
>>
>>
>>
>
Dan Streetman Sept. 3, 2017, 2:12 p.m. UTC | #23
On Sun, Sep 3, 2017 at 4:32 AM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
> On 09/02/2017 09:17 AM, Dan Streetman wrote:
>> On Sat, Sep 2, 2017 at 4:40 AM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
>>> On 08/29/2017 06:58 AM, Dan Streetman wrote:
>>>> On Sat, Jul 22, 2017 at 1:01 AM, Haren Myneni <haren@linux.vnet.ibm.com> wrote:
>>>>>
>>>>> This patch adds P9 NX support for 842 compression engine. Virtual
>>>>> Accelerator Switchboard (VAS) is used to access 842 engine on P9.
>>>>>
>>>>> For each NX engine per chip, setup receive window using
>>>>> vas_rx_win_open() which configures RxFIFo with FIFO address, lpid,
>>>>> pid and tid values. This unique (lpid, pid, tid) combination will
>>>>> be used to identify the target engine.
>>>>>
>>>>> For crypto open request, open send window on the NX engine for
>>>>> the corresponding chip / cpu where the open request is executed.
>>>>> This send window will be closed upon crypto close request.
>>>>>
>>>>> NX provides high and normal priority FIFOs. For compression /
>>>>> decompression requests, we use only hight priority FIFOs in kernel.
>>>>>
>>>>> Each NX request will be communicated to VAS using copy/paste
>>>>> instructions with vas_copy_crb() / vas_paste_crb() functions.
>>>>>
>>>>> Signed-off-by: Haren Myneni <haren@us.ibm.com>
>>>>> ---
>>>>>  drivers/crypto/nx/Kconfig          |   1 +
>>>>>  drivers/crypto/nx/nx-842-powernv.c | 375 ++++++++++++++++++++++++++++++++++++-
>>>>>  drivers/crypto/nx/nx-842.c         |   2 +-
>>>>>  3 files changed, 371 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/crypto/nx/Kconfig b/drivers/crypto/nx/Kconfig
>>>>> index ad7552a6998c..cd5dda9c48f4 100644
>>>>> --- a/drivers/crypto/nx/Kconfig
>>>>> +++ b/drivers/crypto/nx/Kconfig
>>>>> @@ -38,6 +38,7 @@ config CRYPTO_DEV_NX_COMPRESS_PSERIES
>>>>>  config CRYPTO_DEV_NX_COMPRESS_POWERNV
>>>>>         tristate "Compression acceleration support on PowerNV platform"
>>>>>         depends on PPC_POWERNV
>>>>> +       depends on PPC_VAS
>>>>>         default y
>>>>>         help
>>>>>           Support for PowerPC Nest (NX) compression acceleration. This
>>>>> diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
>>>>> index c0dd4c7e17d3..13089a0b9dfa 100644
>>>>> --- a/drivers/crypto/nx/nx-842-powernv.c
>>>>> +++ b/drivers/crypto/nx/nx-842-powernv.c
>>>>> @@ -23,6 +23,7 @@
>>>>>  #include <asm/prom.h>
>>>>>  #include <asm/icswx.h>
>>>>>  #include <asm/vas.h>
>>>>> +#include <asm/reg.h>
>>>>>
>>>>>  MODULE_LICENSE("GPL");
>>>>>  MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
>>>>> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>>>>>
>>>>>  #define WORKMEM_ALIGN  (CRB_ALIGN)
>>>>>  #define CSB_WAIT_MAX   (5000) /* ms */
>>>>> +#define VAS_RETRIES    (10)
>>>>> +/* # of requests allowed per RxFIFO at a time. 0 for unlimited */
>>>>> +#define MAX_CREDITS_PER_RXFIFO (1024)
>>>>>
>>>>>  struct nx842_workmem {
>>>>>         /* Below fields must be properly aligned */
>>>>> @@ -42,16 +46,27 @@ struct nx842_workmem {
>>>>>
>>>>>         ktime_t start;
>>>>>
>>>>> +       struct vas_window *txwin;       /* Used with VAS function */
>>>>>         char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
>>>>>  } __packed __aligned(WORKMEM_ALIGN);
>>>>>
>>>>>  struct nx842_coproc {
>>>>>         unsigned int chip_id;
>>>>>         unsigned int ct;
>>>>> -       unsigned int ci;
>>>>> +       unsigned int ci;        /* Coprocessor instance, used with icswx */
>>>>> +       struct {
>>>>> +               struct vas_window *rxwin;
>>>>> +               int id;
>>>>> +       } vas;
>>>>>         struct list_head list;
>>>>>  };
>>>>>
>>>>> +/*
>>>>> + * Send the request to NX engine on the chip for the corresponding CPU
>>>>> + * where the process is executing. Use with VAS function.
>>>>> + */
>>>>> +static DEFINE_PER_CPU(struct nx842_coproc *, coproc_inst);
>>>>> +
>>>>>  /* no cpu hotplug on powernv, so this list never changes after init */
>>>>>  static LIST_HEAD(nx842_coprocs);
>>>>>  static unsigned int nx842_ct;  /* used in icswx function */
>>>>> @@ -513,6 +528,105 @@ static int nx842_exec_icswx(const unsigned char *in, unsigned int inlen,
>>>>>  }
>>>>>
>>>>>  /**
>>>>> + * nx842_exec_vas - compress/decompress data using the 842 algorithm
>>>>> + *
>>>>> + * (De)compression provided by the NX842 coprocessor on IBM PowerNV systems.
>>>>> + * This compresses or decompresses the provided input buffer into the provided
>>>>> + * output buffer.
>>>>> + *
>>>>> + * Upon return from this function @outlen contains the length of the
>>>>> + * output data.  If there is an error then @outlen will be 0 and an
>>>>> + * error will be specified by the return code from this function.
>>>>> + *
>>>>> + * The @workmem buffer should only be used by one function call at a time.
>>>>> + *
>>>>> + * @in: input buffer pointer
>>>>> + * @inlen: input buffer size
>>>>> + * @out: output buffer pointer
>>>>> + * @outlenp: output buffer size pointer
>>>>> + * @workmem: working memory buffer pointer, size determined by
>>>>> + *           nx842_powernv_driver.workmem_size
>>>>> + * @fc: function code, see CCW Function Codes in nx-842.h
>>>>> + *
>>>>> + * Returns:
>>>>> + *   0         Success, output of length @outlenp stored in the buffer
>>>>> + *             at @out
>>>>> + *   -ENODEV   Hardware unavailable
>>>>> + *   -ENOSPC   Output buffer is to small
>>>>> + *   -EMSGSIZE Input buffer too large
>>>>> + *   -EINVAL   buffer constraints do not fix nx842_constraints
>>>>> + *   -EPROTO   hardware error during operation
>>>>> + *   -ETIMEDOUT        hardware did not complete operation in reasonable time
>>>>> + *   -EINTR    operation was aborted
>>>>> + */
>>>>> +static int nx842_exec_vas(const unsigned char *in, unsigned int inlen,
>>>>> +                                 unsigned char *out, unsigned int *outlenp,
>>>>> +                                 void *workmem, int fc)
>>>>> +{
>>>>> +       struct coprocessor_request_block *crb;
>>>>> +       struct coprocessor_status_block *csb;
>>>>> +       struct nx842_workmem *wmem;
>>>>> +       struct vas_window *txwin;
>>>>> +       int ret, i = 0;
>>>>> +       u32 ccw;
>>>>> +       unsigned int outlen = *outlenp;
>>>>> +
>>>>> +       wmem = PTR_ALIGN(workmem, WORKMEM_ALIGN);
>>>>> +
>>>>> +       *outlenp = 0;
>>>>> +
>>>>> +       crb = &wmem->crb;
>>>>> +       csb = &crb->csb;
>>>>> +
>>>>> +       ret = nx842_config_crb(in, inlen, out, outlen, wmem);
>>>>> +       if (ret)
>>>>> +               return ret;
>>>>> +
>>>>> +       ccw = 0;
>>>>> +       ccw = SET_FIELD(CCW_FC_842, ccw, fc);
>>>>> +       crb->ccw = cpu_to_be32(ccw);
>>>>> +
>>>>> +       txwin = wmem->txwin;
>>>>> +       /* shoudn't happen, we don't load without a coproc */
>>>>> +       if (!txwin) {
>>>>> +               pr_err_ratelimited("NX-842 coprocessor is not available");
>>>>> +               return -ENODEV;
>>>>> +       }
>>>>> +
>>>>> +       do {
>>>>> +               wmem->start = ktime_get();
>>>>> +               preempt_disable();
>>>>> +               /*
>>>>> +                * VAS copy CRB into L2 cache. Refer <asm/vas.h>.
>>>>> +                * @crb, @offset and @first (must be true)
>>>>> +                */
>>>>> +               vas_copy_crb(crb, 0, 1);
>>>>> +
>>>>> +               /*
>>>>> +                * VAS paste previously copied CRB to NX.
>>>>> +                * @txwin, @offset, @last (must be true) and @re is
>>>>> +                * expected/assumed to be true for NX windows.
>>>>> +                */
>>>>> +               ret = vas_paste_crb(txwin, 0, 1, 1);
>>>>> +               preempt_enable();
>>>>> +               /*
>>>>> +                * Retry copy/paste function for VAS failures.
>>>>> +                */
>>>>> +       } while (ret && (i++ < VAS_RETRIES));
>>>>> +
>>>>> +       if (ret) {
>>>>> +               pr_err_ratelimited("VAS copy/paste failed\n");
>>>>> +               return ret;
>>>>> +       }
>>>>> +
>>>>> +       ret = wait_for_csb(wmem, csb);
>>>>> +       if (!ret)
>>>>> +               *outlenp = be32_to_cpu(csb->count);
>>>>> +
>>>>> +       return ret;
>>>>> +}
>>>>> +
>>>>> +/**
>>>>>   * nx842_powernv_compress - Compress data using the 842 algorithm
>>>>>   *
>>>>>   * Compression provided by the NX842 coprocessor on IBM PowerNV systems.
>>>>> @@ -576,6 +690,198 @@ static inline void nx842_add_coprocs_list(struct nx842_coproc *coproc,
>>>>>         list_add(&coproc->list, &nx842_coprocs);
>>>>>  }
>>>>>
>>>>> +/*
>>>>> + * Identify chip ID for each CPU and save coprocesor adddress for the
>>>>> + * corresponding NX engine in percpu coproc_inst.
>>>>> + * coproc_inst is used in crypto_init to open send window on the NX instance
>>>>> + * for the corresponding CPU / chip where the open request is executed.
>>>>> + */
>>>>> +static void nx842_set_per_cpu_coproc(struct nx842_coproc *coproc)
>>>>> +{
>>>>> +       unsigned int i, chip_id;
>>>>> +
>>>>> +       for_each_possible_cpu(i) {
>>>>> +               chip_id = cpu_to_chip_id(i);
>>>>> +
>>>>> +               if (coproc->chip_id == chip_id)
>>>>> +                       per_cpu(coproc_inst, i) = coproc;
>>>>> +       }
>>>>> +}
>>>>> +
>>>>> +
>>>>> +static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc)
>>>>> +{
>>>>> +       struct vas_window *txwin = NULL;
>>>>> +       struct vas_tx_win_attr txattr;
>>>>> +
>>>>> +       /*
>>>>> +        * Kernel requests will be high priority. So open send
>>>>> +        * windows only for high priority RxFIFO entries.
>>>>> +        */
>>>>> +       vas_init_tx_win_attr(&txattr, coproc->ct);
>>>>> +       txattr.lpid = 0;        /* lpid is 0 for kernel requests */
>>>>> +       txattr.pid = mfspr(SPRN_PID);
>>>>> +
>>>>> +       /*
>>>>> +        * Open a VAS send window which is used to send request to NX.
>>>>> +        */
>>>>> +       txwin = vas_tx_win_open(coproc->vas.id, coproc->ct, &txattr);
>>>>> +       if (IS_ERR(txwin)) {
>>>>> +               pr_err("ibm,nx-842: Can not open TX window: %ld\n",
>>>>> +                               PTR_ERR(txwin));
>>>>> +               return NULL;
>>>>> +       }
>>>>> +
>>>>> +       return txwin;
>>>>> +}
>>>>> +
>>>>> +static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id,
>>>>> +                                       int vasid)
>>>>> +{
>>>>> +       struct vas_window *rxwin = NULL;
>>>>> +       struct vas_rx_win_attr rxattr;
>>>>> +       struct nx842_coproc *coproc;
>>>>> +       u32 lpid, pid, tid, fifo_size;
>>>>> +       u64 rx_fifo;
>>>>> +       const char *priority;
>>>>> +       int ret;
>>>>> +
>>>>> +       ret = of_property_read_u64(dn, "rx-fifo-address", (void *)&rx_fifo);
>>>>> +       if (ret) {
>>>>> +               pr_err("Missing rx-fifo-address property\n");
>>>>> +               return ret;
>>>>> +       }
>>>>> +
>>>>> +       ret = of_property_read_u32(dn, "rx-fifo-size", &fifo_size);
>>>>> +       if (ret) {
>>>>> +               pr_err("Missing rx-fifo-size property\n");
>>>>> +               return ret;
>>>>> +       }
>>>>> +
>>>>> +       ret = of_property_read_u32(dn, "lpid", &lpid);
>>>>> +       if (ret) {
>>>>> +               pr_err("Missing lpid property\n");
>>>>> +               return ret;
>>>>> +       }
>>>>> +
>>>>> +       ret = of_property_read_u32(dn, "pid", &pid);
>>>>> +       if (ret) {
>>>>> +               pr_err("Missing pid property\n");
>>>>> +               return ret;
>>>>> +       }
>>>>> +
>>>>> +       ret = of_property_read_u32(dn, "tid", &tid);
>>>>> +       if (ret) {
>>>>> +               pr_err("Missing tid property\n");
>>>>> +               return ret;
>>>>> +       }
>>>>> +
>>>>> +       ret = of_property_read_string(dn, "priority", &priority);
>>>>> +       if (ret) {
>>>>> +               pr_err("Missing priority property\n");
>>>>> +               return ret;
>>>>> +       }
>>>>> +
>>>>> +       coproc = kzalloc(sizeof(*coproc), GFP_KERNEL);
>>>>> +       if (!coproc)
>>>>> +               return -ENOMEM;
>>>>> +
>>>>> +       if (!strcmp(priority, "High"))
>>>>> +               coproc->ct = VAS_COP_TYPE_842_HIPRI;
>>>>> +       else if (!strcmp(priority, "Normal"))
>>>>> +               coproc->ct = VAS_COP_TYPE_842;
>>>>> +       else {
>>>>> +               pr_err("Invalid RxFIFO priority value\n");
>>>>> +               ret =  -EINVAL;
>>>>> +               goto err_out;
>>>>> +       }
>>>>> +
>>>>> +       vas_init_rx_win_attr(&rxattr, coproc->ct);
>>>>> +       rxattr.rx_fifo = (void *)rx_fifo;
>>>>> +       rxattr.rx_fifo_size = fifo_size;
>>>>> +       rxattr.lnotify_lpid = lpid;
>>>>> +       rxattr.lnotify_pid = pid;
>>>>> +       rxattr.lnotify_tid = tid;
>>>>> +       rxattr.wcreds_max = MAX_CREDITS_PER_RXFIFO;
>>>>> +
>>>>> +       /*
>>>>> +        * Open a VAS receice window which is used to configure RxFIFO
>>>>> +        * for NX.
>>>>> +        */
>>>>> +       rxwin = vas_rx_win_open(vasid, coproc->ct, &rxattr);
>>>>> +       if (IS_ERR(rxwin)) {
>>>>> +               ret = PTR_ERR(rxwin);
>>>>> +               pr_err("setting RxFIFO with VAS failed: %d\n",
>>>>> +                       ret);
>>>>> +               goto err_out;
>>>>> +       }
>>>>> +
>>>>> +       coproc->vas.rxwin = rxwin;
>>>>> +       coproc->vas.id = vasid;
>>>>> +       nx842_add_coprocs_list(coproc, chip_id);
>>>>> +
>>>>> +       /*
>>>>> +        * Kernel requests use only high priority FIFOs. So save coproc
>>>>> +        * info in percpu coproc_inst which will be used to open send
>>>>> +        * windows for crypto open requests later.
>>>>> +        */
>>>>> +       if (coproc->ct == VAS_COP_TYPE_842_HIPRI)
>>>>> +               nx842_set_per_cpu_coproc(coproc);
>>>>> +
>>>>> +       return 0;
>>>>> +
>>>>> +err_out:
>>>>> +       kfree(coproc);
>>>>> +       return ret;
>>>>> +}
>>>>> +
>>>>> +
>>>>> +static int __init nx842_powernv_probe_vas(struct device_node *pn)
>>>>> +{
>>>>> +       struct device_node *dn;
>>>>> +       int chip_id, vasid, ret = 0;
>>>>> +       int nx_fifo_found = 0;
>>>>> +
>>>>> +       chip_id = of_get_ibm_chip_id(pn);
>>>>> +       if (chip_id < 0) {
>>>>> +               pr_err("ibm,chip-id missing\n");
>>>>> +               return -EINVAL;
>>>>> +       }
>>>>> +
>>>>> +       dn = of_find_compatible_node(pn, NULL, "ibm,power9-vas-x");
>>>>> +
>>>>> +       if (!dn) {
>>>>> +               pr_err("Missing VAS device node\n");
>>>>> +               return -EINVAL;
>>>>> +       }
>>>>> +
>>>>> +       if (of_property_read_u32(dn, "ibm,vas-id", &vasid)) {
>>>>> +               pr_err("Missing ibm,vas-id device property\n");
>>>>> +               of_node_put(dn);
>>>>> +               return -EINVAL;
>>>>> +       }
>>>>> +
>>>>> +       of_node_put(dn);
>>>>> +
>>>>> +       for_each_child_of_node(pn, dn) {
>>>>> +               if (of_device_is_compatible(dn, "ibm,p9-nx-842")) {
>>>>> +                       ret = vas_cfg_coproc_info(dn, chip_id, vasid);
>>>>> +                       if (ret) {
>>>>> +                               of_node_put(dn);
>>>>> +                               return ret;
>>>>> +                       }
>>>>> +                       nx_fifo_found++;
>>>>> +               }
>>>>> +       }
>>>>> +
>>>>> +       if (!nx_fifo_found) {
>>>>> +               pr_err("NX842 FIFO nodes are missing\n");
>>>>> +               ret = -EINVAL;
>>>>> +       }
>>>>> +
>>>>> +       return ret;
>>>>> +}
>>>>> +
>>>>>  static int __init nx842_powernv_probe(struct device_node *dn)
>>>>>  {
>>>>>         struct nx842_coproc *coproc;
>>>>> @@ -622,6 +928,9 @@ static void nx842_delete_coprocs(void)
>>>>>         struct nx842_coproc *coproc, *n;
>>>>>
>>>>>         list_for_each_entry_safe(coproc, n, &nx842_coprocs, list) {
>>>>> +               if (coproc->vas.rxwin)
>>>>> +                       vas_win_close(coproc->vas.rxwin);
>>>>> +
>>>>>                 list_del(&coproc->list);
>>>>>                 kfree(coproc);
>>>>>         }
>>>>> @@ -643,6 +952,46 @@ static struct nx842_driver nx842_powernv_driver = {
>>>>>         .decompress =   nx842_powernv_decompress,
>>>>>  };
>>>>>
>>>>> +static int nx842_powernv_crypto_init_vas(struct crypto_tfm *tfm)
>>>>> +{
>>>>> +       struct nx842_crypto_ctx *ctx = crypto_tfm_ctx(tfm);
>>>>> +       struct nx842_workmem *wmem;
>>>>> +       struct nx842_coproc *coproc;
>>>>> +       int ret;
>>>>> +
>>>>> +       ret = nx842_crypto_init(tfm, &nx842_powernv_driver);
>>>>> +
>>>>> +       if (ret)
>>>>> +               return ret;
>>>>> +
>>>>> +       wmem = PTR_ALIGN((struct nx842_workmem *)ctx->wmem, WORKMEM_ALIGN);
>>>>> +       coproc = per_cpu(coproc_inst, smp_processor_id());
>>>>
>>>> this is wrong.  the crypto transform init function is not guaranteed
>>>> to be called by the same processor that later uses it.  Just because
>>>> that happens to be how zswap operates doesn't guarantee other crypto
>>>> users will do the same.
>>>
>>> Dan, Sorry missed this comment.
>>>
>>> Right, The actual crypto request can be executed on other processor than the CPU when the init is executed. The main goal is open send window on the NX engine which is on the same chip for the corresponding CPU. So we are OK if the request is scheduled on other CPU as long as it belongs to same chip. Otherwise in the worst case we will end up using remote NX.
>>
>> ok, but there's no guarantee of future crypto calls being on the same
>> chip either, so i still don't understand why you're doing this.  if
>> you want each crypto comp/decomp call to be cpu-local or node-local,
>> then choose which corpco/txwin to use at comp/decomp time, not
>> transform init time.
>
> On P8, requests are always send to same NX engine (nx842_ct) even though if the system has multiple NX coprocessors.

Um, no - read RFC02167 icswx spec again.  CT is the coprocessor
*type*, and is the same value for all 842 coprocessor engines in the
system.  It's the CI which is the coprocessor *instance*, which is
different for each coprocessor instance (of the same type) in the
system.  When setting up the icswx call, using the CT must be the type
assigned to 842 coprocessors, and the CI can correspond to any
specific coprocessor instance, or it can be the reserved value 0 to
indicate the system should choose the "best" coprocessor to use.
That's what the driver does, lets the system choose the "best" 842
engine instance to use:

        ccw = SET_FIELD(ccw, CCW_CI_842, 0); /* use 0 for hw
auto-selection */

Whether or not the underlying P8 firmware actually does pick the
"best" instance, or even load balances the engines at all, I don't
know.  The nx842 driver certainly could be updated to try to choose
the "best" coproc engine to use itself, if the coproc firmware isn't
doing it.  From the last version of the spec that I have, however, it
sounds like it should be working:

"8.3.2 CI Default Selection

When a CI of zero is specified, implementation-depen-
dent default selection of a targeted coprocessor is per-
formed. The default selection is made in one of the
following ways:

 The default CI is determined in the processor dur-
ing execution of icswx. The applicable nonzero CI
is substituted for the originally-specified zero CI in
order to designate the default coprocessor.

The zero CI is observed which causes the default
coprocessor to identify the issuer by implementa-
tion-dependent means and accept the request.

The use of the default CI reduces pressure on the
CI space. Any coprocessor type that does not
require predictable selection can use default CI
selection. Implementation-dependent configuration
controls can be used to balance and rebalance
coprocessor availability for the set of logical parti-
tions."


> Whereas on P9, requests will be forwarded to different NX engines, preferable to the local NX for the corresponding chip.
>
> One way is open 1 send window for each NX during device probe and use the corresponding window for each request. But in this case we might have overhead with parallel requests (ex: 1024).
>
> We can open limited number of send windows on each NX (including for 842, gzip - kernel and user space requests). So we can not keep send windows open forever. The current code is opening TX window for each crypto session during crpto alloc and closing it during crypto free. We do not want window open / close for each request (copy/paste). With this approach, the compression/decompression request can be scheduled on different chip than the one used during window open. So in the worst case, requests will be processed on the remote NX engine.
>
> One other way is reserve few windows (say 100) for each engine type and use them - open 100 windows during probe and manage these windows for different requests depends on the cpu / chip where these requests are executed. Our next plan is look at performance analysis as part of VAS/NX optimization once NX gzip support is added and make necessary changes.

I'm not sure if you are using 'engine type' when you mean 'engine
instance', but yes i would think it makes much more sense to open N
txwins per engine instance and keep them in a queue/list, then for
each comp/decomp operation choose the 'best' (considering cpu-local
and coproc available txwin queue len) coproc to use and take the top
txwin from that coproc's queue.

But again, since the P9 spec isn't public and the last I saw was a P8
spec over 2 years ago, I probably don't need to comment on this much
more ;-)

>
>
>>
>>
>>>
>>> Thanks
>>> Haren
>>>
>>>
>>>
>>>
>>
>
diff mbox

Patch

diff --git a/drivers/crypto/nx/Kconfig b/drivers/crypto/nx/Kconfig
index ad7552a6998c..cd5dda9c48f4 100644
--- a/drivers/crypto/nx/Kconfig
+++ b/drivers/crypto/nx/Kconfig
@@ -38,6 +38,7 @@  config CRYPTO_DEV_NX_COMPRESS_PSERIES
 config CRYPTO_DEV_NX_COMPRESS_POWERNV
 	tristate "Compression acceleration support on PowerNV platform"
 	depends on PPC_POWERNV
+	depends on PPC_VAS
 	default y
 	help
 	  Support for PowerPC Nest (NX) compression acceleration. This
diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
index c0dd4c7e17d3..13089a0b9dfa 100644
--- a/drivers/crypto/nx/nx-842-powernv.c
+++ b/drivers/crypto/nx/nx-842-powernv.c
@@ -23,6 +23,7 @@ 
 #include <asm/prom.h>
 #include <asm/icswx.h>
 #include <asm/vas.h>
+#include <asm/reg.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
@@ -32,6 +33,9 @@  MODULE_ALIAS_CRYPTO("842-nx");
 
 #define WORKMEM_ALIGN	(CRB_ALIGN)
 #define CSB_WAIT_MAX	(5000) /* ms */
+#define VAS_RETRIES	(10)
+/* # of requests allowed per RxFIFO at a time. 0 for unlimited */
+#define MAX_CREDITS_PER_RXFIFO	(1024)
 
 struct nx842_workmem {
 	/* Below fields must be properly aligned */
@@ -42,16 +46,27 @@  struct nx842_workmem {
 
 	ktime_t start;
 
+	struct vas_window *txwin;	/* Used with VAS function */
 	char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
 } __packed __aligned(WORKMEM_ALIGN);
 
 struct nx842_coproc {
 	unsigned int chip_id;
 	unsigned int ct;
-	unsigned int ci;
+	unsigned int ci;	/* Coprocessor instance, used with icswx */
+	struct {
+		struct vas_window *rxwin;
+		int id;
+	} vas;
 	struct list_head list;
 };
 
+/*
+ * Send the request to NX engine on the chip for the corresponding CPU
+ * where the process is executing. Use with VAS function.
+ */
+static DEFINE_PER_CPU(struct nx842_coproc *, coproc_inst);
+
 /* no cpu hotplug on powernv, so this list never changes after init */
 static LIST_HEAD(nx842_coprocs);
 static unsigned int nx842_ct;	/* used in icswx function */
@@ -513,6 +528,105 @@  static int nx842_exec_icswx(const unsigned char *in, unsigned int inlen,
 }
 
 /**
+ * nx842_exec_vas - compress/decompress data using the 842 algorithm
+ *
+ * (De)compression provided by the NX842 coprocessor on IBM PowerNV systems.
+ * This compresses or decompresses the provided input buffer into the provided
+ * output buffer.
+ *
+ * Upon return from this function @outlen contains the length of the
+ * output data.  If there is an error then @outlen will be 0 and an
+ * error will be specified by the return code from this function.
+ *
+ * The @workmem buffer should only be used by one function call at a time.
+ *
+ * @in: input buffer pointer
+ * @inlen: input buffer size
+ * @out: output buffer pointer
+ * @outlenp: output buffer size pointer
+ * @workmem: working memory buffer pointer, size determined by
+ *           nx842_powernv_driver.workmem_size
+ * @fc: function code, see CCW Function Codes in nx-842.h
+ *
+ * Returns:
+ *   0		Success, output of length @outlenp stored in the buffer
+ *		at @out
+ *   -ENODEV	Hardware unavailable
+ *   -ENOSPC	Output buffer is to small
+ *   -EMSGSIZE	Input buffer too large
+ *   -EINVAL	buffer constraints do not fix nx842_constraints
+ *   -EPROTO	hardware error during operation
+ *   -ETIMEDOUT	hardware did not complete operation in reasonable time
+ *   -EINTR	operation was aborted
+ */
+static int nx842_exec_vas(const unsigned char *in, unsigned int inlen,
+				  unsigned char *out, unsigned int *outlenp,
+				  void *workmem, int fc)
+{
+	struct coprocessor_request_block *crb;
+	struct coprocessor_status_block *csb;
+	struct nx842_workmem *wmem;
+	struct vas_window *txwin;
+	int ret, i = 0;
+	u32 ccw;
+	unsigned int outlen = *outlenp;
+
+	wmem = PTR_ALIGN(workmem, WORKMEM_ALIGN);
+
+	*outlenp = 0;
+
+	crb = &wmem->crb;
+	csb = &crb->csb;
+
+	ret = nx842_config_crb(in, inlen, out, outlen, wmem);
+	if (ret)
+		return ret;
+
+	ccw = 0;
+	ccw = SET_FIELD(CCW_FC_842, ccw, fc);
+	crb->ccw = cpu_to_be32(ccw);
+
+	txwin = wmem->txwin;
+	/* shoudn't happen, we don't load without a coproc */
+	if (!txwin) {
+		pr_err_ratelimited("NX-842 coprocessor is not available");
+		return -ENODEV;
+	}
+
+	do {
+		wmem->start = ktime_get();
+		preempt_disable();
+		/*
+		 * VAS copy CRB into L2 cache. Refer <asm/vas.h>.
+		 * @crb, @offset and @first (must be true)
+		 */
+		vas_copy_crb(crb, 0, 1);
+
+		/*
+		 * VAS paste previously copied CRB to NX.
+		 * @txwin, @offset, @last (must be true) and @re is 
+		 * expected/assumed to be true for NX windows.
+		 */
+		ret = vas_paste_crb(txwin, 0, 1, 1);
+		preempt_enable();
+		/*
+		 * Retry copy/paste function for VAS failures.
+		 */
+	} while (ret && (i++ < VAS_RETRIES));
+
+	if (ret) {
+		pr_err_ratelimited("VAS copy/paste failed\n");
+		return ret;
+	}
+
+	ret = wait_for_csb(wmem, csb);
+	if (!ret)
+		*outlenp = be32_to_cpu(csb->count);
+
+	return ret;
+}
+
+/**
  * nx842_powernv_compress - Compress data using the 842 algorithm
  *
  * Compression provided by the NX842 coprocessor on IBM PowerNV systems.
@@ -576,6 +690,198 @@  static inline void nx842_add_coprocs_list(struct nx842_coproc *coproc,
 	list_add(&coproc->list, &nx842_coprocs);
 }
 
+/*
+ * Identify chip ID for each CPU and save coprocesor adddress for the
+ * corresponding NX engine in percpu coproc_inst.
+ * coproc_inst is used in crypto_init to open send window on the NX instance
+ * for the corresponding CPU / chip where the open request is executed.
+ */
+static void nx842_set_per_cpu_coproc(struct nx842_coproc *coproc)
+{
+	unsigned int i, chip_id;
+
+	for_each_possible_cpu(i) {
+		chip_id = cpu_to_chip_id(i);
+
+		if (coproc->chip_id == chip_id)
+			per_cpu(coproc_inst, i) = coproc;
+	}
+}
+
+
+static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc)
+{
+	struct vas_window *txwin = NULL;
+	struct vas_tx_win_attr txattr;
+
+	/*
+	 * Kernel requests will be high priority. So open send
+	 * windows only for high priority RxFIFO entries.
+	 */
+	vas_init_tx_win_attr(&txattr, coproc->ct);
+	txattr.lpid = 0;	/* lpid is 0 for kernel requests */
+	txattr.pid = mfspr(SPRN_PID);
+
+	/*
+	 * Open a VAS send window which is used to send request to NX.
+	 */
+	txwin = vas_tx_win_open(coproc->vas.id, coproc->ct, &txattr);
+	if (IS_ERR(txwin)) {
+		pr_err("ibm,nx-842: Can not open TX window: %ld\n",
+				PTR_ERR(txwin));
+		return NULL;
+	}
+
+	return txwin;
+}
+
+static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id,
+					int vasid)
+{
+	struct vas_window *rxwin = NULL;
+	struct vas_rx_win_attr rxattr;
+	struct nx842_coproc *coproc;
+	u32 lpid, pid, tid, fifo_size;
+	u64 rx_fifo;
+	const char *priority;
+	int ret;
+
+	ret = of_property_read_u64(dn, "rx-fifo-address", (void *)&rx_fifo);
+	if (ret) {
+		pr_err("Missing rx-fifo-address property\n");
+		return ret;
+	}
+
+	ret = of_property_read_u32(dn, "rx-fifo-size", &fifo_size);
+	if (ret) {
+		pr_err("Missing rx-fifo-size property\n");
+		return ret;
+	}
+
+	ret = of_property_read_u32(dn, "lpid", &lpid);
+	if (ret) {
+		pr_err("Missing lpid property\n");
+		return ret;
+	}
+
+	ret = of_property_read_u32(dn, "pid", &pid);
+	if (ret) {
+		pr_err("Missing pid property\n");
+		return ret;
+	}
+
+	ret = of_property_read_u32(dn, "tid", &tid);
+	if (ret) {
+		pr_err("Missing tid property\n");
+		return ret;
+	}
+
+	ret = of_property_read_string(dn, "priority", &priority);
+	if (ret) {
+		pr_err("Missing priority property\n");
+		return ret;
+	}
+
+	coproc = kzalloc(sizeof(*coproc), GFP_KERNEL);
+	if (!coproc)
+		return -ENOMEM;
+
+	if (!strcmp(priority, "High"))
+		coproc->ct = VAS_COP_TYPE_842_HIPRI;
+	else if (!strcmp(priority, "Normal"))
+		coproc->ct = VAS_COP_TYPE_842;
+	else {
+		pr_err("Invalid RxFIFO priority value\n");
+		ret =  -EINVAL;
+		goto err_out;
+	}
+
+	vas_init_rx_win_attr(&rxattr, coproc->ct);
+	rxattr.rx_fifo = (void *)rx_fifo;
+	rxattr.rx_fifo_size = fifo_size;
+	rxattr.lnotify_lpid = lpid;
+	rxattr.lnotify_pid = pid;
+	rxattr.lnotify_tid = tid;
+	rxattr.wcreds_max = MAX_CREDITS_PER_RXFIFO;
+
+	/*
+	 * Open a VAS receice window which is used to configure RxFIFO
+	 * for NX.
+	 */
+	rxwin = vas_rx_win_open(vasid, coproc->ct, &rxattr);
+	if (IS_ERR(rxwin)) {
+		ret = PTR_ERR(rxwin);
+		pr_err("setting RxFIFO with VAS failed: %d\n",
+			ret);
+		goto err_out;
+	}
+
+	coproc->vas.rxwin = rxwin;
+	coproc->vas.id = vasid;
+	nx842_add_coprocs_list(coproc, chip_id);
+
+	/*
+	 * Kernel requests use only high priority FIFOs. So save coproc
+	 * info in percpu coproc_inst which will be used to open send
+	 * windows for crypto open requests later.
+	 */
+	if (coproc->ct == VAS_COP_TYPE_842_HIPRI)
+		nx842_set_per_cpu_coproc(coproc);
+
+	return 0;
+
+err_out:
+	kfree(coproc);
+	return ret;
+}
+
+
+static int __init nx842_powernv_probe_vas(struct device_node *pn)
+{
+	struct device_node *dn;
+	int chip_id, vasid, ret = 0;
+	int nx_fifo_found = 0;
+
+	chip_id = of_get_ibm_chip_id(pn);
+	if (chip_id < 0) {
+		pr_err("ibm,chip-id missing\n");
+		return -EINVAL;
+	}
+
+	dn = of_find_compatible_node(pn, NULL, "ibm,power9-vas-x");
+
+	if (!dn) {
+		pr_err("Missing VAS device node\n");
+		return -EINVAL;
+	}
+
+	if (of_property_read_u32(dn, "ibm,vas-id", &vasid)) {
+		pr_err("Missing ibm,vas-id device property\n");
+		of_node_put(dn);
+		return -EINVAL;
+	}
+
+	of_node_put(dn);
+
+	for_each_child_of_node(pn, dn) {
+		if (of_device_is_compatible(dn, "ibm,p9-nx-842")) {
+			ret = vas_cfg_coproc_info(dn, chip_id, vasid);
+			if (ret) {
+				of_node_put(dn);
+				return ret;
+			}
+			nx_fifo_found++;
+		}
+	}
+
+	if (!nx_fifo_found) {
+		pr_err("NX842 FIFO nodes are missing\n");
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
 static int __init nx842_powernv_probe(struct device_node *dn)
 {
 	struct nx842_coproc *coproc;
@@ -622,6 +928,9 @@  static void nx842_delete_coprocs(void)
 	struct nx842_coproc *coproc, *n;
 
 	list_for_each_entry_safe(coproc, n, &nx842_coprocs, list) {
+		if (coproc->vas.rxwin)
+			vas_win_close(coproc->vas.rxwin);
+
 		list_del(&coproc->list);
 		kfree(coproc);
 	}
@@ -643,6 +952,46 @@  static struct nx842_driver nx842_powernv_driver = {
 	.decompress =	nx842_powernv_decompress,
 };
 
+static int nx842_powernv_crypto_init_vas(struct crypto_tfm *tfm)
+{
+	struct nx842_crypto_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct nx842_workmem *wmem;
+	struct nx842_coproc *coproc;
+	int ret;
+
+	ret = nx842_crypto_init(tfm, &nx842_powernv_driver);
+
+	if (ret)
+		return ret;
+
+	wmem = PTR_ALIGN((struct nx842_workmem *)ctx->wmem, WORKMEM_ALIGN);
+	coproc = per_cpu(coproc_inst, smp_processor_id());
+
+	ret = -EINVAL;
+	if (coproc && coproc->vas.rxwin) {
+		wmem->txwin = nx842_alloc_txwin(coproc);
+		if (!IS_ERR(wmem->txwin))
+			return 0;
+
+		ret = PTR_ERR(wmem->txwin);
+	}
+
+	return ret;
+}
+
+void nx842_powernv_crypto_exit_vas(struct crypto_tfm *tfm)
+{
+	struct nx842_crypto_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct nx842_workmem *wmem;
+
+	wmem = PTR_ALIGN((struct nx842_workmem *)ctx->wmem, WORKMEM_ALIGN);
+
+	if (wmem && wmem->txwin)
+		vas_win_close(wmem->txwin);
+
+	nx842_crypto_exit(tfm);
+}
+
 static int nx842_powernv_crypto_init(struct crypto_tfm *tfm)
 {
 	return nx842_crypto_init(tfm, &nx842_powernv_driver);
@@ -676,13 +1025,27 @@  static __init int nx842_powernv_init(void)
 	BUILD_BUG_ON(DDE_BUFFER_ALIGN % DDE_BUFFER_SIZE_MULT);
 	BUILD_BUG_ON(DDE_BUFFER_SIZE_MULT % DDE_BUFFER_LAST_MULT);
 
-	for_each_compatible_node(dn, NULL, "ibm,power-nx")
-		nx842_powernv_probe(dn);
+	for_each_compatible_node(dn, NULL, "ibm,power9-nx") {
+		ret = nx842_powernv_probe_vas(dn);
+		if (ret) {
+			nx842_delete_coprocs();
+			return ret;
+		}
+	}
 
-	if (!nx842_ct)
-		return -ENODEV;
+	if (list_empty(&nx842_coprocs)) {
+		for_each_compatible_node(dn, NULL, "ibm,power-nx")
+			nx842_powernv_probe(dn);
+
+		if (!nx842_ct)
+			return -ENODEV;
 
-	nx842_powernv_exec = nx842_exec_icswx;
+		nx842_powernv_exec = nx842_exec_icswx;
+	} else {
+		nx842_powernv_exec = nx842_exec_vas;
+		nx842_powernv_alg.cra_init = nx842_powernv_crypto_init_vas;
+		nx842_powernv_alg.cra_exit = nx842_powernv_crypto_exit_vas;
+	}
 
 	ret = crypto_register_alg(&nx842_powernv_alg);
 	if (ret) {
diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c
index d94e25df503b..da3cb8c35ec7 100644
--- a/drivers/crypto/nx/nx-842.c
+++ b/drivers/crypto/nx/nx-842.c
@@ -116,7 +116,7 @@  int nx842_crypto_init(struct crypto_tfm *tfm, struct nx842_driver *driver)
 
 	spin_lock_init(&ctx->lock);
 	ctx->driver = driver;
-	ctx->wmem = kmalloc(driver->workmem_size, GFP_KERNEL);
+	ctx->wmem = kzalloc(driver->workmem_size, GFP_KERNEL);
 	ctx->sbounce = (u8 *)__get_free_pages(GFP_KERNEL, BOUNCE_BUFFER_ORDER);
 	ctx->dbounce = (u8 *)__get_free_pages(GFP_KERNEL, BOUNCE_BUFFER_ORDER);
 	if (!ctx->wmem || !ctx->sbounce || !ctx->dbounce) {