diff mbox series

[16/17] fpga: dfl: fme: add global error reporting support

Message ID 1553483264-5379-17-git-send-email-hao.wu@intel.com (mailing list archive)
State Superseded, archived
Headers show
Series add new features for FPGA DFL drivers | expand

Commit Message

Wu, Hao March 25, 2019, 3:07 a.m. UTC
This patch adds support for global error reporting for FPGA
Management Engine (FME), it introduces sysfs interfaces to
report different error detected by the hardware, and allow
user to clear errors or inject error for testing purpose.

Signed-off-by: Luwei Kang <luwei.kang@intel.com>
Signed-off-by: Ananda Ravuri <ananda.ravuri@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
---
 Documentation/ABI/testing/sysfs-platform-dfl-fme |  58 ++++
 drivers/fpga/Makefile                            |   2 +-
 drivers/fpga/dfl-fme-error.c                     | 390 +++++++++++++++++++++++
 drivers/fpga/dfl-fme-main.c                      |   4 +
 drivers/fpga/dfl-fme.h                           |   2 +
 drivers/fpga/dfl.h                               |   2 +
 6 files changed, 457 insertions(+), 1 deletion(-)
 create mode 100644 drivers/fpga/dfl-fme-error.c

Comments

Alan Tull April 9, 2019, 9:35 p.m. UTC | #1
On Sun, Mar 24, 2019 at 10:24 PM Wu Hao <hao.wu@intel.com> wrote:

Hi Hao,

>
> This patch adds support for global error reporting for FPGA
> Management Engine (FME), it introduces sysfs interfaces to
> report different error detected by the hardware, and allow
> user to clear errors or inject error for testing purpose.
>
> Signed-off-by: Luwei Kang <luwei.kang@intel.com>
> Signed-off-by: Ananda Ravuri <ananda.ravuri@intel.com>
> Signed-off-by: Xu Yilun <yilun.xu@intel.com>
> Signed-off-by: Wu Hao <hao.wu@intel.com>
> ---
>  Documentation/ABI/testing/sysfs-platform-dfl-fme |  58 ++++
>  drivers/fpga/Makefile                            |   2 +-
>  drivers/fpga/dfl-fme-error.c                     | 390 +++++++++++++++++++++++
>  drivers/fpga/dfl-fme-main.c                      |   4 +
>  drivers/fpga/dfl-fme.h                           |   2 +
>  drivers/fpga/dfl.h                               |   2 +
>  6 files changed, 457 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/fpga/dfl-fme-error.c
>
> diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> index 4b6448f..38f9cdd 100644
> --- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
> +++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> @@ -156,3 +156,61 @@ KernelVersion:  5.2
>  Contact:       Wu Hao <hao.wu@intel.com>
>  Description:   Read-only. Read this file to get power limit for fpga, it
>                 is only valid for integrated solution.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/errors
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@intel.com>
> +Description:   Read-only. Read this file to get errors detected by hardware.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/first_error
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@intel.com>
> +Description:   Read-only. Read this file to get the first error detected by
> +               hardware.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/next_error
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@intel.com>
> +Description:   Read-only. Read this file to get the second error detected by
> +               hardware.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/clear
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@intel.com>
> +Description:   Write-only. Write error code to this file to clear errors. If
> +               the input error code doesn't match, it returns -EBUSY.

As with the afu errors patch, seems like -EINVAL would be better.

> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/pcie0_errors
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@intel.com>
> +Description:   Read-only. It returns errors detected on pcie0 link.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/pcie1_errors
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@intel.com>
> +Description:   Read-only. It returns errors detected on pcie1 link.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/nonfatal_errors
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@intel.com>
> +Description:   Read-only. It returns non-fatal errors detected.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/catfatal_errors
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@intel.com>
> +Description:   Read-only. It returns catastrophic and fatal errors detected.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/inject_error
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@intel.com>
> +Description:   Read-Write. Write this file to inject errors for testing
> +               purpose. Read this file to check errors injected.
> diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
> index f1f0af7..1a9fa3d 100644
> --- a/drivers/fpga/Makefile
> +++ b/drivers/fpga/Makefile
> @@ -38,7 +38,7 @@ obj-$(CONFIG_FPGA_DFL_FME_BRIDGE)     += dfl-fme-br.o
>  obj-$(CONFIG_FPGA_DFL_FME_REGION)      += dfl-fme-region.o
>  obj-$(CONFIG_FPGA_DFL_AFU)             += dfl-afu.o
>
> -dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
> +dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o dfl-fme-error.o
>  dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
>  dfl-afu-objs += dfl-afu-error.o
>
> diff --git a/drivers/fpga/dfl-fme-error.c b/drivers/fpga/dfl-fme-error.c
> new file mode 100644
> index 0000000..f2bd5f8
> --- /dev/null
> +++ b/drivers/fpga/dfl-fme-error.c
> @@ -0,0 +1,390 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Driver for FPGA Management Engine Error Management
> + *
> + * Copyright 2019 Intel Corporation, Inc.
> + *
> + * Authors:
> + *   Kang Luwei <luwei.kang@intel.com>
> + *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
> + *   Wu Hao <hao.wu@intel.com>
> + *   Joseph Grecco <joe.grecco@intel.com>
> + *   Enno Luebbers <enno.luebbers@intel.com>
> + *   Tim Whisonant <tim.whisonant@intel.com>
> + *   Ananda Ravuri <ananda.ravuri@intel.com>
> + *   Mitchel, Henry <henry.mitchel@intel.com>
> + */
> +
> +#include <linux/uaccess.h>
> +
> +#include "dfl.h"
> +#include "dfl-fme.h"
> +
> +#define FME_ERROR_MASK         0x8
> +#define FME_ERROR              0x10
> +#define MBP_ERROR              BIT_ULL(6)
> +#define PCIE0_ERROR_MASK       0x18
> +#define PCIE0_ERROR            0x20
> +#define PCIE1_ERROR_MASK       0x28
> +#define PCIE1_ERROR            0x30
> +#define FME_FIRST_ERROR                0x38
> +#define FME_NEXT_ERROR         0x40
> +#define RAS_NONFAT_ERROR_MASK  0x48
> +#define RAS_NONFAT_ERROR       0x50
> +#define RAS_CATFAT_ERROR_MASK  0x58
> +#define RAS_CATFAT_ERROR       0x60
> +#define RAS_ERROR_INJECT       0x68
> +#define INJECT_ERROR_MASK      GENMASK_ULL(2, 0)
> +
> +static ssize_t errors_show(struct device *dev,
> +                          struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + FME_ERROR));
> +}
> +static DEVICE_ATTR_RO(errors);
> +
> +static ssize_t first_error_show(struct device *dev,
> +                               struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + FME_FIRST_ERROR));
> +}
> +static DEVICE_ATTR_RO(first_error);
> +
> +static ssize_t next_error_show(struct device *dev,
> +                              struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + FME_NEXT_ERROR));
> +}
> +static DEVICE_ATTR_RO(next_error);
> +
> +static ssize_t clear_store(struct device *dev, struct device_attribute *attr,
> +                          const char *buf, size_t count)
> +{
> +       struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> +       struct device *err_dev = dev->parent;
> +       struct dfl_feature *feature;
> +       void __iomem *base;
> +       u64 v, val;
> +       int ret;
> +
> +       ret = kstrtou64(buf, 0, &val);
> +       if (ret)
> +               return ret;

from kstrtox.c:
* Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.

Your sysfs doc could be updated to explain these return codes.

> +
> +       feature = dfl_get_feature_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +       base = feature->ioaddr;
> +
> +       mutex_lock(&pdata->lock);
> +       writeq(GENMASK_ULL(63, 0), base + FME_ERROR_MASK);
> +
> +       v = readq(base + FME_ERROR);
> +       if (val != v) {
> +               ret = -EINVAL;

Oh wait, that's what I thought it should be ;)  so the doc just needs to change.

> +               goto done;

It would be easy to avoid using 'goto' here.

> +       }
> +
> +       writeq(v, base + FME_ERROR);
> +       v = readq(base + FME_FIRST_ERROR);
> +       writeq(v, base + FME_FIRST_ERROR);
> +       v = readq(base + FME_NEXT_ERROR);
> +       writeq(v, base + FME_NEXT_ERROR);
> +
> +done:
> +       /* Workaround: disable MBP_ERROR if feature revision is 0 */
> +       writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
> +              base + FME_ERROR_MASK);
> +       mutex_unlock(&pdata->lock);
> +       return ret ? ret : count;
> +}
> +static DEVICE_ATTR_WO(clear);
> +
> +static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
> +                            char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "%u\n", dfl_feature_revision(base));
> +}
> +static DEVICE_ATTR_RO(revision);

The revision attr to be documented in the sysfs doc.

> +
> +static ssize_t pcie0_errors_show(struct device *dev,
> +                                struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + PCIE0_ERROR));
> +}
> +
> +static ssize_t pcie0_errors_store(struct device *dev,

The sysfs doc shows this as read-only.

> +                                 struct device_attribute *attr,
> +                                 const char *buf, size_t count)
> +{
> +       struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +       u64 v, val;
> +       int ret;
> +
> +       ret = kstrtou64(buf, 0, &val);
> +       if (ret)
> +               return ret;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       mutex_lock(&pdata->lock);
> +       writeq(GENMASK_ULL(63, 0), base + PCIE0_ERROR_MASK);
> +
> +       v = readq(base + PCIE0_ERROR);
> +       if (val != v)
> +               ret = -EBUSY;
> +       else
> +               writeq(v, base + PCIE0_ERROR);
> +
> +       writeq(0ULL, base + PCIE0_ERROR_MASK);
> +       mutex_unlock(&pdata->lock);
> +       return ret ? ret : count;
> +}
> +static DEVICE_ATTR_RW(pcie0_errors);
> +
> +static ssize_t pcie1_errors_show(struct device *dev,
> +                                struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + PCIE1_ERROR));
> +}
> +
> +static ssize_t pcie1_errors_store(struct device *dev,
> +                                 struct device_attribute *attr,
> +                                 const char *buf, size_t count)

Same here, documentation needs to show as R/W and explain what happens
when written.

> +{
> +       struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +       u64 v, val;
> +       int ret;
> +
> +       ret = kstrtou64(buf, 0, &val);
> +       if (ret)
> +               return ret;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       mutex_lock(&pdata->lock);
> +       writeq(GENMASK_ULL(63, 0), base + PCIE1_ERROR_MASK);
> +
> +       v = readq(base + PCIE1_ERROR);
> +       if (val != v)
> +               ret = -EBUSY;
> +       else
> +               writeq(v, base + PCIE1_ERROR);
> +
> +       writeq(0ULL, base + PCIE1_ERROR_MASK);
> +       mutex_unlock(&pdata->lock);
> +       return ret ? ret : count;
> +}
> +static DEVICE_ATTR_RW(pcie1_errors);
> +
> +static ssize_t nonfatal_errors_show(struct device *dev,
> +                                   struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + RAS_NONFAT_ERROR));
> +}
> +static DEVICE_ATTR_RO(nonfatal_errors);
> +
> +static ssize_t catfatal_errors_show(struct device *dev,
> +                                   struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + RAS_CATFAT_ERROR));
> +}
> +static DEVICE_ATTR_RO(catfatal_errors);
> +
> +static ssize_t inject_error_show(struct device *dev,
> +                                struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +       u64 v;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       v = readq(base + RAS_ERROR_INJECT);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)FIELD_GET(INJECT_ERROR_MASK, v));
> +}
> +
> +static ssize_t inject_error_store(struct device *dev,
> +                                 struct device_attribute *attr,
> +                                 const char *buf, size_t count)
> +{
> +       struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +       u8 inject_error;
> +       int ret;
> +       u64 v;
> +
> +       ret = kstrtou8(buf, 0, &inject_error);
> +       if (ret)
> +               return ret;
> +
> +       if (inject_error & ~INJECT_ERROR_MASK)
> +               return -EINVAL;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       mutex_lock(&pdata->lock);
> +       v = readq(base + RAS_ERROR_INJECT);
> +       v &= ~INJECT_ERROR_MASK;
> +       v |= FIELD_PREP(INJECT_ERROR_MASK, inject_error);
> +       writeq(v, base + RAS_ERROR_INJECT);
> +       mutex_unlock(&pdata->lock);
> +
> +       return count;
> +}
> +static DEVICE_ATTR_RW(inject_error);
> +
> +static struct attribute *fme_errors_attrs[] = {
> +       &dev_attr_errors.attr,
> +       &dev_attr_first_error.attr,
> +       &dev_attr_next_error.attr,
> +       &dev_attr_clear.attr,
> +       NULL,
> +};
> +
> +static struct attribute_group fme_errors_attr_group = {
> +       .attrs  = fme_errors_attrs,
> +       .name   = "fme-errors",
> +};
> +
> +static struct attribute *errors_attrs[] = {
> +       &dev_attr_revision.attr,
> +       &dev_attr_pcie0_errors.attr,
> +       &dev_attr_pcie1_errors.attr,
> +       &dev_attr_nonfatal_errors.attr,
> +       &dev_attr_catfatal_errors.attr,
> +       &dev_attr_inject_error.attr,
> +       NULL,
> +};
> +
> +static struct attribute_group errors_attr_group = {
> +       .attrs  = errors_attrs,
> +};
> +
> +static const struct attribute_group *error_groups[] = {
> +       &fme_errors_attr_group,
> +       &errors_attr_group,
> +       NULL
> +};
> +
> +static void fme_error_enable(struct dfl_feature *feature)
> +{
> +       void __iomem *base = feature->ioaddr;
> +
> +       /* Workaround: disable MBP_ERROR if revision is 0 */
> +       writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
> +              base + FME_ERROR_MASK);
> +       writeq(0ULL, base + PCIE0_ERROR_MASK);
> +       writeq(0ULL, base + PCIE1_ERROR_MASK);
> +       writeq(0ULL, base + RAS_NONFAT_ERROR_MASK);
> +       writeq(0ULL, base + RAS_CATFAT_ERROR_MASK);
> +}
> +
> +static void err_dev_release(struct device *dev)
> +{
> +       kfree(dev);
> +}
> +
> +static int fme_global_err_init(struct platform_device *pdev,
> +                              struct dfl_feature *feature)
> +{
> +       struct device *dev;
> +       int ret = 0;
> +
> +       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
> +       if (!dev)
> +               return -ENOMEM;
> +
> +       dev->parent = &pdev->dev;
> +       dev->release = err_dev_release;
> +       dev_set_name(dev, "errors");
> +
> +       fme_error_enable(feature);
> +
> +       ret = device_register(dev);
> +       if (ret) {
> +               put_device(dev);
> +               return ret;
> +       }
> +
> +       ret = sysfs_create_groups(&dev->kobj, error_groups);
> +       if (ret) {
> +               device_unregister(dev);
> +               return ret;
> +       }
> +
> +       feature->priv = dev;
> +
> +       return ret;
> +}
> +
> +static void fme_global_err_uinit(struct platform_device *pdev,
> +                                struct dfl_feature *feature)
> +{
> +       struct device *dev = feature->priv;
> +
> +       sysfs_remove_groups(&dev->kobj, error_groups);
> +       device_unregister(dev);
> +}
> +
> +const struct dfl_feature_id fme_global_err_id_table[] = {
> +       {.id = FME_FEATURE_ID_GLOBAL_ERR,},
> +       {0,}
> +};
> +
> +const struct dfl_feature_ops fme_global_err_ops = {
> +       .init = fme_global_err_init,
> +       .uinit = fme_global_err_uinit,
> +};
> diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
> index dafa6580..76cb112 100644
> --- a/drivers/fpga/dfl-fme-main.c
> +++ b/drivers/fpga/dfl-fme-main.c
> @@ -686,6 +686,10 @@ static struct dfl_feature_driver fme_feature_drvs[] = {
>                 .ops = &fme_power_mgmt_ops,
>         },
>         {
> +               .id_table = fme_global_err_id_table,
> +               .ops = &fme_global_err_ops,
> +       },
> +       {
>                 .ops = NULL,
>         },
>  };
> diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h
> index 7a021c4..5fbe3f5 100644
> --- a/drivers/fpga/dfl-fme.h
> +++ b/drivers/fpga/dfl-fme.h
> @@ -37,5 +37,7 @@ struct dfl_fme {
>
>  extern const struct dfl_feature_ops fme_pr_mgmt_ops;
>  extern const struct dfl_feature_id fme_pr_mgmt_id_table[];
> +extern const struct dfl_feature_ops fme_global_err_ops;
> +extern const struct dfl_feature_id fme_global_err_id_table[];
>
>  #endif /* __DFL_FME_H */
> diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
> index fbc57f0..6c32080 100644
> --- a/drivers/fpga/dfl.h
> +++ b/drivers/fpga/dfl.h
> @@ -197,12 +197,14 @@ struct dfl_feature_driver {
>   *                 feature dev (platform device)'s reources.
>   * @ioaddr: mapped mmio resource address.
>   * @ops: ops of this sub feature.
> + * @priv: priv data of this feature.
>   */
>  struct dfl_feature {
>         u64 id;
>         int resource_index;
>         void __iomem *ioaddr;
>         const struct dfl_feature_ops *ops;
> +       void *priv;
>  };
>
>  #define DEV_STATUS_IN_USE      0
> --
> 2.7.4
>

Thanks,
Alan
Wu, Hao April 10, 2019, 1:34 a.m. UTC | #2
On Tue, Apr 09, 2019 at 04:35:25PM -0500, Alan Tull wrote:
> On Sun, Mar 24, 2019 at 10:24 PM Wu Hao <hao.wu@intel.com> wrote:
> 
> Hi Hao,
> 
> >
> > This patch adds support for global error reporting for FPGA
> > Management Engine (FME), it introduces sysfs interfaces to
> > report different error detected by the hardware, and allow
> > user to clear errors or inject error for testing purpose.
> >
> > Signed-off-by: Luwei Kang <luwei.kang@intel.com>
> > Signed-off-by: Ananda Ravuri <ananda.ravuri@intel.com>
> > Signed-off-by: Xu Yilun <yilun.xu@intel.com>
> > Signed-off-by: Wu Hao <hao.wu@intel.com>
> > ---
> >  Documentation/ABI/testing/sysfs-platform-dfl-fme |  58 ++++
> >  drivers/fpga/Makefile                            |   2 +-
> >  drivers/fpga/dfl-fme-error.c                     | 390 +++++++++++++++++++++++
> >  drivers/fpga/dfl-fme-main.c                      |   4 +
> >  drivers/fpga/dfl-fme.h                           |   2 +
> >  drivers/fpga/dfl.h                               |   2 +
> >  6 files changed, 457 insertions(+), 1 deletion(-)
> >  create mode 100644 drivers/fpga/dfl-fme-error.c
> >
> > diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> > index 4b6448f..38f9cdd 100644
> > --- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
> > +++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> > @@ -156,3 +156,61 @@ KernelVersion:  5.2
> >  Contact:       Wu Hao <hao.wu@intel.com>
> >  Description:   Read-only. Read this file to get power limit for fpga, it
> >                 is only valid for integrated solution.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/errors
> > +Date:          March 2019
> > +KernelVersion:  5.2
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-only. Read this file to get errors detected by hardware.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/first_error
> > +Date:          March 2019
> > +KernelVersion:  5.2
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-only. Read this file to get the first error detected by
> > +               hardware.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/next_error
> > +Date:          March 2019
> > +KernelVersion:  5.2
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-only. Read this file to get the second error detected by
> > +               hardware.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/clear
> > +Date:          March 2019
> > +KernelVersion:  5.2
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Write-only. Write error code to this file to clear errors. If
> > +               the input error code doesn't match, it returns -EBUSY.
> 
> As with the afu errors patch, seems like -EINVAL would be better.
> 
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/errors/pcie0_errors
> > +Date:          March 2019
> > +KernelVersion:  5.2
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-only. It returns errors detected on pcie0 link.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/errors/pcie1_errors
> > +Date:          March 2019
> > +KernelVersion:  5.2
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-only. It returns errors detected on pcie1 link.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/errors/nonfatal_errors
> > +Date:          March 2019
> > +KernelVersion:  5.2
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-only. It returns non-fatal errors detected.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/errors/catfatal_errors
> > +Date:          March 2019
> > +KernelVersion:  5.2
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-only. It returns catastrophic and fatal errors detected.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/errors/inject_error
> > +Date:          March 2019
> > +KernelVersion:  5.2
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-Write. Write this file to inject errors for testing
> > +               purpose. Read this file to check errors injected.
> > diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
> > index f1f0af7..1a9fa3d 100644
> > --- a/drivers/fpga/Makefile
> > +++ b/drivers/fpga/Makefile
> > @@ -38,7 +38,7 @@ obj-$(CONFIG_FPGA_DFL_FME_BRIDGE)     += dfl-fme-br.o
> >  obj-$(CONFIG_FPGA_DFL_FME_REGION)      += dfl-fme-region.o
> >  obj-$(CONFIG_FPGA_DFL_AFU)             += dfl-afu.o
> >
> > -dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
> > +dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o dfl-fme-error.o
> >  dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
> >  dfl-afu-objs += dfl-afu-error.o
> >
> > diff --git a/drivers/fpga/dfl-fme-error.c b/drivers/fpga/dfl-fme-error.c
> > new file mode 100644
> > index 0000000..f2bd5f8
> > --- /dev/null
> > +++ b/drivers/fpga/dfl-fme-error.c
> > @@ -0,0 +1,390 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Driver for FPGA Management Engine Error Management
> > + *
> > + * Copyright 2019 Intel Corporation, Inc.
> > + *
> > + * Authors:
> > + *   Kang Luwei <luwei.kang@intel.com>
> > + *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
> > + *   Wu Hao <hao.wu@intel.com>
> > + *   Joseph Grecco <joe.grecco@intel.com>
> > + *   Enno Luebbers <enno.luebbers@intel.com>
> > + *   Tim Whisonant <tim.whisonant@intel.com>
> > + *   Ananda Ravuri <ananda.ravuri@intel.com>
> > + *   Mitchel, Henry <henry.mitchel@intel.com>
> > + */
> > +
> > +#include <linux/uaccess.h>
> > +
> > +#include "dfl.h"
> > +#include "dfl-fme.h"
> > +
> > +#define FME_ERROR_MASK         0x8
> > +#define FME_ERROR              0x10
> > +#define MBP_ERROR              BIT_ULL(6)
> > +#define PCIE0_ERROR_MASK       0x18
> > +#define PCIE0_ERROR            0x20
> > +#define PCIE1_ERROR_MASK       0x28
> > +#define PCIE1_ERROR            0x30
> > +#define FME_FIRST_ERROR                0x38
> > +#define FME_NEXT_ERROR         0x40
> > +#define RAS_NONFAT_ERROR_MASK  0x48
> > +#define RAS_NONFAT_ERROR       0x50
> > +#define RAS_CATFAT_ERROR_MASK  0x58
> > +#define RAS_CATFAT_ERROR       0x60
> > +#define RAS_ERROR_INJECT       0x68
> > +#define INJECT_ERROR_MASK      GENMASK_ULL(2, 0)
> > +
> > +static ssize_t errors_show(struct device *dev,
> > +                          struct device_attribute *attr, char *buf)
> > +{
> > +       struct device *err_dev = dev->parent;
> > +       void __iomem *base;
> > +
> > +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > +                        (unsigned long long)readq(base + FME_ERROR));
> > +}
> > +static DEVICE_ATTR_RO(errors);
> > +
> > +static ssize_t first_error_show(struct device *dev,
> > +                               struct device_attribute *attr, char *buf)
> > +{
> > +       struct device *err_dev = dev->parent;
> > +       void __iomem *base;
> > +
> > +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > +                        (unsigned long long)readq(base + FME_FIRST_ERROR));
> > +}
> > +static DEVICE_ATTR_RO(first_error);
> > +
> > +static ssize_t next_error_show(struct device *dev,
> > +                              struct device_attribute *attr, char *buf)
> > +{
> > +       struct device *err_dev = dev->parent;
> > +       void __iomem *base;
> > +
> > +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > +                        (unsigned long long)readq(base + FME_NEXT_ERROR));
> > +}
> > +static DEVICE_ATTR_RO(next_error);
> > +
> > +static ssize_t clear_store(struct device *dev, struct device_attribute *attr,
> > +                          const char *buf, size_t count)
> > +{
> > +       struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> > +       struct device *err_dev = dev->parent;
> > +       struct dfl_feature *feature;
> > +       void __iomem *base;
> > +       u64 v, val;
> > +       int ret;
> > +
> > +       ret = kstrtou64(buf, 0, &val);
> > +       if (ret)
> > +               return ret;
> 
> from kstrtox.c:
> * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
> 
> Your sysfs doc could be updated to explain these return codes.

Sure, will fix this.

> 
> > +
> > +       feature = dfl_get_feature_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +       base = feature->ioaddr;
> > +
> > +       mutex_lock(&pdata->lock);
> > +       writeq(GENMASK_ULL(63, 0), base + FME_ERROR_MASK);
> > +
> > +       v = readq(base + FME_ERROR);
> > +       if (val != v) {
> > +               ret = -EINVAL;
> 
> Oh wait, that's what I thought it should be ;)  so the doc just needs to change.

Sure, will capture the detailed return values in doc.

> 
> > +               goto done;
> 
> It would be easy to avoid using 'goto' here.

Yes, agree.

> 
> > +       }
> > +
> > +       writeq(v, base + FME_ERROR);
> > +       v = readq(base + FME_FIRST_ERROR);
> > +       writeq(v, base + FME_FIRST_ERROR);
> > +       v = readq(base + FME_NEXT_ERROR);
> > +       writeq(v, base + FME_NEXT_ERROR);
> > +
> > +done:
> > +       /* Workaround: disable MBP_ERROR if feature revision is 0 */
> > +       writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
> > +              base + FME_ERROR_MASK);
> > +       mutex_unlock(&pdata->lock);
> > +       return ret ? ret : count;
> > +}
> > +static DEVICE_ATTR_WO(clear);
> > +
> > +static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
> > +                            char *buf)
> > +{
> > +       struct device *err_dev = dev->parent;
> > +       void __iomem *base;
> > +
> > +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > +       return scnprintf(buf, PAGE_SIZE, "%u\n", dfl_feature_revision(base));
> > +}
> > +static DEVICE_ATTR_RO(revision);
> 
> The revision attr to be documented in the sysfs doc.

Will fix this.

> 
> > +
> > +static ssize_t pcie0_errors_show(struct device *dev,
> > +                                struct device_attribute *attr, char *buf)
> > +{
> > +       struct device *err_dev = dev->parent;
> > +       void __iomem *base;
> > +
> > +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > +                        (unsigned long long)readq(base + PCIE0_ERROR));
> > +}
> > +
> > +static ssize_t pcie0_errors_store(struct device *dev,
> 
> The sysfs doc shows this as read-only.
> 
> > +                                 struct device_attribute *attr,
> > +                                 const char *buf, size_t count)
> > +{
> > +       struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> > +       struct device *err_dev = dev->parent;
> > +       void __iomem *base;
> > +       u64 v, val;
> > +       int ret;
> > +
> > +       ret = kstrtou64(buf, 0, &val);
> > +       if (ret)
> > +               return ret;
> > +
> > +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > +       mutex_lock(&pdata->lock);
> > +       writeq(GENMASK_ULL(63, 0), base + PCIE0_ERROR_MASK);
> > +
> > +       v = readq(base + PCIE0_ERROR);
> > +       if (val != v)
> > +               ret = -EBUSY;
> > +       else
> > +               writeq(v, base + PCIE0_ERROR);
> > +
> > +       writeq(0ULL, base + PCIE0_ERROR_MASK);
> > +       mutex_unlock(&pdata->lock);
> > +       return ret ? ret : count;
> > +}
> > +static DEVICE_ATTR_RW(pcie0_errors);
> > +
> > +static ssize_t pcie1_errors_show(struct device *dev,
> > +                                struct device_attribute *attr, char *buf)
> > +{
> > +       struct device *err_dev = dev->parent;
> > +       void __iomem *base;
> > +
> > +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > +                        (unsigned long long)readq(base + PCIE1_ERROR));
> > +}
> > +
> > +static ssize_t pcie1_errors_store(struct device *dev,
> > +                                 struct device_attribute *attr,
> > +                                 const char *buf, size_t count)
> 
> Same here, documentation needs to show as R/W and explain what happens
> when written.

Sorry, will fix them in doc.

> 
> > +{
> > +       struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> > +       struct device *err_dev = dev->parent;
> > +       void __iomem *base;
> > +       u64 v, val;
> > +       int ret;
> > +
> > +       ret = kstrtou64(buf, 0, &val);
> > +       if (ret)
> > +               return ret;
> > +
> > +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > +       mutex_lock(&pdata->lock);
> > +       writeq(GENMASK_ULL(63, 0), base + PCIE1_ERROR_MASK);
> > +
> > +       v = readq(base + PCIE1_ERROR);
> > +       if (val != v)
> > +               ret = -EBUSY;
> > +       else
> > +               writeq(v, base + PCIE1_ERROR);
> > +
> > +       writeq(0ULL, base + PCIE1_ERROR_MASK);
> > +       mutex_unlock(&pdata->lock);
> > +       return ret ? ret : count;
> > +}
> > +static DEVICE_ATTR_RW(pcie1_errors);
> > +
> > +static ssize_t nonfatal_errors_show(struct device *dev,
> > +                                   struct device_attribute *attr, char *buf)
> > +{
> > +       struct device *err_dev = dev->parent;
> > +       void __iomem *base;
> > +
> > +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > +                        (unsigned long long)readq(base + RAS_NONFAT_ERROR));
> > +}
> > +static DEVICE_ATTR_RO(nonfatal_errors);
> > +
> > +static ssize_t catfatal_errors_show(struct device *dev,
> > +                                   struct device_attribute *attr, char *buf)
> > +{
> > +       struct device *err_dev = dev->parent;
> > +       void __iomem *base;
> > +
> > +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > +                        (unsigned long long)readq(base + RAS_CATFAT_ERROR));
> > +}
> > +static DEVICE_ATTR_RO(catfatal_errors);
> > +
> > +static ssize_t inject_error_show(struct device *dev,
> > +                                struct device_attribute *attr, char *buf)
> > +{
> > +       struct device *err_dev = dev->parent;
> > +       void __iomem *base;
> > +       u64 v;
> > +
> > +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > +       v = readq(base + RAS_ERROR_INJECT);
> > +
> > +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > +                        (unsigned long long)FIELD_GET(INJECT_ERROR_MASK, v));
> > +}
> > +
> > +static ssize_t inject_error_store(struct device *dev,
> > +                                 struct device_attribute *attr,
> > +                                 const char *buf, size_t count)
> > +{
> > +       struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> > +       struct device *err_dev = dev->parent;
> > +       void __iomem *base;
> > +       u8 inject_error;
> > +       int ret;
> > +       u64 v;
> > +
> > +       ret = kstrtou8(buf, 0, &inject_error);
> > +       if (ret)
> > +               return ret;
> > +
> > +       if (inject_error & ~INJECT_ERROR_MASK)
> > +               return -EINVAL;
> > +
> > +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > +       mutex_lock(&pdata->lock);
> > +       v = readq(base + RAS_ERROR_INJECT);
> > +       v &= ~INJECT_ERROR_MASK;
> > +       v |= FIELD_PREP(INJECT_ERROR_MASK, inject_error);
> > +       writeq(v, base + RAS_ERROR_INJECT);
> > +       mutex_unlock(&pdata->lock);
> > +
> > +       return count;
> > +}
> > +static DEVICE_ATTR_RW(inject_error);
> > +
> > +static struct attribute *fme_errors_attrs[] = {
> > +       &dev_attr_errors.attr,
> > +       &dev_attr_first_error.attr,
> > +       &dev_attr_next_error.attr,
> > +       &dev_attr_clear.attr,
> > +       NULL,
> > +};
> > +
> > +static struct attribute_group fme_errors_attr_group = {
> > +       .attrs  = fme_errors_attrs,
> > +       .name   = "fme-errors",
> > +};
> > +
> > +static struct attribute *errors_attrs[] = {
> > +       &dev_attr_revision.attr,
> > +       &dev_attr_pcie0_errors.attr,
> > +       &dev_attr_pcie1_errors.attr,
> > +       &dev_attr_nonfatal_errors.attr,
> > +       &dev_attr_catfatal_errors.attr,
> > +       &dev_attr_inject_error.attr,
> > +       NULL,
> > +};
> > +
> > +static struct attribute_group errors_attr_group = {
> > +       .attrs  = errors_attrs,
> > +};
> > +
> > +static const struct attribute_group *error_groups[] = {
> > +       &fme_errors_attr_group,
> > +       &errors_attr_group,
> > +       NULL
> > +};
> > +
> > +static void fme_error_enable(struct dfl_feature *feature)
> > +{
> > +       void __iomem *base = feature->ioaddr;
> > +
> > +       /* Workaround: disable MBP_ERROR if revision is 0 */
> > +       writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
> > +              base + FME_ERROR_MASK);
> > +       writeq(0ULL, base + PCIE0_ERROR_MASK);
> > +       writeq(0ULL, base + PCIE1_ERROR_MASK);
> > +       writeq(0ULL, base + RAS_NONFAT_ERROR_MASK);
> > +       writeq(0ULL, base + RAS_CATFAT_ERROR_MASK);
> > +}
> > +
> > +static void err_dev_release(struct device *dev)
> > +{
> > +       kfree(dev);
> > +}
> > +
> > +static int fme_global_err_init(struct platform_device *pdev,
> > +                              struct dfl_feature *feature)
> > +{
> > +       struct device *dev;
> > +       int ret = 0;
> > +
> > +       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
> > +       if (!dev)
> > +               return -ENOMEM;
> > +
> > +       dev->parent = &pdev->dev;
> > +       dev->release = err_dev_release;
> > +       dev_set_name(dev, "errors");
> > +
> > +       fme_error_enable(feature);
> > +
> > +       ret = device_register(dev);
> > +       if (ret) {
> > +               put_device(dev);
> > +               return ret;
> > +       }
> > +
> > +       ret = sysfs_create_groups(&dev->kobj, error_groups);
> > +       if (ret) {
> > +               device_unregister(dev);
> > +               return ret;
> > +       }
> > +
> > +       feature->priv = dev;
> > +
> > +       return ret;
> > +}
> > +
> > +static void fme_global_err_uinit(struct platform_device *pdev,
> > +                                struct dfl_feature *feature)
> > +{
> > +       struct device *dev = feature->priv;
> > +
> > +       sysfs_remove_groups(&dev->kobj, error_groups);
> > +       device_unregister(dev);
> > +}
> > +
> > +const struct dfl_feature_id fme_global_err_id_table[] = {
> > +       {.id = FME_FEATURE_ID_GLOBAL_ERR,},
> > +       {0,}
> > +};
> > +
> > +const struct dfl_feature_ops fme_global_err_ops = {
> > +       .init = fme_global_err_init,
> > +       .uinit = fme_global_err_uinit,
> > +};
> > diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
> > index dafa6580..76cb112 100644
> > --- a/drivers/fpga/dfl-fme-main.c
> > +++ b/drivers/fpga/dfl-fme-main.c
> > @@ -686,6 +686,10 @@ static struct dfl_feature_driver fme_feature_drvs[] = {
> >                 .ops = &fme_power_mgmt_ops,
> >         },
> >         {
> > +               .id_table = fme_global_err_id_table,
> > +               .ops = &fme_global_err_ops,
> > +       },
> > +       {
> >                 .ops = NULL,
> >         },
> >  };
> > diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h
> > index 7a021c4..5fbe3f5 100644
> > --- a/drivers/fpga/dfl-fme.h
> > +++ b/drivers/fpga/dfl-fme.h
> > @@ -37,5 +37,7 @@ struct dfl_fme {
> >
> >  extern const struct dfl_feature_ops fme_pr_mgmt_ops;
> >  extern const struct dfl_feature_id fme_pr_mgmt_id_table[];
> > +extern const struct dfl_feature_ops fme_global_err_ops;
> > +extern const struct dfl_feature_id fme_global_err_id_table[];
> >
> >  #endif /* __DFL_FME_H */
> > diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
> > index fbc57f0..6c32080 100644
> > --- a/drivers/fpga/dfl.h
> > +++ b/drivers/fpga/dfl.h
> > @@ -197,12 +197,14 @@ struct dfl_feature_driver {
> >   *                 feature dev (platform device)'s reources.
> >   * @ioaddr: mapped mmio resource address.
> >   * @ops: ops of this sub feature.
> > + * @priv: priv data of this feature.
> >   */
> >  struct dfl_feature {
> >         u64 id;
> >         int resource_index;
> >         void __iomem *ioaddr;
> >         const struct dfl_feature_ops *ops;
> > +       void *priv;
> >  };
> >
> >  #define DEV_STATUS_IN_USE      0
> > --
> > 2.7.4
> >

Thanks for the review, will fix these things in the next version.

Hao

> 
> Thanks,
> Alan
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme
index 4b6448f..38f9cdd 100644
--- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
+++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
@@ -156,3 +156,61 @@  KernelVersion:  5.2
 Contact:	Wu Hao <hao.wu@intel.com>
 Description:	Read-only. Read this file to get power limit for fpga, it
 		is only valid for integrated solution.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/errors
+Date:		March 2019
+KernelVersion:  5.2
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. Read this file to get errors detected by hardware.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/first_error
+Date:		March 2019
+KernelVersion:  5.2
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. Read this file to get the first error detected by
+		hardware.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/next_error
+Date:		March 2019
+KernelVersion:  5.2
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. Read this file to get the second error detected by
+		hardware.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/clear
+Date:		March 2019
+KernelVersion:  5.2
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Write-only. Write error code to this file to clear errors. If
+		the input error code doesn't match, it returns -EBUSY.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/pcie0_errors
+Date:		March 2019
+KernelVersion:  5.2
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. It returns errors detected on pcie0 link.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/pcie1_errors
+Date:		March 2019
+KernelVersion:  5.2
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. It returns errors detected on pcie1 link.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/nonfatal_errors
+Date:		March 2019
+KernelVersion:  5.2
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. It returns non-fatal errors detected.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/catfatal_errors
+Date:		March 2019
+KernelVersion:  5.2
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-only. It returns catastrophic and fatal errors detected.
+
+What:		/sys/bus/platform/devices/dfl-fme.0/errors/inject_error
+Date:		March 2019
+KernelVersion:  5.2
+Contact:	Wu Hao <hao.wu@intel.com>
+Description:	Read-Write. Write this file to inject errors for testing
+		purpose. Read this file to check errors injected.
diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
index f1f0af7..1a9fa3d 100644
--- a/drivers/fpga/Makefile
+++ b/drivers/fpga/Makefile
@@ -38,7 +38,7 @@  obj-$(CONFIG_FPGA_DFL_FME_BRIDGE)	+= dfl-fme-br.o
 obj-$(CONFIG_FPGA_DFL_FME_REGION)	+= dfl-fme-region.o
 obj-$(CONFIG_FPGA_DFL_AFU)		+= dfl-afu.o
 
-dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
+dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o dfl-fme-error.o
 dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
 dfl-afu-objs += dfl-afu-error.o
 
diff --git a/drivers/fpga/dfl-fme-error.c b/drivers/fpga/dfl-fme-error.c
new file mode 100644
index 0000000..f2bd5f8
--- /dev/null
+++ b/drivers/fpga/dfl-fme-error.c
@@ -0,0 +1,390 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver for FPGA Management Engine Error Management
+ *
+ * Copyright 2019 Intel Corporation, Inc.
+ *
+ * Authors:
+ *   Kang Luwei <luwei.kang@intel.com>
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *   Wu Hao <hao.wu@intel.com>
+ *   Joseph Grecco <joe.grecco@intel.com>
+ *   Enno Luebbers <enno.luebbers@intel.com>
+ *   Tim Whisonant <tim.whisonant@intel.com>
+ *   Ananda Ravuri <ananda.ravuri@intel.com>
+ *   Mitchel, Henry <henry.mitchel@intel.com>
+ */
+
+#include <linux/uaccess.h>
+
+#include "dfl.h"
+#include "dfl-fme.h"
+
+#define FME_ERROR_MASK		0x8
+#define FME_ERROR		0x10
+#define MBP_ERROR		BIT_ULL(6)
+#define PCIE0_ERROR_MASK	0x18
+#define PCIE0_ERROR		0x20
+#define PCIE1_ERROR_MASK	0x28
+#define PCIE1_ERROR		0x30
+#define FME_FIRST_ERROR		0x38
+#define FME_NEXT_ERROR		0x40
+#define RAS_NONFAT_ERROR_MASK	0x48
+#define RAS_NONFAT_ERROR	0x50
+#define RAS_CATFAT_ERROR_MASK	0x58
+#define RAS_CATFAT_ERROR	0x60
+#define RAS_ERROR_INJECT	0x68
+#define INJECT_ERROR_MASK	GENMASK_ULL(2, 0)
+
+static ssize_t errors_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+			 (unsigned long long)readq(base + FME_ERROR));
+}
+static DEVICE_ATTR_RO(errors);
+
+static ssize_t first_error_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+			 (unsigned long long)readq(base + FME_FIRST_ERROR));
+}
+static DEVICE_ATTR_RO(first_error);
+
+static ssize_t next_error_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+			 (unsigned long long)readq(base + FME_NEXT_ERROR));
+}
+static DEVICE_ATTR_RO(next_error);
+
+static ssize_t clear_store(struct device *dev, struct device_attribute *attr,
+			   const char *buf, size_t count)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
+	struct device *err_dev = dev->parent;
+	struct dfl_feature *feature;
+	void __iomem *base;
+	u64 v, val;
+	int ret;
+
+	ret = kstrtou64(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	feature = dfl_get_feature_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+	base = feature->ioaddr;
+
+	mutex_lock(&pdata->lock);
+	writeq(GENMASK_ULL(63, 0), base + FME_ERROR_MASK);
+
+	v = readq(base + FME_ERROR);
+	if (val != v) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	writeq(v, base + FME_ERROR);
+	v = readq(base + FME_FIRST_ERROR);
+	writeq(v, base + FME_FIRST_ERROR);
+	v = readq(base + FME_NEXT_ERROR);
+	writeq(v, base + FME_NEXT_ERROR);
+
+done:
+	/* Workaround: disable MBP_ERROR if feature revision is 0 */
+	writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
+	       base + FME_ERROR_MASK);
+	mutex_unlock(&pdata->lock);
+	return ret ? ret : count;
+}
+static DEVICE_ATTR_WO(clear);
+
+static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dfl_feature_revision(base));
+}
+static DEVICE_ATTR_RO(revision);
+
+static ssize_t pcie0_errors_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+			 (unsigned long long)readq(base + PCIE0_ERROR));
+}
+
+static ssize_t pcie0_errors_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+	u64 v, val;
+	int ret;
+
+	ret = kstrtou64(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	mutex_lock(&pdata->lock);
+	writeq(GENMASK_ULL(63, 0), base + PCIE0_ERROR_MASK);
+
+	v = readq(base + PCIE0_ERROR);
+	if (val != v)
+		ret = -EBUSY;
+	else
+		writeq(v, base + PCIE0_ERROR);
+
+	writeq(0ULL, base + PCIE0_ERROR_MASK);
+	mutex_unlock(&pdata->lock);
+	return ret ? ret : count;
+}
+static DEVICE_ATTR_RW(pcie0_errors);
+
+static ssize_t pcie1_errors_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+			 (unsigned long long)readq(base + PCIE1_ERROR));
+}
+
+static ssize_t pcie1_errors_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+	u64 v, val;
+	int ret;
+
+	ret = kstrtou64(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	mutex_lock(&pdata->lock);
+	writeq(GENMASK_ULL(63, 0), base + PCIE1_ERROR_MASK);
+
+	v = readq(base + PCIE1_ERROR);
+	if (val != v)
+		ret = -EBUSY;
+	else
+		writeq(v, base + PCIE1_ERROR);
+
+	writeq(0ULL, base + PCIE1_ERROR_MASK);
+	mutex_unlock(&pdata->lock);
+	return ret ? ret : count;
+}
+static DEVICE_ATTR_RW(pcie1_errors);
+
+static ssize_t nonfatal_errors_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+			 (unsigned long long)readq(base + RAS_NONFAT_ERROR));
+}
+static DEVICE_ATTR_RO(nonfatal_errors);
+
+static ssize_t catfatal_errors_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+			 (unsigned long long)readq(base + RAS_CATFAT_ERROR));
+}
+static DEVICE_ATTR_RO(catfatal_errors);
+
+static ssize_t inject_error_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+	u64 v;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	v = readq(base + RAS_ERROR_INJECT);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+			 (unsigned long long)FIELD_GET(INJECT_ERROR_MASK, v));
+}
+
+static ssize_t inject_error_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
+	struct device *err_dev = dev->parent;
+	void __iomem *base;
+	u8 inject_error;
+	int ret;
+	u64 v;
+
+	ret = kstrtou8(buf, 0, &inject_error);
+	if (ret)
+		return ret;
+
+	if (inject_error & ~INJECT_ERROR_MASK)
+		return -EINVAL;
+
+	base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
+
+	mutex_lock(&pdata->lock);
+	v = readq(base + RAS_ERROR_INJECT);
+	v &= ~INJECT_ERROR_MASK;
+	v |= FIELD_PREP(INJECT_ERROR_MASK, inject_error);
+	writeq(v, base + RAS_ERROR_INJECT);
+	mutex_unlock(&pdata->lock);
+
+	return count;
+}
+static DEVICE_ATTR_RW(inject_error);
+
+static struct attribute *fme_errors_attrs[] = {
+	&dev_attr_errors.attr,
+	&dev_attr_first_error.attr,
+	&dev_attr_next_error.attr,
+	&dev_attr_clear.attr,
+	NULL,
+};
+
+static struct attribute_group fme_errors_attr_group = {
+	.attrs	= fme_errors_attrs,
+	.name	= "fme-errors",
+};
+
+static struct attribute *errors_attrs[] = {
+	&dev_attr_revision.attr,
+	&dev_attr_pcie0_errors.attr,
+	&dev_attr_pcie1_errors.attr,
+	&dev_attr_nonfatal_errors.attr,
+	&dev_attr_catfatal_errors.attr,
+	&dev_attr_inject_error.attr,
+	NULL,
+};
+
+static struct attribute_group errors_attr_group = {
+	.attrs	= errors_attrs,
+};
+
+static const struct attribute_group *error_groups[] = {
+	&fme_errors_attr_group,
+	&errors_attr_group,
+	NULL
+};
+
+static void fme_error_enable(struct dfl_feature *feature)
+{
+	void __iomem *base = feature->ioaddr;
+
+	/* Workaround: disable MBP_ERROR if revision is 0 */
+	writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
+	       base + FME_ERROR_MASK);
+	writeq(0ULL, base + PCIE0_ERROR_MASK);
+	writeq(0ULL, base + PCIE1_ERROR_MASK);
+	writeq(0ULL, base + RAS_NONFAT_ERROR_MASK);
+	writeq(0ULL, base + RAS_CATFAT_ERROR_MASK);
+}
+
+static void err_dev_release(struct device *dev)
+{
+	kfree(dev);
+}
+
+static int fme_global_err_init(struct platform_device *pdev,
+			       struct dfl_feature *feature)
+{
+	struct device *dev;
+	int ret = 0;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->parent = &pdev->dev;
+	dev->release = err_dev_release;
+	dev_set_name(dev, "errors");
+
+	fme_error_enable(feature);
+
+	ret = device_register(dev);
+	if (ret) {
+		put_device(dev);
+		return ret;
+	}
+
+	ret = sysfs_create_groups(&dev->kobj, error_groups);
+	if (ret) {
+		device_unregister(dev);
+		return ret;
+	}
+
+	feature->priv = dev;
+
+	return ret;
+}
+
+static void fme_global_err_uinit(struct platform_device *pdev,
+				 struct dfl_feature *feature)
+{
+	struct device *dev = feature->priv;
+
+	sysfs_remove_groups(&dev->kobj, error_groups);
+	device_unregister(dev);
+}
+
+const struct dfl_feature_id fme_global_err_id_table[] = {
+	{.id = FME_FEATURE_ID_GLOBAL_ERR,},
+	{0,}
+};
+
+const struct dfl_feature_ops fme_global_err_ops = {
+	.init = fme_global_err_init,
+	.uinit = fme_global_err_uinit,
+};
diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
index dafa6580..76cb112 100644
--- a/drivers/fpga/dfl-fme-main.c
+++ b/drivers/fpga/dfl-fme-main.c
@@ -686,6 +686,10 @@  static struct dfl_feature_driver fme_feature_drvs[] = {
 		.ops = &fme_power_mgmt_ops,
 	},
 	{
+		.id_table = fme_global_err_id_table,
+		.ops = &fme_global_err_ops,
+	},
+	{
 		.ops = NULL,
 	},
 };
diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h
index 7a021c4..5fbe3f5 100644
--- a/drivers/fpga/dfl-fme.h
+++ b/drivers/fpga/dfl-fme.h
@@ -37,5 +37,7 @@  struct dfl_fme {
 
 extern const struct dfl_feature_ops fme_pr_mgmt_ops;
 extern const struct dfl_feature_id fme_pr_mgmt_id_table[];
+extern const struct dfl_feature_ops fme_global_err_ops;
+extern const struct dfl_feature_id fme_global_err_id_table[];
 
 #endif /* __DFL_FME_H */
diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index fbc57f0..6c32080 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -197,12 +197,14 @@  struct dfl_feature_driver {
  *		    feature dev (platform device)'s reources.
  * @ioaddr: mapped mmio resource address.
  * @ops: ops of this sub feature.
+ * @priv: priv data of this feature.
  */
 struct dfl_feature {
 	u64 id;
 	int resource_index;
 	void __iomem *ioaddr;
 	const struct dfl_feature_ops *ops;
+	void *priv;
 };
 
 #define DEV_STATUS_IN_USE	0