Message ID | 20240806043231.624645-1-raag.jadav@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v1] drm/xe/uapi: Bring back reset uevent | expand |
On 06/08/24 10:02, Raag Jadav wrote: This changeĀ was originally sent by Himal, so may be you should keep his authorship. > From: Lucas De Marchi <lucas.demarchi@intel.com> > > Bring back uevent for gt reset failure with better uapi naming. > With this in place we can receive failure event using udev. > > $ udevadm monitor --property --kernel > monitor will print the received events for: > KERNEL - the kernel uevent > > KERNEL[871.188570] change /devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0 (pci) > ACTION=change > DEVPATH=/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0 > SUBSYSTEM=pci > DEVICE_STATUS=NEEDS_RESET > REASON=GT_RESET_FAILED > TILE_ID=0 > GT_ID=0 > DRIVER=xe > PCI_CLASS=30000 > PCI_ID=8086:56B1 > PCI_SUBSYS_ID=8086:1210 > PCI_SLOT_NAME=0000:03:00.0 > MODALIAS=pci:v00008086d000056B1sv00008086sd00001210bc03sc00i00 > SEQNUM=6104 > > Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com> > Signed-off-by: Raag Jadav <raag.jadav@intel.com> > --- > drivers/gpu/drm/xe/xe_gt.c | 27 +++++++++++++++++++++++++-- > include/uapi/drm/xe_drm.h | 17 +++++++++++++++++ > 2 files changed, 42 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c > index b04e47186f5b..5ceef0059861 100644 > --- a/drivers/gpu/drm/xe/xe_gt.c > +++ b/drivers/gpu/drm/xe/xe_gt.c > @@ -740,6 +740,30 @@ static int do_gt_restart(struct xe_gt *gt) > return 0; > } > > +static void xe_uevent_gt_reset_failure(struct pci_dev *pdev, u8 tile_id, u8 gt_id) > +{ > + char *reset_event[5]; > + > + reset_event[0] = DRM_XE_RESET_REQUIRED_UEVENT; > + reset_event[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT; > + reset_event[2] = kasprintf(GFP_KERNEL, "TILE_ID=%d", tile_id); > + reset_event[3] = kasprintf(GFP_KERNEL, "GT_ID=%d", gt_id); > + reset_event[4] = NULL; > + kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, reset_event); > + > + kfree(reset_event[2]); > + kfree(reset_event[3]); > +} > + > +static void gt_reset_failed(struct xe_gt *gt, int err) > +{ > + xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err)); > + > + /* Notify userspace about gt reset failure */ > + xe_uevent_gt_reset_failure(to_pci_dev(gt_to_xe(gt)->drm.dev), > + gt_to_tile(gt)->id, gt->info.id); > +} > + > static int gt_reset(struct xe_gt *gt) > { > int err; > @@ -795,8 +819,7 @@ static int gt_reset(struct xe_gt *gt) > XE_WARN_ON(xe_uc_start(>->uc)); > xe_pm_runtime_put(gt_to_xe(gt)); > err_fail: > - xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err)); > - > + gt_reset_failed(gt, err); > xe_device_declare_wedged(gt_to_xe(gt)); Also, we might want to have a RESET_REQUIRED event whenever device is wedged. Thanks, Aravind. > > return err; > diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h > index 19619d4952a8..9ea3be97535e 100644 > --- a/include/uapi/drm/xe_drm.h > +++ b/include/uapi/drm/xe_drm.h > @@ -20,6 +20,7 @@ extern "C" { > * 2. Extension definition and helper structs > * 3. IOCTL's Query structs in the order of the Query's entries. > * 4. The rest of IOCTL structs in the order of IOCTL declaration. > + * 5. uEvents > */ > > /** > @@ -1686,6 +1687,22 @@ struct drm_xe_oa_stream_info { > __u64 reserved[3]; > }; > > +/** > + * DOC: uevent generated by xe on it's pci node. > + * > + * DRM_XE_RESET_REQUIRED_UEVENT - Event is generated when device needs reset. > + * The REASON is provided along with the event for which reset is required. > + * On the basis of REASONS, additional information might be supplied. > + */ > +#define DRM_XE_RESET_REQUIRED_UEVENT "DEVICE_STATUS=NEEDS_RESET" > + > +/** > + * DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT - Reason provided to DRM_XE_RESET_REQUIRED_UEVENT > + * incase of gt reset failure. The additional information supplied is tile id and > + * gt id of the gt unit for which reset has failed. > + */ > +#define DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT "REASON=GT_RESET_FAILED" > + > #if defined(__cplusplus) > } > #endif
On Tue, Aug 06, 2024 at 10:02:31AM GMT, Raag Jadav wrote: >From: Lucas De Marchi <lucas.demarchi@intel.com> > >Bring back uevent for gt reset failure with better uapi naming. >With this in place we can receive failure event using udev. > >$ udevadm monitor --property --kernel >monitor will print the received events for: >KERNEL - the kernel uevent > >KERNEL[871.188570] change /devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0 (pci) >ACTION=change >DEVPATH=/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0 >SUBSYSTEM=pci >DEVICE_STATUS=NEEDS_RESET >REASON=GT_RESET_FAILED >TILE_ID=0 >GT_ID=0 >DRIVER=xe >PCI_CLASS=30000 >PCI_ID=8086:56B1 >PCI_SUBSYS_ID=8086:1210 >PCI_SLOT_NAME=0000:03:00.0 >MODALIAS=pci:v00008086d000056B1sv00008086sd00001210bc03sc00i00 >SEQNUM=6104 > >Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com> please drop my s-o-b here and don't add me as the author of this patch, which I certainly am not. You need to point to the commit where it was reverted and *why* it's ok to have this uapi now. Lucas De Marchi >Signed-off-by: Raag Jadav <raag.jadav@intel.com> >--- > drivers/gpu/drm/xe/xe_gt.c | 27 +++++++++++++++++++++++++-- > include/uapi/drm/xe_drm.h | 17 +++++++++++++++++ > 2 files changed, 42 insertions(+), 2 deletions(-) > >diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c >index b04e47186f5b..5ceef0059861 100644 >--- a/drivers/gpu/drm/xe/xe_gt.c >+++ b/drivers/gpu/drm/xe/xe_gt.c >@@ -740,6 +740,30 @@ static int do_gt_restart(struct xe_gt *gt) > return 0; > } > >+static void xe_uevent_gt_reset_failure(struct pci_dev *pdev, u8 tile_id, u8 gt_id) >+{ >+ char *reset_event[5]; >+ >+ reset_event[0] = DRM_XE_RESET_REQUIRED_UEVENT; >+ reset_event[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT; >+ reset_event[2] = kasprintf(GFP_KERNEL, "TILE_ID=%d", tile_id); >+ reset_event[3] = kasprintf(GFP_KERNEL, "GT_ID=%d", gt_id); >+ reset_event[4] = NULL; >+ kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, reset_event); >+ >+ kfree(reset_event[2]); >+ kfree(reset_event[3]); >+} >+ >+static void gt_reset_failed(struct xe_gt *gt, int err) >+{ >+ xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err)); >+ >+ /* Notify userspace about gt reset failure */ >+ xe_uevent_gt_reset_failure(to_pci_dev(gt_to_xe(gt)->drm.dev), >+ gt_to_tile(gt)->id, gt->info.id); >+} >+ > static int gt_reset(struct xe_gt *gt) > { > int err; >@@ -795,8 +819,7 @@ static int gt_reset(struct xe_gt *gt) > XE_WARN_ON(xe_uc_start(>->uc)); > xe_pm_runtime_put(gt_to_xe(gt)); > err_fail: >- xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err)); >- >+ gt_reset_failed(gt, err); > xe_device_declare_wedged(gt_to_xe(gt)); > > return err; >diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h >index 19619d4952a8..9ea3be97535e 100644 >--- a/include/uapi/drm/xe_drm.h >+++ b/include/uapi/drm/xe_drm.h >@@ -20,6 +20,7 @@ extern "C" { > * 2. Extension definition and helper structs > * 3. IOCTL's Query structs in the order of the Query's entries. > * 4. The rest of IOCTL structs in the order of IOCTL declaration. >+ * 5. uEvents > */ > > /** >@@ -1686,6 +1687,22 @@ struct drm_xe_oa_stream_info { > __u64 reserved[3]; > }; > >+/** >+ * DOC: uevent generated by xe on it's pci node. >+ * >+ * DRM_XE_RESET_REQUIRED_UEVENT - Event is generated when device needs reset. >+ * The REASON is provided along with the event for which reset is required. >+ * On the basis of REASONS, additional information might be supplied. >+ */ >+#define DRM_XE_RESET_REQUIRED_UEVENT "DEVICE_STATUS=NEEDS_RESET" >+ >+/** >+ * DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT - Reason provided to DRM_XE_RESET_REQUIRED_UEVENT >+ * incase of gt reset failure. The additional information supplied is tile id and >+ * gt id of the gt unit for which reset has failed. >+ */ >+#define DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT "REASON=GT_RESET_FAILED" >+ > #if defined(__cplusplus) > } > #endif >-- >2.34.1 >
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index b04e47186f5b..5ceef0059861 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -740,6 +740,30 @@ static int do_gt_restart(struct xe_gt *gt) return 0; } +static void xe_uevent_gt_reset_failure(struct pci_dev *pdev, u8 tile_id, u8 gt_id) +{ + char *reset_event[5]; + + reset_event[0] = DRM_XE_RESET_REQUIRED_UEVENT; + reset_event[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT; + reset_event[2] = kasprintf(GFP_KERNEL, "TILE_ID=%d", tile_id); + reset_event[3] = kasprintf(GFP_KERNEL, "GT_ID=%d", gt_id); + reset_event[4] = NULL; + kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, reset_event); + + kfree(reset_event[2]); + kfree(reset_event[3]); +} + +static void gt_reset_failed(struct xe_gt *gt, int err) +{ + xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err)); + + /* Notify userspace about gt reset failure */ + xe_uevent_gt_reset_failure(to_pci_dev(gt_to_xe(gt)->drm.dev), + gt_to_tile(gt)->id, gt->info.id); +} + static int gt_reset(struct xe_gt *gt) { int err; @@ -795,8 +819,7 @@ static int gt_reset(struct xe_gt *gt) XE_WARN_ON(xe_uc_start(>->uc)); xe_pm_runtime_put(gt_to_xe(gt)); err_fail: - xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err)); - + gt_reset_failed(gt, err); xe_device_declare_wedged(gt_to_xe(gt)); return err; diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 19619d4952a8..9ea3be97535e 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -20,6 +20,7 @@ extern "C" { * 2. Extension definition and helper structs * 3. IOCTL's Query structs in the order of the Query's entries. * 4. The rest of IOCTL structs in the order of IOCTL declaration. + * 5. uEvents */ /** @@ -1686,6 +1687,22 @@ struct drm_xe_oa_stream_info { __u64 reserved[3]; }; +/** + * DOC: uevent generated by xe on it's pci node. + * + * DRM_XE_RESET_REQUIRED_UEVENT - Event is generated when device needs reset. + * The REASON is provided along with the event for which reset is required. + * On the basis of REASONS, additional information might be supplied. + */ +#define DRM_XE_RESET_REQUIRED_UEVENT "DEVICE_STATUS=NEEDS_RESET" + +/** + * DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT - Reason provided to DRM_XE_RESET_REQUIRED_UEVENT + * incase of gt reset failure. The additional information supplied is tile id and + * gt id of the gt unit for which reset has failed. + */ +#define DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT "REASON=GT_RESET_FAILED" + #if defined(__cplusplus) } #endif