Message ID | 20240419164720.1765-11-shiju.jose@huawei.com (mailing list archive) |
---|---|
State | RFC, archived |
Headers | show |
Series | ras: scrub: introduce subsystem + CXL/ACPI-RAS2 drivers | expand |
> +/* Context - lock must be held */ > +static int ras2_get_patrol_scrub_running(struct ras2_scrub_ctx *ras2_ctx, > + bool *running) > +{ > + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) > + ras2_ctx->pcc_subspace->pcc_comm_addr; > + int ret; > + > + if (ras2_ctx->bg) > + *running = true; > + > + ps_sm->common.set_capabilities[0] = RAS2_SUPPORT_HW_PARTOL_SCRUB; > + ps_sm->params.patrol_scrub_command = RAS2_GET_PATROL_PARAMETERS; Need to reset the address range (base and size). A user may have previously called "Enable Background" where the code zeros out these parameters. ps_sm->params.requested_address_range[0] = ras2_ctx->base; ps_sm->params.requested_address_range[1] = ras2_ctx->size; > + > + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); > + if (ret) { > + dev_err(ras2_ctx->dev, "failed to read parameters\n"); > + return ret; > + } > + > + *running = ps_sm->params.flags & RAS2_PATROL_SCRUB_FLAG_SCRUBBER_RUNNING; > + > + return 0; > +} > + > +static int ras2_hw_scrub_write_rate(struct device *dev, u64 rate) > +{ > + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); > + bool running; > + int ret; > + > + guard(mutex)(&ras2_ctx->lock); > + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running); > + if (ret) > + return ret; > + > + if (running) > + return -EBUSY; I suggest we do not check if the patrol scrub is running when we are merely updating cached values. More importantly, if we had previously wrote an invalid value (that is only invalidated by firmware after executing a command), then when we try to write a correct value, this "ras2_get_patrol_scrub_running" check will always fail, therefore preventing us from correcting our error. > + > + if (rate < ras2_ctx->rate_min || rate > ras2_ctx->rate_max) > + return -EINVAL; > + > + ras2_ctx->rate = rate; > + > + return 0; > +} > + > +static int ras2_hw_scrub_read_rate(struct device *dev, u64 *rate) > +{ > + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); > + > + *rate = ras2_ctx->rate; > + > + return 0; > +} > + > +static int ras2_hw_scrub_read_rate_avail(struct device *dev, u64 *min, u64 *max) > +{ > + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); > + > + *min = ras2_ctx->rate_min; > + *max = ras2_ctx->rate_max; > + > + return 0; > +} > + > +static int ras2_hw_scrub_read_range(struct device *dev, u64 *base, u64 *size) > +{ > + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); > + > + *base = ras2_ctx->base; > + *size = ras2_ctx->size; > + > + return 0; > +} > + > +static int ras2_hw_scrub_write_range(struct device *dev, u64 base, u64 size) > +{ > + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); > + bool running; > + int ret; > + > + guard(mutex)(&ras2_ctx->lock); > + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running); > + if (ret) > + return ret; > + > + if (running) > + return -EBUSY; I suggest we do not check if the patrol scrub is running. See previous comment above. > + > + ras2_ctx->base = base; > + ras2_ctx->size = size; > + > + return 0; > +} > + > +static int ras2_hw_scrub_set_enabled_bg(struct device *dev, bool enable) > +{ > + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); > + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) > + ras2_ctx->pcc_subspace->pcc_comm_addr; > + int ret; > + > + guard(mutex)(&ras2_ctx->lock); > + ps_sm->common.set_capabilities[0] = RAS2_SUPPORT_HW_PARTOL_SCRUB; > + if (enable) { > + ps_sm->params.requested_address_range[0] = 0; > + ps_sm->params.requested_address_range[1] = 0; > + ps_sm->params.scrub_params_in &= ~RAS2_PATROL_SCRUB_RATE_IN_MASK; > + ps_sm->params.scrub_params_in |= FIELD_PREP(RAS2_PATROL_SCRUB_RATE_IN_MASK, > + ras2_ctx->rate); > + ps_sm->params.patrol_scrub_command = RAS2_START_PATROL_SCRUBBER; > + } else { > + ps_sm->params.patrol_scrub_command = RAS2_STOP_PATROL_SCRUBBER; > + } > + ps_sm->params.scrub_params_in &= ~RAS2_PATROL_SCRUB_EN_BACKGROUND; > + ps_sm->params.scrub_params_in |= FIELD_PREP(RAS2_PATROL_SCRUB_EN_BACKGROUND, > + enable); > + > + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); > + if (ret) { > + dev_err(ras2_ctx->dev, "%s: failed to enable(%d) background scrubbing\n", > + __func__, enable); > + return ret; > + } > + ras2_ctx->bg = true; > + > + /* Update the cache to account for rounding of supplied parameters and similar */ > + return ras2_update_patrol_scrub_params_cache(ras2_ctx); > +} > + > +static int ras2_hw_scrub_get_enabled_bg(struct device *dev, bool *enabled) > +{ > + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); > + > + *enabled = ras2_ctx->bg; > + > + return 0; > +} > + > +static int ras2_hw_scrub_set_enabled_od(struct device *dev, bool enable) > +{ > + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); > + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) > + ras2_ctx->pcc_subspace->pcc_comm_addr; > + bool enabled; > + int ret; > + > + guard(mutex)(&ras2_ctx->lock); > + ps_sm->common.set_capabilities[0] = RAS2_SUPPORT_HW_PARTOL_SCRUB; > + if (enable) { > + if (!ras2_ctx->size) { > + dev_warn(ras2_ctx->dev, > + "%s: Invalid requested address range, requested_address_range[0]=0x%llx " > + "requested_address_range[1]=0x%llx\n", __func__, > + ps_sm->params.requested_address_range[0], > + ps_sm->params.requested_address_range[1]); > + return -ERANGE; > + } > + ret = ras2_get_patrol_scrub_running(ras2_ctx, &enabled); > + if (ret) > + return ret; > + > + if (enabled) > + return 0; > + > + ps_sm->params.scrub_params_in &= ~RAS2_PATROL_SCRUB_RATE_IN_MASK; > + ps_sm->params.scrub_params_in |= FIELD_PREP(RAS2_PATROL_SCRUB_RATE_IN_MASK, > + ras2_ctx->rate); > + ps_sm->params.requested_address_range[0] = ras2_ctx->base; > + ps_sm->params.requested_address_range[1] = ras2_ctx->size; We need to clear the RAS2_PATROL_SCRUB_EN_BACKGROUND bit in the input parameters. This is in case "Enable Background" was previously called, and this bit was set. ps_sm->params.scrub_params_in &= ~RAS2_PATROL_SCRUB_EN_BACKGROUND; > + ps_sm->params.patrol_scrub_command = RAS2_START_PATROL_SCRUBBER; > + } else { > + ps_sm->params.patrol_scrub_command = RAS2_STOP_PATROL_SCRUBBER; > + } > + > + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); > + if (ret) { > + dev_err(ras2_ctx->dev, "failed to enable(%d) the demand scrubbing\n", enable); > + return ret; > + } > + ras2_ctx->bg = false; > + > + return ras2_update_patrol_scrub_params_cache(ras2_ctx); > +}
Hi Daniel, Thanks for the feedback. >-----Original Message----- >From: Daniel Ferguson <danielf@os.amperecomputing.com> >Sent: 05 June 2024 22:33 >To: Shiju Jose <shiju.jose@huawei.com> >Cc: linux-edac@vger.kernel.org; linux-kernel@vger.kernel.org; >david@redhat.com; Vilas.Sridharan@amd.com; leo.duran@amd.com; >Yazen.Ghannam@amd.com; rientjes@google.com; jiaqiyan@google.com; >tony.luck@intel.com; Jon.Grimm@amd.com; dave.hansen@linux.intel.com; >rafael@kernel.org; lenb@kernel.org; naoya.horiguchi@nec.com; >james.morse@arm.com; jthoughton@google.com; somasundaram.a@hpe.com; >erdemaktas@google.com; pgonda@google.com; duenwen@google.com; >mike.malvestuto@intel.com; gthelen@google.com; >wschwartz@amperecomputing.com; dferguson@amperecomputing.com; >wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei ><tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; >kangkang.shen@futurewei.com; wanghuiqiang <wanghuiqiang@huawei.com>; >Linuxarm <linuxarm@huawei.com>; ira.weiny@intel.com; >vishal.l.verma@intel.com; alison.schofield@intel.com; dave.jiang@intel.com; >Jonathan Cameron <jonathan.cameron@huawei.com>; dave@stgolabs.net; >dan.j.williams@intel.com; linux-mm@kvack.org; linux-acpi@vger.kernel.org; >linux-cxl@vger.kernel.org >Subject: Re: [RFC PATCH v8 10/10] ras: scrub: ACPI RAS2: Add memory ACPI >RAS2 driver > >> +/* Context - lock must be held */ >> +static int ras2_get_patrol_scrub_running(struct ras2_scrub_ctx *ras2_ctx, >> + bool *running) >> +{ >> + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) >> + ras2_ctx->pcc_subspace- >>pcc_comm_addr; >> + int ret; >> + >> + if (ras2_ctx->bg) >> + *running = true; >> + >> + ps_sm->common.set_capabilities[0] = >RAS2_SUPPORT_HW_PARTOL_SCRUB; >> + ps_sm->params.patrol_scrub_command = >RAS2_GET_PATROL_PARAMETERS; > >Need to reset the address range (base and size). A user may have previously >called "Enable Background" where the code zeros out these parameters. > ps_sm->params.requested_address_range[0] = ras2_ctx->base; > ps_sm->params.requested_address_range[1] = ras2_ctx->size; The address range is being set to the above in the ras2_hw_scrub_set_enabled_od(), because they are valid for on-demand scrubbing only. However the ras2_ctx->base and ras2_ctx->size are set to the ras2_ctx->base = ps_sm->params.actual_address_range[0]; ras2_ctx->size = ps_sm->params.actual_address_range[1]; in the ras2_update_patrol_scrub_params_cache(), which is called after enabling bg scrub and on-demand scrub. Thus ras2_ctx->base and ras2_ctx->size may have a 0 or garbage value for bg scrub because address range is not valid for bg scrubbing as perc ACPI specification. I will add checks to retain the cached address range if bg scrub is enabled. > > >> + >> + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); >> + if (ret) { >> + dev_err(ras2_ctx->dev, "failed to read parameters\n"); >> + return ret; >> + } >> + >> + *running = ps_sm->params.flags & >> +RAS2_PATROL_SCRUB_FLAG_SCRUBBER_RUNNING; >> + >> + return 0; >> +} >> + >> +static int ras2_hw_scrub_write_rate(struct device *dev, u64 rate) { >> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >> + bool running; >> + int ret; >> + >> + guard(mutex)(&ras2_ctx->lock); >> + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running); >> + if (ret) >> + return ret; >> + >> + if (running) >> + return -EBUSY; > > >I suggest we do not check if the patrol scrub is running when we are merely >updating cached values. More importantly, if we had previously wrote an invalid >value (that is only invalidated by firmware after executing a command), then >when we try to write a correct value, this "ras2_get_patrol_scrub_running" >check will always fail, therefore preventing us from correcting our error. In our opinion, write the rate and range etc, though updating the cached values, should be allowed only when the scrub is NOT running to avoid confusion thinking they are actually set in the running scrubber, when read them back in the userspace. > >> + >> + if (rate < ras2_ctx->rate_min || rate > ras2_ctx->rate_max) >> + return -EINVAL; >> + >> + ras2_ctx->rate = rate; >> + >> + return 0; >> +} >> + >> +static int ras2_hw_scrub_read_rate(struct device *dev, u64 *rate) { >> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >> + >> + *rate = ras2_ctx->rate; >> + >> + return 0; >> +} >> + >> +static int ras2_hw_scrub_read_rate_avail(struct device *dev, u64 >> +*min, u64 *max) { >> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >> + >> + *min = ras2_ctx->rate_min; >> + *max = ras2_ctx->rate_max; >> + >> + return 0; >> +} >> + >> +static int ras2_hw_scrub_read_range(struct device *dev, u64 *base, >> +u64 *size) { >> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >> + >> + *base = ras2_ctx->base; >> + *size = ras2_ctx->size; >> + >> + return 0; >> +} >> + >> +static int ras2_hw_scrub_write_range(struct device *dev, u64 base, >> +u64 size) { >> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >> + bool running; >> + int ret; >> + >> + guard(mutex)(&ras2_ctx->lock); >> + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running); >> + if (ret) >> + return ret; >> + >> + if (running) >> + return -EBUSY; > >I suggest we do not check if the patrol scrub is running. See previous comment >above. Same as above. > >> + >> + ras2_ctx->base = base; >> + ras2_ctx->size = size; >> + >> + return 0; >> +} >> + >> +static int ras2_hw_scrub_set_enabled_bg(struct device *dev, bool >> +enable) { >> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >> + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) >> + ras2_ctx->pcc_subspace- >>pcc_comm_addr; >> + int ret; >> + >> + guard(mutex)(&ras2_ctx->lock); >> + ps_sm->common.set_capabilities[0] = >RAS2_SUPPORT_HW_PARTOL_SCRUB; >> + if (enable) { >> + ps_sm->params.requested_address_range[0] = 0; >> + ps_sm->params.requested_address_range[1] = 0; >> + ps_sm->params.scrub_params_in &= >~RAS2_PATROL_SCRUB_RATE_IN_MASK; >> + ps_sm->params.scrub_params_in |= >FIELD_PREP(RAS2_PATROL_SCRUB_RATE_IN_MASK, >> + ras2_ctx->rate); >> + ps_sm->params.patrol_scrub_command = >RAS2_START_PATROL_SCRUBBER; >> + } else { >> + ps_sm->params.patrol_scrub_command = >RAS2_STOP_PATROL_SCRUBBER; >> + } >> + ps_sm->params.scrub_params_in &= >~RAS2_PATROL_SCRUB_EN_BACKGROUND; >> + ps_sm->params.scrub_params_in |= >FIELD_PREP(RAS2_PATROL_SCRUB_EN_BACKGROUND, >> + enable); >> + >> + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); >> + if (ret) { >> + dev_err(ras2_ctx->dev, "%s: failed to enable(%d) background >scrubbing\n", >> + __func__, enable); >> + return ret; >> + } >> + ras2_ctx->bg = true; >> + >> + /* Update the cache to account for rounding of supplied parameters and >similar */ >> + return ras2_update_patrol_scrub_params_cache(ras2_ctx); >> +} >> + >> +static int ras2_hw_scrub_get_enabled_bg(struct device *dev, bool >> +*enabled) { >> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >> + >> + *enabled = ras2_ctx->bg; >> + >> + return 0; >> +} >> + >> +static int ras2_hw_scrub_set_enabled_od(struct device *dev, bool >> +enable) { >> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >> + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) >> + ras2_ctx->pcc_subspace- >>pcc_comm_addr; >> + bool enabled; >> + int ret; >> + >> + guard(mutex)(&ras2_ctx->lock); >> + ps_sm->common.set_capabilities[0] = >RAS2_SUPPORT_HW_PARTOL_SCRUB; >> + if (enable) { >> + if (!ras2_ctx->size) { >> + dev_warn(ras2_ctx->dev, >> + "%s: Invalid requested address range, >requested_address_range[0]=0x%llx " >> + "requested_address_range[1]=0x%llx\n", >__func__, >> + ps_sm->params.requested_address_range[0], >> + ps_sm->params.requested_address_range[1]); >> + return -ERANGE; >> + } >> + ret = ras2_get_patrol_scrub_running(ras2_ctx, &enabled); >> + if (ret) >> + return ret; >> + >> + if (enabled) >> + return 0; >> + >> + ps_sm->params.scrub_params_in &= >~RAS2_PATROL_SCRUB_RATE_IN_MASK; >> + ps_sm->params.scrub_params_in |= >FIELD_PREP(RAS2_PATROL_SCRUB_RATE_IN_MASK, >> + ras2_ctx->rate); >> + ps_sm->params.requested_address_range[0] = ras2_ctx->base; >> + ps_sm->params.requested_address_range[1] = ras2_ctx->size; > > >We need to clear the RAS2_PATROL_SCRUB_EN_BACKGROUND bit in the input >parameters. >This is in case "Enable Background" was previously called, and this bit was set. > > ps_sm->params.scrub_params_in &= >~RAS2_PATROL_SCRUB_EN_BACKGROUND; We need to stop background scrub if it is already running before start an on-demand scrubbing. The RAS2_PATROL_SCRUB_EN_BACKGROUND bit would be cleared with disable bg scrub with the following code in ras2_hw_scrub_set_enabled_bg() when disable background scrub('enable' is 0 in this case). ps_sm->params.scrub_params_in &= ~RAS2_PATROL_SCRUB_EN_BACKGROUND; ps_sm->params.scrub_params_in |= FIELD_PREP(RAS2_PATROL_SCRUB_EN_BACKGROUND, enable); Hope it make sense? > > >> + ps_sm->params.patrol_scrub_command = >RAS2_START_PATROL_SCRUBBER; >> + } else { >> + ps_sm->params.patrol_scrub_command = >RAS2_STOP_PATROL_SCRUBBER; >> + } >> + >> + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); >> + if (ret) { >> + dev_err(ras2_ctx->dev, "failed to enable(%d) the demand >scrubbing\n", enable); >> + return ret; >> + } >> + ras2_ctx->bg = false; >> + >> + return ras2_update_patrol_scrub_params_cache(ras2_ctx); >> +} > > Thanks, Shiju
On 6/7/2024 8:46 AM, Shiju Jose wrote: > Hi Daniel, > > Thanks for the feedback. > >> -----Original Message----- >> From: Daniel Ferguson <danielf@os.amperecomputing.com> >> Sent: 05 June 2024 22:33 >> To: Shiju Jose <shiju.jose@huawei.com> >> Cc: linux-edac@vger.kernel.org; linux-kernel@vger.kernel.org; >> david@redhat.com; Vilas.Sridharan@amd.com; leo.duran@amd.com; >> Yazen.Ghannam@amd.com; rientjes@google.com; jiaqiyan@google.com; >> tony.luck@intel.com; Jon.Grimm@amd.com; dave.hansen@linux.intel.com; >> rafael@kernel.org; lenb@kernel.org; naoya.horiguchi@nec.com; >> james.morse@arm.com; jthoughton@google.com; somasundaram.a@hpe.com; >> erdemaktas@google.com; pgonda@google.com; duenwen@google.com; >> mike.malvestuto@intel.com; gthelen@google.com; >> wschwartz@amperecomputing.com; dferguson@amperecomputing.com; >> wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei >> <tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; >> kangkang.shen@futurewei.com; wanghuiqiang <wanghuiqiang@huawei.com>; >> Linuxarm <linuxarm@huawei.com>; ira.weiny@intel.com; >> vishal.l.verma@intel.com; alison.schofield@intel.com; dave.jiang@intel.com; >> Jonathan Cameron <jonathan.cameron@huawei.com>; dave@stgolabs.net; >> dan.j.williams@intel.com; linux-mm@kvack.org; linux-acpi@vger.kernel.org; >> linux-cxl@vger.kernel.org >> Subject: Re: [RFC PATCH v8 10/10] ras: scrub: ACPI RAS2: Add memory ACPI >> RAS2 driver >> >>> +/* Context - lock must be held */ >>> +static int ras2_get_patrol_scrub_running(struct ras2_scrub_ctx *ras2_ctx, >>> + bool *running) >>> +{ >>> + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) >>> + ras2_ctx->pcc_subspace- >>> pcc_comm_addr; >>> + int ret; >>> + >>> + if (ras2_ctx->bg) >>> + *running = true; >>> + >>> + ps_sm->common.set_capabilities[0] = >> RAS2_SUPPORT_HW_PARTOL_SCRUB; >>> + ps_sm->params.patrol_scrub_command = >> RAS2_GET_PATROL_PARAMETERS; >> >> Need to reset the address range (base and size). A user may have previously >> called "Enable Background" where the code zeros out these parameters. >> ps_sm->params.requested_address_range[0] = ras2_ctx->base; >> ps_sm->params.requested_address_range[1] = ras2_ctx->size; > The address range is being set to the above in the ras2_hw_scrub_set_enabled_od(), because they are > valid for on-demand scrubbing only. > > However the ras2_ctx->base and ras2_ctx->size are set to the > ras2_ctx->base = ps_sm->params.actual_address_range[0]; > ras2_ctx->size = ps_sm->params.actual_address_range[1]; > in the ras2_update_patrol_scrub_params_cache(), which is called after enabling bg scrub and on-demand scrub. > Thus ras2_ctx->base and ras2_ctx->size may have a 0 or garbage value for bg scrub because address range is not valid for bg scrubbing as perc ACPI specification. I will add checks to retain the cached address range if bg scrub is enabled. >> >> >>> + >>> + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); >>> + if (ret) { >>> + dev_err(ras2_ctx->dev, "failed to read parameters\n"); >>> + return ret; >>> + } >>> + >>> + *running = ps_sm->params.flags & >>> +RAS2_PATROL_SCRUB_FLAG_SCRUBBER_RUNNING; >>> + >>> + return 0; >>> +} >>> + >>> +static int ras2_hw_scrub_write_rate(struct device *dev, u64 rate) { >>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>> + bool running; >>> + int ret; >>> + >>> + guard(mutex)(&ras2_ctx->lock); >>> + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running); >>> + if (ret) >>> + return ret; >>> + >>> + if (running) >>> + return -EBUSY; >> >> >> I suggest we do not check if the patrol scrub is running when we are merely >> updating cached values. More importantly, if we had previously wrote an invalid >> value (that is only invalidated by firmware after executing a command), then >> when we try to write a correct value, this "ras2_get_patrol_scrub_running" >> check will always fail, therefore preventing us from correcting our error. > > In our opinion, write the rate and range etc, though updating the cached values, should be allowed only when the scrub is NOT running to avoid confusion thinking they are actually set in the running scrubber, when read them back in the userspace. It may be that I didn't explain myself properly last time. Let me try again. 1) This driver code does not currently check to see if an 'addr_range_base' is valid or not. Validation occurs in the platform firmware, when either GET_PATROL_PARAMETERS or START_PATROL_SCRUBBER is executed. If our platform firmware detects an invalid address, it raises an error. 2) Therefore, a user can specify an invalid address, and the user will not know that the address is invalid until after the cached parameters (used to check if the patrol scrubber is running) are written to. 3) Now, every time the user attempts to write a value to either base, size, or rate; the preceding call to ras2_get_patrol_scrub_running will result in an error, and the attempt to write a different value fails. To Conclude: If a user specifies an invalid address, the only way to correct the invalid address is to reboot or module reload. To me, that seems like a show-stopper. >> >>> + >>> + if (rate < ras2_ctx->rate_min || rate > ras2_ctx->rate_max) >>> + return -EINVAL; >>> + >>> + ras2_ctx->rate = rate; >>> + >>> + return 0; >>> +} >>> + >>> +static int ras2_hw_scrub_read_rate(struct device *dev, u64 *rate) { >>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>> + >>> + *rate = ras2_ctx->rate; >>> + >>> + return 0; >>> +} >>> + >>> +static int ras2_hw_scrub_read_rate_avail(struct device *dev, u64 >>> +*min, u64 *max) { >>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>> + >>> + *min = ras2_ctx->rate_min; >>> + *max = ras2_ctx->rate_max; >>> + >>> + return 0; >>> +} >>> + >>> +static int ras2_hw_scrub_read_range(struct device *dev, u64 *base, >>> +u64 *size) { >>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>> + >>> + *base = ras2_ctx->base; >>> + *size = ras2_ctx->size; >>> + >>> + return 0; >>> +} >>> + >>> +static int ras2_hw_scrub_write_range(struct device *dev, u64 base, >>> +u64 size) { >>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>> + bool running; >>> + int ret; >>> + >>> + guard(mutex)(&ras2_ctx->lock); >>> + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running); >>> + if (ret) >>> + return ret; >>> + >>> + if (running) >>> + return -EBUSY; >> >> I suggest we do not check if the patrol scrub is running. See previous comment >> above. > Same as above. > >> >>> + >>> + ras2_ctx->base = base; >>> + ras2_ctx->size = size; >>> + >>> + return 0; >>> +} >>> + >>> +static int ras2_hw_scrub_set_enabled_bg(struct device *dev, bool >>> +enable) { >>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>> + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) >>> + ras2_ctx->pcc_subspace- >>> pcc_comm_addr; >>> + int ret; >>> + >>> + guard(mutex)(&ras2_ctx->lock); >>> + ps_sm->common.set_capabilities[0] = >> RAS2_SUPPORT_HW_PARTOL_SCRUB; >>> + if (enable) { >>> + ps_sm->params.requested_address_range[0] = 0; >>> + ps_sm->params.requested_address_range[1] = 0; >>> + ps_sm->params.scrub_params_in &= >> ~RAS2_PATROL_SCRUB_RATE_IN_MASK; >>> + ps_sm->params.scrub_params_in |= >> FIELD_PREP(RAS2_PATROL_SCRUB_RATE_IN_MASK, >>> + ras2_ctx->rate); >>> + ps_sm->params.patrol_scrub_command = >> RAS2_START_PATROL_SCRUBBER; >>> + } else { >>> + ps_sm->params.patrol_scrub_command = >> RAS2_STOP_PATROL_SCRUBBER; >>> + } >>> + ps_sm->params.scrub_params_in &= >> ~RAS2_PATROL_SCRUB_EN_BACKGROUND; >>> + ps_sm->params.scrub_params_in |= >> FIELD_PREP(RAS2_PATROL_SCRUB_EN_BACKGROUND, >>> + enable); >>> + >>> + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); >>> + if (ret) { >>> + dev_err(ras2_ctx->dev, "%s: failed to enable(%d) background >> scrubbing\n", >>> + __func__, enable); >>> + return ret; >>> + } >>> + ras2_ctx->bg = true; >>> + >>> + /* Update the cache to account for rounding of supplied parameters and >> similar */ >>> + return ras2_update_patrol_scrub_params_cache(ras2_ctx); >>> +} >>> + >>> +static int ras2_hw_scrub_get_enabled_bg(struct device *dev, bool >>> +*enabled) { >>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>> + >>> + *enabled = ras2_ctx->bg; >>> + >>> + return 0; >>> +} >>> + >>> +static int ras2_hw_scrub_set_enabled_od(struct device *dev, bool >>> +enable) { >>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>> + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) >>> + ras2_ctx->pcc_subspace- >>> pcc_comm_addr; >>> + bool enabled; >>> + int ret; >>> + >>> + guard(mutex)(&ras2_ctx->lock); >>> + ps_sm->common.set_capabilities[0] = >> RAS2_SUPPORT_HW_PARTOL_SCRUB; >>> + if (enable) { >>> + if (!ras2_ctx->size) { >>> + dev_warn(ras2_ctx->dev, >>> + "%s: Invalid requested address range, >> requested_address_range[0]=0x%llx " >>> + "requested_address_range[1]=0x%llx\n", >> __func__, >>> + ps_sm->params.requested_address_range[0], >>> + ps_sm->params.requested_address_range[1]); >>> + return -ERANGE; >>> + } >>> + ret = ras2_get_patrol_scrub_running(ras2_ctx, &enabled); >>> + if (ret) >>> + return ret; >>> + >>> + if (enabled) >>> + return 0; >>> + >>> + ps_sm->params.scrub_params_in &= >> ~RAS2_PATROL_SCRUB_RATE_IN_MASK; >>> + ps_sm->params.scrub_params_in |= >> FIELD_PREP(RAS2_PATROL_SCRUB_RATE_IN_MASK, >>> + ras2_ctx->rate); >>> + ps_sm->params.requested_address_range[0] = ras2_ctx->base; >>> + ps_sm->params.requested_address_range[1] = ras2_ctx->size; >> >> >> We need to clear the RAS2_PATROL_SCRUB_EN_BACKGROUND bit in the input >> parameters. >> This is in case "Enable Background" was previously called, and this bit was set. >> >> ps_sm->params.scrub_params_in &= >> ~RAS2_PATROL_SCRUB_EN_BACKGROUND; > We need to stop background scrub if it is already running before start an on-demand scrubbing. > The RAS2_PATROL_SCRUB_EN_BACKGROUND bit would be cleared with disable bg scrub > with the following code > in ras2_hw_scrub_set_enabled_bg() when disable background scrub('enable' is 0 in this case). > ps_sm->params.scrub_params_in &= ~RAS2_PATROL_SCRUB_EN_BACKGROUND; > ps_sm->params.scrub_params_in |= FIELD_PREP(RAS2_PATROL_SCRUB_EN_BACKGROUND, > enable); > Hope it make sense? Yes, this makes sense. But, on our platform, we automatically enable background when on-demand finishes(or is stopped). Similarly, if we enable on-demand then we automatically disable background. So, some sort of patrol is always on-going. The user is unable to turn them both off at the same time. Due to our implementation choices, this causes some weirdness with how the driver represents enable_background and enable_on_demand independently, since our scrubbers are not independent. I'm going to leave this conversation here for now, because on different platforms, maybe having independent control for background and on-demand is desired. >> >> >>> + ps_sm->params.patrol_scrub_command = >> RAS2_START_PATROL_SCRUBBER; >>> + } else { >>> + ps_sm->params.patrol_scrub_command = >> RAS2_STOP_PATROL_SCRUBBER; >>> + } >>> + >>> + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); >>> + if (ret) { >>> + dev_err(ras2_ctx->dev, "failed to enable(%d) the demand >> scrubbing\n", enable); >>> + return ret; >>> + } >>> + ras2_ctx->bg = false; >>> + >>> + return ras2_update_patrol_scrub_params_cache(ras2_ctx); >>> +} >> >> > Thanks, > Shiju
>-----Original Message----- >From: Daniel Ferguson <danielf@os.amperecomputing.com> >Sent: 21 June 2024 19:07 >To: Shiju Jose <shiju.jose@huawei.com> >Cc: linux-edac@vger.kernel.org; linux-kernel@vger.kernel.org; >david@redhat.com; Vilas.Sridharan@amd.com; leo.duran@amd.com; >Yazen.Ghannam@amd.com; rientjes@google.com; jiaqiyan@google.com; >tony.luck@intel.com; Jon.Grimm@amd.com; dave.hansen@linux.intel.com; >rafael@kernel.org; lenb@kernel.org; naoya.horiguchi@nec.com; >james.morse@arm.com; jthoughton@google.com; somasundaram.a@hpe.com; >erdemaktas@google.com; pgonda@google.com; duenwen@google.com; >mike.malvestuto@intel.com; gthelen@google.com; >wschwartz@amperecomputing.com; dferguson@amperecomputing.com; >wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei ><tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; >kangkang.shen@futurewei.com; wanghuiqiang <wanghuiqiang@huawei.com>; >Linuxarm <linuxarm@huawei.com>; ira.weiny@intel.com; >vishal.l.verma@intel.com; alison.schofield@intel.com; dave.jiang@intel.com; >Jonathan Cameron <jonathan.cameron@huawei.com>; dave@stgolabs.net; >dan.j.williams@intel.com; linux-mm@kvack.org; linux-acpi@vger.kernel.org; >linux-cxl@vger.kernel.org >Subject: Re: [RFC PATCH v8 10/10] ras: scrub: ACPI RAS2: Add memory ACPI >RAS2 driver > > > >On 6/7/2024 8:46 AM, Shiju Jose wrote: >> Hi Daniel, >> >> Thanks for the feedback. >> >>> -----Original Message----- >>> From: Daniel Ferguson <danielf@os.amperecomputing.com> >>> Sent: 05 June 2024 22:33 >>> To: Shiju Jose <shiju.jose@huawei.com> >>> Cc: linux-edac@vger.kernel.org; linux-kernel@vger.kernel.org; >>> david@redhat.com; Vilas.Sridharan@amd.com; leo.duran@amd.com; >>> Yazen.Ghannam@amd.com; rientjes@google.com; jiaqiyan@google.com; >>> tony.luck@intel.com; Jon.Grimm@amd.com; dave.hansen@linux.intel.com; >>> rafael@kernel.org; lenb@kernel.org; naoya.horiguchi@nec.com; >>> james.morse@arm.com; jthoughton@google.com; >somasundaram.a@hpe.com; >>> erdemaktas@google.com; pgonda@google.com; duenwen@google.com; >>> mike.malvestuto@intel.com; gthelen@google.com; >>> wschwartz@amperecomputing.com; dferguson@amperecomputing.com; >>> wbs@os.amperecomputing.com; nifan.cxl@gmail.com; tanxiaofei >>> <tanxiaofei@huawei.com>; Zengtao (B) <prime.zeng@hisilicon.com>; >>> kangkang.shen@futurewei.com; wanghuiqiang ><wanghuiqiang@huawei.com>; >>> Linuxarm <linuxarm@huawei.com>; ira.weiny@intel.com; >>> vishal.l.verma@intel.com; alison.schofield@intel.com; >>> dave.jiang@intel.com; Jonathan Cameron ><jonathan.cameron@huawei.com>; >>> dave@stgolabs.net; dan.j.williams@intel.com; linux-mm@kvack.org; >>> linux-acpi@vger.kernel.org; linux-cxl@vger.kernel.org >>> Subject: Re: [RFC PATCH v8 10/10] ras: scrub: ACPI RAS2: Add memory >>> ACPI >>> RAS2 driver >>> >>>> +/* Context - lock must be held */ >>>> +static int ras2_get_patrol_scrub_running(struct ras2_scrub_ctx *ras2_ctx, >>>> + bool *running) >>>> +{ >>>> + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) >>>> + ras2_ctx->pcc_subspace- >>>> pcc_comm_addr; >>>> + int ret; >>>> + >>>> + if (ras2_ctx->bg) >>>> + *running = true; >>>> + >>>> + ps_sm->common.set_capabilities[0] = >>> RAS2_SUPPORT_HW_PARTOL_SCRUB; >>>> + ps_sm->params.patrol_scrub_command = >>> RAS2_GET_PATROL_PARAMETERS; >>> >>> Need to reset the address range (base and size). A user may have >>> previously called "Enable Background" where the code zeros out these >parameters. >>> ps_sm->params.requested_address_range[0] = ras2_ctx->base; >>> ps_sm->params.requested_address_range[1] = ras2_ctx->size; >> The address range is being set to the above in the >> ras2_hw_scrub_set_enabled_od(), because they are valid for on-demand >scrubbing only. >> >> However the ras2_ctx->base and ras2_ctx->size are set to the >> ras2_ctx->base = ps_sm->params.actual_address_range[0]; >> ras2_ctx->size = ps_sm->params.actual_address_range[1]; >> in the ras2_update_patrol_scrub_params_cache(), which is called after >enabling bg scrub and on-demand scrub. >> Thus ras2_ctx->base and ras2_ctx->size may have a 0 or garbage value for bg >scrub because address range is not valid for bg scrubbing as perc ACPI >specification. I will add checks to retain the cached address range if bg scrub is >enabled. >>> >>> >>>> + >>>> + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); >>>> + if (ret) { >>>> + dev_err(ras2_ctx->dev, "failed to read parameters\n"); >>>> + return ret; >>>> + } >>>> + >>>> + *running = ps_sm->params.flags & >>>> +RAS2_PATROL_SCRUB_FLAG_SCRUBBER_RUNNING; >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +static int ras2_hw_scrub_write_rate(struct device *dev, u64 rate) { >>>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>>> + bool running; >>>> + int ret; >>>> + >>>> + guard(mutex)(&ras2_ctx->lock); >>>> + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running); >>>> + if (ret) >>>> + return ret; >>>> + >>>> + if (running) >>>> + return -EBUSY; >>> >>> >>> I suggest we do not check if the patrol scrub is running when we are >>> merely updating cached values. More importantly, if we had previously >>> wrote an invalid value (that is only invalidated by firmware after >>> executing a command), then when we try to write a correct value, this >"ras2_get_patrol_scrub_running" >>> check will always fail, therefore preventing us from correcting our error. >> >> In our opinion, write the rate and range etc, though updating the cached >values, should be allowed only when the scrub is NOT running to avoid confusion >thinking they are actually set in the running scrubber, when read them back in >the userspace. > > >It may be that I didn't explain myself properly last time. Let me try again. > >1) This driver code does not currently check to see if an 'addr_range_base' is >valid or not. Validation occurs in the platform firmware, when either >GET_PATROL_PARAMETERS or START_PATROL_SCRUBBER is executed. If our >platform firmware detects an invalid address, it raises an error. Thanks for giving more details. > >2) Therefore, a user can specify an invalid address, and the user will not know >that the address is invalid until after the cached parameters (used to check if the >patrol scrubber is running) are written to. > >3) Now, every time the user attempts to write a value to either base, size, or >rate; the preceding call to ras2_get_patrol_scrub_running will result in an error, >and the attempt to write a different value fails. Thanks for giving more details. 1. In write_range function, I added basic checks to make sure the address range base or size to set are non-zero and return error to the user if so. But unable to check those address parameters to set are in the supported address range of a scrub device because ACPI RAS2 Table 5.87: Parameter Block Structure for PATROL_SCRUB does have fields to publish supported address range for a patrol scrubber. The field "Actual Address Range (OUTPUT)" seems not suitable for this purpose because " The platform calculates the nearest patrol scrub boundary address from where it can start. This range should be a superset of the Requested Address Range." as described in the ACPI spec? In ras2_hw_scrub_write_rate() , we already have check for scrub rate to set is within supported min and max scrub rates, set by the firmware in the above table in response to GET_PATROL_PARAMETERS. 2. You mentioned that "If our platform firmware detects an invalid address, it raises an error." In this case my understanding is that the scrubbing should not have started and thus the kernel will get the scrub NOT running status and the user will be able to correct the address? > >To Conclude: >If a user specifies an invalid address, the only way to correct the invalid address >is to reboot or module reload. To me, that seems like a show-stopper. > >>> >>>> + >>>> + if (rate < ras2_ctx->rate_min || rate > ras2_ctx->rate_max) >>>> + return -EINVAL; >>>> + >>>> + ras2_ctx->rate = rate; >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +static int ras2_hw_scrub_read_rate(struct device *dev, u64 *rate) { >>>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>>> + >>>> + *rate = ras2_ctx->rate; >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +static int ras2_hw_scrub_read_rate_avail(struct device *dev, u64 >>>> +*min, u64 *max) { >>>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>>> + >>>> + *min = ras2_ctx->rate_min; >>>> + *max = ras2_ctx->rate_max; >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +static int ras2_hw_scrub_read_range(struct device *dev, u64 *base, >>>> +u64 *size) { >>>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>>> + >>>> + *base = ras2_ctx->base; >>>> + *size = ras2_ctx->size; >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +static int ras2_hw_scrub_write_range(struct device *dev, u64 base, >>>> +u64 size) { >>>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>>> + bool running; >>>> + int ret; >>>> + >>>> + guard(mutex)(&ras2_ctx->lock); >>>> + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running); >>>> + if (ret) >>>> + return ret; >>>> + >>>> + if (running) >>>> + return -EBUSY; >>> >>> I suggest we do not check if the patrol scrub is running. See >>> previous comment above. >> Same as above. >> >>> >>>> + >>>> + ras2_ctx->base = base; >>>> + ras2_ctx->size = size; >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +static int ras2_hw_scrub_set_enabled_bg(struct device *dev, bool >>>> +enable) { >>>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>>> + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) >>>> + ras2_ctx->pcc_subspace- >>>> pcc_comm_addr; >>>> + int ret; >>>> + >>>> + guard(mutex)(&ras2_ctx->lock); >>>> + ps_sm->common.set_capabilities[0] = >>> RAS2_SUPPORT_HW_PARTOL_SCRUB; >>>> + if (enable) { >>>> + ps_sm->params.requested_address_range[0] = 0; >>>> + ps_sm->params.requested_address_range[1] = 0; >>>> + ps_sm->params.scrub_params_in &= >>> ~RAS2_PATROL_SCRUB_RATE_IN_MASK; >>>> + ps_sm->params.scrub_params_in |= >>> FIELD_PREP(RAS2_PATROL_SCRUB_RATE_IN_MASK, >>>> + ras2_ctx->rate); >>>> + ps_sm->params.patrol_scrub_command = >>> RAS2_START_PATROL_SCRUBBER; >>>> + } else { >>>> + ps_sm->params.patrol_scrub_command = >>> RAS2_STOP_PATROL_SCRUBBER; >>>> + } >>>> + ps_sm->params.scrub_params_in &= >>> ~RAS2_PATROL_SCRUB_EN_BACKGROUND; >>>> + ps_sm->params.scrub_params_in |= >>> FIELD_PREP(RAS2_PATROL_SCRUB_EN_BACKGROUND, >>>> + enable); >>>> + >>>> + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); >>>> + if (ret) { >>>> + dev_err(ras2_ctx->dev, "%s: failed to enable(%d) background >>> scrubbing\n", >>>> + __func__, enable); >>>> + return ret; >>>> + } >>>> + ras2_ctx->bg = true; >>>> + >>>> + /* Update the cache to account for rounding of supplied parameters >>>> +and >>> similar */ >>>> + return ras2_update_patrol_scrub_params_cache(ras2_ctx); >>>> +} >>>> + >>>> +static int ras2_hw_scrub_get_enabled_bg(struct device *dev, bool >>>> +*enabled) { >>>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>>> + >>>> + *enabled = ras2_ctx->bg; >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +static int ras2_hw_scrub_set_enabled_od(struct device *dev, bool >>>> +enable) { >>>> + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); >>>> + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) >>>> + ras2_ctx->pcc_subspace- >>>> pcc_comm_addr; >>>> + bool enabled; >>>> + int ret; >>>> + >>>> + guard(mutex)(&ras2_ctx->lock); >>>> + ps_sm->common.set_capabilities[0] = >>> RAS2_SUPPORT_HW_PARTOL_SCRUB; >>>> + if (enable) { >>>> + if (!ras2_ctx->size) { >>>> + dev_warn(ras2_ctx->dev, >>>> + "%s: Invalid requested address range, >>> requested_address_range[0]=0x%llx " >>>> + "requested_address_range[1]=0x%llx\n", >>> __func__, >>>> + ps_sm->params.requested_address_range[0], >>>> + ps_sm->params.requested_address_range[1]); >>>> + return -ERANGE; >>>> + } >>>> + ret = ras2_get_patrol_scrub_running(ras2_ctx, &enabled); >>>> + if (ret) >>>> + return ret; >>>> + >>>> + if (enabled) >>>> + return 0; >>>> + >>>> + ps_sm->params.scrub_params_in &= >>> ~RAS2_PATROL_SCRUB_RATE_IN_MASK; >>>> + ps_sm->params.scrub_params_in |= >>> FIELD_PREP(RAS2_PATROL_SCRUB_RATE_IN_MASK, >>>> + ras2_ctx->rate); >>>> + ps_sm->params.requested_address_range[0] = ras2_ctx->base; >>>> + ps_sm->params.requested_address_range[1] = ras2_ctx->size; >>> >>> >>> We need to clear the RAS2_PATROL_SCRUB_EN_BACKGROUND bit in the >input >>> parameters. >>> This is in case "Enable Background" was previously called, and this bit was >set. >>> >>> ps_sm->params.scrub_params_in &= >>> ~RAS2_PATROL_SCRUB_EN_BACKGROUND; >> We need to stop background scrub if it is already running before start an on- >demand scrubbing. >> The RAS2_PATROL_SCRUB_EN_BACKGROUND bit would be cleared with >disable >> bg scrub with the following code in ras2_hw_scrub_set_enabled_bg() >> when disable background scrub('enable' is 0 in this case). >> ps_sm->params.scrub_params_in &= >~RAS2_PATROL_SCRUB_EN_BACKGROUND; >> ps_sm->params.scrub_params_in |= >FIELD_PREP(RAS2_PATROL_SCRUB_EN_BACKGROUND, >> enable); >> Hope it make sense? > > >Yes, this makes sense. But, on our platform, we automatically enable >background when on-demand finishes(or is stopped). Similarly, if we enable on- >demand then we automatically disable background. So, some sort of patrol is >always on-going. The user is unable to turn them both off at the same time. > >Due to our implementation choices, this causes some weirdness with how the >driver represents enable_background and enable_on_demand independently, >since our scrubbers are not independent. > >I'm going to leave this conversation here for now, because on different >platforms, maybe having independent control for background and on-demand is >desired. Ok. I added clearing the RAS2_PATROL_SCRUB_EN_BACKGROUND bit in the input parameters in ras2_hw_scrub_set_enabled_od () as you requested so that it will work in your platforms. > >>> >>> >>>> + ps_sm->params.patrol_scrub_command = >>> RAS2_START_PATROL_SCRUBBER; Thanks, Shiju
diff --git a/Documentation/scrub/scrub-configure.rst b/Documentation/scrub/scrub-configure.rst index 2275366b60d3..7a1bf87bc6d7 100644 --- a/Documentation/scrub/scrub-configure.rst +++ b/Documentation/scrub/scrub-configure.rst @@ -50,3 +50,36 @@ The usage takes the form shown in this example:: # echo 0 > /sys/class/ras/ras0/scrub/enable_background # cat /sys/class/ras/ras0/scrub/enable_background # 0 + +2. RAS2 +2.1 On demand scrubbing for a specific memory region. + # echo 0x120000 > /sys/class/ras/ras1/scrub/addr_range_base + # echo 0x150000 > /sys/class/ras/ras1/scrub/addr_range_size + # cat /sys/class/ras/ras1/scrub/rate_available + # 0x1-0x18 + # echo 20 > /sys/class/ras/ras1/scrub/rate + # echo 1 > /sys/class/ras/ras1/scrub/enable_on_demand + # cat /sys/class/ras/ras1/scrub/enable_on_demand + # 1 + # cat /sys/class/ras/ras1/scrub/rate + # 0x14 + # cat /sys/class/ras/ras1/scrub/addr_range_base + # 0x120000 + # cat /sys/class/ras/ras1/scrub/addr_range_size + # 0x150000 + # echo 0 > /sys/class/ras/ras1/scrub/enable_on_demand + # cat /sys/class/ras/ras1/scrub/enable_on_demand + # 0 + +2.2 Background scrubbing the entire memory + # cat /sys/class/ras/ras1/scrub/rate_available + # 0x1-0x18 + # echo 3 > /sys/class/ras/ras1/scrub/rate + # echo 1 > /sys/class/ras/ras1/scrub/enable_background + # cat /sys/class/ras/ras1/scrub/enable_background + # 1 + # cat /sys/class/ras/ras1/scrub/rate + # 0x3 + # echo 0 > /sys/class/ras/ras1/scrub/enable_background + # cat /sys/class/ras/ras1/scrub/enable_background + # 0 diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig index 181701479564..57c346dfc01f 100644 --- a/drivers/ras/Kconfig +++ b/drivers/ras/Kconfig @@ -53,4 +53,14 @@ config SCRUB configuring the parameters of underlying scrubbers in the system for the DRAM memories. +config MEM_ACPI_RAS2 + tristate "Memory ACPI RAS2 driver" + depends on ACPI_RAS2 + depends on SCRUB + help + The driver binds to the platform device added by the ACPI RAS2 + table parser. Use a PCC channel subspace for communicating with + the ACPI compliant platform to provide control of memory scrub + parameters via the scrub subsystem. + endif diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile index 89bcf0d84355..48339fee1cb3 100644 --- a/drivers/ras/Makefile +++ b/drivers/ras/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_RAS) += ras.o obj-$(CONFIG_DEBUG_FS) += debugfs.o obj-$(CONFIG_RAS_CEC) += cec.o obj-$(CONFIG_SCRUB) += memory_scrub.o +obj-$(CONFIG_MEM_ACPI_RAS2) += acpi_ras2.o obj-$(CONFIG_RAS_FMPM) += amd/fmpm.o obj-y += amd/atl/ diff --git a/drivers/ras/acpi_ras2.c b/drivers/ras/acpi_ras2.c new file mode 100644 index 000000000000..b3e9b61367bb --- /dev/null +++ b/drivers/ras/acpi_ras2.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ACPI RAS2 memory driver + * + * Copyright (c) 2024 HiSilicon Limited. + * + */ + +#define pr_fmt(fmt) "MEMORY ACPI RAS2: " fmt + +#include <linux/memory_scrub.h> +#include <linux/platform_device.h> +#include <acpi/ras2_acpi.h> + +#define RAS2_SUPPORT_HW_PARTOL_SCRUB BIT(0) +#define RAS2_TYPE_PATROL_SCRUB 0x0000 + +#define RAS2_GET_PATROL_PARAMETERS 0x01 +#define RAS2_START_PATROL_SCRUBBER 0x02 +#define RAS2_STOP_PATROL_SCRUBBER 0x03 + +#define RAS2_PATROL_SCRUB_RATE_IN_MASK GENMASK(15, 8) +#define RAS2_PATROL_SCRUB_EN_BACKGROUND BIT(0) +#define RAS2_PATROL_SCRUB_RATE_OUT_MASK GENMASK(7, 0) +#define RAS2_PATROL_SCRUB_MIN_RATE_OUT_MASK GENMASK(15, 8) +#define RAS2_PATROL_SCRUB_MAX_RATE_OUT_MASK GENMASK(23, 16) +#define RAS2_PATROL_SCRUB_FLAG_SCRUBBER_RUNNING BIT(0) + +struct acpi_ras2_ps_shared_mem { + struct acpi_ras2_shared_memory common; + struct acpi_ras2_patrol_scrub_parameter params; +}; + +static int ras2_is_patrol_scrub_support(struct ras2_scrub_ctx *ras2_ctx) +{ + struct acpi_ras2_shared_memory __iomem *common = (void *) + ras2_ctx->pcc_subspace->pcc_comm_addr; + + guard(mutex)(&ras2_ctx->lock); + common->set_capabilities[0] = 0; + + return common->features[0] & RAS2_SUPPORT_HW_PARTOL_SCRUB; +} + +static int ras2_update_patrol_scrub_params_cache(struct ras2_scrub_ctx *ras2_ctx) +{ + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) + ras2_ctx->pcc_subspace->pcc_comm_addr; + int ret; + + ps_sm->common.set_capabilities[0] = RAS2_SUPPORT_HW_PARTOL_SCRUB; + ps_sm->params.patrol_scrub_command = RAS2_GET_PATROL_PARAMETERS; + + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); + if (ret) { + dev_err(ras2_ctx->dev, "failed to read parameters\n"); + return ret; + } + + ras2_ctx->rate_min = FIELD_GET(RAS2_PATROL_SCRUB_MIN_RATE_OUT_MASK, + ps_sm->params.scrub_params_out); + ras2_ctx->rate_max = FIELD_GET(RAS2_PATROL_SCRUB_MAX_RATE_OUT_MASK, + ps_sm->params.scrub_params_out); + ras2_ctx->base = ps_sm->params.actual_address_range[0]; + ras2_ctx->size = ps_sm->params.actual_address_range[1]; + ras2_ctx->rate = FIELD_GET(RAS2_PATROL_SCRUB_RATE_OUT_MASK, + ps_sm->params.scrub_params_out); + return 0; +} + +/* Context - lock must be held */ +static int ras2_get_patrol_scrub_running(struct ras2_scrub_ctx *ras2_ctx, + bool *running) +{ + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) + ras2_ctx->pcc_subspace->pcc_comm_addr; + int ret; + + if (ras2_ctx->bg) + *running = true; + + ps_sm->common.set_capabilities[0] = RAS2_SUPPORT_HW_PARTOL_SCRUB; + ps_sm->params.patrol_scrub_command = RAS2_GET_PATROL_PARAMETERS; + + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); + if (ret) { + dev_err(ras2_ctx->dev, "failed to read parameters\n"); + return ret; + } + + *running = ps_sm->params.flags & RAS2_PATROL_SCRUB_FLAG_SCRUBBER_RUNNING; + + return 0; +} + +static int ras2_hw_scrub_write_rate(struct device *dev, u64 rate) +{ + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); + bool running; + int ret; + + guard(mutex)(&ras2_ctx->lock); + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running); + if (ret) + return ret; + + if (running) + return -EBUSY; + + if (rate < ras2_ctx->rate_min || rate > ras2_ctx->rate_max) + return -EINVAL; + + ras2_ctx->rate = rate; + + return 0; +} + +static int ras2_hw_scrub_read_rate(struct device *dev, u64 *rate) +{ + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); + + *rate = ras2_ctx->rate; + + return 0; +} + +static int ras2_hw_scrub_read_rate_avail(struct device *dev, u64 *min, u64 *max) +{ + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); + + *min = ras2_ctx->rate_min; + *max = ras2_ctx->rate_max; + + return 0; +} + +static int ras2_hw_scrub_read_range(struct device *dev, u64 *base, u64 *size) +{ + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); + + *base = ras2_ctx->base; + *size = ras2_ctx->size; + + return 0; +} + +static int ras2_hw_scrub_write_range(struct device *dev, u64 base, u64 size) +{ + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); + bool running; + int ret; + + guard(mutex)(&ras2_ctx->lock); + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running); + if (ret) + return ret; + + if (running) + return -EBUSY; + + ras2_ctx->base = base; + ras2_ctx->size = size; + + return 0; +} + +static int ras2_hw_scrub_set_enabled_bg(struct device *dev, bool enable) +{ + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) + ras2_ctx->pcc_subspace->pcc_comm_addr; + int ret; + + guard(mutex)(&ras2_ctx->lock); + ps_sm->common.set_capabilities[0] = RAS2_SUPPORT_HW_PARTOL_SCRUB; + if (enable) { + ps_sm->params.requested_address_range[0] = 0; + ps_sm->params.requested_address_range[1] = 0; + ps_sm->params.scrub_params_in &= ~RAS2_PATROL_SCRUB_RATE_IN_MASK; + ps_sm->params.scrub_params_in |= FIELD_PREP(RAS2_PATROL_SCRUB_RATE_IN_MASK, + ras2_ctx->rate); + ps_sm->params.patrol_scrub_command = RAS2_START_PATROL_SCRUBBER; + } else { + ps_sm->params.patrol_scrub_command = RAS2_STOP_PATROL_SCRUBBER; + } + ps_sm->params.scrub_params_in &= ~RAS2_PATROL_SCRUB_EN_BACKGROUND; + ps_sm->params.scrub_params_in |= FIELD_PREP(RAS2_PATROL_SCRUB_EN_BACKGROUND, + enable); + + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); + if (ret) { + dev_err(ras2_ctx->dev, "%s: failed to enable(%d) background scrubbing\n", + __func__, enable); + return ret; + } + ras2_ctx->bg = true; + + /* Update the cache to account for rounding of supplied parameters and similar */ + return ras2_update_patrol_scrub_params_cache(ras2_ctx); +} + +static int ras2_hw_scrub_get_enabled_bg(struct device *dev, bool *enabled) +{ + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); + + *enabled = ras2_ctx->bg; + + return 0; +} + +static int ras2_hw_scrub_set_enabled_od(struct device *dev, bool enable) +{ + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *) + ras2_ctx->pcc_subspace->pcc_comm_addr; + bool enabled; + int ret; + + guard(mutex)(&ras2_ctx->lock); + ps_sm->common.set_capabilities[0] = RAS2_SUPPORT_HW_PARTOL_SCRUB; + if (enable) { + if (!ras2_ctx->size) { + dev_warn(ras2_ctx->dev, + "%s: Invalid requested address range, requested_address_range[0]=0x%llx " + "requested_address_range[1]=0x%llx\n", __func__, + ps_sm->params.requested_address_range[0], + ps_sm->params.requested_address_range[1]); + return -ERANGE; + } + ret = ras2_get_patrol_scrub_running(ras2_ctx, &enabled); + if (ret) + return ret; + + if (enabled) + return 0; + + ps_sm->params.scrub_params_in &= ~RAS2_PATROL_SCRUB_RATE_IN_MASK; + ps_sm->params.scrub_params_in |= FIELD_PREP(RAS2_PATROL_SCRUB_RATE_IN_MASK, + ras2_ctx->rate); + ps_sm->params.requested_address_range[0] = ras2_ctx->base; + ps_sm->params.requested_address_range[1] = ras2_ctx->size; + ps_sm->params.patrol_scrub_command = RAS2_START_PATROL_SCRUBBER; + } else { + ps_sm->params.patrol_scrub_command = RAS2_STOP_PATROL_SCRUBBER; + } + + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC); + if (ret) { + dev_err(ras2_ctx->dev, "failed to enable(%d) the demand scrubbing\n", enable); + return ret; + } + ras2_ctx->bg = false; + + return ras2_update_patrol_scrub_params_cache(ras2_ctx); +} + +static int ras2_hw_scrub_get_enabled_od(struct device *dev, bool *enabled) +{ + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); + + guard(mutex)(&ras2_ctx->lock); + if (ras2_ctx->bg) { + *enabled = false; + return 0; + } + + return ras2_get_patrol_scrub_running(ras2_ctx, enabled); +} + +static int ras2_hw_scrub_get_name(struct device *dev, char *name) +{ + struct ras2_scrub_ctx *ras2_ctx = dev_get_drvdata(dev); + + return sysfs_emit(name, "ras2_scrub%d\n", ras2_ctx->id); +} + +static const struct scrub_ops ras2_scrub_ops = { + .read_range = ras2_hw_scrub_read_range, + .write_range = ras2_hw_scrub_write_range, + .get_enabled_bg = ras2_hw_scrub_get_enabled_bg, + .set_enabled_bg = ras2_hw_scrub_set_enabled_bg, + .get_enabled_od = ras2_hw_scrub_get_enabled_od, + .set_enabled_od = ras2_hw_scrub_set_enabled_od, + .get_name = ras2_hw_scrub_get_name, + .rate_avail_range = ras2_hw_scrub_read_rate_avail, + .rate_read = ras2_hw_scrub_read_rate, + .rate_write = ras2_hw_scrub_write_rate, +}; + +static DEFINE_IDA(ras2_ida); + +static void ida_release(void *ctx) +{ + struct ras2_scrub_ctx *ras2_ctx = ctx; + + ida_free(&ras2_ida, ras2_ctx->id); +} + +static int ras2_probe(struct platform_device *pdev) +{ + struct ras2_scrub_ctx *ras2_ctx; + struct device *hw_scrub_dev; + int ret, id; + + /* RAS2 PCC Channel and Scrub specific context */ + ras2_ctx = devm_kzalloc(&pdev->dev, sizeof(*ras2_ctx), GFP_KERNEL); + if (!ras2_ctx) + return -ENOMEM; + + ras2_ctx->dev = &pdev->dev; + mutex_init(&ras2_ctx->lock); + + ret = devm_ras2_register_pcc_channel(&pdev->dev, ras2_ctx, + *((int *)dev_get_platdata(&pdev->dev))); + if (ret < 0) { + dev_dbg(ras2_ctx->dev, + "failed to register pcc channel ret=%d\n", ret); + return ret; + } + if (!ras2_is_patrol_scrub_support(ras2_ctx)) + return -EOPNOTSUPP; + + ret = ras2_update_patrol_scrub_params_cache(ras2_ctx); + if (ret) + return ret; + + id = ida_alloc(&ras2_ida, GFP_KERNEL); + if (id < 0) + return id; + + ras2_ctx->id = id; + + ret = devm_add_action_or_reset(&pdev->dev, ida_release, ras2_ctx); + if (ret < 0) + return ret; + + hw_scrub_dev = devm_scrub_device_register(&pdev->dev, ras2_ctx, &ras2_scrub_ops); + if (IS_ERR(hw_scrub_dev)) + return PTR_ERR(hw_scrub_dev); + + ras2_ctx->scrub_dev = hw_scrub_dev; + + return 0; +} + +static const struct platform_device_id ras2_id_table[] = { + { .name = "acpi_ras2", }, + { } +}; +MODULE_DEVICE_TABLE(platform, ras2_id_table); + +static struct platform_driver ras2_driver = { + .probe = ras2_probe, + .driver = { + .name = "acpi_ras2", + }, + .id_table = ras2_id_table, +}; +module_driver(ras2_driver, platform_driver_register, platform_driver_unregister); + +MODULE_IMPORT_NS(ACPI_RAS2); +MODULE_DESCRIPTION("ACPI RAS2 memory driver"); +MODULE_LICENSE("GPL");