diff mbox

[3/3] mce: acpi/apei: trace: Enable ghes memory error trace event

Message ID 1375986471-27113-4-git-send-email-naveen.n.rao@linux.vnet.ibm.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Naveen N. Rao Aug. 8, 2013, 6:27 p.m. UTC
Enable memory error trace event in cper.c

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
---
 drivers/acpi/apei/cper.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

Comments

Mauro Carvalho Chehab Aug. 8, 2013, 7:38 p.m. UTC | #1
Em Thu, 08 Aug 2013 23:57:51 +0530
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:

> Enable memory error trace event in cper.c

Why do we need to do that? Memory error events are already handled
via edac_ghes module, in a standard way that allows the same
tracing to be used by either BIOS or by direct hardware error
report mechanism.

Adding an extra tracing for the same thing here will just make
userspace more complex, and will create duplicated error events
for the very same error.

Ok, if the interface there is poor, let's improve it, but adding
yet another interface to report the same error doesn't sound
reasonable on my eyes.

Regards,
Mauro
> 
> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
> ---
>  drivers/acpi/apei/cper.c | 21 ++++++++++++++++-----
>  1 file changed, 16 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c
> index 33dc6a0..19a9c0b 100644
> --- a/drivers/acpi/apei/cper.c
> +++ b/drivers/acpi/apei/cper.c
> @@ -32,6 +32,10 @@
>  #include <linux/pci.h>
>  #include <linux/aer.h>
>  
> +#define CREATE_TRACE_POINTS
> +#define TRACE_EVENT_GHES
> +#include <trace/events/ras.h>
> +
>  /*
>   * CPER record ID need to be unique even after reboot, because record
>   * ID is used as index for ERST storage, while CPER records from
> @@ -193,8 +197,13 @@ static const char *cper_mem_err_type_strs[] = {
>  	"scrub uncorrected error",
>  };
>  
> -static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
> +static void cper_print_mem(const char *pfx,
> +		const struct acpi_hest_generic_status *estatus,
> +		const struct acpi_hest_generic_data *gdata,
> +		const struct cper_sec_mem_err *mem)
>  {
> +	trace_ghes_platform_memory_event(estatus, gdata, mem);
> +
>  	if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
>  		printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
>  	if (mem->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS)
> @@ -292,8 +301,10 @@ static const char *apei_estatus_section_flag_strs[] = {
>  	"latent error",
>  };
>  
> -static void apei_estatus_print_section(
> -	const char *pfx, const struct acpi_hest_generic_data *gdata, int sec_no)
> +static void apei_estatus_print_section(const char *pfx,
> +		const struct acpi_hest_generic_status *estatus,
> +		const struct acpi_hest_generic_data *gdata,
> +		int sec_no)
>  {
>  	uuid_le *sec_type = (uuid_le *)gdata->section_type;
>  	__u16 severity;
> @@ -320,7 +331,7 @@ static void apei_estatus_print_section(
>  		struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
>  		printk("%s""section_type: memory error\n", pfx);
>  		if (gdata->error_data_length >= sizeof(*mem_err))
> -			cper_print_mem(pfx, mem_err);
> +			cper_print_mem(pfx, estatus, gdata, mem_err);
>  		else
>  			goto err_section_too_small;
>  	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) {
> @@ -355,7 +366,7 @@ void apei_estatus_print(const char *pfx,
>  	gdata = (struct acpi_hest_generic_data *)(estatus + 1);
>  	while (data_len > sizeof(*gdata)) {
>  		gedata_len = gdata->error_data_length;
> -		apei_estatus_print_section(pfx, gdata, sec_no);
> +		apei_estatus_print_section(pfx, estatus, gdata, sec_no);
>  		data_len -= gedata_len + sizeof(*gdata);
>  		gdata = (void *)(gdata + 1) + gedata_len;
>  		sec_no++;
Borislav Petkov Aug. 10, 2013, 6:03 p.m. UTC | #2
On Thu, Aug 08, 2013 at 04:38:22PM -0300, Mauro Carvalho Chehab wrote:
> Em Thu, 08 Aug 2013 23:57:51 +0530
> "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:
> 
> > Enable memory error trace event in cper.c
> 
> Why do we need to do that? Memory error events are already handled
> via edac_ghes module,

If APEI gives me all the required information in order to deal with the
hardware error - and it looks like it does - then the additional layer
of ghes_edac is not needed.
Mauro Carvalho Chehab Aug. 12, 2013, 11:33 a.m. UTC | #3
Em Sat, 10 Aug 2013 20:03:22 +0200
Borislav Petkov <bp@alien8.de> escreveu:

> On Thu, Aug 08, 2013 at 04:38:22PM -0300, Mauro Carvalho Chehab wrote:
> > Em Thu, 08 Aug 2013 23:57:51 +0530
> > "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:
> > 
> > > Enable memory error trace event in cper.c
> > 
> > Why do we need to do that? Memory error events are already handled
> > via edac_ghes module,
> 
> If APEI gives me all the required information in order to deal with the
> hardware error - and it looks like it does - then the additional layer
> of ghes_edac is not needed.

APEI is just the mechanism that collects the data, not the mechanism
that reports to userspace.

The current implementation is that APEI already reports those errors
via ghes_edac driver. It also reports the very same error via MCE
(although the APEI interface to MCE is currently broken for everything
that it is not Nehalem-EX - as it basically emulates the MCE log for
that specific architecture).

I really don't see any sense on adding yet-another-way to report the
very same error.

Regards,
Mauro
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov Aug. 12, 2013, 12:38 p.m. UTC | #4
On Mon, Aug 12, 2013 at 08:33:55AM -0300, Mauro Carvalho Chehab wrote:
> APEI is just the mechanism that collects the data, not the mechanism
> that reports to userspace.

Both methods add a tracepoint - no difference.

> I really don't see any sense on adding yet-another-way to report the
> very same error.

Well, your suggested way through the layers is this:

Hardware->APEI->EDAC.

His is

Hardware->APEI.

If I can lose the EDAC layer, then this is a clear win.
Naveen N. Rao Aug. 12, 2013, 12:41 p.m. UTC | #5
On 08/12/2013 05:03 PM, Mauro Carvalho Chehab wrote:
> Em Sat, 10 Aug 2013 20:03:22 +0200
> Borislav Petkov <bp@alien8.de> escreveu:
>
>> On Thu, Aug 08, 2013 at 04:38:22PM -0300, Mauro Carvalho Chehab wrote:
>>> Em Thu, 08 Aug 2013 23:57:51 +0530
>>> "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:
>>>
>>>> Enable memory error trace event in cper.c
>>>
>>> Why do we need to do that? Memory error events are already handled
>>> via edac_ghes module,
>>
>> If APEI gives me all the required information in order to deal with the
>> hardware error - and it looks like it does - then the additional layer
>> of ghes_edac is not needed.
>
> APEI is just the mechanism that collects the data, not the mechanism
> that reports to userspace.

I think what Boris is saying is that ghes_edac isn't adding anything 
more here given what we get from APEI structures. So, there doesn't seem 
to be a need to add dependency on edac for this purpose.

Further, ghes_edac seems to require EDAC_MM_EDAC to be compiled into the 
kernel (not a module). So, more dependencies.

>
> The current implementation is that APEI already reports those errors
> via ghes_edac driver. It also reports the very same error via MCE
> (although the APEI interface to MCE is currently broken for everything
> that it is not Nehalem-EX - as it basically emulates the MCE log for
> that specific architecture).

So, I looked at ghes_edac and it basically seems to boil down to 
trace_mc_event. But, this only seems to expose the APEI data as a string 
and doesn't look to really make all the fields available to user-space 
in a raw manner. Not sure how well this can be utilised by a user-space 
tool. Do you have suggestions on how we can do this?

Thanks,
Naveen

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov Aug. 12, 2013, 12:53 p.m. UTC | #6
On Mon, Aug 12, 2013 at 06:11:49PM +0530, Naveen N. Rao wrote:
> So, I looked at ghes_edac and it basically seems to boil down to
> trace_mc_event. But, this only seems to expose the APEI data as a
> string and doesn't look to really make all the fields available to
> user-space in a raw manner. Not sure how well this can be utilised
> by a user-space tool.

Well, your tracepoint dumps the decoded memory error too. So all the
information we need is already there, without edac. Or am I missing some
bits?

Thus why I'm saying is that we don't need the additional edac layer.
Mauro Carvalho Chehab Aug. 12, 2013, 2:44 p.m. UTC | #7
Em Mon, 12 Aug 2013 18:11:49 +0530
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:

> On 08/12/2013 05:03 PM, Mauro Carvalho Chehab wrote:
> > Em Sat, 10 Aug 2013 20:03:22 +0200
> > Borislav Petkov <bp@alien8.de> escreveu:
> >
> >> On Thu, Aug 08, 2013 at 04:38:22PM -0300, Mauro Carvalho Chehab wrote:
> >>> Em Thu, 08 Aug 2013 23:57:51 +0530
> >>> "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:
> >>>
> >>>> Enable memory error trace event in cper.c
> >>>
> >>> Why do we need to do that? Memory error events are already handled
> >>> via edac_ghes module,
> >>
> >> If APEI gives me all the required information in order to deal with the
> >> hardware error - and it looks like it does - then the additional layer
> >> of ghes_edac is not needed.
> >
> > APEI is just the mechanism that collects the data, not the mechanism
> > that reports to userspace.
> 
> I think what Boris is saying is that ghes_edac isn't adding anything 
> more here given what we get from APEI structures. So, there doesn't seem 
> to be a need to add dependency on edac for this purpose.
> 
> Further, ghes_edac seems to require EDAC_MM_EDAC to be compiled into the 
> kernel (not a module). So, more dependencies.
> 
> >
> > The current implementation is that APEI already reports those errors
> > via ghes_edac driver. It also reports the very same error via MCE
> > (although the APEI interface to MCE is currently broken for everything
> > that it is not Nehalem-EX - as it basically emulates the MCE log for
> > that specific architecture).
> 
> So, I looked at ghes_edac and it basically seems to boil down to 
> trace_mc_event. 

Yes. It also provides the sysfs nodes that describe the memory.

> But, this only seems to expose the APEI data as a string 
> and doesn't look to really make all the fields available to user-space 
> in a raw manner. Not sure how well this can be utilised by a user-space 
> tool. Do you have suggestions on how we can do this?

There's already an userspace tool that handes it:
	https://git.fedorahosted.org/cgit/rasdaemon.git/

What is missing there on the current version is the bits that would allow
to translate from APEI way to report an error (memory node, card, module,
bank, device) into a DIMM label[1]. 

At the end, what really matters is to be able to point to the DIMM(s)
in a way that the user can replace them (e. g. using the silk screen
labels on the system motherboard).

[1] It does such translation for the other EDAC drivers, via a
configuration file that does such per-system mapping. Extending it to
also handle APEI errors shouldn't be hard.

> 
> Thanks,
> Naveen
>
Mauro Carvalho Chehab Aug. 12, 2013, 2:49 p.m. UTC | #8
Em Mon, 12 Aug 2013 14:38:13 +0200
Borislav Petkov <bp@alien8.de> escreveu:

> On Mon, Aug 12, 2013 at 08:33:55AM -0300, Mauro Carvalho Chehab wrote:
> > APEI is just the mechanism that collects the data, not the mechanism
> > that reports to userspace.
> 
> Both methods add a tracepoint - no difference.
> 
> > I really don't see any sense on adding yet-another-way to report the
> > very same error.
> 
> Well, your suggested way through the layers is this:
> 
> Hardware->APEI->EDAC.
> 
> His is
> 
> Hardware->APEI.
> 
> If I can lose the EDAC layer, then this is a clear win.

Clear win from what PoV? Userspace will need to decode a different type
of tracing, and implement a different logic for APEI. So, it will be
reinventing the wheel, with a different trace format, and will require
userspace to implement another tracing event for the same thing that
EDAC already provides.

Also, if both ghes_edac and this new tracing is enabled, userspace will
receive twice the same event, as two traces will be received for the
same thing.

Worse than that, how userspace is supposed to merge those two events
into one?
>
Borislav Petkov Aug. 12, 2013, 3:04 p.m. UTC | #9
On Mon, Aug 12, 2013 at 11:49:32AM -0300, Mauro Carvalho Chehab wrote:
> Clear win from what PoV? Userspace will need to decode a different type
> of tracing, and implement a different logic for APEI.

There's no different type of tracing - it is the same info as in both
cases it comes from APEI. And if it can be done in the APEI layer, then
we don't need the next layer.

> Also, if both ghes_edac and this new tracing is enabled, userspace
> will receive twice the same event, as two traces will be received for
> the same thing.

We are, of course, going to have only one tracepoint which reports
memory errors, not two.
Mauro Carvalho Chehab Aug. 12, 2013, 5:25 p.m. UTC | #10
Em Mon, 12 Aug 2013 17:04:24 +0200
Borislav Petkov <bp@alien8.de> escreveu:

> On Mon, Aug 12, 2013 at 11:49:32AM -0300, Mauro Carvalho Chehab wrote:
> > Clear win from what PoV? Userspace will need to decode a different type
> > of tracing, and implement a different logic for APEI.
> 
> There's no different type of tracing - it is the same info as in both
> cases it comes from APEI.

Well, patch 2/3 is defining a different type of tracing for memory errors,
instead of re-using the existing one.

> And if it can be done in the APEI layer, then
> we don't need the next layer.

Userspace still needs the EDAC sysfs, in order to identify how the memory
is organized, and do the proper memory labels association.

What edac_ghes does is to fill those sysfs nodes, and to call the
existing tracing to report errors.

> > Also, if both ghes_edac and this new tracing is enabled, userspace
> > will receive twice the same event, as two traces will be received for
> > the same thing.
> 
> We are, of course, going to have only one tracepoint which reports
> memory errors, not two.

Yes, that's my point.

Regards,
Mauro
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tony Luck Aug. 12, 2013, 5:54 p.m. UTC | #11
>> We are, of course, going to have only one tracepoint which reports
>> memory errors, not two.
>
> Yes, that's my point.

Is life that simple?

We have systems that have no EDAC driver (in some cases because the
architecture precludes one from ever being written, in other because
we either don't have the documentation, or because no driver has been
written yet).	

Systems also have varying degrees of APEI support (from none at all, to
full support ... possibly we may have some with apparent support, but BIOS
provides bogus information ... assuming BIOS folks live up/down to our
usual expectations).

-Tony


--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov Aug. 12, 2013, 5:56 p.m. UTC | #12
On Mon, Aug 12, 2013 at 02:25:57PM -0300, Mauro Carvalho Chehab wrote:
> Userspace still needs the EDAC sysfs, in order to identify how the
> memory is organized, and do the proper memory labels association.
>
> What edac_ghes does is to fill those sysfs nodes, and to call the
> existing tracing to report errors.

This is the only reason which justifies EDAC's existence. Naveen, can
your BIOS directly report the silkscreen label of the DIMM in error?
Generally, can any BIOS do that?

More specifically, what are those gdata_fru_id and gdata_fru_text
things?

Because if it can, then having the memory error tracepoint come direct
from APEI should be enough. The ghes_edac functionality could be then
fallback for BIOSes which cannot report the silkscreen label and in such
case I can imagine keeping both tracepoints, but disabling one of the
two...
Naveen N. Rao Aug. 13, 2013, 11:21 a.m. UTC | #13
On 08/12/2013 06:23 PM, Borislav Petkov wrote:
> On Mon, Aug 12, 2013 at 06:11:49PM +0530, Naveen N. Rao wrote:
>> So, I looked at ghes_edac and it basically seems to boil down to
>> trace_mc_event. But, this only seems to expose the APEI data as a
>> string and doesn't look to really make all the fields available to
>> user-space in a raw manner. Not sure how well this can be utilised
>> by a user-space tool.
>
> Well, your tracepoint dumps the decoded memory error too. So all the
> information we need is already there, without edac. Or am I missing some
> bits?
>
> Thus why I'm saying is that we don't need the additional edac layer.
>

You're right - my trace point makes all the data provided by apei as-is 
to userspace. However, ghes_edac seems to squash some of this data into 
a string when reporting through mc_event.

Regards,
Naveen

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Naveen N. Rao Aug. 13, 2013, 11:36 a.m. UTC | #14
On 08/12/2013 11:26 PM, Borislav Petkov wrote:
> On Mon, Aug 12, 2013 at 02:25:57PM -0300, Mauro Carvalho Chehab wrote:
>> Userspace still needs the EDAC sysfs, in order to identify how the
>> memory is organized, and do the proper memory labels association.
>>
>> What edac_ghes does is to fill those sysfs nodes, and to call the
>> existing tracing to report errors.

I suppose you're referring to the entries under /sys/devices/system/edac/mc?

I'm not sure I understand how this helps. ghes_edac seems to just be 
populating this based on dmi, which if I'm not mistaken, can be obtained 
in userspace (mcelog as an example).

Also, on my system, all DIMMs are being reported under mc0. I doubt if 
the labels there are accurate.

>
> This is the only reason which justifies EDAC's existence. Naveen, can
> your BIOS directly report the silkscreen label of the DIMM in error?
> Generally, can any BIOS do that?
>
> More specifically, what are those gdata_fru_id and gdata_fru_text
> things?

My understanding was that this provides the DIMM serial number, but I'm 
double checking just to be sure.

Thanks,
Naveen

>
> Because if it can, then having the memory error tracepoint come direct
> from APEI should be enough. The ghes_edac functionality could be then
> fallback for BIOSes which cannot report the silkscreen label and in such
> case I can imagine keeping both tracepoints, but disabling one of the
> two...
>

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Naveen N. Rao Aug. 13, 2013, 11:41 a.m. UTC | #15
On 08/12/2013 08:14 PM, Mauro Carvalho Chehab wrote:
>> But, this only seems to expose the APEI data as a string
>> and doesn't look to really make all the fields available to user-space
>> in a raw manner. Not sure how well this can be utilised by a user-space
>> tool. Do you have suggestions on how we can do this?
>
> There's already an userspace tool that handes it:
> 	https://git.fedorahosted.org/cgit/rasdaemon.git/
>
> What is missing there on the current version is the bits that would allow
> to translate from APEI way to report an error (memory node, card, module,
> bank, device) into a DIMM label[1].

If I'm reading this right, all APEI data seems to be squashed into a 
string in mc_event. Also, the fru id/text don't seem to be passed to 
user-space.

- Naveen

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mauro Carvalho Chehab Aug. 13, 2013, 12:21 p.m. UTC | #16
Em Tue, 13 Aug 2013 17:06:14 +0530
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:

> On 08/12/2013 11:26 PM, Borislav Petkov wrote:
> > On Mon, Aug 12, 2013 at 02:25:57PM -0300, Mauro Carvalho Chehab wrote:
> >> Userspace still needs the EDAC sysfs, in order to identify how the
> >> memory is organized, and do the proper memory labels association.
> >>
> >> What edac_ghes does is to fill those sysfs nodes, and to call the
> >> existing tracing to report errors.
> 
> I suppose you're referring to the entries under /sys/devices/system/edac/mc?

Yes.

> 
> I'm not sure I understand how this helps. ghes_edac seems to just be 
> populating this based on dmi, which if I'm not mistaken, can be obtained 
> in userspace (mcelog as an example).
> 
> Also, on my system, all DIMMs are being reported under mc0. I doubt if 
> the labels there are accurate.

Yes, this is the current status of ghes_edac, where BIOS doesn't provide any
reliable way to associate a given APEI report to a physical DIMM slot label.

The plan is to add more logic there as BIOSes start to provide some reliable
way to do such association. I discussed this subject with a few vendors
while I was working at Red Hat.

> >
> > This is the only reason which justifies EDAC's existence. Naveen, can
> > your BIOS directly report the silkscreen label of the DIMM in error?
> > Generally, can any BIOS do that?
> >
> > More specifically, what are those gdata_fru_id and gdata_fru_text
> > things?
> 
> My understanding was that this provides the DIMM serial number, but I'm 
> double checking just to be sure.

If it provides the DIMM serial number, then it is possible to improve the
ghes_edac driver to associate them. One option could be to write an I2C
driver and dig those information directly from the memories, although 
doing that could be risky, as BIOS could also try to access the same I2C
buses.

> 
> Thanks,
> Naveen
> 
> >
> > Because if it can, then having the memory error tracepoint come direct
> > from APEI should be enough. The ghes_edac functionality could be then
> > fallback for BIOSes which cannot report the silkscreen label and in such
> > case I can imagine keeping both tracepoints, but disabling one of the
> > two...
> >
>
Borislav Petkov Aug. 13, 2013, 12:33 p.m. UTC | #17
On Tue, Aug 13, 2013 at 09:21:54AM -0300, Mauro Carvalho Chehab wrote:
> > > More specifically, what are those gdata_fru_id and gdata_fru_text
> > > things?
> > 
> > My understanding was that this provides the DIMM serial number, but I'm 
> > double checking just to be sure.

Hm, ok, then.

If this info is exported to userspace over i2c or DMI (I've seen
it sometimes in dmidecode output too) then the information in the
tracepoint can be used in conjunction with dmidecode output to map back
to the DIMM or so...
Mauro Carvalho Chehab Aug. 13, 2013, 12:41 p.m. UTC | #18
Em Tue, 13 Aug 2013 17:11:18 +0530
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:

> On 08/12/2013 08:14 PM, Mauro Carvalho Chehab wrote:
> >> But, this only seems to expose the APEI data as a string
> >> and doesn't look to really make all the fields available to user-space
> >> in a raw manner. Not sure how well this can be utilised by a user-space
> >> tool. Do you have suggestions on how we can do this?
> >
> > There's already an userspace tool that handes it:
> > 	https://git.fedorahosted.org/cgit/rasdaemon.git/
> >
> > What is missing there on the current version is the bits that would allow
> > to translate from APEI way to report an error (memory node, card, module,
> > bank, device) into a DIMM label[1].
> 
> If I'm reading this right, all APEI data seems to be squashed into a 
> string in mc_event.

Yes. We had lots of discussion about how to map memory errors over the
last couple years. Basically, it was decided that the information that
could be decoded into a DIMM to be mapped as integers, and all other
driver-specific data to be added as strings.

On the tests I did, different machines/vendors fill the APEI data on
a different way, with makes harder to associate them to a DIMM.

> Also, the fru id/text don't seem to be passed to user-space.

That's likely because on the systems I tested, those fields were not
filled (or maybe they appeared on a latter ACPI version). We should add
them also the same string as the other fields there at ghes_edac.

Regards,
Mauro
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov Aug. 13, 2013, 12:42 p.m. UTC | #19
On Tue, Aug 13, 2013 at 04:51:33PM +0530, Naveen N. Rao wrote:
> You're right - my trace point makes all the data provided by apei
> as-is to userspace. However, ghes_edac seems to squash some of this
> data into a string when reporting through mc_event.

Right, for systems which don't need EDAC to decode to the DIMM or for
which there are no EDAC drivers written, they could use a tracepoint
which carries APEI info as-is. Others, which need EDAC, should probably
use trace_mc_event and disable the APEI tracepoint.

I think this should address Tony's concerns...

Btw, you could call your TP something simpler like
trace_ghes_memory_event or so.

Btw 2, if GHES can report other types of errors (I'm pretty sure it can)
maybe we can use a single tracepoint called trace_ghes_event for any
types of errors coming out of it...

Oh, and while at it, we probably need to start thinking of a mechanism
to disable all the error printing, i.e. cper_print_mem() and such,
if a userspace agent is listening in on the tracepoint and the error
information is carried through it to userspace.

Thanks.
Naveen N. Rao Aug. 13, 2013, 4:55 p.m. UTC | #20
On 08/13/2013 05:51 PM, Mauro Carvalho Chehab wrote:
> Em Tue, 13 Aug 2013 17:06:14 +0530
> "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:
>
>> On 08/12/2013 11:26 PM, Borislav Petkov wrote:
>>> On Mon, Aug 12, 2013 at 02:25:57PM -0300, Mauro Carvalho Chehab wrote:
>>>> Userspace still needs the EDAC sysfs, in order to identify how the
>>>> memory is organized, and do the proper memory labels association.
>>>>
>>>> What edac_ghes does is to fill those sysfs nodes, and to call the
>>>> existing tracing to report errors.
>>
>> I suppose you're referring to the entries under /sys/devices/system/edac/mc?
>
> Yes.
>
>>
>> I'm not sure I understand how this helps. ghes_edac seems to just be
>> populating this based on dmi, which if I'm not mistaken, can be obtained
>> in userspace (mcelog as an example).
>>
>> Also, on my system, all DIMMs are being reported under mc0. I doubt if
>> the labels there are accurate.
>
> Yes, this is the current status of ghes_edac, where BIOS doesn't provide any
> reliable way to associate a given APEI report to a physical DIMM slot label.
>
> The plan is to add more logic there as BIOSes start to provide some reliable
> way to do such association. I discussed this subject with a few vendors
> while I was working at Red Hat.

Hmm... is there anything specific in the APEI report that could help? 
More importantly, is there a need to do this in-kernel rather than in 
user-space?

Thanks,
Naveen

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Naveen N. Rao Aug. 13, 2013, 5:17 p.m. UTC | #21
On 08/13/2013 06:11 PM, Mauro Carvalho Chehab wrote:
> Em Tue, 13 Aug 2013 17:11:18 +0530
> "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:
>
>> On 08/12/2013 08:14 PM, Mauro Carvalho Chehab wrote:
>>>> But, this only seems to expose the APEI data as a string
>>>> and doesn't look to really make all the fields available to user-space
>>>> in a raw manner. Not sure how well this can be utilised by a user-space
>>>> tool. Do you have suggestions on how we can do this?
>>>
>>> There's already an userspace tool that handes it:
>>> 	https://git.fedorahosted.org/cgit/rasdaemon.git/
>>>
>>> What is missing there on the current version is the bits that would allow
>>> to translate from APEI way to report an error (memory node, card, module,
>>> bank, device) into a DIMM label[1].
>>
>> If I'm reading this right, all APEI data seems to be squashed into a
>> string in mc_event.
>
> Yes. We had lots of discussion about how to map memory errors over the
> last couple years. Basically, it was decided that the information that
> could be decoded into a DIMM to be mapped as integers, and all other
> driver-specific data to be added as strings.
>
> On the tests I did, different machines/vendors fill the APEI data on
> a different way, with makes harder to associate them to a DIMM.

Ok, so it looks like ghes_edac isn't quite useful yet.

In the meantime, like Boris suggests, I think we can have a different 
trace event for raw APEI reports - userspace can use it as it pleases.

Once ghes_edac gets better, users can decide whether they want raw APEI 
reports or the EDAC-processed version and choose one or the other trace 
event.

Regards,
Naveen

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Naveen N. Rao Aug. 13, 2013, 5:32 p.m. UTC | #22
On 08/13/2013 06:12 PM, Borislav Petkov wrote:
> On Tue, Aug 13, 2013 at 04:51:33PM +0530, Naveen N. Rao wrote:
>> You're right - my trace point makes all the data provided by apei
>> as-is to userspace. However, ghes_edac seems to squash some of this
>> data into a string when reporting through mc_event.
>
> Right, for systems which don't need EDAC to decode to the DIMM or for
> which there are no EDAC drivers written, they could use a tracepoint
> which carries APEI info as-is. Others, which need EDAC, should probably
> use trace_mc_event and disable the APEI tracepoint.

If I'm not mistaken, even for systems that have EDAC drivers, it looks 
to me like EDAC can't really decode to the DIMM given what is provided 
by the bios in the APEI report currently. If and when ghes_edac gains 
this capability, users will have a choice between raw APEI reports vs. 
edac processed ones.

>
> I think this should address Tony's concerns...
>
> Btw, you could call your TP something simpler like
> trace_ghes_memory_event or so.

I started out with a simpler name, but eventually decided to use the 
name from the CPER record so it is clear what this event carries. I 
think this will be better when adding further ghes events for say, 
processor generic, PCIe and others.

>
> Btw 2, if GHES can report other types of errors (I'm pretty sure it can)
> maybe we can use a single tracepoint called trace_ghes_event for any
> types of errors coming out of it...

Two problems with this:
- One, the record size will be really big since the cper records for 
each type of error is large.
- Two, it may be better to filter events based on the type of error 
(memory error, processor, pcie, ...) rather than subscribing for all 
ghes error reports.

>
> Oh, and while at it, we probably need to start thinking of a mechanism
> to disable all the error printing, i.e. cper_print_mem() and such,
> if a userspace agent is listening in on the tracepoint and the error
> information is carried through it to userspace.

Do you mean conditionally print the cper records based on whether the 
tracepoint is enabled or not? Wouldn't that be confusing if someone is 
monitoring dmesg as well?


Thanks,
Naveen

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tony Luck Aug. 13, 2013, 5:39 p.m. UTC | #23
> In the meantime, like Boris suggests, I think we can have a different 
> trace event for raw APEI reports - userspace can use it as it pleases.
>
> Once ghes_edac gets better, users can decide whether they want raw APEI 
> reports or the EDAC-processed version and choose one or the other trace 
> event.

It's cheap to add as many tracepoints as we like - but may be costly to maintain.
Especially if we have to tinker with them later to adjust which things are logged,
that puts a burden on user-space tools to be updated to adapt to the changing
API.

Mauro has written his user-space tool to process the ghes-edac events:
  git://git.fedorahosted.org/rasdaemon.git

Who is writing the user space tools to process the new apei tracepoints
you want to add?

I'm not opposed to these patches - just wondering who is taking the next step
to make them useful.

-Tony
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov Aug. 13, 2013, 5:58 p.m. UTC | #24
On Tue, Aug 13, 2013 at 11:02:08PM +0530, Naveen N. Rao wrote:
> If I'm not mistaken, even for systems that have EDAC drivers, it looks
> to me like EDAC can't really decode to the DIMM given what is provided
> by the bios in the APEI report currently. If and when ghes_edac gains
> this capability, users will have a choice between raw APEI reports vs.
> edac processed ones.

Which kinda makes that APEI tracepoint not really useful and we can call
the one we have already - trace_mc_event - from APEI...

> I started out with a simpler name, but eventually decided to use the
> name from the CPER record so it is clear what this event carries. I
> think this will be better when adding further ghes events for say,
> processor generic, PCIe and others.

This is exactly my fear: having to add a tracepoint per error type
instead of having a single trace_hw_error or so...

> >Btw 2, if GHES can report other types of errors (I'm pretty sure it can)
> >maybe we can use a single tracepoint called trace_ghes_event for any
> >types of errors coming out of it...
> 
> Two problems with this:
> - One, the record size will be really big since the cper records for
> each type of error is large.

I better go look at that CPER crap....

> - Two, it may be better to filter events based on the type of error
> (memory error, processor, pcie, ...) rather than subscribing for all
> ghes error reports.

You can filter that in userspace too.

> Do you mean conditionally print the cper records based on whether the
> tracepoint is enabled or not? Wouldn't that be confusing if someone is
> monitoring dmesg as well?

Why would you need dmesg if you get your hw errors over the tracepoint?

Thanks.
Tony Luck Aug. 13, 2013, 6:05 p.m. UTC | #25
> Why would you need dmesg if you get your hw errors over the tracepoint?


Redundancy is a good thing when talking about mission critical systems.  dmesg
may be feeding to a serial console to be logged and analysed on another system.

The tracepoint data goes to a process on the system experiencing the errors. If the
errors are serious (or a precursor to something serious) that process may never get
the chance to save the log.

-Tony
Borislav Petkov Aug. 13, 2013, 6:10 p.m. UTC | #26
On Tue, Aug 13, 2013 at 06:05:02PM +0000, Luck, Tony wrote:
> If the errors are serious (or a precursor to something serious) that
> process may never get the chance to save the log.

What about sending tracepoint data over serial and/or network? I agree
that dmesg over serial would be helpful but we need a similar sure-fire
way for carrying error info out.

Which reminds me, pstore could also be a good thing to use, in addition.
Only put error info there as it is limited anyway.
Tony Luck Aug. 13, 2013, 8:13 p.m. UTC | #27
PiBXaGF0IGFib3V0IHNlbmRpbmcgdHJhY2Vwb2ludCBkYXRhIG92ZXIgc2VyaWFsIGFuZC9vciBu
ZXR3b3JrPyBJIGFncmVlDQo+IHRoYXQgZG1lc2cgb3ZlciBzZXJpYWwgd291bGQgYmUgaGVscGZ1
bCBidXQgd2UgbmVlZCBhIHNpbWlsYXIgc3VyZS1maXJlDQo+IHdheSBmb3IgY2FycnlpbmcgZXJy
b3IgaW5mbyBvdXQuDQoNCkdlbmVyaWMgdHJhY2Vwb2ludHMgYXJlIGFyY2hpdGVjdGVkIHRvIGJl
IGFibGUgdG8gZmlyZSBhdCB2ZXJ5IGhpZ2ggcmF0ZXMgYW5kDQpsb2cgaHVnZSBhbW91bnRzIG9m
IGluZm9ybWF0aW9uLiAgU28gd2UnZCBuZWVkIHNvbWV0aGluZyBzcGVjaWFsIHRvIHNheQ0KanVz
dCBsb2cgdGhlc2Ugc3BlY2lhbCB0cmFjZXBvaW50cyB0byBuZXR3b3JrL3NlcmlhbC4NCg0KPiBX
aGljaCByZW1pbmRzIG1lLCBwc3RvcmUgY291bGQgYWxzbyBiZSBhIGdvb2QgdGhpbmcgdG8gdXNl
LCBpbiBhZGRpdGlvbi4NCj4gT25seSBwdXQgZXJyb3IgaW5mbyB0aGVyZSBhcyBpdCBpcyBsaW1p
dGVkIGFueXdheS4NCg0KWWVzIC0gc3BhY2UgaXMgdmVyeSBsaW1pdGVkLiAgSSBkb24ndCBrbm93
IGhvdyB0byBhc3NpZ24gcHJpb3JpdHkgZm9yIGxvZ2dpbmcNCnRoZSBkbWVzZyBkYXRhIHZzLiBz
b21lIGVycm9yIGxvZ3MuDQoNCg0KSWYgd2UganVzdCAicHJpbnRrKCkiIHRoZSBtb3N0IGltcG9y
dGFudCBwYXJ0cyAtIHRoZW4gdGhhdCBkYXRhIHdpbGwgYXV0b21hdGljYWxseQ0KZmxvdyB0byB0
aGUgc2VyaWFsIGNvbnNvbGUgYW5kIHRvIHBzdG9yZS4gIFRoZW4gd2UgaGF2ZSBtdWx0aXBsZSBw
YXRocyBmb3IgdGhlDQpjcml0aWNhbCBiaXRzIG9mIHRoZSBlcnJvciBsb2cgLSBhbmQgdGhlIHRy
YWNlcG9pbnRzIGdpdmUgdXMgbW9yZSBkZXRhaWxzIGZvciB0aGUNCmNhc2VzIHdoZXJlIHRoZSBt
YWNoaW5lIGRvZXNuJ3Qgc3BvbnRhbmVvdXNseSBleHBsb2RlLg0KDQotVG9ueQ0K
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov Aug. 14, 2013, 5:43 a.m. UTC | #28
On Tue, Aug 13, 2013 at 08:13:56PM +0000, Luck, Tony wrote:
> Generic tracepoints are architected to be able to fire at very high
> rates and log huge amounts of information. So we'd need something
> special to say just log these special tracepoints to network/serial.
>
> > Which reminds me, pstore could also be a good thing to use, in addition.
> > Only put error info there as it is limited anyway.
> 
> Yes - space is very limited.  I don't know how to assign priority for logging
> the dmesg data vs. some error logs.

Didn't we say at some point, "log only the panic messsage which kills
the machine"?

However, we probably could use more the messages before that
catastrophic event because they could give us hints about what lead to
the panic but in that case maybe a limited pstore is the wrong logging
medium.

Actually, I can imagine the full serial/network logs of "special"
tracepoints + dmesg to be the optimal thing.

> If we just "printk()" the most important parts - then that data will
> automatically flow to the serial console and to pstore.

Actually, does the pstore act like a circular buffer? Because if it
contains the last N relevant messages (for an arbitrary definition of
relevant) before the system dies, then that could more helpful than only
the error messages.

And with the advent of UEFI, pretty much every system has a pstore. Too
bad that we have to limit it to 50% of size so that the boxes don't
brick. :-P

> Then we have multiple paths for the critical bits of the error log
> - and the tracepoints give us more details for the cases where the
> machine doesn't spontaneously explode.

Ok, let's sort:

* First we have the not-so-critical hw error messages. We want to carry
those out-of-band, i.e. not in dmesg so that people don't have to parse
and collect dmesg but have a specialized solution which gives them
structured logs and tools can analyze, collect and ... those errors.

* When a critical error happens, the above usage is not necessarily
advantageous anymore in the sense that, in order to debug what caused
the machine to crash, we don't simply necessarily want only the crash
message but also the whole system activity that lead to it.

In which case, we probably actually want to turn off/ignore the error
logging tracepoints and write *only* to dmesg which goes out over serial
and to pstore. Right?

Because in such cases I want to have *all* *relevant* messages that lead
to the explosion + the explosion message itself.

Makes sense? Yes, no? Aspects I've missed?

Thanks.
Naveen N. Rao Aug. 14, 2013, 10:47 a.m. UTC | #29
On 08/13/2013 11:09 PM, Luck, Tony wrote:
>> In the meantime, like Boris suggests, I think we can have a different
>> trace event for raw APEI reports - userspace can use it as it pleases.
>>
>> Once ghes_edac gets better, users can decide whether they want raw APEI
>> reports or the EDAC-processed version and choose one or the other trace
>> event.
>
> It's cheap to add as many tracepoints as we like - but may be costly to maintain.
> Especially if we have to tinker with them later to adjust which things are logged,
> that puts a burden on user-space tools to be updated to adapt to the changing
> API.

Agree. And this is the reason I have been considering mc_event. But, the 
below issues with ghes_edac made me unsure:
- One, the logging format for APEI data is a bit verbose and hard to 
parse. But, I suppose we could work with this if we make a few changes. 
Is it ok to change how the APEI data is made available through 
mc_event->driver_detail?
- Two, if ghes_edac is enabled, it prevents other edac drivers from 
being loaded. It looks like the assumption here is that if ghes/firmware 
first is enabled, then *all* memory errors are reported through ghes 
which is not true. We could have (a subset of) corrected errors reported 
through ghes, some through CMCI and uncorrected errors through MCE. So, 
if I'm not mistaken, if ghes_edac is enabled, we will only receive ghes 
error events through mc_event and not the others. Mauro, is this accurate?

>
> Mauro has written his user-space tool to process the ghes-edac events:
>    git://git.fedorahosted.org/rasdaemon.git
>
> Who is writing the user space tools to process the new apei tracepoints
> you want to add?

Enabling rasdaemon itself for the new tracepoint is an option, as long 
as Mauro doesn't object to it ;)

>
> I'm not opposed to these patches - just wondering who is taking the next step
> to make them useful.

Sure.


Regards,
Naveen

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Naveen N. Rao Aug. 14, 2013, 10:57 a.m. UTC | #30
On 08/13/2013 11:28 PM, Borislav Petkov wrote:
> On Tue, Aug 13, 2013 at 11:02:08PM +0530, Naveen N. Rao wrote:
>> If I'm not mistaken, even for systems that have EDAC drivers, it looks
>> to me like EDAC can't really decode to the DIMM given what is provided
>> by the bios in the APEI report currently. If and when ghes_edac gains
>> this capability, users will have a choice between raw APEI reports vs.
>> edac processed ones.
>
> Which kinda makes that APEI tracepoint not really useful and we can call
> the one we have already - trace_mc_event - from APEI...

This looks like a nice option. Mauro, what do you think?


Thanks,
Naveen

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov Aug. 14, 2013, 12:18 p.m. UTC | #31
On Wed, Aug 14, 2013 at 04:17:26PM +0530, Naveen N. Rao wrote:
> - One, the logging format for APEI data is a bit verbose and hard
> to parse. But, I suppose we could work with this if we make a few
> changes. Is it ok to change how the APEI data is made available
> through mc_event->driver_detail?

How?

> - Two, if ghes_edac is enabled, it prevents other edac drivers
> from being loaded. It looks like the assumption here is that if
> ghes/firmware first is enabled, then *all* memory errors are reported
> through ghes which is not true. We could have (a subset of) corrected
> errors reported through ghes, some through CMCI and uncorrected errors
> through MCE. So, if I'm not mistaken, if ghes_edac is enabled, we will
> only receive ghes error events through mc_event and not the others.

The idea is to have a single tracepoint reporting memory errors. Where
you call it from shouldn't matter. Now, we have to make sure that we
don't report the same event more than once over different paths.
Tony Luck Aug. 14, 2013, 6:38 p.m. UTC | #32
> Didn't we say at some point, "log only the panic messsage which kills

> the machine"?


We've wandered around different strategies here. We definitely want
the panic log.  Some people want all other "kernel exit" logs (shutdown,
reboot, kexec).  When there is enough space in the pstore backend we
might also want the "oops" that preceeded the panic.  (Of course when
the oops happens we don't know the future, so have to save it just in
case ... then if more "oops" happen we have to decide whether to keep
the old oops log, or save the newer one).

> However, we probably could use more the messages before that

> catastrophic event because they could give us hints about what lead to

> the panic but in that case maybe a limited pstore is the wrong logging

> medium.

Yes - longer logs are better.  Sad that the pstore backend devices are
measured in kilobytes :-)

> Actually, I can imagine the full serial/network logs of "special"

> tracepoints + dmesg to be the optimal thing.

If you guess the right "special" tracepoints to log - then yes.

> Actually, does the pstore act like a circular buffer? Because if it

> contains the last N relevant messages (for an arbitrary definition of

> relevant) before the system dies, then that could more helpful than only

> the error messages.

No - write speed for the persistent storage backing pstore (flash) means we
don't log as we go. We wait for a panic and then our registered function
gets called so we can snapshot what is in the console log at that point.
We also don't want to wear out the flash which may be soldered to the
motherboard.


> Ok, let's sort:


> * First we have the not-so-critical hw error messages. We want to carry

> those out-of-band, i.e. not in dmesg so that people don't have to parse

> and collect dmesg but have a specialized solution which gives them

> structured logs and tools can analyze, collect and ... those errors.


Agreed - we shouldn't clutter logs with details of corrected errors.
At most we should have a rate-limited log showing the count of corrected errors
so that someone who just watches dmesg knows they should go dig deeper
if they see some big number of corrected errors.

> * When a critical error happens, the above usage is not necessarily

> advantageous anymore in the sense that, in order to debug what caused

> the machine to crash, we don't simply necessarily want only the crash

> message but also the whole system activity that lead to it.


Yes.  There are people looking at various "flight recorder" modes for tracing
that keep logs of normal events in a circular buffer in RAM ... if these exist
they should be saved at crash time (and they are in the kexec/kdump path,
but I don’t know if anyone does anything in the non-kdump case).

> In which case, we probably actually want to turn off/ignore the error

> logging tracepoints and write *only* to dmesg which goes out over serial

> and to pstore. Right?

Tracepoints for errors that are going to lead to system crash would only be
useful together with a flight recorder to make sure they get saved. I think
tracepoints for corrected errors are better than dmesg logs.

> Because in such cases I want to have *all* *relevant* messages that lead

> to the explosion + the explosion message itself.


In a perfect world yes - I don't know that we can achieve perfection - but we
can iterate through good, better, even better.  The really hard part of this is
figuring out what is *relevant* to save before a particular crash happens.

-Tony
Mauro Carvalho Chehab Aug. 14, 2013, 11:54 p.m. UTC | #33
Em Tue, 13 Aug 2013 22:25:58 +0530
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:

(sorry for a late answer, I had to do a small travel yesterday)

> On 08/13/2013 05:51 PM, Mauro Carvalho Chehab wrote:
> > Em Tue, 13 Aug 2013 17:06:14 +0530
> > "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:
> >
> >> On 08/12/2013 11:26 PM, Borislav Petkov wrote:
> >>> On Mon, Aug 12, 2013 at 02:25:57PM -0300, Mauro Carvalho Chehab wrote:
> >>>> Userspace still needs the EDAC sysfs, in order to identify how the
> >>>> memory is organized, and do the proper memory labels association.
> >>>>
> >>>> What edac_ghes does is to fill those sysfs nodes, and to call the
> >>>> existing tracing to report errors.
> >>
> >> I suppose you're referring to the entries under /sys/devices/system/edac/mc?
> >
> > Yes.
> >
> >>
> >> I'm not sure I understand how this helps. ghes_edac seems to just be
> >> populating this based on dmi, which if I'm not mistaken, can be obtained
> >> in userspace (mcelog as an example).
> >>
> >> Also, on my system, all DIMMs are being reported under mc0. I doubt if
> >> the labels there are accurate.
> >
> > Yes, this is the current status of ghes_edac, where BIOS doesn't provide any
> > reliable way to associate a given APEI report to a physical DIMM slot label.
> >
> > The plan is to add more logic there as BIOSes start to provide some reliable
> > way to do such association. I discussed this subject with a few vendors
> > while I was working at Red Hat.
> 
> Hmm... is there anything specific in the APEI report that could help? 

I didn't see anything at APEI spec that would allow to describe how the
memory is organized. So, it is hard for the ghes_edac driver to discover
how many memory controllers, channels and slots are available. This data
is needed, in order to allow userspace to pass the labels for each DIMM,
or for the Kernel to auto-discover.

> More importantly, is there a need to do this in-kernel rather than in 
> user-space?

Yes, due to 2 aspects:

On a critical error, the machine will die. The EDAC core will print the
error at dmesg, but no other record to be latter parsed will be available;

With hot pluggable memories, dynamic channel rerouting, memory poisoning
and other funny things, it could not be possible to point to a DIMM, 
if the parsing is done on a latter time.

Regards,
Mauro
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mauro Carvalho Chehab Aug. 14, 2013, 11:56 p.m. UTC | #34
Em Tue, 13 Aug 2013 22:47:36 +0530
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:

> On 08/13/2013 06:11 PM, Mauro Carvalho Chehab wrote:
> > Em Tue, 13 Aug 2013 17:11:18 +0530
> > "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:
> >
> >> On 08/12/2013 08:14 PM, Mauro Carvalho Chehab wrote:
> >>>> But, this only seems to expose the APEI data as a string
> >>>> and doesn't look to really make all the fields available to user-space
> >>>> in a raw manner. Not sure how well this can be utilised by a user-space
> >>>> tool. Do you have suggestions on how we can do this?
> >>>
> >>> There's already an userspace tool that handes it:
> >>> 	https://git.fedorahosted.org/cgit/rasdaemon.git/
> >>>
> >>> What is missing there on the current version is the bits that would allow
> >>> to translate from APEI way to report an error (memory node, card, module,
> >>> bank, device) into a DIMM label[1].
> >>
> >> If I'm reading this right, all APEI data seems to be squashed into a
> >> string in mc_event.
> >
> > Yes. We had lots of discussion about how to map memory errors over the
> > last couple years. Basically, it was decided that the information that
> > could be decoded into a DIMM to be mapped as integers, and all other
> > driver-specific data to be added as strings.
> >
> > On the tests I did, different machines/vendors fill the APEI data on
> > a different way, with makes harder to associate them to a DIMM.
> 
> Ok, so it looks like ghes_edac isn't quite useful yet.
> 
> In the meantime, like Boris suggests, I think we can have a different 
> trace event for raw APEI reports - userspace can use it as it pleases.

"In the meantime" is something that worries me the most. Kernel APIs should
be stable. We cannot randomly change it on each new kernel version.

Better to spend a little more time discussing than implementing a new trace
that will be removed on a near future.
> 
> Once ghes_edac gets better, users can decide whether they want raw APEI 
> reports or the EDAC-processed version and choose one or the other trace 
> event.
> 
> Regards,
> Naveen
>
Mauro Carvalho Chehab Aug. 15, 2013, midnight UTC | #35
Em Tue, 13 Aug 2013 23:02:08 +0530
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:

> On 08/13/2013 06:12 PM, Borislav Petkov wrote:
> > On Tue, Aug 13, 2013 at 04:51:33PM +0530, Naveen N. Rao wrote:
> >> You're right - my trace point makes all the data provided by apei
> >> as-is to userspace. However, ghes_edac seems to squash some of this
> >> data into a string when reporting through mc_event.
> >
> > Right, for systems which don't need EDAC to decode to the DIMM or for
> > which there are no EDAC drivers written, they could use a tracepoint
> > which carries APEI info as-is. Others, which need EDAC, should probably
> > use trace_mc_event and disable the APEI tracepoint.
> 
> If I'm not mistaken, even for systems that have EDAC drivers, it looks 
> to me like EDAC can't really decode to the DIMM given what is provided 
> by the bios in the APEI report currently.

Yes, the current APEI events, reported via EDAC, can't be decoded currently.

> If and when ghes_edac gains 
> this capability, users will have a choice between raw APEI reports vs. 
> edac processed ones.

An APEI-specific tracing won't fix it, as, AFAIKT, we don't have any way
to map it, even on userspace.

> 
> >
> > I think this should address Tony's concerns...
> >
> > Btw, you could call your TP something simpler like
> > trace_ghes_memory_event or so.
> 
> I started out with a simpler name, but eventually decided to use the 
> name from the CPER record so it is clear what this event carries. I 
> think this will be better when adding further ghes events for say, 
> processor generic, PCIe and others.
> 
> >
> > Btw 2, if GHES can report other types of errors (I'm pretty sure it can)
> > maybe we can use a single tracepoint called trace_ghes_event for any
> > types of errors coming out of it...
> 
> Two problems with this:
> - One, the record size will be really big since the cper records for 
> each type of error is large.
> - Two, it may be better to filter events based on the type of error 
> (memory error, processor, pcie, ...) rather than subscribing for all 
> ghes error reports.

I agree: per-type of error events is better than a big generic one.
> 
> >
> > Oh, and while at it, we probably need to start thinking of a mechanism
> > to disable all the error printing, i.e. cper_print_mem() and such,
> > if a userspace agent is listening in on the tracepoint and the error
> > information is carried through it to userspace.
> 
> Do you mean conditionally print the cper records based on whether the 
> tracepoint is enabled or not? Wouldn't that be confusing if someone is 
> monitoring dmesg as well?
> 
> 
> Thanks,
> Naveen
>
Mauro Carvalho Chehab Aug. 15, 2013, 12:05 a.m. UTC | #36
Em Wed, 14 Aug 2013 07:43:22 +0200
Borislav Petkov <bp@alien8.de> escreveu:

> On Tue, Aug 13, 2013 at 08:13:56PM +0000, Luck, Tony wrote:
> > Generic tracepoints are architected to be able to fire at very high
> > rates and log huge amounts of information. So we'd need something
> > special to say just log these special tracepoints to network/serial.
> >
> > > Which reminds me, pstore could also be a good thing to use, in addition.
> > > Only put error info there as it is limited anyway.
> > 
> > Yes - space is very limited.  I don't know how to assign priority for logging
> > the dmesg data vs. some error logs.
> 
> Didn't we say at some point, "log only the panic messsage which kills
> the machine"?

EDAC core allows those kind of things, and even panic when errors arrive:

$ modinfo edac_core
filename:       /lib/modules/3.10.5-201.fc19.x86_64/kernel/drivers/edac/edac_core.ko
...
parm:           edac_pci_panic_on_pe:Panic on PCI Bus Parity error: 0=off 1=on (int)
parm:           edac_mc_panic_on_ue:Panic on uncorrected error: 0=off 1=on (int)
parm:           edac_mc_log_ue:Log uncorrectable error to console: 0=off 1=on (int)
parm:           edac_mc_log_ce:Log correctable error to console: 0=off 1=on (int)

Those have 644 permission, so they can be changed at runtime.

Of course, there are space for improvements.

> However, we probably could use more the messages before that
> catastrophic event because they could give us hints about what lead to
> the panic but in that case maybe a limited pstore is the wrong logging
> medium.
> 
> Actually, I can imagine the full serial/network logs of "special"
> tracepoints + dmesg to be the optimal thing.
> 
> > If we just "printk()" the most important parts - then that data will
> > automatically flow to the serial console and to pstore.
> 
> Actually, does the pstore act like a circular buffer? Because if it
> contains the last N relevant messages (for an arbitrary definition of
> relevant) before the system dies, then that could more helpful than only
> the error messages.
> 
> And with the advent of UEFI, pretty much every system has a pstore. Too
> bad that we have to limit it to 50% of size so that the boxes don't
> brick. :-P
> 
> > Then we have multiple paths for the critical bits of the error log
> > - and the tracepoints give us more details for the cases where the
> > machine doesn't spontaneously explode.
> 
> Ok, let's sort:
> 
> * First we have the not-so-critical hw error messages. We want to carry
> those out-of-band, i.e. not in dmesg so that people don't have to parse
> and collect dmesg but have a specialized solution which gives them
> structured logs and tools can analyze, collect and ... those errors.
> 
> * When a critical error happens, the above usage is not necessarily
> advantageous anymore in the sense that, in order to debug what caused
> the machine to crash, we don't simply necessarily want only the crash
> message but also the whole system activity that lead to it.
> 
> In which case, we probably actually want to turn off/ignore the error
> logging tracepoints and write *only* to dmesg which goes out over serial
> and to pstore. Right?
> 
> Because in such cases I want to have *all* *relevant* messages that lead
> to the explosion + the explosion message itself.
> 
> Makes sense? Yes, no? Aspects I've missed?

Makes sense to me.

> 
> Thanks.
>
Mauro Carvalho Chehab Aug. 15, 2013, 12:15 a.m. UTC | #37
Em Wed, 14 Aug 2013 16:17:26 +0530
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:

> On 08/13/2013 11:09 PM, Luck, Tony wrote:
> >> In the meantime, like Boris suggests, I think we can have a different
> >> trace event for raw APEI reports - userspace can use it as it pleases.
> >>
> >> Once ghes_edac gets better, users can decide whether they want raw APEI
> >> reports or the EDAC-processed version and choose one or the other trace
> >> event.
> >
> > It's cheap to add as many tracepoints as we like - but may be costly to maintain.
> > Especially if we have to tinker with them later to adjust which things are logged,
> > that puts a burden on user-space tools to be updated to adapt to the changing
> > API.
> 
> Agree. And this is the reason I have been considering mc_event. But, the 
> below issues with ghes_edac made me unsure:
> - One, the logging format for APEI data is a bit verbose and hard to 
> parse. But, I suppose we could work with this if we make a few changes. 
> Is it ok to change how the APEI data is made available through 
> mc_event->driver_detail?

Well, as userspace currently only stores it, doing a few changes at
driver_detail is likely safe, but we need to know what do you intend to do.

> - Two, if ghes_edac is enabled, it prevents other edac drivers from 
> being loaded. It looks like the assumption here is that if ghes/firmware 
> first is enabled, then *all* memory errors are reported through ghes 
> which is not true. We could have (a subset of) corrected errors reported 
> through ghes, some through CMCI and uncorrected errors through MCE. So, 
> if I'm not mistaken, if ghes_edac is enabled, we will only receive ghes 
> error events through mc_event and not the others. Mauro, is this accurate?

Yes, that's the current assumption. It prevents to have both BIOS and a
direct-hardware-access-EDAC-driver to race, as this is known to have
serious issues.

Btw, that's basically the reason why EDAC core should be compiled builtin,
as we need to reserve resources for APEI/GHES before having a chance to
register another EDAC driver.

The current logic doesn't affect error reports via MCE, although I think
we should also try to mask it at kernel, as it is easier to avoid event
duplication in Kernelspace than in userspace (at least for some cases).

We may try to implement a fine graining type of resource locking. Feel free
to propose patches for it.

> 
> >
> > Mauro has written his user-space tool to process the ghes-edac events:
> >    git://git.fedorahosted.org/rasdaemon.git
> >
> > Who is writing the user space tools to process the new apei tracepoints
> > you want to add?
> 
> Enabling rasdaemon itself for the new tracepoint is an option, as long 
> as Mauro doesn't object to it ;)

I don't object to add new tracepoint events there, but I want to prevent
duplicate reports for the very same error. One thing is to have a single
memory corrected error. The other thing is to have a burst of errors at the
same DIMM. If the very same error starts to appear 2, 3, 4 times, then
userspace may take the wrong decision of replacing a good memory just
because of a single random error there.

> 
> >
> > I'm not opposed to these patches - just wondering who is taking the next step
> > to make them useful.
> 
> Sure.
> 
> 
> Regards,
> Naveen
>
Mauro Carvalho Chehab Aug. 15, 2013, 12:22 a.m. UTC | #38
Em Wed, 14 Aug 2013 16:27:06 +0530
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> escreveu:

> On 08/13/2013 11:28 PM, Borislav Petkov wrote:
> > On Tue, Aug 13, 2013 at 11:02:08PM +0530, Naveen N. Rao wrote:
> >> If I'm not mistaken, even for systems that have EDAC drivers, it looks
> >> to me like EDAC can't really decode to the DIMM given what is provided
> >> by the bios in the APEI report currently. If and when ghes_edac gains
> >> this capability, users will have a choice between raw APEI reports vs.
> >> edac processed ones.
> >
> > Which kinda makes that APEI tracepoint not really useful and we can call
> > the one we have already - trace_mc_event - from APEI...
> 
> This looks like a nice option. Mauro, what do you think?

I considered calling trace_mc_event directly in APEI, before writing ghes_edac.

I decided to implement it as a separate driver due to some reasons:

	1) EDAC core needs to know that it should reject "hardware first"
drivers. So, any solution would need to talk to EDAC core anyway (or we 
would need to write some other kind of resource allocation somewhere);

	2) EDAC userspace would need to detect the type of memory. Even
being crappy, the current way the memory is reported as a single contiguous
block at sysfs. So, EDAC userspace is aware that it can't decode the DIMM;

	3) If BIOS vendors add later some solution to enumerate the DIMMS
per memory controller, channel, socket with APEI, the addition to the
existing driver would be trivial.

So, while it would work to just call the tracing at APEI, on the current way,
I believe that having this part on a separate code is better.

Cheers,
Mauro
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov Aug. 15, 2013, 9:38 a.m. UTC | #39
On Wed, Aug 14, 2013 at 09:22:11PM -0300, Mauro Carvalho Chehab wrote:
> 	1) EDAC core needs to know that it should reject "hardware first"
> drivers.

-ENOPARSE. What do you mean?

>  3) If BIOS vendors add later some solution to enumerate the DIMMS
> per memory controller, channel, socket with APEI, the addition to the
> existing driver would be trivial.

Actually, with BIOS vendors wanting to do firmware-first strategy with
DRAM errors, they must have a pretty good and intimate picture of the
memory topology at hand. So it is only a logical consequence for them,
when reporting a memory error to the OS to tell us the silkscreen label
too, while at it.

And if they do that, we don't need the additional layer - just a
tracepoint from APEI and a userspace script.

It's a whole another question if they don't do that.
Borislav Petkov Aug. 15, 2013, 9:43 a.m. UTC | #40
On Wed, Aug 14, 2013 at 09:00:35PM -0300, Mauro Carvalho Chehab wrote:
> I agree: per-type of error events is better than a big generic one.

There are many types of hardware errors and having a single TP for each
is not a good design. Especially if the error formats are comparable
to a high degree. We probably would end up with a nice sensible
classification of having only a handful of TPs covering *all* hardware
events.
Borislav Petkov Aug. 15, 2013, 10:01 a.m. UTC | #41
On Wed, Aug 14, 2013 at 09:15:04PM -0300, Mauro Carvalho Chehab wrote:
> > - Two, if ghes_edac is enabled, it prevents other edac drivers
> > from being loaded. It looks like the assumption here is that if
> > ghes/firmware first is enabled, then *all* memory errors are
> > reported through ghes which is not true. We could have (a subset
> > of) corrected errors reported through ghes, some through CMCI
> > and uncorrected errors through MCE. So, if I'm not mistaken, if
> > ghes_edac is enabled, we will only receive ghes error events through
> > mc_event and not the others. Mauro, is this accurate?
> 
> Yes, that's the current assumption. It prevents to have both BIOS and a
> direct-hardware-access-EDAC-driver to race, as this is known to have
> serious issues.

Ok, this is getting confusing so let's shed some more light.

* First of all, Naveen is asking whether other *edac* drivers can
be loaded. And no, they cannot once the ghes thing is loaded which
potentially is a problem.

For example, if the chipset-specific driver has additional functionality
from ghes_edac, then that functionality is gone when ghes_edac loads
first. This might be a problem in some cases, we probably need to think
about this more in depth.

* Then, there's the trace_mce_record() TP which comes straight
from mce.c This one is always enabled unless the mce_disable_bank
functionality kicks in which you, Naveen, added :-).

* The CMCI stuff should be synchronized with the MCE TP so that should
be ok.

I think those are our current error reporting paths...
Borislav Petkov Aug. 15, 2013, 10:02 a.m. UTC | #42
On Wed, Aug 14, 2013 at 08:56:38PM -0300, Mauro Carvalho Chehab wrote:
> Better to spend a little more time discussing than implementing a new trace
> that will be removed on a near future.

Right, "in the meantime" we established that this new TP doesn't bring
us anything new so we might just as well use trace_mc_event and call it
either straight from APEI or in ghes_edac.
Borislav Petkov Aug. 15, 2013, 10:14 a.m. UTC | #43
On Wed, Aug 14, 2013 at 06:38:09PM +0000, Luck, Tony wrote:
> We've wandered around different strategies here. We definitely
> want the panic log. Some people want all other "kernel exit" logs
> (shutdown, reboot, kexec). When there is enough space in the pstore
> backend we might also want the "oops" that preceeded the panic. (Of
> course when the oops happens we don't know the future, so have to
> save it just in case ... then if more "oops" happen we have to decide
> whether to keep the old oops log, or save the newer one).

Ok, dmesg over serial and *only* oops+panic in pstore. Right.

> Yes - longer logs are better. Sad that the pstore backend devices are
> measured in kilobytes :-)

Right, so good ole serial again to the rescue! There's no room for full
dmesg in nvram because it needs space for the UEFI GUI and some other
crap :-)

> No - write speed for the persistent storage backing pstore (flash)
> means we don't log as we go. We wait for a panic and then our
> registered function gets called so we can snapshot what is in the
> console log at that point. We also don't want to wear out the flash
> which may be soldered to the motherboard.

I suspected as much. So we can forget about using *only* pstore for hw
errors logging. It would be cool to do so but the technology simply
doesn't give it.

> Agreed - we shouldn't clutter logs with details of corrected errors.
> At most we should have a rate-limited log showing the count of
> corrected errors so that someone who just watches dmesg knows they
> should go dig deeper if they see some big number of corrected errors.

/me nods.

> Yes. There are people looking at various "flight recorder" modes for
> tracing that keep logs of normal events in a circular buffer in RAM
> ... if these exist they should be saved at crash time (and they are in
> the kexec/kdump path, but I don’t know if anyone does anything in
> the non-kdump case).

Right, the cheapest solution is serial. Simply log everything to serial
because we can. But this is the key thing I wanted to emphasize:

For severe hardware errors we don't want to use any tracepoint -
actually it is even a bad thing to do so because they would get lost in
some side channels which, during a critical situation, might not get
written to anything/survive the crash, etc.

So what I'm saying is, we basically want severe hardware errors to land
to good old dmesg and to all consoles. No fancy TP stuff for them.

> Tracepoints for errors that are going to lead to system crash would
> only be useful together with a flight recorder to make sure they get
> saved. I think tracepoints for corrected errors are better than dmesg
> logs.

Yes, exactly.

> In a perfect world yes - I don't know that we can achieve perfection
> - but we can iterate through good, better, even better. The really
> hard part of this is figuring out what is *relevant* to save before a
> particular crash happens.

Well, if I have serial connected to the box, it will contain basically
everything the machine said, no?
Mauro Carvalho Chehab Aug. 15, 2013, 1:26 p.m. UTC | #44
Em Thu, 15 Aug 2013 11:38:31 +0200
Borislav Petkov <bp@alien8.de> escreveu:

> On Wed, Aug 14, 2013 at 09:22:11PM -0300, Mauro Carvalho Chehab wrote:
> > 	1) EDAC core needs to know that it should reject "hardware first"
> > drivers.
> 
> -ENOPARSE. What do you mean?

I mean that the edac core needs to know that, on a given system, the
BIOS is accessing the hardware registers and sending the data via ghes_edac.

On such case, it should reject the driver that reads such data directly
from the hardware, as having both active cause inconsistent error reports
(I got a few BZ reports about that).

> >  3) If BIOS vendors add later some solution to enumerate the DIMMS
> > per memory controller, channel, socket with APEI, the addition to the
> > existing driver would be trivial.
> 
> Actually, with BIOS vendors wanting to do firmware-first strategy with
> DRAM errors, they must have a pretty good and intimate picture of the
> memory topology at hand. So it is only a logical consequence for them,
> when reporting a memory error to the OS to tell us the silkscreen label
> too, while at it.
> 
> And if they do that, we don't need the additional layer - just a
> tracepoint from APEI and a userspace script.

No. As we want that fatal errors to also be properly reported, the
kernel will still need to know the memory layout.

Ok, such information can come via userspace, just like we do with the
other EDAC drivers, but we'll need to allow to dynamically create the
memory layout via sysfs (or to use some other interface for loading that
data).

> It's a whole another question if they don't do that.
Mauro Carvalho Chehab Aug. 15, 2013, 1:34 p.m. UTC | #45
Em Thu, 15 Aug 2013 12:01:32 +0200
Borislav Petkov <bp@alien8.de> escreveu:

> On Wed, Aug 14, 2013 at 09:15:04PM -0300, Mauro Carvalho Chehab wrote:
> > > - Two, if ghes_edac is enabled, it prevents other edac drivers
> > > from being loaded. It looks like the assumption here is that if
> > > ghes/firmware first is enabled, then *all* memory errors are
> > > reported through ghes which is not true. We could have (a subset
> > > of) corrected errors reported through ghes, some through CMCI
> > > and uncorrected errors through MCE. So, if I'm not mistaken, if
> > > ghes_edac is enabled, we will only receive ghes error events through
> > > mc_event and not the others. Mauro, is this accurate?
> > 
> > Yes, that's the current assumption. It prevents to have both BIOS and a
> > direct-hardware-access-EDAC-driver to race, as this is known to have
> > serious issues.
> 
> Ok, this is getting confusing so let's shed some more light.
> 
> * First of all, Naveen is asking whether other *edac* drivers can
> be loaded. And no, they cannot once the ghes thing is loaded which
> potentially is a problem.
> 
> For example, if the chipset-specific driver has additional functionality
> from ghes_edac, then that functionality is gone when ghes_edac loads
> first. This might be a problem in some cases, we probably need to think
> about this more in depth.

Yes, but the thing is that it is not safe to use the hardware driver if
the BIOS is also reading the hardware error registers directly, as, on
several hardware, a read cause the error data to be cleaned on such 
register.

So, either APEI should be extended to allow some fine-grained that would
provide ways to check/control what resources would be reserved for BIOS 
only, or the user needs to tell BIOS/Kernel if they want BIOS
or OS to access the hardware.

Regards,
Mauro
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov Aug. 15, 2013, 1:44 p.m. UTC | #46
On Thu, Aug 15, 2013 at 10:26:07AM -0300, Mauro Carvalho Chehab wrote:
> I mean that the edac core needs to know that, on a given system, the
> BIOS is accessing the hardware registers and sending the data via
> ghes_edac.

Right, that's the firmware-first thing which Naveen did - see
mce_disable_bank.

> No. As we want that fatal errors to also be properly reported, the
> kernel will still need to know the memory layout.

Read what I said: if you have the silkscreen label you don't need the
memory layout - you *already* *know* which DIMM is affected.

Also, fatal errors are a whole different beast where we run in NMI
context or we even don't get to run the #MC handler on some systems.
Borislav Petkov Aug. 15, 2013, 1:51 p.m. UTC | #47
On Thu, Aug 15, 2013 at 10:34:21AM -0300, Mauro Carvalho Chehab wrote:
> Yes, but the thing is that it is not safe to use the hardware driver
> if the BIOS is also reading the hardware error registers directly, as,
> on several hardware, a read cause the error data to be cleaned on such
> register.

Here's the deal:

* We parse some APEI table and disable those MCA banks which the BIOS
wants to handle first.

* When the BIOS decides to report an error from that handling, it does
so over another BIOS table.

* Now you have two possibilities:

** On systems without an edac driver or where it doesn't make sense to
have the ghes_edac driver, we call trace_mc_event() straight from APEI
code (this is what we're currently discussung).

** On other systems, where we need ghes_edac, we *don't* use the
trace_mc_event() tracepoint in the APEI code but let it come from
ghes_edac with additional information collected by edac.
Mauro Carvalho Chehab Aug. 15, 2013, 2:14 p.m. UTC | #48
Em Thu, 15 Aug 2013 15:44:54 +0200
Borislav Petkov <bp@alien8.de> escreveu:

> On Thu, Aug 15, 2013 at 10:26:07AM -0300, Mauro Carvalho Chehab wrote:
> > I mean that the edac core needs to know that, on a given system, the
> > BIOS is accessing the hardware registers and sending the data via
> > ghes_edac.
> 
> Right, that's the firmware-first thing which Naveen did - see
> mce_disable_bank.
> 
> > No. As we want that fatal errors to also be properly reported, the
> > kernel will still need to know the memory layout.
> 
> Read what I said: if you have the silkscreen label you don't need the
> memory layout - you *already* *know* which DIMM is affected.

AFAIKT, APEI doesn't provide the silkscreen label. Some code (or some
datasheet) is needed to translate between what APEI provides into the
silkscreen label.

Naveen, please correct me if I'm wrong.

> Also, fatal errors are a whole different beast where we run in NMI
> context or we even don't get to run the #MC handler on some systems.

I see.

Yes, APEI currently prints only a raw event on high severity errors
at ghes_notify_nmi(), and doesn't call ghes_edac. Changing it would
require to parse the error at __ghes_print_estatus(). Not sure how
easy would be to change that.

Em Thu, 15 Aug 2013 15:51:06 +0200
Borislav Petkov <bp@alien8.de> escreveu:

> On Thu, Aug 15, 2013 at 10:34:21AM -0300, Mauro Carvalho Chehab wrote:
> > Yes, but the thing is that it is not safe to use the hardware driver
> > if the BIOS is also reading the hardware error registers directly, as,
> > on several hardware, a read cause the error data to be cleaned on such
> > register.
> 
> Here's the deal:
> 
> * We parse some APEI table and disable those MCA banks which the BIOS
> wants to handle first.
> 
> * When the BIOS decides to report an error from that handling, it does
> so over another BIOS table.

OK.

> 
> * Now you have two possibilities:
> 
> ** On systems without an edac driver or where it doesn't make sense to
> have the ghes_edac driver, we call trace_mc_event() straight from APEI
> code (this is what we're currently discussung).
> 
> ** On other systems, where we need ghes_edac, we *don't* use the
> trace_mc_event() tracepoint in the APEI code but let it come from
> ghes_edac with additional information collected by edac.

I don't see why should we have those two alternatives, as, at worse
case (e. g. if ghes_edac can't enrich the APEI data with labels),
they'll basically provide the very same data to userspace, and the
EDAC extra overhead is small, on its error report logic.

The risk of doing the very same thing on two different places is that
the logic to encapsulate APEI data into trace_mc_event() would be
on two separate places. It risks that someone would change one of the
drivers and forget to apply the very same change on the other, causing
parse errors on userspace, depending on the source.
Borislav Petkov Aug. 15, 2013, 4:11 p.m. UTC | #49
On Thu, Aug 15, 2013 at 11:14:07AM -0300, Mauro Carvalho Chehab wrote:
> I don't see why should we have those two alternatives, as, at worse
> case (e. g. if ghes_edac can't enrich the APEI data with labels),
> they'll basically provide the very same data to userspace, and the
> EDAC extra overhead is small, on its error report logic.

Well, a couple of reasons.

The first and foremost one is having another layer which needs
registration, etc. because ghes_edac pulls the whole edac core stuff
with it. The thinner we are, the less overhead we cause. And this is
a must for RAS.

Actually, this is a must for all kernel code - the faster we can get
done and the thinner we are, the better. We absolutely definitely don't
want to have a useless layer in the error reporting path just because it
is easier.

This short path will pay out later in error storms and other
resources-constrained situations.

Furthermore, dealing with another edac driver is not trivial for
distros, like going around and telling people that all of a sudden
they need to enable ghes_edac. This is tangential to the issue which
Naveen raised that on some machines, you want not only ghes_edac but the
platform-specific one. Which doesn't work currently and we don't have a
clear solution on how to get it working yet.

Finally, userspace doesn't care where it gets its TP data from as long
as it is there.

> The risk of doing the very same thing on two different places is that
> the logic to encapsulate APEI data into trace_mc_event() would be on
> two separate places. It risks that someone would change one of the
> drivers and forget to apply the very same change on the other, causing
> parse errors on userspace, depending on the source.

We'll make sure that doesn't happen.
Tony Luck Aug. 15, 2013, 6:16 p.m. UTC | #50
> * We parse some APEI table and disable those MCA banks which the BIOS

> wants to handle first.


We have no idea which errors the BIOS has chosen for itself.  We just know which
bank numbers ... and Intel processors change mappings of which errors are logged
in which banks in every new processor tock (and sometimes tick).  Some banks
are documented in processor datasheet. most are not. Most common case might
well be memory ... but it could be cache, or I/O, or ...

So this doesn't help Mauro figure out whether to allow loading of an
EDAC driver that will peek and poke at chipset specific registers in
possibly racy ways with BIOS code doing the same thing.

-Tony
Borislav Petkov Aug. 15, 2013, 6:41 p.m. UTC | #51
On Thu, Aug 15, 2013 at 06:16:48PM +0000, Luck, Tony wrote:
> > * We parse some APEI table and disable those MCA banks which the BIOS
> > wants to handle first.
>
> We have no idea which errors the BIOS has chosen for itself. We just
> know which bank numbers ...

Well, those which BIOS hasn't chosen for itself get simply handled up
through, HEST it is, I think. So it all goes out in APEI anyway...

> and Intel processors change mappings of which errors are logged in
> which banks in every new processor tock (and sometimes tick). Some
> banks are documented in processor datasheet. most are not. Most common
> case might well be memory ... but it could be cache, or I/O, or ...
>
> So this doesn't help Mauro figure out whether to allow loading of an
> EDAC driver that will peek and poke at chipset specific registers in
> possibly racy ways with BIOS code doing the same thing.

That doesn't matter - the only thing that matters is if an EDAC driver
has anything additional to bring to the table. If it does, then it gets
to see the errors before they're dumped to userspace. If not, then APEI
should report them directly.

Mind you, if we've disabled an MCA bank for the kernel then no EDAC
driver gets to see errors from it either because APEI has taken
responsibility. Unless said driver is poking around MCA registers -
which it shouldn't.

So I'd guess the decision to load an EDAC driver should be a platform
one. A platform which gives *sufficient* information in APEI tables for
an error doesn't need an EDAC driver. Older platforms or platforms which
cannot supply sufficient information for, say, properly pinpointing the
DIMM, should use the additional help of an EDAC driver for that, if
possible.

Which begs the most important question: do we even have a platform that
can give us sufficient information without the need for an EDAC driver?

Because if not, we should stop wasting energy pointlessly and simply
drop this discussion: we basically load an EDAC driver and do not do the
APEI tracepoint because it simply doesn't make any sense and there's no
actual platform giving us that info.

So, which is it?
Tony Luck Aug. 15, 2013, 7:14 p.m. UTC | #52
> Well, if I have serial connected to the box, it will contain basically

> everything the machine said, no?


Yes - but the serial port is too slow to log everything that you might
conceivably need to debug your problem.  Imagine trying to log every
interrupt and every pagefault on every processor down a single 115200
baud connection.  Thus my earlier comments about guessing what to
log.

-Tony
Tony Luck Aug. 15, 2013, 7:20 p.m. UTC | #53
> AFAIKT, APEI doesn't provide the silkscreen label. Some code (or some
> datasheet) is needed to translate between what APEI provides into the
> silkscreen label.

In theory it could. The ACPI generic error structure used to report includes
a 20-byte free format field which a BIOS could use to describe the location
of the error.  Haven't seen anyone do this yet - and our internal BIOS people
looked at me like I was crazy to suggest such a thing.  But I still entertain
hopes that some OEM will do the right thing and raise the bar of usefulness.

-Tony
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov Aug. 15, 2013, 7:41 p.m. UTC | #54
On Thu, Aug 15, 2013 at 07:20:29PM +0000, Luck, Tony wrote:
> In theory it could. The ACPI generic error structure used to report
> includes a 20-byte free format field which a BIOS could use to
> describe the location of the error. Haven't seen anyone do this yet -
> and our internal BIOS people looked at me like I was crazy to suggest
> such a thing. But I still entertain hopes that some OEM will do the
> right thing and raise the bar of usefulness.

Sadly, I can report similar experiences.

I did try to get the RAS hw people persuaded that spitting out the DIMM
mapping is best done by the BIOS because it has that info already -
it is a matter of carrying it out. Alas, more important events took
place... :)
Borislav Petkov Aug. 15, 2013, 7:43 p.m. UTC | #55
On Thu, Aug 15, 2013 at 07:14:34PM +0000, Luck, Tony wrote:
> Yes - but the serial port is too slow to log everything that you might
> conceivably need to debug your problem. Imagine trying to log every
> interrupt and every pagefault on every processor down a single 115200
> baud connection. Thus my earlier comments about guessing what to log.

Right, but a normally sized dmesg usually gets through ok. Selecting
what to log could be a very tedious and error-prone process...
diff mbox

Patch

diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c
index 33dc6a0..19a9c0b 100644
--- a/drivers/acpi/apei/cper.c
+++ b/drivers/acpi/apei/cper.c
@@ -32,6 +32,10 @@ 
 #include <linux/pci.h>
 #include <linux/aer.h>
 
+#define CREATE_TRACE_POINTS
+#define TRACE_EVENT_GHES
+#include <trace/events/ras.h>
+
 /*
  * CPER record ID need to be unique even after reboot, because record
  * ID is used as index for ERST storage, while CPER records from
@@ -193,8 +197,13 @@  static const char *cper_mem_err_type_strs[] = {
 	"scrub uncorrected error",
 };
 
-static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
+static void cper_print_mem(const char *pfx,
+		const struct acpi_hest_generic_status *estatus,
+		const struct acpi_hest_generic_data *gdata,
+		const struct cper_sec_mem_err *mem)
 {
+	trace_ghes_platform_memory_event(estatus, gdata, mem);
+
 	if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
 		printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
 	if (mem->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS)
@@ -292,8 +301,10 @@  static const char *apei_estatus_section_flag_strs[] = {
 	"latent error",
 };
 
-static void apei_estatus_print_section(
-	const char *pfx, const struct acpi_hest_generic_data *gdata, int sec_no)
+static void apei_estatus_print_section(const char *pfx,
+		const struct acpi_hest_generic_status *estatus,
+		const struct acpi_hest_generic_data *gdata,
+		int sec_no)
 {
 	uuid_le *sec_type = (uuid_le *)gdata->section_type;
 	__u16 severity;
@@ -320,7 +331,7 @@  static void apei_estatus_print_section(
 		struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
 		printk("%s""section_type: memory error\n", pfx);
 		if (gdata->error_data_length >= sizeof(*mem_err))
-			cper_print_mem(pfx, mem_err);
+			cper_print_mem(pfx, estatus, gdata, mem_err);
 		else
 			goto err_section_too_small;
 	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) {
@@ -355,7 +366,7 @@  void apei_estatus_print(const char *pfx,
 	gdata = (struct acpi_hest_generic_data *)(estatus + 1);
 	while (data_len > sizeof(*gdata)) {
 		gedata_len = gdata->error_data_length;
-		apei_estatus_print_section(pfx, gdata, sec_no);
+		apei_estatus_print_section(pfx, estatus, gdata, sec_no);
 		data_len -= gedata_len + sizeof(*gdata);
 		gdata = (void *)(gdata + 1) + gedata_len;
 		sec_no++;