diff mbox

[v7,2/8] ath10k: provide firmware crash info via debugfs

Message ID 20140819082253.16842.32433.stgit@potku.adurom.net (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Kalle Valo Aug. 19, 2014, 8:22 a.m. UTC
From: Ben Greear <greearb@candelatech.com>

Store the firmware crash registers and last 128 or so
firmware debug-log ids and present them to user-space
via debugfs.

Should help with figuring out why the firmware crashed.

Signed-off-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: Kalle Valo <kvalo@qca.qualcomm.com>
---
 drivers/net/wireless/ath/ath10k/core.h  |   27 +++
 drivers/net/wireless/ath/ath10k/debug.c |  280 +++++++++++++++++++++++++++++++
 drivers/net/wireless/ath/ath10k/debug.h |   22 ++
 drivers/net/wireless/ath/ath10k/hw.h    |   29 +++
 drivers/net/wireless/ath/ath10k/pci.c   |  133 ++++++++++++++-
 drivers/net/wireless/ath/ath10k/pci.h   |    3 
 6 files changed, 481 insertions(+), 13 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Michal Kazior Aug. 19, 2014, 9:33 a.m. UTC | #1
On 19 August 2014 10:22, Kalle Valo <kvalo@qca.qualcomm.com> wrote:
> From: Ben Greear <greearb@candelatech.com>
>
> Store the firmware crash registers and last 128 or so
> firmware debug-log ids and present them to user-space
> via debugfs.
>
> Should help with figuring out why the firmware crashed.
>
> Signed-off-by: Ben Greear <greearb@candelatech.com>
> Signed-off-by: Kalle Valo <kvalo@qca.qualcomm.com>
> ---
[...]
> +struct ath10k_dump_file_data {
> +       /* dump file information */
> +
> +       /* "ATH10K-FW-DUMP" */
> +       char df_magic[16];
> +
> +       __le32 len;
> +
> +       /* file dump version */
> +       __le32 version;
> +
> +       /* some info we can get from ath10k struct that might help */
> +
> +       u8 uuid[16];
> +
> +       __le32 chip_id;
> +
> +       /* 0 for now, in place for later hardware */
> +       __le32 bus_type;
> +
> +       __le32 target_version;
> +       __le32 fw_version_major;
> +       __le32 fw_version_minor;
> +       __le32 fw_version_release;
> +       __le32 fw_version_build;
> +       __le32 phy_capability;
> +       __le32 hw_min_tx_power;
> +       __le32 hw_max_tx_power;
> +       __le32 ht_cap_info;
> +       __le32 vht_cap_info;
> +       __le32 num_rf_chains;

Hmm.. some of these values don't really change once driver is loaded
so we could probably just export them as separate debugfs entries but
perhaps that's an overkill just for dumping.


> +
> +       dump_data->kernel_ver_code = cpu_to_le32(LINUX_VERSION_CODE);
> +       strlcpy(dump_data->kernel_ver, VERMAGIC_STRING,
> +               sizeof(dump_data->kernel_ver));
> +
> +       dump_data->tv_sec = cpu_to_le64(crash_data->timestamp.tv_sec);
> +       dump_data->tv_nsec = cpu_to_le64(crash_data->timestamp.tv_nsec);
> +
> +       /* Gather dbg-log */
> +       dump_tlv = (struct ath10k_tlv_dump_data *)(buf + sofar);
> +       dump_tlv->type = cpu_to_le32(ATH10K_FW_CRASH_DUMP_DBGLOG);
> +       dump_tlv->tlv_len = cpu_to_le32(sizeof(crash_data->dbglog_entry_data));

Hmm should this really be sizeof()? Not next_idx (which effectively
defines number of bytes of the dbglog)?


> +       memcpy(dump_tlv->tlv_data, &crash_data->dbglog_entry_data,
> +              sizeof(crash_data->dbglog_entry_data));
> +       sofar += sizeof(*dump_tlv) + sizeof(crash_data->dbglog_entry_data);
> +
> +       /* Gather crash-dump */
> +       dump_tlv = (struct ath10k_tlv_dump_data *)(buf + sofar);
> +       dump_tlv->type = cpu_to_le32(ATH10K_FW_CRASH_DUMP_REGDUMP);
> +       dump_tlv->tlv_len = cpu_to_le32(sizeof(crash_data->reg_dump_values));
> +       memcpy(dump_tlv->tlv_data, &crash_data->reg_dump_values,
> +              sizeof(crash_data->reg_dump_values));
> +       sofar += sizeof(*dump_tlv) + sizeof(crash_data->reg_dump_values);

I haven't seen you mention that your latest patchset is based on my
patches that remove implicit byte swap from some diag functions so I
infer you're might've missed that reg_dump_values will be byte swapped
on big-endian hosts and userspace will remain unaware of that since
there is no endianess indication in the crash dump structure anymore.

reg_dump_values must be swapped cpu_to_le32 here (or should be never
implicit byte swapped in the first place).

It's probably worth stating in the comments that
ATH10K_FW_CRASH_DUMP_REGDUMP returns a list of __le32.


> +               ret = ath10k_pci_diag_read_mem(ar, dbuf.buffer, buffer,
> +                                              dbuf.length);
> +               if (ret != 0) {
> +                       ath10k_warn("failed to read debug log buffer from address 0x%x: %d\n",
> +                                   dbuf.buffer, ret);
> +                       kfree(buffer);
> +                       return;
> +               }
> +
> +               ath10k_debug_dbglog_add(ar, buffer, dbuf.length);
> +               kfree(buffer);

Is the `buffer` really a string of bytes (u8) or just u32 in disguise?
If it's the former then on big-endian host the implicit byte swap will
mess it up and userspace will have no way of knowing that now.


Micha?
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Greear Aug. 19, 2014, 3:25 p.m. UTC | #2
On 08/19/2014 02:33 AM, Michal Kazior wrote:
> On 19 August 2014 10:22, Kalle Valo <kvalo@qca.qualcomm.com> wrote:
>> From: Ben Greear <greearb@candelatech.com>
>>
>> Store the firmware crash registers and last 128 or so
>> firmware debug-log ids and present them to user-space
>> via debugfs.
>>
>> Should help with figuring out why the firmware crashed.
>>
>> Signed-off-by: Ben Greear <greearb@candelatech.com>
>> Signed-off-by: Kalle Valo <kvalo@qca.qualcomm.com>
>> ---
> [...]
>> +struct ath10k_dump_file_data {
>> +       /* dump file information */
>> +
>> +       /* "ATH10K-FW-DUMP" */
>> +       char df_magic[16];
>> +
>> +       __le32 len;
>> +
>> +       /* file dump version */
>> +       __le32 version;
>> +
>> +       /* some info we can get from ath10k struct that might help */
>> +
>> +       u8 uuid[16];
>> +
>> +       __le32 chip_id;
>> +
>> +       /* 0 for now, in place for later hardware */
>> +       __le32 bus_type;
>> +
>> +       __le32 target_version;
>> +       __le32 fw_version_major;
>> +       __le32 fw_version_minor;
>> +       __le32 fw_version_release;
>> +       __le32 fw_version_build;
>> +       __le32 phy_capability;
>> +       __le32 hw_min_tx_power;
>> +       __le32 hw_max_tx_power;
>> +       __le32 ht_cap_info;
>> +       __le32 vht_cap_info;
>> +       __le32 num_rf_chains;
>
> Hmm.. some of these values don't really change once driver is loaded
> so we could probably just export them as separate debugfs entries but
> perhaps that's an overkill just for dumping.

I think it will be easier for all involved if there is a single dump image that
has all needed info.  Likely the end user of the dump file will have little or no
interaction with the host that dumped the file to begin with.

>> +       dump_data->kernel_ver_code = cpu_to_le32(LINUX_VERSION_CODE);
>> +       strlcpy(dump_data->kernel_ver, VERMAGIC_STRING,
>> +               sizeof(dump_data->kernel_ver));
>> +
>> +       dump_data->tv_sec = cpu_to_le64(crash_data->timestamp.tv_sec);
>> +       dump_data->tv_nsec = cpu_to_le64(crash_data->timestamp.tv_nsec);
>> +
>> +       /* Gather dbg-log */
>> +       dump_tlv = (struct ath10k_tlv_dump_data *)(buf + sofar);
>> +       dump_tlv->type = cpu_to_le32(ATH10K_FW_CRASH_DUMP_DBGLOG);
>> +       dump_tlv->tlv_len = cpu_to_le32(sizeof(crash_data->dbglog_entry_data));
>
> Hmm should this really be sizeof()? Not next_idx (which effectively
> defines number of bytes of the dbglog)?

I haven't tried decoding this yet, but we may need to know the next_idx
to decode this properly.

>> +       memcpy(dump_tlv->tlv_data, &crash_data->dbglog_entry_data,
>> +              sizeof(crash_data->dbglog_entry_data));
>> +       sofar += sizeof(*dump_tlv) + sizeof(crash_data->dbglog_entry_data);
>> +
>> +       /* Gather crash-dump */
>> +       dump_tlv = (struct ath10k_tlv_dump_data *)(buf + sofar);
>> +       dump_tlv->type = cpu_to_le32(ATH10K_FW_CRASH_DUMP_REGDUMP);
>> +       dump_tlv->tlv_len = cpu_to_le32(sizeof(crash_data->reg_dump_values));
>> +       memcpy(dump_tlv->tlv_data, &crash_data->reg_dump_values,
>> +              sizeof(crash_data->reg_dump_values));
>> +       sofar += sizeof(*dump_tlv) + sizeof(crash_data->reg_dump_values);
>
> I haven't seen you mention that your latest patchset is based on my
> patches that remove implicit byte swap from some diag functions so I
> infer you're might've missed that reg_dump_values will be byte swapped
> on big-endian hosts and userspace will remain unaware of that since
> there is no endianess indication in the crash dump structure anymore.
>
> reg_dump_values must be swapped cpu_to_le32 here (or should be never
> implicit byte swapped in the first place).
>
> It's probably worth stating in the comments that
> ATH10K_FW_CRASH_DUMP_REGDUMP returns a list of __le32.
>
>
>> +               ret = ath10k_pci_diag_read_mem(ar, dbuf.buffer, buffer,
>> +                                              dbuf.length);
>> +               if (ret != 0) {
>> +                       ath10k_warn("failed to read debug log buffer from address 0x%x: %d\n",
>> +                                   dbuf.buffer, ret);
>> +                       kfree(buffer);
>> +                       return;
>> +               }
>> +
>> +               ath10k_debug_dbglog_add(ar, buffer, dbuf.length);
>> +               kfree(buffer);
>
> Is the `buffer` really a string of bytes (u8) or just u32 in disguise?
> If it's the former then on big-endian host the implicit byte swap will
> mess it up and userspace will have no way of knowing that now.

dbglog is array of 32 bit ints.

Thanks,
Ben
Ben Greear Aug. 19, 2014, 4:05 p.m. UTC | #3
On 08/19/2014 08:25 AM, Ben Greear wrote:

>>> +       dump_data->kernel_ver_code = cpu_to_le32(LINUX_VERSION_CODE);
>>> +       strlcpy(dump_data->kernel_ver, VERMAGIC_STRING,
>>> +               sizeof(dump_data->kernel_ver));
>>> +
>>> +       dump_data->tv_sec = cpu_to_le64(crash_data->timestamp.tv_sec);
>>> +       dump_data->tv_nsec = cpu_to_le64(crash_data->timestamp.tv_nsec);
>>> +
>>> +       /* Gather dbg-log */
>>> +       dump_tlv = (struct ath10k_tlv_dump_data *)(buf + sofar);
>>> +       dump_tlv->type = cpu_to_le32(ATH10K_FW_CRASH_DUMP_DBGLOG);
>>> +       dump_tlv->tlv_len = cpu_to_le32(sizeof(crash_data->dbglog_entry_data));
>>
>> Hmm should this really be sizeof()? Not next_idx (which effectively
>> defines number of bytes of the dbglog)?
> 
> I haven't tried decoding this yet, but we may need to know the next_idx
> to decode this properly.

I thought some more on this on the way to work..and I think it will not
be easy to decode this if we do not know the starting point for the
messages.  Each report from firmware will start at the beginning of
a message, so we either need to do a lot of memory copying, or play
tricks with a circular buffer storage.

A long time ago, Kalle mentioned he didn't want any ability to decode
the messages in the kernel, so the 'start' boundary will need to be
the start of the message(s) received from firmware.

This means that the ath10k_debug_dbglog_add will need to be improved
to handle wraps more intelligently, somehow.

This series has taken a long time already, so it would be fine with me
to deal with the dbglog cleanup in subsequent patches, and just assume
for now that the dbglog messages start at index 0 in the dbglog-entry-data.

Thanks,
Ben
Michal Kazior Aug. 20, 2014, 6:28 a.m. UTC | #4
On 19 August 2014 17:25, Ben Greear <greearb@candelatech.com> wrote:
> On 08/19/2014 02:33 AM, Michal Kazior wrote:
>> On 19 August 2014 10:22, Kalle Valo <kvalo@qca.qualcomm.com> wrote:
[...]
>>> +       __le32 target_version;
>>> +       __le32 fw_version_major;
>>> +       __le32 fw_version_minor;
>>> +       __le32 fw_version_release;
>>> +       __le32 fw_version_build;
>>> +       __le32 phy_capability;
>>> +       __le32 hw_min_tx_power;
>>> +       __le32 hw_max_tx_power;
>>> +       __le32 ht_cap_info;
>>> +       __le32 vht_cap_info;
>>> +       __le32 num_rf_chains;
>>
>>
>> Hmm.. some of these values don't really change once driver is loaded
>> so we could probably just export them as separate debugfs entries but
>> perhaps that's an overkill just for dumping.
>
>
> I think it will be easier for all involved if there is a single dump image
> that
> has all needed info.  Likely the end user of the dump file will have little
> or no
> interaction with the host that dumped the file to begin with.

I think there's been this idea about moving dumping to udev somewhat,
no? Since this is just debugfs then I imagine you could have a
userspace program that would create the single blob/crash report from
things it thinks is important, e.g.. `uname -a`, debugfs entries (fw
stack traces, dbglog, etc), recent kernel log buffer, etc. It could
even all be stored in plaintext (with binary data encoded as a
hexdump). But that, I guess, could be cumbersome, at least initially,
until all major distros adapt.


>
>>> +       dump_data->kernel_ver_code = cpu_to_le32(LINUX_VERSION_CODE);
>>> +       strlcpy(dump_data->kernel_ver, VERMAGIC_STRING,
>>> +               sizeof(dump_data->kernel_ver));
>>> +
>>> +       dump_data->tv_sec = cpu_to_le64(crash_data->timestamp.tv_sec);
>>> +       dump_data->tv_nsec = cpu_to_le64(crash_data->timestamp.tv_nsec);
>>> +
>>> +       /* Gather dbg-log */
>>> +       dump_tlv = (struct ath10k_tlv_dump_data *)(buf + sofar);
>>> +       dump_tlv->type = cpu_to_le32(ATH10K_FW_CRASH_DUMP_DBGLOG);
>>> +       dump_tlv->tlv_len =
>>> cpu_to_le32(sizeof(crash_data->dbglog_entry_data));
>>
>>
>> Hmm should this really be sizeof()? Not next_idx (which effectively
>> defines number of bytes of the dbglog)?
>
>
> I haven't tried decoding this yet, but we may need to know the next_idx
> to decode this properly.

So basically a (length, payload) encoding would be necessary here
instead of a single stream, right?


>>> +               ret = ath10k_pci_diag_read_mem(ar, dbuf.buffer, buffer,
>>> +                                              dbuf.length);
>>> +               if (ret != 0) {
>>> +                       ath10k_warn("failed to read debug log buffer from
>>> address 0x%x: %d\n",
>>> +                                   dbuf.buffer, ret);
>>> +                       kfree(buffer);
>>> +                       return;
>>> +               }
>>> +
>>> +               ath10k_debug_dbglog_add(ar, buffer, dbuf.length);
>>> +               kfree(buffer);
>>
>>
>> Is the `buffer` really a string of bytes (u8) or just u32 in disguise?
>> If it's the former then on big-endian host the implicit byte swap will
>> mess it up and userspace will have no way of knowing that now.
>
>
> dbglog is array of 32 bit ints.

Thanks, this clears things. Then it is necessary to byte-swap
cpu_to_le32 before calling debug_dbglog_add.


Micha?
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kalle Valo Aug. 20, 2014, 6:48 a.m. UTC | #5
Michal Kazior <michal.kazior@tieto.com> writes:

> On 19 August 2014 17:25, Ben Greear <greearb@candelatech.com> wrote:
>> On 08/19/2014 02:33 AM, Michal Kazior wrote:
>>> On 19 August 2014 10:22, Kalle Valo <kvalo@qca.qualcomm.com> wrote:
> [...]
>>>> +       __le32 target_version;
>>>> +       __le32 fw_version_major;
>>>> +       __le32 fw_version_minor;
>>>> +       __le32 fw_version_release;
>>>> +       __le32 fw_version_build;
>>>> +       __le32 phy_capability;
>>>> +       __le32 hw_min_tx_power;
>>>> +       __le32 hw_max_tx_power;
>>>> +       __le32 ht_cap_info;
>>>> +       __le32 vht_cap_info;
>>>> +       __le32 num_rf_chains;
>>>
>>>
>>> Hmm.. some of these values don't really change once driver is loaded
>>> so we could probably just export them as separate debugfs entries but
>>> perhaps that's an overkill just for dumping.
>>
>>
>> I think it will be easier for all involved if there is a single dump image
>> that
>> has all needed info.  Likely the end user of the dump file will have little
>> or no
>> interaction with the host that dumped the file to begin with.
>
> I think there's been this idea about moving dumping to udev somewhat,
> no? 

Yeah. But I'm thinking that should just signal user space that there's a
new crash dump available, nothing else.

> Since this is just debugfs then I imagine you could have a userspace
> program that would create the single blob/crash report from things it
> thinks is important, e.g.. `uname -a`, debugfs entries (fw stack
> traces, dbglog, etc), recent kernel log buffer, etc. It could even all
> be stored in plaintext (with binary data encoded as a hexdump). But
> that, I guess, could be cumbersome, at least initially, until all
> major distros adapt.

Sure, we can push lots of this to user space. But what's the point?

With the current approach we get a self-contained crash dump which the
user (be it a real person or automated) can store with a simple file
copy operatation and can be easily post-processed afterwards. What you
propose is a lot of work for everyone, most likely even more bugs and
without any clear benefits. At least the memory consumption should be
the same (if not even increase due to increased complexity).
Kalle Valo Aug. 20, 2014, 6:56 a.m. UTC | #6
Kalle Valo <kvalo@qca.qualcomm.com> writes:

>> Since this is just debugfs then I imagine you could have a userspace
>> program that would create the single blob/crash report from things it
>> thinks is important, e.g.. `uname -a`, debugfs entries (fw stack
>> traces, dbglog, etc), recent kernel log buffer, etc. It could even all
>> be stored in plaintext (with binary data encoded as a hexdump). But
>> that, I guess, could be cumbersome, at least initially, until all
>> major distros adapt.
>
> Sure, we can push lots of this to user space. But what's the point?
>
> With the current approach we get a self-contained crash dump which the
> user (be it a real person or automated) can store with a simple file
> copy operatation and can be easily post-processed afterwards. What you
> propose is a lot of work for everyone, most likely even more bugs and
> without any clear benefits. At least the memory consumption should be
> the same (if not even increase due to increased complexity).

I forgot to mention that also I would not want to rely on debugfs except
at the moment to deliver the dump to user space. I'm hoping that we
would get a generic firmware crash dump support to cfg80211 (hopefully
we can talk about that in Dusseldorf) and we might switch to ethtool or
some another user space interface. Then it might be possible to enable
this even without debugfs, if we are happy to take the hit on memory
use.
Kalle Valo Aug. 20, 2014, 7:29 a.m. UTC | #7
Ben Greear <greearb@candelatech.com> writes:

>>> +       /* Gather dbg-log */
>>> +       dump_tlv = (struct ath10k_tlv_dump_data *)(buf + sofar);
>>> +       dump_tlv->type = cpu_to_le32(ATH10K_FW_CRASH_DUMP_DBGLOG);
>>> +       dump_tlv->tlv_len = cpu_to_le32(sizeof(crash_data->dbglog_entry_data));
>>
>> Hmm should this really be sizeof()? Not next_idx (which effectively
>> defines number of bytes of the dbglog)?
>
> I haven't tried decoding this yet, but we may need to know the next_idx
> to decode this properly.

I really don't like the idea of having untested code in ath10k. Buggy
code is okay (preferably document the known bugs when submitting code),
but untested code is not.
Kalle Valo Aug. 20, 2014, 7:36 a.m. UTC | #8
Ben Greear <greearb@candelatech.com> writes:

> This series has taken a long time already, so it would be fine with me
> to deal with the dbglog cleanup in subsequent patches, and just assume
> for now that the dbglog messages start at index 0 in the dbglog-entry-data.

I also would like to get this support finally in. As the basic
infrastructure seems to be mostly ok, I'm planning to split the patchset
into two and the first containing just the basic infrastructure and
register dump:

	ATH10K_FW_CRASH_DUMP_REGDUMP = 1,

Which I'll actually rename to ATH10K_FW_CRASH_DUMP_REGISTERS. And I'll
also fix the endian problem Michal pointed out.

And then we can later add the rest, once they are properly tested and
the endian issues are solved:

	ATH10K_FW_CRASH_DUMP_DBGLOG = 0,
	ATH10K_FW_CRASH_DUMP_STACK = 2,
	ATH10K_FW_CRASH_DUMP_EXC_STACK = 3,
	ATH10K_FW_CRASH_DUMP_RAM_BSS = 4,
	ATH10K_FW_CRASH_DUMP_ROM_BSS = 5,

And as we use TLV format this split doesn't break anything in user
space. How does that sound?
Ben Greear Aug. 20, 2014, 1:08 p.m. UTC | #9
On 08/20/2014 12:29 AM, Kalle Valo wrote:
> Ben Greear <greearb@candelatech.com> writes:
>
>>>> +       /* Gather dbg-log */
>>>> +       dump_tlv = (struct ath10k_tlv_dump_data *)(buf + sofar);
>>>> +       dump_tlv->type = cpu_to_le32(ATH10K_FW_CRASH_DUMP_DBGLOG);
>>>> +       dump_tlv->tlv_len = cpu_to_le32(sizeof(crash_data->dbglog_entry_data));
>>>
>>> Hmm should this really be sizeof()? Not next_idx (which effectively
>>> defines number of bytes of the dbglog)?
>>
>> I haven't tried decoding this yet, but we may need to know the next_idx
>> to decode this properly.
>
> I really don't like the idea of having untested code in ath10k. Buggy
> code is okay (preferably document the known bugs when submitting code),
> but untested code is not.

It would be quite a waste of time to keep re-writing a user-space dump tool
while the patch set is in this much churn, I think.

It's not like ath10k is going to blow up, but maybe the decode is less useful
in that we cannot actually get a useful debuglog decode until the problem is
resolved.  We get zero firmware dbglog messages currently, so nothing is lost.

When at least the first patch or two is in, and endian issue is resolved,
I'll write a decode tool (or modify the one I have that parses /var/log/messages
to deal with the new binary dump file).

Thanks,
Ben
Ben Greear Aug. 20, 2014, 1:13 p.m. UTC | #10
>>> Hmm should this really be sizeof()? Not next_idx (which effectively
>>> defines number of bytes of the dbglog)?
>>
>>
>> I haven't tried decoding this yet, but we may need to know the next_idx
>> to decode this properly.
>
> So basically a (length, payload) encoding would be necessary here
> instead of a single stream, right?

As long as you know the starting point for a message, you can decode the
rest of the message(s) as a stream of 32 bit ints.  The current code that
I wrote is a circular buffer, so after first wrap, we lose an idea of where
to start reading.

Thanks,
Ben
Kalle Valo Aug. 20, 2014, 2:19 p.m. UTC | #11
Ben Greear <greearb@candelatech.com> writes:

> On 08/20/2014 12:29 AM, Kalle Valo wrote:
>> Ben Greear <greearb@candelatech.com> writes:
>>
>>>>> + /* Gather dbg-log */ + dump_tlv = (struct ath10k_tlv_dump_data
>>>>> *)(buf + sofar); + dump_tlv->type =
>>>>> cpu_to_le32(ATH10K_FW_CRASH_DUMP_DBGLOG); + dump_tlv->tlv_len =
>>>>> cpu_to_le32(sizeof(crash_data->dbglog_entry_data));
>>>> Hmm should this really be sizeof()? Not next_idx (which
>>>> effectively defines number of bytes of the dbglog)?
>>> I haven't tried decoding this yet, but we may need to know the
>>> next_idx to decode this properly.
>> I really don't like the idea of having untested code in ath10k.
>> Buggy code is okay (preferably document the known bugs when
>> submitting code), but untested code is not.
>
> It would be quite a waste of time to keep re-writing a user-space dump
> tool while the patch set is in this much churn, I think.

> It's not like ath10k is going to blow up, but maybe the decode is less
> useful in that we cannot actually get a useful debuglog decode until
> the problem is resolved. We get zero firmware dbglog messages
> currently, so nothing is lost.

Of course ath10k won't blow up, but every patch and every line of code
increases the maintenance cost. For example, if I had known that the
dbglog dump in your patchset doesn't work at all, I would have dropped
it immediately and saved a lot of time of everyone's time, especially
mine.

So next time you send me untested code PLEASE clearly mention that. I
assume that the developer has tested the submitted code well enough that
it really works. Hoping it to work is not enough! Unless you are
Johannes, Linus or Davem :)
Ben Greear Aug. 20, 2014, 2:52 p.m. UTC | #12
On 08/20/2014 07:19 AM, Kalle Valo wrote:
> Ben Greear <greearb@candelatech.com> writes:
>
>> On 08/20/2014 12:29 AM, Kalle Valo wrote:
>>> Ben Greear <greearb@candelatech.com> writes:
>>>
>>>>>> + /* Gather dbg-log */ + dump_tlv = (struct ath10k_tlv_dump_data
>>>>>> *)(buf + sofar); + dump_tlv->type =
>>>>>> cpu_to_le32(ATH10K_FW_CRASH_DUMP_DBGLOG); + dump_tlv->tlv_len =
>>>>>> cpu_to_le32(sizeof(crash_data->dbglog_entry_data));
>>>>> Hmm should this really be sizeof()? Not next_idx (which
>>>>> effectively defines number of bytes of the dbglog)?
>>>> I haven't tried decoding this yet, but we may need to know the
>>>> next_idx to decode this properly.
>>> I really don't like the idea of having untested code in ath10k.
>>> Buggy code is okay (preferably document the known bugs when
>>> submitting code), but untested code is not.
>>
>> It would be quite a waste of time to keep re-writing a user-space dump
>> tool while the patch set is in this much churn, I think.
>
>> It's not like ath10k is going to blow up, but maybe the decode is less
>> useful in that we cannot actually get a useful debuglog decode until
>> the problem is resolved. We get zero firmware dbglog messages
>> currently, so nothing is lost.
>
> Of course ath10k won't blow up, but every patch and every line of code
> increases the maintenance cost. For example, if I had known that the
> dbglog dump in your patchset doesn't work at all, I would have dropped
> it immediately and saved a lot of time of everyone's time, especially
> mine.
>
> So next time you send me untested code PLEASE clearly mention that. I
> assume that the developer has tested the submitted code well enough that
> it really works. Hoping it to work is not enough! Unless you are
> Johannes, Linus or Davem :)

I know that I mentioned I have not actually written the decode tool.
I did test my original patches to make sure they properly dumped
a data file and did not crash the kernel or otherwise muck up the
driver.

And, actually I think I could make a decode tool able to decode the
dbglog as written, especially if the next-idx is included...the messages
are normalized enough that I can just read ints until I find something
that is a legit start of message.  Not super clean, but kernel would
remain simpler.

I think we would make more progress if we were slightly less picky on
style issues and details for patches, and just make more
improvements after the initial patches go in.  It would certainly
make it easier to do things like write and test and improve the decode
tool.

Thanks,
Ben
diff mbox

Patch

diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h
index cca6060dbf30..09c0a109368a 100644
--- a/drivers/net/wireless/ath/ath10k/core.h
+++ b/drivers/net/wireless/ath/ath10k/core.h
@@ -22,6 +22,8 @@ 
 #include <linux/if_ether.h>
 #include <linux/types.h>
 #include <linux/pci.h>
+#include <linux/uuid.h>
+#include <linux/time.h>
 
 #include "htt.h"
 #include "htc.h"
@@ -278,6 +280,29 @@  struct ath10k_vif_iter {
 	struct ath10k_vif *arvif;
 };
 
+/* This will store at least the last 128 entries.  Each dbglog message
+ * is a max of 7 32-bit integers in length, but the length can be less
+ * than that as well.
+ */
+#define ATH10K_DBGLOG_DATA_LEN (128 * 7 * sizeof(u32))
+
+struct ath10k_dbglog_entry_storage {
+	/* where to write next chunk of data */
+	u32 next_idx;
+
+	u8 data[ATH10K_DBGLOG_DATA_LEN];
+};
+
+/* used for crash-dump storage, protected by data-lock */
+struct ath10k_fw_crash_data {
+	bool crashed_since_read;
+
+	uuid_le uuid;
+	struct timespec timestamp;
+	struct ath10k_dbglog_entry_storage dbglog_entry_data;
+	u32 reg_dump_values[REG_DUMP_COUNT_QCA988X];
+};
+
 struct ath10k_debug {
 	struct dentry *debugfs_phy;
 
@@ -295,6 +320,8 @@  struct ath10k_debug {
 
 	u8 htt_max_amsdu;
 	u8 htt_max_ampdu;
+
+	struct ath10k_fw_crash_data *fw_crash_data;
 };
 
 enum ath10k_state {
diff --git a/drivers/net/wireless/ath/ath10k/debug.c b/drivers/net/wireless/ath/ath10k/debug.c
index df1abe7f1fef..8c3f9acd6492 100644
--- a/drivers/net/wireless/ath/ath10k/debug.c
+++ b/drivers/net/wireless/ath/ath10k/debug.c
@@ -17,6 +17,9 @@ 
 
 #include <linux/module.h>
 #include <linux/debugfs.h>
+#include <linux/version.h>
+#include <linux/vermagic.h>
+#include <linux/vmalloc.h>
 
 #include "core.h"
 #include "debug.h"
@@ -24,6 +27,87 @@ 
 /* ms */
 #define ATH10K_DEBUG_HTT_STATS_INTERVAL 1000
 
+#define ATH10K_FW_CRASH_DUMP_VERSION 1
+
+/**
+ * enum ath10k_fw_crash_dump_type - types of data in the dump file
+ * @ATH10K_FW_CRASH_DUMP_DBGLOG:  Recent firmware debug log entries
+ * @ATH10K_FW_CRASH_DUMP_REGDUMP: Register crash dump in binary format
+ */
+enum ath10k_fw_crash_dump_type {
+	ATH10K_FW_CRASH_DUMP_DBGLOG = 0,
+	ATH10K_FW_CRASH_DUMP_REGDUMP = 1,
+
+	ATH10K_FW_CRASH_DUMP_MAX,
+};
+
+struct ath10k_tlv_dump_data {
+	/* see ath10k_fw_crash_dump_type above */
+	__le32 type;
+
+	/* in bytes */
+	__le32 tlv_len;
+
+	/* pad to 32-bit boundaries as needed */
+	u8 tlv_data[];
+} __packed;
+
+struct ath10k_dump_file_data {
+	/* dump file information */
+
+	/* "ATH10K-FW-DUMP" */
+	char df_magic[16];
+
+	__le32 len;
+
+	/* file dump version */
+	__le32 version;
+
+	/* some info we can get from ath10k struct that might help */
+
+	u8 uuid[16];
+
+	__le32 chip_id;
+
+	/* 0 for now, in place for later hardware */
+	__le32 bus_type;
+
+	__le32 target_version;
+	__le32 fw_version_major;
+	__le32 fw_version_minor;
+	__le32 fw_version_release;
+	__le32 fw_version_build;
+	__le32 phy_capability;
+	__le32 hw_min_tx_power;
+	__le32 hw_max_tx_power;
+	__le32 ht_cap_info;
+	__le32 vht_cap_info;
+	__le32 num_rf_chains;
+
+	/* firmware version string */
+	char fw_ver[ETHTOOL_FWVERS_LEN];
+
+	/* Kernel related information */
+
+	/* time-of-day stamp */
+	__le64 tv_sec;
+
+	/* time-of-day stamp, nano-seconds */
+	__le64 tv_nsec;
+
+	/* LINUX_VERSION_CODE */
+	__le32 kernel_ver_code;
+
+	/* VERMAGIC_STRING */
+	char kernel_ver[64];
+
+	/* room for growth w/out changing binary format */
+	u8 unused[128];
+
+	/* struct ath10k_tlv_dump_data + more */
+	u8 data[0];
+} __packed;
+
 static int ath10k_printk(const char *level, const char *fmt, ...)
 {
 	struct va_format vaf;
@@ -588,6 +672,177 @@  static const struct file_operations fops_chip_id = {
 	.llseek = default_llseek,
 };
 
+struct ath10k_fw_crash_data *
+ath10k_debug_get_new_fw_crash_data(struct ath10k *ar)
+{
+	struct ath10k_fw_crash_data *crash_data = ar->debug.fw_crash_data;
+
+	lockdep_assert_held(&ar->data_lock);
+
+	crash_data->crashed_since_read = true;
+	uuid_le_gen(&crash_data->uuid);
+	getnstimeofday(&crash_data->timestamp);
+
+	return crash_data;
+}
+EXPORT_SYMBOL(ath10k_debug_get_new_fw_crash_data);
+
+void ath10k_debug_dbglog_add(struct ath10k *ar, u8 *buffer, int len)
+{
+	struct ath10k_fw_crash_data *crash_data = ar->debug.fw_crash_data;
+	int i, z;
+
+	lockdep_assert_held(&ar->data_lock);
+
+	z = crash_data->dbglog_entry_data.next_idx;
+
+	for (i = 0; i < len; i++) {
+		crash_data->dbglog_entry_data.data[z] = buffer[i];
+		z++;
+		if (z >= ATH10K_DBGLOG_DATA_LEN)
+			z = 0;
+	}
+
+	crash_data->dbglog_entry_data.next_idx = z;
+}
+EXPORT_SYMBOL(ath10k_debug_dbglog_add);
+
+static struct ath10k_dump_file_data *ath10k_build_dump_file(struct ath10k *ar)
+{
+	struct ath10k_fw_crash_data *crash_data = ar->debug.fw_crash_data;
+	struct ath10k_dump_file_data *dump_data;
+	struct ath10k_tlv_dump_data *dump_tlv;
+	int hdr_len = sizeof(*dump_data);
+	unsigned int len, sofar = 0;
+	unsigned char *buf;
+
+	lockdep_assert_held(&ar->conf_mutex);
+
+	len = hdr_len;
+	len += sizeof(*dump_tlv) + sizeof(crash_data->reg_dump_values);
+	len += sizeof(*dump_tlv) + sizeof(crash_data->dbglog_entry_data);
+
+	sofar += hdr_len;
+
+	/* This is going to get big when we start dumping FW RAM and such,
+	 * so go ahead and use vmalloc.
+	 */
+	buf = vzalloc(len);
+	if (!buf)
+		return NULL;
+
+	spin_lock_bh(&ar->data_lock);
+
+	if (!crash_data->crashed_since_read) {
+		spin_unlock_bh(&ar->data_lock);
+		vfree(buf);
+		return NULL;
+	}
+
+	dump_data = (struct ath10k_dump_file_data *)(buf);
+	strlcpy(dump_data->df_magic, "ATH10K-FW-DUMP",
+		sizeof(dump_data->df_magic));
+	dump_data->len = cpu_to_le32(len);
+
+	dump_data->version = cpu_to_le32(ATH10K_FW_CRASH_DUMP_VERSION);
+
+	memcpy(dump_data->uuid, &crash_data->uuid, sizeof(dump_data->uuid));
+	dump_data->chip_id = cpu_to_le32(ar->chip_id);
+	dump_data->bus_type = cpu_to_le32(0);
+	dump_data->target_version = cpu_to_le32(ar->target_version);
+	dump_data->fw_version_major = cpu_to_le32(ar->fw_version_major);
+	dump_data->fw_version_minor = cpu_to_le32(ar->fw_version_minor);
+	dump_data->fw_version_release = cpu_to_le32(ar->fw_version_release);
+	dump_data->fw_version_build = cpu_to_le32(ar->fw_version_build);
+	dump_data->phy_capability = cpu_to_le32(ar->phy_capability);
+	dump_data->hw_min_tx_power = cpu_to_le32(ar->hw_min_tx_power);
+	dump_data->hw_max_tx_power = cpu_to_le32(ar->hw_max_tx_power);
+	dump_data->ht_cap_info = cpu_to_le32(ar->ht_cap_info);
+	dump_data->vht_cap_info = cpu_to_le32(ar->vht_cap_info);
+	dump_data->num_rf_chains = cpu_to_le32(ar->num_rf_chains);
+
+	strlcpy(dump_data->fw_ver, ar->hw->wiphy->fw_version,
+		sizeof(dump_data->fw_ver));
+
+	dump_data->kernel_ver_code = cpu_to_le32(LINUX_VERSION_CODE);
+	strlcpy(dump_data->kernel_ver, VERMAGIC_STRING,
+		sizeof(dump_data->kernel_ver));
+
+	dump_data->tv_sec = cpu_to_le64(crash_data->timestamp.tv_sec);
+	dump_data->tv_nsec = cpu_to_le64(crash_data->timestamp.tv_nsec);
+
+	/* Gather dbg-log */
+	dump_tlv = (struct ath10k_tlv_dump_data *)(buf + sofar);
+	dump_tlv->type = cpu_to_le32(ATH10K_FW_CRASH_DUMP_DBGLOG);
+	dump_tlv->tlv_len = cpu_to_le32(sizeof(crash_data->dbglog_entry_data));
+	memcpy(dump_tlv->tlv_data, &crash_data->dbglog_entry_data,
+	       sizeof(crash_data->dbglog_entry_data));
+	sofar += sizeof(*dump_tlv) + sizeof(crash_data->dbglog_entry_data);
+
+	/* Gather crash-dump */
+	dump_tlv = (struct ath10k_tlv_dump_data *)(buf + sofar);
+	dump_tlv->type = cpu_to_le32(ATH10K_FW_CRASH_DUMP_REGDUMP);
+	dump_tlv->tlv_len = cpu_to_le32(sizeof(crash_data->reg_dump_values));
+	memcpy(dump_tlv->tlv_data, &crash_data->reg_dump_values,
+	       sizeof(crash_data->reg_dump_values));
+	sofar += sizeof(*dump_tlv) + sizeof(crash_data->reg_dump_values);
+
+	ar->debug.fw_crash_data->crashed_since_read = false;
+
+	spin_unlock_bh(&ar->data_lock);
+
+	return dump_data;
+}
+
+static int ath10k_fw_crash_dump_open(struct inode *inode, struct file *file)
+{
+	struct ath10k *ar = inode->i_private;
+	struct ath10k_dump_file_data *dump;
+	int ret;
+
+	mutex_lock(&ar->conf_mutex);
+
+	dump = ath10k_build_dump_file(ar);
+	if (!dump) {
+		ret = -ENODATA;
+		goto out;
+	}
+
+	file->private_data = dump;
+	ret = 0;
+
+out:
+	mutex_unlock(&ar->conf_mutex);
+	return ret;
+}
+
+static ssize_t ath10k_fw_crash_dump_read(struct file *file,
+					 char __user *user_buf,
+					 size_t count, loff_t *ppos)
+{
+	struct ath10k_dump_file_data *dump_file = file->private_data;
+
+	return simple_read_from_buffer(user_buf, count, ppos,
+				       dump_file,
+				       le32_to_cpu(dump_file->len));
+}
+
+static int ath10k_fw_crash_dump_release(struct inode *inode,
+					struct file *file)
+{
+	vfree(file->private_data);
+
+	return 0;
+}
+
+static const struct file_operations fops_fw_crash_dump = {
+	.open = ath10k_fw_crash_dump_open,
+	.read = ath10k_fw_crash_dump_read,
+	.release = ath10k_fw_crash_dump_release,
+	.owner = THIS_MODULE,
+	.llseek = default_llseek,
+};
+
 static int ath10k_debug_htt_stats_req(struct ath10k *ar)
 {
 	u64 cookie;
@@ -921,11 +1176,20 @@  static const struct file_operations fops_dfs_stats = {
 
 int ath10k_debug_create(struct ath10k *ar)
 {
+	int ret;
+
+	ar->debug.fw_crash_data = vzalloc(sizeof(*ar->debug.fw_crash_data));
+	if (!ar->debug.fw_crash_data) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
 	ar->debug.debugfs_phy = debugfs_create_dir("ath10k",
 						   ar->hw->wiphy->debugfsdir);
-
-	if (!ar->debug.debugfs_phy)
-		return -ENOMEM;
+	if (!ar->debug.debugfs_phy) {
+		ret = -ENOMEM;
+		goto err_free_fw_crash_data;
+	}
 
 	INIT_DELAYED_WORK(&ar->debug.htt_stats_dwork,
 			  ath10k_debug_htt_stats_dwork);
@@ -941,6 +1205,9 @@  int ath10k_debug_create(struct ath10k *ar)
 	debugfs_create_file("simulate_fw_crash", S_IRUSR, ar->debug.debugfs_phy,
 			    ar, &fops_simulate_fw_crash);
 
+	debugfs_create_file("fw_crash_dump", S_IRUSR, ar->debug.debugfs_phy,
+			    ar, &fops_fw_crash_dump);
+
 	debugfs_create_file("chip_id", S_IRUSR, ar->debug.debugfs_phy,
 			    ar, &fops_chip_id);
 
@@ -969,10 +1236,17 @@  int ath10k_debug_create(struct ath10k *ar)
 	}
 
 	return 0;
+
+err_free_fw_crash_data:
+	vfree(ar->debug.fw_crash_data);
+
+err:
+	return ret;
 }
 
 void ath10k_debug_destroy(struct ath10k *ar)
 {
+	vfree(ar->debug.fw_crash_data);
 	cancel_delayed_work_sync(&ar->debug.htt_stats_dwork);
 }
 
diff --git a/drivers/net/wireless/ath/ath10k/debug.h b/drivers/net/wireless/ath/ath10k/debug.h
index a5824990bd2a..80ff14e4db9b 100644
--- a/drivers/net/wireless/ath/ath10k/debug.h
+++ b/drivers/net/wireless/ath/ath10k/debug.h
@@ -53,6 +53,10 @@  void ath10k_debug_read_service_map(struct ath10k *ar,
 				   size_t map_size);
 void ath10k_debug_read_target_stats(struct ath10k *ar,
 				    struct wmi_stats_event *ev);
+struct ath10k_fw_crash_data *
+ath10k_debug_get_new_fw_crash_data(struct ath10k *ar);
+
+void ath10k_debug_dbglog_add(struct ath10k *ar, u8 *buffer, int len);
 
 #define ATH10K_DFS_STAT_INC(ar, c) (ar->debug.dfs_stats.c++)
 
@@ -86,6 +90,17 @@  static inline void ath10k_debug_read_target_stats(struct ath10k *ar,
 {
 }
 
+static inline void ath10k_debug_dbglog_add(struct ath10k *ar, u8 *buffer,
+					   int len)
+{
+}
+
+static inline struct ath10k_fw_crash_data *
+ath10k_debug_get_new_fw_crash_data(struct ath10k *ar)
+{
+	return NULL;
+}
+
 #define ATH10K_DFS_STAT_INC(ar, c) do { } while (0)
 
 #endif /* CONFIG_ATH10K_DEBUGFS */
@@ -96,6 +111,7 @@  __printf(2, 3) void ath10k_dbg(enum ath10k_debug_mask mask,
 void ath10k_dbg_dump(enum ath10k_debug_mask mask,
 		     const char *msg, const char *prefix,
 		     const void *buf, size_t len);
+
 #else /* CONFIG_ATH10K_DEBUG */
 
 static inline int ath10k_dbg(enum ath10k_debug_mask dbg_mask,
@@ -109,5 +125,11 @@  static inline void ath10k_dbg_dump(enum ath10k_debug_mask mask,
 				   const void *buf, size_t len)
 {
 }
+
+static inline void ath10k_dbg_save_fw_dbg_buffer(struct ath10k *ar,
+						 u8 *buffer, int len)
+{
+}
 #endif /* CONFIG_ATH10K_DEBUG */
+
 #endif /* _DEBUG_H_ */
diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h
index ffd04890407e..c931cce17063 100644
--- a/drivers/net/wireless/ath/ath10k/hw.h
+++ b/drivers/net/wireless/ath/ath10k/hw.h
@@ -39,6 +39,8 @@ 
 /* includes also the null byte */
 #define ATH10K_FIRMWARE_MAGIC               "QCA-ATH10K"
 
+#define REG_DUMP_COUNT_QCA988X 60
+
 struct ath10k_fw_ie {
 	__le32 id;
 	__le32 len;
@@ -362,4 +364,31 @@  enum ath10k_mcast2ucast_mode {
 
 #define RTC_STATE_V_GET(x) (((x) & RTC_STATE_V_MASK) >> RTC_STATE_V_LSB)
 
+
+/* Target debug log related defines and structs */
+
+/* Target is 32-bit CPU, so we just use u32 for
+ * the pointers.  The memory space is relative to the
+ * target, not the host.
+ */
+struct ath10k_fw_dbglog_buf {
+	/* pointer to dblog_buf_s */
+	u32 next;
+
+	/* pointer to u8 buffer */
+	u32 buffer;
+
+	u32 bufsize;
+	u32 length;
+	u32 count;
+	u32 free;
+} __packed;
+
+struct ath10k_fw_dbglog_hdr {
+	/* pointer to dbglog_buf_s */
+	u32 dbuf;
+
+	u32 dropped;
+} __packed;
+
 #endif /* _HW_H_ */
diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c
index b4ae4bb3fc09..41ff620e1af2 100644
--- a/drivers/net/wireless/ath/ath10k/pci.c
+++ b/drivers/net/wireless/ath/ath10k/pci.c
@@ -832,16 +832,103 @@  static u16 ath10k_pci_hif_get_free_queue_number(struct ath10k *ar, u8 pipe)
 	return ath10k_ce_num_free_src_entries(ar_pci->pipe_info[pipe].ce_hdl);
 }
 
-static void ath10k_pci_hif_dump_area(struct ath10k *ar)
+static void ath10k_pci_dump_dbglog(struct ath10k *ar,
+				   struct ath10k_fw_crash_data *crash_data)
+{
+	struct ath10k_fw_dbglog_hdr dbg_hdr;
+	struct ath10k_fw_dbglog_buf dbuf;
+	u8 *buffer;
+	int ret, i;
+	u32 dbufp;
+
+	lockdep_assert_held(&ar->data_lock);
+
+	/* dump the debug logs on the target */
+	ret = ath10k_pci_diag_read_hi(ar, &dbg_hdr,
+				      hi_dbglog_hdr, sizeof(dbg_hdr));
+	if (ret != 0) {
+		ath10k_warn("failed to dump debug log area from hi_dbglog_hdr: %d\n",
+			    ret);
+		return;
+	}
+
+	ath10k_dbg(ATH10K_DBG_PCI,
+		   "pci dbglog header dbuf 0x%x dropped %i\n",
+		   dbg_hdr.dbuf, dbg_hdr.dropped);
+
+	/* pointer in target memory space */
+	dbufp = dbg_hdr.dbuf;
+
+	/* i is for logging purposes and sanity check in case firmware buffers
+	 * are corrupted and will not properly terminate the list.
+	 * In standard firmware, it appears there are no more than 2
+	 * buffers, so 10 should be safe upper limit even if firmware
+	 * changes quite a bit.
+	 */
+	i = 0;
+	while (dbufp && i < 10) {
+		ret = ath10k_pci_diag_read_mem(ar, dbufp,
+					       &dbuf, sizeof(dbuf));
+		if (ret != 0) {
+			ath10k_warn("failed to read debug log area from address 0x%x: %d\n",
+				    dbufp, ret);
+			return;
+		}
+
+		/* we have a buffer of data */
+		ath10k_dbg(ATH10K_DBG_PCI,
+			   "pci dbglog [%i] next 0x%x buf 0x%x size %i len %i count %i free %i\n",
+			   i, dbuf.next, dbuf.buffer, dbuf.bufsize, dbuf.length,
+			   dbuf.count, dbuf.free);
+		if (dbuf.buffer == 0 || dbuf.length == 0)
+			goto next;
+
+		/* Pick arbitrary upper bound in case firmware is corrupted for
+		 * whatever reason.
+		 */
+		if (dbuf.length > 16000) {
+			ath10k_warn("firmware debug log buffer length is out of bounds: %d\n",
+				    dbuf.length);
+			/* do not trust the next pointer either... */
+			return;
+		}
+
+		buffer = kmalloc(dbuf.length, GFP_ATOMIC);
+
+		if (!buffer)
+			goto next;
+
+		ret = ath10k_pci_diag_read_mem(ar, dbuf.buffer, buffer,
+					       dbuf.length);
+		if (ret != 0) {
+			ath10k_warn("failed to read debug log buffer from address 0x%x: %d\n",
+				    dbuf.buffer, ret);
+			kfree(buffer);
+			return;
+		}
+
+		ath10k_debug_dbglog_add(ar, buffer, dbuf.length);
+		kfree(buffer);
+
+next:
+		dbufp = dbuf.next;
+		if (dbufp == dbg_hdr.dbuf) {
+			/* It is a circular buffer it seems, bail if next
+			 * is head.
+			 */
+			break;
+		}
+		i++;
+	} /* while we have a debug buffer to read */
+}
+
+static void ath10k_pci_dump_registers(struct ath10k *ar,
+				      struct ath10k_fw_crash_data *crash_data)
 {
-	u32 reg_dump_values[REG_DUMP_COUNT_QCA988X] = {};
+	u32 i, reg_dump_values[REG_DUMP_COUNT_QCA988X] = {};
 	int ret;
-	u32 i;
 
-	ath10k_err("firmware crashed!\n");
-	ath10k_err("hardware name %s version 0x%x\n",
-		   ar->hw_params.name, ar->target_version);
-	ath10k_err("firmware version: %s\n", ar->hw->wiphy->fw_version);
+	lockdep_assert_held(&ar->data_lock);
 
 	ret = ath10k_pci_diag_read_hi(ar, &reg_dump_values[0],
 				      hi_failure_state,
@@ -862,6 +949,38 @@  static void ath10k_pci_hif_dump_area(struct ath10k *ar)
 			   reg_dump_values[i + 2],
 			   reg_dump_values[i + 3]);
 
+	memcpy(crash_data->reg_dump_values, reg_dump_values,
+	       sizeof(crash_data->reg_dump_values));
+}
+
+static void ath10k_pci_hif_dump_area(struct ath10k *ar)
+{
+	struct ath10k_fw_crash_data *crash_data;
+	char uuid[50];
+
+	spin_lock_bh(&ar->data_lock);
+
+	crash_data = ath10k_debug_get_new_fw_crash_data(ar);
+
+	if (crash_data)
+		scnprintf(uuid, sizeof(uuid), "%pUl", &crash_data->uuid);
+	else
+		scnprintf(uuid, sizeof(uuid), "n/a");
+
+	ath10k_err("firmware crashed! (uuid %s)\n", uuid);
+	ath10k_err("hardware name %s version 0x%x\n",
+		   ar->hw_params.name, ar->target_version);
+	ath10k_err("firmware version: %s\n", ar->hw->wiphy->fw_version);
+
+	if (!crash_data)
+		goto exit;
+
+	ath10k_pci_dump_registers(ar, crash_data);
+	ath10k_pci_dump_dbglog(ar, crash_data);
+
+exit:
+	spin_unlock_bh(&ar->data_lock);
+
 	queue_work(ar->workqueue, &ar->restart_work);
 }
 
diff --git a/drivers/net/wireless/ath/ath10k/pci.h b/drivers/net/wireless/ath/ath10k/pci.h
index 662bca3922ef..294a72e01909 100644
--- a/drivers/net/wireless/ath/ath10k/pci.h
+++ b/drivers/net/wireless/ath/ath10k/pci.h
@@ -23,9 +23,6 @@ 
 #include "hw.h"
 #include "ce.h"
 
-/* FW dump area */
-#define REG_DUMP_COUNT_QCA988X 60
-
 /*
  * maximum number of bytes that can be handled atomically by DiagRead/DiagWrite
  */