diff mbox series

[v1] Bluetooth: hci_qca: Bug fixes while collecting controller memory dump

Message ID 1580832929-2067-1-git-send-email-gubbaven@codeaurora.org (mailing list archive)
State Changes Requested
Delegated to: Marcel Holtmann
Headers show
Series [v1] Bluetooth: hci_qca: Bug fixes while collecting controller memory dump | expand

Commit Message

Venkata Lakshmi Narayana Gubba Feb. 4, 2020, 4:15 p.m. UTC
This patch will fix the below issues
   1.Fixed race conditions while accessing memory dump state flags.
   2.Updated with actual context of timer in hci_memdump_timeout()
   3.Updated injecting hardware error event if the dumps failed to receive.
   4.Once timeout is triggered, stopping the memory dump collections.

Fixes: d841502c79e3 ("Bluetooth: hci_qca: Collect controller memory dump during SSR")
Reported-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Signed-off-by: Venkata Lakshmi Narayana Gubba <gubbaven@codeaurora.org>
---
 drivers/bluetooth/hci_qca.c | 104 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 90 insertions(+), 14 deletions(-)

Comments

Abhishek Pandit-Subedi Feb. 4, 2020, 11:10 p.m. UTC | #1
Hi Venkata,

Per our earlier review on chromium gerrit:
https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/1992966

I'm not too keen on the change from mutex to spinlock because it's
made the code more complex.

Also, it has been a couple weeks since my last review and I've lost
the context of what order of events are supposed to happen (making
reviewing the sequencing hard).

Good case:

Memdump event from firmware
Some number of memdump events with seq #
Hw error event
Reset

Timeout case:

Memdump event from firmware
Some number of memdump events with seq #
Timeout schedules hw_error_event
hw_error_event clears the memdump activity
reset

Software memdump:

hw_error_event sends memdump command to firmware and waits for completion
memdump event with seq#
hw error event
reset

Does this look right? Could you add this to either the commit message
or as a comment in one of the functions so that it's easier to
understand what is the expected order of events.

On Tue, Feb 4, 2020 at 8:16 AM Venkata Lakshmi Narayana Gubba
<gubbaven@codeaurora.org> wrote:
>
> This patch will fix the below issues
>    1.Fixed race conditions while accessing memory dump state flags.
>    2.Updated with actual context of timer in hci_memdump_timeout()
>    3.Updated injecting hardware error event if the dumps failed to receive.
>    4.Once timeout is triggered, stopping the memory dump collections.
>
> Fixes: d841502c79e3 ("Bluetooth: hci_qca: Collect controller memory dump during SSR")
> Reported-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
> Signed-off-by: Venkata Lakshmi Narayana Gubba <gubbaven@codeaurora.org>
> ---
>  drivers/bluetooth/hci_qca.c | 104 ++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 90 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
> index eacc65b..ea956c3 100644
> --- a/drivers/bluetooth/hci_qca.c
> +++ b/drivers/bluetooth/hci_qca.c
> @@ -69,7 +69,8 @@ enum qca_flags {
>         QCA_IBS_ENABLED,
>         QCA_DROP_VENDOR_EVENT,
>         QCA_SUSPENDING,
> -       QCA_MEMDUMP_COLLECTION
> +       QCA_MEMDUMP_COLLECTION,
> +       QCA_HW_ERROR_EVENT
>  };
>
>
> @@ -150,6 +151,7 @@ struct qca_data {
>         struct completion drop_ev_comp;
>         wait_queue_head_t suspend_wait_q;
>         enum qca_memdump_states memdump_state;
> +       spinlock_t hci_memdump_lock;
In an earlier revision of this patch, you had this lock as a mutex.
Why change it from mutex to spinlock_t? I think this has made your
change more complex since you have to unlock during the middle of an
operation more often (i.e. since it can block)
>
>         /* For debugging purpose */
>         u64 ibs_sent_wacks;
> @@ -524,19 +526,19 @@ static void hci_ibs_wake_retrans_timeout(struct timer_list *t)
>
>  static void hci_memdump_timeout(struct timer_list *t)
>  {
> -       struct qca_data *qca = from_timer(qca, t, tx_idle_timer);
> +       struct qca_data *qca = from_timer(qca, t, memdump_timer);
>         struct hci_uart *hu = qca->hu;
> -       struct qca_memdump_data *qca_memdump = qca->qca_memdump;
> -       char *memdump_buf = qca_memdump->memdump_buf_tail;
> +       unsigned long flags;
>
> -       bt_dev_err(hu->hdev, "clearing allocated memory due to memdump timeout");
> -       /* Inject hw error event to reset the device and driver. */
> -       hci_reset_dev(hu->hdev);
> -       vfree(memdump_buf);
> -       kfree(qca_memdump);
> +       spin_lock_irqsave(&qca->hci_memdump_lock, flags);
>         qca->memdump_state = QCA_MEMDUMP_TIMEOUT;
> +       if (!test_bit(QCA_HW_ERROR_EVENT, &qca->flags)) {
> +               /* Inject hw error event to reset the device and driver. */
> +               hci_reset_dev(hu->hdev);
> +       }
> +
>         del_timer(&qca->memdump_timer);
> -       cancel_work_sync(&qca->ctrl_memdump_evt);
> +       spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);

Missing cancel_work_sync (not sure if intentional but it's included in
qca_wait_for_dump_collection so it should probably be here too)

>  }
>
>  /* Initialize protocol */
> @@ -558,6 +560,7 @@ static int qca_open(struct hci_uart *hu)
>         skb_queue_head_init(&qca->tx_wait_q);
>         skb_queue_head_init(&qca->rx_memdump_q);
>         spin_lock_init(&qca->hci_ibs_lock);
> +       spin_lock_init(&qca->hci_memdump_lock);
>         qca->workqueue = alloc_ordered_workqueue("qca_wq", 0);
>         if (!qca->workqueue) {
>                 BT_ERR("QCA Workqueue not initialized properly");
> @@ -960,14 +963,25 @@ static void qca_controller_memdump(struct work_struct *work)
>         char nullBuff[QCA_DUMP_PACKET_SIZE] = { 0 };
>         u16 seq_no;
>         u32 dump_size;
> +       unsigned long flags;
>
>         while ((skb = skb_dequeue(&qca->rx_memdump_q))) {
>
> +               spin_lock_irqsave(&qca->hci_memdump_lock, flags);
> +               /* Skip processing the received packets if timeout detected. */
> +               if (qca->memdump_state == QCA_MEMDUMP_TIMEOUT) {
> +                       spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
> +                       return;
> +               }
> +
>                 if (!qca_memdump) {
>                         qca_memdump = kzalloc(sizeof(struct qca_memdump_data),
>                                               GFP_ATOMIC);
> -                       if (!qca_memdump)
> +                       if (!qca_memdump) {
> +                               spin_unlock_irqrestore(&qca->hci_memdump_lock,
> +                                                               flags);
>                                 return;
> +                       }
>
>                         qca->qca_memdump = qca_memdump;
>                 }
> @@ -992,6 +1006,8 @@ static void qca_controller_memdump(struct work_struct *work)
>                         if (!(dump_size)) {
>                                 bt_dev_err(hu->hdev, "Rx invalid memdump size");
>                                 kfree_skb(skb);
> +                               spin_unlock_irqrestore(&qca->hci_memdump_lock,
> +                                                       flags);
>                                 return;
>                         }
>
> @@ -1001,7 +1017,24 @@ static void qca_controller_memdump(struct work_struct *work)
>                                   msecs_to_jiffies(MEMDUMP_TIMEOUT_MS)));
>
>                         skb_pull(skb, sizeof(dump_size));
> +
> +                       /* vmalloc() might go to sleep while trying to allocate
> +                        * memory.As calling sleep function under spin lock is
> +                        * not allowed so unlocking spin lock and will be locked
> +                        * again after vmalloc().
> +                        */
> +                       spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
>                         memdump_buf = vmalloc(dump_size);
> +                       spin_lock_irqsave(&qca->hci_memdump_lock, flags);
> +                       /* Skip processing the received packets if timeout
> +                        * detected.
> +                        */
> +                       if (qca->memdump_state == QCA_MEMDUMP_TIMEOUT) {
> +                               spin_unlock_irqrestore(&qca->hci_memdump_lock,
> +                                                       flags);
> +                               return;
> +                       }
> +
>                         qca_memdump->memdump_buf_head = memdump_buf;
>                         qca_memdump->memdump_buf_tail = memdump_buf;
>                 }
> @@ -1016,6 +1049,7 @@ static void qca_controller_memdump(struct work_struct *work)
>                         kfree(qca_memdump);
>                         kfree_skb(skb);
>                         qca->qca_memdump = NULL;
> +                       spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
>                         return;
>                 }
>
> @@ -1044,18 +1078,37 @@ static void qca_controller_memdump(struct work_struct *work)
>                         bt_dev_info(hu->hdev, "QCA writing crash dump of size %d bytes",
>                                    qca_memdump->received_dump);
>                         memdump_buf = qca_memdump->memdump_buf_head;
> +
> +                       /* dev_coredumpv() might go to sleep.As calling sleep
> +                        * function under spin lock is not allowed so unlocking
> +                        * spin lock and will be locked again after
> +                        * dev_coredumpv().
> +                        */
> +                       spin_unlock_irqrestore(&qca->hci_memdump_lock,
> +                                               flags);
>                         dev_coredumpv(&hu->serdev->dev, memdump_buf,
>                                       qca_memdump->received_dump, GFP_KERNEL);
> +                       spin_lock_irqsave(&qca->hci_memdump_lock, flags);
> +                       if (qca->memdump_state == QCA_MEMDUMP_TIMEOUT) {
> +                               spin_unlock_irqrestore(&qca->hci_memdump_lock,
> +                                                       flags);
> +                               return;
> +                       }
> +
>                         del_timer(&qca->memdump_timer);
>                         kfree(qca->qca_memdump);
>                         qca->qca_memdump = NULL;
>                         qca->memdump_state = QCA_MEMDUMP_COLLECTED;
> +                       clear_bit(QCA_MEMDUMP_COLLECTION, &qca->flags);
>                 }
> +
> +               spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
>         }
>
>  }
>
> -int qca_controller_memdump_event(struct hci_dev *hdev, struct sk_buff *skb)
> +static int qca_controller_memdump_event(struct hci_dev *hdev,
> +                                       struct sk_buff *skb)
>  {
>         struct hci_uart *hu = hci_get_drvdata(hdev);
>         struct qca_data *qca = hu->priv;
> @@ -1408,19 +1461,25 @@ static void qca_wait_for_dump_collection(struct hci_dev *hdev)
>         struct qca_data *qca = hu->priv;
>         struct qca_memdump_data *qca_memdump = qca->qca_memdump;
>         char *memdump_buf = NULL;
> +       unsigned long flags;
>
>         wait_on_bit_timeout(&qca->flags, QCA_MEMDUMP_COLLECTION,
>                             TASK_UNINTERRUPTIBLE, MEMDUMP_TIMEOUT_MS);
>
>         clear_bit(QCA_MEMDUMP_COLLECTION, &qca->flags);
> -       if (qca->memdump_state == QCA_MEMDUMP_IDLE) {
> +       if (qca->memdump_state == QCA_MEMDUMP_IDLE ||
> +           qca->memdump_state == QCA_MEMDUMP_COLLECTING) {
>                 bt_dev_err(hu->hdev, "Clearing the buffers due to timeout");
> +               spin_lock_irqsave(&qca->hci_memdump_lock, flags);
>                 if (qca_memdump)
> -                       memdump_buf = qca_memdump->memdump_buf_tail;
> +                       memdump_buf = qca_memdump->memdump_buf_head;
>                 vfree(memdump_buf);
>                 kfree(qca_memdump);
> +               qca->qca_memdump = NULL;
>                 qca->memdump_state = QCA_MEMDUMP_TIMEOUT;
>                 del_timer(&qca->memdump_timer);
> +               skb_queue_purge(&qca->rx_memdump_q);
> +               spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
>                 cancel_work_sync(&qca->ctrl_memdump_evt);
>         }
>  }
> @@ -1429,7 +1488,11 @@ static void qca_hw_error(struct hci_dev *hdev, u8 code)
>  {
>         struct hci_uart *hu = hci_get_drvdata(hdev);
>         struct qca_data *qca = hu->priv;
> +       struct qca_memdump_data *qca_memdump = qca->qca_memdump;
> +       char *memdump_buf = NULL;
> +       unsigned long flags;
>
> +       set_bit(QCA_HW_ERROR_EVENT, &qca->flags);
>         bt_dev_info(hdev, "mem_dump_status: %d", qca->memdump_state);
>
>         if (qca->memdump_state == QCA_MEMDUMP_IDLE) {
> @@ -1448,7 +1511,20 @@ static void qca_hw_error(struct hci_dev *hdev, u8 code)
>                  */
>                 bt_dev_info(hdev, "waiting for dump to complete");
>                 qca_wait_for_dump_collection(hdev);
> +       } else if (qca->memdump_state == QCA_MEMDUMP_TIMEOUT) {
> +               bt_dev_err(hu->hdev, "clearing allocated memory due to memdump timeout");
> +               spin_lock_irqsave(&qca->hci_memdump_lock, flags);
> +               if (qca_memdump)
> +                       memdump_buf = qca_memdump->memdump_buf_head;
> +               vfree(memdump_buf);
> +               kfree(qca_memdump);
> +               qca->qca_memdump = NULL;
> +               skb_queue_purge(&qca->rx_memdump_q);
> +               spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
> +               cancel_work_sync(&qca->ctrl_memdump_evt);
>         }
> +
> +       clear_bit(QCA_HW_ERROR_EVENT, &qca->flags);
>  }
>
>  static void qca_cmd_timeout(struct hci_dev *hdev)
> --
> QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
> of Code Aurora Forum, hosted by The Linux Foundation
>
Marcel Holtmann Feb. 5, 2020, 8:03 a.m. UTC | #2
Hi Abhishek,

> Per our earlier review on chromium gerrit:
> https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/1992966
> 
> I'm not too keen on the change from mutex to spinlock because it's
> made the code more complex.
> 
> Also, it has been a couple weeks since my last review and I've lost
> the context of what order of events are supposed to happen (making
> reviewing the sequencing hard).
> 
> Good case:
> 
> Memdump event from firmware
> Some number of memdump events with seq #
> Hw error event
> Reset
> 
> Timeout case:
> 
> Memdump event from firmware
> Some number of memdump events with seq #
> Timeout schedules hw_error_event
> hw_error_event clears the memdump activity
> reset
> 
> Software memdump:
> 
> hw_error_event sends memdump command to firmware and waits for completion
> memdump event with seq#
> hw error event
> reset
> 
> Does this look right? Could you add this to either the commit message
> or as a comment in one of the functions so that it's easier to
> understand what is the expected order of events.
> 
> On Tue, Feb 4, 2020 at 8:16 AM Venkata Lakshmi Narayana Gubba
> <gubbaven@codeaurora.org> wrote:
>> 
>> This patch will fix the below issues
>>   1.Fixed race conditions while accessing memory dump state flags.
>>   2.Updated with actual context of timer in hci_memdump_timeout()
>>   3.Updated injecting hardware error event if the dumps failed to receive.
>>   4.Once timeout is triggered, stopping the memory dump collections.
>> 
>> Fixes: d841502c79e3 ("Bluetooth: hci_qca: Collect controller memory dump during SSR")
>> Reported-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
>> Signed-off-by: Venkata Lakshmi Narayana Gubba <gubbaven@codeaurora.org>
>> ---
>> drivers/bluetooth/hci_qca.c | 104 ++++++++++++++++++++++++++++++++++++++------
>> 1 file changed, 90 insertions(+), 14 deletions(-)
>> 
>> diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
>> index eacc65b..ea956c3 100644
>> --- a/drivers/bluetooth/hci_qca.c
>> +++ b/drivers/bluetooth/hci_qca.c
>> @@ -69,7 +69,8 @@ enum qca_flags {
>>        QCA_IBS_ENABLED,
>>        QCA_DROP_VENDOR_EVENT,
>>        QCA_SUSPENDING,
>> -       QCA_MEMDUMP_COLLECTION
>> +       QCA_MEMDUMP_COLLECTION,
>> +       QCA_HW_ERROR_EVENT
>> };
>> 
>> 
>> @@ -150,6 +151,7 @@ struct qca_data {
>>        struct completion drop_ev_comp;
>>        wait_queue_head_t suspend_wait_q;
>>        enum qca_memdump_states memdump_state;
>> +       spinlock_t hci_memdump_lock;
> In an earlier revision of this patch, you had this lock as a mutex.
> Why change it from mutex to spinlock_t? I think this has made your
> change more complex since you have to unlock during the middle of an
> operation more often (i.e. since it can block)

I agree that we should try to keep a mutex since all event processing in Bluetooth core happens in a workqueue anyway.

Regards

Marcel
Venkata Lakshmi Narayana Gubba Feb. 5, 2020, 12:48 p.m. UTC | #3
Hi Abhishek,

On 2020-02-05 04:40, Abhishek Pandit-Subedi wrote:
> Hi Venkata,
> 
> Per our earlier review on chromium gerrit:
> https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/1992966
> 
> I'm not too keen on the change from mutex to spinlock because it's
> made the code more complex.
[Venkata] :

We have moved from mutex to spinlock as timer callback function is 
getting executed under interrupt context and not under process context.

> 
> Also, it has been a couple weeks since my last review and I've lost
> the context of what order of events are supposed to happen (making
> reviewing the sequencing hard).
> 
> Good case:
> 
> Memdump event from firmware
> Some number of memdump events with seq #
> Hw error event
> Reset
> 
> Timeout case:
> 
> Memdump event from firmware
> Some number of memdump events with seq #
> Timeout schedules hw_error_event
> hw_error_event clears the memdump activity
> reset
> 
> Software memdump:
> 
> hw_error_event sends memdump command to firmware and waits for 
> completion
> memdump event with seq#
> hw error event
> reset
> 
> Does this look right? Could you add this to either the commit message
> or as a comment in one of the functions so that it's easier to
> understand what is the expected order of events.

[Venkata] :

The scenarios you have mentioned look right.In case of time out scenario
hw error event is injected if it is not received already.
Sure we will update the scenarios information in next patch set.

> 
> On Tue, Feb 4, 2020 at 8:16 AM Venkata Lakshmi Narayana Gubba
> <gubbaven@codeaurora.org> wrote:
>> 
>> This patch will fix the below issues
>>    1.Fixed race conditions while accessing memory dump state flags.
>>    2.Updated with actual context of timer in hci_memdump_timeout()
>>    3.Updated injecting hardware error event if the dumps failed to 
>> receive.
>>    4.Once timeout is triggered, stopping the memory dump collections.
>> 
>> Fixes: d841502c79e3 ("Bluetooth: hci_qca: Collect controller memory 
>> dump during SSR")
>> Reported-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
>> Signed-off-by: Venkata Lakshmi Narayana Gubba 
>> <gubbaven@codeaurora.org>
>> ---
>>  drivers/bluetooth/hci_qca.c | 104 
>> ++++++++++++++++++++++++++++++++++++++------
>>  1 file changed, 90 insertions(+), 14 deletions(-)
>> 
>> diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
>> index eacc65b..ea956c3 100644
>> --- a/drivers/bluetooth/hci_qca.c
>> +++ b/drivers/bluetooth/hci_qca.c
>> @@ -69,7 +69,8 @@ enum qca_flags {
>>         QCA_IBS_ENABLED,
>>         QCA_DROP_VENDOR_EVENT,
>>         QCA_SUSPENDING,
>> -       QCA_MEMDUMP_COLLECTION
>> +       QCA_MEMDUMP_COLLECTION,
>> +       QCA_HW_ERROR_EVENT
>>  };
>> 
>> 
>> @@ -150,6 +151,7 @@ struct qca_data {
>>         struct completion drop_ev_comp;
>>         wait_queue_head_t suspend_wait_q;
>>         enum qca_memdump_states memdump_state;
>> +       spinlock_t hci_memdump_lock;
> In an earlier revision of this patch, you had this lock as a mutex.
> Why change it from mutex to spinlock_t? I think this has made your
> change more complex since you have to unlock during the middle of an
> operation more often (i.e. since it can block)

[Venkata] :

We have created a scenario to trigger timer callback function 
hci_memdump_timeout()
and we have observed that kernel is going to panic state with mutex 
implementation.
We got the following error.It seems timer callback function is getting 
executed
in interuupt context and not under process context.After moving to 
spinlock we dont
see these issues.

[  182.203510] BUG: sleeping function called from invalid context at 
/mnt/host/source/src/third_party/kernel/v5.4/kernel/locking/mutex.c:281

[  182.216198] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 0, 
name: swapper/3

[  182.224460] ------------[ cut here ]------------

[  182.229254] kernel BUG at 
/mnt/host/source/src/third_party/kernel/v5.4/kernel/sched/core.c:6787!

[  182.238331] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP

[  182.411931] Call trace:

[  182.414489]  ___might_sleep+0x11c/0x120

[  182.418442]  __might_sleep+0x50/0x84

[  182.422127]  mutex_lock+0x28/0x60

[  182.425554]  hci_memdump_timeout+0x50/0x90 [hci_uart]

[  182.430755]  call_timer_fn+0xc4/0x1a8

[  182.434532]  __run_timers+0x214/0x2c0

[  182.438298]  run_timer_softirq+0x24/0x44

[  182.442343]  __do_softirq+0x164/0x30c

[  182.446123]  irq_exit+0xa0/0xa4

[  182.449361]  __handle_domain_irq+0x8c/0xc4

[  182.453572]  gic_handle_irq+0xd0/0x178

[  182.457434]  el1_irq+0xb8/0x180

[  182.460710]  cpuidle_enter_state+0xf8/0x27c

[  182.465059]  cpuidle_enter+0x38/0x4c

[  182.468778]  do_idle+0x1a4/0x290

[  182.472140]  cpu_startup_entry+0x24/0x28

[  182.476223]  secondary_start_kernel+0x148/0x154

[  182.480934] Code: 913ba400 9400a9f4 d53b4228 12190109 (d4210000)

[  182.487244] ---[ end trace 8d7b29f94068a7cf ]---

[  182.523162] Kernel panic - not syncing: Fatal exception in interrupt


>> 
>>         /* For debugging purpose */
>>         u64 ibs_sent_wacks;
>> @@ -524,19 +526,19 @@ static void hci_ibs_wake_retrans_timeout(struct 
>> timer_list *t)
>> 
>>  static void hci_memdump_timeout(struct timer_list *t)
>>  {
>> -       struct qca_data *qca = from_timer(qca, t, tx_idle_timer);
>> +       struct qca_data *qca = from_timer(qca, t, memdump_timer);
>>         struct hci_uart *hu = qca->hu;
>> -       struct qca_memdump_data *qca_memdump = qca->qca_memdump;
>> -       char *memdump_buf = qca_memdump->memdump_buf_tail;
>> +       unsigned long flags;
>> 
>> -       bt_dev_err(hu->hdev, "clearing allocated memory due to memdump 
>> timeout");
>> -       /* Inject hw error event to reset the device and driver. */
>> -       hci_reset_dev(hu->hdev);
>> -       vfree(memdump_buf);
>> -       kfree(qca_memdump);
>> +       spin_lock_irqsave(&qca->hci_memdump_lock, flags);
>>         qca->memdump_state = QCA_MEMDUMP_TIMEOUT;
>> +       if (!test_bit(QCA_HW_ERROR_EVENT, &qca->flags)) {
>> +               /* Inject hw error event to reset the device and 
>> driver. */
>> +               hci_reset_dev(hu->hdev);
>> +       }
>> +
>>         del_timer(&qca->memdump_timer);
>> -       cancel_work_sync(&qca->ctrl_memdump_evt);
>> +       spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
> 
> Missing cancel_work_sync (not sure if intentional but it's included in
> qca_wait_for_dump_collection so it should probably be here too)
[Venkata] :

As timer callback function is running under interrupt context
and cancel_work_sync() might go to sleep, we got kernel panic
for calling this function in timer callback function.
So we have moved cancel_work_sync() to qca_hw_error() and handled
under QCA_MEMDUMP_TIMEOUT scenario.

> 
>>  }
>> 
>>  /* Initialize protocol */
>> @@ -558,6 +560,7 @@ static int qca_open(struct hci_uart *hu)
>>         skb_queue_head_init(&qca->tx_wait_q);
>>         skb_queue_head_init(&qca->rx_memdump_q);
>>         spin_lock_init(&qca->hci_ibs_lock);
>> +       spin_lock_init(&qca->hci_memdump_lock);
>>         qca->workqueue = alloc_ordered_workqueue("qca_wq", 0);
>>         if (!qca->workqueue) {
>>                 BT_ERR("QCA Workqueue not initialized properly");
>> @@ -960,14 +963,25 @@ static void qca_controller_memdump(struct 
>> work_struct *work)
>>         char nullBuff[QCA_DUMP_PACKET_SIZE] = { 0 };
>>         u16 seq_no;
>>         u32 dump_size;
>> +       unsigned long flags;
>> 
>>         while ((skb = skb_dequeue(&qca->rx_memdump_q))) {
>> 
>> +               spin_lock_irqsave(&qca->hci_memdump_lock, flags);
>> +               /* Skip processing the received packets if timeout 
>> detected. */
>> +               if (qca->memdump_state == QCA_MEMDUMP_TIMEOUT) {
>> +                       spin_unlock_irqrestore(&qca->hci_memdump_lock, 
>> flags);
>> +                       return;
>> +               }
>> +
>>                 if (!qca_memdump) {
>>                         qca_memdump = kzalloc(sizeof(struct 
>> qca_memdump_data),
>>                                               GFP_ATOMIC);
>> -                       if (!qca_memdump)
>> +                       if (!qca_memdump) {
>> +                               
>> spin_unlock_irqrestore(&qca->hci_memdump_lock,
>> +                                                               
>> flags);
>>                                 return;
>> +                       }
>> 
>>                         qca->qca_memdump = qca_memdump;
>>                 }
>> @@ -992,6 +1006,8 @@ static void qca_controller_memdump(struct 
>> work_struct *work)
>>                         if (!(dump_size)) {
>>                                 bt_dev_err(hu->hdev, "Rx invalid 
>> memdump size");
>>                                 kfree_skb(skb);
>> +                               
>> spin_unlock_irqrestore(&qca->hci_memdump_lock,
>> +                                                       flags);
>>                                 return;
>>                         }
>> 
>> @@ -1001,7 +1017,24 @@ static void qca_controller_memdump(struct 
>> work_struct *work)
>>                                   
>> msecs_to_jiffies(MEMDUMP_TIMEOUT_MS)));
>> 
>>                         skb_pull(skb, sizeof(dump_size));
>> +
>> +                       /* vmalloc() might go to sleep while trying to 
>> allocate
>> +                        * memory.As calling sleep function under spin 
>> lock is
>> +                        * not allowed so unlocking spin lock and will 
>> be locked
>> +                        * again after vmalloc().
>> +                        */
>> +                       spin_unlock_irqrestore(&qca->hci_memdump_lock, 
>> flags);
>>                         memdump_buf = vmalloc(dump_size);
>> +                       spin_lock_irqsave(&qca->hci_memdump_lock, 
>> flags);
>> +                       /* Skip processing the received packets if 
>> timeout
>> +                        * detected.
>> +                        */
>> +                       if (qca->memdump_state == QCA_MEMDUMP_TIMEOUT) 
>> {
>> +                               
>> spin_unlock_irqrestore(&qca->hci_memdump_lock,
>> +                                                       flags);
>> +                               return;
>> +                       }
>> +
>>                         qca_memdump->memdump_buf_head = memdump_buf;
>>                         qca_memdump->memdump_buf_tail = memdump_buf;
>>                 }
>> @@ -1016,6 +1049,7 @@ static void qca_controller_memdump(struct 
>> work_struct *work)
>>                         kfree(qca_memdump);
>>                         kfree_skb(skb);
>>                         qca->qca_memdump = NULL;
>> +                       spin_unlock_irqrestore(&qca->hci_memdump_lock, 
>> flags);
>>                         return;
>>                 }
>> 
>> @@ -1044,18 +1078,37 @@ static void qca_controller_memdump(struct 
>> work_struct *work)
>>                         bt_dev_info(hu->hdev, "QCA writing crash dump 
>> of size %d bytes",
>>                                    qca_memdump->received_dump);
>>                         memdump_buf = qca_memdump->memdump_buf_head;
>> +
>> +                       /* dev_coredumpv() might go to sleep.As 
>> calling sleep
>> +                        * function under spin lock is not allowed so 
>> unlocking
>> +                        * spin lock and will be locked again after
>> +                        * dev_coredumpv().
>> +                        */
>> +                       spin_unlock_irqrestore(&qca->hci_memdump_lock,
>> +                                               flags);
>>                         dev_coredumpv(&hu->serdev->dev, memdump_buf,
>>                                       qca_memdump->received_dump, 
>> GFP_KERNEL);
>> +                       spin_lock_irqsave(&qca->hci_memdump_lock, 
>> flags);
>> +                       if (qca->memdump_state == QCA_MEMDUMP_TIMEOUT) 
>> {
>> +                               
>> spin_unlock_irqrestore(&qca->hci_memdump_lock,
>> +                                                       flags);
>> +                               return;
>> +                       }
>> +
>>                         del_timer(&qca->memdump_timer);
>>                         kfree(qca->qca_memdump);
>>                         qca->qca_memdump = NULL;
>>                         qca->memdump_state = QCA_MEMDUMP_COLLECTED;
>> +                       clear_bit(QCA_MEMDUMP_COLLECTION, 
>> &qca->flags);
>>                 }
>> +
>> +               spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
>>         }
>> 
>>  }
>> 
>> -int qca_controller_memdump_event(struct hci_dev *hdev, struct sk_buff 
>> *skb)
>> +static int qca_controller_memdump_event(struct hci_dev *hdev,
>> +                                       struct sk_buff *skb)
>>  {
>>         struct hci_uart *hu = hci_get_drvdata(hdev);
>>         struct qca_data *qca = hu->priv;
>> @@ -1408,19 +1461,25 @@ static void 
>> qca_wait_for_dump_collection(struct hci_dev *hdev)
>>         struct qca_data *qca = hu->priv;
>>         struct qca_memdump_data *qca_memdump = qca->qca_memdump;
>>         char *memdump_buf = NULL;
>> +       unsigned long flags;
>> 
>>         wait_on_bit_timeout(&qca->flags, QCA_MEMDUMP_COLLECTION,
>>                             TASK_UNINTERRUPTIBLE, MEMDUMP_TIMEOUT_MS);
>> 
>>         clear_bit(QCA_MEMDUMP_COLLECTION, &qca->flags);
>> -       if (qca->memdump_state == QCA_MEMDUMP_IDLE) {
>> +       if (qca->memdump_state == QCA_MEMDUMP_IDLE ||
>> +           qca->memdump_state == QCA_MEMDUMP_COLLECTING) {
>>                 bt_dev_err(hu->hdev, "Clearing the buffers due to 
>> timeout");
>> +               spin_lock_irqsave(&qca->hci_memdump_lock, flags);
>>                 if (qca_memdump)
>> -                       memdump_buf = qca_memdump->memdump_buf_tail;
>> +                       memdump_buf = qca_memdump->memdump_buf_head;
>>                 vfree(memdump_buf);
>>                 kfree(qca_memdump);
>> +               qca->qca_memdump = NULL;
>>                 qca->memdump_state = QCA_MEMDUMP_TIMEOUT;
>>                 del_timer(&qca->memdump_timer);
>> +               skb_queue_purge(&qca->rx_memdump_q);
>> +               spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
>>                 cancel_work_sync(&qca->ctrl_memdump_evt);
>>         }
>>  }
>> @@ -1429,7 +1488,11 @@ static void qca_hw_error(struct hci_dev *hdev, 
>> u8 code)
>>  {
>>         struct hci_uart *hu = hci_get_drvdata(hdev);
>>         struct qca_data *qca = hu->priv;
>> +       struct qca_memdump_data *qca_memdump = qca->qca_memdump;
>> +       char *memdump_buf = NULL;
>> +       unsigned long flags;
>> 
>> +       set_bit(QCA_HW_ERROR_EVENT, &qca->flags);
>>         bt_dev_info(hdev, "mem_dump_status: %d", qca->memdump_state);
>> 
>>         if (qca->memdump_state == QCA_MEMDUMP_IDLE) {
>> @@ -1448,7 +1511,20 @@ static void qca_hw_error(struct hci_dev *hdev, 
>> u8 code)
>>                  */
>>                 bt_dev_info(hdev, "waiting for dump to complete");
>>                 qca_wait_for_dump_collection(hdev);
>> +       } else if (qca->memdump_state == QCA_MEMDUMP_TIMEOUT) {
>> +               bt_dev_err(hu->hdev, "clearing allocated memory due to 
>> memdump timeout");
>> +               spin_lock_irqsave(&qca->hci_memdump_lock, flags);
>> +               if (qca_memdump)
>> +                       memdump_buf = qca_memdump->memdump_buf_head;
>> +               vfree(memdump_buf);
>> +               kfree(qca_memdump);
>> +               qca->qca_memdump = NULL;
>> +               skb_queue_purge(&qca->rx_memdump_q);
>> +               spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
>> +               cancel_work_sync(&qca->ctrl_memdump_evt);
>>         }
>> +
>> +       clear_bit(QCA_HW_ERROR_EVENT, &qca->flags);
>>  }
>> 
>>  static void qca_cmd_timeout(struct hci_dev *hdev)
>> --
>> QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
>> member
>> of Code Aurora Forum, hosted by The Linux Foundation
>> 

Regards,
Venkata.
Marcel Holtmann Feb. 5, 2020, 12:52 p.m. UTC | #4
Hi Venkata,

>> Hi Venkata,
>> Per our earlier review on chromium gerrit:
>> https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/1992966
>> I'm not too keen on the change from mutex to spinlock because it's
>> made the code more complex.
> [Venkata] :
> 
> We have moved from mutex to spinlock as timer callback function is getting executed under interrupt context and not under process context.

can’t you use a delayed workqueue for this?

Regards

Marcel
diff mbox series

Patch

diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index eacc65b..ea956c3 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -69,7 +69,8 @@  enum qca_flags {
 	QCA_IBS_ENABLED,
 	QCA_DROP_VENDOR_EVENT,
 	QCA_SUSPENDING,
-	QCA_MEMDUMP_COLLECTION
+	QCA_MEMDUMP_COLLECTION,
+	QCA_HW_ERROR_EVENT
 };
 
 
@@ -150,6 +151,7 @@  struct qca_data {
 	struct completion drop_ev_comp;
 	wait_queue_head_t suspend_wait_q;
 	enum qca_memdump_states memdump_state;
+	spinlock_t hci_memdump_lock;
 
 	/* For debugging purpose */
 	u64 ibs_sent_wacks;
@@ -524,19 +526,19 @@  static void hci_ibs_wake_retrans_timeout(struct timer_list *t)
 
 static void hci_memdump_timeout(struct timer_list *t)
 {
-	struct qca_data *qca = from_timer(qca, t, tx_idle_timer);
+	struct qca_data *qca = from_timer(qca, t, memdump_timer);
 	struct hci_uart *hu = qca->hu;
-	struct qca_memdump_data *qca_memdump = qca->qca_memdump;
-	char *memdump_buf = qca_memdump->memdump_buf_tail;
+	unsigned long flags;
 
-	bt_dev_err(hu->hdev, "clearing allocated memory due to memdump timeout");
-	/* Inject hw error event to reset the device and driver. */
-	hci_reset_dev(hu->hdev);
-	vfree(memdump_buf);
-	kfree(qca_memdump);
+	spin_lock_irqsave(&qca->hci_memdump_lock, flags);
 	qca->memdump_state = QCA_MEMDUMP_TIMEOUT;
+	if (!test_bit(QCA_HW_ERROR_EVENT, &qca->flags)) {
+		/* Inject hw error event to reset the device and driver. */
+		hci_reset_dev(hu->hdev);
+	}
+
 	del_timer(&qca->memdump_timer);
-	cancel_work_sync(&qca->ctrl_memdump_evt);
+	spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
 }
 
 /* Initialize protocol */
@@ -558,6 +560,7 @@  static int qca_open(struct hci_uart *hu)
 	skb_queue_head_init(&qca->tx_wait_q);
 	skb_queue_head_init(&qca->rx_memdump_q);
 	spin_lock_init(&qca->hci_ibs_lock);
+	spin_lock_init(&qca->hci_memdump_lock);
 	qca->workqueue = alloc_ordered_workqueue("qca_wq", 0);
 	if (!qca->workqueue) {
 		BT_ERR("QCA Workqueue not initialized properly");
@@ -960,14 +963,25 @@  static void qca_controller_memdump(struct work_struct *work)
 	char nullBuff[QCA_DUMP_PACKET_SIZE] = { 0 };
 	u16 seq_no;
 	u32 dump_size;
+	unsigned long flags;
 
 	while ((skb = skb_dequeue(&qca->rx_memdump_q))) {
 
+		spin_lock_irqsave(&qca->hci_memdump_lock, flags);
+		/* Skip processing the received packets if timeout detected. */
+		if (qca->memdump_state == QCA_MEMDUMP_TIMEOUT) {
+			spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
+			return;
+		}
+
 		if (!qca_memdump) {
 			qca_memdump = kzalloc(sizeof(struct qca_memdump_data),
 					      GFP_ATOMIC);
-			if (!qca_memdump)
+			if (!qca_memdump) {
+				spin_unlock_irqrestore(&qca->hci_memdump_lock,
+								flags);
 				return;
+			}
 
 			qca->qca_memdump = qca_memdump;
 		}
@@ -992,6 +1006,8 @@  static void qca_controller_memdump(struct work_struct *work)
 			if (!(dump_size)) {
 				bt_dev_err(hu->hdev, "Rx invalid memdump size");
 				kfree_skb(skb);
+				spin_unlock_irqrestore(&qca->hci_memdump_lock,
+							flags);
 				return;
 			}
 
@@ -1001,7 +1017,24 @@  static void qca_controller_memdump(struct work_struct *work)
 				  msecs_to_jiffies(MEMDUMP_TIMEOUT_MS)));
 
 			skb_pull(skb, sizeof(dump_size));
+
+			/* vmalloc() might go to sleep while trying to allocate
+			 * memory.As calling sleep function under spin lock is
+			 * not allowed so unlocking spin lock and will be locked
+			 * again after vmalloc().
+			 */
+			spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
 			memdump_buf = vmalloc(dump_size);
+			spin_lock_irqsave(&qca->hci_memdump_lock, flags);
+			/* Skip processing the received packets if timeout
+			 * detected.
+			 */
+			if (qca->memdump_state == QCA_MEMDUMP_TIMEOUT) {
+				spin_unlock_irqrestore(&qca->hci_memdump_lock,
+							flags);
+				return;
+			}
+
 			qca_memdump->memdump_buf_head = memdump_buf;
 			qca_memdump->memdump_buf_tail = memdump_buf;
 		}
@@ -1016,6 +1049,7 @@  static void qca_controller_memdump(struct work_struct *work)
 			kfree(qca_memdump);
 			kfree_skb(skb);
 			qca->qca_memdump = NULL;
+			spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
 			return;
 		}
 
@@ -1044,18 +1078,37 @@  static void qca_controller_memdump(struct work_struct *work)
 			bt_dev_info(hu->hdev, "QCA writing crash dump of size %d bytes",
 				   qca_memdump->received_dump);
 			memdump_buf = qca_memdump->memdump_buf_head;
+
+			/* dev_coredumpv() might go to sleep.As calling sleep
+			 * function under spin lock is not allowed so unlocking
+			 * spin lock and will be locked again after
+			 * dev_coredumpv().
+			 */
+			spin_unlock_irqrestore(&qca->hci_memdump_lock,
+						flags);
 			dev_coredumpv(&hu->serdev->dev, memdump_buf,
 				      qca_memdump->received_dump, GFP_KERNEL);
+			spin_lock_irqsave(&qca->hci_memdump_lock, flags);
+			if (qca->memdump_state == QCA_MEMDUMP_TIMEOUT) {
+				spin_unlock_irqrestore(&qca->hci_memdump_lock,
+							flags);
+				return;
+			}
+
 			del_timer(&qca->memdump_timer);
 			kfree(qca->qca_memdump);
 			qca->qca_memdump = NULL;
 			qca->memdump_state = QCA_MEMDUMP_COLLECTED;
+			clear_bit(QCA_MEMDUMP_COLLECTION, &qca->flags);
 		}
+
+		spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
 	}
 
 }
 
-int qca_controller_memdump_event(struct hci_dev *hdev, struct sk_buff *skb)
+static int qca_controller_memdump_event(struct hci_dev *hdev,
+					struct sk_buff *skb)
 {
 	struct hci_uart *hu = hci_get_drvdata(hdev);
 	struct qca_data *qca = hu->priv;
@@ -1408,19 +1461,25 @@  static void qca_wait_for_dump_collection(struct hci_dev *hdev)
 	struct qca_data *qca = hu->priv;
 	struct qca_memdump_data *qca_memdump = qca->qca_memdump;
 	char *memdump_buf = NULL;
+	unsigned long flags;
 
 	wait_on_bit_timeout(&qca->flags, QCA_MEMDUMP_COLLECTION,
 			    TASK_UNINTERRUPTIBLE, MEMDUMP_TIMEOUT_MS);
 
 	clear_bit(QCA_MEMDUMP_COLLECTION, &qca->flags);
-	if (qca->memdump_state == QCA_MEMDUMP_IDLE) {
+	if (qca->memdump_state == QCA_MEMDUMP_IDLE ||
+	    qca->memdump_state == QCA_MEMDUMP_COLLECTING) {
 		bt_dev_err(hu->hdev, "Clearing the buffers due to timeout");
+		spin_lock_irqsave(&qca->hci_memdump_lock, flags);
 		if (qca_memdump)
-			memdump_buf = qca_memdump->memdump_buf_tail;
+			memdump_buf = qca_memdump->memdump_buf_head;
 		vfree(memdump_buf);
 		kfree(qca_memdump);
+		qca->qca_memdump = NULL;
 		qca->memdump_state = QCA_MEMDUMP_TIMEOUT;
 		del_timer(&qca->memdump_timer);
+		skb_queue_purge(&qca->rx_memdump_q);
+		spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
 		cancel_work_sync(&qca->ctrl_memdump_evt);
 	}
 }
@@ -1429,7 +1488,11 @@  static void qca_hw_error(struct hci_dev *hdev, u8 code)
 {
 	struct hci_uart *hu = hci_get_drvdata(hdev);
 	struct qca_data *qca = hu->priv;
+	struct qca_memdump_data *qca_memdump = qca->qca_memdump;
+	char *memdump_buf = NULL;
+	unsigned long flags;
 
+	set_bit(QCA_HW_ERROR_EVENT, &qca->flags);
 	bt_dev_info(hdev, "mem_dump_status: %d", qca->memdump_state);
 
 	if (qca->memdump_state == QCA_MEMDUMP_IDLE) {
@@ -1448,7 +1511,20 @@  static void qca_hw_error(struct hci_dev *hdev, u8 code)
 		 */
 		bt_dev_info(hdev, "waiting for dump to complete");
 		qca_wait_for_dump_collection(hdev);
+	} else if (qca->memdump_state == QCA_MEMDUMP_TIMEOUT) {
+		bt_dev_err(hu->hdev, "clearing allocated memory due to memdump timeout");
+		spin_lock_irqsave(&qca->hci_memdump_lock, flags);
+		if (qca_memdump)
+			memdump_buf = qca_memdump->memdump_buf_head;
+		vfree(memdump_buf);
+		kfree(qca_memdump);
+		qca->qca_memdump = NULL;
+		skb_queue_purge(&qca->rx_memdump_q);
+		spin_unlock_irqrestore(&qca->hci_memdump_lock, flags);
+		cancel_work_sync(&qca->ctrl_memdump_evt);
 	}
+
+	clear_bit(QCA_HW_ERROR_EVENT, &qca->flags);
 }
 
 static void qca_cmd_timeout(struct hci_dev *hdev)