diff mbox series

[13/18] scsi: target: Fix multiple LUN_RESET handling

Message ID 20230309223312.94595-14-michael.christie@oracle.com (mailing list archive)
State Superseded
Headers show
Series target: TMF and recovery fixes | expand

Commit Message

Mike Christie March 9, 2023, 10:33 p.m. UTC
This fixes a bug where an initiator thinks a LUN_RESET has cleaned
up running commands when it hasn't. The bug was added in:

commit 51ec502a3266 ("target: Delete tmr from list before processing")

The problem occurs when:

1. We have N IO cmds running in the target layer spread over 2 sessions.
2. The initiator sends a LUN_RESET for each session.
3. session1's LUN_RESET loops over all the running commands from both
sessions and moves them to its local drain_task_list.
4. session2's LUN_RESET does not see the LUN_RESET from session1 because
the commit above has it remove itself. session2 also does not see any
commands since the other reset moved them off the state lists.
5. sessions2's LUN_RESET will then complete with a successful response.
6. sessions2's inititor believes the running commands on its session are
now cleaned up due to the successful response and cleans up the running
commands from its side. It then restarts them.
7. The commands do eventually complete on the backend and the target
starts to return aborted task statuses for them. The initiator will
either throw a invalid ITT error or might accidentally lookup a new task
if the ITT has been reallocated already.

This fixes the bug by reverting the patch, and also serializes the
execution of LUN_RESETs and Preempt and Aborts. The latter is necessary
because it turns out the commit accidentally fixed a bug where if there
are 2 LUN RESETs executing they can see each other on the dev_tmr_list,
put the other one on their local drain list, then end up waiting on each
other resulting in a deadlock.

Fixes: 51ec502a3266 ("target: Delete tmr from list before processing")
Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 drivers/target/target_core_device.c    | 15 ++++++--
 drivers/target/target_core_tmr.c       | 15 ++++----
 drivers/target/target_core_transport.c | 50 ++++++++++++++++++++++++--
 include/target/target_core_base.h      |  5 ++-
 4 files changed, 74 insertions(+), 11 deletions(-)

Comments

Dmitry Bogdanov March 15, 2023, 4:13 p.m. UTC | #1
On Thu, Mar 09, 2023 at 04:33:07PM -0600, Mike Christie wrote:
> «Внимание! Данное письмо от внешнего адресата!»
> 
> This fixes a bug where an initiator thinks a LUN_RESET has cleaned
> up running commands when it hasn't. The bug was added in:
> 
> commit 51ec502a3266 ("target: Delete tmr from list before processing")
> 
> The problem occurs when:
> 
> 1. We have N IO cmds running in the target layer spread over 2 sessions.
> 2. The initiator sends a LUN_RESET for each session.
> 3. session1's LUN_RESET loops over all the running commands from both
> sessions and moves them to its local drain_task_list.
> 4. session2's LUN_RESET does not see the LUN_RESET from session1 because
> the commit above has it remove itself. session2 also does not see any
> commands since the other reset moved them off the state lists.
> 5. sessions2's LUN_RESET will then complete with a successful response.
> 6. sessions2's inititor believes the running commands on its session are
> now cleaned up due to the successful response and cleans up the running
> commands from its side. It then restarts them.
> 7. The commands do eventually complete on the backend and the target
> starts to return aborted task statuses for them. The initiator will
> either throw a invalid ITT error or might accidentally lookup a new task
> if the ITT has been reallocated already.
> 
> This fixes the bug by reverting the patch, and also serializes the
> execution of LUN_RESETs and Preempt and Aborts. The latter is necessary
> because it turns out the commit accidentally fixed a bug where if there
> are 2 LUN RESETs executing they can see each other on the dev_tmr_list,
> put the other one on their local drain list, then end up waiting on each
> other resulting in a deadlock.

If LUN_RESET is not in TMR list anymore there is no need to serialize
core_tmr_drain_tmr_list.

> 
> Fixes: 51ec502a3266 ("target: Delete tmr from list before processing")
> Signed-off-by: Mike Christie <michael.christie@oracle.com>
> ---
>  drivers/target/target_core_device.c    | 15 ++++++--
>  drivers/target/target_core_tmr.c       | 15 ++++----
>  drivers/target/target_core_transport.c | 50 ++++++++++++++++++++++++--
>  include/target/target_core_base.h      |  5 ++-
>  4 files changed, 74 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
> index f6e58410ec3f..c9f75ed1566b 100644
> --- a/drivers/target/target_core_device.c
> +++ b/drivers/target/target_core_device.c
> @@ -179,7 +179,16 @@ int transport_lookup_tmr_lun(struct se_cmd *se_cmd)
>         se_tmr->tmr_dev = rcu_dereference_raw(se_lun->lun_se_dev);
> 
>         spin_lock_irqsave(&se_tmr->tmr_dev->se_tmr_lock, flags);
> -       list_add_tail(&se_tmr->tmr_list, &se_tmr->tmr_dev->dev_tmr_list);
> +       switch (se_tmr->function) {
> +       case TMR_ABORT_TASK:
> +               list_add_tail(&se_tmr->tmr_list,
> +                             &se_tmr->tmr_dev->generic_tmr_list);
> +               break;
> +       case TMR_LUN_RESET:
> +               list_add_tail(&se_tmr->tmr_list,
> +                             &se_tmr->tmr_dev->lun_reset_tmr_list);
> +               break;
> +       }
>         spin_unlock_irqrestore(&se_tmr->tmr_dev->se_tmr_lock, flags);
> 
>         return 0;
> @@ -761,7 +770,8 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
>         dev->hba_index = hba->hba_index;
> 
>         INIT_LIST_HEAD(&dev->dev_sep_list);
> -       INIT_LIST_HEAD(&dev->dev_tmr_list);
> +       INIT_LIST_HEAD(&dev->generic_tmr_list);
> +       INIT_LIST_HEAD(&dev->lun_reset_tmr_list);
>         INIT_LIST_HEAD(&dev->delayed_cmd_list);
>         INIT_LIST_HEAD(&dev->qf_cmd_list);
>         spin_lock_init(&dev->delayed_cmd_lock);
> @@ -782,6 +792,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
>         spin_lock_init(&dev->t10_alua.lba_map_lock);
> 
>         INIT_WORK(&dev->delayed_cmd_work, target_do_delayed_work);
> +       mutex_init(&dev->lun_reset_mutex);
> 
>         dev->t10_wwn.t10_dev = dev;
>         /*
> diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c
> index 2b95b4550a63..88d2a7839876 100644
> --- a/drivers/target/target_core_tmr.c
> +++ b/drivers/target/target_core_tmr.c
> @@ -184,13 +184,11 @@ static void core_tmr_drain_tmr_list(
>         unsigned long flags;
>         bool rc;
>         /*
> -        * Release all pending and outgoing TMRs aside from the received
> -        * LUN_RESET tmr..
> +        * Release all pending and outgoing TMRs except for LUN_RESETS.
>          */
>         spin_lock_irqsave(&dev->se_tmr_lock, flags);
> -       if (tmr)
> -               list_del_init(&tmr->tmr_list);
> -       list_for_each_entry_safe(tmr_p, tmr_pp, &dev->dev_tmr_list, tmr_list) {
> +       list_for_each_entry_safe(tmr_p, tmr_pp, &dev->generic_tmr_list,
> +                                tmr_list) {
>                 cmd = tmr_p->task_cmd;
>                 if (!cmd) {
>                         pr_err("Unable to locate struct se_cmd for TMR\n");
> @@ -379,14 +377,19 @@ int core_tmr_lun_reset(
>                                 tmr_nacl->initiatorname);
>                 }
>         }
> +
> +       /* Serialize LUN RESET TMRs and preempt and aborts */
> +       mutex_lock(&dev->lun_reset_mutex);
> +
>         pr_debug("LUN_RESET: %s starting for [%s], tas: %d\n",
>                 (preempt_and_abort_list) ? "Preempt" : "TMR",
>                 dev->transport->name, tas);
> -
>         core_tmr_drain_tmr_list(dev, tmr, preempt_and_abort_list);
>         core_tmr_drain_state_list(dev, prout_cmd, tmr_sess, tas,
>                                 preempt_and_abort_list);
> 
> +       mutex_unlock(&dev->lun_reset_mutex);
> +
>         /*
>          * Clear any legacy SPC-2 reservation when called during
>          * LOGICAL UNIT RESET
> diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
> index 1c23079a5d7f..3c732b1b5389 100644
> --- a/drivers/target/target_core_transport.c
> +++ b/drivers/target/target_core_transport.c
> @@ -3574,6 +3574,7 @@ static void target_tmr_work(struct work_struct *work)
>         struct se_cmd *cmd = container_of(work, struct se_cmd, work);
>         struct se_device *dev = cmd->se_dev;
>         struct se_tmr_req *tmr = cmd->se_tmr_req;
> +       bool sched_reset = false;
>         int ret;
> 
>         if (cmd->transport_state & CMD_T_ABORTED)
> @@ -3596,6 +3597,22 @@ static void target_tmr_work(struct work_struct *work)
>                         target_dev_ua_allocate(dev, 0x29,
>                                                ASCQ_29H_BUS_DEVICE_RESET_FUNCTION_OCCURRED);
>                 }
> +
> +               /*
> +                * If this is the last reset the device can be freed after we
> +                * run transport_cmd_check_stop_to_fabric. Figure out if there
> +                * are other resets that need to be scheduled while we know we
> +                * have a refcount on the device.
> +                */
> +               spin_lock_irq(&dev->se_tmr_lock);

tmr->tmr_list is removed from the list in the very end of se_cmd lifecycle
so any number of LUN_RESETs can be in lun_reset_tmr_list. And all of them 
can be finished but not yet removed from the list. 
 
You may delete lun_reset here with nulling tmr->tmr_dev:
+			list_del_init(&cmd->se_tmr_req->tmr_list);
+			cmd->se_tmr_req->tmr_dev = NULL;

Then the check below will be just 
+			if (!list_empty(dev->lun_reset_tmr_list))

> +               if (list_first_entry(&dev->lun_reset_tmr_list,
> +                                    struct se_tmr_req, tmr_list) !=
> +                   list_last_entry(&dev->lun_reset_tmr_list,
> +                                   struct se_tmr_req, tmr_list))
> +                       sched_reset = true;
> +               else
> +                       dev->dev_flags &= ~DF_RESETTING_LUN;
> +               spin_unlock_irq(&dev->se_tmr_lock);
>                 break;
>         case TMR_TARGET_WARM_RESET:
>                 tmr->response = TMR_FUNCTION_REJECTED;
> @@ -3617,15 +3634,26 @@ static void target_tmr_work(struct work_struct *work)
> 
>         transport_lun_remove_cmd(cmd);
>         transport_cmd_check_stop_to_fabric(cmd);
> +
> +       if (!sched_reset)
> +               return;
> +
> +       spin_lock_irq(&dev->se_tmr_lock);
> +       tmr = list_first_entry(&dev->lun_reset_tmr_list, struct se_tmr_req,
> +                              tmr_list);

And this list_first_entry will return the next LUN_RESET as you
expected.

> +       spin_unlock_irq(&dev->se_tmr_lock);
> +
> +       INIT_WORK(&tmr->task_cmd->work, target_tmr_work);
> +       schedule_work(&tmr->task_cmd->work);
>         return;
> 
>  aborted:
>         target_handle_abort(cmd);
>  }
> 
> -int transport_generic_handle_tmr(
> -       struct se_cmd *cmd)
> +int transport_generic_handle_tmr(struct se_cmd *cmd)
>  {
> +       struct se_device *dev = cmd->se_dev;
>         unsigned long flags;
>         bool aborted = false;
> 
> @@ -3646,8 +3674,26 @@ int transport_generic_handle_tmr(
>                 return 0;
>         }
> 
> +       spin_lock_irqsave(&dev->se_tmr_lock, flags);
> +       if (cmd->se_tmr_req->function == TMR_LUN_RESET) {
> +               /*
> +                * We only allow one reset to execute at a time to prevent
> +                * one reset waiting on another, and to make sure one reset
> +                * does not claim all the cmds causing the other reset to
> +                * return early.
> +                */
> +               if (dev->dev_flags & DF_RESETTING_LUN) {
> +                       spin_unlock_irqrestore(&dev->se_tmr_lock, flags);
> +                       goto done;
> +               }
> +
> +               dev->dev_flags |= DF_RESETTING_LUN;

Not good choise of flag variable. It is used at configuration time and
not under a lock. Configfs file dev/alias can be changed in any time
and could race with LUN_RESET.

> +       }
> +       spin_unlock_irqrestore(&dev->se_tmr_lock, flags);
> +
>         INIT_WORK(&cmd->work, target_tmr_work);
>         schedule_work(&cmd->work);
> +done:
>         return 0;
>  }
>  EXPORT_SYMBOL(transport_generic_handle_tmr);
> diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
> index bd299790e99c..0a5b51f8e5e8 100644
> --- a/include/target/target_core_base.h
> +++ b/include/target/target_core_base.h
> @@ -804,6 +804,7 @@ struct se_device {
>  #define DF_USING_UDEV_PATH                     0x00000008
>  #define DF_USING_ALIAS                         0x00000010
>  #define DF_READ_ONLY                           0x00000020
> +#define DF_RESETTING_LUN                       0x00000040
>         u8                      transport_flags;
>         /* Physical device queue depth */
>         u32                     queue_depth;
> @@ -840,7 +841,8 @@ struct se_device {
>         /* Used for SPC-3 Persistent Reservations */
>         struct t10_pr_registration *dev_pr_res_holder;
>         struct list_head        dev_sep_list;
> -       struct list_head        dev_tmr_list;
> +       struct list_head        generic_tmr_list;
> +       struct list_head        lun_reset_tmr_list;
>         struct work_struct      qf_work_queue;
>         struct work_struct      delayed_cmd_work;
>         struct list_head        delayed_cmd_list;
> @@ -872,6 +874,7 @@ struct se_device {
>         struct rcu_head         rcu_head;
>         int                     queue_cnt;
>         struct se_device_queue  *queues;
> +       struct mutex            lun_reset_mutex;
>  };
> 
>  struct target_opcode_descriptor {
> --
> 2.31.1
> 
>
Mike Christie March 15, 2023, 4:44 p.m. UTC | #2
On 3/15/23 11:13 AM, Dmitry Bogdanov wrote:
> On Thu, Mar 09, 2023 at 04:33:07PM -0600, Mike Christie wrote:
>> «Внимание! Данное письмо от внешнего адресата!»
>>
>> This fixes a bug where an initiator thinks a LUN_RESET has cleaned
>> up running commands when it hasn't. The bug was added in:
>>
>> commit 51ec502a3266 ("target: Delete tmr from list before processing")
>>
>> The problem occurs when:
>>
>> 1. We have N IO cmds running in the target layer spread over 2 sessions.
>> 2. The initiator sends a LUN_RESET for each session.
>> 3. session1's LUN_RESET loops over all the running commands from both
>> sessions and moves them to its local drain_task_list.
>> 4. session2's LUN_RESET does not see the LUN_RESET from session1 because
>> the commit above has it remove itself. session2 also does not see any
>> commands since the other reset moved them off the state lists.
>> 5. sessions2's LUN_RESET will then complete with a successful response.
>> 6. sessions2's inititor believes the running commands on its session are
>> now cleaned up due to the successful response and cleans up the running
>> commands from its side. It then restarts them.
>> 7. The commands do eventually complete on the backend and the target
>> starts to return aborted task statuses for them. The initiator will
>> either throw a invalid ITT error or might accidentally lookup a new task
>> if the ITT has been reallocated already.
>>
>> This fixes the bug by reverting the patch, and also serializes the
>> execution of LUN_RESETs and Preempt and Aborts. The latter is necessary
>> because it turns out the commit accidentally fixed a bug where if there
>> are 2 LUN RESETs executing they can see each other on the dev_tmr_list,
>> put the other one on their local drain list, then end up waiting on each
>> other resulting in a deadlock.
> 
> If LUN_RESET is not in TMR list anymore there is no need to serialize
> core_tmr_drain_tmr_list.

Ah shoot yeah I miswrote that. I meant I needed the serialization for my
bug not yours.


>>
>>         if (cmd->transport_state & CMD_T_ABORTED)
>> @@ -3596,6 +3597,22 @@ static void target_tmr_work(struct work_struct *work)
>>                         target_dev_ua_allocate(dev, 0x29,
>>                                                ASCQ_29H_BUS_DEVICE_RESET_FUNCTION_OCCURRED);
>>                 }
>> +
>> +               /*
>> +                * If this is the last reset the device can be freed after we
>> +                * run transport_cmd_check_stop_to_fabric. Figure out if there
>> +                * are other resets that need to be scheduled while we know we
>> +                * have a refcount on the device.
>> +                */
>> +               spin_lock_irq(&dev->se_tmr_lock);
> 
> tmr->tmr_list is removed from the list in the very end of se_cmd lifecycle
> so any number of LUN_RESETs can be in lun_reset_tmr_list. And all of them 
> can be finished but not yet removed from the list. 

Don't we remove it from the list a little later in this function when
we call transport_lun_remove_cmd?

>  
> You may delete lun_reset here with nulling tmr->tmr_dev:
> +			list_del_init(&cmd->se_tmr_req->tmr_list);
> +			cmd->se_tmr_req->tmr_dev = NULL;
> 
> Then the check below will be just 
> +			if (!list_empty(dev->lun_reset_tmr_list))

I could go either way on this. Normally it's best to just have the one
place where we handle something like the deletion and clearing. If I'm
correct then it's already done a little later in this function so we
are ok.

On the other hand, yeah my test is kind of gross.


>>
>> +       spin_lock_irqsave(&dev->se_tmr_lock, flags);
>> +       if (cmd->se_tmr_req->function == TMR_LUN_RESET) {
>> +               /*
>> +                * We only allow one reset to execute at a time to prevent
>> +                * one reset waiting on another, and to make sure one reset
>> +                * does not claim all the cmds causing the other reset to
>> +                * return early.
>> +                */
>> +               if (dev->dev_flags & DF_RESETTING_LUN) {
>> +                       spin_unlock_irqrestore(&dev->se_tmr_lock, flags);
>> +                       goto done;
>> +               }
>> +
>> +               dev->dev_flags |= DF_RESETTING_LUN;
> 
> Not good choise of flag variable. It is used at configuration time and
> not under a lock. Configfs file dev/alias can be changed in any time
> and could race with LUN_RESET.

I didn't see any places where one place can overwrite other flags. Are
you just saying in general it could happen. If so, would you also not
want dev->transport_flags to be used then?
Dmitry Bogdanov March 15, 2023, 7:11 p.m. UTC | #3
On Wed, Mar 15, 2023 at 11:44:48AM -0500, Mike Christie wrote:
> 
> On 3/15/23 11:13 AM, Dmitry Bogdanov wrote:
> > On Thu, Mar 09, 2023 at 04:33:07PM -0600, Mike Christie wrote:
> >>
> >> This fixes a bug where an initiator thinks a LUN_RESET has cleaned
> >> up running commands when it hasn't. The bug was added in:
> >>
> >> commit 51ec502a3266 ("target: Delete tmr from list before processing")
> >>
> >> The problem occurs when:
> >>
> >> 1. We have N IO cmds running in the target layer spread over 2 sessions.
> >> 2. The initiator sends a LUN_RESET for each session.
> >> 3. session1's LUN_RESET loops over all the running commands from both
> >> sessions and moves them to its local drain_task_list.
> >> 4. session2's LUN_RESET does not see the LUN_RESET from session1 because
> >> the commit above has it remove itself. session2 also does not see any
> >> commands since the other reset moved them off the state lists.
> >> 5. sessions2's LUN_RESET will then complete with a successful response.
> >> 6. sessions2's inititor believes the running commands on its session are
> >> now cleaned up due to the successful response and cleans up the running
> >> commands from its side. It then restarts them.
> >> 7. The commands do eventually complete on the backend and the target
> >> starts to return aborted task statuses for them. The initiator will
> >> either throw a invalid ITT error or might accidentally lookup a new task
> >> if the ITT has been reallocated already.
> >>
> >> This fixes the bug by reverting the patch, and also serializes the
> >> execution of LUN_RESETs and Preempt and Aborts. The latter is necessary
> >> because it turns out the commit accidentally fixed a bug where if there
> >> are 2 LUN RESETs executing they can see each other on the dev_tmr_list,
> >> put the other one on their local drain list, then end up waiting on each
> >> other resulting in a deadlock.
> >
> > If LUN_RESET is not in TMR list anymore there is no need to serialize
> > core_tmr_drain_tmr_list.
> 
> Ah shoot yeah I miswrote that. I meant I needed the serialization for my
> bug not yours.

I still did not get why you wrapping core_tmr_drain_*_list by mutex.
general_tmr_list have only aborts now and they do not wait for other aborts.

> 
> >>
> >>         if (cmd->transport_state & CMD_T_ABORTED)
> >> @@ -3596,6 +3597,22 @@ static void target_tmr_work(struct work_struct *work)
> >>                         target_dev_ua_allocate(dev, 0x29,
> >>                                                ASCQ_29H_BUS_DEVICE_RESET_FUNCTION_OCCURRED);
> >>                 }
> >> +
> >> +               /*
> >> +                * If this is the last reset the device can be freed after we
> >> +                * run transport_cmd_check_stop_to_fabric. Figure out if there
> >> +                * are other resets that need to be scheduled while we know we
> >> +                * have a refcount on the device.
> >> +                */
> >> +               spin_lock_irq(&dev->se_tmr_lock);
> >
> > tmr->tmr_list is removed from the list in the very end of se_cmd lifecycle
> > so any number of LUN_RESETs can be in lun_reset_tmr_list. And all of them
> > can be finished but not yet removed from the list.
> 
> Don't we remove it from the list a little later in this function when
> we call transport_lun_remove_cmd?

OMG, yes, of course, you a right. I messed up something.

But I have concerns still:
transport_lookup_tmr_lun (where LUN_RESET is added to the list) and
transport_generic_handle_tmr(where LUN_RESET is scheduled to handle)
are not serialized. And below you can start the second LUN_RESET while
transport_generic_handle_tmr is not yet called for it. The _handle_tmr
could be delayed for a such time that is enough to handle that second
LUN_RESET and to clear the flag. _handle_tmr will then start the work
again.

Is it worth doing that list management? Is it not enough just wrap
calling core_tmr_lun_reset() in target_tmr_work by a mutex?


> >
> > You may delete lun_reset here with nulling tmr->tmr_dev:
> > +                     list_del_init(&cmd->se_tmr_req->tmr_list);
> > +                     cmd->se_tmr_req->tmr_dev = NULL;
> >
> > Then the check below will be just
> > +                     if (!list_empty(dev->lun_reset_tmr_list))
> 
> I could go either way on this. Normally it's best to just have the one
> place where we handle something like the deletion and clearing. If I'm
> correct then it's already done a little later in this function so we
> are ok.
> 
> On the other hand, yeah my test is kind of gross.
> 
> 
> >>
> >> +       spin_lock_irqsave(&dev->se_tmr_lock, flags);
> >> +       if (cmd->se_tmr_req->function == TMR_LUN_RESET) {
> >> +               /*
> >> +                * We only allow one reset to execute at a time to prevent
> >> +                * one reset waiting on another, and to make sure one reset
> >> +                * does not claim all the cmds causing the other reset to
> >> +                * return early.
> >> +                */
> >> +               if (dev->dev_flags & DF_RESETTING_LUN) {
> >> +                       spin_unlock_irqrestore(&dev->se_tmr_lock, flags);
> >> +                       goto done;
> >> +               }
> >> +
> >> +               dev->dev_flags |= DF_RESETTING_LUN;
> >
> > Not good choise of flag variable. It is used at configuration time and
> > not under a lock. Configfs file dev/alias can be changed in any time
> > and could race with LUN_RESET.
> 
> I didn't see any places where one place can overwrite other flags. Are
> you just saying in general it could happen. If so, would you also not
> want dev->transport_flags to be used then?

Yes, in general, bit setting is not atomic, write of some bit can
clear other bit being write in parallel.
Better to have a separarte variable used only under lock.
Mike Christie March 15, 2023, 9:42 p.m. UTC | #4
On 3/15/23 2:11 PM, Dmitry Bogdanov wrote:
> On Wed, Mar 15, 2023 at 11:44:48AM -0500, Mike Christie wrote:
>>
>> On 3/15/23 11:13 AM, Dmitry Bogdanov wrote:
>>> On Thu, Mar 09, 2023 at 04:33:07PM -0600, Mike Christie wrote:
>>>>
>>>> This fixes a bug where an initiator thinks a LUN_RESET has cleaned
>>>> up running commands when it hasn't. The bug was added in:
>>>>
>>>> commit 51ec502a3266 ("target: Delete tmr from list before processing")
>>>>
>>>> The problem occurs when:
>>>>
>>>> 1. We have N IO cmds running in the target layer spread over 2 sessions.
>>>> 2. The initiator sends a LUN_RESET for each session.
>>>> 3. session1's LUN_RESET loops over all the running commands from both
>>>> sessions and moves them to its local drain_task_list.
>>>> 4. session2's LUN_RESET does not see the LUN_RESET from session1 because
>>>> the commit above has it remove itself. session2 also does not see any
>>>> commands since the other reset moved them off the state lists.
>>>> 5. sessions2's LUN_RESET will then complete with a successful response.
>>>> 6. sessions2's inititor believes the running commands on its session are
>>>> now cleaned up due to the successful response and cleans up the running
>>>> commands from its side. It then restarts them.
>>>> 7. The commands do eventually complete on the backend and the target
>>>> starts to return aborted task statuses for them. The initiator will
>>>> either throw a invalid ITT error or might accidentally lookup a new task
>>>> if the ITT has been reallocated already.
>>>>
>>>> This fixes the bug by reverting the patch, and also serializes the
>>>> execution of LUN_RESETs and Preempt and Aborts. The latter is necessary
>>>> because it turns out the commit accidentally fixed a bug where if there
>>>> are 2 LUN RESETs executing they can see each other on the dev_tmr_list,
>>>> put the other one on their local drain list, then end up waiting on each
>>>> other resulting in a deadlock.
>>>
>>> If LUN_RESET is not in TMR list anymore there is no need to serialize
>>> core_tmr_drain_tmr_list.
>>
>> Ah shoot yeah I miswrote that. I meant I needed the serialization for my
>> bug not yours.
> 
> I still did not get why you wrapping core_tmr_drain_*_list by mutex.
> general_tmr_list have only aborts now and they do not wait for other aborts.

Do you mean I don't need the mutex for the bug I originally hit that's described
at the beginning? If your saying I don't need it for 2 resets running at the same
time, I agree. I thought I needed it if we have a RESET and Preempt and Abort:

1. You have 2 sessions. There are no TMRs initially.
2. session1 gets Preempt and Abort. It calls core_tmr_drain_state_list
and takes all the cmds from both sessions and puts them on the local
drain_task_list list.
3. session1 or 2 gets a LUN_RESET, it sees no cmds on the device's
state_lists, and returns success.
4. The initiator thinks the commands were cleaned up by the LUN_RESET.

- It could end up re-using the ITT while the original task being cleaned up is
still running. Then depending on which session got what and if TAS was set, if
the original command completes first then the initiator would think the second
command failed with SAM_STAT_TASK_ABORTED.

- If there was no TAS or the RESET and Preempt and Abort were on the same session
then when we could still hit a bug. We get the RESET response, the initiator might
retry the cmds or fail and the app might retry. The retry might go down a completely
different path on the target (like if hw queue1 was blocked and had the original
command, but this retry goes down hw queue2 due to being received on a different
CPU, so it completes right away). We do some new IO. Then hw queue1 unblocks and
overwrites the new IO.

With the mutex, the LUN_RESET will wait for the Preempt and Abort
which is waiting on the running commands. I could have had Preempt
and Abort create a tmr, and queue a work and go through that path
but I thought it looked uglier faking it.


> 
>>
>>>>
>>>>         if (cmd->transport_state & CMD_T_ABORTED)
>>>> @@ -3596,6 +3597,22 @@ static void target_tmr_work(struct work_struct *work)
>>>>                         target_dev_ua_allocate(dev, 0x29,
>>>>                                                ASCQ_29H_BUS_DEVICE_RESET_FUNCTION_OCCURRED);
>>>>                 }
>>>> +
>>>> +               /*
>>>> +                * If this is the last reset the device can be freed after we
>>>> +                * run transport_cmd_check_stop_to_fabric. Figure out if there
>>>> +                * are other resets that need to be scheduled while we know we
>>>> +                * have a refcount on the device.
>>>> +                */
>>>> +               spin_lock_irq(&dev->se_tmr_lock);
>>>
>>> tmr->tmr_list is removed from the list in the very end of se_cmd lifecycle
>>> so any number of LUN_RESETs can be in lun_reset_tmr_list. And all of them
>>> can be finished but not yet removed from the list.
>>
>> Don't we remove it from the list a little later in this function when
>> we call transport_lun_remove_cmd?
> 
> OMG, yes, of course, you a right. I messed up something.
> 
> But I have concerns still:
> transport_lookup_tmr_lun (where LUN_RESET is added to the list) and
> transport_generic_handle_tmr(where LUN_RESET is scheduled to handle)
> are not serialized. And below you can start the second LUN_RESET while
> transport_generic_handle_tmr is not yet called for it. The _handle_tmr
> could be delayed for a such time that is enough to handle that second
> LUN_RESET and to clear the flag. _handle_tmr will then start the work
> again.

Ah yeah, nice catch.

> 
> Is it worth doing that list management? Is it not enough just wrap
> calling core_tmr_lun_reset() in target_tmr_work by a mutex?

I can just do the mutex.

Was trying to reduce how many threads we use, but the big win is for aborts.
Will work on that type of thing in a separate patchset.


> Better to have a separarte variable used only under lock.
>
Will fix.
Dmitry Bogdanov March 16, 2023, 10:39 a.m. UTC | #5
On Wed, Mar 15, 2023 at 04:42:19PM -0500, Mike Christie wrote:
> 
> On 3/15/23 2:11 PM, Dmitry Bogdanov wrote:
> > On Wed, Mar 15, 2023 at 11:44:48AM -0500, Mike Christie wrote:
> >>
> >> On 3/15/23 11:13 AM, Dmitry Bogdanov wrote:
> >>> On Thu, Mar 09, 2023 at 04:33:07PM -0600, Mike Christie wrote:
> >>>>
> >>>> This fixes a bug where an initiator thinks a LUN_RESET has cleaned
> >>>> up running commands when it hasn't. The bug was added in:
> >>>>
> >>>> commit 51ec502a3266 ("target: Delete tmr from list before processing")
> >>>>
> >>>> The problem occurs when:
> >>>>
> >>>> 1. We have N IO cmds running in the target layer spread over 2 sessions.
> >>>> 2. The initiator sends a LUN_RESET for each session.
> >>>> 3. session1's LUN_RESET loops over all the running commands from both
> >>>> sessions and moves them to its local drain_task_list.
> >>>> 4. session2's LUN_RESET does not see the LUN_RESET from session1 because
> >>>> the commit above has it remove itself. session2 also does not see any
> >>>> commands since the other reset moved them off the state lists.
> >>>> 5. sessions2's LUN_RESET will then complete with a successful response.
> >>>> 6. sessions2's inititor believes the running commands on its session are
> >>>> now cleaned up due to the successful response and cleans up the running
> >>>> commands from its side. It then restarts them.
> >>>> 7. The commands do eventually complete on the backend and the target
> >>>> starts to return aborted task statuses for them. The initiator will
> >>>> either throw a invalid ITT error or might accidentally lookup a new task
> >>>> if the ITT has been reallocated already.
> >>>>
> >>>> This fixes the bug by reverting the patch, and also serializes the
> >>>> execution of LUN_RESETs and Preempt and Aborts. The latter is necessary
> >>>> because it turns out the commit accidentally fixed a bug where if there
> >>>> are 2 LUN RESETs executing they can see each other on the dev_tmr_list,
> >>>> put the other one on their local drain list, then end up waiting on each
> >>>> other resulting in a deadlock.
> >>>
> >>> If LUN_RESET is not in TMR list anymore there is no need to serialize
> >>> core_tmr_drain_tmr_list.
> >>
> >> Ah shoot yeah I miswrote that. I meant I needed the serialization for my
> >> bug not yours.
> >
> > I still did not get why you wrapping core_tmr_drain_*_list by mutex.
> > general_tmr_list have only aborts now and they do not wait for other aborts.
> 
> Do you mean I don't need the mutex for the bug I originally hit that's described
> at the beginning? If your saying I don't need it for 2 resets running at the same
> time, I agree. I thought I needed it if we have a RESET and Preempt and Abort:
> 
> 1. You have 2 sessions. There are no TMRs initially.
> 2. session1 gets Preempt and Abort. It calls core_tmr_drain_state_list
> and takes all the cmds from both sessions and puts them on the local
> drain_task_list list.
> 3. session1 or 2 gets a LUN_RESET, it sees no cmds on the device's
> state_lists, and returns success.
> 4. The initiator thinks the commands were cleaned up by the LUN_RESET.
> 
> - It could end up re-using the ITT while the original task being cleaned up is
> still running. Then depending on which session got what and if TAS was set, if
> the original command completes first then the initiator would think the second
> command failed with SAM_STAT_TASK_ABORTED.
> 
> - If there was no TAS or the RESET and Preempt and Abort were on the same session
> then when we could still hit a bug. We get the RESET response, the initiator might
> retry the cmds or fail and the app might retry. The retry might go down a completely
> different path on the target (like if hw queue1 was blocked and had the original
> command, but this retry goes down hw queue2 due to being received on a different
> CPU, so it completes right away). We do some new IO. Then hw queue1 unblocks and
> overwrites the new IO.
> 
> With the mutex, the LUN_RESET will wait for the Preempt and Abort
> which is waiting on the running commands. I could have had Preempt
> and Abort create a tmr, and queue a work and go through that path
> but I thought it looked uglier faking it.

Thank you for explanation. But I think you a not right here.
Preempt And Abort is used to change the reservation holder and abort
preempted session's commands. A preempted session is not allowed to send
any new messages, they will be failed anyway.
So we are safe here. Or did I miss something?

> >>
> >>>>
> >>>>         if (cmd->transport_state & CMD_T_ABORTED)
> >>>> @@ -3596,6 +3597,22 @@ static void target_tmr_work(struct work_struct *work)
> >>>>                         target_dev_ua_allocate(dev, 0x29,
> >>>>                                                ASCQ_29H_BUS_DEVICE_RESET_FUNCTION_OCCURRED);
> >>>>                 }
> >>>> +
> >>>> +               /*
> >>>> +                * If this is the last reset the device can be freed after we
> >>>> +                * run transport_cmd_check_stop_to_fabric. Figure out if there
> >>>> +                * are other resets that need to be scheduled while we know we
> >>>> +                * have a refcount on the device.
> >>>> +                */
> >>>> +               spin_lock_irq(&dev->se_tmr_lock);
> >>>
> >>> tmr->tmr_list is removed from the list in the very end of se_cmd lifecycle
> >>> so any number of LUN_RESETs can be in lun_reset_tmr_list. And all of them
> >>> can be finished but not yet removed from the list.
> >>
> >> Don't we remove it from the list a little later in this function when
> >> we call transport_lun_remove_cmd?
> >
> > OMG, yes, of course, you a right. I messed up something.
> >
> > But I have concerns still:
> > transport_lookup_tmr_lun (where LUN_RESET is added to the list) and
> > transport_generic_handle_tmr(where LUN_RESET is scheduled to handle)
> > are not serialized. And below you can start the second LUN_RESET while
> > transport_generic_handle_tmr is not yet called for it. The _handle_tmr
> > could be delayed for a such time that is enough to handle that second
> > LUN_RESET and to clear the flag. _handle_tmr will then start the work
> > again.
> 
> Ah yeah, nice catch.
> 
> >
> > Is it worth doing that list management? Is it not enough just wrap
> > calling core_tmr_lun_reset() in target_tmr_work by a mutex?
> 
> I can just do the mutex.
> 
> Was trying to reduce how many threads we use, but the big win is for aborts.
> Will work on that type of thing in a separate patchset.

Considering that (if) I am right with PreemptAndAbort,
to address multiple LUN_RESET issue it's enough to wrap
core_tmr_lun_reset and skip all LUN_RESETs in target_drain_tmr_list.
Without any new lists. That would be as simple patch as possible. 

> 
> > Better to have a separarte variable used only under lock.
> >
> Will fix.
> 
>
Mike Christie March 16, 2023, 4:03 p.m. UTC | #6
On 3/16/23 5:39 AM, Dmitry Bogdanov wrote:
> On Wed, Mar 15, 2023 at 04:42:19PM -0500, Mike Christie wrote:
>> On 3/15/23 2:11 PM, Dmitry Bogdanov wrote:
>>> On Wed, Mar 15, 2023 at 11:44:48AM -0500, Mike Christie wrote:
>>>> On 3/15/23 11:13 AM, Dmitry Bogdanov wrote:
>>>>> On Thu, Mar 09, 2023 at 04:33:07PM -0600, Mike Christie wrote:
>>>>>> This fixes a bug where an initiator thinks a LUN_RESET has cleaned
>>>>>> up running commands when it hasn't. The bug was added in:
>>>>>>
>>>>>> commit 51ec502a3266 ("target: Delete tmr from list before processing")
>>>>>>
>>>>>> The problem occurs when:
>>>>>>
>>>>>> 1. We have N IO cmds running in the target layer spread over 2 sessions.
>>>>>> 2. The initiator sends a LUN_RESET for each session.
>>>>>> 3. session1's LUN_RESET loops over all the running commands from both
>>>>>> sessions and moves them to its local drain_task_list.
>>>>>> 4. session2's LUN_RESET does not see the LUN_RESET from session1 because
>>>>>> the commit above has it remove itself. session2 also does not see any
>>>>>> commands since the other reset moved them off the state lists.
>>>>>> 5. sessions2's LUN_RESET will then complete with a successful response.
>>>>>> 6. sessions2's inititor believes the running commands on its session are
>>>>>> now cleaned up due to the successful response and cleans up the running
>>>>>> commands from its side. It then restarts them.
>>>>>> 7. The commands do eventually complete on the backend and the target
>>>>>> starts to return aborted task statuses for them. The initiator will
>>>>>> either throw a invalid ITT error or might accidentally lookup a new task
>>>>>> if the ITT has been reallocated already.
>>>>>>
>>>>>> This fixes the bug by reverting the patch, and also serializes the
>>>>>> execution of LUN_RESETs and Preempt and Aborts. The latter is necessary
>>>>>> because it turns out the commit accidentally fixed a bug where if there
>>>>>> are 2 LUN RESETs executing they can see each other on the dev_tmr_list,
>>>>>> put the other one on their local drain list, then end up waiting on each
>>>>>> other resulting in a deadlock.
>>>>> If LUN_RESET is not in TMR list anymore there is no need to serialize
>>>>> core_tmr_drain_tmr_list.
>>>> Ah shoot yeah I miswrote that. I meant I needed the serialization for my
>>>> bug not yours.
>>> I still did not get why you wrapping core_tmr_drain_*_list by mutex.
>>> general_tmr_list have only aborts now and they do not wait for other aborts.
>> Do you mean I don't need the mutex for the bug I originally hit that's described
>> at the beginning? If your saying I don't need it for 2 resets running at the same
>> time, I agree. I thought I needed it if we have a RESET and Preempt and Abort:
>>
>> 1. You have 2 sessions. There are no TMRs initially.
>> 2. session1 gets Preempt and Abort. It calls core_tmr_drain_state_list
>> and takes all the cmds from both sessions and puts them on the local
>> drain_task_list list.
>> 3. session1 or 2 gets a LUN_RESET, it sees no cmds on the device's
>> state_lists, and returns success.
>> 4. The initiator thinks the commands were cleaned up by the LUN_RESET.
>>
>> - It could end up re-using the ITT while the original task being cleaned up is
>> still running. Then depending on which session got what and if TAS was set, if
>> the original command completes first then the initiator would think the second
>> command failed with SAM_STAT_TASK_ABORTED.
>>
>> - If there was no TAS or the RESET and Preempt and Abort were on the same session
>> then when we could still hit a bug. We get the RESET response, the initiator might
>> retry the cmds or fail and the app might retry. The retry might go down a completely
>> different path on the target (like if hw queue1 was blocked and had the original
>> command, but this retry goes down hw queue2 due to being received on a different
>> CPU, so it completes right away). We do some new IO. Then hw queue1 unblocks and
>> overwrites the new IO.
>>
>> With the mutex, the LUN_RESET will wait for the Preempt and Abort
>> which is waiting on the running commands. I could have had Preempt
>> and Abort create a tmr, and queue a work and go through that path
>> but I thought it looked uglier faking it.
> Thank you for explanation. But I think you a not right here.
> Preempt And Abort is used to change the reservation holder and abort
> preempted session's commands. A preempted session is not allowed to send
> any new messages, they will be failed anyway.

For the ITT bug, a preempted session can still send commands like INQUIRY,
TURS, RTPG, PR-in, etc. If those commands have the same ITT as the command
the Preempt and Abort is waiting on, we can hit the bug.

Also in general for the ITT bug, even if the new cmd was going to be failed
due to a conflict, it's not right. Eventually the command the Preempt and Abort
is waiting on completes. The initiator is going to end up logging a message
the user almost never sees about getting a command response but no running
command, and drop the connection, and bug people like us :)

For the second issue, if the LUN_RESET came after the Preempt and Abort on
the same session, the RESET doesn't clear the registrations and reservation
So it's going to be sending IO down that specific path, so they will be
executing.

Agree with you for the no TAS and RESET and Preempt on Abort running
on different sessions case. I was thinking the path that got preempted
could later get registered and start sending IO, but I don't think that
makes sense.
Mike Christie March 16, 2023, 4:07 p.m. UTC | #7
On 3/16/23 5:39 AM, Dmitry Bogdanov wrote:
> Considering that (if) I am right with PreemptAndAbort,
> to address multiple LUN_RESET issue it's enough to wrap
> core_tmr_lun_reset and skip all LUN_RESETs in target_drain_tmr_list.
> Without any new lists. That would be as simple patch as possible. 

I think even if you are not correct about PreemptAndAbort, we can
just do the mutex and skip LUN_RESETs in target_drain_tmr_list :)
diff mbox series

Patch

diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index f6e58410ec3f..c9f75ed1566b 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -179,7 +179,16 @@  int transport_lookup_tmr_lun(struct se_cmd *se_cmd)
 	se_tmr->tmr_dev = rcu_dereference_raw(se_lun->lun_se_dev);
 
 	spin_lock_irqsave(&se_tmr->tmr_dev->se_tmr_lock, flags);
-	list_add_tail(&se_tmr->tmr_list, &se_tmr->tmr_dev->dev_tmr_list);
+	switch (se_tmr->function) {
+	case TMR_ABORT_TASK:
+		list_add_tail(&se_tmr->tmr_list,
+			      &se_tmr->tmr_dev->generic_tmr_list);
+		break;
+	case TMR_LUN_RESET:
+		list_add_tail(&se_tmr->tmr_list,
+			      &se_tmr->tmr_dev->lun_reset_tmr_list);
+		break;
+	}
 	spin_unlock_irqrestore(&se_tmr->tmr_dev->se_tmr_lock, flags);
 
 	return 0;
@@ -761,7 +770,8 @@  struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
 	dev->hba_index = hba->hba_index;
 
 	INIT_LIST_HEAD(&dev->dev_sep_list);
-	INIT_LIST_HEAD(&dev->dev_tmr_list);
+	INIT_LIST_HEAD(&dev->generic_tmr_list);
+	INIT_LIST_HEAD(&dev->lun_reset_tmr_list);
 	INIT_LIST_HEAD(&dev->delayed_cmd_list);
 	INIT_LIST_HEAD(&dev->qf_cmd_list);
 	spin_lock_init(&dev->delayed_cmd_lock);
@@ -782,6 +792,7 @@  struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
 	spin_lock_init(&dev->t10_alua.lba_map_lock);
 
 	INIT_WORK(&dev->delayed_cmd_work, target_do_delayed_work);
+	mutex_init(&dev->lun_reset_mutex);
 
 	dev->t10_wwn.t10_dev = dev;
 	/*
diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c
index 2b95b4550a63..88d2a7839876 100644
--- a/drivers/target/target_core_tmr.c
+++ b/drivers/target/target_core_tmr.c
@@ -184,13 +184,11 @@  static void core_tmr_drain_tmr_list(
 	unsigned long flags;
 	bool rc;
 	/*
-	 * Release all pending and outgoing TMRs aside from the received
-	 * LUN_RESET tmr..
+	 * Release all pending and outgoing TMRs except for LUN_RESETS.
 	 */
 	spin_lock_irqsave(&dev->se_tmr_lock, flags);
-	if (tmr)
-		list_del_init(&tmr->tmr_list);
-	list_for_each_entry_safe(tmr_p, tmr_pp, &dev->dev_tmr_list, tmr_list) {
+	list_for_each_entry_safe(tmr_p, tmr_pp, &dev->generic_tmr_list,
+				 tmr_list) {
 		cmd = tmr_p->task_cmd;
 		if (!cmd) {
 			pr_err("Unable to locate struct se_cmd for TMR\n");
@@ -379,14 +377,19 @@  int core_tmr_lun_reset(
 				tmr_nacl->initiatorname);
 		}
 	}
+
+	/* Serialize LUN RESET TMRs and preempt and aborts */
+	mutex_lock(&dev->lun_reset_mutex);
+
 	pr_debug("LUN_RESET: %s starting for [%s], tas: %d\n",
 		(preempt_and_abort_list) ? "Preempt" : "TMR",
 		dev->transport->name, tas);
-
 	core_tmr_drain_tmr_list(dev, tmr, preempt_and_abort_list);
 	core_tmr_drain_state_list(dev, prout_cmd, tmr_sess, tas,
 				preempt_and_abort_list);
 
+	mutex_unlock(&dev->lun_reset_mutex);
+
 	/*
 	 * Clear any legacy SPC-2 reservation when called during
 	 * LOGICAL UNIT RESET
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 1c23079a5d7f..3c732b1b5389 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -3574,6 +3574,7 @@  static void target_tmr_work(struct work_struct *work)
 	struct se_cmd *cmd = container_of(work, struct se_cmd, work);
 	struct se_device *dev = cmd->se_dev;
 	struct se_tmr_req *tmr = cmd->se_tmr_req;
+	bool sched_reset = false;
 	int ret;
 
 	if (cmd->transport_state & CMD_T_ABORTED)
@@ -3596,6 +3597,22 @@  static void target_tmr_work(struct work_struct *work)
 			target_dev_ua_allocate(dev, 0x29,
 					       ASCQ_29H_BUS_DEVICE_RESET_FUNCTION_OCCURRED);
 		}
+
+		/*
+		 * If this is the last reset the device can be freed after we
+		 * run transport_cmd_check_stop_to_fabric. Figure out if there
+		 * are other resets that need to be scheduled while we know we
+		 * have a refcount on the device.
+		 */
+		spin_lock_irq(&dev->se_tmr_lock);
+		if (list_first_entry(&dev->lun_reset_tmr_list,
+				     struct se_tmr_req, tmr_list) !=
+		    list_last_entry(&dev->lun_reset_tmr_list,
+				    struct se_tmr_req, tmr_list))
+			sched_reset = true;
+		else
+			dev->dev_flags &= ~DF_RESETTING_LUN;
+		spin_unlock_irq(&dev->se_tmr_lock);
 		break;
 	case TMR_TARGET_WARM_RESET:
 		tmr->response = TMR_FUNCTION_REJECTED;
@@ -3617,15 +3634,26 @@  static void target_tmr_work(struct work_struct *work)
 
 	transport_lun_remove_cmd(cmd);
 	transport_cmd_check_stop_to_fabric(cmd);
+
+	if (!sched_reset)
+		return;
+
+	spin_lock_irq(&dev->se_tmr_lock);
+	tmr = list_first_entry(&dev->lun_reset_tmr_list, struct se_tmr_req,
+			       tmr_list);
+	spin_unlock_irq(&dev->se_tmr_lock);
+
+	INIT_WORK(&tmr->task_cmd->work, target_tmr_work);
+	schedule_work(&tmr->task_cmd->work);
 	return;
 
 aborted:
 	target_handle_abort(cmd);
 }
 
-int transport_generic_handle_tmr(
-	struct se_cmd *cmd)
+int transport_generic_handle_tmr(struct se_cmd *cmd)
 {
+	struct se_device *dev = cmd->se_dev;
 	unsigned long flags;
 	bool aborted = false;
 
@@ -3646,8 +3674,26 @@  int transport_generic_handle_tmr(
 		return 0;
 	}
 
+	spin_lock_irqsave(&dev->se_tmr_lock, flags);
+	if (cmd->se_tmr_req->function == TMR_LUN_RESET) {
+		/*
+		 * We only allow one reset to execute at a time to prevent
+		 * one reset waiting on another, and to make sure one reset
+		 * does not claim all the cmds causing the other reset to
+		 * return early.
+		 */
+		if (dev->dev_flags & DF_RESETTING_LUN) {
+			spin_unlock_irqrestore(&dev->se_tmr_lock, flags);
+			goto done;
+		}
+
+		dev->dev_flags |= DF_RESETTING_LUN;
+	}
+	spin_unlock_irqrestore(&dev->se_tmr_lock, flags);
+
 	INIT_WORK(&cmd->work, target_tmr_work);
 	schedule_work(&cmd->work);
+done:
 	return 0;
 }
 EXPORT_SYMBOL(transport_generic_handle_tmr);
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index bd299790e99c..0a5b51f8e5e8 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -804,6 +804,7 @@  struct se_device {
 #define DF_USING_UDEV_PATH			0x00000008
 #define DF_USING_ALIAS				0x00000010
 #define DF_READ_ONLY				0x00000020
+#define DF_RESETTING_LUN			0x00000040
 	u8			transport_flags;
 	/* Physical device queue depth */
 	u32			queue_depth;
@@ -840,7 +841,8 @@  struct se_device {
 	/* Used for SPC-3 Persistent Reservations */
 	struct t10_pr_registration *dev_pr_res_holder;
 	struct list_head	dev_sep_list;
-	struct list_head	dev_tmr_list;
+	struct list_head	generic_tmr_list;
+	struct list_head	lun_reset_tmr_list;
 	struct work_struct	qf_work_queue;
 	struct work_struct	delayed_cmd_work;
 	struct list_head	delayed_cmd_list;
@@ -872,6 +874,7 @@  struct se_device {
 	struct rcu_head		rcu_head;
 	int			queue_cnt;
 	struct se_device_queue	*queues;
+	struct mutex		lun_reset_mutex;
 };
 
 struct target_opcode_descriptor {