diff mbox series

[blktests,v3] nvme/046: test queue count changes on reconnect

Message ID 20220913065758.134668-1-dwagner@suse.de (mailing list archive)
State New, archived
Headers show
Series [blktests,v3] nvme/046: test queue count changes on reconnect | expand

Commit Message

Daniel Wagner Sept. 13, 2022, 6:57 a.m. UTC
The target is allowed to change the number of I/O queues. Test if the
host is able to reconnect in this scenario.

Signed-off-by: Daniel Wagner <dwagner@suse.de>
---
I've send a fix for the bug reported in nvmet_subsys_attr_qid_max_show()

Though I have found another problem with this test:

 nvmet: creating nvm controller 1 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:.
 nvme nvme0: creating 8 I/O queues.
 nvme nvme0: mapped 8/0/0 default/read/poll queues.
 nvme nvme0: new ctrl: NQN "blktests-subsystem-1", addr 10.161.61.228:4420
 nvme nvme0: starting error recovery

 ======================================================
 WARNING: possible circular locking dependency detected
 6.0.0-rc2+ #26 Not tainted
 ------------------------------------------------------
 kworker/6:6/17471 is trying to acquire lock:
 ffff888103d76440 (&id_priv->handler_mutex){+.+.}-{3:3}, at: rdma_destroy_id+0x17/0x20 [rdma_cm]
 
 but task is already holding lock:
 ffffc90002497de0 ((work_completion)(&queue->release_work)){+.+.}-{0:0}, at: process_one_work+0x4ec/0xa90
 
 which lock already depends on the new lock.

 
 the existing dependency chain (in reverse order) is:
 
 -> #3 ((work_completion)(&queue->release_work)){+.+.}-{0:0}:
        process_one_work+0x565/0xa90
        worker_thread+0x2c0/0x710
        kthread+0x16c/0x1a0
        ret_from_fork+0x1f/0x30
 
 -> #2 ((wq_completion)nvmet-wq){+.+.}-{0:0}:
        __flush_workqueue+0x105/0x850
        nvmet_rdma_cm_handler+0xf9f/0x174e [nvmet_rdma]
        cma_cm_event_handler+0x77/0x2d0 [rdma_cm]
        cma_ib_req_handler+0xbd4/0x23c0 [rdma_cm]
        cm_process_work+0x2f/0x210 [ib_cm]
        cm_req_handler+0xf73/0x2020 [ib_cm]
        cm_work_handler+0x6d6/0x37c0 [ib_cm]
        process_one_work+0x5b6/0xa90
        worker_thread+0x2c0/0x710
        kthread+0x16c/0x1a0
        ret_from_fork+0x1f/0x30
 
 -> #1 (&id_priv->handler_mutex/1){+.+.}-{3:3}:
        __mutex_lock+0x12d/0xe40
        cma_ib_req_handler+0x956/0x23c0 [rdma_cm]
        cm_process_work+0x2f/0x210 [ib_cm]
        cm_req_handler+0xf73/0x2020 [ib_cm]
        cm_work_handler+0x6d6/0x37c0 [ib_cm]
        process_one_work+0x5b6/0xa90
        worker_thread+0x2c0/0x710
        kthread+0x16c/0x1a0
        ret_from_fork+0x1f/0x30
 
 -> #0 (&id_priv->handler_mutex){+.+.}-{3:3}:
        __lock_acquire+0x1a9d/0x2d90
        lock_acquire+0x15d/0x3f0
        __mutex_lock+0x12d/0xe40
        rdma_destroy_id+0x17/0x20 [rdma_cm]
        nvmet_rdma_free_queue+0x54/0x180 [nvmet_rdma]
        nvmet_rdma_release_queue_work+0x2c/0x70 [nvmet_rdma]
        process_one_work+0x5b6/0xa90
        worker_thread+0x2c0/0x710
        kthread+0x16c/0x1a0
        ret_from_fork+0x1f/0x30
 
 other info that might help us debug this:

 Chain exists of:
   &id_priv->handler_mutex --> (wq_completion)nvmet-wq --> (work_completion)(&queue->release_work)

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock((work_completion)(&queue->release_work));
                                lock((wq_completion)nvmet-wq);
                                lock((work_completion)(&queue->release_work));
   lock(&id_priv->handler_mutex);
 
  *** DEADLOCK ***

 2 locks held by kworker/6:6/17471:
  #0: ffff888150207948 ((wq_completion)nvmet-wq){+.+.}-{0:0}, at: process_one_work+0x4ec/0xa90
  #1: ffffc90002497de0 ((work_completion)(&queue->release_work)){+.+.}-{0:0}, at: process_one_work+0x4ec/0xa90
 
 stack backtrace:
 CPU: 6 PID: 17471 Comm: kworker/6:6 Not tainted 6.0.0-rc2+ #26 a3543a403f60337b00194a751e59d1695bae8a5b
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
 Workqueue: nvmet-wq nvmet_rdma_release_queue_work [nvmet_rdma]
 Call Trace:
  <TASK>
  dump_stack_lvl+0x56/0x73
  check_noncircular+0x1e8/0x220
  ? print_circular_bug+0x110/0x110
  ? kvm_sched_clock_read+0x14/0x40
  ? sched_clock_cpu+0x63/0x1b0
  ? lockdep_lock+0xa7/0x160
  ? call_rcu_zapped+0x90/0x90
  ? add_chain_block+0x1e0/0x290
  __lock_acquire+0x1a9d/0x2d90
  ? lockdep_hardirqs_on_prepare+0x200/0x200
  ? rxe_post_recv+0x139/0x150 [rdma_rxe 81c8d5ac4ddc7eb0ce33e9cb903fbd3fbfb05634]
  lock_acquire+0x15d/0x3f0
  ? rdma_destroy_id+0x17/0x20 [rdma_cm 5f1d19776772fb2bf4a8bb0e22480d1c5839d7fd]
  ? lock_release+0x710/0x710
  ? kvm_sched_clock_read+0x14/0x40
  ? __rdma_block_iter_next+0x110/0x110 [ib_core 0fb68d9b3dd88438bf20acf0ab9fc3d4d71a3097]
  ? lock_is_held_type+0xac/0x130
  __mutex_lock+0x12d/0xe40
  ? rdma_destroy_id+0x17/0x20 [rdma_cm 5f1d19776772fb2bf4a8bb0e22480d1c5839d7fd]
  ? rdma_destroy_id+0x17/0x20 [rdma_cm 5f1d19776772fb2bf4a8bb0e22480d1c5839d7fd]
  ? mutex_lock_io_nested+0xcf0/0xcf0
  ? kfree+0x203/0x4c0
  ? lock_is_held_type+0xac/0x130
  ? lock_is_held_type+0xac/0x130
  ? rdma_destroy_id+0x17/0x20 [rdma_cm 5f1d19776772fb2bf4a8bb0e22480d1c5839d7fd]
  ? trace_cq_drain_complete+0xd4/0x130 [ib_core 0fb68d9b3dd88438bf20acf0ab9fc3d4d71a3097]
  rdma_destroy_id+0x17/0x20 [rdma_cm 5f1d19776772fb2bf4a8bb0e22480d1c5839d7fd]
  nvmet_rdma_free_queue+0x54/0x180 [nvmet_rdma 870a8126b1a0d73aa27d7cf6abb08aa5abd53fe3]
  nvmet_rdma_release_queue_work+0x2c/0x70 [nvmet_rdma 870a8126b1a0d73aa27d7cf6abb08aa5abd53fe3]
  process_one_work+0x5b6/0xa90
  ? pwq_dec_nr_in_flight+0x100/0x100
  ? lock_acquired+0x33d/0x5b0
  ? _raw_spin_unlock_irq+0x24/0x50
  worker_thread+0x2c0/0x710
  ? process_one_work+0xa90/0xa90
  kthread+0x16c/0x1a0
  ? kthread_complete_and_exit+0x20/0x20
  ret_from_fork+0x1f/0x30
  </TASK>
 nvme nvme0: Reconnecting in 10 seconds...
 nvmet_rdma: enabling port 0 (10.161.61.228:4420)


And there is the question how to handle the different kernel logs output:

c740:~/blktests # nvme_trtype=tcp ./check nvme/046
nvme/046 (Test queue count changes on reconnect)             [passed]
    runtime  32.750s  ...  32.340s
c740:~/blktests # nvme_trtype=rdma ./check nvme/046
nvme/046 (Test queue count changes on reconnect)             [failed]
    runtime  32.340s  ...  24.716s
    something found in dmesg:
    [307171.053929] run blktests nvme/046 at 2022-09-13 08:32:07
    [307171.884448] rdma_rxe: loaded
    [307171.965507] eth0 speed is unknown, defaulting to 1000
    [307171.967049] eth0 speed is unknown, defaulting to 1000
    [307171.968671] eth0 speed is unknown, defaulting to 1000
    [307172.057714] infiniband eth0_rxe: set active
    [307172.058544] eth0 speed is unknown, defaulting to 1000
    [307172.058630] infiniband eth0_rxe: added eth0
    [307172.107627] eth0 speed is unknown, defaulting to 1000
    [307172.230771] nvmet: adding nsid 1 to subsystem blktests-feature-detect
    ...
    (See '/root/blktests/results/nodev/nvme/046.dmesg' for the entire message)
c740:~/blktests # nvme_trtype=fc ./check nvme/046
nvme/046 (Test queue count changes on reconnect)             [failed]
    runtime  24.716s  ...  87.454s
    --- tests/nvme/046.out      2022-09-09 16:23:22.926123227 +0200
    +++ /root/blktests/results/nodev/nvme/046.out.bad   2022-09-13 08:35:03.716097654 +0200
    @@ -1,3 +1,89 @@
     Running nvme/046
    -NQN:blktests-subsystem-1 disconnected 1 controller(s)
    +grep: /sys/class/nvme-fabrics/ctl/nvme0/state: No such file or directory
    +grep: /sys/class/nvme-fabrics/ctl/nvme0/state: No such file or directory
    +grep: /sys/class/nvme-fabrics/ctl/nvme0/state: No such file or directory
    +grep: /sys/class/nvme-fabrics/ctl/nvme0/state: No such file or directory
    +grep: /sys/class/nvme-fabrics/ctl/nvme0/state: No such file or directory
    ...
    (Run 'diff -u tests/nvme/046.out /root/blktests/results/nodev/nvme/046.out.bad' to see the entire diff)

The grep error is something I can fix in the test but I don't know how
to handle the 'eth0 speed is unknown' kernel message which will make
the test always fail. Is it possible to ignore them when parsing the
kernel log for errors?


v3:
 - Added comment why at least 2 CPUs are needed for the test
 - Fixed shell quoting in _set_nvmet_attr_qid_max
v2:
 - detect if attr_qid_max is available
 - https://lore.kernel.org/linux-block/20220831153506.28234-1-dwagner@suse.de/
v1:
 - https://lore.kernel.org/linux-block/20220831120900.13129-1-dwagner@suse.de/

 tests/nvme/046     | 137 +++++++++++++++++++++++++++++++++++++++++++++
 tests/nvme/046.out |   3 +
 tests/nvme/rc      |  10 ++++
 3 files changed, 150 insertions(+)
 create mode 100755 tests/nvme/046
 create mode 100644 tests/nvme/046.out

Comments

Shinichiro Kawasaki Sept. 13, 2022, 10:57 a.m. UTC | #1
On Sep 13, 2022 / 08:57, Daniel Wagner wrote:
> The target is allowed to change the number of I/O queues. Test if the
> host is able to reconnect in this scenario.
> 
> Signed-off-by: Daniel Wagner <dwagner@suse.de>
> ---
> I've send a fix for the bug reported in nvmet_subsys_attr_qid_max_show()

Thanks!

> 
> Though I have found another problem with this test:
> 
>  nvmet: creating nvm controller 1 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:.
>  nvme nvme0: creating 8 I/O queues.
>  nvme nvme0: mapped 8/0/0 default/read/poll queues.
>  nvme nvme0: new ctrl: NQN "blktests-subsystem-1", addr 10.161.61.228:4420
>  nvme nvme0: starting error recovery
> 
>  ======================================================
>  WARNING: possible circular locking dependency detected
>  6.0.0-rc2+ #26 Not tainted
>  ------------------------------------------------------
>  kworker/6:6/17471 is trying to acquire lock:
>  ffff888103d76440 (&id_priv->handler_mutex){+.+.}-{3:3}, at: rdma_destroy_id+0x17/0x20 [rdma_cm]
>  
>  but task is already holding lock:
>  ffffc90002497de0 ((work_completion)(&queue->release_work)){+.+.}-{0:0}, at: process_one_work+0x4ec/0xa90
>  
>  which lock already depends on the new lock.
> 
>  
>  the existing dependency chain (in reverse order) is:
>  
>  -> #3 ((work_completion)(&queue->release_work)){+.+.}-{0:0}:
>         process_one_work+0x565/0xa90
>         worker_thread+0x2c0/0x710
>         kthread+0x16c/0x1a0
>         ret_from_fork+0x1f/0x30
>  
>  -> #2 ((wq_completion)nvmet-wq){+.+.}-{0:0}:
>         __flush_workqueue+0x105/0x850
>         nvmet_rdma_cm_handler+0xf9f/0x174e [nvmet_rdma]
>         cma_cm_event_handler+0x77/0x2d0 [rdma_cm]
>         cma_ib_req_handler+0xbd4/0x23c0 [rdma_cm]
>         cm_process_work+0x2f/0x210 [ib_cm]
>         cm_req_handler+0xf73/0x2020 [ib_cm]
>         cm_work_handler+0x6d6/0x37c0 [ib_cm]
>         process_one_work+0x5b6/0xa90
>         worker_thread+0x2c0/0x710
>         kthread+0x16c/0x1a0
>         ret_from_fork+0x1f/0x30
>  
>  -> #1 (&id_priv->handler_mutex/1){+.+.}-{3:3}:
>         __mutex_lock+0x12d/0xe40
>         cma_ib_req_handler+0x956/0x23c0 [rdma_cm]
>         cm_process_work+0x2f/0x210 [ib_cm]
>         cm_req_handler+0xf73/0x2020 [ib_cm]
>         cm_work_handler+0x6d6/0x37c0 [ib_cm]
>         process_one_work+0x5b6/0xa90
>         worker_thread+0x2c0/0x710
>         kthread+0x16c/0x1a0
>         ret_from_fork+0x1f/0x30
>  
>  -> #0 (&id_priv->handler_mutex){+.+.}-{3:3}:
>         __lock_acquire+0x1a9d/0x2d90
>         lock_acquire+0x15d/0x3f0
>         __mutex_lock+0x12d/0xe40
>         rdma_destroy_id+0x17/0x20 [rdma_cm]
>         nvmet_rdma_free_queue+0x54/0x180 [nvmet_rdma]
>         nvmet_rdma_release_queue_work+0x2c/0x70 [nvmet_rdma]
>         process_one_work+0x5b6/0xa90
>         worker_thread+0x2c0/0x710
>         kthread+0x16c/0x1a0
>         ret_from_fork+0x1f/0x30
>  
>  other info that might help us debug this:
> 
>  Chain exists of:
>    &id_priv->handler_mutex --> (wq_completion)nvmet-wq --> (work_completion)(&queue->release_work)
> 
>   Possible unsafe locking scenario:
> 
>         CPU0                    CPU1
>         ----                    ----
>    lock((work_completion)(&queue->release_work));
>                                 lock((wq_completion)nvmet-wq);
>                                 lock((work_completion)(&queue->release_work));
>    lock(&id_priv->handler_mutex);

I also observe this WARNING on my test machine with nvme_trtype=rdma. It looks a
hidden rdma issue for me...

> 
> And there is the question how to handle the different kernel logs output:
> 
> c740:~/blktests # nvme_trtype=tcp ./check nvme/046
> nvme/046 (Test queue count changes on reconnect)             [passed]
>     runtime  32.750s  ...  32.340s
> c740:~/blktests # nvme_trtype=rdma ./check nvme/046
> nvme/046 (Test queue count changes on reconnect)             [failed]
>     runtime  32.340s  ...  24.716s
>     something found in dmesg:
>     [307171.053929] run blktests nvme/046 at 2022-09-13 08:32:07
>     [307171.884448] rdma_rxe: loaded
>     [307171.965507] eth0 speed is unknown, defaulting to 1000
>     [307171.967049] eth0 speed is unknown, defaulting to 1000
>     [307171.968671] eth0 speed is unknown, defaulting to 1000
>     [307172.057714] infiniband eth0_rxe: set active
>     [307172.058544] eth0 speed is unknown, defaulting to 1000
>     [307172.058630] infiniband eth0_rxe: added eth0
>     [307172.107627] eth0 speed is unknown, defaulting to 1000
>     [307172.230771] nvmet: adding nsid 1 to subsystem blktests-feature-detect
>     ...
>     (See '/root/blktests/results/nodev/nvme/046.dmesg' for the entire message)
> c740:~/blktests # nvme_trtype=fc ./check nvme/046
> nvme/046 (Test queue count changes on reconnect)             [failed]
>     runtime  24.716s  ...  87.454s
>     --- tests/nvme/046.out      2022-09-09 16:23:22.926123227 +0200
>     +++ /root/blktests/results/nodev/nvme/046.out.bad   2022-09-13 08:35:03.716097654 +0200
>     @@ -1,3 +1,89 @@
>      Running nvme/046
>     -NQN:blktests-subsystem-1 disconnected 1 controller(s)
>     +grep: /sys/class/nvme-fabrics/ctl/nvme0/state: No such file or directory
>     +grep: /sys/class/nvme-fabrics/ctl/nvme0/state: No such file or directory
>     +grep: /sys/class/nvme-fabrics/ctl/nvme0/state: No such file or directory
>     +grep: /sys/class/nvme-fabrics/ctl/nvme0/state: No such file or directory
>     +grep: /sys/class/nvme-fabrics/ctl/nvme0/state: No such file or directory
>     ...
>     (Run 'diff -u tests/nvme/046.out /root/blktests/results/nodev/nvme/046.out.bad' to see the entire diff)
> 
> The grep error is something I can fix in the test but I don't know how
> to handle the 'eth0 speed is unknown' kernel message which will make
> the test always fail. Is it possible to ignore them when parsing the
> kernel log for errors?

FYI, each blktests test case can define DMESG_FILTER not to fail with specific
keywords in dmesg. Test cases meta/011 and block/028 are reference use cases.

Having said that, I don't think 'eth0 speed is unknown' in dmesg makes the test
case fail. See _check_dmesg() in "check" script, which lists keywords that
blktests handles as errors. I guess the WARNING above is the failure cause,
probably.
Daniel Wagner Sept. 13, 2022, 11:42 a.m. UTC | #2
On Tue, Sep 13, 2022 at 10:57:44AM +0000, Shinichiro Kawasaki wrote:
> >   Possible unsafe locking scenario:
> > 
> >         CPU0                    CPU1
> >         ----                    ----
> >    lock((work_completion)(&queue->release_work));
> >                                 lock((wq_completion)nvmet-wq);
> >                                 lock((work_completion)(&queue->release_work));
> >    lock(&id_priv->handler_mutex);
> 
> I also observe this WARNING on my test machine with nvme_trtype=rdma. It looks a
> hidden rdma issue for me...

Yes, that problem is not new, just uncovered by this test.

> > The grep error is something I can fix in the test but I don't know how
> > to handle the 'eth0 speed is unknown' kernel message which will make
> > the test always fail. Is it possible to ignore them when parsing the
> > kernel log for errors?
> 
> FYI, each blktests test case can define DMESG_FILTER not to fail with specific
> keywords in dmesg. Test cases meta/011 and block/028 are reference use
> cases.

Ah okay, let me look into it.

> Having said that, I don't think 'eth0 speed is unknown' in dmesg makes the test
> case fail. See _check_dmesg() in "check" script, which lists keywords that
> blktests handles as errors. I guess the WARNING above is the failure cause,
> probably.

Makes sense. On second thought, the fc transport seems to behave
differently than tcp and rdma if you look at the grep error. Will look
into it, it might be another existing bug...

Daniel
Daniel Wagner Sept. 13, 2022, 5:10 p.m. UTC | #3
On Tue, Sep 13, 2022 at 01:42:11PM +0200, Daniel Wagner wrote:
> On Tue, Sep 13, 2022 at 10:57:44AM +0000, Shinichiro Kawasaki wrote:
> > FYI, each blktests test case can define DMESG_FILTER not to fail with specific
> > keywords in dmesg. Test cases meta/011 and block/028 are reference use
> > cases.
> 
> Ah okay, let me look into it.

So I made the state read function a bit more robust (test if state file
exists) and the it turns out this made rdma happy(??) but tcp is still
breaking.

nvmf_wait_for_state() {
        local subsys_name="$1"
        local state="$2"
        local timeout="${3:-$def_state_timeout}"

        local nvmedev=$(_find_nvme_dev "${subsys_name}")
        local state_file="/sys/class/nvme-fabrics/ctl/${nvmedev}/state"

        local start_time=$(date +%s)
        local end_time

        while [ -f "${state_file}" ]; do
                if grep -q "${state}" "${state_file}"; then
                        break;
                fi

                end_time=$(date +%s)
                if (( end_time - start_time > timeout )); then
                        echo "expected state \"${state}\" not " \
                             "reached within ${timeout} seconds"
                        break
                fi
                sleep 1
        done

        [ -f "${state_file}" ] || echo "failed to read ${state_file}"
}


c740:~/blktests # nvme_trtype=tcp ./check nvme/046
nvme/046 (Test queue count changes on reconnect)             [passed]
    runtime  32.154s  ...  32.189s
c740:~/blktests # nvme_trtype=rdma ./check nvme/046
nvme/046 (Test queue count changes on reconnect)             [passed]
    runtime  32.189s  ...  23.488s
c740:~/blktests # nvme_trtype=fc ./check nvme/046
nvme/046 (Test queue count changes on reconnect)             [failed]
    runtime  23.488s  ...  2.918s
    --- tests/nvme/046.out      2022-09-09 16:23:22.926123227 +0200
    +++ /root/blktests/results/nodev/nvme/046.out.bad   2022-09-13 19:07:43.661118528 +0200
    @@ -1,3 +1,7 @@
     Running nvme/046
    -NQN:blktests-subsystem-1 disconnected 1 controller(s)
    +failed to read /sys/class/nvme-fabrics/ctl/nvme0/state
    +failed to read /sys/class/nvme-fabrics/ctl//state
    +failed to read /sys/class/nvme-fabrics/ctl//state
    +failed to read /sys/class/nvme-fabrics/ctl//state
    +NQN:blktests-subsystem-1 disconnected 0 controller(s)
    ...
Daniel Wagner Sept. 14, 2022, 9 a.m. UTC | #4
On Tue, Sep 13, 2022 at 07:10:49PM +0200, Daniel Wagner wrote:
> On Tue, Sep 13, 2022 at 01:42:11PM +0200, Daniel Wagner wrote:
> > On Tue, Sep 13, 2022 at 10:57:44AM +0000, Shinichiro Kawasaki wrote:
> > > FYI, each blktests test case can define DMESG_FILTER not to fail with specific
> > > keywords in dmesg. Test cases meta/011 and block/028 are reference use
> > > cases.
> > 
> > Ah okay, let me look into it.
> 
> So I made the state read function a bit more robust (test if state file
> exists) and the it turns out this made rdma happy(??) but tcp is still
> breaking.

s/tcp/fc/

On closer inspection I see following sequence for fc:

[399664.863585] nvmet: connect request for invalid subsystem blktests-subsystem-1!
[399664.863704] nvme nvme0: Connect Invalid Data Parameter, subsysnqn "blktests-subsystem-1"
[399664.863758] nvme nvme0: NVME-FC{0}: reset: Reconnect attempt failed (16770)
[399664.863784] nvme nvme0: NVME-FC{0}: reconnect failure
[399664.863837] nvme nvme0: Removing ctrl: NQN "blktests-subsystem-1"

When the host tries to reconnect to a non existing controller (the test
called _remove_nvmet_subsystem_from_port()) the target returns 0x4182
(NVME_SC_DNR|NVME_SC_READ_ONLY(?)). So arguably fc behaves correct by
stopping the reconnects. tcp and rdma just ignore the DNR.

If we agree that the fc behavior is the right one, then the nvmet code
needs to be changed so that when the qid_max attribute changes it forces
a reconnect. The trick with calling _remove_nvmet_subsystem_from_port()
to force a reconnect is not working. And tcp/rdma needs to honor the
DNR.

Daniel
Sagi Grimberg Sept. 14, 2022, 10:37 a.m. UTC | #5
>>>> FYI, each blktests test case can define DMESG_FILTER not to fail with specific
>>>> keywords in dmesg. Test cases meta/011 and block/028 are reference use
>>>> cases.
>>>
>>> Ah okay, let me look into it.
>>
>> So I made the state read function a bit more robust (test if state file
>> exists) and the it turns out this made rdma happy(??) but tcp is still
>> breaking.
> 
> s/tcp/fc/
> 
> On closer inspection I see following sequence for fc:
> 
> [399664.863585] nvmet: connect request for invalid subsystem blktests-subsystem-1!
> [399664.863704] nvme nvme0: Connect Invalid Data Parameter, subsysnqn "blktests-subsystem-1"
> [399664.863758] nvme nvme0: NVME-FC{0}: reset: Reconnect attempt failed (16770)
> [399664.863784] nvme nvme0: NVME-FC{0}: reconnect failure
> [399664.863837] nvme nvme0: Removing ctrl: NQN "blktests-subsystem-1"
> 
> When the host tries to reconnect to a non existing controller (the test
> called _remove_nvmet_subsystem_from_port()) the target returns 0x4182
> (NVME_SC_DNR|NVME_SC_READ_ONLY(?)).

That is not something that the target is supposed to be doing, I have no
idea why this is sent. Perhaps this is something specific to the fc
implementation?

  So arguably fc behaves correct by
> stopping the reconnects. tcp and rdma just ignore the DNR.

DNR means do not retry the command, it says nothing about do not attempt
a future reconnect...

> If we agree that the fc behavior is the right one, then the nvmet code
> needs to be changed so that when the qid_max attribute changes it forces
> a reconnect. The trick with calling _remove_nvmet_subsystem_from_port()
> to force a reconnect is not working. And tcp/rdma needs to honor the
> DNR.

tcp/rdma honor DNR afaik.
Daniel Wagner Sept. 14, 2022, 11:07 a.m. UTC | #6
On Wed, Sep 14, 2022 at 01:37:38PM +0300, Sagi Grimberg wrote:
> 
> > > > > FYI, each blktests test case can define DMESG_FILTER not to fail with specific
> > > > > keywords in dmesg. Test cases meta/011 and block/028 are reference use
> > > > > cases.
> > > > 
> > > > Ah okay, let me look into it.
> > > 
> > > So I made the state read function a bit more robust (test if state file
> > > exists) and the it turns out this made rdma happy(??) but tcp is still
> > > breaking.
> > 
> > s/tcp/fc/
> > 
> > On closer inspection I see following sequence for fc:
> > 
> > [399664.863585] nvmet: connect request for invalid subsystem blktests-subsystem-1!
> > [399664.863704] nvme nvme0: Connect Invalid Data Parameter, subsysnqn "blktests-subsystem-1"
> > [399664.863758] nvme nvme0: NVME-FC{0}: reset: Reconnect attempt failed (16770)
> > [399664.863784] nvme nvme0: NVME-FC{0}: reconnect failure
> > [399664.863837] nvme nvme0: Removing ctrl: NQN "blktests-subsystem-1"
> > 
> > When the host tries to reconnect to a non existing controller (the test
> > called _remove_nvmet_subsystem_from_port()) the target returns 0x4182
> > (NVME_SC_DNR|NVME_SC_READ_ONLY(?)).
> 
> That is not something that the target is supposed to be doing, I have no
> idea why this is sent. Perhaps this is something specific to the fc
> implementation?

Okay, I'll look into this.

>  So arguably fc behaves correct by
> > stopping the reconnects. tcp and rdma just ignore the DNR.
> 
> DNR means do not retry the command, it says nothing about do not attempt
> a future reconnect...

That makes sense.

> > If we agree that the fc behavior is the right one, then the nvmet code
> > needs to be changed so that when the qid_max attribute changes it forces
> > a reconnect. The trick with calling _remove_nvmet_subsystem_from_port()
> > to force a reconnect is not working. And tcp/rdma needs to honor the
> > DNR.
> 
> tcp/rdma honor DNR afaik.

I did interpret DNR wrongly. As you pointed out it's just about the
command not about the reconnect attempt.

So do we agree the fc host should not stop reconnecting? James?
Daniel Wagner Sept. 15, 2022, 11:33 a.m. UTC | #7
On Wed, Sep 14, 2022 at 01:07:18PM +0200, Daniel Wagner wrote:
> > > [399664.863585] nvmet: connect request for invalid subsystem blktests-subsystem-1!
> > > [399664.863704] nvme nvme0: Connect Invalid Data Parameter, subsysnqn "blktests-subsystem-1"
> > > [399664.863758] nvme nvme0: NVME-FC{0}: reset: Reconnect attempt failed (16770)
> > > [399664.863784] nvme nvme0: NVME-FC{0}: reconnect failure
> > > [399664.863837] nvme nvme0: Removing ctrl: NQN "blktests-subsystem-1"
> > > 
> > > When the host tries to reconnect to a non existing controller (the test
> > > called _remove_nvmet_subsystem_from_port()) the target returns 0x4182
> > > (NVME_SC_DNR|NVME_SC_READ_ONLY(?)).
> > 
> > That is not something that the target is supposed to be doing, I have no
> > idea why this is sent. Perhaps this is something specific to the fc
> > implementation?
> 
> Okay, I'll look into this.

I didn't realize that the 0x182 can be overloaded. In the case of fabric
commands it stands for NVME_SC_CONNECT_INVALID_PARAM. So all good here.
Daniel Wagner Sept. 15, 2022, 12:50 p.m. UTC | #8
> So do we agree the fc host should not stop reconnecting? James?

With the change below the test succeeds once:

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 42767fb75455..b167cf9fee6d 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3325,8 +3325,6 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
                dev_info(ctrl->ctrl.device,
                        "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
                        ctrl->cnum, status);
-               if (status > 0 && (status & NVME_SC_DNR))
-                       recon = false;
        } else if (time_after_eq(jiffies, rport->dev_loss_end))
                recon = false;

This part was introduced with f25f8ef70ce2 ("nvme-fc: short-circuit
reconnect retries"):

    nvme-fc: short-circuit reconnect retries

    Returning an nvme status from nvme_fc_create_association() indicates
    that the association is established, and we should honour the DNR bit.
    If it's set a reconnect attempt will just return the same error, so
    we can short-circuit the reconnect attempts and fail the connection
    directly.

It looks like the reasoning here didn't take into consideration the
scenario we have here. I'd say we should not do it and handle it similar
as with have with tcp/rdma.

The second time the target explodes in various ways. I suspect the
problem starts with tgtport->dev being NULL. Though the NULL pointer has
nothing todo with my change above it, as I saw it in the previous runs
as well. The WARNING/oops just happens less often without out the patch
above. With it is reproducible.


[  205.677228] (NULL device *): {0:2} Association deleted
[  205.677307] (NULL device *): {0:2} Association freed
[  205.677336] (NULL device *): Disconnect LS failed: No Association
[  205.721480] nvme nvme0: NVME-FC{0}: controller connectivity lost. Awaiting Reconnect
[  205.735687] (NULL device *): {0:0} Association deleted
[  205.735770] (NULL device *): {0:0} Association freed

[  205.737922] ============================================
[  205.737942] WARNING: possible recursive locking detected
[  205.737961] 6.0.0-rc2+ #26 Not tainted
[  205.737978] --------------------------------------------
[  205.737996] kworker/5:0/47 is trying to acquire lock:
[  205.738016] ffff8881304311e8 ((work_completion)(&tport->ls_work)){+.+.}-{0:0}, at: __flush_work+0x4c2/0x760
[  205.738073] 
               but task is already holding lock:
[  205.738094] ffffc90000397de0 ((work_completion)(&tport->ls_work)){+.+.}-{0:0}, at: process_one_work+0x4ec/0xa90
[  205.738139] 
               other info that might help us debug this:
[  205.738162]  Possible unsafe locking scenario:

[  205.738183]        CPU0
[  205.738196]        ----
[  205.738208]   lock((work_completion)(&tport->ls_work));
[  205.738232]   lock((work_completion)(&tport->ls_work));
[  205.738254] 
                *** DEADLOCK ***

[  205.738278]  May be due to missing lock nesting notation

[  205.738301] 2 locks held by kworker/5:0/47:
[  205.738318]  #0: ffff888142212948 ((wq_completion)nvmet-wq){+.+.}-{0:0}, at: process_one_work+0x4ec/0xa90
[  205.738365]  #1: ffffc90000397de0 ((work_completion)(&tport->ls_work)){+.+.}-{0:0}, at: process_one_work+0x4ec/0xa90
[  205.738412] 
               stack backtrace:
[  205.738431] CPU: 5 PID: 47 Comm: kworker/5:0 Not tainted 6.0.0-rc2+ #26 a3543a403f60337b00194a751e59d1695bae8a5b
[  205.738465] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
[  205.738490] Workqueue: nvmet-wq fcloop_tport_lsrqst_work [nvme_fcloop]
[  205.738534] Call Trace:
[  205.738548]  <TASK>
[  205.738562]  dump_stack_lvl+0x56/0x73
[  205.738586]  __lock_acquire.cold+0x15d/0x2e2
[  205.738619]  ? lockdep_hardirqs_on_prepare+0x200/0x200
[  205.738647]  ? lockdep_lock+0xa7/0x160
[  205.738669]  lock_acquire+0x15d/0x3f0
[  205.738689]  ? __flush_work+0x4c2/0x760
[  205.738712]  ? lock_release+0x710/0x710
[  205.738733]  ? pvclock_clocksource_read+0xdc/0x1b0
[  205.738759]  ? mark_lock.part.0+0x108/0xd90
[  205.738784]  __flush_work+0x4e5/0x760
[  205.738804]  ? __flush_work+0x4c2/0x760
[  205.738828]  ? queue_delayed_work_on+0x90/0x90
[  205.738851]  ? lock_release+0x1fb/0x710
[  205.738876]  ? mark_held_locks+0x6b/0x90
[  205.738896]  ? _raw_spin_unlock_irqrestore+0x2d/0x50
[  205.738925]  ? lockdep_hardirqs_on_prepare+0x136/0x200
[  205.738948]  ? trace_hardirqs_on+0x2d/0xf0
[  205.738972]  ? _raw_spin_unlock_irqrestore+0x2d/0x50
[  205.738997]  ? __free_pages_ok+0x397/0x7e0
[  205.739023]  fcloop_targetport_delete+0x24/0xf0 [nvme_fcloop 1d5c548e62c1c6933b09815fc35f2c26e025239d]
[  205.739064]  nvmet_fc_free_tgtport+0x1d5/0x220 [nvmet_fc 7a9748591fe196fb85ac8210ffbd9cf866e138c8]
[  205.739110]  nvmet_fc_disconnect_assoc_done+0x133/0x150 [nvmet_fc 7a9748591fe196fb85ac8210ffbd9cf866e138c8]
[  205.739152]  ? nvmet_fc_xmt_ls_rsp+0xa0/0xa0 [nvmet_fc 7a9748591fe196fb85ac8210ffbd9cf866e138c8]
[  205.739189]  fcloop_rport_lsrqst_work+0xbe/0x110 [nvme_fcloop 1d5c548e62c1c6933b09815fc35f2c26e025239d]
[  205.739230]  process_one_work+0x5b6/0xa90
[  205.739256]  ? pwq_dec_nr_in_flight+0x100/0x100
[  205.739278]  ? lock_acquired+0x33d/0x5b0
[  205.739302]  ? _raw_spin_unlock_irq+0x24/0x50
[  205.739330]  worker_thread+0x2c0/0x710
[  205.739356]  ? process_one_work+0xa90/0xa90
[  205.739390]  kthread+0x16c/0x1a0
[  205.739412]  ? kthread_complete_and_exit+0x20/0x20
[  205.739436]  ret_from_fork+0x1f/0x30
[  205.739469]  </TASK>
[  265.728627] nvme nvme0: NVME-FC{0}: dev_loss_tmo (60) expired while waiting for remoteport connectivity.
[  265.728760] nvme nvme0: Removing ctrl: NQN "nqn.2014-08.org.nvmexpress.discovery"


or


[  106.172123] ==================================================================
[  106.172134] BUG: KASAN: null-ptr-deref in nvmet_execute_disc_get_log_page+0x1d3/0x5a0 [nvmet]
[  106.172212] Read of size 8 at addr 0000000000000520 by task kworker/4:3/7895
[  106.172226] CPU: 4 PID: 7895 Comm: kworker/4:3 Not tainted 6.0.0-rc2+ #26 a3543a403f60337b00194a751e59d1695bae8a5b
[  106.172238] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
[  106.172245] Workqueue: nvmet-wq fcloop_fcp_recv_work [nvme_fcloop]
[  106.172270] Call Trace:
[  106.172276]  <TASK>
[  106.172282]  dump_stack_lvl+0x56/0x73
[  106.172298]  kasan_report+0xb1/0xf0
[  106.172317]  ? nvmet_execute_disc_get_log_page+0x1d3/0x5a0 [nvmet 3bf416f83ec95a5fd1f5e7e8430a2ea0fb7e3bb4]
[  106.172374]  nvmet_execute_disc_get_log_page+0x1d3/0x5a0 [nvmet 3bf416f83ec95a5fd1f5e7e8430a2ea0fb7e3bb4]
[  106.172435]  ? nvmet_execute_disc_set_features+0xa0/0xa0 [nvmet 3bf416f83ec95a5fd1f5e7e8430a2ea0fb7e3bb4]
[  106.172490]  ? rcu_read_lock_sched_held+0x43/0x70
[  106.172506]  ? __alloc_pages+0x34d/0x410
[  106.172518]  ? __alloc_pages_slowpath.constprop.0+0x1500/0x1500
[  106.172530]  ? lock_is_held_type+0xac/0x130
[  106.172547]  ? _find_first_bit+0x28/0x50
[  106.172557]  ? policy_node+0x95/0xb0
[  106.172570]  ? sgl_alloc_order+0x151/0x200
[  106.172586]  nvmet_fc_handle_fcp_rqst+0x2cb/0x5c0 [nvmet_fc 7a9748591fe196fb85ac8210ffbd9cf866e138c8]
[  106.172610]  ? nvmet_fc_transfer_fcp_data+0x380/0x380 [nvmet_fc 7a9748591fe196fb85ac8210ffbd9cf866e138c8]
[  106.172628]  ? _raw_spin_unlock_irqrestore+0x2d/0x50
[  106.172644]  ? lockdep_hardirqs_on_prepare+0x136/0x200
[  106.172659]  nvmet_fc_rcv_fcp_req+0x3bd/0x76b [nvmet_fc 7a9748591fe196fb85ac8210ffbd9cf866e138c8]
[  106.172680]  ? nvmet_fc_handle_ls_rqst_work+0x1350/0x1350 [nvmet_fc 7a9748591fe196fb85ac8210ffbd9cf866e138c8]
[  106.172698]  ? mark_held_locks+0x6b/0x90
[  106.172707]  ? _raw_spin_unlock_irq+0x24/0x50
[  106.172718]  ? lockdep_hardirqs_on_prepare+0x136/0x200
[  106.172731]  fcloop_fcp_recv_work+0xe4/0x120 [nvme_fcloop 1d5c548e62c1c6933b09815fc35f2c26e025239d]
[  106.172753]  process_one_work+0x5b6/0xa90
[  106.172770]  ? pwq_dec_nr_in_flight+0x100/0x100
[  106.172779]  ? lock_acquired+0x33d/0x5b0
[  106.172791]  ? _raw_spin_unlock_irq+0x24/0x50
[  106.172806]  worker_thread+0x2c0/0x710
[  106.172822]  ? process_one_work+0xa90/0xa90
[  106.172830]  kthread+0x16c/0x1a0
[  106.172838]  ? kthread_complete_and_exit+0x20/0x20
[  106.172848]  ret_from_fork+0x1f/0x30
[  106.172869]  </TASK>
[  106.172874] ==================================================================
[  106.172879] Disabling lock debugging due to kernel taint
[  106.172891] BUG: kernel NULL pointer dereference, address: 0000000000000520
[  106.172920] #PF: supervisor read access in kernel mode
[  106.172940] #PF: error_code(0x0000) - not-present page
[  106.172960] PGD 0 P4D 0 
[  106.172979] Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI
[  106.173000] CPU: 4 PID: 7895 Comm: kworker/4:3 Tainted: G    B              6.0.0-rc2+ #26 a3543a403f60337b00194a751e59d1695bae8a5b
[  106.173036] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
[  106.173065] Workqueue: nvmet-wq fcloop_fcp_recv_work [nvme_fcloop]
[  106.173102] RIP: 0010:nvmet_execute_disc_get_log_page+0x1d3/0x5a0 [nvmet]
[  106.173172] Code: 10 48 89 df e8 be 24 b3 cc 4c 89 ef 4c 8b 3b e8 b3 24 b3 cc 4c 8b b5 b8 01 00 00 4d 8d a6 20 05 00 00 4c 89 e7 e8 9d 24 b3 cc <49> 8b 9e 20 05 00 00 4c 39 e3 0f 84 7e 03 00 00 49 81 c7 b4 03 00
[  106.173220] RSP: 0018:ffffc9000834f998 EFLAGS: 00010286
[  106.173243] RAX: 0000000000000001 RBX: ffff88811c820070 RCX: ffffffff8d4fbd36
[  106.173268] RDX: fffffbfff223e8f9 RSI: 0000000000000008 RDI: ffffffff911f47c0
[  106.173291] RBP: ffff88811c822508 R08: 0000000000000001 R09: ffffffff911f47c7
[  106.173314] R10: fffffbfff223e8f8 R11: 0000000000000001 R12: 0000000000000520
[  106.173337] R13: ffff88811c8226c0 R14: 0000000000000000 R15: ffff88810e9d0000
[  106.173361] FS:  0000000000000000(0000) GS:ffff88821f800000(0000) knlGS:0000000000000000
[  106.173388] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  106.173409] CR2: 0000000000000520 CR3: 000000010c1b2004 CR4: 0000000000170ee0
[  106.173438] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  106.173461] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  106.173484] Call Trace:
[  106.173498]  <TASK>
[  106.173515]  ? nvmet_execute_disc_set_features+0xa0/0xa0 [nvmet 3bf416f83ec95a5fd1f5e7e8430a2ea0fb7e3bb4]
[  106.173593]  ? rcu_read_lock_sched_held+0x43/0x70
[  106.173616]  ? __alloc_pages+0x34d/0x410
[  106.173639]  ? __alloc_pages_slowpath.constprop.0+0x1500/0x1500
[  106.173666]  ? lock_is_held_type+0xac/0x130
[  106.173692]  ? _find_first_bit+0x28/0x50
[  106.173712]  ? policy_node+0x95/0xb0
[  106.173735]  ? sgl_alloc_order+0x151/0x200
[  106.173761]  nvmet_fc_handle_fcp_rqst+0x2cb/0x5c0 [nvmet_fc 7a9748591fe196fb85ac8210ffbd9cf866e138c8]
[  106.173804]  ? nvmet_fc_transfer_fcp_data+0x380/0x380 [nvmet_fc 7a9748591fe196fb85ac8210ffbd9cf866e138c8]
[  106.173843]  ? _raw_spin_unlock_irqrestore+0x2d/0x50
[  106.173868]  ? lockdep_hardirqs_on_prepare+0x136/0x200
[  106.173895]  nvmet_fc_rcv_fcp_req+0x3bd/0x76b [nvmet_fc 7a9748591fe196fb85ac8210ffbd9cf866e138c8]
[  106.173936]  ? nvmet_fc_handle_ls_rqst_work+0x1350/0x1350 [nvmet_fc 7a9748591fe196fb85ac8210ffbd9cf866e138c8]
[  106.173976]  ? mark_held_locks+0x6b/0x90
[  106.173996]  ? _raw_spin_unlock_irq+0x24/0x50
[  106.174019]  ? lockdep_hardirqs_on_prepare+0x136/0x200
[  106.174045]  fcloop_fcp_recv_work+0xe4/0x120 [nvme_fcloop 1d5c548e62c1c6933b09815fc35f2c26e025239d]
[  106.174087]  process_one_work+0x5b6/0xa90
[  106.174114]  ? pwq_dec_nr_in_flight+0x100/0x100
[  106.174135]  ? lock_acquired+0x33d/0x5b0
[  106.174159]  ? _raw_spin_unlock_irq+0x24/0x50
[  106.174186]  worker_thread+0x2c0/0x710
[  106.174212]  ? process_one_work+0xa90/0xa90
[  106.174233]  kthread+0x16c/0x1a0
[  106.174250]  ? kthread_complete_and_exit+0x20/0x20
[  106.174274]  ret_from_fork+0x1f/0x30
[  106.174303]  </TASK>
[  106.174315] Modules linked in: nvme_fcloop nvme_fc nvme_fabrics nvmet_fc nvmet nvme_core nvme_common af_packet rfkill nls_iso8859_1 nls_cp437 intel_rapl_msr vfat intel_rapl_common fat snd_hda_codec_generic kvm_intel snd_hda_intel snd_intel_dspcfg iTCO_wdt snd_hda_codec intel_pmc_bxt snd_hwdep kvm iTCO_vendor_support snd_hda_core joydev irqbypass snd_pcm i2c_i801 pcspkr i2c_smbus snd_timer efi_pstore snd virtio_net soundcore virtio_balloon lpc_ich net_failover failover tiny_power_button button fuse configfs ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel hid_generic usbhid aesni_intel crypto_simd cryptd serio_raw sr_mod xhci_pci xhci_pci_renesas cdrom xhci_hcd virtio_blk usbcore virtio_gpu virtio_dma_buf qemu_fw_cfg btrfs blake2b_generic libcrc32c crc32c_intel xor raid6_pq sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua efivarfs virtio_rng
[  106.174738] CR2: 0000000000000520
[  106.174756] ---[ end trace 0000000000000000 ]---
[  106.174775] RIP: 0010:nvmet_execute_disc_get_log_page+0x1d3/0x5a0 [nvmet]
[  106.174844] Code: 10 48 89 df e8 be 24 b3 cc 4c 89 ef 4c 8b 3b e8 b3 24 b3 cc 4c 8b b5 b8 01 00 00 4d 8d a6 20 05 00 00 4c 89 e7 e8 9d 24 b3 cc <49> 8b 9e 20 05 00 00 4c 39 e3 0f 84 7e 03 00 00 49 81 c7 b4 03 00
[  106.174893] RSP: 0018:ffffc9000834f998 EFLAGS: 00010286
[  106.174915] RAX: 0000000000000001 RBX: ffff88811c820070 RCX: ffffffff8d4fbd36
[  106.174939] RDX: fffffbfff223e8f9 RSI: 0000000000000008 RDI: ffffffff911f47c0
[  106.174963] RBP: ffff88811c822508 R08: 0000000000000001 R09: ffffffff911f47c7
[  106.174986] R10: fffffbfff223e8f8 R11: 0000000000000001 R12: 0000000000000520
[  106.175009] R13: ffff88811c8226c0 R14: 0000000000000000 R15: ffff88810e9d0000
[  106.175032] FS:  0000000000000000(0000) GS:ffff88821f800000(0000) knlGS:0000000000000000
[  106.175058] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  106.175079] CR2: 0000000000000520 CR3: 000000010c1b2004 CR4: 0000000000170ee0
[  106.175121] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  106.175144] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  107.593127] nvme nvme0: NVME-FC{0}: create association : host wwpn 0x20001100aa000002  rport wwpn
Hannes Reinecke Sept. 15, 2022, 3:41 p.m. UTC | #9
On 9/15/22 14:50, Daniel Wagner wrote:
>> So do we agree the fc host should not stop reconnecting? James?
> 
> With the change below the test succeeds once:
> 
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index 42767fb75455..b167cf9fee6d 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -3325,8 +3325,6 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
>                  dev_info(ctrl->ctrl.device,
>                          "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
>                          ctrl->cnum, status);
> -               if (status > 0 && (status & NVME_SC_DNR))
> -                       recon = false;
>          } else if (time_after_eq(jiffies, rport->dev_loss_end))
>                  recon = false;
> 
> This part was introduced with f25f8ef70ce2 ("nvme-fc: short-circuit
> reconnect retries"):
> 
>      nvme-fc: short-circuit reconnect retries
> 
>      Returning an nvme status from nvme_fc_create_association() indicates
>      that the association is established, and we should honour the DNR bit.
>      If it's set a reconnect attempt will just return the same error, so
>      we can short-circuit the reconnect attempts and fail the connection
>      directly.
> 
> It looks like the reasoning here didn't take into consideration the
> scenario we have here. I'd say we should not do it and handle it similar
> as with have with tcp/rdma.
> 
But that is wrong.
When we are evaluating the NVMe status we _need_ to check the DNR bit; 
that's precisely what it's for.

The real reason here is a queue inversion with fcloop; we've had them 
for ages, and I'm not surprised that it pops off now.
In fact, I was quite surprised that I didn't hit it when updating 
blktests; in previous versions I had to insert an explicit 'sleep 2' 
before disconnect to make it work.

I'd rather fix that than reverting FC to the (wrong) behaviour.

Cheers,

Hannes
Daniel Wagner Sept. 16, 2022, 6:30 a.m. UTC | #10
On Thu, Sep 15, 2022 at 05:41:46PM +0200, Hannes Reinecke wrote:
> > It looks like the reasoning here didn't take into consideration the
> > scenario we have here. I'd say we should not do it and handle it similar
> > as with have with tcp/rdma.
> > 
> But that is wrong.
> When we are evaluating the NVMe status we _need_ to check the DNR bit;
> that's precisely what it's for.

I asked what the desired behavior is and Sagi pointed out:

> DNR means do not retry the command, it says nothing about do not attempt
> a future reconnect...

I really don't care too much, it just that fc and tcp/rdma do not behave
the same in this regard which makes this test trip over. As long you two
can agree on a the 'correct' behavior, I can do the work.

> The real reason here is a queue inversion with fcloop; we've had them for
> ages, and I'm not surprised that it pops off now.

Could you elaborate?

> In fact, I was quite surprised that I didn't hit it when updating blktests;
> in previous versions I had to insert an explicit 'sleep 2' before disconnect
> to make it work.
> 
> I'd rather fix that than reverting FC to the (wrong) behaviour.

Sure, no problem with this approach.
diff mbox series

Patch

diff --git a/tests/nvme/046 b/tests/nvme/046
new file mode 100755
index 000000000000..4ffd4dcff832
--- /dev/null
+++ b/tests/nvme/046
@@ -0,0 +1,137 @@ 
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2022 Daniel Wagner, SUSE Labs
+#
+# Test queue count changes on reconnect
+
+. tests/nvme/rc
+
+DESCRIPTION="Test queue count changes on reconnect"
+QUICK=1
+
+# This test requires at least two CPUs on the host side to be available. The
+# host will create for each CPU a queue.  As this tests starts with the number
+# of queues requested by the host and then limits the queues count to one on the
+# target side we need to have more than one queue initially.
+requires() {
+	_nvme_requires
+	_have_loop
+	_require_nvme_trtype_is_fabrics
+	_require_min_cpus 2
+}
+
+_detect_subsys_attr() {
+	local attr="$1"
+	local file_path="${TMPDIR}/img"
+	local subsys_name="blktests-feature-detect"
+	local cfs_path="${NVMET_CFS}/subsystems/${subsys_name}"
+	local port
+
+	truncate -s 1M "${file_path}"
+
+	_create_nvmet_subsystem "${subsys_name}" "${file_path}" \
+		"b92842df-a394-44b1-84a4-92ae7d112332"
+	port="$(_create_nvmet_port "${nvme_trtype}")"
+
+	local val=1
+	[[ -f "${cfs_path}/${attr}" ]] && val=0
+
+	_remove_nvmet_subsystem "${subsys_name}"
+
+	_remove_nvmet_port "${port}"
+
+	rm "${file_path}"
+
+	return "${val}"
+}
+
+def_state_timeout=20
+
+nvmf_wait_for_state() {
+	local subsys_name="$1"
+	local state="$2"
+	local timeout="${3:-$def_state_timeout}"
+
+	local nvmedev=$(_find_nvme_dev "${subsys_name}")
+	local state_file="/sys/class/nvme-fabrics/ctl/${nvmedev}/state"
+
+	local start_time=$(date +%s)
+	local end_time
+
+	while ! grep -q "${state}" "${state_file}"; do
+		sleep 1
+		end_time=$(date +%s)
+                if (( end_time - start_time > timeout )); then
+                        echo "expected state \"${state}\" not " \
+			     "reached within ${timeout} seconds"
+                        break
+                fi
+	done
+}
+
+nvmet_set_max_qid() {
+	local port="$1"
+	local subsys_name="$2"
+	local max_qid="$3"
+
+	_remove_nvmet_subsystem_from_port "${port}" "${subsys_name}"
+	nvmf_wait_for_state "${subsys_name}" "connecting"
+
+	_set_nvmet_attr_qid_max "${subsys_name}" "${max_qid}"
+
+	_add_nvmet_subsys_to_port "${port}" "${subsys_name}"
+	nvmf_wait_for_state "${subsys_name}" "live"
+}
+
+test() {
+	local port
+	local subsys_name="blktests-subsystem-1"
+	local hostid
+	local hostnqn="nqn.2014-08.org.nvmexpress:uuid:${hostid}"
+	local file_path="${TMPDIR}/img"
+
+	echo "Running ${TEST_NAME}"
+
+	hostid="$(uuidgen)"
+	if [ -z "$hostid" ] ; then
+		echo "uuidgen failed"
+		return 1
+	fi
+
+	_setup_nvmet
+
+	if ! _detect_subsys_attr "attr_qid_max"; then
+		SKIP_REASONS+=("missing attr_qid_max feature")
+		return 1
+	fi
+
+	truncate -s 512M "${file_path}"
+
+	_create_nvmet_subsystem "${subsys_name}" "${file_path}" \
+		"b92842df-a394-44b1-84a4-92ae7d112861"
+	port="$(_create_nvmet_port "${nvme_trtype}")"
+	_add_nvmet_subsys_to_port "${port}" "${subsys_name}"
+	_create_nvmet_host "${subsys_name}" "${hostnqn}"
+
+	_nvme_connect_subsys "${nvme_trtype}" "${subsys_name}" \
+			     "" "" \
+			     "${hostnqn}" "${hostid}"
+
+	nvmf_wait_for_state "${subsys_name}" "live"
+
+	nvmet_set_max_qid "${port}" "${subsys_name}" 1
+	nvmet_set_max_qid "${port}" "${subsys_name}" 128
+
+	_nvme_disconnect_subsys "${subsys_name}"
+
+	_remove_nvmet_subsystem_from_port "${port}" "${subsys_name}"
+	_remove_nvmet_subsystem "${subsys_name}"
+
+	_remove_nvmet_port "${port}"
+
+	_remove_nvmet_host "${hostnqn}"
+
+	rm "${file_path}"
+
+	echo "Test complete"
+}
diff --git a/tests/nvme/046.out b/tests/nvme/046.out
new file mode 100644
index 000000000000..f1a967d540b7
--- /dev/null
+++ b/tests/nvme/046.out
@@ -0,0 +1,3 @@ 
+Running nvme/046
+NQN:blktests-subsystem-1 disconnected 1 controller(s)
+Test complete
diff --git a/tests/nvme/rc b/tests/nvme/rc
index 6d4397a7f043..eab7c6773c60 100644
--- a/tests/nvme/rc
+++ b/tests/nvme/rc
@@ -544,6 +544,16 @@  _set_nvmet_dhgroup() {
 	     "${cfs_path}/dhchap_dhgroup"
 }
 
+_set_nvmet_attr_qid_max() {
+	local nvmet_subsystem="$1"
+	local qid_max="$2"
+	local cfs_path="${NVMET_CFS}/subsystems/${nvmet_subsystem}"
+
+	if [[ -f "${cfs_path}/attr_qid_max" ]]; then
+		echo "${qid_max}" > "${cfs_path}/attr_qid_max"
+	fi
+}
+
 _find_nvme_dev() {
 	local subsys=$1
 	local subsysnqn