diff mbox series

pNFS: fix IO thread starvation problem during LAYOUTUNAVAILABLE error

Message ID 20220531134854.63115-1-olga.kornievskaia@gmail.com (mailing list archive)
State New, archived
Headers show
Series pNFS: fix IO thread starvation problem during LAYOUTUNAVAILABLE error | expand

Commit Message

Olga Kornievskaia May 31, 2022, 1:48 p.m. UTC
From: Olga Kornievskaia <kolga@netapp.com>

In recent pnfs testing we've incountered IO thread starvation problem
during the time when the server returns LAYOUTUNAVAILABLE error to the
client. When that happens each IO request tries to get a new layout
and the pnfs_update_layout() code ensures that only 1 LAYOUTGET
RPC is outstanding, the rest would be waiting. As the thread that gets
the layout wakes up the waiters only one gets to run and it tends to be
the latest added to the waiting queue. After receiving LAYOUTUNAVAILABLE
error the client would fall back to the MDS writes and as those writes
complete and the new write is issued, those requests are added as
waiters and they get to run before the earliest of the waiters that
was put on the queue originally never gets to run until the
LAYOUTUNAVAILABLE condition resolves itself on the server.

With the current code, if N IOs arrive asking for a layout, then
there will be N serial LAYOUTGETs that will follow, each would be
getting its own LAYOUTUNAVAILABLE error. Instead, the patch proposes
to apply the error condition to ALL the waiters for the outstanding
LAYOUTGET. Once the error is received, the code would allow all
exiting N IOs fall back to the MDS, but any new arriving IOs would be
then queued up and one them the new IO would trigger a new LAYOUTGET.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
---
 fs/nfs/pnfs.c | 14 +++++++++++++-
 fs/nfs/pnfs.h |  2 ++
 2 files changed, 15 insertions(+), 1 deletion(-)

Comments

Trond Myklebust May 31, 2022, 3 p.m. UTC | #1
On Tue, 2022-05-31 at 09:48 -0400, Olga Kornievskaia wrote:
> From: Olga Kornievskaia <kolga@netapp.com>
> 
> In recent pnfs testing we've incountered IO thread starvation problem
> during the time when the server returns LAYOUTUNAVAILABLE error to
> the
> client. When that happens each IO request tries to get a new layout
> and the pnfs_update_layout() code ensures that only 1 LAYOUTGET
> RPC is outstanding, the rest would be waiting. As the thread that
> gets
> the layout wakes up the waiters only one gets to run and it tends to
> be
> the latest added to the waiting queue. After receiving
> LAYOUTUNAVAILABLE
> error the client would fall back to the MDS writes and as those
> writes
> complete and the new write is issued, those requests are added as
> waiters and they get to run before the earliest of the waiters that
> was put on the queue originally never gets to run until the
> LAYOUTUNAVAILABLE condition resolves itself on the server.
> 
> With the current code, if N IOs arrive asking for a layout, then
> there will be N serial LAYOUTGETs that will follow, each would be
> getting its own LAYOUTUNAVAILABLE error. Instead, the patch proposes
> to apply the error condition to ALL the waiters for the outstanding
> LAYOUTGET. Once the error is received, the code would allow all
> exiting N IOs fall back to the MDS, but any new arriving IOs would be
> then queued up and one them the new IO would trigger a new LAYOUTGET.
> 
> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
> ---
>  fs/nfs/pnfs.c | 14 +++++++++++++-
>  fs/nfs/pnfs.h |  2 ++
>  2 files changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 68a87be3e6f9..5b7a679e32c8 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -2028,10 +2028,20 @@ pnfs_update_layout(struct inode *ino,
>         if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo))
> &&
>             atomic_read(&lo->plh_outstanding) != 0) {
>                 spin_unlock(&ino->i_lock);
> +               atomic_inc(&lo->plh_waiting);
>                 lseg = ERR_PTR(wait_var_event_killable(&lo-
> >plh_outstanding,
>                                         !atomic_read(&lo-
> >plh_outstanding)));
> -               if (IS_ERR(lseg))
> +               if (IS_ERR(lseg)) {
> +                       atomic_dec(&lo->plh_waiting);
>                         goto out_put_layout_hdr;
> +               }
> +               if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) {
> +                       pnfs_layout_clear_fail_bit(lo,
> pnfs_iomode_to_fail_bit(iomode));
> +                       lseg = NULL;
> +                       if (atomic_dec_and_test(&lo->plh_waiting))
> +                               clear_bit(NFS_LAYOUT_DRAIN, &lo-
> >plh_flags);
> +                       goto out_put_layout_hdr;
> +               }
>                 pnfs_put_layout_hdr(lo);
>                 goto lookup_again;
>         }
> @@ -2152,6 +2162,8 @@ pnfs_update_layout(struct inode *ino,
>                 case -ERECALLCONFLICT:
>                 case -EAGAIN:
>                         break;
> +               case -ENODATA:
> +                       set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
>                 default:
>                         if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
>                                 pnfs_layout_clear_fail_bit(lo,
> pnfs_iomode_to_fail_bit(iomode));
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 07f11489e4e9..5c07da32320b 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -105,6 +105,7 @@ enum {
>         NFS_LAYOUT_FIRST_LAYOUTGET,     /* Serialize first layoutget
> */
>         NFS_LAYOUT_INODE_FREEING,       /* The inode is being freed
> */
>         NFS_LAYOUT_HASHED,              /* The layout visible */
> +       NFS_LAYOUT_DRAIN,
>  };
>  
>  enum layoutdriver_policy_flags {
> @@ -196,6 +197,7 @@ struct pnfs_commit_ops {
>  struct pnfs_layout_hdr {
>         refcount_t              plh_refcount;
>         atomic_t                plh_outstanding; /* number of RPCs
> out */
> +       atomic_t                plh_waiting;
>         struct list_head        plh_layouts;   /* other client
> layouts */
>         struct list_head        plh_bulk_destroy;
>         struct list_head        plh_segs;      /* layout segments
> list */

According to the spec, the correct behaviour for handling
NFS4ERR_LAYOUTUNAVAILABLE is to stop trying to do pNFS to the inode,
and to fall back to doing I/O through the MDS. The error describes a
more or less permanent state of the server being unable to hand out a
layout for this file.
If the server wanted the clients to retry after a delay, it should be
returning NFS4ERR_LAYOUTTRYLATER. We already handle that correctly.

Currently, what our client does to handle NFS4ERR_LAYOUTUNAVAILABLE is
just plain wrong: we just return no layout, and then let the next
caller to pnfs_update_layout() immediately try again.

My problem with this patch, is that it just falls back to doing I/O
through the MDS for the writes that are already queued in
pnfs_update_layout(). It perpetuates the current bad behaviour of
unnecessary pounding of the server with LAYOUTGET requests that are
going to fail with the exact same error.

I'd therefore prefer to see us fix the real bug (i.e. the handling of
NFS4ERR_LAYOUTUNAVAILABLE) first, and then look at mitigating issues
with the queuing. I already have 2 patches to deal with this.
Olga Kornievskaia May 31, 2022, 4:03 p.m. UTC | #2
On Tue, May 31, 2022 at 11:00 AM Trond Myklebust
<trondmy@hammerspace.com> wrote:
>
> On Tue, 2022-05-31 at 09:48 -0400, Olga Kornievskaia wrote:
> > From: Olga Kornievskaia <kolga@netapp.com>
> >
> > In recent pnfs testing we've incountered IO thread starvation problem
> > during the time when the server returns LAYOUTUNAVAILABLE error to
> > the
> > client. When that happens each IO request tries to get a new layout
> > and the pnfs_update_layout() code ensures that only 1 LAYOUTGET
> > RPC is outstanding, the rest would be waiting. As the thread that
> > gets
> > the layout wakes up the waiters only one gets to run and it tends to
> > be
> > the latest added to the waiting queue. After receiving
> > LAYOUTUNAVAILABLE
> > error the client would fall back to the MDS writes and as those
> > writes
> > complete and the new write is issued, those requests are added as
> > waiters and they get to run before the earliest of the waiters that
> > was put on the queue originally never gets to run until the
> > LAYOUTUNAVAILABLE condition resolves itself on the server.
> >
> > With the current code, if N IOs arrive asking for a layout, then
> > there will be N serial LAYOUTGETs that will follow, each would be
> > getting its own LAYOUTUNAVAILABLE error. Instead, the patch proposes
> > to apply the error condition to ALL the waiters for the outstanding
> > LAYOUTGET. Once the error is received, the code would allow all
> > exiting N IOs fall back to the MDS, but any new arriving IOs would be
> > then queued up and one them the new IO would trigger a new LAYOUTGET.
> >
> > Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
> > ---
> >  fs/nfs/pnfs.c | 14 +++++++++++++-
> >  fs/nfs/pnfs.h |  2 ++
> >  2 files changed, 15 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> > index 68a87be3e6f9..5b7a679e32c8 100644
> > --- a/fs/nfs/pnfs.c
> > +++ b/fs/nfs/pnfs.c
> > @@ -2028,10 +2028,20 @@ pnfs_update_layout(struct inode *ino,
> >         if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo))
> > &&
> >             atomic_read(&lo->plh_outstanding) != 0) {
> >                 spin_unlock(&ino->i_lock);
> > +               atomic_inc(&lo->plh_waiting);
> >                 lseg = ERR_PTR(wait_var_event_killable(&lo-
> > >plh_outstanding,
> >                                         !atomic_read(&lo-
> > >plh_outstanding)));
> > -               if (IS_ERR(lseg))
> > +               if (IS_ERR(lseg)) {
> > +                       atomic_dec(&lo->plh_waiting);
> >                         goto out_put_layout_hdr;
> > +               }
> > +               if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) {
> > +                       pnfs_layout_clear_fail_bit(lo,
> > pnfs_iomode_to_fail_bit(iomode));
> > +                       lseg = NULL;
> > +                       if (atomic_dec_and_test(&lo->plh_waiting))
> > +                               clear_bit(NFS_LAYOUT_DRAIN, &lo-
> > >plh_flags);
> > +                       goto out_put_layout_hdr;
> > +               }
> >                 pnfs_put_layout_hdr(lo);
> >                 goto lookup_again;
> >         }
> > @@ -2152,6 +2162,8 @@ pnfs_update_layout(struct inode *ino,
> >                 case -ERECALLCONFLICT:
> >                 case -EAGAIN:
> >                         break;
> > +               case -ENODATA:
> > +                       set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
> >                 default:
> >                         if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
> >                                 pnfs_layout_clear_fail_bit(lo,
> > pnfs_iomode_to_fail_bit(iomode));
> > diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> > index 07f11489e4e9..5c07da32320b 100644
> > --- a/fs/nfs/pnfs.h
> > +++ b/fs/nfs/pnfs.h
> > @@ -105,6 +105,7 @@ enum {
> >         NFS_LAYOUT_FIRST_LAYOUTGET,     /* Serialize first layoutget
> > */
> >         NFS_LAYOUT_INODE_FREEING,       /* The inode is being freed
> > */
> >         NFS_LAYOUT_HASHED,              /* The layout visible */
> > +       NFS_LAYOUT_DRAIN,
> >  };
> >
> >  enum layoutdriver_policy_flags {
> > @@ -196,6 +197,7 @@ struct pnfs_commit_ops {
> >  struct pnfs_layout_hdr {
> >         refcount_t              plh_refcount;
> >         atomic_t                plh_outstanding; /* number of RPCs
> > out */
> > +       atomic_t                plh_waiting;
> >         struct list_head        plh_layouts;   /* other client
> > layouts */
> >         struct list_head        plh_bulk_destroy;
> >         struct list_head        plh_segs;      /* layout segments
> > list */
>
> According to the spec, the correct behaviour for handling
> NFS4ERR_LAYOUTUNAVAILABLE is to stop trying to do pNFS to the inode,
> and to fall back to doing I/O through the MDS. The error describes a
> more or less permanent state of the server being unable to hand out a
> layout for this file.
> If the server wanted the clients to retry after a delay, it should be
> returning NFS4ERR_LAYOUTTRYLATER. We already handle that correctly.

To clarify, can you confirm that LAYOUTUNAVAILABLE would only turn off
the inode permanently but for a period of time?

It looks to me that for the LAYOUTTRYLATER, the client would face the
same starvation problem in the same situation. I don't see anything
marking the segment failed for such error? I believe the client
returns nolayout for that error, falls back to MDS but allows asking
for the layout for a period of time, having again the queue of waiters
that gets manipulated as such that favors last added.


> Currently, what our client does to handle NFS4ERR_LAYOUTUNAVAILABLE is
> just plain wrong: we just return no layout, and then let the next
> caller to pnfs_update_layout() immediately try again.
>
> My problem with this patch, is that it just falls back to doing I/O
> through the MDS for the writes that are already queued in
> pnfs_update_layout(). It perpetuates the current bad behaviour of
> unnecessary pounding of the server with LAYOUTGET requests that are
> going to fail with the exact same error.
>
> I'd therefore prefer to see us fix the real bug (i.e. the handling of
> NFS4ERR_LAYOUTUNAVAILABLE) first, and then look at mitigating issues
> with the queuing. I already have 2 patches to deal with this.
>
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
>
>
Trond Myklebust May 31, 2022, 4:14 p.m. UTC | #3
On Tue, 2022-05-31 at 12:03 -0400, Olga Kornievskaia wrote:
> On Tue, May 31, 2022 at 11:00 AM Trond Myklebust
> <trondmy@hammerspace.com> wrote:
> > 
> > On Tue, 2022-05-31 at 09:48 -0400, Olga Kornievskaia wrote:
> > > From: Olga Kornievskaia <kolga@netapp.com>
> > > 
> > > In recent pnfs testing we've incountered IO thread starvation
> > > problem
> > > during the time when the server returns LAYOUTUNAVAILABLE error
> > > to
> > > the
> > > client. When that happens each IO request tries to get a new
> > > layout
> > > and the pnfs_update_layout() code ensures that only 1 LAYOUTGET
> > > RPC is outstanding, the rest would be waiting. As the thread that
> > > gets
> > > the layout wakes up the waiters only one gets to run and it tends
> > > to
> > > be
> > > the latest added to the waiting queue. After receiving
> > > LAYOUTUNAVAILABLE
> > > error the client would fall back to the MDS writes and as those
> > > writes
> > > complete and the new write is issued, those requests are added as
> > > waiters and they get to run before the earliest of the waiters
> > > that
> > > was put on the queue originally never gets to run until the
> > > LAYOUTUNAVAILABLE condition resolves itself on the server.
> > > 
> > > With the current code, if N IOs arrive asking for a layout, then
> > > there will be N serial LAYOUTGETs that will follow, each would be
> > > getting its own LAYOUTUNAVAILABLE error. Instead, the patch
> > > proposes
> > > to apply the error condition to ALL the waiters for the
> > > outstanding
> > > LAYOUTGET. Once the error is received, the code would allow all
> > > exiting N IOs fall back to the MDS, but any new arriving IOs
> > > would be
> > > then queued up and one them the new IO would trigger a new
> > > LAYOUTGET.
> > > 
> > > Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
> > > ---
> > >  fs/nfs/pnfs.c | 14 +++++++++++++-
> > >  fs/nfs/pnfs.h |  2 ++
> > >  2 files changed, 15 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> > > index 68a87be3e6f9..5b7a679e32c8 100644
> > > --- a/fs/nfs/pnfs.c
> > > +++ b/fs/nfs/pnfs.c
> > > @@ -2028,10 +2028,20 @@ pnfs_update_layout(struct inode *ino,
> > >         if ((list_empty(&lo->plh_segs) ||
> > > !pnfs_layout_is_valid(lo))
> > > &&
> > >             atomic_read(&lo->plh_outstanding) != 0) {
> > >                 spin_unlock(&ino->i_lock);
> > > +               atomic_inc(&lo->plh_waiting);
> > >                 lseg = ERR_PTR(wait_var_event_killable(&lo-
> > > > plh_outstanding,
> > >                                         !atomic_read(&lo-
> > > > plh_outstanding)));
> > > -               if (IS_ERR(lseg))
> > > +               if (IS_ERR(lseg)) {
> > > +                       atomic_dec(&lo->plh_waiting);
> > >                         goto out_put_layout_hdr;
> > > +               }
> > > +               if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) {
> > > +                       pnfs_layout_clear_fail_bit(lo,
> > > pnfs_iomode_to_fail_bit(iomode));

By the way: this call to pnfs_layout_clear_fail_bit() would break any
future fix to NFS4ERR_LAYOUTUNAVAILABLE. It's not the right thing to do
here.

> > > +                       lseg = NULL;
> > > +                       if (atomic_dec_and_test(&lo-
> > > >plh_waiting))
> > > +                               clear_bit(NFS_LAYOUT_DRAIN, &lo-
> > > > plh_flags);
> > > +                       goto out_put_layout_hdr;
> > > +               }
> > >                 pnfs_put_layout_hdr(lo);
> > >                 goto lookup_again;
> > >         }
> > > @@ -2152,6 +2162,8 @@ pnfs_update_layout(struct inode *ino,
> > >                 case -ERECALLCONFLICT:
> > >                 case -EAGAIN:
> > >                         break;
> > > +               case -ENODATA:
> > > +                       set_bit(NFS_LAYOUT_DRAIN, &lo-
> > > >plh_flags);
> > >                 default:
> > >                         if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
> > >                                 pnfs_layout_clear_fail_bit(lo,
> > > pnfs_iomode_to_fail_bit(iomode));
> > > diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> > > index 07f11489e4e9..5c07da32320b 100644
> > > --- a/fs/nfs/pnfs.h
> > > +++ b/fs/nfs/pnfs.h
> > > @@ -105,6 +105,7 @@ enum {
> > >         NFS_LAYOUT_FIRST_LAYOUTGET,     /* Serialize first
> > > layoutget
> > > */
> > >         NFS_LAYOUT_INODE_FREEING,       /* The inode is being
> > > freed
> > > */
> > >         NFS_LAYOUT_HASHED,              /* The layout visible */
> > > +       NFS_LAYOUT_DRAIN,
> > >  };
> > > 
> > >  enum layoutdriver_policy_flags {
> > > @@ -196,6 +197,7 @@ struct pnfs_commit_ops {
> > >  struct pnfs_layout_hdr {
> > >         refcount_t              plh_refcount;
> > >         atomic_t                plh_outstanding; /* number of
> > > RPCs
> > > out */
> > > +       atomic_t                plh_waiting;
> > >         struct list_head        plh_layouts;   /* other client
> > > layouts */
> > >         struct list_head        plh_bulk_destroy;
> > >         struct list_head        plh_segs;      /* layout segments
> > > list */
> > 
> > According to the spec, the correct behaviour for handling
> > NFS4ERR_LAYOUTUNAVAILABLE is to stop trying to do pNFS to the
> > inode,
> > and to fall back to doing I/O through the MDS. The error describes
> > a
> > more or less permanent state of the server being unable to hand out
> > a
> > layout for this file.
> > If the server wanted the clients to retry after a delay, it should
> > be
> > returning NFS4ERR_LAYOUTTRYLATER. We already handle that correctly.
> 
> To clarify, can you confirm that LAYOUTUNAVAILABLE would only turn
> off
> the inode permanently but for a period of time?

See pnfs_layout_io_test_failed(). It automatically clears the fail bit
after a period of PNFS_LAYOUTGET_RETRY_TIMEOUT (i.e. 120 seconds)

> 
> It looks to me that for the LAYOUTTRYLATER, the client would face the
> same starvation problem in the same situation. I don't see anything
> marking the segment failed for such error? I believe the client
> returns nolayout for that error, falls back to MDS but allows asking
> for the layout for a period of time, having again the queue of
> waiters
> that gets manipulated as such that favors last added.
> 
> 
> > Currently, what our client does to handle NFS4ERR_LAYOUTUNAVAILABLE
> > is
> > just plain wrong: we just return no layout, and then let the next
> > caller to pnfs_update_layout() immediately try again.
> > 
> > My problem with this patch, is that it just falls back to doing I/O
> > through the MDS for the writes that are already queued in
> > pnfs_update_layout(). It perpetuates the current bad behaviour of
> > unnecessary pounding of the server with LAYOUTGET requests that are
> > going to fail with the exact same error.
> > 
> > I'd therefore prefer to see us fix the real bug (i.e. the handling
> > of
> > NFS4ERR_LAYOUTUNAVAILABLE) first, and then look at mitigating
> > issues
> > with the queuing. I already have 2 patches to deal with this.
> > 
> > --
> > Trond Myklebust
> > Linux NFS client maintainer, Hammerspace
> > trond.myklebust@hammerspace.com
> > 
> >
kernel test robot May 31, 2022, 5:17 p.m. UTC | #4
Hi Olga,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on trondmy-nfs/linux-next]
[also build test WARNING on v5.18 next-20220531]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Olga-Kornievskaia/pNFS-fix-IO-thread-starvation-problem-during-LAYOUTUNAVAILABLE-error/20220531-215040
base:   git://git.linux-nfs.org/projects/trondmy/linux-nfs.git linux-next
config: arc-allyesconfig (https://download.01.org/0day-ci/archive/20220601/202206010131.m809TcwM-lkp@intel.com/config)
compiler: arceb-elf-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/7f80a6c53d6cdb806706a8748cb79348f9906229
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Olga-Kornievskaia/pNFS-fix-IO-thread-starvation-problem-during-LAYOUTUNAVAILABLE-error/20220531-215040
        git checkout 7f80a6c53d6cdb806706a8748cb79348f9906229
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=arc SHELL=/bin/bash fs/nfs/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   fs/nfs/pnfs.c: In function 'pnfs_update_layout':
>> fs/nfs/pnfs.c:2164:25: warning: this statement may fall through [-Wimplicit-fallthrough=]
    2164 |                         set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   fs/nfs/pnfs.c:2165:17: note: here
    2165 |                 default:
         |                 ^~~~~~~


vim +2164 fs/nfs/pnfs.c

  1952	
  1953	/*
  1954	 * Layout segment is retreived from the server if not cached.
  1955	 * The appropriate layout segment is referenced and returned to the caller.
  1956	 */
  1957	struct pnfs_layout_segment *
  1958	pnfs_update_layout(struct inode *ino,
  1959			   struct nfs_open_context *ctx,
  1960			   loff_t pos,
  1961			   u64 count,
  1962			   enum pnfs_iomode iomode,
  1963			   bool strict_iomode,
  1964			   gfp_t gfp_flags)
  1965	{
  1966		struct pnfs_layout_range arg = {
  1967			.iomode = iomode,
  1968			.offset = pos,
  1969			.length = count,
  1970		};
  1971		unsigned pg_offset;
  1972		struct nfs_server *server = NFS_SERVER(ino);
  1973		struct nfs_client *clp = server->nfs_client;
  1974		struct pnfs_layout_hdr *lo = NULL;
  1975		struct pnfs_layout_segment *lseg = NULL;
  1976		struct nfs4_layoutget *lgp;
  1977		nfs4_stateid stateid;
  1978		long timeout = 0;
  1979		unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
  1980		bool first;
  1981	
  1982		if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
  1983			trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
  1984					 PNFS_UPDATE_LAYOUT_NO_PNFS);
  1985			goto out;
  1986		}
  1987	
  1988		if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
  1989			trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
  1990					 PNFS_UPDATE_LAYOUT_MDSTHRESH);
  1991			goto out;
  1992		}
  1993	
  1994	lookup_again:
  1995		lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp));
  1996		if (IS_ERR(lseg))
  1997			goto out;
  1998		first = false;
  1999		spin_lock(&ino->i_lock);
  2000		lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
  2001		if (lo == NULL) {
  2002			spin_unlock(&ino->i_lock);
  2003			trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
  2004					 PNFS_UPDATE_LAYOUT_NOMEM);
  2005			goto out;
  2006		}
  2007	
  2008		/* Do we even need to bother with this? */
  2009		if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
  2010			trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
  2011					 PNFS_UPDATE_LAYOUT_BULK_RECALL);
  2012			dprintk("%s matches recall, use MDS\n", __func__);
  2013			goto out_unlock;
  2014		}
  2015	
  2016		/* if LAYOUTGET already failed once we don't try again */
  2017		if (pnfs_layout_io_test_failed(lo, iomode)) {
  2018			trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
  2019					 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
  2020			goto out_unlock;
  2021		}
  2022	
  2023		/*
  2024		 * If the layout segment list is empty, but there are outstanding
  2025		 * layoutget calls, then they might be subject to a layoutrecall.
  2026		 */
  2027		if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) &&
  2028		    atomic_read(&lo->plh_outstanding) != 0) {
  2029			spin_unlock(&ino->i_lock);
  2030			atomic_inc(&lo->plh_waiting);
  2031			lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
  2032						!atomic_read(&lo->plh_outstanding)));
  2033			if (IS_ERR(lseg)) {
  2034				atomic_dec(&lo->plh_waiting);
  2035				goto out_put_layout_hdr;
  2036			}
  2037			if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) {
  2038				pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
  2039				lseg = NULL;
  2040				if (atomic_dec_and_test(&lo->plh_waiting))
  2041					clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
  2042				goto out_put_layout_hdr;
  2043			}
  2044			pnfs_put_layout_hdr(lo);
  2045			goto lookup_again;
  2046		}
  2047	
  2048		/*
  2049		 * Because we free lsegs when sending LAYOUTRETURN, we need to wait
  2050		 * for LAYOUTRETURN.
  2051		 */
  2052		if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
  2053			spin_unlock(&ino->i_lock);
  2054			dprintk("%s wait for layoutreturn\n", __func__);
  2055			lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo));
  2056			if (!IS_ERR(lseg)) {
  2057				pnfs_put_layout_hdr(lo);
  2058				dprintk("%s retrying\n", __func__);
  2059				trace_pnfs_update_layout(ino, pos, count, iomode, lo,
  2060							 lseg,
  2061							 PNFS_UPDATE_LAYOUT_RETRY);
  2062				goto lookup_again;
  2063			}
  2064			trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
  2065						 PNFS_UPDATE_LAYOUT_RETURN);
  2066			goto out_put_layout_hdr;
  2067		}
  2068	
  2069		lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
  2070		if (lseg) {
  2071			trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
  2072					PNFS_UPDATE_LAYOUT_FOUND_CACHED);
  2073			goto out_unlock;
  2074		}
  2075	
  2076		/*
  2077		 * Choose a stateid for the LAYOUTGET. If we don't have a layout
  2078		 * stateid, or it has been invalidated, then we must use the open
  2079		 * stateid.
  2080		 */
  2081		if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
  2082			int status;
  2083	
  2084			/*
  2085			 * The first layoutget for the file. Need to serialize per
  2086			 * RFC 5661 Errata 3208.
  2087			 */
  2088			if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
  2089					     &lo->plh_flags)) {
  2090				spin_unlock(&ino->i_lock);
  2091				lseg = ERR_PTR(wait_on_bit(&lo->plh_flags,
  2092							NFS_LAYOUT_FIRST_LAYOUTGET,
  2093							TASK_KILLABLE));
  2094				if (IS_ERR(lseg))
  2095					goto out_put_layout_hdr;
  2096				pnfs_put_layout_hdr(lo);
  2097				dprintk("%s retrying\n", __func__);
  2098				goto lookup_again;
  2099			}
  2100	
  2101			spin_unlock(&ino->i_lock);
  2102			first = true;
  2103			status = nfs4_select_rw_stateid(ctx->state,
  2104						iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ,
  2105						NULL, &stateid, NULL);
  2106			if (status != 0) {
  2107				lseg = ERR_PTR(status);
  2108				trace_pnfs_update_layout(ino, pos, count,
  2109						iomode, lo, lseg,
  2110						PNFS_UPDATE_LAYOUT_INVALID_OPEN);
  2111				nfs4_schedule_stateid_recovery(server, ctx->state);
  2112				pnfs_clear_first_layoutget(lo);
  2113				pnfs_put_layout_hdr(lo);
  2114				goto lookup_again;
  2115			}
  2116			spin_lock(&ino->i_lock);
  2117		} else {
  2118			nfs4_stateid_copy(&stateid, &lo->plh_stateid);
  2119		}
  2120	
  2121		if (pnfs_layoutgets_blocked(lo)) {
  2122			trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
  2123					PNFS_UPDATE_LAYOUT_BLOCKED);
  2124			goto out_unlock;
  2125		}
  2126		nfs_layoutget_begin(lo);
  2127		spin_unlock(&ino->i_lock);
  2128	
  2129		_add_to_server_list(lo, server);
  2130	
  2131		pg_offset = arg.offset & ~PAGE_MASK;
  2132		if (pg_offset) {
  2133			arg.offset -= pg_offset;
  2134			arg.length += pg_offset;
  2135		}
  2136		if (arg.length != NFS4_MAX_UINT64)
  2137			arg.length = PAGE_ALIGN(arg.length);
  2138	
  2139		lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags);
  2140		if (!lgp) {
  2141			trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL,
  2142						 PNFS_UPDATE_LAYOUT_NOMEM);
  2143			nfs_layoutget_end(lo);
  2144			goto out_put_layout_hdr;
  2145		}
  2146	
  2147		lgp->lo = lo;
  2148		pnfs_get_layout_hdr(lo);
  2149	
  2150		lseg = nfs4_proc_layoutget(lgp, &timeout);
  2151		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
  2152					 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
  2153		nfs_layoutget_end(lo);
  2154		if (IS_ERR(lseg)) {
  2155			switch(PTR_ERR(lseg)) {
  2156			case -EBUSY:
  2157				if (time_after(jiffies, giveup))
  2158					lseg = NULL;
  2159				break;
  2160			case -ERECALLCONFLICT:
  2161			case -EAGAIN:
  2162				break;
  2163			case -ENODATA:
> 2164				set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
  2165			default:
  2166				if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
  2167					pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
  2168					lseg = NULL;
  2169				}
  2170				goto out_put_layout_hdr;
  2171			}
  2172			if (lseg) {
  2173				if (first)
  2174					pnfs_clear_first_layoutget(lo);
  2175				trace_pnfs_update_layout(ino, pos, count,
  2176					iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
  2177				pnfs_put_layout_hdr(lo);
  2178				goto lookup_again;
  2179			}
  2180		} else {
  2181			pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
  2182		}
  2183	
  2184	out_put_layout_hdr:
  2185		if (first)
  2186			pnfs_clear_first_layoutget(lo);
  2187		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
  2188					 PNFS_UPDATE_LAYOUT_EXIT);
  2189		pnfs_put_layout_hdr(lo);
  2190	out:
  2191		dprintk("%s: inode %s/%llu pNFS layout segment %s for "
  2192				"(%s, offset: %llu, length: %llu)\n",
  2193				__func__, ino->i_sb->s_id,
  2194				(unsigned long long)NFS_FILEID(ino),
  2195				IS_ERR_OR_NULL(lseg) ? "not found" : "found",
  2196				iomode==IOMODE_RW ?  "read/write" : "read-only",
  2197				(unsigned long long)pos,
  2198				(unsigned long long)count);
  2199		return lseg;
  2200	out_unlock:
  2201		spin_unlock(&ino->i_lock);
  2202		goto out_put_layout_hdr;
  2203	}
  2204	EXPORT_SYMBOL_GPL(pnfs_update_layout);
  2205
kernel test robot May 31, 2022, 6:19 p.m. UTC | #5
Hi Olga,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on trondmy-nfs/linux-next]
[also build test WARNING on v5.18 next-20220531]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Olga-Kornievskaia/pNFS-fix-IO-thread-starvation-problem-during-LAYOUTUNAVAILABLE-error/20220531-215040
base:   git://git.linux-nfs.org/projects/trondmy/linux-nfs.git linux-next
config: i386-randconfig-a015 (https://download.01.org/0day-ci/archive/20220601/202206010223.Zov1XR1x-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project c825abd6b0198fb088d9752f556a70705bc99dfd)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/7f80a6c53d6cdb806706a8748cb79348f9906229
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Olga-Kornievskaia/pNFS-fix-IO-thread-starvation-problem-during-LAYOUTUNAVAILABLE-error/20220531-215040
        git checkout 7f80a6c53d6cdb806706a8748cb79348f9906229
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash fs/nfs/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> fs/nfs/pnfs.c:2165:3: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough]
                   default:
                   ^
   fs/nfs/pnfs.c:2165:3: note: insert '__attribute__((fallthrough));' to silence this warning
                   default:
                   ^
                   __attribute__((fallthrough)); 
   fs/nfs/pnfs.c:2165:3: note: insert 'break;' to avoid fall-through
                   default:
                   ^
                   break; 
   1 warning generated.


vim +2165 fs/nfs/pnfs.c

78746a384c88c64 Fred Isaman           2016-09-22  1952  
e5e940170b2136a Benny Halevy          2010-10-20  1953  /*
e5e940170b2136a Benny Halevy          2010-10-20  1954   * Layout segment is retreived from the server if not cached.
e5e940170b2136a Benny Halevy          2010-10-20  1955   * The appropriate layout segment is referenced and returned to the caller.
e5e940170b2136a Benny Halevy          2010-10-20  1956   */
7c24d9489fe57d6 Andy Adamson          2011-06-13  1957  struct pnfs_layout_segment *
e5e940170b2136a Benny Halevy          2010-10-20  1958  pnfs_update_layout(struct inode *ino,
e5e940170b2136a Benny Halevy          2010-10-20  1959  		   struct nfs_open_context *ctx,
fb3296eb4636763 Benny Halevy          2011-05-22  1960  		   loff_t pos,
fb3296eb4636763 Benny Halevy          2011-05-22  1961  		   u64 count,
a75b9df9d3bfc3c Trond Myklebust       2011-05-11  1962  		   enum pnfs_iomode iomode,
c7d73af2d249f03 Tom Haynes            2016-05-25  1963  		   bool strict_iomode,
a75b9df9d3bfc3c Trond Myklebust       2011-05-11  1964  		   gfp_t gfp_flags)
e5e940170b2136a Benny Halevy          2010-10-20  1965  {
fb3296eb4636763 Benny Halevy          2011-05-22  1966  	struct pnfs_layout_range arg = {
fb3296eb4636763 Benny Halevy          2011-05-22  1967  		.iomode = iomode,
fb3296eb4636763 Benny Halevy          2011-05-22  1968  		.offset = pos,
fb3296eb4636763 Benny Halevy          2011-05-22  1969  		.length = count,
fb3296eb4636763 Benny Halevy          2011-05-22  1970  	};
70d2f7b1ea19b7e Trond Myklebust       2017-09-11  1971  	unsigned pg_offset;
6382a44138e7aa4 Weston Andros Adamson 2011-06-01  1972  	struct nfs_server *server = NFS_SERVER(ino);
6382a44138e7aa4 Weston Andros Adamson 2011-06-01  1973  	struct nfs_client *clp = server->nfs_client;
183d9e7b112aaed Jeff Layton           2016-05-17  1974  	struct pnfs_layout_hdr *lo = NULL;
e5e940170b2136a Benny Halevy          2010-10-20  1975  	struct pnfs_layout_segment *lseg = NULL;
587f03deb69bdbf Fred Isaman           2016-09-21  1976  	struct nfs4_layoutget *lgp;
183d9e7b112aaed Jeff Layton           2016-05-17  1977  	nfs4_stateid stateid;
183d9e7b112aaed Jeff Layton           2016-05-17  1978  	long timeout = 0;
66b53f325876703 Trond Myklebust       2016-07-14  1979  	unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
3000512137602b8 Weston Andros Adamson 2013-02-28  1980  	bool first;
e5e940170b2136a Benny Halevy          2010-10-20  1981  
9a4bf31d05a801e Jeff Layton           2015-12-10  1982  	if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
183d9e7b112aaed Jeff Layton           2016-05-17  1983  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
9a4bf31d05a801e Jeff Layton           2015-12-10  1984  				 PNFS_UPDATE_LAYOUT_NO_PNFS);
f86bbcf85db3259 Trond Myklebust       2012-09-26  1985  		goto out;
9a4bf31d05a801e Jeff Layton           2015-12-10  1986  	}
d23d61c8d351f5c Andy Adamson          2012-05-23  1987  
9a4bf31d05a801e Jeff Layton           2015-12-10  1988  	if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
183d9e7b112aaed Jeff Layton           2016-05-17  1989  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
9a4bf31d05a801e Jeff Layton           2015-12-10  1990  				 PNFS_UPDATE_LAYOUT_MDSTHRESH);
f86bbcf85db3259 Trond Myklebust       2012-09-26  1991  		goto out;
9a4bf31d05a801e Jeff Layton           2015-12-10  1992  	}
d23d61c8d351f5c Andy Adamson          2012-05-23  1993  
9bf87482ddc6f8d Peng Tao              2014-08-22  1994  lookup_again:
d03360aaf5ccac4 Trond Myklebust       2018-09-05  1995  	lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp));
d03360aaf5ccac4 Trond Myklebust       2018-09-05  1996  	if (IS_ERR(lseg))
d03360aaf5ccac4 Trond Myklebust       2018-09-05  1997  		goto out;
9bf87482ddc6f8d Peng Tao              2014-08-22  1998  	first = false;
e5e940170b2136a Benny Halevy          2010-10-20  1999  	spin_lock(&ino->i_lock);
9fa4075878a5faa Peng Tao              2011-07-30  2000  	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
830ffb565760234 Trond Myklebust       2012-09-20  2001  	if (lo == NULL) {
830ffb565760234 Trond Myklebust       2012-09-20  2002  		spin_unlock(&ino->i_lock);
183d9e7b112aaed Jeff Layton           2016-05-17  2003  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
9a4bf31d05a801e Jeff Layton           2015-12-10  2004  				 PNFS_UPDATE_LAYOUT_NOMEM);
830ffb565760234 Trond Myklebust       2012-09-20  2005  		goto out;
830ffb565760234 Trond Myklebust       2012-09-20  2006  	}
e5e940170b2136a Benny Halevy          2010-10-20  2007  
43f1b3da8b35d70 Fred Isaman           2011-01-06  2008  	/* Do we even need to bother with this? */
a59c30acfbe701d Trond Myklebust       2012-03-01  2009  	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
183d9e7b112aaed Jeff Layton           2016-05-17  2010  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
9a4bf31d05a801e Jeff Layton           2015-12-10  2011  				 PNFS_UPDATE_LAYOUT_BULK_RECALL);
43f1b3da8b35d70 Fred Isaman           2011-01-06  2012  		dprintk("%s matches recall, use MDS\n", __func__);
e5e940170b2136a Benny Halevy          2010-10-20  2013  		goto out_unlock;
e5e940170b2136a Benny Halevy          2010-10-20  2014  	}
e5e940170b2136a Benny Halevy          2010-10-20  2015  
e5e940170b2136a Benny Halevy          2010-10-20  2016  	/* if LAYOUTGET already failed once we don't try again */
2e5b29f0448be9e Trond Myklebust       2015-12-14  2017  	if (pnfs_layout_io_test_failed(lo, iomode)) {
183d9e7b112aaed Jeff Layton           2016-05-17  2018  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
9a4bf31d05a801e Jeff Layton           2015-12-10  2019  				 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
e5e940170b2136a Benny Halevy          2010-10-20  2020  		goto out_unlock;
9a4bf31d05a801e Jeff Layton           2015-12-10  2021  	}
e5e940170b2136a Benny Halevy          2010-10-20  2022  
411ae722d10a6d4 Trond Myklebust       2018-06-23  2023  	/*
411ae722d10a6d4 Trond Myklebust       2018-06-23  2024  	 * If the layout segment list is empty, but there are outstanding
411ae722d10a6d4 Trond Myklebust       2018-06-23  2025  	 * layoutget calls, then they might be subject to a layoutrecall.
411ae722d10a6d4 Trond Myklebust       2018-06-23  2026  	 */
0b77f97a7e42adc Trond Myklebust       2021-07-02  2027  	if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) &&
411ae722d10a6d4 Trond Myklebust       2018-06-23  2028  	    atomic_read(&lo->plh_outstanding) != 0) {
411ae722d10a6d4 Trond Myklebust       2018-06-23  2029  		spin_unlock(&ino->i_lock);
7f80a6c53d6cdb8 Olga Kornievskaia     2022-05-31  2030  		atomic_inc(&lo->plh_waiting);
d03360aaf5ccac4 Trond Myklebust       2018-09-05  2031  		lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
400417b05f3ec05 Trond Myklebust       2019-03-12  2032  					!atomic_read(&lo->plh_outstanding)));
7f80a6c53d6cdb8 Olga Kornievskaia     2022-05-31  2033  		if (IS_ERR(lseg)) {
7f80a6c53d6cdb8 Olga Kornievskaia     2022-05-31  2034  			atomic_dec(&lo->plh_waiting);
411ae722d10a6d4 Trond Myklebust       2018-06-23  2035  			goto out_put_layout_hdr;
7f80a6c53d6cdb8 Olga Kornievskaia     2022-05-31  2036  		}
7f80a6c53d6cdb8 Olga Kornievskaia     2022-05-31  2037  		if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) {
7f80a6c53d6cdb8 Olga Kornievskaia     2022-05-31  2038  			pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
7f80a6c53d6cdb8 Olga Kornievskaia     2022-05-31  2039  			lseg = NULL;
7f80a6c53d6cdb8 Olga Kornievskaia     2022-05-31  2040  			if (atomic_dec_and_test(&lo->plh_waiting))
7f80a6c53d6cdb8 Olga Kornievskaia     2022-05-31  2041  				clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
7f80a6c53d6cdb8 Olga Kornievskaia     2022-05-31  2042  			goto out_put_layout_hdr;
7f80a6c53d6cdb8 Olga Kornievskaia     2022-05-31  2043  		}
411ae722d10a6d4 Trond Myklebust       2018-06-23  2044  		pnfs_put_layout_hdr(lo);
411ae722d10a6d4 Trond Myklebust       2018-06-23  2045  		goto lookup_again;
411ae722d10a6d4 Trond Myklebust       2018-06-23  2046  	}
411ae722d10a6d4 Trond Myklebust       2018-06-23  2047  
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2048  	/*
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2049  	 * Because we free lsegs when sending LAYOUTRETURN, we need to wait
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2050  	 * for LAYOUTRETURN.
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2051  	 */
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2052  	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2053  		spin_unlock(&ino->i_lock);
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2054  		dprintk("%s wait for layoutreturn\n", __func__);
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2055  		lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo));
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2056  		if (!IS_ERR(lseg)) {
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2057  			pnfs_put_layout_hdr(lo);
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2058  			dprintk("%s retrying\n", __func__);
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2059  			trace_pnfs_update_layout(ino, pos, count, iomode, lo,
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2060  						 lseg,
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2061  						 PNFS_UPDATE_LAYOUT_RETRY);
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2062  			goto lookup_again;
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2063  		}
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2064  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2065  					 PNFS_UPDATE_LAYOUT_RETURN);
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2066  		goto out_put_layout_hdr;
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2067  	}
2c8d5fc37fe2384 Trond Myklebust       2021-01-05  2068  
c7d73af2d249f03 Tom Haynes            2016-05-25  2069  	lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
183d9e7b112aaed Jeff Layton           2016-05-17  2070  	if (lseg) {
183d9e7b112aaed Jeff Layton           2016-05-17  2071  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
183d9e7b112aaed Jeff Layton           2016-05-17  2072  				PNFS_UPDATE_LAYOUT_FOUND_CACHED);
183d9e7b112aaed Jeff Layton           2016-05-17  2073  		goto out_unlock;
183d9e7b112aaed Jeff Layton           2016-05-17  2074  	}
183d9e7b112aaed Jeff Layton           2016-05-17  2075  
183d9e7b112aaed Jeff Layton           2016-05-17  2076  	/*
183d9e7b112aaed Jeff Layton           2016-05-17  2077  	 * Choose a stateid for the LAYOUTGET. If we don't have a layout
183d9e7b112aaed Jeff Layton           2016-05-17  2078  	 * stateid, or it has been invalidated, then we must use the open
183d9e7b112aaed Jeff Layton           2016-05-17  2079  	 * stateid.
183d9e7b112aaed Jeff Layton           2016-05-17  2080  	 */
67a3b721462c9b3 Trond Myklebust       2016-06-17  2081  	if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
d9aba2b40de6fdd Trond Myklebust       2019-07-16  2082  		int status;
183d9e7b112aaed Jeff Layton           2016-05-17  2083  
183d9e7b112aaed Jeff Layton           2016-05-17  2084  		/*
183d9e7b112aaed Jeff Layton           2016-05-17  2085  		 * The first layoutget for the file. Need to serialize per
9bf87482ddc6f8d Peng Tao              2014-08-22  2086  		 * RFC 5661 Errata 3208.
9bf87482ddc6f8d Peng Tao              2014-08-22  2087  		 */
9bf87482ddc6f8d Peng Tao              2014-08-22  2088  		if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
9bf87482ddc6f8d Peng Tao              2014-08-22  2089  				     &lo->plh_flags)) {
9bf87482ddc6f8d Peng Tao              2014-08-22  2090  			spin_unlock(&ino->i_lock);
d03360aaf5ccac4 Trond Myklebust       2018-09-05  2091  			lseg = ERR_PTR(wait_on_bit(&lo->plh_flags,
d03360aaf5ccac4 Trond Myklebust       2018-09-05  2092  						NFS_LAYOUT_FIRST_LAYOUTGET,
d03360aaf5ccac4 Trond Myklebust       2018-09-05  2093  						TASK_KILLABLE));
d03360aaf5ccac4 Trond Myklebust       2018-09-05  2094  			if (IS_ERR(lseg))
d03360aaf5ccac4 Trond Myklebust       2018-09-05  2095  				goto out_put_layout_hdr;
9bf87482ddc6f8d Peng Tao              2014-08-22  2096  			pnfs_put_layout_hdr(lo);
183d9e7b112aaed Jeff Layton           2016-05-17  2097  			dprintk("%s retrying\n", __func__);
9bf87482ddc6f8d Peng Tao              2014-08-22  2098  			goto lookup_again;
9bf87482ddc6f8d Peng Tao              2014-08-22  2099  		}
183d9e7b112aaed Jeff Layton           2016-05-17  2100  
fbf4bcc9a837312 Trond Myklebust       2020-04-13  2101  		spin_unlock(&ino->i_lock);
183d9e7b112aaed Jeff Layton           2016-05-17  2102  		first = true;
d9aba2b40de6fdd Trond Myklebust       2019-07-16  2103  		status = nfs4_select_rw_stateid(ctx->state,
70d2f7b1ea19b7e Trond Myklebust       2017-09-11  2104  					iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ,
d9aba2b40de6fdd Trond Myklebust       2019-07-16  2105  					NULL, &stateid, NULL);
d9aba2b40de6fdd Trond Myklebust       2019-07-16  2106  		if (status != 0) {
731c74dd987e4f1 Trond Myklebust       2019-07-22  2107  			lseg = ERR_PTR(status);
70d2f7b1ea19b7e Trond Myklebust       2017-09-11  2108  			trace_pnfs_update_layout(ino, pos, count,
70d2f7b1ea19b7e Trond Myklebust       2017-09-11  2109  					iomode, lo, lseg,
70d2f7b1ea19b7e Trond Myklebust       2017-09-11  2110  					PNFS_UPDATE_LAYOUT_INVALID_OPEN);
d9aba2b40de6fdd Trond Myklebust       2019-07-16  2111  			nfs4_schedule_stateid_recovery(server, ctx->state);
d9aba2b40de6fdd Trond Myklebust       2019-07-16  2112  			pnfs_clear_first_layoutget(lo);
d9aba2b40de6fdd Trond Myklebust       2019-07-16  2113  			pnfs_put_layout_hdr(lo);
d9aba2b40de6fdd Trond Myklebust       2019-07-16  2114  			goto lookup_again;
70d2f7b1ea19b7e Trond Myklebust       2017-09-11  2115  		}
fbf4bcc9a837312 Trond Myklebust       2020-04-13  2116  		spin_lock(&ino->i_lock);
9bf87482ddc6f8d Peng Tao              2014-08-22  2117  	} else {
183d9e7b112aaed Jeff Layton           2016-05-17  2118  		nfs4_stateid_copy(&stateid, &lo->plh_stateid);
9a4bf31d05a801e Jeff Layton           2015-12-10  2119  	}
568e8c494ded95a Andy Adamson          2011-03-01  2120  
9a4bf31d05a801e Jeff Layton           2015-12-10  2121  	if (pnfs_layoutgets_blocked(lo)) {
183d9e7b112aaed Jeff Layton           2016-05-17  2122  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
9a4bf31d05a801e Jeff Layton           2015-12-10  2123  				PNFS_UPDATE_LAYOUT_BLOCKED);
cf7d63f1f989571 Fred Isaman           2011-01-06  2124  		goto out_unlock;
9a4bf31d05a801e Jeff Layton           2015-12-10  2125  	}
411ae722d10a6d4 Trond Myklebust       2018-06-23  2126  	nfs_layoutget_begin(lo);
f49f9baac8f63de Fred Isaman           2011-02-03  2127  	spin_unlock(&ino->i_lock);
3000512137602b8 Weston Andros Adamson 2013-02-28  2128  
78746a384c88c64 Fred Isaman           2016-09-22  2129  	_add_to_server_list(lo, server);
e5e940170b2136a Benny Halevy          2010-10-20  2130  
09cbfeaf1a5a67b Kirill A. Shutemov    2016-04-01  2131  	pg_offset = arg.offset & ~PAGE_MASK;
707ed5fdb587c71 Benny Halevy          2011-05-22  2132  	if (pg_offset) {
707ed5fdb587c71 Benny Halevy          2011-05-22  2133  		arg.offset -= pg_offset;
707ed5fdb587c71 Benny Halevy          2011-05-22  2134  		arg.length += pg_offset;
707ed5fdb587c71 Benny Halevy          2011-05-22  2135  	}
7c24d9489fe57d6 Andy Adamson          2011-06-13  2136  	if (arg.length != NFS4_MAX_UINT64)
09cbfeaf1a5a67b Kirill A. Shutemov    2016-04-01  2137  		arg.length = PAGE_ALIGN(arg.length);
707ed5fdb587c71 Benny Halevy          2011-05-22  2138  
5e36e2a9411210b Fred Isaman           2016-10-06  2139  	lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags);
587f03deb69bdbf Fred Isaman           2016-09-21  2140  	if (!lgp) {
587f03deb69bdbf Fred Isaman           2016-09-21  2141  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL,
587f03deb69bdbf Fred Isaman           2016-09-21  2142  					 PNFS_UPDATE_LAYOUT_NOMEM);
411ae722d10a6d4 Trond Myklebust       2018-06-23  2143  		nfs_layoutget_end(lo);
587f03deb69bdbf Fred Isaman           2016-09-21  2144  		goto out_put_layout_hdr;
587f03deb69bdbf Fred Isaman           2016-09-21  2145  	}
587f03deb69bdbf Fred Isaman           2016-09-21  2146  
b4e89bcba2b3a96 Trond Myklebust       2021-07-02  2147  	lgp->lo = lo;
b4e89bcba2b3a96 Trond Myklebust       2021-07-02  2148  	pnfs_get_layout_hdr(lo);
b4e89bcba2b3a96 Trond Myklebust       2021-07-02  2149  
dacb452db873347 Fred Isaman           2016-09-19  2150  	lseg = nfs4_proc_layoutget(lgp, &timeout);
183d9e7b112aaed Jeff Layton           2016-05-17  2151  	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
183d9e7b112aaed Jeff Layton           2016-05-17  2152  				 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
411ae722d10a6d4 Trond Myklebust       2018-06-23  2153  	nfs_layoutget_end(lo);
83026d80a16ea6a Jeff Layton           2016-05-17  2154  	if (IS_ERR(lseg)) {
183d9e7b112aaed Jeff Layton           2016-05-17  2155  		switch(PTR_ERR(lseg)) {
e85d7ee42003314 Trond Myklebust       2016-07-14  2156  		case -EBUSY:
183d9e7b112aaed Jeff Layton           2016-05-17  2157  			if (time_after(jiffies, giveup))
183d9e7b112aaed Jeff Layton           2016-05-17  2158  				lseg = NULL;
66b53f325876703 Trond Myklebust       2016-07-14  2159  			break;
66b53f325876703 Trond Myklebust       2016-07-14  2160  		case -ERECALLCONFLICT:
183d9e7b112aaed Jeff Layton           2016-05-17  2161  		case -EAGAIN:
56b38a1f7c78151 Trond Myklebust       2016-07-14  2162  			break;
7f80a6c53d6cdb8 Olga Kornievskaia     2022-05-31  2163  		case -ENODATA:
7f80a6c53d6cdb8 Olga Kornievskaia     2022-05-31  2164  			set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
183d9e7b112aaed Jeff Layton           2016-05-17 @2165  		default:
83026d80a16ea6a Jeff Layton           2016-05-17  2166  			if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
83026d80a16ea6a Jeff Layton           2016-05-17  2167  				pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
83026d80a16ea6a Jeff Layton           2016-05-17  2168  				lseg = NULL;
83026d80a16ea6a Jeff Layton           2016-05-17  2169  			}
56b38a1f7c78151 Trond Myklebust       2016-07-14  2170  			goto out_put_layout_hdr;
56b38a1f7c78151 Trond Myklebust       2016-07-14  2171  		}
56b38a1f7c78151 Trond Myklebust       2016-07-14  2172  		if (lseg) {
56b38a1f7c78151 Trond Myklebust       2016-07-14  2173  			if (first)
56b38a1f7c78151 Trond Myklebust       2016-07-14  2174  				pnfs_clear_first_layoutget(lo);
56b38a1f7c78151 Trond Myklebust       2016-07-14  2175  			trace_pnfs_update_layout(ino, pos, count,
56b38a1f7c78151 Trond Myklebust       2016-07-14  2176  				iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
56b38a1f7c78151 Trond Myklebust       2016-07-14  2177  			pnfs_put_layout_hdr(lo);
56b38a1f7c78151 Trond Myklebust       2016-07-14  2178  			goto lookup_again;
183d9e7b112aaed Jeff Layton           2016-05-17  2179  		}
83026d80a16ea6a Jeff Layton           2016-05-17  2180  	} else {
83026d80a16ea6a Jeff Layton           2016-05-17  2181  		pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
83026d80a16ea6a Jeff Layton           2016-05-17  2182  	}
83026d80a16ea6a Jeff Layton           2016-05-17  2183  
830ffb565760234 Trond Myklebust       2012-09-20  2184  out_put_layout_hdr:
d67ae825a59d639 Tom Haynes            2014-12-11  2185  	if (first)
d67ae825a59d639 Tom Haynes            2014-12-11  2186  		pnfs_clear_first_layoutget(lo);
d5b9216fd5114be Trond Myklebust       2019-07-18  2187  	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
d5b9216fd5114be Trond Myklebust       2019-07-18  2188  				 PNFS_UPDATE_LAYOUT_EXIT);
70c3bd2bdf9a3c7 Trond Myklebust       2012-09-18  2189  	pnfs_put_layout_hdr(lo);
e5e940170b2136a Benny Halevy          2010-10-20  2190  out:
f86bbcf85db3259 Trond Myklebust       2012-09-26  2191  	dprintk("%s: inode %s/%llu pNFS layout segment %s for "
f86bbcf85db3259 Trond Myklebust       2012-09-26  2192  			"(%s, offset: %llu, length: %llu)\n",
f86bbcf85db3259 Trond Myklebust       2012-09-26  2193  			__func__, ino->i_sb->s_id,
f86bbcf85db3259 Trond Myklebust       2012-09-26  2194  			(unsigned long long)NFS_FILEID(ino),
d600ad1f2bdbf97 Peng Tao              2015-12-04  2195  			IS_ERR_OR_NULL(lseg) ? "not found" : "found",
f86bbcf85db3259 Trond Myklebust       2012-09-26  2196  			iomode==IOMODE_RW ?  "read/write" : "read-only",
f86bbcf85db3259 Trond Myklebust       2012-09-26  2197  			(unsigned long long)pos,
f86bbcf85db3259 Trond Myklebust       2012-09-26  2198  			(unsigned long long)count);
e5e940170b2136a Benny Halevy          2010-10-20  2199  	return lseg;
e5e940170b2136a Benny Halevy          2010-10-20  2200  out_unlock:
e5e940170b2136a Benny Halevy          2010-10-20  2201  	spin_unlock(&ino->i_lock);
830ffb565760234 Trond Myklebust       2012-09-20  2202  	goto out_put_layout_hdr;
e5e940170b2136a Benny Halevy          2010-10-20  2203  }
7c24d9489fe57d6 Andy Adamson          2011-06-13  2204  EXPORT_SYMBOL_GPL(pnfs_update_layout);
b1f69b754ee312e Andy Adamson          2010-10-20  2205
kernel test robot May 31, 2022, 6:19 p.m. UTC | #6
Hi Olga,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on trondmy-nfs/linux-next]
[also build test WARNING on v5.18 next-20220531]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Olga-Kornievskaia/pNFS-fix-IO-thread-starvation-problem-during-LAYOUTUNAVAILABLE-error/20220531-215040
base:   git://git.linux-nfs.org/projects/trondmy/linux-nfs.git linux-next
config: arm-defconfig (https://download.01.org/0day-ci/archive/20220601/202206010206.kDKlof2o-lkp@intel.com/config)
compiler: arm-linux-gnueabi-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/7f80a6c53d6cdb806706a8748cb79348f9906229
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Olga-Kornievskaia/pNFS-fix-IO-thread-starvation-problem-during-LAYOUTUNAVAILABLE-error/20220531-215040
        git checkout 7f80a6c53d6cdb806706a8748cb79348f9906229
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash fs/nfs/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from include/linux/bitops.h:33,
                    from include/linux/kernel.h:22,
                    from include/linux/uio.h:8,
                    from include/linux/socket.h:8,
                    from include/uapi/linux/in.h:24,
                    from include/linux/in.h:19,
                    from include/linux/nfs_fs.h:22,
                    from fs/nfs/pnfs.c:30:
   fs/nfs/pnfs.c: In function 'pnfs_update_layout':
>> arch/arm/include/asm/bitops.h:183:41: warning: this statement may fall through [-Wimplicit-fallthrough=]
     183 | #define ATOMIC_BITOP(name,nr,p)         _##name(nr,p)
         |                                         ^~~~~~~~~~~~~
   arch/arm/include/asm/bitops.h:189:41: note: in expansion of macro 'ATOMIC_BITOP'
     189 | #define set_bit(nr,p)                   ATOMIC_BITOP(set_bit,nr,p)
         |                                         ^~~~~~~~~~~~
   fs/nfs/pnfs.c:2164:25: note: in expansion of macro 'set_bit'
    2164 |                         set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
         |                         ^~~~~~~
   fs/nfs/pnfs.c:2165:17: note: here
    2165 |                 default:
         |                 ^~~~~~~


vim +183 arch/arm/include/asm/bitops.h

^1da177e4c3f41 include/asm-arm/bitops.h      Linus Torvalds 2005-04-16  175  
e7ec02938dbe8c include/asm-arm/bitops.h      Russell King   2005-07-28  176  #ifndef CONFIG_SMP
^1da177e4c3f41 include/asm-arm/bitops.h      Linus Torvalds 2005-04-16  177  /*
^1da177e4c3f41 include/asm-arm/bitops.h      Linus Torvalds 2005-04-16  178   * The __* form of bitops are non-atomic and may be reordered.
^1da177e4c3f41 include/asm-arm/bitops.h      Linus Torvalds 2005-04-16  179   */
6323f0ccedf756 arch/arm/include/asm/bitops.h Russell King   2011-01-16  180  #define ATOMIC_BITOP(name,nr,p)			\
6323f0ccedf756 arch/arm/include/asm/bitops.h Russell King   2011-01-16  181  	(__builtin_constant_p(nr) ? ____atomic_##name(nr, p) : _##name(nr,p))
e7ec02938dbe8c include/asm-arm/bitops.h      Russell King   2005-07-28  182  #else
6323f0ccedf756 arch/arm/include/asm/bitops.h Russell King   2011-01-16 @183  #define ATOMIC_BITOP(name,nr,p)		_##name(nr,p)
e7ec02938dbe8c include/asm-arm/bitops.h      Russell King   2005-07-28  184  #endif
^1da177e4c3f41 include/asm-arm/bitops.h      Linus Torvalds 2005-04-16  185
Mkrtchyan, Tigran June 1, 2022, 7:27 a.m. UTC | #7
Hi Olga,

----- Original Message -----
> From: "Olga Kornievskaia" <olga.kornievskaia@gmail.com>
> To: "trondmy" <trondmy@hammerspace.com>
> Cc: "Anna Schumaker" <anna.schumaker@netapp.com>, "linux-nfs" <linux-nfs@vger.kernel.org>
> Sent: Tuesday, 31 May, 2022 18:03:34
> Subject: Re: [PATCH] pNFS: fix IO thread starvation problem during LAYOUTUNAVAILABLE error

> On Tue, May 31, 2022 at 11:00 AM Trond Myklebust
> <trondmy@hammerspace.com> wrote:
>>
>> On Tue, 2022-05-31 at 09:48 -0400, Olga Kornievskaia wrote:
>> > From: Olga Kornievskaia <kolga@netapp.com>
>> >
>> > In recent pnfs testing we've incountered IO thread starvation problem
>> > during the time when the server returns LAYOUTUNAVAILABLE error to
>> > the
>> > client. When that happens each IO request tries to get a new layout
>> > and the pnfs_update_layout() code ensures that only 1 LAYOUTGET
>> > RPC is outstanding, the rest would be waiting. As the thread that
>> > gets
>> > the layout wakes up the waiters only one gets to run and it tends to
>> > be
>> > the latest added to the waiting queue. After receiving
>> > LAYOUTUNAVAILABLE
>> > error the client would fall back to the MDS writes and as those
>> > writes
>> > complete and the new write is issued, those requests are added as
>> > waiters and they get to run before the earliest of the waiters that
>> > was put on the queue originally never gets to run until the
>> > LAYOUTUNAVAILABLE condition resolves itself on the server.
>> >
>> > With the current code, if N IOs arrive asking for a layout, then
>> > there will be N serial LAYOUTGETs that will follow, each would be
>> > getting its own LAYOUTUNAVAILABLE error. Instead, the patch proposes
>> > to apply the error condition to ALL the waiters for the outstanding
>> > LAYOUTGET. Once the error is received, the code would allow all
>> > exiting N IOs fall back to the MDS, but any new arriving IOs would be
>> > then queued up and one them the new IO would trigger a new LAYOUTGET.
>> >
>> > Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
>> > ---
>> >  fs/nfs/pnfs.c | 14 +++++++++++++-
>> >  fs/nfs/pnfs.h |  2 ++
>> >  2 files changed, 15 insertions(+), 1 deletion(-)
>> >
>> > diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> > index 68a87be3e6f9..5b7a679e32c8 100644
>> > --- a/fs/nfs/pnfs.c
>> > +++ b/fs/nfs/pnfs.c
>> > @@ -2028,10 +2028,20 @@ pnfs_update_layout(struct inode *ino,
>> >         if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo))
>> > &&
>> >             atomic_read(&lo->plh_outstanding) != 0) {
>> >                 spin_unlock(&ino->i_lock);
>> > +               atomic_inc(&lo->plh_waiting);
>> >                 lseg = ERR_PTR(wait_var_event_killable(&lo-
>> > >plh_outstanding,
>> >                                         !atomic_read(&lo-
>> > >plh_outstanding)));
>> > -               if (IS_ERR(lseg))
>> > +               if (IS_ERR(lseg)) {
>> > +                       atomic_dec(&lo->plh_waiting);
>> >                         goto out_put_layout_hdr;
>> > +               }
>> > +               if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) {
>> > +                       pnfs_layout_clear_fail_bit(lo,
>> > pnfs_iomode_to_fail_bit(iomode));
>> > +                       lseg = NULL;
>> > +                       if (atomic_dec_and_test(&lo->plh_waiting))
>> > +                               clear_bit(NFS_LAYOUT_DRAIN, &lo-
>> > >plh_flags);
>> > +                       goto out_put_layout_hdr;
>> > +               }
>> >                 pnfs_put_layout_hdr(lo);
>> >                 goto lookup_again;
>> >         }
>> > @@ -2152,6 +2162,8 @@ pnfs_update_layout(struct inode *ino,
>> >                 case -ERECALLCONFLICT:
>> >                 case -EAGAIN:
>> >                         break;
>> > +               case -ENODATA:
>> > +                       set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
>> >                 default:
>> >                         if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
>> >                                 pnfs_layout_clear_fail_bit(lo,
>> > pnfs_iomode_to_fail_bit(iomode));
>> > diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> > index 07f11489e4e9..5c07da32320b 100644
>> > --- a/fs/nfs/pnfs.h
>> > +++ b/fs/nfs/pnfs.h
>> > @@ -105,6 +105,7 @@ enum {
>> >         NFS_LAYOUT_FIRST_LAYOUTGET,     /* Serialize first layoutget
>> > */
>> >         NFS_LAYOUT_INODE_FREEING,       /* The inode is being freed
>> > */
>> >         NFS_LAYOUT_HASHED,              /* The layout visible */
>> > +       NFS_LAYOUT_DRAIN,
>> >  };
>> >
>> >  enum layoutdriver_policy_flags {
>> > @@ -196,6 +197,7 @@ struct pnfs_commit_ops {
>> >  struct pnfs_layout_hdr {
>> >         refcount_t              plh_refcount;
>> >         atomic_t                plh_outstanding; /* number of RPCs
>> > out */
>> > +       atomic_t                plh_waiting;
>> >         struct list_head        plh_layouts;   /* other client
>> > layouts */
>> >         struct list_head        plh_bulk_destroy;
>> >         struct list_head        plh_segs;      /* layout segments
>> > list */
>>
>> According to the spec, the correct behaviour for handling
>> NFS4ERR_LAYOUTUNAVAILABLE is to stop trying to do pNFS to the inode,
>> and to fall back to doing I/O through the MDS. The error describes a
>> more or less permanent state of the server being unable to hand out a
>> layout for this file.
>> If the server wanted the clients to retry after a delay, it should be
>> returning NFS4ERR_LAYOUTTRYLATER. We already handle that correctly.
> 
> To clarify, can you confirm that LAYOUTUNAVAILABLE would only turn off
> the inode permanently but for a period of time?
> 
> It looks to me that for the LAYOUTTRYLATER, the client would face the
> same starvation problem in the same situation. I don't see anything
> marking the segment failed for such error? I believe the client
> returns nolayout for that error, falls back to MDS but allows asking
> for the layout for a period of time, having again the queue of waiters

Your assumption doesn't match to our observation. For files that off-line
(DS down or file is on tape) we return LAYOUTTRYLATER. Usually, client keep
re-trying LAYOUTGET until file is available again. We use flexfile layout
as nfs4_file has less predictable behavior. For files that should be served
by MDS we return LAYOUTUNAVAILABLE. Typically, those files are quite small
and served with a single READ request, so I haven't observe repetitive LAYOUTGET
calls.

Best regards,
   Tigran. 

> that gets manipulated as such that favors last added.
> 
> 
>> Currently, what our client does to handle NFS4ERR_LAYOUTUNAVAILABLE is
>> just plain wrong: we just return no layout, and then let the next
>> caller to pnfs_update_layout() immediately try again.
>>
>> My problem with this patch, is that it just falls back to doing I/O
>> through the MDS for the writes that are already queued in
>> pnfs_update_layout(). It perpetuates the current bad behaviour of
>> unnecessary pounding of the server with LAYOUTGET requests that are
>> going to fail with the exact same error.
>>
>> I'd therefore prefer to see us fix the real bug (i.e. the handling of
>> NFS4ERR_LAYOUTUNAVAILABLE) first, and then look at mitigating issues
>> with the queuing. I already have 2 patches to deal with this.
>>
>> --
>> Trond Myklebust
>> Linux NFS client maintainer, Hammerspace
>> trond.myklebust@hammerspace.com
>>
Trond Myklebust June 1, 2022, 3:50 p.m. UTC | #8
On Wed, 2022-06-01 at 09:27 +0200, Mkrtchyan, Tigran wrote:
> Hi Olga,
> 
> ----- Original Message -----
> > From: "Olga Kornievskaia" <olga.kornievskaia@gmail.com>
> > To: "trondmy" <trondmy@hammerspace.com>
> > Cc: "Anna Schumaker" <anna.schumaker@netapp.com>, "linux-nfs"
> > <linux-nfs@vger.kernel.org>
> > Sent: Tuesday, 31 May, 2022 18:03:34
> > Subject: Re: [PATCH] pNFS: fix IO thread starvation problem during
> > LAYOUTUNAVAILABLE error
> 
> 

<snip>

> > To clarify, can you confirm that LAYOUTUNAVAILABLE would only turn
> > off
> > the inode permanently but for a period of time?
> > 
> > It looks to me that for the LAYOUTTRYLATER, the client would face
> > the
> > same starvation problem in the same situation. I don't see anything
> > marking the segment failed for such error? I believe the client
> > returns nolayout for that error, falls back to MDS but allows
> > asking
> > for the layout for a period of time, having again the queue of
> > waiters
> 
> Your assumption doesn't match to our observation. For files that off-
> line
> (DS down or file is on tape) we return LAYOUTTRYLATER. Usually,
> client keep
> re-trying LAYOUTGET until file is available again. We use flexfile
> layout
> as nfs4_file has less predictable behavior. For files that should be
> served
> by MDS we return LAYOUTUNAVAILABLE. Typically, those files are quite
> small
> and served with a single READ request, so I haven't observe
> repetitive LAYOUTGET
> calls.

Right. If you're only doing READ, and this is a small file, then you
are unlikely to see any repetition of the LAYOUTGET calls like Olga
describes, because the client page cache will take care of serialising
the initial I/O (by means of the folio lock/page lock) and will cache
the results so that no further I/O is needed.

The main problems with NFS4ERR_LAYOUTUNAVAILABLE in current pNFS
implementation will occur when reading large files and/or when writing
to the file. In those cases, we will send a LAYOUTGET each time we need
to schedule more I/O because we don't cache the negative result of the
previous attempt.
Mkrtchyan, Tigran June 2, 2022, 10:09 a.m. UTC | #9
I had re-run the tests. So, indeed, if I wait log enough, then client falls back to IO
through MDS and issues a new LAYOUTGET after the first successful READ operation.

the capture available at https://sas.desy.de/index.php/s/StGeijXTGysdHa2

Best regards,
  Tigran.


----- Original Message -----
> From: "trondmy" <trondmy@hammerspace.com>
> To: "Tigran Mkrtchyan" <tigran.mkrtchyan@desy.de>, "Olga Kornievskaia" <olga.kornievskaia@gmail.com>
> Cc: "linux-nfs" <linux-nfs@vger.kernel.org>, "Anna Schumaker" <anna.schumaker@netapp.com>
> Sent: Wednesday, 1 June, 2022 17:50:53
> Subject: Re: [PATCH] pNFS: fix IO thread starvation problem during LAYOUTUNAVAILABLE error

> On Wed, 2022-06-01 at 09:27 +0200, Mkrtchyan, Tigran wrote:
>> Hi Olga,
>> 
>> ----- Original Message -----
>> > From: "Olga Kornievskaia" <olga.kornievskaia@gmail.com>
>> > To: "trondmy" <trondmy@hammerspace.com>
>> > Cc: "Anna Schumaker" <anna.schumaker@netapp.com>, "linux-nfs"
>> > <linux-nfs@vger.kernel.org>
>> > Sent: Tuesday, 31 May, 2022 18:03:34
>> > Subject: Re: [PATCH] pNFS: fix IO thread starvation problem during
>> > LAYOUTUNAVAILABLE error
>> 
>> 
> 
> <snip>
> 
>> > To clarify, can you confirm that LAYOUTUNAVAILABLE would only turn
>> > off
>> > the inode permanently but for a period of time?
>> > 
>> > It looks to me that for the LAYOUTTRYLATER, the client would face
>> > the
>> > same starvation problem in the same situation. I don't see anything
>> > marking the segment failed for such error? I believe the client
>> > returns nolayout for that error, falls back to MDS but allows
>> > asking
>> > for the layout for a period of time, having again the queue of
>> > waiters
>> 
>> Your assumption doesn't match to our observation. For files that off-
>> line
>> (DS down or file is on tape) we return LAYOUTTRYLATER. Usually,
>> client keep
>> re-trying LAYOUTGET until file is available again. We use flexfile
>> layout
>> as nfs4_file has less predictable behavior. For files that should be
>> served
>> by MDS we return LAYOUTUNAVAILABLE. Typically, those files are quite
>> small
>> and served with a single READ request, so I haven't observe
>> repetitive LAYOUTGET
>> calls.
> 
> Right. If you're only doing READ, and this is a small file, then you
> are unlikely to see any repetition of the LAYOUTGET calls like Olga
> describes, because the client page cache will take care of serialising
> the initial I/O (by means of the folio lock/page lock) and will cache
> the results so that no further I/O is needed.
> 
> The main problems with NFS4ERR_LAYOUTUNAVAILABLE in current pNFS
> implementation will occur when reading large files and/or when writing
> to the file. In those cases, we will send a LAYOUTGET each time we need
> to schedule more I/O because we don't cache the negative result of the
> previous attempt.
> 
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
diff mbox series

Patch

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 68a87be3e6f9..5b7a679e32c8 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2028,10 +2028,20 @@  pnfs_update_layout(struct inode *ino,
 	if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) &&
 	    atomic_read(&lo->plh_outstanding) != 0) {
 		spin_unlock(&ino->i_lock);
+		atomic_inc(&lo->plh_waiting);
 		lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
 					!atomic_read(&lo->plh_outstanding)));
-		if (IS_ERR(lseg))
+		if (IS_ERR(lseg)) {
+			atomic_dec(&lo->plh_waiting);
 			goto out_put_layout_hdr;
+		}
+		if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) {
+			pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+			lseg = NULL;
+			if (atomic_dec_and_test(&lo->plh_waiting))
+				clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
+			goto out_put_layout_hdr;
+		}
 		pnfs_put_layout_hdr(lo);
 		goto lookup_again;
 	}
@@ -2152,6 +2162,8 @@  pnfs_update_layout(struct inode *ino,
 		case -ERECALLCONFLICT:
 		case -EAGAIN:
 			break;
+		case -ENODATA:
+			set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
 		default:
 			if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
 				pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 07f11489e4e9..5c07da32320b 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -105,6 +105,7 @@  enum {
 	NFS_LAYOUT_FIRST_LAYOUTGET,	/* Serialize first layoutget */
 	NFS_LAYOUT_INODE_FREEING,	/* The inode is being freed */
 	NFS_LAYOUT_HASHED,		/* The layout visible */
+	NFS_LAYOUT_DRAIN,
 };
 
 enum layoutdriver_policy_flags {
@@ -196,6 +197,7 @@  struct pnfs_commit_ops {
 struct pnfs_layout_hdr {
 	refcount_t		plh_refcount;
 	atomic_t		plh_outstanding; /* number of RPCs out */
+	atomic_t		plh_waiting;
 	struct list_head	plh_layouts;   /* other client layouts */
 	struct list_head	plh_bulk_destroy;
 	struct list_head	plh_segs;      /* layout segments list */