diff mbox series

Place a memory barrier around cp_state changes

Message ID 20190606223110.15725-1-stancheff@cray.com (mailing list archive)
State New, archived
Headers show
Series Place a memory barrier around cp_state changes | expand

Commit Message

Shaun Tancheff June 6, 2019, 10:31 p.m. UTC
Ensure all uses of cl_page->cp_state are smp-safe.

LBUG Output:
In __cl_page_state_set()
..
  old = page->cp_state;
  PASSERT(env, page, allowed_transitions[old][state]);
..
Asserted with the following:
  page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]

However cp_state 0 (CPS_CACHED) to 1 (CPS_OWNED) is a valid transition
leading to the conclusion that cp_state became 0 during the
assertion.

Signed-off-by: Shaun Tancheff <stancheff@cray.com>
---
 fs/lustre/include/cl_object.h   | 11 +++++++++++
 fs/lustre/llite/rw26.c          |  2 +-
 fs/lustre/llite/vvp_page.c      |  6 +++---
 fs/lustre/obdclass/cl_page.c    | 34 +++++++++++++++++++--------------
 fs/lustre/obdecho/echo_client.c |  2 +-
 fs/lustre/osc/osc_cache.c       | 18 +++++++++--------
 6 files changed, 46 insertions(+), 27 deletions(-)

Comments

NeilBrown June 7, 2019, 12:56 a.m. UTC | #1
On Thu, Jun 06 2019, Shaun Tancheff wrote:

> Ensure all uses of cl_page->cp_state are smp-safe.
>
> LBUG Output:
> In __cl_page_state_set()
> ..
>   old = page->cp_state;
>   PASSERT(env, page, allowed_transitions[old][state]);
> ..
> Asserted with the following:
>   page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]

What do all these fields mean?  Or where is this printed?
The closest I can find is in cl_page_header_print() which uses the
format string:

		   "page@%p[%d %p %d %d %p]\n",

But that only has 5 fields in the [], while the output you provided has
6.

>
> However cp_state 0 (CPS_CACHED) to 1 (CPS_OWNED) is a valid transition
> leading to the conclusion that cp_state became 0 during the
> assertion.

Where is the evidence that this was the transition that was happening?
If it was in some part of the LBUG output that wasn't quoted - then
please fix that by quoting the entire LBUG output.

We need to understand which change happened at this time to cause the
race.
Then we need to explain why the added barriers actually close the race
window.

If these barrier do actually fix a race, then there is something *very*
wrong.
The comments say that changes to cp_state are protected by the page lock
on the corresponding VM page.
Locking and unlocking a VM page entails sufficient barriers that changes
made while one thread holds the lock will be visible to another
thread once it also gets the lock.

So the only possible explanation for a race as you suspect, is if the
vmpage *isn't* locked when something changes cp_state, and that would be
bad.

Thanks,
NeilBrown


>
> Signed-off-by: Shaun Tancheff <stancheff@cray.com>
> ---
>  fs/lustre/include/cl_object.h   | 11 +++++++++++
>  fs/lustre/llite/rw26.c          |  2 +-
>  fs/lustre/llite/vvp_page.c      |  6 +++---
>  fs/lustre/obdclass/cl_page.c    | 34 +++++++++++++++++++--------------
>  fs/lustre/obdecho/echo_client.c |  2 +-
>  fs/lustre/osc/osc_cache.c       | 18 +++++++++--------
>  6 files changed, 46 insertions(+), 27 deletions(-)
>
> diff --git a/fs/lustre/include/cl_object.h b/fs/lustre/include/cl_object.h
> index 691c2f5da53a..d6e1f6f05f50 100644
> --- a/fs/lustre/include/cl_object.h
> +++ b/fs/lustre/include/cl_object.h
> @@ -752,6 +752,17 @@ struct cl_page {
>  	struct cl_sync_io		*cp_sync_io;
>  };
>  
> +static inline enum cl_page_state cl_page_state_get(const struct cl_page *pg)
> +{
> +	/*
> +	 * Paired with smp_store_release in cl_page_state_set_trust
> +	 * and ensures that we see the most recent value of cp_state
> +	 * even when the last modification was not performed on the
> +	 * current processor
> +	 */
> +	return smp_load_acquire(&pg->cp_state);
> +}
> +
>  /**
>   * Per-layer part of cl_page.
>   *
> diff --git a/fs/lustre/llite/rw26.c b/fs/lustre/llite/rw26.c
> index e4ce3b6f5772..364dec208ccd 100644
> --- a/fs/lustre/llite/rw26.c
> +++ b/fs/lustre/llite/rw26.c
> @@ -200,7 +200,7 @@ static ssize_t ll_direct_IO_seg(const struct lu_env *env, struct cl_io *io,
>  
>  		rc = cl_page_own(env, io, clp);
>  		if (rc) {
> -			LASSERT(clp->cp_state == CPS_FREEING);
> +			LASSERT(cl_page_state_get(clp) == CPS_FREEING);
>  			cl_page_put(env, clp);
>  			break;
>  		}
> diff --git a/fs/lustre/llite/vvp_page.c b/fs/lustre/llite/vvp_page.c
> index 590e5f5e43c9..38b8c488d765 100644
> --- a/fs/lustre/llite/vvp_page.c
> +++ b/fs/lustre/llite/vvp_page.c
> @@ -323,18 +323,18 @@ static int vvp_page_make_ready(const struct lu_env *env,
>  
>  	lock_page(vmpage);
>  	if (clear_page_dirty_for_io(vmpage)) {
> -		LASSERT(pg->cp_state == CPS_CACHED);
> +		LASSERT(cl_page_state_get(pg) == CPS_CACHED);
>  		/* This actually clears the dirty bit in the radix tree. */
>  		set_page_writeback(vmpage);
>  		CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
> -	} else if (pg->cp_state == CPS_PAGEOUT) {
> +	} else if (cl_page_state_get(pg) == CPS_PAGEOUT) {
>  		/* is it possible for osc_flush_async_page() to already
>  		 * make it ready?
>  		 */
>  		result = -EALREADY;
>  	} else {
>  		CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n",
> -			      pg->cp_state);
> +			      cl_page_state_get(pg));
>  		LBUG();
>  	}
>  	unlock_page(vmpage);
> diff --git a/fs/lustre/obdclass/cl_page.c b/fs/lustre/obdclass/cl_page.c
> index 349f19e014e0..da4429b82932 100644
> --- a/fs/lustre/obdclass/cl_page.c
> +++ b/fs/lustre/obdclass/cl_page.c
> @@ -97,7 +97,7 @@ static void cl_page_free(const struct lu_env *env, struct cl_page *page)
>  
>  	PASSERT(env, page, list_empty(&page->cp_batch));
>  	PASSERT(env, page, !page->cp_owner);
> -	PASSERT(env, page, page->cp_state == CPS_FREEING);
> +	PASSERT(env, page, cl_page_state_get(page) == CPS_FREEING);
>  
>  	while ((slice = list_first_entry_or_null(&page->cp_layers,
>  						 struct cl_page_slice,
> @@ -119,8 +119,14 @@ static void cl_page_free(const struct lu_env *env, struct cl_page *page)
>  static inline void cl_page_state_set_trust(struct cl_page *page,
>  					   enum cl_page_state state)
>  {
> -	/* bypass const. */
> -	*(enum cl_page_state *)&page->cp_state = state;
> +	/*
> +	 * Paired with smp_load_acquire in cl_page_state_get
> +	 * and ensures that we see the most recent value of cp_state
> +	 * is available even when the next access is not performed on the
> +	 * current processor.
> +	 * Note we also cast away const as the only modifier of cp_state.
> +	 */
> +	smp_store_release((enum cl_page_state *)&page->cp_state, state);
>  }
>  
>  struct cl_page *cl_page_alloc(const struct lu_env *env,
> @@ -270,10 +276,10 @@ static void __cl_page_state_set(const struct lu_env *env,
>  		}
>  	};
>  
> -	old = page->cp_state;
> +	old = cl_page_state_get(page);
>  	PASSERT(env, page, allowed_transitions[old][state]);
>  	CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
> -	PASSERT(env, page, page->cp_state == old);
> +	PASSERT(env, page, cl_page_state_get(page) == old);
>  	PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner));
>  	cl_page_state_set_trust(page, state);
>  }
> @@ -313,7 +319,7 @@ void cl_page_put(const struct lu_env *env, struct cl_page *page)
>  		       refcount_read(&page->cp_ref));
>  
>  	if (refcount_dec_and_test(&page->cp_ref)) {
> -		LASSERT(page->cp_state == CPS_FREEING);
> +		LASSERT(cl_page_state_get(page) == CPS_FREEING);
>  
>  		LASSERT(refcount_read(&page->cp_ref) == 0);
>  		PASSERT(env, page, !page->cp_owner);
> @@ -378,7 +384,7 @@ void __cl_page_disown(const struct lu_env *env,
>  	const struct cl_page_slice *slice;
>  	enum cl_page_state state;
>  
> -	state = pg->cp_state;
> +	state = cl_page_state_get(pg);
>  	cl_page_owner_clear(pg);
>  
>  	if (state == CPS_OWNED)
> @@ -402,7 +408,7 @@ int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
>  	struct cl_io *top = cl_io_top((struct cl_io *)io);
>  
>  	LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
> -	return pg->cp_state == CPS_OWNED && pg->cp_owner == top;
> +	return cl_page_state_get(pg) == CPS_OWNED && pg->cp_owner == top;
>  }
>  EXPORT_SYMBOL(cl_page_is_owned);
>  
> @@ -434,7 +440,7 @@ static int __cl_page_own(const struct lu_env *env, struct cl_io *io,
>  
>  	io = cl_io_top(io);
>  
> -	if (pg->cp_state == CPS_FREEING) {
> +	if (cl_page_state_get(pg) == CPS_FREEING) {
>  		result = -ENOENT;
>  		goto out;
>  	}
> @@ -453,7 +459,7 @@ static int __cl_page_own(const struct lu_env *env, struct cl_io *io,
>  		PASSERT(env, pg, !pg->cp_owner);
>  		pg->cp_owner = cl_io_top(io);
>  		cl_page_owner_set(pg);
> -		if (pg->cp_state != CPS_FREEING) {
> +		if (cl_page_state_get(pg) != CPS_FREEING) {
>  			cl_page_state_set(env, pg, CPS_OWNED);
>  		} else {
>  			__cl_page_disown(env, io, pg);
> @@ -593,7 +599,7 @@ static void __cl_page_delete(const struct lu_env *env, struct cl_page *pg)
>  {
>  	const struct cl_page_slice *slice;
>  
> -	PASSERT(env, pg, pg->cp_state != CPS_FREEING);
> +	PASSERT(env, pg, cl_page_state_get(pg) != CPS_FREEING);
>  
>  	/*
>  	 * Sever all ways to obtain new pointers to @pg.
> @@ -756,7 +762,7 @@ void cl_page_completion(const struct lu_env *env,
>  	const struct cl_page_slice *slice;
>  
>  	PASSERT(env, pg, crt < CRT_NR);
> -	PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
> +	PASSERT(env, pg, cl_page_state_get(pg) == cl_req_type_state(crt));
>  
>  	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
>  
> @@ -805,7 +811,7 @@ int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,
>  	}
>  
>  	if (result >= 0) {
> -		PASSERT(env, pg, pg->cp_state == CPS_CACHED);
> +		PASSERT(env, pg, cl_page_state_get(pg) == CPS_CACHED);
>  		cl_page_io_start(env, pg, crt);
>  		result = 0;
>  	}
> @@ -870,7 +876,7 @@ void cl_page_header_print(const struct lu_env *env, void *cookie,
>  	(*printer)(env, cookie,
>  		   "page@%p[%d %p %d %d %p]\n",
>  		   pg, refcount_read(&pg->cp_ref), pg->cp_obj,
> -		   pg->cp_state, pg->cp_type,
> +		   cl_page_state_get(pg), pg->cp_type,
>  		   pg->cp_owner);
>  }
>  EXPORT_SYMBOL(cl_page_header_print);
> diff --git a/fs/lustre/obdecho/echo_client.c b/fs/lustre/obdecho/echo_client.c
> index 317123fd27cb..d879f109e641 100644
> --- a/fs/lustre/obdecho/echo_client.c
> +++ b/fs/lustre/obdecho/echo_client.c
> @@ -1046,7 +1046,7 @@ static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset,
>  
>  		rc = cl_page_own(env, io, clp);
>  		if (rc) {
> -			LASSERT(clp->cp_state == CPS_FREEING);
> +			LASSERT(cl_page_state_get(clp) == CPS_FREEING);
>  			cl_page_put(env, clp);
>  			break;
>  		}
> diff --git a/fs/lustre/osc/osc_cache.c b/fs/lustre/osc/osc_cache.c
> index f8fddbfe6a7e..75984b98b229 100644
> --- a/fs/lustre/osc/osc_cache.c
> +++ b/fs/lustre/osc/osc_cache.c
> @@ -1045,7 +1045,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
>  			cl_page_discard(env, io, page);
>  			cl_page_disown(env, io, page);
>  		} else {
> -			LASSERT(page->cp_state == CPS_FREEING);
> +			LASSERT(cl_page_state_get(page) == CPS_FREEING);
>  			LASSERT(0);
>  		}
>  
> @@ -1356,10 +1356,12 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
>  	int srvlock;
>  
>  	cmd &= ~OBD_BRW_NOQUOTA;
> -	LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ),
> -		 "cp_state:%u, cmd:%d\n", page->cp_state, cmd);
> -	LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE),
> -		 "cp_state:%u, cmd:%d\n", page->cp_state, cmd);
> +	LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEIN,
> +		      cmd == OBD_BRW_READ),
> +		 "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd);
> +	LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEOUT,
> +		      cmd == OBD_BRW_WRITE),
> +		 "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd);
>  	LASSERT(opg->ops_transfer_pinned);
>  
>  	crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
> @@ -3061,7 +3063,7 @@ bool osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
>  
>  			page = ops->ops_cl.cpl_page;
>  			LASSERT(page->cp_type == CPT_CACHEABLE);
> -			if (page->cp_state == CPS_FREEING)
> +			if (cl_page_state_get(page) == CPS_FREEING)
>  				continue;
>  
>  			cl_page_get(page);
> @@ -3142,7 +3144,7 @@ static bool check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
>  			cl_page_discard(env, io, page);
>  			cl_page_disown(env, io, page);
>  		} else {
> -			LASSERT(page->cp_state == CPS_FREEING);
> +			LASSERT(cl_page_state_get(page) == CPS_FREEING);
>  		}
>  	}
>  
> @@ -3169,7 +3171,7 @@ static bool discard_cb(const struct lu_env *env, struct cl_io *io,
>  		cl_page_discard(env, io, page);
>  		cl_page_disown(env, io, page);
>  	} else {
> -		LASSERT(page->cp_state == CPS_FREEING);
> +		LASSERT(cl_page_state_get(page) == CPS_FREEING);
>  	}
>  
>  	return true;
> -- 
> 2.17.1
Shaun Tancheff June 7, 2019, 3:54 a.m. UTC | #2
On Thu, Jun 6, 2019 at 7:57 PM NeilBrown <neilb@suse.com> wrote:
>
> On Thu, Jun 06 2019, Shaun Tancheff wrote:
>
> > Ensure all uses of cl_page->cp_state are smp-safe.
> >
> > LBUG Output:
> > In __cl_page_state_set()
> > ..
> >   old = page->cp_state;
> >   PASSERT(env, page, allowed_transitions[old][state]);
> > ..
> > Asserted with the following:
> >   page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]
>
> What do all these fields mean?  Or where is this printed?
> The closest I can find is in cl_page_header_print() which uses the
> format string:
>
>                    "page@%p[%d %p %d %d %p]\n",
>
> But that only has 5 fields in the [], while the output you provided has
> 6.
>

Sorry for the confusion, here is my (long) explanation with source snippets:

The LBUG in full

LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0())
page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e28cc0]
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0())
vvp-page@ffff80be1fcd6650(0:0) vm@ffff7e02fa0e60c0 5000000000001029 4:0
ffff80be1fcd6600 1049548 lru
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0())
lov-page@ffff80be1fcd6690, comp index: 0, gen: 0
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0())
osc-page@ffff80be1fcd66c8 1049548: 1< 0x845fed 2 0 - - > 2< 4298948608 0
4096 0x0 0x420 |           (null) ffff809e473708e8 ffff80be199c1e40 > 3< 0
0 0 > 4< 5 13 64 0 + | - - - - > 5< - - - - | 0 - | 0 - ->
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) end
page@ffff80be1fcd6600
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0())
allowed_transitions[old][state]
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) ASSERTION( 0 )
failed:
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) LBUG

The source snippet around L342
...
   339
   340          ENTRY;
   341          old = page->cp_state;
   342          PASSERT(env, page, allowed_transitions[old][state]);
   343          CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old,
state);
   344          PASSERT(env, page, page->cp_state == old);
   345          PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner
!= NULL));
   346
   347          cs_pagestate_dec(page->cp_obj, page->cp_state);
   348          cs_pagestate_inc(page->cp_obj, state);

The PASSERT macro ...
...
# define PASSERT(env, page, expr)                                       \
  do {                                                                    \
          if (unlikely(!(expr))) {                                      \
                  CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n");    \
                  LASSERT(0);                                           \
          }                                                             \
  } while (0)
#else /* !LIBCFS_DEBUG */
...

/**
 * Helper macro, dumping detailed information about \a page into a log.
 */
#define CL_PAGE_DEBUG(mask, env, page, format, ...)                     \
do {                                                                    \
        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                   \
                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);        \
                cl_page_print(env, &msgdata, lu_cdebug_printer, page);  \
                CDEBUG(mask, format , ## __VA_ARGS__);                  \
        }                                                               \
} while (0)

Almost there
...
  1062  /**
  1063   * Prints human readable representation of \a pg to the \a f.
  1064   */
  1065  void cl_page_print(const struct lu_env *env, void *cookie,
  1066                     lu_printer_t printer, const struct cl_page *pg)
  1067  {
  1068          const struct cl_page_slice *slice;
  1069          int result = 0;
  1070
  1071          cl_page_header_print(env, cookie, printer, pg);
  1072          list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
  1073                  if (slice->cpl_ops->cpo_print != NULL)
  1074                          result = (*slice->cpl_ops->cpo_print)(env,
slice,
  1075
cookie, printer);

Finally the 'good stuff'
...
  1051  void cl_page_header_print(const struct lu_env *env, void *cookie,
  1052                            lu_printer_t printer, const struct
cl_page *pg)
  1053  {
  1054          (*printer)(env, cookie,
  1055                     "page@%p[%d %p %d %d %p]\n",
  1056                     pg, atomic_read(&pg->cp_ref), pg->cp_obj,
  1057                     pg->cp_state, pg->cp_type,
  1058                     pg->cp_owner);
  1059  }
...

Given the above along with the first line from the LBUG ....

LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0())
page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e28cc0]

Implies the following:

pg == ffff80be1fcd6600
pg->cp_ref == 3
pg->cp_obj == ffff80be1fca5cc0
pg->cp_state == 0 (state == CPS_CACHED)
pg->cp_type == 1
pg->cp_owner == ffff809e60e28cc0

Also known caller was cl_page_assume+0xdc/0x3e0 [obdclass] (from the
backtrace)
So we know the on-stack / register value of state == CPS_OWNED

The backtrace:
...
 dump_backtrace+0x0/0x248
 show_stack+0x24/0x30
 dump_stack+0xbc/0xf4
 libcfs_call_trace+0xec/0x120 [libcfs]
 lbug_with_loc+0x4c/0xb8 [libcfs]
 cl_page_state_set0+0x2b4/0x6e0 [obdclass]
 cl_page_assume+0xdc/0x3e0 [obdclass]
 ll_io_read_page+0x144c/0x1de0 [lustre]
 ll_write_begin+0x3d4/0xee8 [lustre]

and the contents of cl_page_assume

   596  void cl_page_assume(const struct lu_env *env,
   597                      struct cl_io *io, struct cl_page *pg)
   598  {
   599          const struct cl_page_slice *slice;
   600
   601          PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
   602
   603          ENTRY;
   604          io = cl_io_top(io);
   605
   606          list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
   607                  if (slice->cpl_ops->cpo_assume != NULL)
   608                          (*slice->cpl_ops->cpo_assume)(env, slice,
io);
   609          }
   610
   611          PASSERT(env, pg, pg->cp_owner == NULL);
   612          pg->cp_owner = cl_io_top(io);
   613          cl_page_owner_set(pg);
   614          cl_page_state_set(env, pg, CPS_OWNED);
   615          EXIT;
   616  }

While I believe my conclusion to be correct I am certainly open to being
dead wrong.



On Thu, Jun 6, 2019 at 7:57 PM NeilBrown <neilb@suse.com> wrote:

> On Thu, Jun 06 2019, Shaun Tancheff wrote:
>
> > Ensure all uses of cl_page->cp_state are smp-safe.
> >
> > LBUG Output:
> > In __cl_page_state_set()
> > ..
> >   old = page->cp_state;
> >   PASSERT(env, page, allowed_transitions[old][state]);
> > ..
> > Asserted with the following:
> >   page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]
>
> What do all these fields mean?  Or where is this printed?
> The closest I can find is in cl_page_header_print() which uses the
> format string:
>
>                    "page@%p[%d %p %d %d %p]\n",
>
> But that only has 5 fields in the [], while the output you provided has
> 6.
>
> >
> > However cp_state 0 (CPS_CACHED) to 1 (CPS_OWNED) is a valid transition
> > leading to the conclusion that cp_state became 0 during the
> > assertion.
>
> Where is the evidence that this was the transition that was happening?
> If it was in some part of the LBUG output that wasn't quoted - then
> please fix that by quoting the entire LBUG output.
>
> We need to understand which change happened at this time to cause the
> race.
> Then we need to explain why the added barriers actually close the race
> window.
>
> If these barrier do actually fix a race, then there is something *very*
> wrong.
> The comments say that changes to cp_state are protected by the page lock
> on the corresponding VM page.
> Locking and unlocking a VM page entails sufficient barriers that changes
> made while one thread holds the lock will be visible to another
> thread once it also gets the lock.
>
>
True, but a page lock is just a bit on the page flags. And we absolutely
wait on I/O while the
page is 'locked' so a ping-pong (our task *can* sleep) is entirely
reasonable, as far as
I understand anyway. Since there is multiple state-changes during a single
page lock/unlock
there is a race, however rare and unlikely, and it's arch specific based on
what I understand
of the memory barrier rules.

So the only possible explanation for a race as you suspect, is if the
> vmpage *isn't* locked when something changes cp_state, and that would be
> bad.
>
> Thanks,
> NeilBrown
>
>
> >
> > Signed-off-by: Shaun Tancheff <stancheff@cray.com>
> > ---
> >  fs/lustre/include/cl_object.h   | 11 +++++++++++
> >  fs/lustre/llite/rw26.c          |  2 +-
> >  fs/lustre/llite/vvp_page.c      |  6 +++---
> >  fs/lustre/obdclass/cl_page.c    | 34 +++++++++++++++++++--------------
> >  fs/lustre/obdecho/echo_client.c |  2 +-
> >  fs/lustre/osc/osc_cache.c       | 18 +++++++++--------
> >  6 files changed, 46 insertions(+), 27 deletions(-)
> >
> > diff --git a/fs/lustre/include/cl_object.h
> b/fs/lustre/include/cl_object.h
> > index 691c2f5da53a..d6e1f6f05f50 100644
> > --- a/fs/lustre/include/cl_object.h
> > +++ b/fs/lustre/include/cl_object.h
> > @@ -752,6 +752,17 @@ struct cl_page {
> >       struct cl_sync_io               *cp_sync_io;
> >  };
> >
> > +static inline enum cl_page_state cl_page_state_get(const struct cl_page
> *pg)
> > +{
> > +     /*
> > +      * Paired with smp_store_release in cl_page_state_set_trust
> > +      * and ensures that we see the most recent value of cp_state
> > +      * even when the last modification was not performed on the
> > +      * current processor
> > +      */
> > +     return smp_load_acquire(&pg->cp_state);
> > +}
> > +
> >  /**
> >   * Per-layer part of cl_page.
> >   *
> > diff --git a/fs/lustre/llite/rw26.c b/fs/lustre/llite/rw26.c
> > index e4ce3b6f5772..364dec208ccd 100644
> > --- a/fs/lustre/llite/rw26.c
> > +++ b/fs/lustre/llite/rw26.c
> > @@ -200,7 +200,7 @@ static ssize_t ll_direct_IO_seg(const struct lu_env
> *env, struct cl_io *io,
> >
> >               rc = cl_page_own(env, io, clp);
> >               if (rc) {
> > -                     LASSERT(clp->cp_state == CPS_FREEING);
> > +                     LASSERT(cl_page_state_get(clp) == CPS_FREEING);
> >                       cl_page_put(env, clp);
> >                       break;
> >               }
> > diff --git a/fs/lustre/llite/vvp_page.c b/fs/lustre/llite/vvp_page.c
> > index 590e5f5e43c9..38b8c488d765 100644
> > --- a/fs/lustre/llite/vvp_page.c
> > +++ b/fs/lustre/llite/vvp_page.c
> > @@ -323,18 +323,18 @@ static int vvp_page_make_ready(const struct lu_env
> *env,
> >
> >       lock_page(vmpage);
> >       if (clear_page_dirty_for_io(vmpage)) {
> > -             LASSERT(pg->cp_state == CPS_CACHED);
> > +             LASSERT(cl_page_state_get(pg) == CPS_CACHED);
> >               /* This actually clears the dirty bit in the radix tree. */
> >               set_page_writeback(vmpage);
> >               CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
> > -     } else if (pg->cp_state == CPS_PAGEOUT) {
> > +     } else if (cl_page_state_get(pg) == CPS_PAGEOUT) {
> >               /* is it possible for osc_flush_async_page() to already
> >                * make it ready?
> >                */
> >               result = -EALREADY;
> >       } else {
> >               CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state
> %d.\n",
> > -                           pg->cp_state);
> > +                           cl_page_state_get(pg));
> >               LBUG();
> >       }
> >       unlock_page(vmpage);
> > diff --git a/fs/lustre/obdclass/cl_page.c b/fs/lustre/obdclass/cl_page.c
> > index 349f19e014e0..da4429b82932 100644
> > --- a/fs/lustre/obdclass/cl_page.c
> > +++ b/fs/lustre/obdclass/cl_page.c
> > @@ -97,7 +97,7 @@ static void cl_page_free(const struct lu_env *env,
> struct cl_page *page)
> >
> >       PASSERT(env, page, list_empty(&page->cp_batch));
> >       PASSERT(env, page, !page->cp_owner);
> > -     PASSERT(env, page, page->cp_state == CPS_FREEING);
> > +     PASSERT(env, page, cl_page_state_get(page) == CPS_FREEING);
> >
> >       while ((slice = list_first_entry_or_null(&page->cp_layers,
> >                                                struct cl_page_slice,
> > @@ -119,8 +119,14 @@ static void cl_page_free(const struct lu_env *env,
> struct cl_page *page)
> >  static inline void cl_page_state_set_trust(struct cl_page *page,
> >                                          enum cl_page_state state)
> >  {
> > -     /* bypass const. */
> > -     *(enum cl_page_state *)&page->cp_state = state;
> > +     /*
> > +      * Paired with smp_load_acquire in cl_page_state_get
> > +      * and ensures that we see the most recent value of cp_state
> > +      * is available even when the next access is not performed on the
> > +      * current processor.
> > +      * Note we also cast away const as the only modifier of cp_state.
> > +      */
> > +     smp_store_release((enum cl_page_state *)&page->cp_state, state);
> >  }
> >
> >  struct cl_page *cl_page_alloc(const struct lu_env *env,
> > @@ -270,10 +276,10 @@ static void __cl_page_state_set(const struct
> lu_env *env,
> >               }
> >       };
> >
> > -     old = page->cp_state;
> > +     old = cl_page_state_get(page);
> >       PASSERT(env, page, allowed_transitions[old][state]);
> >       CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
> > -     PASSERT(env, page, page->cp_state == old);
> > +     PASSERT(env, page, cl_page_state_get(page) == old);
> >       PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner));
> >       cl_page_state_set_trust(page, state);
> >  }
> > @@ -313,7 +319,7 @@ void cl_page_put(const struct lu_env *env, struct
> cl_page *page)
> >                      refcount_read(&page->cp_ref));
> >
> >       if (refcount_dec_and_test(&page->cp_ref)) {
> > -             LASSERT(page->cp_state == CPS_FREEING);
> > +             LASSERT(cl_page_state_get(page) == CPS_FREEING);
> >
> >               LASSERT(refcount_read(&page->cp_ref) == 0);
> >               PASSERT(env, page, !page->cp_owner);
> > @@ -378,7 +384,7 @@ void __cl_page_disown(const struct lu_env *env,
> >       const struct cl_page_slice *slice;
> >       enum cl_page_state state;
> >
> > -     state = pg->cp_state;
> > +     state = cl_page_state_get(pg);
> >       cl_page_owner_clear(pg);
> >
> >       if (state == CPS_OWNED)
> > @@ -402,7 +408,7 @@ int cl_page_is_owned(const struct cl_page *pg, const
> struct cl_io *io)
> >       struct cl_io *top = cl_io_top((struct cl_io *)io);
> >
> >       LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
> > -     return pg->cp_state == CPS_OWNED && pg->cp_owner == top;
> > +     return cl_page_state_get(pg) == CPS_OWNED && pg->cp_owner == top;
> >  }
> >  EXPORT_SYMBOL(cl_page_is_owned);
> >
> > @@ -434,7 +440,7 @@ static int __cl_page_own(const struct lu_env *env,
> struct cl_io *io,
> >
> >       io = cl_io_top(io);
> >
> > -     if (pg->cp_state == CPS_FREEING) {
> > +     if (cl_page_state_get(pg) == CPS_FREEING) {
> >               result = -ENOENT;
> >               goto out;
> >       }
> > @@ -453,7 +459,7 @@ static int __cl_page_own(const struct lu_env *env,
> struct cl_io *io,
> >               PASSERT(env, pg, !pg->cp_owner);
> >               pg->cp_owner = cl_io_top(io);
> >               cl_page_owner_set(pg);
> > -             if (pg->cp_state != CPS_FREEING) {
> > +             if (cl_page_state_get(pg) != CPS_FREEING) {
> >                       cl_page_state_set(env, pg, CPS_OWNED);
> >               } else {
> >                       __cl_page_disown(env, io, pg);
> > @@ -593,7 +599,7 @@ static void __cl_page_delete(const struct lu_env
> *env, struct cl_page *pg)
> >  {
> >       const struct cl_page_slice *slice;
> >
> > -     PASSERT(env, pg, pg->cp_state != CPS_FREEING);
> > +     PASSERT(env, pg, cl_page_state_get(pg) != CPS_FREEING);
> >
> >       /*
> >        * Sever all ways to obtain new pointers to @pg.
> > @@ -756,7 +762,7 @@ void cl_page_completion(const struct lu_env *env,
> >       const struct cl_page_slice *slice;
> >
> >       PASSERT(env, pg, crt < CRT_NR);
> > -     PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
> > +     PASSERT(env, pg, cl_page_state_get(pg) == cl_req_type_state(crt));
> >
> >       CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
> >
> > @@ -805,7 +811,7 @@ int cl_page_make_ready(const struct lu_env *env,
> struct cl_page *pg,
> >       }
> >
> >       if (result >= 0) {
> > -             PASSERT(env, pg, pg->cp_state == CPS_CACHED);
> > +             PASSERT(env, pg, cl_page_state_get(pg) == CPS_CACHED);
> >               cl_page_io_start(env, pg, crt);
> >               result = 0;
> >       }
> > @@ -870,7 +876,7 @@ void cl_page_header_print(const struct lu_env *env,
> void *cookie,
> >       (*printer)(env, cookie,
> >                  "page@%p[%d %p %d %d %p]\n",
> >                  pg, refcount_read(&pg->cp_ref), pg->cp_obj,
> > -                pg->cp_state, pg->cp_type,
> > +                cl_page_state_get(pg), pg->cp_type,
> >                  pg->cp_owner);
> >  }
> >  EXPORT_SYMBOL(cl_page_header_print);
> > diff --git a/fs/lustre/obdecho/echo_client.c
> b/fs/lustre/obdecho/echo_client.c
> > index 317123fd27cb..d879f109e641 100644
> > --- a/fs/lustre/obdecho/echo_client.c
> > +++ b/fs/lustre/obdecho/echo_client.c
> > @@ -1046,7 +1046,7 @@ static int cl_echo_object_brw(struct echo_object
> *eco, int rw, u64 offset,
> >
> >               rc = cl_page_own(env, io, clp);
> >               if (rc) {
> > -                     LASSERT(clp->cp_state == CPS_FREEING);
> > +                     LASSERT(cl_page_state_get(clp) == CPS_FREEING);
> >                       cl_page_put(env, clp);
> >                       break;
> >               }
> > diff --git a/fs/lustre/osc/osc_cache.c b/fs/lustre/osc/osc_cache.c
> > index f8fddbfe6a7e..75984b98b229 100644
> > --- a/fs/lustre/osc/osc_cache.c
> > +++ b/fs/lustre/osc/osc_cache.c
> > @@ -1045,7 +1045,7 @@ static int osc_extent_truncate(struct osc_extent
> *ext, pgoff_t trunc_index,
> >                       cl_page_discard(env, io, page);
> >                       cl_page_disown(env, io, page);
> >               } else {
> > -                     LASSERT(page->cp_state == CPS_FREEING);
> > +                     LASSERT(cl_page_state_get(page) == CPS_FREEING);
> >                       LASSERT(0);
> >               }
> >
> > @@ -1356,10 +1356,12 @@ static int osc_completion(const struct lu_env
> *env, struct osc_async_page *oap,
> >       int srvlock;
> >
> >       cmd &= ~OBD_BRW_NOQUOTA;
> > -     LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ),
> > -              "cp_state:%u, cmd:%d\n", page->cp_state, cmd);
> > -     LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE),
> > -              "cp_state:%u, cmd:%d\n", page->cp_state, cmd);
> > +     LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEIN,
> > +                   cmd == OBD_BRW_READ),
> > +              "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd);
> > +     LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEOUT,
> > +                   cmd == OBD_BRW_WRITE),
> > +              "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd);
> >       LASSERT(opg->ops_transfer_pinned);
> >
> >       crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
> > @@ -3061,7 +3063,7 @@ bool osc_page_gang_lookup(const struct lu_env
> *env, struct cl_io *io,
> >
> >                       page = ops->ops_cl.cpl_page;
> >                       LASSERT(page->cp_type == CPT_CACHEABLE);
> > -                     if (page->cp_state == CPS_FREEING)
> > +                     if (cl_page_state_get(page) == CPS_FREEING)
> >                               continue;
> >
> >                       cl_page_get(page);
> > @@ -3142,7 +3144,7 @@ static bool check_and_discard_cb(const struct
> lu_env *env, struct cl_io *io,
> >                       cl_page_discard(env, io, page);
> >                       cl_page_disown(env, io, page);
> >               } else {
> > -                     LASSERT(page->cp_state == CPS_FREEING);
> > +                     LASSERT(cl_page_state_get(page) == CPS_FREEING);
> >               }
> >       }
> >
> > @@ -3169,7 +3171,7 @@ static bool discard_cb(const struct lu_env *env,
> struct cl_io *io,
> >               cl_page_discard(env, io, page);
> >               cl_page_disown(env, io, page);
> >       } else {
> > -             LASSERT(page->cp_state == CPS_FREEING);
> > +             LASSERT(cl_page_state_get(page) == CPS_FREEING);
> >       }
> >
> >       return true;
> > --
> > 2.17.1
>
<div dir="ltr"><div dir="ltr">On Thu, Jun 6, 2019 at 7:57 PM NeilBrown &lt;<a href="mailto:neilb@suse.com">neilb@suse.com</a>&gt; wrote:<br>&gt;<br>&gt; On Thu, Jun 06 2019, Shaun Tancheff wrote:<br>&gt;<br>&gt; &gt; Ensure all uses of cl_page-&gt;cp_state are smp-safe.<br>&gt; &gt;<br>&gt; &gt; LBUG Output:<br>&gt; &gt; In __cl_page_state_set()<br>&gt; &gt; ..<br>&gt; &gt;   old = page-&gt;cp_state;<br>&gt; &gt;   PASSERT(env, page, allowed_transitions[old][state]);<br>&gt; &gt; ..<br>&gt; &gt; Asserted with the following:<br>&gt; &gt;   page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]<br>&gt;<br>&gt; What do all these fields mean?  Or where is this printed?<br>&gt; The closest I can find is in cl_page_header_print() which uses the<br>&gt; format string:<br>&gt;<br>&gt;                    &quot;page@%p[%d %p %d %d %p]\n&quot;,<br>&gt;<br>&gt; But that only has 5 fields in the [], while the output you provided has<br>&gt; 6.<br>&gt;<br><br>Sorry for the confusion, here is my (long) explanation with source snippets:<br><br>The LBUG in full<br><br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e28cc0]<br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) vvp-page@ffff80be1fcd6650(0:0) vm@ffff7e02fa0e60c0 5000000000001029 4:0 ffff80be1fcd6600 1049548 lru<br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) lov-page@ffff80be1fcd6690, comp index: 0, gen: 0<br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) osc-page@ffff80be1fcd66c8 1049548: 1&lt; 0x845fed 2 0 - - &gt; 2&lt; 4298948608 0 4096 0x0 0x420 |           (null) ffff809e473708e8 ffff80be199c1e40 &gt; 3&lt; 0 0 0 &gt; 4&lt; 5 13 64 0 + | - - - - &gt; 5&lt; - - - - | 0 - | 0 - -&gt;<br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) end page@ffff80be1fcd6600<br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) allowed_transitions[old][state]<br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) ASSERTION( 0 ) failed:<br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) LBUG<br><br>The source snippet around L342<br>...<br>   339 <br>   340          ENTRY;<br>   341          old = page-&gt;cp_state;<br>   342          PASSERT(env, page, allowed_transitions[old][state]);<br>   343          CL_PAGE_HEADER(D_TRACE, env, page, &quot;%d -&gt; %d\n&quot;, old, state);<br>   344          PASSERT(env, page, page-&gt;cp_state == old);<br>   345          PASSERT(env, page, equi(state == CPS_OWNED, page-&gt;cp_owner != NULL));<br>   346 <br>   347          cs_pagestate_dec(page-&gt;cp_obj, page-&gt;cp_state);<br>   348          cs_pagestate_inc(page-&gt;cp_obj, state);<br><br>The PASSERT macro ...<br>...<br># define PASSERT(env, page, expr)                                       \<br>  do {                                                                    \<br>          if (unlikely(!(expr))) {                                      \<br>                  CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr &quot;\n&quot;);    \<br>                  LASSERT(0);                                           \<br>          }                                                             \<br>  } while (0)<br>#else /* !LIBCFS_DEBUG */<br>...<br><br>/**<br> * Helper macro, dumping detailed information about \a page into a log.<br> */<br>#define CL_PAGE_DEBUG(mask, env, page, format, ...)                     \<br>do {                                                                    \<br>        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                   \<br>                LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);        \<br>                cl_page_print(env, &amp;msgdata, lu_cdebug_printer, page);  \<br>                CDEBUG(mask, format , ## __VA_ARGS__);                  \<br>        }                                                               \<br>} while (0)<br><br>Almost there<br>...<br>  1062  /**<br>  1063   * Prints human readable representation of \a pg to the \a f.<br>  1064   */<br>  1065  void cl_page_print(const struct lu_env *env, void *cookie,<br>  1066                     lu_printer_t printer, const struct cl_page *pg)<br>  1067  {<br>  1068          const struct cl_page_slice *slice;<br>  1069          int result = 0;<br>  1070 <br>  1071          cl_page_header_print(env, cookie, printer, pg);<br>  1072          list_for_each_entry(slice, &amp;pg-&gt;cp_layers, cpl_linkage) {<br>  1073                  if (slice-&gt;cpl_ops-&gt;cpo_print != NULL)<br>  1074                          result = (*slice-&gt;cpl_ops-&gt;cpo_print)(env, slice,<br>  1075                                                               cookie, printer);<br><br>Finally the &#39;good stuff&#39;<br>...<br>  1051  void cl_page_header_print(const struct lu_env *env, void *cookie,<br>  1052                            lu_printer_t printer, const struct cl_page *pg)<br>  1053  {<br>  1054          (*printer)(env, cookie,<br>  1055                     &quot;page@%p[%d %p %d %d %p]\n&quot;,<br>  1056                     pg, atomic_read(&amp;pg-&gt;cp_ref), pg-&gt;cp_obj,<br>  1057                     pg-&gt;cp_state, pg-&gt;cp_type,<br>  1058                     pg-&gt;cp_owner);<br>  1059  }<br>...<br><br>Given the above along with the first line from the LBUG ....<div><br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e28cc0]<br><br>Implies the following:</div><div><br>pg == ffff80be1fcd6600<br>pg-&gt;cp_ref == 3<br>pg-&gt;cp_obj == ffff80be1fca5cc0<br>pg-&gt;cp_state == 0 (state == CPS_CACHED)<br>pg-&gt;cp_type == 1<br>pg-&gt;cp_owner == ffff809e60e28cc0<br><br>Also known caller was cl_page_assume+0xdc/0x3e0 [obdclass] (from the backtrace)<br>So we know the on-stack / register value of state == CPS_OWNED</div><div><br></div><div>The backtrace:<br>...<br> dump_backtrace+0x0/0x248<br> show_stack+0x24/0x30<br> dump_stack+0xbc/0xf4<br> libcfs_call_trace+0xec/0x120 [libcfs]<br> lbug_with_loc+0x4c/0xb8 [libcfs]<br> cl_page_state_set0+0x2b4/0x6e0 [obdclass]<br> cl_page_assume+0xdc/0x3e0 [obdclass]<br> ll_io_read_page+0x144c/0x1de0 [lustre]<br> ll_write_begin+0x3d4/0xee8 [lustre]<br><br></div><div>and the contents of cl_page_assume</div><div><br>   596  void cl_page_assume(const struct lu_env *env,<br>   597                      struct cl_io *io, struct cl_page *pg)<br>   598  {<br>   599          const struct cl_page_slice *slice;<br>   600 <br>   601          PINVRNT(env, pg, cl_object_same(pg-&gt;cp_obj, io-&gt;ci_obj));<br>   602 <br>   603          ENTRY;<br>   604          io = cl_io_top(io);<br>   605 <br>   606          list_for_each_entry(slice, &amp;pg-&gt;cp_layers, cpl_linkage) {<br>   607                  if (slice-&gt;cpl_ops-&gt;cpo_assume != NULL)<br>   608                          (*slice-&gt;cpl_ops-&gt;cpo_assume)(env, slice, io);<br>   609          }<br>   610 <br>   611          PASSERT(env, pg, pg-&gt;cp_owner == NULL);<br>   612          pg-&gt;cp_owner = cl_io_top(io);<br>   613          cl_page_owner_set(pg);<br>   614          cl_page_state_set(env, pg, CPS_OWNED);<br>   615          EXIT;<br>   616  }<br><br></div><div>While I believe my conclusion to be correct I am certainly open to being dead wrong.</div><div><br></div></div><br><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Thu, Jun 6, 2019 at 7:57 PM NeilBrown &lt;<a href="mailto:neilb@suse.com">neilb@suse.com</a>&gt; wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">On Thu, Jun 06 2019, Shaun Tancheff wrote:<br>
<br>
&gt; Ensure all uses of cl_page-&gt;cp_state are smp-safe.<br>
&gt;<br>
&gt; LBUG Output:<br>
&gt; In __cl_page_state_set()<br>
&gt; ..<br>
&gt;   old = page-&gt;cp_state;<br>
&gt;   PASSERT(env, page, allowed_transitions[old][state]);<br>
&gt; ..<br>
&gt; Asserted with the following:<br>
&gt;   page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]<br>
<br>
What do all these fields mean?  Or where is this printed?<br>
The closest I can find is in cl_page_header_print() which uses the<br>
format string:<br>
<br>
                   &quot;page@%p[%d %p %d %d %p]\n&quot;,<br>
<br>
But that only has 5 fields in the [], while the output you provided has<br>
6.<br>
<br>
&gt;<br>
&gt; However cp_state 0 (CPS_CACHED) to 1 (CPS_OWNED) is a valid transition<br>
&gt; leading to the conclusion that cp_state became 0 during the<br>
&gt; assertion.<br>
<br>
Where is the evidence that this was the transition that was happening?<br>
If it was in some part of the LBUG output that wasn&#39;t quoted - then<br>
please fix that by quoting the entire LBUG output.<br>
<br>
We need to understand which change happened at this time to cause the<br>
race.<br>
Then we need to explain why the added barriers actually close the race<br>
window.<br>
<br>
If these barrier do actually fix a race, then there is something *very*<br>
wrong.<br>
The comments say that changes to cp_state are protected by the page lock<br>
on the corresponding VM page.<br>
Locking and unlocking a VM page entails sufficient barriers that changes<br>
made while one thread holds the lock will be visible to another<br>
thread once it also gets the lock.<br>
<br></blockquote><div><br></div><div>True, but a page lock is just a bit on the page flags. And we absolutely wait on I/O while the </div><div>page is &#39;locked&#39; so a ping-pong (our task *can* sleep) is entirely reasonable, as far as</div><div>I understand anyway. Since there is multiple state-changes during a single page lock/unlock</div><div>there is a race, however rare and unlikely, and it&#39;s arch specific based on what I understand</div><div>of the memory barrier rules.</div><div><br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
So the only possible explanation for a race as you suspect, is if the<br>
vmpage *isn&#39;t* locked when something changes cp_state, and that would be<br>
bad.<br>
<br>
Thanks,<br>
NeilBrown<br>
<br>
<br>
&gt;<br>
&gt; Signed-off-by: Shaun Tancheff &lt;<a href="mailto:stancheff@cray.com" target="_blank">stancheff@cray.com</a>&gt;<br>
&gt; ---<br>
&gt;  fs/lustre/include/cl_object.h   | 11 +++++++++++<br>
&gt;  fs/lustre/llite/rw26.c          |  2 +-<br>
&gt;  fs/lustre/llite/vvp_page.c      |  6 +++---<br>
&gt;  fs/lustre/obdclass/cl_page.c    | 34 +++++++++++++++++++--------------<br>
&gt;  fs/lustre/obdecho/echo_client.c |  2 +-<br>
&gt;  fs/lustre/osc/osc_cache.c       | 18 +++++++++--------<br>
&gt;  6 files changed, 46 insertions(+), 27 deletions(-)<br>
&gt;<br>
&gt; diff --git a/fs/lustre/include/cl_object.h b/fs/lustre/include/cl_object.h<br>
&gt; index 691c2f5da53a..d6e1f6f05f50 100644<br>
&gt; --- a/fs/lustre/include/cl_object.h<br>
&gt; +++ b/fs/lustre/include/cl_object.h<br>
&gt; @@ -752,6 +752,17 @@ struct cl_page {<br>
&gt;       struct cl_sync_io               *cp_sync_io;<br>
&gt;  };<br>
&gt;  <br>
&gt; +static inline enum cl_page_state cl_page_state_get(const struct cl_page *pg)<br>
&gt; +{<br>
&gt; +     /*<br>
&gt; +      * Paired with smp_store_release in cl_page_state_set_trust<br>
&gt; +      * and ensures that we see the most recent value of cp_state<br>
&gt; +      * even when the last modification was not performed on the<br>
&gt; +      * current processor<br>
&gt; +      */<br>
&gt; +     return smp_load_acquire(&amp;pg-&gt;cp_state);<br>
&gt; +}<br>
&gt; +<br>
&gt;  /**<br>
&gt;   * Per-layer part of cl_page.<br>
&gt;   *<br>
&gt; diff --git a/fs/lustre/llite/rw26.c b/fs/lustre/llite/rw26.c<br>
&gt; index e4ce3b6f5772..364dec208ccd 100644<br>
&gt; --- a/fs/lustre/llite/rw26.c<br>
&gt; +++ b/fs/lustre/llite/rw26.c<br>
&gt; @@ -200,7 +200,7 @@ static ssize_t ll_direct_IO_seg(const struct lu_env *env, struct cl_io *io,<br>
&gt;  <br>
&gt;               rc = cl_page_own(env, io, clp);<br>
&gt;               if (rc) {<br>
&gt; -                     LASSERT(clp-&gt;cp_state == CPS_FREEING);<br>
&gt; +                     LASSERT(cl_page_state_get(clp) == CPS_FREEING);<br>
&gt;                       cl_page_put(env, clp);<br>
&gt;                       break;<br>
&gt;               }<br>
&gt; diff --git a/fs/lustre/llite/vvp_page.c b/fs/lustre/llite/vvp_page.c<br>
&gt; index 590e5f5e43c9..38b8c488d765 100644<br>
&gt; --- a/fs/lustre/llite/vvp_page.c<br>
&gt; +++ b/fs/lustre/llite/vvp_page.c<br>
&gt; @@ -323,18 +323,18 @@ static int vvp_page_make_ready(const struct lu_env *env,<br>
&gt;  <br>
&gt;       lock_page(vmpage);<br>
&gt;       if (clear_page_dirty_for_io(vmpage)) {<br>
&gt; -             LASSERT(pg-&gt;cp_state == CPS_CACHED);<br>
&gt; +             LASSERT(cl_page_state_get(pg) == CPS_CACHED);<br>
&gt;               /* This actually clears the dirty bit in the radix tree. */<br>
&gt;               set_page_writeback(vmpage);<br>
&gt;               CL_PAGE_HEADER(D_PAGE, env, pg, &quot;readied\n&quot;);<br>
&gt; -     } else if (pg-&gt;cp_state == CPS_PAGEOUT) {<br>
&gt; +     } else if (cl_page_state_get(pg) == CPS_PAGEOUT) {<br>
&gt;               /* is it possible for osc_flush_async_page() to already<br>
&gt;                * make it ready?<br>
&gt;                */<br>
&gt;               result = -EALREADY;<br>
&gt;       } else {<br>
&gt;               CL_PAGE_DEBUG(D_ERROR, env, pg, &quot;Unexpecting page state %d.\n&quot;,<br>
&gt; -                           pg-&gt;cp_state);<br>
&gt; +                           cl_page_state_get(pg));<br>
&gt;               LBUG();<br>
&gt;       }<br>
&gt;       unlock_page(vmpage);<br>
&gt; diff --git a/fs/lustre/obdclass/cl_page.c b/fs/lustre/obdclass/cl_page.c<br>
&gt; index 349f19e014e0..da4429b82932 100644<br>
&gt; --- a/fs/lustre/obdclass/cl_page.c<br>
&gt; +++ b/fs/lustre/obdclass/cl_page.c<br>
&gt; @@ -97,7 +97,7 @@ static void cl_page_free(const struct lu_env *env, struct cl_page *page)<br>
&gt;  <br>
&gt;       PASSERT(env, page, list_empty(&amp;page-&gt;cp_batch));<br>
&gt;       PASSERT(env, page, !page-&gt;cp_owner);<br>
&gt; -     PASSERT(env, page, page-&gt;cp_state == CPS_FREEING);<br>
&gt; +     PASSERT(env, page, cl_page_state_get(page) == CPS_FREEING);<br>
&gt;  <br>
&gt;       while ((slice = list_first_entry_or_null(&amp;page-&gt;cp_layers,<br>
&gt;                                                struct cl_page_slice,<br>
&gt; @@ -119,8 +119,14 @@ static void cl_page_free(const struct lu_env *env, struct cl_page *page)<br>
&gt;  static inline void cl_page_state_set_trust(struct cl_page *page,<br>
&gt;                                          enum cl_page_state state)<br>
&gt;  {<br>
&gt; -     /* bypass const. */<br>
&gt; -     *(enum cl_page_state *)&amp;page-&gt;cp_state = state;<br>
&gt; +     /*<br>
&gt; +      * Paired with smp_load_acquire in cl_page_state_get<br>
&gt; +      * and ensures that we see the most recent value of cp_state<br>
&gt; +      * is available even when the next access is not performed on the<br>
&gt; +      * current processor.<br>
&gt; +      * Note we also cast away const as the only modifier of cp_state.<br>
&gt; +      */<br>
&gt; +     smp_store_release((enum cl_page_state *)&amp;page-&gt;cp_state, state);<br>
&gt;  }<br>
&gt;  <br>
&gt;  struct cl_page *cl_page_alloc(const struct lu_env *env,<br>
&gt; @@ -270,10 +276,10 @@ static void __cl_page_state_set(const struct lu_env *env,<br>
&gt;               }<br>
&gt;       };<br>
&gt;  <br>
&gt; -     old = page-&gt;cp_state;<br>
&gt; +     old = cl_page_state_get(page);<br>
&gt;       PASSERT(env, page, allowed_transitions[old][state]);<br>
&gt;       CL_PAGE_HEADER(D_TRACE, env, page, &quot;%d -&gt; %d\n&quot;, old, state);<br>
&gt; -     PASSERT(env, page, page-&gt;cp_state == old);<br>
&gt; +     PASSERT(env, page, cl_page_state_get(page) == old);<br>
&gt;       PASSERT(env, page, equi(state == CPS_OWNED, page-&gt;cp_owner));<br>
&gt;       cl_page_state_set_trust(page, state);<br>
&gt;  }<br>
&gt; @@ -313,7 +319,7 @@ void cl_page_put(const struct lu_env *env, struct cl_page *page)<br>
&gt;                      refcount_read(&amp;page-&gt;cp_ref));<br>
&gt;  <br>
&gt;       if (refcount_dec_and_test(&amp;page-&gt;cp_ref)) {<br>
&gt; -             LASSERT(page-&gt;cp_state == CPS_FREEING);<br>
&gt; +             LASSERT(cl_page_state_get(page) == CPS_FREEING);<br>
&gt;  <br>
&gt;               LASSERT(refcount_read(&amp;page-&gt;cp_ref) == 0);<br>
&gt;               PASSERT(env, page, !page-&gt;cp_owner);<br>
&gt; @@ -378,7 +384,7 @@ void __cl_page_disown(const struct lu_env *env,<br>
&gt;       const struct cl_page_slice *slice;<br>
&gt;       enum cl_page_state state;<br>
&gt;  <br>
&gt; -     state = pg-&gt;cp_state;<br>
&gt; +     state = cl_page_state_get(pg);<br>
&gt;       cl_page_owner_clear(pg);<br>
&gt;  <br>
&gt;       if (state == CPS_OWNED)<br>
&gt; @@ -402,7 +408,7 @@ int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)<br>
&gt;       struct cl_io *top = cl_io_top((struct cl_io *)io);<br>
&gt;  <br>
&gt;       LINVRNT(cl_object_same(pg-&gt;cp_obj, io-&gt;ci_obj));<br>
&gt; -     return pg-&gt;cp_state == CPS_OWNED &amp;&amp; pg-&gt;cp_owner == top;<br>
&gt; +     return cl_page_state_get(pg) == CPS_OWNED &amp;&amp; pg-&gt;cp_owner == top;<br>
&gt;  }<br>
&gt;  EXPORT_SYMBOL(cl_page_is_owned);<br>
&gt;  <br>
&gt; @@ -434,7 +440,7 @@ static int __cl_page_own(const struct lu_env *env, struct cl_io *io,<br>
&gt;  <br>
&gt;       io = cl_io_top(io);<br>
&gt;  <br>
&gt; -     if (pg-&gt;cp_state == CPS_FREEING) {<br>
&gt; +     if (cl_page_state_get(pg) == CPS_FREEING) {<br>
&gt;               result = -ENOENT;<br>
&gt;               goto out;<br>
&gt;       }<br>
&gt; @@ -453,7 +459,7 @@ static int __cl_page_own(const struct lu_env *env, struct cl_io *io,<br>
&gt;               PASSERT(env, pg, !pg-&gt;cp_owner);<br>
&gt;               pg-&gt;cp_owner = cl_io_top(io);<br>
&gt;               cl_page_owner_set(pg);<br>
&gt; -             if (pg-&gt;cp_state != CPS_FREEING) {<br>
&gt; +             if (cl_page_state_get(pg) != CPS_FREEING) {<br>
&gt;                       cl_page_state_set(env, pg, CPS_OWNED);<br>
&gt;               } else {<br>
&gt;                       __cl_page_disown(env, io, pg);<br>
&gt; @@ -593,7 +599,7 @@ static void __cl_page_delete(const struct lu_env *env, struct cl_page *pg)<br>
&gt;  {<br>
&gt;       const struct cl_page_slice *slice;<br>
&gt;  <br>
&gt; -     PASSERT(env, pg, pg-&gt;cp_state != CPS_FREEING);<br>
&gt; +     PASSERT(env, pg, cl_page_state_get(pg) != CPS_FREEING);<br>
&gt;  <br>
&gt;       /*<br>
&gt;        * Sever all ways to obtain new pointers to @pg.<br>
&gt; @@ -756,7 +762,7 @@ void cl_page_completion(const struct lu_env *env,<br>
&gt;       const struct cl_page_slice *slice;<br>
&gt;  <br>
&gt;       PASSERT(env, pg, crt &lt; CRT_NR);<br>
&gt; -     PASSERT(env, pg, pg-&gt;cp_state == cl_req_type_state(crt));<br>
&gt; +     PASSERT(env, pg, cl_page_state_get(pg) == cl_req_type_state(crt));<br>
&gt;  <br>
&gt;       CL_PAGE_HEADER(D_TRACE, env, pg, &quot;%d %d\n&quot;, crt, ioret);<br>
&gt;  <br>
&gt; @@ -805,7 +811,7 @@ int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,<br>
&gt;       }<br>
&gt;  <br>
&gt;       if (result &gt;= 0) {<br>
&gt; -             PASSERT(env, pg, pg-&gt;cp_state == CPS_CACHED);<br>
&gt; +             PASSERT(env, pg, cl_page_state_get(pg) == CPS_CACHED);<br>
&gt;               cl_page_io_start(env, pg, crt);<br>
&gt;               result = 0;<br>
&gt;       }<br>
&gt; @@ -870,7 +876,7 @@ void cl_page_header_print(const struct lu_env *env, void *cookie,<br>
&gt;       (*printer)(env, cookie,<br>
&gt;                  &quot;page@%p[%d %p %d %d %p]\n&quot;,<br>
&gt;                  pg, refcount_read(&amp;pg-&gt;cp_ref), pg-&gt;cp_obj,<br>
&gt; -                pg-&gt;cp_state, pg-&gt;cp_type,<br>
&gt; +                cl_page_state_get(pg), pg-&gt;cp_type,<br>
&gt;                  pg-&gt;cp_owner);<br>
&gt;  }<br>
&gt;  EXPORT_SYMBOL(cl_page_header_print);<br>
&gt; diff --git a/fs/lustre/obdecho/echo_client.c b/fs/lustre/obdecho/echo_client.c<br>
&gt; index 317123fd27cb..d879f109e641 100644<br>
&gt; --- a/fs/lustre/obdecho/echo_client.c<br>
&gt; +++ b/fs/lustre/obdecho/echo_client.c<br>
&gt; @@ -1046,7 +1046,7 @@ static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset,<br>
&gt;  <br>
&gt;               rc = cl_page_own(env, io, clp);<br>
&gt;               if (rc) {<br>
&gt; -                     LASSERT(clp-&gt;cp_state == CPS_FREEING);<br>
&gt; +                     LASSERT(cl_page_state_get(clp) == CPS_FREEING);<br>
&gt;                       cl_page_put(env, clp);<br>
&gt;                       break;<br>
&gt;               }<br>
&gt; diff --git a/fs/lustre/osc/osc_cache.c b/fs/lustre/osc/osc_cache.c<br>
&gt; index f8fddbfe6a7e..75984b98b229 100644<br>
&gt; --- a/fs/lustre/osc/osc_cache.c<br>
&gt; +++ b/fs/lustre/osc/osc_cache.c<br>
&gt; @@ -1045,7 +1045,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,<br>
&gt;                       cl_page_discard(env, io, page);<br>
&gt;                       cl_page_disown(env, io, page);<br>
&gt;               } else {<br>
&gt; -                     LASSERT(page-&gt;cp_state == CPS_FREEING);<br>
&gt; +                     LASSERT(cl_page_state_get(page) == CPS_FREEING);<br>
&gt;                       LASSERT(0);<br>
&gt;               }<br>
&gt;  <br>
&gt; @@ -1356,10 +1356,12 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,<br>
&gt;       int srvlock;<br>
&gt;  <br>
&gt;       cmd &amp;= ~OBD_BRW_NOQUOTA;<br>
&gt; -     LASSERTF(equi(page-&gt;cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ),<br>
&gt; -              &quot;cp_state:%u, cmd:%d\n&quot;, page-&gt;cp_state, cmd);<br>
&gt; -     LASSERTF(equi(page-&gt;cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE),<br>
&gt; -              &quot;cp_state:%u, cmd:%d\n&quot;, page-&gt;cp_state, cmd);<br>
&gt; +     LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEIN,<br>
&gt; +                   cmd == OBD_BRW_READ),<br>
&gt; +              &quot;cp_state:%u, cmd:%d\n&quot;, cl_page_state_get(page), cmd);<br>
&gt; +     LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEOUT,<br>
&gt; +                   cmd == OBD_BRW_WRITE),<br>
&gt; +              &quot;cp_state:%u, cmd:%d\n&quot;, cl_page_state_get(page), cmd);<br>
&gt;       LASSERT(opg-&gt;ops_transfer_pinned);<br>
&gt;  <br>
&gt;       crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;<br>
&gt; @@ -3061,7 +3063,7 @@ bool osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,<br>
&gt;  <br>
&gt;                       page = ops-&gt;ops_cl.cpl_page;<br>
&gt;                       LASSERT(page-&gt;cp_type == CPT_CACHEABLE);<br>
&gt; -                     if (page-&gt;cp_state == CPS_FREEING)<br>
&gt; +                     if (cl_page_state_get(page) == CPS_FREEING)<br>
&gt;                               continue;<br>
&gt;  <br>
&gt;                       cl_page_get(page);<br>
&gt; @@ -3142,7 +3144,7 @@ static bool check_and_discard_cb(const struct lu_env *env, struct cl_io *io,<br>
&gt;                       cl_page_discard(env, io, page);<br>
&gt;                       cl_page_disown(env, io, page);<br>
&gt;               } else {<br>
&gt; -                     LASSERT(page-&gt;cp_state == CPS_FREEING);<br>
&gt; +                     LASSERT(cl_page_state_get(page) == CPS_FREEING);<br>
&gt;               }<br>
&gt;       }<br>
&gt;  <br>
&gt; @@ -3169,7 +3171,7 @@ static bool discard_cb(const struct lu_env *env, struct cl_io *io,<br>
&gt;               cl_page_discard(env, io, page);<br>
&gt;               cl_page_disown(env, io, page);<br>
&gt;       } else {<br>
&gt; -             LASSERT(page-&gt;cp_state == CPS_FREEING);<br>
&gt; +             LASSERT(cl_page_state_get(page) == CPS_FREEING);<br>
&gt;       }<br>
&gt;  <br>
&gt;       return true;<br>
&gt; -- <br>
&gt; 2.17.1<br>
</blockquote></div></div>
NeilBrown June 11, 2019, 12:49 a.m. UTC | #3
On Thu, Jun 06 2019, Shaun Tancheff wrote:

> On Thu, Jun 6, 2019 at 7:57 PM NeilBrown <neilb@suse.com> wrote:
>>
>> On Thu, Jun 06 2019, Shaun Tancheff wrote:
>>
>> > Ensure all uses of cl_page->cp_state are smp-safe.
>> >
>> > LBUG Output:
>> > In __cl_page_state_set()
>> > ..
>> >   old = page->cp_state;
>> >   PASSERT(env, page, allowed_transitions[old][state]);
>> > ..
>> > Asserted with the following:
>> >   page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]

I now see that this space.....                                 ^^^
shouldn't be there..
.....

> Also known caller was cl_page_assume+0xdc/0x3e0 [obdclass] (from the
> backtrace)
> So we know the on-stack / register value of state == CPS_OWNED
>
> The backtrace:
> ...
>  dump_backtrace+0x0/0x248
>  show_stack+0x24/0x30
>  dump_stack+0xbc/0xf4
>  libcfs_call_trace+0xec/0x120 [libcfs]
>  lbug_with_loc+0x4c/0xb8 [libcfs]
>  cl_page_state_set0+0x2b4/0x6e0 [obdclass]
>  cl_page_assume+0xdc/0x3e0 [obdclass]
>  ll_io_read_page+0x144c/0x1de0 [lustre]
>  ll_write_begin+0x3d4/0xee8 [lustre]

Thanks for this extra detail.
I agree that the evidence suggests that page->cp_state changed (to
CPS_CACHED) during the processing of the PASSERT().

It isn't clear to me though that this problem can be fixed just by
adding barriers  - particularly if this was on x86_64 as I suspect it
is.  Intel architecture processors have strongly ordered memory
semantics.  I'm not entirely sure what that means, but as
smp_load_acquire() is very nearly a no-op on x86, I doubt that the
memory barriers you added actually fix anything there.

My understanding (based on a fairly shallow reading of the code) is that
a partial-page write is being attempted, and the target page was not in
memory, so it was read in synchronously.
The fragment of ll_io_read_page happening is:

	if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */
		rc = cl_sync_io_wait(env, anchor, 0);

		cl_page_assume(env, io, page);

So cl_sync_io_wait() has just run.  The page was (presumably) previously
owned by something else.  cl_sync_io_wait() waited for csi_sync_nr
to drop to zero, which should say it isn't owned any more.
cl_page_assume() then takes ownership.
But it seems that the old ownership wasn't quite gone yet.

The VM page remains locked the whole time, so the barriers implicit in
locking and unlocking a page do not some into play here.
There is an implicit barrier on the 'store' side when cl_sync_io_note()
calls atomic_dec_and_test() to decrement csi_sync_nr, but I cannot see
any implicit barrier on the 'load' side.  If l_wait_event() in
cl_sync_io_wait() took a spinlock, that would be enough.  But it is
possible for the wait to complete without any locking.

So I can just about see the possibility of the race that you think you
have hit.  But it couldn't happen on x86, as it has strong memory
ordering.

What CPU architecture did this ASSERT trigger on?

Thanks,
NeilBrown
Shaun Tancheff June 11, 2019, 4:05 a.m. UTC | #4
On Mon, Jun 10, 2019 at 7:49 PM NeilBrown <neilb@suse.com> wrote:

> On Thu, Jun 06 2019, Shaun Tancheff wrote:
>
> > On Thu, Jun 6, 2019 at 7:57 PM NeilBrown <neilb@suse.com> wrote:
> >>
> >> On Thu, Jun 06 2019, Shaun Tancheff wrote:
> >>
> >> > Ensure all uses of cl_page->cp_state are smp-safe.
> >> >
> >> > LBUG Output:
> >> > In __cl_page_state_set()
> >> > ..
> >> >   old = page->cp_state;
> >> >   PASSERT(env, page, allowed_transitions[old][state]);
> >> > ..
> >> > Asserted with the following:
> >> >   page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]
>
> I now see that this space.....                                 ^^^
> shouldn't be there..
> .....
>
> > Also known caller was cl_page_assume+0xdc/0x3e0 [obdclass] (from the
> > backtrace)
> > So we know the on-stack / register value of state == CPS_OWNED
> >
> > The backtrace:
> > ...
> >  dump_backtrace+0x0/0x248
> >  show_stack+0x24/0x30
> >  dump_stack+0xbc/0xf4
> >  libcfs_call_trace+0xec/0x120 [libcfs]
> >  lbug_with_loc+0x4c/0xb8 [libcfs]
> >  cl_page_state_set0+0x2b4/0x6e0 [obdclass]
> >  cl_page_assume+0xdc/0x3e0 [obdclass]
> >  ll_io_read_page+0x144c/0x1de0 [lustre]
> >  ll_write_begin+0x3d4/0xee8 [lustre]
>
> Thanks for this extra detail.
> I agree that the evidence suggests that page->cp_state changed (to
> CPS_CACHED) during the processing of the PASSERT().
>
> It isn't clear to me though that this problem can be fixed just by
> adding barriers  - particularly if this was on x86_64 as I suspect it
> is.  Intel architecture processors have strongly ordered memory
> semantics.  I'm not entirely sure what that means, but as
> smp_load_acquire() is very nearly a no-op on x86, I doubt that the
> memory barriers you added actually fix anything there.
>

Agree. This occurred on aarch64. I suspect one of the reasons that this has
not
been seen is that the majority of users are x86_64. It appears to be very
rare on
aarch64 but it could also, theoretically, happen on the other relaxed
architectures.


> My understanding (based on a fairly shallow reading of the code) is that
> a partial-page write is being attempted, and the target page was not in
> memory, so it was read in synchronously.
> The fragment of ll_io_read_page happening is:
>
>         if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent
> */
>                 rc = cl_sync_io_wait(env, anchor, 0);
>
>                 cl_page_assume(env, io, page);
>
> So cl_sync_io_wait() has just run.  The page was (presumably) previously
> owned by something else.  cl_sync_io_wait() waited for csi_sync_nr
> to drop to zero, which should say it isn't owned any more.
> cl_page_assume() then takes ownership.
> But it seems that the old ownership wasn't quite gone yet.
>
> The VM page remains locked the whole time, so the barriers implicit in
> locking and unlocking a page do not some into play here.
> There is an implicit barrier on the 'store' side when cl_sync_io_note()
> calls atomic_dec_and_test() to decrement csi_sync_nr, but I cannot see
> any implicit barrier on the 'load' side.  If l_wait_event() in
> cl_sync_io_wait() took a spinlock, that would be enough.  But it is
> possible for the wait to complete without any locking.
>
> So I can just about see the possibility of the race that you think you
> have hit.  But it couldn't happen on x86, as it has strong memory
> ordering.
>
> What CPU architecture did this ASSERT trigger on?
>

Agree with everything above, unfortunately was aarch64.


> Thanks,
> NeilBrown
>
<div dir="ltr"><div dir="ltr">On Mon, Jun 10, 2019 at 7:49 PM NeilBrown &lt;<a href="mailto:neilb@suse.com">neilb@suse.com</a>&gt; wrote:<br></div><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">On Thu, Jun 06 2019, Shaun Tancheff wrote:<br>
<br>
&gt; On Thu, Jun 6, 2019 at 7:57 PM NeilBrown &lt;<a href="mailto:neilb@suse.com" target="_blank">neilb@suse.com</a>&gt; wrote:<br>
&gt;&gt;<br>
&gt;&gt; On Thu, Jun 06 2019, Shaun Tancheff wrote:<br>
&gt;&gt;<br>
&gt;&gt; &gt; Ensure all uses of cl_page-&gt;cp_state are smp-safe.<br>
&gt;&gt; &gt;<br>
&gt;&gt; &gt; LBUG Output:<br>
&gt;&gt; &gt; In __cl_page_state_set()<br>
&gt;&gt; &gt; ..<br>
&gt;&gt; &gt;   old = page-&gt;cp_state;<br>
&gt;&gt; &gt;   PASSERT(env, page, allowed_transitions[old][state]);<br>
&gt;&gt; &gt; ..<br>
&gt;&gt; &gt; Asserted with the following:<br>
&gt;&gt; &gt;   page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]<br>
<br>
I now see that this space.....                                 ^^^<br>
shouldn&#39;t be there..<br>
.....<br>
<br>
&gt; Also known caller was cl_page_assume+0xdc/0x3e0 [obdclass] (from the<br>
&gt; backtrace)<br>
&gt; So we know the on-stack / register value of state == CPS_OWNED<br>
&gt;<br>
&gt; The backtrace:<br>
&gt; ...<br>
&gt;  dump_backtrace+0x0/0x248<br>
&gt;  show_stack+0x24/0x30<br>
&gt;  dump_stack+0xbc/0xf4<br>
&gt;  libcfs_call_trace+0xec/0x120 [libcfs]<br>
&gt;  lbug_with_loc+0x4c/0xb8 [libcfs]<br>
&gt;  cl_page_state_set0+0x2b4/0x6e0 [obdclass]<br>
&gt;  cl_page_assume+0xdc/0x3e0 [obdclass]<br>
&gt;  ll_io_read_page+0x144c/0x1de0 [lustre]<br>
&gt;  ll_write_begin+0x3d4/0xee8 [lustre]<br>
<br>
Thanks for this extra detail.<br>
I agree that the evidence suggests that page-&gt;cp_state changed (to<br>
CPS_CACHED) during the processing of the PASSERT().<br>
<br>
It isn&#39;t clear to me though that this problem can be fixed just by<br>
adding barriers  - particularly if this was on x86_64 as I suspect it<br>
is.  Intel architecture processors have strongly ordered memory<br>
semantics.  I&#39;m not entirely sure what that means, but as<br>
smp_load_acquire() is very nearly a no-op on x86, I doubt that the<br>
memory barriers you added actually fix anything there.<br></blockquote><div><br></div><div>Agree. This occurred on aarch64. I suspect one of the reasons that this has not</div><div>been seen is that the majority of users are x86_64. It appears to be very rare on</div><div>aarch64 but it could also, theoretically, happen on the other relaxed architectures.</div><div><br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><br>
My understanding (based on a fairly shallow reading of the code) is that<br>
a partial-page write is being attempted, and the target page was not in<br>
memory, so it was read in synchronously.<br>
The fragment of ll_io_read_page happening is:<br>
<br>
        if (anchor != NULL &amp;&amp; !cl_page_is_owned(page, io)) { /* have sent */<br>
                rc = cl_sync_io_wait(env, anchor, 0);<br>
<br>
                cl_page_assume(env, io, page);<br>
<br>
So cl_sync_io_wait() has just run.  The page was (presumably) previously<br>
owned by something else.  cl_sync_io_wait() waited for csi_sync_nr<br>
to drop to zero, which should say it isn&#39;t owned any more.<br>
cl_page_assume() then takes ownership.<br>
But it seems that the old ownership wasn&#39;t quite gone yet.<br>
<br>
The VM page remains locked the whole time, so the barriers implicit in<br>
locking and unlocking a page do not some into play here.<br>
There is an implicit barrier on the &#39;store&#39; side when cl_sync_io_note()<br>
calls atomic_dec_and_test() to decrement csi_sync_nr, but I cannot see<br>
any implicit barrier on the &#39;load&#39; side.  If l_wait_event() in<br>
cl_sync_io_wait() took a spinlock, that would be enough.  But it is<br>
possible for the wait to complete without any locking.<br>
<br>
So I can just about see the possibility of the race that you think you<br>
have hit.  But it couldn&#39;t happen on x86, as it has strong memory<br>
ordering.<br>
<br>
What CPU architecture did this ASSERT trigger on?<br></blockquote><div><br></div><div>Agree with everything above, unfortunately was aarch64.</div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
Thanks,<br>
NeilBrown<br></blockquote><div> </div></div></div>
NeilBrown June 12, 2019, 4:08 a.m. UTC | #5
On Mon, Jun 10 2019, Shaun Tancheff wrote:

> On Mon, Jun 10, 2019 at 7:49 PM NeilBrown <neilb@suse.com> wrote:
>
>>>
>> So I can just about see the possibility of the race that you think you
>> have hit.  But it couldn't happen on x86, as it has strong memory
>> ordering.
>>
>> What CPU architecture did this ASSERT trigger on?
>>
>
> Agree with everything above, unfortunately was aarch64.

Ahh.
OK, I'm now reasonably convinced that the patch probably fixes the bug.
It is a shame that there is no standard barrier between the wake_up and
the wait_event(), but there isn't and we need to work with that.

But I still don't like the patch.... :-(

I have a few problems with it.

Firstly, I generally don't like accessor functions.  They tend to hide
useful information.
Rather than cl_page_state_get() I would rather have
	 smp_load_acquire(&pg->cp_state)

where ever it is needed, with a comment explaining why it is needed in
that particular context.
Some people seem to like accessor functions - there certainly are a lot
of them in the kernel - but I don't think they help.
Having an accessor function that just adds a barrier is, I think,
particularly ugly as the comment can only be general in nature, and so
doesn't really help the reader to understand why the barrier is needed.
But even that isn't completely without precedent.
"key_read_state()" in include/linux/key.h simply wraps
smp_load_acquire(), and the comment there doesn't really tell me
anything useful.
"inet_sk_state_load" is slightly better it says it is for use in places
where the socket lock isn't held - and names some of those.

My second problem is that has "get" in the accessor function name is
potentially misleading and "get" normally has a matching "put", and
usually increments a refcount.  Having "read" or "load" in place of
"get" in the accessor function name would remove my second objection.

My third objection is that the bug doesn't exist in the upstream client
code.  I'm not sure whether this is luck or good management, but either
way it is gone.

cl_sync_io_wait() has been changed to always take csi_waitq.lock.  This
avoids the need for anchor->csi_barrier, and provides a more robust way
to ensure that cl_sync_io_note() is completely finished before
cl_sync_io_wait() completes.

That isn't quite the outcome I was expecting...

I'm sorry if I seem to have been rather harsh on your patch - my
intention was only to make sure it was the best it could be.

It really needed to have the stack trace and the arch were the LBUG
happened to be at all complete.
I would have been nice if it identified the particular code where the
barrier was needed: between cl_sync_io_note() and cl_sync_io_wait()
because that mediates and ownership transition that happens while the VM
page is locked.

The rest are probably just issues of taste.

Thanks,
NeilBrown
Shaun Tancheff June 12, 2019, 4:36 a.m. UTC | #6
On Tue, Jun 11, 2019 at 11:09 PM NeilBrown <neilb@suse.com> wrote:

> On Mon, Jun 10 2019, Shaun Tancheff wrote:
>
> > On Mon, Jun 10, 2019 at 7:49 PM NeilBrown <neilb@suse.com> wrote:
> >
> >>>
> >> So I can just about see the possibility of the race that you think you
> >> have hit.  But it couldn't happen on x86, as it has strong memory
> >> ordering.
> >>
> >> What CPU architecture did this ASSERT trigger on?
> >>
> >
> > Agree with everything above, unfortunately was aarch64.
>
> Ahh.
> OK, I'm now reasonably convinced that the patch probably fixes the bug.
> It is a shame that there is no standard barrier between the wake_up and
> the wait_event(), but there isn't and we need to work with that.
>
> But I still don't like the patch.... :-(
>
> I have a few problems with it.
>
> Firstly, I generally don't like accessor functions.  They tend to hide
> useful information.
> Rather than cl_page_state_get() I would rather have
>          smp_load_acquire(&pg->cp_state)
>
> where ever it is needed, with a comment explaining why it is needed in
> that particular context.
> Some people seem to like accessor functions - there certainly are a lot
> of them in the kernel - but I don't think they help.
> Having an accessor function that just adds a barrier is, I think,
> particularly ugly as the comment can only be general in nature, and so
> doesn't really help the reader to understand why the barrier is needed.
> But even that isn't completely without precedent.
> "key_read_state()" in include/linux/key.h simply wraps
> smp_load_acquire(), and the comment there doesn't really tell me
> anything useful.
> "inet_sk_state_load" is slightly better it says it is for use in places
> where the socket lock isn't held - and names some of those.
>

Understood and a fair assessment.


> My second problem is that has "get" in the accessor function name is
> potentially misleading and "get" normally has a matching "put", and
> usually increments a refcount.  Having "read" or "load" in place of
> "get" in the accessor function name would remove my second objection.
>

Yeah, I was trying to balance with the set .. but really I'm not enamored
with that either :P


> My third objection is that the bug doesn't exist in the upstream client
> code.  I'm not sure whether this is luck or good management, but either
> way it is gone.
>

This is really good to know. Personally I was itching to refactor the whole
lot but this gives a me a much better path.


> cl_sync_io_wait() has been changed to always take csi_waitq.lock.  This
> avoids the need for anchor->csi_barrier, and provides a more robust way
> to ensure that cl_sync_io_note() is completely finished before
> cl_sync_io_wait() completes.
>
> That isn't quite the outcome I was expecting...
>
> I'm sorry if I seem to have been rather harsh on your patch - my
> intention was only to make sure it was the best it could be.
>
> It really needed to have the stack trace and the arch were the LBUG
> happened to be at all complete.
> I would have been nice if it identified the particular code where the
> barrier was needed: between cl_sync_io_note() and cl_sync_io_wait()
> because that mediates and ownership transition that happens while the VM
> page is locked.
>
> The rest are probably just issues of taste.
>
> Thanks,
> NeilBrown
>

Thanks I really appreciate the time need to consider the issue.
And I agree any patch needs to be as good and tasteful as we can make them.

Regards,
Shaun
<div dir="ltr"><div dir="ltr">On Tue, Jun 11, 2019 at 11:09 PM NeilBrown &lt;<a href="mailto:neilb@suse.com">neilb@suse.com</a>&gt; wrote:<br></div><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">On Mon, Jun 10 2019, Shaun Tancheff wrote:<br>
<br>
&gt; On Mon, Jun 10, 2019 at 7:49 PM NeilBrown &lt;<a href="mailto:neilb@suse.com" target="_blank">neilb@suse.com</a>&gt; wrote:<br>
&gt;<br>
&gt;&gt;&gt;<br>
&gt;&gt; So I can just about see the possibility of the race that you think you<br>
&gt;&gt; have hit.  But it couldn&#39;t happen on x86, as it has strong memory<br>
&gt;&gt; ordering.<br>
&gt;&gt;<br>
&gt;&gt; What CPU architecture did this ASSERT trigger on?<br>
&gt;&gt;<br>
&gt;<br>
&gt; Agree with everything above, unfortunately was aarch64.<br>
<br>
Ahh.<br>
OK, I&#39;m now reasonably convinced that the patch probably fixes the bug.<br>
It is a shame that there is no standard barrier between the wake_up and<br>
the wait_event(), but there isn&#39;t and we need to work with that.<br>
<br>
But I still don&#39;t like the patch.... :-(<br>
<br>
I have a few problems with it.<br>
<br>
Firstly, I generally don&#39;t like accessor functions.  They tend to hide<br>
useful information.<br>
Rather than cl_page_state_get() I would rather have<br>
         smp_load_acquire(&amp;pg-&gt;cp_state)<br>
<br>
where ever it is needed, with a comment explaining why it is needed in<br>
that particular context.<br>
Some people seem to like accessor functions - there certainly are a lot<br>
of them in the kernel - but I don&#39;t think they help.<br>
Having an accessor function that just adds a barrier is, I think,<br>
particularly ugly as the comment can only be general in nature, and so<br>
doesn&#39;t really help the reader to understand why the barrier is needed.<br>
But even that isn&#39;t completely without precedent.<br>
&quot;key_read_state()&quot; in include/linux/key.h simply wraps<br>
smp_load_acquire(), and the comment there doesn&#39;t really tell me<br>
anything useful.<br>
&quot;inet_sk_state_load&quot; is slightly better it says it is for use in places<br>
where the socket lock isn&#39;t held - and names some of those.<br></blockquote><div><br></div><div>Understood and a fair assessment.</div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
My second problem is that has &quot;get&quot; in the accessor function name is<br>
potentially misleading and &quot;get&quot; normally has a matching &quot;put&quot;, and<br>
usually increments a refcount.  Having &quot;read&quot; or &quot;load&quot; in place of<br>
&quot;get&quot; in the accessor function name would remove my second objection.<br></blockquote><div><br></div><div>Yeah, I was trying to balance with the set .. but really I&#39;m not enamored</div><div>with that either :P</div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
My third objection is that the bug doesn&#39;t exist in the upstream client<br>
code.  I&#39;m not sure whether this is luck or good management, but either<br>
way it is gone.<br></blockquote><div><br></div><div>This is really good to know. Personally I was itching to refactor the whole</div><div>lot but this gives a me a much better path.</div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">cl_sync_io_wait() has been changed to always take csi_waitq.lock.  This<br>
avoids the need for anchor-&gt;csi_barrier, and provides a more robust way<br>
to ensure that cl_sync_io_note() is completely finished before<br>
cl_sync_io_wait() completes.<br>
<br>
That isn&#39;t quite the outcome I was expecting...<br>
<br>
I&#39;m sorry if I seem to have been rather harsh on your patch - my<br>
intention was only to make sure it was the best it could be.<br>
<br>
It really needed to have the stack trace and the arch were the LBUG<br>
happened to be at all complete.<br>
I would have been nice if it identified the particular code where the<br>
barrier was needed: between cl_sync_io_note() and cl_sync_io_wait()<br>
because that mediates and ownership transition that happens while the VM<br>
page is locked.<br>
<br>
The rest are probably just issues of taste.<br>
<br>
Thanks,<br>
NeilBrown<br></blockquote><div><br></div><div>Thanks I really appreciate the time need to consider the issue.</div><div>And I agree any patch needs to be as good and tasteful as we can make them.</div><div><br></div><div>Regards,</div><div>Shaun</div><div><br></div></div></div>
Patrick Farrell June 12, 2019, 1:04 p.m. UTC | #7
As an interested observer (thanks to both of you for an interesting exchange), Shaun, what’s your plan going forward for the OpenSFS/WC branch?  It sounds like you’re thinking you’ll try to emulate what Neil did upstream? (which sounds good to me, I always prefer avoiding explicit memory barriers if reasonable)  Will you be opening an LU for this one?

Thanks,
- Patrick
Shaun Tancheff June 12, 2019, 4:12 p.m. UTC | #8
On Wed, Jun 12, 2019 at 8:04 AM Patrick Farrell <pfarrell@whamcloud.com>
wrote:

> As an interested observer (thanks to both of you for an interesting
> exchange), Shaun, what’s your plan going forward for the OpenSFS/WC
> branch?  It sounds like you’re thinking you’ll try to emulate what Neil did
> upstream? (which sounds good to me, I always prefer avoiding explicit
> memory barriers if reasonable)  Will you be opening an LU for this one?
>

Yes, I will be opening an LU for this. My plan is to emulate upstream.


> Thanks,
> - Patrick
> ------------------------------
> *From:* lustre-devel <lustre-devel-bounces@lists.lustre.org> on behalf of
> Shaun Tancheff <shaun@tancheff.com>
> *Sent:* Tuesday, June 11, 2019 11:36:13 PM
> *To:* NeilBrown
> *Cc:* Bobi Jam; lustre-devel@lists.lustre.org
> *Subject:* Re: [lustre-devel] [PATCH] Place a memory barrier around
> cp_state changes
>
> On Tue, Jun 11, 2019 at 11:09 PM NeilBrown <neilb@suse.com> wrote:
>
> On Mon, Jun 10 2019, Shaun Tancheff wrote:
>
> > On Mon, Jun 10, 2019 at 7:49 PM NeilBrown <neilb@suse.com> wrote:
> >
> >>>
> >> So I can just about see the possibility of the race that you think you
> >> have hit.  But it couldn't happen on x86, as it has strong memory
> >> ordering.
> >>
> >> What CPU architecture did this ASSERT trigger on?
> >>
> >
> > Agree with everything above, unfortunately was aarch64.
>
> Ahh.
> OK, I'm now reasonably convinced that the patch probably fixes the bug.
> It is a shame that there is no standard barrier between the wake_up and
> the wait_event(), but there isn't and we need to work with that.
>
> But I still don't like the patch.... :-(
>
> I have a few problems with it.
>
> Firstly, I generally don't like accessor functions.  They tend to hide
> useful information.
> Rather than cl_page_state_get() I would rather have
>          smp_load_acquire(&pg->cp_state)
>
> where ever it is needed, with a comment explaining why it is needed in
> that particular context.
> Some people seem to like accessor functions - there certainly are a lot
> of them in the kernel - but I don't think they help.
> Having an accessor function that just adds a barrier is, I think,
> particularly ugly as the comment can only be general in nature, and so
> doesn't really help the reader to understand why the barrier is needed.
> But even that isn't completely without precedent.
> "key_read_state()" in include/linux/key.h simply wraps
> smp_load_acquire(), and the comment there doesn't really tell me
> anything useful.
> "inet_sk_state_load" is slightly better it says it is for use in places
> where the socket lock isn't held - and names some of those.
>
>
> Understood and a fair assessment.
>
>
> My second problem is that has "get" in the accessor function name is
> potentially misleading and "get" normally has a matching "put", and
> usually increments a refcount.  Having "read" or "load" in place of
> "get" in the accessor function name would remove my second objection.
>
>
> Yeah, I was trying to balance with the set .. but really I'm not enamored
> with that either :P
>
>
> My third objection is that the bug doesn't exist in the upstream client
> code.  I'm not sure whether this is luck or good management, but either
> way it is gone.
>
>
> This is really good to know. Personally I was itching to refactor the whole
> lot but this gives a me a much better path.
>
>
> cl_sync_io_wait() has been changed to always take csi_waitq.lock.  This
> avoids the need for anchor->csi_barrier, and provides a more robust way
> to ensure that cl_sync_io_note() is completely finished before
> cl_sync_io_wait() completes.
>
> That isn't quite the outcome I was expecting...
>
> I'm sorry if I seem to have been rather harsh on your patch - my
> intention was only to make sure it was the best it could be.
>
> It really needed to have the stack trace and the arch were the LBUG
> happened to be at all complete.
> I would have been nice if it identified the particular code where the
> barrier was needed: between cl_sync_io_note() and cl_sync_io_wait()
> because that mediates and ownership transition that happens while the VM
> page is locked.
>
> The rest are probably just issues of taste.
>
> Thanks,
> NeilBrown
>
>
> Thanks I really appreciate the time need to consider the issue.
> And I agree any patch needs to be as good and tasteful as we can make them.
>
> Regards,
> Shaun
>
>
<div dir="ltr"><div dir="ltr">On Wed, Jun 12, 2019 at 8:04 AM Patrick Farrell &lt;<a href="mailto:pfarrell@whamcloud.com">pfarrell@whamcloud.com</a>&gt; wrote:<br></div><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">



<div>
As an interested observer (thanks to both of you for an interesting exchange), Shaun, what’s your plan going forward for the OpenSFS/WC branch?  It sounds like you’re thinking you’ll try to emulate what Neil did upstream? (which sounds good to me, I always
 prefer avoiding explicit memory barriers if reasonable)  Will you be opening an LU for this one?<br></div></blockquote><div><br></div><div>Yes, I will be opening an LU for this. My plan is to emulate upstream.</div><div> <br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><div>
Thanks,<br>
- Patrick
<hr style="display:inline-block;width:98%">
<div id="gmail-m_2263145130371291002divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> lustre-devel &lt;<a href="mailto:lustre-devel-bounces@lists.lustre.org" target="_blank">lustre-devel-bounces@lists.lustre.org</a>&gt; on behalf of Shaun Tancheff &lt;<a href="mailto:shaun@tancheff.com" target="_blank">shaun@tancheff.com</a>&gt;<br>
<b>Sent:</b> Tuesday, June 11, 2019 11:36:13 PM<br>
<b>To:</b> NeilBrown<br>
<b>Cc:</b> Bobi Jam; <a href="mailto:lustre-devel@lists.lustre.org" target="_blank">lustre-devel@lists.lustre.org</a><br>
<b>Subject:</b> Re: [lustre-devel] [PATCH] Place a memory barrier around cp_state changes</font>
<div> </div>
</div>
<div>
<div dir="ltr">
<div dir="ltr">On Tue, Jun 11, 2019 at 11:09 PM NeilBrown &lt;<a href="mailto:neilb@suse.com" target="_blank">neilb@suse.com</a>&gt; wrote:<br>
</div>
<div class="gmail-m_2263145130371291002x_gmail_quote">
<blockquote class="gmail-m_2263145130371291002x_gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
On Mon, Jun 10 2019, Shaun Tancheff wrote:<br>
<br>
&gt; On Mon, Jun 10, 2019 at 7:49 PM NeilBrown &lt;<a href="mailto:neilb@suse.com" target="_blank">neilb@suse.com</a>&gt; wrote:<br>
&gt;<br>
&gt;&gt;&gt;<br>
&gt;&gt; So I can just about see the possibility of the race that you think you<br>
&gt;&gt; have hit.  But it couldn&#39;t happen on x86, as it has strong memory<br>
&gt;&gt; ordering.<br>
&gt;&gt;<br>
&gt;&gt; What CPU architecture did this ASSERT trigger on?<br>
&gt;&gt;<br>
&gt;<br>
&gt; Agree with everything above, unfortunately was aarch64.<br>
<br>
Ahh.<br>
OK, I&#39;m now reasonably convinced that the patch probably fixes the bug.<br>
It is a shame that there is no standard barrier between the wake_up and<br>
the wait_event(), but there isn&#39;t and we need to work with that.<br>
<br>
But I still don&#39;t like the patch.... :-(<br>
<br>
I have a few problems with it.<br>
<br>
Firstly, I generally don&#39;t like accessor functions.  They tend to hide<br>
useful information.<br>
Rather than cl_page_state_get() I would rather have<br>
         smp_load_acquire(&amp;pg-&gt;cp_state)<br>
<br>
where ever it is needed, with a comment explaining why it is needed in<br>
that particular context.<br>
Some people seem to like accessor functions - there certainly are a lot<br>
of them in the kernel - but I don&#39;t think they help.<br>
Having an accessor function that just adds a barrier is, I think,<br>
particularly ugly as the comment can only be general in nature, and so<br>
doesn&#39;t really help the reader to understand why the barrier is needed.<br>
But even that isn&#39;t completely without precedent.<br>
&quot;key_read_state()&quot; in include/linux/key.h simply wraps<br>
smp_load_acquire(), and the comment there doesn&#39;t really tell me<br>
anything useful.<br>
&quot;inet_sk_state_load&quot; is slightly better it says it is for use in places<br>
where the socket lock isn&#39;t held - and names some of those.<br>
</blockquote>
<div><br>
</div>
<div>Understood and a fair assessment.</div>
<div> </div>
<blockquote class="gmail-m_2263145130371291002x_gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
My second problem is that has &quot;get&quot; in the accessor function name is<br>
potentially misleading and &quot;get&quot; normally has a matching &quot;put&quot;, and<br>
usually increments a refcount.  Having &quot;read&quot; or &quot;load&quot; in place of<br>
&quot;get&quot; in the accessor function name would remove my second objection.<br>
</blockquote>
<div><br>
</div>
<div>Yeah, I was trying to balance with the set .. but really I&#39;m not enamored</div>
<div>with that either :P</div>
<div> </div>
<blockquote class="gmail-m_2263145130371291002x_gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
My third objection is that the bug doesn&#39;t exist in the upstream client<br>
code.  I&#39;m not sure whether this is luck or good management, but either<br>
way it is gone.<br>
</blockquote>
<div><br>
</div>
<div>This is really good to know. Personally I was itching to refactor the whole</div>
<div>lot but this gives a me a much better path.</div>
<div> </div>
<blockquote class="gmail-m_2263145130371291002x_gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
cl_sync_io_wait() has been changed to always take csi_waitq.lock.  This<br>
avoids the need for anchor-&gt;csi_barrier, and provides a more robust way<br>
to ensure that cl_sync_io_note() is completely finished before<br>
cl_sync_io_wait() completes.<br>
<br>
That isn&#39;t quite the outcome I was expecting...<br>
<br>
I&#39;m sorry if I seem to have been rather harsh on your patch - my<br>
intention was only to make sure it was the best it could be.<br>
<br>
It really needed to have the stack trace and the arch were the LBUG<br>
happened to be at all complete.<br>
I would have been nice if it identified the particular code where the<br>
barrier was needed: between cl_sync_io_note() and cl_sync_io_wait()<br>
because that mediates and ownership transition that happens while the VM<br>
page is locked.<br>
<br>
The rest are probably just issues of taste.<br>
<br>
Thanks,<br>
NeilBrown<br>
</blockquote>
<div><br>
</div>
<div>Thanks I really appreciate the time need to consider the issue.</div>
<div>And I agree any patch needs to be as good and tasteful as we can make them.</div>
<div><br>
</div>
<div>Regards,</div>
<div>Shaun</div>
<div><br>
</div>
</div>
</div>
</div>
</div>

</blockquote></div></div>
diff mbox series

Patch

diff --git a/fs/lustre/include/cl_object.h b/fs/lustre/include/cl_object.h
index 691c2f5da53a..d6e1f6f05f50 100644
--- a/fs/lustre/include/cl_object.h
+++ b/fs/lustre/include/cl_object.h
@@ -752,6 +752,17 @@  struct cl_page {
 	struct cl_sync_io		*cp_sync_io;
 };
 
+static inline enum cl_page_state cl_page_state_get(const struct cl_page *pg)
+{
+	/*
+	 * Paired with smp_store_release in cl_page_state_set_trust
+	 * and ensures that we see the most recent value of cp_state
+	 * even when the last modification was not performed on the
+	 * current processor
+	 */
+	return smp_load_acquire(&pg->cp_state);
+}
+
 /**
  * Per-layer part of cl_page.
  *
diff --git a/fs/lustre/llite/rw26.c b/fs/lustre/llite/rw26.c
index e4ce3b6f5772..364dec208ccd 100644
--- a/fs/lustre/llite/rw26.c
+++ b/fs/lustre/llite/rw26.c
@@ -200,7 +200,7 @@  static ssize_t ll_direct_IO_seg(const struct lu_env *env, struct cl_io *io,
 
 		rc = cl_page_own(env, io, clp);
 		if (rc) {
-			LASSERT(clp->cp_state == CPS_FREEING);
+			LASSERT(cl_page_state_get(clp) == CPS_FREEING);
 			cl_page_put(env, clp);
 			break;
 		}
diff --git a/fs/lustre/llite/vvp_page.c b/fs/lustre/llite/vvp_page.c
index 590e5f5e43c9..38b8c488d765 100644
--- a/fs/lustre/llite/vvp_page.c
+++ b/fs/lustre/llite/vvp_page.c
@@ -323,18 +323,18 @@  static int vvp_page_make_ready(const struct lu_env *env,
 
 	lock_page(vmpage);
 	if (clear_page_dirty_for_io(vmpage)) {
-		LASSERT(pg->cp_state == CPS_CACHED);
+		LASSERT(cl_page_state_get(pg) == CPS_CACHED);
 		/* This actually clears the dirty bit in the radix tree. */
 		set_page_writeback(vmpage);
 		CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
-	} else if (pg->cp_state == CPS_PAGEOUT) {
+	} else if (cl_page_state_get(pg) == CPS_PAGEOUT) {
 		/* is it possible for osc_flush_async_page() to already
 		 * make it ready?
 		 */
 		result = -EALREADY;
 	} else {
 		CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n",
-			      pg->cp_state);
+			      cl_page_state_get(pg));
 		LBUG();
 	}
 	unlock_page(vmpage);
diff --git a/fs/lustre/obdclass/cl_page.c b/fs/lustre/obdclass/cl_page.c
index 349f19e014e0..da4429b82932 100644
--- a/fs/lustre/obdclass/cl_page.c
+++ b/fs/lustre/obdclass/cl_page.c
@@ -97,7 +97,7 @@  static void cl_page_free(const struct lu_env *env, struct cl_page *page)
 
 	PASSERT(env, page, list_empty(&page->cp_batch));
 	PASSERT(env, page, !page->cp_owner);
-	PASSERT(env, page, page->cp_state == CPS_FREEING);
+	PASSERT(env, page, cl_page_state_get(page) == CPS_FREEING);
 
 	while ((slice = list_first_entry_or_null(&page->cp_layers,
 						 struct cl_page_slice,
@@ -119,8 +119,14 @@  static void cl_page_free(const struct lu_env *env, struct cl_page *page)
 static inline void cl_page_state_set_trust(struct cl_page *page,
 					   enum cl_page_state state)
 {
-	/* bypass const. */
-	*(enum cl_page_state *)&page->cp_state = state;
+	/*
+	 * Paired with smp_load_acquire in cl_page_state_get
+	 * and ensures that we see the most recent value of cp_state
+	 * is available even when the next access is not performed on the
+	 * current processor.
+	 * Note we also cast away const as the only modifier of cp_state.
+	 */
+	smp_store_release((enum cl_page_state *)&page->cp_state, state);
 }
 
 struct cl_page *cl_page_alloc(const struct lu_env *env,
@@ -270,10 +276,10 @@  static void __cl_page_state_set(const struct lu_env *env,
 		}
 	};
 
-	old = page->cp_state;
+	old = cl_page_state_get(page);
 	PASSERT(env, page, allowed_transitions[old][state]);
 	CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
-	PASSERT(env, page, page->cp_state == old);
+	PASSERT(env, page, cl_page_state_get(page) == old);
 	PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner));
 	cl_page_state_set_trust(page, state);
 }
@@ -313,7 +319,7 @@  void cl_page_put(const struct lu_env *env, struct cl_page *page)
 		       refcount_read(&page->cp_ref));
 
 	if (refcount_dec_and_test(&page->cp_ref)) {
-		LASSERT(page->cp_state == CPS_FREEING);
+		LASSERT(cl_page_state_get(page) == CPS_FREEING);
 
 		LASSERT(refcount_read(&page->cp_ref) == 0);
 		PASSERT(env, page, !page->cp_owner);
@@ -378,7 +384,7 @@  void __cl_page_disown(const struct lu_env *env,
 	const struct cl_page_slice *slice;
 	enum cl_page_state state;
 
-	state = pg->cp_state;
+	state = cl_page_state_get(pg);
 	cl_page_owner_clear(pg);
 
 	if (state == CPS_OWNED)
@@ -402,7 +408,7 @@  int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
 	struct cl_io *top = cl_io_top((struct cl_io *)io);
 
 	LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
-	return pg->cp_state == CPS_OWNED && pg->cp_owner == top;
+	return cl_page_state_get(pg) == CPS_OWNED && pg->cp_owner == top;
 }
 EXPORT_SYMBOL(cl_page_is_owned);
 
@@ -434,7 +440,7 @@  static int __cl_page_own(const struct lu_env *env, struct cl_io *io,
 
 	io = cl_io_top(io);
 
-	if (pg->cp_state == CPS_FREEING) {
+	if (cl_page_state_get(pg) == CPS_FREEING) {
 		result = -ENOENT;
 		goto out;
 	}
@@ -453,7 +459,7 @@  static int __cl_page_own(const struct lu_env *env, struct cl_io *io,
 		PASSERT(env, pg, !pg->cp_owner);
 		pg->cp_owner = cl_io_top(io);
 		cl_page_owner_set(pg);
-		if (pg->cp_state != CPS_FREEING) {
+		if (cl_page_state_get(pg) != CPS_FREEING) {
 			cl_page_state_set(env, pg, CPS_OWNED);
 		} else {
 			__cl_page_disown(env, io, pg);
@@ -593,7 +599,7 @@  static void __cl_page_delete(const struct lu_env *env, struct cl_page *pg)
 {
 	const struct cl_page_slice *slice;
 
-	PASSERT(env, pg, pg->cp_state != CPS_FREEING);
+	PASSERT(env, pg, cl_page_state_get(pg) != CPS_FREEING);
 
 	/*
 	 * Sever all ways to obtain new pointers to @pg.
@@ -756,7 +762,7 @@  void cl_page_completion(const struct lu_env *env,
 	const struct cl_page_slice *slice;
 
 	PASSERT(env, pg, crt < CRT_NR);
-	PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
+	PASSERT(env, pg, cl_page_state_get(pg) == cl_req_type_state(crt));
 
 	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
 
@@ -805,7 +811,7 @@  int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,
 	}
 
 	if (result >= 0) {
-		PASSERT(env, pg, pg->cp_state == CPS_CACHED);
+		PASSERT(env, pg, cl_page_state_get(pg) == CPS_CACHED);
 		cl_page_io_start(env, pg, crt);
 		result = 0;
 	}
@@ -870,7 +876,7 @@  void cl_page_header_print(const struct lu_env *env, void *cookie,
 	(*printer)(env, cookie,
 		   "page@%p[%d %p %d %d %p]\n",
 		   pg, refcount_read(&pg->cp_ref), pg->cp_obj,
-		   pg->cp_state, pg->cp_type,
+		   cl_page_state_get(pg), pg->cp_type,
 		   pg->cp_owner);
 }
 EXPORT_SYMBOL(cl_page_header_print);
diff --git a/fs/lustre/obdecho/echo_client.c b/fs/lustre/obdecho/echo_client.c
index 317123fd27cb..d879f109e641 100644
--- a/fs/lustre/obdecho/echo_client.c
+++ b/fs/lustre/obdecho/echo_client.c
@@ -1046,7 +1046,7 @@  static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset,
 
 		rc = cl_page_own(env, io, clp);
 		if (rc) {
-			LASSERT(clp->cp_state == CPS_FREEING);
+			LASSERT(cl_page_state_get(clp) == CPS_FREEING);
 			cl_page_put(env, clp);
 			break;
 		}
diff --git a/fs/lustre/osc/osc_cache.c b/fs/lustre/osc/osc_cache.c
index f8fddbfe6a7e..75984b98b229 100644
--- a/fs/lustre/osc/osc_cache.c
+++ b/fs/lustre/osc/osc_cache.c
@@ -1045,7 +1045,7 @@  static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
 			cl_page_discard(env, io, page);
 			cl_page_disown(env, io, page);
 		} else {
-			LASSERT(page->cp_state == CPS_FREEING);
+			LASSERT(cl_page_state_get(page) == CPS_FREEING);
 			LASSERT(0);
 		}
 
@@ -1356,10 +1356,12 @@  static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
 	int srvlock;
 
 	cmd &= ~OBD_BRW_NOQUOTA;
-	LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ),
-		 "cp_state:%u, cmd:%d\n", page->cp_state, cmd);
-	LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE),
-		 "cp_state:%u, cmd:%d\n", page->cp_state, cmd);
+	LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEIN,
+		      cmd == OBD_BRW_READ),
+		 "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd);
+	LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEOUT,
+		      cmd == OBD_BRW_WRITE),
+		 "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd);
 	LASSERT(opg->ops_transfer_pinned);
 
 	crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
@@ -3061,7 +3063,7 @@  bool osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
 
 			page = ops->ops_cl.cpl_page;
 			LASSERT(page->cp_type == CPT_CACHEABLE);
-			if (page->cp_state == CPS_FREEING)
+			if (cl_page_state_get(page) == CPS_FREEING)
 				continue;
 
 			cl_page_get(page);
@@ -3142,7 +3144,7 @@  static bool check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
 			cl_page_discard(env, io, page);
 			cl_page_disown(env, io, page);
 		} else {
-			LASSERT(page->cp_state == CPS_FREEING);
+			LASSERT(cl_page_state_get(page) == CPS_FREEING);
 		}
 	}
 
@@ -3169,7 +3171,7 @@  static bool discard_cb(const struct lu_env *env, struct cl_io *io,
 		cl_page_discard(env, io, page);
 		cl_page_disown(env, io, page);
 	} else {
-		LASSERT(page->cp_state == CPS_FREEING);
+		LASSERT(cl_page_state_get(page) == CPS_FREEING);
 	}
 
 	return true;