Message ID | 20190606223110.15725-1-stancheff@cray.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Place a memory barrier around cp_state changes | expand |
On Thu, Jun 06 2019, Shaun Tancheff wrote: > Ensure all uses of cl_page->cp_state are smp-safe. > > LBUG Output: > In __cl_page_state_set() > .. > old = page->cp_state; > PASSERT(env, page, allowed_transitions[old][state]); > .. > Asserted with the following: > page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0] What do all these fields mean? Or where is this printed? The closest I can find is in cl_page_header_print() which uses the format string: "page@%p[%d %p %d %d %p]\n", But that only has 5 fields in the [], while the output you provided has 6. > > However cp_state 0 (CPS_CACHED) to 1 (CPS_OWNED) is a valid transition > leading to the conclusion that cp_state became 0 during the > assertion. Where is the evidence that this was the transition that was happening? If it was in some part of the LBUG output that wasn't quoted - then please fix that by quoting the entire LBUG output. We need to understand which change happened at this time to cause the race. Then we need to explain why the added barriers actually close the race window. If these barrier do actually fix a race, then there is something *very* wrong. The comments say that changes to cp_state are protected by the page lock on the corresponding VM page. Locking and unlocking a VM page entails sufficient barriers that changes made while one thread holds the lock will be visible to another thread once it also gets the lock. So the only possible explanation for a race as you suspect, is if the vmpage *isn't* locked when something changes cp_state, and that would be bad. Thanks, NeilBrown > > Signed-off-by: Shaun Tancheff <stancheff@cray.com> > --- > fs/lustre/include/cl_object.h | 11 +++++++++++ > fs/lustre/llite/rw26.c | 2 +- > fs/lustre/llite/vvp_page.c | 6 +++--- > fs/lustre/obdclass/cl_page.c | 34 +++++++++++++++++++-------------- > fs/lustre/obdecho/echo_client.c | 2 +- > fs/lustre/osc/osc_cache.c | 18 +++++++++-------- > 6 files changed, 46 insertions(+), 27 deletions(-) > > diff --git a/fs/lustre/include/cl_object.h b/fs/lustre/include/cl_object.h > index 691c2f5da53a..d6e1f6f05f50 100644 > --- a/fs/lustre/include/cl_object.h > +++ b/fs/lustre/include/cl_object.h > @@ -752,6 +752,17 @@ struct cl_page { > struct cl_sync_io *cp_sync_io; > }; > > +static inline enum cl_page_state cl_page_state_get(const struct cl_page *pg) > +{ > + /* > + * Paired with smp_store_release in cl_page_state_set_trust > + * and ensures that we see the most recent value of cp_state > + * even when the last modification was not performed on the > + * current processor > + */ > + return smp_load_acquire(&pg->cp_state); > +} > + > /** > * Per-layer part of cl_page. > * > diff --git a/fs/lustre/llite/rw26.c b/fs/lustre/llite/rw26.c > index e4ce3b6f5772..364dec208ccd 100644 > --- a/fs/lustre/llite/rw26.c > +++ b/fs/lustre/llite/rw26.c > @@ -200,7 +200,7 @@ static ssize_t ll_direct_IO_seg(const struct lu_env *env, struct cl_io *io, > > rc = cl_page_own(env, io, clp); > if (rc) { > - LASSERT(clp->cp_state == CPS_FREEING); > + LASSERT(cl_page_state_get(clp) == CPS_FREEING); > cl_page_put(env, clp); > break; > } > diff --git a/fs/lustre/llite/vvp_page.c b/fs/lustre/llite/vvp_page.c > index 590e5f5e43c9..38b8c488d765 100644 > --- a/fs/lustre/llite/vvp_page.c > +++ b/fs/lustre/llite/vvp_page.c > @@ -323,18 +323,18 @@ static int vvp_page_make_ready(const struct lu_env *env, > > lock_page(vmpage); > if (clear_page_dirty_for_io(vmpage)) { > - LASSERT(pg->cp_state == CPS_CACHED); > + LASSERT(cl_page_state_get(pg) == CPS_CACHED); > /* This actually clears the dirty bit in the radix tree. */ > set_page_writeback(vmpage); > CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n"); > - } else if (pg->cp_state == CPS_PAGEOUT) { > + } else if (cl_page_state_get(pg) == CPS_PAGEOUT) { > /* is it possible for osc_flush_async_page() to already > * make it ready? > */ > result = -EALREADY; > } else { > CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n", > - pg->cp_state); > + cl_page_state_get(pg)); > LBUG(); > } > unlock_page(vmpage); > diff --git a/fs/lustre/obdclass/cl_page.c b/fs/lustre/obdclass/cl_page.c > index 349f19e014e0..da4429b82932 100644 > --- a/fs/lustre/obdclass/cl_page.c > +++ b/fs/lustre/obdclass/cl_page.c > @@ -97,7 +97,7 @@ static void cl_page_free(const struct lu_env *env, struct cl_page *page) > > PASSERT(env, page, list_empty(&page->cp_batch)); > PASSERT(env, page, !page->cp_owner); > - PASSERT(env, page, page->cp_state == CPS_FREEING); > + PASSERT(env, page, cl_page_state_get(page) == CPS_FREEING); > > while ((slice = list_first_entry_or_null(&page->cp_layers, > struct cl_page_slice, > @@ -119,8 +119,14 @@ static void cl_page_free(const struct lu_env *env, struct cl_page *page) > static inline void cl_page_state_set_trust(struct cl_page *page, > enum cl_page_state state) > { > - /* bypass const. */ > - *(enum cl_page_state *)&page->cp_state = state; > + /* > + * Paired with smp_load_acquire in cl_page_state_get > + * and ensures that we see the most recent value of cp_state > + * is available even when the next access is not performed on the > + * current processor. > + * Note we also cast away const as the only modifier of cp_state. > + */ > + smp_store_release((enum cl_page_state *)&page->cp_state, state); > } > > struct cl_page *cl_page_alloc(const struct lu_env *env, > @@ -270,10 +276,10 @@ static void __cl_page_state_set(const struct lu_env *env, > } > }; > > - old = page->cp_state; > + old = cl_page_state_get(page); > PASSERT(env, page, allowed_transitions[old][state]); > CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state); > - PASSERT(env, page, page->cp_state == old); > + PASSERT(env, page, cl_page_state_get(page) == old); > PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner)); > cl_page_state_set_trust(page, state); > } > @@ -313,7 +319,7 @@ void cl_page_put(const struct lu_env *env, struct cl_page *page) > refcount_read(&page->cp_ref)); > > if (refcount_dec_and_test(&page->cp_ref)) { > - LASSERT(page->cp_state == CPS_FREEING); > + LASSERT(cl_page_state_get(page) == CPS_FREEING); > > LASSERT(refcount_read(&page->cp_ref) == 0); > PASSERT(env, page, !page->cp_owner); > @@ -378,7 +384,7 @@ void __cl_page_disown(const struct lu_env *env, > const struct cl_page_slice *slice; > enum cl_page_state state; > > - state = pg->cp_state; > + state = cl_page_state_get(pg); > cl_page_owner_clear(pg); > > if (state == CPS_OWNED) > @@ -402,7 +408,7 @@ int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io) > struct cl_io *top = cl_io_top((struct cl_io *)io); > > LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj)); > - return pg->cp_state == CPS_OWNED && pg->cp_owner == top; > + return cl_page_state_get(pg) == CPS_OWNED && pg->cp_owner == top; > } > EXPORT_SYMBOL(cl_page_is_owned); > > @@ -434,7 +440,7 @@ static int __cl_page_own(const struct lu_env *env, struct cl_io *io, > > io = cl_io_top(io); > > - if (pg->cp_state == CPS_FREEING) { > + if (cl_page_state_get(pg) == CPS_FREEING) { > result = -ENOENT; > goto out; > } > @@ -453,7 +459,7 @@ static int __cl_page_own(const struct lu_env *env, struct cl_io *io, > PASSERT(env, pg, !pg->cp_owner); > pg->cp_owner = cl_io_top(io); > cl_page_owner_set(pg); > - if (pg->cp_state != CPS_FREEING) { > + if (cl_page_state_get(pg) != CPS_FREEING) { > cl_page_state_set(env, pg, CPS_OWNED); > } else { > __cl_page_disown(env, io, pg); > @@ -593,7 +599,7 @@ static void __cl_page_delete(const struct lu_env *env, struct cl_page *pg) > { > const struct cl_page_slice *slice; > > - PASSERT(env, pg, pg->cp_state != CPS_FREEING); > + PASSERT(env, pg, cl_page_state_get(pg) != CPS_FREEING); > > /* > * Sever all ways to obtain new pointers to @pg. > @@ -756,7 +762,7 @@ void cl_page_completion(const struct lu_env *env, > const struct cl_page_slice *slice; > > PASSERT(env, pg, crt < CRT_NR); > - PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt)); > + PASSERT(env, pg, cl_page_state_get(pg) == cl_req_type_state(crt)); > > CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret); > > @@ -805,7 +811,7 @@ int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg, > } > > if (result >= 0) { > - PASSERT(env, pg, pg->cp_state == CPS_CACHED); > + PASSERT(env, pg, cl_page_state_get(pg) == CPS_CACHED); > cl_page_io_start(env, pg, crt); > result = 0; > } > @@ -870,7 +876,7 @@ void cl_page_header_print(const struct lu_env *env, void *cookie, > (*printer)(env, cookie, > "page@%p[%d %p %d %d %p]\n", > pg, refcount_read(&pg->cp_ref), pg->cp_obj, > - pg->cp_state, pg->cp_type, > + cl_page_state_get(pg), pg->cp_type, > pg->cp_owner); > } > EXPORT_SYMBOL(cl_page_header_print); > diff --git a/fs/lustre/obdecho/echo_client.c b/fs/lustre/obdecho/echo_client.c > index 317123fd27cb..d879f109e641 100644 > --- a/fs/lustre/obdecho/echo_client.c > +++ b/fs/lustre/obdecho/echo_client.c > @@ -1046,7 +1046,7 @@ static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset, > > rc = cl_page_own(env, io, clp); > if (rc) { > - LASSERT(clp->cp_state == CPS_FREEING); > + LASSERT(cl_page_state_get(clp) == CPS_FREEING); > cl_page_put(env, clp); > break; > } > diff --git a/fs/lustre/osc/osc_cache.c b/fs/lustre/osc/osc_cache.c > index f8fddbfe6a7e..75984b98b229 100644 > --- a/fs/lustre/osc/osc_cache.c > +++ b/fs/lustre/osc/osc_cache.c > @@ -1045,7 +1045,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, > cl_page_discard(env, io, page); > cl_page_disown(env, io, page); > } else { > - LASSERT(page->cp_state == CPS_FREEING); > + LASSERT(cl_page_state_get(page) == CPS_FREEING); > LASSERT(0); > } > > @@ -1356,10 +1356,12 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, > int srvlock; > > cmd &= ~OBD_BRW_NOQUOTA; > - LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ), > - "cp_state:%u, cmd:%d\n", page->cp_state, cmd); > - LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE), > - "cp_state:%u, cmd:%d\n", page->cp_state, cmd); > + LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEIN, > + cmd == OBD_BRW_READ), > + "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd); > + LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEOUT, > + cmd == OBD_BRW_WRITE), > + "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd); > LASSERT(opg->ops_transfer_pinned); > > crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE; > @@ -3061,7 +3063,7 @@ bool osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, > > page = ops->ops_cl.cpl_page; > LASSERT(page->cp_type == CPT_CACHEABLE); > - if (page->cp_state == CPS_FREEING) > + if (cl_page_state_get(page) == CPS_FREEING) > continue; > > cl_page_get(page); > @@ -3142,7 +3144,7 @@ static bool check_and_discard_cb(const struct lu_env *env, struct cl_io *io, > cl_page_discard(env, io, page); > cl_page_disown(env, io, page); > } else { > - LASSERT(page->cp_state == CPS_FREEING); > + LASSERT(cl_page_state_get(page) == CPS_FREEING); > } > } > > @@ -3169,7 +3171,7 @@ static bool discard_cb(const struct lu_env *env, struct cl_io *io, > cl_page_discard(env, io, page); > cl_page_disown(env, io, page); > } else { > - LASSERT(page->cp_state == CPS_FREEING); > + LASSERT(cl_page_state_get(page) == CPS_FREEING); > } > > return true; > -- > 2.17.1
On Thu, Jun 6, 2019 at 7:57 PM NeilBrown <neilb@suse.com> wrote: > > On Thu, Jun 06 2019, Shaun Tancheff wrote: > > > Ensure all uses of cl_page->cp_state are smp-safe. > > > > LBUG Output: > > In __cl_page_state_set() > > .. > > old = page->cp_state; > > PASSERT(env, page, allowed_transitions[old][state]); > > .. > > Asserted with the following: > > page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0] > > What do all these fields mean? Or where is this printed? > The closest I can find is in cl_page_header_print() which uses the > format string: > > "page@%p[%d %p %d %d %p]\n", > > But that only has 5 fields in the [], while the output you provided has > 6. > Sorry for the confusion, here is my (long) explanation with source snippets: The LBUG in full LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e28cc0] LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) vvp-page@ffff80be1fcd6650(0:0) vm@ffff7e02fa0e60c0 5000000000001029 4:0 ffff80be1fcd6600 1049548 lru LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) lov-page@ffff80be1fcd6690, comp index: 0, gen: 0 LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) osc-page@ffff80be1fcd66c8 1049548: 1< 0x845fed 2 0 - - > 2< 4298948608 0 4096 0x0 0x420 | (null) ffff809e473708e8 ffff80be199c1e40 > 3< 0 0 0 > 4< 5 13 64 0 + | - - - - > 5< - - - - | 0 - | 0 - -> LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) end page@ffff80be1fcd6600 LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) allowed_transitions[old][state] LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) ASSERTION( 0 ) failed: LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) LBUG The source snippet around L342 ... 339 340 ENTRY; 341 old = page->cp_state; 342 PASSERT(env, page, allowed_transitions[old][state]); 343 CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state); 344 PASSERT(env, page, page->cp_state == old); 345 PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner != NULL)); 346 347 cs_pagestate_dec(page->cp_obj, page->cp_state); 348 cs_pagestate_inc(page->cp_obj, state); The PASSERT macro ... ... # define PASSERT(env, page, expr) \ do { \ if (unlikely(!(expr))) { \ CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \ LASSERT(0); \ } \ } while (0) #else /* !LIBCFS_DEBUG */ ... /** * Helper macro, dumping detailed information about \a page into a log. */ #define CL_PAGE_DEBUG(mask, env, page, format, ...) \ do { \ if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ cl_page_print(env, &msgdata, lu_cdebug_printer, page); \ CDEBUG(mask, format , ## __VA_ARGS__); \ } \ } while (0) Almost there ... 1062 /** 1063 * Prints human readable representation of \a pg to the \a f. 1064 */ 1065 void cl_page_print(const struct lu_env *env, void *cookie, 1066 lu_printer_t printer, const struct cl_page *pg) 1067 { 1068 const struct cl_page_slice *slice; 1069 int result = 0; 1070 1071 cl_page_header_print(env, cookie, printer, pg); 1072 list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) { 1073 if (slice->cpl_ops->cpo_print != NULL) 1074 result = (*slice->cpl_ops->cpo_print)(env, slice, 1075 cookie, printer); Finally the 'good stuff' ... 1051 void cl_page_header_print(const struct lu_env *env, void *cookie, 1052 lu_printer_t printer, const struct cl_page *pg) 1053 { 1054 (*printer)(env, cookie, 1055 "page@%p[%d %p %d %d %p]\n", 1056 pg, atomic_read(&pg->cp_ref), pg->cp_obj, 1057 pg->cp_state, pg->cp_type, 1058 pg->cp_owner); 1059 } ... Given the above along with the first line from the LBUG .... LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e28cc0] Implies the following: pg == ffff80be1fcd6600 pg->cp_ref == 3 pg->cp_obj == ffff80be1fca5cc0 pg->cp_state == 0 (state == CPS_CACHED) pg->cp_type == 1 pg->cp_owner == ffff809e60e28cc0 Also known caller was cl_page_assume+0xdc/0x3e0 [obdclass] (from the backtrace) So we know the on-stack / register value of state == CPS_OWNED The backtrace: ... dump_backtrace+0x0/0x248 show_stack+0x24/0x30 dump_stack+0xbc/0xf4 libcfs_call_trace+0xec/0x120 [libcfs] lbug_with_loc+0x4c/0xb8 [libcfs] cl_page_state_set0+0x2b4/0x6e0 [obdclass] cl_page_assume+0xdc/0x3e0 [obdclass] ll_io_read_page+0x144c/0x1de0 [lustre] ll_write_begin+0x3d4/0xee8 [lustre] and the contents of cl_page_assume 596 void cl_page_assume(const struct lu_env *env, 597 struct cl_io *io, struct cl_page *pg) 598 { 599 const struct cl_page_slice *slice; 600 601 PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj)); 602 603 ENTRY; 604 io = cl_io_top(io); 605 606 list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) { 607 if (slice->cpl_ops->cpo_assume != NULL) 608 (*slice->cpl_ops->cpo_assume)(env, slice, io); 609 } 610 611 PASSERT(env, pg, pg->cp_owner == NULL); 612 pg->cp_owner = cl_io_top(io); 613 cl_page_owner_set(pg); 614 cl_page_state_set(env, pg, CPS_OWNED); 615 EXIT; 616 } While I believe my conclusion to be correct I am certainly open to being dead wrong. On Thu, Jun 6, 2019 at 7:57 PM NeilBrown <neilb@suse.com> wrote: > On Thu, Jun 06 2019, Shaun Tancheff wrote: > > > Ensure all uses of cl_page->cp_state are smp-safe. > > > > LBUG Output: > > In __cl_page_state_set() > > .. > > old = page->cp_state; > > PASSERT(env, page, allowed_transitions[old][state]); > > .. > > Asserted with the following: > > page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0] > > What do all these fields mean? Or where is this printed? > The closest I can find is in cl_page_header_print() which uses the > format string: > > "page@%p[%d %p %d %d %p]\n", > > But that only has 5 fields in the [], while the output you provided has > 6. > > > > > However cp_state 0 (CPS_CACHED) to 1 (CPS_OWNED) is a valid transition > > leading to the conclusion that cp_state became 0 during the > > assertion. > > Where is the evidence that this was the transition that was happening? > If it was in some part of the LBUG output that wasn't quoted - then > please fix that by quoting the entire LBUG output. > > We need to understand which change happened at this time to cause the > race. > Then we need to explain why the added barriers actually close the race > window. > > If these barrier do actually fix a race, then there is something *very* > wrong. > The comments say that changes to cp_state are protected by the page lock > on the corresponding VM page. > Locking and unlocking a VM page entails sufficient barriers that changes > made while one thread holds the lock will be visible to another > thread once it also gets the lock. > > True, but a page lock is just a bit on the page flags. And we absolutely wait on I/O while the page is 'locked' so a ping-pong (our task *can* sleep) is entirely reasonable, as far as I understand anyway. Since there is multiple state-changes during a single page lock/unlock there is a race, however rare and unlikely, and it's arch specific based on what I understand of the memory barrier rules. So the only possible explanation for a race as you suspect, is if the > vmpage *isn't* locked when something changes cp_state, and that would be > bad. > > Thanks, > NeilBrown > > > > > > Signed-off-by: Shaun Tancheff <stancheff@cray.com> > > --- > > fs/lustre/include/cl_object.h | 11 +++++++++++ > > fs/lustre/llite/rw26.c | 2 +- > > fs/lustre/llite/vvp_page.c | 6 +++--- > > fs/lustre/obdclass/cl_page.c | 34 +++++++++++++++++++-------------- > > fs/lustre/obdecho/echo_client.c | 2 +- > > fs/lustre/osc/osc_cache.c | 18 +++++++++-------- > > 6 files changed, 46 insertions(+), 27 deletions(-) > > > > diff --git a/fs/lustre/include/cl_object.h > b/fs/lustre/include/cl_object.h > > index 691c2f5da53a..d6e1f6f05f50 100644 > > --- a/fs/lustre/include/cl_object.h > > +++ b/fs/lustre/include/cl_object.h > > @@ -752,6 +752,17 @@ struct cl_page { > > struct cl_sync_io *cp_sync_io; > > }; > > > > +static inline enum cl_page_state cl_page_state_get(const struct cl_page > *pg) > > +{ > > + /* > > + * Paired with smp_store_release in cl_page_state_set_trust > > + * and ensures that we see the most recent value of cp_state > > + * even when the last modification was not performed on the > > + * current processor > > + */ > > + return smp_load_acquire(&pg->cp_state); > > +} > > + > > /** > > * Per-layer part of cl_page. > > * > > diff --git a/fs/lustre/llite/rw26.c b/fs/lustre/llite/rw26.c > > index e4ce3b6f5772..364dec208ccd 100644 > > --- a/fs/lustre/llite/rw26.c > > +++ b/fs/lustre/llite/rw26.c > > @@ -200,7 +200,7 @@ static ssize_t ll_direct_IO_seg(const struct lu_env > *env, struct cl_io *io, > > > > rc = cl_page_own(env, io, clp); > > if (rc) { > > - LASSERT(clp->cp_state == CPS_FREEING); > > + LASSERT(cl_page_state_get(clp) == CPS_FREEING); > > cl_page_put(env, clp); > > break; > > } > > diff --git a/fs/lustre/llite/vvp_page.c b/fs/lustre/llite/vvp_page.c > > index 590e5f5e43c9..38b8c488d765 100644 > > --- a/fs/lustre/llite/vvp_page.c > > +++ b/fs/lustre/llite/vvp_page.c > > @@ -323,18 +323,18 @@ static int vvp_page_make_ready(const struct lu_env > *env, > > > > lock_page(vmpage); > > if (clear_page_dirty_for_io(vmpage)) { > > - LASSERT(pg->cp_state == CPS_CACHED); > > + LASSERT(cl_page_state_get(pg) == CPS_CACHED); > > /* This actually clears the dirty bit in the radix tree. */ > > set_page_writeback(vmpage); > > CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n"); > > - } else if (pg->cp_state == CPS_PAGEOUT) { > > + } else if (cl_page_state_get(pg) == CPS_PAGEOUT) { > > /* is it possible for osc_flush_async_page() to already > > * make it ready? > > */ > > result = -EALREADY; > > } else { > > CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state > %d.\n", > > - pg->cp_state); > > + cl_page_state_get(pg)); > > LBUG(); > > } > > unlock_page(vmpage); > > diff --git a/fs/lustre/obdclass/cl_page.c b/fs/lustre/obdclass/cl_page.c > > index 349f19e014e0..da4429b82932 100644 > > --- a/fs/lustre/obdclass/cl_page.c > > +++ b/fs/lustre/obdclass/cl_page.c > > @@ -97,7 +97,7 @@ static void cl_page_free(const struct lu_env *env, > struct cl_page *page) > > > > PASSERT(env, page, list_empty(&page->cp_batch)); > > PASSERT(env, page, !page->cp_owner); > > - PASSERT(env, page, page->cp_state == CPS_FREEING); > > + PASSERT(env, page, cl_page_state_get(page) == CPS_FREEING); > > > > while ((slice = list_first_entry_or_null(&page->cp_layers, > > struct cl_page_slice, > > @@ -119,8 +119,14 @@ static void cl_page_free(const struct lu_env *env, > struct cl_page *page) > > static inline void cl_page_state_set_trust(struct cl_page *page, > > enum cl_page_state state) > > { > > - /* bypass const. */ > > - *(enum cl_page_state *)&page->cp_state = state; > > + /* > > + * Paired with smp_load_acquire in cl_page_state_get > > + * and ensures that we see the most recent value of cp_state > > + * is available even when the next access is not performed on the > > + * current processor. > > + * Note we also cast away const as the only modifier of cp_state. > > + */ > > + smp_store_release((enum cl_page_state *)&page->cp_state, state); > > } > > > > struct cl_page *cl_page_alloc(const struct lu_env *env, > > @@ -270,10 +276,10 @@ static void __cl_page_state_set(const struct > lu_env *env, > > } > > }; > > > > - old = page->cp_state; > > + old = cl_page_state_get(page); > > PASSERT(env, page, allowed_transitions[old][state]); > > CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state); > > - PASSERT(env, page, page->cp_state == old); > > + PASSERT(env, page, cl_page_state_get(page) == old); > > PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner)); > > cl_page_state_set_trust(page, state); > > } > > @@ -313,7 +319,7 @@ void cl_page_put(const struct lu_env *env, struct > cl_page *page) > > refcount_read(&page->cp_ref)); > > > > if (refcount_dec_and_test(&page->cp_ref)) { > > - LASSERT(page->cp_state == CPS_FREEING); > > + LASSERT(cl_page_state_get(page) == CPS_FREEING); > > > > LASSERT(refcount_read(&page->cp_ref) == 0); > > PASSERT(env, page, !page->cp_owner); > > @@ -378,7 +384,7 @@ void __cl_page_disown(const struct lu_env *env, > > const struct cl_page_slice *slice; > > enum cl_page_state state; > > > > - state = pg->cp_state; > > + state = cl_page_state_get(pg); > > cl_page_owner_clear(pg); > > > > if (state == CPS_OWNED) > > @@ -402,7 +408,7 @@ int cl_page_is_owned(const struct cl_page *pg, const > struct cl_io *io) > > struct cl_io *top = cl_io_top((struct cl_io *)io); > > > > LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj)); > > - return pg->cp_state == CPS_OWNED && pg->cp_owner == top; > > + return cl_page_state_get(pg) == CPS_OWNED && pg->cp_owner == top; > > } > > EXPORT_SYMBOL(cl_page_is_owned); > > > > @@ -434,7 +440,7 @@ static int __cl_page_own(const struct lu_env *env, > struct cl_io *io, > > > > io = cl_io_top(io); > > > > - if (pg->cp_state == CPS_FREEING) { > > + if (cl_page_state_get(pg) == CPS_FREEING) { > > result = -ENOENT; > > goto out; > > } > > @@ -453,7 +459,7 @@ static int __cl_page_own(const struct lu_env *env, > struct cl_io *io, > > PASSERT(env, pg, !pg->cp_owner); > > pg->cp_owner = cl_io_top(io); > > cl_page_owner_set(pg); > > - if (pg->cp_state != CPS_FREEING) { > > + if (cl_page_state_get(pg) != CPS_FREEING) { > > cl_page_state_set(env, pg, CPS_OWNED); > > } else { > > __cl_page_disown(env, io, pg); > > @@ -593,7 +599,7 @@ static void __cl_page_delete(const struct lu_env > *env, struct cl_page *pg) > > { > > const struct cl_page_slice *slice; > > > > - PASSERT(env, pg, pg->cp_state != CPS_FREEING); > > + PASSERT(env, pg, cl_page_state_get(pg) != CPS_FREEING); > > > > /* > > * Sever all ways to obtain new pointers to @pg. > > @@ -756,7 +762,7 @@ void cl_page_completion(const struct lu_env *env, > > const struct cl_page_slice *slice; > > > > PASSERT(env, pg, crt < CRT_NR); > > - PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt)); > > + PASSERT(env, pg, cl_page_state_get(pg) == cl_req_type_state(crt)); > > > > CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret); > > > > @@ -805,7 +811,7 @@ int cl_page_make_ready(const struct lu_env *env, > struct cl_page *pg, > > } > > > > if (result >= 0) { > > - PASSERT(env, pg, pg->cp_state == CPS_CACHED); > > + PASSERT(env, pg, cl_page_state_get(pg) == CPS_CACHED); > > cl_page_io_start(env, pg, crt); > > result = 0; > > } > > @@ -870,7 +876,7 @@ void cl_page_header_print(const struct lu_env *env, > void *cookie, > > (*printer)(env, cookie, > > "page@%p[%d %p %d %d %p]\n", > > pg, refcount_read(&pg->cp_ref), pg->cp_obj, > > - pg->cp_state, pg->cp_type, > > + cl_page_state_get(pg), pg->cp_type, > > pg->cp_owner); > > } > > EXPORT_SYMBOL(cl_page_header_print); > > diff --git a/fs/lustre/obdecho/echo_client.c > b/fs/lustre/obdecho/echo_client.c > > index 317123fd27cb..d879f109e641 100644 > > --- a/fs/lustre/obdecho/echo_client.c > > +++ b/fs/lustre/obdecho/echo_client.c > > @@ -1046,7 +1046,7 @@ static int cl_echo_object_brw(struct echo_object > *eco, int rw, u64 offset, > > > > rc = cl_page_own(env, io, clp); > > if (rc) { > > - LASSERT(clp->cp_state == CPS_FREEING); > > + LASSERT(cl_page_state_get(clp) == CPS_FREEING); > > cl_page_put(env, clp); > > break; > > } > > diff --git a/fs/lustre/osc/osc_cache.c b/fs/lustre/osc/osc_cache.c > > index f8fddbfe6a7e..75984b98b229 100644 > > --- a/fs/lustre/osc/osc_cache.c > > +++ b/fs/lustre/osc/osc_cache.c > > @@ -1045,7 +1045,7 @@ static int osc_extent_truncate(struct osc_extent > *ext, pgoff_t trunc_index, > > cl_page_discard(env, io, page); > > cl_page_disown(env, io, page); > > } else { > > - LASSERT(page->cp_state == CPS_FREEING); > > + LASSERT(cl_page_state_get(page) == CPS_FREEING); > > LASSERT(0); > > } > > > > @@ -1356,10 +1356,12 @@ static int osc_completion(const struct lu_env > *env, struct osc_async_page *oap, > > int srvlock; > > > > cmd &= ~OBD_BRW_NOQUOTA; > > - LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ), > > - "cp_state:%u, cmd:%d\n", page->cp_state, cmd); > > - LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE), > > - "cp_state:%u, cmd:%d\n", page->cp_state, cmd); > > + LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEIN, > > + cmd == OBD_BRW_READ), > > + "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd); > > + LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEOUT, > > + cmd == OBD_BRW_WRITE), > > + "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd); > > LASSERT(opg->ops_transfer_pinned); > > > > crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE; > > @@ -3061,7 +3063,7 @@ bool osc_page_gang_lookup(const struct lu_env > *env, struct cl_io *io, > > > > page = ops->ops_cl.cpl_page; > > LASSERT(page->cp_type == CPT_CACHEABLE); > > - if (page->cp_state == CPS_FREEING) > > + if (cl_page_state_get(page) == CPS_FREEING) > > continue; > > > > cl_page_get(page); > > @@ -3142,7 +3144,7 @@ static bool check_and_discard_cb(const struct > lu_env *env, struct cl_io *io, > > cl_page_discard(env, io, page); > > cl_page_disown(env, io, page); > > } else { > > - LASSERT(page->cp_state == CPS_FREEING); > > + LASSERT(cl_page_state_get(page) == CPS_FREEING); > > } > > } > > > > @@ -3169,7 +3171,7 @@ static bool discard_cb(const struct lu_env *env, > struct cl_io *io, > > cl_page_discard(env, io, page); > > cl_page_disown(env, io, page); > > } else { > > - LASSERT(page->cp_state == CPS_FREEING); > > + LASSERT(cl_page_state_get(page) == CPS_FREEING); > > } > > > > return true; > > -- > > 2.17.1 > <div dir="ltr"><div dir="ltr">On Thu, Jun 6, 2019 at 7:57 PM NeilBrown <<a href="mailto:neilb@suse.com">neilb@suse.com</a>> wrote:<br>><br>> On Thu, Jun 06 2019, Shaun Tancheff wrote:<br>><br>> > Ensure all uses of cl_page->cp_state are smp-safe.<br>> ><br>> > LBUG Output:<br>> > In __cl_page_state_set()<br>> > ..<br>> > old = page->cp_state;<br>> > PASSERT(env, page, allowed_transitions[old][state]);<br>> > ..<br>> > Asserted with the following:<br>> > page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]<br>><br>> What do all these fields mean? Or where is this printed?<br>> The closest I can find is in cl_page_header_print() which uses the<br>> format string:<br>><br>> "page@%p[%d %p %d %d %p]\n",<br>><br>> But that only has 5 fields in the [], while the output you provided has<br>> 6.<br>><br><br>Sorry for the confusion, here is my (long) explanation with source snippets:<br><br>The LBUG in full<br><br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e28cc0]<br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) vvp-page@ffff80be1fcd6650(0:0) vm@ffff7e02fa0e60c0 5000000000001029 4:0 ffff80be1fcd6600 1049548 lru<br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) lov-page@ffff80be1fcd6690, comp index: 0, gen: 0<br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) osc-page@ffff80be1fcd66c8 1049548: 1< 0x845fed 2 0 - - > 2< 4298948608 0 4096 0x0 0x420 | (null) ffff809e473708e8 ffff80be199c1e40 > 3< 0 0 0 > 4< 5 13 64 0 + | - - - - > 5< - - - - | 0 - | 0 - -><br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) end page@ffff80be1fcd6600<br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) allowed_transitions[old][state]<br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) ASSERTION( 0 ) failed:<br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) LBUG<br><br>The source snippet around L342<br>...<br> 339 <br> 340 ENTRY;<br> 341 old = page->cp_state;<br> 342 PASSERT(env, page, allowed_transitions[old][state]);<br> 343 CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);<br> 344 PASSERT(env, page, page->cp_state == old);<br> 345 PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner != NULL));<br> 346 <br> 347 cs_pagestate_dec(page->cp_obj, page->cp_state);<br> 348 cs_pagestate_inc(page->cp_obj, state);<br><br>The PASSERT macro ...<br>...<br># define PASSERT(env, page, expr) \<br> do { \<br> if (unlikely(!(expr))) { \<br> CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \<br> LASSERT(0); \<br> } \<br> } while (0)<br>#else /* !LIBCFS_DEBUG */<br>...<br><br>/**<br> * Helper macro, dumping detailed information about \a page into a log.<br> */<br>#define CL_PAGE_DEBUG(mask, env, page, format, ...) \<br>do { \<br> if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \<br> LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \<br> cl_page_print(env, &msgdata, lu_cdebug_printer, page); \<br> CDEBUG(mask, format , ## __VA_ARGS__); \<br> } \<br>} while (0)<br><br>Almost there<br>...<br> 1062 /**<br> 1063 * Prints human readable representation of \a pg to the \a f.<br> 1064 */<br> 1065 void cl_page_print(const struct lu_env *env, void *cookie,<br> 1066 lu_printer_t printer, const struct cl_page *pg)<br> 1067 {<br> 1068 const struct cl_page_slice *slice;<br> 1069 int result = 0;<br> 1070 <br> 1071 cl_page_header_print(env, cookie, printer, pg);<br> 1072 list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {<br> 1073 if (slice->cpl_ops->cpo_print != NULL)<br> 1074 result = (*slice->cpl_ops->cpo_print)(env, slice,<br> 1075 cookie, printer);<br><br>Finally the 'good stuff'<br>...<br> 1051 void cl_page_header_print(const struct lu_env *env, void *cookie,<br> 1052 lu_printer_t printer, const struct cl_page *pg)<br> 1053 {<br> 1054 (*printer)(env, cookie,<br> 1055 "page@%p[%d %p %d %d %p]\n",<br> 1056 pg, atomic_read(&pg->cp_ref), pg->cp_obj,<br> 1057 pg->cp_state, pg->cp_type,<br> 1058 pg->cp_owner);<br> 1059 }<br>...<br><br>Given the above along with the first line from the LBUG ....<div><br>LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e28cc0]<br><br>Implies the following:</div><div><br>pg == ffff80be1fcd6600<br>pg->cp_ref == 3<br>pg->cp_obj == ffff80be1fca5cc0<br>pg->cp_state == 0 (state == CPS_CACHED)<br>pg->cp_type == 1<br>pg->cp_owner == ffff809e60e28cc0<br><br>Also known caller was cl_page_assume+0xdc/0x3e0 [obdclass] (from the backtrace)<br>So we know the on-stack / register value of state == CPS_OWNED</div><div><br></div><div>The backtrace:<br>...<br> dump_backtrace+0x0/0x248<br> show_stack+0x24/0x30<br> dump_stack+0xbc/0xf4<br> libcfs_call_trace+0xec/0x120 [libcfs]<br> lbug_with_loc+0x4c/0xb8 [libcfs]<br> cl_page_state_set0+0x2b4/0x6e0 [obdclass]<br> cl_page_assume+0xdc/0x3e0 [obdclass]<br> ll_io_read_page+0x144c/0x1de0 [lustre]<br> ll_write_begin+0x3d4/0xee8 [lustre]<br><br></div><div>and the contents of cl_page_assume</div><div><br> 596 void cl_page_assume(const struct lu_env *env,<br> 597 struct cl_io *io, struct cl_page *pg)<br> 598 {<br> 599 const struct cl_page_slice *slice;<br> 600 <br> 601 PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));<br> 602 <br> 603 ENTRY;<br> 604 io = cl_io_top(io);<br> 605 <br> 606 list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {<br> 607 if (slice->cpl_ops->cpo_assume != NULL)<br> 608 (*slice->cpl_ops->cpo_assume)(env, slice, io);<br> 609 }<br> 610 <br> 611 PASSERT(env, pg, pg->cp_owner == NULL);<br> 612 pg->cp_owner = cl_io_top(io);<br> 613 cl_page_owner_set(pg);<br> 614 cl_page_state_set(env, pg, CPS_OWNED);<br> 615 EXIT;<br> 616 }<br><br></div><div>While I believe my conclusion to be correct I am certainly open to being dead wrong.</div><div><br></div></div><br><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Thu, Jun 6, 2019 at 7:57 PM NeilBrown <<a href="mailto:neilb@suse.com">neilb@suse.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">On Thu, Jun 06 2019, Shaun Tancheff wrote:<br> <br> > Ensure all uses of cl_page->cp_state are smp-safe.<br> ><br> > LBUG Output:<br> > In __cl_page_state_set()<br> > ..<br> > old = page->cp_state;<br> > PASSERT(env, page, allowed_transitions[old][state]);<br> > ..<br> > Asserted with the following:<br> > page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]<br> <br> What do all these fields mean? Or where is this printed?<br> The closest I can find is in cl_page_header_print() which uses the<br> format string:<br> <br> "page@%p[%d %p %d %d %p]\n",<br> <br> But that only has 5 fields in the [], while the output you provided has<br> 6.<br> <br> ><br> > However cp_state 0 (CPS_CACHED) to 1 (CPS_OWNED) is a valid transition<br> > leading to the conclusion that cp_state became 0 during the<br> > assertion.<br> <br> Where is the evidence that this was the transition that was happening?<br> If it was in some part of the LBUG output that wasn't quoted - then<br> please fix that by quoting the entire LBUG output.<br> <br> We need to understand which change happened at this time to cause the<br> race.<br> Then we need to explain why the added barriers actually close the race<br> window.<br> <br> If these barrier do actually fix a race, then there is something *very*<br> wrong.<br> The comments say that changes to cp_state are protected by the page lock<br> on the corresponding VM page.<br> Locking and unlocking a VM page entails sufficient barriers that changes<br> made while one thread holds the lock will be visible to another<br> thread once it also gets the lock.<br> <br></blockquote><div><br></div><div>True, but a page lock is just a bit on the page flags. And we absolutely wait on I/O while the </div><div>page is 'locked' so a ping-pong (our task *can* sleep) is entirely reasonable, as far as</div><div>I understand anyway. Since there is multiple state-changes during a single page lock/unlock</div><div>there is a race, however rare and unlikely, and it's arch specific based on what I understand</div><div>of the memory barrier rules.</div><div><br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"> So the only possible explanation for a race as you suspect, is if the<br> vmpage *isn't* locked when something changes cp_state, and that would be<br> bad.<br> <br> Thanks,<br> NeilBrown<br> <br> <br> ><br> > Signed-off-by: Shaun Tancheff <<a href="mailto:stancheff@cray.com" target="_blank">stancheff@cray.com</a>><br> > ---<br> > fs/lustre/include/cl_object.h | 11 +++++++++++<br> > fs/lustre/llite/rw26.c | 2 +-<br> > fs/lustre/llite/vvp_page.c | 6 +++---<br> > fs/lustre/obdclass/cl_page.c | 34 +++++++++++++++++++--------------<br> > fs/lustre/obdecho/echo_client.c | 2 +-<br> > fs/lustre/osc/osc_cache.c | 18 +++++++++--------<br> > 6 files changed, 46 insertions(+), 27 deletions(-)<br> ><br> > diff --git a/fs/lustre/include/cl_object.h b/fs/lustre/include/cl_object.h<br> > index 691c2f5da53a..d6e1f6f05f50 100644<br> > --- a/fs/lustre/include/cl_object.h<br> > +++ b/fs/lustre/include/cl_object.h<br> > @@ -752,6 +752,17 @@ struct cl_page {<br> > struct cl_sync_io *cp_sync_io;<br> > };<br> > <br> > +static inline enum cl_page_state cl_page_state_get(const struct cl_page *pg)<br> > +{<br> > + /*<br> > + * Paired with smp_store_release in cl_page_state_set_trust<br> > + * and ensures that we see the most recent value of cp_state<br> > + * even when the last modification was not performed on the<br> > + * current processor<br> > + */<br> > + return smp_load_acquire(&pg->cp_state);<br> > +}<br> > +<br> > /**<br> > * Per-layer part of cl_page.<br> > *<br> > diff --git a/fs/lustre/llite/rw26.c b/fs/lustre/llite/rw26.c<br> > index e4ce3b6f5772..364dec208ccd 100644<br> > --- a/fs/lustre/llite/rw26.c<br> > +++ b/fs/lustre/llite/rw26.c<br> > @@ -200,7 +200,7 @@ static ssize_t ll_direct_IO_seg(const struct lu_env *env, struct cl_io *io,<br> > <br> > rc = cl_page_own(env, io, clp);<br> > if (rc) {<br> > - LASSERT(clp->cp_state == CPS_FREEING);<br> > + LASSERT(cl_page_state_get(clp) == CPS_FREEING);<br> > cl_page_put(env, clp);<br> > break;<br> > }<br> > diff --git a/fs/lustre/llite/vvp_page.c b/fs/lustre/llite/vvp_page.c<br> > index 590e5f5e43c9..38b8c488d765 100644<br> > --- a/fs/lustre/llite/vvp_page.c<br> > +++ b/fs/lustre/llite/vvp_page.c<br> > @@ -323,18 +323,18 @@ static int vvp_page_make_ready(const struct lu_env *env,<br> > <br> > lock_page(vmpage);<br> > if (clear_page_dirty_for_io(vmpage)) {<br> > - LASSERT(pg->cp_state == CPS_CACHED);<br> > + LASSERT(cl_page_state_get(pg) == CPS_CACHED);<br> > /* This actually clears the dirty bit in the radix tree. */<br> > set_page_writeback(vmpage);<br> > CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");<br> > - } else if (pg->cp_state == CPS_PAGEOUT) {<br> > + } else if (cl_page_state_get(pg) == CPS_PAGEOUT) {<br> > /* is it possible for osc_flush_async_page() to already<br> > * make it ready?<br> > */<br> > result = -EALREADY;<br> > } else {<br> > CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n",<br> > - pg->cp_state);<br> > + cl_page_state_get(pg));<br> > LBUG();<br> > }<br> > unlock_page(vmpage);<br> > diff --git a/fs/lustre/obdclass/cl_page.c b/fs/lustre/obdclass/cl_page.c<br> > index 349f19e014e0..da4429b82932 100644<br> > --- a/fs/lustre/obdclass/cl_page.c<br> > +++ b/fs/lustre/obdclass/cl_page.c<br> > @@ -97,7 +97,7 @@ static void cl_page_free(const struct lu_env *env, struct cl_page *page)<br> > <br> > PASSERT(env, page, list_empty(&page->cp_batch));<br> > PASSERT(env, page, !page->cp_owner);<br> > - PASSERT(env, page, page->cp_state == CPS_FREEING);<br> > + PASSERT(env, page, cl_page_state_get(page) == CPS_FREEING);<br> > <br> > while ((slice = list_first_entry_or_null(&page->cp_layers,<br> > struct cl_page_slice,<br> > @@ -119,8 +119,14 @@ static void cl_page_free(const struct lu_env *env, struct cl_page *page)<br> > static inline void cl_page_state_set_trust(struct cl_page *page,<br> > enum cl_page_state state)<br> > {<br> > - /* bypass const. */<br> > - *(enum cl_page_state *)&page->cp_state = state;<br> > + /*<br> > + * Paired with smp_load_acquire in cl_page_state_get<br> > + * and ensures that we see the most recent value of cp_state<br> > + * is available even when the next access is not performed on the<br> > + * current processor.<br> > + * Note we also cast away const as the only modifier of cp_state.<br> > + */<br> > + smp_store_release((enum cl_page_state *)&page->cp_state, state);<br> > }<br> > <br> > struct cl_page *cl_page_alloc(const struct lu_env *env,<br> > @@ -270,10 +276,10 @@ static void __cl_page_state_set(const struct lu_env *env,<br> > }<br> > };<br> > <br> > - old = page->cp_state;<br> > + old = cl_page_state_get(page);<br> > PASSERT(env, page, allowed_transitions[old][state]);<br> > CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);<br> > - PASSERT(env, page, page->cp_state == old);<br> > + PASSERT(env, page, cl_page_state_get(page) == old);<br> > PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner));<br> > cl_page_state_set_trust(page, state);<br> > }<br> > @@ -313,7 +319,7 @@ void cl_page_put(const struct lu_env *env, struct cl_page *page)<br> > refcount_read(&page->cp_ref));<br> > <br> > if (refcount_dec_and_test(&page->cp_ref)) {<br> > - LASSERT(page->cp_state == CPS_FREEING);<br> > + LASSERT(cl_page_state_get(page) == CPS_FREEING);<br> > <br> > LASSERT(refcount_read(&page->cp_ref) == 0);<br> > PASSERT(env, page, !page->cp_owner);<br> > @@ -378,7 +384,7 @@ void __cl_page_disown(const struct lu_env *env,<br> > const struct cl_page_slice *slice;<br> > enum cl_page_state state;<br> > <br> > - state = pg->cp_state;<br> > + state = cl_page_state_get(pg);<br> > cl_page_owner_clear(pg);<br> > <br> > if (state == CPS_OWNED)<br> > @@ -402,7 +408,7 @@ int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)<br> > struct cl_io *top = cl_io_top((struct cl_io *)io);<br> > <br> > LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));<br> > - return pg->cp_state == CPS_OWNED && pg->cp_owner == top;<br> > + return cl_page_state_get(pg) == CPS_OWNED && pg->cp_owner == top;<br> > }<br> > EXPORT_SYMBOL(cl_page_is_owned);<br> > <br> > @@ -434,7 +440,7 @@ static int __cl_page_own(const struct lu_env *env, struct cl_io *io,<br> > <br> > io = cl_io_top(io);<br> > <br> > - if (pg->cp_state == CPS_FREEING) {<br> > + if (cl_page_state_get(pg) == CPS_FREEING) {<br> > result = -ENOENT;<br> > goto out;<br> > }<br> > @@ -453,7 +459,7 @@ static int __cl_page_own(const struct lu_env *env, struct cl_io *io,<br> > PASSERT(env, pg, !pg->cp_owner);<br> > pg->cp_owner = cl_io_top(io);<br> > cl_page_owner_set(pg);<br> > - if (pg->cp_state != CPS_FREEING) {<br> > + if (cl_page_state_get(pg) != CPS_FREEING) {<br> > cl_page_state_set(env, pg, CPS_OWNED);<br> > } else {<br> > __cl_page_disown(env, io, pg);<br> > @@ -593,7 +599,7 @@ static void __cl_page_delete(const struct lu_env *env, struct cl_page *pg)<br> > {<br> > const struct cl_page_slice *slice;<br> > <br> > - PASSERT(env, pg, pg->cp_state != CPS_FREEING);<br> > + PASSERT(env, pg, cl_page_state_get(pg) != CPS_FREEING);<br> > <br> > /*<br> > * Sever all ways to obtain new pointers to @pg.<br> > @@ -756,7 +762,7 @@ void cl_page_completion(const struct lu_env *env,<br> > const struct cl_page_slice *slice;<br> > <br> > PASSERT(env, pg, crt < CRT_NR);<br> > - PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));<br> > + PASSERT(env, pg, cl_page_state_get(pg) == cl_req_type_state(crt));<br> > <br> > CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);<br> > <br> > @@ -805,7 +811,7 @@ int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,<br> > }<br> > <br> > if (result >= 0) {<br> > - PASSERT(env, pg, pg->cp_state == CPS_CACHED);<br> > + PASSERT(env, pg, cl_page_state_get(pg) == CPS_CACHED);<br> > cl_page_io_start(env, pg, crt);<br> > result = 0;<br> > }<br> > @@ -870,7 +876,7 @@ void cl_page_header_print(const struct lu_env *env, void *cookie,<br> > (*printer)(env, cookie,<br> > "page@%p[%d %p %d %d %p]\n",<br> > pg, refcount_read(&pg->cp_ref), pg->cp_obj,<br> > - pg->cp_state, pg->cp_type,<br> > + cl_page_state_get(pg), pg->cp_type,<br> > pg->cp_owner);<br> > }<br> > EXPORT_SYMBOL(cl_page_header_print);<br> > diff --git a/fs/lustre/obdecho/echo_client.c b/fs/lustre/obdecho/echo_client.c<br> > index 317123fd27cb..d879f109e641 100644<br> > --- a/fs/lustre/obdecho/echo_client.c<br> > +++ b/fs/lustre/obdecho/echo_client.c<br> > @@ -1046,7 +1046,7 @@ static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset,<br> > <br> > rc = cl_page_own(env, io, clp);<br> > if (rc) {<br> > - LASSERT(clp->cp_state == CPS_FREEING);<br> > + LASSERT(cl_page_state_get(clp) == CPS_FREEING);<br> > cl_page_put(env, clp);<br> > break;<br> > }<br> > diff --git a/fs/lustre/osc/osc_cache.c b/fs/lustre/osc/osc_cache.c<br> > index f8fddbfe6a7e..75984b98b229 100644<br> > --- a/fs/lustre/osc/osc_cache.c<br> > +++ b/fs/lustre/osc/osc_cache.c<br> > @@ -1045,7 +1045,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,<br> > cl_page_discard(env, io, page);<br> > cl_page_disown(env, io, page);<br> > } else {<br> > - LASSERT(page->cp_state == CPS_FREEING);<br> > + LASSERT(cl_page_state_get(page) == CPS_FREEING);<br> > LASSERT(0);<br> > }<br> > <br> > @@ -1356,10 +1356,12 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,<br> > int srvlock;<br> > <br> > cmd &= ~OBD_BRW_NOQUOTA;<br> > - LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ),<br> > - "cp_state:%u, cmd:%d\n", page->cp_state, cmd);<br> > - LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE),<br> > - "cp_state:%u, cmd:%d\n", page->cp_state, cmd);<br> > + LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEIN,<br> > + cmd == OBD_BRW_READ),<br> > + "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd);<br> > + LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEOUT,<br> > + cmd == OBD_BRW_WRITE),<br> > + "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd);<br> > LASSERT(opg->ops_transfer_pinned);<br> > <br> > crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;<br> > @@ -3061,7 +3063,7 @@ bool osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,<br> > <br> > page = ops->ops_cl.cpl_page;<br> > LASSERT(page->cp_type == CPT_CACHEABLE);<br> > - if (page->cp_state == CPS_FREEING)<br> > + if (cl_page_state_get(page) == CPS_FREEING)<br> > continue;<br> > <br> > cl_page_get(page);<br> > @@ -3142,7 +3144,7 @@ static bool check_and_discard_cb(const struct lu_env *env, struct cl_io *io,<br> > cl_page_discard(env, io, page);<br> > cl_page_disown(env, io, page);<br> > } else {<br> > - LASSERT(page->cp_state == CPS_FREEING);<br> > + LASSERT(cl_page_state_get(page) == CPS_FREEING);<br> > }<br> > }<br> > <br> > @@ -3169,7 +3171,7 @@ static bool discard_cb(const struct lu_env *env, struct cl_io *io,<br> > cl_page_discard(env, io, page);<br> > cl_page_disown(env, io, page);<br> > } else {<br> > - LASSERT(page->cp_state == CPS_FREEING);<br> > + LASSERT(cl_page_state_get(page) == CPS_FREEING);<br> > }<br> > <br> > return true;<br> > -- <br> > 2.17.1<br> </blockquote></div></div>
On Thu, Jun 06 2019, Shaun Tancheff wrote: > On Thu, Jun 6, 2019 at 7:57 PM NeilBrown <neilb@suse.com> wrote: >> >> On Thu, Jun 06 2019, Shaun Tancheff wrote: >> >> > Ensure all uses of cl_page->cp_state are smp-safe. >> > >> > LBUG Output: >> > In __cl_page_state_set() >> > .. >> > old = page->cp_state; >> > PASSERT(env, page, allowed_transitions[old][state]); >> > .. >> > Asserted with the following: >> > page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0] I now see that this space..... ^^^ shouldn't be there.. ..... > Also known caller was cl_page_assume+0xdc/0x3e0 [obdclass] (from the > backtrace) > So we know the on-stack / register value of state == CPS_OWNED > > The backtrace: > ... > dump_backtrace+0x0/0x248 > show_stack+0x24/0x30 > dump_stack+0xbc/0xf4 > libcfs_call_trace+0xec/0x120 [libcfs] > lbug_with_loc+0x4c/0xb8 [libcfs] > cl_page_state_set0+0x2b4/0x6e0 [obdclass] > cl_page_assume+0xdc/0x3e0 [obdclass] > ll_io_read_page+0x144c/0x1de0 [lustre] > ll_write_begin+0x3d4/0xee8 [lustre] Thanks for this extra detail. I agree that the evidence suggests that page->cp_state changed (to CPS_CACHED) during the processing of the PASSERT(). It isn't clear to me though that this problem can be fixed just by adding barriers - particularly if this was on x86_64 as I suspect it is. Intel architecture processors have strongly ordered memory semantics. I'm not entirely sure what that means, but as smp_load_acquire() is very nearly a no-op on x86, I doubt that the memory barriers you added actually fix anything there. My understanding (based on a fairly shallow reading of the code) is that a partial-page write is being attempted, and the target page was not in memory, so it was read in synchronously. The fragment of ll_io_read_page happening is: if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */ rc = cl_sync_io_wait(env, anchor, 0); cl_page_assume(env, io, page); So cl_sync_io_wait() has just run. The page was (presumably) previously owned by something else. cl_sync_io_wait() waited for csi_sync_nr to drop to zero, which should say it isn't owned any more. cl_page_assume() then takes ownership. But it seems that the old ownership wasn't quite gone yet. The VM page remains locked the whole time, so the barriers implicit in locking and unlocking a page do not some into play here. There is an implicit barrier on the 'store' side when cl_sync_io_note() calls atomic_dec_and_test() to decrement csi_sync_nr, but I cannot see any implicit barrier on the 'load' side. If l_wait_event() in cl_sync_io_wait() took a spinlock, that would be enough. But it is possible for the wait to complete without any locking. So I can just about see the possibility of the race that you think you have hit. But it couldn't happen on x86, as it has strong memory ordering. What CPU architecture did this ASSERT trigger on? Thanks, NeilBrown
On Mon, Jun 10, 2019 at 7:49 PM NeilBrown <neilb@suse.com> wrote: > On Thu, Jun 06 2019, Shaun Tancheff wrote: > > > On Thu, Jun 6, 2019 at 7:57 PM NeilBrown <neilb@suse.com> wrote: > >> > >> On Thu, Jun 06 2019, Shaun Tancheff wrote: > >> > >> > Ensure all uses of cl_page->cp_state are smp-safe. > >> > > >> > LBUG Output: > >> > In __cl_page_state_set() > >> > .. > >> > old = page->cp_state; > >> > PASSERT(env, page, allowed_transitions[old][state]); > >> > .. > >> > Asserted with the following: > >> > page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0] > > I now see that this space..... ^^^ > shouldn't be there.. > ..... > > > Also known caller was cl_page_assume+0xdc/0x3e0 [obdclass] (from the > > backtrace) > > So we know the on-stack / register value of state == CPS_OWNED > > > > The backtrace: > > ... > > dump_backtrace+0x0/0x248 > > show_stack+0x24/0x30 > > dump_stack+0xbc/0xf4 > > libcfs_call_trace+0xec/0x120 [libcfs] > > lbug_with_loc+0x4c/0xb8 [libcfs] > > cl_page_state_set0+0x2b4/0x6e0 [obdclass] > > cl_page_assume+0xdc/0x3e0 [obdclass] > > ll_io_read_page+0x144c/0x1de0 [lustre] > > ll_write_begin+0x3d4/0xee8 [lustre] > > Thanks for this extra detail. > I agree that the evidence suggests that page->cp_state changed (to > CPS_CACHED) during the processing of the PASSERT(). > > It isn't clear to me though that this problem can be fixed just by > adding barriers - particularly if this was on x86_64 as I suspect it > is. Intel architecture processors have strongly ordered memory > semantics. I'm not entirely sure what that means, but as > smp_load_acquire() is very nearly a no-op on x86, I doubt that the > memory barriers you added actually fix anything there. > Agree. This occurred on aarch64. I suspect one of the reasons that this has not been seen is that the majority of users are x86_64. It appears to be very rare on aarch64 but it could also, theoretically, happen on the other relaxed architectures. > My understanding (based on a fairly shallow reading of the code) is that > a partial-page write is being attempted, and the target page was not in > memory, so it was read in synchronously. > The fragment of ll_io_read_page happening is: > > if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent > */ > rc = cl_sync_io_wait(env, anchor, 0); > > cl_page_assume(env, io, page); > > So cl_sync_io_wait() has just run. The page was (presumably) previously > owned by something else. cl_sync_io_wait() waited for csi_sync_nr > to drop to zero, which should say it isn't owned any more. > cl_page_assume() then takes ownership. > But it seems that the old ownership wasn't quite gone yet. > > The VM page remains locked the whole time, so the barriers implicit in > locking and unlocking a page do not some into play here. > There is an implicit barrier on the 'store' side when cl_sync_io_note() > calls atomic_dec_and_test() to decrement csi_sync_nr, but I cannot see > any implicit barrier on the 'load' side. If l_wait_event() in > cl_sync_io_wait() took a spinlock, that would be enough. But it is > possible for the wait to complete without any locking. > > So I can just about see the possibility of the race that you think you > have hit. But it couldn't happen on x86, as it has strong memory > ordering. > > What CPU architecture did this ASSERT trigger on? > Agree with everything above, unfortunately was aarch64. > Thanks, > NeilBrown > <div dir="ltr"><div dir="ltr">On Mon, Jun 10, 2019 at 7:49 PM NeilBrown <<a href="mailto:neilb@suse.com">neilb@suse.com</a>> wrote:<br></div><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">On Thu, Jun 06 2019, Shaun Tancheff wrote:<br> <br> > On Thu, Jun 6, 2019 at 7:57 PM NeilBrown <<a href="mailto:neilb@suse.com" target="_blank">neilb@suse.com</a>> wrote:<br> >><br> >> On Thu, Jun 06 2019, Shaun Tancheff wrote:<br> >><br> >> > Ensure all uses of cl_page->cp_state are smp-safe.<br> >> ><br> >> > LBUG Output:<br> >> > In __cl_page_state_set()<br> >> > ..<br> >> > old = page->cp_state;<br> >> > PASSERT(env, page, allowed_transitions[old][state]);<br> >> > ..<br> >> > Asserted with the following:<br> >> > page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]<br> <br> I now see that this space..... ^^^<br> shouldn't be there..<br> .....<br> <br> > Also known caller was cl_page_assume+0xdc/0x3e0 [obdclass] (from the<br> > backtrace)<br> > So we know the on-stack / register value of state == CPS_OWNED<br> ><br> > The backtrace:<br> > ...<br> > dump_backtrace+0x0/0x248<br> > show_stack+0x24/0x30<br> > dump_stack+0xbc/0xf4<br> > libcfs_call_trace+0xec/0x120 [libcfs]<br> > lbug_with_loc+0x4c/0xb8 [libcfs]<br> > cl_page_state_set0+0x2b4/0x6e0 [obdclass]<br> > cl_page_assume+0xdc/0x3e0 [obdclass]<br> > ll_io_read_page+0x144c/0x1de0 [lustre]<br> > ll_write_begin+0x3d4/0xee8 [lustre]<br> <br> Thanks for this extra detail.<br> I agree that the evidence suggests that page->cp_state changed (to<br> CPS_CACHED) during the processing of the PASSERT().<br> <br> It isn't clear to me though that this problem can be fixed just by<br> adding barriers - particularly if this was on x86_64 as I suspect it<br> is. Intel architecture processors have strongly ordered memory<br> semantics. I'm not entirely sure what that means, but as<br> smp_load_acquire() is very nearly a no-op on x86, I doubt that the<br> memory barriers you added actually fix anything there.<br></blockquote><div><br></div><div>Agree. This occurred on aarch64. I suspect one of the reasons that this has not</div><div>been seen is that the majority of users are x86_64. It appears to be very rare on</div><div>aarch64 but it could also, theoretically, happen on the other relaxed architectures.</div><div><br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><br> My understanding (based on a fairly shallow reading of the code) is that<br> a partial-page write is being attempted, and the target page was not in<br> memory, so it was read in synchronously.<br> The fragment of ll_io_read_page happening is:<br> <br> if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */<br> rc = cl_sync_io_wait(env, anchor, 0);<br> <br> cl_page_assume(env, io, page);<br> <br> So cl_sync_io_wait() has just run. The page was (presumably) previously<br> owned by something else. cl_sync_io_wait() waited for csi_sync_nr<br> to drop to zero, which should say it isn't owned any more.<br> cl_page_assume() then takes ownership.<br> But it seems that the old ownership wasn't quite gone yet.<br> <br> The VM page remains locked the whole time, so the barriers implicit in<br> locking and unlocking a page do not some into play here.<br> There is an implicit barrier on the 'store' side when cl_sync_io_note()<br> calls atomic_dec_and_test() to decrement csi_sync_nr, but I cannot see<br> any implicit barrier on the 'load' side. If l_wait_event() in<br> cl_sync_io_wait() took a spinlock, that would be enough. But it is<br> possible for the wait to complete without any locking.<br> <br> So I can just about see the possibility of the race that you think you<br> have hit. But it couldn't happen on x86, as it has strong memory<br> ordering.<br> <br> What CPU architecture did this ASSERT trigger on?<br></blockquote><div><br></div><div>Agree with everything above, unfortunately was aarch64.</div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"> Thanks,<br> NeilBrown<br></blockquote><div> </div></div></div>
On Mon, Jun 10 2019, Shaun Tancheff wrote: > On Mon, Jun 10, 2019 at 7:49 PM NeilBrown <neilb@suse.com> wrote: > >>> >> So I can just about see the possibility of the race that you think you >> have hit. But it couldn't happen on x86, as it has strong memory >> ordering. >> >> What CPU architecture did this ASSERT trigger on? >> > > Agree with everything above, unfortunately was aarch64. Ahh. OK, I'm now reasonably convinced that the patch probably fixes the bug. It is a shame that there is no standard barrier between the wake_up and the wait_event(), but there isn't and we need to work with that. But I still don't like the patch.... :-( I have a few problems with it. Firstly, I generally don't like accessor functions. They tend to hide useful information. Rather than cl_page_state_get() I would rather have smp_load_acquire(&pg->cp_state) where ever it is needed, with a comment explaining why it is needed in that particular context. Some people seem to like accessor functions - there certainly are a lot of them in the kernel - but I don't think they help. Having an accessor function that just adds a barrier is, I think, particularly ugly as the comment can only be general in nature, and so doesn't really help the reader to understand why the barrier is needed. But even that isn't completely without precedent. "key_read_state()" in include/linux/key.h simply wraps smp_load_acquire(), and the comment there doesn't really tell me anything useful. "inet_sk_state_load" is slightly better it says it is for use in places where the socket lock isn't held - and names some of those. My second problem is that has "get" in the accessor function name is potentially misleading and "get" normally has a matching "put", and usually increments a refcount. Having "read" or "load" in place of "get" in the accessor function name would remove my second objection. My third objection is that the bug doesn't exist in the upstream client code. I'm not sure whether this is luck or good management, but either way it is gone. cl_sync_io_wait() has been changed to always take csi_waitq.lock. This avoids the need for anchor->csi_barrier, and provides a more robust way to ensure that cl_sync_io_note() is completely finished before cl_sync_io_wait() completes. That isn't quite the outcome I was expecting... I'm sorry if I seem to have been rather harsh on your patch - my intention was only to make sure it was the best it could be. It really needed to have the stack trace and the arch were the LBUG happened to be at all complete. I would have been nice if it identified the particular code where the barrier was needed: between cl_sync_io_note() and cl_sync_io_wait() because that mediates and ownership transition that happens while the VM page is locked. The rest are probably just issues of taste. Thanks, NeilBrown
On Tue, Jun 11, 2019 at 11:09 PM NeilBrown <neilb@suse.com> wrote: > On Mon, Jun 10 2019, Shaun Tancheff wrote: > > > On Mon, Jun 10, 2019 at 7:49 PM NeilBrown <neilb@suse.com> wrote: > > > >>> > >> So I can just about see the possibility of the race that you think you > >> have hit. But it couldn't happen on x86, as it has strong memory > >> ordering. > >> > >> What CPU architecture did this ASSERT trigger on? > >> > > > > Agree with everything above, unfortunately was aarch64. > > Ahh. > OK, I'm now reasonably convinced that the patch probably fixes the bug. > It is a shame that there is no standard barrier between the wake_up and > the wait_event(), but there isn't and we need to work with that. > > But I still don't like the patch.... :-( > > I have a few problems with it. > > Firstly, I generally don't like accessor functions. They tend to hide > useful information. > Rather than cl_page_state_get() I would rather have > smp_load_acquire(&pg->cp_state) > > where ever it is needed, with a comment explaining why it is needed in > that particular context. > Some people seem to like accessor functions - there certainly are a lot > of them in the kernel - but I don't think they help. > Having an accessor function that just adds a barrier is, I think, > particularly ugly as the comment can only be general in nature, and so > doesn't really help the reader to understand why the barrier is needed. > But even that isn't completely without precedent. > "key_read_state()" in include/linux/key.h simply wraps > smp_load_acquire(), and the comment there doesn't really tell me > anything useful. > "inet_sk_state_load" is slightly better it says it is for use in places > where the socket lock isn't held - and names some of those. > Understood and a fair assessment. > My second problem is that has "get" in the accessor function name is > potentially misleading and "get" normally has a matching "put", and > usually increments a refcount. Having "read" or "load" in place of > "get" in the accessor function name would remove my second objection. > Yeah, I was trying to balance with the set .. but really I'm not enamored with that either :P > My third objection is that the bug doesn't exist in the upstream client > code. I'm not sure whether this is luck or good management, but either > way it is gone. > This is really good to know. Personally I was itching to refactor the whole lot but this gives a me a much better path. > cl_sync_io_wait() has been changed to always take csi_waitq.lock. This > avoids the need for anchor->csi_barrier, and provides a more robust way > to ensure that cl_sync_io_note() is completely finished before > cl_sync_io_wait() completes. > > That isn't quite the outcome I was expecting... > > I'm sorry if I seem to have been rather harsh on your patch - my > intention was only to make sure it was the best it could be. > > It really needed to have the stack trace and the arch were the LBUG > happened to be at all complete. > I would have been nice if it identified the particular code where the > barrier was needed: between cl_sync_io_note() and cl_sync_io_wait() > because that mediates and ownership transition that happens while the VM > page is locked. > > The rest are probably just issues of taste. > > Thanks, > NeilBrown > Thanks I really appreciate the time need to consider the issue. And I agree any patch needs to be as good and tasteful as we can make them. Regards, Shaun <div dir="ltr"><div dir="ltr">On Tue, Jun 11, 2019 at 11:09 PM NeilBrown <<a href="mailto:neilb@suse.com">neilb@suse.com</a>> wrote:<br></div><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">On Mon, Jun 10 2019, Shaun Tancheff wrote:<br> <br> > On Mon, Jun 10, 2019 at 7:49 PM NeilBrown <<a href="mailto:neilb@suse.com" target="_blank">neilb@suse.com</a>> wrote:<br> ><br> >>><br> >> So I can just about see the possibility of the race that you think you<br> >> have hit. But it couldn't happen on x86, as it has strong memory<br> >> ordering.<br> >><br> >> What CPU architecture did this ASSERT trigger on?<br> >><br> ><br> > Agree with everything above, unfortunately was aarch64.<br> <br> Ahh.<br> OK, I'm now reasonably convinced that the patch probably fixes the bug.<br> It is a shame that there is no standard barrier between the wake_up and<br> the wait_event(), but there isn't and we need to work with that.<br> <br> But I still don't like the patch.... :-(<br> <br> I have a few problems with it.<br> <br> Firstly, I generally don't like accessor functions. They tend to hide<br> useful information.<br> Rather than cl_page_state_get() I would rather have<br> smp_load_acquire(&pg->cp_state)<br> <br> where ever it is needed, with a comment explaining why it is needed in<br> that particular context.<br> Some people seem to like accessor functions - there certainly are a lot<br> of them in the kernel - but I don't think they help.<br> Having an accessor function that just adds a barrier is, I think,<br> particularly ugly as the comment can only be general in nature, and so<br> doesn't really help the reader to understand why the barrier is needed.<br> But even that isn't completely without precedent.<br> "key_read_state()" in include/linux/key.h simply wraps<br> smp_load_acquire(), and the comment there doesn't really tell me<br> anything useful.<br> "inet_sk_state_load" is slightly better it says it is for use in places<br> where the socket lock isn't held - and names some of those.<br></blockquote><div><br></div><div>Understood and a fair assessment.</div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"> My second problem is that has "get" in the accessor function name is<br> potentially misleading and "get" normally has a matching "put", and<br> usually increments a refcount. Having "read" or "load" in place of<br> "get" in the accessor function name would remove my second objection.<br></blockquote><div><br></div><div>Yeah, I was trying to balance with the set .. but really I'm not enamored</div><div>with that either :P</div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"> My third objection is that the bug doesn't exist in the upstream client<br> code. I'm not sure whether this is luck or good management, but either<br> way it is gone.<br></blockquote><div><br></div><div>This is really good to know. Personally I was itching to refactor the whole</div><div>lot but this gives a me a much better path.</div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">cl_sync_io_wait() has been changed to always take csi_waitq.lock. This<br> avoids the need for anchor->csi_barrier, and provides a more robust way<br> to ensure that cl_sync_io_note() is completely finished before<br> cl_sync_io_wait() completes.<br> <br> That isn't quite the outcome I was expecting...<br> <br> I'm sorry if I seem to have been rather harsh on your patch - my<br> intention was only to make sure it was the best it could be.<br> <br> It really needed to have the stack trace and the arch were the LBUG<br> happened to be at all complete.<br> I would have been nice if it identified the particular code where the<br> barrier was needed: between cl_sync_io_note() and cl_sync_io_wait()<br> because that mediates and ownership transition that happens while the VM<br> page is locked.<br> <br> The rest are probably just issues of taste.<br> <br> Thanks,<br> NeilBrown<br></blockquote><div><br></div><div>Thanks I really appreciate the time need to consider the issue.</div><div>And I agree any patch needs to be as good and tasteful as we can make them.</div><div><br></div><div>Regards,</div><div>Shaun</div><div><br></div></div></div>
As an interested observer (thanks to both of you for an interesting exchange), Shaun, what’s your plan going forward for the OpenSFS/WC branch? It sounds like you’re thinking you’ll try to emulate what Neil did upstream? (which sounds good to me, I always prefer avoiding explicit memory barriers if reasonable) Will you be opening an LU for this one? Thanks, - Patrick
On Wed, Jun 12, 2019 at 8:04 AM Patrick Farrell <pfarrell@whamcloud.com> wrote: > As an interested observer (thanks to both of you for an interesting > exchange), Shaun, what’s your plan going forward for the OpenSFS/WC > branch? It sounds like you’re thinking you’ll try to emulate what Neil did > upstream? (which sounds good to me, I always prefer avoiding explicit > memory barriers if reasonable) Will you be opening an LU for this one? > Yes, I will be opening an LU for this. My plan is to emulate upstream. > Thanks, > - Patrick > ------------------------------ > *From:* lustre-devel <lustre-devel-bounces@lists.lustre.org> on behalf of > Shaun Tancheff <shaun@tancheff.com> > *Sent:* Tuesday, June 11, 2019 11:36:13 PM > *To:* NeilBrown > *Cc:* Bobi Jam; lustre-devel@lists.lustre.org > *Subject:* Re: [lustre-devel] [PATCH] Place a memory barrier around > cp_state changes > > On Tue, Jun 11, 2019 at 11:09 PM NeilBrown <neilb@suse.com> wrote: > > On Mon, Jun 10 2019, Shaun Tancheff wrote: > > > On Mon, Jun 10, 2019 at 7:49 PM NeilBrown <neilb@suse.com> wrote: > > > >>> > >> So I can just about see the possibility of the race that you think you > >> have hit. But it couldn't happen on x86, as it has strong memory > >> ordering. > >> > >> What CPU architecture did this ASSERT trigger on? > >> > > > > Agree with everything above, unfortunately was aarch64. > > Ahh. > OK, I'm now reasonably convinced that the patch probably fixes the bug. > It is a shame that there is no standard barrier between the wake_up and > the wait_event(), but there isn't and we need to work with that. > > But I still don't like the patch.... :-( > > I have a few problems with it. > > Firstly, I generally don't like accessor functions. They tend to hide > useful information. > Rather than cl_page_state_get() I would rather have > smp_load_acquire(&pg->cp_state) > > where ever it is needed, with a comment explaining why it is needed in > that particular context. > Some people seem to like accessor functions - there certainly are a lot > of them in the kernel - but I don't think they help. > Having an accessor function that just adds a barrier is, I think, > particularly ugly as the comment can only be general in nature, and so > doesn't really help the reader to understand why the barrier is needed. > But even that isn't completely without precedent. > "key_read_state()" in include/linux/key.h simply wraps > smp_load_acquire(), and the comment there doesn't really tell me > anything useful. > "inet_sk_state_load" is slightly better it says it is for use in places > where the socket lock isn't held - and names some of those. > > > Understood and a fair assessment. > > > My second problem is that has "get" in the accessor function name is > potentially misleading and "get" normally has a matching "put", and > usually increments a refcount. Having "read" or "load" in place of > "get" in the accessor function name would remove my second objection. > > > Yeah, I was trying to balance with the set .. but really I'm not enamored > with that either :P > > > My third objection is that the bug doesn't exist in the upstream client > code. I'm not sure whether this is luck or good management, but either > way it is gone. > > > This is really good to know. Personally I was itching to refactor the whole > lot but this gives a me a much better path. > > > cl_sync_io_wait() has been changed to always take csi_waitq.lock. This > avoids the need for anchor->csi_barrier, and provides a more robust way > to ensure that cl_sync_io_note() is completely finished before > cl_sync_io_wait() completes. > > That isn't quite the outcome I was expecting... > > I'm sorry if I seem to have been rather harsh on your patch - my > intention was only to make sure it was the best it could be. > > It really needed to have the stack trace and the arch were the LBUG > happened to be at all complete. > I would have been nice if it identified the particular code where the > barrier was needed: between cl_sync_io_note() and cl_sync_io_wait() > because that mediates and ownership transition that happens while the VM > page is locked. > > The rest are probably just issues of taste. > > Thanks, > NeilBrown > > > Thanks I really appreciate the time need to consider the issue. > And I agree any patch needs to be as good and tasteful as we can make them. > > Regards, > Shaun > > <div dir="ltr"><div dir="ltr">On Wed, Jun 12, 2019 at 8:04 AM Patrick Farrell <<a href="mailto:pfarrell@whamcloud.com">pfarrell@whamcloud.com</a>> wrote:<br></div><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"> <div> As an interested observer (thanks to both of you for an interesting exchange), Shaun, what’s your plan going forward for the OpenSFS/WC branch? It sounds like you’re thinking you’ll try to emulate what Neil did upstream? (which sounds good to me, I always prefer avoiding explicit memory barriers if reasonable) Will you be opening an LU for this one?<br></div></blockquote><div><br></div><div>Yes, I will be opening an LU for this. My plan is to emulate upstream.</div><div> <br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><div> Thanks,<br> - Patrick <hr style="display:inline-block;width:98%"> <div id="gmail-m_2263145130371291002divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> lustre-devel <<a href="mailto:lustre-devel-bounces@lists.lustre.org" target="_blank">lustre-devel-bounces@lists.lustre.org</a>> on behalf of Shaun Tancheff <<a href="mailto:shaun@tancheff.com" target="_blank">shaun@tancheff.com</a>><br> <b>Sent:</b> Tuesday, June 11, 2019 11:36:13 PM<br> <b>To:</b> NeilBrown<br> <b>Cc:</b> Bobi Jam; <a href="mailto:lustre-devel@lists.lustre.org" target="_blank">lustre-devel@lists.lustre.org</a><br> <b>Subject:</b> Re: [lustre-devel] [PATCH] Place a memory barrier around cp_state changes</font> <div> </div> </div> <div> <div dir="ltr"> <div dir="ltr">On Tue, Jun 11, 2019 at 11:09 PM NeilBrown <<a href="mailto:neilb@suse.com" target="_blank">neilb@suse.com</a>> wrote:<br> </div> <div class="gmail-m_2263145130371291002x_gmail_quote"> <blockquote class="gmail-m_2263145130371291002x_gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"> On Mon, Jun 10 2019, Shaun Tancheff wrote:<br> <br> > On Mon, Jun 10, 2019 at 7:49 PM NeilBrown <<a href="mailto:neilb@suse.com" target="_blank">neilb@suse.com</a>> wrote:<br> ><br> >>><br> >> So I can just about see the possibility of the race that you think you<br> >> have hit. But it couldn't happen on x86, as it has strong memory<br> >> ordering.<br> >><br> >> What CPU architecture did this ASSERT trigger on?<br> >><br> ><br> > Agree with everything above, unfortunately was aarch64.<br> <br> Ahh.<br> OK, I'm now reasonably convinced that the patch probably fixes the bug.<br> It is a shame that there is no standard barrier between the wake_up and<br> the wait_event(), but there isn't and we need to work with that.<br> <br> But I still don't like the patch.... :-(<br> <br> I have a few problems with it.<br> <br> Firstly, I generally don't like accessor functions. They tend to hide<br> useful information.<br> Rather than cl_page_state_get() I would rather have<br> smp_load_acquire(&pg->cp_state)<br> <br> where ever it is needed, with a comment explaining why it is needed in<br> that particular context.<br> Some people seem to like accessor functions - there certainly are a lot<br> of them in the kernel - but I don't think they help.<br> Having an accessor function that just adds a barrier is, I think,<br> particularly ugly as the comment can only be general in nature, and so<br> doesn't really help the reader to understand why the barrier is needed.<br> But even that isn't completely without precedent.<br> "key_read_state()" in include/linux/key.h simply wraps<br> smp_load_acquire(), and the comment there doesn't really tell me<br> anything useful.<br> "inet_sk_state_load" is slightly better it says it is for use in places<br> where the socket lock isn't held - and names some of those.<br> </blockquote> <div><br> </div> <div>Understood and a fair assessment.</div> <div> </div> <blockquote class="gmail-m_2263145130371291002x_gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"> My second problem is that has "get" in the accessor function name is<br> potentially misleading and "get" normally has a matching "put", and<br> usually increments a refcount. Having "read" or "load" in place of<br> "get" in the accessor function name would remove my second objection.<br> </blockquote> <div><br> </div> <div>Yeah, I was trying to balance with the set .. but really I'm not enamored</div> <div>with that either :P</div> <div> </div> <blockquote class="gmail-m_2263145130371291002x_gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"> My third objection is that the bug doesn't exist in the upstream client<br> code. I'm not sure whether this is luck or good management, but either<br> way it is gone.<br> </blockquote> <div><br> </div> <div>This is really good to know. Personally I was itching to refactor the whole</div> <div>lot but this gives a me a much better path.</div> <div> </div> <blockquote class="gmail-m_2263145130371291002x_gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"> cl_sync_io_wait() has been changed to always take csi_waitq.lock. This<br> avoids the need for anchor->csi_barrier, and provides a more robust way<br> to ensure that cl_sync_io_note() is completely finished before<br> cl_sync_io_wait() completes.<br> <br> That isn't quite the outcome I was expecting...<br> <br> I'm sorry if I seem to have been rather harsh on your patch - my<br> intention was only to make sure it was the best it could be.<br> <br> It really needed to have the stack trace and the arch were the LBUG<br> happened to be at all complete.<br> I would have been nice if it identified the particular code where the<br> barrier was needed: between cl_sync_io_note() and cl_sync_io_wait()<br> because that mediates and ownership transition that happens while the VM<br> page is locked.<br> <br> The rest are probably just issues of taste.<br> <br> Thanks,<br> NeilBrown<br> </blockquote> <div><br> </div> <div>Thanks I really appreciate the time need to consider the issue.</div> <div>And I agree any patch needs to be as good and tasteful as we can make them.</div> <div><br> </div> <div>Regards,</div> <div>Shaun</div> <div><br> </div> </div> </div> </div> </div> </blockquote></div></div>
diff --git a/fs/lustre/include/cl_object.h b/fs/lustre/include/cl_object.h index 691c2f5da53a..d6e1f6f05f50 100644 --- a/fs/lustre/include/cl_object.h +++ b/fs/lustre/include/cl_object.h @@ -752,6 +752,17 @@ struct cl_page { struct cl_sync_io *cp_sync_io; }; +static inline enum cl_page_state cl_page_state_get(const struct cl_page *pg) +{ + /* + * Paired with smp_store_release in cl_page_state_set_trust + * and ensures that we see the most recent value of cp_state + * even when the last modification was not performed on the + * current processor + */ + return smp_load_acquire(&pg->cp_state); +} + /** * Per-layer part of cl_page. * diff --git a/fs/lustre/llite/rw26.c b/fs/lustre/llite/rw26.c index e4ce3b6f5772..364dec208ccd 100644 --- a/fs/lustre/llite/rw26.c +++ b/fs/lustre/llite/rw26.c @@ -200,7 +200,7 @@ static ssize_t ll_direct_IO_seg(const struct lu_env *env, struct cl_io *io, rc = cl_page_own(env, io, clp); if (rc) { - LASSERT(clp->cp_state == CPS_FREEING); + LASSERT(cl_page_state_get(clp) == CPS_FREEING); cl_page_put(env, clp); break; } diff --git a/fs/lustre/llite/vvp_page.c b/fs/lustre/llite/vvp_page.c index 590e5f5e43c9..38b8c488d765 100644 --- a/fs/lustre/llite/vvp_page.c +++ b/fs/lustre/llite/vvp_page.c @@ -323,18 +323,18 @@ static int vvp_page_make_ready(const struct lu_env *env, lock_page(vmpage); if (clear_page_dirty_for_io(vmpage)) { - LASSERT(pg->cp_state == CPS_CACHED); + LASSERT(cl_page_state_get(pg) == CPS_CACHED); /* This actually clears the dirty bit in the radix tree. */ set_page_writeback(vmpage); CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n"); - } else if (pg->cp_state == CPS_PAGEOUT) { + } else if (cl_page_state_get(pg) == CPS_PAGEOUT) { /* is it possible for osc_flush_async_page() to already * make it ready? */ result = -EALREADY; } else { CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n", - pg->cp_state); + cl_page_state_get(pg)); LBUG(); } unlock_page(vmpage); diff --git a/fs/lustre/obdclass/cl_page.c b/fs/lustre/obdclass/cl_page.c index 349f19e014e0..da4429b82932 100644 --- a/fs/lustre/obdclass/cl_page.c +++ b/fs/lustre/obdclass/cl_page.c @@ -97,7 +97,7 @@ static void cl_page_free(const struct lu_env *env, struct cl_page *page) PASSERT(env, page, list_empty(&page->cp_batch)); PASSERT(env, page, !page->cp_owner); - PASSERT(env, page, page->cp_state == CPS_FREEING); + PASSERT(env, page, cl_page_state_get(page) == CPS_FREEING); while ((slice = list_first_entry_or_null(&page->cp_layers, struct cl_page_slice, @@ -119,8 +119,14 @@ static void cl_page_free(const struct lu_env *env, struct cl_page *page) static inline void cl_page_state_set_trust(struct cl_page *page, enum cl_page_state state) { - /* bypass const. */ - *(enum cl_page_state *)&page->cp_state = state; + /* + * Paired with smp_load_acquire in cl_page_state_get + * and ensures that we see the most recent value of cp_state + * is available even when the next access is not performed on the + * current processor. + * Note we also cast away const as the only modifier of cp_state. + */ + smp_store_release((enum cl_page_state *)&page->cp_state, state); } struct cl_page *cl_page_alloc(const struct lu_env *env, @@ -270,10 +276,10 @@ static void __cl_page_state_set(const struct lu_env *env, } }; - old = page->cp_state; + old = cl_page_state_get(page); PASSERT(env, page, allowed_transitions[old][state]); CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state); - PASSERT(env, page, page->cp_state == old); + PASSERT(env, page, cl_page_state_get(page) == old); PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner)); cl_page_state_set_trust(page, state); } @@ -313,7 +319,7 @@ void cl_page_put(const struct lu_env *env, struct cl_page *page) refcount_read(&page->cp_ref)); if (refcount_dec_and_test(&page->cp_ref)) { - LASSERT(page->cp_state == CPS_FREEING); + LASSERT(cl_page_state_get(page) == CPS_FREEING); LASSERT(refcount_read(&page->cp_ref) == 0); PASSERT(env, page, !page->cp_owner); @@ -378,7 +384,7 @@ void __cl_page_disown(const struct lu_env *env, const struct cl_page_slice *slice; enum cl_page_state state; - state = pg->cp_state; + state = cl_page_state_get(pg); cl_page_owner_clear(pg); if (state == CPS_OWNED) @@ -402,7 +408,7 @@ int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io) struct cl_io *top = cl_io_top((struct cl_io *)io); LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj)); - return pg->cp_state == CPS_OWNED && pg->cp_owner == top; + return cl_page_state_get(pg) == CPS_OWNED && pg->cp_owner == top; } EXPORT_SYMBOL(cl_page_is_owned); @@ -434,7 +440,7 @@ static int __cl_page_own(const struct lu_env *env, struct cl_io *io, io = cl_io_top(io); - if (pg->cp_state == CPS_FREEING) { + if (cl_page_state_get(pg) == CPS_FREEING) { result = -ENOENT; goto out; } @@ -453,7 +459,7 @@ static int __cl_page_own(const struct lu_env *env, struct cl_io *io, PASSERT(env, pg, !pg->cp_owner); pg->cp_owner = cl_io_top(io); cl_page_owner_set(pg); - if (pg->cp_state != CPS_FREEING) { + if (cl_page_state_get(pg) != CPS_FREEING) { cl_page_state_set(env, pg, CPS_OWNED); } else { __cl_page_disown(env, io, pg); @@ -593,7 +599,7 @@ static void __cl_page_delete(const struct lu_env *env, struct cl_page *pg) { const struct cl_page_slice *slice; - PASSERT(env, pg, pg->cp_state != CPS_FREEING); + PASSERT(env, pg, cl_page_state_get(pg) != CPS_FREEING); /* * Sever all ways to obtain new pointers to @pg. @@ -756,7 +762,7 @@ void cl_page_completion(const struct lu_env *env, const struct cl_page_slice *slice; PASSERT(env, pg, crt < CRT_NR); - PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt)); + PASSERT(env, pg, cl_page_state_get(pg) == cl_req_type_state(crt)); CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret); @@ -805,7 +811,7 @@ int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg, } if (result >= 0) { - PASSERT(env, pg, pg->cp_state == CPS_CACHED); + PASSERT(env, pg, cl_page_state_get(pg) == CPS_CACHED); cl_page_io_start(env, pg, crt); result = 0; } @@ -870,7 +876,7 @@ void cl_page_header_print(const struct lu_env *env, void *cookie, (*printer)(env, cookie, "page@%p[%d %p %d %d %p]\n", pg, refcount_read(&pg->cp_ref), pg->cp_obj, - pg->cp_state, pg->cp_type, + cl_page_state_get(pg), pg->cp_type, pg->cp_owner); } EXPORT_SYMBOL(cl_page_header_print); diff --git a/fs/lustre/obdecho/echo_client.c b/fs/lustre/obdecho/echo_client.c index 317123fd27cb..d879f109e641 100644 --- a/fs/lustre/obdecho/echo_client.c +++ b/fs/lustre/obdecho/echo_client.c @@ -1046,7 +1046,7 @@ static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset, rc = cl_page_own(env, io, clp); if (rc) { - LASSERT(clp->cp_state == CPS_FREEING); + LASSERT(cl_page_state_get(clp) == CPS_FREEING); cl_page_put(env, clp); break; } diff --git a/fs/lustre/osc/osc_cache.c b/fs/lustre/osc/osc_cache.c index f8fddbfe6a7e..75984b98b229 100644 --- a/fs/lustre/osc/osc_cache.c +++ b/fs/lustre/osc/osc_cache.c @@ -1045,7 +1045,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, cl_page_discard(env, io, page); cl_page_disown(env, io, page); } else { - LASSERT(page->cp_state == CPS_FREEING); + LASSERT(cl_page_state_get(page) == CPS_FREEING); LASSERT(0); } @@ -1356,10 +1356,12 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, int srvlock; cmd &= ~OBD_BRW_NOQUOTA; - LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ), - "cp_state:%u, cmd:%d\n", page->cp_state, cmd); - LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE), - "cp_state:%u, cmd:%d\n", page->cp_state, cmd); + LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEIN, + cmd == OBD_BRW_READ), + "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd); + LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEOUT, + cmd == OBD_BRW_WRITE), + "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd); LASSERT(opg->ops_transfer_pinned); crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE; @@ -3061,7 +3063,7 @@ bool osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, page = ops->ops_cl.cpl_page; LASSERT(page->cp_type == CPT_CACHEABLE); - if (page->cp_state == CPS_FREEING) + if (cl_page_state_get(page) == CPS_FREEING) continue; cl_page_get(page); @@ -3142,7 +3144,7 @@ static bool check_and_discard_cb(const struct lu_env *env, struct cl_io *io, cl_page_discard(env, io, page); cl_page_disown(env, io, page); } else { - LASSERT(page->cp_state == CPS_FREEING); + LASSERT(cl_page_state_get(page) == CPS_FREEING); } } @@ -3169,7 +3171,7 @@ static bool discard_cb(const struct lu_env *env, struct cl_io *io, cl_page_discard(env, io, page); cl_page_disown(env, io, page); } else { - LASSERT(page->cp_state == CPS_FREEING); + LASSERT(cl_page_state_get(page) == CPS_FREEING); } return true;
Ensure all uses of cl_page->cp_state are smp-safe. LBUG Output: In __cl_page_state_set() .. old = page->cp_state; PASSERT(env, page, allowed_transitions[old][state]); .. Asserted with the following: page@ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0] However cp_state 0 (CPS_CACHED) to 1 (CPS_OWNED) is a valid transition leading to the conclusion that cp_state became 0 during the assertion. Signed-off-by: Shaun Tancheff <stancheff@cray.com> --- fs/lustre/include/cl_object.h | 11 +++++++++++ fs/lustre/llite/rw26.c | 2 +- fs/lustre/llite/vvp_page.c | 6 +++--- fs/lustre/obdclass/cl_page.c | 34 +++++++++++++++++++-------------- fs/lustre/obdecho/echo_client.c | 2 +- fs/lustre/osc/osc_cache.c | 18 +++++++++-------- 6 files changed, 46 insertions(+), 27 deletions(-)