dax: Fix missed PMD wakeups
diff mbox series

Message ID 156213869409.3910140.7715747316991468148.stgit@dwillia2-desk3.amr.corp.intel.com
State New
Headers show
Series
  • dax: Fix missed PMD wakeups
Related show

Commit Message

Dan Williams July 3, 2019, 7:24 a.m. UTC
Ever since the conversion of DAX to the Xarray a RocksDB benchmark has
been encountering intermittent lockups. In the failing case a thread
that is taking a PMD-fault is awaiting a wakeup while holding the
'mmap_sem' for read. As soon as the next mmap() event occurs that tries
to take the 'mmap_sem' for write it causes ps(1)  and any new 'mmap_sem'
reader to block.

Debug shows that there are no outstanding Xarray entry-lock holders in
the hang state which indicates that a PTE lock-holder thread caused a
PMD thread to wait. When the PTE index-lock is released it may wake the
wrong waitqueue depending on how the index hashes. Brute-force fix this
by arranging for PTE-aligned indices within a PMD-span to hash to the
same waitqueue as the PMD-index.

This fix may increase waitqueue contention, but a fix for that is saved
for a larger rework. In the meantime this fix is suitable for -stable
backports.

Link: https://lore.kernel.org/linux-fsdevel/CAPcyv4hwHpX-MkUEqxwdTj7wCCZCN4RV-L4jsnuwLGyL_UEG4A@mail>
Fixes: b15cd800682f ("dax: Convert page fault handlers to XArray")
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Boaz Harrosh <openosd@gmail.com>
Cc: <stable@vger.kernel.org>
Reported-by: Robert Barror <robert.barror@intel.com>
Reported-by: Seema Pandit <seema.pandit@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c |   34 ++++++++++++----------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

Comments

Matthew Wilcox July 3, 2019, 12:17 p.m. UTC | #1
On Wed, Jul 03, 2019 at 12:24:54AM -0700, Dan Williams wrote:
> This fix may increase waitqueue contention, but a fix for that is saved
> for a larger rework. In the meantime this fix is suitable for -stable
> backports.

I think this is too big for what it is; just the two-line patch to stop
incorporating the low bits of the PTE would be more appropriate.
Dan Williams July 3, 2019, 5:01 p.m. UTC | #2
On Wed, Jul 3, 2019 at 5:17 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Wed, Jul 03, 2019 at 12:24:54AM -0700, Dan Williams wrote:
> > This fix may increase waitqueue contention, but a fix for that is saved
> > for a larger rework. In the meantime this fix is suitable for -stable
> > backports.
>
> I think this is too big for what it is; just the two-line patch to stop
> incorporating the low bits of the PTE would be more appropriate.

Sufficient, yes, "appropriate", not so sure. All those comments about
pmd entry size are stale after this change.
Matthew Wilcox July 3, 2019, 7:53 p.m. UTC | #3
On Wed, Jul 03, 2019 at 10:01:37AM -0700, Dan Williams wrote:
> On Wed, Jul 3, 2019 at 5:17 AM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Wed, Jul 03, 2019 at 12:24:54AM -0700, Dan Williams wrote:
> > > This fix may increase waitqueue contention, but a fix for that is saved
> > > for a larger rework. In the meantime this fix is suitable for -stable
> > > backports.
> >
> > I think this is too big for what it is; just the two-line patch to stop
> > incorporating the low bits of the PTE would be more appropriate.
> 
> Sufficient, yes, "appropriate", not so sure. All those comments about
> pmd entry size are stale after this change.

But then they'll have to be put back in again.  This seems to be working
for me, although I doubt I'm actually hitting the edge case that rocksdb
hits:

diff --git a/fs/dax.c b/fs/dax.c
index 2e48c7ebb973..e77bd6aef10c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -198,6 +198,10 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
  * if it did.
  *
  * Must be called with the i_pages lock held.
+ *
+ * If the xa_state refers to a larger entry, then it may return a locked
+ * smaller entry (eg a PTE entry) without waiting for the smaller entry
+ * to be unlocked.
  */
 static void *get_unlocked_entry(struct xa_state *xas)
 {
@@ -211,7 +215,8 @@ static void *get_unlocked_entry(struct xa_state *xas)
 	for (;;) {
 		entry = xas_find_conflict(xas);
 		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
-				!dax_is_locked(entry))
+				!dax_is_locked(entry) ||
+				dax_entry_order(entry) < xas_get_order(xas))
 			return entry;
 
 		wq = dax_entry_waitqueue(xas, entry, &ewait.key);
@@ -253,8 +258,12 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
 
 static void put_unlocked_entry(struct xa_state *xas, void *entry)
 {
-	/* If we were the only waiter woken, wake the next one */
-	if (entry)
+	/*
+	 * If we were the only waiter woken, wake the next one.
+	 * Do not wake anybody if the entry is locked; that indicates
+	 * we weren't woken.
+	 */
+	if (entry && !dax_is_locked(entry))
 		dax_wake_entry(xas, entry, false);
 }
 
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 052e06ff4c36..b17289d92af4 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1529,6 +1529,27 @@ static inline void xas_set_order(struct xa_state *xas, unsigned long index,
 #endif
 }
 
+/**
+ * xas_get_order() - Get the order of the entry being operated on.
+ * @xas: XArray operation state.
+ *
+ * Return: The order of the entry.
+ */
+static inline unsigned int xas_get_order(const struct xa_state *xas)
+{
+	unsigned int order = xas->xa_shift;
+
+#ifdef CONFIG_XARRAY_MULTI
+	unsigned int sibs = xas->xa_sibs;
+
+	while (sibs) {
+		order++;
+		sibs /= 2;
+	}
+#endif
+	return order;
+}
+
 /**
  * xas_set_update() - Set up XArray operation state for a callback.
  * @xas: XArray operation state.
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index 7df4f7f395bf..af024477ec93 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -95,6 +95,17 @@ static noinline void check_xa_err(struct xarray *xa)
 //	XA_BUG_ON(xa, xa_err(xa_store(xa, 0, xa_mk_internal(0), 0)) != -EINVAL);
 }
 
+static noinline void check_xas_order(struct xarray *xa)
+{
+	XA_STATE(xas, xa, 0);
+	unsigned int i;
+
+	for (i = 0; i < sizeof(long) * 8; i++) {
+		xas_set_order(&xas, 0, i);
+		XA_BUG_ON(xa, xas_get_order(&xas) != i);
+	}
+}
+
 static noinline void check_xas_retry(struct xarray *xa)
 {
 	XA_STATE(xas, xa, 0);
@@ -1583,6 +1594,7 @@ static DEFINE_XARRAY(array);
 static int xarray_checks(void)
 {
 	check_xa_err(&array);
+	check_xas_order(&array);
 	check_xas_retry(&array);
 	check_xa_load(&array);
 	check_xa_mark(&array);
Dan Williams July 3, 2019, 9:28 p.m. UTC | #4
On Wed, Jul 3, 2019 at 12:53 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Wed, Jul 03, 2019 at 10:01:37AM -0700, Dan Williams wrote:
> > On Wed, Jul 3, 2019 at 5:17 AM Matthew Wilcox <willy@infradead.org> wrote:
> > >
> > > On Wed, Jul 03, 2019 at 12:24:54AM -0700, Dan Williams wrote:
> > > > This fix may increase waitqueue contention, but a fix for that is saved
> > > > for a larger rework. In the meantime this fix is suitable for -stable
> > > > backports.
> > >
> > > I think this is too big for what it is; just the two-line patch to stop
> > > incorporating the low bits of the PTE would be more appropriate.
> >
> > Sufficient, yes, "appropriate", not so sure. All those comments about
> > pmd entry size are stale after this change.
>
> But then they'll have to be put back in again.  This seems to be working
> for me, although I doubt I'm actually hitting the edge case that rocksdb
> hits:

Seems to be holding up under testing here, a couple comments...

>
> diff --git a/fs/dax.c b/fs/dax.c
> index 2e48c7ebb973..e77bd6aef10c 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -198,6 +198,10 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
>   * if it did.
>   *
>   * Must be called with the i_pages lock held.
> + *
> + * If the xa_state refers to a larger entry, then it may return a locked
> + * smaller entry (eg a PTE entry) without waiting for the smaller entry
> + * to be unlocked.
>   */
>  static void *get_unlocked_entry(struct xa_state *xas)
>  {
> @@ -211,7 +215,8 @@ static void *get_unlocked_entry(struct xa_state *xas)
>         for (;;) {
>                 entry = xas_find_conflict(xas);
>                 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
> -                               !dax_is_locked(entry))
> +                               !dax_is_locked(entry) ||
> +                               dax_entry_order(entry) < xas_get_order(xas))

Doesn't this potentially allow a locked entry to be returned for a
caller that expects all value entries are unlocked?

>                         return entry;
>
>                 wq = dax_entry_waitqueue(xas, entry, &ewait.key);
> @@ -253,8 +258,12 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
>
>  static void put_unlocked_entry(struct xa_state *xas, void *entry)
>  {
> -       /* If we were the only waiter woken, wake the next one */
> -       if (entry)
> +       /*
> +        * If we were the only waiter woken, wake the next one.
> +        * Do not wake anybody if the entry is locked; that indicates
> +        * we weren't woken.
> +        */
> +       if (entry && !dax_is_locked(entry))
>                 dax_wake_entry(xas, entry, false);
>  }
>
> diff --git a/include/linux/xarray.h b/include/linux/xarray.h
> index 052e06ff4c36..b17289d92af4 100644
> --- a/include/linux/xarray.h
> +++ b/include/linux/xarray.h
> @@ -1529,6 +1529,27 @@ static inline void xas_set_order(struct xa_state *xas, unsigned long index,
>  #endif
>  }
>
> +/**
> + * xas_get_order() - Get the order of the entry being operated on.
> + * @xas: XArray operation state.
> + *
> + * Return: The order of the entry.
> + */
> +static inline unsigned int xas_get_order(const struct xa_state *xas)
> +{
> +       unsigned int order = xas->xa_shift;
> +
> +#ifdef CONFIG_XARRAY_MULTI
> +       unsigned int sibs = xas->xa_sibs;
> +
> +       while (sibs) {
> +               order++;
> +               sibs /= 2;
> +       }

Use ilog2() here?
Matthew Wilcox July 4, 2019, 3:27 a.m. UTC | #5
On Wed, Jul 03, 2019 at 02:28:41PM -0700, Dan Williams wrote:
> On Wed, Jul 3, 2019 at 12:53 PM Matthew Wilcox <willy@infradead.org> wrote:
> > @@ -211,7 +215,8 @@ static void *get_unlocked_entry(struct xa_state *xas)
> >         for (;;) {
> >                 entry = xas_find_conflict(xas);
> >                 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
> > -                               !dax_is_locked(entry))
> > +                               !dax_is_locked(entry) ||
> > +                               dax_entry_order(entry) < xas_get_order(xas))
> 
> Doesn't this potentially allow a locked entry to be returned for a
> caller that expects all value entries are unlocked?

It only allows locked entries to be returned for callers which pass in
an xas which refers to a PMD entry.  This is fine for grab_mapping_entry()
because it checks size_flag & is_pte_entry.

dax_layout_busy_page() only uses 0-order.
__dax_invalidate_entry() only uses 0-order.
dax_writeback_one() needs an extra fix:

                /* Did a PMD entry get split? */
                if (dax_is_locked(entry))
                        goto put_unlocked;

dax_insert_pfn_mkwrite() checks for a mismatch of pte vs pmd.

So I think we're good for all current users.

> > +#ifdef CONFIG_XARRAY_MULTI
> > +       unsigned int sibs = xas->xa_sibs;
> > +
> > +       while (sibs) {
> > +               order++;
> > +               sibs /= 2;
> > +       }
> 
> Use ilog2() here?

Thought about it.  sibs is never going to be more than 31, so I don't
know that it's worth eliminating 5 add/shift pairs in favour of whatever
the ilog2 instruction is on a given CPU.  In practice, on x86, sibs is
going to be either 0 (PTEs) or 7 (PMDs).  We could also avoid even having
this function by passing PMD_ORDER or PTE_ORDER into get_unlocked_entry().

It's probably never going to be noticable in this scenario because it's
the very last thing checked before we put ourselves on a waitqueue and
go to sleep.
Boaz Harrosh July 4, 2019, 1 p.m. UTC | #6
On 04/07/2019 06:27, Matthew Wilcox wrote:
> On Wed, Jul 03, 2019 at 02:28:41PM -0700, Dan Williams wrote:
<>
>>> +#ifdef CONFIG_XARRAY_MULTI
>>> +       unsigned int sibs = xas->xa_sibs;
>>> +
>>> +       while (sibs) {
>>> +               order++;
>>> +               sibs /= 2;
>>> +       }
>>
>> Use ilog2() here?
> 
> Thought about it.  sibs is never going to be more than 31, so I don't
> know that it's worth eliminating 5 add/shift pairs in favour of whatever
> the ilog2 instruction is on a given CPU.  In practice, on x86, sibs is
> going to be either 0 (PTEs) or 7 (PMDs).  We could also avoid even having
> this function by passing PMD_ORDER or PTE_ORDER into get_unlocked_entry().
> 
> It's probably never going to be noticable in this scenario because it's
> the very last thing checked before we put ourselves on a waitqueue and
> go to sleep.
> 

Matthew you must be kidding an ilog2 in binary is zero clocks
(Return the highest bit or something like that)

In any way. It took me 5 minutes to understand what you are doing
here. And I only fully got it when Dan gave his comment. So please for
the sake of stupid guys like me could you please make it ilog2() so
to make it easier to understand?
(And please don't do the compiler's job. If in some arch the loop
 is the fastest let the compiler decide?)

Thanks
Boaz
Matthew Wilcox July 4, 2019, 1:58 p.m. UTC | #7
On Thu, Jul 04, 2019 at 04:00:00PM +0300, Boaz Harrosh wrote:
> On 04/07/2019 06:27, Matthew Wilcox wrote:
> > On Wed, Jul 03, 2019 at 02:28:41PM -0700, Dan Williams wrote:
> >>> +#ifdef CONFIG_XARRAY_MULTI
> >>> +       unsigned int sibs = xas->xa_sibs;
> >>> +
> >>> +       while (sibs) {
> >>> +               order++;
> >>> +               sibs /= 2;
> >>> +       }
> >>
> >> Use ilog2() here?
> > 
> > Thought about it.  sibs is never going to be more than 31, so I don't
> > know that it's worth eliminating 5 add/shift pairs in favour of whatever
> > the ilog2 instruction is on a given CPU.  In practice, on x86, sibs is
> > going to be either 0 (PTEs) or 7 (PMDs).  We could also avoid even having
> > this function by passing PMD_ORDER or PTE_ORDER into get_unlocked_entry().
> > 
> > It's probably never going to be noticable in this scenario because it's
> > the very last thing checked before we put ourselves on a waitqueue and
> > go to sleep.
> 
> Matthew you must be kidding an ilog2 in binary is zero clocks
> (Return the highest bit or something like that)

You might want to actually check the documentation instead of just
making shit up.

https://www.agner.org/optimize/instruction_tables.pdf

I think in this instance what we want is BSR (aka ffz) since the input is
going to be one of 0, 1, 3, 7, 15 or 31 (and we want 0, 1, 2, 3, 4, 5 as
results).

> In any way. It took me 5 minutes to understand what you are doing
> here. And I only fully got it when Dan gave his comment. So please for
> the sake of stupid guys like me could you please make it ilog2() so
> to make it easier to understand?
> (And please don't do the compiler's job. If in some arch the loop
>  is the fastest let the compiler decide?)

The compiler doesn't know the range of 'sibs'.  Unless we do the
profile-feedback thing.
Boaz Harrosh July 4, 2019, 2:32 p.m. UTC | #8
On 04/07/2019 16:58, Matthew Wilcox wrote:
> On Thu, Jul 04, 2019 at 04:00:00PM +0300, Boaz Harrosh wrote:
<>
>> Matthew you must be kidding an ilog2 in binary is zero clocks
>> (Return the highest bit or something like that)
> 
> You might want to actually check the documentation instead of just
> making shit up.
> 

Yes you are right I stand corrected. Must be smoking ;-)

> https://www.agner.org/optimize/instruction_tables.pdf
> 
> I think in this instance what we want is BSR (aka ffz) since the input is
> going to be one of 0, 1, 3, 7, 15 or 31 (and we want 0, 1, 2, 3, 4, 5 as
> results).
> 
<>
> The compiler doesn't know the range of 'sibs'.  Unless we do the
> profile-feedback thing.
> 

Would you please consider the use of get_order() macro from #include <getorder.h>
Just for the sake of understanding? (Or at least a comment)

Thanks
Boaz
Jan Kara July 4, 2019, 4:54 p.m. UTC | #9
On Wed 03-07-19 20:27:28, Matthew Wilcox wrote:
> On Wed, Jul 03, 2019 at 02:28:41PM -0700, Dan Williams wrote:
> > On Wed, Jul 3, 2019 at 12:53 PM Matthew Wilcox <willy@infradead.org> wrote:
> > > @@ -211,7 +215,8 @@ static void *get_unlocked_entry(struct xa_state *xas)
> > >         for (;;) {
> > >                 entry = xas_find_conflict(xas);
> > >                 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
> > > -                               !dax_is_locked(entry))
> > > +                               !dax_is_locked(entry) ||
> > > +                               dax_entry_order(entry) < xas_get_order(xas))
> > 
> > Doesn't this potentially allow a locked entry to be returned for a
> > caller that expects all value entries are unlocked?
> 
> It only allows locked entries to be returned for callers which pass in
> an xas which refers to a PMD entry.  This is fine for grab_mapping_entry()
> because it checks size_flag & is_pte_entry.
> 
> dax_layout_busy_page() only uses 0-order.
> __dax_invalidate_entry() only uses 0-order.
> dax_writeback_one() needs an extra fix:
> 
>                 /* Did a PMD entry get split? */
>                 if (dax_is_locked(entry))
>                         goto put_unlocked;
> 
> dax_insert_pfn_mkwrite() checks for a mismatch of pte vs pmd.
> 
> So I think we're good for all current users.

Agreed but it is an ugly trap. As I already said, I'd rather pay the
unnecessary cost of waiting for pte entry and have an easy to understand
interface. If we ever have a real world use case that would care for this
optimization, we will need to refactor functions to make this possible and
still keep the interfaces sane. For example get_unlocked_entry() could
return special "error code" indicating that there's no entry with matching
order in xarray but there's a conflict with it. That would be much less
error-prone interface.

								Honza
Matthew Wilcox July 4, 2019, 7:14 p.m. UTC | #10
On Thu, Jul 04, 2019 at 06:54:50PM +0200, Jan Kara wrote:
> On Wed 03-07-19 20:27:28, Matthew Wilcox wrote:
> > So I think we're good for all current users.
> 
> Agreed but it is an ugly trap. As I already said, I'd rather pay the
> unnecessary cost of waiting for pte entry and have an easy to understand
> interface. If we ever have a real world use case that would care for this
> optimization, we will need to refactor functions to make this possible and
> still keep the interfaces sane. For example get_unlocked_entry() could
> return special "error code" indicating that there's no entry with matching
> order in xarray but there's a conflict with it. That would be much less
> error-prone interface.

This is an internal interface.  I think it's already a pretty gnarly
interface to use by definition -- it's going to sleep and might return
almost anything.  There's not much scope for returning an error indicator
either; value entries occupy half of the range (all odd numbers between 1
and ULONG_MAX inclusive), plus NULL.  We could use an internal entry, but
I don't think that makes the interface any easier to use than returning
a locked entry.

I think this iteration of the patch makes it a little clearer.  What do you
think?

diff --git a/fs/dax.c b/fs/dax.c
index 2e48c7ebb973..398b601259f9 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -198,8 +198,11 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
  * if it did.
  *
  * Must be called with the i_pages lock held.
+ *
+ * If order is non-zero, then a locked smaller entry (eg a PTE entry)
+ * may be returned.
  */
-static void *get_unlocked_entry(struct xa_state *xas)
+static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
 {
 	void *entry;
 	struct wait_exceptional_entry_queue ewait;
@@ -211,7 +214,8 @@ static void *get_unlocked_entry(struct xa_state *xas)
 	for (;;) {
 		entry = xas_find_conflict(xas);
 		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
-				!dax_is_locked(entry))
+				!dax_is_locked(entry) ||
+				dax_entry_order(entry) < order)
 			return entry;
 
 		wq = dax_entry_waitqueue(xas, entry, &ewait.key);
@@ -253,8 +257,12 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
 
 static void put_unlocked_entry(struct xa_state *xas, void *entry)
 {
-	/* If we were the only waiter woken, wake the next one */
-	if (entry)
+	/*
+	 * If we were the only waiter woken, wake the next one.
+	 * Do not wake anybody if the entry is locked; that indicates
+	 * we weren't woken.
+	 */
+	if (entry && !dax_is_locked(entry))
 		dax_wake_entry(xas, entry, false);
 }
 
@@ -461,7 +469,7 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie)
  * overlap with xarray value entries.
  */
 static void *grab_mapping_entry(struct xa_state *xas,
-		struct address_space *mapping, unsigned long size_flag)
+		struct address_space *mapping, unsigned int order)
 {
 	unsigned long index = xas->xa_index;
 	bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
@@ -469,7 +477,7 @@ static void *grab_mapping_entry(struct xa_state *xas,
 
 retry:
 	xas_lock_irq(xas);
-	entry = get_unlocked_entry(xas);
+	entry = get_unlocked_entry(xas, order);
 
 	if (entry) {
 		if (!xa_is_value(entry)) {
@@ -477,7 +485,7 @@ static void *grab_mapping_entry(struct xa_state *xas,
 			goto out_unlock;
 		}
 
-		if (size_flag & DAX_PMD) {
+		if (order == PMD_ORDER) {
 			if (dax_is_pte_entry(entry)) {
 				put_unlocked_entry(xas, entry);
 				goto fallback;
@@ -523,7 +531,10 @@ static void *grab_mapping_entry(struct xa_state *xas,
 	if (entry) {
 		dax_lock_entry(xas, entry);
 	} else {
-		entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY);
+		unsigned long flags = DAX_EMPTY;
+		if (order > 0)
+			flags |= DAX_PMD;
+		entry = dax_make_entry(pfn_to_pfn_t(0), flags);
 		dax_lock_entry(xas, entry);
 		if (xas_error(xas))
 			goto out_unlock;
@@ -594,7 +605,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
 		if (WARN_ON_ONCE(!xa_is_value(entry)))
 			continue;
 		if (unlikely(dax_is_locked(entry)))
-			entry = get_unlocked_entry(&xas);
+			entry = get_unlocked_entry(&xas, 0);
 		if (entry)
 			page = dax_busy_page(entry);
 		put_unlocked_entry(&xas, entry);
@@ -621,7 +632,7 @@ static int __dax_invalidate_entry(struct address_space *mapping,
 	void *entry;
 
 	xas_lock_irq(&xas);
-	entry = get_unlocked_entry(&xas);
+	entry = get_unlocked_entry(&xas, 0);
 	if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
 		goto out;
 	if (!trunc &&
@@ -849,7 +860,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
 	if (unlikely(dax_is_locked(entry))) {
 		void *old_entry = entry;
 
-		entry = get_unlocked_entry(xas);
+		entry = get_unlocked_entry(xas, dax_entry_order(entry));
 
 		/* Entry got punched out / reallocated? */
 		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
@@ -861,6 +872,9 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
 		 */
 		if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
 			goto put_unlocked;
+		/* Did a PMD entry get split? */
+		if (dax_is_locked(entry))
+			goto put_unlocked;
 		if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
 					dax_is_zero_entry(entry))) {
 			ret = -EIO;
@@ -1510,7 +1524,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	 * entry is already in the array, for instance), it will return
 	 * VM_FAULT_FALLBACK.
 	 */
-	entry = grab_mapping_entry(&xas, mapping, DAX_PMD);
+	entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
 	if (xa_is_internal(entry)) {
 		result = xa_to_internal(entry);
 		goto fallback;
@@ -1659,7 +1673,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
 	vm_fault_t ret;
 
 	xas_lock_irq(&xas);
-	entry = get_unlocked_entry(&xas);
+	entry = get_unlocked_entry(&xas, order);
 	/* Did we race with someone splitting entry or so? */
 	if (!entry ||
 	    (order == 0 && !dax_is_pte_entry(entry)) ||
Dan Williams July 4, 2019, 11:27 p.m. UTC | #11
On Thu, Jul 4, 2019 at 12:14 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Thu, Jul 04, 2019 at 06:54:50PM +0200, Jan Kara wrote:
> > On Wed 03-07-19 20:27:28, Matthew Wilcox wrote:
> > > So I think we're good for all current users.
> >
> > Agreed but it is an ugly trap. As I already said, I'd rather pay the
> > unnecessary cost of waiting for pte entry and have an easy to understand
> > interface. If we ever have a real world use case that would care for this
> > optimization, we will need to refactor functions to make this possible and
> > still keep the interfaces sane. For example get_unlocked_entry() could
> > return special "error code" indicating that there's no entry with matching
> > order in xarray but there's a conflict with it. That would be much less
> > error-prone interface.
>
> This is an internal interface.  I think it's already a pretty gnarly
> interface to use by definition -- it's going to sleep and might return
> almost anything.  There's not much scope for returning an error indicator
> either; value entries occupy half of the range (all odd numbers between 1
> and ULONG_MAX inclusive), plus NULL.  We could use an internal entry, but
> I don't think that makes the interface any easier to use than returning
> a locked entry.
>
> I think this iteration of the patch makes it a little clearer.  What do you
> think?
>

Not much clearer to me. get_unlocked_entry() is now misnamed and this
arrangement allows for mismatches of @order argument vs @xas
configuration. Can you describe, or even better demonstrate with
numbers, why it's better to carry this complication than just
converging the waitqueues between the types?
Matthew Wilcox July 5, 2019, 7:10 p.m. UTC | #12
On Thu, Jul 04, 2019 at 04:27:14PM -0700, Dan Williams wrote:
> On Thu, Jul 4, 2019 at 12:14 PM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Thu, Jul 04, 2019 at 06:54:50PM +0200, Jan Kara wrote:
> > > On Wed 03-07-19 20:27:28, Matthew Wilcox wrote:
> > > > So I think we're good for all current users.
> > >
> > > Agreed but it is an ugly trap. As I already said, I'd rather pay the
> > > unnecessary cost of waiting for pte entry and have an easy to understand
> > > interface. If we ever have a real world use case that would care for this
> > > optimization, we will need to refactor functions to make this possible and
> > > still keep the interfaces sane. For example get_unlocked_entry() could
> > > return special "error code" indicating that there's no entry with matching
> > > order in xarray but there's a conflict with it. That would be much less
> > > error-prone interface.
> >
> > This is an internal interface.  I think it's already a pretty gnarly
> > interface to use by definition -- it's going to sleep and might return
> > almost anything.  There's not much scope for returning an error indicator
> > either; value entries occupy half of the range (all odd numbers between 1
> > and ULONG_MAX inclusive), plus NULL.  We could use an internal entry, but
> > I don't think that makes the interface any easier to use than returning
> > a locked entry.
> >
> > I think this iteration of the patch makes it a little clearer.  What do you
> > think?
> >
> 
> Not much clearer to me. get_unlocked_entry() is now misnamed and this

misnamed?  You'd rather it was called "try_get_unlocked_entry()"?

> arrangement allows for mismatches of @order argument vs @xas
> configuration.

> Can you describe, or even better demonstrate with
> numbers, why it's better to carry this complication than just
> converging the waitqueues between the types?

You've got the reproducer ;-)  It seems quite wrong to make a page fault
stall just because another task is working on a different page in the
same 2MB chunk.
Dan Williams July 5, 2019, 8:47 p.m. UTC | #13
On Fri, Jul 5, 2019 at 12:10 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Thu, Jul 04, 2019 at 04:27:14PM -0700, Dan Williams wrote:
> > On Thu, Jul 4, 2019 at 12:14 PM Matthew Wilcox <willy@infradead.org> wrote:
> > >
> > > On Thu, Jul 04, 2019 at 06:54:50PM +0200, Jan Kara wrote:
> > > > On Wed 03-07-19 20:27:28, Matthew Wilcox wrote:
> > > > > So I think we're good for all current users.
> > > >
> > > > Agreed but it is an ugly trap. As I already said, I'd rather pay the
> > > > unnecessary cost of waiting for pte entry and have an easy to understand
> > > > interface. If we ever have a real world use case that would care for this
> > > > optimization, we will need to refactor functions to make this possible and
> > > > still keep the interfaces sane. For example get_unlocked_entry() could
> > > > return special "error code" indicating that there's no entry with matching
> > > > order in xarray but there's a conflict with it. That would be much less
> > > > error-prone interface.
> > >
> > > This is an internal interface.  I think it's already a pretty gnarly
> > > interface to use by definition -- it's going to sleep and might return
> > > almost anything.  There's not much scope for returning an error indicator
> > > either; value entries occupy half of the range (all odd numbers between 1
> > > and ULONG_MAX inclusive), plus NULL.  We could use an internal entry, but
> > > I don't think that makes the interface any easier to use than returning
> > > a locked entry.
> > >
> > > I think this iteration of the patch makes it a little clearer.  What do you
> > > think?
> > >
> >
> > Not much clearer to me. get_unlocked_entry() is now misnamed and this
>
> misnamed?  You'd rather it was called "try_get_unlocked_entry()"?

I was thinking more along the lines of
get_unlocked_but_sometimes_locked_entry(), i.e. per Jan's feedback to
keep the interface simple.
Jan Kara July 10, 2019, 7:02 p.m. UTC | #14
On Fri 05-07-19 13:47:02, Dan Williams wrote:
> On Fri, Jul 5, 2019 at 12:10 PM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Thu, Jul 04, 2019 at 04:27:14PM -0700, Dan Williams wrote:
> > > On Thu, Jul 4, 2019 at 12:14 PM Matthew Wilcox <willy@infradead.org> wrote:
> > > >
> > > > On Thu, Jul 04, 2019 at 06:54:50PM +0200, Jan Kara wrote:
> > > > > On Wed 03-07-19 20:27:28, Matthew Wilcox wrote:
> > > > > > So I think we're good for all current users.
> > > > >
> > > > > Agreed but it is an ugly trap. As I already said, I'd rather pay the
> > > > > unnecessary cost of waiting for pte entry and have an easy to understand
> > > > > interface. If we ever have a real world use case that would care for this
> > > > > optimization, we will need to refactor functions to make this possible and
> > > > > still keep the interfaces sane. For example get_unlocked_entry() could
> > > > > return special "error code" indicating that there's no entry with matching
> > > > > order in xarray but there's a conflict with it. That would be much less
> > > > > error-prone interface.
> > > >
> > > > This is an internal interface.  I think it's already a pretty gnarly
> > > > interface to use by definition -- it's going to sleep and might return
> > > > almost anything.  There's not much scope for returning an error indicator
> > > > either; value entries occupy half of the range (all odd numbers between 1
> > > > and ULONG_MAX inclusive), plus NULL.  We could use an internal entry, but
> > > > I don't think that makes the interface any easier to use than returning
> > > > a locked entry.
> > > >
> > > > I think this iteration of the patch makes it a little clearer.  What do you
> > > > think?
> > > >
> > >
> > > Not much clearer to me. get_unlocked_entry() is now misnamed and this
> >
> > misnamed?  You'd rather it was called "try_get_unlocked_entry()"?
> 
> I was thinking more along the lines of
> get_unlocked_but_sometimes_locked_entry(), i.e. per Jan's feedback to
> keep the interface simple.

So how about the attached patch? That keeps the interface sane and passes a
smoketest for me (full fstest run running). Obviously it also needs a
proper changelog...

								Honza
Matthew Wilcox July 10, 2019, 8:15 p.m. UTC | #15
On Wed, Jul 10, 2019 at 09:02:04PM +0200, Jan Kara wrote:
> +#define DAX_ENTRY_CONFLICT dax_make_entry(pfn_to_pfn_t(1), DAX_EMPTY)

I was hoping to get rid of DAX_EMPTY ... it's almost unused now.  Once
we switch to having a single DAX_LOCK value instead of a single bit,
I think it can go away, freeing up two bits.

If you really want a special DAX_ENTRY_CONFLICT, I think we can make
one in the 2..4094 range.

That aside, this looks pretty similar to the previous patch I sent, so
if you're now happy with this, let's add

#define XA_DAX_CONFLICT_ENTRY xa_mk_internal(258)

to xarray.h and do it that way?
Jan Kara July 10, 2019, 8:26 p.m. UTC | #16
On Wed 10-07-19 13:15:39, Matthew Wilcox wrote:
> On Wed, Jul 10, 2019 at 09:02:04PM +0200, Jan Kara wrote:
> > +#define DAX_ENTRY_CONFLICT dax_make_entry(pfn_to_pfn_t(1), DAX_EMPTY)
> 
> I was hoping to get rid of DAX_EMPTY ... it's almost unused now.  Once
> we switch to having a single DAX_LOCK value instead of a single bit,
> I think it can go away, freeing up two bits.
> 
> If you really want a special DAX_ENTRY_CONFLICT, I think we can make
> one in the 2..4094 range.
> 
> That aside, this looks pretty similar to the previous patch I sent, so
> if you're now happy with this, let's add
> 
> #define XA_DAX_CONFLICT_ENTRY xa_mk_internal(258)
> 
> to xarray.h and do it that way?

Yeah, that would work for me as well. The chosen value for DAX_ENTRY_CONFLICT
was pretty arbitrary. Or we could possibly use:

#define DAX_ENTRY_CONFLICT XA_ZERO_ENTRY

so that we don't leak DAX-specific internal definition into xarray.h?

								Honza
Matthew Wilcox July 11, 2019, 3:08 a.m. UTC | #17
On Wed, Jul 10, 2019 at 09:02:04PM +0200, Jan Kara wrote:
> @@ -848,7 +853,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
>  	if (unlikely(dax_is_locked(entry))) {
>  		void *old_entry = entry;
>  
> -		entry = get_unlocked_entry(xas);
> +		entry = get_unlocked_entry(xas, 0);
>  
>  		/* Entry got punched out / reallocated? */
>  		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))

I'm not sure about this one.  Are we sure there will never be a dirty
PMD entry?  Even if we can't create one today, it feels like a bit of
a landmine to leave for someone who creates one in the future.
Matthew Wilcox July 11, 2019, 3:35 a.m. UTC | #18
On Wed, Jul 10, 2019 at 09:02:04PM +0200, Jan Kara wrote:
> So how about the attached patch? That keeps the interface sane and passes a
> smoketest for me (full fstest run running). Obviously it also needs a
> proper changelog...

Changelog and slightly massaged version along the lines of my two comments
attached.
From 57b63fdd38e7bea7eb8d6332f0163fb028570def Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 3 Jul 2019 23:21:25 -0400
Subject: [PATCH] dax: Fix missed wakeup with PMD faults

RocksDB can hang indefinitely when using a DAX file.  This is due to
a bug in the XArray conversion when handling a PMD fault and finding a
PTE entry.  We use the wrong index in the hash and end up waiting on
the wrong waitqueue.

There's actually no need to wait; if we find a PTE entry while looking
for a PMD entry, we can return immediately as we know we should fall
back to a PTE fault (which may not conflict with the lock held).

Cc: stable@vger.kernel.org
Fixes: b15cd800682f ("dax: Convert page fault handlers to XArray")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 fs/dax.c               | 47 ++++++++++++++++++++++++------------------
 include/linux/xarray.h |  4 ++--
 2 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 2e48c7ebb973..1ce1059af266 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -195,11 +195,13 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
  * Look up entry in page cache, wait for it to become unlocked if it
  * is a DAX entry and return it.  The caller must subsequently call
  * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
- * if it did.
+ * if it did.  The entry returned may have a larger order than @order.
+ * If @order is larger than the order of the entry found in i_pages, this
+ * function returns a CONFLICT entry.
  *
  * Must be called with the i_pages lock held.
  */
-static void *get_unlocked_entry(struct xa_state *xas)
+static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
 {
 	void *entry;
 	struct wait_exceptional_entry_queue ewait;
@@ -210,6 +212,8 @@ static void *get_unlocked_entry(struct xa_state *xas)
 
 	for (;;) {
 		entry = xas_find_conflict(xas);
+		if (dax_entry_order(entry) < order)
+			return XA_DAX_CONFLICT_ENTRY;
 		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
 				!dax_is_locked(entry))
 			return entry;
@@ -254,7 +258,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
 static void put_unlocked_entry(struct xa_state *xas, void *entry)
 {
 	/* If we were the only waiter woken, wake the next one */
-	if (entry)
+	if (entry && entry != XA_DAX_CONFLICT_ENTRY)
 		dax_wake_entry(xas, entry, false);
 }
 
@@ -461,7 +465,7 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie)
  * overlap with xarray value entries.
  */
 static void *grab_mapping_entry(struct xa_state *xas,
-		struct address_space *mapping, unsigned long size_flag)
+		struct address_space *mapping, unsigned int order)
 {
 	unsigned long index = xas->xa_index;
 	bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
@@ -469,20 +473,17 @@ static void *grab_mapping_entry(struct xa_state *xas,
 
 retry:
 	xas_lock_irq(xas);
-	entry = get_unlocked_entry(xas);
+	entry = get_unlocked_entry(xas, order);
 
 	if (entry) {
+		if (entry == XA_DAX_CONFLICT_ENTRY)
+			goto fallback;
 		if (!xa_is_value(entry)) {
 			xas_set_err(xas, EIO);
 			goto out_unlock;
 		}
 
-		if (size_flag & DAX_PMD) {
-			if (dax_is_pte_entry(entry)) {
-				put_unlocked_entry(xas, entry);
-				goto fallback;
-			}
-		} else { /* trying to grab a PTE entry */
+		if (order == 0) {
 			if (dax_is_pmd_entry(entry) &&
 			    (dax_is_zero_entry(entry) ||
 			     dax_is_empty_entry(entry))) {
@@ -523,7 +524,11 @@ static void *grab_mapping_entry(struct xa_state *xas,
 	if (entry) {
 		dax_lock_entry(xas, entry);
 	} else {
-		entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY);
+		unsigned long flags = DAX_EMPTY;
+
+		if (order > 0)
+			flags |= DAX_PMD;
+		entry = dax_make_entry(pfn_to_pfn_t(0), flags);
 		dax_lock_entry(xas, entry);
 		if (xas_error(xas))
 			goto out_unlock;
@@ -594,7 +599,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
 		if (WARN_ON_ONCE(!xa_is_value(entry)))
 			continue;
 		if (unlikely(dax_is_locked(entry)))
-			entry = get_unlocked_entry(&xas);
+			entry = get_unlocked_entry(&xas, 0);
 		if (entry)
 			page = dax_busy_page(entry);
 		put_unlocked_entry(&xas, entry);
@@ -621,7 +626,7 @@ static int __dax_invalidate_entry(struct address_space *mapping,
 	void *entry;
 
 	xas_lock_irq(&xas);
-	entry = get_unlocked_entry(&xas);
+	entry = get_unlocked_entry(&xas, 0);
 	if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
 		goto out;
 	if (!trunc &&
@@ -849,8 +854,11 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
 	if (unlikely(dax_is_locked(entry))) {
 		void *old_entry = entry;
 
-		entry = get_unlocked_entry(xas);
+		entry = get_unlocked_entry(xas, dax_entry_order(entry));
 
+		/* Did a PMD entry get split? */
+		if (entry == XA_DAX_CONFLICT_ENTRY)
+			goto put_unlocked;
 		/* Entry got punched out / reallocated? */
 		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
 			goto put_unlocked;
@@ -1510,7 +1518,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	 * entry is already in the array, for instance), it will return
 	 * VM_FAULT_FALLBACK.
 	 */
-	entry = grab_mapping_entry(&xas, mapping, DAX_PMD);
+	entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
 	if (xa_is_internal(entry)) {
 		result = xa_to_internal(entry);
 		goto fallback;
@@ -1659,11 +1667,10 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
 	vm_fault_t ret;
 
 	xas_lock_irq(&xas);
-	entry = get_unlocked_entry(&xas);
+	entry = get_unlocked_entry(&xas, order);
 	/* Did we race with someone splitting entry or so? */
-	if (!entry ||
-	    (order == 0 && !dax_is_pte_entry(entry)) ||
-	    (order == PMD_ORDER && !dax_is_pmd_entry(entry))) {
+	if (!entry || entry == XA_DAX_CONFLICT_ENTRY ||
+	    (order == 0 && !dax_is_pte_entry(entry))) {
 		put_unlocked_entry(&xas, entry);
 		xas_unlock_irq(&xas);
 		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 052e06ff4c36..fb25452bcfa4 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -169,7 +169,9 @@ static inline bool xa_is_internal(const void *entry)
 	return ((unsigned long)entry & 3) == 2;
 }
 
+#define XA_RETRY_ENTRY		xa_mk_internal(256)
 #define XA_ZERO_ENTRY		xa_mk_internal(257)
+#define XA_DAX_CONFLICT_ENTRY	xa_mk_internal(258)
 
 /**
  * xa_is_zero() - Is the entry a zero entry?
@@ -1213,8 +1215,6 @@ static inline bool xa_is_sibling(const void *entry)
 		(entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
 }
 
-#define XA_RETRY_ENTRY		xa_mk_internal(256)
-
 /**
  * xa_is_retry() - Is the entry a retry entry?
  * @entry: Entry retrieved from the XArray
Jan Kara July 11, 2019, 7:48 a.m. UTC | #19
On Wed 10-07-19 20:08:55, Matthew Wilcox wrote:
> On Wed, Jul 10, 2019 at 09:02:04PM +0200, Jan Kara wrote:
> > @@ -848,7 +853,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
> >  	if (unlikely(dax_is_locked(entry))) {
> >  		void *old_entry = entry;
> >  
> > -		entry = get_unlocked_entry(xas);
> > +		entry = get_unlocked_entry(xas, 0);
> >  
> >  		/* Entry got punched out / reallocated? */
> >  		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
> 
> I'm not sure about this one.  Are we sure there will never be a dirty
> PMD entry?  Even if we can't create one today, it feels like a bit of
> a landmine to leave for someone who creates one in the future.

I was thinking about this but dax_writeback_one() doesn't really care what
entry it gets. Yes, in theory it could get a PMD when previously there was
PTE or vice-versa but we check that PFN's match and if they really do
match, there's no harm in doing the flushing whatever entry we got back...
And the checks are simpler this way.

								Honza
Jan Kara July 11, 2019, 8:06 a.m. UTC | #20
On Wed 10-07-19 20:35:55, Matthew Wilcox wrote:
> On Wed, Jul 10, 2019 at 09:02:04PM +0200, Jan Kara wrote:
> > So how about the attached patch? That keeps the interface sane and passes a
> > smoketest for me (full fstest run running). Obviously it also needs a
> > proper changelog...
> 
> Changelog and slightly massaged version along the lines of my two comments
> attached.
> 

> From 57b63fdd38e7bea7eb8d6332f0163fb028570def Mon Sep 17 00:00:00 2001
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> Date: Wed, 3 Jul 2019 23:21:25 -0400
> Subject: [PATCH] dax: Fix missed wakeup with PMD faults
> 
> RocksDB can hang indefinitely when using a DAX file.  This is due to
> a bug in the XArray conversion when handling a PMD fault and finding a
> PTE entry.  We use the wrong index in the hash and end up waiting on
> the wrong waitqueue.
> 
> There's actually no need to wait; if we find a PTE entry while looking
> for a PMD entry, we can return immediately as we know we should fall
> back to a PTE fault (which may not conflict with the lock held).
> 
> Cc: stable@vger.kernel.org
> Fixes: b15cd800682f ("dax: Convert page fault handlers to XArray")
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Just one nit below. Otherwise feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

> diff --git a/include/linux/xarray.h b/include/linux/xarray.h
> index 052e06ff4c36..fb25452bcfa4 100644
> --- a/include/linux/xarray.h
> +++ b/include/linux/xarray.h
> @@ -169,7 +169,9 @@ static inline bool xa_is_internal(const void *entry)
>  	return ((unsigned long)entry & 3) == 2;
>  }
>  
> +#define XA_RETRY_ENTRY		xa_mk_internal(256)
>  #define XA_ZERO_ENTRY		xa_mk_internal(257)
> +#define XA_DAX_CONFLICT_ENTRY	xa_mk_internal(258)
>  
>  /**
>   * xa_is_zero() - Is the entry a zero entry?

As I wrote in my previous email, won't it be nicer if we just defined
DAX_CONFLICT_ENTRY (or however we name it) inside dax code say to
XA_ZERO_ENTRY?  Generic xarray code just doesn't care about that value and
I can imagine in future there'll be other xarray user's who'd like to have
some special value(s) for use similarly to DAX and we don't want to clutter
xarray definitions with those as well. If you don't like XA_ZERO_ENTRY, I
could also imagine having XA_USER_RESERVED value that's guaranteed to be
unused by xarray and we'd define DAX_CONFLICT_ENTRY to it. Overall I don't
care too much so I can live even with what you have now but it would seem
cleaner that way to me.

								Honza
Matthew Wilcox July 11, 2019, 1:28 p.m. UTC | #21
On Thu, Jul 11, 2019 at 09:48:59AM +0200, Jan Kara wrote:
> On Wed 10-07-19 20:08:55, Matthew Wilcox wrote:
> > On Wed, Jul 10, 2019 at 09:02:04PM +0200, Jan Kara wrote:
> > > @@ -848,7 +853,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
> > >  	if (unlikely(dax_is_locked(entry))) {
> > >  		void *old_entry = entry;
> > >  
> > > -		entry = get_unlocked_entry(xas);
> > > +		entry = get_unlocked_entry(xas, 0);
> > >  
> > >  		/* Entry got punched out / reallocated? */
> > >  		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
> > 
> > I'm not sure about this one.  Are we sure there will never be a dirty
> > PMD entry?  Even if we can't create one today, it feels like a bit of
> > a landmine to leave for someone who creates one in the future.
> 
> I was thinking about this but dax_writeback_one() doesn't really care what
> entry it gets. Yes, in theory it could get a PMD when previously there was
> PTE or vice-versa but we check that PFN's match and if they really do
> match, there's no harm in doing the flushing whatever entry we got back...
> And the checks are simpler this way.

That's fair.  I'll revert this part to the way you had it.

I actually want to get rid of the PFN match check too; it doesn't really
buy us much.  We already check whether the TOWRITE bit is set, and that
should accomplish the same thing.
Matthew Wilcox July 11, 2019, 2:13 p.m. UTC | #22
On Wed, Jul 10, 2019 at 10:26:47PM +0200, Jan Kara wrote:
> On Wed 10-07-19 13:15:39, Matthew Wilcox wrote:
> > On Wed, Jul 10, 2019 at 09:02:04PM +0200, Jan Kara wrote:
> > > +#define DAX_ENTRY_CONFLICT dax_make_entry(pfn_to_pfn_t(1), DAX_EMPTY)
> > 
> > I was hoping to get rid of DAX_EMPTY ... it's almost unused now.  Once
> > we switch to having a single DAX_LOCK value instead of a single bit,
> > I think it can go away, freeing up two bits.
> > 
> > If you really want a special DAX_ENTRY_CONFLICT, I think we can make
> > one in the 2..4094 range.
> > 
> > That aside, this looks pretty similar to the previous patch I sent, so
> > if you're now happy with this, let's add
> > 
> > #define XA_DAX_CONFLICT_ENTRY xa_mk_internal(258)
> > 
> > to xarray.h and do it that way?
> 
> Yeah, that would work for me as well. The chosen value for DAX_ENTRY_CONFLICT
> was pretty arbitrary. Or we could possibly use:
> 
> #define DAX_ENTRY_CONFLICT XA_ZERO_ENTRY
> 
> so that we don't leak DAX-specific internal definition into xarray.h?

I don't want to use the ZERO entry as our conflict marker because that
could legitimately appear in an XArray.  Not the i_pages XArray today,
but I hold out hope for using that in place of the DAX_ZERO_PAGE bit too.
That's going to be a bit more tricky since we currently distinguish
between DAX_ZERO_PAGE and DAX_ZERO_PAGE | DAX_PMD.

However, the XA_RETRY_ENTRY might be a good choice.  It doesn't normally
appear in an XArray (it may appear if you're looking at a deleted node,
but since we're holding the lock, we can't see deleted nodes).
Matthew Wilcox July 11, 2019, 3:25 p.m. UTC | #23
On Thu, Jul 11, 2019 at 07:13:50AM -0700, Matthew Wilcox wrote:
> However, the XA_RETRY_ENTRY might be a good choice.  It doesn't normally
> appear in an XArray (it may appear if you're looking at a deleted node,
> but since we're holding the lock, we can't see deleted nodes).

Updated patch (also slight updates to changelog and comments)

--- 8< ---

From af8402dacb3998d8ef23677ac35fdb72b236320c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 3 Jul 2019 23:21:25 -0400
Subject: [PATCH] dax: Fix missed wakeup with PMD faults

RocksDB can hang indefinitely when using a DAX file.  This is due to
a bug in the XArray conversion when handling a PMD fault and finding a
PTE entry.  We use the wrong index in the hash and end up waiting on
the wrong waitqueue.

There's actually no need to wait; if we find a PTE entry while looking
for a PMD entry, we can return immediately as we know we should fall
back to a PTE fault (which may not conflict with the lock held).

We reuse the XA_RETRY_ENTRY to signal a conflicting entry was found.
This value can never be found in an XArray while holding its lock, so
it does not create an ambiguity.

Cc: stable@vger.kernel.org
Fixes: b15cd800682f ("dax: Convert page fault handlers to XArray")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 fs/dax.c | 53 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 2e48c7ebb973..7a75031da644 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -123,6 +123,15 @@ static int dax_is_empty_entry(void *entry)
 	return xa_to_value(entry) & DAX_EMPTY;
 }
 
+/*
+ * true if the entry that was found is of a smaller order than the entry
+ * we were looking for
+ */
+static bool dax_is_conflict(void *entry)
+{
+	return entry == XA_RETRY_ENTRY;
+}
+
 /*
  * DAX page cache entry locking
  */
@@ -195,11 +204,13 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
  * Look up entry in page cache, wait for it to become unlocked if it
  * is a DAX entry and return it.  The caller must subsequently call
  * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
- * if it did.
+ * if it did.  The entry returned may have a larger order than @order.
+ * If @order is larger than the order of the entry found in i_pages, this
+ * function returns a dax_is_conflict entry.
  *
  * Must be called with the i_pages lock held.
  */
-static void *get_unlocked_entry(struct xa_state *xas)
+static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
 {
 	void *entry;
 	struct wait_exceptional_entry_queue ewait;
@@ -210,6 +221,8 @@ static void *get_unlocked_entry(struct xa_state *xas)
 
 	for (;;) {
 		entry = xas_find_conflict(xas);
+		if (dax_entry_order(entry) < order)
+			return XA_RETRY_ENTRY;
 		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
 				!dax_is_locked(entry))
 			return entry;
@@ -254,7 +267,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
 static void put_unlocked_entry(struct xa_state *xas, void *entry)
 {
 	/* If we were the only waiter woken, wake the next one */
-	if (entry)
+	if (entry && dax_is_conflict(entry))
 		dax_wake_entry(xas, entry, false);
 }
 
@@ -461,7 +474,7 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie)
  * overlap with xarray value entries.
  */
 static void *grab_mapping_entry(struct xa_state *xas,
-		struct address_space *mapping, unsigned long size_flag)
+		struct address_space *mapping, unsigned int order)
 {
 	unsigned long index = xas->xa_index;
 	bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
@@ -469,20 +482,17 @@ static void *grab_mapping_entry(struct xa_state *xas,
 
 retry:
 	xas_lock_irq(xas);
-	entry = get_unlocked_entry(xas);
+	entry = get_unlocked_entry(xas, order);
 
 	if (entry) {
+		if (dax_is_conflict(entry))
+			goto fallback;
 		if (!xa_is_value(entry)) {
 			xas_set_err(xas, EIO);
 			goto out_unlock;
 		}
 
-		if (size_flag & DAX_PMD) {
-			if (dax_is_pte_entry(entry)) {
-				put_unlocked_entry(xas, entry);
-				goto fallback;
-			}
-		} else { /* trying to grab a PTE entry */
+		if (order == 0) {
 			if (dax_is_pmd_entry(entry) &&
 			    (dax_is_zero_entry(entry) ||
 			     dax_is_empty_entry(entry))) {
@@ -523,7 +533,11 @@ static void *grab_mapping_entry(struct xa_state *xas,
 	if (entry) {
 		dax_lock_entry(xas, entry);
 	} else {
-		entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY);
+		unsigned long flags = DAX_EMPTY;
+
+		if (order > 0)
+			flags |= DAX_PMD;
+		entry = dax_make_entry(pfn_to_pfn_t(0), flags);
 		dax_lock_entry(xas, entry);
 		if (xas_error(xas))
 			goto out_unlock;
@@ -594,7 +608,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
 		if (WARN_ON_ONCE(!xa_is_value(entry)))
 			continue;
 		if (unlikely(dax_is_locked(entry)))
-			entry = get_unlocked_entry(&xas);
+			entry = get_unlocked_entry(&xas, 0);
 		if (entry)
 			page = dax_busy_page(entry);
 		put_unlocked_entry(&xas, entry);
@@ -621,7 +635,7 @@ static int __dax_invalidate_entry(struct address_space *mapping,
 	void *entry;
 
 	xas_lock_irq(&xas);
-	entry = get_unlocked_entry(&xas);
+	entry = get_unlocked_entry(&xas, 0);
 	if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
 		goto out;
 	if (!trunc &&
@@ -849,7 +863,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
 	if (unlikely(dax_is_locked(entry))) {
 		void *old_entry = entry;
 
-		entry = get_unlocked_entry(xas);
+		entry = get_unlocked_entry(xas, 0);
 
 		/* Entry got punched out / reallocated? */
 		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
@@ -1510,7 +1524,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	 * entry is already in the array, for instance), it will return
 	 * VM_FAULT_FALLBACK.
 	 */
-	entry = grab_mapping_entry(&xas, mapping, DAX_PMD);
+	entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
 	if (xa_is_internal(entry)) {
 		result = xa_to_internal(entry);
 		goto fallback;
@@ -1659,11 +1673,10 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
 	vm_fault_t ret;
 
 	xas_lock_irq(&xas);
-	entry = get_unlocked_entry(&xas);
+	entry = get_unlocked_entry(&xas, order);
 	/* Did we race with someone splitting entry or so? */
-	if (!entry ||
-	    (order == 0 && !dax_is_pte_entry(entry)) ||
-	    (order == PMD_ORDER && !dax_is_pmd_entry(entry))) {
+	if (!entry || dax_is_conflict(entry) ||
+	    (order == 0 && !dax_is_pte_entry(entry))) {
 		put_unlocked_entry(&xas, entry);
 		xas_unlock_irq(&xas);
 		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
Jan Kara July 11, 2019, 3:41 p.m. UTC | #24
On Thu 11-07-19 08:25:50, Matthew Wilcox wrote:
> On Thu, Jul 11, 2019 at 07:13:50AM -0700, Matthew Wilcox wrote:
> > However, the XA_RETRY_ENTRY might be a good choice.  It doesn't normally
> > appear in an XArray (it may appear if you're looking at a deleted node,
> > but since we're holding the lock, we can't see deleted nodes).
> 
...

> @@ -254,7 +267,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
>  static void put_unlocked_entry(struct xa_state *xas, void *entry)
>  {
>  	/* If we were the only waiter woken, wake the next one */
> -	if (entry)
> +	if (entry && dax_is_conflict(entry))

This should be !dax_is_conflict(entry)...

>  		dax_wake_entry(xas, entry, false);
>  }

Otherwise the patch looks good to me so feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

once you fix this.

								Honza
Dan Williams July 17, 2019, 3:39 a.m. UTC | #25
On Fri, Jul 12, 2019 at 2:14 AM Jan Kara <jack@suse.cz> wrote:
>
> On Thu 11-07-19 08:25:50, Matthew Wilcox wrote:
> > On Thu, Jul 11, 2019 at 07:13:50AM -0700, Matthew Wilcox wrote:
> > > However, the XA_RETRY_ENTRY might be a good choice.  It doesn't normally
> > > appear in an XArray (it may appear if you're looking at a deleted node,
> > > but since we're holding the lock, we can't see deleted nodes).
> >
> ...
>
> > @@ -254,7 +267,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
> >  static void put_unlocked_entry(struct xa_state *xas, void *entry)
> >  {
> >       /* If we were the only waiter woken, wake the next one */
> > -     if (entry)
> > +     if (entry && dax_is_conflict(entry))
>
> This should be !dax_is_conflict(entry)...
>
> >               dax_wake_entry(xas, entry, false);
> >  }
>
> Otherwise the patch looks good to me so feel free to add:
>
> Reviewed-by: Jan Kara <jack@suse.cz>

Looks good, and passes the test case. Now pushed out to
libnvdimm-for-next for v5.3 inclusion:

https://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git/commit/?h=libnvdimm-for-next&id=23c84eb7837514e16d79ed6d849b13745e0ce688
Jan Kara July 29, 2019, 12:02 p.m. UTC | #26
On Tue 16-07-19 20:39:46, Dan Williams wrote:
> On Fri, Jul 12, 2019 at 2:14 AM Jan Kara <jack@suse.cz> wrote:
> >
> > On Thu 11-07-19 08:25:50, Matthew Wilcox wrote:
> > > On Thu, Jul 11, 2019 at 07:13:50AM -0700, Matthew Wilcox wrote:
> > > > However, the XA_RETRY_ENTRY might be a good choice.  It doesn't normally
> > > > appear in an XArray (it may appear if you're looking at a deleted node,
> > > > but since we're holding the lock, we can't see deleted nodes).
> > >
> > ...
> >
> > > @@ -254,7 +267,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
> > >  static void put_unlocked_entry(struct xa_state *xas, void *entry)
> > >  {
> > >       /* If we were the only waiter woken, wake the next one */
> > > -     if (entry)
> > > +     if (entry && dax_is_conflict(entry))
> >
> > This should be !dax_is_conflict(entry)...
> >
> > >               dax_wake_entry(xas, entry, false);
> > >  }
> >
> > Otherwise the patch looks good to me so feel free to add:
> >
> > Reviewed-by: Jan Kara <jack@suse.cz>
> 
> Looks good, and passes the test case. Now pushed out to
> libnvdimm-for-next for v5.3 inclusion:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git/commit/?h=libnvdimm-for-next&id=23c84eb7837514e16d79ed6d849b13745e0ce688

Thanks for picking up the patch but you didn't apply the fix I've mentioned
above. So put_unlocked_entry() is not waking up anybody anymore... Since
this got already to Linus' tree, I guess a separate fixup patch is needed
(attached).

								Honza
Dan Williams July 29, 2019, 3:18 p.m. UTC | #27
On Mon, Jul 29, 2019 at 5:02 AM Jan Kara <jack@suse.cz> wrote:
>
> On Tue 16-07-19 20:39:46, Dan Williams wrote:
> > On Fri, Jul 12, 2019 at 2:14 AM Jan Kara <jack@suse.cz> wrote:
> > >
> > > On Thu 11-07-19 08:25:50, Matthew Wilcox wrote:
> > > > On Thu, Jul 11, 2019 at 07:13:50AM -0700, Matthew Wilcox wrote:
> > > > > However, the XA_RETRY_ENTRY might be a good choice.  It doesn't normally
> > > > > appear in an XArray (it may appear if you're looking at a deleted node,
> > > > > but since we're holding the lock, we can't see deleted nodes).
> > > >
> > > ...
> > >
> > > > @@ -254,7 +267,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
> > > >  static void put_unlocked_entry(struct xa_state *xas, void *entry)
> > > >  {
> > > >       /* If we were the only waiter woken, wake the next one */
> > > > -     if (entry)
> > > > +     if (entry && dax_is_conflict(entry))
> > >
> > > This should be !dax_is_conflict(entry)...
> > >
> > > >               dax_wake_entry(xas, entry, false);
> > > >  }
> > >
> > > Otherwise the patch looks good to me so feel free to add:
> > >
> > > Reviewed-by: Jan Kara <jack@suse.cz>
> >
> > Looks good, and passes the test case. Now pushed out to
> > libnvdimm-for-next for v5.3 inclusion:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git/commit/?h=libnvdimm-for-next&id=23c84eb7837514e16d79ed6d849b13745e0ce688
>
> Thanks for picking up the patch but you didn't apply the fix I've mentioned
> above. So put_unlocked_entry() is not waking up anybody anymore... Since
> this got already to Linus' tree, I guess a separate fixup patch is needed
> (attached).

Sigh, indeed. I think what happened is I applied the fixup locally,
tested it, and then later reapplied the patch from the list as I was
integrating the new automatic "Link:" generation script that has been
proposed on the ksummit list.

I'll get this pushed immediately.

Lesson learned: no manual local fixups, ask for resends to always be
able to pull the exact contents from the list.

Patch
diff mbox series

diff --git a/fs/dax.c b/fs/dax.c
index 9fd908f3df32..592944c522b8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -144,19 +144,14 @@  struct wait_exceptional_entry_queue {
 	struct exceptional_entry_key key;
 };
 
-static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
-		void *entry, struct exceptional_entry_key *key)
+static wait_queue_head_t *dax_index_waitqueue(struct xa_state *xas,
+		struct exceptional_entry_key *key)
 {
 	unsigned long hash;
 	unsigned long index = xas->xa_index;
 
-	/*
-	 * If 'entry' is a PMD, align the 'index' that we use for the wait
-	 * queue to the start of that PMD.  This ensures that all offsets in
-	 * the range covered by the PMD map to the same bit lock.
-	 */
-	if (dax_is_pmd_entry(entry))
-		index &= ~PG_PMD_COLOUR;
+	/* PMD-align the index to ensure PTE events wakeup PMD waiters */
+	index &= ~PG_PMD_COLOUR;
 	key->xa = xas->xa;
 	key->entry_start = index;
 
@@ -177,17 +172,12 @@  static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
 	return autoremove_wake_function(wait, mode, sync, NULL);
 }
 
-/*
- * @entry may no longer be the entry at the index in the mapping.
- * The important information it's conveying is whether the entry at
- * this index used to be a PMD entry.
- */
-static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
+static void dax_wake_index(struct xa_state *xas, bool wake_all)
 {
 	struct exceptional_entry_key key;
 	wait_queue_head_t *wq;
 
-	wq = dax_entry_waitqueue(xas, entry, &key);
+	wq = dax_index_waitqueue(xas, &key);
 
 	/*
 	 * Checking for locked entry and prepare_to_wait_exclusive() happens
@@ -222,7 +212,7 @@  static void *get_unlocked_entry(struct xa_state *xas)
 				!dax_is_locked(entry))
 			return entry;
 
-		wq = dax_entry_waitqueue(xas, entry, &ewait.key);
+		wq = dax_index_waitqueue(xas, &ewait.key);
 		prepare_to_wait_exclusive(wq, &ewait.wait,
 					  TASK_UNINTERRUPTIBLE);
 		xas_unlock_irq(xas);
@@ -246,7 +236,7 @@  static void wait_entry_unlocked(struct xa_state *xas, void *entry)
 	init_wait(&ewait.wait);
 	ewait.wait.func = wake_exceptional_entry_func;
 
-	wq = dax_entry_waitqueue(xas, entry, &ewait.key);
+	wq = dax_index_waitqueue(xas, &ewait.key);
 	/*
 	 * Unlike get_unlocked_entry() there is no guarantee that this
 	 * path ever successfully retrieves an unlocked entry before an
@@ -263,7 +253,7 @@  static void put_unlocked_entry(struct xa_state *xas, void *entry)
 {
 	/* If we were the only waiter woken, wake the next one */
 	if (entry)
-		dax_wake_entry(xas, entry, false);
+		dax_wake_index(xas, false);
 }
 
 /*
@@ -281,7 +271,7 @@  static void dax_unlock_entry(struct xa_state *xas, void *entry)
 	old = xas_store(xas, entry);
 	xas_unlock_irq(xas);
 	BUG_ON(!dax_is_locked(old));
-	dax_wake_entry(xas, entry, false);
+	dax_wake_index(xas, false);
 }
 
 /*
@@ -522,7 +512,7 @@  static void *grab_mapping_entry(struct xa_state *xas,
 
 		dax_disassociate_entry(entry, mapping, false);
 		xas_store(xas, NULL);	/* undo the PMD join */
-		dax_wake_entry(xas, entry, true);
+		dax_wake_index(xas, true);
 		mapping->nrexceptional--;
 		entry = NULL;
 		xas_set(xas, index);
@@ -915,7 +905,7 @@  static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
 	xas_lock_irq(xas);
 	xas_store(xas, entry);
 	xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
-	dax_wake_entry(xas, entry, false);
+	dax_wake_index(xas, false);
 
 	trace_dax_writeback_one(mapping->host, index, count);
 	return ret;