diff mbox series

[RFC,v2,5/6] inode: port __I_LRU_ISOLATING to var event

Message ID 20240821-work-i_state-v2-5-67244769f102@kernel.org (mailing list archive)
State New
Headers show
Series inode: turn i_state into u32 | expand

Commit Message

Christian Brauner Aug. 21, 2024, 3:47 p.m. UTC
Port the __I_LRU_ISOLATING mechanism to use the new var event mechanism.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

Comments

Jeff Layton Aug. 21, 2024, 7:41 p.m. UTC | #1
On Wed, 2024-08-21 at 17:47 +0200, Christian Brauner wrote:
> Port the __I_LRU_ISOLATING mechanism to use the new var event mechanism.
> 
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
>  fs/inode.c | 26 +++++++++++++++++---------
>  1 file changed, 17 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index d18e1567c487..c8a5c63dc980 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -510,8 +510,7 @@ static void inode_unpin_lru_isolating(struct inode *inode)
>  	spin_lock(&inode->i_lock);
>  	WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
>  	inode->i_state &= ~I_LRU_ISOLATING;
> -	smp_mb();
> -	wake_up_bit(&inode->i_state, __I_LRU_ISOLATING);
> +	inode_wake_up_bit(inode, __I_LRU_ISOLATING);
>  	spin_unlock(&inode->i_lock);
>  }
>  
> @@ -519,13 +518,22 @@ static void inode_wait_for_lru_isolating(struct inode *inode)
>  {
>  	lockdep_assert_held(&inode->i_lock);
>  	if (inode->i_state & I_LRU_ISOLATING) {
> -		DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
> -		wait_queue_head_t *wqh;
> -
> -		wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
> -		spin_unlock(&inode->i_lock);
> -		__wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
> -		spin_lock(&inode->i_lock);
> +		struct wait_bit_queue_entry wqe;
> +		struct wait_queue_head *wq_head;
> +
> +		wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
> +		for (;;) {
> +			prepare_to_wait_event(wq_head, &wqe.wq_entry,
> +					      TASK_UNINTERRUPTIBLE);
> +			if (inode->i_state & I_LRU_ISOLATING) {
> +				spin_unlock(&inode->i_lock);
> +				schedule();
> +				spin_lock(&inode->i_lock);
> +				continue;
> +			}
> +			break;
> +		}

nit: personally, I'd prefer this, since you wouldn't need the brackets
or the continue:

			if (!(inode->i_state & LRU_ISOLATING))
				break;
			spin_unlock();
			schedule();
			...

> +		finish_wait(wq_head, &wqe.wq_entry);
>  		WARN_ON(inode->i_state & I_LRU_ISOLATING);
>  	}
>  }
>
Christian Brauner Aug. 22, 2024, 8:53 a.m. UTC | #2
On Wed, Aug 21, 2024 at 03:41:45PM GMT, Jeff Layton wrote:
> On Wed, 2024-08-21 at 17:47 +0200, Christian Brauner wrote:
> > Port the __I_LRU_ISOLATING mechanism to use the new var event mechanism.
> > 
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > ---
> >  fs/inode.c | 26 +++++++++++++++++---------
> >  1 file changed, 17 insertions(+), 9 deletions(-)
> > 
> > diff --git a/fs/inode.c b/fs/inode.c
> > index d18e1567c487..c8a5c63dc980 100644
> > --- a/fs/inode.c
> > +++ b/fs/inode.c
> > @@ -510,8 +510,7 @@ static void inode_unpin_lru_isolating(struct inode *inode)
> >  	spin_lock(&inode->i_lock);
> >  	WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
> >  	inode->i_state &= ~I_LRU_ISOLATING;
> > -	smp_mb();
> > -	wake_up_bit(&inode->i_state, __I_LRU_ISOLATING);
> > +	inode_wake_up_bit(inode, __I_LRU_ISOLATING);
> >  	spin_unlock(&inode->i_lock);
> >  }
> >  
> > @@ -519,13 +518,22 @@ static void inode_wait_for_lru_isolating(struct inode *inode)
> >  {
> >  	lockdep_assert_held(&inode->i_lock);
> >  	if (inode->i_state & I_LRU_ISOLATING) {
> > -		DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
> > -		wait_queue_head_t *wqh;
> > -
> > -		wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
> > -		spin_unlock(&inode->i_lock);
> > -		__wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
> > -		spin_lock(&inode->i_lock);
> > +		struct wait_bit_queue_entry wqe;
> > +		struct wait_queue_head *wq_head;
> > +
> > +		wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
> > +		for (;;) {
> > +			prepare_to_wait_event(wq_head, &wqe.wq_entry,
> > +					      TASK_UNINTERRUPTIBLE);
> > +			if (inode->i_state & I_LRU_ISOLATING) {
> > +				spin_unlock(&inode->i_lock);
> > +				schedule();
> > +				spin_lock(&inode->i_lock);
> > +				continue;
> > +			}
> > +			break;
> > +		}
> 
> nit: personally, I'd prefer this, since you wouldn't need the brackets
> or the continue:
> 
> 			if (!(inode->i_state & LRU_ISOLATING))
> 				break;
> 			spin_unlock();
> 			schedule();

Yeah, that's nicer. I'll use that.
Mateusz Guzik Aug. 22, 2024, 9:48 a.m. UTC | #3
On Thu, Aug 22, 2024 at 10:53:50AM +0200, Christian Brauner wrote:
> On Wed, Aug 21, 2024 at 03:41:45PM GMT, Jeff Layton wrote:
> > >  	if (inode->i_state & I_LRU_ISOLATING) {
> > > -		DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
> > > -		wait_queue_head_t *wqh;
> > > -
> > > -		wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
> > > -		spin_unlock(&inode->i_lock);
> > > -		__wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
> > > -		spin_lock(&inode->i_lock);
> > > +		struct wait_bit_queue_entry wqe;
> > > +		struct wait_queue_head *wq_head;
> > > +
> > > +		wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
> > > +		for (;;) {
> > > +			prepare_to_wait_event(wq_head, &wqe.wq_entry,
> > > +					      TASK_UNINTERRUPTIBLE);
> > > +			if (inode->i_state & I_LRU_ISOLATING) {
> > > +				spin_unlock(&inode->i_lock);
> > > +				schedule();
> > > +				spin_lock(&inode->i_lock);
> > > +				continue;
> > > +			}
> > > +			break;
> > > +		}
> > 
> > nit: personally, I'd prefer this, since you wouldn't need the brackets
> > or the continue:
> > 
> > 			if (!(inode->i_state & LRU_ISOLATING))
> > 				break;
> > 			spin_unlock();
> > 			schedule();
> 
> Yeah, that's nicer. I'll use that.

In that case may I suggest also short-circuiting the entire func?

	if (!(inode->i_state & I_LRU_ISOLATING))
		return;

then the entire thing loses one indentation level

Same thing is applicable to the I_SYNC flag.

A non-cosmetic is as follows: is it legal for a wake up to happen
spuriously?

For legal waiters on the flag I_FREEING is set which prevents
I_LRU_ISOLATING from showing up afterwards. Thus it should be sufficient
to wait for the flag to clear once. This is moot if spurious wakeups do
happen.

If looping is needed, then this warn:
	WARN_ON(inode->i_state & I_LRU_ISOLATING);


fails to test for anything since the loop is on that very condition (iow
it needs to be whacked)

The writeback code needs to loop because there are callers outside of
evict().
Christian Brauner Aug. 22, 2024, 11:10 a.m. UTC | #4
On Thu, Aug 22, 2024 at 11:48:45AM GMT, Mateusz Guzik wrote:
> On Thu, Aug 22, 2024 at 10:53:50AM +0200, Christian Brauner wrote:
> > On Wed, Aug 21, 2024 at 03:41:45PM GMT, Jeff Layton wrote:
> > > >  	if (inode->i_state & I_LRU_ISOLATING) {
> > > > -		DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
> > > > -		wait_queue_head_t *wqh;
> > > > -
> > > > -		wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
> > > > -		spin_unlock(&inode->i_lock);
> > > > -		__wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
> > > > -		spin_lock(&inode->i_lock);
> > > > +		struct wait_bit_queue_entry wqe;
> > > > +		struct wait_queue_head *wq_head;
> > > > +
> > > > +		wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
> > > > +		for (;;) {
> > > > +			prepare_to_wait_event(wq_head, &wqe.wq_entry,
> > > > +					      TASK_UNINTERRUPTIBLE);
> > > > +			if (inode->i_state & I_LRU_ISOLATING) {
> > > > +				spin_unlock(&inode->i_lock);
> > > > +				schedule();
> > > > +				spin_lock(&inode->i_lock);
> > > > +				continue;
> > > > +			}
> > > > +			break;
> > > > +		}
> > > 
> > > nit: personally, I'd prefer this, since you wouldn't need the brackets
> > > or the continue:
> > > 
> > > 			if (!(inode->i_state & LRU_ISOLATING))
> > > 				break;
> > > 			spin_unlock();
> > > 			schedule();
> > 
> > Yeah, that's nicer. I'll use that.
> 
> In that case may I suggest also short-circuiting the entire func?
> 
> 	if (!(inode->i_state & I_LRU_ISOLATING))
> 		return;

Afaict, if prepare_to_wait_event() has been called finish_wait() must be
called.

> 
> then the entire thing loses one indentation level
> 
> Same thing is applicable to the I_SYNC flag.
> 
> A non-cosmetic is as follows: is it legal for a wake up to happen
> spuriously?

So, I simply mimicked ___wait_var_event() and __wait_on_bit() - both
loop. I reasoned that ___wait_var_event() via @state and __wait_on_bit()
via @mode take the task state into account to wait in. And my conclusion
was that they don't loop on TASK_UNINTERRUPTIBLE but would loop on e.g.,
TASK_INTERRUPTIBLE. Iow, I assume that spurious wakeups shouldn't happen
with TASK_UNINTERRUPTIBLE.

But it's not something I'm able to assert with absolute confidence so I
erred on the side of caution.
Mateusz Guzik Aug. 22, 2024, 12:46 p.m. UTC | #5
On Thu, Aug 22, 2024 at 01:10:43PM +0200, Christian Brauner wrote:
> On Thu, Aug 22, 2024 at 11:48:45AM GMT, Mateusz Guzik wrote:
> > On Thu, Aug 22, 2024 at 10:53:50AM +0200, Christian Brauner wrote:
> > > On Wed, Aug 21, 2024 at 03:41:45PM GMT, Jeff Layton wrote:
> > > > >  	if (inode->i_state & I_LRU_ISOLATING) {
> > > > > -		DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
> > > > > -		wait_queue_head_t *wqh;
> > > > > -
> > > > > -		wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
> > > > > -		spin_unlock(&inode->i_lock);
> > > > > -		__wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
> > > > > -		spin_lock(&inode->i_lock);
> > > > > +		struct wait_bit_queue_entry wqe;
> > > > > +		struct wait_queue_head *wq_head;
> > > > > +
> > > > > +		wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
> > > > > +		for (;;) {
> > > > > +			prepare_to_wait_event(wq_head, &wqe.wq_entry,
> > > > > +					      TASK_UNINTERRUPTIBLE);
> > > > > +			if (inode->i_state & I_LRU_ISOLATING) {
> > > > > +				spin_unlock(&inode->i_lock);
> > > > > +				schedule();
> > > > > +				spin_lock(&inode->i_lock);
> > > > > +				continue;
> > > > > +			}
> > > > > +			break;
> > > > > +		}
> > > > 
> > > > nit: personally, I'd prefer this, since you wouldn't need the brackets
> > > > or the continue:
> > > > 
> > > > 			if (!(inode->i_state & LRU_ISOLATING))
> > > > 				break;
> > > > 			spin_unlock();
> > > > 			schedule();
> > > 
> > > Yeah, that's nicer. I'll use that.
> > 
> > In that case may I suggest also short-circuiting the entire func?
> > 
> > 	if (!(inode->i_state & I_LRU_ISOLATING))
> > 		return;
> 
> Afaict, if prepare_to_wait_event() has been called finish_wait() must be
> called.
> 

I mean the upfront check, before any other work:

static void inode_wait_for_lru_isolating(struct inode *inode)
{
        lockdep_assert_held(&inode->i_lock);
	if (!(inode->i_state & I_LRU_ISOLATING))
		return;

	/* for loop goes here, losing an indentation level but otherwise
	 * identical. same remark for the writeback thing */
	
}

> > 
> > then the entire thing loses one indentation level
> > 
> > Same thing is applicable to the I_SYNC flag.
> > 
> > A non-cosmetic is as follows: is it legal for a wake up to happen
> > spuriously?
> 
> So, I simply mimicked ___wait_var_event() and __wait_on_bit() - both
> loop. I reasoned that ___wait_var_event() via @state and __wait_on_bit()
> via @mode take the task state into account to wait in. And my conclusion
> was that they don't loop on TASK_UNINTERRUPTIBLE but would loop on e.g.,
> TASK_INTERRUPTIBLE. Iow, I assume that spurious wakeups shouldn't happen
> with TASK_UNINTERRUPTIBLE.
> 
> But it's not something I'm able to assert with absolute confidence so I
> erred on the side of caution.

This definitely should be sorted out that callers don't need to loop if
the condition is a one off in the worst case, but I concede getting
there is not *needed* at the moment, just a fixme.
NeilBrown Aug. 23, 2024, 12:36 a.m. UTC | #6
On Thu, 22 Aug 2024, Christian Brauner wrote:
> Port the __I_LRU_ISOLATING mechanism to use the new var event mechanism.
> 
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
>  fs/inode.c | 26 +++++++++++++++++---------
>  1 file changed, 17 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index d18e1567c487..c8a5c63dc980 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -510,8 +510,7 @@ static void inode_unpin_lru_isolating(struct inode *inode)
>  	spin_lock(&inode->i_lock);
>  	WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
>  	inode->i_state &= ~I_LRU_ISOLATING;
> -	smp_mb();
> -	wake_up_bit(&inode->i_state, __I_LRU_ISOLATING);
> +	inode_wake_up_bit(inode, __I_LRU_ISOLATING);
>  	spin_unlock(&inode->i_lock);
>  }
>  
> @@ -519,13 +518,22 @@ static void inode_wait_for_lru_isolating(struct inode *inode)
>  {
>  	lockdep_assert_held(&inode->i_lock);
>  	if (inode->i_state & I_LRU_ISOLATING) {
> -		DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
> -		wait_queue_head_t *wqh;
> -
> -		wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
> -		spin_unlock(&inode->i_lock);
> -		__wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
> -		spin_lock(&inode->i_lock);
> +		struct wait_bit_queue_entry wqe;
> +		struct wait_queue_head *wq_head;
> +
> +		wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
> +		for (;;) {
> +			prepare_to_wait_event(wq_head, &wqe.wq_entry,
> +					      TASK_UNINTERRUPTIBLE);
> +			if (inode->i_state & I_LRU_ISOLATING) {
> +				spin_unlock(&inode->i_lock);
> +				schedule();
> +				spin_lock(&inode->i_lock);
> +				continue;
> +			}
> +			break;
> +		}
> +		finish_wait(wq_head, &wqe.wq_entry);

I would really like to add

  wait_var_event_locked(variable, conditon, spinlock)

so that above would be one or two lines.

 #define  wait_var_event_locked(var, condition, lock) \
    do { \
	might_sleep(); \
	if (condition) \
		break; \
	___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, \
			  0, 0,  \
			  spin_unlock(lock);schedule();spin_lock(lock)); \
    } while(0)

That can happen after you series lands though.

The wake_up here don't need a memory barrier either.

NeilBrown


>  		WARN_ON(inode->i_state & I_LRU_ISOLATING);
>  	}
>  }
> 
> -- 
> 2.43.0
> 
>
Linus Torvalds Aug. 23, 2024, 2:24 a.m. UTC | #7
On Fri, 23 Aug 2024 at 08:37, NeilBrown <neilb@suse.de> wrote:
>
> I would really like to add
>
>   wait_var_event_locked(variable, condition, spinlock)
>
> so that above would be one or two lines.

We don't even have that for the regular wait_event, but we do have

  wait_event_interruptible_locked
  wait_event_interruptible_locked_irq

but they actually only work on the wait-queue lock, not on some
generic "use this lock".

Honestly, I like your version much better, but having two *very*
different models for what "locked" means looks wrong.

The wait-queue lock (our current "locked" event version) is a rather
important special case, though, and takes advantage of holding the
waitqueue lock by then using __add_wait_queue_entry_tail() without
locking.

"You are in a maze of twisty little functions, all alike".

              Linus
Christian Brauner Aug. 23, 2024, 8:29 a.m. UTC | #8
On Fri, Aug 23, 2024 at 10:24:57AM GMT, Linus Torvalds wrote:
> On Fri, 23 Aug 2024 at 08:37, NeilBrown <neilb@suse.de> wrote:
> >
> > I would really like to add
> >
> >   wait_var_event_locked(variable, condition, spinlock)
> >
> > so that above would be one or two lines.
> 
> We don't even have that for the regular wait_event, but we do have
> 
>   wait_event_interruptible_locked
>   wait_event_interruptible_locked_irq
> 
> but they actually only work on the wait-queue lock, not on some
> generic "use this lock".
> 
> Honestly, I like your version much better, but having two *very*
> different models for what "locked" means looks wrong.
> 
> The wait-queue lock (our current "locked" event version) is a rather
> important special case, though, and takes advantage of holding the
> waitqueue lock by then using __add_wait_queue_entry_tail() without
> locking.
> 
> "You are in a maze of twisty little functions, all alike".

"You could've used a little more cowbell^wunderscores."

__fput()
____fput()

__wait_var_event()
___wait_var_event()

https://www.youtube.com/watch?v=cVsQLlk-T0s
diff mbox series

Patch

diff --git a/fs/inode.c b/fs/inode.c
index d18e1567c487..c8a5c63dc980 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -510,8 +510,7 @@  static void inode_unpin_lru_isolating(struct inode *inode)
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
 	inode->i_state &= ~I_LRU_ISOLATING;
-	smp_mb();
-	wake_up_bit(&inode->i_state, __I_LRU_ISOLATING);
+	inode_wake_up_bit(inode, __I_LRU_ISOLATING);
 	spin_unlock(&inode->i_lock);
 }
 
@@ -519,13 +518,22 @@  static void inode_wait_for_lru_isolating(struct inode *inode)
 {
 	lockdep_assert_held(&inode->i_lock);
 	if (inode->i_state & I_LRU_ISOLATING) {
-		DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
-		wait_queue_head_t *wqh;
-
-		wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
-		spin_unlock(&inode->i_lock);
-		__wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
-		spin_lock(&inode->i_lock);
+		struct wait_bit_queue_entry wqe;
+		struct wait_queue_head *wq_head;
+
+		wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
+		for (;;) {
+			prepare_to_wait_event(wq_head, &wqe.wq_entry,
+					      TASK_UNINTERRUPTIBLE);
+			if (inode->i_state & I_LRU_ISOLATING) {
+				spin_unlock(&inode->i_lock);
+				schedule();
+				spin_lock(&inode->i_lock);
+				continue;
+			}
+			break;
+		}
+		finish_wait(wq_head, &wqe.wq_entry);
 		WARN_ON(inode->i_state & I_LRU_ISOLATING);
 	}
 }