diff mbox series

sysv: convert pointers_lock from rw_lock to rw_sem

Message ID 6fcbdc89-6aff-064b-a040-0966152856e0@I-love.SAKURA.ne.jp (mailing list archive)
State Mainlined, archived
Headers show
Series sysv: convert pointers_lock from rw_lock to rw_sem | expand

Commit Message

Tetsuo Handa March 26, 2023, 10:24 p.m. UTC
syzbot is reporting that __getblk_gfp() cannot be called from atomic
context. Fix this problem by converting pointers_lock from rw_lock to
rw_sem.

Reported-by: syzbot <syzbot+69b40dc5fd40f32c199f@syzkaller.appspotmail.com>
Link: https://syzkaller.appspot.com/bug?extid=69b40dc5fd40f32c199f
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot <syzbot+69b40dc5fd40f32c199f@syzkaller.appspotmail.com>
---
 fs/sysv/itree.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

Comments

Al Viro March 27, 2023, 12:04 a.m. UTC | #1
On Mon, Mar 27, 2023 at 07:24:25AM +0900, Tetsuo Handa wrote:
> syzbot is reporting that __getblk_gfp() cannot be called from atomic
> context. Fix this problem by converting pointers_lock from rw_lock to
> rw_sem.
> 
> Reported-by: syzbot <syzbot+69b40dc5fd40f32c199f@syzkaller.appspotmail.com>
> Link: https://syzkaller.appspot.com/bug?extid=69b40dc5fd40f32c199f
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Tested-by: syzbot <syzbot+69b40dc5fd40f32c199f@syzkaller.appspotmail.com>

Hmm...  The bug is real, all right (introduced back in 2002 during the
conversion away from BKL;
commit 3ba77f860fa7f359660e9d498a5b35940021cfba
Author: Christoph Hellwig <hch@sb.bsdonline.org>
Date:   Thu Apr 4 00:25:37 2002 +0200

    Replace BKL for chain locking with sysvfs-private rwlock.
is where it had happened).

However, I don't think this is the right fix.  Note that the problem is
with get_branch() done under the rwlock; all other places are safe.  But
in get_branch() we only need the lock (and only shared, at that) around
the verify+add pair.  See how it's done in fs/minix/itree_common.c...
Something like this:

diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index b22764fe669c..cfa281fb6578 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -104,15 +104,18 @@ static Indirect *get_branch(struct inode *inode,
 		bh = sb_bread(sb, block);
 		if (!bh)
 			goto failure;
+		read_lock(&pointers_lock);
 		if (!verify_chain(chain, p))
 			goto changed;
 		add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets);
+		read_unlock(&pointers_lock);
 		if (!p->key)
 			goto no_block;
 	}
 	return NULL;
 
 changed:
+	read_unlock(&pointers_lock);
 	brelse(bh);
 	*err = -EAGAIN;
 	goto no_block;
@@ -214,9 +217,7 @@ static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *b
 		goto out;
 
 reread:
-	read_lock(&pointers_lock);
 	partial = get_branch(inode, depth, offsets, chain, &err);
-	read_unlock(&pointers_lock);
 
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
@@ -287,10 +288,11 @@ static Indirect *find_shared(struct inode *inode,
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
 
-	write_lock(&pointers_lock);
 	partial = get_branch(inode, k, offsets, chain, &err);
 	if (!partial)
 		partial = chain + k-1;
+
+	write_lock(&pointers_lock);
 	/*
 	 * If the branch acquired continuation since we've looked at it -
 	 * fine, it should all survive and (new) top doesn't belong to us.
Tetsuo Handa March 27, 2023, 10:19 a.m. UTC | #2
On 2023/03/27 9:04, Al Viro wrote:
> However, I don't think this is the right fix.  Note that the problem is
> with get_branch() done under the rwlock; all other places are safe.  But
> in get_branch() we only need the lock (and only shared, at that) around
> the verify+add pair.  See how it's done in fs/minix/itree_common.c...
> Something like this:

I can see fs/minix/itree_common.c is doing similar things. But since I'm not
familiar with filesystems, I can't be convinced that minix's assumption is safe.

>  static Indirect *find_shared(struct inode *inode,
>                                 int depth,
>                                 int offsets[],
>                                 Indirect chain[],
>                                 sysv_zone_t *top)
>  {
>         Indirect *partial, *p;
>         int k, err;
> 
>         *top = 0;
>         for (k = depth; k > 1 && !offsets[k-1]; k--)
>                 ;
> 
> -       write_lock(&pointers_lock);
>         partial = get_branch(inode, k, offsets, chain, &err);

Does the result of verify_chain() from get_branch() remain valid even after
sleep by preemption at this line made get_branch() to execute "*err = -EAGAIN;"
line rather than "return NULL;" line?

>         if (!partial)
>                 partial = chain + k-1;

Can sleep by preemption at this line or

> +
> +       write_lock(&pointers_lock);
>         /*
>          * If the branch acquired continuation since we've looked at it -
>          * fine, it should all survive and (new) top doesn't belong to us.
>          */
>         if (!partial->key && *partial->p) {
>                 write_unlock(&pointers_lock);

at this line change the result of "!partial->key && *partial->p" test above?

>                 goto no_top;
>         }
>         for (p=partial; p>chain && all_zeroes((sysv_zone_t*)p->bh->b_data,p->p); p--)
>                 ;
>         /*
>          * OK, we've found the last block that must survive. The rest of our
>          * branch should be detached before unlocking. However, if that rest
>          * of branch is all ours and does not grow immediately from the inode
>          * it's easier to cheat and just decrement partial->p.
>          */
>         if (p == chain + k - 1 && p > chain) {
>                 p->p--;
>         } else {
>                 *top = *p->p;
>                 *p->p = 0;
>         }
>         write_unlock(&pointers_lock);
>  
>         while (partial > p) {
>                 brelse(partial->bh);
>                 partial--;
>         }
>  no_top:
>         return partial;
>  }

I feel worried about

	/*
	 * Indirect block might be removed by truncate while we were
	 * reading it. Handling of that case (forget what we've got and
	 * reread) is taken out of the main path.
	 */
	if (err == -EAGAIN)
		goto changed;

in get_block()...
Al Viro March 27, 2023, 1:02 p.m. UTC | #3
On Mon, Mar 27, 2023 at 07:19:47PM +0900, Tetsuo Handa wrote:

> I feel worried about
> 
> 	/*
> 	 * Indirect block might be removed by truncate while we were
> 	 * reading it. Handling of that case (forget what we've got and
> 	 * reread) is taken out of the main path.
> 	 */
> 	if (err == -EAGAIN)
> 		goto changed;
> 
> in get_block()...

Look at the caller of find_shared(); there won't be other truncate messing
up with the indirect blocks.  sysv_truncate() is called by sysv_write_failed()
(from ->write_begin()), sysv_setattr() (->setattr(), with ATTR_SIZE) and
sysv_evict_inode() (if there's no links left to on-disk inode; it's an
->evict_inode() instance, so we are dropping the last reference to in-core one).

The first two have i_mutex held by callers, serializing them against each
other, and both have the in-core inode pinned, which gives exclusion with
the third one...

IOW, sysv_truncate() (as well as its minixfs counterpart) relies upon having
serialization wrt other callers of sysv_truncate().
Tetsuo Handa March 27, 2023, 1:09 p.m. UTC | #4
On 2023/03/27 22:02, Al Viro wrote:
> IOW, sysv_truncate() (as well as its minixfs counterpart) relies upon having
> serialization wrt other callers of sysv_truncate().

I see. Please submit a patch.
diff mbox series

Patch

diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index b22764fe669c..513b20e30afd 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -62,7 +62,7 @@  typedef struct {
 	struct buffer_head *bh;
 } Indirect;
 
-static DEFINE_RWLOCK(pointers_lock);
+static DECLARE_RWSEM(pointers_lock);
 
 static inline void add_chain(Indirect *p, struct buffer_head *bh, sysv_zone_t *v)
 {
@@ -83,7 +83,7 @@  static inline sysv_zone_t *block_end(struct buffer_head *bh)
 }
 
 /*
- * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock)
+ * Requires down_read(&pointers_lock) or down_write(&pointers_lock)
  */
 static Indirect *get_branch(struct inode *inode,
 			    int depth,
@@ -173,11 +173,11 @@  static inline int splice_branch(struct inode *inode,
 	int i;
 
 	/* Verify that place we are splicing to is still there and vacant */
-	write_lock(&pointers_lock);
+	down_write(&pointers_lock);
 	if (!verify_chain(chain, where-1) || *where->p)
 		goto changed;
 	*where->p = where->key;
-	write_unlock(&pointers_lock);
+	up_write(&pointers_lock);
 
 	inode->i_ctime = current_time(inode);
 
@@ -192,7 +192,7 @@  static inline int splice_branch(struct inode *inode,
 	return 0;
 
 changed:
-	write_unlock(&pointers_lock);
+	up_write(&pointers_lock);
 	for (i = 1; i < num; i++)
 		bforget(where[i].bh);
 	for (i = 0; i < num; i++)
@@ -214,9 +214,9 @@  static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *b
 		goto out;
 
 reread:
-	read_lock(&pointers_lock);
+	down_read(&pointers_lock);
 	partial = get_branch(inode, depth, offsets, chain, &err);
-	read_unlock(&pointers_lock);
+	up_read(&pointers_lock);
 
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
@@ -287,7 +287,7 @@  static Indirect *find_shared(struct inode *inode,
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
 
-	write_lock(&pointers_lock);
+	down_write(&pointers_lock);
 	partial = get_branch(inode, k, offsets, chain, &err);
 	if (!partial)
 		partial = chain + k-1;
@@ -296,7 +296,7 @@  static Indirect *find_shared(struct inode *inode,
 	 * fine, it should all survive and (new) top doesn't belong to us.
 	 */
 	if (!partial->key && *partial->p) {
-		write_unlock(&pointers_lock);
+		up_write(&pointers_lock);
 		goto no_top;
 	}
 	for (p=partial; p>chain && all_zeroes((sysv_zone_t*)p->bh->b_data,p->p); p--)
@@ -313,7 +313,7 @@  static Indirect *find_shared(struct inode *inode,
 		*top = *p->p;
 		*p->p = 0;
 	}
-	write_unlock(&pointers_lock);
+	up_write(&pointers_lock);
 
 	while (partial > p) {
 		brelse(partial->bh);