diff mbox series

[6/6] gfs2: Fix mmap + page fault deadlocks (part 2)

Message ID 20210520122536.1596602-7-agruenba@redhat.com (mailing list archive)
State New, archived
Headers show
Series gfs2: Handle page faults during read and write | expand

Commit Message

Andreas Gruenbacher May 20, 2021, 12:25 p.m. UTC
Now that we handle self-recursion on the inode glock in gfs2_fault and
gfs2_page_mkwrite, we need to take care of more complex deadlock
scenarios like the following (example by Jan Kara):

Two independent processes P1, P2. Two files F1, F2, and two mappings M1,
M2 where M1 is a mapping of F1, M2 is a mapping of F2. Now P1 does DIO
to F1 with M2 as a buffer, P2 does DIO to F2 with M1 as a buffer. They
can race like:

P1                                      P2
read()                                  read()
  gfs2_file_read_iter()                   gfs2_file_read_iter()
    gfs2_file_direct_read()                 gfs2_file_direct_read()
      locks glock of F1                       locks glock of F2
      iomap_dio_rw()                          iomap_dio_rw()
        bio_iov_iter_get_pages()                bio_iov_iter_get_pages()
          <fault in M2>                           <fault in M1>
            gfs2_fault()                            gfs2_fault()
              tries to grab glock of F2               tries to grab glock of F1

Those kinds of scenarios are much harder to reproduce than
self-recursion.

We deal with such situations by using the LM_FLAG_OUTER flag to mark
"outer" glock taking.  Then, when taking an "inner" glock, we use the
LM_FLAG_TRY flag so that locking attempts that don't immediately succeed
will be aborted.  In case of a failed locking attempt, we "unroll" to
where the "outer" glock was taken, drop the "outer" glock, and fault in
the first offending user page.  This will re-trigger the "inner" locking
attempt but without the LM_FLAG_TRY flag.  Once that has happened, we
re-acquire the "outer" glock and retry the original operation.

Reported-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/bmap.c |  3 ++-
 fs/gfs2/file.c | 62 +++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 58 insertions(+), 7 deletions(-)

Comments

Jan Kara May 20, 2021, 1:30 p.m. UTC | #1
On Thu 20-05-21 14:25:36, Andreas Gruenbacher wrote:
> Now that we handle self-recursion on the inode glock in gfs2_fault and
> gfs2_page_mkwrite, we need to take care of more complex deadlock
> scenarios like the following (example by Jan Kara):
> 
> Two independent processes P1, P2. Two files F1, F2, and two mappings M1,
> M2 where M1 is a mapping of F1, M2 is a mapping of F2. Now P1 does DIO
> to F1 with M2 as a buffer, P2 does DIO to F2 with M1 as a buffer. They
> can race like:
> 
> P1                                      P2
> read()                                  read()
>   gfs2_file_read_iter()                   gfs2_file_read_iter()
>     gfs2_file_direct_read()                 gfs2_file_direct_read()
>       locks glock of F1                       locks glock of F2
>       iomap_dio_rw()                          iomap_dio_rw()
>         bio_iov_iter_get_pages()                bio_iov_iter_get_pages()
>           <fault in M2>                           <fault in M1>
>             gfs2_fault()                            gfs2_fault()
>               tries to grab glock of F2               tries to grab glock of F1
> 
> Those kinds of scenarios are much harder to reproduce than
> self-recursion.
> 
> We deal with such situations by using the LM_FLAG_OUTER flag to mark
> "outer" glock taking.  Then, when taking an "inner" glock, we use the
> LM_FLAG_TRY flag so that locking attempts that don't immediately succeed
> will be aborted.  In case of a failed locking attempt, we "unroll" to
> where the "outer" glock was taken, drop the "outer" glock, and fault in
> the first offending user page.  This will re-trigger the "inner" locking
> attempt but without the LM_FLAG_TRY flag.  Once that has happened, we
> re-acquire the "outer" glock and retry the original operation.
> 
> Reported-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>

...

> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index 7d88abb4629b..8b26893f8dc6 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -431,21 +431,30 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
>  	vm_fault_t ret = VM_FAULT_LOCKED;
>  	struct gfs2_holder gh;
>  	unsigned int length;
> +	u16 flags = 0;
>  	loff_t size;
>  	int err;
>  
>  	sb_start_pagefault(inode->i_sb);
>  
> -	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
> +	if (current_holds_glock())
> +		flags |= LM_FLAG_TRY;
> +
> +	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, flags, &gh);
>  	if (likely(!outer_gh)) {
>  		err = gfs2_glock_nq(&gh);
>  		if (err) {
>  			ret = block_page_mkwrite_return(err);
> +			if (err == GLR_TRYFAILED) {
> +				set_current_needs_retry(true);
> +				ret = VM_FAULT_SIGBUS;
> +			}

I've checked to make sure but do_user_addr_fault() indeed calls do_sigbus()
which raises the SIGBUS signal. So if the application does not ignore
SIGBUS, your retry will be visible to the application and can cause all
sorts of interesting results... So you probably need to add a new VM_FAULT_
return code that will behave like VM_FAULT_SIGBUS except it will not raise
the signal.

Otherwise it seems to me your approach should work.

								Honza
Andreas Gruenbacher May 20, 2021, 2:07 p.m. UTC | #2
On Thu, May 20, 2021 at 3:30 PM Jan Kara <jack@suse.cz> wrote:
> On Thu 20-05-21 14:25:36, Andreas Gruenbacher wrote:
> > Now that we handle self-recursion on the inode glock in gfs2_fault and
> > gfs2_page_mkwrite, we need to take care of more complex deadlock
> > scenarios like the following (example by Jan Kara):
> >
> > Two independent processes P1, P2. Two files F1, F2, and two mappings M1,
> > M2 where M1 is a mapping of F1, M2 is a mapping of F2. Now P1 does DIO
> > to F1 with M2 as a buffer, P2 does DIO to F2 with M1 as a buffer. They
> > can race like:
> >
> > P1                                      P2
> > read()                                  read()
> >   gfs2_file_read_iter()                   gfs2_file_read_iter()
> >     gfs2_file_direct_read()                 gfs2_file_direct_read()
> >       locks glock of F1                       locks glock of F2
> >       iomap_dio_rw()                          iomap_dio_rw()
> >         bio_iov_iter_get_pages()                bio_iov_iter_get_pages()
> >           <fault in M2>                           <fault in M1>
> >             gfs2_fault()                            gfs2_fault()
> >               tries to grab glock of F2               tries to grab glock of F1
> >
> > Those kinds of scenarios are much harder to reproduce than
> > self-recursion.
> >
> > We deal with such situations by using the LM_FLAG_OUTER flag to mark
> > "outer" glock taking.  Then, when taking an "inner" glock, we use the
> > LM_FLAG_TRY flag so that locking attempts that don't immediately succeed
> > will be aborted.  In case of a failed locking attempt, we "unroll" to
> > where the "outer" glock was taken, drop the "outer" glock, and fault in
> > the first offending user page.  This will re-trigger the "inner" locking
> > attempt but without the LM_FLAG_TRY flag.  Once that has happened, we
> > re-acquire the "outer" glock and retry the original operation.
> >
> > Reported-by: Jan Kara <jack@suse.cz>
> > Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
>
> ...
>
> > diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> > index 7d88abb4629b..8b26893f8dc6 100644
> > --- a/fs/gfs2/file.c
> > +++ b/fs/gfs2/file.c
> > @@ -431,21 +431,30 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
> >       vm_fault_t ret = VM_FAULT_LOCKED;
> >       struct gfs2_holder gh;
> >       unsigned int length;
> > +     u16 flags = 0;
> >       loff_t size;
> >       int err;
> >
> >       sb_start_pagefault(inode->i_sb);
> >
> > -     gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
> > +     if (current_holds_glock())
> > +             flags |= LM_FLAG_TRY;
> > +
> > +     gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, flags, &gh);
> >       if (likely(!outer_gh)) {
> >               err = gfs2_glock_nq(&gh);
> >               if (err) {
> >                       ret = block_page_mkwrite_return(err);
> > +                     if (err == GLR_TRYFAILED) {
> > +                             set_current_needs_retry(true);
> > +                             ret = VM_FAULT_SIGBUS;
> > +                     }
>
> I've checked to make sure but do_user_addr_fault() indeed calls do_sigbus()
> which raises the SIGBUS signal. So if the application does not ignore
> SIGBUS, your retry will be visible to the application and can cause all
> sorts of interesting results...

I would have noticed that, but no SIGBUS signals were actually
delivered. So we probably end up in kernelmode_fixup_or_oops() when in
kernel mode, which just does nothing in that case.

Andy Lutomirski, you've been involved with this, could you please shed
some light?

> So you probably need to add a new VM_FAULT_
> return code that will behave like VM_FAULT_SIGBUS except it will not raise
> the signal.

A new VM_FAULT_* flag might make the code easier to read, but I don't
know if we can have one.

> Otherwise it seems to me your approach should work.

Thanks a lot,
Andreas
Jan Kara May 21, 2021, 3:23 p.m. UTC | #3
On Thu 20-05-21 16:07:56, Andreas Gruenbacher wrote:
> On Thu, May 20, 2021 at 3:30 PM Jan Kara <jack@suse.cz> wrote:
> > On Thu 20-05-21 14:25:36, Andreas Gruenbacher wrote:
> > > Now that we handle self-recursion on the inode glock in gfs2_fault and
> > > gfs2_page_mkwrite, we need to take care of more complex deadlock
> > > scenarios like the following (example by Jan Kara):
> > >
> > > Two independent processes P1, P2. Two files F1, F2, and two mappings M1,
> > > M2 where M1 is a mapping of F1, M2 is a mapping of F2. Now P1 does DIO
> > > to F1 with M2 as a buffer, P2 does DIO to F2 with M1 as a buffer. They
> > > can race like:
> > >
> > > P1                                      P2
> > > read()                                  read()
> > >   gfs2_file_read_iter()                   gfs2_file_read_iter()
> > >     gfs2_file_direct_read()                 gfs2_file_direct_read()
> > >       locks glock of F1                       locks glock of F2
> > >       iomap_dio_rw()                          iomap_dio_rw()
> > >         bio_iov_iter_get_pages()                bio_iov_iter_get_pages()
> > >           <fault in M2>                           <fault in M1>
> > >             gfs2_fault()                            gfs2_fault()
> > >               tries to grab glock of F2               tries to grab glock of F1
> > >
> > > Those kinds of scenarios are much harder to reproduce than
> > > self-recursion.
> > >
> > > We deal with such situations by using the LM_FLAG_OUTER flag to mark
> > > "outer" glock taking.  Then, when taking an "inner" glock, we use the
> > > LM_FLAG_TRY flag so that locking attempts that don't immediately succeed
> > > will be aborted.  In case of a failed locking attempt, we "unroll" to
> > > where the "outer" glock was taken, drop the "outer" glock, and fault in
> > > the first offending user page.  This will re-trigger the "inner" locking
> > > attempt but without the LM_FLAG_TRY flag.  Once that has happened, we
> > > re-acquire the "outer" glock and retry the original operation.
> > >
> > > Reported-by: Jan Kara <jack@suse.cz>
> > > Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> >
> > ...
> >
> > > diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> > > index 7d88abb4629b..8b26893f8dc6 100644
> > > --- a/fs/gfs2/file.c
> > > +++ b/fs/gfs2/file.c
> > > @@ -431,21 +431,30 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
> > >       vm_fault_t ret = VM_FAULT_LOCKED;
> > >       struct gfs2_holder gh;
> > >       unsigned int length;
> > > +     u16 flags = 0;
> > >       loff_t size;
> > >       int err;
> > >
> > >       sb_start_pagefault(inode->i_sb);
> > >
> > > -     gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
> > > +     if (current_holds_glock())
> > > +             flags |= LM_FLAG_TRY;
> > > +
> > > +     gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, flags, &gh);
> > >       if (likely(!outer_gh)) {
> > >               err = gfs2_glock_nq(&gh);
> > >               if (err) {
> > >                       ret = block_page_mkwrite_return(err);
> > > +                     if (err == GLR_TRYFAILED) {
> > > +                             set_current_needs_retry(true);
> > > +                             ret = VM_FAULT_SIGBUS;
> > > +                     }
> >
> > I've checked to make sure but do_user_addr_fault() indeed calls do_sigbus()
> > which raises the SIGBUS signal. So if the application does not ignore
> > SIGBUS, your retry will be visible to the application and can cause all
> > sorts of interesting results...
> 
> I would have noticed that, but no SIGBUS signals were actually
> delivered. So we probably end up in kernelmode_fixup_or_oops() when in
> kernel mode, which just does nothing in that case.

Hum, but how would we get there? I don't think fatal_signal_pending() would
return true yet...

> > So you probably need to add a new VM_FAULT_
> > return code that will behave like VM_FAULT_SIGBUS except it will not raise
> > the signal.
> 
> A new VM_FAULT_* flag might make the code easier to read, but I don't
> know if we can have one.

Well, this is kernel-internal API and there's still plenty of space in
vm_fault_reason.
								Honza
Andreas Gruenbacher May 21, 2021, 3:46 p.m. UTC | #4
On Fri, May 21, 2021 at 5:23 PM Jan Kara <jack@suse.cz> wrote:
> On Thu 20-05-21 16:07:56, Andreas Gruenbacher wrote:
> > On Thu, May 20, 2021 at 3:30 PM Jan Kara <jack@suse.cz> wrote:
> > > On Thu 20-05-21 14:25:36, Andreas Gruenbacher wrote:
> > > > Now that we handle self-recursion on the inode glock in gfs2_fault and
> > > > gfs2_page_mkwrite, we need to take care of more complex deadlock
> > > > scenarios like the following (example by Jan Kara):
> > > >
> > > > Two independent processes P1, P2. Two files F1, F2, and two mappings M1,
> > > > M2 where M1 is a mapping of F1, M2 is a mapping of F2. Now P1 does DIO
> > > > to F1 with M2 as a buffer, P2 does DIO to F2 with M1 as a buffer. They
> > > > can race like:
> > > >
> > > > P1                                      P2
> > > > read()                                  read()
> > > >   gfs2_file_read_iter()                   gfs2_file_read_iter()
> > > >     gfs2_file_direct_read()                 gfs2_file_direct_read()
> > > >       locks glock of F1                       locks glock of F2
> > > >       iomap_dio_rw()                          iomap_dio_rw()
> > > >         bio_iov_iter_get_pages()                bio_iov_iter_get_pages()
> > > >           <fault in M2>                           <fault in M1>
> > > >             gfs2_fault()                            gfs2_fault()
> > > >               tries to grab glock of F2               tries to grab glock of F1
> > > >
> > > > Those kinds of scenarios are much harder to reproduce than
> > > > self-recursion.
> > > >
> > > > We deal with such situations by using the LM_FLAG_OUTER flag to mark
> > > > "outer" glock taking.  Then, when taking an "inner" glock, we use the
> > > > LM_FLAG_TRY flag so that locking attempts that don't immediately succeed
> > > > will be aborted.  In case of a failed locking attempt, we "unroll" to
> > > > where the "outer" glock was taken, drop the "outer" glock, and fault in
> > > > the first offending user page.  This will re-trigger the "inner" locking
> > > > attempt but without the LM_FLAG_TRY flag.  Once that has happened, we
> > > > re-acquire the "outer" glock and retry the original operation.
> > > >
> > > > Reported-by: Jan Kara <jack@suse.cz>
> > > > Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> > >
> > > ...
> > >
> > > > diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> > > > index 7d88abb4629b..8b26893f8dc6 100644
> > > > --- a/fs/gfs2/file.c
> > > > +++ b/fs/gfs2/file.c
> > > > @@ -431,21 +431,30 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
> > > >       vm_fault_t ret = VM_FAULT_LOCKED;
> > > >       struct gfs2_holder gh;
> > > >       unsigned int length;
> > > > +     u16 flags = 0;
> > > >       loff_t size;
> > > >       int err;
> > > >
> > > >       sb_start_pagefault(inode->i_sb);
> > > >
> > > > -     gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
> > > > +     if (current_holds_glock())
> > > > +             flags |= LM_FLAG_TRY;
> > > > +
> > > > +     gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, flags, &gh);
> > > >       if (likely(!outer_gh)) {
> > > >               err = gfs2_glock_nq(&gh);
> > > >               if (err) {
> > > >                       ret = block_page_mkwrite_return(err);
> > > > +                     if (err == GLR_TRYFAILED) {
> > > > +                             set_current_needs_retry(true);
> > > > +                             ret = VM_FAULT_SIGBUS;
> > > > +                     }
> > >
> > > I've checked to make sure but do_user_addr_fault() indeed calls do_sigbus()
> > > which raises the SIGBUS signal. So if the application does not ignore
> > > SIGBUS, your retry will be visible to the application and can cause all
> > > sorts of interesting results...
> >
> > I would have noticed that, but no SIGBUS signals were actually
> > delivered. So we probably end up in kernelmode_fixup_or_oops() when in
> > kernel mode, which just does nothing in that case.
>
> Hum, but how would we get there? I don't think fatal_signal_pending() would
> return true yet...

Hmm, right ...

> > > So you probably need to add a new VM_FAULT_
> > > return code that will behave like VM_FAULT_SIGBUS except it will not raise
> > > the signal.
> >
> > A new VM_FAULT_* flag might make the code easier to read, but I don't
> > know if we can have one.
>
> Well, this is kernel-internal API and there's still plenty of space in
> vm_fault_reason.

That's in the context of the page fault. The other issue is how to
propagate that out through iov_iter_fault_in_readable ->
fault_in_pages_readable -> __get_user, for example. I don't think
there's much of a chance to get an additional error code out of
__get_user and __put_user.

Thanks,
Andreas
Jan Kara May 24, 2021, 8:39 a.m. UTC | #5
On Fri 21-05-21 17:46:04, Andreas Gruenbacher wrote:
> On Fri, May 21, 2021 at 5:23 PM Jan Kara <jack@suse.cz> wrote:
> > On Thu 20-05-21 16:07:56, Andreas Gruenbacher wrote:
> > > > So you probably need to add a new VM_FAULT_
> > > > return code that will behave like VM_FAULT_SIGBUS except it will not raise
> > > > the signal.
> > >
> > > A new VM_FAULT_* flag might make the code easier to read, but I don't
> > > know if we can have one.
> >
> > Well, this is kernel-internal API and there's still plenty of space in
> > vm_fault_reason.
> 
> That's in the context of the page fault. The other issue is how to
> propagate that out through iov_iter_fault_in_readable ->
> fault_in_pages_readable -> __get_user, for example. I don't think
> there's much of a chance to get an additional error code out of
> __get_user and __put_user.

Yes, at that level we'd get EFAULT as in any other case. Really the only
difference of the new VM_FAULT_ error code from a case of "standard" error
and VM_FAULT_SIGBUS would be not raising the signal.

								Honza
diff mbox series

Patch

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 2ff501c413f4..82e4506984e3 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -967,7 +967,8 @@  static int gfs2_write_lock(struct inode *inode)
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	int error;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_OUTER,
+			 &ip->i_gh);
 	error = gfs2_glock_nq(&ip->i_gh);
 	if (error)
 		goto out_uninit;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7d88abb4629b..8b26893f8dc6 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -431,21 +431,30 @@  static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
 	vm_fault_t ret = VM_FAULT_LOCKED;
 	struct gfs2_holder gh;
 	unsigned int length;
+	u16 flags = 0;
 	loff_t size;
 	int err;
 
 	sb_start_pagefault(inode->i_sb);
 
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (current_holds_glock())
+		flags |= LM_FLAG_TRY;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, flags, &gh);
 	if (likely(!outer_gh)) {
 		err = gfs2_glock_nq(&gh);
 		if (err) {
 			ret = block_page_mkwrite_return(err);
+			if (err == GLR_TRYFAILED) {
+				set_current_needs_retry(true);
+				ret = VM_FAULT_SIGBUS;
+			}
 			goto out_uninit;
 		}
 	} else {
 		if (!gfs2_holder_is_compatible(outer_gh, LM_ST_EXCLUSIVE)) {
 			/* We could try to upgrade outer_gh here. */
+			set_current_needs_retry(true);
 			ret = VM_FAULT_SIGBUS;
 			goto out_uninit;
 		}
@@ -568,20 +577,28 @@  static vm_fault_t gfs2_fault(struct vm_fault *vmf)
 	struct gfs2_holder *outer_gh = gfs2_glock_is_locked_by_me(ip->i_gl);
 	struct gfs2_holder gh;
 	vm_fault_t ret;
-	u16 state;
+	u16 state, flags = 0;
 	int err;
 
+	if (current_holds_glock())
+		flags |= LM_FLAG_TRY;
+
 	state = (vmf->flags & FAULT_FLAG_WRITE) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
-	gfs2_holder_init(ip->i_gl, state, 0, &gh);
+	gfs2_holder_init(ip->i_gl, state, flags, &gh);
 	if (likely(!outer_gh)) {
 		err = gfs2_glock_nq(&gh);
 		if (err) {
 			ret = block_page_mkwrite_return(err);
+			if (err == GLR_TRYFAILED) {
+				set_current_needs_retry(true);
+				ret = VM_FAULT_SIGBUS;
+			}
 			goto out_uninit;
 		}
 	} else {
 		if (!gfs2_holder_is_compatible(outer_gh, state)) {
 			/* We could try to upgrade outer_gh here. */
+			set_current_needs_retry(true);
 			ret = VM_FAULT_SIGBUS;
 			goto out_uninit;
 		}
@@ -807,13 +824,21 @@  static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
 	if (!count)
 		return 0; /* skip atime */
 
-	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, LM_FLAG_OUTER, gh);
+retry:
 	ret = gfs2_glock_nq(gh);
 	if (ret)
 		goto out_uninit;
 
 	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
 	gfs2_glock_dq(gh);
+	if (unlikely(current_needs_retry())) {
+		set_current_needs_retry(false);
+		if (ret == -EFAULT) {
+			if (!iov_iter_fault_in_writeable(to, PAGE_SIZE))
+				goto retry;
+		}
+	}
 out_uninit:
 	gfs2_holder_uninit(gh);
 	return ret;
@@ -837,7 +862,8 @@  static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
 	 * unfortunately, have the option of only flushing a range like the
 	 * VFS does.
 	 */
-	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, LM_FLAG_OUTER, gh);
+retry:
 	ret = gfs2_glock_nq(gh);
 	if (ret)
 		goto out_uninit;
@@ -851,6 +877,13 @@  static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
 		ret = 0;
 out:
 	gfs2_glock_dq(gh);
+	if (unlikely(current_needs_retry())) {
+		set_current_needs_retry(false);
+		if (ret == -EFAULT) {
+			if (!iov_iter_fault_in_readable(from, PAGE_SIZE))
+				goto retry;
+		}
+	}
 out_uninit:
 	gfs2_holder_uninit(gh);
 	return ret;
@@ -883,7 +916,8 @@  static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			return ret;
 	}
 	ip = GFS2_I(iocb->ki_filp->f_mapping->host);
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_OUTER, &gh);
+retry:
 	ret = gfs2_glock_nq(&gh);
 	if (ret)
 		goto out_uninit;
@@ -891,6 +925,13 @@  static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (ret > 0)
 		written += ret;
 	gfs2_glock_dq(&gh);
+	if (unlikely(current_needs_retry())) {
+		set_current_needs_retry(false);
+		if (ret == -EFAULT) {
+			if (!iov_iter_fault_in_writeable(to, PAGE_SIZE))
+				goto retry;
+		}
+	}
 out_uninit:
 	gfs2_holder_uninit(&gh);
 	return written ? written : ret;
@@ -902,9 +943,18 @@  static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct iov_iter *fro
 	struct inode *inode = file_inode(file);
 	ssize_t ret;
 
+retry:
 	current->backing_dev_info = inode_to_bdi(inode);
 	ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
 	current->backing_dev_info = NULL;
+	if (unlikely(current_needs_retry())) {
+		set_current_needs_retry(false);
+		if (ret == -EFAULT) {
+			if (!iov_iter_fault_in_readable(from, PAGE_SIZE))
+				goto retry;
+		}
+	}
+
 	return ret;
 }