diff mbox series

ceph: improve error handling and short/overflow-read logic in __ceph_sync_read()

Message ID 20241209114359.1309965-1-amarkuze@redhat.com (mailing list archive)
State New
Headers show
Series ceph: improve error handling and short/overflow-read logic in __ceph_sync_read() | expand

Commit Message

Alex Markuze Dec. 9, 2024, 11:43 a.m. UTC
This patch refines the read logic in __ceph_sync_read() to ensure more
predictable and efficient behavior in various edge cases.

- Return early if the requested read length is zero or if the file size
  (`i_size`) is zero.
- Initialize the index variable (`idx`) where needed and reorder some
  code to ensure it is always set before use.
- Improve error handling by checking for negative return values earlier.
- Remove redundant encrypted file checks after failures. Only attempt
  filesystem-level decryption if the read succeeded.
- Simplify leftover calculations to correctly handle cases where the read
  extends beyond the end of the file or stops short.
- This resolves multiple issues caused by integer overflow
  - https://tracker.ceph.com/issues/67524
  - https://tracker.ceph.com/issues/68981
  - https://tracker.ceph.com/issues/68980

Signed-off-by: Alex Markuze <amarkuze@redhat.com>
---
 fs/ceph/file.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

Comments

Viacheslav Dubeyko Dec. 10, 2024, 6:13 p.m. UTC | #1
On Mon, 2024-12-09 at 11:43 +0000, Alex Markuze wrote:
> This patch refines the read logic in __ceph_sync_read() to ensure
> more
> predictable and efficient behavior in various edge cases.
> 
> - Return early if the requested read length is zero or if the file
> size
>   (`i_size`) is zero.
> - Initialize the index variable (`idx`) where needed and reorder some
>   code to ensure it is always set before use.
> - Improve error handling by checking for negative return values
> earlier.
> - Remove redundant encrypted file checks after failures. Only attempt
>   filesystem-level decryption if the read succeeded.
> - Simplify leftover calculations to correctly handle cases where the
> read
>   extends beyond the end of the file or stops short.
> - This resolves multiple issues caused by integer overflow
>   -
> https://tracker.ceph.com/issues/67524 
>  
>   -
> https://tracker.ceph.com/issues/68981 
>  
>   -
> https://tracker.ceph.com/issues/68980 
>  
> 
> Signed-off-by: Alex Markuze <amarkuze@redhat.com>
> ---
>  fs/ceph/file.c | 29 ++++++++++++++---------------
>  1 file changed, 14 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index ce342a5d4b8b..8e0400d461a2 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -1066,7 +1066,7 @@ ssize_t __ceph_sync_read(struct inode *inode,
> loff_t *ki_pos,
>  	if (ceph_inode_is_shutdown(inode))
>  		return -EIO;
>  
> -	if (!len)
> +	if (!len || !i_size)
>  		return 0;
>  	/*
>  	 * flush any page cache pages in this range.  this
> @@ -1086,7 +1086,7 @@ ssize_t __ceph_sync_read(struct inode *inode,
> loff_t *ki_pos,
>  		int num_pages;
>  		size_t page_off;
>  		bool more;
> -		int idx;
> +		int idx = 0;
>  		size_t left;
>  		struct ceph_osd_req_op *op;
>  		u64 read_off = off;
> @@ -1160,7 +1160,14 @@ ssize_t __ceph_sync_read(struct inode *inode,
> loff_t *ki_pos,
>  		else if (ret == -ENOENT)
>  			ret = 0;
>  
> -		if (ret > 0 && IS_ENCRYPTED(inode)) {

The whole function contains multiple places of checking ret > 0
condition. Frankly speaking, it looks very weird. It is clear that it
is effort to distinguish normal and erroneous execution flow. But, for
my taste, it could be a ground for bugs. I have feelings that
__ceph_sync_read() logic requires refactoring:

if (ret < 0) {
   <report error and stop execution>
} else {
   <continue normal execution flow>
}

What do you think?

> +		if (ret < 0) {
> +			ceph_osdc_put_request(req);
> +			if (ret == -EBLOCKLISTED)
> +				fsc->blocklisted = true;
> +			break;
> +		}
> +
> +		if (IS_ENCRYPTED(inode)) {
>  			int fret;
>  
>  			fret = ceph_fscrypt_decrypt_extents(inode,
> pages,
> @@ -1187,7 +1194,7 @@ ssize_t __ceph_sync_read(struct inode *inode,
> loff_t *ki_pos,
>  		}
>  
>  		/* Short read but not EOF? Zero out the remainder.
> */
> -		if (ret >= 0 && ret < len && (off + ret < i_size)) {
> +		if (ret < len && (off + ret < i_size)) {
>  			int zlen = min(len - ret, i_size - off -
> ret);
>  			int zoff = page_off + ret;
>  
> @@ -1197,13 +1204,11 @@ ssize_t __ceph_sync_read(struct inode *inode,
> loff_t *ki_pos,
>  			ret += zlen;
>  		}
>  
> -		idx = 0;
> -		if (ret <= 0)
> -			left = 0;
> -		else if (off + ret > i_size)
> -			left = i_size - off;
> +		if (off + ret > i_size)
> +			left = (i_size > off) ? i_size - off : 0;
>  		else
>  			left = ret;
> +
>  		while (left > 0) {
>  			size_t plen, copied;
>  
> @@ -1222,12 +1227,6 @@ ssize_t __ceph_sync_read(struct inode *inode,
> loff_t *ki_pos,
>  
>  		ceph_osdc_put_request(req);
>  
> -		if (ret < 0) {
> -			if (ret == -EBLOCKLISTED)
> -				fsc->blocklisted = true;
> -			break;
> -		}
> -
>  		if (off >= i_size || !more)
>  			break;
>  	}

Mostly, cleanup looks good. But I have feelings that this function
requires
more refactoring efforts.

Thanks,
Slava.
Alex Markuze Dec. 10, 2024, 7:12 p.m. UTC | #2
The main goal of this patch is to solve erroneous read sizes and overflows.
The convoluted 'if else' chain is a recipe for disaster. Currently,
exec stops immediately on first ret that indicates an error.
If you have additional refactoring thoughts feel free to add more
patches, This is mainly a bug fix, that solves both the immediate
overflow bug and attempts to make this code more manageable to
mitigate future bugs.

On Tue, Dec 10, 2024 at 8:13 PM Viacheslav Dubeyko
<Slava.Dubeyko@ibm.com> wrote:
>
> On Mon, 2024-12-09 at 11:43 +0000, Alex Markuze wrote:
> > This patch refines the read logic in __ceph_sync_read() to ensure
> > more
> > predictable and efficient behavior in various edge cases.
> >
> > - Return early if the requested read length is zero or if the file
> > size
> >   (`i_size`) is zero.
> > - Initialize the index variable (`idx`) where needed and reorder some
> >   code to ensure it is always set before use.
> > - Improve error handling by checking for negative return values
> > earlier.
> > - Remove redundant encrypted file checks after failures. Only attempt
> >   filesystem-level decryption if the read succeeded.
> > - Simplify leftover calculations to correctly handle cases where the
> > read
> >   extends beyond the end of the file or stops short.
> > - This resolves multiple issues caused by integer overflow
> >   -
> > https://tracker.ceph.com/issues/67524
> >
> >   -
> > https://tracker.ceph.com/issues/68981
> >
> >   -
> > https://tracker.ceph.com/issues/68980
> >
> >
> > Signed-off-by: Alex Markuze <amarkuze@redhat.com>
> > ---
> >  fs/ceph/file.c | 29 ++++++++++++++---------------
> >  1 file changed, 14 insertions(+), 15 deletions(-)
> >
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index ce342a5d4b8b..8e0400d461a2 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -1066,7 +1066,7 @@ ssize_t __ceph_sync_read(struct inode *inode,
> > loff_t *ki_pos,
> >       if (ceph_inode_is_shutdown(inode))
> >               return -EIO;
> >
> > -     if (!len)
> > +     if (!len || !i_size)
> >               return 0;
> >       /*
> >        * flush any page cache pages in this range.  this
> > @@ -1086,7 +1086,7 @@ ssize_t __ceph_sync_read(struct inode *inode,
> > loff_t *ki_pos,
> >               int num_pages;
> >               size_t page_off;
> >               bool more;
> > -             int idx;
> > +             int idx = 0;
> >               size_t left;
> >               struct ceph_osd_req_op *op;
> >               u64 read_off = off;
> > @@ -1160,7 +1160,14 @@ ssize_t __ceph_sync_read(struct inode *inode,
> > loff_t *ki_pos,
> >               else if (ret == -ENOENT)
> >                       ret = 0;
> >
> > -             if (ret > 0 && IS_ENCRYPTED(inode)) {
>
> The whole function contains multiple places of checking ret > 0
> condition. Frankly speaking, it looks very weird. It is clear that it
> is effort to distinguish normal and erroneous execution flow. But, for
> my taste, it could be a ground for bugs. I have feelings that
> __ceph_sync_read() logic requires refactoring:
>
> if (ret < 0) {
>    <report error and stop execution>
> } else {
>    <continue normal execution flow>
> }
>
> What do you think?
>
> > +             if (ret < 0) {
> > +                     ceph_osdc_put_request(req);
> > +                     if (ret == -EBLOCKLISTED)
> > +                             fsc->blocklisted = true;
> > +                     break;
> > +             }
> > +
> > +             if (IS_ENCRYPTED(inode)) {
> >                       int fret;
> >
> >                       fret = ceph_fscrypt_decrypt_extents(inode,
> > pages,
> > @@ -1187,7 +1194,7 @@ ssize_t __ceph_sync_read(struct inode *inode,
> > loff_t *ki_pos,
> >               }
> >
> >               /* Short read but not EOF? Zero out the remainder.
> > */
> > -             if (ret >= 0 && ret < len && (off + ret < i_size)) {
> > +             if (ret < len && (off + ret < i_size)) {
> >                       int zlen = min(len - ret, i_size - off -
> > ret);
> >                       int zoff = page_off + ret;
> >
> > @@ -1197,13 +1204,11 @@ ssize_t __ceph_sync_read(struct inode *inode,
> > loff_t *ki_pos,
> >                       ret += zlen;
> >               }
> >
> > -             idx = 0;
> > -             if (ret <= 0)
> > -                     left = 0;
> > -             else if (off + ret > i_size)
> > -                     left = i_size - off;
> > +             if (off + ret > i_size)
> > +                     left = (i_size > off) ? i_size - off : 0;
> >               else
> >                       left = ret;
> > +
> >               while (left > 0) {
> >                       size_t plen, copied;
> >
> > @@ -1222,12 +1227,6 @@ ssize_t __ceph_sync_read(struct inode *inode,
> > loff_t *ki_pos,
> >
> >               ceph_osdc_put_request(req);
> >
> > -             if (ret < 0) {
> > -                     if (ret == -EBLOCKLISTED)
> > -                             fsc->blocklisted = true;
> > -                     break;
> > -             }
> > -
> >               if (off >= i_size || !more)
> >                       break;
> >       }
>
> Mostly, cleanup looks good. But I have feelings that this function
> requires
> more refactoring efforts.
>
> Thanks,
> Slava.
>
Viacheslav Dubeyko Dec. 10, 2024, 7:28 p.m. UTC | #3
On Tue, 2024-12-10 at 21:12 +0200, Alex Markuze wrote:
> The main goal of this patch is to solve erroneous read sizes and
> overflows.
> The convoluted 'if else' chain is a recipe for disaster. Currently,
> exec stops immediately on first ret that indicates an error.
> If you have additional refactoring thoughts feel free to add more
> patches, This is mainly a bug fix, that solves both the immediate
> overflow bug and attempts to make this code more manageable to
> mitigate future bugs.

I see your point. I simply see several cases of ret > 0 check:

https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1150
https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1158
https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1163 <-
your fix here
https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1192 <-
your fix here too
https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1236

And there are places to check ret for negative values:

https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1160
https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1226

These checks distributes in the function's code, it could be confusing
and, potentially, be the source of new bugs during modifications. I
simply have feelings that this logic somehow requires refactoring to
improve the execution flow. But if you would like not to do it, then I
am OK with it.

Thanks,
Slava.
Alex Markuze Dec. 11, 2024, 10:03 a.m. UTC | #4
I agree this function needs work, there is a major performance issue
in there as well. One step at a time.
Meanwhile I need this patch to be acked so It can move to the main
branch, as it fixes multiple bugs seen in production.

On Tue, Dec 10, 2024 at 9:28 PM Viacheslav Dubeyko
<Slava.Dubeyko@ibm.com> wrote:
>
> On Tue, 2024-12-10 at 21:12 +0200, Alex Markuze wrote:
> > The main goal of this patch is to solve erroneous read sizes and
> > overflows.
> > The convoluted 'if else' chain is a recipe for disaster. Currently,
> > exec stops immediately on first ret that indicates an error.
> > If you have additional refactoring thoughts feel free to add more
> > patches, This is mainly a bug fix, that solves both the immediate
> > overflow bug and attempts to make this code more manageable to
> > mitigate future bugs.
>
> I see your point. I simply see several cases of ret > 0 check:
>
> https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1150
> https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1158
> https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1163 <-
> your fix here
> https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1192 <-
> your fix here too
> https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1236
>
> And there are places to check ret for negative values:
>
> https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1160
> https://elixir.bootlin.com/linux/v6.12.3/source/fs/ceph/file.c#L1226
>
> These checks distributes in the function's code, it could be confusing
> and, potentially, be the source of new bugs during modifications. I
> simply have feelings that this logic somehow requires refactoring to
> improve the execution flow. But if you would like not to do it, then I
> am OK with it.
>
> Thanks,
> Slava.
>
>
Viacheslav Dubeyko Dec. 11, 2024, 10:17 p.m. UTC | #5
On Wed, 2024-12-11 at 12:03 +0200, Alex Markuze wrote:
> I agree this function needs work, there is a major performance issue
> in there as well. One step at a time.
> Meanwhile I need this patch to be acked so It can move to the main
> branch, as it fixes multiple bugs seen in production.
> 

Makes sense to me. The patch looks good.

Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>

Thanks,
Slava.
diff mbox series

Patch

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce342a5d4b8b..8e0400d461a2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1066,7 +1066,7 @@  ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 	if (ceph_inode_is_shutdown(inode))
 		return -EIO;
 
-	if (!len)
+	if (!len || !i_size)
 		return 0;
 	/*
 	 * flush any page cache pages in this range.  this
@@ -1086,7 +1086,7 @@  ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 		int num_pages;
 		size_t page_off;
 		bool more;
-		int idx;
+		int idx = 0;
 		size_t left;
 		struct ceph_osd_req_op *op;
 		u64 read_off = off;
@@ -1160,7 +1160,14 @@  ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 		else if (ret == -ENOENT)
 			ret = 0;
 
-		if (ret > 0 && IS_ENCRYPTED(inode)) {
+		if (ret < 0) {
+			ceph_osdc_put_request(req);
+			if (ret == -EBLOCKLISTED)
+				fsc->blocklisted = true;
+			break;
+		}
+
+		if (IS_ENCRYPTED(inode)) {
 			int fret;
 
 			fret = ceph_fscrypt_decrypt_extents(inode, pages,
@@ -1187,7 +1194,7 @@  ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 		}
 
 		/* Short read but not EOF? Zero out the remainder. */
-		if (ret >= 0 && ret < len && (off + ret < i_size)) {
+		if (ret < len && (off + ret < i_size)) {
 			int zlen = min(len - ret, i_size - off - ret);
 			int zoff = page_off + ret;
 
@@ -1197,13 +1204,11 @@  ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 			ret += zlen;
 		}
 
-		idx = 0;
-		if (ret <= 0)
-			left = 0;
-		else if (off + ret > i_size)
-			left = i_size - off;
+		if (off + ret > i_size)
+			left = (i_size > off) ? i_size - off : 0;
 		else
 			left = ret;
+
 		while (left > 0) {
 			size_t plen, copied;
 
@@ -1222,12 +1227,6 @@  ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 
 		ceph_osdc_put_request(req);
 
-		if (ret < 0) {
-			if (ret == -EBLOCKLISTED)
-				fsc->blocklisted = true;
-			break;
-		}
-
 		if (off >= i_size || !more)
 			break;
 	}