[v4,3/7] iomap: support direct I/O with fscrypt using blk-crypto
diff mbox series

Message ID 20200720233739.824943-4-satyat@google.com
State New
Headers show
Series
  • add support for direct I/O with fscrypt using blk-crypto
Related show

Commit Message

Satya Tangirala July 20, 2020, 11:37 p.m. UTC
From: Eric Biggers <ebiggers@google.com>

Wire up iomap direct I/O with the fscrypt additions for direct I/O.
This allows ext4 to support direct I/O on encrypted files when inline
encryption is enabled.

This change consists of two parts:

- Set a bio_crypt_ctx on bios for encrypted files, so that the file
  contents get encrypted (or decrypted).

- Ensure that encryption data unit numbers (DUNs) are contiguous within
  each bio.  Use the new function fscrypt_limit_io_pages() for this,
  since the iomap code works directly with logical ranges and thus
  doesn't have a chance to call fscrypt_mergeable_bio() on each page.

Note that fscrypt_limit_io_pages() is normally a no-op, as normally the
DUNs simply increment along with the logical blocks.  But it's needed to
handle an edge case in one of the fscrypt IV generation methods.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Co-developed-by: Satya Tangirala <satyat@google.com>
Signed-off-by: Satya Tangirala <satyat@google.com>
---
 fs/iomap/direct-io.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

Comments

Jaegeuk Kim July 22, 2020, 5:06 p.m. UTC | #1
On 07/20, Satya Tangirala wrote:
> From: Eric Biggers <ebiggers@google.com>
> 
> Wire up iomap direct I/O with the fscrypt additions for direct I/O.
> This allows ext4 to support direct I/O on encrypted files when inline
> encryption is enabled.
> 
> This change consists of two parts:
> 
> - Set a bio_crypt_ctx on bios for encrypted files, so that the file
>   contents get encrypted (or decrypted).
> 
> - Ensure that encryption data unit numbers (DUNs) are contiguous within
>   each bio.  Use the new function fscrypt_limit_io_pages() for this,
>   since the iomap code works directly with logical ranges and thus
>   doesn't have a chance to call fscrypt_mergeable_bio() on each page.
> 
> Note that fscrypt_limit_io_pages() is normally a no-op, as normally the
> DUNs simply increment along with the logical blocks.  But it's needed to
> handle an edge case in one of the fscrypt IV generation methods.
> 
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> Co-developed-by: Satya Tangirala <satyat@google.com>
> Signed-off-by: Satya Tangirala <satyat@google.com>

Reviewed-by: Jaegeuk Kim <jaegeuk@kernel.org>

> ---
>  fs/iomap/direct-io.c | 12 +++++++++++-
>  1 file changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index ec7b78e6feca..12064daa3e3d 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -6,6 +6,7 @@
>  #include <linux/module.h>
>  #include <linux/compiler.h>
>  #include <linux/fs.h>
> +#include <linux/fscrypt.h>
>  #include <linux/iomap.h>
>  #include <linux/backing-dev.h>
>  #include <linux/uio.h>
> @@ -183,11 +184,16 @@ static void
>  iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
>  		unsigned len)
>  {
> +	struct inode *inode = file_inode(dio->iocb->ki_filp);
>  	struct page *page = ZERO_PAGE(0);
>  	int flags = REQ_SYNC | REQ_IDLE;
>  	struct bio *bio;
>  
>  	bio = bio_alloc(GFP_KERNEL, 1);
> +
> +	/* encrypted direct I/O is guaranteed to be fs-block aligned */
> +	WARN_ON_ONCE(fscrypt_needs_contents_encryption(inode));
> +
>  	bio_set_dev(bio, iomap->bdev);
>  	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
>  	bio->bi_private = dio;
> @@ -253,6 +259,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  		ret = nr_pages;
>  		goto out;
>  	}
> +	nr_pages = fscrypt_limit_io_pages(inode, pos, nr_pages);
>  
>  	if (need_zeroout) {
>  		/* zero out from the start of the block to the write offset */
> @@ -270,6 +277,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  		}
>  
>  		bio = bio_alloc(GFP_KERNEL, nr_pages);
> +		fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
> +					  GFP_KERNEL);
>  		bio_set_dev(bio, iomap->bdev);
>  		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
>  		bio->bi_write_hint = dio->iocb->ki_hint;
> @@ -306,9 +315,10 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  		dio->size += n;
>  		copied += n;
>  
> -		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
>  		iomap_dio_submit_bio(dio, iomap, bio, pos);
>  		pos += n;
> +		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
> +		nr_pages = fscrypt_limit_io_pages(inode, pos, nr_pages);
>  	} while (nr_pages);
>  
>  	/*
> -- 
> 2.28.0.rc0.105.gf9edc3c819-goog
Dave Chinner July 22, 2020, 9:16 p.m. UTC | #2
On Mon, Jul 20, 2020 at 11:37:35PM +0000, Satya Tangirala wrote:
> From: Eric Biggers <ebiggers@google.com>
> 
> Wire up iomap direct I/O with the fscrypt additions for direct I/O.
> This allows ext4 to support direct I/O on encrypted files when inline
> encryption is enabled.
> 
> This change consists of two parts:
> 
> - Set a bio_crypt_ctx on bios for encrypted files, so that the file
>   contents get encrypted (or decrypted).
> 
> - Ensure that encryption data unit numbers (DUNs) are contiguous within
>   each bio.  Use the new function fscrypt_limit_io_pages() for this,
>   since the iomap code works directly with logical ranges and thus
>   doesn't have a chance to call fscrypt_mergeable_bio() on each page.
> 
> Note that fscrypt_limit_io_pages() is normally a no-op, as normally the
> DUNs simply increment along with the logical blocks.  But it's needed to
> handle an edge case in one of the fscrypt IV generation methods.
> 
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> Co-developed-by: Satya Tangirala <satyat@google.com>
> Signed-off-by: Satya Tangirala <satyat@google.com>
> ---
>  fs/iomap/direct-io.c | 12 +++++++++++-
>  1 file changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index ec7b78e6feca..12064daa3e3d 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -6,6 +6,7 @@
>  #include <linux/module.h>
>  #include <linux/compiler.h>
>  #include <linux/fs.h>
> +#include <linux/fscrypt.h>
>  #include <linux/iomap.h>
>  #include <linux/backing-dev.h>
>  #include <linux/uio.h>
> @@ -183,11 +184,16 @@ static void
>  iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
>  		unsigned len)
>  {
> +	struct inode *inode = file_inode(dio->iocb->ki_filp);
>  	struct page *page = ZERO_PAGE(0);
>  	int flags = REQ_SYNC | REQ_IDLE;
>  	struct bio *bio;
>  
>  	bio = bio_alloc(GFP_KERNEL, 1);
> +
> +	/* encrypted direct I/O is guaranteed to be fs-block aligned */
> +	WARN_ON_ONCE(fscrypt_needs_contents_encryption(inode));

Which means you are now placing a new constraint on this code in
that we cannot ever, in future, zero entire blocks here.

This code can issue arbitrary sized zeroing bios - multiple entire fs blocks
blocks if necessary - so I think constraining it to only support
partial block zeroing by adding a warning like this is no correct.

> @@ -253,6 +259,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
>  		ret = nr_pages;
>  		goto out;
>  	}
> +	nr_pages = fscrypt_limit_io_pages(inode, pos, nr_pages);

So if "pos" overlaps a 2^32 offset when a certain mode is used, we
have to break up the IO?

If true, I'm not sure that this belongs here. Limiting the size of
the IOs because of filesytem contraints belongs in the filesystem
extent mapping code. That's the point where the IO is broken up into
maximally sized chunks the filesystem can issue as a contiguous
range. If the fscrypt code is breaking that "contiguous IO range"
because of the mode being used, the fs mapping code should break
the mapping at the boundery where the IO needs to be broken.

Hence the dio mapping code here will never build IOs that cross this
-filesystem- encryption limitation, and we don't need this fscrypt
code in the direct IO path at all.

Cheers,

Dave.
Eric Biggers July 22, 2020, 10:34 p.m. UTC | #3
On Thu, Jul 23, 2020 at 07:16:29AM +1000, Dave Chinner wrote:
> On Mon, Jul 20, 2020 at 11:37:35PM +0000, Satya Tangirala wrote:
> > From: Eric Biggers <ebiggers@google.com>
> > 
> > Wire up iomap direct I/O with the fscrypt additions for direct I/O.
> > This allows ext4 to support direct I/O on encrypted files when inline
> > encryption is enabled.
> > 
> > This change consists of two parts:
> > 
> > - Set a bio_crypt_ctx on bios for encrypted files, so that the file
> >   contents get encrypted (or decrypted).
> > 
> > - Ensure that encryption data unit numbers (DUNs) are contiguous within
> >   each bio.  Use the new function fscrypt_limit_io_pages() for this,
> >   since the iomap code works directly with logical ranges and thus
> >   doesn't have a chance to call fscrypt_mergeable_bio() on each page.
> > 
> > Note that fscrypt_limit_io_pages() is normally a no-op, as normally the
> > DUNs simply increment along with the logical blocks.  But it's needed to
> > handle an edge case in one of the fscrypt IV generation methods.
> > 
> > Signed-off-by: Eric Biggers <ebiggers@google.com>
> > Co-developed-by: Satya Tangirala <satyat@google.com>
> > Signed-off-by: Satya Tangirala <satyat@google.com>
> > ---
> >  fs/iomap/direct-io.c | 12 +++++++++++-
> >  1 file changed, 11 insertions(+), 1 deletion(-)
> > 
> > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> > index ec7b78e6feca..12064daa3e3d 100644
> > --- a/fs/iomap/direct-io.c
> > +++ b/fs/iomap/direct-io.c
> > @@ -6,6 +6,7 @@
> >  #include <linux/module.h>
> >  #include <linux/compiler.h>
> >  #include <linux/fs.h>
> > +#include <linux/fscrypt.h>
> >  #include <linux/iomap.h>
> >  #include <linux/backing-dev.h>
> >  #include <linux/uio.h>
> > @@ -183,11 +184,16 @@ static void
> >  iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
> >  		unsigned len)
> >  {
> > +	struct inode *inode = file_inode(dio->iocb->ki_filp);
> >  	struct page *page = ZERO_PAGE(0);
> >  	int flags = REQ_SYNC | REQ_IDLE;
> >  	struct bio *bio;
> >  
> >  	bio = bio_alloc(GFP_KERNEL, 1);
> > +
> > +	/* encrypted direct I/O is guaranteed to be fs-block aligned */
> > +	WARN_ON_ONCE(fscrypt_needs_contents_encryption(inode));
> 
> Which means you are now placing a new constraint on this code in
> that we cannot ever, in future, zero entire blocks here.
> 
> This code can issue arbitrary sized zeroing bios - multiple entire fs blocks
> blocks if necessary - so I think constraining it to only support
> partial block zeroing by adding a warning like this is no correct.

In v3 and earlier this instead had the code to set an encryption context:

	fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
				  GFP_KERNEL);

Would you prefer that, even though the call to fscrypt_set_bio_crypt_ctx() would
always be a no-op currently (since for now, iomap_dio_zero() will never be
called with an encrypted file) and thus wouldn't be properly tested?

BTW, iomap_dio_zero() is actually limited to one page, so it's not quite
"arbitrary sizes".
	
> 
> > @@ -253,6 +259,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
> >  		ret = nr_pages;
> >  		goto out;
> >  	}
> > +	nr_pages = fscrypt_limit_io_pages(inode, pos, nr_pages);
> 
> So if "pos" overlaps a 2^32 offset when a certain mode is used, we
> have to break up the IO?

More or less.  It's actually when 'hashed_ino + (pos >> i_blkbits)' overlaps a
2^32 offset.  But yes, we have to break up the I/O when it happens.

> 
> If true, I'm not sure that this belongs here. Limiting the size of
> the IOs because of filesytem contraints belongs in the filesystem
> extent mapping code. That's the point where the IO is broken up into
> maximally sized chunks the filesystem can issue as a contiguous
> range. If the fscrypt code is breaking that "contiguous IO range"
> because of the mode being used, the fs mapping code should break
> the mapping at the boundery where the IO needs to be broken.
> 
> Hence the dio mapping code here will never build IOs that cross this
> -filesystem- encryption limitation, and we don't need this fscrypt
> code in the direct IO path at all.
> 

I think that would work.

iomap is used for other filesystem operations too, so we need to consider when
to actually do the limiting.  I don't think we should break up the extents
returned FS_IOC_FIEMAP, for example.  FIEMAP already has a defined behavior.
Also, it would be weird for the list of extents that FIEMAP returns to change
depending on whether the filesystem is mounted with '-o inlinecrypt' or not.

So, something like this:

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 44bad4bb8831..2816194db46c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3437,6 +3437,15 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 	map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
 			  EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
 
+	/*
+	 * When inline encryption is enabled, sometimes I/O to an encrypted file
+	 * has to be broken up to guarantee DUN contiguity.  Handle this by
+	 * limiting the length of the mapping returned.
+	 */
+	if (!(flags & IOMAP_REPORT))
+		map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk,
+						    map.m_len);
+
 	if (flags & IOMAP_WRITE)
 		ret = ext4_iomap_alloc(inode, &map, flags);
 	else


That also avoids any confusion between pages and blocks, which is nice.

- Eric
Matthew Wilcox July 22, 2020, 10:44 p.m. UTC | #4
On Wed, Jul 22, 2020 at 03:34:04PM -0700, Eric Biggers wrote:
> > Which means you are now placing a new constraint on this code in
> > that we cannot ever, in future, zero entire blocks here.
> > 
> > This code can issue arbitrary sized zeroing bios - multiple entire fs blocks
> > blocks if necessary - so I think constraining it to only support
> > partial block zeroing by adding a warning like this is no correct.
> 
> In v3 and earlier this instead had the code to set an encryption context:
> 
> 	fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
> 				  GFP_KERNEL);
> 
> Would you prefer that, even though the call to fscrypt_set_bio_crypt_ctx() would
> always be a no-op currently (since for now, iomap_dio_zero() will never be
> called with an encrypted file) and thus wouldn't be properly tested?
> 
> BTW, iomap_dio_zero() is actually limited to one page, so it's not quite
> "arbitrary sizes".

I have a patch for that

http://git.infradead.org/users/willy/pagecache.git/commitdiff/1a4d72a890ca9c2ea3d244a6153511ae674ce1d8

It's not going to cause a problem for crossing a 2^32 boundary because
pages are naturally aligned and don't get that big.
Eric Biggers July 22, 2020, 11:12 p.m. UTC | #5
On Wed, Jul 22, 2020 at 11:44:07PM +0100, Matthew Wilcox wrote:
> On Wed, Jul 22, 2020 at 03:34:04PM -0700, Eric Biggers wrote:
> > > Which means you are now placing a new constraint on this code in
> > > that we cannot ever, in future, zero entire blocks here.
> > > 
> > > This code can issue arbitrary sized zeroing bios - multiple entire fs blocks
> > > blocks if necessary - so I think constraining it to only support
> > > partial block zeroing by adding a warning like this is no correct.
> > 
> > In v3 and earlier this instead had the code to set an encryption context:
> > 
> > 	fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
> > 				  GFP_KERNEL);
> > 
> > Would you prefer that, even though the call to fscrypt_set_bio_crypt_ctx() would
> > always be a no-op currently (since for now, iomap_dio_zero() will never be
> > called with an encrypted file) and thus wouldn't be properly tested?
> > 
> > BTW, iomap_dio_zero() is actually limited to one page, so it's not quite
> > "arbitrary sizes".
> 
> I have a patch for that
> 
> http://git.infradead.org/users/willy/pagecache.git/commitdiff/1a4d72a890ca9c2ea3d244a6153511ae674ce1d8

No you don't :-)  Your patch is for iomap_zero_range() in
fs/iomap/buffered-io.c.  It doesn't touch fs/iomap/direct-io.c which is what
we're talking about here.

> It's not going to cause a problem for crossing a 2^32 boundary because
> pages are naturally aligned and don't get that big.

Well, the boundary can actually occur at any block.  But it's not relevant here
because (a) fs/iomap/buffered-io.c doesn't yet support encryption anyway, since
neither ext4 nor f2fs use it; and (b) iomap_zero_range() just writes to the
pagecache, and the bios aren't actually issued until ->writepages().

- Eric
Eric Biggers July 22, 2020, 11:26 p.m. UTC | #6
On Wed, Jul 22, 2020 at 03:34:04PM -0700, Eric Biggers wrote:
> So, something like this:
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 44bad4bb8831..2816194db46c 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3437,6 +3437,15 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>  	map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
>  			  EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
>  
> +	/*
> +	 * When inline encryption is enabled, sometimes I/O to an encrypted file
> +	 * has to be broken up to guarantee DUN contiguity.  Handle this by
> +	 * limiting the length of the mapping returned.
> +	 */
> +	if (!(flags & IOMAP_REPORT))
> +		map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk,
> +						    map.m_len);
> +
>  	if (flags & IOMAP_WRITE)
>  		ret = ext4_iomap_alloc(inode, &map, flags);
>  	else
> 
> 
> That also avoids any confusion between pages and blocks, which is nice.

Correction: for fiemap, ext4 actually uses ext4_iomap_begin_report() instead of
ext4_iomap_begin().  So we don't need to check for !IOMAP_REPORT.

Also it could make sense to limit map.m_len after ext4_iomap_alloc() rather than
before, so that we don't limit the length of the extent that gets allocated but
rather just the length that gets returned to iomap.

- Eric
Darrick J. Wong July 22, 2020, 11:32 p.m. UTC | #7
On Wed, Jul 22, 2020 at 04:26:25PM -0700, Eric Biggers wrote:
> On Wed, Jul 22, 2020 at 03:34:04PM -0700, Eric Biggers wrote:
> > So, something like this:
> > 
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index 44bad4bb8831..2816194db46c 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -3437,6 +3437,15 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> >  	map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
> >  			  EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
> >  
> > +	/*
> > +	 * When inline encryption is enabled, sometimes I/O to an encrypted file
> > +	 * has to be broken up to guarantee DUN contiguity.  Handle this by
> > +	 * limiting the length of the mapping returned.
> > +	 */
> > +	if (!(flags & IOMAP_REPORT))
> > +		map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk,
> > +						    map.m_len);
> > +
> >  	if (flags & IOMAP_WRITE)
> >  		ret = ext4_iomap_alloc(inode, &map, flags);
> >  	else
> > 
> > 
> > That also avoids any confusion between pages and blocks, which is nice.
> 
> Correction: for fiemap, ext4 actually uses ext4_iomap_begin_report() instead of
> ext4_iomap_begin().  So we don't need to check for !IOMAP_REPORT.
> 
> Also it could make sense to limit map.m_len after ext4_iomap_alloc() rather than
> before, so that we don't limit the length of the extent that gets allocated but
> rather just the length that gets returned to iomap.

Naïve question here -- if the decision to truncate the bio depends on
the file block offset, can you achieve the same thing by capping the
length of the iovec prior to iomap_dio_rw?

Granted that probably only makes sense if the LBLK IV thing is only
supposed to be used infrequently, and having to opencode a silly loop
might be more hassle than it's worth...

--D

> - Eric
Eric Biggers July 22, 2020, 11:43 p.m. UTC | #8
On Wed, Jul 22, 2020 at 04:32:47PM -0700, Darrick J. Wong wrote:
> On Wed, Jul 22, 2020 at 04:26:25PM -0700, Eric Biggers wrote:
> > On Wed, Jul 22, 2020 at 03:34:04PM -0700, Eric Biggers wrote:
> > > So, something like this:
> > > 
> > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > index 44bad4bb8831..2816194db46c 100644
> > > --- a/fs/ext4/inode.c
> > > +++ b/fs/ext4/inode.c
> > > @@ -3437,6 +3437,15 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> > >  	map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
> > >  			  EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
> > >  
> > > +	/*
> > > +	 * When inline encryption is enabled, sometimes I/O to an encrypted file
> > > +	 * has to be broken up to guarantee DUN contiguity.  Handle this by
> > > +	 * limiting the length of the mapping returned.
> > > +	 */
> > > +	if (!(flags & IOMAP_REPORT))
> > > +		map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk,
> > > +						    map.m_len);
> > > +
> > >  	if (flags & IOMAP_WRITE)
> > >  		ret = ext4_iomap_alloc(inode, &map, flags);
> > >  	else
> > > 
> > > 
> > > That also avoids any confusion between pages and blocks, which is nice.
> > 
> > Correction: for fiemap, ext4 actually uses ext4_iomap_begin_report() instead of
> > ext4_iomap_begin().  So we don't need to check for !IOMAP_REPORT.
> > 
> > Also it could make sense to limit map.m_len after ext4_iomap_alloc() rather than
> > before, so that we don't limit the length of the extent that gets allocated but
> > rather just the length that gets returned to iomap.
> 
> Naïve question here -- if the decision to truncate the bio depends on
> the file block offset, can you achieve the same thing by capping the
> length of the iovec prior to iomap_dio_rw?
> 
> Granted that probably only makes sense if the LBLK IV thing is only
> supposed to be used infrequently, and having to opencode a silly loop
> might be more hassle than it's worth...
> 

We *could* do the truncation there, but that would truncate the actual read() or
write().  So, userspace would see a short read or write.  And I understand that
while applications are *supposed* to handle short reads and writes, many don't.

I think Dave's suggestion makes more sense, since it would make this case be
treated just like normal fragmentation of the file.

- Eric
Dave Chinner July 23, 2020, 10:07 p.m. UTC | #9
On Wed, Jul 22, 2020 at 03:34:04PM -0700, Eric Biggers wrote:
> On Thu, Jul 23, 2020 at 07:16:29AM +1000, Dave Chinner wrote:
> > On Mon, Jul 20, 2020 at 11:37:35PM +0000, Satya Tangirala wrote:
> > > From: Eric Biggers <ebiggers@google.com>
> > > 
> > > Wire up iomap direct I/O with the fscrypt additions for direct I/O.
> > > This allows ext4 to support direct I/O on encrypted files when inline
> > > encryption is enabled.
> > > 
> > > This change consists of two parts:
> > > 
> > > - Set a bio_crypt_ctx on bios for encrypted files, so that the file
> > >   contents get encrypted (or decrypted).
> > > 
> > > - Ensure that encryption data unit numbers (DUNs) are contiguous within
> > >   each bio.  Use the new function fscrypt_limit_io_pages() for this,
> > >   since the iomap code works directly with logical ranges and thus
> > >   doesn't have a chance to call fscrypt_mergeable_bio() on each page.
> > > 
> > > Note that fscrypt_limit_io_pages() is normally a no-op, as normally the
> > > DUNs simply increment along with the logical blocks.  But it's needed to
> > > handle an edge case in one of the fscrypt IV generation methods.
> > > 
> > > Signed-off-by: Eric Biggers <ebiggers@google.com>
> > > Co-developed-by: Satya Tangirala <satyat@google.com>
> > > Signed-off-by: Satya Tangirala <satyat@google.com>
> > > ---
> > >  fs/iomap/direct-io.c | 12 +++++++++++-
> > >  1 file changed, 11 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> > > index ec7b78e6feca..12064daa3e3d 100644
> > > --- a/fs/iomap/direct-io.c
> > > +++ b/fs/iomap/direct-io.c
> > > @@ -6,6 +6,7 @@
> > >  #include <linux/module.h>
> > >  #include <linux/compiler.h>
> > >  #include <linux/fs.h>
> > > +#include <linux/fscrypt.h>
> > >  #include <linux/iomap.h>
> > >  #include <linux/backing-dev.h>
> > >  #include <linux/uio.h>
> > > @@ -183,11 +184,16 @@ static void
> > >  iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
> > >  		unsigned len)
> > >  {
> > > +	struct inode *inode = file_inode(dio->iocb->ki_filp);
> > >  	struct page *page = ZERO_PAGE(0);
> > >  	int flags = REQ_SYNC | REQ_IDLE;
> > >  	struct bio *bio;
> > >  
> > >  	bio = bio_alloc(GFP_KERNEL, 1);
> > > +
> > > +	/* encrypted direct I/O is guaranteed to be fs-block aligned */
> > > +	WARN_ON_ONCE(fscrypt_needs_contents_encryption(inode));
> > 
> > Which means you are now placing a new constraint on this code in
> > that we cannot ever, in future, zero entire blocks here.
> > 
> > This code can issue arbitrary sized zeroing bios - multiple entire fs blocks
> > blocks if necessary - so I think constraining it to only support
> > partial block zeroing by adding a warning like this is no correct.
> 
> In v3 and earlier this instead had the code to set an encryption context:
> 
> 	fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
> 				  GFP_KERNEL);
> 
> Would you prefer that, even though the call to fscrypt_set_bio_crypt_ctx() would

Actually, I have no idea what that function does. It's not in a
5.8-rc6 kernel, and it's not in this patchset....

> always be a no-op currently (since for now, iomap_dio_zero() will never be
> called with an encrypted file) and thus wouldn't be properly tested?

Same can be said for this WARN_ON_ONCE() code :)

But, in the interests of not leaving landmines, if a fscrypt context
is needed to be attached to the bio for data IO in direct IO, it
should be attached to all bios that are allocated in the dio path
rather than leave a landmine for people in future to trip over.

> BTW, iomap_dio_zero() is actually limited to one page, so it's not quite
> "arbitrary sizes".

Yup, but that's an implentation detail, not a design constraint.
i.e. I typically review/talk about how stuff functions at a
design/architecture level, not how it's been implemented in the
code.

e.g. block size > page size patches in progress make use of the
"arbitrary length" capability of the design:

https://lore.kernel.org/linux-xfs/20181107063127.3902-7-david@fromorbit.com/

> iomap is used for other filesystem operations too, so we need to consider when
> to actually do the limiting.  I don't think we should break up the extents
> returned FS_IOC_FIEMAP, for example.  FIEMAP already has a defined behavior.
> Also, it would be weird for the list of extents that FIEMAP returns to change
> depending on whether the filesystem is mounted with '-o inlinecrypt' or not.

We don't need to care about that in the iomap code. The caller
controls the behaviour of the mapping callbacks themselves via
the iomap_ops structure they pass into high level iomap functions.

> That also avoids any confusion between pages and blocks, which is nice.

FWIW, the latest version of the above patchset (which,
co-incidentally, I was bring up to date yesterday) abstracts away
page and block sizes. It introduces the concept of "chunk size"
which is calculated from the combination of the current page's size
and the current inode's block size.

i.e. in the near future we are going to have both variable page
sizes (on a per-page basis via Willy's current work) and per-inode
blocks sizes smaller, the same and larger than the size of the
current pager. Hence we need to get rid of any assumptions about
page sizes and block sizes in the iomap code, not introduce new
ones.

Hence if there is any limitation of filesystem functionality based
on block size vs page size, it is going to be up to the filesystem
to detect and enforce those restrictions, not the iomap
infrastructure.

Cheers,

Dave.
Eric Biggers July 23, 2020, 11:03 p.m. UTC | #10
Hi Dave,

On Fri, Jul 24, 2020 at 08:07:52AM +1000, Dave Chinner wrote:
> > > > @@ -183,11 +184,16 @@ static void
> > > >  iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
> > > >  		unsigned len)
> > > >  {
> > > > +	struct inode *inode = file_inode(dio->iocb->ki_filp);
> > > >  	struct page *page = ZERO_PAGE(0);
> > > >  	int flags = REQ_SYNC | REQ_IDLE;
> > > >  	struct bio *bio;
> > > >  
> > > >  	bio = bio_alloc(GFP_KERNEL, 1);
> > > > +
> > > > +	/* encrypted direct I/O is guaranteed to be fs-block aligned */
> > > > +	WARN_ON_ONCE(fscrypt_needs_contents_encryption(inode));
> > > 
> > > Which means you are now placing a new constraint on this code in
> > > that we cannot ever, in future, zero entire blocks here.
> > > 
> > > This code can issue arbitrary sized zeroing bios - multiple entire fs blocks
> > > blocks if necessary - so I think constraining it to only support
> > > partial block zeroing by adding a warning like this is no correct.
> > 
> > In v3 and earlier this instead had the code to set an encryption context:
> > 
> > 	fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
> > 				  GFP_KERNEL);
> > 
> > Would you prefer that, even though the call to fscrypt_set_bio_crypt_ctx() would
> 
> Actually, I have no idea what that function does. It's not in a
> 5.8-rc6 kernel, and it's not in this patchset....

The cover letter mentions that this patchset is based on fscrypt/master.

That is, "master" of https://git.kernel.org/pub/scm/fs/fscrypt/fscrypt.git

fscrypt_set_bio_crypt_ctx() was introduced by
"fscrypt: add inline encryption support" on that branch.

> 
> > always be a no-op currently (since for now, iomap_dio_zero() will never be
> > called with an encrypted file) and thus wouldn't be properly tested?
> 
> Same can be said for this WARN_ON_ONCE() code :)
> 
> But, in the interests of not leaving landmines, if a fscrypt context
> is needed to be attached to the bio for data IO in direct IO, it
> should be attached to all bios that are allocated in the dio path
> rather than leave a landmine for people in future to trip over.

My concern is that if we were to pass the wrong 'lblk' to
fscrypt_set_bio_crypt_ctx(), we wouldn't catch it because it's not tested.
Passing the wrong 'lblk' would cause the data to be encrypted/decrypted
incorrectly.

It's not a big deal though, as it's "obviously correct".  So we can just go
with that if you prefer it.

> 
> > BTW, iomap_dio_zero() is actually limited to one page, so it's not quite
> > "arbitrary sizes".
> 
> Yup, but that's an implentation detail, not a design constraint.
> i.e. I typically review/talk about how stuff functions at a
> design/architecture level, not how it's been implemented in the
> code.
> 
> e.g. block size > page size patches in progress make use of the
> "arbitrary length" capability of the design:
> 
> https://lore.kernel.org/linux-xfs/20181107063127.3902-7-david@fromorbit.com/
> 
> > iomap is used for other filesystem operations too, so we need to consider when
> > to actually do the limiting.  I don't think we should break up the extents
> > returned FS_IOC_FIEMAP, for example.  FIEMAP already has a defined behavior.
> > Also, it would be weird for the list of extents that FIEMAP returns to change
> > depending on whether the filesystem is mounted with '-o inlinecrypt' or not.
> 
> We don't need to care about that in the iomap code. The caller
> controls the behaviour of the mapping callbacks themselves via
> the iomap_ops structure they pass into high level iomap functions.

Sure, I wasn't saying we need to.  I was talking about what we need to do in
ext4.

> 
> > That also avoids any confusion between pages and blocks, which is nice.
> 
> FWIW, the latest version of the above patchset (which,
> co-incidentally, I was bring up to date yesterday) abstracts away
> page and block sizes. It introduces the concept of "chunk size"
> which is calculated from the combination of the current page's size
> and the current inode's block size.
> 
> i.e. in the near future we are going to have both variable page
> sizes (on a per-page basis via Willy's current work) and per-inode
> blocks sizes smaller, the same and larger than the size of the
> current pager. Hence we need to get rid of any assumptions about
> page sizes and block sizes in the iomap code, not introduce new
> ones.
> 
> Hence if there is any limitation of filesystem functionality based
> on block size vs page size, it is going to be up to the filesystem
> to detect and enforce those restrictions, not the iomap
> infrastructure.

Sure, again I was talking about what we'll be doing in ext4, since with the
proposed change, it will be ext4 that does fscrypt_limit_io_blocks().  The limit
is based on blocks, not pages, so "fscrypt_limit_io_pages()" was a bit weird.

Note that currently, I don't think iomap_dio_bio_actor() would handle an
encrypted file with blocksize > PAGE_SIZE correctly, as the I/O could be split
in the middle of a filesystem block (even after the filesystem ensures that
direct I/O on encrypted files is fully filesystem-block-aligned, which we do ---
see the rest of this patchset), which isn't allowed on encrypted files.

However we currently don't support blocksize > PAGE_SIZE in ext4, f2fs, or
fs/crypto/ at all, so I don't think we should add extra logic to fs/iomap/ to
try to handle that case for encrypted files when we'd have no way to test it.

- Eric
Dave Chinner July 24, 2020, 1:39 a.m. UTC | #11
On Thu, Jul 23, 2020 at 04:03:45PM -0700, Eric Biggers wrote:
> Hi Dave,
> 
> On Fri, Jul 24, 2020 at 08:07:52AM +1000, Dave Chinner wrote:
> > > > > @@ -183,11 +184,16 @@ static void
> > > > >  iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
> > > > >  		unsigned len)
> > > > >  {
> > > > > +	struct inode *inode = file_inode(dio->iocb->ki_filp);
> > > > >  	struct page *page = ZERO_PAGE(0);
> > > > >  	int flags = REQ_SYNC | REQ_IDLE;
> > > > >  	struct bio *bio;
> > > > >  
> > > > >  	bio = bio_alloc(GFP_KERNEL, 1);
> > > > > +
> > > > > +	/* encrypted direct I/O is guaranteed to be fs-block aligned */
> > > > > +	WARN_ON_ONCE(fscrypt_needs_contents_encryption(inode));
> > > > 
> > > > Which means you are now placing a new constraint on this code in
> > > > that we cannot ever, in future, zero entire blocks here.
> > > > 
> > > > This code can issue arbitrary sized zeroing bios - multiple entire fs blocks
> > > > blocks if necessary - so I think constraining it to only support
> > > > partial block zeroing by adding a warning like this is no correct.
> > > 
> > > In v3 and earlier this instead had the code to set an encryption context:
> > > 
> > > 	fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
> > > 				  GFP_KERNEL);
> > > 
> > > Would you prefer that, even though the call to fscrypt_set_bio_crypt_ctx() would
> > 
> > Actually, I have no idea what that function does. It's not in a
> > 5.8-rc6 kernel, and it's not in this patchset....
> 
> The cover letter mentions that this patchset is based on fscrypt/master.

Which the reader is left to work out where it exists. If you are
going to say "based on", then a pointer to the actual tree like
this:

> That is, "master" of https://git.kernel.org/pub/scm/fs/fscrypt/fscrypt.git

is somewhat helpful.

> fscrypt_set_bio_crypt_ctx() was introduced by
> "fscrypt: add inline encryption support" on that branch.

Way too much static inline function abstraction.

fscrypt_inode_uses_inline_crypto() ends up being:

	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) &&
	    inode->i_crypt_info->ci_inlinecrypt)

I note there are no checks for inode->i_crypt_info being non-null,
and I note that S_ENCRYPTED is set on the inode when the on-disk
encrypted flag is encountered, not when inode->i_crypt_info is set.

Further, I note that the local variable for ci is fetched before
fscrypt_inode_uses_inline_crypto() is run, so leaving a landmine for
people who try to access ci before checking if inline crypto is
enabled. Or, indeed, if S_ENCRYPTED is set and the crypt_info has
not been set up, fscrypt_set_bio_crypt_ctx() will oops in the DIO
path.

> > > always be a no-op currently (since for now, iomap_dio_zero() will never be
> > > called with an encrypted file) and thus wouldn't be properly tested?
> > 
> > Same can be said for this WARN_ON_ONCE() code :)
> > 
> > But, in the interests of not leaving landmines, if a fscrypt context
> > is needed to be attached to the bio for data IO in direct IO, it
> > should be attached to all bios that are allocated in the dio path
> > rather than leave a landmine for people in future to trip over.
> 
> My concern is that if we were to pass the wrong 'lblk' to
> fscrypt_set_bio_crypt_ctx(), we wouldn't catch it because it's not tested.
> Passing the wrong 'lblk' would cause the data to be encrypted/decrypted
> incorrectly.

When you implement sub-block DIO alignment for fscrypt enabled
filesystems, fsx will tell you pretty quickly if you screwed up....

> Note that currently, I don't think iomap_dio_bio_actor() would handle an
> encrypted file with blocksize > PAGE_SIZE correctly, as the I/O could be split
> in the middle of a filesystem block (even after the filesystem ensures that
> direct I/O on encrypted files is fully filesystem-block-aligned, which we do ---
> see the rest of this patchset), which isn't allowed on encrypted files.

That can already happen unless you've specifically restricted DIO
alignments in the filesystem code. i.e. Direct IO already supports
sub-block ranges and alignment, and we can already do user DIO on
sub-block, sector aligned ranges just fine. And the filesystem can
already split the iomap on sub-block alignments and ranges if it
needs to because the iomap uses byte range addressing, not sector or
block based addressing.

So either you already have a situation where the 2^32 offset can
land *inside* a filesystem block, or the offset is guaranteed to be
filesystem block aligned and so you'll never get this "break an IO
on sub-block alignment" problem regardless of the filesystem block
size...

Either way, it's not an iomap problem - it's a filesystem mapping
problem...

Cheers,

Dave.
Eric Biggers July 24, 2020, 3:46 a.m. UTC | #12
On Fri, Jul 24, 2020 at 11:39:10AM +1000, Dave Chinner wrote:
> > fscrypt_set_bio_crypt_ctx() was introduced by
> > "fscrypt: add inline encryption support" on that branch.
> 
> Way too much static inline function abstraction.

Well, this is mostly because we're trying to be a good kernel citizen and
minimize the overhead when our feature isn't enabled or isn't being used.

Eventually we might remove CONFIG_FS_ENCRYPTION_INLINE_CRYPT and make
CONFIG_FS_ENCRYPTION always offer inline crypto support, which would simplify
things.  But for now we'd like to keep the separate option.

Also, previously people have complained about hardcoded S_ISREG() to decide
whether a file needs contents encryption, and said they prefer a helper function
fscrypt_needs_contents_encryption().  (Which is what we have now.)

So we can't make everyone happy...

> 
> fscrypt_inode_uses_inline_crypto() ends up being:
> 
> 	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) &&
> 	    inode->i_crypt_info->ci_inlinecrypt)
> 
> I note there are no checks for inode->i_crypt_info being non-null,
> and I note that S_ENCRYPTED is set on the inode when the on-disk
> encrypted flag is encountered, not when inode->i_crypt_info is set.
> 

->i_crypt_info is set when the file is opened, so it's guaranteed to be set for
any I/O.  So the case you're concerned about just doesn't happen.

If we're talking about handling a hypothetical bug where it does nevertheless
happen, there are three options:

(a) Crash (current behavior)
(b) Silently leave the file unencrypted, i.e. silently introduce a critical
    security vulnerability.
(c) Return an error, which would add a lot of code complexity because we'd have
    start returning an error from places like fscrypt_set_bio_crypt_ctx() that
    currently can't fail.  This would be dead code that's not tested.

I very much prefer (a), since it's the simplest option, and it also makes the
bug most likely to be reported and fixed, without leaving the possibility for
data to silently be left unencrypted.  Crashing isn't good (obviously), but the
other options are worse.

> Further, I note that the local variable for ci is fetched before
> fscrypt_inode_uses_inline_crypto() is run, so leaving a landmine for
> people who try to access ci before checking if inline crypto is
> enabled. Or, indeed, if S_ENCRYPTED is set and the crypt_info has
> not been set up, fscrypt_set_bio_crypt_ctx() will oops in the DIO
> path.

Sure, this is harmless currently, but we can move the assignment to 'ci'.

> > > > always be a no-op currently (since for now, iomap_dio_zero() will never be
> > > > called with an encrypted file) and thus wouldn't be properly tested?
> > > 
> > > Same can be said for this WARN_ON_ONCE() code :)
> > > 
> > > But, in the interests of not leaving landmines, if a fscrypt context
> > > is needed to be attached to the bio for data IO in direct IO, it
> > > should be attached to all bios that are allocated in the dio path
> > > rather than leave a landmine for people in future to trip over.
> > 
> > My concern is that if we were to pass the wrong 'lblk' to
> > fscrypt_set_bio_crypt_ctx(), we wouldn't catch it because it's not tested.
> > Passing the wrong 'lblk' would cause the data to be encrypted/decrypted
> > incorrectly.
> 
> When you implement sub-block DIO alignment for fscrypt enabled
> filesystems, fsx will tell you pretty quickly if you screwed up....
> 
> > Note that currently, I don't think iomap_dio_bio_actor() would handle an
> > encrypted file with blocksize > PAGE_SIZE correctly, as the I/O could be split
> > in the middle of a filesystem block (even after the filesystem ensures that
> > direct I/O on encrypted files is fully filesystem-block-aligned, which we do ---
> > see the rest of this patchset), which isn't allowed on encrypted files.
> 
> That can already happen unless you've specifically restricted DIO
> alignments in the filesystem code. i.e. Direct IO already supports
> sub-block ranges and alignment, and we can already do user DIO on
> sub-block, sector aligned ranges just fine. And the filesystem can
> already split the iomap on sub-block alignments and ranges if it
> needs to because the iomap uses byte range addressing, not sector or
> block based addressing.
> 
> So either you already have a situation where the 2^32 offset can
> land *inside* a filesystem block, or the offset is guaranteed to be
> filesystem block aligned and so you'll never get this "break an IO
> on sub-block alignment" problem regardless of the filesystem block
> size...
> 
> Either way, it's not an iomap problem - it's a filesystem mapping
> problem...
> 

I think you're missing the point here.  Currently, the granularity of encryption
(a.k.a. "data unit size") is always filesystem blocks, so that's the minimum we
can directly read or write to an encrypted file.  This has nothing to do with
the IV wraparound case also being discussed.

For example, changing a single bit in the plaintext of a filesystem block may
result in the entire block's ciphertext changing.  (The exact behavior depends
on the cryptographic algorithm that is used.)

That's why this patchset makes ext4 only allow direct I/O on encrypted files if
the I/O is fully filesystem-block-aligned.  Note that this might be a more
strict alignment requirement than the bdev_logical_block_size().

As long as the iomap code only issues filesystem-block-aligned bios, *given
fully filesystem-block-aligned inputs*, we're fine.  That appears to be the case
currently.  I was just pointing out that some changes might be needed to
maintain this property in the blocksize > PAGE_SIZE case (which again, for
encrypted files is totally unsupported at the moment anyway).

(It's possible that in the future we'll support other encryption data unit
sizes, perhaps powers of 2 from 512 to filesystem block size.  But for now the
filesystem block size has been good enough for everyone, and there would be a
significant performance hit when using a smaller size, so there hasn't been a
need to add the extra layer of complexity.)

- Eric
Dave Chinner July 24, 2020, 5:31 a.m. UTC | #13
On Thu, Jul 23, 2020 at 08:46:28PM -0700, Eric Biggers wrote:
> On Fri, Jul 24, 2020 at 11:39:10AM +1000, Dave Chinner wrote:
> > fscrypt_inode_uses_inline_crypto() ends up being:
> > 
> > 	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) &&
> > 	    inode->i_crypt_info->ci_inlinecrypt)
> > 
> > I note there are no checks for inode->i_crypt_info being non-null,
> > and I note that S_ENCRYPTED is set on the inode when the on-disk
> > encrypted flag is encountered, not when inode->i_crypt_info is set.
> > 
> 
> ->i_crypt_info is set when the file is opened, so it's guaranteed to be set for
> any I/O.  So the case you're concerned about just doesn't happen.

Ok. The connection is not obvious to someone who doesn't know the
fscrypt code inside out.

> > > Note that currently, I don't think iomap_dio_bio_actor() would handle an
> > > encrypted file with blocksize > PAGE_SIZE correctly, as the I/O could be split
> > > in the middle of a filesystem block (even after the filesystem ensures that
> > > direct I/O on encrypted files is fully filesystem-block-aligned, which we do ---
> > > see the rest of this patchset), which isn't allowed on encrypted files.
> > 
> > That can already happen unless you've specifically restricted DIO
> > alignments in the filesystem code. i.e. Direct IO already supports
> > sub-block ranges and alignment, and we can already do user DIO on
> > sub-block, sector aligned ranges just fine. And the filesystem can
> > already split the iomap on sub-block alignments and ranges if it
> > needs to because the iomap uses byte range addressing, not sector or
> > block based addressing.
> > 
> > So either you already have a situation where the 2^32 offset can
> > land *inside* a filesystem block, or the offset is guaranteed to be
> > filesystem block aligned and so you'll never get this "break an IO
> > on sub-block alignment" problem regardless of the filesystem block
> > size...
> > 
> > Either way, it's not an iomap problem - it's a filesystem mapping
> > problem...
> > 
> 
> I think you're missing the point here.  Currently, the granularity of encryption
> (a.k.a. "data unit size") is always filesystem blocks, so that's the minimum we
> can directly read or write to an encrypted file.  This has nothing to do with
> the IV wraparound case also being discussed.

So when you change the subject, please make it *really obvious* so
that people don't think you are still talking about the same issue.

> For example, changing a single bit in the plaintext of a filesystem block may
> result in the entire block's ciphertext changing.  (The exact behavior depends
> on the cryptographic algorithm that is used.)
> 
> That's why this patchset makes ext4 only allow direct I/O on encrypted files if
> the I/O is fully filesystem-block-aligned.  Note that this might be a more
> strict alignment requirement than the bdev_logical_block_size().
> 
> As long as the iomap code only issues filesystem-block-aligned bios, *given
> fully filesystem-block-aligned inputs*, we're fine.  That appears to be the case
> currently.

The actual size and shape of the bios issued by direct IO (both old
code and newer iomap code) is determined by the user supplied iov,
the size of the biovec array allocated in the bio, and the IO
constraints of the underlying hardware.  Hence direct IO does not
guarantee alignment to anything larger than the underlying block
device logical sector size because there's no guarantee when or
where a bio will fill up.

To guarantee alignment of what ends up at the hardware, you have to
set the block device parameters (e.g. logical sector size)
appropriately all the way down the stack. You also need to ensure
that the filesystem is correctly aligned on the block device so that
filesystem blocks don't overlap things like RAID stripe boundaires,
linear concat boundaries, etc.

IOWs, to constrain alignment in the IO path, you need to configure
you system correct so that the information provided to iomap for IO
alignment matches your requirements. This is not somethign iomap can
do itself; everything from above needs to be constrained by the
filesystem using iomap, everything sent below by iomap is
constrained by the block device config.

> (It's possible that in the future we'll support other encryption data unit
> sizes, perhaps powers of 2 from 512 to filesystem block size.  But for now the
> filesystem block size has been good enough for everyone,

Not the case. fscrypt use in enterprise environments needs support
for block size < page size so that it can be deployed on 64kB page
size machines without requiring 64kB filesystem block sizes.

Cheers,

Dave.
Eric Biggers July 24, 2020, 5:41 p.m. UTC | #14
On Fri, Jul 24, 2020 at 03:31:30PM +1000, Dave Chinner wrote:
> On Thu, Jul 23, 2020 at 08:46:28PM -0700, Eric Biggers wrote:
> > On Fri, Jul 24, 2020 at 11:39:10AM +1000, Dave Chinner wrote:
> > > fscrypt_inode_uses_inline_crypto() ends up being:
> > > 
> > > 	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) &&
> > > 	    inode->i_crypt_info->ci_inlinecrypt)
> > > 
> > > I note there are no checks for inode->i_crypt_info being non-null,
> > > and I note that S_ENCRYPTED is set on the inode when the on-disk
> > > encrypted flag is encountered, not when inode->i_crypt_info is set.
> > > 
> > 
> > ->i_crypt_info is set when the file is opened, so it's guaranteed to be set for
> > any I/O.  So the case you're concerned about just doesn't happen.
> 
> Ok. The connection is not obvious to someone who doesn't know the
> fscrypt code inside out.
> 
> > > > Note that currently, I don't think iomap_dio_bio_actor() would handle an
> > > > encrypted file with blocksize > PAGE_SIZE correctly, as the I/O could be split
> > > > in the middle of a filesystem block (even after the filesystem ensures that
> > > > direct I/O on encrypted files is fully filesystem-block-aligned, which we do ---
> > > > see the rest of this patchset), which isn't allowed on encrypted files.
> > > 
> > > That can already happen unless you've specifically restricted DIO
> > > alignments in the filesystem code. i.e. Direct IO already supports
> > > sub-block ranges and alignment, and we can already do user DIO on
> > > sub-block, sector aligned ranges just fine. And the filesystem can
> > > already split the iomap on sub-block alignments and ranges if it
> > > needs to because the iomap uses byte range addressing, not sector or
> > > block based addressing.
> > > 
> > > So either you already have a situation where the 2^32 offset can
> > > land *inside* a filesystem block, or the offset is guaranteed to be
> > > filesystem block aligned and so you'll never get this "break an IO
> > > on sub-block alignment" problem regardless of the filesystem block
> > > size...
> > > 
> > > Either way, it's not an iomap problem - it's a filesystem mapping
> > > problem...
> > > 
> > 
> > I think you're missing the point here.  Currently, the granularity of encryption
> > (a.k.a. "data unit size") is always filesystem blocks, so that's the minimum we
> > can directly read or write to an encrypted file.  This has nothing to do with
> > the IV wraparound case also being discussed.
> 
> So when you change the subject, please make it *really obvious* so
> that people don't think you are still talking about the same issue.
> 
> > For example, changing a single bit in the plaintext of a filesystem block may
> > result in the entire block's ciphertext changing.  (The exact behavior depends
> > on the cryptographic algorithm that is used.)
> > 
> > That's why this patchset makes ext4 only allow direct I/O on encrypted files if
> > the I/O is fully filesystem-block-aligned.  Note that this might be a more
> > strict alignment requirement than the bdev_logical_block_size().
> > 
> > As long as the iomap code only issues filesystem-block-aligned bios, *given
> > fully filesystem-block-aligned inputs*, we're fine.  That appears to be the case
> > currently.
> 
> The actual size and shape of the bios issued by direct IO (both old
> code and newer iomap code) is determined by the user supplied iov,
> the size of the biovec array allocated in the bio, and the IO
> constraints of the underlying hardware.  Hence direct IO does not
> guarantee alignment to anything larger than the underlying block
> device logical sector size because there's no guarantee when or
> where a bio will fill up.
> 
> To guarantee alignment of what ends up at the hardware, you have to
> set the block device parameters (e.g. logical sector size)
> appropriately all the way down the stack. You also need to ensure
> that the filesystem is correctly aligned on the block device so that
> filesystem blocks don't overlap things like RAID stripe boundaires,
> linear concat boundaries, etc.
> 
> IOWs, to constrain alignment in the IO path, you need to configure
> you system correct so that the information provided to iomap for IO
> alignment matches your requirements. This is not somethign iomap can
> do itself; everything from above needs to be constrained by the
> filesystem using iomap, everything sent below by iomap is
> constrained by the block device config.

That way of thinking about things doesn't entirely work for inline encryption.

Hardware can support multiple encryption "data unit sizes", some of which may be
larger than the logical block size.  (The data unit size is the granularity of
encryption.  E.g. if software selects data_unit_size=4096, then each invocation
of the encryption/decryption algorithm is passed 4096 bytes.  You can't then
later encrypt/decrypt just part of that; that's not how the algorithms work.)

For example hardware might *in general* support addressing 512-byte sectors and
thus have logical_block_size=512.  But it could also support encryption data
unit sizes [512, 1024, 2048, 4096].  Encrypted I/O has to be aligned to the data
unit size, not just to the logical block size.  The data unit size to use, and
whether to use encryption or not, is decided on a per-I/O basis.

So in this case technically it's the filesystem (and later the
bio::bi_crypt_context which the filesystem sets) that knows about the alignment
needed -- *not* the request_queue.

Is it your opinion that inline encryption should only be supported when
data_unit_size <= logical_block_size?  The problems with that are

    (a) Using an unnecessarily small data_unit_size degrades performance a
	lot -- for *all* I/O, not just direct I/O.  This is because there are a
	lot more separate encryptions/decryptions to do, and there's a fixed
	overhead to each one (much of which is intrinsic in the crypto
	algorithms themselves, i.e. this isn't simply an implementation quirk).

    (b) fscrypt currently only supports data_unit_size == filesystem_block_size.
	(OFC, filesystem_block_size may be greater than logical_block_size.)

    (c) Filesystem images would be less portable, unless the minimum
	data_unit_size were used everywhere which would degrade performance.

(We could address (b) by allowing users to specify data_unit_size when
encrypting a directory.  That would add complexity, but it's possible.)

But again, as far as I can tell, fs/iomap/direct-io.c currently *does* guarantee
that *if* the input is fully filesystem-block-aligned and if blocksize <=
PAGE_SIZE, then the issued I/O is also filesystem-block-aligned.

So as far as I can tell, there isn't really any problem there, at least not now.
I just want to make sure we're on the same page...

> 
> > (It's possible that in the future we'll support other encryption data unit
> > sizes, perhaps powers of 2 from 512 to filesystem block size.  But for now the
> > filesystem block size has been good enough for everyone,
> 
> Not the case. fscrypt use in enterprise environments needs support
> for block size < page size so that it can be deployed on 64kB page
> size machines without requiring 64kB filesystem block sizes.
> 

It's already supported (on ext4 since Linux v5.5).

What's not supported (yet) is data_unit_size < filesystem_block_size.

- Eric
Dave Chinner July 25, 2020, 11:47 p.m. UTC | #15
On Fri, Jul 24, 2020 at 10:41:32AM -0700, Eric Biggers wrote:
> On Fri, Jul 24, 2020 at 03:31:30PM +1000, Dave Chinner wrote:
> > On Thu, Jul 23, 2020 at 08:46:28PM -0700, Eric Biggers wrote:
> > > On Fri, Jul 24, 2020 at 11:39:10AM +1000, Dave Chinner wrote:
> > > > fscrypt_inode_uses_inline_crypto() ends up being:
> > > > 
> > > > 	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) &&
> > > > 	    inode->i_crypt_info->ci_inlinecrypt)
> > > > 
> > > > I note there are no checks for inode->i_crypt_info being non-null,
> > > > and I note that S_ENCRYPTED is set on the inode when the on-disk
> > > > encrypted flag is encountered, not when inode->i_crypt_info is set.
> > > > 
> > > 
> > > ->i_crypt_info is set when the file is opened, so it's guaranteed to be set for
> > > any I/O.  So the case you're concerned about just doesn't happen.
> > 
> > Ok. The connection is not obvious to someone who doesn't know the
> > fscrypt code inside out.
> > 
> > > > > Note that currently, I don't think iomap_dio_bio_actor() would handle an
> > > > > encrypted file with blocksize > PAGE_SIZE correctly, as the I/O could be split
> > > > > in the middle of a filesystem block (even after the filesystem ensures that
> > > > > direct I/O on encrypted files is fully filesystem-block-aligned, which we do ---
> > > > > see the rest of this patchset), which isn't allowed on encrypted files.
> > > > 
> > > > That can already happen unless you've specifically restricted DIO
> > > > alignments in the filesystem code. i.e. Direct IO already supports
> > > > sub-block ranges and alignment, and we can already do user DIO on
> > > > sub-block, sector aligned ranges just fine. And the filesystem can
> > > > already split the iomap on sub-block alignments and ranges if it
> > > > needs to because the iomap uses byte range addressing, not sector or
> > > > block based addressing.
> > > > 
> > > > So either you already have a situation where the 2^32 offset can
> > > > land *inside* a filesystem block, or the offset is guaranteed to be
> > > > filesystem block aligned and so you'll never get this "break an IO
> > > > on sub-block alignment" problem regardless of the filesystem block
> > > > size...
> > > > 
> > > > Either way, it's not an iomap problem - it's a filesystem mapping
> > > > problem...
> > > > 
> > > 
> > > I think you're missing the point here.  Currently, the granularity of encryption
> > > (a.k.a. "data unit size") is always filesystem blocks, so that's the minimum we
> > > can directly read or write to an encrypted file.  This has nothing to do with
> > > the IV wraparound case also being discussed.
> > 
> > So when you change the subject, please make it *really obvious* so
> > that people don't think you are still talking about the same issue.
> > 
> > > For example, changing a single bit in the plaintext of a filesystem block may
> > > result in the entire block's ciphertext changing.  (The exact behavior depends
> > > on the cryptographic algorithm that is used.)
> > > 
> > > That's why this patchset makes ext4 only allow direct I/O on encrypted files if
> > > the I/O is fully filesystem-block-aligned.  Note that this might be a more
> > > strict alignment requirement than the bdev_logical_block_size().
> > > 
> > > As long as the iomap code only issues filesystem-block-aligned bios, *given
> > > fully filesystem-block-aligned inputs*, we're fine.  That appears to be the case
> > > currently.
> > 
> > The actual size and shape of the bios issued by direct IO (both old
> > code and newer iomap code) is determined by the user supplied iov,
> > the size of the biovec array allocated in the bio, and the IO
> > constraints of the underlying hardware.  Hence direct IO does not
> > guarantee alignment to anything larger than the underlying block
> > device logical sector size because there's no guarantee when or
> > where a bio will fill up.
> > 
> > To guarantee alignment of what ends up at the hardware, you have to
> > set the block device parameters (e.g. logical sector size)
> > appropriately all the way down the stack. You also need to ensure
> > that the filesystem is correctly aligned on the block device so that
> > filesystem blocks don't overlap things like RAID stripe boundaires,
> > linear concat boundaries, etc.
> > 
> > IOWs, to constrain alignment in the IO path, you need to configure
> > you system correct so that the information provided to iomap for IO
> > alignment matches your requirements. This is not somethign iomap can
> > do itself; everything from above needs to be constrained by the
> > filesystem using iomap, everything sent below by iomap is
> > constrained by the block device config.
> 
> That way of thinking about things doesn't entirely work for inline encryption.

Then the inline encryption design is flawed. Block devices tell the
layers above what the minimum unit of atomic IO is via the logical
block size of the device is. Everything above the block device
assumes that it can align and size IO to this size, and the IO will
succeed.

> Hardware can support multiple encryption "data unit sizes", some of which may be
> larger than the logical block size.  (The data unit size is the granularity of
> encryption.  E.g. if software selects data_unit_size=4096, then each invocation
> of the encryption/decryption algorithm is passed 4096 bytes.  You can't then
> later encrypt/decrypt just part of that; that's not how the algorithms work.)

I know what a DUN is. The problem here is that it's the unit of
atomic IO the hardware supports when encryption is enabled....

> For example hardware might *in general* support addressing 512-byte sectors and
> thus have logical_block_size=512.  But it could also support encryption data
> unit sizes [512, 1024, 2048, 4096].  Encrypted I/O has to be aligned to the data
> unit size, not just to the logical block size.  The data unit size to use, and
> whether to use encryption or not, is decided on a per-I/O basis.

And that is the fundamental problem here: DUN > logical block size
of the underlying device. i.e. The storage stack does not guarantee
atomicity of such IOs.

If inline encryption increases the size of the atomic unit of IO,
then the logical block size of the device must increase to match it.
If you do that, then the iomap and storage layers will guarantee
that IOs are *always* aligned to DUN boundaries.

> So in this case technically it's the filesystem (and later the
> bio::bi_crypt_context which the filesystem sets) that knows about the alignment
> needed -- *not* the request_queue.

Exactly my point. Requiring infrastructure and storage layers to
obey completely new, undefined, undiscoverable, opaque and variable
definition of the block devices' "atomic unit of IO", then that's
simply a non-starter. That requires a complete re-architecture of
the block layers and how things interface and transmit information
through them. At minimum, high level IO alignment constraints must
be generic and not be hidden in context specific crypto structures.

> Is it your opinion that inline encryption should only be supported when
> data_unit_size <= logical_block_size?  The problems with that are

Pretty much.

>     (a) Using an unnecessarily small data_unit_size degrades performance a
> 	lot -- for *all* I/O, not just direct I/O.  This is because there are a
> 	lot more separate encryptions/decryptions to do, and there's a fixed
> 	overhead to each one (much of which is intrinsic in the crypto
> 	algorithms themselves, i.e. this isn't simply an implementation quirk).

Performance is irrelevant if correctness is not possible.

>     (b) fscrypt currently only supports data_unit_size == filesystem_block_size.
> 	(OFC, filesystem_block_size may be greater than logical_block_size.)

Which is just fine if FSB == logical block size.

The existing constraint on filesystems is that FSB >= block device
logical sector size as the filesystem has to be able to do single
block IOs.  For inline encryption, this turns into a constraint on
fscrypt that DUN <= logical block size because it requires IOs to be
aligned to DUNs, not filesystem blocks..

And because of the -implementation limitation- of fscrypt that DUN
== FSB, that means the only valid configuration right now is DUN =
FSB = logical sector size.

>     (c) Filesystem images would be less portable, unless the minimum
> 	data_unit_size were used everywhere which would degrade performance.

Not my problem. If the hardware and/or kernel cannot support the
requirements of the encryption used within the filesystem image,
then it -should error out-.

> (We could address (b) by allowing users to specify data_unit_size when
> encrypting a directory.  That would add complexity, but it's possible.)
> 
> But again, as far as I can tell, fs/iomap/direct-io.c currently *does* guarantee
> that *if* the input is fully filesystem-block-aligned and if blocksize <=
> PAGE_SIZE, then the issued I/O is also filesystem-block-aligned.

Please listen to what I'm saying, Eric.

The -current iomap implementation- may provide that behaviour. That
doesn't mean we guarantee that behaviour. i.e. the iomap -design-
does not guaranteee that behaviour, and we don't guarantee such
behaviour into the future. And we won't guarantee this behaviour -
even though the current implementation may provide it - because the
rest of the IO stack below iomap does not provide iomap with that
guarantee.

Hence if iomap cannot get a guarantee that IO it issues won't get
split at some arbitrary boundary, it cannot provide filesystems with
that guarantee.

> So as far as I can tell, there isn't really any problem there, at least not now.
> I just want to make sure we're on the same page...

If that page is "fscrypt+inline encryption is making fundamentally
flawed assumptions about IO stack behaviour" then, yes, we're on the
same page.

And that code that has fundamentally flawed assumptions is grounds
for a NACK, yes?

-Dave.
Dave Chinner July 25, 2020, 11:59 p.m. UTC | #16
On Sun, Jul 26, 2020 at 09:47:51AM +1000, Dave Chinner wrote:
> On Fri, Jul 24, 2020 at 10:41:32AM -0700, Eric Biggers wrote:
> > But again, as far as I can tell, fs/iomap/direct-io.c currently *does* guarantee
> > that *if* the input is fully filesystem-block-aligned and if blocksize <=
> > PAGE_SIZE, then the issued I/O is also filesystem-block-aligned.
> 
> Please listen to what I'm saying, Eric.
> 
> The -current iomap implementation- may provide that behaviour. That
> doesn't mean we guarantee that behaviour. i.e. the iomap -design-
> does not guaranteee that behaviour, and we don't guarantee such
> behaviour into the future. And we won't guarantee this behaviour -
> even though the current implementation may provide it - because the
> rest of the IO stack below iomap does not provide iomap with that
> guarantee.
> 
> Hence if iomap cannot get a guarantee that IO it issues won't get
> split at some arbitrary boundary, it cannot provide filesystems with
> that guarantee.

BTW, if you want iomap_dio_rw() to provide an arbitrary bio
alignment guarantee at the iomap layer, then it should be returned
in the iomap along with the extent mapping. That could then be used
instead of the bdev logical block size. That won't guarantee the
behaviour of the rest of the stack, but it would provide a defined
IO submission behaviour that iomap would have to guarantee into the
future...

That would also remove the need to duplicate the alignment checks
in the filesystem for fscrypt DIO...

Cheers,

Dave.
Eric Biggers July 26, 2020, 2:42 a.m. UTC | #17
On Sun, Jul 26, 2020 at 09:47:51AM +1000, Dave Chinner wrote:
> > > > I think you're missing the point here.  Currently, the granularity of encryption
> > > > (a.k.a. "data unit size") is always filesystem blocks, so that's the minimum we
> > > > can directly read or write to an encrypted file.  This has nothing to do with
> > > > the IV wraparound case also being discussed.
> > > 
> > > So when you change the subject, please make it *really obvious* so
> > > that people don't think you are still talking about the same issue.
> > > 
> > > > For example, changing a single bit in the plaintext of a filesystem block may
> > > > result in the entire block's ciphertext changing.  (The exact behavior depends
> > > > on the cryptographic algorithm that is used.)
> > > > 
> > > > That's why this patchset makes ext4 only allow direct I/O on encrypted files if
> > > > the I/O is fully filesystem-block-aligned.  Note that this might be a more
> > > > strict alignment requirement than the bdev_logical_block_size().
> > > > 
> > > > As long as the iomap code only issues filesystem-block-aligned bios, *given
> > > > fully filesystem-block-aligned inputs*, we're fine.  That appears to be the case
> > > > currently.
> > > 
> > > The actual size and shape of the bios issued by direct IO (both old
> > > code and newer iomap code) is determined by the user supplied iov,
> > > the size of the biovec array allocated in the bio, and the IO
> > > constraints of the underlying hardware.  Hence direct IO does not
> > > guarantee alignment to anything larger than the underlying block
> > > device logical sector size because there's no guarantee when or
> > > where a bio will fill up.
> > > 
> > > To guarantee alignment of what ends up at the hardware, you have to
> > > set the block device parameters (e.g. logical sector size)
> > > appropriately all the way down the stack. You also need to ensure
> > > that the filesystem is correctly aligned on the block device so that
> > > filesystem blocks don't overlap things like RAID stripe boundaires,
> > > linear concat boundaries, etc.
> > > 
> > > IOWs, to constrain alignment in the IO path, you need to configure
> > > you system correct so that the information provided to iomap for IO
> > > alignment matches your requirements. This is not somethign iomap can
> > > do itself; everything from above needs to be constrained by the
> > > filesystem using iomap, everything sent below by iomap is
> > > constrained by the block device config.
> > 
> > That way of thinking about things doesn't entirely work for inline encryption.
> 
> Then the inline encryption design is flawed. Block devices tell the
> layers above what the minimum unit of atomic IO is via the logical
> block size of the device is. Everything above the block device
> assumes that it can align and size IO to this size, and the IO will
> succeed.
> 
> > Hardware can support multiple encryption "data unit sizes", some of which may be
> > larger than the logical block size.  (The data unit size is the granularity of
> > encryption.  E.g. if software selects data_unit_size=4096, then each invocation
> > of the encryption/decryption algorithm is passed 4096 bytes.  You can't then
> > later encrypt/decrypt just part of that; that's not how the algorithms work.)
> 
> I know what a DUN is. The problem here is that it's the unit of
> atomic IO the hardware supports when encryption is enabled....
> 
> > For example hardware might *in general* support addressing 512-byte sectors and
> > thus have logical_block_size=512.  But it could also support encryption data
> > unit sizes [512, 1024, 2048, 4096].  Encrypted I/O has to be aligned to the data
> > unit size, not just to the logical block size.  The data unit size to use, and
> > whether to use encryption or not, is decided on a per-I/O basis.
> 
> And that is the fundamental problem here: DUN > logical block size
> of the underlying device. i.e. The storage stack does not guarantee
> atomicity of such IOs.
> 
> If inline encryption increases the size of the atomic unit of IO,
> then the logical block size of the device must increase to match it.
> If you do that, then the iomap and storage layers will guarantee
> that IOs are *always* aligned to DUN boundaries.
> 
> > So in this case technically it's the filesystem (and later the
> > bio::bi_crypt_context which the filesystem sets) that knows about the alignment
> > needed -- *not* the request_queue.
> 
> Exactly my point. Requiring infrastructure and storage layers to
> obey completely new, undefined, undiscoverable, opaque and variable
> definition of the block devices' "atomic unit of IO", then that's
> simply a non-starter. That requires a complete re-architecture of
> the block layers and how things interface and transmit information
> through them. At minimum, high level IO alignment constraints must
> be generic and not be hidden in context specific crypto structures.

Do you have any specific examples in mind of where *encrypted* I/O may broken at
only a logical_block_size boundary?  Remember that encrypted I/O with a
particular data_unit_size is only issued if the request_queue has declared that
it supports encryption with that data_unit_size.  In the case of a layered
device, that means that every layer would have to opt-into supporting encryption
as well as the specific data_unit_size.

Also, the alignment requirement is already passed down the stack as part of the
bio_crypt_ctx.  If there do turn out to be places that need to use it, we could
easily define generic helper functions:

unsigned int bio_required_alignment(struct bio *bio)
{
        unsigned int alignmask = queue_logical_block_size(bio->bi_disk->queue) - 1;

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        if (bio->bi_crypt_context)
                alignmask |= bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size - 1;
#endif

        return alignmask + 1;
}

unsigned int rq_required_alignment(struct request *rq)
{
        unsigned int alignmask = queue_logical_block_size(rq->q) - 1;

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        if (rq->crypt_ctx)
                alignmask |= rq->crypt_ctx->bc_key->crypto_cfg.data_unit_size - 1;
#endif

        return alignmask + 1;
}

Sure, we could also add a new alignment_required field to struct bio and struct
request, but it would be unnecessary since all the information is already there.

> > Is it your opinion that inline encryption should only be supported when
> > data_unit_size <= logical_block_size?  The problems with that are
> 
> Pretty much.
> 
> >     (a) Using an unnecessarily small data_unit_size degrades performance a
> > 	lot -- for *all* I/O, not just direct I/O.  This is because there are a
> > 	lot more separate encryptions/decryptions to do, and there's a fixed
> > 	overhead to each one (much of which is intrinsic in the crypto
> > 	algorithms themselves, i.e. this isn't simply an implementation quirk).
> 
> Performance is irrelevant if correctness is not possible.
> 

As far as I know, data_unit_size > logical_block_size is working for everyone
who has used it so far.

So again, I'm curious if you have any specific examples in mind.  Is this a
real-world problem, or just a theoretical case where (in the future) someone
could declare support for some data_unit_size in their 'struct request_queue'
(possibly for a layered device) without correctly handling alignment in all
cases?

I do see that logical_block_size is used for discard, write_same, and zeroout.
But that isn't encrypted I/O.

BTW, there might very well be hardware that *only* supports
data_unit_size > logical_block_size.

- Eric
Eric Biggers July 27, 2020, 5:16 p.m. UTC | #18
On Sat, Jul 25, 2020 at 07:42:11PM -0700, Eric Biggers wrote:
> > Exactly my point. Requiring infrastructure and storage layers to
> > obey completely new, undefined, undiscoverable, opaque and variable
> > definition of the block devices' "atomic unit of IO", then that's
> > simply a non-starter. That requires a complete re-architecture of
> > the block layers and how things interface and transmit information
> > through them. At minimum, high level IO alignment constraints must
> > be generic and not be hidden in context specific crypto structures.
> 
> Do you have any specific examples in mind of where *encrypted* I/O may broken at
> only a logical_block_size boundary?  Remember that encrypted I/O with a
> particular data_unit_size is only issued if the request_queue has declared that
> it supports encryption with that data_unit_size.  In the case of a layered
> device, that means that every layer would have to opt-into supporting encryption
> as well as the specific data_unit_size.
> 
> Also, the alignment requirement is already passed down the stack as part of the
> bio_crypt_ctx.  If there do turn out to be places that need to use it, we could
> easily define generic helper functions:
> 
> unsigned int bio_required_alignment(struct bio *bio)
> {
>         unsigned int alignmask = queue_logical_block_size(bio->bi_disk->queue) - 1;
> 
> #ifdef CONFIG_BLK_INLINE_ENCRYPTION
>         if (bio->bi_crypt_context)
>                 alignmask |= bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size - 1;
> #endif
> 
>         return alignmask + 1;
> }
> 
> unsigned int rq_required_alignment(struct request *rq)
> {
>         unsigned int alignmask = queue_logical_block_size(rq->q) - 1;
> 
> #ifdef CONFIG_BLK_INLINE_ENCRYPTION
>         if (rq->crypt_ctx)
>                 alignmask |= rq->crypt_ctx->bc_key->crypto_cfg.data_unit_size - 1;
> #endif
> 
>         return alignmask + 1;
> }
> 
> Sure, we could also add a new alignment_required field to struct bio and struct
> request, but it would be unnecessary since all the information is already there.
> 
> > > Is it your opinion that inline encryption should only be supported when
> > > data_unit_size <= logical_block_size?  The problems with that are
> > 
> > Pretty much.
> > 
> > >     (a) Using an unnecessarily small data_unit_size degrades performance a
> > > 	lot -- for *all* I/O, not just direct I/O.  This is because there are a
> > > 	lot more separate encryptions/decryptions to do, and there's a fixed
> > > 	overhead to each one (much of which is intrinsic in the crypto
> > > 	algorithms themselves, i.e. this isn't simply an implementation quirk).
> > 
> > Performance is irrelevant if correctness is not possible.
> > 
> 
> As far as I know, data_unit_size > logical_block_size is working for everyone
> who has used it so far.
> 
> So again, I'm curious if you have any specific examples in mind.  Is this a
> real-world problem, or just a theoretical case where (in the future) someone
> could declare support for some data_unit_size in their 'struct request_queue'
> (possibly for a layered device) without correctly handling alignment in all
> cases?
> 
> I do see that logical_block_size is used for discard, write_same, and zeroout.
> But that isn't encrypted I/O.
> 
> BTW, there might very well be hardware that *only* supports
> data_unit_size > logical_block_size.

I found get_max_io_size() in block/blk-merge.c.  I'll check if that needs to be
updated.

Let me know if you have any objection to the fscrypt inline encryption patches
*without direct I/O support* going into 5.9.  Note that fscrypt doesn't directly
care about this block layer stuff at all; instead it uses
blk_crypto_config_supported() to check whether inline encryption with the
specified (crypto_mode, data_unit_size, dun_bytes) combination is supported on
the filesystem's device(s).  Only then will fscrypt use inline encryption
instead of the traditional filesystem-layer encryption.

So if blk_crypto_config_supported() is saying that some crypto configuration is
supported when it isn't, then that's a bug in the blk-crypto patches that went
into the block layer in 5.8, which we need to fix there.  (Ideally by fixing any
cases where encrypted I/O may be split in the middle of a data unit.  But in the
worst case, we could easily make blk_crypto_config_supported() return false when
'data_unit_size > logical_block_size' for now.)

- Eric

Patch
diff mbox series

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index ec7b78e6feca..12064daa3e3d 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -6,6 +6,7 @@ 
 #include <linux/module.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
+#include <linux/fscrypt.h>
 #include <linux/iomap.h>
 #include <linux/backing-dev.h>
 #include <linux/uio.h>
@@ -183,11 +184,16 @@  static void
 iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 		unsigned len)
 {
+	struct inode *inode = file_inode(dio->iocb->ki_filp);
 	struct page *page = ZERO_PAGE(0);
 	int flags = REQ_SYNC | REQ_IDLE;
 	struct bio *bio;
 
 	bio = bio_alloc(GFP_KERNEL, 1);
+
+	/* encrypted direct I/O is guaranteed to be fs-block aligned */
+	WARN_ON_ONCE(fscrypt_needs_contents_encryption(inode));
+
 	bio_set_dev(bio, iomap->bdev);
 	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 	bio->bi_private = dio;
@@ -253,6 +259,7 @@  iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		ret = nr_pages;
 		goto out;
 	}
+	nr_pages = fscrypt_limit_io_pages(inode, pos, nr_pages);
 
 	if (need_zeroout) {
 		/* zero out from the start of the block to the write offset */
@@ -270,6 +277,8 @@  iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		}
 
 		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
+					  GFP_KERNEL);
 		bio_set_dev(bio, iomap->bdev);
 		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 		bio->bi_write_hint = dio->iocb->ki_hint;
@@ -306,9 +315,10 @@  iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		dio->size += n;
 		copied += n;
 
-		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
 		iomap_dio_submit_bio(dio, iomap, bio, pos);
 		pos += n;
+		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
+		nr_pages = fscrypt_limit_io_pages(inode, pos, nr_pages);
 	} while (nr_pages);
 
 	/*