diff mbox series

[RFC,4/9] dm: add llseek(SEEK_HOLE/SEEK_DATA) support

Message ID 20240328203910.2370087-5-stefanha@redhat.com (mailing list archive)
State Deferred, archived
Delegated to: Mike Snitzer
Headers show
Series block: add llseek(SEEK_HOLE/SEEK_DATA) support | expand

Commit Message

Stefan Hajnoczi March 28, 2024, 8:39 p.m. UTC
Delegate SEEK_HOLE/SEEK_DATA to device-mapper targets. The new
dm_seek_hole_data() callback allows target types to customize behavior.
The default implementation treats the target as all data with no holes.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/linux/device-mapper.h |  5 +++
 drivers/md/dm.c               | 68 +++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

Comments

Eric Blake March 29, 2024, 12:38 a.m. UTC | #1
On Thu, Mar 28, 2024 at 04:39:05PM -0400, Stefan Hajnoczi wrote:
> Delegate SEEK_HOLE/SEEK_DATA to device-mapper targets. The new
> dm_seek_hole_data() callback allows target types to customize behavior.
> The default implementation treats the target as all data with no holes.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  include/linux/device-mapper.h |  5 +++
>  drivers/md/dm.c               | 68 +++++++++++++++++++++++++++++++++++
>  2 files changed, 73 insertions(+)
> 

> +/* Default implementation for targets that do not implement the callback */
> +static loff_t dm_blk_seek_hole_data_default(loff_t offset, int whence,
> +		loff_t size)
> +{
> +	switch (whence) {
> +	case SEEK_DATA:
> +		if ((unsigned long long)offset >= size)
> +			return -ENXIO;
> +		return offset;
> +	case SEEK_HOLE:
> +		if ((unsigned long long)offset >= size)
> +			return -ENXIO;
> +		return size;

These fail with -ENXIO if offset == size (matching what we do on files)...

> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +static loff_t dm_blk_do_seek_hole_data(struct dm_table *table, loff_t offset,
> +		int whence)
> +{
> +	struct dm_target *ti;
> +	loff_t end;
> +
> +	/* Loop when the end of a target is reached */
> +	do {
> +		ti = dm_table_find_target(table, offset >> SECTOR_SHIFT);
> +		if (!ti)
> +			return whence == SEEK_DATA ? -ENXIO : offset;

...but this blindly returns offset for SEEK_HOLE, even when offset is
beyond the end of the dm.  I think you want 'return -ENXIO;'
unconditionally here.

> +
> +		end = (ti->begin + ti->len) << SECTOR_SHIFT;
> +
> +		if (ti->type->seek_hole_data)
> +			offset = ti->type->seek_hole_data(ti, offset, whence);

Are we guaranteed that ti->type->seek_hole_data will not return a
value exceeding end?  Or can dm be used to truncate the view of an
underlying device, and the underlying seek_hold_data can now return an
answer beyond where dm_table_find_target should look for the next part
of the dm's view?

In which case, should the blkdev_seek_hole_data callback be passed a
max size parameter everywhere, similar to how fixed_size_llseek does
things?

> +		else
> +			offset = dm_blk_seek_hole_data_default(offset, whence, end);
> +
> +		if (whence == SEEK_DATA && offset == -ENXIO)
> +			offset = end;

You have a bug here.  If I have a dm contructed of two underlying targets:

|A  |B  |

and A is all data, then whence == SEEK_HOLE will have offset = -ENXIO
at this point, and you fail to check whether B is also data.  That is,
you have silently treated the rest of the block device as data, which
is semantically not wrong (as that is always a safe fallback), but not
optimal.

I think the correct logic is s/whence == SEEK_DATA &&//.

> +	} while (offset == end);

I'm trying to make sure that we can never return the equivalent of
lseek(dm, 0, SEEK_END).  If you make my above suggested changes, we
will iterate through the do loop once more at EOF, and
dm_table_find_target() will then fail to match at which point we do
get the desired -ENXIO for both SEEK_HOLE and SEEK_DATA.

> +
> +	return offset;
> +}
> +
Stefan Hajnoczi April 3, 2024, 2:11 p.m. UTC | #2
On Thu, Mar 28, 2024 at 07:38:20PM -0500, Eric Blake wrote:
> On Thu, Mar 28, 2024 at 04:39:05PM -0400, Stefan Hajnoczi wrote:
> > Delegate SEEK_HOLE/SEEK_DATA to device-mapper targets. The new
> > dm_seek_hole_data() callback allows target types to customize behavior.
> > The default implementation treats the target as all data with no holes.
> > 
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > ---
> >  include/linux/device-mapper.h |  5 +++
> >  drivers/md/dm.c               | 68 +++++++++++++++++++++++++++++++++++
> >  2 files changed, 73 insertions(+)
> > 
> 
> > +/* Default implementation for targets that do not implement the callback */
> > +static loff_t dm_blk_seek_hole_data_default(loff_t offset, int whence,
> > +		loff_t size)
> > +{
> > +	switch (whence) {
> > +	case SEEK_DATA:
> > +		if ((unsigned long long)offset >= size)
> > +			return -ENXIO;
> > +		return offset;
> > +	case SEEK_HOLE:
> > +		if ((unsigned long long)offset >= size)
> > +			return -ENXIO;
> > +		return size;
> 
> These fail with -ENXIO if offset == size (matching what we do on files)...
> 
> > +	default:
> > +		return -EINVAL;
> > +	}
> > +}
> > +
> > +static loff_t dm_blk_do_seek_hole_data(struct dm_table *table, loff_t offset,
> > +		int whence)
> > +{
> > +	struct dm_target *ti;
> > +	loff_t end;
> > +
> > +	/* Loop when the end of a target is reached */
> > +	do {
> > +		ti = dm_table_find_target(table, offset >> SECTOR_SHIFT);
> > +		if (!ti)
> > +			return whence == SEEK_DATA ? -ENXIO : offset;
> 
> ...but this blindly returns offset for SEEK_HOLE, even when offset is
> beyond the end of the dm.  I think you want 'return -ENXIO;'
> unconditionally here.

If the initial offset is beyond the end of the table, then SEEK_HOLE
should return -ENXIO. I agree that the code doesn't handle this case.

However, returning offset here is correct when there is data at the end
with SEEK_HOLE.

I'll update the code to address the out-of-bounds offset case, perhaps
by checking the initial offset before entering the loop.

> 
> > +
> > +		end = (ti->begin + ti->len) << SECTOR_SHIFT;
> > +
> > +		if (ti->type->seek_hole_data)
> > +			offset = ti->type->seek_hole_data(ti, offset, whence);
> 
> Are we guaranteed that ti->type->seek_hole_data will not return a
> value exceeding end?  Or can dm be used to truncate the view of an
> underlying device, and the underlying seek_hold_data can now return an
> answer beyond where dm_table_find_target should look for the next part
> of the dm's view?

ti->type->seek_hole_data() must not return a value larger than
(ti->begin + ti->len) << SECTOR_SHIFT.

> 
> In which case, should the blkdev_seek_hole_data callback be passed a
> max size parameter everywhere, similar to how fixed_size_llseek does
> things?
> 
> > +		else
> > +			offset = dm_blk_seek_hole_data_default(offset, whence, end);
> > +
> > +		if (whence == SEEK_DATA && offset == -ENXIO)
> > +			offset = end;
> 
> You have a bug here.  If I have a dm contructed of two underlying targets:
> 
> |A  |B  |
> 
> and A is all data, then whence == SEEK_HOLE will have offset = -ENXIO
> at this point, and you fail to check whether B is also data.  That is,
> you have silently treated the rest of the block device as data, which
> is semantically not wrong (as that is always a safe fallback), but not
> optimal.
> 
> I think the correct logic is s/whence == SEEK_DATA &&//.

No, with whence == SEEK_HOLE and an initial offset in A, the new offset
will be (A->begin + A->end) << SECTOR_SHIFT. The loop will iterate and
continue seeking into B.

The if statement you commented on ensures that we also continue looping
with whence == SEEK_DATA, because that would otherwise prematurely end
with the new offset = -ENXIO.

> 
> > +	} while (offset == end);
> 
> I'm trying to make sure that we can never return the equivalent of
> lseek(dm, 0, SEEK_END).  If you make my above suggested changes, we
> will iterate through the do loop once more at EOF, and
> dm_table_find_target() will then fail to match at which point we do
> get the desired -ENXIO for both SEEK_HOLE and SEEK_DATA.

Wait, lseek() is supposed to return the equivalent of lseek(dm, 0,
SEEK_END) when whence == SEEK_HOLE and there is data at the end.

> 
> > +
> > +	return offset;
> > +}
> > +
> 
> -- 
> Eric Blake, Principal Software Engineer
> Red Hat, Inc.
> Virtualization:  qemu.org | libguestfs.org
>
Eric Blake April 3, 2024, 5:02 p.m. UTC | #3
On Wed, Apr 03, 2024 at 10:11:47AM -0400, Stefan Hajnoczi wrote:
...
> > > +static loff_t dm_blk_do_seek_hole_data(struct dm_table *table, loff_t offset,
> > > +		int whence)
> > > +{
> > > +	struct dm_target *ti;
> > > +	loff_t end;
> > > +
> > > +	/* Loop when the end of a target is reached */
> > > +	do {
> > > +		ti = dm_table_find_target(table, offset >> SECTOR_SHIFT);
> > > +		if (!ti)
> > > +			return whence == SEEK_DATA ? -ENXIO : offset;
> > 
> > ...but this blindly returns offset for SEEK_HOLE, even when offset is
> > beyond the end of the dm.  I think you want 'return -ENXIO;'
> > unconditionally here.
> 
> If the initial offset is beyond the end of the table, then SEEK_HOLE
> should return -ENXIO. I agree that the code doesn't handle this case.
> 
> However, returning offset here is correct when there is data at the end
> with SEEK_HOLE.
> 
> I'll update the code to address the out-of-bounds offset case, perhaps
> by checking the initial offset before entering the loop.

You are correct that if we are on the second loop iteration of
SEEK_HOLE (because the first iteration saw all data), then we have
found the offset of the start of a hole and should return that offset,
not -ENXIO.  This may be a case where we just have to be careful on
whether the initial pass might have any corner cases different from
later times through the loop, and that we end the loop with correct
results for both SEEK_HOLE and SEEK_DATA.

> 
> > 
> > > +
> > > +		end = (ti->begin + ti->len) << SECTOR_SHIFT;
> > > +
> > > +		if (ti->type->seek_hole_data)
> > > +			offset = ti->type->seek_hole_data(ti, offset, whence);
> > 
> > Are we guaranteed that ti->type->seek_hole_data will not return a
> > value exceeding end?  Or can dm be used to truncate the view of an
> > underlying device, and the underlying seek_hold_data can now return an
> > answer beyond where dm_table_find_target should look for the next part
> > of the dm's view?
> 
> ti->type->seek_hole_data() must not return a value larger than
> (ti->begin + ti->len) << SECTOR_SHIFT.

Worth adding as documentation then.

> 
> > 
> > In which case, should the blkdev_seek_hole_data callback be passed a
> > max size parameter everywhere, similar to how fixed_size_llseek does
> > things?
> > 
> > > +		else
> > > +			offset = dm_blk_seek_hole_data_default(offset, whence, end);
> > > +
> > > +		if (whence == SEEK_DATA && offset == -ENXIO)
> > > +			offset = end;
> > 
> > You have a bug here.  If I have a dm contructed of two underlying targets:
> > 
> > |A  |B  |
> > 
> > and A is all data, then whence == SEEK_HOLE will have offset = -ENXIO
> > at this point, and you fail to check whether B is also data.  That is,
> > you have silently treated the rest of the block device as data, which
> > is semantically not wrong (as that is always a safe fallback), but not
> > optimal.
> > 
> > I think the correct logic is s/whence == SEEK_DATA &&//.
> 
> No, with whence == SEEK_HOLE and an initial offset in A, the new offset
> will be (A->begin + A->end) << SECTOR_SHIFT. The loop will iterate and
> continue seeking into B.
> 
> The if statement you commented on ensures that we also continue looping
> with whence == SEEK_DATA, because that would otherwise prematurely end
> with the new offset = -ENXIO.
> 
> > 
> > > +	} while (offset == end);
> > 
> > I'm trying to make sure that we can never return the equivalent of
> > lseek(dm, 0, SEEK_END).  If you make my above suggested changes, we
> > will iterate through the do loop once more at EOF, and
> > dm_table_find_target() will then fail to match at which point we do
> > get the desired -ENXIO for both SEEK_HOLE and SEEK_DATA.
> 
> Wait, lseek() is supposed to return the equivalent of lseek(dm, 0,
> SEEK_END) when whence == SEEK_HOLE and there is data at the end.

It was confusing enough for me to write my initial review, I apologize
if I'm making it harder for you.  Yes, we want to ensure that:

off1 = lseek(fd, -1, SEEK_END);
off2 = off1 + 1; // == lseek(fd, 0, SEEK_END)

if off1 belongs to a data extent:
  - lseek(fd, off1, SEEK_DATA) == off1
  - lseek(fd, off1, SEEK_HOLE) == off2
  - lseek(fd, off2, SEEK_DATA) == -ENXIO
  - lseek(fd, off2, SEEK_HOLE) == -ENXIO

if off1 belongs to a hole:
  - lseek(fd, off1, SEEK_DATA) == -ENXIO
  - lseek(fd, off1, SEEK_HOLE) == off1
  - lseek(fd, off2, SEEK_DATA) == -ENXIO
  - lseek(fd, off2, SEEK_HOLE) == -ENXIO

Anything in my wall of text from the earlier message inconsistent with
this table can be ignored; but at the same time, I was not able to
quickly convince myself that your code properly had those properties,
even after writing up the table.

Reiterating what I said elsewhere, it may be smarter to document that
for callbacks, it is wiser to require intermediate behavior that the
input value 'offset' is always between the half-open range
[ti->begin<<SECTOR_SHIFT, (ti->begin+ti->len)<<SECTOR_SHIFT), and on
success, the output must be in the fully-closed range [offset,
(ti->begin+ti->len)<<SECTOR_SHIFT], errors like -EIO are permitted but
-ENXIO should not be returned; and let the caller worry about
synthesizing -ENXIO from that (since the caller knows whether or not
there is a successor ti where adjacency concerns come into play).

That is, we can never pass in off2 (beyond the bounds of the table),
and when passing in off1, I think this interface may be easier to work
with in the intermediate layers, even though it differs from the
lseek() interface above.  For off1 in data:
  - dm_blk_do_seek_hole_data(dm, off1, SEEK_DATA) == off1
  - dm_blk_do_seek_hole_data(dm, off1, SEEK_HOLE) == off2
and for a hole:
  - dm_blk_do_seek_hole_data(dm, off1, SEEK_DATA) == off2
  - dm_blk_do_seek_hole_data(dm, off1, SEEK_HOLE) == off1
Stefan Hajnoczi April 3, 2024, 5:58 p.m. UTC | #4
On Wed, Apr 03, 2024 at 12:02:19PM -0500, Eric Blake wrote:
> On Wed, Apr 03, 2024 at 10:11:47AM -0400, Stefan Hajnoczi wrote:
> ...
> > > > +static loff_t dm_blk_do_seek_hole_data(struct dm_table *table, loff_t offset,
> > > > +		int whence)
> > > > +{
> > > > +	struct dm_target *ti;
> > > > +	loff_t end;
> > > > +
> > > > +	/* Loop when the end of a target is reached */
> > > > +	do {
> > > > +		ti = dm_table_find_target(table, offset >> SECTOR_SHIFT);
> > > > +		if (!ti)
> > > > +			return whence == SEEK_DATA ? -ENXIO : offset;
> > > 
> > > ...but this blindly returns offset for SEEK_HOLE, even when offset is
> > > beyond the end of the dm.  I think you want 'return -ENXIO;'
> > > unconditionally here.
> > 
> > If the initial offset is beyond the end of the table, then SEEK_HOLE
> > should return -ENXIO. I agree that the code doesn't handle this case.
> > 
> > However, returning offset here is correct when there is data at the end
> > with SEEK_HOLE.
> > 
> > I'll update the code to address the out-of-bounds offset case, perhaps
> > by checking the initial offset before entering the loop.
> 
> You are correct that if we are on the second loop iteration of
> SEEK_HOLE (because the first iteration saw all data), then we have
> found the offset of the start of a hole and should return that offset,
> not -ENXIO.  This may be a case where we just have to be careful on
> whether the initial pass might have any corner cases different from
> later times through the loop, and that we end the loop with correct
> results for both SEEK_HOLE and SEEK_DATA.
> 
> > 
> > > 
> > > > +
> > > > +		end = (ti->begin + ti->len) << SECTOR_SHIFT;
> > > > +
> > > > +		if (ti->type->seek_hole_data)
> > > > +			offset = ti->type->seek_hole_data(ti, offset, whence);
> > > 
> > > Are we guaranteed that ti->type->seek_hole_data will not return a
> > > value exceeding end?  Or can dm be used to truncate the view of an
> > > underlying device, and the underlying seek_hold_data can now return an
> > > answer beyond where dm_table_find_target should look for the next part
> > > of the dm's view?
> > 
> > ti->type->seek_hole_data() must not return a value larger than
> > (ti->begin + ti->len) << SECTOR_SHIFT.
> 
> Worth adding as documentation then.
> 
> > 
> > > 
> > > In which case, should the blkdev_seek_hole_data callback be passed a
> > > max size parameter everywhere, similar to how fixed_size_llseek does
> > > things?
> > > 
> > > > +		else
> > > > +			offset = dm_blk_seek_hole_data_default(offset, whence, end);
> > > > +
> > > > +		if (whence == SEEK_DATA && offset == -ENXIO)
> > > > +			offset = end;
> > > 
> > > You have a bug here.  If I have a dm contructed of two underlying targets:
> > > 
> > > |A  |B  |
> > > 
> > > and A is all data, then whence == SEEK_HOLE will have offset = -ENXIO
> > > at this point, and you fail to check whether B is also data.  That is,
> > > you have silently treated the rest of the block device as data, which
> > > is semantically not wrong (as that is always a safe fallback), but not
> > > optimal.
> > > 
> > > I think the correct logic is s/whence == SEEK_DATA &&//.
> > 
> > No, with whence == SEEK_HOLE and an initial offset in A, the new offset
> > will be (A->begin + A->end) << SECTOR_SHIFT. The loop will iterate and
> > continue seeking into B.
> > 
> > The if statement you commented on ensures that we also continue looping
> > with whence == SEEK_DATA, because that would otherwise prematurely end
> > with the new offset = -ENXIO.
> > 
> > > 
> > > > +	} while (offset == end);
> > > 
> > > I'm trying to make sure that we can never return the equivalent of
> > > lseek(dm, 0, SEEK_END).  If you make my above suggested changes, we
> > > will iterate through the do loop once more at EOF, and
> > > dm_table_find_target() will then fail to match at which point we do
> > > get the desired -ENXIO for both SEEK_HOLE and SEEK_DATA.
> > 
> > Wait, lseek() is supposed to return the equivalent of lseek(dm, 0,
> > SEEK_END) when whence == SEEK_HOLE and there is data at the end.
> 
> It was confusing enough for me to write my initial review, I apologize
> if I'm making it harder for you.

No worries, if my code is hard to understand I can learn from your
feedback.

> Yes, we want to ensure that:
> 
> off1 = lseek(fd, -1, SEEK_END);
> off2 = off1 + 1; // == lseek(fd, 0, SEEK_END)
> 
> if off1 belongs to a data extent:
>   - lseek(fd, off1, SEEK_DATA) == off1
>   - lseek(fd, off1, SEEK_HOLE) == off2
>   - lseek(fd, off2, SEEK_DATA) == -ENXIO
>   - lseek(fd, off2, SEEK_HOLE) == -ENXIO

Agreed.

> if off1 belongs to a hole:
>   - lseek(fd, off1, SEEK_DATA) == -ENXIO
>   - lseek(fd, off1, SEEK_HOLE) == off1
>   - lseek(fd, off2, SEEK_DATA) == -ENXIO
>   - lseek(fd, off2, SEEK_HOLE) == -ENXIO

Agreed.

> 
> Anything in my wall of text from the earlier message inconsistent with
> this table can be ignored; but at the same time, I was not able to
> quickly convince myself that your code properly had those properties,
> even after writing up the table.
> 
> Reiterating what I said elsewhere, it may be smarter to document that
> for callbacks, it is wiser to require intermediate behavior that the
> input value 'offset' is always between the half-open range
> [ti->begin<<SECTOR_SHIFT, (ti->begin+ti->len)<<SECTOR_SHIFT), and on
> success, the output must be in the fully-closed range [offset,
> (ti->begin+ti->len)<<SECTOR_SHIFT], errors like -EIO are permitted but
> -ENXIO should not be returned; and let the caller worry about
> synthesizing -ENXIO from that (since the caller knows whether or not
> there is a successor ti where adjacency concerns come into play).
> 
> That is, we can never pass in off2 (beyond the bounds of the table),
> and when passing in off1, I think this interface may be easier to work
> with in the intermediate layers, even though it differs from the
> lseek() interface above.  For off1 in data:
>   - dm_blk_do_seek_hole_data(dm, off1, SEEK_DATA) == off1
>   - dm_blk_do_seek_hole_data(dm, off1, SEEK_HOLE) == off2
> and for a hole:
>   - dm_blk_do_seek_hole_data(dm, off1, SEEK_DATA) == off2
>   - dm_blk_do_seek_hole_data(dm, off1, SEEK_HOLE) == off1

I'll take a look again starting from block/fops.c, through dm.c, and
into dm-linear.c to see how to make things clearest. Although I would
like to have the same semantics for every seek function, maybe in the
end your suggestion will make the code clearer. Let's see.

Stefan
Eric Blake April 3, 2024, 7:28 p.m. UTC | #5
On Wed, Apr 03, 2024 at 01:58:38PM -0400, Stefan Hajnoczi wrote:
> > Yes, we want to ensure that:
> > 
> > off1 = lseek(fd, -1, SEEK_END);
> > off2 = off1 + 1; // == lseek(fd, 0, SEEK_END)
> > 
> > if off1 belongs to a data extent:
> >   - lseek(fd, off1, SEEK_DATA) == off1
> >   - lseek(fd, off1, SEEK_HOLE) == off2
> >   - lseek(fd, off2, SEEK_DATA) == -ENXIO
> >   - lseek(fd, off2, SEEK_HOLE) == -ENXIO
> 
> Agreed.
> 
> > if off1 belongs to a hole:
> >   - lseek(fd, off1, SEEK_DATA) == -ENXIO
> >   - lseek(fd, off1, SEEK_HOLE) == off1
> >   - lseek(fd, off2, SEEK_DATA) == -ENXIO
> >   - lseek(fd, off2, SEEK_HOLE) == -ENXIO
> 
> Agreed.
> 
...
> > 
> > That is, we can never pass in off2 (beyond the bounds of the table),
> > and when passing in off1, I think this interface may be easier to work
> > with in the intermediate layers, even though it differs from the
> > lseek() interface above.  For off1 in data:
> >   - dm_blk_do_seek_hole_data(dm, off1, SEEK_DATA) == off1
> >   - dm_blk_do_seek_hole_data(dm, off1, SEEK_HOLE) == off2
> > and for a hole:
> >   - dm_blk_do_seek_hole_data(dm, off1, SEEK_DATA) == off2
> >   - dm_blk_do_seek_hole_data(dm, off1, SEEK_HOLE) == off1
>

In the caller, we already need to know both off1 and off2 (that is,
the offset we are asking about, AND the offset at which the underlying
component ends its map at, since that is where we are then in charge
of whether to concatenate that with the next component or give the
overall result).

If we already guarantee that we never call into a lower layer with
off2 (because it was beyond bounds), then the only difference between
the two semantics is whether SEEK_DATA in a final hole returns -ENXIO
or off2; and it looks like we can do a conversion from either style
into the other.

So designing the caller logic, it looks like I would want:

if off1 >= EOF return -ENXIO (out of bounds)

while off1 < EOF:

  determine off2 of current lower region
  at this point, we are guaranteed off1 < off2
  we also know that off2 < EOF unless we are on last lower region
  call result=lower_layer(off1, SEEK_X)
  it is a bug if result is non-negative but < off1, or if result > off2

  if result == off1, return result (we are already in the right HOLE
  or DATA)

  if result > off1 and < off2, return result (we had to advance to get
  to the right region, but the right region was within the lower
  layer, and we don't need to inspect further layers)

  Note - the above two paragraphs can be collapsed into one: if result
  < off2, return result (the current subregion gave us an adequate
  answer)

  if result == off2, continue to the next region with off1=result (in
  first semantics, this is only possible for SEEK_HOLE for a lower
  region ending in data; in the second semantics, it is possible for
  either SEEK_HOLE or SEEK_DATA for a lower region ending in the type
  opposite of the request)

  if result == -ENXIO, continue to the next region by using off1=off2
  (only possible in the first semantics for SEEK_DATA on a lower
  region ending in a hole)

  any other result is necessarily a negative errno like -EIO; return it

if the loop completes without an early return, we are out of lower
regions, and we should be left with off1 == EOF.  Because we advanced,
we know now to:
return whence == SEEK_HOLE ? off1 : -ENXIO

> I'll take a look again starting from block/fops.c, through dm.c, and
> into dm-linear.c to see how to make things clearest. Although I would
> like to have the same semantics for every seek function, maybe in the
> end your suggestion will make the code clearer. Let's see.

Keeping lseek semantics may require a couple more lines of code
duplicated at each layer, but less maintainer headaches in the long
run of converting between slightly different semantics depending on
which layer you are at.
diff mbox series

Patch

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 82b2195efaca7..e89ebaab6507a 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -161,6 +161,10 @@  typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff,
 typedef size_t (*dm_dax_recovery_write_fn)(struct dm_target *ti, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i);
 
+/* Like llseek(SEEK_HOLE/SEEK_DATA) */
+typedef loff_t (*dm_seek_hole_data)(struct dm_target *ti, loff_t offset,
+		int whence);
+
 void dm_error(const char *message);
 
 struct dm_dev {
@@ -210,6 +214,7 @@  struct target_type {
 	dm_dax_direct_access_fn direct_access;
 	dm_dax_zero_page_range_fn dax_zero_page_range;
 	dm_dax_recovery_write_fn dax_recovery_write;
+	dm_seek_hole_data seek_hole_data;
 
 	/* For internal device-mapper use. */
 	struct list_head list;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 56aa2a8b9d715..3c921bdbd17fc 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -3167,6 +3167,72 @@  void dm_free_md_mempools(struct dm_md_mempools *pools)
 	kfree(pools);
 }
 
+/* Default implementation for targets that do not implement the callback */
+static loff_t dm_blk_seek_hole_data_default(loff_t offset, int whence,
+		loff_t size)
+{
+	switch (whence) {
+	case SEEK_DATA:
+		if ((unsigned long long)offset >= size)
+			return -ENXIO;
+		return offset;
+	case SEEK_HOLE:
+		if ((unsigned long long)offset >= size)
+			return -ENXIO;
+		return size;
+	default:
+		return -EINVAL;
+	}
+}
+
+static loff_t dm_blk_do_seek_hole_data(struct dm_table *table, loff_t offset,
+		int whence)
+{
+	struct dm_target *ti;
+	loff_t end;
+
+	/* Loop when the end of a target is reached */
+	do {
+		ti = dm_table_find_target(table, offset >> SECTOR_SHIFT);
+		if (!ti)
+			return whence == SEEK_DATA ? -ENXIO : offset;
+
+		end = (ti->begin + ti->len) << SECTOR_SHIFT;
+
+		if (ti->type->seek_hole_data)
+			offset = ti->type->seek_hole_data(ti, offset, whence);
+		else
+			offset = dm_blk_seek_hole_data_default(offset, whence, end);
+
+		if (whence == SEEK_DATA && offset == -ENXIO)
+			offset = end;
+	} while (offset == end);
+
+	return offset;
+}
+
+static loff_t dm_blk_seek_hole_data(struct block_device *bdev, loff_t offset,
+		int whence)
+{
+	struct mapped_device *md = bdev->bd_disk->private_data;
+	struct dm_table *table;
+	int srcu_idx;
+	loff_t ret;
+
+	if (dm_suspended_md(md))
+		return -EAGAIN;
+
+	table = dm_get_live_table(md, &srcu_idx);
+	if (!table)
+		return -EIO;
+
+	ret = dm_blk_do_seek_hole_data(table, offset, whence);
+
+	dm_put_live_table(md, srcu_idx);
+
+	return ret;
+}
+
 struct dm_pr {
 	u64	old_key;
 	u64	new_key;
@@ -3493,6 +3559,7 @@  static const struct block_device_operations dm_blk_dops = {
 	.getgeo = dm_blk_getgeo,
 	.report_zones = dm_blk_report_zones,
 	.pr_ops = &dm_pr_ops,
+	.seek_hole_data = dm_blk_seek_hole_data,
 	.owner = THIS_MODULE
 };
 
@@ -3502,6 +3569,7 @@  static const struct block_device_operations dm_rq_blk_dops = {
 	.ioctl = dm_blk_ioctl,
 	.getgeo = dm_blk_getgeo,
 	.pr_ops = &dm_pr_ops,
+	.seek_hole_data = dm_blk_seek_hole_data,
 	.owner = THIS_MODULE
 };