diff mbox series

[RFC,3/4] nfs/blocklayout: Fix premature PR key unregistration

Message ID 20240619173929.177818-9-cel@kernel.org (mailing list archive)
State New
Headers show
Series Snapshot of fixes for SCSI PR key registration | expand

Commit Message

Chuck Lever June 19, 2024, 5:39 p.m. UTC
From: Chuck Lever <chuck.lever@oracle.com>

During generic/069 runs with pNFS SCSI layouts, the NFS client emits
the following in the system journal:

kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
kernel: sd 6:0:0:1: reservation conflict
kernel: sd 6:0:0:1: [sdb] tag#16 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
kernel: sd 6:0:0:1: [sdb] tag#16 CDB: Write(10) 2a 00 00 00 00 50 00 00 08 00
kernel: reservation conflict error, dev sdb, sector 80 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 2
kernel: sd 6:0:0:1: reservation conflict
kernel: sd 6:0:0:1: reservation conflict
kernel: sd 6:0:0:1: [sdb] tag#18 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
kernel: sd 6:0:0:1: [sdb] tag#17 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
kernel: sd 6:0:0:1: [sdb] tag#18 CDB: Write(10) 2a 00 00 00 00 60 00 00 08 00
kernel: sd 6:0:0:1: [sdb] tag#17 CDB: Write(10) 2a 00 00 00 00 58 00 00 08 00
kernel: reservation conflict error, dev sdb, sector 96 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
kernel: reservation conflict error, dev sdb, sector 88 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
systemd[1]: fstests-generic-069.scope: Deactivated successfully.
systemd[1]: fstests-generic-069.scope: Consumed 5.092s CPU time.
systemd[1]: media-test.mount: Deactivated successfully.
systemd[1]: media-scratch.mount: Deactivated successfully.
kernel: sd 6:0:0:1: reservation conflict
kernel: failed to unregister PR key.

This appears to be due to a race. bl_alloc_lseg() calls this:

561 static struct nfs4_deviceid_node *
562 bl_find_get_deviceid(struct nfs_server *server,
563                 const struct nfs4_deviceid *id, const struct cred *cred,
564                 gfp_t gfp_mask)
565 {
566         struct nfs4_deviceid_node *node;
567         unsigned long start, end;
568
569 retry:
570         node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
571         if (!node)
572                 return ERR_PTR(-ENODEV);

nfs4_find_get_deviceid() does a lookup without the spin lock first.
If it can't find a matching deviceid, it creates a new device_info
(which calls bl_alloc_deviceid_node, and that registers the device's
PR key).

Then it takes the nfs4_deviceid_lock and looks up the deviceid again.
If it finds it this time, bl_find_get_deviceid() frees the spare
(new) device_info, which unregisters the PR key for the same device.

Any subsequent I/O from this client on that device gets EBADE.

The umount later unregisters the device's PR key again.

To prevent this problem, register the PR key after the deviceid_node
lookup.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/blocklayout/blocklayout.c |  9 ++++++++-
 fs/nfs/blocklayout/blocklayout.h |  1 +
 fs/nfs/blocklayout/dev.c         | 29 +++++++++++++++++++++--------
 3 files changed, 30 insertions(+), 9 deletions(-)

Comments

Christoph Hellwig June 20, 2024, 5:06 a.m. UTC | #1
On Wed, Jun 19, 2024 at 01:39:33PM -0400, cel@kernel.org wrote:
> -	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
> +	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) {

It might be worth to invert this and keep the unavailable handling in
the branch as that's the exceptional case.   That code is also woefully
under-documented and could have really used a comment.

> +		struct pnfs_block_dev *d =
> +			container_of(node, struct pnfs_block_dev, node);
> +		if (d->pr_reg)
> +			if (d->pr_reg(d) < 0)
> +				goto out_put;

Empty line after variable declarations.  Also is there anything that
synchronizes the lookups here so that we don't do multiple registrations
in parallel?

> +
> +	if (d->pr_registered)
> +		return 0;
> +
> +	error = ops->pr_register(bdev, 0, d->pr_key, true);
> +	if (error) {
> +		trace_bl_pr_key_reg_err(bdev->bd_disk->disk_name, d->pr_key, error);
> +		return -error;

->pr_register returns either negative errnos or positive PR_STS_* values,
simply inverting the error here isn't doing the right thing.
Benjamin Coddington June 20, 2024, 1:51 p.m. UTC | #2
On 19 Jun 2024, at 13:39, cel@kernel.org wrote:

> From: Chuck Lever <chuck.lever@oracle.com>
>
> During generic/069 runs with pNFS SCSI layouts, the NFS client emits
> the following in the system journal:
>
> kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
> kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
> kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
> kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
> kernel: sd 6:0:0:1: reservation conflict
> kernel: sd 6:0:0:1: [sdb] tag#16 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
> kernel: sd 6:0:0:1: [sdb] tag#16 CDB: Write(10) 2a 00 00 00 00 50 00 00 08 00
> kernel: reservation conflict error, dev sdb, sector 80 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 2
> kernel: sd 6:0:0:1: reservation conflict
> kernel: sd 6:0:0:1: reservation conflict
> kernel: sd 6:0:0:1: [sdb] tag#18 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
> kernel: sd 6:0:0:1: [sdb] tag#17 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
> kernel: sd 6:0:0:1: [sdb] tag#18 CDB: Write(10) 2a 00 00 00 00 60 00 00 08 00
> kernel: sd 6:0:0:1: [sdb] tag#17 CDB: Write(10) 2a 00 00 00 00 58 00 00 08 00
> kernel: reservation conflict error, dev sdb, sector 96 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
> kernel: reservation conflict error, dev sdb, sector 88 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
> systemd[1]: fstests-generic-069.scope: Deactivated successfully.
> systemd[1]: fstests-generic-069.scope: Consumed 5.092s CPU time.
> systemd[1]: media-test.mount: Deactivated successfully.
> systemd[1]: media-scratch.mount: Deactivated successfully.
> kernel: sd 6:0:0:1: reservation conflict
> kernel: failed to unregister PR key.
>
> This appears to be due to a race. bl_alloc_lseg() calls this:
>
> 561 static struct nfs4_deviceid_node *
> 562 bl_find_get_deviceid(struct nfs_server *server,
> 563                 const struct nfs4_deviceid *id, const struct cred *cred,
> 564                 gfp_t gfp_mask)
> 565 {
> 566         struct nfs4_deviceid_node *node;
> 567         unsigned long start, end;
> 568
> 569 retry:
> 570         node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
> 571         if (!node)
> 572                 return ERR_PTR(-ENODEV);
>
> nfs4_find_get_deviceid() does a lookup without the spin lock first.
> If it can't find a matching deviceid, it creates a new device_info
> (which calls bl_alloc_deviceid_node, and that registers the device's
> PR key).
>
> Then it takes the nfs4_deviceid_lock and looks up the deviceid again.
> If it finds it this time, bl_find_get_deviceid() frees the spare
> (new) device_info, which unregisters the PR key for the same device.
>
> Any subsequent I/O from this client on that device gets EBADE.
>
> The umount later unregisters the device's PR key again.
>
> To prevent this problem, register the PR key after the deviceid_node
> lookup.

Hi Chuck - nice catch, but I'm not seeing how we don't have the same problem
after this patch, instead it just seems like it moves the race.  What
prevents another process waiting to take the nfs4_deviceid_lock from
unregistering the same device?  I think we need another way to signal
bl_free_device that we don't want to unregister for the case where the new
device isn't added to nfs4_deviceid_cache.

No good ideas yet - maybe we can use a flag set within the
nfs4_deviceid_lock?

Ben

>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  fs/nfs/blocklayout/blocklayout.c |  9 ++++++++-
>  fs/nfs/blocklayout/blocklayout.h |  1 +
>  fs/nfs/blocklayout/dev.c         | 29 +++++++++++++++++++++--------
>  3 files changed, 30 insertions(+), 9 deletions(-)
>
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index 6be13e0ec170..75cc5e50bd37 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -571,8 +571,14 @@ bl_find_get_deviceid(struct nfs_server *server,
>  	if (!node)
>  		return ERR_PTR(-ENODEV);
>
> -	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
> +	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) {
> +		struct pnfs_block_dev *d =
> +			container_of(node, struct pnfs_block_dev, node);
> +		if (d->pr_reg)
> +			if (d->pr_reg(d) < 0)
> +				goto out_put;
>  		return node;
> +	}
>
>  	end = jiffies;
>  	start = end - PNFS_DEVICE_RETRY_TIMEOUT;
> @@ -581,6 +587,7 @@ bl_find_get_deviceid(struct nfs_server *server,
>  		goto retry;
>  	}
>
> +out_put:
>  	nfs4_put_deviceid_node(node);
>  	return ERR_PTR(-ENODEV);
>  }
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> index f1eeb4914199..8aabaf5218b8 100644
> --- a/fs/nfs/blocklayout/blocklayout.h
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -116,6 +116,7 @@ struct pnfs_block_dev {
>
>  	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
>  			struct pnfs_block_dev_map *map);
> +	int (*pr_reg)(struct pnfs_block_dev *dev);
>  };
>
>  /* sector_t fields are all in 512-byte sectors */
> diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
> index 356bc967fb5d..3d2401820ef4 100644
> --- a/fs/nfs/blocklayout/dev.c
> +++ b/fs/nfs/blocklayout/dev.c
> @@ -230,6 +230,26 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
>  	return true;
>  }
>
> +static int bl_register_scsi(struct pnfs_block_dev *d)
> +{
> +	struct block_device *bdev = file_bdev(d->bdev_file);
> +	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
> +	int error;
> +
> +	if (d->pr_registered)
> +		return 0;
> +
> +	error = ops->pr_register(bdev, 0, d->pr_key, true);
> +	if (error) {
> +		trace_bl_pr_key_reg_err(bdev->bd_disk->disk_name, d->pr_key, error);
> +		return -error;
> +	}
> +
> +	trace_bl_pr_key_reg(bdev->bd_disk->disk_name, d->pr_key);
> +	d->pr_registered = true;
> +	return 0;
> +}
> +
>  static int
>  bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
>  		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
> @@ -373,14 +393,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
>  		goto out_blkdev_put;
>  	}
>
> -	error = ops->pr_register(bdev, 0, d->pr_key, true);
> -	if (error) {
> -		trace_bl_pr_key_reg_err(bdev->bd_disk->disk_name, d->pr_key, error);
> -		goto out_blkdev_put;
> -	}
> -	trace_bl_pr_key_reg(bdev->bd_disk->disk_name, d->pr_key);
> -
> -	d->pr_registered = true;
> +	d->pr_reg = bl_register_scsi;
>  	return 0;
>
>  out_blkdev_put:
> -- 
> 2.45.1
Benjamin Coddington June 20, 2024, 1:52 p.m. UTC | #3
On 20 Jun 2024, at 1:06, Christoph Hellwig wrote:

> On Wed, Jun 19, 2024 at 01:39:33PM -0400, cel@kernel.org wrote:
>> -	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
>> +	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) {
>
> It might be worth to invert this and keep the unavailable handling in
> the branch as that's the exceptional case.   That code is also woefully
> under-documented and could have really used a comment.

The transient device handling in general, or just this bit of it?

>> +		struct pnfs_block_dev *d =
>> +			container_of(node, struct pnfs_block_dev, node);
>> +		if (d->pr_reg)
>> +			if (d->pr_reg(d) < 0)
>> +				goto out_put;
>
> Empty line after variable declarations.  Also is there anything that
> synchronizes the lookups here so that we don't do multiple registrations
> in parallel?

I don't think there is.  Do we get an error if we register twice?

Ben

>> +
>> +	if (d->pr_registered)
>> +		return 0;
>> +
>> +	error = ops->pr_register(bdev, 0, d->pr_key, true);
>> +	if (error) {
>> +		trace_bl_pr_key_reg_err(bdev->bd_disk->disk_name, d->pr_key, error);
>> +		return -error;
>
> ->pr_register returns either negative errnos or positive PR_STS_* values,
> simply inverting the error here isn't doing the right thing.
Chuck Lever June 20, 2024, 1:58 p.m. UTC | #4
On Thu, Jun 20, 2024 at 09:52:59AM -0400, Benjamin Coddington wrote:
> On 20 Jun 2024, at 1:06, Christoph Hellwig wrote:
> 
> > On Wed, Jun 19, 2024 at 01:39:33PM -0400, cel@kernel.org wrote:
> >> -	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
> >> +	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) {
> >
> > It might be worth to invert this and keep the unavailable handling in
> > the branch as that's the exceptional case.   That code is also woefully
> > under-documented and could have really used a comment.
> 
> The transient device handling in general, or just this bit of it?

I read Christoph's comment as referring specifically to the logic
in bl_find_get_deviceid(). 


> >> +		struct pnfs_block_dev *d =
> >> +			container_of(node, struct pnfs_block_dev, node);
> >> +		if (d->pr_reg)
> >> +			if (d->pr_reg(d) < 0)
> >> +				goto out_put;
> >
> > Empty line after variable declarations.  Also is there anything that
> > synchronizes the lookups here so that we don't do multiple registrations
> > in parallel?
> 
> I don't think there is.  Do we get an error if we register twice?

pr_register() does not throw an error in that case, so I didn't
protect against it. However, I could add atomic bit flags to
pnfs_block_dev to ensure registration is done only once, if we
believe that is necessary.


> Ben
> 
> >> +
> >> +	if (d->pr_registered)
> >> +		return 0;
> >> +
> >> +	error = ops->pr_register(bdev, 0, d->pr_key, true);
> >> +	if (error) {
> >> +		trace_bl_pr_key_reg_err(bdev->bd_disk->disk_name, d->pr_key, error);
> >> +		return -error;
> >
> > ->pr_register returns either negative errnos or positive PR_STS_* values,
> > simply inverting the error here isn't doing the right thing.
>
Christoph Hellwig June 20, 2024, 2:15 p.m. UTC | #5
On Thu, Jun 20, 2024 at 09:52:59AM -0400, Benjamin Coddington wrote:
> On 20 Jun 2024, at 1:06, Christoph Hellwig wrote:
> 
> > On Wed, Jun 19, 2024 at 01:39:33PM -0400, cel@kernel.org wrote:
> >> -	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
> >> +	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) {
> >
> > It might be worth to invert this and keep the unavailable handling in
> > the branch as that's the exceptional case.   That code is also woefully
> > under-documented and could have really used a comment.
> 
> The transient device handling in general, or just this bit of it?

Basically the code behind this NFS_DEVICEID_UNAVAILABLE check here.

> >> +		if (d->pr_reg)
> >> +			if (d->pr_reg(d) < 0)
> >> +				goto out_put;
> >
> > Empty line after variable declarations.  Also is there anything that
> > synchronizes the lookups here so that we don't do multiple registrations
> > in parallel?
> 
> I don't think there is.  Do we get an error if we register twice?

Yes.  That's the basically the same condition as the one that made
Chuck create this series.
Chuck Lever June 20, 2024, 2:18 p.m. UTC | #6
> On Jun 20, 2024, at 10:15 AM, Christoph Hellwig <hch@lst.de> wrote:
> 
> On Thu, Jun 20, 2024 at 09:52:59AM -0400, Benjamin Coddington wrote:
>> On 20 Jun 2024, at 1:06, Christoph Hellwig wrote:
>> 
>>> On Wed, Jun 19, 2024 at 01:39:33PM -0400, cel@kernel.org wrote:
>>>> - if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
>>>> + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) {
>>> 
>>> It might be worth to invert this and keep the unavailable handling in
>>> the branch as that's the exceptional case.   That code is also woefully
>>> under-documented and could have really used a comment.
>> 
>> The transient device handling in general, or just this bit of it?
> 
> Basically the code behind this NFS_DEVICEID_UNAVAILABLE check here.
> 
>>>> + if (d->pr_reg)
>>>> + if (d->pr_reg(d) < 0)
>>>> + goto out_put;
>>> 
>>> Empty line after variable declarations.  Also is there anything that
>>> synchronizes the lookups here so that we don't do multiple registrations
>>> in parallel?
>> 
>> I don't think there is.  Do we get an error if we register twice?
> 
> Yes.  That's the basically the same condition as the one that made
> Chuck create this series.

No, the problem I saw was that the device was unregistered early,
and that resulted in I/O errors and falling back to the MDS.

pr_register() doesn't fail if the device is already registered
(at least when the key is the same).

--
Chuck Lever
Chuck Lever June 20, 2024, 2:34 p.m. UTC | #7
On Thu, Jun 20, 2024 at 09:51:46AM -0400, Benjamin Coddington wrote:
> On 19 Jun 2024, at 13:39, cel@kernel.org wrote:
> 
> > From: Chuck Lever <chuck.lever@oracle.com>
> >
> > During generic/069 runs with pNFS SCSI layouts, the NFS client emits
> > the following in the system journal:
> >
> > kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
> > kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
> > kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
> > kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
> > kernel: sd 6:0:0:1: reservation conflict
> > kernel: sd 6:0:0:1: [sdb] tag#16 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
> > kernel: sd 6:0:0:1: [sdb] tag#16 CDB: Write(10) 2a 00 00 00 00 50 00 00 08 00
> > kernel: reservation conflict error, dev sdb, sector 80 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 2
> > kernel: sd 6:0:0:1: reservation conflict
> > kernel: sd 6:0:0:1: reservation conflict
> > kernel: sd 6:0:0:1: [sdb] tag#18 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
> > kernel: sd 6:0:0:1: [sdb] tag#17 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
> > kernel: sd 6:0:0:1: [sdb] tag#18 CDB: Write(10) 2a 00 00 00 00 60 00 00 08 00
> > kernel: sd 6:0:0:1: [sdb] tag#17 CDB: Write(10) 2a 00 00 00 00 58 00 00 08 00
> > kernel: reservation conflict error, dev sdb, sector 96 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
> > kernel: reservation conflict error, dev sdb, sector 88 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
> > systemd[1]: fstests-generic-069.scope: Deactivated successfully.
> > systemd[1]: fstests-generic-069.scope: Consumed 5.092s CPU time.
> > systemd[1]: media-test.mount: Deactivated successfully.
> > systemd[1]: media-scratch.mount: Deactivated successfully.
> > kernel: sd 6:0:0:1: reservation conflict
> > kernel: failed to unregister PR key.
> >
> > This appears to be due to a race. bl_alloc_lseg() calls this:
> >
> > 561 static struct nfs4_deviceid_node *
> > 562 bl_find_get_deviceid(struct nfs_server *server,
> > 563                 const struct nfs4_deviceid *id, const struct cred *cred,
> > 564                 gfp_t gfp_mask)
> > 565 {
> > 566         struct nfs4_deviceid_node *node;
> > 567         unsigned long start, end;
> > 568
> > 569 retry:
> > 570         node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
> > 571         if (!node)
> > 572                 return ERR_PTR(-ENODEV);
> >
> > nfs4_find_get_deviceid() does a lookup without the spin lock first.
> > If it can't find a matching deviceid, it creates a new device_info
> > (which calls bl_alloc_deviceid_node, and that registers the device's
> > PR key).
> >
> > Then it takes the nfs4_deviceid_lock and looks up the deviceid again.
> > If it finds it this time, bl_find_get_deviceid() frees the spare
> > (new) device_info, which unregisters the PR key for the same device.
> >
> > Any subsequent I/O from this client on that device gets EBADE.
> >
> > The umount later unregisters the device's PR key again.
> >
> > To prevent this problem, register the PR key after the deviceid_node
> > lookup.
> 
> Hi Chuck - nice catch, but I'm not seeing how we don't have the same problem
> after this patch, instead it just seems like it moves the race.  What
> prevents another process waiting to take the nfs4_deviceid_lock from
> unregistering the same device?  I think we need another way to signal
> bl_free_device that we don't want to unregister for the case where the new
> device isn't added to nfs4_deviceid_cache.

That's a (related but) somewhat different issue. I haven't seen
that in practice so far.


> No good ideas yet - maybe we can use a flag set within the
> nfs4_deviceid_lock?

Well this smells like a use for a reference count on the block
device, but fs/nfs doesn't control the definition of that data
structure.


> Ben
> 
> >
> > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > ---
> >  fs/nfs/blocklayout/blocklayout.c |  9 ++++++++-
> >  fs/nfs/blocklayout/blocklayout.h |  1 +
> >  fs/nfs/blocklayout/dev.c         | 29 +++++++++++++++++++++--------
> >  3 files changed, 30 insertions(+), 9 deletions(-)
> >
> > diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> > index 6be13e0ec170..75cc5e50bd37 100644
> > --- a/fs/nfs/blocklayout/blocklayout.c
> > +++ b/fs/nfs/blocklayout/blocklayout.c
> > @@ -571,8 +571,14 @@ bl_find_get_deviceid(struct nfs_server *server,
> >  	if (!node)
> >  		return ERR_PTR(-ENODEV);
> >
> > -	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
> > +	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) {
> > +		struct pnfs_block_dev *d =
> > +			container_of(node, struct pnfs_block_dev, node);
> > +		if (d->pr_reg)
> > +			if (d->pr_reg(d) < 0)
> > +				goto out_put;
> >  		return node;
> > +	}
> >
> >  	end = jiffies;
> >  	start = end - PNFS_DEVICE_RETRY_TIMEOUT;
> > @@ -581,6 +587,7 @@ bl_find_get_deviceid(struct nfs_server *server,
> >  		goto retry;
> >  	}
> >
> > +out_put:
> >  	nfs4_put_deviceid_node(node);
> >  	return ERR_PTR(-ENODEV);
> >  }
> > diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> > index f1eeb4914199..8aabaf5218b8 100644
> > --- a/fs/nfs/blocklayout/blocklayout.h
> > +++ b/fs/nfs/blocklayout/blocklayout.h
> > @@ -116,6 +116,7 @@ struct pnfs_block_dev {
> >
> >  	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
> >  			struct pnfs_block_dev_map *map);
> > +	int (*pr_reg)(struct pnfs_block_dev *dev);
> >  };
> >
> >  /* sector_t fields are all in 512-byte sectors */
> > diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
> > index 356bc967fb5d..3d2401820ef4 100644
> > --- a/fs/nfs/blocklayout/dev.c
> > +++ b/fs/nfs/blocklayout/dev.c
> > @@ -230,6 +230,26 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
> >  	return true;
> >  }
> >
> > +static int bl_register_scsi(struct pnfs_block_dev *d)
> > +{
> > +	struct block_device *bdev = file_bdev(d->bdev_file);
> > +	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
> > +	int error;
> > +
> > +	if (d->pr_registered)
> > +		return 0;
> > +
> > +	error = ops->pr_register(bdev, 0, d->pr_key, true);
> > +	if (error) {
> > +		trace_bl_pr_key_reg_err(bdev->bd_disk->disk_name, d->pr_key, error);
> > +		return -error;
> > +	}
> > +
> > +	trace_bl_pr_key_reg(bdev->bd_disk->disk_name, d->pr_key);
> > +	d->pr_registered = true;
> > +	return 0;
> > +}
> > +
> >  static int
> >  bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
> >  		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
> > @@ -373,14 +393,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
> >  		goto out_blkdev_put;
> >  	}
> >
> > -	error = ops->pr_register(bdev, 0, d->pr_key, true);
> > -	if (error) {
> > -		trace_bl_pr_key_reg_err(bdev->bd_disk->disk_name, d->pr_key, error);
> > -		goto out_blkdev_put;
> > -	}
> > -	trace_bl_pr_key_reg(bdev->bd_disk->disk_name, d->pr_key);
> > -
> > -	d->pr_registered = true;
> > +	d->pr_reg = bl_register_scsi;
> >  	return 0;
> >
> >  out_blkdev_put:
> > -- 
> > 2.45.1
>
Christoph Hellwig June 20, 2024, 2:37 p.m. UTC | #8
On Thu, Jun 20, 2024 at 10:34:03AM -0400, Chuck Lever wrote:
> > No good ideas yet - maybe we can use a flag set within the
> > nfs4_deviceid_lock?
> 
> Well this smells like a use for a reference count on the block
> device, but fs/nfs doesn't control the definition of that data
> structure.

The block device actually has multiple refcounts, up to three
depending on how you count.  I'm not sure how that would solve
anything here, though.
Benjamin Coddington June 20, 2024, 3:30 p.m. UTC | #9
On 20 Jun 2024, at 10:34, Chuck Lever wrote:

> On Thu, Jun 20, 2024 at 09:51:46AM -0400, Benjamin Coddington wrote:
>> On 19 Jun 2024, at 13:39, cel@kernel.org wrote:
>>
>>> From: Chuck Lever <chuck.lever@oracle.com>
>>>
>>> During generic/069 runs with pNFS SCSI layouts, the NFS client emits
>>> the following in the system journal:
>>>
>>> kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
>>> kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
>>> kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
>>> kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
>>> kernel: sd 6:0:0:1: reservation conflict
>>> kernel: sd 6:0:0:1: [sdb] tag#16 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
>>> kernel: sd 6:0:0:1: [sdb] tag#16 CDB: Write(10) 2a 00 00 00 00 50 00 00 08 00
>>> kernel: reservation conflict error, dev sdb, sector 80 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 2
>>> kernel: sd 6:0:0:1: reservation conflict
>>> kernel: sd 6:0:0:1: reservation conflict
>>> kernel: sd 6:0:0:1: [sdb] tag#18 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
>>> kernel: sd 6:0:0:1: [sdb] tag#17 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
>>> kernel: sd 6:0:0:1: [sdb] tag#18 CDB: Write(10) 2a 00 00 00 00 60 00 00 08 00
>>> kernel: sd 6:0:0:1: [sdb] tag#17 CDB: Write(10) 2a 00 00 00 00 58 00 00 08 00
>>> kernel: reservation conflict error, dev sdb, sector 96 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
>>> kernel: reservation conflict error, dev sdb, sector 88 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
>>> systemd[1]: fstests-generic-069.scope: Deactivated successfully.
>>> systemd[1]: fstests-generic-069.scope: Consumed 5.092s CPU time.
>>> systemd[1]: media-test.mount: Deactivated successfully.
>>> systemd[1]: media-scratch.mount: Deactivated successfully.
>>> kernel: sd 6:0:0:1: reservation conflict
>>> kernel: failed to unregister PR key.
>>>
>>> This appears to be due to a race. bl_alloc_lseg() calls this:
>>>
>>> 561 static struct nfs4_deviceid_node *
>>> 562 bl_find_get_deviceid(struct nfs_server *server,
>>> 563                 const struct nfs4_deviceid *id, const struct cred *cred,
>>> 564                 gfp_t gfp_mask)
>>> 565 {
>>> 566         struct nfs4_deviceid_node *node;
>>> 567         unsigned long start, end;
>>> 568
>>> 569 retry:
>>> 570         node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
>>> 571         if (!node)
>>> 572                 return ERR_PTR(-ENODEV);
>>>
>>> nfs4_find_get_deviceid() does a lookup without the spin lock first.
>>> If it can't find a matching deviceid, it creates a new device_info
>>> (which calls bl_alloc_deviceid_node, and that registers the device's
>>> PR key).
>>>
>>> Then it takes the nfs4_deviceid_lock and looks up the deviceid again.
>>> If it finds it this time, bl_find_get_deviceid() frees the spare
>>> (new) device_info, which unregisters the PR key for the same device.
>>>
>>> Any subsequent I/O from this client on that device gets EBADE.
>>>
>>> The umount later unregisters the device's PR key again.
>>>
>>> To prevent this problem, register the PR key after the deviceid_node
>>> lookup.
>>
>> Hi Chuck - nice catch, but I'm not seeing how we don't have the same problem
>> after this patch, instead it just seems like it moves the race.  What
>> prevents another process waiting to take the nfs4_deviceid_lock from
>> unregistering the same device?  I think we need another way to signal
>> bl_free_device that we don't want to unregister for the case where the new
>> device isn't added to nfs4_deviceid_cache.
>
> That's a (related but) somewhat different issue. I haven't seen
> that in practice so far.
>
>
>> No good ideas yet - maybe we can use a flag set within the
>> nfs4_deviceid_lock?
>
> Well this smells like a use for a reference count on the block
> device, but fs/nfs doesn't control the definition of that data
> structure.

I think we need two things to fix this race:
 - a way to determine if the current thread is the one
   that added the device to the to the cache, if so do the register
 - a way to determine if we're freeing because we lost the race to the
   cache, if so don't un-register.

We can get both with a flag that's always set within the nfs4_deviceid_lock,
something like NFS_DEVICEID_INIT.  If set, it signals we need to register in
the case we keep the device, or skip de-registration in the case where we
lost the race and throw it out.  We still need this patch to do the
registration after it lands in the cache.

Ben
Chuck Lever June 20, 2024, 3:39 p.m. UTC | #10
On Thu, Jun 20, 2024 at 07:06:14AM +0200, Christoph Hellwig wrote:
> On Wed, Jun 19, 2024 at 01:39:33PM -0400, cel@kernel.org wrote:
> > -	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
> > +	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) {
> 
> It might be worth to invert this and keep the unavailable handling in
> the branch as that's the exceptional case.   That code is also woefully
> under-documented and could have really used a comment.
> 
> > +		struct pnfs_block_dev *d =
> > +			container_of(node, struct pnfs_block_dev, node);
> > +		if (d->pr_reg)
> > +			if (d->pr_reg(d) < 0)
> > +				goto out_put;
> 
> Empty line after variable declarations.  Also is there anything that
> synchronizes the lookups here so that we don't do multiple registrations
> in parallel?

Is there something (more than d->pr_registered) that prevents that
today?


> > +
> > +	if (d->pr_registered)
> > +		return 0;
> > +
> > +	error = ops->pr_register(bdev, 0, d->pr_key, true);
> > +	if (error) {
> > +		trace_bl_pr_key_reg_err(bdev->bd_disk->disk_name, d->pr_key, error);
> > +		return -error;
> 
> ->pr_register returns either negative errnos or positive PR_STS_* values,
> simply inverting the error here isn't doing the right thing.

I don't see pr_register() returning a negative errno, but I will
remove the "-" and document the expected return values.
Benjamin Coddington June 20, 2024, 3:45 p.m. UTC | #11
On 20 Jun 2024, at 10:15, Christoph Hellwig wrote:

> On Thu, Jun 20, 2024 at 09:52:59AM -0400, Benjamin Coddington wrote:
>> On 20 Jun 2024, at 1:06, Christoph Hellwig wrote:
>>
>>> On Wed, Jun 19, 2024 at 01:39:33PM -0400, cel@kernel.org wrote:
>>>> -	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
>>>> +	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) {
>>>
>>> It might be worth to invert this and keep the unavailable handling in
>>> the branch as that's the exceptional case.   That code is also woefully
>>> under-documented and could have really used a comment.
>>
>> The transient device handling in general, or just this bit of it?
>
> Basically the code behind this NFS_DEVICEID_UNAVAILABLE check here.

How about..


diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 6be13e0ec170..665c054784b4 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -564,25 +564,30 @@ bl_find_get_deviceid(struct nfs_server *server,
                gfp_t gfp_mask)
 {
        struct nfs4_deviceid_node *node;
-       unsigned long start, end;

 retry:
        node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
        if (!node)
                return ERR_PTR(-ENODEV);

-       if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
-               return node;
+       /* Transient devices are left in the cache with a timeout to minimize
+        * sending GETDEVINFO after every LAYOUTGET */
+       if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+               unsigned long start, end;

-       end = jiffies;
-       start = end - PNFS_DEVICE_RETRY_TIMEOUT;
-       if (!time_in_range(node->timestamp_unavailable, start, end)) {
-               nfs4_delete_deviceid(node->ld, node->nfs_client, id);
-               goto retry;
+               end = jiffies;
+               start = end - PNFS_DEVICE_RETRY_TIMEOUT;
+
+               /* Maybe the device is back, let's look for it again */
+               if (!time_in_range(node->timestamp_unavailable, start, end)) {
+                       nfs4_delete_deviceid(node->ld, node->nfs_client, id);
+                       goto retry;
+               }
+               nfs4_put_deviceid_node(node);
+               return ERR_PTR(-ENODEV);
        }

-       nfs4_put_deviceid_node(node);
-       return ERR_PTR(-ENODEV);
+       return node;
 }

 static int
Chuck Lever June 20, 2024, 3:46 p.m. UTC | #12
On Thu, Jun 20, 2024 at 11:30:54AM -0400, Benjamin Coddington wrote:
> On 20 Jun 2024, at 10:34, Chuck Lever wrote:
> 
> > On Thu, Jun 20, 2024 at 09:51:46AM -0400, Benjamin Coddington wrote:
> >> On 19 Jun 2024, at 13:39, cel@kernel.org wrote:
> >>
> >>> From: Chuck Lever <chuck.lever@oracle.com>
> >>>
> >>> During generic/069 runs with pNFS SCSI layouts, the NFS client emits
> >>> the following in the system journal:
> >>>
> >>> kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
> >>> kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
> >>> kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
> >>> kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
> >>> kernel: sd 6:0:0:1: reservation conflict
> >>> kernel: sd 6:0:0:1: [sdb] tag#16 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
> >>> kernel: sd 6:0:0:1: [sdb] tag#16 CDB: Write(10) 2a 00 00 00 00 50 00 00 08 00
> >>> kernel: reservation conflict error, dev sdb, sector 80 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 2
> >>> kernel: sd 6:0:0:1: reservation conflict
> >>> kernel: sd 6:0:0:1: reservation conflict
> >>> kernel: sd 6:0:0:1: [sdb] tag#18 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
> >>> kernel: sd 6:0:0:1: [sdb] tag#17 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
> >>> kernel: sd 6:0:0:1: [sdb] tag#18 CDB: Write(10) 2a 00 00 00 00 60 00 00 08 00
> >>> kernel: sd 6:0:0:1: [sdb] tag#17 CDB: Write(10) 2a 00 00 00 00 58 00 00 08 00
> >>> kernel: reservation conflict error, dev sdb, sector 96 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
> >>> kernel: reservation conflict error, dev sdb, sector 88 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
> >>> systemd[1]: fstests-generic-069.scope: Deactivated successfully.
> >>> systemd[1]: fstests-generic-069.scope: Consumed 5.092s CPU time.
> >>> systemd[1]: media-test.mount: Deactivated successfully.
> >>> systemd[1]: media-scratch.mount: Deactivated successfully.
> >>> kernel: sd 6:0:0:1: reservation conflict
> >>> kernel: failed to unregister PR key.
> >>>
> >>> This appears to be due to a race. bl_alloc_lseg() calls this:
> >>>
> >>> 561 static struct nfs4_deviceid_node *
> >>> 562 bl_find_get_deviceid(struct nfs_server *server,
> >>> 563                 const struct nfs4_deviceid *id, const struct cred *cred,
> >>> 564                 gfp_t gfp_mask)
> >>> 565 {
> >>> 566         struct nfs4_deviceid_node *node;
> >>> 567         unsigned long start, end;
> >>> 568
> >>> 569 retry:
> >>> 570         node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
> >>> 571         if (!node)
> >>> 572                 return ERR_PTR(-ENODEV);
> >>>
> >>> nfs4_find_get_deviceid() does a lookup without the spin lock first.
> >>> If it can't find a matching deviceid, it creates a new device_info
> >>> (which calls bl_alloc_deviceid_node, and that registers the device's
> >>> PR key).
> >>>
> >>> Then it takes the nfs4_deviceid_lock and looks up the deviceid again.
> >>> If it finds it this time, bl_find_get_deviceid() frees the spare
> >>> (new) device_info, which unregisters the PR key for the same device.
> >>>
> >>> Any subsequent I/O from this client on that device gets EBADE.
> >>>
> >>> The umount later unregisters the device's PR key again.
> >>>
> >>> To prevent this problem, register the PR key after the deviceid_node
> >>> lookup.
> >>
> >> Hi Chuck - nice catch, but I'm not seeing how we don't have the same problem
> >> after this patch, instead it just seems like it moves the race.  What
> >> prevents another process waiting to take the nfs4_deviceid_lock from
> >> unregistering the same device?  I think we need another way to signal
> >> bl_free_device that we don't want to unregister for the case where the new
> >> device isn't added to nfs4_deviceid_cache.
> >
> > That's a (related but) somewhat different issue. I haven't seen
> > that in practice so far.
> >
> >
> >> No good ideas yet - maybe we can use a flag set within the
> >> nfs4_deviceid_lock?
> >
> > Well this smells like a use for a reference count on the block
> > device, but fs/nfs doesn't control the definition of that data
> > structure.
> 
> I think we need two things to fix this race:
>  - a way to determine if the current thread is the one
>    that added the device to the to the cache, if so do the register
>  - a way to determine if we're freeing because we lost the race to the
>    cache, if so don't un-register.

My patch is supposed to deal with all of that already. Can you show
me specifically what is not getting handled by my proposed change?


> We can get both with a flag that's always set within the nfs4_deviceid_lock,
> something like NFS_DEVICEID_INIT.  If set, it signals we need to register in
> the case we keep the device, or skip de-registration in the case where we
> lost the race and throw it out.  We still need this patch to do the
> registration after it lands in the cache.
Chuck Lever June 20, 2024, 3:48 p.m. UTC | #13
On Thu, Jun 20, 2024 at 11:45:02AM -0400, Benjamin Coddington wrote:
> On 20 Jun 2024, at 10:15, Christoph Hellwig wrote:
> 
> > On Thu, Jun 20, 2024 at 09:52:59AM -0400, Benjamin Coddington wrote:
> >> On 20 Jun 2024, at 1:06, Christoph Hellwig wrote:
> >>
> >>> On Wed, Jun 19, 2024 at 01:39:33PM -0400, cel@kernel.org wrote:
> >>>> -	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
> >>>> +	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) {
> >>>
> >>> It might be worth to invert this and keep the unavailable handling in
> >>> the branch as that's the exceptional case.   That code is also woefully
> >>> under-documented and could have really used a comment.
> >>
> >> The transient device handling in general, or just this bit of it?
> >
> > Basically the code behind this NFS_DEVICEID_UNAVAILABLE check here.
> 
> How about..

Let's leave this as a separate patch. IMO this is dealing with an
entirely orthogonal issue.


> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index 6be13e0ec170..665c054784b4 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -564,25 +564,30 @@ bl_find_get_deviceid(struct nfs_server *server,
>                 gfp_t gfp_mask)
>  {
>         struct nfs4_deviceid_node *node;
> -       unsigned long start, end;
> 
>  retry:
>         node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
>         if (!node)
>                 return ERR_PTR(-ENODEV);
> 
> -       if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
> -               return node;
> +       /* Transient devices are left in the cache with a timeout to minimize
> +        * sending GETDEVINFO after every LAYOUTGET */
> +       if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
> +               unsigned long start, end;
> 
> -       end = jiffies;
> -       start = end - PNFS_DEVICE_RETRY_TIMEOUT;
> -       if (!time_in_range(node->timestamp_unavailable, start, end)) {
> -               nfs4_delete_deviceid(node->ld, node->nfs_client, id);
> -               goto retry;
> +               end = jiffies;
> +               start = end - PNFS_DEVICE_RETRY_TIMEOUT;
> +
> +               /* Maybe the device is back, let's look for it again */
> +               if (!time_in_range(node->timestamp_unavailable, start, end)) {
> +                       nfs4_delete_deviceid(node->ld, node->nfs_client, id);
> +                       goto retry;
> +               }
> +               nfs4_put_deviceid_node(node);
> +               return ERR_PTR(-ENODEV);
>         }
> 
> -       nfs4_put_deviceid_node(node);
> -       return ERR_PTR(-ENODEV);
> +       return node;
>  }
> 
>  static int
>
Benjamin Coddington June 20, 2024, 3:56 p.m. UTC | #14
On 20 Jun 2024, at 11:46, Chuck Lever wrote:

> On Thu, Jun 20, 2024 at 11:30:54AM -0400, Benjamin Coddington wrote:
>> On 20 Jun 2024, at 10:34, Chuck Lever wrote:
>>
>>> On Thu, Jun 20, 2024 at 09:51:46AM -0400, Benjamin Coddington wrote:
>>>> On 19 Jun 2024, at 13:39, cel@kernel.org wrote:
>>>>
>>>>> From: Chuck Lever <chuck.lever@oracle.com>
>>>>>
>>>>> During generic/069 runs with pNFS SCSI layouts, the NFS client emits
>>>>> the following in the system journal:
>>>>>
>>>>> kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
>>>>> kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
>>>>> kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
>>>>> kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
>>>>> kernel: sd 6:0:0:1: reservation conflict
>>>>> kernel: sd 6:0:0:1: [sdb] tag#16 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
>>>>> kernel: sd 6:0:0:1: [sdb] tag#16 CDB: Write(10) 2a 00 00 00 00 50 00 00 08 00
>>>>> kernel: reservation conflict error, dev sdb, sector 80 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 2
>>>>> kernel: sd 6:0:0:1: reservation conflict
>>>>> kernel: sd 6:0:0:1: reservation conflict
>>>>> kernel: sd 6:0:0:1: [sdb] tag#18 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
>>>>> kernel: sd 6:0:0:1: [sdb] tag#17 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
>>>>> kernel: sd 6:0:0:1: [sdb] tag#18 CDB: Write(10) 2a 00 00 00 00 60 00 00 08 00
>>>>> kernel: sd 6:0:0:1: [sdb] tag#17 CDB: Write(10) 2a 00 00 00 00 58 00 00 08 00
>>>>> kernel: reservation conflict error, dev sdb, sector 96 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
>>>>> kernel: reservation conflict error, dev sdb, sector 88 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
>>>>> systemd[1]: fstests-generic-069.scope: Deactivated successfully.
>>>>> systemd[1]: fstests-generic-069.scope: Consumed 5.092s CPU time.
>>>>> systemd[1]: media-test.mount: Deactivated successfully.
>>>>> systemd[1]: media-scratch.mount: Deactivated successfully.
>>>>> kernel: sd 6:0:0:1: reservation conflict
>>>>> kernel: failed to unregister PR key.
>>>>>
>>>>> This appears to be due to a race. bl_alloc_lseg() calls this:
>>>>>
>>>>> 561 static struct nfs4_deviceid_node *
>>>>> 562 bl_find_get_deviceid(struct nfs_server *server,
>>>>> 563                 const struct nfs4_deviceid *id, const struct cred *cred,
>>>>> 564                 gfp_t gfp_mask)
>>>>> 565 {
>>>>> 566         struct nfs4_deviceid_node *node;
>>>>> 567         unsigned long start, end;
>>>>> 568
>>>>> 569 retry:
>>>>> 570         node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
>>>>> 571         if (!node)
>>>>> 572                 return ERR_PTR(-ENODEV);
>>>>>
>>>>> nfs4_find_get_deviceid() does a lookup without the spin lock first.
>>>>> If it can't find a matching deviceid, it creates a new device_info
>>>>> (which calls bl_alloc_deviceid_node, and that registers the device's
>>>>> PR key).
>>>>>
>>>>> Then it takes the nfs4_deviceid_lock and looks up the deviceid again.
>>>>> If it finds it this time, bl_find_get_deviceid() frees the spare
>>>>> (new) device_info, which unregisters the PR key for the same device.
>>>>>
>>>>> Any subsequent I/O from this client on that device gets EBADE.
>>>>>
>>>>> The umount later unregisters the device's PR key again.
>>>>>
>>>>> To prevent this problem, register the PR key after the deviceid_node
>>>>> lookup.
>>>>
>>>> Hi Chuck - nice catch, but I'm not seeing how we don't have the same problem
>>>> after this patch, instead it just seems like it moves the race.  What
>>>> prevents another process waiting to take the nfs4_deviceid_lock from
>>>> unregistering the same device?  I think we need another way to signal
>>>> bl_free_device that we don't want to unregister for the case where the new
>>>> device isn't added to nfs4_deviceid_cache.
>>>
>>> That's a (related but) somewhat different issue. I haven't seen
>>> that in practice so far.
>>>
>>>
>>>> No good ideas yet - maybe we can use a flag set within the
>>>> nfs4_deviceid_lock?
>>>
>>> Well this smells like a use for a reference count on the block
>>> device, but fs/nfs doesn't control the definition of that data
>>> structure.
>>
>> I think we need two things to fix this race:
>>  - a way to determine if the current thread is the one
>>    that added the device to the to the cache, if so do the register
>>  - a way to determine if we're freeing because we lost the race to the
>>    cache, if so don't un-register.
>
> My patch is supposed to deal with all of that already. Can you show
> me specifically what is not getting handled by my proposed change?

Well - I may be missing something, but it looks like with this patch you can
still have:

Thread
A                           B

nfs4_find_get_deviceid
new{a} = nfs4_get_device_info
locks nfs4_deviceid_lock
                            nfs4_find_get_deviceid
                            new{b} = nfs4_get_device_info
                            spins on nfs4_deviceid_lock
adds new{a} to the cache
unlocks nfs4_deviceid_lock
pr_register
                            locks nfs4_deviceid_lock
                            finds new{a}
                            pr_UNregisters new{b}

In this case, you end up with an unregistered device.


Also, you can have more than one thread doing the initial pr_register, but I
think as we've already discussed that's no big deal - it should be rare and
I don't think it returns an error.

Ben
Benjamin Coddington June 20, 2024, 3:58 p.m. UTC | #15
On 20 Jun 2024, at 11:48, Chuck Lever wrote:

> On Thu, Jun 20, 2024 at 11:45:02AM -0400, Benjamin Coddington wrote:
>> On 20 Jun 2024, at 10:15, Christoph Hellwig wrote:
>>
>>> On Thu, Jun 20, 2024 at 09:52:59AM -0400, Benjamin Coddington wrote:
>>>> On 20 Jun 2024, at 1:06, Christoph Hellwig wrote:
>>>>
>>>>> On Wed, Jun 19, 2024 at 01:39:33PM -0400, cel@kernel.org wrote:
>>>>>> -	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
>>>>>> +	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) {
>>>>>
>>>>> It might be worth to invert this and keep the unavailable handling in
>>>>> the branch as that's the exceptional case.   That code is also woefully
>>>>> under-documented and could have really used a comment.
>>>>
>>>> The transient device handling in general, or just this bit of it?
>>>
>>> Basically the code behind this NFS_DEVICEID_UNAVAILABLE check here.
>>
>> How about..
>
> Let's leave this as a separate patch. IMO this is dealing with an
> entirely orthogonal issue.

Agree - just wanted feedback on what's appropriate for comments in here.  I
can send something after your work.

Ben
Benjamin Coddington June 20, 2024, 4:45 p.m. UTC | #16
On 20 Jun 2024, at 11:56, Benjamin Coddington wrote:

> On 20 Jun 2024, at 11:46, Chuck Lever wrote:
>
>> On Thu, Jun 20, 2024 at 11:30:54AM -0400, Benjamin Coddington wrote:
>>> On 20 Jun 2024, at 10:34, Chuck Lever wrote:
>>>
>>>> On Thu, Jun 20, 2024 at 09:51:46AM -0400, Benjamin Coddington wrote:
>>>>> On 19 Jun 2024, at 13:39, cel@kernel.org wrote:
>>>>>
>>>>>> From: Chuck Lever <chuck.lever@oracle.com>
>>>>>>
>>>>>> During generic/069 runs with pNFS SCSI layouts, the NFS client emits
>>>>>> the following in the system journal:
>>>>>>
>>>>>> kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
>>>>>> kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
>>>>>> kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
>>>>>> kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
>>>>>> kernel: sd 6:0:0:1: reservation conflict
>>>>>> kernel: sd 6:0:0:1: [sdb] tag#16 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
>>>>>> kernel: sd 6:0:0:1: [sdb] tag#16 CDB: Write(10) 2a 00 00 00 00 50 00 00 08 00
>>>>>> kernel: reservation conflict error, dev sdb, sector 80 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 2
>>>>>> kernel: sd 6:0:0:1: reservation conflict
>>>>>> kernel: sd 6:0:0:1: reservation conflict
>>>>>> kernel: sd 6:0:0:1: [sdb] tag#18 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
>>>>>> kernel: sd 6:0:0:1: [sdb] tag#17 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
>>>>>> kernel: sd 6:0:0:1: [sdb] tag#18 CDB: Write(10) 2a 00 00 00 00 60 00 00 08 00
>>>>>> kernel: sd 6:0:0:1: [sdb] tag#17 CDB: Write(10) 2a 00 00 00 00 58 00 00 08 00
>>>>>> kernel: reservation conflict error, dev sdb, sector 96 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
>>>>>> kernel: reservation conflict error, dev sdb, sector 88 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
>>>>>> systemd[1]: fstests-generic-069.scope: Deactivated successfully.
>>>>>> systemd[1]: fstests-generic-069.scope: Consumed 5.092s CPU time.
>>>>>> systemd[1]: media-test.mount: Deactivated successfully.
>>>>>> systemd[1]: media-scratch.mount: Deactivated successfully.
>>>>>> kernel: sd 6:0:0:1: reservation conflict
>>>>>> kernel: failed to unregister PR key.
>>>>>>
>>>>>> This appears to be due to a race. bl_alloc_lseg() calls this:
>>>>>>
>>>>>> 561 static struct nfs4_deviceid_node *
>>>>>> 562 bl_find_get_deviceid(struct nfs_server *server,
>>>>>> 563                 const struct nfs4_deviceid *id, const struct cred *cred,
>>>>>> 564                 gfp_t gfp_mask)
>>>>>> 565 {
>>>>>> 566         struct nfs4_deviceid_node *node;
>>>>>> 567         unsigned long start, end;
>>>>>> 568
>>>>>> 569 retry:
>>>>>> 570         node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
>>>>>> 571         if (!node)
>>>>>> 572                 return ERR_PTR(-ENODEV);
>>>>>>
>>>>>> nfs4_find_get_deviceid() does a lookup without the spin lock first.
>>>>>> If it can't find a matching deviceid, it creates a new device_info
>>>>>> (which calls bl_alloc_deviceid_node, and that registers the device's
>>>>>> PR key).
>>>>>>
>>>>>> Then it takes the nfs4_deviceid_lock and looks up the deviceid again.
>>>>>> If it finds it this time, bl_find_get_deviceid() frees the spare
>>>>>> (new) device_info, which unregisters the PR key for the same device.
>>>>>>
>>>>>> Any subsequent I/O from this client on that device gets EBADE.
>>>>>>
>>>>>> The umount later unregisters the device's PR key again.
>>>>>>
>>>>>> To prevent this problem, register the PR key after the deviceid_node
>>>>>> lookup.
>>>>>
>>>>> Hi Chuck - nice catch, but I'm not seeing how we don't have the same problem
>>>>> after this patch, instead it just seems like it moves the race.  What
>>>>> prevents another process waiting to take the nfs4_deviceid_lock from
>>>>> unregistering the same device?  I think we need another way to signal
>>>>> bl_free_device that we don't want to unregister for the case where the new
>>>>> device isn't added to nfs4_deviceid_cache.
>>>>
>>>> That's a (related but) somewhat different issue. I haven't seen
>>>> that in practice so far.
>>>>
>>>>
>>>>> No good ideas yet - maybe we can use a flag set within the
>>>>> nfs4_deviceid_lock?
>>>>
>>>> Well this smells like a use for a reference count on the block
>>>> device, but fs/nfs doesn't control the definition of that data
>>>> structure.
>>>
>>> I think we need two things to fix this race:
>>>  - a way to determine if the current thread is the one
>>>    that added the device to the to the cache, if so do the register
>>>  - a way to determine if we're freeing because we lost the race to the
>>>    cache, if so don't un-register.
>>
>> My patch is supposed to deal with all of that already. Can you show
>> me specifically what is not getting handled by my proposed change?
>
> Well - I may be missing something, but it looks like with this patch you can
> still have:
>
> Thread
> A                           B
>
> nfs4_find_get_deviceid
> new{a} = nfs4_get_device_info
> locks nfs4_deviceid_lock
>                             nfs4_find_get_deviceid
>                             new{b} = nfs4_get_device_info
>                             spins on nfs4_deviceid_lock
> adds new{a} to the cache
> unlocks nfs4_deviceid_lock
> pr_register
>                             locks nfs4_deviceid_lock
>                             finds new{a}
>                             pr_UNregisters new{b}
>
> In this case, you end up with an unregistered device.

Oh jeez Chuck, nevermind - I am missing something, that we check the the
new{b} pnfs_block_dev->pr_registred before unregistering it.

So, actually - I think this patch does solve the problem.  I apologize for
the noise here.

Ben
Chuck Lever June 20, 2024, 5:08 p.m. UTC | #17
On Thu, Jun 20, 2024 at 12:45:56PM -0400, Benjamin Coddington wrote:
> On 20 Jun 2024, at 11:56, Benjamin Coddington wrote:
> 
> > On 20 Jun 2024, at 11:46, Chuck Lever wrote:
> >
> >> On Thu, Jun 20, 2024 at 11:30:54AM -0400, Benjamin Coddington wrote:
> >>> On 20 Jun 2024, at 10:34, Chuck Lever wrote:
> >>>
> >>>> On Thu, Jun 20, 2024 at 09:51:46AM -0400, Benjamin Coddington wrote:
> >>>>> On 19 Jun 2024, at 13:39, cel@kernel.org wrote:
> >>>>>
> >>>>>> From: Chuck Lever <chuck.lever@oracle.com>
> >>>>>>
> >>>>>> During generic/069 runs with pNFS SCSI layouts, the NFS client emits
> >>>>>> the following in the system journal:
> >>>>>>
> >>>>>> kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
> >>>>>> kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
> >>>>>> kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
> >>>>>> kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
> >>>>>> kernel: sd 6:0:0:1: reservation conflict
> >>>>>> kernel: sd 6:0:0:1: [sdb] tag#16 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
> >>>>>> kernel: sd 6:0:0:1: [sdb] tag#16 CDB: Write(10) 2a 00 00 00 00 50 00 00 08 00
> >>>>>> kernel: reservation conflict error, dev sdb, sector 80 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 2
> >>>>>> kernel: sd 6:0:0:1: reservation conflict
> >>>>>> kernel: sd 6:0:0:1: reservation conflict
> >>>>>> kernel: sd 6:0:0:1: [sdb] tag#18 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
> >>>>>> kernel: sd 6:0:0:1: [sdb] tag#17 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
> >>>>>> kernel: sd 6:0:0:1: [sdb] tag#18 CDB: Write(10) 2a 00 00 00 00 60 00 00 08 00
> >>>>>> kernel: sd 6:0:0:1: [sdb] tag#17 CDB: Write(10) 2a 00 00 00 00 58 00 00 08 00
> >>>>>> kernel: reservation conflict error, dev sdb, sector 96 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
> >>>>>> kernel: reservation conflict error, dev sdb, sector 88 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
> >>>>>> systemd[1]: fstests-generic-069.scope: Deactivated successfully.
> >>>>>> systemd[1]: fstests-generic-069.scope: Consumed 5.092s CPU time.
> >>>>>> systemd[1]: media-test.mount: Deactivated successfully.
> >>>>>> systemd[1]: media-scratch.mount: Deactivated successfully.
> >>>>>> kernel: sd 6:0:0:1: reservation conflict
> >>>>>> kernel: failed to unregister PR key.
> >>>>>>
> >>>>>> This appears to be due to a race. bl_alloc_lseg() calls this:
> >>>>>>
> >>>>>> 561 static struct nfs4_deviceid_node *
> >>>>>> 562 bl_find_get_deviceid(struct nfs_server *server,
> >>>>>> 563                 const struct nfs4_deviceid *id, const struct cred *cred,
> >>>>>> 564                 gfp_t gfp_mask)
> >>>>>> 565 {
> >>>>>> 566         struct nfs4_deviceid_node *node;
> >>>>>> 567         unsigned long start, end;
> >>>>>> 568
> >>>>>> 569 retry:
> >>>>>> 570         node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
> >>>>>> 571         if (!node)
> >>>>>> 572                 return ERR_PTR(-ENODEV);
> >>>>>>
> >>>>>> nfs4_find_get_deviceid() does a lookup without the spin lock first.
> >>>>>> If it can't find a matching deviceid, it creates a new device_info
> >>>>>> (which calls bl_alloc_deviceid_node, and that registers the device's
> >>>>>> PR key).
> >>>>>>
> >>>>>> Then it takes the nfs4_deviceid_lock and looks up the deviceid again.
> >>>>>> If it finds it this time, bl_find_get_deviceid() frees the spare
> >>>>>> (new) device_info, which unregisters the PR key for the same device.
> >>>>>>
> >>>>>> Any subsequent I/O from this client on that device gets EBADE.
> >>>>>>
> >>>>>> The umount later unregisters the device's PR key again.
> >>>>>>
> >>>>>> To prevent this problem, register the PR key after the deviceid_node
> >>>>>> lookup.
> >>>>>
> >>>>> Hi Chuck - nice catch, but I'm not seeing how we don't have the same problem
> >>>>> after this patch, instead it just seems like it moves the race.  What
> >>>>> prevents another process waiting to take the nfs4_deviceid_lock from
> >>>>> unregistering the same device?  I think we need another way to signal
> >>>>> bl_free_device that we don't want to unregister for the case where the new
> >>>>> device isn't added to nfs4_deviceid_cache.
> >>>>
> >>>> That's a (related but) somewhat different issue. I haven't seen
> >>>> that in practice so far.
> >>>>
> >>>>
> >>>>> No good ideas yet - maybe we can use a flag set within the
> >>>>> nfs4_deviceid_lock?
> >>>>
> >>>> Well this smells like a use for a reference count on the block
> >>>> device, but fs/nfs doesn't control the definition of that data
> >>>> structure.
> >>>
> >>> I think we need two things to fix this race:
> >>>  - a way to determine if the current thread is the one
> >>>    that added the device to the to the cache, if so do the register
> >>>  - a way to determine if we're freeing because we lost the race to the
> >>>    cache, if so don't un-register.
> >>
> >> My patch is supposed to deal with all of that already. Can you show
> >> me specifically what is not getting handled by my proposed change?
> >
> > Well - I may be missing something, but it looks like with this patch you can
> > still have:
> >
> > Thread
> > A                           B
> >
> > nfs4_find_get_deviceid
> > new{a} = nfs4_get_device_info
> > locks nfs4_deviceid_lock
> >                             nfs4_find_get_deviceid
> >                             new{b} = nfs4_get_device_info
> >                             spins on nfs4_deviceid_lock
> > adds new{a} to the cache
> > unlocks nfs4_deviceid_lock
> > pr_register
> >                             locks nfs4_deviceid_lock
> >                             finds new{a}
> >                             pr_UNregisters new{b}
> >
> > In this case, you end up with an unregistered device.
> 
> Oh jeez Chuck, nevermind - I am missing something, that we check the the
> new{b} pnfs_block_dev->pr_registred before unregistering it.
> 
> So, actually - I think this patch does solve the problem.  I apologize for
> the noise here.

Thanks! Was wondering, because I thought that was exactly the race
I was trying to fix!
diff mbox series

Patch

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 6be13e0ec170..75cc5e50bd37 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -571,8 +571,14 @@  bl_find_get_deviceid(struct nfs_server *server,
 	if (!node)
 		return ERR_PTR(-ENODEV);
 
-	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
+	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) {
+		struct pnfs_block_dev *d =
+			container_of(node, struct pnfs_block_dev, node);
+		if (d->pr_reg)
+			if (d->pr_reg(d) < 0)
+				goto out_put;
 		return node;
+	}
 
 	end = jiffies;
 	start = end - PNFS_DEVICE_RETRY_TIMEOUT;
@@ -581,6 +587,7 @@  bl_find_get_deviceid(struct nfs_server *server,
 		goto retry;
 	}
 
+out_put:
 	nfs4_put_deviceid_node(node);
 	return ERR_PTR(-ENODEV);
 }
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f1eeb4914199..8aabaf5218b8 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -116,6 +116,7 @@  struct pnfs_block_dev {
 
 	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
 			struct pnfs_block_dev_map *map);
+	int (*pr_reg)(struct pnfs_block_dev *dev);
 };
 
 /* sector_t fields are all in 512-byte sectors */
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 356bc967fb5d..3d2401820ef4 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -230,6 +230,26 @@  static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
 	return true;
 }
 
+static int bl_register_scsi(struct pnfs_block_dev *d)
+{
+	struct block_device *bdev = file_bdev(d->bdev_file);
+	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+	int error;
+
+	if (d->pr_registered)
+		return 0;
+
+	error = ops->pr_register(bdev, 0, d->pr_key, true);
+	if (error) {
+		trace_bl_pr_key_reg_err(bdev->bd_disk->disk_name, d->pr_key, error);
+		return -error;
+	}
+
+	trace_bl_pr_key_reg(bdev->bd_disk->disk_name, d->pr_key);
+	d->pr_registered = true;
+	return 0;
+}
+
 static int
 bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
@@ -373,14 +393,7 @@  bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 		goto out_blkdev_put;
 	}
 
-	error = ops->pr_register(bdev, 0, d->pr_key, true);
-	if (error) {
-		trace_bl_pr_key_reg_err(bdev->bd_disk->disk_name, d->pr_key, error);
-		goto out_blkdev_put;
-	}
-	trace_bl_pr_key_reg(bdev->bd_disk->disk_name, d->pr_key);
-
-	d->pr_registered = true;
+	d->pr_reg = bl_register_scsi;
 	return 0;
 
 out_blkdev_put: