diff mbox series

[RFC] scsi: sd_zbc: prevent report zones racing rev_wp_ofst updates

Message ID 20200804142541.17126-1-johannes.thumshirn@wdc.com (mailing list archive)
State Changes Requested
Headers show
Series [RFC] scsi: sd_zbc: prevent report zones racing rev_wp_ofst updates | expand

Commit Message

Johannes Thumshirn Aug. 4, 2020, 2:25 p.m. UTC
When revalidating a zoned SCSI disk, we need to do a REPORT ZONES, which
also updates the write-pointer offset cache.

As we don't want a normal REPORT ZONES to constantly update the
write-pointer offset cache, we swap the cache contents from revalidate
with the live version.

But if a second REPORT ZONES is triggered while '->rev_wp_offset' is
already allocated, sd_zbc_parse_report() can't distinguish the two
different REPORT ZONES (from revalidation context or from a
file-system/ioctl).

                 CPU0                             CPU1

sd_zbc_revalidate_zones()
`-> mutex_lock(&sdkp->rev_mutex);
`-> sdkp->rev_wp_offset = kvcalloc();
`->blk_revalidate_disk_zones();
   `-> disk->fops->report_zones();
       `-> sd_zbc_report_zones();
           `-> sd_zbc_parse_report();
	       `-> if (sdkp->rev_wp_offset)
                   `-> sdkp->rev_wp_offset[idx] =

                                           blkdev_report_zones()
                                           `-> disk->fops->report_zones();
                                               `-> sd_zbc_report_zones();
                                                   `-> sd_zbc_parse_report();
                                        	       `-> if (sdkp->rev_wp_offset)
                                                           `-> sdkp->rev_wp_offset[idx] =

   `-> update_driver_data();
      `-> sd_zbc_revalidate_zones_cb();
          `-> swap(sdkp->zones_wp_offset, sdkp->rev_wp_offset);
`-> kvfree(sdkp->rev_wp_offset);
`-> sdkp->rev_wp_offset = NULL;
`-> mutex_unlock(&sdkp->rev_mutex);

As two concurrent revalidates are excluded via the '->rev_mutex', try to
grab the '->rev_mutex' in sd_zbc_report_zones(). If we cannot lock the
'->rev_mutex' because it's already held, we know we're called in a disk
revalidate context, if we can grab the mutex we need to unlock it again
after sd_zbc_parse_report() as we're not called in a revalidate context.

This way we can ensure a partial REPORT ZONES doesn't set invalid
write-pointer offsets in the revalidate write-pointer offset cache when a
partial REPORT ZONES is running concurrently with a full REPORT ZONES from
disk revalidation.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 drivers/scsi/sd_zbc.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

Comments

Damien Le Moal Aug. 5, 2020, 1:38 a.m. UTC | #1
On 2020/08/04 23:25, Johannes Thumshirn wrote:
> When revalidating a zoned SCSI disk, we need to do a REPORT ZONES, which
> also updates the write-pointer offset cache.
> 
> As we don't want a normal REPORT ZONES to constantly update the
> write-pointer offset cache, we swap the cache contents from revalidate
> with the live version.
> 
> But if a second REPORT ZONES is triggered while '->rev_wp_offset' is
> already allocated, sd_zbc_parse_report() can't distinguish the two
> different REPORT ZONES (from revalidation context or from a
> file-system/ioctl).
> 
>                  CPU0                             CPU1
> 
> sd_zbc_revalidate_zones()
> `-> mutex_lock(&sdkp->rev_mutex);
> `-> sdkp->rev_wp_offset = kvcalloc();
> `->blk_revalidate_disk_zones();
>    `-> disk->fops->report_zones();
>        `-> sd_zbc_report_zones();
>            `-> sd_zbc_parse_report();
> 	       `-> if (sdkp->rev_wp_offset)
>                    `-> sdkp->rev_wp_offset[idx] =
> 
>                                            blkdev_report_zones()
>                                            `-> disk->fops->report_zones();
>                                                `-> sd_zbc_report_zones();
>                                                    `-> sd_zbc_parse_report();
>                                         	       `-> if (sdkp->rev_wp_offset)
>                                                            `-> sdkp->rev_wp_offset[idx] =
> 
>    `-> update_driver_data();
>       `-> sd_zbc_revalidate_zones_cb();
>           `-> swap(sdkp->zones_wp_offset, sdkp->rev_wp_offset);
> `-> kvfree(sdkp->rev_wp_offset);
> `-> sdkp->rev_wp_offset = NULL;
> `-> mutex_unlock(&sdkp->rev_mutex);
> 
> As two concurrent revalidates are excluded via the '->rev_mutex', try to
> grab the '->rev_mutex' in sd_zbc_report_zones(). If we cannot lock the
> '->rev_mutex' because it's already held, we know we're called in a disk
> revalidate context, if we can grab the mutex we need to unlock it again
> after sd_zbc_parse_report() as we're not called in a revalidate context.
> 
> This way we can ensure a partial REPORT ZONES doesn't set invalid
> write-pointer offsets in the revalidate write-pointer offset cache when a
> partial REPORT ZONES is running concurrently with a full REPORT ZONES from
> disk revalidation.
> 
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  drivers/scsi/sd_zbc.c | 11 +++++++++++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
> index 6f7eba66687e..d19456220c09 100644
> --- a/drivers/scsi/sd_zbc.c
> +++ b/drivers/scsi/sd_zbc.c
> @@ -198,6 +198,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
>  	unsigned char *buf;
>  	size_t offset, buflen = 0;
>  	int zone_idx = 0;
> +	bool unlock = false;
>  	int ret;
>  
>  	if (!sd_is_zoned(sdkp))
> @@ -223,6 +224,14 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
>  		if (!nr)
>  			break;
>  
> +		/*
> +		 * Check if we're called by revalidate or by a normal report
> +		 * zones. Mutually exclude report zones due to revalidation and
> +		 * normal report zones, so we're not accidentally overriding the
> +		 * write-pointer offset cache.
> +		 */
> +		if (mutex_trylock(&sdkp->rev_mutex))
> +			unlock = true;
>  		for (i = 0; i < nr && zone_idx < nr_zones; i++) {
>  			offset += 64;
>  			ret = sd_zbc_parse_report(sdkp, buf + offset, zone_idx,
> @@ -231,6 +240,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
>  				goto out;

You need to unlock rev_mutex if unlock == true before this goto, otherwise zones
revalidation will deadlock.

>  			zone_idx++;
>  		}
> +		if (unlock)
> +			mutex_unlock(&sdkp->rev_mutex);
>  
>  		sector += sd_zbc_zone_sectors(sdkp) * i;
>  	}
>
Johannes Thumshirn Aug. 5, 2020, 7:09 a.m. UTC | #2
On 05/08/2020 03:38, Damien Le Moal wrote:
> You need to unlock rev_mutex if unlock == true before this goto, otherwise zones
> revalidation will deadlock.

Oops correct, thanks!
Damien Le Moal Aug. 5, 2020, 9:10 a.m. UTC | #3
On 2020/08/04 23:25, Johannes Thumshirn wrote:
> When revalidating a zoned SCSI disk, we need to do a REPORT ZONES, which
> also updates the write-pointer offset cache.
> 
> As we don't want a normal REPORT ZONES to constantly update the
> write-pointer offset cache, we swap the cache contents from revalidate
> with the live version.
> 
> But if a second REPORT ZONES is triggered while '->rev_wp_offset' is
> already allocated, sd_zbc_parse_report() can't distinguish the two
> different REPORT ZONES (from revalidation context or from a
> file-system/ioctl).
> 
>                  CPU0                             CPU1
> 
> sd_zbc_revalidate_zones()
> `-> mutex_lock(&sdkp->rev_mutex);
> `-> sdkp->rev_wp_offset = kvcalloc();
> `->blk_revalidate_disk_zones();
>    `-> disk->fops->report_zones();
>        `-> sd_zbc_report_zones();
>            `-> sd_zbc_parse_report();
> 	       `-> if (sdkp->rev_wp_offset)
>                    `-> sdkp->rev_wp_offset[idx] =
> 
>                                            blkdev_report_zones()
>                                            `-> disk->fops->report_zones();
>                                                `-> sd_zbc_report_zones();
>                                                    `-> sd_zbc_parse_report();
>                                         	       `-> if (sdkp->rev_wp_offset)
>                                                            `-> sdkp->rev_wp_offset[idx] =
> 
>    `-> update_driver_data();
>       `-> sd_zbc_revalidate_zones_cb();
>           `-> swap(sdkp->zones_wp_offset, sdkp->rev_wp_offset);
> `-> kvfree(sdkp->rev_wp_offset);
> `-> sdkp->rev_wp_offset = NULL;
> `-> mutex_unlock(&sdkp->rev_mutex);
> 
> As two concurrent revalidates are excluded via the '->rev_mutex', try to
> grab the '->rev_mutex' in sd_zbc_report_zones(). If we cannot lock the
> '->rev_mutex' because it's already held, we know we're called in a disk
> revalidate context, if we can grab the mutex we need to unlock it again
> after sd_zbc_parse_report() as we're not called in a revalidate context.

Thinking more about this one, I think it does not work: if rev_mutex cannot be
locked, it simply means that revalidate zones is on-going. It does not
necessarily mean that sd_zbc_report_zones() is being called from revalidate
context. Eg: when revalidate is ongoing with rev_mutex locked, if a user calls
blkdev_report_zones() which then calls sd_zbc_report_zones(), the try lock will
fail for the user context, and the execution will not change from before.
sd_zbc_parse_report() calls in the user context will still update the wp offset
array as it sees rev_wp_offset != NULL...

And with the report_zones method interface as it is, I still have no idea how to
differentiate revalidate context from regular report zones user context. Unlike
user space, we do not have recursive lock on mutexes, so can't use that either
to serialize parse calls.

May be something like this would do (not pretty...):

diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 999f54810926..fba312c8725d 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -84,6 +84,7 @@ struct scsi_disk {
        u32             *zones_wp_offset;
        spinlock_t      zones_wp_offset_lock;
        u32             *rev_wp_offset;
+       struct task_struct *rev_task;
        struct mutex    rev_mutex;
        struct work_struct zone_wp_offset_work;
        char            *zone_wp_update_buf;
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 33221d8e9f8f..53f0489c3433 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -69,7 +69,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
        if (ret)
                return ret;

-       if (sdkp->rev_wp_offset)
+       if (sdkp->rev_wp_offset && current == sdkp->rev_task)
                sdkp->rev_wp_offset[idx] = sd_zbc_get_zone_wp_offset(&zone);

        return 0;
@@ -688,10 +688,13 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
                goto unlock;
        }

+       sdkp->rev_task = current;
+
        ret = blk_revalidate_disk_zones(disk, sd_zbc_revalidate_zones_cb);

        kvfree(sdkp->rev_wp_offset);
        sdkp->rev_wp_offset = NULL;
+       sdkp->rev_task = NULL;

        if (ret) {
                sdkp->zone_blocks = 0;


This is totally untested...

> 
> This way we can ensure a partial REPORT ZONES doesn't set invalid
> write-pointer offsets in the revalidate write-pointer offset cache when a
> partial REPORT ZONES is running concurrently with a full REPORT ZONES from
> disk revalidation.
> 
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  drivers/scsi/sd_zbc.c | 11 +++++++++++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
> index 6f7eba66687e..d19456220c09 100644
> --- a/drivers/scsi/sd_zbc.c
> +++ b/drivers/scsi/sd_zbc.c
> @@ -198,6 +198,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
>  	unsigned char *buf;
>  	size_t offset, buflen = 0;
>  	int zone_idx = 0;
> +	bool unlock = false;
>  	int ret;
>  
>  	if (!sd_is_zoned(sdkp))
> @@ -223,6 +224,14 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
>  		if (!nr)
>  			break;
>  
> +		/*
> +		 * Check if we're called by revalidate or by a normal report
> +		 * zones. Mutually exclude report zones due to revalidation and
> +		 * normal report zones, so we're not accidentally overriding the
> +		 * write-pointer offset cache.
> +		 */
> +		if (mutex_trylock(&sdkp->rev_mutex))
> +			unlock = true;
>  		for (i = 0; i < nr && zone_idx < nr_zones; i++) {
>  			offset += 64;
>  			ret = sd_zbc_parse_report(sdkp, buf + offset, zone_idx,
> @@ -231,6 +240,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
>  				goto out;
>  			zone_idx++;
>  		}
> +		if (unlock)
> +			mutex_unlock(&sdkp->rev_mutex);
>  
>  		sector += sd_zbc_zone_sectors(sdkp) * i;
>  	}
>
Johannes Thumshirn Aug. 7, 2020, 11:18 a.m. UTC | #4
On 05/08/2020 11:10, Damien Le Moal wrote:
[...]
> 
> May be something like this would do (not pretty...):
> 
> diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
> index 999f54810926..fba312c8725d 100644
> --- a/drivers/scsi/sd.h
> +++ b/drivers/scsi/sd.h
> @@ -84,6 +84,7 @@ struct scsi_disk {
>         u32             *zones_wp_offset;
>         spinlock_t      zones_wp_offset_lock;
>         u32             *rev_wp_offset;
> +       struct task_struct *rev_task;
>         struct mutex    rev_mutex;
>         struct work_struct zone_wp_offset_work;
>         char            *zone_wp_update_buf;
> diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
> index 33221d8e9f8f..53f0489c3433 100644
> --- a/drivers/scsi/sd_zbc.c
> +++ b/drivers/scsi/sd_zbc.c
> @@ -69,7 +69,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
>         if (ret)
>                 return ret;
> 
> -       if (sdkp->rev_wp_offset)
> +       if (sdkp->rev_wp_offset && current == sdkp->rev_task)
>                 sdkp->rev_wp_offset[idx] = sd_zbc_get_zone_wp_offset(&zone);
> 
>         return 0;
> @@ -688,10 +688,13 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
>                 goto unlock;
>         }
> 
> +       sdkp->rev_task = current;
> +
>         ret = blk_revalidate_disk_zones(disk, sd_zbc_revalidate_zones_cb);
> 
>         kvfree(sdkp->rev_wp_offset);
>         sdkp->rev_wp_offset = NULL;
> +       sdkp->rev_task = NULL;
> 
>         if (ret) {
>                 sdkp->zone_blocks = 0;
> 
> 
> This is totally untested...
> 

I agree it's not pretty but at least way more readable then what I did. Can't test 
though, as I haven't managed to provoke the race yet.
diff mbox series

Patch

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 6f7eba66687e..d19456220c09 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -198,6 +198,7 @@  int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
 	unsigned char *buf;
 	size_t offset, buflen = 0;
 	int zone_idx = 0;
+	bool unlock = false;
 	int ret;
 
 	if (!sd_is_zoned(sdkp))
@@ -223,6 +224,14 @@  int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
 		if (!nr)
 			break;
 
+		/*
+		 * Check if we're called by revalidate or by a normal report
+		 * zones. Mutually exclude report zones due to revalidation and
+		 * normal report zones, so we're not accidentally overriding the
+		 * write-pointer offset cache.
+		 */
+		if (mutex_trylock(&sdkp->rev_mutex))
+			unlock = true;
 		for (i = 0; i < nr && zone_idx < nr_zones; i++) {
 			offset += 64;
 			ret = sd_zbc_parse_report(sdkp, buf + offset, zone_idx,
@@ -231,6 +240,8 @@  int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
 				goto out;
 			zone_idx++;
 		}
+		if (unlock)
+			mutex_unlock(&sdkp->rev_mutex);
 
 		sector += sd_zbc_zone_sectors(sdkp) * i;
 	}