diff mbox

[v4,08/18] tools/testing/nvdimm: add 'bio_delay' mechanism

Message ID 151407700353.38751.17609507477918854876.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Dan Williams Dec. 24, 2017, 12:56 a.m. UTC
In support of testing truncate colliding with dma add a mechanism that
delays the completion of block I/O requests by a programmable number of
seconds. This allows a truncate operation to be issued while page
references are held for direct-I/O.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/Kbuild           |    1 +
 tools/testing/nvdimm/test/iomap.c     |   62 +++++++++++++++++++++++++++++++++
 tools/testing/nvdimm/test/nfit.c      |   34 ++++++++++++++++++
 tools/testing/nvdimm/test/nfit_test.h |    1 +
 4 files changed, 98 insertions(+)


--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Ross Zwisler Dec. 27, 2017, 6:08 p.m. UTC | #1
On Sat, Dec 23, 2017 at 04:56:43PM -0800, Dan Williams wrote:
> In support of testing truncate colliding with dma add a mechanism that
> delays the completion of block I/O requests by a programmable number of
> seconds. This allows a truncate operation to be issued while page
> references are held for direct-I/O.
> 
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

> @@ -387,4 +389,64 @@ union acpi_object * __wrap_acpi_evaluate_dsm(acpi_handle handle, const guid_t *g
>  }
>  EXPORT_SYMBOL(__wrap_acpi_evaluate_dsm);
>  
> +static DEFINE_SPINLOCK(bio_lock);
> +static struct bio *biolist;
> +int bio_do_queue;
> +
> +static void run_bio(struct work_struct *work)
> +{
> +	struct delayed_work *dw = container_of(work, typeof(*dw), work);
> +	struct bio *bio, *next;
> +
> +	pr_info("%s\n", __func__);

Did you mean to leave this print in, or was it part of your debug while
developing?  I don't see any other prints in the rest of the nvdimm testing
code?

> +	spin_lock(&bio_lock);
> +	bio_do_queue = 0;
> +	bio = biolist;
> +	biolist = NULL;
> +	spin_unlock(&bio_lock);
> +
> +	while (bio) {
> +		next = bio->bi_next;
> +		bio->bi_next = NULL;
> +		bio_endio(bio);
> +		bio = next;
> +	}
> +	kfree(dw);
> +}
> +
> +void nfit_test_inject_bio_delay(int sec)
> +{
> +	struct delayed_work *dw = kzalloc(sizeof(*dw), GFP_KERNEL);
> +
> +	spin_lock(&bio_lock);
> +	if (!bio_do_queue) {
> +		pr_info("%s: %d seconds\n", __func__, sec);

Ditto with this print - did you mean to leave it in?

> +		INIT_DELAYED_WORK(dw, run_bio);
> +		bio_do_queue = 1;
> +		schedule_delayed_work(dw, sec * HZ);
> +		dw = NULL;

Why set dw = NULL here?  In the else case we leak dw - was this dw=NULL meant
to allow a kfree(dw) after we get out of the if() (and probably after we drop
the spinlock)?

> +	}
> +	spin_unlock(&bio_lock);
> +}
> +EXPORT_SYMBOL_GPL(nfit_test_inject_bio_delay);
> +

> diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
> index 7217b2b953b5..9362b01e9a8f 100644
> --- a/tools/testing/nvdimm/test/nfit.c
> +++ b/tools/testing/nvdimm/test/nfit.c
> @@ -872,6 +872,39 @@ static const struct attribute_group *nfit_test_dimm_attribute_groups[] = {
>  	NULL,
>  };
>  
> +static ssize_t bio_delay_show(struct device_driver *drv, char *buf)
> +{
> +	return sprintf(buf, "0\n");
> +}

It doesn't seem like this _show() routine adds much?  We could have it print
out the value of 'bio_do_queue' so we can see if we are currently queueing
bios in a workqueue element, but that suffers pretty badly from a TOCTOU race.

Otherwise we could just omit the _show() altogether and just use
DRIVER_ATTR_WO(bio_delay).

> +
> +static ssize_t bio_delay_store(struct device_driver *drv, const char *buf,
> +		size_t count)
> +{
> +	unsigned long delay;
> +	int rc = kstrtoul(buf, 0, &delay);
> +
> +	if (rc < 0)
> +		return rc;
> +
> +	nfit_test_inject_bio_delay(delay);
> +	return count;
> +}
> +DRIVER_ATTR_RW(bio_delay);

   DRIVER_ATTR_WO(bio_delay);  ?
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Williams Jan. 2, 2018, 8:35 p.m. UTC | #2
On Wed, Dec 27, 2017 at 10:08 AM, Ross Zwisler
<ross.zwisler@linux.intel.com> wrote:
> On Sat, Dec 23, 2017 at 04:56:43PM -0800, Dan Williams wrote:
>> In support of testing truncate colliding with dma add a mechanism that
>> delays the completion of block I/O requests by a programmable number of
>> seconds. This allows a truncate operation to be issued while page
>> references are held for direct-I/O.
>>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>
>> @@ -387,4 +389,64 @@ union acpi_object * __wrap_acpi_evaluate_dsm(acpi_handle handle, const guid_t *g
>>  }
>>  EXPORT_SYMBOL(__wrap_acpi_evaluate_dsm);
>>
>> +static DEFINE_SPINLOCK(bio_lock);
>> +static struct bio *biolist;
>> +int bio_do_queue;
>> +
>> +static void run_bio(struct work_struct *work)
>> +{
>> +     struct delayed_work *dw = container_of(work, typeof(*dw), work);
>> +     struct bio *bio, *next;
>> +
>> +     pr_info("%s\n", __func__);
>
> Did you mean to leave this print in, or was it part of your debug while
> developing?  I don't see any other prints in the rest of the nvdimm testing
> code?
>
>> +     spin_lock(&bio_lock);
>> +     bio_do_queue = 0;
>> +     bio = biolist;
>> +     biolist = NULL;
>> +     spin_unlock(&bio_lock);
>> +
>> +     while (bio) {
>> +             next = bio->bi_next;
>> +             bio->bi_next = NULL;
>> +             bio_endio(bio);
>> +             bio = next;
>> +     }
>> +     kfree(dw);
>> +}
>> +
>> +void nfit_test_inject_bio_delay(int sec)
>> +{
>> +     struct delayed_work *dw = kzalloc(sizeof(*dw), GFP_KERNEL);
>> +
>> +     spin_lock(&bio_lock);
>> +     if (!bio_do_queue) {
>> +             pr_info("%s: %d seconds\n", __func__, sec);
>
> Ditto with this print - did you mean to leave it in?

Yes, this one plus the previous one are in there deliberately so that
I can see the injection / completion of the delay relative to when the
test is performing direct-i/o.

>
>> +             INIT_DELAYED_WORK(dw, run_bio);
>> +             bio_do_queue = 1;
>> +             schedule_delayed_work(dw, sec * HZ);
>> +             dw = NULL;
>
> Why set dw = NULL here?  In the else case we leak dw - was this dw=NULL meant
> to allow a kfree(dw) after we get out of the if() (and probably after we drop
> the spinlock)?

Something like that, but now it's just a leftover from an initial
version of the code, will delete.

>> +     }
>> +     spin_unlock(&bio_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(nfit_test_inject_bio_delay);
>> +
>
>> diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
>> index 7217b2b953b5..9362b01e9a8f 100644
>> --- a/tools/testing/nvdimm/test/nfit.c
>> +++ b/tools/testing/nvdimm/test/nfit.c
>> @@ -872,6 +872,39 @@ static const struct attribute_group *nfit_test_dimm_attribute_groups[] = {
>>       NULL,
>>  };
>>
>> +static ssize_t bio_delay_show(struct device_driver *drv, char *buf)
>> +{
>> +     return sprintf(buf, "0\n");
>> +}
>
> It doesn't seem like this _show() routine adds much?  We could have it print
> out the value of 'bio_do_queue' so we can see if we are currently queueing
> bios in a workqueue element, but that suffers pretty badly from a TOCTOU race.
>
> Otherwise we could just omit the _show() altogether and just use
> DRIVER_ATTR_WO(bio_delay).

Sure.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Jan. 2, 2018, 9:44 p.m. UTC | #3
On Sat, Dec 23, 2017 at 04:56:43PM -0800, Dan Williams wrote:
> In support of testing truncate colliding with dma add a mechanism that
> delays the completion of block I/O requests by a programmable number of
> seconds. This allows a truncate operation to be issued while page
> references are held for direct-I/O.
> 
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Why not put this in the generic bio layer code and then write a
generic fstest to exercise this truncate vs direct IO completion
race condition on all types of storage and filesystems?

i.e. if it sits in a nvdimm test suite, it's never going to be run
by filesystem developers....

Cheers,

Dave.
Dan Williams Jan. 2, 2018, 9:51 p.m. UTC | #4
On Tue, Jan 2, 2018 at 1:44 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Sat, Dec 23, 2017 at 04:56:43PM -0800, Dan Williams wrote:
>> In support of testing truncate colliding with dma add a mechanism that
>> delays the completion of block I/O requests by a programmable number of
>> seconds. This allows a truncate operation to be issued while page
>> references are held for direct-I/O.
>>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>
> Why not put this in the generic bio layer code and then write a
> generic fstest to exercise this truncate vs direct IO completion
> race condition on all types of storage and filesystems?
>
> i.e. if it sits in a nvdimm test suite, it's never going to be run
> by filesystem developers....

I do want to get it into xfstests eventually. I picked the nvdimm
infrastructure for expediency of getting the fix developed. Also, I
consider the collision in the non-dax case a solved problem since the
core mm will keep the page out of circulation indefinitely.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Jan. 3, 2018, 3:46 p.m. UTC | #5
On Tue 02-01-18 13:51:49, Dan Williams wrote:
> On Tue, Jan 2, 2018 at 1:44 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Sat, Dec 23, 2017 at 04:56:43PM -0800, Dan Williams wrote:
> >> In support of testing truncate colliding with dma add a mechanism that
> >> delays the completion of block I/O requests by a programmable number of
> >> seconds. This allows a truncate operation to be issued while page
> >> references are held for direct-I/O.
> >>
> >> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> >
> > Why not put this in the generic bio layer code and then write a
> > generic fstest to exercise this truncate vs direct IO completion
> > race condition on all types of storage and filesystems?
> >
> > i.e. if it sits in a nvdimm test suite, it's never going to be run
> > by filesystem developers....
> 
> I do want to get it into xfstests eventually. I picked the nvdimm
> infrastructure for expediency of getting the fix developed. Also, I
> consider the collision in the non-dax case a solved problem since the
> core mm will keep the page out of circulation indefinitely.

Yes, but there are different races that could happen even for regular page
cache pages. So I also think it would be worthwhile to have this inside the
block layer possibly as part of the generic fault-injection framework which
is already there for fail_make_request. That already supports various
filtering, frequency, and other options that could be useful.

								Honza
Jeff Moyer Jan. 3, 2018, 8:37 p.m. UTC | #6
Jan Kara <jack@suse.cz> writes:

> On Tue 02-01-18 13:51:49, Dan Williams wrote:
>> On Tue, Jan 2, 2018 at 1:44 PM, Dave Chinner <david@fromorbit.com> wrote:
>> > On Sat, Dec 23, 2017 at 04:56:43PM -0800, Dan Williams wrote:
>> >> In support of testing truncate colliding with dma add a mechanism that
>> >> delays the completion of block I/O requests by a programmable number of
>> >> seconds. This allows a truncate operation to be issued while page
>> >> references are held for direct-I/O.
>> >>
>> >> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> >
>> > Why not put this in the generic bio layer code and then write a
>> > generic fstest to exercise this truncate vs direct IO completion
>> > race condition on all types of storage and filesystems?
>> >
>> > i.e. if it sits in a nvdimm test suite, it's never going to be run
>> > by filesystem developers....
>> 
>> I do want to get it into xfstests eventually. I picked the nvdimm
>> infrastructure for expediency of getting the fix developed. Also, I
>> consider the collision in the non-dax case a solved problem since the
>> core mm will keep the page out of circulation indefinitely.
>
> Yes, but there are different races that could happen even for regular page
> cache pages. So I also think it would be worthwhile to have this inside the
> block layer possibly as part of the generic fault-injection framework which
> is already there for fail_make_request. That already supports various
> filtering, frequency, and other options that could be useful.

Or consider extending the dm-delay target (which delays the queuing of
bios) to support delaying the completions.  I'm not sure I'm a fan of
sticking all sorts of debug code into the generic I/O submission path.

Cheers,
Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index db33b28c5ef3..afc070c961cd 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -16,6 +16,7 @@  ldflags-y += --wrap=insert_resource
 ldflags-y += --wrap=remove_resource
 ldflags-y += --wrap=acpi_evaluate_object
 ldflags-y += --wrap=acpi_evaluate_dsm
+ldflags-y += --wrap=bio_endio
 
 DRIVERS := ../../../drivers
 NVDIMM_SRC := $(DRIVERS)/nvdimm
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index ff9d3a5825e1..dd90060c0004 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -10,6 +10,7 @@ 
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  */
+#include <linux/workqueue.h>
 #include <linux/memremap.h>
 #include <linux/rculist.h>
 #include <linux/export.h>
@@ -18,6 +19,7 @@ 
 #include <linux/types.h>
 #include <linux/pfn_t.h>
 #include <linux/acpi.h>
+#include <linux/bio.h>
 #include <linux/io.h>
 #include <linux/mm.h>
 #include "nfit_test.h"
@@ -387,4 +389,64 @@  union acpi_object * __wrap_acpi_evaluate_dsm(acpi_handle handle, const guid_t *g
 }
 EXPORT_SYMBOL(__wrap_acpi_evaluate_dsm);
 
+static DEFINE_SPINLOCK(bio_lock);
+static struct bio *biolist;
+int bio_do_queue;
+
+static void run_bio(struct work_struct *work)
+{
+	struct delayed_work *dw = container_of(work, typeof(*dw), work);
+	struct bio *bio, *next;
+
+	pr_info("%s\n", __func__);
+	spin_lock(&bio_lock);
+	bio_do_queue = 0;
+	bio = biolist;
+	biolist = NULL;
+	spin_unlock(&bio_lock);
+
+	while (bio) {
+		next = bio->bi_next;
+		bio->bi_next = NULL;
+		bio_endio(bio);
+		bio = next;
+	}
+	kfree(dw);
+}
+
+void nfit_test_inject_bio_delay(int sec)
+{
+	struct delayed_work *dw = kzalloc(sizeof(*dw), GFP_KERNEL);
+
+	spin_lock(&bio_lock);
+	if (!bio_do_queue) {
+		pr_info("%s: %d seconds\n", __func__, sec);
+		INIT_DELAYED_WORK(dw, run_bio);
+		bio_do_queue = 1;
+		schedule_delayed_work(dw, sec * HZ);
+		dw = NULL;
+	}
+	spin_unlock(&bio_lock);
+}
+EXPORT_SYMBOL_GPL(nfit_test_inject_bio_delay);
+
+void __wrap_bio_endio(struct bio *bio)
+{
+	int did_q = 0;
+
+	spin_lock(&bio_lock);
+	if (bio_do_queue) {
+		bio->bi_next = biolist;
+		biolist = bio;
+		did_q = 1;
+	}
+	spin_unlock(&bio_lock);
+
+	if (did_q)
+		return;
+
+	bio_endio(bio);
+}
+EXPORT_SYMBOL_GPL(__wrap_bio_endio);
+
 MODULE_LICENSE("GPL v2");
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 7217b2b953b5..9362b01e9a8f 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -872,6 +872,39 @@  static const struct attribute_group *nfit_test_dimm_attribute_groups[] = {
 	NULL,
 };
 
+static ssize_t bio_delay_show(struct device_driver *drv, char *buf)
+{
+	return sprintf(buf, "0\n");
+}
+
+static ssize_t bio_delay_store(struct device_driver *drv, const char *buf,
+		size_t count)
+{
+	unsigned long delay;
+	int rc = kstrtoul(buf, 0, &delay);
+
+	if (rc < 0)
+		return rc;
+
+	nfit_test_inject_bio_delay(delay);
+	return count;
+}
+DRIVER_ATTR_RW(bio_delay);
+
+static struct attribute *nfit_test_driver_attributes[] = {
+	&driver_attr_bio_delay.attr,
+	NULL,
+};
+
+static struct attribute_group nfit_test_driver_attribute_group = {
+	.attrs = nfit_test_driver_attributes,
+};
+
+static const struct attribute_group *nfit_test_driver_attribute_groups[] = {
+	&nfit_test_driver_attribute_group,
+	NULL,
+};
+
 static int nfit_test0_alloc(struct nfit_test *t)
 {
 	size_t nfit_size = sizeof(struct acpi_nfit_system_address) * NUM_SPA
@@ -2151,6 +2184,7 @@  static struct platform_driver nfit_test_driver = {
 	.remove = nfit_test_remove,
 	.driver = {
 		.name = KBUILD_MODNAME,
+		.groups = nfit_test_driver_attribute_groups,
 	},
 	.id_table = nfit_test_id,
 };
diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h
index 113b44675a71..744740a76dee 100644
--- a/tools/testing/nvdimm/test/nfit_test.h
+++ b/tools/testing/nvdimm/test/nfit_test.h
@@ -98,4 +98,5 @@  void nfit_test_setup(nfit_test_lookup_fn lookup,
 		nfit_test_evaluate_dsm_fn evaluate);
 void nfit_test_teardown(void);
 struct nfit_test_resource *get_nfit_res(resource_size_t resource);
+void nfit_test_inject_bio_delay(int sec);
 #endif