Message ID | 151407700353.38751.17609507477918854876.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Sat, Dec 23, 2017 at 04:56:43PM -0800, Dan Williams wrote: > In support of testing truncate colliding with dma add a mechanism that > delays the completion of block I/O requests by a programmable number of > seconds. This allows a truncate operation to be issued while page > references are held for direct-I/O. > > Signed-off-by: Dan Williams <dan.j.williams@intel.com> > @@ -387,4 +389,64 @@ union acpi_object * __wrap_acpi_evaluate_dsm(acpi_handle handle, const guid_t *g > } > EXPORT_SYMBOL(__wrap_acpi_evaluate_dsm); > > +static DEFINE_SPINLOCK(bio_lock); > +static struct bio *biolist; > +int bio_do_queue; > + > +static void run_bio(struct work_struct *work) > +{ > + struct delayed_work *dw = container_of(work, typeof(*dw), work); > + struct bio *bio, *next; > + > + pr_info("%s\n", __func__); Did you mean to leave this print in, or was it part of your debug while developing? I don't see any other prints in the rest of the nvdimm testing code? > + spin_lock(&bio_lock); > + bio_do_queue = 0; > + bio = biolist; > + biolist = NULL; > + spin_unlock(&bio_lock); > + > + while (bio) { > + next = bio->bi_next; > + bio->bi_next = NULL; > + bio_endio(bio); > + bio = next; > + } > + kfree(dw); > +} > + > +void nfit_test_inject_bio_delay(int sec) > +{ > + struct delayed_work *dw = kzalloc(sizeof(*dw), GFP_KERNEL); > + > + spin_lock(&bio_lock); > + if (!bio_do_queue) { > + pr_info("%s: %d seconds\n", __func__, sec); Ditto with this print - did you mean to leave it in? > + INIT_DELAYED_WORK(dw, run_bio); > + bio_do_queue = 1; > + schedule_delayed_work(dw, sec * HZ); > + dw = NULL; Why set dw = NULL here? In the else case we leak dw - was this dw=NULL meant to allow a kfree(dw) after we get out of the if() (and probably after we drop the spinlock)? > + } > + spin_unlock(&bio_lock); > +} > +EXPORT_SYMBOL_GPL(nfit_test_inject_bio_delay); > + > diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c > index 7217b2b953b5..9362b01e9a8f 100644 > --- a/tools/testing/nvdimm/test/nfit.c > +++ b/tools/testing/nvdimm/test/nfit.c > @@ -872,6 +872,39 @@ static const struct attribute_group *nfit_test_dimm_attribute_groups[] = { > NULL, > }; > > +static ssize_t bio_delay_show(struct device_driver *drv, char *buf) > +{ > + return sprintf(buf, "0\n"); > +} It doesn't seem like this _show() routine adds much? We could have it print out the value of 'bio_do_queue' so we can see if we are currently queueing bios in a workqueue element, but that suffers pretty badly from a TOCTOU race. Otherwise we could just omit the _show() altogether and just use DRIVER_ATTR_WO(bio_delay). > + > +static ssize_t bio_delay_store(struct device_driver *drv, const char *buf, > + size_t count) > +{ > + unsigned long delay; > + int rc = kstrtoul(buf, 0, &delay); > + > + if (rc < 0) > + return rc; > + > + nfit_test_inject_bio_delay(delay); > + return count; > +} > +DRIVER_ATTR_RW(bio_delay); DRIVER_ATTR_WO(bio_delay); ?
On Wed, Dec 27, 2017 at 10:08 AM, Ross Zwisler <ross.zwisler@linux.intel.com> wrote: > On Sat, Dec 23, 2017 at 04:56:43PM -0800, Dan Williams wrote: >> In support of testing truncate colliding with dma add a mechanism that >> delays the completion of block I/O requests by a programmable number of >> seconds. This allows a truncate operation to be issued while page >> references are held for direct-I/O. >> >> Signed-off-by: Dan Williams <dan.j.williams@intel.com> > >> @@ -387,4 +389,64 @@ union acpi_object * __wrap_acpi_evaluate_dsm(acpi_handle handle, const guid_t *g >> } >> EXPORT_SYMBOL(__wrap_acpi_evaluate_dsm); >> >> +static DEFINE_SPINLOCK(bio_lock); >> +static struct bio *biolist; >> +int bio_do_queue; >> + >> +static void run_bio(struct work_struct *work) >> +{ >> + struct delayed_work *dw = container_of(work, typeof(*dw), work); >> + struct bio *bio, *next; >> + >> + pr_info("%s\n", __func__); > > Did you mean to leave this print in, or was it part of your debug while > developing? I don't see any other prints in the rest of the nvdimm testing > code? > >> + spin_lock(&bio_lock); >> + bio_do_queue = 0; >> + bio = biolist; >> + biolist = NULL; >> + spin_unlock(&bio_lock); >> + >> + while (bio) { >> + next = bio->bi_next; >> + bio->bi_next = NULL; >> + bio_endio(bio); >> + bio = next; >> + } >> + kfree(dw); >> +} >> + >> +void nfit_test_inject_bio_delay(int sec) >> +{ >> + struct delayed_work *dw = kzalloc(sizeof(*dw), GFP_KERNEL); >> + >> + spin_lock(&bio_lock); >> + if (!bio_do_queue) { >> + pr_info("%s: %d seconds\n", __func__, sec); > > Ditto with this print - did you mean to leave it in? Yes, this one plus the previous one are in there deliberately so that I can see the injection / completion of the delay relative to when the test is performing direct-i/o. > >> + INIT_DELAYED_WORK(dw, run_bio); >> + bio_do_queue = 1; >> + schedule_delayed_work(dw, sec * HZ); >> + dw = NULL; > > Why set dw = NULL here? In the else case we leak dw - was this dw=NULL meant > to allow a kfree(dw) after we get out of the if() (and probably after we drop > the spinlock)? Something like that, but now it's just a leftover from an initial version of the code, will delete. >> + } >> + spin_unlock(&bio_lock); >> +} >> +EXPORT_SYMBOL_GPL(nfit_test_inject_bio_delay); >> + > >> diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c >> index 7217b2b953b5..9362b01e9a8f 100644 >> --- a/tools/testing/nvdimm/test/nfit.c >> +++ b/tools/testing/nvdimm/test/nfit.c >> @@ -872,6 +872,39 @@ static const struct attribute_group *nfit_test_dimm_attribute_groups[] = { >> NULL, >> }; >> >> +static ssize_t bio_delay_show(struct device_driver *drv, char *buf) >> +{ >> + return sprintf(buf, "0\n"); >> +} > > It doesn't seem like this _show() routine adds much? We could have it print > out the value of 'bio_do_queue' so we can see if we are currently queueing > bios in a workqueue element, but that suffers pretty badly from a TOCTOU race. > > Otherwise we could just omit the _show() altogether and just use > DRIVER_ATTR_WO(bio_delay). Sure.
On Sat, Dec 23, 2017 at 04:56:43PM -0800, Dan Williams wrote: > In support of testing truncate colliding with dma add a mechanism that > delays the completion of block I/O requests by a programmable number of > seconds. This allows a truncate operation to be issued while page > references are held for direct-I/O. > > Signed-off-by: Dan Williams <dan.j.williams@intel.com> Why not put this in the generic bio layer code and then write a generic fstest to exercise this truncate vs direct IO completion race condition on all types of storage and filesystems? i.e. if it sits in a nvdimm test suite, it's never going to be run by filesystem developers.... Cheers, Dave.
On Tue, Jan 2, 2018 at 1:44 PM, Dave Chinner <david@fromorbit.com> wrote: > On Sat, Dec 23, 2017 at 04:56:43PM -0800, Dan Williams wrote: >> In support of testing truncate colliding with dma add a mechanism that >> delays the completion of block I/O requests by a programmable number of >> seconds. This allows a truncate operation to be issued while page >> references are held for direct-I/O. >> >> Signed-off-by: Dan Williams <dan.j.williams@intel.com> > > Why not put this in the generic bio layer code and then write a > generic fstest to exercise this truncate vs direct IO completion > race condition on all types of storage and filesystems? > > i.e. if it sits in a nvdimm test suite, it's never going to be run > by filesystem developers.... I do want to get it into xfstests eventually. I picked the nvdimm infrastructure for expediency of getting the fix developed. Also, I consider the collision in the non-dax case a solved problem since the core mm will keep the page out of circulation indefinitely.
On Tue 02-01-18 13:51:49, Dan Williams wrote: > On Tue, Jan 2, 2018 at 1:44 PM, Dave Chinner <david@fromorbit.com> wrote: > > On Sat, Dec 23, 2017 at 04:56:43PM -0800, Dan Williams wrote: > >> In support of testing truncate colliding with dma add a mechanism that > >> delays the completion of block I/O requests by a programmable number of > >> seconds. This allows a truncate operation to be issued while page > >> references are held for direct-I/O. > >> > >> Signed-off-by: Dan Williams <dan.j.williams@intel.com> > > > > Why not put this in the generic bio layer code and then write a > > generic fstest to exercise this truncate vs direct IO completion > > race condition on all types of storage and filesystems? > > > > i.e. if it sits in a nvdimm test suite, it's never going to be run > > by filesystem developers.... > > I do want to get it into xfstests eventually. I picked the nvdimm > infrastructure for expediency of getting the fix developed. Also, I > consider the collision in the non-dax case a solved problem since the > core mm will keep the page out of circulation indefinitely. Yes, but there are different races that could happen even for regular page cache pages. So I also think it would be worthwhile to have this inside the block layer possibly as part of the generic fault-injection framework which is already there for fail_make_request. That already supports various filtering, frequency, and other options that could be useful. Honza
Jan Kara <jack@suse.cz> writes: > On Tue 02-01-18 13:51:49, Dan Williams wrote: >> On Tue, Jan 2, 2018 at 1:44 PM, Dave Chinner <david@fromorbit.com> wrote: >> > On Sat, Dec 23, 2017 at 04:56:43PM -0800, Dan Williams wrote: >> >> In support of testing truncate colliding with dma add a mechanism that >> >> delays the completion of block I/O requests by a programmable number of >> >> seconds. This allows a truncate operation to be issued while page >> >> references are held for direct-I/O. >> >> >> >> Signed-off-by: Dan Williams <dan.j.williams@intel.com> >> > >> > Why not put this in the generic bio layer code and then write a >> > generic fstest to exercise this truncate vs direct IO completion >> > race condition on all types of storage and filesystems? >> > >> > i.e. if it sits in a nvdimm test suite, it's never going to be run >> > by filesystem developers.... >> >> I do want to get it into xfstests eventually. I picked the nvdimm >> infrastructure for expediency of getting the fix developed. Also, I >> consider the collision in the non-dax case a solved problem since the >> core mm will keep the page out of circulation indefinitely. > > Yes, but there are different races that could happen even for regular page > cache pages. So I also think it would be worthwhile to have this inside the > block layer possibly as part of the generic fault-injection framework which > is already there for fail_make_request. That already supports various > filtering, frequency, and other options that could be useful. Or consider extending the dm-delay target (which delays the queuing of bios) to support delaying the completions. I'm not sure I'm a fan of sticking all sorts of debug code into the generic I/O submission path. Cheers, Jeff
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild index db33b28c5ef3..afc070c961cd 100644 --- a/tools/testing/nvdimm/Kbuild +++ b/tools/testing/nvdimm/Kbuild @@ -16,6 +16,7 @@ ldflags-y += --wrap=insert_resource ldflags-y += --wrap=remove_resource ldflags-y += --wrap=acpi_evaluate_object ldflags-y += --wrap=acpi_evaluate_dsm +ldflags-y += --wrap=bio_endio DRIVERS := ../../../drivers NVDIMM_SRC := $(DRIVERS)/nvdimm diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index ff9d3a5825e1..dd90060c0004 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -10,6 +10,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#include <linux/workqueue.h> #include <linux/memremap.h> #include <linux/rculist.h> #include <linux/export.h> @@ -18,6 +19,7 @@ #include <linux/types.h> #include <linux/pfn_t.h> #include <linux/acpi.h> +#include <linux/bio.h> #include <linux/io.h> #include <linux/mm.h> #include "nfit_test.h" @@ -387,4 +389,64 @@ union acpi_object * __wrap_acpi_evaluate_dsm(acpi_handle handle, const guid_t *g } EXPORT_SYMBOL(__wrap_acpi_evaluate_dsm); +static DEFINE_SPINLOCK(bio_lock); +static struct bio *biolist; +int bio_do_queue; + +static void run_bio(struct work_struct *work) +{ + struct delayed_work *dw = container_of(work, typeof(*dw), work); + struct bio *bio, *next; + + pr_info("%s\n", __func__); + spin_lock(&bio_lock); + bio_do_queue = 0; + bio = biolist; + biolist = NULL; + spin_unlock(&bio_lock); + + while (bio) { + next = bio->bi_next; + bio->bi_next = NULL; + bio_endio(bio); + bio = next; + } + kfree(dw); +} + +void nfit_test_inject_bio_delay(int sec) +{ + struct delayed_work *dw = kzalloc(sizeof(*dw), GFP_KERNEL); + + spin_lock(&bio_lock); + if (!bio_do_queue) { + pr_info("%s: %d seconds\n", __func__, sec); + INIT_DELAYED_WORK(dw, run_bio); + bio_do_queue = 1; + schedule_delayed_work(dw, sec * HZ); + dw = NULL; + } + spin_unlock(&bio_lock); +} +EXPORT_SYMBOL_GPL(nfit_test_inject_bio_delay); + +void __wrap_bio_endio(struct bio *bio) +{ + int did_q = 0; + + spin_lock(&bio_lock); + if (bio_do_queue) { + bio->bi_next = biolist; + biolist = bio; + did_q = 1; + } + spin_unlock(&bio_lock); + + if (did_q) + return; + + bio_endio(bio); +} +EXPORT_SYMBOL_GPL(__wrap_bio_endio); + MODULE_LICENSE("GPL v2"); diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 7217b2b953b5..9362b01e9a8f 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -872,6 +872,39 @@ static const struct attribute_group *nfit_test_dimm_attribute_groups[] = { NULL, }; +static ssize_t bio_delay_show(struct device_driver *drv, char *buf) +{ + return sprintf(buf, "0\n"); +} + +static ssize_t bio_delay_store(struct device_driver *drv, const char *buf, + size_t count) +{ + unsigned long delay; + int rc = kstrtoul(buf, 0, &delay); + + if (rc < 0) + return rc; + + nfit_test_inject_bio_delay(delay); + return count; +} +DRIVER_ATTR_RW(bio_delay); + +static struct attribute *nfit_test_driver_attributes[] = { + &driver_attr_bio_delay.attr, + NULL, +}; + +static struct attribute_group nfit_test_driver_attribute_group = { + .attrs = nfit_test_driver_attributes, +}; + +static const struct attribute_group *nfit_test_driver_attribute_groups[] = { + &nfit_test_driver_attribute_group, + NULL, +}; + static int nfit_test0_alloc(struct nfit_test *t) { size_t nfit_size = sizeof(struct acpi_nfit_system_address) * NUM_SPA @@ -2151,6 +2184,7 @@ static struct platform_driver nfit_test_driver = { .remove = nfit_test_remove, .driver = { .name = KBUILD_MODNAME, + .groups = nfit_test_driver_attribute_groups, }, .id_table = nfit_test_id, }; diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h index 113b44675a71..744740a76dee 100644 --- a/tools/testing/nvdimm/test/nfit_test.h +++ b/tools/testing/nvdimm/test/nfit_test.h @@ -98,4 +98,5 @@ void nfit_test_setup(nfit_test_lookup_fn lookup, nfit_test_evaluate_dsm_fn evaluate); void nfit_test_teardown(void); struct nfit_test_resource *get_nfit_res(resource_size_t resource); +void nfit_test_inject_bio_delay(int sec); #endif
In support of testing truncate colliding with dma add a mechanism that delays the completion of block I/O requests by a programmable number of seconds. This allows a truncate operation to be issued while page references are held for direct-I/O. Signed-off-by: Dan Williams <dan.j.williams@intel.com> --- tools/testing/nvdimm/Kbuild | 1 + tools/testing/nvdimm/test/iomap.c | 62 +++++++++++++++++++++++++++++++++ tools/testing/nvdimm/test/nfit.c | 34 ++++++++++++++++++ tools/testing/nvdimm/test/nfit_test.h | 1 + 4 files changed, 98 insertions(+)