diff mbox

[v4] libnvdimm: fix clear poison locking with spinlock and GFP_NOWAIT allocation

Message ID 149202093948.14586.10025145039608378870.stgit@djiang5-desk3.ch.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dave Jiang April 12, 2017, 6:15 p.m. UTC
The following warning results from holding a lane spinlock,
preempt_disable(), or the btt map spinlock and then trying to take the
reconfig_mutex to walk the poison list and potentially add new entries.

BUG: sleeping function called from invalid context at kernel/locking/mutex.
c:747
in_atomic(): 1, irqs_disabled(): 0, pid: 17159, name: dd
[..]
Call Trace:
dump_stack+0x85/0xc8
___might_sleep+0x184/0x250
__might_sleep+0x4a/0x90
__mutex_lock+0x58/0x9b0
? nvdimm_bus_lock+0x21/0x30 [libnvdimm]
? __nvdimm_bus_badblocks_clear+0x2f/0x60 [libnvdimm]
? acpi_nfit_forget_poison+0x79/0x80 [nfit]
? _raw_spin_unlock+0x27/0x40
mutex_lock_nested+0x1b/0x20
nvdimm_bus_lock+0x21/0x30 [libnvdimm]
nvdimm_forget_poison+0x25/0x50 [libnvdimm]
nvdimm_clear_poison+0x106/0x140 [libnvdimm]
nsio_rw_bytes+0x164/0x270 [libnvdimm]
btt_write_pg+0x1de/0x3e0 [nd_btt]
? blk_queue_enter+0x30/0x290
btt_make_request+0x11a/0x310 [nd_btt]
? blk_queue_enter+0xb7/0x290
? blk_queue_enter+0x30/0x290
generic_make_request+0x118/0x3b0

A spinlock is introduced to protect the poison list. This allows us to not
having to acquire the reconfig_mutex for touching the poison list. The
add_poison() function has been broken out into two helper functions. One to
allocate the poison entry and the other to apppend the entry. This allows us
to unlock the poison_lock in non-I/O path and continue to be able to allocate
the poison entry with GFP_KERNEL. We will use GFP_NOWAIT in the I/O path in
order to satisfy being in atomic context.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/nvdimm/bus.c      |    7 +++--
 drivers/nvdimm/core.c     |   61 ++++++++++++++++++++++++++++-----------------
 drivers/nvdimm/nd-core.h  |    1 +
 include/linux/libnvdimm.h |    2 -
 4 files changed, 43 insertions(+), 28 deletions(-)

Comments

Verma, Vishal L April 13, 2017, 12:39 a.m. UTC | #1
On 04/12, Dave Jiang wrote:
> The following warning results from holding a lane spinlock,
> preempt_disable(), or the btt map spinlock and then trying to take the
> reconfig_mutex to walk the poison list and potentially add new entries.
> 
> BUG: sleeping function called from invalid context at kernel/locking/mutex.
> c:747
> in_atomic(): 1, irqs_disabled(): 0, pid: 17159, name: dd
> [..]
> Call Trace:
> dump_stack+0x85/0xc8
> ___might_sleep+0x184/0x250
> __might_sleep+0x4a/0x90
> __mutex_lock+0x58/0x9b0
> ? nvdimm_bus_lock+0x21/0x30 [libnvdimm]
> ? __nvdimm_bus_badblocks_clear+0x2f/0x60 [libnvdimm]
> ? acpi_nfit_forget_poison+0x79/0x80 [nfit]
> ? _raw_spin_unlock+0x27/0x40
> mutex_lock_nested+0x1b/0x20
> nvdimm_bus_lock+0x21/0x30 [libnvdimm]
> nvdimm_forget_poison+0x25/0x50 [libnvdimm]
> nvdimm_clear_poison+0x106/0x140 [libnvdimm]
> nsio_rw_bytes+0x164/0x270 [libnvdimm]
> btt_write_pg+0x1de/0x3e0 [nd_btt]
> ? blk_queue_enter+0x30/0x290
> btt_make_request+0x11a/0x310 [nd_btt]
> ? blk_queue_enter+0xb7/0x290
> ? blk_queue_enter+0x30/0x290
> generic_make_request+0x118/0x3b0
> 
> A spinlock is introduced to protect the poison list. This allows us to not
> having to acquire the reconfig_mutex for touching the poison list. The
> add_poison() function has been broken out into two helper functions. One to
> allocate the poison entry and the other to apppend the entry. This allows us
> to unlock the poison_lock in non-I/O path and continue to be able to allocate
> the poison entry with GFP_KERNEL. We will use GFP_NOWAIT in the I/O path in
> order to satisfy being in atomic context.
> 
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> ---
>  drivers/nvdimm/bus.c      |    7 +++--
>  drivers/nvdimm/core.c     |   61 ++++++++++++++++++++++++++++-----------------
>  drivers/nvdimm/nd-core.h  |    1 +
>  include/linux/libnvdimm.h |    2 -
>  4 files changed, 43 insertions(+), 28 deletions(-)
> 

Looks good!
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Dan Williams April 13, 2017, 12:51 a.m. UTC | #2
On Wed, Apr 12, 2017 at 11:15 AM, Dave Jiang <dave.jiang@intel.com> wrote:
> The following warning results from holding a lane spinlock,
> preempt_disable(), or the btt map spinlock and then trying to take the
> reconfig_mutex to walk the poison list and potentially add new entries.
>
> BUG: sleeping function called from invalid context at kernel/locking/mutex.
> c:747
> in_atomic(): 1, irqs_disabled(): 0, pid: 17159, name: dd
> [..]
> Call Trace:
> dump_stack+0x85/0xc8
> ___might_sleep+0x184/0x250
> __might_sleep+0x4a/0x90
> __mutex_lock+0x58/0x9b0
> ? nvdimm_bus_lock+0x21/0x30 [libnvdimm]
> ? __nvdimm_bus_badblocks_clear+0x2f/0x60 [libnvdimm]
> ? acpi_nfit_forget_poison+0x79/0x80 [nfit]
> ? _raw_spin_unlock+0x27/0x40
> mutex_lock_nested+0x1b/0x20
> nvdimm_bus_lock+0x21/0x30 [libnvdimm]
> nvdimm_forget_poison+0x25/0x50 [libnvdimm]
> nvdimm_clear_poison+0x106/0x140 [libnvdimm]
> nsio_rw_bytes+0x164/0x270 [libnvdimm]
> btt_write_pg+0x1de/0x3e0 [nd_btt]
> ? blk_queue_enter+0x30/0x290
> btt_make_request+0x11a/0x310 [nd_btt]
> ? blk_queue_enter+0xb7/0x290
> ? blk_queue_enter+0x30/0x290
> generic_make_request+0x118/0x3b0
>
> A spinlock is introduced to protect the poison list. This allows us to not
> having to acquire the reconfig_mutex for touching the poison list. The
> add_poison() function has been broken out into two helper functions. One to
> allocate the poison entry and the other to apppend the entry. This allows us
> to unlock the poison_lock in non-I/O path and continue to be able to allocate
> the poison entry with GFP_KERNEL. We will use GFP_NOWAIT in the I/O path in
> order to satisfy being in atomic context.
>
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> ---
>  drivers/nvdimm/bus.c      |    7 +++--
>  drivers/nvdimm/core.c     |   61 ++++++++++++++++++++++++++++-----------------
>  drivers/nvdimm/nd-core.h  |    1 +
>  include/linux/libnvdimm.h |    2 -
>  4 files changed, 43 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
> index 5ad2e59..d214ac44 100644
> --- a/drivers/nvdimm/bus.c
> +++ b/drivers/nvdimm/bus.c
> @@ -296,6 +296,7 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
>         init_waitqueue_head(&nvdimm_bus->probe_wait);
>         nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
>         mutex_init(&nvdimm_bus->reconfig_mutex);
> +       spin_lock_init(&nvdimm_bus->poison_lock);
>         if (nvdimm_bus->id < 0) {
>                 kfree(nvdimm_bus);
>                 return NULL;
> @@ -364,9 +365,9 @@ static int nd_bus_remove(struct device *dev)
>         nd_synchronize();
>         device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
>
> -       nvdimm_bus_lock(&nvdimm_bus->dev);
> +       spin_lock(&nvdimm_bus->poison_lock);
>         free_poison_list(&nvdimm_bus->poison_list);
> -       nvdimm_bus_unlock(&nvdimm_bus->dev);
> +       spin_unlock(&nvdimm_bus->poison_lock);
>
>         nvdimm_bus_destroy_ndctl(nvdimm_bus);
>
> @@ -990,7 +991,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
>
>                 if (clear_err->cleared) {
>                         /* clearing the poison list we keep track of */
> -                       __nvdimm_forget_poison(nvdimm_bus, clear_err->address,
> +                       nvdimm_forget_poison(nvdimm_bus, clear_err->address,
>                                         clear_err->cleared);
>
>                         /* now sync the badblocks lists */
> diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
> index 40a3da0..dfd5ba6 100644
> --- a/drivers/nvdimm/core.c
> +++ b/drivers/nvdimm/core.c
> @@ -518,28 +518,47 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region,
>  }
>  EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
>
> +static struct nd_poison * alloc_poison_entry(struct nvdimm_bus *nvdimm_bus,
> +               gfp_t flags)
> +{
> +       return (struct nd_poison *)kzalloc(sizeof(struct nd_poison), flags);

No need to cast since kzalloc() returns (void *).

> +}
> +
> +static void append_poison_entry(struct nvdimm_bus *nvdimm_bus,
> +               struct nd_poison *pl, u64 addr, u64 length)
> +{
> +       lockdep_assert_held(&nvdimm_bus->poison_lock);
> +       pl->start = addr;
> +       pl->length = length;
> +       list_add_tail(&pl->list, &nvdimm_bus->poison_list);
> +}
> +
>  static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
>                         gfp_t flags)
>  {
>         struct nd_poison *pl;
>
> -       pl = kzalloc(sizeof(*pl), flags);
> +       pl = alloc_poison_entry(nvdimm_bus, flags);
>         if (!pl)
>                 return -ENOMEM;
>
> -       pl->start = addr;
> -       pl->length = length;
> -       list_add_tail(&pl->list, &nvdimm_bus->poison_list);
> -
> +       append_poison_entry(nvdimm_bus, pl, addr, length);
>         return 0;
>  }
>
>  static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
>  {
>         struct nd_poison *pl;
> +       int rc;
>
> -       if (list_empty(&nvdimm_bus->poison_list))
> -               return add_poison(nvdimm_bus, addr, length, GFP_KERNEL);
> +       if (list_empty(&nvdimm_bus->poison_list)) {
> +               spin_unlock(&nvdimm_bus->poison_lock);
> +               pl = alloc_poison_entry(nvdimm_bus, GFP_KERNEL);
> +               spin_lock(&nvdimm_bus->poison_lock);

As soon as we drop the lock the list_empty() check we did 2 lines back
is invalidated. In other words, how does this handle the case where
the new entry needs to be merged with one that another thread added?
This needs to re-validate assumptions after re-acquiring the lock.
diff mbox

Patch

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 5ad2e59..d214ac44 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -296,6 +296,7 @@  struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 	init_waitqueue_head(&nvdimm_bus->probe_wait);
 	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
 	mutex_init(&nvdimm_bus->reconfig_mutex);
+	spin_lock_init(&nvdimm_bus->poison_lock);
 	if (nvdimm_bus->id < 0) {
 		kfree(nvdimm_bus);
 		return NULL;
@@ -364,9 +365,9 @@  static int nd_bus_remove(struct device *dev)
 	nd_synchronize();
 	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
 
-	nvdimm_bus_lock(&nvdimm_bus->dev);
+	spin_lock(&nvdimm_bus->poison_lock);
 	free_poison_list(&nvdimm_bus->poison_list);
-	nvdimm_bus_unlock(&nvdimm_bus->dev);
+	spin_unlock(&nvdimm_bus->poison_lock);
 
 	nvdimm_bus_destroy_ndctl(nvdimm_bus);
 
@@ -990,7 +991,7 @@  static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 
 		if (clear_err->cleared) {
 			/* clearing the poison list we keep track of */
-			__nvdimm_forget_poison(nvdimm_bus, clear_err->address,
+			nvdimm_forget_poison(nvdimm_bus, clear_err->address,
 					clear_err->cleared);
 
 			/* now sync the badblocks lists */
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 40a3da0..dfd5ba6 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -518,28 +518,47 @@  void nvdimm_badblocks_populate(struct nd_region *nd_region,
 }
 EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
 
+static struct nd_poison * alloc_poison_entry(struct nvdimm_bus *nvdimm_bus,
+		gfp_t flags)
+{
+	return (struct nd_poison *)kzalloc(sizeof(struct nd_poison), flags);
+}
+
+static void append_poison_entry(struct nvdimm_bus *nvdimm_bus,
+		struct nd_poison *pl, u64 addr, u64 length)
+{
+	lockdep_assert_held(&nvdimm_bus->poison_lock);
+	pl->start = addr;
+	pl->length = length;
+	list_add_tail(&pl->list, &nvdimm_bus->poison_list);
+}
+
 static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
 			gfp_t flags)
 {
 	struct nd_poison *pl;
 
-	pl = kzalloc(sizeof(*pl), flags);
+	pl = alloc_poison_entry(nvdimm_bus, flags);
 	if (!pl)
 		return -ENOMEM;
 
-	pl->start = addr;
-	pl->length = length;
-	list_add_tail(&pl->list, &nvdimm_bus->poison_list);
-
+	append_poison_entry(nvdimm_bus, pl, addr, length);
 	return 0;
 }
 
 static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 {
 	struct nd_poison *pl;
+	int rc;
 
-	if (list_empty(&nvdimm_bus->poison_list))
-		return add_poison(nvdimm_bus, addr, length, GFP_KERNEL);
+	if (list_empty(&nvdimm_bus->poison_list)) {
+		spin_unlock(&nvdimm_bus->poison_lock);
+		pl = alloc_poison_entry(nvdimm_bus, GFP_KERNEL);
+		spin_lock(&nvdimm_bus->poison_lock);
+		if (pl)
+			append_poison_entry(nvdimm_bus, pl, addr, length);
+		return rc;
+	}
 
 	/*
 	 * There is a chance this is a duplicate, check for those first.
@@ -559,30 +578,34 @@  static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 	 * as any overlapping ranges will get resolved when the list is consumed
 	 * and converted to badblocks
 	 */
-	return add_poison(nvdimm_bus, addr, length, GFP_KERNEL);
+	spin_unlock(&nvdimm_bus->poison_lock);
+	pl = alloc_poison_entry(nvdimm_bus, GFP_KERNEL);
+	spin_lock(&nvdimm_bus->poison_lock);
+	if (pl)
+		append_poison_entry(nvdimm_bus, pl, addr, length);
+	return rc;
 }
 
 int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 {
 	int rc;
 
-	nvdimm_bus_lock(&nvdimm_bus->dev);
+	spin_lock(&nvdimm_bus->poison_lock);
 	rc = bus_add_poison(nvdimm_bus, addr, length);
-	nvdimm_bus_unlock(&nvdimm_bus->dev);
+	spin_unlock(&nvdimm_bus->poison_lock);
 
 	return rc;
 }
 EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
 
-void __nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
+void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
 		unsigned int len)
 {
 	struct list_head *poison_list = &nvdimm_bus->poison_list;
 	u64 clr_end = start + len - 1;
 	struct nd_poison *pl, *next;
 
-	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
-
+	spin_lock(&nvdimm_bus->poison_lock);
 	WARN_ON_ONCE(list_empty(poison_list));
 
 	/*
@@ -629,21 +652,13 @@  void __nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
 			u64 new_len = pl_end - new_start + 1;
 
 			/* Add new entry covering the right half */
-			add_poison(nvdimm_bus, new_start, new_len, GFP_NOIO);
+			add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT);
 			/* Adjust this entry to cover the left half */
 			pl->length = start - pl->start;
 			continue;
 		}
 	}
-}
-EXPORT_SYMBOL_GPL(__nvdimm_forget_poison);
-
-void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
-		phys_addr_t start, unsigned int len)
-{
-	nvdimm_bus_lock(&nvdimm_bus->dev);
-	__nvdimm_forget_poison(nvdimm_bus, start, len);
-	nvdimm_bus_unlock(&nvdimm_bus->dev);
+	spin_unlock(&nvdimm_bus->poison_lock);
 }
 EXPORT_SYMBOL_GPL(nvdimm_forget_poison);
 
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 8623e57..4c4bd20 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -32,6 +32,7 @@  struct nvdimm_bus {
 	struct list_head poison_list;
 	struct list_head mapping_list;
 	struct mutex reconfig_mutex;
+	spinlock_t poison_lock;
 };
 
 struct nvdimm {
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 1c609e8..98b2076 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -122,8 +122,6 @@  static inline struct nd_blk_region_desc *to_blk_region_desc(
 int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
 void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
 		phys_addr_t start, unsigned int len);
-void __nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
-		phys_addr_t start, unsigned int len);
 struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nfit_desc);
 void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);