diff mbox series

[v4,02/10] dax: Introduce holder for dax_device

Message ID 20210604011844.1756145-3-ruansy.fnst@fujitsu.com (mailing list archive)
State New, archived
Headers show
Series fsdax: introduce fs query to support reflink | expand

Commit Message

Shiyang Ruan June 4, 2021, 1:18 a.m. UTC
To easily track filesystem from a pmem device, we introduce a holder for
dax_device structure, and also its operation.  This holder is used to
remember who is using this dax_device:
 - When it is the backend of a filesystem, the holder will be the
   superblock of this filesystem.
 - When this pmem device is one of the targets in a mapped device, the
   holder will be this mapped device.  In this case, the mapped device
   has its own dax_device and it will follow the first rule.  So that we
   can finally track to the filesystem we needed.

The holder and holder_ops will be set when filesystem is being mounted,
or an target device is being activated.

Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
---
 drivers/dax/super.c | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/dax.h | 10 ++++++++++
 2 files changed, 48 insertions(+)

Comments

Dan Williams June 16, 2021, 12:46 a.m. UTC | #1
On Thu, Jun 3, 2021 at 6:19 PM Shiyang Ruan <ruansy.fnst@fujitsu.com> wrote:
>
> To easily track filesystem from a pmem device, we introduce a holder for
> dax_device structure, and also its operation.  This holder is used to
> remember who is using this dax_device:
>  - When it is the backend of a filesystem, the holder will be the
>    superblock of this filesystem.
>  - When this pmem device is one of the targets in a mapped device, the
>    holder will be this mapped device.  In this case, the mapped device
>    has its own dax_device and it will follow the first rule.  So that we
>    can finally track to the filesystem we needed.
>
> The holder and holder_ops will be set when filesystem is being mounted,
> or an target device is being activated.
>
> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> ---
>  drivers/dax/super.c | 38 ++++++++++++++++++++++++++++++++++++++
>  include/linux/dax.h | 10 ++++++++++
>  2 files changed, 48 insertions(+)
>
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index 5fa6ae9dbc8b..d118e2a7dc70 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -222,8 +222,10 @@ struct dax_device {
>         struct cdev cdev;
>         const char *host;
>         void *private;

@private is likely too generic of a name now, it would be better to
call this @parent.

> +       void *holder;

This should probably be called holder_data, and this structure could
use some kernel-doc to clarify what the fields mean.

>         unsigned long flags;
>         const struct dax_operations *ops;
> +       const struct dax_holder_operations *holder_ops;
>  };
>
>  static ssize_t write_cache_show(struct device *dev,
> @@ -373,6 +375,24 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
>  }
>  EXPORT_SYMBOL_GPL(dax_zero_page_range);
>
> +int dax_corrupted_range(struct dax_device *dax_dev, struct block_device *bdev,
> +               loff_t offset, size_t size, void *data)

Why is @bdev an argument to this routine?  The primary motivation for
a 'struct dax_device' is to break the association with 'struct
block_device'. The filesystem may know that the logical addresses
associated with a given dax_dev alias with the logical addresses of a
given bdev, but that knowledge need not leak into the API.

> +{
> +       int rc = -ENXIO;
> +       if (!dax_dev)
> +               return rc;
> +
> +       if (dax_dev->holder) {
> +               rc = dax_dev->holder_ops->corrupted_range(dax_dev, bdev, offset,
> +                                                         size, data);

A bikeshed comment, but I do not like the name corrupted_range(),
because "corrupted" implies a permanent state. The source of this
notification is memory_failure() and that does not convey "permanent"
vs "transient" it just reports "failure". So, to keep the naming
consistent with the pgmap notification callback lets call this one
"notify_failure".

> +               if (rc == -ENODEV)
> +                       rc = -ENXIO;
> +       } else
> +               rc = -EOPNOTSUPP;
> +       return rc;
> +}
> +EXPORT_SYMBOL_GPL(dax_corrupted_range);

dax_holder_notify_failure() makes it clearer that this is
communicating a failure up the holder stack.

> +
>  #ifdef CONFIG_ARCH_HAS_PMEM_API
>  void arch_wb_cache_pmem(void *addr, size_t size);
>  void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
> @@ -624,6 +644,24 @@ void put_dax(struct dax_device *dax_dev)
>  }
>  EXPORT_SYMBOL_GPL(put_dax);
>
> +void dax_set_holder(struct dax_device *dax_dev, void *holder,
> +               const struct dax_holder_operations *ops)
> +{
> +       if (!dax_dev)
> +               return;
> +       dax_dev->holder = holder;
> +       dax_dev->holder_ops = ops;

I think there needs to be some synchronization here, perhaps a global
dax_dev_rwsem that is taken for read in the notification path and
write when adding a holder to the chain.

I also wonder if this should be an event that triggers a dax_dev stack
to re-report any failure notifications. For example the pmem driver
may have recorded a list of bad blocks at the beginning of time.
Likely the filesystem or other holder would like to get that
pre-existing list of failures at first registration. Have you given
thought about how the filesystem is told about pre-existing badblocks?

> +}
> +EXPORT_SYMBOL_GPL(dax_set_holder);
> +
> +void *dax_get_holder(struct dax_device *dax_dev)
> +{
> +       if (!dax_dev)
> +               return NULL;
> +       return dax_dev->holder;
> +}
> +EXPORT_SYMBOL_GPL(dax_get_holder);

Where is this used?
diff mbox series

Patch

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 5fa6ae9dbc8b..d118e2a7dc70 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -222,8 +222,10 @@  struct dax_device {
 	struct cdev cdev;
 	const char *host;
 	void *private;
+	void *holder;
 	unsigned long flags;
 	const struct dax_operations *ops;
+	const struct dax_holder_operations *holder_ops;
 };
 
 static ssize_t write_cache_show(struct device *dev,
@@ -373,6 +375,24 @@  int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
 }
 EXPORT_SYMBOL_GPL(dax_zero_page_range);
 
+int dax_corrupted_range(struct dax_device *dax_dev, struct block_device *bdev,
+		loff_t offset, size_t size, void *data)
+{
+	int rc = -ENXIO;
+	if (!dax_dev)
+		return rc;
+
+	if (dax_dev->holder) {
+		rc = dax_dev->holder_ops->corrupted_range(dax_dev, bdev, offset,
+							  size, data);
+		if (rc == -ENODEV)
+			rc = -ENXIO;
+	} else
+		rc = -EOPNOTSUPP;
+	return rc;
+}
+EXPORT_SYMBOL_GPL(dax_corrupted_range);
+
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
@@ -624,6 +644,24 @@  void put_dax(struct dax_device *dax_dev)
 }
 EXPORT_SYMBOL_GPL(put_dax);
 
+void dax_set_holder(struct dax_device *dax_dev, void *holder,
+		const struct dax_holder_operations *ops)
+{
+	if (!dax_dev)
+		return;
+	dax_dev->holder = holder;
+	dax_dev->holder_ops = ops;
+}
+EXPORT_SYMBOL_GPL(dax_set_holder);
+
+void *dax_get_holder(struct dax_device *dax_dev)
+{
+	if (!dax_dev)
+		return NULL;
+	return dax_dev->holder;
+}
+EXPORT_SYMBOL_GPL(dax_get_holder);
+
 /**
  * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
  * @host: alternate name for the device registered by a dax driver
diff --git a/include/linux/dax.h b/include/linux/dax.h
index b52f084aa643..1ce343a960ab 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -38,10 +38,18 @@  struct dax_operations {
 	int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
 };
 
+struct dax_holder_operations {
+	int (*corrupted_range)(struct dax_device *, struct block_device *,
+			loff_t, size_t, void *);
+};
+
 extern struct attribute_group dax_attribute_group;
 
 #if IS_ENABLED(CONFIG_DAX)
 struct dax_device *dax_get_by_host(const char *host);
+void dax_set_holder(struct dax_device *dax_dev, void *holder,
+		const struct dax_holder_operations *ops);
+void *dax_get_holder(struct dax_device *dax_dev);
 struct dax_device *alloc_dax(void *private, const char *host,
 		const struct dax_operations *ops, unsigned long flags);
 void put_dax(struct dax_device *dax_dev);
@@ -226,6 +234,8 @@  size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 		size_t bytes, struct iov_iter *i);
 int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
 			size_t nr_pages);
+int dax_corrupted_range(struct dax_device *dax_dev, struct block_device *bdev,
+		loff_t offset, size_t size, void *data);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,