diff mbox series

[13/20] fuse, dax: Implement dax read/write operations

Message ID 20200304165845.3081-14-vgoyal@redhat.com (mailing list archive)
State New, archived
Headers show
Series virtiofs: Add DAX support | expand

Commit Message

Vivek Goyal March 4, 2020, 4:58 p.m. UTC
This patch implements basic DAX support. mmap() is not implemented
yet and will come in later patches. This patch looks into implemeting
read/write.

We make use of interval tree to keep track of per inode dax mappings.

Do not use dax for file extending writes, instead just send WRITE message
to daemon (like we do for direct I/O path). This will keep write and
i_size change atomic w.r.t crash.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: Peng Tao <tao.peng@linux.alibaba.com>
---
 fs/fuse/file.c            | 597 +++++++++++++++++++++++++++++++++++++-
 fs/fuse/fuse_i.h          |  23 ++
 fs/fuse/inode.c           |   6 +
 include/uapi/linux/fuse.h |   1 +
 4 files changed, 621 insertions(+), 6 deletions(-)

Comments

Miklos Szeredi March 12, 2020, 9:43 a.m. UTC | #1
On Wed, Mar 4, 2020 at 5:59 PM Vivek Goyal <vgoyal@redhat.com> wrote:
>
> This patch implements basic DAX support. mmap() is not implemented
> yet and will come in later patches. This patch looks into implemeting
> read/write.
>
> We make use of interval tree to keep track of per inode dax mappings.
>
> Do not use dax for file extending writes, instead just send WRITE message
> to daemon (like we do for direct I/O path). This will keep write and
> i_size change atomic w.r.t crash.
>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
> Signed-off-by: Peng Tao <tao.peng@linux.alibaba.com>
> ---
>  fs/fuse/file.c            | 597 +++++++++++++++++++++++++++++++++++++-
>  fs/fuse/fuse_i.h          |  23 ++
>  fs/fuse/inode.c           |   6 +
>  include/uapi/linux/fuse.h |   1 +
>  4 files changed, 621 insertions(+), 6 deletions(-)
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 9d67b830fb7a..9effdd3dc6d6 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -18,6 +18,12 @@
>  #include <linux/swap.h>
>  #include <linux/falloc.h>
>  #include <linux/uio.h>
> +#include <linux/dax.h>
> +#include <linux/iomap.h>
> +#include <linux/interval_tree_generic.h>
> +
> +INTERVAL_TREE_DEFINE(struct fuse_dax_mapping, rb, __u64, __subtree_last,
> +                     START, LAST, static inline, fuse_dax_interval_tree);

Are you using this because of byte ranges (u64)?   Does it not make
more sense to use page offsets, which are unsigned long and so fit
nicely into the generic interval tree?

>
>  static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
>                                       struct fuse_page_desc **desc)
> @@ -187,6 +193,242 @@ static void fuse_link_write_file(struct file *file)
>         spin_unlock(&fi->lock);
>  }
>
> +static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn *fc)
> +{
> +       struct fuse_dax_mapping *dmap = NULL;
> +
> +       spin_lock(&fc->lock);
> +
> +       if (fc->nr_free_ranges <= 0) {
> +               spin_unlock(&fc->lock);
> +               return NULL;
> +       }
> +
> +       WARN_ON(list_empty(&fc->free_ranges));
> +
> +       /* Take a free range */
> +       dmap = list_first_entry(&fc->free_ranges, struct fuse_dax_mapping,
> +                                       list);
> +       list_del_init(&dmap->list);
> +       fc->nr_free_ranges--;
> +       spin_unlock(&fc->lock);
> +       return dmap;
> +}
> +
> +/* This assumes fc->lock is held */
> +static void __dmap_add_to_free_pool(struct fuse_conn *fc,
> +                               struct fuse_dax_mapping *dmap)
> +{
> +       list_add_tail(&dmap->list, &fc->free_ranges);
> +       fc->nr_free_ranges++;
> +}
> +
> +static void dmap_add_to_free_pool(struct fuse_conn *fc,
> +                               struct fuse_dax_mapping *dmap)
> +{
> +       /* Return fuse_dax_mapping to free list */
> +       spin_lock(&fc->lock);
> +       __dmap_add_to_free_pool(fc, dmap);
> +       spin_unlock(&fc->lock);
> +}
> +
> +/* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */
> +static int fuse_setup_one_mapping(struct inode *inode, loff_t offset,
> +                                 struct fuse_dax_mapping *dmap, bool writable,
> +                                 bool upgrade)
> +{
> +       struct fuse_conn *fc = get_fuse_conn(inode);
> +       struct fuse_inode *fi = get_fuse_inode(inode);
> +       struct fuse_setupmapping_in inarg;
> +       FUSE_ARGS(args);
> +       ssize_t err;
> +
> +       WARN_ON(offset % FUSE_DAX_MEM_RANGE_SZ);
> +       WARN_ON(fc->nr_free_ranges < 0);
> +
> +       /* Ask fuse daemon to setup mapping */
> +       memset(&inarg, 0, sizeof(inarg));
> +       inarg.foffset = offset;
> +       inarg.fh = -1;
> +       inarg.moffset = dmap->window_offset;
> +       inarg.len = FUSE_DAX_MEM_RANGE_SZ;
> +       inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
> +       if (writable)
> +               inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
> +       args.opcode = FUSE_SETUPMAPPING;
> +       args.nodeid = fi->nodeid;
> +       args.in_numargs = 1;
> +       args.in_args[0].size = sizeof(inarg);
> +       args.in_args[0].value = &inarg;

args.force = true?

> +       err = fuse_simple_request(fc, &args);
> +       if (err < 0) {
> +               printk(KERN_ERR "%s request failed at mem_offset=0x%llx %zd\n",
> +                                __func__, dmap->window_offset, err);

Is this level of noisiness really needed?  AFAICS, the error will
reach the caller, in which case we don't usually need to print a
kernel error.

> +               return err;
> +       }
> +
> +       pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx writable=%d"
> +                " err=%zd\n", offset, writable, err);
> +
> +       dmap->writable = writable;
> +       if (!upgrade) {
> +               dmap->start = offset;
> +               dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1;
> +               /* Protected by fi->i_dmap_sem */
> +               fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree);
> +               fi->nr_dmaps++;
> +       }
> +       return 0;
> +}
> +
> +static int
> +fuse_send_removemapping(struct inode *inode,
> +                       struct fuse_removemapping_in *inargp,
> +                       struct fuse_removemapping_one *remove_one)
> +{
> +       struct fuse_inode *fi = get_fuse_inode(inode);
> +       struct fuse_conn *fc = get_fuse_conn(inode);
> +       FUSE_ARGS(args);
> +
> +       args.opcode = FUSE_REMOVEMAPPING;
> +       args.nodeid = fi->nodeid;
> +       args.in_numargs = 2;
> +       args.in_args[0].size = sizeof(*inargp);
> +       args.in_args[0].value = inargp;
> +       args.in_args[1].size = inargp->count * sizeof(*remove_one);
> +       args.in_args[1].value = remove_one;

args.force = true?

> +       return fuse_simple_request(fc, &args);
> +}
> +
> +static int dmap_removemapping_list(struct inode *inode, unsigned num,
> +                                  struct list_head *to_remove)
> +{
> +       struct fuse_removemapping_one *remove_one, *ptr;
> +       struct fuse_removemapping_in inarg;
> +       struct fuse_dax_mapping *dmap;
> +       int ret, i = 0, nr_alloc;
> +
> +       nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
> +       remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
> +       if (!remove_one)
> +               return -ENOMEM;
> +
> +       ptr = remove_one;
> +       list_for_each_entry(dmap, to_remove, list) {
> +               ptr->moffset = dmap->window_offset;
> +               ptr->len = dmap->length;
> +               ptr++;

Minor nit: ptr = &remove_one[i] at the start of the section would be
cleaner IMO.


> +               i++;
> +               num--;
> +               if (i >= nr_alloc || num == 0) {
> +                       memset(&inarg, 0, sizeof(inarg));
> +                       inarg.count = i;
> +                       ret = fuse_send_removemapping(inode, &inarg,
> +                                                     remove_one);
> +                       if (ret)
> +                               goto out;
> +                       ptr = remove_one;
> +                       i = 0;
> +               }
> +       }
> +out:
> +       kfree(remove_one);
> +       return ret;
> +}
> +
> +/*
> + * Cleanup dmap entry and add back to free list. This should be called with
> + * fc->lock held.
> + */
> +static void dmap_reinit_add_to_free_pool(struct fuse_conn *fc,
> +                                           struct fuse_dax_mapping *dmap)
> +{
> +       pr_debug("fuse: freeing memory range start=0x%llx end=0x%llx "
> +                "window_offset=0x%llx length=0x%llx\n", dmap->start,
> +                dmap->end, dmap->window_offset, dmap->length);
> +       dmap->start = dmap->end = 0;
> +       __dmap_add_to_free_pool(fc, dmap);
> +}
> +
> +/*
> + * Free inode dmap entries whose range falls entirely inside [start, end].
> + * Does not take any locks. At this point of time it should only be
> + * called from evict_inode() path where we know all dmap entries can be
> + * reclaimed.
> + */
> +static void inode_reclaim_dmap_range(struct fuse_conn *fc, struct inode *inode,
> +                                     loff_t start, loff_t end)
> +{
> +       struct fuse_inode *fi = get_fuse_inode(inode);
> +       struct fuse_dax_mapping *dmap, *n;
> +       int err, num = 0;
> +       LIST_HEAD(to_remove);
> +
> +       pr_debug("fuse: %s: start=0x%llx, end=0x%llx\n", __func__, start, end);
> +
> +       /*
> +        * Interval tree search matches intersecting entries. Adjust the range
> +        * to avoid dropping partial valid entries.
> +        */
> +       start = ALIGN(start, FUSE_DAX_MEM_RANGE_SZ);
> +       end = ALIGN_DOWN(end, FUSE_DAX_MEM_RANGE_SZ);
> +
> +       while (1) {
> +               dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, start,
> +                                                        end);
> +               if (!dmap)
> +                       break;
> +               fuse_dax_interval_tree_remove(dmap, &fi->dmap_tree);
> +               num++;
> +               list_add(&dmap->list, &to_remove);
> +       }
> +
> +       /* Nothing to remove */
> +       if (list_empty(&to_remove))
> +               return;
> +
> +       WARN_ON(fi->nr_dmaps < num);
> +       fi->nr_dmaps -= num;
> +       /*
> +        * During umount/shutdown, fuse connection is dropped first
> +        * and evict_inode() is called later. That means any
> +        * removemapping messages are going to fail. Send messages
> +        * only if connection is up. Otherwise fuse daemon is
> +        * responsible for cleaning up any leftover references and
> +        * mappings.
> +        */
> +       if (fc->connected) {
> +               err = dmap_removemapping_list(inode, num, &to_remove);
> +               if (err) {
> +                       pr_warn("Failed to removemappings. start=0x%llx"
> +                               " end=0x%llx\n", start, end);
> +               }
> +       }
> +       spin_lock(&fc->lock);
> +       list_for_each_entry_safe(dmap, n, &to_remove, list) {
> +               list_del_init(&dmap->list);
> +               dmap_reinit_add_to_free_pool(fc, dmap);
> +       }
> +       spin_unlock(&fc->lock);
> +}
> +
> +/*
> + * It is called from evict_inode() and by that time inode is going away. So
> + * this function does not take any locks like fi->i_dmap_sem for traversing
> + * that fuse inode interval tree. If that lock is taken then lock validator
> + * complains of deadlock situation w.r.t fs_reclaim lock.
> + */
> +void fuse_cleanup_inode_mappings(struct inode *inode)
> +{
> +       struct fuse_conn *fc = get_fuse_conn(inode);
> +       /*
> +        * fuse_evict_inode() has alredy called truncate_inode_pages_final()
> +        * before we arrive here. So we should not have to worry about
> +        * any pages/exception entries still associated with inode.
> +        */
> +       inode_reclaim_dmap_range(fc, inode, 0, -1);
> +}
> +
>  void fuse_finish_open(struct inode *inode, struct file *file)
>  {
>         struct fuse_file *ff = file->private_data;
> @@ -1562,32 +1804,364 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
>         return res;
>  }
>
> +static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
>  static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  {
>         struct file *file = iocb->ki_filp;
>         struct fuse_file *ff = file->private_data;
> +       struct inode *inode = file->f_mapping->host;
>
>         if (is_bad_inode(file_inode(file)))
>                 return -EIO;
>
> -       if (!(ff->open_flags & FOPEN_DIRECT_IO))
> -               return fuse_cache_read_iter(iocb, to);
> -       else
> +       if (IS_DAX(inode))
> +               return fuse_dax_read_iter(iocb, to);
> +
> +       if (ff->open_flags & FOPEN_DIRECT_IO)
>                 return fuse_direct_read_iter(iocb, to);
> +
> +       return fuse_cache_read_iter(iocb, to);
>  }
>
> +static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
>  static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  {
>         struct file *file = iocb->ki_filp;
>         struct fuse_file *ff = file->private_data;
> +       struct inode *inode = file->f_mapping->host;
>
>         if (is_bad_inode(file_inode(file)))
>                 return -EIO;
>
> -       if (!(ff->open_flags & FOPEN_DIRECT_IO))
> -               return fuse_cache_write_iter(iocb, from);
> -       else
> +       if (IS_DAX(inode))
> +               return fuse_dax_write_iter(iocb, from);
> +
> +       if (ff->open_flags & FOPEN_DIRECT_IO)
>                 return fuse_direct_write_iter(iocb, from);
> +
> +       return fuse_cache_write_iter(iocb, from);
> +}
> +
> +static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
> +{
> +       iomap->addr = IOMAP_NULL_ADDR;
> +       iomap->length = length;
> +       iomap->type = IOMAP_HOLE;
> +}
> +
> +static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
> +                       struct iomap *iomap, struct fuse_dax_mapping *dmap,
> +                       unsigned flags)
> +{
> +       loff_t offset, len;
> +       loff_t i_size = i_size_read(inode);
> +
> +       offset = pos - dmap->start;
> +       len = min(length, dmap->length - offset);
> +
> +       /* If length is beyond end of file, truncate further */
> +       if (pos + len > i_size)
> +               len = i_size - pos;
> +
> +       if (len > 0) {
> +               iomap->addr = dmap->window_offset + offset;
> +               iomap->length = len;
> +               if (flags & IOMAP_FAULT)
> +                       iomap->length = ALIGN(len, PAGE_SIZE);
> +               iomap->type = IOMAP_MAPPED;
> +               pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
> +                               " length 0x%llx\n", __func__, iomap->addr,
> +                               iomap->offset, iomap->length);
> +       } else {
> +               /* Mapping beyond end of file is hole */
> +               fuse_fill_iomap_hole(iomap, length);
> +               pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
> +                               "length 0x%llx\n", __func__, iomap->addr,
> +                               iomap->offset, iomap->length);
> +       }
> +}
> +
> +static int iomap_begin_setup_new_mapping(struct inode *inode, loff_t pos,
> +                                        loff_t length, unsigned flags,
> +                                        struct iomap *iomap)
> +{
> +       struct fuse_inode *fi = get_fuse_inode(inode);
> +       struct fuse_conn *fc = get_fuse_conn(inode);
> +       struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
> +       int ret;
> +       bool writable = flags & IOMAP_WRITE;
> +
> +       alloc_dmap = alloc_dax_mapping(fc);
> +       if (!alloc_dmap)
> +               return -EBUSY;
> +
> +       /*
> +        * Take write lock so that only one caller can try to setup mapping
> +        * and other waits.
> +        */
> +       down_write(&fi->i_dmap_sem);
> +       /*
> +        * We dropped lock. Check again if somebody else setup
> +        * mapping already.
> +        */
> +       dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos,
> +                                               pos);
> +       if (dmap) {
> +               fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> +               dmap_add_to_free_pool(fc, alloc_dmap);
> +               up_write(&fi->i_dmap_sem);
> +               return 0;
> +       }
> +
> +       /* Setup one mapping */
> +       ret = fuse_setup_one_mapping(inode,
> +                                    ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
> +                                    alloc_dmap, writable, false);
> +       if (ret < 0) {
> +               printk("fuse_setup_one_mapping() failed. err=%d"
> +                       " pos=0x%llx, writable=%d\n", ret, pos, writable);

More  unnecessary noise?

> +               dmap_add_to_free_pool(fc, alloc_dmap);
> +               up_write(&fi->i_dmap_sem);
> +               return ret;
> +       }
> +       fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
> +       up_write(&fi->i_dmap_sem);
> +       return 0;
> +}
> +
> +static int iomap_begin_upgrade_mapping(struct inode *inode, loff_t pos,
> +                                        loff_t length, unsigned flags,
> +                                        struct iomap *iomap)
> +{
> +       struct fuse_inode *fi = get_fuse_inode(inode);
> +       struct fuse_dax_mapping *dmap;
> +       int ret;
> +
> +       /*
> +        * Take exclusive lock so that only one caller can try to setup
> +        * mapping and others wait.
> +        */
> +       down_write(&fi->i_dmap_sem);
> +       dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
> +
> +       /* We are holding either inode lock or i_mmap_sem, and that should
> +        * ensure that dmap can't reclaimed or truncated and it should still
> +        * be there in tree despite the fact we dropped and re-acquired the
> +        * lock.
> +        */
> +       ret = -EIO;
> +       if (WARN_ON(!dmap))
> +               goto out_err;
> +
> +       /* Maybe another thread already upgraded mapping while we were not
> +        * holding lock.
> +        */
> +       if (dmap->writable)
> +               goto out_fill_iomap;
> +
> +       ret = fuse_setup_one_mapping(inode,
> +                                    ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
> +                                    dmap, true, true);
> +       if (ret < 0) {
> +               printk("fuse_setup_one_mapping() failed. err=%d pos=0x%llx\n",
> +                      ret, pos);

Again.

> +               goto out_err;
> +       }
> +
> +out_fill_iomap:
> +       fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> +out_err:
> +       up_write(&fi->i_dmap_sem);
> +       return ret;
> +}
> +
> +/* This is just for DAX and the mapping is ephemeral, do not use it for other
> + * purposes since there is no block device with a permanent mapping.
> + */
> +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
> +                           unsigned flags, struct iomap *iomap,
> +                           struct iomap *srcmap)
> +{
> +       struct fuse_inode *fi = get_fuse_inode(inode);
> +       struct fuse_conn *fc = get_fuse_conn(inode);
> +       struct fuse_dax_mapping *dmap;
> +       bool writable = flags & IOMAP_WRITE;
> +
> +       /* We don't support FIEMAP */
> +       BUG_ON(flags & IOMAP_REPORT);
> +
> +       pr_debug("fuse_iomap_begin() called. pos=0x%llx length=0x%llx\n",
> +                       pos, length);
> +
> +       /*
> +        * Writes beyond end of file are not handled using dax path. Instead
> +        * a fuse write message is sent to daemon
> +        */
> +       if (flags & IOMAP_WRITE && pos >= i_size_read(inode))
> +               return -EIO;

Okay, this will work fine if the host filesystem is not modified by
other entities.

What happens if there's a concurrent truncate going on on the host
with this write?    If the two are not in any way synchronized than
either the two following behavior is allowed:

 1) Whole or partial data in write is truncated. (If there are
complete pages from the write being truncated, then the writing
process will receive SIGBUS.  Does KVM hande that?   I remember that
being discussed, but don't remember the conclusion).

 2) Write re-extends file size.

However EIO is not a good result, so we need to do something with it.

> +
> +       iomap->offset = pos;
> +       iomap->flags = 0;
> +       iomap->bdev = NULL;
> +       iomap->dax_dev = fc->dax_dev;
> +
> +       /*
> +        * Both read/write and mmap path can race here. So we need something
> +        * to make sure if we are setting up mapping, then other path waits
> +        *
> +        * For now, use a semaphore for this. It probably needs to be
> +        * optimized later.
> +        */
> +       down_read(&fi->i_dmap_sem);
> +       dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
> +
> +       if (dmap) {
> +               if (writable && !dmap->writable) {
> +                       /* Upgrade read-only mapping to read-write. This will
> +                        * require exclusive i_dmap_sem lock as we don't want
> +                        * two threads to be trying to this simultaneously
> +                        * for same dmap. So drop shared lock and acquire
> +                        * exclusive lock.
> +                        */
> +                       up_read(&fi->i_dmap_sem);
> +                       pr_debug("%s: Upgrading mapping at offset 0x%llx"
> +                                " length 0x%llx\n", __func__, pos, length);
> +                       return iomap_begin_upgrade_mapping(inode, pos, length,
> +                                                          flags, iomap);
> +               } else {
> +                       fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> +                       up_read(&fi->i_dmap_sem);
> +                       return 0;
> +               }
> +       } else {
> +               up_read(&fi->i_dmap_sem);
> +               pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
> +                               __func__, pos, length);
> +               if (pos >= i_size_read(inode))
> +                       goto iomap_hole;
> +
> +               return iomap_begin_setup_new_mapping(inode, pos, length, flags,
> +                                                    iomap);
> +       }
> +
> +       /*
> +        * If read beyond end of file happnes, fs code seems to return
> +        * it as hole
> +        */
> +iomap_hole:
> +       fuse_fill_iomap_hole(iomap, length);
> +       pr_debug("fuse_iomap_begin() returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", pos, length, iomap->length);
> +       return 0;
> +}
> +
> +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
> +                         ssize_t written, unsigned flags,
> +                         struct iomap *iomap)
> +{
> +       /* DAX writes beyond end-of-file aren't handled using iomap, so the
> +        * file size is unchanged and there is nothing to do here.
> +        */
> +       return 0;
> +}
> +
> +static const struct iomap_ops fuse_iomap_ops = {
> +       .iomap_begin = fuse_iomap_begin,
> +       .iomap_end = fuse_iomap_end,
> +};
> +
> +static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> +       struct inode *inode = file_inode(iocb->ki_filp);
> +       ssize_t ret;
> +
> +       if (iocb->ki_flags & IOCB_NOWAIT) {
> +               if (!inode_trylock_shared(inode))
> +                       return -EAGAIN;
> +       } else {
> +               inode_lock_shared(inode);
> +       }
> +
> +       ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
> +       inode_unlock_shared(inode);
> +
> +       /* TODO file_accessed(iocb->f_filp) */
> +       return ret;
> +}
> +
> +static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from)
> +{
> +       struct inode *inode = file_inode(iocb->ki_filp);
> +
> +       return (iov_iter_rw(from) == WRITE &&
> +               ((iocb->ki_pos) >= i_size_read(inode)));
> +}
> +
> +static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
> +{
> +       struct inode *inode = file_inode(iocb->ki_filp);
> +       struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
> +       ssize_t ret;
> +
> +       ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
> +       if (ret < 0)
> +               return ret;
> +
> +       fuse_invalidate_attr(inode);
> +       fuse_write_update_size(inode, iocb->ki_pos);
> +       return ret;
> +}
> +
> +static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
> +{
> +       struct inode *inode = file_inode(iocb->ki_filp);
> +       ssize_t ret, count;
> +
> +       if (iocb->ki_flags & IOCB_NOWAIT) {
> +               if (!inode_trylock(inode))
> +                       return -EAGAIN;
> +       } else {
> +               inode_lock(inode);
> +       }
> +
> +       ret = generic_write_checks(iocb, from);
> +       if (ret <= 0)
> +               goto out;
> +
> +       ret = file_remove_privs(iocb->ki_filp);
> +       if (ret)
> +               goto out;
> +       /* TODO file_update_time() but we don't want metadata I/O */
> +
> +       /* Do not use dax for file extending writes as its an mmap and
> +        * trying to write beyong end of existing page will generate
> +        * SIGBUS.

Ah, here it is.  So what happens in case of a race?  Does that
currently crash KVM?

> +        */
> +       if (file_extending_write(iocb, from)) {
> +               ret = fuse_dax_direct_write(iocb, from);
> +               goto out;
> +       }
> +
> +       ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
> +       if (ret < 0)
> +               goto out;
> +
> +       /*
> +        * If part of the write was file extending, fuse dax path will not
> +        * take care of that. Do direct write instead.
> +        */
> +       if (iov_iter_count(from) && file_extending_write(iocb, from)) {
> +               count = fuse_dax_direct_write(iocb, from);
> +               if (count < 0)
> +                       goto out;
> +               ret += count;
> +       }
> +
> +out:
> +       inode_unlock(inode);
> +
> +       if (ret > 0)
> +               ret = generic_write_sync(iocb, ret);
> +       return ret;
>  }
>
>  static void fuse_writepage_free(struct fuse_writepage_args *wpa)
> @@ -2318,6 +2892,11 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
>         return 0;
>  }
>
> +static int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +       return -EINVAL; /* TODO */
> +}
> +
>  static int convert_fuse_file_lock(struct fuse_conn *fc,
>                                   const struct fuse_file_lock *ffl,
>                                   struct file_lock *fl)
> @@ -3387,6 +3966,7 @@ static const struct address_space_operations fuse_file_aops  = {
>  void fuse_init_file_inode(struct inode *inode)
>  {
>         struct fuse_inode *fi = get_fuse_inode(inode);
> +       struct fuse_conn *fc = get_fuse_conn(inode);
>
>         inode->i_fop = &fuse_file_operations;
>         inode->i_data.a_ops = &fuse_file_aops;
> @@ -3396,4 +3976,9 @@ void fuse_init_file_inode(struct inode *inode)
>         fi->writectr = 0;
>         init_waitqueue_head(&fi->page_waitq);
>         INIT_LIST_HEAD(&fi->writepages);
> +       fi->dmap_tree = RB_ROOT_CACHED;
> +
> +       if (fc->dax_dev) {
> +               inode->i_flags |= S_DAX;
> +       }
>  }
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index b41275f73e4c..490549862bda 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -70,16 +70,29 @@ struct fuse_forget_link {
>         struct fuse_forget_link *next;
>  };
>
> +#define START(node) ((node)->start)
> +#define LAST(node) ((node)->end)
> +
>  /** Translation information for file offsets to DAX window offsets */
>  struct fuse_dax_mapping {
>         /* Will connect in fc->free_ranges to keep track of free memory */
>         struct list_head list;
>
> +       /* For interval tree in file/inode */
> +       struct rb_node rb;
> +       /** Start Position in file */
> +       __u64 start;
> +       /** End Position in file */
> +       __u64 end;
> +       __u64 __subtree_last;
>         /** Position in DAX window */
>         u64 window_offset;
>
>         /** Length of mapping, in bytes */
>         loff_t length;
> +
> +       /* Is this mapping read-only or read-write */
> +       bool writable;
>  };
>
>  /** FUSE inode */
> @@ -167,6 +180,15 @@ struct fuse_inode {
>
>         /** Lock to protect write related fields */
>         spinlock_t lock;
> +
> +       /*
> +        * Semaphore to protect modifications to dmap_tree
> +        */
> +       struct rw_semaphore i_dmap_sem;
> +
> +       /** Sorted rb tree of struct fuse_dax_mapping elements */
> +       struct rb_root_cached dmap_tree;
> +       unsigned long nr_dmaps;
>  };
>
>  /** FUSE inode state bits */
> @@ -1127,5 +1149,6 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
>   */
>  u64 fuse_get_unique(struct fuse_iqueue *fiq);
>  void fuse_free_conn(struct fuse_conn *fc);
> +void fuse_cleanup_inode_mappings(struct inode *inode);
>
>  #endif /* _FS_FUSE_I_H */
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 36cb9c00bbe5..93bc65607a15 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -86,7 +86,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
>         fi->attr_version = 0;
>         fi->orig_ino = 0;
>         fi->state = 0;
> +       fi->nr_dmaps = 0;
>         mutex_init(&fi->mutex);
> +       init_rwsem(&fi->i_dmap_sem);
>         spin_lock_init(&fi->lock);
>         fi->forget = fuse_alloc_forget();
>         if (!fi->forget) {
> @@ -114,6 +116,10 @@ static void fuse_evict_inode(struct inode *inode)
>         clear_inode(inode);
>         if (inode->i_sb->s_flags & SB_ACTIVE) {
>                 struct fuse_conn *fc = get_fuse_conn(inode);
> +               if (IS_DAX(inode)) {
> +                       fuse_cleanup_inode_mappings(inode);
> +                       WARN_ON(fi->nr_dmaps);
> +               }
>                 fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
>                 fi->forget = NULL;
>         }
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 62633555d547..36d824b82ebc 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -896,6 +896,7 @@ struct fuse_copy_file_range_in {
>
>  #define FUSE_SETUPMAPPING_ENTRIES 8
>  #define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0)
> +#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1)
>  struct fuse_setupmapping_in {
>         /* An already open handle */
>         uint64_t        fh;
> --
> 2.20.1
>
Vivek Goyal March 12, 2020, 4:02 p.m. UTC | #2
On Thu, Mar 12, 2020 at 10:43:10AM +0100, Miklos Szeredi wrote:
> On Wed, Mar 4, 2020 at 5:59 PM Vivek Goyal <vgoyal@redhat.com> wrote:
> >
> > This patch implements basic DAX support. mmap() is not implemented
> > yet and will come in later patches. This patch looks into implemeting
> > read/write.
> >
> > We make use of interval tree to keep track of per inode dax mappings.
> >
> > Do not use dax for file extending writes, instead just send WRITE message
> > to daemon (like we do for direct I/O path). This will keep write and
> > i_size change atomic w.r.t crash.
> >
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
> > Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> > Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
> > Signed-off-by: Peng Tao <tao.peng@linux.alibaba.com>
> > ---
> >  fs/fuse/file.c            | 597 +++++++++++++++++++++++++++++++++++++-
> >  fs/fuse/fuse_i.h          |  23 ++
> >  fs/fuse/inode.c           |   6 +
> >  include/uapi/linux/fuse.h |   1 +
> >  4 files changed, 621 insertions(+), 6 deletions(-)
> >
> > diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> > index 9d67b830fb7a..9effdd3dc6d6 100644
> > --- a/fs/fuse/file.c
> > +++ b/fs/fuse/file.c
> > @@ -18,6 +18,12 @@
> >  #include <linux/swap.h>
> >  #include <linux/falloc.h>
> >  #include <linux/uio.h>
> > +#include <linux/dax.h>
> > +#include <linux/iomap.h>
> > +#include <linux/interval_tree_generic.h>
> > +
> > +INTERVAL_TREE_DEFINE(struct fuse_dax_mapping, rb, __u64, __subtree_last,
> > +                     START, LAST, static inline, fuse_dax_interval_tree);
> 
> Are you using this because of byte ranges (u64)?   Does it not make
> more sense to use page offsets, which are unsigned long and so fit
> nicely into the generic interval tree?

I think I should be able to use generic interval tree. I will switch
to that.

[..]
> > +/* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */
> > +static int fuse_setup_one_mapping(struct inode *inode, loff_t offset,
> > +                                 struct fuse_dax_mapping *dmap, bool writable,
> > +                                 bool upgrade)
> > +{
> > +       struct fuse_conn *fc = get_fuse_conn(inode);
> > +       struct fuse_inode *fi = get_fuse_inode(inode);
> > +       struct fuse_setupmapping_in inarg;
> > +       FUSE_ARGS(args);
> > +       ssize_t err;
> > +
> > +       WARN_ON(offset % FUSE_DAX_MEM_RANGE_SZ);
> > +       WARN_ON(fc->nr_free_ranges < 0);
> > +
> > +       /* Ask fuse daemon to setup mapping */
> > +       memset(&inarg, 0, sizeof(inarg));
> > +       inarg.foffset = offset;
> > +       inarg.fh = -1;
> > +       inarg.moffset = dmap->window_offset;
> > +       inarg.len = FUSE_DAX_MEM_RANGE_SZ;
> > +       inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
> > +       if (writable)
> > +               inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
> > +       args.opcode = FUSE_SETUPMAPPING;
> > +       args.nodeid = fi->nodeid;
> > +       args.in_numargs = 1;
> > +       args.in_args[0].size = sizeof(inarg);
> > +       args.in_args[0].value = &inarg;
> 
> args.force = true?

I can do that but I am not sure what exactly does args.force do and
why do we need it in this case.

First thing it does is that request is allocated with flag __GFP_NOFAIL.
Second thing it does is that caller is forced to wait for request
completion and its not an interruptible sleep. 

I am wondering what makes FUSE_SETUPMAPING/FUSE_REMOVEMAPPING requests
special that we need to set force flag.

> 
> > +       err = fuse_simple_request(fc, &args);
> > +       if (err < 0) {
> > +               printk(KERN_ERR "%s request failed at mem_offset=0x%llx %zd\n",
> > +                                __func__, dmap->window_offset, err);
> 
> Is this level of noisiness really needed?  AFAICS, the error will
> reach the caller, in which case we don't usually need to print a
> kernel error.

I will remove it. I think code in general has quite a few printk() and
pr_debug() we can get rid of. Some of them were helpful for debugging
problems while code was being developed. But now that code is working,
we should be able to drop some of them.

[..]
> > +static int
> > +fuse_send_removemapping(struct inode *inode,
> > +                       struct fuse_removemapping_in *inargp,
> > +                       struct fuse_removemapping_one *remove_one)
> > +{
> > +       struct fuse_inode *fi = get_fuse_inode(inode);
> > +       struct fuse_conn *fc = get_fuse_conn(inode);
> > +       FUSE_ARGS(args);
> > +
> > +       args.opcode = FUSE_REMOVEMAPPING;
> > +       args.nodeid = fi->nodeid;
> > +       args.in_numargs = 2;
> > +       args.in_args[0].size = sizeof(*inargp);
> > +       args.in_args[0].value = inargp;
> > +       args.in_args[1].size = inargp->count * sizeof(*remove_one);
> > +       args.in_args[1].value = remove_one;
> 
> args.force = true?

FUSE_REMOVEMAPPING is an optional nice to have request. Will it make
help to set force.

> 
> > +       return fuse_simple_request(fc, &args);
> > +}
> > +
> > +static int dmap_removemapping_list(struct inode *inode, unsigned num,
> > +                                  struct list_head *to_remove)
> > +{
> > +       struct fuse_removemapping_one *remove_one, *ptr;
> > +       struct fuse_removemapping_in inarg;
> > +       struct fuse_dax_mapping *dmap;
> > +       int ret, i = 0, nr_alloc;
> > +
> > +       nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
> > +       remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
> > +       if (!remove_one)
> > +               return -ENOMEM;
> > +
> > +       ptr = remove_one;
> > +       list_for_each_entry(dmap, to_remove, list) {
> > +               ptr->moffset = dmap->window_offset;
> > +               ptr->len = dmap->length;
> > +               ptr++;
> 
> Minor nit: ptr = &remove_one[i] at the start of the section would be
> cleaner IMO.

Will do.

[..]
> > +static int iomap_begin_setup_new_mapping(struct inode *inode, loff_t pos,
> > +                                        loff_t length, unsigned flags,
> > +                                        struct iomap *iomap)
> > +{
> > +       struct fuse_inode *fi = get_fuse_inode(inode);
> > +       struct fuse_conn *fc = get_fuse_conn(inode);
> > +       struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
> > +       int ret;
> > +       bool writable = flags & IOMAP_WRITE;
> > +
> > +       alloc_dmap = alloc_dax_mapping(fc);
> > +       if (!alloc_dmap)
> > +               return -EBUSY;
> > +
> > +       /*
> > +        * Take write lock so that only one caller can try to setup mapping
> > +        * and other waits.
> > +        */
> > +       down_write(&fi->i_dmap_sem);
> > +       /*
> > +        * We dropped lock. Check again if somebody else setup
> > +        * mapping already.
> > +        */
> > +       dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos,
> > +                                               pos);
> > +       if (dmap) {
> > +               fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> > +               dmap_add_to_free_pool(fc, alloc_dmap);
> > +               up_write(&fi->i_dmap_sem);
> > +               return 0;
> > +       }
> > +
> > +       /* Setup one mapping */
> > +       ret = fuse_setup_one_mapping(inode,
> > +                                    ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
> > +                                    alloc_dmap, writable, false);
> > +       if (ret < 0) {
> > +               printk("fuse_setup_one_mapping() failed. err=%d"
> > +                       " pos=0x%llx, writable=%d\n", ret, pos, writable);
> 
> More  unnecessary noise?

Will remove.

> 
> > +               dmap_add_to_free_pool(fc, alloc_dmap);
> > +               up_write(&fi->i_dmap_sem);
> > +               return ret;
> > +       }
> > +       fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
> > +       up_write(&fi->i_dmap_sem);
> > +       return 0;
> > +}
> > +
> > +static int iomap_begin_upgrade_mapping(struct inode *inode, loff_t pos,
> > +                                        loff_t length, unsigned flags,
> > +                                        struct iomap *iomap)
> > +{
> > +       struct fuse_inode *fi = get_fuse_inode(inode);
> > +       struct fuse_dax_mapping *dmap;
> > +       int ret;
> > +
> > +       /*
> > +        * Take exclusive lock so that only one caller can try to setup
> > +        * mapping and others wait.
> > +        */
> > +       down_write(&fi->i_dmap_sem);
> > +       dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
> > +
> > +       /* We are holding either inode lock or i_mmap_sem, and that should
> > +        * ensure that dmap can't reclaimed or truncated and it should still
> > +        * be there in tree despite the fact we dropped and re-acquired the
> > +        * lock.
> > +        */
> > +       ret = -EIO;
> > +       if (WARN_ON(!dmap))
> > +               goto out_err;
> > +
> > +       /* Maybe another thread already upgraded mapping while we were not
> > +        * holding lock.
> > +        */
> > +       if (dmap->writable)
> > +               goto out_fill_iomap;
> > +
> > +       ret = fuse_setup_one_mapping(inode,
> > +                                    ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
> > +                                    dmap, true, true);
> > +       if (ret < 0) {
> > +               printk("fuse_setup_one_mapping() failed. err=%d pos=0x%llx\n",
> > +                      ret, pos);
> 
> Again.

Will remove. How about converting some of them to pr_debug() instead? It
can help with debugging if something is not working.

> 
> > +               goto out_err;
> > +       }
> > +
> > +out_fill_iomap:
> > +       fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> > +out_err:
> > +       up_write(&fi->i_dmap_sem);
> > +       return ret;
> > +}
> > +
> > +/* This is just for DAX and the mapping is ephemeral, do not use it for other
> > + * purposes since there is no block device with a permanent mapping.
> > + */
> > +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
> > +                           unsigned flags, struct iomap *iomap,
> > +                           struct iomap *srcmap)
> > +{
> > +       struct fuse_inode *fi = get_fuse_inode(inode);
> > +       struct fuse_conn *fc = get_fuse_conn(inode);
> > +       struct fuse_dax_mapping *dmap;
> > +       bool writable = flags & IOMAP_WRITE;
> > +
> > +       /* We don't support FIEMAP */
> > +       BUG_ON(flags & IOMAP_REPORT);
> > +
> > +       pr_debug("fuse_iomap_begin() called. pos=0x%llx length=0x%llx\n",
> > +                       pos, length);
> > +
> > +       /*
> > +        * Writes beyond end of file are not handled using dax path. Instead
> > +        * a fuse write message is sent to daemon
> > +        */
> > +       if (flags & IOMAP_WRITE && pos >= i_size_read(inode))
> > +               return -EIO;
> 
> Okay, this will work fine if the host filesystem is not modified by
> other entities.

This requires little longer explanation. It took me a while to remember
what I did.

For file extending writes, we do not want to go through dax path because
we want written data and file size to be atomic operation w.r.t guest
crash. So in fuse_dax_write_iter() I detect that this is file extending
write and call fuse_dax_direct_write() instead to fall back to regular
fuse message for write and bypass dax.

But if write is partially overwriting and rest is file extending, current
logic tries to use dax for the portion of page which is being overwritten
and fall back to fuse write message for the remaining file extending
write. And that's why after the call to dax_iomap_rw() I check one more
time if there are some bytes not written and use fuse write to extend
file.

        /*
         * If part of the write was file extending, fuse dax path will not
         * take care of that. Do direct write instead.
         */
        if (iov_iter_count(from) && file_extending_write(iocb, from)) {
                count = fuse_dax_direct_write(iocb, from);
                if (count < 0)
                        goto out;
                ret += count;
        }

dax_iomap_rw() will do dax operation for the bytes which are with-in
i_size. Then it will call iomap_apply() again with the portion of
file doing file extending write and this time iomap_begin() will return
-EIO. And dax_iomap_rw() will return number of bytes written (and not
-EIO) to caller. 

        while (iov_iter_count(iter)) {
                ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
                                iter, dax_iomap_actor);
                if (ret <= 0)
                        break;
                pos += ret;
                done += ret;
        }

I am beginning to think that this is way more complicated then it needs
to be. Probably I should detect that if any part of the file is file
extending, just fall back to using fuse write path.

> What happens if there's a concurrent truncate going on on the host
> with this write?

For regular fuse write, concurrent truncate is not a problem. But for
dax read/write/mmap, concurrent truncate is a problem. If another guest
truncates the file (after this guest has mapped this page), then
any attempt to access this page hangs that process. KVM is trying to
fault in a page on host which does not exist anymore. Currently kvm
does not seem to have the logic to be able to deal with errors in
async page fault path. And we will have to modify all that so that
we can somehow propagate errors (SIGBUS) to guest and deliver it
to process.

So if process did mmap() and tried to access truncated portion of
file, then it should get SIGBUS. If we are doing read/write then
we should have the logic to deal with this error (exception table
magic) and deliver -EIO to user space.

None of that is handled right now and is a future TODO item. So
for now, this will work well only with single guest and we will
run into issues if we are sharing directories with another
guest.

> If the two are not in any way synchronized than
> either the two following behavior is allowed:
> 
>  1) Whole or partial data in write is truncated. (If there are
> complete pages from the write being truncated, then the writing
> process will receive SIGBUS.  Does KVM hande that?   I remember that
> being discussed, but don't remember the conclusion).
> 
>  2) Write re-extends file size.

Currently, for file extending writes, if other guest truncates file first
then fuse write will extend file again. If fuse write finished first,
then other guest will truncate file and reduce size.

I think we will have problem when only part of the write is extending
file. In that case part of the file which is being overwritten, we are
doing dax. And if other guest truncates file first, then kvm will hang.

But that's a problem we have with not just file extending write, but
any read/write/mmap w.r.t truncate by another guest. We will have to
fix that before we support virtiofs+dax for shared directory.

> 
> However EIO is not a good result, so we need to do something with it.

This -EIO is not seen by user. But dax_iomap_rw() does not return it
instead returns number of bytes which have been written. 

[..]
> > +static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
> > +{
> > +       struct inode *inode = file_inode(iocb->ki_filp);
> > +       ssize_t ret, count;
> > +
> > +       if (iocb->ki_flags & IOCB_NOWAIT) {
> > +               if (!inode_trylock(inode))
> > +                       return -EAGAIN;
> > +       } else {
> > +               inode_lock(inode);
> > +       }
> > +
> > +       ret = generic_write_checks(iocb, from);
> > +       if (ret <= 0)
> > +               goto out;
> > +
> > +       ret = file_remove_privs(iocb->ki_filp);
> > +       if (ret)
> > +               goto out;
> > +       /* TODO file_update_time() but we don't want metadata I/O */
> > +
> > +       /* Do not use dax for file extending writes as its an mmap and
> > +        * trying to write beyong end of existing page will generate
> > +        * SIGBUS.
> 
> Ah, here it is.  So what happens in case of a race?  Does that
> currently crash KVM?

In case of race, yes, KVM hangs. So no shared directory operation yet
till we have designed proper error handling in kvm path.

Thanks
Vivek
Miklos Szeredi March 13, 2020, 10:18 a.m. UTC | #3
On Thu, Mar 12, 2020 at 5:02 PM Vivek Goyal <vgoyal@redhat.com> wrote:
>
> On Thu, Mar 12, 2020 at 10:43:10AM +0100, Miklos Szeredi wrote:
> > On Wed, Mar 4, 2020 at 5:59 PM Vivek Goyal <vgoyal@redhat.com> wrote:
> > >
> > > This patch implements basic DAX support. mmap() is not implemented
> > > yet and will come in later patches. This patch looks into implemeting
> > > read/write.
> > >
> > > We make use of interval tree to keep track of per inode dax mappings.
> > >
> > > Do not use dax for file extending writes, instead just send WRITE message
> > > to daemon (like we do for direct I/O path). This will keep write and
> > > i_size change atomic w.r.t crash.
> > >
> > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > > Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
> > > Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> > > Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
> > > Signed-off-by: Peng Tao <tao.peng@linux.alibaba.com>
> > > ---
> > >  fs/fuse/file.c            | 597 +++++++++++++++++++++++++++++++++++++-
> > >  fs/fuse/fuse_i.h          |  23 ++
> > >  fs/fuse/inode.c           |   6 +
> > >  include/uapi/linux/fuse.h |   1 +
> > >  4 files changed, 621 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> > > index 9d67b830fb7a..9effdd3dc6d6 100644
> > > --- a/fs/fuse/file.c
> > > +++ b/fs/fuse/file.c
> > > @@ -18,6 +18,12 @@
> > >  #include <linux/swap.h>
> > >  #include <linux/falloc.h>
> > >  #include <linux/uio.h>
> > > +#include <linux/dax.h>
> > > +#include <linux/iomap.h>
> > > +#include <linux/interval_tree_generic.h>
> > > +
> > > +INTERVAL_TREE_DEFINE(struct fuse_dax_mapping, rb, __u64, __subtree_last,
> > > +                     START, LAST, static inline, fuse_dax_interval_tree);
> >
> > Are you using this because of byte ranges (u64)?   Does it not make
> > more sense to use page offsets, which are unsigned long and so fit
> > nicely into the generic interval tree?
>
> I think I should be able to use generic interval tree. I will switch
> to that.
>
> [..]
> > > +/* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */
> > > +static int fuse_setup_one_mapping(struct inode *inode, loff_t offset,
> > > +                                 struct fuse_dax_mapping *dmap, bool writable,
> > > +                                 bool upgrade)
> > > +{
> > > +       struct fuse_conn *fc = get_fuse_conn(inode);
> > > +       struct fuse_inode *fi = get_fuse_inode(inode);
> > > +       struct fuse_setupmapping_in inarg;
> > > +       FUSE_ARGS(args);
> > > +       ssize_t err;
> > > +
> > > +       WARN_ON(offset % FUSE_DAX_MEM_RANGE_SZ);
> > > +       WARN_ON(fc->nr_free_ranges < 0);
> > > +
> > > +       /* Ask fuse daemon to setup mapping */
> > > +       memset(&inarg, 0, sizeof(inarg));
> > > +       inarg.foffset = offset;
> > > +       inarg.fh = -1;
> > > +       inarg.moffset = dmap->window_offset;
> > > +       inarg.len = FUSE_DAX_MEM_RANGE_SZ;
> > > +       inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
> > > +       if (writable)
> > > +               inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
> > > +       args.opcode = FUSE_SETUPMAPPING;
> > > +       args.nodeid = fi->nodeid;
> > > +       args.in_numargs = 1;
> > > +       args.in_args[0].size = sizeof(inarg);
> > > +       args.in_args[0].value = &inarg;
> >
> > args.force = true?
>
> I can do that but I am not sure what exactly does args.force do and
> why do we need it in this case.

Hm, it prevents interrupts.  Looking closely, however it will only
prevent SIGKILL from immediately interrupting the request, otherwise
it will send an INTERRUPT request and the filesystem can ignore that.
Might make sense to have a args.nonint flag to prevent the sending of
INTERRUPT...

> First thing it does is that request is allocated with flag __GFP_NOFAIL.
> Second thing it does is that caller is forced to wait for request
> completion and its not an interruptible sleep.
>
> I am wondering what makes FUSE_SETUPMAPING/FUSE_REMOVEMAPPING requests
> special that we need to set force flag.

Maybe not for SETUPMAPPING (I was confused by the error log).

However if REMOVEMAPPING fails for some reason, than that dax mapping
will be leaked for the lifetime of the filesystem.   Or am I
misunderstanding it?

> > > +       ret = fuse_setup_one_mapping(inode,
> > > +                                    ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
> > > +                                    dmap, true, true);
> > > +       if (ret < 0) {
> > > +               printk("fuse_setup_one_mapping() failed. err=%d pos=0x%llx\n",
> > > +                      ret, pos);
> >
> > Again.
>
> Will remove. How about converting some of them to pr_debug() instead? It
> can help with debugging if something is not working.

Okay, and please move it to fuse_setup_one_mapping() where there's
already a pr_debug() for the success case.

 > > +
> > > +       /* Do not use dax for file extending writes as its an mmap and
> > > +        * trying to write beyong end of existing page will generate
> > > +        * SIGBUS.
> >
> > Ah, here it is.  So what happens in case of a race?  Does that
> > currently crash KVM?
>
> In case of race, yes, KVM hangs. So no shared directory operation yet
> till we have designed proper error handling in kvm path.

I think before this is merged we have to fix the KVM crash; that's not
acceptable even if we explicitly say that shared directory is not
supported for the time being.

Thanks,
Miklos
Vivek Goyal March 13, 2020, 1:41 p.m. UTC | #4
On Fri, Mar 13, 2020 at 11:18:15AM +0100, Miklos Szeredi wrote:

[..]
> > > > +/* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */
> > > > +static int fuse_setup_one_mapping(struct inode *inode, loff_t offset,
> > > > +                                 struct fuse_dax_mapping *dmap, bool writable,
> > > > +                                 bool upgrade)
> > > > +{
> > > > +       struct fuse_conn *fc = get_fuse_conn(inode);
> > > > +       struct fuse_inode *fi = get_fuse_inode(inode);
> > > > +       struct fuse_setupmapping_in inarg;
> > > > +       FUSE_ARGS(args);
> > > > +       ssize_t err;
> > > > +
> > > > +       WARN_ON(offset % FUSE_DAX_MEM_RANGE_SZ);
> > > > +       WARN_ON(fc->nr_free_ranges < 0);
> > > > +
> > > > +       /* Ask fuse daemon to setup mapping */
> > > > +       memset(&inarg, 0, sizeof(inarg));
> > > > +       inarg.foffset = offset;
> > > > +       inarg.fh = -1;
> > > > +       inarg.moffset = dmap->window_offset;
> > > > +       inarg.len = FUSE_DAX_MEM_RANGE_SZ;
> > > > +       inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
> > > > +       if (writable)
> > > > +               inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
> > > > +       args.opcode = FUSE_SETUPMAPPING;
> > > > +       args.nodeid = fi->nodeid;
> > > > +       args.in_numargs = 1;
> > > > +       args.in_args[0].size = sizeof(inarg);
> > > > +       args.in_args[0].value = &inarg;
> > >
> > > args.force = true?
> >
> > I can do that but I am not sure what exactly does args.force do and
> > why do we need it in this case.
> 
> Hm, it prevents interrupts.  Looking closely, however it will only
> prevent SIGKILL from immediately interrupting the request, otherwise
> it will send an INTERRUPT request and the filesystem can ignore that.
> Might make sense to have a args.nonint flag to prevent the sending of
> INTERRUPT...

Hi Miklos,

virtiofs does not support interrupt requests yet. Its fiq interrupt
handler just does not do anything.

static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
__releases(fiq->lock)
{
        /*
         * TODO interrupts.
         *
         * Normal fs operations on a local filesystems aren't interruptible.
         * Exceptions are blocking lock operations; for example fcntl(F_SETLKW)
         * with shared lock between host and guest.
         */
        spin_unlock(&fiq->lock);
}

So as of now setting force or not will not make any difference. We will
still end up waiting for request to finish.

Infact, I think there is no mechanism to set fc->no_interrupt in
virtio_fs. If I am reading request_wait_answer(), correctly, it will
see fc->no_interrupt is not set. That means filesystem supports
interrupt requests and it will do wait_event_interruptible() and
not even check for FR_FORCE bit. 

Right now fc->no_interrupt is set in response to INTERRUPT request
reply. Will it make sense to also be able to set it as part of
connection negotation protocol and filesystem can tell in the
beginning itself that it does not support interrupt and virtiofs
can make use of that.

So force flag is only useful if filesystem does not support interrupt
and in that case we do wait_event_killable() and upon receiving
SIGKILL, cancel request if it is still in pending queue. For virtiofs,
we take request out of fiq->pending queue in submission path itself
and if it can't be dispatched it waits on virtiofs speicfic queue
with FR_PENDING cleared. That means, setting FR_FORCE for virtiofs
does not mean anything as caller will end up waiting for
request to finish anyway.

IOW, setting FR_FORCE will make sense when we have mechanism to
detect that request is still queued in virtiofs queues and have
mechanism to cancel it. We don't have it. In fact, given we are
a push model, we dispatch request immediately to filesystem,
until and unless virtqueue is full. So probability of a request
still in virtiofs queue is low.

So may be we can start setting force at some point of time later
when we have mechanism to cancel detect and cancel pending requests
in virtiofs.

> 
> > First thing it does is that request is allocated with flag __GFP_NOFAIL.
> > Second thing it does is that caller is forced to wait for request
> > completion and its not an interruptible sleep.
> >
> > I am wondering what makes FUSE_SETUPMAPING/FUSE_REMOVEMAPPING requests
> > special that we need to set force flag.
> 
> Maybe not for SETUPMAPPING (I was confused by the error log).
> 
> However if REMOVEMAPPING fails for some reason, than that dax mapping
> will be leaked for the lifetime of the filesystem.   Or am I
> misunderstanding it?

FUSE_REMVOEMAPPING is not must. If we send another FUSE_SETUPMAPPING, then
it will create the new mapping and free up resources associated with
the previous mapping, IIUC.

So at one point of time we were thinking that what's the point of
sending FUSE_REMOVEMAPPING. It helps a bit with freeing up filesystem
resources earlier. So if cache size is big, then there will not be
much reclaim activity going and if we don't send FUSE_REMOVEMAPPING,
all these filesystem resources will remain busy on host for a long
time.

> 
> > > > +       ret = fuse_setup_one_mapping(inode,
> > > > +                                    ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
> > > > +                                    dmap, true, true);
> > > > +       if (ret < 0) {
> > > > +               printk("fuse_setup_one_mapping() failed. err=%d pos=0x%llx\n",
> > > > +                      ret, pos);
> > >
> > > Again.
> >
> > Will remove. How about converting some of them to pr_debug() instead? It
> > can help with debugging if something is not working.
> 
> Okay, and please move it to fuse_setup_one_mapping() where there's
> already a pr_debug() for the success case.

Will do.

> 
>  > > +
> > > > +       /* Do not use dax for file extending writes as its an mmap and
> > > > +        * trying to write beyong end of existing page will generate
> > > > +        * SIGBUS.
> > >
> > > Ah, here it is.  So what happens in case of a race?  Does that
> > > currently crash KVM?
> >
> > In case of race, yes, KVM hangs. So no shared directory operation yet
> > till we have designed proper error handling in kvm path.
> 
> I think before this is merged we have to fix the KVM crash; that's not
> acceptable even if we explicitly say that shared directory is not
> supported for the time being.

Ok, I will look into it. I had done some work in the past and realized
its not trivial to fix kvm error paths. There are no users and propagating
signals back into qemu instances and finding the right process is going to be
tricky.

Given the complexity of that work, I thought that for now we say that
shared directory is not supported and once basic dax patches get merged,
focus on kvm work.

Thanks
Vivek
Liu Bo April 4, 2020, 12:25 a.m. UTC | #5
On Wed, Mar 04, 2020 at 11:58:38AM -0500, Vivek Goyal wrote:
> This patch implements basic DAX support. mmap() is not implemented
> yet and will come in later patches. This patch looks into implemeting
> read/write.
> 
> We make use of interval tree to keep track of per inode dax mappings.
> 
> Do not use dax for file extending writes, instead just send WRITE message
> to daemon (like we do for direct I/O path). This will keep write and
> i_size change atomic w.r.t crash.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
> Signed-off-by: Peng Tao <tao.peng@linux.alibaba.com>
> ---
>  fs/fuse/file.c            | 597 +++++++++++++++++++++++++++++++++++++-
>  fs/fuse/fuse_i.h          |  23 ++
>  fs/fuse/inode.c           |   6 +
>  include/uapi/linux/fuse.h |   1 +
>  4 files changed, 621 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 9d67b830fb7a..9effdd3dc6d6 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -18,6 +18,12 @@
>  #include <linux/swap.h>
>  #include <linux/falloc.h>
>  #include <linux/uio.h>
> +#include <linux/dax.h>
> +#include <linux/iomap.h>
> +#include <linux/interval_tree_generic.h>
> +
> +INTERVAL_TREE_DEFINE(struct fuse_dax_mapping, rb, __u64, __subtree_last,
> +                     START, LAST, static inline, fuse_dax_interval_tree);
>  
>  static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
>  				      struct fuse_page_desc **desc)
> @@ -187,6 +193,242 @@ static void fuse_link_write_file(struct file *file)
>  	spin_unlock(&fi->lock);
>  }
>  
> +static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn *fc)
> +{
> +	struct fuse_dax_mapping *dmap = NULL;
> +
> +	spin_lock(&fc->lock);
> +
> +	if (fc->nr_free_ranges <= 0) {
> +		spin_unlock(&fc->lock);
> +		return NULL;
> +	}
> +
> +	WARN_ON(list_empty(&fc->free_ranges));
> +
> +	/* Take a free range */
> +	dmap = list_first_entry(&fc->free_ranges, struct fuse_dax_mapping,
> +					list);
> +	list_del_init(&dmap->list);
> +	fc->nr_free_ranges--;
> +	spin_unlock(&fc->lock);
> +	return dmap;
> +}
> +
> +/* This assumes fc->lock is held */
> +static void __dmap_add_to_free_pool(struct fuse_conn *fc,
> +				struct fuse_dax_mapping *dmap)
> +{
> +	list_add_tail(&dmap->list, &fc->free_ranges);
> +	fc->nr_free_ranges++;
> +}
> +
> +static void dmap_add_to_free_pool(struct fuse_conn *fc,
> +				struct fuse_dax_mapping *dmap)
> +{
> +	/* Return fuse_dax_mapping to free list */
> +	spin_lock(&fc->lock);
> +	__dmap_add_to_free_pool(fc, dmap);
> +	spin_unlock(&fc->lock);
> +}
> +
> +/* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */
> +static int fuse_setup_one_mapping(struct inode *inode, loff_t offset,
> +				  struct fuse_dax_mapping *dmap, bool writable,
> +				  bool upgrade)
> +{
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_setupmapping_in inarg;
> +	FUSE_ARGS(args);
> +	ssize_t err;
> +
> +	WARN_ON(offset % FUSE_DAX_MEM_RANGE_SZ);
> +	WARN_ON(fc->nr_free_ranges < 0);
> +
> +	/* Ask fuse daemon to setup mapping */
> +	memset(&inarg, 0, sizeof(inarg));
> +	inarg.foffset = offset;
> +	inarg.fh = -1;
> +	inarg.moffset = dmap->window_offset;
> +	inarg.len = FUSE_DAX_MEM_RANGE_SZ;
> +	inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
> +	if (writable)
> +		inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
> +	args.opcode = FUSE_SETUPMAPPING;
> +	args.nodeid = fi->nodeid;
> +	args.in_numargs = 1;
> +	args.in_args[0].size = sizeof(inarg);
> +	args.in_args[0].value = &inarg;
> +	err = fuse_simple_request(fc, &args);
> +	if (err < 0) {
> +		printk(KERN_ERR "%s request failed at mem_offset=0x%llx %zd\n",
> +				 __func__, dmap->window_offset, err);
> +		return err;
> +	}
> +
> +	pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx writable=%d"
> +		 " err=%zd\n", offset, writable, err);
> +
> +	dmap->writable = writable;
> +	if (!upgrade) {
> +		dmap->start = offset;
> +		dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1;
> +		/* Protected by fi->i_dmap_sem */
> +		fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree);
> +		fi->nr_dmaps++;
> +	}
> +	return 0;
> +}
> +
> +static int
> +fuse_send_removemapping(struct inode *inode,
> +			struct fuse_removemapping_in *inargp,
> +			struct fuse_removemapping_one *remove_one)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	FUSE_ARGS(args);
> +
> +	args.opcode = FUSE_REMOVEMAPPING;
> +	args.nodeid = fi->nodeid;
> +	args.in_numargs = 2;
> +	args.in_args[0].size = sizeof(*inargp);
> +	args.in_args[0].value = inargp;
> +	args.in_args[1].size = inargp->count * sizeof(*remove_one);
> +	args.in_args[1].value = remove_one;
> +	return fuse_simple_request(fc, &args);
> +}
> +
> +static int dmap_removemapping_list(struct inode *inode, unsigned num,
> +				   struct list_head *to_remove)
> +{
> +	struct fuse_removemapping_one *remove_one, *ptr;
> +	struct fuse_removemapping_in inarg;
> +	struct fuse_dax_mapping *dmap;
> +	int ret, i = 0, nr_alloc;
> +
> +	nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
> +	remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
> +	if (!remove_one)
> +		return -ENOMEM;
> +
> +	ptr = remove_one;
> +	list_for_each_entry(dmap, to_remove, list) {
> +		ptr->moffset = dmap->window_offset;
> +		ptr->len = dmap->length;
> +		ptr++;
> +		i++;
> +		num--;
> +		if (i >= nr_alloc || num == 0) {
> +			memset(&inarg, 0, sizeof(inarg));
> +			inarg.count = i;
> +			ret = fuse_send_removemapping(inode, &inarg,
> +						      remove_one);
> +			if (ret)
> +				goto out;
> +			ptr = remove_one;
> +			i = 0;
> +		}
> +	}
> +out:
> +	kfree(remove_one);
> +	return ret;
> +}
> +
> +/*
> + * Cleanup dmap entry and add back to free list. This should be called with
> + * fc->lock held.
> + */
> +static void dmap_reinit_add_to_free_pool(struct fuse_conn *fc,
> +					    struct fuse_dax_mapping *dmap)
> +{
> +	pr_debug("fuse: freeing memory range start=0x%llx end=0x%llx "
> +		 "window_offset=0x%llx length=0x%llx\n", dmap->start,
> +		 dmap->end, dmap->window_offset, dmap->length);
> +	dmap->start = dmap->end = 0;
> +	__dmap_add_to_free_pool(fc, dmap);
> +}
> +
> +/*
> + * Free inode dmap entries whose range falls entirely inside [start, end].
> + * Does not take any locks. At this point of time it should only be
> + * called from evict_inode() path where we know all dmap entries can be
> + * reclaimed.
> + */
> +static void inode_reclaim_dmap_range(struct fuse_conn *fc, struct inode *inode,
> +				      loff_t start, loff_t end)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_dax_mapping *dmap, *n;
> +	int err, num = 0;
> +	LIST_HEAD(to_remove);
> +
> +	pr_debug("fuse: %s: start=0x%llx, end=0x%llx\n", __func__, start, end);
> +
> +	/*
> +	 * Interval tree search matches intersecting entries. Adjust the range
> +	 * to avoid dropping partial valid entries.
> +	 */
> +	start = ALIGN(start, FUSE_DAX_MEM_RANGE_SZ);
> +	end = ALIGN_DOWN(end, FUSE_DAX_MEM_RANGE_SZ);
> +
> +	while (1) {
> +		dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, start,
> +							 end);
> +		if (!dmap)
> +			break;
> +		fuse_dax_interval_tree_remove(dmap, &fi->dmap_tree);
> +		num++;
> +		list_add(&dmap->list, &to_remove);
> +	}
> +
> +	/* Nothing to remove */
> +	if (list_empty(&to_remove))
> +		return;
> +
> +	WARN_ON(fi->nr_dmaps < num);
> +	fi->nr_dmaps -= num;
> +	/*
> +	 * During umount/shutdown, fuse connection is dropped first
> +	 * and evict_inode() is called later. That means any
> +	 * removemapping messages are going to fail. Send messages
> +	 * only if connection is up. Otherwise fuse daemon is
> +	 * responsible for cleaning up any leftover references and
> +	 * mappings.
> +	 */
> +	if (fc->connected) {
> +		err = dmap_removemapping_list(inode, num, &to_remove);
> +		if (err) {
> +			pr_warn("Failed to removemappings. start=0x%llx"
> +				" end=0x%llx\n", start, end);
> +		}
> +	}
> +	spin_lock(&fc->lock);
> +	list_for_each_entry_safe(dmap, n, &to_remove, list) {
> +		list_del_init(&dmap->list);
> +		dmap_reinit_add_to_free_pool(fc, dmap);
> +	}
> +	spin_unlock(&fc->lock);
> +}
> +
> +/*
> + * It is called from evict_inode() and by that time inode is going away. So
> + * this function does not take any locks like fi->i_dmap_sem for traversing
> + * that fuse inode interval tree. If that lock is taken then lock validator
> + * complains of deadlock situation w.r.t fs_reclaim lock.
> + */
> +void fuse_cleanup_inode_mappings(struct inode *inode)
> +{
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	/*
> +	 * fuse_evict_inode() has alredy called truncate_inode_pages_final()
> +	 * before we arrive here. So we should not have to worry about
> +	 * any pages/exception entries still associated with inode.
> +	 */
> +	inode_reclaim_dmap_range(fc, inode, 0, -1);
> +}
> +
>  void fuse_finish_open(struct inode *inode, struct file *file)
>  {
>  	struct fuse_file *ff = file->private_data;
> @@ -1562,32 +1804,364 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	return res;
>  }
>  
> +static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
>  static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct fuse_file *ff = file->private_data;
> +	struct inode *inode = file->f_mapping->host;
>  
>  	if (is_bad_inode(file_inode(file)))
>  		return -EIO;
>  
> -	if (!(ff->open_flags & FOPEN_DIRECT_IO))
> -		return fuse_cache_read_iter(iocb, to);
> -	else
> +	if (IS_DAX(inode))
> +		return fuse_dax_read_iter(iocb, to);
> +
> +	if (ff->open_flags & FOPEN_DIRECT_IO)
>  		return fuse_direct_read_iter(iocb, to);
> +
> +	return fuse_cache_read_iter(iocb, to);
>  }
>  
> +static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
>  static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct fuse_file *ff = file->private_data;
> +	struct inode *inode = file->f_mapping->host;
>  
>  	if (is_bad_inode(file_inode(file)))
>  		return -EIO;
>  
> -	if (!(ff->open_flags & FOPEN_DIRECT_IO))
> -		return fuse_cache_write_iter(iocb, from);
> -	else
> +	if (IS_DAX(inode))
> +		return fuse_dax_write_iter(iocb, from);
> +
> +	if (ff->open_flags & FOPEN_DIRECT_IO)
>  		return fuse_direct_write_iter(iocb, from);
> +
> +	return fuse_cache_write_iter(iocb, from);
> +}
> +
> +static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
> +{
> +	iomap->addr = IOMAP_NULL_ADDR;
> +	iomap->length = length;
> +	iomap->type = IOMAP_HOLE;
> +}
> +
> +static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
> +			struct iomap *iomap, struct fuse_dax_mapping *dmap,
> +			unsigned flags)
> +{
> +	loff_t offset, len;
> +	loff_t i_size = i_size_read(inode);
> +
> +	offset = pos - dmap->start;
> +	len = min(length, dmap->length - offset);
> +
> +	/* If length is beyond end of file, truncate further */
> +	if (pos + len > i_size)
> +		len = i_size - pos;
> +
> +	if (len > 0) {
> +		iomap->addr = dmap->window_offset + offset;
> +		iomap->length = len;
> +		if (flags & IOMAP_FAULT)
> +			iomap->length = ALIGN(len, PAGE_SIZE);
> +		iomap->type = IOMAP_MAPPED;
> +		pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
> +				" length 0x%llx\n", __func__, iomap->addr,
> +				iomap->offset, iomap->length);
> +	} else {
> +		/* Mapping beyond end of file is hole */
> +		fuse_fill_iomap_hole(iomap, length);
> +		pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
> +				"length 0x%llx\n", __func__, iomap->addr,
> +				iomap->offset, iomap->length);
> +	}
> +}
> +
> +static int iomap_begin_setup_new_mapping(struct inode *inode, loff_t pos,
> +					 loff_t length, unsigned flags,
> +					 struct iomap *iomap)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
> +	int ret;
> +	bool writable = flags & IOMAP_WRITE;
> +
> +	alloc_dmap = alloc_dax_mapping(fc);
> +	if (!alloc_dmap)
> +		return -EBUSY;
> +
> +	/*
> +	 * Take write lock so that only one caller can try to setup mapping
> +	 * and other waits.
> +	 */
> +	down_write(&fi->i_dmap_sem);
> +	/*
> +	 * We dropped lock. Check again if somebody else setup
> +	 * mapping already.
> +	 */
> +	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos,
> +						pos);
> +	if (dmap) {
> +		fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> +		dmap_add_to_free_pool(fc, alloc_dmap);
> +		up_write(&fi->i_dmap_sem);
> +		return 0;
> +	}
> +
> +	/* Setup one mapping */
> +	ret = fuse_setup_one_mapping(inode,
> +				     ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
> +				     alloc_dmap, writable, false);
> +	if (ret < 0) {
> +		printk("fuse_setup_one_mapping() failed. err=%d"
> +			" pos=0x%llx, writable=%d\n", ret, pos, writable);
> +		dmap_add_to_free_pool(fc, alloc_dmap);
> +		up_write(&fi->i_dmap_sem);
> +		return ret;
> +	}
> +	fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
> +	up_write(&fi->i_dmap_sem);
> +	return 0;
> +}
> +
> +static int iomap_begin_upgrade_mapping(struct inode *inode, loff_t pos,
> +					 loff_t length, unsigned flags,
> +					 struct iomap *iomap)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_dax_mapping *dmap;
> +	int ret;
> +
> +	/*
> +	 * Take exclusive lock so that only one caller can try to setup
> +	 * mapping and others wait.
> +	 */
> +	down_write(&fi->i_dmap_sem);
> +	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
> +
> +	/* We are holding either inode lock or i_mmap_sem, and that should
> +	 * ensure that dmap can't reclaimed or truncated and it should still
> +	 * be there in tree despite the fact we dropped and re-acquired the
> +	 * lock.
> +	 */
> +	ret = -EIO;
> +	if (WARN_ON(!dmap))
> +		goto out_err;
> +
> +	/* Maybe another thread already upgraded mapping while we were not
> +	 * holding lock.
> +	 */
> +	if (dmap->writable)

oops, looks like it's still returning -EIO here, %ret should be zero.

thanks,
liubo

> +		goto out_fill_iomap;
> +
> +	ret = fuse_setup_one_mapping(inode,
> +				     ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
> +				     dmap, true, true);
> +	if (ret < 0) {
> +		printk("fuse_setup_one_mapping() failed. err=%d pos=0x%llx\n",
> +		       ret, pos);
> +		goto out_err;
> +	}
> +
> +out_fill_iomap:
> +	fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> +out_err:
> +	up_write(&fi->i_dmap_sem);
> +	return ret;
> +}
> +
> +/* This is just for DAX and the mapping is ephemeral, do not use it for other
> + * purposes since there is no block device with a permanent mapping.
> + */
> +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
> +			    unsigned flags, struct iomap *iomap,
> +			    struct iomap *srcmap)
> +{
> +	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_conn *fc = get_fuse_conn(inode);
> +	struct fuse_dax_mapping *dmap;
> +	bool writable = flags & IOMAP_WRITE;
> +
> +	/* We don't support FIEMAP */
> +	BUG_ON(flags & IOMAP_REPORT);
> +
> +	pr_debug("fuse_iomap_begin() called. pos=0x%llx length=0x%llx\n",
> +			pos, length);
> +
> +	/*
> +	 * Writes beyond end of file are not handled using dax path. Instead
> +	 * a fuse write message is sent to daemon
> +	 */
> +	if (flags & IOMAP_WRITE && pos >= i_size_read(inode))
> +		return -EIO;
> +
> +	iomap->offset = pos;
> +	iomap->flags = 0;
> +	iomap->bdev = NULL;
> +	iomap->dax_dev = fc->dax_dev;
> +
> +	/*
> +	 * Both read/write and mmap path can race here. So we need something
> +	 * to make sure if we are setting up mapping, then other path waits
> +	 *
> +	 * For now, use a semaphore for this. It probably needs to be
> +	 * optimized later.
> +	 */
> +	down_read(&fi->i_dmap_sem);
> +	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
> +
> +	if (dmap) {
> +		if (writable && !dmap->writable) {
> +			/* Upgrade read-only mapping to read-write. This will
> +			 * require exclusive i_dmap_sem lock as we don't want
> +			 * two threads to be trying to this simultaneously
> +			 * for same dmap. So drop shared lock and acquire
> +			 * exclusive lock.
> +			 */
> +			up_read(&fi->i_dmap_sem);
> +			pr_debug("%s: Upgrading mapping at offset 0x%llx"
> +				 " length 0x%llx\n", __func__, pos, length);
> +			return iomap_begin_upgrade_mapping(inode, pos, length,
> +							   flags, iomap);
> +		} else {
> +			fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
> +			up_read(&fi->i_dmap_sem);
> +			return 0;
> +		}
> +	} else {
> +		up_read(&fi->i_dmap_sem);
> +		pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
> +				__func__, pos, length);
> +		if (pos >= i_size_read(inode))
> +			goto iomap_hole;
> +
> +		return iomap_begin_setup_new_mapping(inode, pos, length, flags,
> +						     iomap);
> +	}
> +
> +	/*
> +	 * If read beyond end of file happnes, fs code seems to return
> +	 * it as hole
> +	 */
> +iomap_hole:
> +	fuse_fill_iomap_hole(iomap, length);
> +	pr_debug("fuse_iomap_begin() returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", pos, length, iomap->length);
> +	return 0;
> +}
> +
> +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
> +			  ssize_t written, unsigned flags,
> +			  struct iomap *iomap)
> +{
> +	/* DAX writes beyond end-of-file aren't handled using iomap, so the
> +	 * file size is unchanged and there is nothing to do here.
> +	 */
> +	return 0;
> +}
> +
> +static const struct iomap_ops fuse_iomap_ops = {
> +	.iomap_begin = fuse_iomap_begin,
> +	.iomap_end = fuse_iomap_end,
> +};
> +
> +static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	ssize_t ret;
> +
> +	if (iocb->ki_flags & IOCB_NOWAIT) {
> +		if (!inode_trylock_shared(inode))
> +			return -EAGAIN;
> +	} else {
> +		inode_lock_shared(inode);
> +	}
> +
> +	ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
> +	inode_unlock_shared(inode);
> +
> +	/* TODO file_accessed(iocb->f_filp) */
> +	return ret;
> +}
> +
> +static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	return (iov_iter_rw(from) == WRITE &&
> +		((iocb->ki_pos) >= i_size_read(inode)));
> +}
> +
> +static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
> +	ssize_t ret;
> +
> +	ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
> +	if (ret < 0)
> +		return ret;
> +
> +	fuse_invalidate_attr(inode);
> +	fuse_write_update_size(inode, iocb->ki_pos);
> +	return ret;
> +}
> +
> +static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	ssize_t ret, count;
> +
> +	if (iocb->ki_flags & IOCB_NOWAIT) {
> +		if (!inode_trylock(inode))
> +			return -EAGAIN;
> +	} else {
> +		inode_lock(inode);
> +	}
> +
> +	ret = generic_write_checks(iocb, from);
> +	if (ret <= 0)
> +		goto out;
> +
> +	ret = file_remove_privs(iocb->ki_filp);
> +	if (ret)
> +		goto out;
> +	/* TODO file_update_time() but we don't want metadata I/O */
> +
> +	/* Do not use dax for file extending writes as its an mmap and
> +	 * trying to write beyong end of existing page will generate
> +	 * SIGBUS.
> +	 */
> +	if (file_extending_write(iocb, from)) {
> +		ret = fuse_dax_direct_write(iocb, from);
> +		goto out;
> +	}
> +
> +	ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
> +	if (ret < 0)
> +		goto out;
> +
> +	/*
> +	 * If part of the write was file extending, fuse dax path will not
> +	 * take care of that. Do direct write instead.
> +	 */
> +	if (iov_iter_count(from) && file_extending_write(iocb, from)) {
> +		count = fuse_dax_direct_write(iocb, from);
> +		if (count < 0)
> +			goto out;
> +		ret += count;
> +	}
> +
> +out:
> +	inode_unlock(inode);
> +
> +	if (ret > 0)
> +		ret = generic_write_sync(iocb, ret);
> +	return ret;
>  }
>  
>  static void fuse_writepage_free(struct fuse_writepage_args *wpa)
> @@ -2318,6 +2892,11 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
>  	return 0;
>  }
>  
> +static int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	return -EINVAL; /* TODO */
> +}
> +
>  static int convert_fuse_file_lock(struct fuse_conn *fc,
>  				  const struct fuse_file_lock *ffl,
>  				  struct file_lock *fl)
> @@ -3387,6 +3966,7 @@ static const struct address_space_operations fuse_file_aops  = {
>  void fuse_init_file_inode(struct inode *inode)
>  {
>  	struct fuse_inode *fi = get_fuse_inode(inode);
> +	struct fuse_conn *fc = get_fuse_conn(inode);
>  
>  	inode->i_fop = &fuse_file_operations;
>  	inode->i_data.a_ops = &fuse_file_aops;
> @@ -3396,4 +3976,9 @@ void fuse_init_file_inode(struct inode *inode)
>  	fi->writectr = 0;
>  	init_waitqueue_head(&fi->page_waitq);
>  	INIT_LIST_HEAD(&fi->writepages);
> +	fi->dmap_tree = RB_ROOT_CACHED;
> +
> +	if (fc->dax_dev) {
> +		inode->i_flags |= S_DAX;
> +	}
>  }
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index b41275f73e4c..490549862bda 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -70,16 +70,29 @@ struct fuse_forget_link {
>  	struct fuse_forget_link *next;
>  };
>  
> +#define START(node) ((node)->start)
> +#define LAST(node) ((node)->end)
> +
>  /** Translation information for file offsets to DAX window offsets */
>  struct fuse_dax_mapping {
>  	/* Will connect in fc->free_ranges to keep track of free memory */
>  	struct list_head list;
>  
> +	/* For interval tree in file/inode */
> +	struct rb_node rb;
> +	/** Start Position in file */
> +	__u64 start;
> +	/** End Position in file */
> +	__u64 end;
> +	__u64 __subtree_last;
>  	/** Position in DAX window */
>  	u64 window_offset;
>  
>  	/** Length of mapping, in bytes */
>  	loff_t length;
> +
> +	/* Is this mapping read-only or read-write */
> +	bool writable;
>  };
>  
>  /** FUSE inode */
> @@ -167,6 +180,15 @@ struct fuse_inode {
>  
>  	/** Lock to protect write related fields */
>  	spinlock_t lock;
> +
> +	/*
> +	 * Semaphore to protect modifications to dmap_tree
> +	 */
> +	struct rw_semaphore i_dmap_sem;
> +
> +	/** Sorted rb tree of struct fuse_dax_mapping elements */
> +	struct rb_root_cached dmap_tree;
> +	unsigned long nr_dmaps;
>  };
>  
>  /** FUSE inode state bits */
> @@ -1127,5 +1149,6 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
>   */
>  u64 fuse_get_unique(struct fuse_iqueue *fiq);
>  void fuse_free_conn(struct fuse_conn *fc);
> +void fuse_cleanup_inode_mappings(struct inode *inode);
>  
>  #endif /* _FS_FUSE_I_H */
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 36cb9c00bbe5..93bc65607a15 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -86,7 +86,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
>  	fi->attr_version = 0;
>  	fi->orig_ino = 0;
>  	fi->state = 0;
> +	fi->nr_dmaps = 0;
>  	mutex_init(&fi->mutex);
> +	init_rwsem(&fi->i_dmap_sem);
>  	spin_lock_init(&fi->lock);
>  	fi->forget = fuse_alloc_forget();
>  	if (!fi->forget) {
> @@ -114,6 +116,10 @@ static void fuse_evict_inode(struct inode *inode)
>  	clear_inode(inode);
>  	if (inode->i_sb->s_flags & SB_ACTIVE) {
>  		struct fuse_conn *fc = get_fuse_conn(inode);
> +		if (IS_DAX(inode)) {
> +			fuse_cleanup_inode_mappings(inode);
> +			WARN_ON(fi->nr_dmaps);
> +		}
>  		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
>  		fi->forget = NULL;
>  	}
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 62633555d547..36d824b82ebc 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -896,6 +896,7 @@ struct fuse_copy_file_range_in {
>  
>  #define FUSE_SETUPMAPPING_ENTRIES 8
>  #define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0)
> +#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1)
>  struct fuse_setupmapping_in {
>  	/* An already open handle */
>  	uint64_t	fh;
> -- 
> 2.20.1
Vivek Goyal April 14, 2020, 12:54 p.m. UTC | #6
On Sat, Apr 04, 2020 at 08:25:21AM +0800, Liu Bo wrote:

[..]
> > +static int iomap_begin_upgrade_mapping(struct inode *inode, loff_t pos,
> > +					 loff_t length, unsigned flags,
> > +					 struct iomap *iomap)
> > +{
> > +	struct fuse_inode *fi = get_fuse_inode(inode);
> > +	struct fuse_dax_mapping *dmap;
> > +	int ret;
> > +
> > +	/*
> > +	 * Take exclusive lock so that only one caller can try to setup
> > +	 * mapping and others wait.
> > +	 */
> > +	down_write(&fi->i_dmap_sem);
> > +	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
> > +
> > +	/* We are holding either inode lock or i_mmap_sem, and that should
> > +	 * ensure that dmap can't reclaimed or truncated and it should still
> > +	 * be there in tree despite the fact we dropped and re-acquired the
> > +	 * lock.
> > +	 */
> > +	ret = -EIO;
> > +	if (WARN_ON(!dmap))
> > +		goto out_err;
> > +
> > +	/* Maybe another thread already upgraded mapping while we were not
> > +	 * holding lock.
> > +	 */
> > +	if (dmap->writable)
> 
> oops, looks like it's still returning -EIO here, %ret should be zero.
> 

Good catch. Will fix it.

Vivek
diff mbox series

Patch

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9d67b830fb7a..9effdd3dc6d6 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -18,6 +18,12 @@ 
 #include <linux/swap.h>
 #include <linux/falloc.h>
 #include <linux/uio.h>
+#include <linux/dax.h>
+#include <linux/iomap.h>
+#include <linux/interval_tree_generic.h>
+
+INTERVAL_TREE_DEFINE(struct fuse_dax_mapping, rb, __u64, __subtree_last,
+                     START, LAST, static inline, fuse_dax_interval_tree);
 
 static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
 				      struct fuse_page_desc **desc)
@@ -187,6 +193,242 @@  static void fuse_link_write_file(struct file *file)
 	spin_unlock(&fi->lock);
 }
 
+static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn *fc)
+{
+	struct fuse_dax_mapping *dmap = NULL;
+
+	spin_lock(&fc->lock);
+
+	if (fc->nr_free_ranges <= 0) {
+		spin_unlock(&fc->lock);
+		return NULL;
+	}
+
+	WARN_ON(list_empty(&fc->free_ranges));
+
+	/* Take a free range */
+	dmap = list_first_entry(&fc->free_ranges, struct fuse_dax_mapping,
+					list);
+	list_del_init(&dmap->list);
+	fc->nr_free_ranges--;
+	spin_unlock(&fc->lock);
+	return dmap;
+}
+
+/* This assumes fc->lock is held */
+static void __dmap_add_to_free_pool(struct fuse_conn *fc,
+				struct fuse_dax_mapping *dmap)
+{
+	list_add_tail(&dmap->list, &fc->free_ranges);
+	fc->nr_free_ranges++;
+}
+
+static void dmap_add_to_free_pool(struct fuse_conn *fc,
+				struct fuse_dax_mapping *dmap)
+{
+	/* Return fuse_dax_mapping to free list */
+	spin_lock(&fc->lock);
+	__dmap_add_to_free_pool(fc, dmap);
+	spin_unlock(&fc->lock);
+}
+
+/* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */
+static int fuse_setup_one_mapping(struct inode *inode, loff_t offset,
+				  struct fuse_dax_mapping *dmap, bool writable,
+				  bool upgrade)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_setupmapping_in inarg;
+	FUSE_ARGS(args);
+	ssize_t err;
+
+	WARN_ON(offset % FUSE_DAX_MEM_RANGE_SZ);
+	WARN_ON(fc->nr_free_ranges < 0);
+
+	/* Ask fuse daemon to setup mapping */
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.foffset = offset;
+	inarg.fh = -1;
+	inarg.moffset = dmap->window_offset;
+	inarg.len = FUSE_DAX_MEM_RANGE_SZ;
+	inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
+	if (writable)
+		inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
+	args.opcode = FUSE_SETUPMAPPING;
+	args.nodeid = fi->nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	err = fuse_simple_request(fc, &args);
+	if (err < 0) {
+		printk(KERN_ERR "%s request failed at mem_offset=0x%llx %zd\n",
+				 __func__, dmap->window_offset, err);
+		return err;
+	}
+
+	pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx writable=%d"
+		 " err=%zd\n", offset, writable, err);
+
+	dmap->writable = writable;
+	if (!upgrade) {
+		dmap->start = offset;
+		dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1;
+		/* Protected by fi->i_dmap_sem */
+		fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree);
+		fi->nr_dmaps++;
+	}
+	return 0;
+}
+
+static int
+fuse_send_removemapping(struct inode *inode,
+			struct fuse_removemapping_in *inargp,
+			struct fuse_removemapping_one *remove_one)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+
+	args.opcode = FUSE_REMOVEMAPPING;
+	args.nodeid = fi->nodeid;
+	args.in_numargs = 2;
+	args.in_args[0].size = sizeof(*inargp);
+	args.in_args[0].value = inargp;
+	args.in_args[1].size = inargp->count * sizeof(*remove_one);
+	args.in_args[1].value = remove_one;
+	return fuse_simple_request(fc, &args);
+}
+
+static int dmap_removemapping_list(struct inode *inode, unsigned num,
+				   struct list_head *to_remove)
+{
+	struct fuse_removemapping_one *remove_one, *ptr;
+	struct fuse_removemapping_in inarg;
+	struct fuse_dax_mapping *dmap;
+	int ret, i = 0, nr_alloc;
+
+	nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
+	remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
+	if (!remove_one)
+		return -ENOMEM;
+
+	ptr = remove_one;
+	list_for_each_entry(dmap, to_remove, list) {
+		ptr->moffset = dmap->window_offset;
+		ptr->len = dmap->length;
+		ptr++;
+		i++;
+		num--;
+		if (i >= nr_alloc || num == 0) {
+			memset(&inarg, 0, sizeof(inarg));
+			inarg.count = i;
+			ret = fuse_send_removemapping(inode, &inarg,
+						      remove_one);
+			if (ret)
+				goto out;
+			ptr = remove_one;
+			i = 0;
+		}
+	}
+out:
+	kfree(remove_one);
+	return ret;
+}
+
+/*
+ * Cleanup dmap entry and add back to free list. This should be called with
+ * fc->lock held.
+ */
+static void dmap_reinit_add_to_free_pool(struct fuse_conn *fc,
+					    struct fuse_dax_mapping *dmap)
+{
+	pr_debug("fuse: freeing memory range start=0x%llx end=0x%llx "
+		 "window_offset=0x%llx length=0x%llx\n", dmap->start,
+		 dmap->end, dmap->window_offset, dmap->length);
+	dmap->start = dmap->end = 0;
+	__dmap_add_to_free_pool(fc, dmap);
+}
+
+/*
+ * Free inode dmap entries whose range falls entirely inside [start, end].
+ * Does not take any locks. At this point of time it should only be
+ * called from evict_inode() path where we know all dmap entries can be
+ * reclaimed.
+ */
+static void inode_reclaim_dmap_range(struct fuse_conn *fc, struct inode *inode,
+				      loff_t start, loff_t end)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_dax_mapping *dmap, *n;
+	int err, num = 0;
+	LIST_HEAD(to_remove);
+
+	pr_debug("fuse: %s: start=0x%llx, end=0x%llx\n", __func__, start, end);
+
+	/*
+	 * Interval tree search matches intersecting entries. Adjust the range
+	 * to avoid dropping partial valid entries.
+	 */
+	start = ALIGN(start, FUSE_DAX_MEM_RANGE_SZ);
+	end = ALIGN_DOWN(end, FUSE_DAX_MEM_RANGE_SZ);
+
+	while (1) {
+		dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, start,
+							 end);
+		if (!dmap)
+			break;
+		fuse_dax_interval_tree_remove(dmap, &fi->dmap_tree);
+		num++;
+		list_add(&dmap->list, &to_remove);
+	}
+
+	/* Nothing to remove */
+	if (list_empty(&to_remove))
+		return;
+
+	WARN_ON(fi->nr_dmaps < num);
+	fi->nr_dmaps -= num;
+	/*
+	 * During umount/shutdown, fuse connection is dropped first
+	 * and evict_inode() is called later. That means any
+	 * removemapping messages are going to fail. Send messages
+	 * only if connection is up. Otherwise fuse daemon is
+	 * responsible for cleaning up any leftover references and
+	 * mappings.
+	 */
+	if (fc->connected) {
+		err = dmap_removemapping_list(inode, num, &to_remove);
+		if (err) {
+			pr_warn("Failed to removemappings. start=0x%llx"
+				" end=0x%llx\n", start, end);
+		}
+	}
+	spin_lock(&fc->lock);
+	list_for_each_entry_safe(dmap, n, &to_remove, list) {
+		list_del_init(&dmap->list);
+		dmap_reinit_add_to_free_pool(fc, dmap);
+	}
+	spin_unlock(&fc->lock);
+}
+
+/*
+ * It is called from evict_inode() and by that time inode is going away. So
+ * this function does not take any locks like fi->i_dmap_sem for traversing
+ * that fuse inode interval tree. If that lock is taken then lock validator
+ * complains of deadlock situation w.r.t fs_reclaim lock.
+ */
+void fuse_cleanup_inode_mappings(struct inode *inode)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	/*
+	 * fuse_evict_inode() has alredy called truncate_inode_pages_final()
+	 * before we arrive here. So we should not have to worry about
+	 * any pages/exception entries still associated with inode.
+	 */
+	inode_reclaim_dmap_range(fc, inode, 0, -1);
+}
+
 void fuse_finish_open(struct inode *inode, struct file *file)
 {
 	struct fuse_file *ff = file->private_data;
@@ -1562,32 +1804,364 @@  static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	return res;
 }
 
+static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
 static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *file = iocb->ki_filp;
 	struct fuse_file *ff = file->private_data;
+	struct inode *inode = file->f_mapping->host;
 
 	if (is_bad_inode(file_inode(file)))
 		return -EIO;
 
-	if (!(ff->open_flags & FOPEN_DIRECT_IO))
-		return fuse_cache_read_iter(iocb, to);
-	else
+	if (IS_DAX(inode))
+		return fuse_dax_read_iter(iocb, to);
+
+	if (ff->open_flags & FOPEN_DIRECT_IO)
 		return fuse_direct_read_iter(iocb, to);
+
+	return fuse_cache_read_iter(iocb, to);
 }
 
+static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
 static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct fuse_file *ff = file->private_data;
+	struct inode *inode = file->f_mapping->host;
 
 	if (is_bad_inode(file_inode(file)))
 		return -EIO;
 
-	if (!(ff->open_flags & FOPEN_DIRECT_IO))
-		return fuse_cache_write_iter(iocb, from);
-	else
+	if (IS_DAX(inode))
+		return fuse_dax_write_iter(iocb, from);
+
+	if (ff->open_flags & FOPEN_DIRECT_IO)
 		return fuse_direct_write_iter(iocb, from);
+
+	return fuse_cache_write_iter(iocb, from);
+}
+
+static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
+{
+	iomap->addr = IOMAP_NULL_ADDR;
+	iomap->length = length;
+	iomap->type = IOMAP_HOLE;
+}
+
+static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
+			struct iomap *iomap, struct fuse_dax_mapping *dmap,
+			unsigned flags)
+{
+	loff_t offset, len;
+	loff_t i_size = i_size_read(inode);
+
+	offset = pos - dmap->start;
+	len = min(length, dmap->length - offset);
+
+	/* If length is beyond end of file, truncate further */
+	if (pos + len > i_size)
+		len = i_size - pos;
+
+	if (len > 0) {
+		iomap->addr = dmap->window_offset + offset;
+		iomap->length = len;
+		if (flags & IOMAP_FAULT)
+			iomap->length = ALIGN(len, PAGE_SIZE);
+		iomap->type = IOMAP_MAPPED;
+		pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
+				" length 0x%llx\n", __func__, iomap->addr,
+				iomap->offset, iomap->length);
+	} else {
+		/* Mapping beyond end of file is hole */
+		fuse_fill_iomap_hole(iomap, length);
+		pr_debug("%s: returns iomap: addr 0x%llx offset 0x%llx"
+				"length 0x%llx\n", __func__, iomap->addr,
+				iomap->offset, iomap->length);
+	}
+}
+
+static int iomap_begin_setup_new_mapping(struct inode *inode, loff_t pos,
+					 loff_t length, unsigned flags,
+					 struct iomap *iomap)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
+	int ret;
+	bool writable = flags & IOMAP_WRITE;
+
+	alloc_dmap = alloc_dax_mapping(fc);
+	if (!alloc_dmap)
+		return -EBUSY;
+
+	/*
+	 * Take write lock so that only one caller can try to setup mapping
+	 * and other waits.
+	 */
+	down_write(&fi->i_dmap_sem);
+	/*
+	 * We dropped lock. Check again if somebody else setup
+	 * mapping already.
+	 */
+	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos,
+						pos);
+	if (dmap) {
+		fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+		dmap_add_to_free_pool(fc, alloc_dmap);
+		up_write(&fi->i_dmap_sem);
+		return 0;
+	}
+
+	/* Setup one mapping */
+	ret = fuse_setup_one_mapping(inode,
+				     ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
+				     alloc_dmap, writable, false);
+	if (ret < 0) {
+		printk("fuse_setup_one_mapping() failed. err=%d"
+			" pos=0x%llx, writable=%d\n", ret, pos, writable);
+		dmap_add_to_free_pool(fc, alloc_dmap);
+		up_write(&fi->i_dmap_sem);
+		return ret;
+	}
+	fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
+	up_write(&fi->i_dmap_sem);
+	return 0;
+}
+
+static int iomap_begin_upgrade_mapping(struct inode *inode, loff_t pos,
+					 loff_t length, unsigned flags,
+					 struct iomap *iomap)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_dax_mapping *dmap;
+	int ret;
+
+	/*
+	 * Take exclusive lock so that only one caller can try to setup
+	 * mapping and others wait.
+	 */
+	down_write(&fi->i_dmap_sem);
+	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
+
+	/* We are holding either inode lock or i_mmap_sem, and that should
+	 * ensure that dmap can't reclaimed or truncated and it should still
+	 * be there in tree despite the fact we dropped and re-acquired the
+	 * lock.
+	 */
+	ret = -EIO;
+	if (WARN_ON(!dmap))
+		goto out_err;
+
+	/* Maybe another thread already upgraded mapping while we were not
+	 * holding lock.
+	 */
+	if (dmap->writable)
+		goto out_fill_iomap;
+
+	ret = fuse_setup_one_mapping(inode,
+				     ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ),
+				     dmap, true, true);
+	if (ret < 0) {
+		printk("fuse_setup_one_mapping() failed. err=%d pos=0x%llx\n",
+		       ret, pos);
+		goto out_err;
+	}
+
+out_fill_iomap:
+	fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+out_err:
+	up_write(&fi->i_dmap_sem);
+	return ret;
+}
+
+/* This is just for DAX and the mapping is ephemeral, do not use it for other
+ * purposes since there is no block device with a permanent mapping.
+ */
+static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+			    unsigned flags, struct iomap *iomap,
+			    struct iomap *srcmap)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_dax_mapping *dmap;
+	bool writable = flags & IOMAP_WRITE;
+
+	/* We don't support FIEMAP */
+	BUG_ON(flags & IOMAP_REPORT);
+
+	pr_debug("fuse_iomap_begin() called. pos=0x%llx length=0x%llx\n",
+			pos, length);
+
+	/*
+	 * Writes beyond end of file are not handled using dax path. Instead
+	 * a fuse write message is sent to daemon
+	 */
+	if (flags & IOMAP_WRITE && pos >= i_size_read(inode))
+		return -EIO;
+
+	iomap->offset = pos;
+	iomap->flags = 0;
+	iomap->bdev = NULL;
+	iomap->dax_dev = fc->dax_dev;
+
+	/*
+	 * Both read/write and mmap path can race here. So we need something
+	 * to make sure if we are setting up mapping, then other path waits
+	 *
+	 * For now, use a semaphore for this. It probably needs to be
+	 * optimized later.
+	 */
+	down_read(&fi->i_dmap_sem);
+	dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos);
+
+	if (dmap) {
+		if (writable && !dmap->writable) {
+			/* Upgrade read-only mapping to read-write. This will
+			 * require exclusive i_dmap_sem lock as we don't want
+			 * two threads to be trying to this simultaneously
+			 * for same dmap. So drop shared lock and acquire
+			 * exclusive lock.
+			 */
+			up_read(&fi->i_dmap_sem);
+			pr_debug("%s: Upgrading mapping at offset 0x%llx"
+				 " length 0x%llx\n", __func__, pos, length);
+			return iomap_begin_upgrade_mapping(inode, pos, length,
+							   flags, iomap);
+		} else {
+			fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+			up_read(&fi->i_dmap_sem);
+			return 0;
+		}
+	} else {
+		up_read(&fi->i_dmap_sem);
+		pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
+				__func__, pos, length);
+		if (pos >= i_size_read(inode))
+			goto iomap_hole;
+
+		return iomap_begin_setup_new_mapping(inode, pos, length, flags,
+						     iomap);
+	}
+
+	/*
+	 * If read beyond end of file happnes, fs code seems to return
+	 * it as hole
+	 */
+iomap_hole:
+	fuse_fill_iomap_hole(iomap, length);
+	pr_debug("fuse_iomap_begin() returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", pos, length, iomap->length);
+	return 0;
+}
+
+static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+			  ssize_t written, unsigned flags,
+			  struct iomap *iomap)
+{
+	/* DAX writes beyond end-of-file aren't handled using iomap, so the
+	 * file size is unchanged and there is nothing to do here.
+	 */
+	return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+	.iomap_begin = fuse_iomap_begin,
+	.iomap_end = fuse_iomap_end,
+};
+
+static ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!inode_trylock_shared(inode))
+			return -EAGAIN;
+	} else {
+		inode_lock_shared(inode);
+	}
+
+	ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
+	inode_unlock_shared(inode);
+
+	/* TODO file_accessed(iocb->f_filp) */
+	return ret;
+}
+
+static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	return (iov_iter_rw(from) == WRITE &&
+		((iocb->ki_pos) >= i_size_read(inode)));
+}
+
+static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+	ssize_t ret;
+
+	ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+	if (ret < 0)
+		return ret;
+
+	fuse_invalidate_attr(inode);
+	fuse_write_update_size(inode, iocb->ki_pos);
+	return ret;
+}
+
+static ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret, count;
+
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!inode_trylock(inode))
+			return -EAGAIN;
+	} else {
+		inode_lock(inode);
+	}
+
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto out;
+
+	ret = file_remove_privs(iocb->ki_filp);
+	if (ret)
+		goto out;
+	/* TODO file_update_time() but we don't want metadata I/O */
+
+	/* Do not use dax for file extending writes as its an mmap and
+	 * trying to write beyong end of existing page will generate
+	 * SIGBUS.
+	 */
+	if (file_extending_write(iocb, from)) {
+		ret = fuse_dax_direct_write(iocb, from);
+		goto out;
+	}
+
+	ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * If part of the write was file extending, fuse dax path will not
+	 * take care of that. Do direct write instead.
+	 */
+	if (iov_iter_count(from) && file_extending_write(iocb, from)) {
+		count = fuse_dax_direct_write(iocb, from);
+		if (count < 0)
+			goto out;
+		ret += count;
+	}
+
+out:
+	inode_unlock(inode);
+
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+	return ret;
 }
 
 static void fuse_writepage_free(struct fuse_writepage_args *wpa)
@@ -2318,6 +2892,11 @@  static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+static int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return -EINVAL; /* TODO */
+}
+
 static int convert_fuse_file_lock(struct fuse_conn *fc,
 				  const struct fuse_file_lock *ffl,
 				  struct file_lock *fl)
@@ -3387,6 +3966,7 @@  static const struct address_space_operations fuse_file_aops  = {
 void fuse_init_file_inode(struct inode *inode)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = get_fuse_conn(inode);
 
 	inode->i_fop = &fuse_file_operations;
 	inode->i_data.a_ops = &fuse_file_aops;
@@ -3396,4 +3976,9 @@  void fuse_init_file_inode(struct inode *inode)
 	fi->writectr = 0;
 	init_waitqueue_head(&fi->page_waitq);
 	INIT_LIST_HEAD(&fi->writepages);
+	fi->dmap_tree = RB_ROOT_CACHED;
+
+	if (fc->dax_dev) {
+		inode->i_flags |= S_DAX;
+	}
 }
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index b41275f73e4c..490549862bda 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -70,16 +70,29 @@  struct fuse_forget_link {
 	struct fuse_forget_link *next;
 };
 
+#define START(node) ((node)->start)
+#define LAST(node) ((node)->end)
+
 /** Translation information for file offsets to DAX window offsets */
 struct fuse_dax_mapping {
 	/* Will connect in fc->free_ranges to keep track of free memory */
 	struct list_head list;
 
+	/* For interval tree in file/inode */
+	struct rb_node rb;
+	/** Start Position in file */
+	__u64 start;
+	/** End Position in file */
+	__u64 end;
+	__u64 __subtree_last;
 	/** Position in DAX window */
 	u64 window_offset;
 
 	/** Length of mapping, in bytes */
 	loff_t length;
+
+	/* Is this mapping read-only or read-write */
+	bool writable;
 };
 
 /** FUSE inode */
@@ -167,6 +180,15 @@  struct fuse_inode {
 
 	/** Lock to protect write related fields */
 	spinlock_t lock;
+
+	/*
+	 * Semaphore to protect modifications to dmap_tree
+	 */
+	struct rw_semaphore i_dmap_sem;
+
+	/** Sorted rb tree of struct fuse_dax_mapping elements */
+	struct rb_root_cached dmap_tree;
+	unsigned long nr_dmaps;
 };
 
 /** FUSE inode state bits */
@@ -1127,5 +1149,6 @@  unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
  */
 u64 fuse_get_unique(struct fuse_iqueue *fiq);
 void fuse_free_conn(struct fuse_conn *fc);
+void fuse_cleanup_inode_mappings(struct inode *inode);
 
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 36cb9c00bbe5..93bc65607a15 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -86,7 +86,9 @@  static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi->attr_version = 0;
 	fi->orig_ino = 0;
 	fi->state = 0;
+	fi->nr_dmaps = 0;
 	mutex_init(&fi->mutex);
+	init_rwsem(&fi->i_dmap_sem);
 	spin_lock_init(&fi->lock);
 	fi->forget = fuse_alloc_forget();
 	if (!fi->forget) {
@@ -114,6 +116,10 @@  static void fuse_evict_inode(struct inode *inode)
 	clear_inode(inode);
 	if (inode->i_sb->s_flags & SB_ACTIVE) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
+		if (IS_DAX(inode)) {
+			fuse_cleanup_inode_mappings(inode);
+			WARN_ON(fi->nr_dmaps);
+		}
 		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
 		fi->forget = NULL;
 	}
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 62633555d547..36d824b82ebc 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -896,6 +896,7 @@  struct fuse_copy_file_range_in {
 
 #define FUSE_SETUPMAPPING_ENTRIES 8
 #define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0)
+#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1)
 struct fuse_setupmapping_in {
 	/* An already open handle */
 	uint64_t	fh;