diff mbox series

[v2,18/30] virtio_fs, dax: Set up virtio_fs dax_device

Message ID 20190515192715.18000-19-vgoyal@redhat.com (mailing list archive)
State New, archived
Headers show
Series virtio-fs: shared file system for virtual machines | expand

Commit Message

Vivek Goyal May 15, 2019, 7:27 p.m. UTC
From: Stefan Hajnoczi <stefanha@redhat.com>

Setup a dax device.

Use the shm capability to find the cache entry and map it.

The DAX window is accessed by the fs/dax.c infrastructure and must have
struct pages (at least on x86).  Use devm_memremap_pages() to map the
DAX window PCI BAR and allocate struct page.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
---
 fs/fuse/fuse_i.h               |   1 +
 fs/fuse/inode.c                |   8 ++
 fs/fuse/virtio_fs.c            | 173 ++++++++++++++++++++++++++++++++-
 include/uapi/linux/virtio_fs.h |   3 +
 4 files changed, 183 insertions(+), 2 deletions(-)

Comments

Halil Pasic July 17, 2019, 5:27 p.m. UTC | #1
On Wed, 15 May 2019 15:27:03 -0400
Vivek Goyal <vgoyal@redhat.com> wrote:

> From: Stefan Hajnoczi <stefanha@redhat.com>
> 
> Setup a dax device.
> 
> Use the shm capability to find the cache entry and map it.
> 
> The DAX window is accessed by the fs/dax.c infrastructure and must have
> struct pages (at least on x86).  Use devm_memremap_pages() to map the
> DAX window PCI BAR and allocate struct page.
>

Sorry for being this late. I don't see any more recent version so I will
comment here.

I'm trying to figure out how is this supposed to work on s390. My concern
is, that on s390 PCI memory needs to be accessed by special
instructions. This is taken care of by the stuff defined in
arch/s390/include/asm/io.h. E.g. we 'override' __raw_writew so it uses
the appropriate s390 instruction. However if the code does not use the
linux abstractions for accessing PCI memory, but assumes it can be
accessed like RAM, we have a problem.

Looking at this patch, it seems to me, that we might end up with exactly
the case described. For example AFAICT copy_to_iter() (3) resolves to
the function in lib/iov_iter.c which does not seem to cater for s390
oddities.

I didn't have the time to investigate this properly, and since virtio-fs
is virtual, we may be able to get around what is otherwise a
limitation on s390. My understanding of these areas is admittedly
shallow, and since I'm not sure I'll have much more time to
invest in the near future I decided to raise concern.

Any opinions?

[CCing some s390 people who are probably more knowledgeable than my on
these matters.]

Regards,
Halil


> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
> Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
> Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
> ---

[..]
  
> +/* Map a window offset to a page frame number.  The window offset will have
> + * been produced by .iomap_begin(), which maps a file offset to a window
> + * offset.
> + */
> +static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
> +				    long nr_pages, void **kaddr, pfn_t *pfn)
> +{
> +	struct virtio_fs *fs = dax_get_private(dax_dev);
> +	phys_addr_t offset = PFN_PHYS(pgoff);
> +	size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff;
> +
> +	if (kaddr)
> +		*kaddr = fs->window_kaddr + offset;

(2) Here we use fs->window_kaddr, basically directing the access to the
virtio shared memory region.

> +	if (pfn)
> +		*pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
> +					PFN_DEV | PFN_MAP);
> +	return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
> +}
> +
> +static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev,
> +				       pgoff_t pgoff, void *addr,
> +				       size_t bytes, struct iov_iter *i)
> +{
> +	return copy_from_iter(addr, bytes, i);
> +}
> +
> +static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev,
> +				       pgoff_t pgoff, void *addr,
> +				       size_t bytes, struct iov_iter *i)
> +{
> +	return copy_to_iter(addr, bytes, i);

(3) And this should be the access to it. Which does not seem to use.

> +}
> +
> +static const struct dax_operations virtio_fs_dax_ops = {
> +	.direct_access = virtio_fs_direct_access,
> +	.copy_from_iter = virtio_fs_copy_from_iter,
> +	.copy_to_iter = virtio_fs_copy_to_iter,
> +};
> +
> +static void virtio_fs_percpu_release(struct percpu_ref *ref)
> +{
> +	struct virtio_fs_memremap_info *mi =
> +		container_of(ref, struct virtio_fs_memremap_info, ref);
> +
> +	complete(&mi->completion);
> +}
> +
> +static void virtio_fs_percpu_exit(void *data)
> +{
> +	struct virtio_fs_memremap_info *mi = data;
> +
> +	wait_for_completion(&mi->completion);
> +	percpu_ref_exit(&mi->ref);
> +}
> +
> +static void virtio_fs_percpu_kill(struct percpu_ref *ref)
> +{
> +	percpu_ref_kill(ref);
> +}
> +
> +static void virtio_fs_cleanup_dax(void *data)
> +{
> +	struct virtio_fs *fs = data;
> +
> +	kill_dax(fs->dax_dev);
> +	put_dax(fs->dax_dev);
> +}
> +
> +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> +{
> +	struct virtio_shm_region cache_reg;
> +	struct virtio_fs_memremap_info *mi;
> +	struct dev_pagemap *pgmap;
> +	bool have_cache;
> +	int ret;
> +
> +	if (!IS_ENABLED(CONFIG_DAX_DRIVER))
> +		return 0;
> +
> +	/* Get cache region */
> +	have_cache = virtio_get_shm_region(vdev,
> +					   &cache_reg,
> +					   (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
> +	if (!have_cache) {
> +		dev_err(&vdev->dev, "%s: No cache capability\n", __func__);
> +		return -ENXIO;
> +	} else {
> +		dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n",
> +			   cache_reg.len, cache_reg.addr);
> +	}
> +
> +	mi = devm_kzalloc(&vdev->dev, sizeof(*mi), GFP_KERNEL);
> +	if (!mi)
> +		return -ENOMEM;
> +
> +	init_completion(&mi->completion);
> +	ret = percpu_ref_init(&mi->ref, virtio_fs_percpu_release, 0,
> +			      GFP_KERNEL);
> +	if (ret < 0) {
> +		dev_err(&vdev->dev, "%s: percpu_ref_init failed (%d)\n",
> +			__func__, ret);
> +		return ret;
> +	}
> +
> +	ret = devm_add_action(&vdev->dev, virtio_fs_percpu_exit, mi);
> +	if (ret < 0) {
> +		percpu_ref_exit(&mi->ref);
> +		return ret;
> +	}
> +
> +	pgmap = &mi->pgmap;
> +	pgmap->altmap_valid = false;
> +	pgmap->ref = &mi->ref;
> +	pgmap->kill = virtio_fs_percpu_kill;
> +	pgmap->type = MEMORY_DEVICE_FS_DAX;
> +
> +	/* Ideally we would directly use the PCI BAR resource but
> +	 * devm_memremap_pages() wants its own copy in pgmap.  So
> +	 * initialize a struct resource from scratch (only the start
> +	 * and end fields will be used).
> +	 */
> +	pgmap->res = (struct resource){
> +		.name = "virtio-fs dax window",
> +		.start = (phys_addr_t) cache_reg.addr,
> +		.end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1,
> +	};
> +
> +	fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap);

(1) Here we assign fs->window_kaddr basically from the virtio shm region.

> +	if (IS_ERR(fs->window_kaddr))
> +		return PTR_ERR(fs->window_kaddr);
> +
> +	fs->window_phys_addr = (phys_addr_t) cache_reg.addr;
> +	fs->window_len = (phys_addr_t) cache_reg.len;
> +
> +	dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx"
> +		" len 0x%llx\n", __func__, fs->window_kaddr, cache_reg.addr,
> +		cache_reg.len);
> +
> +	fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops);
> +	if (!fs->dax_dev)
> +		return -ENOMEM;
> +
> +	return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs);
> +}
> +

[..]
Cornelia Huck July 18, 2019, 9:04 a.m. UTC | #2
On Wed, 17 Jul 2019 19:27:25 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Wed, 15 May 2019 15:27:03 -0400
> Vivek Goyal <vgoyal@redhat.com> wrote:
> 
> > From: Stefan Hajnoczi <stefanha@redhat.com>
> > 
> > Setup a dax device.
> > 
> > Use the shm capability to find the cache entry and map it.
> > 
> > The DAX window is accessed by the fs/dax.c infrastructure and must have
> > struct pages (at least on x86).  Use devm_memremap_pages() to map the
> > DAX window PCI BAR and allocate struct page.
> >  
> 
> Sorry for being this late. I don't see any more recent version so I will
> comment here.

[Yeah, this one has been sitting in my to-review queue far too long as
well :(]

> 
> I'm trying to figure out how is this supposed to work on s390. My concern
> is, that on s390 PCI memory needs to be accessed by special
> instructions. This is taken care of by the stuff defined in
> arch/s390/include/asm/io.h. E.g. we 'override' __raw_writew so it uses
> the appropriate s390 instruction. However if the code does not use the
> linux abstractions for accessing PCI memory, but assumes it can be
> accessed like RAM, we have a problem.
> 
> Looking at this patch, it seems to me, that we might end up with exactly
> the case described. For example AFAICT copy_to_iter() (3) resolves to
> the function in lib/iov_iter.c which does not seem to cater for s390
> oddities.

What about the new pci instructions recently introduced? Not sure how
they differ from the old ones (which are currently the only ones
supported in QEMU...), but I'm pretty sure they are supposed to solve
an issue :)

> 
> I didn't have the time to investigate this properly, and since virtio-fs
> is virtual, we may be able to get around what is otherwise a
> limitation on s390. My understanding of these areas is admittedly
> shallow, and since I'm not sure I'll have much more time to
> invest in the near future I decided to raise concern.
> 
> Any opinions?

Let me point to the thread starting at
https://marc.info/?l=linux-s390&m=155048406205221&w=2 as well. That
memory region stuff is still unsolved for ccw, and I'm not sure if we
need to do something for zpci as well.

Does s390 work with DAX at all? ISTR that DAX evolved from XIP, so I
thought it did?

> 
> [CCing some s390 people who are probably more knowledgeable than my on
> these matters.]
> 
> Regards,
> Halil
> 
> 
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
> > Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
> > Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
> > ---  
> 
> [..]
>   
> > +/* Map a window offset to a page frame number.  The window offset will have
> > + * been produced by .iomap_begin(), which maps a file offset to a window
> > + * offset.
> > + */
> > +static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
> > +				    long nr_pages, void **kaddr, pfn_t *pfn)
> > +{
> > +	struct virtio_fs *fs = dax_get_private(dax_dev);
> > +	phys_addr_t offset = PFN_PHYS(pgoff);
> > +	size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff;
> > +
> > +	if (kaddr)
> > +		*kaddr = fs->window_kaddr + offset;  
> 
> (2) Here we use fs->window_kaddr, basically directing the access to the
> virtio shared memory region.
> 
> > +	if (pfn)
> > +		*pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
> > +					PFN_DEV | PFN_MAP);
> > +	return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
> > +}
> > +
> > +static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev,
> > +				       pgoff_t pgoff, void *addr,
> > +				       size_t bytes, struct iov_iter *i)
> > +{
> > +	return copy_from_iter(addr, bytes, i);
> > +}
> > +
> > +static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev,
> > +				       pgoff_t pgoff, void *addr,
> > +				       size_t bytes, struct iov_iter *i)
> > +{
> > +	return copy_to_iter(addr, bytes, i);  
> 
> (3) And this should be the access to it. Which does not seem to use.
> 
> > +}
> > +
> > +static const struct dax_operations virtio_fs_dax_ops = {
> > +	.direct_access = virtio_fs_direct_access,
> > +	.copy_from_iter = virtio_fs_copy_from_iter,
> > +	.copy_to_iter = virtio_fs_copy_to_iter,
> > +};
> > +
> > +static void virtio_fs_percpu_release(struct percpu_ref *ref)
> > +{
> > +	struct virtio_fs_memremap_info *mi =
> > +		container_of(ref, struct virtio_fs_memremap_info, ref);
> > +
> > +	complete(&mi->completion);
> > +}
> > +
> > +static void virtio_fs_percpu_exit(void *data)
> > +{
> > +	struct virtio_fs_memremap_info *mi = data;
> > +
> > +	wait_for_completion(&mi->completion);
> > +	percpu_ref_exit(&mi->ref);
> > +}
> > +
> > +static void virtio_fs_percpu_kill(struct percpu_ref *ref)
> > +{
> > +	percpu_ref_kill(ref);
> > +}
> > +
> > +static void virtio_fs_cleanup_dax(void *data)
> > +{
> > +	struct virtio_fs *fs = data;
> > +
> > +	kill_dax(fs->dax_dev);
> > +	put_dax(fs->dax_dev);
> > +}
> > +
> > +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> > +{
> > +	struct virtio_shm_region cache_reg;
> > +	struct virtio_fs_memremap_info *mi;
> > +	struct dev_pagemap *pgmap;
> > +	bool have_cache;
> > +	int ret;
> > +
> > +	if (!IS_ENABLED(CONFIG_DAX_DRIVER))
> > +		return 0;
> > +
> > +	/* Get cache region */
> > +	have_cache = virtio_get_shm_region(vdev,
> > +					   &cache_reg,
> > +					   (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
> > +	if (!have_cache) {
> > +		dev_err(&vdev->dev, "%s: No cache capability\n", __func__);
> > +		return -ENXIO;
> > +	} else {
> > +		dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n",
> > +			   cache_reg.len, cache_reg.addr);
> > +	}
> > +
> > +	mi = devm_kzalloc(&vdev->dev, sizeof(*mi), GFP_KERNEL);
> > +	if (!mi)
> > +		return -ENOMEM;
> > +
> > +	init_completion(&mi->completion);
> > +	ret = percpu_ref_init(&mi->ref, virtio_fs_percpu_release, 0,
> > +			      GFP_KERNEL);
> > +	if (ret < 0) {
> > +		dev_err(&vdev->dev, "%s: percpu_ref_init failed (%d)\n",
> > +			__func__, ret);
> > +		return ret;
> > +	}
> > +
> > +	ret = devm_add_action(&vdev->dev, virtio_fs_percpu_exit, mi);
> > +	if (ret < 0) {
> > +		percpu_ref_exit(&mi->ref);
> > +		return ret;
> > +	}
> > +
> > +	pgmap = &mi->pgmap;
> > +	pgmap->altmap_valid = false;
> > +	pgmap->ref = &mi->ref;
> > +	pgmap->kill = virtio_fs_percpu_kill;
> > +	pgmap->type = MEMORY_DEVICE_FS_DAX;
> > +
> > +	/* Ideally we would directly use the PCI BAR resource but
> > +	 * devm_memremap_pages() wants its own copy in pgmap.  So
> > +	 * initialize a struct resource from scratch (only the start
> > +	 * and end fields will be used).
> > +	 */
> > +	pgmap->res = (struct resource){
> > +		.name = "virtio-fs dax window",
> > +		.start = (phys_addr_t) cache_reg.addr,
> > +		.end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1,
> > +	};
> > +
> > +	fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap);  
> 
> (1) Here we assign fs->window_kaddr basically from the virtio shm region.
> 
> > +	if (IS_ERR(fs->window_kaddr))
> > +		return PTR_ERR(fs->window_kaddr);
> > +
> > +	fs->window_phys_addr = (phys_addr_t) cache_reg.addr;
> > +	fs->window_len = (phys_addr_t) cache_reg.len;
> > +
> > +	dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx"
> > +		" len 0x%llx\n", __func__, fs->window_kaddr, cache_reg.addr,
> > +		cache_reg.len);
> > +
> > +	fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops);
> > +	if (!fs->dax_dev)
> > +		return -ENOMEM;
> > +
> > +	return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs);
> > +}
> > +  
> 
> [..]
>
Halil Pasic July 18, 2019, 11:20 a.m. UTC | #3
On Thu, 18 Jul 2019 11:04:17 +0200
Cornelia Huck <cohuck@redhat.com> wrote:

> On Wed, 17 Jul 2019 19:27:25 +0200
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
> > On Wed, 15 May 2019 15:27:03 -0400
> > Vivek Goyal <vgoyal@redhat.com> wrote:
> > 
> > > From: Stefan Hajnoczi <stefanha@redhat.com>
> > > 
> > > Setup a dax device.
> > > 
> > > Use the shm capability to find the cache entry and map it.
> > > 
> > > The DAX window is accessed by the fs/dax.c infrastructure and must have
> > > struct pages (at least on x86).  Use devm_memremap_pages() to map the
> > > DAX window PCI BAR and allocate struct page.
> > >  
> > 
> > Sorry for being this late. I don't see any more recent version so I will
> > comment here.
> 
> [Yeah, this one has been sitting in my to-review queue far too long as
> well :(]
> 
> > 
> > I'm trying to figure out how is this supposed to work on s390. My concern
> > is, that on s390 PCI memory needs to be accessed by special
> > instructions. This is taken care of by the stuff defined in
> > arch/s390/include/asm/io.h. E.g. we 'override' __raw_writew so it uses
> > the appropriate s390 instruction. However if the code does not use the
> > linux abstractions for accessing PCI memory, but assumes it can be
> > accessed like RAM, we have a problem.
> > 
> > Looking at this patch, it seems to me, that we might end up with exactly
> > the case described. For example AFAICT copy_to_iter() (3) resolves to
> > the function in lib/iov_iter.c which does not seem to cater for s390
> > oddities.
> 
> What about the new pci instructions recently introduced? Not sure how
> they differ from the old ones (which are currently the only ones
> supported in QEMU...), but I'm pretty sure they are supposed to solve
> an issue :)
> 

I'm struggling to find the connection between this topic and the new pci
instructions. Can you please explain in more detail?

> > 
> > I didn't have the time to investigate this properly, and since virtio-fs
> > is virtual, we may be able to get around what is otherwise a
> > limitation on s390. My understanding of these areas is admittedly
> > shallow, and since I'm not sure I'll have much more time to
> > invest in the near future I decided to raise concern.
> > 
> > Any opinions?
> 
> Let me point to the thread starting at
> https://marc.info/?l=linux-s390&m=155048406205221&w=2 as well. That
> memory region stuff is still unsolved for ccw, and I'm not sure if we
> need to do something for zpci as well.
> 

Right virtio-ccw is another problem, but at least there we don't have the
need to limit ourselves to a very specific set of instructions (for
accessing memory).

zPCI i.e. virtio-pci on z should require much less dedicated love if any
at all. Unfortunately I'm not very knowledgeable on either PCI in general
or its s390 variant.

> Does s390 work with DAX at all? ISTR that DAX evolved from XIP, so I
> thought it did?
> 

Documentation/filesystems/dax.txt even mentions dcssblk: s390 dcss block
device driver as a source of inspiration. So I suppose it does work.

Regards,
Halil

> > 
> > [CCing some s390 people who are probably more knowledgeable than my
> > on these matters.]
> > 
> > Regards,
> > Halil
> > 
> > 
> > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > > Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
> > > Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
> > > Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
> > > ---  
> > 
> > [..]
> >   
> > > +/* Map a window offset to a page frame number.  The window offset
> > > will have
> > > + * been produced by .iomap_begin(), which maps a file offset to a
> > > window
> > > + * offset.
> > > + */
> > > +static long virtio_fs_direct_access(struct dax_device *dax_dev,
> > > pgoff_t pgoff,
> > > +				    long nr_pages, void **kaddr,
> > > pfn_t *pfn) +{
> > > +	struct virtio_fs *fs = dax_get_private(dax_dev);
> > > +	phys_addr_t offset = PFN_PHYS(pgoff);
> > > +	size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff;
> > > +
> > > +	if (kaddr)
> > > +		*kaddr = fs->window_kaddr + offset;  
> > 
> > (2) Here we use fs->window_kaddr, basically directing the access to
> > the virtio shared memory region.
> > 
> > > +	if (pfn)
> > > +		*pfn = phys_to_pfn_t(fs->window_phys_addr +
> > > offset,
> > > +					PFN_DEV | PFN_MAP);
> > > +	return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
> > > +}
> > > +
> > > +static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev,
> > > +				       pgoff_t pgoff, void *addr,
> > > +				       size_t bytes, struct
> > > iov_iter *i) +{
> > > +	return copy_from_iter(addr, bytes, i);
> > > +}
> > > +
> > > +static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev,
> > > +				       pgoff_t pgoff, void *addr,
> > > +				       size_t bytes, struct
> > > iov_iter *i) +{
> > > +	return copy_to_iter(addr, bytes, i);  
> > 
> > (3) And this should be the access to it. Which does not seem to use.
> > 
> > > +}
> > > +
> > > +static const struct dax_operations virtio_fs_dax_ops = {
> > > +	.direct_access = virtio_fs_direct_access,
> > > +	.copy_from_iter = virtio_fs_copy_from_iter,
> > > +	.copy_to_iter = virtio_fs_copy_to_iter,
> > > +};
> > > +
> > > +static void virtio_fs_percpu_release(struct percpu_ref *ref)
> > > +{
> > > +	struct virtio_fs_memremap_info *mi =
> > > +		container_of(ref, struct virtio_fs_memremap_info,
> > > ref); +
> > > +	complete(&mi->completion);
> > > +}
> > > +
> > > +static void virtio_fs_percpu_exit(void *data)
> > > +{
> > > +	struct virtio_fs_memremap_info *mi = data;
> > > +
> > > +	wait_for_completion(&mi->completion);
> > > +	percpu_ref_exit(&mi->ref);
> > > +}
> > > +
> > > +static void virtio_fs_percpu_kill(struct percpu_ref *ref)
> > > +{
> > > +	percpu_ref_kill(ref);
> > > +}
> > > +
> > > +static void virtio_fs_cleanup_dax(void *data)
> > > +{
> > > +	struct virtio_fs *fs = data;
> > > +
> > > +	kill_dax(fs->dax_dev);
> > > +	put_dax(fs->dax_dev);
> > > +}
> > > +
> > > +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct
> > > virtio_fs *fs) +{
> > > +	struct virtio_shm_region cache_reg;
> > > +	struct virtio_fs_memremap_info *mi;
> > > +	struct dev_pagemap *pgmap;
> > > +	bool have_cache;
> > > +	int ret;
> > > +
> > > +	if (!IS_ENABLED(CONFIG_DAX_DRIVER))
> > > +		return 0;
> > > +
> > > +	/* Get cache region */
> > > +	have_cache = virtio_get_shm_region(vdev,
> > > +					   &cache_reg,
> > > +
> > > (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
> > > +	if (!have_cache) {
> > > +		dev_err(&vdev->dev, "%s: No cache capability\n",
> > > __func__);
> > > +		return -ENXIO;
> > > +	} else {
> > > +		dev_notice(&vdev->dev, "Cache len: 0x%llx @
> > > 0x%llx\n",
> > > +			   cache_reg.len, cache_reg.addr);
> > > +	}
> > > +
> > > +	mi = devm_kzalloc(&vdev->dev, sizeof(*mi), GFP_KERNEL);
> > > +	if (!mi)
> > > +		return -ENOMEM;
> > > +
> > > +	init_completion(&mi->completion);
> > > +	ret = percpu_ref_init(&mi->ref, virtio_fs_percpu_release,
> > > 0,
> > > +			      GFP_KERNEL);
> > > +	if (ret < 0) {
> > > +		dev_err(&vdev->dev, "%s: percpu_ref_init failed
> > > (%d)\n",
> > > +			__func__, ret);
> > > +		return ret;
> > > +	}
> > > +
> > > +	ret = devm_add_action(&vdev->dev, virtio_fs_percpu_exit,
> > > mi);
> > > +	if (ret < 0) {
> > > +		percpu_ref_exit(&mi->ref);
> > > +		return ret;
> > > +	}
> > > +
> > > +	pgmap = &mi->pgmap;
> > > +	pgmap->altmap_valid = false;
> > > +	pgmap->ref = &mi->ref;
> > > +	pgmap->kill = virtio_fs_percpu_kill;
> > > +	pgmap->type = MEMORY_DEVICE_FS_DAX;
> > > +
> > > +	/* Ideally we would directly use the PCI BAR resource but
> > > +	 * devm_memremap_pages() wants its own copy in pgmap.  So
> > > +	 * initialize a struct resource from scratch (only the
> > > start
> > > +	 * and end fields will be used).
> > > +	 */
> > > +	pgmap->res = (struct resource){
> > > +		.name = "virtio-fs dax window",
> > > +		.start = (phys_addr_t) cache_reg.addr,
> > > +		.end = (phys_addr_t) cache_reg.addr +
> > > cache_reg.len - 1,
> > > +	};
> > > +
> > > +	fs->window_kaddr = devm_memremap_pages(&vdev->dev,
> > > pgmap);  
> > 
> > (1) Here we assign fs->window_kaddr basically from the virtio shm
> > region.
> > 
> > > +	if (IS_ERR(fs->window_kaddr))
> > > +		return PTR_ERR(fs->window_kaddr);
> > > +
> > > +	fs->window_phys_addr = (phys_addr_t) cache_reg.addr;
> > > +	fs->window_len = (phys_addr_t) cache_reg.len;
> > > +
> > > +	dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr
> > > 0x%llx"
> > > +		" len 0x%llx\n", __func__, fs->window_kaddr,
> > > cache_reg.addr,
> > > +		cache_reg.len);
> > > +
> > > +	fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops);
> > > +	if (!fs->dax_dev)
> > > +		return -ENOMEM;
> > > +
> > > +	return devm_add_action_or_reset(&vdev->dev,
> > > virtio_fs_cleanup_dax, fs); +}
> > > +  
> > 
> > [..]
> > 
>
Vivek Goyal July 18, 2019, 1:15 p.m. UTC | #4
On Wed, Jul 17, 2019 at 07:27:25PM +0200, Halil Pasic wrote:
> On Wed, 15 May 2019 15:27:03 -0400
> Vivek Goyal <vgoyal@redhat.com> wrote:
> 
> > From: Stefan Hajnoczi <stefanha@redhat.com>
> > 
> > Setup a dax device.
> > 
> > Use the shm capability to find the cache entry and map it.
> > 
> > The DAX window is accessed by the fs/dax.c infrastructure and must have
> > struct pages (at least on x86).  Use devm_memremap_pages() to map the
> > DAX window PCI BAR and allocate struct page.
> >
> 
> Sorry for being this late. I don't see any more recent version so I will
> comment here.
> 
> I'm trying to figure out how is this supposed to work on s390. My concern
> is, that on s390 PCI memory needs to be accessed by special
> instructions. This is taken care of by the stuff defined in
> arch/s390/include/asm/io.h. E.g. we 'override' __raw_writew so it uses
> the appropriate s390 instruction. However if the code does not use the
> linux abstractions for accessing PCI memory, but assumes it can be
> accessed like RAM, we have a problem.
> 
> Looking at this patch, it seems to me, that we might end up with exactly
> the case described. For example AFAICT copy_to_iter() (3) resolves to
> the function in lib/iov_iter.c which does not seem to cater for s390
> oddities.
> 
> I didn't have the time to investigate this properly, and since virtio-fs
> is virtual, we may be able to get around what is otherwise a
> limitation on s390. My understanding of these areas is admittedly
> shallow, and since I'm not sure I'll have much more time to
> invest in the near future I decided to raise concern.
> 
> Any opinions?

Hi Halil,

I don't understand s390 and how PCI works there as well. Is there any
other transport we can use there to map IO memory directly and access
using DAX?

BTW, is DAX supported for s390.

I am also hoping somebody who knows better can chip in. Till that time,
we could still use virtio-fs on s390 without DAX.

Thanks
Vivek
Dan Williams July 18, 2019, 2:30 p.m. UTC | #5
On Thu, Jul 18, 2019 at 6:15 AM Vivek Goyal <vgoyal@redhat.com> wrote:
>
> On Wed, Jul 17, 2019 at 07:27:25PM +0200, Halil Pasic wrote:
> > On Wed, 15 May 2019 15:27:03 -0400
> > Vivek Goyal <vgoyal@redhat.com> wrote:
> >
> > > From: Stefan Hajnoczi <stefanha@redhat.com>
> > >
> > > Setup a dax device.
> > >
> > > Use the shm capability to find the cache entry and map it.
> > >
> > > The DAX window is accessed by the fs/dax.c infrastructure and must have
> > > struct pages (at least on x86).  Use devm_memremap_pages() to map the
> > > DAX window PCI BAR and allocate struct page.
> > >
> >
> > Sorry for being this late. I don't see any more recent version so I will
> > comment here.
> >
> > I'm trying to figure out how is this supposed to work on s390. My concern
> > is, that on s390 PCI memory needs to be accessed by special
> > instructions. This is taken care of by the stuff defined in
> > arch/s390/include/asm/io.h. E.g. we 'override' __raw_writew so it uses
> > the appropriate s390 instruction. However if the code does not use the
> > linux abstractions for accessing PCI memory, but assumes it can be
> > accessed like RAM, we have a problem.
> >
> > Looking at this patch, it seems to me, that we might end up with exactly
> > the case described. For example AFAICT copy_to_iter() (3) resolves to
> > the function in lib/iov_iter.c which does not seem to cater for s390
> > oddities.
> >
> > I didn't have the time to investigate this properly, and since virtio-fs
> > is virtual, we may be able to get around what is otherwise a
> > limitation on s390. My understanding of these areas is admittedly
> > shallow, and since I'm not sure I'll have much more time to
> > invest in the near future I decided to raise concern.
> >
> > Any opinions?
>
> Hi Halil,
>
> I don't understand s390 and how PCI works there as well. Is there any
> other transport we can use there to map IO memory directly and access
> using DAX?
>
> BTW, is DAX supported for s390.
>
> I am also hoping somebody who knows better can chip in. Till that time,
> we could still use virtio-fs on s390 without DAX.

s390 has so-called "limited" dax support, see CONFIG_FS_DAX_LIMITED.
In practice that means that support for PTE_DEVMAP is missing which
means no get_user_pages() support for dax mappings. Effectively it's
only useful for execute-in-place as operations like fork() and ptrace
of dax mappings will fail.
Cornelia Huck July 18, 2019, 2:47 p.m. UTC | #6
On Thu, 18 Jul 2019 13:20:49 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Thu, 18 Jul 2019 11:04:17 +0200
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > On Wed, 17 Jul 2019 19:27:25 +0200
> > Halil Pasic <pasic@linux.ibm.com> wrote:

> > > I'm trying to figure out how is this supposed to work on s390. My concern
> > > is, that on s390 PCI memory needs to be accessed by special
> > > instructions. This is taken care of by the stuff defined in
> > > arch/s390/include/asm/io.h. E.g. we 'override' __raw_writew so it uses
> > > the appropriate s390 instruction. However if the code does not use the
> > > linux abstractions for accessing PCI memory, but assumes it can be
> > > accessed like RAM, we have a problem.
> > > 
> > > Looking at this patch, it seems to me, that we might end up with exactly
> > > the case described. For example AFAICT copy_to_iter() (3) resolves to
> > > the function in lib/iov_iter.c which does not seem to cater for s390
> > > oddities.  
> > 
> > What about the new pci instructions recently introduced? Not sure how
> > they differ from the old ones (which are currently the only ones
> > supported in QEMU...), but I'm pretty sure they are supposed to solve
> > an issue :)
> >   
> 
> I'm struggling to find the connection between this topic and the new pci
> instructions. Can you please explain in more detail?

The problem is that I'm lacking detail myself... if the new approach is
handling some things substantially differently (e.g. you set up
something and then do read/writes instead of going through
instructions), things will probably work out differently.

> 
> > > 
> > > I didn't have the time to investigate this properly, and since virtio-fs
> > > is virtual, we may be able to get around what is otherwise a
> > > limitation on s390. My understanding of these areas is admittedly
> > > shallow, and since I'm not sure I'll have much more time to
> > > invest in the near future I decided to raise concern.
> > > 
> > > Any opinions?  
> > 
> > Let me point to the thread starting at
> > https://marc.info/?l=linux-s390&m=155048406205221&w=2 as well. That
> > memory region stuff is still unsolved for ccw, and I'm not sure if we
> > need to do something for zpci as well.
> >   
> 
> Right virtio-ccw is another problem, but at least there we don't have the
> need to limit ourselves to a very specific set of instructions (for
> accessing memory).
> 
> zPCI i.e. virtio-pci on z should require much less dedicated love if any

s/virtio-pci/pci/

> at all. Unfortunately I'm not very knowledgeable on either PCI in general
> or its s390 variant.

Right, the biggest issue with zpci and shared regions is the
interaction with ccw using shared regions as well.

Unfortunately, I can't judge any zpci details from here, either :(

If virtio-fs is working in its non-dax version, we'll at least have
something on s390. (Has anyone tried that, btw?) It seems that s390 is
only supporting a limited subset of dax anyway.
Christian Borntraeger July 22, 2019, 10:51 a.m. UTC | #7
On 18.07.19 16:30, Dan Williams wrote:
> On Thu, Jul 18, 2019 at 6:15 AM Vivek Goyal <vgoyal@redhat.com> wrote:
>>
>> On Wed, Jul 17, 2019 at 07:27:25PM +0200, Halil Pasic wrote:
>>> On Wed, 15 May 2019 15:27:03 -0400
>>> Vivek Goyal <vgoyal@redhat.com> wrote:
>>>
>>>> From: Stefan Hajnoczi <stefanha@redhat.com>
>>>>
>>>> Setup a dax device.
>>>>
>>>> Use the shm capability to find the cache entry and map it.
>>>>
>>>> The DAX window is accessed by the fs/dax.c infrastructure and must have
>>>> struct pages (at least on x86).  Use devm_memremap_pages() to map the
>>>> DAX window PCI BAR and allocate struct page.
>>>>
>>>
>>> Sorry for being this late. I don't see any more recent version so I will
>>> comment here.
>>>
>>> I'm trying to figure out how is this supposed to work on s390. My concern
>>> is, that on s390 PCI memory needs to be accessed by special
>>> instructions. This is taken care of by the stuff defined in
>>> arch/s390/include/asm/io.h. E.g. we 'override' __raw_writew so it uses
>>> the appropriate s390 instruction. However if the code does not use the
>>> linux abstractions for accessing PCI memory, but assumes it can be
>>> accessed like RAM, we have a problem.
>>>
>>> Looking at this patch, it seems to me, that we might end up with exactly
>>> the case described. For example AFAICT copy_to_iter() (3) resolves to
>>> the function in lib/iov_iter.c which does not seem to cater for s390
>>> oddities.
>>>
>>> I didn't have the time to investigate this properly, and since virtio-fs
>>> is virtual, we may be able to get around what is otherwise a
>>> limitation on s390. My understanding of these areas is admittedly
>>> shallow, and since I'm not sure I'll have much more time to
>>> invest in the near future I decided to raise concern.
>>>
>>> Any opinions?
>>
>> Hi Halil,
>>
>> I don't understand s390 and how PCI works there as well. Is there any
>> other transport we can use there to map IO memory directly and access
>> using DAX?
>>
>> BTW, is DAX supported for s390.
>>
>> I am also hoping somebody who knows better can chip in. Till that time,
>> we could still use virtio-fs on s390 without DAX.
> 
> s390 has so-called "limited" dax support, see CONFIG_FS_DAX_LIMITED.
> In practice that means that support for PTE_DEVMAP is missing which
> means no get_user_pages() support for dax mappings. Effectively it's
> only useful for execute-in-place as operations like fork() and ptrace
> of dax mappings will fail.


This is only true for the dcssblk device driver (drivers/s390/block/dcssblk.c
and arch/s390/mm/extmem.c). 

For what its worth, the dcssblk looks to Linux like normal memory (just above the
previously detected memory) that can be used like normal memory. In previous time
we even had struct pages for this memory - this was removed long ago (when it was
still xip) to reduce the memory footprint for large dcss blocks and small memory
guests.
Can the CONFIG_FS_DAX_LIMITED go away if we have struct pages for that memory?

Now some observations: 
- dcssblk is z/VM only (not KVM)
- Setting CONFIG_FS_DAX_LIMITED globally as a Kconfig option depending on wether
  a device driver is compiled in or not seems not flexible enough in case if you
  have device driver that does have struct pages and another one that doesn't
- I do not see a reason why we should not be able to map anything from QEMU
  into the guest real memory via an additional KVM memory slot. 
  We would need to handle that in the guest somehow (and not as a PCI bar),
  register this with struct pages etc.
- we must then look how we can create the link between the guest memory and the
  virtio-fs driver. For virtio-ccw we might be able to add a new ccw command or
  whatever. Maybe we could also piggy-back on some memory hotplug work from David
  Hildenbrand (add cc).

Regarding limitations on the platform:
- while we do have PCI, the virtio devices are usually plugged via the ccw bus.
  That implies no PCI bars. I assume you use those PCI bars only to implicitely 
  have the location of the shared memory
  Correct?
- no real memory mapped I/O. Instead there are instructions that work on the mmio.
  As I understand things, this is of no concern regarding virtio-fs as you do not
  need mmio in the sense that a memory access of the guest to such an address 
  triggers an exit. You just need the shared memory as a mean to have the data
  inside the guest. Any notification is done via normal virtqueue mechanisms
  Correct?


Adding Heiko, maybe he remembers some details of the dcssblk/xip history.
Dr. David Alan Gilbert July 22, 2019, 10:56 a.m. UTC | #8
* Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> 
> 
> On 18.07.19 16:30, Dan Williams wrote:
> > On Thu, Jul 18, 2019 at 6:15 AM Vivek Goyal <vgoyal@redhat.com> wrote:
> >>
> >> On Wed, Jul 17, 2019 at 07:27:25PM +0200, Halil Pasic wrote:
> >>> On Wed, 15 May 2019 15:27:03 -0400
> >>> Vivek Goyal <vgoyal@redhat.com> wrote:
> >>>
> >>>> From: Stefan Hajnoczi <stefanha@redhat.com>
> >>>>
> >>>> Setup a dax device.
> >>>>
> >>>> Use the shm capability to find the cache entry and map it.
> >>>>
> >>>> The DAX window is accessed by the fs/dax.c infrastructure and must have
> >>>> struct pages (at least on x86).  Use devm_memremap_pages() to map the
> >>>> DAX window PCI BAR and allocate struct page.
> >>>>
> >>>
> >>> Sorry for being this late. I don't see any more recent version so I will
> >>> comment here.
> >>>
> >>> I'm trying to figure out how is this supposed to work on s390. My concern
> >>> is, that on s390 PCI memory needs to be accessed by special
> >>> instructions. This is taken care of by the stuff defined in
> >>> arch/s390/include/asm/io.h. E.g. we 'override' __raw_writew so it uses
> >>> the appropriate s390 instruction. However if the code does not use the
> >>> linux abstractions for accessing PCI memory, but assumes it can be
> >>> accessed like RAM, we have a problem.
> >>>
> >>> Looking at this patch, it seems to me, that we might end up with exactly
> >>> the case described. For example AFAICT copy_to_iter() (3) resolves to
> >>> the function in lib/iov_iter.c which does not seem to cater for s390
> >>> oddities.
> >>>
> >>> I didn't have the time to investigate this properly, and since virtio-fs
> >>> is virtual, we may be able to get around what is otherwise a
> >>> limitation on s390. My understanding of these areas is admittedly
> >>> shallow, and since I'm not sure I'll have much more time to
> >>> invest in the near future I decided to raise concern.
> >>>
> >>> Any opinions?
> >>
> >> Hi Halil,
> >>
> >> I don't understand s390 and how PCI works there as well. Is there any
> >> other transport we can use there to map IO memory directly and access
> >> using DAX?
> >>
> >> BTW, is DAX supported for s390.
> >>
> >> I am also hoping somebody who knows better can chip in. Till that time,
> >> we could still use virtio-fs on s390 without DAX.
> > 
> > s390 has so-called "limited" dax support, see CONFIG_FS_DAX_LIMITED.
> > In practice that means that support for PTE_DEVMAP is missing which
> > means no get_user_pages() support for dax mappings. Effectively it's
> > only useful for execute-in-place as operations like fork() and ptrace
> > of dax mappings will fail.
> 
> 
> This is only true for the dcssblk device driver (drivers/s390/block/dcssblk.c
> and arch/s390/mm/extmem.c). 
> 
> For what its worth, the dcssblk looks to Linux like normal memory (just above the
> previously detected memory) that can be used like normal memory. In previous time
> we even had struct pages for this memory - this was removed long ago (when it was
> still xip) to reduce the memory footprint for large dcss blocks and small memory
> guests.
> Can the CONFIG_FS_DAX_LIMITED go away if we have struct pages for that memory?
> 
> Now some observations: 
> - dcssblk is z/VM only (not KVM)
> - Setting CONFIG_FS_DAX_LIMITED globally as a Kconfig option depending on wether
>   a device driver is compiled in or not seems not flexible enough in case if you
>   have device driver that does have struct pages and another one that doesn't
> - I do not see a reason why we should not be able to map anything from QEMU
>   into the guest real memory via an additional KVM memory slot. 
>   We would need to handle that in the guest somehow (and not as a PCI bar),
>   register this with struct pages etc.
> - we must then look how we can create the link between the guest memory and the
>   virtio-fs driver. For virtio-ccw we might be able to add a new ccw command or
>   whatever. Maybe we could also piggy-back on some memory hotplug work from David
>   Hildenbrand (add cc).
> 
> Regarding limitations on the platform:
> - while we do have PCI, the virtio devices are usually plugged via the ccw bus.
>   That implies no PCI bars. I assume you use those PCI bars only to implicitely 
>   have the location of the shared memory
>   Correct?

Right.

> - no real memory mapped I/O. Instead there are instructions that work on the mmio.
>   As I understand things, this is of no concern regarding virtio-fs as you do not
>   need mmio in the sense that a memory access of the guest to such an address 
>   triggers an exit. You just need the shared memory as a mean to have the data
>   inside the guest. Any notification is done via normal virtqueue mechanisms
>   Correct?

Yep.

> 
> Adding Heiko, maybe he remembers some details of the dcssblk/xip history.
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Christian Borntraeger July 22, 2019, 11:20 a.m. UTC | #9
On 22.07.19 12:56, Dr. David Alan Gilbert wrote:
> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
>>
>>
>> On 18.07.19 16:30, Dan Williams wrote:
>>> On Thu, Jul 18, 2019 at 6:15 AM Vivek Goyal <vgoyal@redhat.com> wrote:
>>>>
>>>> On Wed, Jul 17, 2019 at 07:27:25PM +0200, Halil Pasic wrote:
>>>>> On Wed, 15 May 2019 15:27:03 -0400
>>>>> Vivek Goyal <vgoyal@redhat.com> wrote:
>>>>>
>>>>>> From: Stefan Hajnoczi <stefanha@redhat.com>
>>>>>>
>>>>>> Setup a dax device.
>>>>>>
>>>>>> Use the shm capability to find the cache entry and map it.
>>>>>>
>>>>>> The DAX window is accessed by the fs/dax.c infrastructure and must have
>>>>>> struct pages (at least on x86).  Use devm_memremap_pages() to map the
>>>>>> DAX window PCI BAR and allocate struct page.
>>>>>>
>>>>>
>>>>> Sorry for being this late. I don't see any more recent version so I will
>>>>> comment here.
>>>>>
>>>>> I'm trying to figure out how is this supposed to work on s390. My concern
>>>>> is, that on s390 PCI memory needs to be accessed by special
>>>>> instructions. This is taken care of by the stuff defined in
>>>>> arch/s390/include/asm/io.h. E.g. we 'override' __raw_writew so it uses
>>>>> the appropriate s390 instruction. However if the code does not use the
>>>>> linux abstractions for accessing PCI memory, but assumes it can be
>>>>> accessed like RAM, we have a problem.
>>>>>
>>>>> Looking at this patch, it seems to me, that we might end up with exactly
>>>>> the case described. For example AFAICT copy_to_iter() (3) resolves to
>>>>> the function in lib/iov_iter.c which does not seem to cater for s390
>>>>> oddities.
>>>>>
>>>>> I didn't have the time to investigate this properly, and since virtio-fs
>>>>> is virtual, we may be able to get around what is otherwise a
>>>>> limitation on s390. My understanding of these areas is admittedly
>>>>> shallow, and since I'm not sure I'll have much more time to
>>>>> invest in the near future I decided to raise concern.
>>>>>
>>>>> Any opinions?
>>>>
>>>> Hi Halil,
>>>>
>>>> I don't understand s390 and how PCI works there as well. Is there any
>>>> other transport we can use there to map IO memory directly and access
>>>> using DAX?
>>>>
>>>> BTW, is DAX supported for s390.
>>>>
>>>> I am also hoping somebody who knows better can chip in. Till that time,
>>>> we could still use virtio-fs on s390 without DAX.
>>>
>>> s390 has so-called "limited" dax support, see CONFIG_FS_DAX_LIMITED.
>>> In practice that means that support for PTE_DEVMAP is missing which
>>> means no get_user_pages() support for dax mappings. Effectively it's
>>> only useful for execute-in-place as operations like fork() and ptrace
>>> of dax mappings will fail.
>>
>>
>> This is only true for the dcssblk device driver (drivers/s390/block/dcssblk.c
>> and arch/s390/mm/extmem.c). 
>>
>> For what its worth, the dcssblk looks to Linux like normal memory (just above the
>> previously detected memory) that can be used like normal memory. In previous time
>> we even had struct pages for this memory - this was removed long ago (when it was
>> still xip) to reduce the memory footprint for large dcss blocks and small memory
>> guests.
>> Can the CONFIG_FS_DAX_LIMITED go away if we have struct pages for that memory?
>>
>> Now some observations: 
>> - dcssblk is z/VM only (not KVM)
>> - Setting CONFIG_FS_DAX_LIMITED globally as a Kconfig option depending on wether
>>   a device driver is compiled in or not seems not flexible enough in case if you
>>   have device driver that does have struct pages and another one that doesn't
>> - I do not see a reason why we should not be able to map anything from QEMU
>>   into the guest real memory via an additional KVM memory slot. 
>>   We would need to handle that in the guest somehow (and not as a PCI bar),
>>   register this with struct pages etc.
>> - we must then look how we can create the link between the guest memory and the
>>   virtio-fs driver. For virtio-ccw we might be able to add a new ccw command or
>>   whatever. Maybe we could also piggy-back on some memory hotplug work from David
>>   Hildenbrand (add cc).
>>
>> Regarding limitations on the platform:
>> - while we do have PCI, the virtio devices are usually plugged via the ccw bus.
>>   That implies no PCI bars. I assume you use those PCI bars only to implicitely 
>>   have the location of the shared memory
>>   Correct?
> 
> Right.

So in essence we just have to provide a vm_get_shm_region callback in the virtio-ccw
guest code?

How many regions do we have to support? One region per device? Or many?
Even if we need more, this should be possible with a 2 new CCWs, e.g READ_SHM_BASE(id)
and READ_SHM_SIZE(id)


> 
>> - no real memory mapped I/O. Instead there are instructions that work on the mmio.
>>   As I understand things, this is of no concern regarding virtio-fs as you do not
>>   need mmio in the sense that a memory access of the guest to such an address 
>>   triggers an exit. You just need the shared memory as a mean to have the data
>>   inside the guest. Any notification is done via normal virtqueue mechanisms
>>   Correct?
> 
> Yep.
Cornelia Huck July 22, 2019, 11:43 a.m. UTC | #10
On Mon, 22 Jul 2019 13:20:18 +0200
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> On 22.07.19 12:56, Dr. David Alan Gilbert wrote:
> > * Christian Borntraeger (borntraeger@de.ibm.com) wrote:  
> >>
> >>
> >> On 18.07.19 16:30, Dan Williams wrote:  
> >>> On Thu, Jul 18, 2019 at 6:15 AM Vivek Goyal <vgoyal@redhat.com> wrote:  
> >>>>
> >>>> On Wed, Jul 17, 2019 at 07:27:25PM +0200, Halil Pasic wrote:  
> >>>>> On Wed, 15 May 2019 15:27:03 -0400
> >>>>> Vivek Goyal <vgoyal@redhat.com> wrote:
> >>>>>  
> >>>>>> From: Stefan Hajnoczi <stefanha@redhat.com>
> >>>>>>
> >>>>>> Setup a dax device.
> >>>>>>
> >>>>>> Use the shm capability to find the cache entry and map it.
> >>>>>>
> >>>>>> The DAX window is accessed by the fs/dax.c infrastructure and must have
> >>>>>> struct pages (at least on x86).  Use devm_memremap_pages() to map the
> >>>>>> DAX window PCI BAR and allocate struct page.
> >>>>>>  
> >>>>>
> >>>>> Sorry for being this late. I don't see any more recent version so I will
> >>>>> comment here.
> >>>>>
> >>>>> I'm trying to figure out how is this supposed to work on s390. My concern
> >>>>> is, that on s390 PCI memory needs to be accessed by special
> >>>>> instructions. This is taken care of by the stuff defined in
> >>>>> arch/s390/include/asm/io.h. E.g. we 'override' __raw_writew so it uses
> >>>>> the appropriate s390 instruction. However if the code does not use the
> >>>>> linux abstractions for accessing PCI memory, but assumes it can be
> >>>>> accessed like RAM, we have a problem.
> >>>>>
> >>>>> Looking at this patch, it seems to me, that we might end up with exactly
> >>>>> the case described. For example AFAICT copy_to_iter() (3) resolves to
> >>>>> the function in lib/iov_iter.c which does not seem to cater for s390
> >>>>> oddities.
> >>>>>
> >>>>> I didn't have the time to investigate this properly, and since virtio-fs
> >>>>> is virtual, we may be able to get around what is otherwise a
> >>>>> limitation on s390. My understanding of these areas is admittedly
> >>>>> shallow, and since I'm not sure I'll have much more time to
> >>>>> invest in the near future I decided to raise concern.
> >>>>>
> >>>>> Any opinions?  
> >>>>
> >>>> Hi Halil,
> >>>>
> >>>> I don't understand s390 and how PCI works there as well. Is there any
> >>>> other transport we can use there to map IO memory directly and access
> >>>> using DAX?
> >>>>
> >>>> BTW, is DAX supported for s390.
> >>>>
> >>>> I am also hoping somebody who knows better can chip in. Till that time,
> >>>> we could still use virtio-fs on s390 without DAX.  
> >>>
> >>> s390 has so-called "limited" dax support, see CONFIG_FS_DAX_LIMITED.
> >>> In practice that means that support for PTE_DEVMAP is missing which
> >>> means no get_user_pages() support for dax mappings. Effectively it's
> >>> only useful for execute-in-place as operations like fork() and ptrace
> >>> of dax mappings will fail.  
> >>
> >>
> >> This is only true for the dcssblk device driver (drivers/s390/block/dcssblk.c
> >> and arch/s390/mm/extmem.c). 
> >>
> >> For what its worth, the dcssblk looks to Linux like normal memory (just above the
> >> previously detected memory) that can be used like normal memory. In previous time
> >> we even had struct pages for this memory - this was removed long ago (when it was
> >> still xip) to reduce the memory footprint for large dcss blocks and small memory
> >> guests.
> >> Can the CONFIG_FS_DAX_LIMITED go away if we have struct pages for that memory?
> >>
> >> Now some observations: 
> >> - dcssblk is z/VM only (not KVM)
> >> - Setting CONFIG_FS_DAX_LIMITED globally as a Kconfig option depending on wether
> >>   a device driver is compiled in or not seems not flexible enough in case if you
> >>   have device driver that does have struct pages and another one that doesn't
> >> - I do not see a reason why we should not be able to map anything from QEMU
> >>   into the guest real memory via an additional KVM memory slot. 
> >>   We would need to handle that in the guest somehow (and not as a PCI bar),
> >>   register this with struct pages etc.

You mean for ccw, right? I don't think we want pci to behave
differently than everywhere else.

> >> - we must then look how we can create the link between the guest memory and the
> >>   virtio-fs driver. For virtio-ccw we might be able to add a new ccw command or
> >>   whatever. Maybe we could also piggy-back on some memory hotplug work from David
> >>   Hildenbrand (add cc).
> >>
> >> Regarding limitations on the platform:
> >> - while we do have PCI, the virtio devices are usually plugged via the ccw bus.
> >>   That implies no PCI bars. I assume you use those PCI bars only to implicitely 
> >>   have the location of the shared memory
> >>   Correct?  
> > 
> > Right.  
> 
> So in essence we just have to provide a vm_get_shm_region callback in the virtio-ccw
> guest code?
> 
> How many regions do we have to support? One region per device? Or many?
> Even if we need more, this should be possible with a 2 new CCWs, e.g READ_SHM_BASE(id)
> and READ_SHM_SIZE(id)

I'd just add a single CCW with a control block containing id and size.

The main issue is where we put those regions, and what happens if we
use both virtio-pci and virtio-ccw on the same machine.

> 
> 
> >   
> >> - no real memory mapped I/O. Instead there are instructions that work on the mmio.
> >>   As I understand things, this is of no concern regarding virtio-fs as you do not
> >>   need mmio in the sense that a memory access of the guest to such an address 
> >>   triggers an exit. You just need the shared memory as a mean to have the data
> >>   inside the guest. Any notification is done via normal virtqueue mechanisms
> >>   Correct?  
> > 
> > Yep.  
>
Christian Borntraeger July 22, 2019, noon UTC | #11
On 22.07.19 13:43, Cornelia Huck wrote:
> On Mon, 22 Jul 2019 13:20:18 +0200
> Christian Borntraeger <borntraeger@de.ibm.com> wrote:
> 
>> On 22.07.19 12:56, Dr. David Alan Gilbert wrote:
>>> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:  
>>>>
>>>>
>>>> On 18.07.19 16:30, Dan Williams wrote:  
>>>>> On Thu, Jul 18, 2019 at 6:15 AM Vivek Goyal <vgoyal@redhat.com> wrote:  
>>>>>>
>>>>>> On Wed, Jul 17, 2019 at 07:27:25PM +0200, Halil Pasic wrote:  
>>>>>>> On Wed, 15 May 2019 15:27:03 -0400
>>>>>>> Vivek Goyal <vgoyal@redhat.com> wrote:
>>>>>>>  
>>>>>>>> From: Stefan Hajnoczi <stefanha@redhat.com>
>>>>>>>>
>>>>>>>> Setup a dax device.
>>>>>>>>
>>>>>>>> Use the shm capability to find the cache entry and map it.
>>>>>>>>
>>>>>>>> The DAX window is accessed by the fs/dax.c infrastructure and must have
>>>>>>>> struct pages (at least on x86).  Use devm_memremap_pages() to map the
>>>>>>>> DAX window PCI BAR and allocate struct page.
>>>>>>>>  
>>>>>>>
>>>>>>> Sorry for being this late. I don't see any more recent version so I will
>>>>>>> comment here.
>>>>>>>
>>>>>>> I'm trying to figure out how is this supposed to work on s390. My concern
>>>>>>> is, that on s390 PCI memory needs to be accessed by special
>>>>>>> instructions. This is taken care of by the stuff defined in
>>>>>>> arch/s390/include/asm/io.h. E.g. we 'override' __raw_writew so it uses
>>>>>>> the appropriate s390 instruction. However if the code does not use the
>>>>>>> linux abstractions for accessing PCI memory, but assumes it can be
>>>>>>> accessed like RAM, we have a problem.
>>>>>>>
>>>>>>> Looking at this patch, it seems to me, that we might end up with exactly
>>>>>>> the case described. For example AFAICT copy_to_iter() (3) resolves to
>>>>>>> the function in lib/iov_iter.c which does not seem to cater for s390
>>>>>>> oddities.
>>>>>>>
>>>>>>> I didn't have the time to investigate this properly, and since virtio-fs
>>>>>>> is virtual, we may be able to get around what is otherwise a
>>>>>>> limitation on s390. My understanding of these areas is admittedly
>>>>>>> shallow, and since I'm not sure I'll have much more time to
>>>>>>> invest in the near future I decided to raise concern.
>>>>>>>
>>>>>>> Any opinions?  
>>>>>>
>>>>>> Hi Halil,
>>>>>>
>>>>>> I don't understand s390 and how PCI works there as well. Is there any
>>>>>> other transport we can use there to map IO memory directly and access
>>>>>> using DAX?
>>>>>>
>>>>>> BTW, is DAX supported for s390.
>>>>>>
>>>>>> I am also hoping somebody who knows better can chip in. Till that time,
>>>>>> we could still use virtio-fs on s390 without DAX.  
>>>>>
>>>>> s390 has so-called "limited" dax support, see CONFIG_FS_DAX_LIMITED.
>>>>> In practice that means that support for PTE_DEVMAP is missing which
>>>>> means no get_user_pages() support for dax mappings. Effectively it's
>>>>> only useful for execute-in-place as operations like fork() and ptrace
>>>>> of dax mappings will fail.  
>>>>
>>>>
>>>> This is only true for the dcssblk device driver (drivers/s390/block/dcssblk.c
>>>> and arch/s390/mm/extmem.c). 
>>>>
>>>> For what its worth, the dcssblk looks to Linux like normal memory (just above the
>>>> previously detected memory) that can be used like normal memory. In previous time
>>>> we even had struct pages for this memory - this was removed long ago (when it was
>>>> still xip) to reduce the memory footprint for large dcss blocks and small memory
>>>> guests.
>>>> Can the CONFIG_FS_DAX_LIMITED go away if we have struct pages for that memory?
>>>>
>>>> Now some observations: 
>>>> - dcssblk is z/VM only (not KVM)
>>>> - Setting CONFIG_FS_DAX_LIMITED globally as a Kconfig option depending on wether
>>>>   a device driver is compiled in or not seems not flexible enough in case if you
>>>>   have device driver that does have struct pages and another one that doesn't
>>>> - I do not see a reason why we should not be able to map anything from QEMU
>>>>   into the guest real memory via an additional KVM memory slot. 
>>>>   We would need to handle that in the guest somehow (and not as a PCI bar),
>>>>   register this with struct pages etc.
> 
> You mean for ccw, right? I don't think we want pci to behave
> differently than everywhere else.

Yes for virtio-ccw. We would need to have a look at how virtio-ccw can create a memory
mapping with struct pages, so that DAX will work.(Dan, it is just struct pages that 
you need, correct?)


> 
>>>> - we must then look how we can create the link between the guest memory and the
>>>>   virtio-fs driver. For virtio-ccw we might be able to add a new ccw command or
>>>>   whatever. Maybe we could also piggy-back on some memory hotplug work from David
>>>>   Hildenbrand (add cc).
>>>>
>>>> Regarding limitations on the platform:
>>>> - while we do have PCI, the virtio devices are usually plugged via the ccw bus.
>>>>   That implies no PCI bars. I assume you use those PCI bars only to implicitely 
>>>>   have the location of the shared memory
>>>>   Correct?  
>>>
>>> Right.  
>>
>> So in essence we just have to provide a vm_get_shm_region callback in the virtio-ccw
>> guest code?
>>
>> How many regions do we have to support? One region per device? Or many?
>> Even if we need more, this should be possible with a 2 new CCWs, e.g READ_SHM_BASE(id)
>> and READ_SHM_SIZE(id)
> 
> I'd just add a single CCW with a control block containing id and size.
> 
> The main issue is where we put those regions, and what happens if we
> use both virtio-pci and virtio-ccw on the same machine.

Then these 2 devices should get independent memory regions that are added in an
independent (but still exclusive) way.
> 
>>
>>
>>>   
>>>> - no real memory mapped I/O. Instead there are instructions that work on the mmio.
>>>>   As I understand things, this is of no concern regarding virtio-fs as you do not
>>>>   need mmio in the sense that a memory access of the guest to such an address 
>>>>   triggers an exit. You just need the shared memory as a mean to have the data
>>>>   inside the guest. Any notification is done via normal virtqueue mechanisms
>>>>   Correct?  
>>>
>>> Yep.  
>>
>
David Hildenbrand July 22, 2019, 12:08 p.m. UTC | #12
On 22.07.19 14:00, Christian Borntraeger wrote:
> 
> 
> On 22.07.19 13:43, Cornelia Huck wrote:
>> On Mon, 22 Jul 2019 13:20:18 +0200
>> Christian Borntraeger <borntraeger@de.ibm.com> wrote:
>>
>>> On 22.07.19 12:56, Dr. David Alan Gilbert wrote:
>>>> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:  
>>>>>
>>>>>
>>>>> On 18.07.19 16:30, Dan Williams wrote:  
>>>>>> On Thu, Jul 18, 2019 at 6:15 AM Vivek Goyal <vgoyal@redhat.com> wrote:  
>>>>>>>
>>>>>>> On Wed, Jul 17, 2019 at 07:27:25PM +0200, Halil Pasic wrote:  
>>>>>>>> On Wed, 15 May 2019 15:27:03 -0400
>>>>>>>> Vivek Goyal <vgoyal@redhat.com> wrote:
>>>>>>>>  
>>>>>>>>> From: Stefan Hajnoczi <stefanha@redhat.com>
>>>>>>>>>
>>>>>>>>> Setup a dax device.
>>>>>>>>>
>>>>>>>>> Use the shm capability to find the cache entry and map it.
>>>>>>>>>
>>>>>>>>> The DAX window is accessed by the fs/dax.c infrastructure and must have
>>>>>>>>> struct pages (at least on x86).  Use devm_memremap_pages() to map the
>>>>>>>>> DAX window PCI BAR and allocate struct page.
>>>>>>>>>  
>>>>>>>>
>>>>>>>> Sorry for being this late. I don't see any more recent version so I will
>>>>>>>> comment here.
>>>>>>>>
>>>>>>>> I'm trying to figure out how is this supposed to work on s390. My concern
>>>>>>>> is, that on s390 PCI memory needs to be accessed by special
>>>>>>>> instructions. This is taken care of by the stuff defined in
>>>>>>>> arch/s390/include/asm/io.h. E.g. we 'override' __raw_writew so it uses
>>>>>>>> the appropriate s390 instruction. However if the code does not use the
>>>>>>>> linux abstractions for accessing PCI memory, but assumes it can be
>>>>>>>> accessed like RAM, we have a problem.
>>>>>>>>
>>>>>>>> Looking at this patch, it seems to me, that we might end up with exactly
>>>>>>>> the case described. For example AFAICT copy_to_iter() (3) resolves to
>>>>>>>> the function in lib/iov_iter.c which does not seem to cater for s390
>>>>>>>> oddities.
>>>>>>>>
>>>>>>>> I didn't have the time to investigate this properly, and since virtio-fs
>>>>>>>> is virtual, we may be able to get around what is otherwise a
>>>>>>>> limitation on s390. My understanding of these areas is admittedly
>>>>>>>> shallow, and since I'm not sure I'll have much more time to
>>>>>>>> invest in the near future I decided to raise concern.
>>>>>>>>
>>>>>>>> Any opinions?  
>>>>>>>
>>>>>>> Hi Halil,
>>>>>>>
>>>>>>> I don't understand s390 and how PCI works there as well. Is there any
>>>>>>> other transport we can use there to map IO memory directly and access
>>>>>>> using DAX?
>>>>>>>
>>>>>>> BTW, is DAX supported for s390.
>>>>>>>
>>>>>>> I am also hoping somebody who knows better can chip in. Till that time,
>>>>>>> we could still use virtio-fs on s390 without DAX.  
>>>>>>
>>>>>> s390 has so-called "limited" dax support, see CONFIG_FS_DAX_LIMITED.
>>>>>> In practice that means that support for PTE_DEVMAP is missing which
>>>>>> means no get_user_pages() support for dax mappings. Effectively it's
>>>>>> only useful for execute-in-place as operations like fork() and ptrace
>>>>>> of dax mappings will fail.  
>>>>>
>>>>>
>>>>> This is only true for the dcssblk device driver (drivers/s390/block/dcssblk.c
>>>>> and arch/s390/mm/extmem.c). 
>>>>>
>>>>> For what its worth, the dcssblk looks to Linux like normal memory (just above the
>>>>> previously detected memory) that can be used like normal memory. In previous time
>>>>> we even had struct pages for this memory - this was removed long ago (when it was
>>>>> still xip) to reduce the memory footprint for large dcss blocks and small memory
>>>>> guests.
>>>>> Can the CONFIG_FS_DAX_LIMITED go away if we have struct pages for that memory?
>>>>>
>>>>> Now some observations: 
>>>>> - dcssblk is z/VM only (not KVM)
>>>>> - Setting CONFIG_FS_DAX_LIMITED globally as a Kconfig option depending on wether
>>>>>   a device driver is compiled in or not seems not flexible enough in case if you
>>>>>   have device driver that does have struct pages and another one that doesn't
>>>>> - I do not see a reason why we should not be able to map anything from QEMU
>>>>>   into the guest real memory via an additional KVM memory slot. 
>>>>>   We would need to handle that in the guest somehow (and not as a PCI bar),
>>>>>   register this with struct pages etc.
>>
>> You mean for ccw, right? I don't think we want pci to behave
>> differently than everywhere else.
> 
> Yes for virtio-ccw. We would need to have a look at how virtio-ccw can create a memory
> mapping with struct pages, so that DAX will work.(Dan, it is just struct pages that 
> you need, correct?)
> 
> 
>>
>>>>> - we must then look how we can create the link between the guest memory and the
>>>>>   virtio-fs driver. For virtio-ccw we might be able to add a new ccw command or
>>>>>   whatever. Maybe we could also piggy-back on some memory hotplug work from David
>>>>>   Hildenbrand (add cc).
>>>>>
>>>>> Regarding limitations on the platform:
>>>>> - while we do have PCI, the virtio devices are usually plugged via the ccw bus.
>>>>>   That implies no PCI bars. I assume you use those PCI bars only to implicitely 
>>>>>   have the location of the shared memory
>>>>>   Correct?  
>>>>
>>>> Right.  
>>>
>>> So in essence we just have to provide a vm_get_shm_region callback in the virtio-ccw
>>> guest code?
>>>
>>> How many regions do we have to support? One region per device? Or many?
>>> Even if we need more, this should be possible with a 2 new CCWs, e.g READ_SHM_BASE(id)
>>> and READ_SHM_SIZE(id)
>>
>> I'd just add a single CCW with a control block containing id and size.
>>
>> The main issue is where we put those regions, and what happens if we
>> use both virtio-pci and virtio-ccw on the same machine.
> 
> Then these 2 devices should get independent memory regions that are added in an
> independent (but still exclusive) way.

I remember that one discussion was who dictates the physical address
mapping. If I'm not wrong, PCI bars can be mapped freely by the guest
intot he address space. So it would not just be querying the start+size.
Unless we want a pre-determined mapping (which might make more sense for
s390x).
diff mbox series

Patch

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 46fc1a454084..840c88af711c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -70,6 +70,7 @@  struct fuse_mount_data {
 	unsigned group_id_present:1;
 	unsigned default_permissions:1;
 	unsigned allow_other:1;
+	unsigned dax:1;
 	unsigned destroy:1;
 	unsigned max_read;
 	unsigned blksize;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 42f3ac5b7521..97d218a7daa8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -442,6 +442,7 @@  enum {
 	OPT_ALLOW_OTHER,
 	OPT_MAX_READ,
 	OPT_BLKSIZE,
+	OPT_DAX,
 	OPT_ERR
 };
 
@@ -455,6 +456,7 @@  static const match_table_t tokens = {
 	{OPT_ALLOW_OTHER,		"allow_other"},
 	{OPT_MAX_READ,			"max_read=%u"},
 	{OPT_BLKSIZE,			"blksize=%u"},
+	{OPT_DAX,			"dax"},
 	{OPT_ERR,			NULL}
 };
 
@@ -546,6 +548,10 @@  int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
 			d->blksize = value;
 			break;
 
+		case OPT_DAX:
+			d->dax = 1;
+			break;
+
 		default:
 			return 0;
 		}
@@ -574,6 +580,8 @@  static int fuse_show_options(struct seq_file *m, struct dentry *root)
 		seq_printf(m, ",max_read=%u", fc->max_read);
 	if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
 		seq_printf(m, ",blksize=%lu", sb->s_blocksize);
+	if (fc->dax_dev)
+		seq_printf(m, ",dax");
 	return 0;
 }
 
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index a23a1fb67217..2b790865dc21 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -5,6 +5,9 @@ 
  */
 
 #include <linux/fs.h>
+#include <linux/dax.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
 #include <linux/module.h>
 #include <linux/virtio.h>
 #include <linux/virtio_fs.h>
@@ -31,6 +34,18 @@  struct virtio_fs_vq {
 	char name[24];
 } ____cacheline_aligned_in_smp;
 
+/* State needed for devm_memremap_pages().  This API is called on the
+ * underlying pci_dev instead of struct virtio_fs (layering violation).  Since
+ * the memremap release function only gets called when the pci_dev is released,
+ * keep the associated state separate from struct virtio_fs (it has a different
+ * lifecycle from pci_dev).
+ */
+struct virtio_fs_memremap_info {
+	struct dev_pagemap pgmap;
+	struct percpu_ref ref;
+	struct completion completion;
+};
+
 /* A virtio-fs device instance */
 struct virtio_fs {
 	struct list_head list;    /* on virtio_fs_instances */
@@ -38,6 +53,12 @@  struct virtio_fs {
 	struct virtio_fs_vq *vqs;
 	unsigned nvqs;            /* number of virtqueues */
 	unsigned num_queues;      /* number of request queues */
+	struct dax_device *dax_dev;
+
+	/* DAX memory window where file contents are mapped */
+	void *window_kaddr;
+	phys_addr_t window_phys_addr;
+	size_t window_len;
 };
 
 struct virtio_fs_forget {
@@ -421,6 +442,151 @@  static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
 	vdev->config->del_vqs(vdev);
 }
 
+/* Map a window offset to a page frame number.  The window offset will have
+ * been produced by .iomap_begin(), which maps a file offset to a window
+ * offset.
+ */
+static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
+				    long nr_pages, void **kaddr, pfn_t *pfn)
+{
+	struct virtio_fs *fs = dax_get_private(dax_dev);
+	phys_addr_t offset = PFN_PHYS(pgoff);
+	size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff;
+
+	if (kaddr)
+		*kaddr = fs->window_kaddr + offset;
+	if (pfn)
+		*pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
+					PFN_DEV | PFN_MAP);
+	return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
+}
+
+static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev,
+				       pgoff_t pgoff, void *addr,
+				       size_t bytes, struct iov_iter *i)
+{
+	return copy_from_iter(addr, bytes, i);
+}
+
+static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev,
+				       pgoff_t pgoff, void *addr,
+				       size_t bytes, struct iov_iter *i)
+{
+	return copy_to_iter(addr, bytes, i);
+}
+
+static const struct dax_operations virtio_fs_dax_ops = {
+	.direct_access = virtio_fs_direct_access,
+	.copy_from_iter = virtio_fs_copy_from_iter,
+	.copy_to_iter = virtio_fs_copy_to_iter,
+};
+
+static void virtio_fs_percpu_release(struct percpu_ref *ref)
+{
+	struct virtio_fs_memremap_info *mi =
+		container_of(ref, struct virtio_fs_memremap_info, ref);
+
+	complete(&mi->completion);
+}
+
+static void virtio_fs_percpu_exit(void *data)
+{
+	struct virtio_fs_memremap_info *mi = data;
+
+	wait_for_completion(&mi->completion);
+	percpu_ref_exit(&mi->ref);
+}
+
+static void virtio_fs_percpu_kill(struct percpu_ref *ref)
+{
+	percpu_ref_kill(ref);
+}
+
+static void virtio_fs_cleanup_dax(void *data)
+{
+	struct virtio_fs *fs = data;
+
+	kill_dax(fs->dax_dev);
+	put_dax(fs->dax_dev);
+}
+
+static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+	struct virtio_shm_region cache_reg;
+	struct virtio_fs_memremap_info *mi;
+	struct dev_pagemap *pgmap;
+	bool have_cache;
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_DAX_DRIVER))
+		return 0;
+
+	/* Get cache region */
+	have_cache = virtio_get_shm_region(vdev,
+					   &cache_reg,
+					   (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
+	if (!have_cache) {
+		dev_err(&vdev->dev, "%s: No cache capability\n", __func__);
+		return -ENXIO;
+	} else {
+		dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n",
+			   cache_reg.len, cache_reg.addr);
+	}
+
+	mi = devm_kzalloc(&vdev->dev, sizeof(*mi), GFP_KERNEL);
+	if (!mi)
+		return -ENOMEM;
+
+	init_completion(&mi->completion);
+	ret = percpu_ref_init(&mi->ref, virtio_fs_percpu_release, 0,
+			      GFP_KERNEL);
+	if (ret < 0) {
+		dev_err(&vdev->dev, "%s: percpu_ref_init failed (%d)\n",
+			__func__, ret);
+		return ret;
+	}
+
+	ret = devm_add_action(&vdev->dev, virtio_fs_percpu_exit, mi);
+	if (ret < 0) {
+		percpu_ref_exit(&mi->ref);
+		return ret;
+	}
+
+	pgmap = &mi->pgmap;
+	pgmap->altmap_valid = false;
+	pgmap->ref = &mi->ref;
+	pgmap->kill = virtio_fs_percpu_kill;
+	pgmap->type = MEMORY_DEVICE_FS_DAX;
+
+	/* Ideally we would directly use the PCI BAR resource but
+	 * devm_memremap_pages() wants its own copy in pgmap.  So
+	 * initialize a struct resource from scratch (only the start
+	 * and end fields will be used).
+	 */
+	pgmap->res = (struct resource){
+		.name = "virtio-fs dax window",
+		.start = (phys_addr_t) cache_reg.addr,
+		.end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1,
+	};
+
+	fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap);
+	if (IS_ERR(fs->window_kaddr))
+		return PTR_ERR(fs->window_kaddr);
+
+	fs->window_phys_addr = (phys_addr_t) cache_reg.addr;
+	fs->window_len = (phys_addr_t) cache_reg.len;
+
+	dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx"
+		" len 0x%llx\n", __func__, fs->window_kaddr, cache_reg.addr,
+		cache_reg.len);
+
+	fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops);
+	if (!fs->dax_dev)
+		return -ENOMEM;
+
+	return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs);
+}
+
 static int virtio_fs_probe(struct virtio_device *vdev)
 {
 	struct virtio_fs *fs;
@@ -442,6 +608,10 @@  static int virtio_fs_probe(struct virtio_device *vdev)
 	/* TODO vq affinity */
 	/* TODO populate notifications vq */
 
+	ret = virtio_fs_setup_dax(vdev, fs);
+	if (ret < 0)
+		goto out_vqs;
+
 	/* Bring the device online in case the filesystem is mounted and
 	 * requests need to be sent before we return.
 	 */
@@ -456,7 +626,6 @@  static int virtio_fs_probe(struct virtio_device *vdev)
 out_vqs:
 	vdev->config->reset(vdev);
 	virtio_fs_cleanup_vqs(vdev, fs);
-
 out:
 	vdev->priv = NULL;
 	return ret;
@@ -866,7 +1035,7 @@  static int virtio_fs_fill_super(struct super_block *sb, void *data,
  		goto err_free_fuse_devs;
 	__set_bit(FR_BACKGROUND, &init_req->flags);
 
-	d.dax_dev = NULL;
+	d.dax_dev = d.dax ? fs->dax_dev : NULL;
 	d.fiq_ops = &virtio_fs_fiq_ops;
 	d.fiq_priv = fs;
 	d.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud;
diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h
index 48f3590dcfbe..d4bb549568eb 100644
--- a/include/uapi/linux/virtio_fs.h
+++ b/include/uapi/linux/virtio_fs.h
@@ -38,4 +38,7 @@  struct virtio_fs_config {
 	__u32 num_queues;
 } __attribute__((packed));
 
+/* For the id field in virtio_pci_shm_cap */
+#define VIRTIO_FS_SHMCAP_ID_CACHE 0
+
 #endif /* _UAPI_LINUX_VIRTIO_FS_H */