diff mbox series

[18/52] virtio-fs: Map cache using the values from the capabilities

Message ID 20181210171318.16998-19-vgoyal@redhat.com (mailing list archive)
State New, archived
Headers show
Series virtio-fs: shared file system for virtual machines | expand

Commit Message

Vivek Goyal Dec. 10, 2018, 5:12 p.m. UTC
Instead of assuming we had the fixed bar for the cache, use the
value from the capabilities.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

Comments

David Hildenbrand Dec. 13, 2018, 9:10 a.m. UTC | #1
On 10.12.18 18:12, Vivek Goyal wrote:
> Instead of assuming we had the fixed bar for the cache, use the
> value from the capabilities.
> 
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
>  1 file changed, 17 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> index 60d496c16841..55bac1465536 100644
> --- a/fs/fuse/virtio_fs.c
> +++ b/fs/fuse/virtio_fs.c
> @@ -14,11 +14,6 @@
>  #include <uapi/linux/virtio_pci.h>
>  #include "fuse_i.h"
>  
> -enum {
> -	/* PCI BAR number of the virtio-fs DAX window */
> -	VIRTIO_FS_WINDOW_BAR = 2,
> -};
> -
>  /* List of virtio-fs device instances and a lock for the list */
>  static DEFINE_MUTEX(virtio_fs_mutex);
>  static LIST_HEAD(virtio_fs_instances);
> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>  	struct dev_pagemap *pgmap;
>  	struct pci_dev *pci_dev;
>  	phys_addr_t phys_addr;
> -	size_t len;
> +	size_t bar_len;
>  	int ret;
>  	u8 have_cache, cache_bar;
>  	u64 cache_offset, cache_len;
> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>          }
>  
>  	/* TODO handle case where device doesn't expose BAR? */

For virtio-pmem we decided to not go via BARs as this would effectively
make it only usable for virtio-pci implementers. Instead, we are going
to export the applicable physical device region directly (e.g.
phys_start, phys_size in virtio config), so it is decoupled from PCI
details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
to make eventually use of this.

> -	ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR,
> -				 "virtio-fs-window");
> +	ret = pci_request_region(pci_dev, cache_bar, "virtio-fs-window");
>  	if (ret < 0) {
>  		dev_err(&vdev->dev, "%s: failed to request window BAR\n",
>  			__func__);
>  		return ret;
>  	}
>  
> -	phys_addr = pci_resource_start(pci_dev, VIRTIO_FS_WINDOW_BAR);
> -	len = pci_resource_len(pci_dev, VIRTIO_FS_WINDOW_BAR);
> -
>  	mi = devm_kzalloc(&pci_dev->dev, sizeof(*mi), GFP_KERNEL);
>  	if (!mi)
>  		return -ENOMEM;
> @@ -586,6 +577,17 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>  	pgmap->ref = &mi->ref;
>  	pgmap->type = MEMORY_DEVICE_FS_DAX;
>  
> +	phys_addr = pci_resource_start(pci_dev, cache_bar);
> +	bar_len = pci_resource_len(pci_dev, cache_bar);
> +
> +	if (cache_offset + cache_len > bar_len) {
> +		dev_err(&vdev->dev,
> +			"%s: cache bar shorter than cap offset+len\n",
> +			__func__);
> +		return -EINVAL;
> +	}
> +	phys_addr += cache_offset;
> +
>  	/* Ideally we would directly use the PCI BAR resource but
>  	 * devm_memremap_pages() wants its own copy in pgmap.  So
>  	 * initialize a struct resource from scratch (only the start
> @@ -594,7 +596,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>  	pgmap->res = (struct resource){
>  		.name = "virtio-fs dax window",
>  		.start = phys_addr,
> -		.end = phys_addr + len,
> +		.end = phys_addr + cache_len,
>  	};
>  
>  	fs->window_kaddr = devm_memremap_pages(&pci_dev->dev, pgmap);
> @@ -607,10 +609,10 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>  		return ret;
>  
>  	fs->window_phys_addr = phys_addr;
> -	fs->window_len = len;
> +	fs->window_len = cache_len;
>  
> -	dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len %zu\n",
> -		__func__, fs->window_kaddr, phys_addr, len);
> +	dev_dbg(&vdev->dev, "%s: cache kaddr 0x%px phys_addr 0x%llx len %llx\n",
> +		__func__, fs->window_kaddr, phys_addr, cache_len);
>  
>  	fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops);
>  	if (!fs->dax_dev)
>
Dr. David Alan Gilbert Dec. 13, 2018, 9:13 a.m. UTC | #2
* David Hildenbrand (david@redhat.com) wrote:
> On 10.12.18 18:12, Vivek Goyal wrote:
> > Instead of assuming we had the fixed bar for the cache, use the
> > value from the capabilities.
> > 
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
> >  1 file changed, 17 insertions(+), 15 deletions(-)
> > 
> > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> > index 60d496c16841..55bac1465536 100644
> > --- a/fs/fuse/virtio_fs.c
> > +++ b/fs/fuse/virtio_fs.c
> > @@ -14,11 +14,6 @@
> >  #include <uapi/linux/virtio_pci.h>
> >  #include "fuse_i.h"
> >  
> > -enum {
> > -	/* PCI BAR number of the virtio-fs DAX window */
> > -	VIRTIO_FS_WINDOW_BAR = 2,
> > -};
> > -
> >  /* List of virtio-fs device instances and a lock for the list */
> >  static DEFINE_MUTEX(virtio_fs_mutex);
> >  static LIST_HEAD(virtio_fs_instances);
> > @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >  	struct dev_pagemap *pgmap;
> >  	struct pci_dev *pci_dev;
> >  	phys_addr_t phys_addr;
> > -	size_t len;
> > +	size_t bar_len;
> >  	int ret;
> >  	u8 have_cache, cache_bar;
> >  	u64 cache_offset, cache_len;
> > @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >          }
> >  
> >  	/* TODO handle case where device doesn't expose BAR? */
> 
> For virtio-pmem we decided to not go via BARs as this would effectively
> make it only usable for virtio-pci implementers. Instead, we are going
> to export the applicable physical device region directly (e.g.
> phys_start, phys_size in virtio config), so it is decoupled from PCI
> details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
> to make eventually use of this.

That makes it a very odd looking PCI device;  I can see that with
virtio-pmem it makes some sense, given that it's job is to expose
arbitrary chunks of memory.

Dave


> > -	ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR,
> > -				 "virtio-fs-window");
> > +	ret = pci_request_region(pci_dev, cache_bar, "virtio-fs-window");
> >  	if (ret < 0) {
> >  		dev_err(&vdev->dev, "%s: failed to request window BAR\n",
> >  			__func__);
> >  		return ret;
> >  	}
> >  
> > -	phys_addr = pci_resource_start(pci_dev, VIRTIO_FS_WINDOW_BAR);
> > -	len = pci_resource_len(pci_dev, VIRTIO_FS_WINDOW_BAR);
> > -
> >  	mi = devm_kzalloc(&pci_dev->dev, sizeof(*mi), GFP_KERNEL);
> >  	if (!mi)
> >  		return -ENOMEM;
> > @@ -586,6 +577,17 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >  	pgmap->ref = &mi->ref;
> >  	pgmap->type = MEMORY_DEVICE_FS_DAX;
> >  
> > +	phys_addr = pci_resource_start(pci_dev, cache_bar);
> > +	bar_len = pci_resource_len(pci_dev, cache_bar);
> > +
> > +	if (cache_offset + cache_len > bar_len) {
> > +		dev_err(&vdev->dev,
> > +			"%s: cache bar shorter than cap offset+len\n",
> > +			__func__);
> > +		return -EINVAL;
> > +	}
> > +	phys_addr += cache_offset;
> > +
> >  	/* Ideally we would directly use the PCI BAR resource but
> >  	 * devm_memremap_pages() wants its own copy in pgmap.  So
> >  	 * initialize a struct resource from scratch (only the start
> > @@ -594,7 +596,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >  	pgmap->res = (struct resource){
> >  		.name = "virtio-fs dax window",
> >  		.start = phys_addr,
> > -		.end = phys_addr + len,
> > +		.end = phys_addr + cache_len,
> >  	};
> >  
> >  	fs->window_kaddr = devm_memremap_pages(&pci_dev->dev, pgmap);
> > @@ -607,10 +609,10 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >  		return ret;
> >  
> >  	fs->window_phys_addr = phys_addr;
> > -	fs->window_len = len;
> > +	fs->window_len = cache_len;
> >  
> > -	dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len %zu\n",
> > -		__func__, fs->window_kaddr, phys_addr, len);
> > +	dev_dbg(&vdev->dev, "%s: cache kaddr 0x%px phys_addr 0x%llx len %llx\n",
> > +		__func__, fs->window_kaddr, phys_addr, cache_len);
> >  
> >  	fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops);
> >  	if (!fs->dax_dev)
> > 
> 
> 
> -- 
> 
> Thanks,
> 
> David / dhildenb
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
David Hildenbrand Dec. 13, 2018, 9:34 a.m. UTC | #3
On 13.12.18 10:13, Dr. David Alan Gilbert wrote:
> * David Hildenbrand (david@redhat.com) wrote:
>> On 10.12.18 18:12, Vivek Goyal wrote:
>>> Instead of assuming we had the fixed bar for the cache, use the
>>> value from the capabilities.
>>>
>>> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>> ---
>>>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
>>>  1 file changed, 17 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
>>> index 60d496c16841..55bac1465536 100644
>>> --- a/fs/fuse/virtio_fs.c
>>> +++ b/fs/fuse/virtio_fs.c
>>> @@ -14,11 +14,6 @@
>>>  #include <uapi/linux/virtio_pci.h>
>>>  #include "fuse_i.h"
>>>  
>>> -enum {
>>> -	/* PCI BAR number of the virtio-fs DAX window */
>>> -	VIRTIO_FS_WINDOW_BAR = 2,
>>> -};
>>> -
>>>  /* List of virtio-fs device instances and a lock for the list */
>>>  static DEFINE_MUTEX(virtio_fs_mutex);
>>>  static LIST_HEAD(virtio_fs_instances);
>>> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>>>  	struct dev_pagemap *pgmap;
>>>  	struct pci_dev *pci_dev;
>>>  	phys_addr_t phys_addr;
>>> -	size_t len;
>>> +	size_t bar_len;
>>>  	int ret;
>>>  	u8 have_cache, cache_bar;
>>>  	u64 cache_offset, cache_len;
>>> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>>>          }
>>>  
>>>  	/* TODO handle case where device doesn't expose BAR? */
>>
>> For virtio-pmem we decided to not go via BARs as this would effectively
>> make it only usable for virtio-pci implementers. Instead, we are going
>> to export the applicable physical device region directly (e.g.
>> phys_start, phys_size in virtio config), so it is decoupled from PCI
>> details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
>> to make eventually use of this.
> 
> That makes it a very odd looking PCI device;  I can see that with
> virtio-pmem it makes some sense, given that it's job is to expose
> arbitrary chunks of memory.
> 
> Dave

Well, the fact that your are

- including <uapi/linux/virtio_pci.h>
- adding pci related code

in/to fs/fuse/virtio_fs.c

tells me that these properties might be better communicated on the
virtio layer, not on the PCI layer.

Or do you really want to glue virtio-fs to virtio-pci for all eternity?
Dr. David Alan Gilbert Dec. 13, 2018, 10 a.m. UTC | #4
* David Hildenbrand (david@redhat.com) wrote:
> On 13.12.18 10:13, Dr. David Alan Gilbert wrote:
> > * David Hildenbrand (david@redhat.com) wrote:
> >> On 10.12.18 18:12, Vivek Goyal wrote:
> >>> Instead of assuming we had the fixed bar for the cache, use the
> >>> value from the capabilities.
> >>>
> >>> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> >>> ---
> >>>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
> >>>  1 file changed, 17 insertions(+), 15 deletions(-)
> >>>
> >>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> >>> index 60d496c16841..55bac1465536 100644
> >>> --- a/fs/fuse/virtio_fs.c
> >>> +++ b/fs/fuse/virtio_fs.c
> >>> @@ -14,11 +14,6 @@
> >>>  #include <uapi/linux/virtio_pci.h>
> >>>  #include "fuse_i.h"
> >>>  
> >>> -enum {
> >>> -	/* PCI BAR number of the virtio-fs DAX window */
> >>> -	VIRTIO_FS_WINDOW_BAR = 2,
> >>> -};
> >>> -
> >>>  /* List of virtio-fs device instances and a lock for the list */
> >>>  static DEFINE_MUTEX(virtio_fs_mutex);
> >>>  static LIST_HEAD(virtio_fs_instances);
> >>> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >>>  	struct dev_pagemap *pgmap;
> >>>  	struct pci_dev *pci_dev;
> >>>  	phys_addr_t phys_addr;
> >>> -	size_t len;
> >>> +	size_t bar_len;
> >>>  	int ret;
> >>>  	u8 have_cache, cache_bar;
> >>>  	u64 cache_offset, cache_len;
> >>> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >>>          }
> >>>  
> >>>  	/* TODO handle case where device doesn't expose BAR? */
> >>
> >> For virtio-pmem we decided to not go via BARs as this would effectively
> >> make it only usable for virtio-pci implementers. Instead, we are going
> >> to export the applicable physical device region directly (e.g.
> >> phys_start, phys_size in virtio config), so it is decoupled from PCI
> >> details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
> >> to make eventually use of this.
> > 
> > That makes it a very odd looking PCI device;  I can see that with
> > virtio-pmem it makes some sense, given that it's job is to expose
> > arbitrary chunks of memory.
> > 
> > Dave
> 
> Well, the fact that your are
> 
> - including <uapi/linux/virtio_pci.h>
> - adding pci related code
> 
> in/to fs/fuse/virtio_fs.c
> 
> tells me that these properties might be better communicated on the
> virtio layer, not on the PCI layer.
> 
> Or do you really want to glue virtio-fs to virtio-pci for all eternity?

No, these need cleaning up; and the split within the bar
is probably going to change to be communicated via virtio layer
rather than pci capabilities.  However, I don't want to make our PCI
device look odd, just to make portability to non-PCI devices - so it's
right to make the split appropriately, but still to use PCI bars
for what they were designed for.

Dave

> 
> -- 
> 
> Thanks,
> 
> David / dhildenb
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
David Hildenbrand Dec. 13, 2018, 11:26 a.m. UTC | #5
On 13.12.18 11:00, Dr. David Alan Gilbert wrote:
> * David Hildenbrand (david@redhat.com) wrote:
>> On 13.12.18 10:13, Dr. David Alan Gilbert wrote:
>>> * David Hildenbrand (david@redhat.com) wrote:
>>>> On 10.12.18 18:12, Vivek Goyal wrote:
>>>>> Instead of assuming we had the fixed bar for the cache, use the
>>>>> value from the capabilities.
>>>>>
>>>>> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>>>> ---
>>>>>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
>>>>>  1 file changed, 17 insertions(+), 15 deletions(-)
>>>>>
>>>>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
>>>>> index 60d496c16841..55bac1465536 100644
>>>>> --- a/fs/fuse/virtio_fs.c
>>>>> +++ b/fs/fuse/virtio_fs.c
>>>>> @@ -14,11 +14,6 @@
>>>>>  #include <uapi/linux/virtio_pci.h>
>>>>>  #include "fuse_i.h"
>>>>>  
>>>>> -enum {
>>>>> -	/* PCI BAR number of the virtio-fs DAX window */
>>>>> -	VIRTIO_FS_WINDOW_BAR = 2,
>>>>> -};
>>>>> -
>>>>>  /* List of virtio-fs device instances and a lock for the list */
>>>>>  static DEFINE_MUTEX(virtio_fs_mutex);
>>>>>  static LIST_HEAD(virtio_fs_instances);
>>>>> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>>>>>  	struct dev_pagemap *pgmap;
>>>>>  	struct pci_dev *pci_dev;
>>>>>  	phys_addr_t phys_addr;
>>>>> -	size_t len;
>>>>> +	size_t bar_len;
>>>>>  	int ret;
>>>>>  	u8 have_cache, cache_bar;
>>>>>  	u64 cache_offset, cache_len;
>>>>> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>>>>>          }
>>>>>  
>>>>>  	/* TODO handle case where device doesn't expose BAR? */
>>>>
>>>> For virtio-pmem we decided to not go via BARs as this would effectively
>>>> make it only usable for virtio-pci implementers. Instead, we are going
>>>> to export the applicable physical device region directly (e.g.
>>>> phys_start, phys_size in virtio config), so it is decoupled from PCI
>>>> details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
>>>> to make eventually use of this.
>>>
>>> That makes it a very odd looking PCI device;  I can see that with
>>> virtio-pmem it makes some sense, given that it's job is to expose
>>> arbitrary chunks of memory.
>>>
>>> Dave
>>
>> Well, the fact that your are
>>
>> - including <uapi/linux/virtio_pci.h>
>> - adding pci related code
>>
>> in/to fs/fuse/virtio_fs.c
>>
>> tells me that these properties might be better communicated on the
>> virtio layer, not on the PCI layer.
>>
>> Or do you really want to glue virtio-fs to virtio-pci for all eternity?
> 
> No, these need cleaning up; and the split within the bar
> is probably going to change to be communicated via virtio layer
> rather than pci capabilities.  However, I don't want to make our PCI
> device look odd, just to make portability to non-PCI devices - so it's
> right to make the split appropriately, but still to use PCI bars
> for what they were designed for.
> 
> Dave

Let's discuss after the cleanup. In general I am not convinced this is
the right thing to do. Using virtio-pci for anything else than pure
transport smells like bad design to me (well, I am no virtio expert
after all ;) ). No matter what PCI bars were designed for. If we can't
get the same running with e.g. virtio-ccw or virtio-whatever, it is
broken by design (or an addon that is tightly glued to virtio-pci, if
that is the general idea).
Dr. David Alan Gilbert Dec. 13, 2018, 12:15 p.m. UTC | #6
* David Hildenbrand (david@redhat.com) wrote:
> On 13.12.18 11:00, Dr. David Alan Gilbert wrote:
> > * David Hildenbrand (david@redhat.com) wrote:
> >> On 13.12.18 10:13, Dr. David Alan Gilbert wrote:
> >>> * David Hildenbrand (david@redhat.com) wrote:
> >>>> On 10.12.18 18:12, Vivek Goyal wrote:
> >>>>> Instead of assuming we had the fixed bar for the cache, use the
> >>>>> value from the capabilities.
> >>>>>
> >>>>> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> >>>>> ---
> >>>>>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
> >>>>>  1 file changed, 17 insertions(+), 15 deletions(-)
> >>>>>
> >>>>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> >>>>> index 60d496c16841..55bac1465536 100644
> >>>>> --- a/fs/fuse/virtio_fs.c
> >>>>> +++ b/fs/fuse/virtio_fs.c
> >>>>> @@ -14,11 +14,6 @@
> >>>>>  #include <uapi/linux/virtio_pci.h>
> >>>>>  #include "fuse_i.h"
> >>>>>  
> >>>>> -enum {
> >>>>> -	/* PCI BAR number of the virtio-fs DAX window */
> >>>>> -	VIRTIO_FS_WINDOW_BAR = 2,
> >>>>> -};
> >>>>> -
> >>>>>  /* List of virtio-fs device instances and a lock for the list */
> >>>>>  static DEFINE_MUTEX(virtio_fs_mutex);
> >>>>>  static LIST_HEAD(virtio_fs_instances);
> >>>>> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >>>>>  	struct dev_pagemap *pgmap;
> >>>>>  	struct pci_dev *pci_dev;
> >>>>>  	phys_addr_t phys_addr;
> >>>>> -	size_t len;
> >>>>> +	size_t bar_len;
> >>>>>  	int ret;
> >>>>>  	u8 have_cache, cache_bar;
> >>>>>  	u64 cache_offset, cache_len;
> >>>>> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >>>>>          }
> >>>>>  
> >>>>>  	/* TODO handle case where device doesn't expose BAR? */
> >>>>
> >>>> For virtio-pmem we decided to not go via BARs as this would effectively
> >>>> make it only usable for virtio-pci implementers. Instead, we are going
> >>>> to export the applicable physical device region directly (e.g.
> >>>> phys_start, phys_size in virtio config), so it is decoupled from PCI
> >>>> details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
> >>>> to make eventually use of this.
> >>>
> >>> That makes it a very odd looking PCI device;  I can see that with
> >>> virtio-pmem it makes some sense, given that it's job is to expose
> >>> arbitrary chunks of memory.
> >>>
> >>> Dave
> >>
> >> Well, the fact that your are
> >>
> >> - including <uapi/linux/virtio_pci.h>
> >> - adding pci related code
> >>
> >> in/to fs/fuse/virtio_fs.c
> >>
> >> tells me that these properties might be better communicated on the
> >> virtio layer, not on the PCI layer.
> >>
> >> Or do you really want to glue virtio-fs to virtio-pci for all eternity?
> > 
> > No, these need cleaning up; and the split within the bar
> > is probably going to change to be communicated via virtio layer
> > rather than pci capabilities.  However, I don't want to make our PCI
> > device look odd, just to make portability to non-PCI devices - so it's
> > right to make the split appropriately, but still to use PCI bars
> > for what they were designed for.
> > 
> > Dave
> 
> Let's discuss after the cleanup. In general I am not convinced this is
> the right thing to do. Using virtio-pci for anything else than pure
> transport smells like bad design to me (well, I am no virtio expert
> after all ;) ). No matter what PCI bars were designed for. If we can't
> get the same running with e.g. virtio-ccw or virtio-whatever, it is
> broken by design (or an addon that is tightly glued to virtio-pci, if
> that is the general idea).

I'm sure we can find alternatives for virtio-*, so I wouldn't expect
it to be glued to virtio-pci.

Dave

> -- 
> 
> Thanks,
> 
> David / dhildenb
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
David Hildenbrand Dec. 13, 2018, 12:24 p.m. UTC | #7
On 13.12.18 13:15, Dr. David Alan Gilbert wrote:
> * David Hildenbrand (david@redhat.com) wrote:
>> On 13.12.18 11:00, Dr. David Alan Gilbert wrote:
>>> * David Hildenbrand (david@redhat.com) wrote:
>>>> On 13.12.18 10:13, Dr. David Alan Gilbert wrote:
>>>>> * David Hildenbrand (david@redhat.com) wrote:
>>>>>> On 10.12.18 18:12, Vivek Goyal wrote:
>>>>>>> Instead of assuming we had the fixed bar for the cache, use the
>>>>>>> value from the capabilities.
>>>>>>>
>>>>>>> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>>>>>> ---
>>>>>>>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
>>>>>>>  1 file changed, 17 insertions(+), 15 deletions(-)
>>>>>>>
>>>>>>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
>>>>>>> index 60d496c16841..55bac1465536 100644
>>>>>>> --- a/fs/fuse/virtio_fs.c
>>>>>>> +++ b/fs/fuse/virtio_fs.c
>>>>>>> @@ -14,11 +14,6 @@
>>>>>>>  #include <uapi/linux/virtio_pci.h>
>>>>>>>  #include "fuse_i.h"
>>>>>>>  
>>>>>>> -enum {
>>>>>>> -	/* PCI BAR number of the virtio-fs DAX window */
>>>>>>> -	VIRTIO_FS_WINDOW_BAR = 2,
>>>>>>> -};
>>>>>>> -
>>>>>>>  /* List of virtio-fs device instances and a lock for the list */
>>>>>>>  static DEFINE_MUTEX(virtio_fs_mutex);
>>>>>>>  static LIST_HEAD(virtio_fs_instances);
>>>>>>> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>>>>>>>  	struct dev_pagemap *pgmap;
>>>>>>>  	struct pci_dev *pci_dev;
>>>>>>>  	phys_addr_t phys_addr;
>>>>>>> -	size_t len;
>>>>>>> +	size_t bar_len;
>>>>>>>  	int ret;
>>>>>>>  	u8 have_cache, cache_bar;
>>>>>>>  	u64 cache_offset, cache_len;
>>>>>>> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>>>>>>>          }
>>>>>>>  
>>>>>>>  	/* TODO handle case where device doesn't expose BAR? */
>>>>>>
>>>>>> For virtio-pmem we decided to not go via BARs as this would effectively
>>>>>> make it only usable for virtio-pci implementers. Instead, we are going
>>>>>> to export the applicable physical device region directly (e.g.
>>>>>> phys_start, phys_size in virtio config), so it is decoupled from PCI
>>>>>> details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
>>>>>> to make eventually use of this.
>>>>>
>>>>> That makes it a very odd looking PCI device;  I can see that with
>>>>> virtio-pmem it makes some sense, given that it's job is to expose
>>>>> arbitrary chunks of memory.
>>>>>
>>>>> Dave
>>>>
>>>> Well, the fact that your are
>>>>
>>>> - including <uapi/linux/virtio_pci.h>
>>>> - adding pci related code
>>>>
>>>> in/to fs/fuse/virtio_fs.c
>>>>
>>>> tells me that these properties might be better communicated on the
>>>> virtio layer, not on the PCI layer.
>>>>
>>>> Or do you really want to glue virtio-fs to virtio-pci for all eternity?
>>>
>>> No, these need cleaning up; and the split within the bar
>>> is probably going to change to be communicated via virtio layer
>>> rather than pci capabilities.  However, I don't want to make our PCI
>>> device look odd, just to make portability to non-PCI devices - so it's
>>> right to make the split appropriately, but still to use PCI bars
>>> for what they were designed for.
>>>
>>> Dave
>>
>> Let's discuss after the cleanup. In general I am not convinced this is
>> the right thing to do. Using virtio-pci for anything else than pure
>> transport smells like bad design to me (well, I am no virtio expert
>> after all ;) ). No matter what PCI bars were designed for. If we can't
>> get the same running with e.g. virtio-ccw or virtio-whatever, it is
>> broken by design (or an addon that is tightly glued to virtio-pci, if
>> that is the general idea).
> 
> I'm sure we can find alternatives for virtio-*, so I wouldn't expect
> it to be glued to virtio-pci.
> 
> Dave

As s390x does not have the concept of memory mapped io (RAM is RAM,
nothing else), this is not architectured. vitio-ccw can therefore not
define anything similar like that. However, in virtual environments we
can do whatever we want on top of the pure transport (e.g. on the virtio
layer).

Conny can correct me if I am wrong.
Cornelia Huck Dec. 13, 2018, 12:38 p.m. UTC | #8
On Thu, 13 Dec 2018 13:24:31 +0100
David Hildenbrand <david@redhat.com> wrote:

> On 13.12.18 13:15, Dr. David Alan Gilbert wrote:
> > * David Hildenbrand (david@redhat.com) wrote:  
> >> On 13.12.18 11:00, Dr. David Alan Gilbert wrote:  
> >>> * David Hildenbrand (david@redhat.com) wrote:  
> >>>> On 13.12.18 10:13, Dr. David Alan Gilbert wrote:  
> >>>>> * David Hildenbrand (david@redhat.com) wrote:  
> >>>>>> On 10.12.18 18:12, Vivek Goyal wrote:  
> >>>>>>> Instead of assuming we had the fixed bar for the cache, use the
> >>>>>>> value from the capabilities.
> >>>>>>>
> >>>>>>> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> >>>>>>> ---
> >>>>>>>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
> >>>>>>>  1 file changed, 17 insertions(+), 15 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> >>>>>>> index 60d496c16841..55bac1465536 100644
> >>>>>>> --- a/fs/fuse/virtio_fs.c
> >>>>>>> +++ b/fs/fuse/virtio_fs.c
> >>>>>>> @@ -14,11 +14,6 @@
> >>>>>>>  #include <uapi/linux/virtio_pci.h>
> >>>>>>>  #include "fuse_i.h"
> >>>>>>>  
> >>>>>>> -enum {
> >>>>>>> -	/* PCI BAR number of the virtio-fs DAX window */
> >>>>>>> -	VIRTIO_FS_WINDOW_BAR = 2,
> >>>>>>> -};
> >>>>>>> -
> >>>>>>>  /* List of virtio-fs device instances and a lock for the list */
> >>>>>>>  static DEFINE_MUTEX(virtio_fs_mutex);
> >>>>>>>  static LIST_HEAD(virtio_fs_instances);
> >>>>>>> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >>>>>>>  	struct dev_pagemap *pgmap;
> >>>>>>>  	struct pci_dev *pci_dev;
> >>>>>>>  	phys_addr_t phys_addr;
> >>>>>>> -	size_t len;
> >>>>>>> +	size_t bar_len;
> >>>>>>>  	int ret;
> >>>>>>>  	u8 have_cache, cache_bar;
> >>>>>>>  	u64 cache_offset, cache_len;
> >>>>>>> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >>>>>>>          }
> >>>>>>>  
> >>>>>>>  	/* TODO handle case where device doesn't expose BAR? */  
> >>>>>>
> >>>>>> For virtio-pmem we decided to not go via BARs as this would effectively
> >>>>>> make it only usable for virtio-pci implementers. Instead, we are going
> >>>>>> to export the applicable physical device region directly (e.g.
> >>>>>> phys_start, phys_size in virtio config), so it is decoupled from PCI
> >>>>>> details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
> >>>>>> to make eventually use of this.  
> >>>>>
> >>>>> That makes it a very odd looking PCI device;  I can see that with
> >>>>> virtio-pmem it makes some sense, given that it's job is to expose
> >>>>> arbitrary chunks of memory.
> >>>>>
> >>>>> Dave  
> >>>>
> >>>> Well, the fact that your are
> >>>>
> >>>> - including <uapi/linux/virtio_pci.h>
> >>>> - adding pci related code
> >>>>
> >>>> in/to fs/fuse/virtio_fs.c
> >>>>
> >>>> tells me that these properties might be better communicated on the
> >>>> virtio layer, not on the PCI layer.
> >>>>
> >>>> Or do you really want to glue virtio-fs to virtio-pci for all eternity?  
> >>>
> >>> No, these need cleaning up; and the split within the bar
> >>> is probably going to change to be communicated via virtio layer
> >>> rather than pci capabilities.  However, I don't want to make our PCI
> >>> device look odd, just to make portability to non-PCI devices - so it's
> >>> right to make the split appropriately, but still to use PCI bars
> >>> for what they were designed for.
> >>>
> >>> Dave  
> >>
> >> Let's discuss after the cleanup. In general I am not convinced this is
> >> the right thing to do. Using virtio-pci for anything else than pure
> >> transport smells like bad design to me (well, I am no virtio expert
> >> after all ;) ). No matter what PCI bars were designed for. If we can't
> >> get the same running with e.g. virtio-ccw or virtio-whatever, it is
> >> broken by design (or an addon that is tightly glued to virtio-pci, if
> >> that is the general idea).  
> > 
> > I'm sure we can find alternatives for virtio-*, so I wouldn't expect
> > it to be glued to virtio-pci.
> > 
> > Dave  
> 
> As s390x does not have the concept of memory mapped io (RAM is RAM,
> nothing else), this is not architectured. vitio-ccw can therefore not
> define anything similar like that. However, in virtual environments we
> can do whatever we want on top of the pure transport (e.g. on the virtio
> layer).
> 
> Conny can correct me if I am wrong.

I don't think you're wrong, but I haven't read the code yet and I'm
therefore not aware of the purpose of this BAR.

Generally, if there is a memory location shared between host and guest,
we need a way to communicate its location, which will likely differ
between transports. For ccw, I could imagine a new channel command
dedicated to exchanging configuration information (similar to what
exists today to communicate the locations of virtqueues), but I'd
rather not go down this path.

Without reading the code/design further, can we use one of the
following instead of a BAR:
- a virtqueue;
- something in config space?
That would be implementable by any virtio transport.
Stefan Hajnoczi Dec. 14, 2018, 1:44 p.m. UTC | #9
On Thu, Dec 13, 2018 at 01:38:23PM +0100, Cornelia Huck wrote:
> On Thu, 13 Dec 2018 13:24:31 +0100
> David Hildenbrand <david@redhat.com> wrote:
> 
> > On 13.12.18 13:15, Dr. David Alan Gilbert wrote:
> > > * David Hildenbrand (david@redhat.com) wrote:  
> > >> On 13.12.18 11:00, Dr. David Alan Gilbert wrote:  
> > >>> * David Hildenbrand (david@redhat.com) wrote:  
> > >>>> On 13.12.18 10:13, Dr. David Alan Gilbert wrote:  
> > >>>>> * David Hildenbrand (david@redhat.com) wrote:  
> > >>>>>> On 10.12.18 18:12, Vivek Goyal wrote:  
> > >>>>>>> Instead of assuming we had the fixed bar for the cache, use the
> > >>>>>>> value from the capabilities.
> > >>>>>>>
> > >>>>>>> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > >>>>>>> ---
> > >>>>>>>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
> > >>>>>>>  1 file changed, 17 insertions(+), 15 deletions(-)
> > >>>>>>>
> > >>>>>>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> > >>>>>>> index 60d496c16841..55bac1465536 100644
> > >>>>>>> --- a/fs/fuse/virtio_fs.c
> > >>>>>>> +++ b/fs/fuse/virtio_fs.c
> > >>>>>>> @@ -14,11 +14,6 @@
> > >>>>>>>  #include <uapi/linux/virtio_pci.h>
> > >>>>>>>  #include "fuse_i.h"
> > >>>>>>>  
> > >>>>>>> -enum {
> > >>>>>>> -	/* PCI BAR number of the virtio-fs DAX window */
> > >>>>>>> -	VIRTIO_FS_WINDOW_BAR = 2,
> > >>>>>>> -};
> > >>>>>>> -
> > >>>>>>>  /* List of virtio-fs device instances and a lock for the list */
> > >>>>>>>  static DEFINE_MUTEX(virtio_fs_mutex);
> > >>>>>>>  static LIST_HEAD(virtio_fs_instances);
> > >>>>>>> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> > >>>>>>>  	struct dev_pagemap *pgmap;
> > >>>>>>>  	struct pci_dev *pci_dev;
> > >>>>>>>  	phys_addr_t phys_addr;
> > >>>>>>> -	size_t len;
> > >>>>>>> +	size_t bar_len;
> > >>>>>>>  	int ret;
> > >>>>>>>  	u8 have_cache, cache_bar;
> > >>>>>>>  	u64 cache_offset, cache_len;
> > >>>>>>> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> > >>>>>>>          }
> > >>>>>>>  
> > >>>>>>>  	/* TODO handle case where device doesn't expose BAR? */  
> > >>>>>>
> > >>>>>> For virtio-pmem we decided to not go via BARs as this would effectively
> > >>>>>> make it only usable for virtio-pci implementers. Instead, we are going
> > >>>>>> to export the applicable physical device region directly (e.g.
> > >>>>>> phys_start, phys_size in virtio config), so it is decoupled from PCI
> > >>>>>> details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
> > >>>>>> to make eventually use of this.  
> > >>>>>
> > >>>>> That makes it a very odd looking PCI device;  I can see that with
> > >>>>> virtio-pmem it makes some sense, given that it's job is to expose
> > >>>>> arbitrary chunks of memory.
> > >>>>>
> > >>>>> Dave  
> > >>>>
> > >>>> Well, the fact that your are
> > >>>>
> > >>>> - including <uapi/linux/virtio_pci.h>
> > >>>> - adding pci related code
> > >>>>
> > >>>> in/to fs/fuse/virtio_fs.c
> > >>>>
> > >>>> tells me that these properties might be better communicated on the
> > >>>> virtio layer, not on the PCI layer.
> > >>>>
> > >>>> Or do you really want to glue virtio-fs to virtio-pci for all eternity?  
> > >>>
> > >>> No, these need cleaning up; and the split within the bar
> > >>> is probably going to change to be communicated via virtio layer
> > >>> rather than pci capabilities.  However, I don't want to make our PCI
> > >>> device look odd, just to make portability to non-PCI devices - so it's
> > >>> right to make the split appropriately, but still to use PCI bars
> > >>> for what they were designed for.
> > >>>
> > >>> Dave  
> > >>
> > >> Let's discuss after the cleanup. In general I am not convinced this is
> > >> the right thing to do. Using virtio-pci for anything else than pure
> > >> transport smells like bad design to me (well, I am no virtio expert
> > >> after all ;) ). No matter what PCI bars were designed for. If we can't
> > >> get the same running with e.g. virtio-ccw or virtio-whatever, it is
> > >> broken by design (or an addon that is tightly glued to virtio-pci, if
> > >> that is the general idea).  
> > > 
> > > I'm sure we can find alternatives for virtio-*, so I wouldn't expect
> > > it to be glued to virtio-pci.
> > > 
> > > Dave  
> > 
> > As s390x does not have the concept of memory mapped io (RAM is RAM,
> > nothing else), this is not architectured. vitio-ccw can therefore not
> > define anything similar like that. However, in virtual environments we
> > can do whatever we want on top of the pure transport (e.g. on the virtio
> > layer).
> > 
> > Conny can correct me if I am wrong.
> 
> I don't think you're wrong, but I haven't read the code yet and I'm
> therefore not aware of the purpose of this BAR.
> 
> Generally, if there is a memory location shared between host and guest,
> we need a way to communicate its location, which will likely differ
> between transports. For ccw, I could imagine a new channel command
> dedicated to exchanging configuration information (similar to what
> exists today to communicate the locations of virtqueues), but I'd
> rather not go down this path.
> 
> Without reading the code/design further, can we use one of the
> following instead of a BAR:
> - a virtqueue;
> - something in config space?
> That would be implementable by any virtio transport.

The way I think about this is that we wish to extend the VIRTIO device
model with the concept of shared memory.  virtio-fs, virtio-gpu, and
virtio-vhost-user all have requirements for shared memory.

This seems like a transport-level issue to me.  PCI supports
memory-mapped I/O and that's the right place to do it.  If you try to
put it into config space or the virtqueue, you'll end up with something
that cannot be realized as a PCI device because it bypasses PCI bus
address translation.

If CCW needs a side-channel, that's fine.  But that side-channel is a
CCW-specific mechanism and probably doesn't apply to all other
transports.

Stefan
Cornelia Huck Dec. 14, 2018, 1:50 p.m. UTC | #10
On Fri, 14 Dec 2018 13:44:34 +0000
Stefan Hajnoczi <stefanha@redhat.com> wrote:

> On Thu, Dec 13, 2018 at 01:38:23PM +0100, Cornelia Huck wrote:
> > On Thu, 13 Dec 2018 13:24:31 +0100
> > David Hildenbrand <david@redhat.com> wrote:
> >   
> > > On 13.12.18 13:15, Dr. David Alan Gilbert wrote:  
> > > > * David Hildenbrand (david@redhat.com) wrote:    
> > > >> On 13.12.18 11:00, Dr. David Alan Gilbert wrote:    
> > > >>> * David Hildenbrand (david@redhat.com) wrote:    
> > > >>>> On 13.12.18 10:13, Dr. David Alan Gilbert wrote:    
> > > >>>>> * David Hildenbrand (david@redhat.com) wrote:    
> > > >>>>>> On 10.12.18 18:12, Vivek Goyal wrote:    
> > > >>>>>>> Instead of assuming we had the fixed bar for the cache, use the
> > > >>>>>>> value from the capabilities.
> > > >>>>>>>
> > > >>>>>>> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > > >>>>>>> ---
> > > >>>>>>>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
> > > >>>>>>>  1 file changed, 17 insertions(+), 15 deletions(-)
> > > >>>>>>>
> > > >>>>>>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> > > >>>>>>> index 60d496c16841..55bac1465536 100644
> > > >>>>>>> --- a/fs/fuse/virtio_fs.c
> > > >>>>>>> +++ b/fs/fuse/virtio_fs.c
> > > >>>>>>> @@ -14,11 +14,6 @@
> > > >>>>>>>  #include <uapi/linux/virtio_pci.h>
> > > >>>>>>>  #include "fuse_i.h"
> > > >>>>>>>  
> > > >>>>>>> -enum {
> > > >>>>>>> -	/* PCI BAR number of the virtio-fs DAX window */
> > > >>>>>>> -	VIRTIO_FS_WINDOW_BAR = 2,
> > > >>>>>>> -};
> > > >>>>>>> -
> > > >>>>>>>  /* List of virtio-fs device instances and a lock for the list */
> > > >>>>>>>  static DEFINE_MUTEX(virtio_fs_mutex);
> > > >>>>>>>  static LIST_HEAD(virtio_fs_instances);
> > > >>>>>>> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> > > >>>>>>>  	struct dev_pagemap *pgmap;
> > > >>>>>>>  	struct pci_dev *pci_dev;
> > > >>>>>>>  	phys_addr_t phys_addr;
> > > >>>>>>> -	size_t len;
> > > >>>>>>> +	size_t bar_len;
> > > >>>>>>>  	int ret;
> > > >>>>>>>  	u8 have_cache, cache_bar;
> > > >>>>>>>  	u64 cache_offset, cache_len;
> > > >>>>>>> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> > > >>>>>>>          }
> > > >>>>>>>  
> > > >>>>>>>  	/* TODO handle case where device doesn't expose BAR? */    
> > > >>>>>>
> > > >>>>>> For virtio-pmem we decided to not go via BARs as this would effectively
> > > >>>>>> make it only usable for virtio-pci implementers. Instead, we are going
> > > >>>>>> to export the applicable physical device region directly (e.g.
> > > >>>>>> phys_start, phys_size in virtio config), so it is decoupled from PCI
> > > >>>>>> details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
> > > >>>>>> to make eventually use of this.    
> > > >>>>>
> > > >>>>> That makes it a very odd looking PCI device;  I can see that with
> > > >>>>> virtio-pmem it makes some sense, given that it's job is to expose
> > > >>>>> arbitrary chunks of memory.
> > > >>>>>
> > > >>>>> Dave    
> > > >>>>
> > > >>>> Well, the fact that your are
> > > >>>>
> > > >>>> - including <uapi/linux/virtio_pci.h>
> > > >>>> - adding pci related code
> > > >>>>
> > > >>>> in/to fs/fuse/virtio_fs.c
> > > >>>>
> > > >>>> tells me that these properties might be better communicated on the
> > > >>>> virtio layer, not on the PCI layer.
> > > >>>>
> > > >>>> Or do you really want to glue virtio-fs to virtio-pci for all eternity?    
> > > >>>
> > > >>> No, these need cleaning up; and the split within the bar
> > > >>> is probably going to change to be communicated via virtio layer
> > > >>> rather than pci capabilities.  However, I don't want to make our PCI
> > > >>> device look odd, just to make portability to non-PCI devices - so it's
> > > >>> right to make the split appropriately, but still to use PCI bars
> > > >>> for what they were designed for.
> > > >>>
> > > >>> Dave    
> > > >>
> > > >> Let's discuss after the cleanup. In general I am not convinced this is
> > > >> the right thing to do. Using virtio-pci for anything else than pure
> > > >> transport smells like bad design to me (well, I am no virtio expert
> > > >> after all ;) ). No matter what PCI bars were designed for. If we can't
> > > >> get the same running with e.g. virtio-ccw or virtio-whatever, it is
> > > >> broken by design (or an addon that is tightly glued to virtio-pci, if
> > > >> that is the general idea).    
> > > > 
> > > > I'm sure we can find alternatives for virtio-*, so I wouldn't expect
> > > > it to be glued to virtio-pci.
> > > > 
> > > > Dave    
> > > 
> > > As s390x does not have the concept of memory mapped io (RAM is RAM,
> > > nothing else), this is not architectured. vitio-ccw can therefore not
> > > define anything similar like that. However, in virtual environments we
> > > can do whatever we want on top of the pure transport (e.g. on the virtio
> > > layer).
> > > 
> > > Conny can correct me if I am wrong.  
> > 
> > I don't think you're wrong, but I haven't read the code yet and I'm
> > therefore not aware of the purpose of this BAR.
> > 
> > Generally, if there is a memory location shared between host and guest,
> > we need a way to communicate its location, which will likely differ
> > between transports. For ccw, I could imagine a new channel command
> > dedicated to exchanging configuration information (similar to what
> > exists today to communicate the locations of virtqueues), but I'd
> > rather not go down this path.
> > 
> > Without reading the code/design further, can we use one of the
> > following instead of a BAR:
> > - a virtqueue;
> > - something in config space?
> > That would be implementable by any virtio transport.  
> 
> The way I think about this is that we wish to extend the VIRTIO device
> model with the concept of shared memory.  virtio-fs, virtio-gpu, and
> virtio-vhost-user all have requirements for shared memory.
> 
> This seems like a transport-level issue to me.  PCI supports
> memory-mapped I/O and that's the right place to do it.  If you try to
> put it into config space or the virtqueue, you'll end up with something
> that cannot be realized as a PCI device because it bypasses PCI bus
> address translation.
> 
> If CCW needs a side-channel, that's fine.  But that side-channel is a
> CCW-specific mechanism and probably doesn't apply to all other
> transports.

But virtio-gpu works with ccw right now (I haven't checked what it
uses); can virtio-fs use an equivalent method?

If there's a more generic case to be made for extending virtio devices
with a way to handle shared memory, a ccw for that would be fine. I
just want to avoid adding new ccws for everything as the namespace is
not infinite.
Dr. David Alan Gilbert Dec. 14, 2018, 2:06 p.m. UTC | #11
* Cornelia Huck (cohuck@redhat.com) wrote:
> On Fri, 14 Dec 2018 13:44:34 +0000
> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> > On Thu, Dec 13, 2018 at 01:38:23PM +0100, Cornelia Huck wrote:
> > > On Thu, 13 Dec 2018 13:24:31 +0100
> > > David Hildenbrand <david@redhat.com> wrote:
> > >   
> > > > On 13.12.18 13:15, Dr. David Alan Gilbert wrote:  
> > > > > * David Hildenbrand (david@redhat.com) wrote:    
> > > > >> On 13.12.18 11:00, Dr. David Alan Gilbert wrote:    
> > > > >>> * David Hildenbrand (david@redhat.com) wrote:    
> > > > >>>> On 13.12.18 10:13, Dr. David Alan Gilbert wrote:    
> > > > >>>>> * David Hildenbrand (david@redhat.com) wrote:    
> > > > >>>>>> On 10.12.18 18:12, Vivek Goyal wrote:    
> > > > >>>>>>> Instead of assuming we had the fixed bar for the cache, use the
> > > > >>>>>>> value from the capabilities.
> > > > >>>>>>>
> > > > >>>>>>> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > > > >>>>>>> ---
> > > > >>>>>>>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
> > > > >>>>>>>  1 file changed, 17 insertions(+), 15 deletions(-)
> > > > >>>>>>>
> > > > >>>>>>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> > > > >>>>>>> index 60d496c16841..55bac1465536 100644
> > > > >>>>>>> --- a/fs/fuse/virtio_fs.c
> > > > >>>>>>> +++ b/fs/fuse/virtio_fs.c
> > > > >>>>>>> @@ -14,11 +14,6 @@
> > > > >>>>>>>  #include <uapi/linux/virtio_pci.h>
> > > > >>>>>>>  #include "fuse_i.h"
> > > > >>>>>>>  
> > > > >>>>>>> -enum {
> > > > >>>>>>> -	/* PCI BAR number of the virtio-fs DAX window */
> > > > >>>>>>> -	VIRTIO_FS_WINDOW_BAR = 2,
> > > > >>>>>>> -};
> > > > >>>>>>> -
> > > > >>>>>>>  /* List of virtio-fs device instances and a lock for the list */
> > > > >>>>>>>  static DEFINE_MUTEX(virtio_fs_mutex);
> > > > >>>>>>>  static LIST_HEAD(virtio_fs_instances);
> > > > >>>>>>> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> > > > >>>>>>>  	struct dev_pagemap *pgmap;
> > > > >>>>>>>  	struct pci_dev *pci_dev;
> > > > >>>>>>>  	phys_addr_t phys_addr;
> > > > >>>>>>> -	size_t len;
> > > > >>>>>>> +	size_t bar_len;
> > > > >>>>>>>  	int ret;
> > > > >>>>>>>  	u8 have_cache, cache_bar;
> > > > >>>>>>>  	u64 cache_offset, cache_len;
> > > > >>>>>>> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> > > > >>>>>>>          }
> > > > >>>>>>>  
> > > > >>>>>>>  	/* TODO handle case where device doesn't expose BAR? */    
> > > > >>>>>>
> > > > >>>>>> For virtio-pmem we decided to not go via BARs as this would effectively
> > > > >>>>>> make it only usable for virtio-pci implementers. Instead, we are going
> > > > >>>>>> to export the applicable physical device region directly (e.g.
> > > > >>>>>> phys_start, phys_size in virtio config), so it is decoupled from PCI
> > > > >>>>>> details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
> > > > >>>>>> to make eventually use of this.    
> > > > >>>>>
> > > > >>>>> That makes it a very odd looking PCI device;  I can see that with
> > > > >>>>> virtio-pmem it makes some sense, given that it's job is to expose
> > > > >>>>> arbitrary chunks of memory.
> > > > >>>>>
> > > > >>>>> Dave    
> > > > >>>>
> > > > >>>> Well, the fact that your are
> > > > >>>>
> > > > >>>> - including <uapi/linux/virtio_pci.h>
> > > > >>>> - adding pci related code
> > > > >>>>
> > > > >>>> in/to fs/fuse/virtio_fs.c
> > > > >>>>
> > > > >>>> tells me that these properties might be better communicated on the
> > > > >>>> virtio layer, not on the PCI layer.
> > > > >>>>
> > > > >>>> Or do you really want to glue virtio-fs to virtio-pci for all eternity?    
> > > > >>>
> > > > >>> No, these need cleaning up; and the split within the bar
> > > > >>> is probably going to change to be communicated via virtio layer
> > > > >>> rather than pci capabilities.  However, I don't want to make our PCI
> > > > >>> device look odd, just to make portability to non-PCI devices - so it's
> > > > >>> right to make the split appropriately, but still to use PCI bars
> > > > >>> for what they were designed for.
> > > > >>>
> > > > >>> Dave    
> > > > >>
> > > > >> Let's discuss after the cleanup. In general I am not convinced this is
> > > > >> the right thing to do. Using virtio-pci for anything else than pure
> > > > >> transport smells like bad design to me (well, I am no virtio expert
> > > > >> after all ;) ). No matter what PCI bars were designed for. If we can't
> > > > >> get the same running with e.g. virtio-ccw or virtio-whatever, it is
> > > > >> broken by design (or an addon that is tightly glued to virtio-pci, if
> > > > >> that is the general idea).    
> > > > > 
> > > > > I'm sure we can find alternatives for virtio-*, so I wouldn't expect
> > > > > it to be glued to virtio-pci.
> > > > > 
> > > > > Dave    
> > > > 
> > > > As s390x does not have the concept of memory mapped io (RAM is RAM,
> > > > nothing else), this is not architectured. vitio-ccw can therefore not
> > > > define anything similar like that. However, in virtual environments we
> > > > can do whatever we want on top of the pure transport (e.g. on the virtio
> > > > layer).
> > > > 
> > > > Conny can correct me if I am wrong.  
> > > 
> > > I don't think you're wrong, but I haven't read the code yet and I'm
> > > therefore not aware of the purpose of this BAR.
> > > 
> > > Generally, if there is a memory location shared between host and guest,
> > > we need a way to communicate its location, which will likely differ
> > > between transports. For ccw, I could imagine a new channel command
> > > dedicated to exchanging configuration information (similar to what
> > > exists today to communicate the locations of virtqueues), but I'd
> > > rather not go down this path.
> > > 
> > > Without reading the code/design further, can we use one of the
> > > following instead of a BAR:
> > > - a virtqueue;
> > > - something in config space?
> > > That would be implementable by any virtio transport.  
> > 
> > The way I think about this is that we wish to extend the VIRTIO device
> > model with the concept of shared memory.  virtio-fs, virtio-gpu, and
> > virtio-vhost-user all have requirements for shared memory.
> > 
> > This seems like a transport-level issue to me.  PCI supports
> > memory-mapped I/O and that's the right place to do it.  If you try to
> > put it into config space or the virtqueue, you'll end up with something
> > that cannot be realized as a PCI device because it bypasses PCI bus
> > address translation.
> > 
> > If CCW needs a side-channel, that's fine.  But that side-channel is a
> > CCW-specific mechanism and probably doesn't apply to all other
> > transports.
> 
> But virtio-gpu works with ccw right now (I haven't checked what it
> uses); can virtio-fs use an equivalent method?
> 
> If there's a more generic case to be made for extending virtio devices
> with a way to handle shared memory, a ccw for that would be fine. I
> just want to avoid adding new ccws for everything as the namespace is
> not infinite.

In our case we've got somewhere between 0..3 ranges of memory, and I was
specifying them as PCI capabilities; however Gerd's suggestion was that
it would be better to just use 1 bar and then have something as part of
virtio or the like to split them up.
If we do that, then we could have something of the form
  (index, base, length)

for each of the regions, where in the PCI case 'index' means BAR and
in CCW it means something else. (For mmio it's probably irrelevant and
the base is probably a physical address).

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
David Hildenbrand Dec. 17, 2018, 10:53 a.m. UTC | #12
On 14.12.18 14:44, Stefan Hajnoczi wrote:
> On Thu, Dec 13, 2018 at 01:38:23PM +0100, Cornelia Huck wrote:
>> On Thu, 13 Dec 2018 13:24:31 +0100
>> David Hildenbrand <david@redhat.com> wrote:
>>
>>> On 13.12.18 13:15, Dr. David Alan Gilbert wrote:
>>>> * David Hildenbrand (david@redhat.com) wrote:  
>>>>> On 13.12.18 11:00, Dr. David Alan Gilbert wrote:  
>>>>>> * David Hildenbrand (david@redhat.com) wrote:  
>>>>>>> On 13.12.18 10:13, Dr. David Alan Gilbert wrote:  
>>>>>>>> * David Hildenbrand (david@redhat.com) wrote:  
>>>>>>>>> On 10.12.18 18:12, Vivek Goyal wrote:  
>>>>>>>>>> Instead of assuming we had the fixed bar for the cache, use the
>>>>>>>>>> value from the capabilities.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>>>>>>>>> ---
>>>>>>>>>>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
>>>>>>>>>>  1 file changed, 17 insertions(+), 15 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
>>>>>>>>>> index 60d496c16841..55bac1465536 100644
>>>>>>>>>> --- a/fs/fuse/virtio_fs.c
>>>>>>>>>> +++ b/fs/fuse/virtio_fs.c
>>>>>>>>>> @@ -14,11 +14,6 @@
>>>>>>>>>>  #include <uapi/linux/virtio_pci.h>
>>>>>>>>>>  #include "fuse_i.h"
>>>>>>>>>>  
>>>>>>>>>> -enum {
>>>>>>>>>> -	/* PCI BAR number of the virtio-fs DAX window */
>>>>>>>>>> -	VIRTIO_FS_WINDOW_BAR = 2,
>>>>>>>>>> -};
>>>>>>>>>> -
>>>>>>>>>>  /* List of virtio-fs device instances and a lock for the list */
>>>>>>>>>>  static DEFINE_MUTEX(virtio_fs_mutex);
>>>>>>>>>>  static LIST_HEAD(virtio_fs_instances);
>>>>>>>>>> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>>>>>>>>>>  	struct dev_pagemap *pgmap;
>>>>>>>>>>  	struct pci_dev *pci_dev;
>>>>>>>>>>  	phys_addr_t phys_addr;
>>>>>>>>>> -	size_t len;
>>>>>>>>>> +	size_t bar_len;
>>>>>>>>>>  	int ret;
>>>>>>>>>>  	u8 have_cache, cache_bar;
>>>>>>>>>>  	u64 cache_offset, cache_len;
>>>>>>>>>> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>>>>>>>>>>          }
>>>>>>>>>>  
>>>>>>>>>>  	/* TODO handle case where device doesn't expose BAR? */  
>>>>>>>>>
>>>>>>>>> For virtio-pmem we decided to not go via BARs as this would effectively
>>>>>>>>> make it only usable for virtio-pci implementers. Instead, we are going
>>>>>>>>> to export the applicable physical device region directly (e.g.
>>>>>>>>> phys_start, phys_size in virtio config), so it is decoupled from PCI
>>>>>>>>> details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
>>>>>>>>> to make eventually use of this.  
>>>>>>>>
>>>>>>>> That makes it a very odd looking PCI device;  I can see that with
>>>>>>>> virtio-pmem it makes some sense, given that it's job is to expose
>>>>>>>> arbitrary chunks of memory.
>>>>>>>>
>>>>>>>> Dave  
>>>>>>>
>>>>>>> Well, the fact that your are
>>>>>>>
>>>>>>> - including <uapi/linux/virtio_pci.h>
>>>>>>> - adding pci related code
>>>>>>>
>>>>>>> in/to fs/fuse/virtio_fs.c
>>>>>>>
>>>>>>> tells me that these properties might be better communicated on the
>>>>>>> virtio layer, not on the PCI layer.
>>>>>>>
>>>>>>> Or do you really want to glue virtio-fs to virtio-pci for all eternity?  
>>>>>>
>>>>>> No, these need cleaning up; and the split within the bar
>>>>>> is probably going to change to be communicated via virtio layer
>>>>>> rather than pci capabilities.  However, I don't want to make our PCI
>>>>>> device look odd, just to make portability to non-PCI devices - so it's
>>>>>> right to make the split appropriately, but still to use PCI bars
>>>>>> for what they were designed for.
>>>>>>
>>>>>> Dave  
>>>>>
>>>>> Let's discuss after the cleanup. In general I am not convinced this is
>>>>> the right thing to do. Using virtio-pci for anything else than pure
>>>>> transport smells like bad design to me (well, I am no virtio expert
>>>>> after all ;) ). No matter what PCI bars were designed for. If we can't
>>>>> get the same running with e.g. virtio-ccw or virtio-whatever, it is
>>>>> broken by design (or an addon that is tightly glued to virtio-pci, if
>>>>> that is the general idea).  
>>>>
>>>> I'm sure we can find alternatives for virtio-*, so I wouldn't expect
>>>> it to be glued to virtio-pci.
>>>>
>>>> Dave  
>>>
>>> As s390x does not have the concept of memory mapped io (RAM is RAM,
>>> nothing else), this is not architectured. vitio-ccw can therefore not
>>> define anything similar like that. However, in virtual environments we
>>> can do whatever we want on top of the pure transport (e.g. on the virtio
>>> layer).
>>>
>>> Conny can correct me if I am wrong.
>>
>> I don't think you're wrong, but I haven't read the code yet and I'm
>> therefore not aware of the purpose of this BAR.
>>
>> Generally, if there is a memory location shared between host and guest,
>> we need a way to communicate its location, which will likely differ
>> between transports. For ccw, I could imagine a new channel command
>> dedicated to exchanging configuration information (similar to what
>> exists today to communicate the locations of virtqueues), but I'd
>> rather not go down this path.
>>
>> Without reading the code/design further, can we use one of the
>> following instead of a BAR:
>> - a virtqueue;
>> - something in config space?
>> That would be implementable by any virtio transport.
> 
> The way I think about this is that we wish to extend the VIRTIO device
> model with the concept of shared memory.  virtio-fs, virtio-gpu, and
> virtio-vhost-user all have requirements for shared memory.
> 
> This seems like a transport-level issue to me.  PCI supports
> memory-mapped I/O and that's the right place to do it.  If you try to
> put it into config space or the virtqueue, you'll end up with something
> that cannot be realized as a PCI device because it bypasses PCI bus
> address translation.
> 
> If CCW needs a side-channel, that's fine.  But that side-channel is a
> CCW-specific mechanism and probably doesn't apply to all other
> transports.
> 
> Stefan
> 

I think the problem is more fundamental. There is no iommu. Whatever
shared region you want to indicate, you want it to be assigned a memory
region in guest physical memory. Like a DIMM/NVDIMM. And this should be
different to the concept of a BAR. Or am I missing something?

I am ok with using whatever other channel to transport such information.
But I believe this is different to a typical BAR. (I wish I knew more
about PCI internals ;) ).

I would also like to know how shared memory works as of now for e.g.
virtio-gpu.
Stefan Hajnoczi Dec. 17, 2018, 11:25 a.m. UTC | #13
On Fri, Dec 14, 2018 at 02:50:58PM +0100, Cornelia Huck wrote:
> On Fri, 14 Dec 2018 13:44:34 +0000
> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> > On Thu, Dec 13, 2018 at 01:38:23PM +0100, Cornelia Huck wrote:
> > > On Thu, 13 Dec 2018 13:24:31 +0100
> > > David Hildenbrand <david@redhat.com> wrote:
> > >   
> > > > On 13.12.18 13:15, Dr. David Alan Gilbert wrote:  
> > > > > * David Hildenbrand (david@redhat.com) wrote:    
> > > > >> On 13.12.18 11:00, Dr. David Alan Gilbert wrote:    
> > > > >>> * David Hildenbrand (david@redhat.com) wrote:    
> > > > >>>> On 13.12.18 10:13, Dr. David Alan Gilbert wrote:    
> > > > >>>>> * David Hildenbrand (david@redhat.com) wrote:    
> > > > >>>>>> On 10.12.18 18:12, Vivek Goyal wrote:    
> > > > >>>>>>> Instead of assuming we had the fixed bar for the cache, use the
> > > > >>>>>>> value from the capabilities.
> > > > >>>>>>>
> > > > >>>>>>> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > > > >>>>>>> ---
> > > > >>>>>>>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
> > > > >>>>>>>  1 file changed, 17 insertions(+), 15 deletions(-)
> > > > >>>>>>>
> > > > >>>>>>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> > > > >>>>>>> index 60d496c16841..55bac1465536 100644
> > > > >>>>>>> --- a/fs/fuse/virtio_fs.c
> > > > >>>>>>> +++ b/fs/fuse/virtio_fs.c
> > > > >>>>>>> @@ -14,11 +14,6 @@
> > > > >>>>>>>  #include <uapi/linux/virtio_pci.h>
> > > > >>>>>>>  #include "fuse_i.h"
> > > > >>>>>>>  
> > > > >>>>>>> -enum {
> > > > >>>>>>> -	/* PCI BAR number of the virtio-fs DAX window */
> > > > >>>>>>> -	VIRTIO_FS_WINDOW_BAR = 2,
> > > > >>>>>>> -};
> > > > >>>>>>> -
> > > > >>>>>>>  /* List of virtio-fs device instances and a lock for the list */
> > > > >>>>>>>  static DEFINE_MUTEX(virtio_fs_mutex);
> > > > >>>>>>>  static LIST_HEAD(virtio_fs_instances);
> > > > >>>>>>> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> > > > >>>>>>>  	struct dev_pagemap *pgmap;
> > > > >>>>>>>  	struct pci_dev *pci_dev;
> > > > >>>>>>>  	phys_addr_t phys_addr;
> > > > >>>>>>> -	size_t len;
> > > > >>>>>>> +	size_t bar_len;
> > > > >>>>>>>  	int ret;
> > > > >>>>>>>  	u8 have_cache, cache_bar;
> > > > >>>>>>>  	u64 cache_offset, cache_len;
> > > > >>>>>>> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> > > > >>>>>>>          }
> > > > >>>>>>>  
> > > > >>>>>>>  	/* TODO handle case where device doesn't expose BAR? */    
> > > > >>>>>>
> > > > >>>>>> For virtio-pmem we decided to not go via BARs as this would effectively
> > > > >>>>>> make it only usable for virtio-pci implementers. Instead, we are going
> > > > >>>>>> to export the applicable physical device region directly (e.g.
> > > > >>>>>> phys_start, phys_size in virtio config), so it is decoupled from PCI
> > > > >>>>>> details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
> > > > >>>>>> to make eventually use of this.    
> > > > >>>>>
> > > > >>>>> That makes it a very odd looking PCI device;  I can see that with
> > > > >>>>> virtio-pmem it makes some sense, given that it's job is to expose
> > > > >>>>> arbitrary chunks of memory.
> > > > >>>>>
> > > > >>>>> Dave    
> > > > >>>>
> > > > >>>> Well, the fact that your are
> > > > >>>>
> > > > >>>> - including <uapi/linux/virtio_pci.h>
> > > > >>>> - adding pci related code
> > > > >>>>
> > > > >>>> in/to fs/fuse/virtio_fs.c
> > > > >>>>
> > > > >>>> tells me that these properties might be better communicated on the
> > > > >>>> virtio layer, not on the PCI layer.
> > > > >>>>
> > > > >>>> Or do you really want to glue virtio-fs to virtio-pci for all eternity?    
> > > > >>>
> > > > >>> No, these need cleaning up; and the split within the bar
> > > > >>> is probably going to change to be communicated via virtio layer
> > > > >>> rather than pci capabilities.  However, I don't want to make our PCI
> > > > >>> device look odd, just to make portability to non-PCI devices - so it's
> > > > >>> right to make the split appropriately, but still to use PCI bars
> > > > >>> for what they were designed for.
> > > > >>>
> > > > >>> Dave    
> > > > >>
> > > > >> Let's discuss after the cleanup. In general I am not convinced this is
> > > > >> the right thing to do. Using virtio-pci for anything else than pure
> > > > >> transport smells like bad design to me (well, I am no virtio expert
> > > > >> after all ;) ). No matter what PCI bars were designed for. If we can't
> > > > >> get the same running with e.g. virtio-ccw or virtio-whatever, it is
> > > > >> broken by design (or an addon that is tightly glued to virtio-pci, if
> > > > >> that is the general idea).    
> > > > > 
> > > > > I'm sure we can find alternatives for virtio-*, so I wouldn't expect
> > > > > it to be glued to virtio-pci.
> > > > > 
> > > > > Dave    
> > > > 
> > > > As s390x does not have the concept of memory mapped io (RAM is RAM,
> > > > nothing else), this is not architectured. vitio-ccw can therefore not
> > > > define anything similar like that. However, in virtual environments we
> > > > can do whatever we want on top of the pure transport (e.g. on the virtio
> > > > layer).
> > > > 
> > > > Conny can correct me if I am wrong.  
> > > 
> > > I don't think you're wrong, but I haven't read the code yet and I'm
> > > therefore not aware of the purpose of this BAR.
> > > 
> > > Generally, if there is a memory location shared between host and guest,
> > > we need a way to communicate its location, which will likely differ
> > > between transports. For ccw, I could imagine a new channel command
> > > dedicated to exchanging configuration information (similar to what
> > > exists today to communicate the locations of virtqueues), but I'd
> > > rather not go down this path.
> > > 
> > > Without reading the code/design further, can we use one of the
> > > following instead of a BAR:
> > > - a virtqueue;
> > > - something in config space?
> > > That would be implementable by any virtio transport.  
> > 
> > The way I think about this is that we wish to extend the VIRTIO device
> > model with the concept of shared memory.  virtio-fs, virtio-gpu, and
> > virtio-vhost-user all have requirements for shared memory.
> > 
> > This seems like a transport-level issue to me.  PCI supports
> > memory-mapped I/O and that's the right place to do it.  If you try to
> > put it into config space or the virtqueue, you'll end up with something
> > that cannot be realized as a PCI device because it bypasses PCI bus
> > address translation.
> > 
> > If CCW needs a side-channel, that's fine.  But that side-channel is a
> > CCW-specific mechanism and probably doesn't apply to all other
> > transports.
> 
> But virtio-gpu works with ccw right now (I haven't checked what it
> uses); can virtio-fs use an equivalent method?

virtio-gpu does not use shared memory yet but it needs to in the future.

> If there's a more generic case to be made for extending virtio devices
> with a way to handle shared memory, a ccw for that would be fine. I
> just want to avoid adding new ccws for everything as the namespace is
> not infinite.

Yes, virtio-vhost-user needs it too.  I think it makes sense for shared
memory resources to be part of the VIRTIO device model.

Stefan
Stefan Hajnoczi Dec. 17, 2018, 2:56 p.m. UTC | #14
On Mon, Dec 17, 2018 at 11:53:46AM +0100, David Hildenbrand wrote:
> On 14.12.18 14:44, Stefan Hajnoczi wrote:
> > On Thu, Dec 13, 2018 at 01:38:23PM +0100, Cornelia Huck wrote:
> >> On Thu, 13 Dec 2018 13:24:31 +0100
> >> David Hildenbrand <david@redhat.com> wrote:
> >>
> >>> On 13.12.18 13:15, Dr. David Alan Gilbert wrote:
> >>>> * David Hildenbrand (david@redhat.com) wrote:  
> >>>>> On 13.12.18 11:00, Dr. David Alan Gilbert wrote:  
> >>>>>> * David Hildenbrand (david@redhat.com) wrote:  
> >>>>>>> On 13.12.18 10:13, Dr. David Alan Gilbert wrote:  
> >>>>>>>> * David Hildenbrand (david@redhat.com) wrote:  
> >>>>>>>>> On 10.12.18 18:12, Vivek Goyal wrote:  
> >>>>>>>>>> Instead of assuming we had the fixed bar for the cache, use the
> >>>>>>>>>> value from the capabilities.
> >>>>>>>>>>
> >>>>>>>>>> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> >>>>>>>>>> ---
> >>>>>>>>>>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
> >>>>>>>>>>  1 file changed, 17 insertions(+), 15 deletions(-)
> >>>>>>>>>>
> >>>>>>>>>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> >>>>>>>>>> index 60d496c16841..55bac1465536 100644
> >>>>>>>>>> --- a/fs/fuse/virtio_fs.c
> >>>>>>>>>> +++ b/fs/fuse/virtio_fs.c
> >>>>>>>>>> @@ -14,11 +14,6 @@
> >>>>>>>>>>  #include <uapi/linux/virtio_pci.h>
> >>>>>>>>>>  #include "fuse_i.h"
> >>>>>>>>>>  
> >>>>>>>>>> -enum {
> >>>>>>>>>> -	/* PCI BAR number of the virtio-fs DAX window */
> >>>>>>>>>> -	VIRTIO_FS_WINDOW_BAR = 2,
> >>>>>>>>>> -};
> >>>>>>>>>> -
> >>>>>>>>>>  /* List of virtio-fs device instances and a lock for the list */
> >>>>>>>>>>  static DEFINE_MUTEX(virtio_fs_mutex);
> >>>>>>>>>>  static LIST_HEAD(virtio_fs_instances);
> >>>>>>>>>> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >>>>>>>>>>  	struct dev_pagemap *pgmap;
> >>>>>>>>>>  	struct pci_dev *pci_dev;
> >>>>>>>>>>  	phys_addr_t phys_addr;
> >>>>>>>>>> -	size_t len;
> >>>>>>>>>> +	size_t bar_len;
> >>>>>>>>>>  	int ret;
> >>>>>>>>>>  	u8 have_cache, cache_bar;
> >>>>>>>>>>  	u64 cache_offset, cache_len;
> >>>>>>>>>> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >>>>>>>>>>          }
> >>>>>>>>>>  
> >>>>>>>>>>  	/* TODO handle case where device doesn't expose BAR? */  
> >>>>>>>>>
> >>>>>>>>> For virtio-pmem we decided to not go via BARs as this would effectively
> >>>>>>>>> make it only usable for virtio-pci implementers. Instead, we are going
> >>>>>>>>> to export the applicable physical device region directly (e.g.
> >>>>>>>>> phys_start, phys_size in virtio config), so it is decoupled from PCI
> >>>>>>>>> details. Doing the same for virtio-fs would allow e.g. also virtio-ccw
> >>>>>>>>> to make eventually use of this.  
> >>>>>>>>
> >>>>>>>> That makes it a very odd looking PCI device;  I can see that with
> >>>>>>>> virtio-pmem it makes some sense, given that it's job is to expose
> >>>>>>>> arbitrary chunks of memory.
> >>>>>>>>
> >>>>>>>> Dave  
> >>>>>>>
> >>>>>>> Well, the fact that your are
> >>>>>>>
> >>>>>>> - including <uapi/linux/virtio_pci.h>
> >>>>>>> - adding pci related code
> >>>>>>>
> >>>>>>> in/to fs/fuse/virtio_fs.c
> >>>>>>>
> >>>>>>> tells me that these properties might be better communicated on the
> >>>>>>> virtio layer, not on the PCI layer.
> >>>>>>>
> >>>>>>> Or do you really want to glue virtio-fs to virtio-pci for all eternity?  
> >>>>>>
> >>>>>> No, these need cleaning up; and the split within the bar
> >>>>>> is probably going to change to be communicated via virtio layer
> >>>>>> rather than pci capabilities.  However, I don't want to make our PCI
> >>>>>> device look odd, just to make portability to non-PCI devices - so it's
> >>>>>> right to make the split appropriately, but still to use PCI bars
> >>>>>> for what they were designed for.
> >>>>>>
> >>>>>> Dave  
> >>>>>
> >>>>> Let's discuss after the cleanup. In general I am not convinced this is
> >>>>> the right thing to do. Using virtio-pci for anything else than pure
> >>>>> transport smells like bad design to me (well, I am no virtio expert
> >>>>> after all ;) ). No matter what PCI bars were designed for. If we can't
> >>>>> get the same running with e.g. virtio-ccw or virtio-whatever, it is
> >>>>> broken by design (or an addon that is tightly glued to virtio-pci, if
> >>>>> that is the general idea).  
> >>>>
> >>>> I'm sure we can find alternatives for virtio-*, so I wouldn't expect
> >>>> it to be glued to virtio-pci.
> >>>>
> >>>> Dave  
> >>>
> >>> As s390x does not have the concept of memory mapped io (RAM is RAM,
> >>> nothing else), this is not architectured. vitio-ccw can therefore not
> >>> define anything similar like that. However, in virtual environments we
> >>> can do whatever we want on top of the pure transport (e.g. on the virtio
> >>> layer).
> >>>
> >>> Conny can correct me if I am wrong.
> >>
> >> I don't think you're wrong, but I haven't read the code yet and I'm
> >> therefore not aware of the purpose of this BAR.
> >>
> >> Generally, if there is a memory location shared between host and guest,
> >> we need a way to communicate its location, which will likely differ
> >> between transports. For ccw, I could imagine a new channel command
> >> dedicated to exchanging configuration information (similar to what
> >> exists today to communicate the locations of virtqueues), but I'd
> >> rather not go down this path.
> >>
> >> Without reading the code/design further, can we use one of the
> >> following instead of a BAR:
> >> - a virtqueue;
> >> - something in config space?
> >> That would be implementable by any virtio transport.
> > 
> > The way I think about this is that we wish to extend the VIRTIO device
> > model with the concept of shared memory.  virtio-fs, virtio-gpu, and
> > virtio-vhost-user all have requirements for shared memory.
> > 
> > This seems like a transport-level issue to me.  PCI supports
> > memory-mapped I/O and that's the right place to do it.  If you try to
> > put it into config space or the virtqueue, you'll end up with something
> > that cannot be realized as a PCI device because it bypasses PCI bus
> > address translation.
> > 
> > If CCW needs a side-channel, that's fine.  But that side-channel is a
> > CCW-specific mechanism and probably doesn't apply to all other
> > transports.
> > 
> > Stefan
> > 
> 
> I think the problem is more fundamental. There is no iommu. Whatever
> shared region you want to indicate, you want it to be assigned a memory
> region in guest physical memory. Like a DIMM/NVDIMM. And this should be
> different to the concept of a BAR. Or am I missing something?

If you implement a physical virtio PCI adapter then there is bus
addressing and an IOMMU and VIRTIO has support for that.  I'm not sure I
understand what you mean by "there is no iommu"?

> I am ok with using whatever other channel to transport such information.
> But I believe this is different to a typical BAR. (I wish I knew more
> about PCI internals ;) ).
> 
> I would also like to know how shared memory works as of now for e.g.
> virtio-gpu.

virtio-gpu currently does not use shared memory, it needs it for future
features.

Stefan
Cornelia Huck Dec. 18, 2018, 5:13 p.m. UTC | #15
On Mon, 17 Dec 2018 14:56:38 +0000
Stefan Hajnoczi <stefanha@redhat.com> wrote:

> On Mon, Dec 17, 2018 at 11:53:46AM +0100, David Hildenbrand wrote:
> > On 14.12.18 14:44, Stefan Hajnoczi wrote:  
> > > On Thu, Dec 13, 2018 at 01:38:23PM +0100, Cornelia Huck wrote:  
> > >> On Thu, 13 Dec 2018 13:24:31 +0100
> > >> David Hildenbrand <david@redhat.com> wrote:

> > >>> As s390x does not have the concept of memory mapped io (RAM is RAM,
> > >>> nothing else), this is not architectured. vitio-ccw can therefore not
> > >>> define anything similar like that. However, in virtual environments we
> > >>> can do whatever we want on top of the pure transport (e.g. on the virtio
> > >>> layer).
> > >>>
> > >>> Conny can correct me if I am wrong.  
> > >>
> > >> I don't think you're wrong, but I haven't read the code yet and I'm
> > >> therefore not aware of the purpose of this BAR.
> > >>
> > >> Generally, if there is a memory location shared between host and guest,
> > >> we need a way to communicate its location, which will likely differ
> > >> between transports. For ccw, I could imagine a new channel command
> > >> dedicated to exchanging configuration information (similar to what
> > >> exists today to communicate the locations of virtqueues), but I'd
> > >> rather not go down this path.
> > >>
> > >> Without reading the code/design further, can we use one of the
> > >> following instead of a BAR:
> > >> - a virtqueue;
> > >> - something in config space?
> > >> That would be implementable by any virtio transport.  
> > > 
> > > The way I think about this is that we wish to extend the VIRTIO device
> > > model with the concept of shared memory.  virtio-fs, virtio-gpu, and
> > > virtio-vhost-user all have requirements for shared memory.
> > > 
> > > This seems like a transport-level issue to me.  PCI supports
> > > memory-mapped I/O and that's the right place to do it.  If you try to
> > > put it into config space or the virtqueue, you'll end up with something
> > > that cannot be realized as a PCI device because it bypasses PCI bus
> > > address translation.
> > > 
> > > If CCW needs a side-channel, that's fine.  But that side-channel is a
> > > CCW-specific mechanism and probably doesn't apply to all other
> > > transports.
> > > 
> > > Stefan
> > >   
> > 
> > I think the problem is more fundamental. There is no iommu. Whatever
> > shared region you want to indicate, you want it to be assigned a memory
> > region in guest physical memory. Like a DIMM/NVDIMM. And this should be
> > different to the concept of a BAR. Or am I missing something?  
> 
> If you implement a physical virtio PCI adapter then there is bus
> addressing and an IOMMU and VIRTIO has support for that.  I'm not sure I
> understand what you mean by "there is no iommu"?

For ccw, there is no iommu; channel-program translation is doing
similar things. (I hope that is what David meant :)

> 
> > I am ok with using whatever other channel to transport such information.
> > But I believe this is different to a typical BAR. (I wish I knew more
> > about PCI internals ;) ).
> > 
> > I would also like to know how shared memory works as of now for e.g.
> > virtio-gpu.  
> 
> virtio-gpu currently does not use shared memory, it needs it for future
> features.

OK, that all sounds like we need to define a generic, per transport,
device agnostic way to specify shared memory.

Where is that memory situated? Is it something in guest memory (like
virtqueues)? If it is something provided by the device, things will get
tricky for ccw (remember that there's no mmio on s390; pci on s390 uses
special instructions for that.)
David Hildenbrand Dec. 18, 2018, 5:25 p.m. UTC | #16
On 18.12.18 18:13, Cornelia Huck wrote:
> On Mon, 17 Dec 2018 14:56:38 +0000
> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
>> On Mon, Dec 17, 2018 at 11:53:46AM +0100, David Hildenbrand wrote:
>>> On 14.12.18 14:44, Stefan Hajnoczi wrote:  
>>>> On Thu, Dec 13, 2018 at 01:38:23PM +0100, Cornelia Huck wrote:  
>>>>> On Thu, 13 Dec 2018 13:24:31 +0100
>>>>> David Hildenbrand <david@redhat.com> wrote:
> 
>>>>>> As s390x does not have the concept of memory mapped io (RAM is RAM,
>>>>>> nothing else), this is not architectured. vitio-ccw can therefore not
>>>>>> define anything similar like that. However, in virtual environments we
>>>>>> can do whatever we want on top of the pure transport (e.g. on the virtio
>>>>>> layer).
>>>>>>
>>>>>> Conny can correct me if I am wrong.  
>>>>>
>>>>> I don't think you're wrong, but I haven't read the code yet and I'm
>>>>> therefore not aware of the purpose of this BAR.
>>>>>
>>>>> Generally, if there is a memory location shared between host and guest,
>>>>> we need a way to communicate its location, which will likely differ
>>>>> between transports. For ccw, I could imagine a new channel command
>>>>> dedicated to exchanging configuration information (similar to what
>>>>> exists today to communicate the locations of virtqueues), but I'd
>>>>> rather not go down this path.
>>>>>
>>>>> Without reading the code/design further, can we use one of the
>>>>> following instead of a BAR:
>>>>> - a virtqueue;
>>>>> - something in config space?
>>>>> That would be implementable by any virtio transport.  
>>>>
>>>> The way I think about this is that we wish to extend the VIRTIO device
>>>> model with the concept of shared memory.  virtio-fs, virtio-gpu, and
>>>> virtio-vhost-user all have requirements for shared memory.
>>>>
>>>> This seems like a transport-level issue to me.  PCI supports
>>>> memory-mapped I/O and that's the right place to do it.  If you try to
>>>> put it into config space or the virtqueue, you'll end up with something
>>>> that cannot be realized as a PCI device because it bypasses PCI bus
>>>> address translation.
>>>>
>>>> If CCW needs a side-channel, that's fine.  But that side-channel is a
>>>> CCW-specific mechanism and probably doesn't apply to all other
>>>> transports.
>>>>
>>>> Stefan
>>>>   
>>>
>>> I think the problem is more fundamental. There is no iommu. Whatever
>>> shared region you want to indicate, you want it to be assigned a memory
>>> region in guest physical memory. Like a DIMM/NVDIMM. And this should be
>>> different to the concept of a BAR. Or am I missing something?  
>>
>> If you implement a physical virtio PCI adapter then there is bus
>> addressing and an IOMMU and VIRTIO has support for that.  I'm not sure I
>> understand what you mean by "there is no iommu"?
> 
> For ccw, there is no iommu; channel-program translation is doing
> similar things. (I hope that is what David meant :)
> 
>>
>>> I am ok with using whatever other channel to transport such information.
>>> But I believe this is different to a typical BAR. (I wish I knew more
>>> about PCI internals ;) ).
>>>
>>> I would also like to know how shared memory works as of now for e.g.
>>> virtio-gpu.  
>>
>> virtio-gpu currently does not use shared memory, it needs it for future
>> features.
> 
> OK, that all sounds like we need to define a generic, per transport,
> device agnostic way to specify shared memory.
> 
> Where is that memory situated? Is it something in guest memory (like
> virtqueues)? If it is something provided by the device, things will get
> tricky for ccw (remember that there's no mmio on s390; pci on s390 uses
> special instructions for that.)
> 

I am just very very confused right now. What I am struggling with right
now (Stefan, hope you can clarify it for me):

We need some place where this shared memory is located in the guest
physical memory. On x86 - if I am not wrong - this BAR is placed into
the reserved memory area between 3 and 4 GB. There is no such thing on
s390x. Because we don't have IO via memory (yet). All we have is one or
two KVM memory slots filled with all memory.

So what we will need on s390x is on the QEMU side such a reserved memory
region where devices like virtio-fs can reserve a region for shared memory.

So it is something like a dimm/nvdimm except that it is smaller and not
visible to the user directly (via memory backends).
Stefan Hajnoczi Jan. 2, 2019, 10:24 a.m. UTC | #17
On Tue, Dec 18, 2018 at 06:25:27PM +0100, David Hildenbrand wrote:
> On 18.12.18 18:13, Cornelia Huck wrote:
> > On Mon, 17 Dec 2018 14:56:38 +0000
> > Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> >> On Mon, Dec 17, 2018 at 11:53:46AM +0100, David Hildenbrand wrote:
> >>> On 14.12.18 14:44, Stefan Hajnoczi wrote:  
> >>>> On Thu, Dec 13, 2018 at 01:38:23PM +0100, Cornelia Huck wrote:  
> >>>>> On Thu, 13 Dec 2018 13:24:31 +0100
> >>>>> David Hildenbrand <david@redhat.com> wrote:
> > 
> >>>>>> As s390x does not have the concept of memory mapped io (RAM is RAM,
> >>>>>> nothing else), this is not architectured. vitio-ccw can therefore not
> >>>>>> define anything similar like that. However, in virtual environments we
> >>>>>> can do whatever we want on top of the pure transport (e.g. on the virtio
> >>>>>> layer).
> >>>>>>
> >>>>>> Conny can correct me if I am wrong.  
> >>>>>
> >>>>> I don't think you're wrong, but I haven't read the code yet and I'm
> >>>>> therefore not aware of the purpose of this BAR.
> >>>>>
> >>>>> Generally, if there is a memory location shared between host and guest,
> >>>>> we need a way to communicate its location, which will likely differ
> >>>>> between transports. For ccw, I could imagine a new channel command
> >>>>> dedicated to exchanging configuration information (similar to what
> >>>>> exists today to communicate the locations of virtqueues), but I'd
> >>>>> rather not go down this path.
> >>>>>
> >>>>> Without reading the code/design further, can we use one of the
> >>>>> following instead of a BAR:
> >>>>> - a virtqueue;
> >>>>> - something in config space?
> >>>>> That would be implementable by any virtio transport.  
> >>>>
> >>>> The way I think about this is that we wish to extend the VIRTIO device
> >>>> model with the concept of shared memory.  virtio-fs, virtio-gpu, and
> >>>> virtio-vhost-user all have requirements for shared memory.
> >>>>
> >>>> This seems like a transport-level issue to me.  PCI supports
> >>>> memory-mapped I/O and that's the right place to do it.  If you try to
> >>>> put it into config space or the virtqueue, you'll end up with something
> >>>> that cannot be realized as a PCI device because it bypasses PCI bus
> >>>> address translation.
> >>>>
> >>>> If CCW needs a side-channel, that's fine.  But that side-channel is a
> >>>> CCW-specific mechanism and probably doesn't apply to all other
> >>>> transports.
> >>>>
> >>>> Stefan
> >>>>   
> >>>
> >>> I think the problem is more fundamental. There is no iommu. Whatever
> >>> shared region you want to indicate, you want it to be assigned a memory
> >>> region in guest physical memory. Like a DIMM/NVDIMM. And this should be
> >>> different to the concept of a BAR. Or am I missing something?  
> >>
> >> If you implement a physical virtio PCI adapter then there is bus
> >> addressing and an IOMMU and VIRTIO has support for that.  I'm not sure I
> >> understand what you mean by "there is no iommu"?
> > 
> > For ccw, there is no iommu; channel-program translation is doing
> > similar things. (I hope that is what David meant :)
> > 
> >>
> >>> I am ok with using whatever other channel to transport such information.
> >>> But I believe this is different to a typical BAR. (I wish I knew more
> >>> about PCI internals ;) ).
> >>>
> >>> I would also like to know how shared memory works as of now for e.g.
> >>> virtio-gpu.  
> >>
> >> virtio-gpu currently does not use shared memory, it needs it for future
> >> features.
> > 
> > OK, that all sounds like we need to define a generic, per transport,
> > device agnostic way to specify shared memory.
> > 
> > Where is that memory situated? Is it something in guest memory (like
> > virtqueues)? If it is something provided by the device, things will get
> > tricky for ccw (remember that there's no mmio on s390; pci on s390 uses
> > special instructions for that.)
> > 
> 
> I am just very very confused right now. What I am struggling with right
> now (Stefan, hope you can clarify it for me):
> 
> We need some place where this shared memory is located in the guest
> physical memory. On x86 - if I am not wrong - this BAR is placed into
> the reserved memory area between 3 and 4 GB.

Right, the shared memory is provided by the device and does not live in
guest RAM.

> There is no such thing on
> s390x. Because we don't have IO via memory (yet). All we have is one or
> two KVM memory slots filled with all memory.
> 
> So what we will need on s390x is on the QEMU side such a reserved memory
> region where devices like virtio-fs can reserve a region for shared memory.
> 
> So it is something like a dimm/nvdimm except that it is smaller and not
> visible to the user directly (via memory backends).

I see.  That makes sense.

Stefan
Liu Bo March 17, 2019, 12:33 a.m. UTC | #18
On Mon, Dec 10, 2018 at 9:57 AM Vivek Goyal <vgoyal@redhat.com> wrote:
>
> Instead of assuming we had the fixed bar for the cache, use the
> value from the capabilities.
>
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
>  1 file changed, 17 insertions(+), 15 deletions(-)
>
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> index 60d496c16841..55bac1465536 100644
> --- a/fs/fuse/virtio_fs.c
> +++ b/fs/fuse/virtio_fs.c
> @@ -14,11 +14,6 @@
>  #include <uapi/linux/virtio_pci.h>
>  #include "fuse_i.h"
>
> -enum {
> -       /* PCI BAR number of the virtio-fs DAX window */
> -       VIRTIO_FS_WINDOW_BAR = 2,
> -};
> -
>  /* List of virtio-fs device instances and a lock for the list */
>  static DEFINE_MUTEX(virtio_fs_mutex);
>  static LIST_HEAD(virtio_fs_instances);
> @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>         struct dev_pagemap *pgmap;
>         struct pci_dev *pci_dev;
>         phys_addr_t phys_addr;
> -       size_t len;
> +       size_t bar_len;
>         int ret;
>         u8 have_cache, cache_bar;
>         u64 cache_offset, cache_len;
> @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>          }
>
>         /* TODO handle case where device doesn't expose BAR? */
> -       ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR,
> -                                "virtio-fs-window");
> +       ret = pci_request_region(pci_dev, cache_bar, "virtio-fs-window");
>         if (ret < 0) {
>                 dev_err(&vdev->dev, "%s: failed to request window BAR\n",
>                         __func__);
>                 return ret;
>         }
>
> -       phys_addr = pci_resource_start(pci_dev, VIRTIO_FS_WINDOW_BAR);
> -       len = pci_resource_len(pci_dev, VIRTIO_FS_WINDOW_BAR);
> -
>         mi = devm_kzalloc(&pci_dev->dev, sizeof(*mi), GFP_KERNEL);
>         if (!mi)
>                 return -ENOMEM;
> @@ -586,6 +577,17 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>         pgmap->ref = &mi->ref;
>         pgmap->type = MEMORY_DEVICE_FS_DAX;
>
> +       phys_addr = pci_resource_start(pci_dev, cache_bar);
> +       bar_len = pci_resource_len(pci_dev, cache_bar);
> +
> +       if (cache_offset + cache_len > bar_len) {
> +               dev_err(&vdev->dev,
> +                       "%s: cache bar shorter than cap offset+len\n",
> +                       __func__);
> +               return -EINVAL;
> +       }
> +       phys_addr += cache_offset;
> +
>         /* Ideally we would directly use the PCI BAR resource but
>          * devm_memremap_pages() wants its own copy in pgmap.  So
>          * initialize a struct resource from scratch (only the start
> @@ -594,7 +596,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>         pgmap->res = (struct resource){
>                 .name = "virtio-fs dax window",
>                 .start = phys_addr,
> -               .end = phys_addr + len,
> +               .end = phys_addr + cache_len,

Just in case you haven't noticed/fixed this problem, it should be

+ .end = phys_addr + cache_len - 1,

because resource_size() counts %size as "end - start + 1".
The end result of the above is a "conflicting page map" warning when
specifying a second virtio-fs pci device.

I'll send a patch for this, and feel free to take it along with the
patchset if needed.

thanks,
liubo

>         };
>
>         fs->window_kaddr = devm_memremap_pages(&pci_dev->dev, pgmap);
> @@ -607,10 +609,10 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
>                 return ret;
>
>         fs->window_phys_addr = phys_addr;
> -       fs->window_len = len;
> +       fs->window_len = cache_len;
>
> -       dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len %zu\n",
> -               __func__, fs->window_kaddr, phys_addr, len);
> +       dev_dbg(&vdev->dev, "%s: cache kaddr 0x%px phys_addr 0x%llx len %llx\n",
> +               __func__, fs->window_kaddr, phys_addr, cache_len);
>
>         fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops);
>         if (!fs->dax_dev)
> --
> 2.13.6
>
Dr. David Alan Gilbert March 20, 2019, 10:42 a.m. UTC | #19
* Liu Bo (obuil.liubo@gmail.com) wrote:
> On Mon, Dec 10, 2018 at 9:57 AM Vivek Goyal <vgoyal@redhat.com> wrote:
> >
> > Instead of assuming we had the fixed bar for the cache, use the
> > value from the capabilities.
> >
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  fs/fuse/virtio_fs.c | 32 +++++++++++++++++---------------
> >  1 file changed, 17 insertions(+), 15 deletions(-)
> >
> > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> > index 60d496c16841..55bac1465536 100644
> > --- a/fs/fuse/virtio_fs.c
> > +++ b/fs/fuse/virtio_fs.c
> > @@ -14,11 +14,6 @@
> >  #include <uapi/linux/virtio_pci.h>
> >  #include "fuse_i.h"
> >
> > -enum {
> > -       /* PCI BAR number of the virtio-fs DAX window */
> > -       VIRTIO_FS_WINDOW_BAR = 2,
> > -};
> > -
> >  /* List of virtio-fs device instances and a lock for the list */
> >  static DEFINE_MUTEX(virtio_fs_mutex);
> >  static LIST_HEAD(virtio_fs_instances);
> > @@ -518,7 +513,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >         struct dev_pagemap *pgmap;
> >         struct pci_dev *pci_dev;
> >         phys_addr_t phys_addr;
> > -       size_t len;
> > +       size_t bar_len;
> >         int ret;
> >         u8 have_cache, cache_bar;
> >         u64 cache_offset, cache_len;
> > @@ -551,17 +546,13 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >          }
> >
> >         /* TODO handle case where device doesn't expose BAR? */
> > -       ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR,
> > -                                "virtio-fs-window");
> > +       ret = pci_request_region(pci_dev, cache_bar, "virtio-fs-window");
> >         if (ret < 0) {
> >                 dev_err(&vdev->dev, "%s: failed to request window BAR\n",
> >                         __func__);
> >                 return ret;
> >         }
> >
> > -       phys_addr = pci_resource_start(pci_dev, VIRTIO_FS_WINDOW_BAR);
> > -       len = pci_resource_len(pci_dev, VIRTIO_FS_WINDOW_BAR);
> > -
> >         mi = devm_kzalloc(&pci_dev->dev, sizeof(*mi), GFP_KERNEL);
> >         if (!mi)
> >                 return -ENOMEM;
> > @@ -586,6 +577,17 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >         pgmap->ref = &mi->ref;
> >         pgmap->type = MEMORY_DEVICE_FS_DAX;
> >
> > +       phys_addr = pci_resource_start(pci_dev, cache_bar);
> > +       bar_len = pci_resource_len(pci_dev, cache_bar);
> > +
> > +       if (cache_offset + cache_len > bar_len) {
> > +               dev_err(&vdev->dev,
> > +                       "%s: cache bar shorter than cap offset+len\n",
> > +                       __func__);
> > +               return -EINVAL;
> > +       }
> > +       phys_addr += cache_offset;
> > +
> >         /* Ideally we would directly use the PCI BAR resource but
> >          * devm_memremap_pages() wants its own copy in pgmap.  So
> >          * initialize a struct resource from scratch (only the start
> > @@ -594,7 +596,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >         pgmap->res = (struct resource){
> >                 .name = "virtio-fs dax window",
> >                 .start = phys_addr,
> > -               .end = phys_addr + len,
> > +               .end = phys_addr + cache_len,
> 
> Just in case you haven't noticed/fixed this problem, it should be
> 
> + .end = phys_addr + cache_len - 1,
> 
> because resource_size() counts %size as "end - start + 1".
> The end result of the above is a "conflicting page map" warning when
> specifying a second virtio-fs pci device.

Thanks for spotting this! I think we'd seen that message once but not
noticed where from.

> I'll send a patch for this, and feel free to take it along with the
> patchset if needed.
> 

Dave

> thanks,
> liubo
> 
> >         };
> >
> >         fs->window_kaddr = devm_memremap_pages(&pci_dev->dev, pgmap);
> > @@ -607,10 +609,10 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
> >                 return ret;
> >
> >         fs->window_phys_addr = phys_addr;
> > -       fs->window_len = len;
> > +       fs->window_len = cache_len;
> >
> > -       dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len %zu\n",
> > -               __func__, fs->window_kaddr, phys_addr, len);
> > +       dev_dbg(&vdev->dev, "%s: cache kaddr 0x%px phys_addr 0x%llx len %llx\n",
> > +               __func__, fs->window_kaddr, phys_addr, cache_len);
> >
> >         fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops);
> >         if (!fs->dax_dev)
> > --
> > 2.13.6
> >
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox series

Patch

diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 60d496c16841..55bac1465536 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -14,11 +14,6 @@ 
 #include <uapi/linux/virtio_pci.h>
 #include "fuse_i.h"
 
-enum {
-	/* PCI BAR number of the virtio-fs DAX window */
-	VIRTIO_FS_WINDOW_BAR = 2,
-};
-
 /* List of virtio-fs device instances and a lock for the list */
 static DEFINE_MUTEX(virtio_fs_mutex);
 static LIST_HEAD(virtio_fs_instances);
@@ -518,7 +513,7 @@  static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
 	struct dev_pagemap *pgmap;
 	struct pci_dev *pci_dev;
 	phys_addr_t phys_addr;
-	size_t len;
+	size_t bar_len;
 	int ret;
 	u8 have_cache, cache_bar;
 	u64 cache_offset, cache_len;
@@ -551,17 +546,13 @@  static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
         }
 
 	/* TODO handle case where device doesn't expose BAR? */
-	ret = pci_request_region(pci_dev, VIRTIO_FS_WINDOW_BAR,
-				 "virtio-fs-window");
+	ret = pci_request_region(pci_dev, cache_bar, "virtio-fs-window");
 	if (ret < 0) {
 		dev_err(&vdev->dev, "%s: failed to request window BAR\n",
 			__func__);
 		return ret;
 	}
 
-	phys_addr = pci_resource_start(pci_dev, VIRTIO_FS_WINDOW_BAR);
-	len = pci_resource_len(pci_dev, VIRTIO_FS_WINDOW_BAR);
-
 	mi = devm_kzalloc(&pci_dev->dev, sizeof(*mi), GFP_KERNEL);
 	if (!mi)
 		return -ENOMEM;
@@ -586,6 +577,17 @@  static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
 	pgmap->ref = &mi->ref;
 	pgmap->type = MEMORY_DEVICE_FS_DAX;
 
+	phys_addr = pci_resource_start(pci_dev, cache_bar);
+	bar_len = pci_resource_len(pci_dev, cache_bar);
+
+	if (cache_offset + cache_len > bar_len) {
+		dev_err(&vdev->dev,
+			"%s: cache bar shorter than cap offset+len\n",
+			__func__);
+		return -EINVAL;
+	}
+	phys_addr += cache_offset;
+
 	/* Ideally we would directly use the PCI BAR resource but
 	 * devm_memremap_pages() wants its own copy in pgmap.  So
 	 * initialize a struct resource from scratch (only the start
@@ -594,7 +596,7 @@  static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
 	pgmap->res = (struct resource){
 		.name = "virtio-fs dax window",
 		.start = phys_addr,
-		.end = phys_addr + len,
+		.end = phys_addr + cache_len,
 	};
 
 	fs->window_kaddr = devm_memremap_pages(&pci_dev->dev, pgmap);
@@ -607,10 +609,10 @@  static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
 		return ret;
 
 	fs->window_phys_addr = phys_addr;
-	fs->window_len = len;
+	fs->window_len = cache_len;
 
-	dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len %zu\n",
-		__func__, fs->window_kaddr, phys_addr, len);
+	dev_dbg(&vdev->dev, "%s: cache kaddr 0x%px phys_addr 0x%llx len %llx\n",
+		__func__, fs->window_kaddr, phys_addr, cache_len);
 
 	fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops);
 	if (!fs->dax_dev)