diff mbox series

[RFC,04/12] s390/cio: introduce cio DMA pool

Message ID 20190404231622.52531-5-pasic@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series s390: virtio: support protected virtualization | expand

Commit Message

Halil Pasic April 4, 2019, 11:16 p.m. UTC
To support protected virtualization cio will need to make sure the
memory used for communication with the hypervisor is DMA memory.

Let us introduce a DMA pool to cio that will help us in allocating
deallocating those chunks of memory.

We use a gen_pool backed with DMA pages to avoid each allocation
effectively wasting a page, as we typically allocate much less
than PAGE_SIZE.

Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
---
 arch/s390/Kconfig           |  1 +
 arch/s390/include/asm/cio.h |  3 +++
 drivers/s390/cio/css.c      | 57 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+)

Comments

Cornelia Huck April 9, 2019, 10:44 a.m. UTC | #1
On Fri,  5 Apr 2019 01:16:14 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> To support protected virtualization cio will need to make sure the
> memory used for communication with the hypervisor is DMA memory.
> 
> Let us introduce a DMA pool to cio that will help us in allocating

missing 'and'

> deallocating those chunks of memory.
> 
> We use a gen_pool backed with DMA pages to avoid each allocation
> effectively wasting a page, as we typically allocate much less
> than PAGE_SIZE.
> 
> Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> ---
>  arch/s390/Kconfig           |  1 +
>  arch/s390/include/asm/cio.h |  3 +++
>  drivers/s390/cio/css.c      | 57 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 61 insertions(+)
> 
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index 46c69283a67b..e8099ab47368 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -194,6 +194,7 @@ config S390
>  	select VIRT_TO_BUS
>  	select HAVE_NMI
>  	select SWIOTLB
> +	select GENERIC_ALLOCATOR
>  
>  
>  config SCHED_OMIT_FRAME_POINTER
> diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
> index 1727180e8ca1..4510e418614a 100644
> --- a/arch/s390/include/asm/cio.h
> +++ b/arch/s390/include/asm/cio.h
> @@ -328,6 +328,9 @@ static inline u8 pathmask_to_pos(u8 mask)
>  void channel_subsystem_reinit(void);
>  extern void css_schedule_reprobe(void);
>  
> +extern void *cio_dma_zalloc(size_t size);
> +extern void cio_dma_free(void *cpu_addr, size_t size);
> +
>  /* Function from drivers/s390/cio/chsc.c */
>  int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta);
>  int chsc_sstpi(void *page, void *result, size_t size);
> diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
> index aea502922646..72629d99d8e4 100644
> --- a/drivers/s390/cio/css.c
> +++ b/drivers/s390/cio/css.c
> @@ -20,6 +20,8 @@
>  #include <linux/reboot.h>
>  #include <linux/suspend.h>
>  #include <linux/proc_fs.h>
> +#include <linux/genalloc.h>
> +#include <linux/dma-mapping.h>
>  #include <asm/isc.h>
>  #include <asm/crw.h>
>  
> @@ -886,6 +888,8 @@ static const struct attribute_group *cssdev_attr_groups[] = {
>  	NULL,
>  };
>  
> +static u64 css_dev_dma_mask = DMA_BIT_MASK(31);
> +
>  static int __init setup_css(int nr)
>  {
>  	struct channel_subsystem *css;
> @@ -899,6 +903,9 @@ static int __init setup_css(int nr)
>  	dev_set_name(&css->device, "css%x", nr);
>  	css->device.groups = cssdev_attr_groups;
>  	css->device.release = channel_subsystem_release;
> +	/* some cio DMA memory needs to be 31 bit addressable */
> +	css->device.coherent_dma_mask = DMA_BIT_MASK(31),
> +	css->device.dma_mask = &css_dev_dma_mask;

Question: Does this percolate down to the child devices eventually?
E.g., you have a ccw_device getting the mask from its parent
subchannel, which gets it from its parent css? If so, does that clash
with the the mask you used for the virtio_ccw_device in a previous
patch? Or are they two really different things?

>  
>  	mutex_init(&css->mutex);
>  	css->cssid = chsc_get_cssid(nr);
> @@ -1018,6 +1025,55 @@ static struct notifier_block css_power_notifier = {
>  	.notifier_call = css_power_event,
>  };
>  
> +#define POOL_INIT_PAGES 1
> +static struct gen_pool *cio_dma_pool;
> +/* Currently cio supports only a single css */
> +static struct device *cio_dma_css;

That global variable feels wrong, especially if you plan to support
MCSS-E in the future. (Do you? :) If yes, should the dma pool be global
or per-css? As css0 currently is the root device for the channel
subsystem stuff, you'd either need a new parent to hang this off from
or size this with the number of css images.)

For now, just grab channel_subsystems[0]->device directly?

> +static gfp_t  cio_dma_flags;
> +
> +static void __init cio_dma_pool_init(void)
> +{
> +	void *cpu_addr;
> +	dma_addr_t dma_addr;
> +	int i;
> +
> +	cio_dma_css = &channel_subsystems[0]->device;
> +	cio_dma_flags = GFP_DMA | GFP_KERNEL | __GFP_ZERO;
> +	cio_dma_pool = gen_pool_create(3, -1);
> +	/* No need to free up the resources: compiled in */
> +	for (i = 0; i < POOL_INIT_PAGES; ++i) {
> +		cpu_addr = dma_alloc_coherent(cio_dma_css, PAGE_SIZE, &dma_addr,
> +					      cio_dma_flags);
> +		if (!cpu_addr)
> +			return;
> +		gen_pool_add_virt(cio_dma_pool, (unsigned long) cpu_addr,
> +				  dma_addr, PAGE_SIZE, -1);
> +	}
> +
> +}
> +
> +void *cio_dma_zalloc(size_t size)
> +{
> +	dma_addr_t dma_addr;
> +	unsigned long addr = gen_pool_alloc(cio_dma_pool, size);
> +
> +	if (!addr) {
> +		addr = (unsigned long) dma_alloc_coherent(cio_dma_css,
> +					PAGE_SIZE, &dma_addr, cio_dma_flags);
> +		if (!addr)
> +			return NULL;
> +		gen_pool_add_virt(cio_dma_pool, addr, dma_addr, PAGE_SIZE, -1);
> +		addr = gen_pool_alloc(cio_dma_pool, size);
> +	}
> +	return (void *) addr;

At this point, you're always going via the css0 device. I'm wondering
whether you should pass in the cssid here and use css_by_id(cssid) to
make this future proof. But then, I'm not quite clear from which
context this will be called.

> +}
> +
> +void cio_dma_free(void *cpu_addr, size_t size)
> +{
> +	memset(cpu_addr, 0, size);
> +	gen_pool_free(cio_dma_pool, (unsigned long) cpu_addr, size);
> +}
> +
>  /*
>   * Now that the driver core is running, we can setup our channel subsystem.
>   * The struct subchannel's are created during probing.
> @@ -1063,6 +1119,7 @@ static int __init css_bus_init(void)
>  		unregister_reboot_notifier(&css_reboot_notifier);
>  		goto out_unregister;
>  	}
> +	cio_dma_pool_init();
>  	css_init_done = 1;
>  
>  	/* Enable default isc for I/O subchannels. */
Halil Pasic April 9, 2019, 12:11 p.m. UTC | #2
On Tue, 9 Apr 2019 12:44:58 +0200
Cornelia Huck <cohuck@redhat.com> wrote:

> On Fri,  5 Apr 2019 01:16:14 +0200
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
> > To support protected virtualization cio will need to make sure the
> > memory used for communication with the hypervisor is DMA memory.
> > 
> > Let us introduce a DMA pool to cio that will help us in allocating
> 
> missing 'and'
> 

Nod.

> > deallocating those chunks of memory.
> > 
> > We use a gen_pool backed with DMA pages to avoid each allocation
> > effectively wasting a page, as we typically allocate much less
> > than PAGE_SIZE.
> > 
> > Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> > ---
> >  arch/s390/Kconfig           |  1 +
> >  arch/s390/include/asm/cio.h |  3 +++
> >  drivers/s390/cio/css.c      | 57 +++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 61 insertions(+)
> > 
> > diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> > index 46c69283a67b..e8099ab47368 100644
> > --- a/arch/s390/Kconfig
> > +++ b/arch/s390/Kconfig
> > @@ -194,6 +194,7 @@ config S390
> >  	select VIRT_TO_BUS
> >  	select HAVE_NMI
> >  	select SWIOTLB
> > +	select GENERIC_ALLOCATOR
> >  
> >  
> >  config SCHED_OMIT_FRAME_POINTER
> > diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
> > index 1727180e8ca1..4510e418614a 100644
> > --- a/arch/s390/include/asm/cio.h
> > +++ b/arch/s390/include/asm/cio.h
> > @@ -328,6 +328,9 @@ static inline u8 pathmask_to_pos(u8 mask)
> >  void channel_subsystem_reinit(void);
> >  extern void css_schedule_reprobe(void);
> >  
> > +extern void *cio_dma_zalloc(size_t size);
> > +extern void cio_dma_free(void *cpu_addr, size_t size);
> > +
> >  /* Function from drivers/s390/cio/chsc.c */
> >  int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta);
> >  int chsc_sstpi(void *page, void *result, size_t size);
> > diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
> > index aea502922646..72629d99d8e4 100644
> > --- a/drivers/s390/cio/css.c
> > +++ b/drivers/s390/cio/css.c
> > @@ -20,6 +20,8 @@
> >  #include <linux/reboot.h>
> >  #include <linux/suspend.h>
> >  #include <linux/proc_fs.h>
> > +#include <linux/genalloc.h>
> > +#include <linux/dma-mapping.h>
> >  #include <asm/isc.h>
> >  #include <asm/crw.h>
> >  
> > @@ -886,6 +888,8 @@ static const struct attribute_group *cssdev_attr_groups[] = {
> >  	NULL,
> >  };
> >  
> > +static u64 css_dev_dma_mask = DMA_BIT_MASK(31);
> > +
> >  static int __init setup_css(int nr)
> >  {
> >  	struct channel_subsystem *css;
> > @@ -899,6 +903,9 @@ static int __init setup_css(int nr)
> >  	dev_set_name(&css->device, "css%x", nr);
> >  	css->device.groups = cssdev_attr_groups;
> >  	css->device.release = channel_subsystem_release;
> > +	/* some cio DMA memory needs to be 31 bit addressable */
> > +	css->device.coherent_dma_mask = DMA_BIT_MASK(31),
> > +	css->device.dma_mask = &css_dev_dma_mask;
> 
> Question: Does this percolate down to the child devices eventually?
> E.g., you have a ccw_device getting the mask from its parent
> subchannel, which gets it from its parent css? If so, does that clash
> with the the mask you used for the virtio_ccw_device in a previous
> patch? Or are they two really different things?
> 

AFAIK id does not percolate. I could not find anything showing in this
direction in drivers core at least. AFAIU dma_mask is also about the
addressing limitations of the device itself (in the good old pci world,
not really applicable for ccw devices).

Regarding virtio_ccw, I need to set the DMA stuff on the parent device
because that is how the stuff in virtio_ring.c works. There I we can
get away with DMA_BIT_MASK(64) as that stuff is not used in places where
31 bit addresses are required. For the actual ccw stuff I used the
cio DMA pool introduced here.

FWIW the allocation mechanisms provided by  DMA API (as documented) is
not very well suited with what we need for ccw. This is why I choose to
do this gen_pool thing. The gist of it is as-is at the moment we get
page granularity allocations from DMA API. Of course we could use
dma_pool which is kind of perfect for uniformly sized objects. As you
will see in the rest of the series, we have a comparatively small number
of smallish objects of varying sizes. And some of these are short lived.

So the child devices like subchannels and ccw devices do not use DMA API
directly, except for virtio_ccw.

Does all that make any sense to you?


> >  
> >  	mutex_init(&css->mutex);
> >  	css->cssid = chsc_get_cssid(nr);
> > @@ -1018,6 +1025,55 @@ static struct notifier_block css_power_notifier = {
> >  	.notifier_call = css_power_event,
> >  };
> >  
> > +#define POOL_INIT_PAGES 1
> > +static struct gen_pool *cio_dma_pool;
> > +/* Currently cio supports only a single css */
> > +static struct device *cio_dma_css;
> 
> That global variable feels wrong, especially if you plan to support
> MCSS-E in the future. (Do you? :) 

Not that I'm aware of any plans to add support MCSS-E.

> If yes, should the dma pool be global
> or per-css? As css0 currently is the root device for the channel
> subsystem stuff, you'd either need a new parent to hang this off from
> or size this with the number of css images.)
> 
> For now, just grab channel_subsystems[0]->device directly?
> 

Patch 6 gets rid of this variable and adds an accessor instead:

+struct device *cio_get_dma_css_dev(void)
+{
+	return &channel_subsystems[0]->device;
+}
+

I can move that here if you like (for v1).

Yes it is kind of unfortunate that some pieces of this code look
like there could be more than one css, but actually MCSS-E is not
supported. I would prefer sorting these problems out when they arise, if
they ever arise.

> > +static gfp_t  cio_dma_flags;
> > +
> > +static void __init cio_dma_pool_init(void)
> > +{
> > +	void *cpu_addr;
> > +	dma_addr_t dma_addr;
> > +	int i;
> > +
> > +	cio_dma_css = &channel_subsystems[0]->device;
> > +	cio_dma_flags = GFP_DMA | GFP_KERNEL | __GFP_ZERO;
> > +	cio_dma_pool = gen_pool_create(3, -1);
> > +	/* No need to free up the resources: compiled in */
> > +	for (i = 0; i < POOL_INIT_PAGES; ++i) {
> > +		cpu_addr = dma_alloc_coherent(cio_dma_css, PAGE_SIZE, &dma_addr,
> > +					      cio_dma_flags);
> > +		if (!cpu_addr)
> > +			return;
> > +		gen_pool_add_virt(cio_dma_pool, (unsigned long) cpu_addr,
> > +				  dma_addr, PAGE_SIZE, -1);
> > +	}
> > +
> > +}
> > +
> > +void *cio_dma_zalloc(size_t size)
> > +{
> > +	dma_addr_t dma_addr;
> > +	unsigned long addr = gen_pool_alloc(cio_dma_pool, size);
> > +
> > +	if (!addr) {
> > +		addr = (unsigned long) dma_alloc_coherent(cio_dma_css,
> > +					PAGE_SIZE, &dma_addr, cio_dma_flags);
> > +		if (!addr)
> > +			return NULL;
> > +		gen_pool_add_virt(cio_dma_pool, addr, dma_addr, PAGE_SIZE, -1);
> > +		addr = gen_pool_alloc(cio_dma_pool, size);
> > +	}
> > +	return (void *) addr;
> 
> At this point, you're always going via the css0 device. I'm wondering
> whether you should pass in the cssid here and use css_by_id(cssid) to
> make this future proof. But then, I'm not quite clear from which
> context this will be called.

As said before I don't see MCSS-E coming. Regarding the client code,
please check out the rest of the series. Especially patch 6. 

From my perspective it would be at this stage saner to make it more
obvious that only one css is supported (at the moment), than to implement
MCSS-E, or to make this (kind of) MCSS-E ready.

Regards,
Halil

> 
> > +}
> > +
> > +void cio_dma_free(void *cpu_addr, size_t size)
> > +{
> > +	memset(cpu_addr, 0, size);
> > +	gen_pool_free(cio_dma_pool, (unsigned long) cpu_addr, size);
> > +}
> > +
> >  /*
> >   * Now that the driver core is running, we can setup our channel subsystem.
> >   * The struct subchannel's are created during probing.
> > @@ -1063,6 +1119,7 @@ static int __init css_bus_init(void)
> >  		unregister_reboot_notifier(&css_reboot_notifier);
> >  		goto out_unregister;
> >  	}
> > +	cio_dma_pool_init();
> >  	css_init_done = 1;
> >  
> >  	/* Enable default isc for I/O subchannels. */
>
Cornelia Huck April 9, 2019, 5:14 p.m. UTC | #3
On Tue, 9 Apr 2019 14:11:14 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Tue, 9 Apr 2019 12:44:58 +0200
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > On Fri,  5 Apr 2019 01:16:14 +0200
> > Halil Pasic <pasic@linux.ibm.com> wrote:

> > > @@ -886,6 +888,8 @@ static const struct attribute_group *cssdev_attr_groups[] = {
> > >  	NULL,
> > >  };
> > >  
> > > +static u64 css_dev_dma_mask = DMA_BIT_MASK(31);
> > > +
> > >  static int __init setup_css(int nr)
> > >  {
> > >  	struct channel_subsystem *css;
> > > @@ -899,6 +903,9 @@ static int __init setup_css(int nr)
> > >  	dev_set_name(&css->device, "css%x", nr);
> > >  	css->device.groups = cssdev_attr_groups;
> > >  	css->device.release = channel_subsystem_release;
> > > +	/* some cio DMA memory needs to be 31 bit addressable */
> > > +	css->device.coherent_dma_mask = DMA_BIT_MASK(31),
> > > +	css->device.dma_mask = &css_dev_dma_mask;  
> > 
> > Question: Does this percolate down to the child devices eventually?
> > E.g., you have a ccw_device getting the mask from its parent
> > subchannel, which gets it from its parent css? If so, does that clash
> > with the the mask you used for the virtio_ccw_device in a previous
> > patch? Or are they two really different things?
> >   
> 
> AFAIK id does not percolate. I could not find anything showing in this
> direction in drivers core at least. AFAIU dma_mask is also about the
> addressing limitations of the device itself (in the good old pci world,
> not really applicable for ccw devices).
> 
> Regarding virtio_ccw, I need to set the DMA stuff on the parent device
> because that is how the stuff in virtio_ring.c works. There I we can
> get away with DMA_BIT_MASK(64) as that stuff is not used in places where
> 31 bit addresses are required. For the actual ccw stuff I used the
> cio DMA pool introduced here.

Ok, so those are two different users then.

> 
> FWIW the allocation mechanisms provided by  DMA API (as documented) is
> not very well suited with what we need for ccw. This is why I choose to
> do this gen_pool thing. The gist of it is as-is at the moment we get
> page granularity allocations from DMA API. Of course we could use
> dma_pool which is kind of perfect for uniformly sized objects. As you
> will see in the rest of the series, we have a comparatively small number
> of smallish objects of varying sizes. And some of these are short lived.
> 
> So the child devices like subchannels and ccw devices do not use DMA API
> directly, except for virtio_ccw.
> 
> Does all that make any sense to you?

I think I need to read more of the series, but that should be enough
info to get me started :)

> 
> 
> > >  
> > >  	mutex_init(&css->mutex);
> > >  	css->cssid = chsc_get_cssid(nr);
> > > @@ -1018,6 +1025,55 @@ static struct notifier_block css_power_notifier = {
> > >  	.notifier_call = css_power_event,
> > >  };
> > >  
> > > +#define POOL_INIT_PAGES 1
> > > +static struct gen_pool *cio_dma_pool;
> > > +/* Currently cio supports only a single css */
> > > +static struct device *cio_dma_css;  
> > 
> > That global variable feels wrong, especially if you plan to support
> > MCSS-E in the future. (Do you? :)   
> 
> Not that I'm aware of any plans to add support MCSS-E.
> 
> > If yes, should the dma pool be global
> > or per-css? As css0 currently is the root device for the channel
> > subsystem stuff, you'd either need a new parent to hang this off from
> > or size this with the number of css images.)
> > 
> > For now, just grab channel_subsystems[0]->device directly?
> >   
> 
> Patch 6 gets rid of this variable and adds an accessor instead:
> 
> +struct device *cio_get_dma_css_dev(void)
> +{
> +	return &channel_subsystems[0]->device;
> +}
> +
> 
> I can move that here if you like (for v1).

An accessor looks more sane than a global variable, yes.

> 
> Yes it is kind of unfortunate that some pieces of this code look
> like there could be more than one css, but actually MCSS-E is not
> supported. I would prefer sorting these problems out when they arise, if
> they ever arise.

As long as it's not too unreasonable, I think we should keep the
infrastructure for multiple css instances in place.

> 
> > > +static gfp_t  cio_dma_flags;
> > > +
> > > +static void __init cio_dma_pool_init(void)
> > > +{
> > > +	void *cpu_addr;
> > > +	dma_addr_t dma_addr;
> > > +	int i;
> > > +
> > > +	cio_dma_css = &channel_subsystems[0]->device;
> > > +	cio_dma_flags = GFP_DMA | GFP_KERNEL | __GFP_ZERO;
> > > +	cio_dma_pool = gen_pool_create(3, -1);
> > > +	/* No need to free up the resources: compiled in */
> > > +	for (i = 0; i < POOL_INIT_PAGES; ++i) {
> > > +		cpu_addr = dma_alloc_coherent(cio_dma_css, PAGE_SIZE, &dma_addr,
> > > +					      cio_dma_flags);
> > > +		if (!cpu_addr)
> > > +			return;
> > > +		gen_pool_add_virt(cio_dma_pool, (unsigned long) cpu_addr,
> > > +				  dma_addr, PAGE_SIZE, -1);
> > > +	}
> > > +
> > > +}
> > > +
> > > +void *cio_dma_zalloc(size_t size)
> > > +{
> > > +	dma_addr_t dma_addr;
> > > +	unsigned long addr = gen_pool_alloc(cio_dma_pool, size);
> > > +
> > > +	if (!addr) {
> > > +		addr = (unsigned long) dma_alloc_coherent(cio_dma_css,
> > > +					PAGE_SIZE, &dma_addr, cio_dma_flags);
> > > +		if (!addr)
> > > +			return NULL;
> > > +		gen_pool_add_virt(cio_dma_pool, addr, dma_addr, PAGE_SIZE, -1);
> > > +		addr = gen_pool_alloc(cio_dma_pool, size);
> > > +	}
> > > +	return (void *) addr;  
> > 
> > At this point, you're always going via the css0 device. I'm wondering
> > whether you should pass in the cssid here and use css_by_id(cssid) to
> > make this future proof. But then, I'm not quite clear from which
> > context this will be called.  
> 
> As said before I don't see MCSS-E coming. Regarding the client code,
> please check out the rest of the series. Especially patch 6. 
> 
> From my perspective it would be at this stage saner to make it more
> obvious that only one css is supported (at the moment), than to implement
> MCSS-E, or to make this (kind of) MCSS-E ready.

I disagree. I think there's value in keeping the interfaces clean
(within reasonable bounds, of course.) Even if there is no
implementation of MCSS-E other than QEMU... it is probably a good idea
to spend some brain cycles to make this conceptually clean.
Halil Pasic April 10, 2019, 3:31 p.m. UTC | #4
On Tue, 9 Apr 2019 19:14:53 +0200
Cornelia Huck <cohuck@redhat.com> wrote:

[..]

> > > At this point, you're always going via the css0 device. I'm
> > > wondering whether you should pass in the cssid here and use
> > > css_by_id(cssid) to make this future proof. But then, I'm not
> > > quite clear from which context this will be called.    
> > 
> > As said before I don't see MCSS-E coming. Regarding the client code,
> > please check out the rest of the series. Especially patch 6. 
> > 
> > From my perspective it would be at this stage saner to make it more
> > obvious that only one css is supported (at the moment), than to
> > implement MCSS-E, or to make this (kind of) MCSS-E ready.  
> 
> I disagree. I think there's value in keeping the interfaces clean
> (within reasonable bounds, of course.) Even if there is no
> implementation of MCSS-E other than QEMU... it is probably a good idea
> to spend some brain cycles to make this conceptually clean.

AFAIU Linux currently does not support MCSS-E. I don't have the
bandwidth to implement MCSS-E support in the kernel.

I fully agree for external interfaces should be MCSS-E conform, so
should we ever decide to implement we don't have to overcome self-made
obstacles.

Kernel internal stuff however, IMHO, should avoid carrying a ballast of
an 20%-30% implemented MCSS-E support. I see no benefit.

But I don't insist. If you have a good idea how to make this more MCSS-E
conform, you are welcome. In think something like this is best done on
top of a series that provides a basic working solution. Especially if the
conceptually clean thing is conceptually or code-wise more complex than
the basic solution. If you have something in mind that is simpler and
more lightweight than what I did here, I would be more than happy to make
that happen.

Regards,
Halil
Cornelia Huck April 10, 2019, 4:07 p.m. UTC | #5
On Wed, 10 Apr 2019 17:31:48 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Tue, 9 Apr 2019 19:14:53 +0200
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> [..]
> 
> > > > At this point, you're always going via the css0 device. I'm
> > > > wondering whether you should pass in the cssid here and use
> > > > css_by_id(cssid) to make this future proof. But then, I'm not
> > > > quite clear from which context this will be called.      
> > > 
> > > As said before I don't see MCSS-E coming. Regarding the client code,
> > > please check out the rest of the series. Especially patch 6. 
> > > 
> > > From my perspective it would be at this stage saner to make it more
> > > obvious that only one css is supported (at the moment), than to
> > > implement MCSS-E, or to make this (kind of) MCSS-E ready.    
> > 
> > I disagree. I think there's value in keeping the interfaces clean
> > (within reasonable bounds, of course.) Even if there is no
> > implementation of MCSS-E other than QEMU... it is probably a good idea
> > to spend some brain cycles to make this conceptually clean.  
> 
> AFAIU Linux currently does not support MCSS-E. I don't have the
> bandwidth to implement MCSS-E support in the kernel.
> 
> I fully agree for external interfaces should be MCSS-E conform, so
> should we ever decide to implement we don't have to overcome self-made
> obstacles.
> 
> Kernel internal stuff however, IMHO, should avoid carrying a ballast of
> an 20%-30% implemented MCSS-E support. I see no benefit.
> 
> But I don't insist. If you have a good idea how to make this more MCSS-E
> conform, you are welcome. In think something like this is best done on
> top of a series that provides a basic working solution. Especially if the
> conceptually clean thing is conceptually or code-wise more complex than
> the basic solution. If you have something in mind that is simpler and
> more lightweight than what I did here, I would be more than happy to make
> that happen.

I haven't asked for adding complete support for MCSS-E, just to keep it
in the back of our minds...

Back to the issue at hand. You're currently hard-wiring this to the
css0 device. My question is: Does it make sense to have a per-css area?
From my limited perspective, I think it does (more css images would
mean more devices, and just adding smaller areas per-css is probably
easier than adding one big area scaling with the number of css images).
In that case, what about tacking the area to the actual css object,
instead of having the global variable? Should be an easy change, and
feels cleaner.
Halil Pasic April 10, 2019, 4:52 p.m. UTC | #6
On Wed, 10 Apr 2019 18:07:19 +0200
Cornelia Huck <cohuck@redhat.com> wrote:

> On Wed, 10 Apr 2019 17:31:48 +0200
> Halil Pasic <pasic@linux.ibm.com> wrote:
> 
> > On Tue, 9 Apr 2019 19:14:53 +0200
> > Cornelia Huck <cohuck@redhat.com> wrote:
> > 
> > [..]
> > 
> > > > > At this point, you're always going via the css0 device. I'm
> > > > > wondering whether you should pass in the cssid here and use
> > > > > css_by_id(cssid) to make this future proof. But then, I'm not
> > > > > quite clear from which context this will be called.      
> > > > 
> > > > As said before I don't see MCSS-E coming. Regarding the client code,
> > > > please check out the rest of the series. Especially patch 6. 
> > > > 
> > > > From my perspective it would be at this stage saner to make it more
> > > > obvious that only one css is supported (at the moment), than to
> > > > implement MCSS-E, or to make this (kind of) MCSS-E ready.    
> > > 
> > > I disagree. I think there's value in keeping the interfaces clean
> > > (within reasonable bounds, of course.) Even if there is no
> > > implementation of MCSS-E other than QEMU... it is probably a good idea
> > > to spend some brain cycles to make this conceptually clean.  
> > 
> > AFAIU Linux currently does not support MCSS-E. I don't have the
> > bandwidth to implement MCSS-E support in the kernel.
> > 
> > I fully agree for external interfaces should be MCSS-E conform, so
> > should we ever decide to implement we don't have to overcome self-made
> > obstacles.
> > 
> > Kernel internal stuff however, IMHO, should avoid carrying a ballast of
> > an 20%-30% implemented MCSS-E support. I see no benefit.
> > 
> > But I don't insist. If you have a good idea how to make this more MCSS-E
> > conform, you are welcome. In think something like this is best done on
> > top of a series that provides a basic working solution. Especially if the
> > conceptually clean thing is conceptually or code-wise more complex than
> > the basic solution. If you have something in mind that is simpler and
> > more lightweight than what I did here, I would be more than happy to make
> > that happen.
> 
> I haven't asked for adding complete support for MCSS-E, just to keep it
> in the back of our minds...

I will, I promise ;)!

> 
> Back to the issue at hand. You're currently hard-wiring this to the
> css0 device. My question is: Does it make sense to have a per-css area?

What do you mean by area?

> From my limited perspective, I think it does (more css images would
> mean more devices, and just adding smaller areas per-css is probably
> easier than adding one big area scaling with the number of css images).

Currently all devices are hooked up to css0. This is also hard-wired:

int css_register_subchannel(struct subchannel *sch)                             
{                                                                               
        int ret;                                                                
                                                                                
        /* Initialize the subchannel structure */                               
        sch->dev.parent = &channel_subsystems[0]->device;

I think the hard-wiring the DMA properties of the CSS to
channel_subsystems[0] is consistent with prior art.

> In that case, what about tacking the area to the actual css object,
> instead of having the global variable? Should be an easy change, and
> feels cleaner.
> 

I already got rid of the global variable. Instead we have an accessor
now.

My problem with a gen_pool per ccs approach is the airq aiv vectors.
Those vectors are currently shared between devices (at least for
virtio-ccw), and those vectors must be allocated as shared (DMA) memory.
Please take another look at patch #6. Would you like 
airq_iv_create(unsigned long bits, unsigned long flags) to take
a struct channel_subsystem argument?

Regards,
Halil
Sebastian Ott April 11, 2019, 6:25 p.m. UTC | #7
On Fri, 5 Apr 2019, Halil Pasic wrote:
> To support protected virtualization cio will need to make sure the
> memory used for communication with the hypervisor is DMA memory.
> 
> Let us introduce a DMA pool to cio that will help us in allocating
> deallocating those chunks of memory.
> 
> We use a gen_pool backed with DMA pages to avoid each allocation
> effectively wasting a page, as we typically allocate much less
> than PAGE_SIZE.

I don't think we should use this global DMA pool. I guess it's OK for
stuff like airq (where we don't have a struct device at hand) but for
CCW we should use the device we have. Yes, this way we waste some memory
but all dma memory a device uses should fit in a page - so the wastage
is not too much.

Sebastian
Halil Pasic April 12, 2019, 11:20 a.m. UTC | #8
On Thu, 11 Apr 2019 20:25:01 +0200 (CEST)
Sebastian Ott <sebott@linux.ibm.com> wrote:

> On Fri, 5 Apr 2019, Halil Pasic wrote:
> > To support protected virtualization cio will need to make sure the
> > memory used for communication with the hypervisor is DMA memory.
> > 
> > Let us introduce a DMA pool to cio that will help us in allocating
> > deallocating those chunks of memory.
> > 
> > We use a gen_pool backed with DMA pages to avoid each allocation
> > effectively wasting a page, as we typically allocate much less
> > than PAGE_SIZE.
> 
> I don't think we should use this global DMA pool. I guess it's OK for
> stuff like airq (where we don't have a struct device at hand) but for
> CCW we should use the device we have. Yes, this way we waste some memory
> but all dma memory a device uses should fit in a page - so the wastage
> is not too much.
> 

Is what you envision an own gen_pool on for each subchannel (e.g. a
struct io_subchannel member)?

I'm struggling with understanding the expected benefits of a
per-subchannel pool/allocator. Can you please tell me what benefits do
you expect (over the current approach)?

I understand you idea is to keep the CIO global pool for stuff that can
not be tied to a single device (i.e. ariq). So the per device stuff would
also mean more code. Would you be OK with postponing this alleged
enhancement (i.e. implement it as a patch on top of this series)?

Regards,
Halil
Sebastian Ott April 12, 2019, 12:12 p.m. UTC | #9
On Fri, 12 Apr 2019, Halil Pasic wrote:
> On Thu, 11 Apr 2019 20:25:01 +0200 (CEST)
> Sebastian Ott <sebott@linux.ibm.com> wrote:
> > I don't think we should use this global DMA pool. I guess it's OK for
> > stuff like airq (where we don't have a struct device at hand) but for
> > CCW we should use the device we have. Yes, this way we waste some memory
> > but all dma memory a device uses should fit in a page - so the wastage
> > is not too much.
> > 
> 
> Is what you envision an own gen_pool on for each subchannel (e.g. a
> struct io_subchannel member)?

Either that or if that's too much overhead simply map a page and create
a struct containing the few dma areas for that device.

> I'm struggling with understanding the expected benefits of a
> per-subchannel pool/allocator. Can you please tell me what benefits do
> you expect (over the current approach)?

Logically DMA is a capability of a device and the whole DMA API is build
around devices. Working around that just feels wrong. For practical
matters: DMA debugging will complain about misuse of a specific device or
driver.

> I understand you idea is to keep the CIO global pool for stuff that can
> not be tied to a single device (i.e. ariq). So the per device stuff would
> also mean more code. Would you be OK with postponing this alleged
> enhancement (i.e. implement it as a patch on top of this series)?

I don't like it but it's just in-kernel usage which we can change at any
time. So if it helps you to do it that way, why not..
Halil Pasic April 12, 2019, 3:30 p.m. UTC | #10
On Fri, 12 Apr 2019 14:12:31 +0200 (CEST)
Sebastian Ott <sebott@linux.ibm.com> wrote:

> On Fri, 12 Apr 2019, Halil Pasic wrote:
> > On Thu, 11 Apr 2019 20:25:01 +0200 (CEST)
> > Sebastian Ott <sebott@linux.ibm.com> wrote:
> > > I don't think we should use this global DMA pool. I guess it's OK for
> > > stuff like airq (where we don't have a struct device at hand) but for
> > > CCW we should use the device we have. Yes, this way we waste some memory
> > > but all dma memory a device uses should fit in a page - so the wastage
> > > is not too much.

Regarding the wastage. Let us do the math together in search for an
upper (wastage) limit.

#define __MAX_CSSID 0
#define __MAX_SUBCHANNEL 65535
#define __MAX_SSID 3 

that is a maximum of
2^16 devices per subchannel set x 2^2 subchannel sets * 2^0 css
== 2^18 devices at most.

With your a page per device we have max 2^30 = 2^18 + 2^12 bytes.

That is exactly 1GB. And because some of the stuff needs to be
31 bit addressable all of that 1GB would have to be under 2GB. 

Currently we need at least 224 bytes per device that is ~ 6%
of a PAGE_SIZE.

> > > 
> > 
> > Is what you envision an own gen_pool on for each subchannel (e.g. a
> > struct io_subchannel member)?
> 
> Either that or if that's too much overhead simply map a page and create
> a struct containing the few dma areas for that device.

In virtio-ccw we do dynamic allocations of DMA memory. And we could
theoretically do the same elsewhere. So I don't think a struct with
a few dma areas would do (assumed I understood you correctly, which
isn't very likely).

> 
> > I'm struggling with understanding the expected benefits of a
> > per-subchannel pool/allocator. Can you please tell me what benefits do
> > you expect (over the current approach)?
> 
> Logically DMA is a capability of a device and the whole DMA API is build
> around devices. 

I agree the whole DMA API is built around devices. IMHO DMA is indeed on
one hand a capability of a device, but it is also a capability of a
machine (system).

Normally (PCI) limitations can come either from the device side (e.g.
device supports only 32 bit addressing) or from the machine/bus side.

In our case however all ccw devices have the same
limitations/requirements. E.g. if you want to do a SSCH your ORB and
your CCWs will have to have to be 31 bit addressable (physical). Your
data can be above 2G if MIDA is available and used. None of this depends
on the device we are talking to, but dictated by the properties of
the architecture and the machine. Another important thing to notice is
that for CCW IO isolation is done very differently than for PCI. 

> Working around that just feels wrong.

Then the airq iv->vector should be a per-device thing as well, or?

> For practical
> matters: DMA debugging will complain about misuse of a specific device or
> driver.
> 

Do you mean CONFIG_DMA_API_DEBUG and CONFIG_DMA_API_DEBUG_SG? I've been
running with those and did not see any complaints. Maybe we should
clarify this one offline...

> > I understand you idea is to keep the CIO global pool for stuff that can
> > not be tied to a single device (i.e. ariq). So the per device stuff would
> > also mean more code. Would you be OK with postponing this alleged
> > enhancement (i.e. implement it as a patch on top of this series)?
> 
> I don't like it but it's just in-kernel usage which we can change at any
> time. So if it helps you to do it that way, why not..
> 

Right. There should be no external interface implications to changing
this AFAICT. I prefer seeing this matter as not substantial for what I'm
trying to accomplish with this series.

Regards,
Halil
Sebastian Ott April 16, 2019, 12:50 p.m. UTC | #11
On Fri, 12 Apr 2019, Halil Pasic wrote:
> On Fri, 12 Apr 2019 14:12:31 +0200 (CEST)
> Sebastian Ott <sebott@linux.ibm.com> wrote:
> > On Fri, 12 Apr 2019, Halil Pasic wrote:
> > > On Thu, 11 Apr 2019 20:25:01 +0200 (CEST)
> > > Sebastian Ott <sebott@linux.ibm.com> wrote:
> > > > I don't think we should use this global DMA pool. I guess it's OK for
> > > > stuff like airq (where we don't have a struct device at hand) but for
> > > > CCW we should use the device we have. Yes, this way we waste some memory
> > > > but all dma memory a device uses should fit in a page - so the wastage
> > > > is not too much.
> 
> Regarding the wastage. Let us do the math together in search for an
> upper (wastage) limit.
[...]
> Currently we need at least 224 bytes per device that is ~ 6%
> of a PAGE_SIZE.

Yes, we basically waste the whole page. I'm ok with that if the benefit is
to play nice with the kernel APIs.

> > For practical
> > matters: DMA debugging will complain about misuse of a specific device or
> > driver.
> > 
> 
> Do you mean CONFIG_DMA_API_DEBUG and CONFIG_DMA_API_DEBUG_SG? I've been
> running with those and did not see any complaints. Maybe we should
> clarify this one offline...

I didn't mean to imply that there are bugs already - just that when used
as intended the DMA_DEBUG_API can complain about stuff like "your device
is gone but you have still DMA memory set up for it" which will not work
if you don't use the correct device...

Sebastian
Halil Pasic April 16, 2019, 1:31 p.m. UTC | #12
On Tue, 16 Apr 2019 14:50:14 +0200 (CEST)
Sebastian Ott <sebott@linux.ibm.com> wrote:

> On Fri, 12 Apr 2019, Halil Pasic wrote:
> > On Fri, 12 Apr 2019 14:12:31 +0200 (CEST)
> > Sebastian Ott <sebott@linux.ibm.com> wrote:
> > > On Fri, 12 Apr 2019, Halil Pasic wrote:
> > > > On Thu, 11 Apr 2019 20:25:01 +0200 (CEST)
> > > > Sebastian Ott <sebott@linux.ibm.com> wrote:
> > > > > I don't think we should use this global DMA pool. I guess it's OK for
> > > > > stuff like airq (where we don't have a struct device at hand) but for
> > > > > CCW we should use the device we have. Yes, this way we waste some memory
> > > > > but all dma memory a device uses should fit in a page - so the wastage
> > > > > is not too much.
> > 
> > Regarding the wastage. Let us do the math together in search for an
> > upper (wastage) limit.
> [...]
> > Currently we need at least 224 bytes per device that is ~ 6%
> > of a PAGE_SIZE.
> 
> Yes, we basically waste the whole page. I'm ok with that if the benefit is
> to play nice with the kernel APIs.
> 
> > > For practical
> > > matters: DMA debugging will complain about misuse of a specific device or
> > > driver.
> > > 
> > 
> > Do you mean CONFIG_DMA_API_DEBUG and CONFIG_DMA_API_DEBUG_SG? I've been
> > running with those and did not see any complaints. Maybe we should
> > clarify this one offline...
> 
> I didn't mean to imply that there are bugs already - just that when used
> as intended the DMA_DEBUG_API can complain about stuff like "your device
> is gone but you have still DMA memory set up for it" which will not work
> if you don't use the correct device...
> 

Right. In fact the 'real' allocations happen using gen_pool, and the pool
never shrinks.

IMHO as so often in software engineering we have a trade-off here. I'm
still not convinced I traded badly here, but I will take the request of
yours to tie the dma allocations to a more the device requiring the dma
as a maintainers request, and accommodate it in v1.

Are you fine with having a similar gen_pool backed with dma_pages on a
per struct io_subchannel_private basis?

Based on our offline chat I think you are, but but better safe than sorry.

Regards,
Halil
diff mbox series

Patch

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 46c69283a67b..e8099ab47368 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -194,6 +194,7 @@  config S390
 	select VIRT_TO_BUS
 	select HAVE_NMI
 	select SWIOTLB
+	select GENERIC_ALLOCATOR
 
 
 config SCHED_OMIT_FRAME_POINTER
diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
index 1727180e8ca1..4510e418614a 100644
--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -328,6 +328,9 @@  static inline u8 pathmask_to_pos(u8 mask)
 void channel_subsystem_reinit(void);
 extern void css_schedule_reprobe(void);
 
+extern void *cio_dma_zalloc(size_t size);
+extern void cio_dma_free(void *cpu_addr, size_t size);
+
 /* Function from drivers/s390/cio/chsc.c */
 int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta);
 int chsc_sstpi(void *page, void *result, size_t size);
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index aea502922646..72629d99d8e4 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -20,6 +20,8 @@ 
 #include <linux/reboot.h>
 #include <linux/suspend.h>
 #include <linux/proc_fs.h>
+#include <linux/genalloc.h>
+#include <linux/dma-mapping.h>
 #include <asm/isc.h>
 #include <asm/crw.h>
 
@@ -886,6 +888,8 @@  static const struct attribute_group *cssdev_attr_groups[] = {
 	NULL,
 };
 
+static u64 css_dev_dma_mask = DMA_BIT_MASK(31);
+
 static int __init setup_css(int nr)
 {
 	struct channel_subsystem *css;
@@ -899,6 +903,9 @@  static int __init setup_css(int nr)
 	dev_set_name(&css->device, "css%x", nr);
 	css->device.groups = cssdev_attr_groups;
 	css->device.release = channel_subsystem_release;
+	/* some cio DMA memory needs to be 31 bit addressable */
+	css->device.coherent_dma_mask = DMA_BIT_MASK(31),
+	css->device.dma_mask = &css_dev_dma_mask;
 
 	mutex_init(&css->mutex);
 	css->cssid = chsc_get_cssid(nr);
@@ -1018,6 +1025,55 @@  static struct notifier_block css_power_notifier = {
 	.notifier_call = css_power_event,
 };
 
+#define POOL_INIT_PAGES 1
+static struct gen_pool *cio_dma_pool;
+/* Currently cio supports only a single css */
+static struct device *cio_dma_css;
+static gfp_t  cio_dma_flags;
+
+static void __init cio_dma_pool_init(void)
+{
+	void *cpu_addr;
+	dma_addr_t dma_addr;
+	int i;
+
+	cio_dma_css = &channel_subsystems[0]->device;
+	cio_dma_flags = GFP_DMA | GFP_KERNEL | __GFP_ZERO;
+	cio_dma_pool = gen_pool_create(3, -1);
+	/* No need to free up the resources: compiled in */
+	for (i = 0; i < POOL_INIT_PAGES; ++i) {
+		cpu_addr = dma_alloc_coherent(cio_dma_css, PAGE_SIZE, &dma_addr,
+					      cio_dma_flags);
+		if (!cpu_addr)
+			return;
+		gen_pool_add_virt(cio_dma_pool, (unsigned long) cpu_addr,
+				  dma_addr, PAGE_SIZE, -1);
+	}
+
+}
+
+void *cio_dma_zalloc(size_t size)
+{
+	dma_addr_t dma_addr;
+	unsigned long addr = gen_pool_alloc(cio_dma_pool, size);
+
+	if (!addr) {
+		addr = (unsigned long) dma_alloc_coherent(cio_dma_css,
+					PAGE_SIZE, &dma_addr, cio_dma_flags);
+		if (!addr)
+			return NULL;
+		gen_pool_add_virt(cio_dma_pool, addr, dma_addr, PAGE_SIZE, -1);
+		addr = gen_pool_alloc(cio_dma_pool, size);
+	}
+	return (void *) addr;
+}
+
+void cio_dma_free(void *cpu_addr, size_t size)
+{
+	memset(cpu_addr, 0, size);
+	gen_pool_free(cio_dma_pool, (unsigned long) cpu_addr, size);
+}
+
 /*
  * Now that the driver core is running, we can setup our channel subsystem.
  * The struct subchannel's are created during probing.
@@ -1063,6 +1119,7 @@  static int __init css_bus_init(void)
 		unregister_reboot_notifier(&css_reboot_notifier);
 		goto out_unregister;
 	}
+	cio_dma_pool_init();
 	css_init_done = 1;
 
 	/* Enable default isc for I/O subchannels. */