Message ID | 20190404231622.52531-5-pasic@linux.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | s390: virtio: support protected virtualization | expand |
On Fri, 5 Apr 2019 01:16:14 +0200 Halil Pasic <pasic@linux.ibm.com> wrote: > To support protected virtualization cio will need to make sure the > memory used for communication with the hypervisor is DMA memory. > > Let us introduce a DMA pool to cio that will help us in allocating missing 'and' > deallocating those chunks of memory. > > We use a gen_pool backed with DMA pages to avoid each allocation > effectively wasting a page, as we typically allocate much less > than PAGE_SIZE. > > Signed-off-by: Halil Pasic <pasic@linux.ibm.com> > --- > arch/s390/Kconfig | 1 + > arch/s390/include/asm/cio.h | 3 +++ > drivers/s390/cio/css.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 61 insertions(+) > > diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig > index 46c69283a67b..e8099ab47368 100644 > --- a/arch/s390/Kconfig > +++ b/arch/s390/Kconfig > @@ -194,6 +194,7 @@ config S390 > select VIRT_TO_BUS > select HAVE_NMI > select SWIOTLB > + select GENERIC_ALLOCATOR > > > config SCHED_OMIT_FRAME_POINTER > diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h > index 1727180e8ca1..4510e418614a 100644 > --- a/arch/s390/include/asm/cio.h > +++ b/arch/s390/include/asm/cio.h > @@ -328,6 +328,9 @@ static inline u8 pathmask_to_pos(u8 mask) > void channel_subsystem_reinit(void); > extern void css_schedule_reprobe(void); > > +extern void *cio_dma_zalloc(size_t size); > +extern void cio_dma_free(void *cpu_addr, size_t size); > + > /* Function from drivers/s390/cio/chsc.c */ > int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); > int chsc_sstpi(void *page, void *result, size_t size); > diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c > index aea502922646..72629d99d8e4 100644 > --- a/drivers/s390/cio/css.c > +++ b/drivers/s390/cio/css.c > @@ -20,6 +20,8 @@ > #include <linux/reboot.h> > #include <linux/suspend.h> > #include <linux/proc_fs.h> > +#include <linux/genalloc.h> > +#include <linux/dma-mapping.h> > #include <asm/isc.h> > #include <asm/crw.h> > > @@ -886,6 +888,8 @@ static const struct attribute_group *cssdev_attr_groups[] = { > NULL, > }; > > +static u64 css_dev_dma_mask = DMA_BIT_MASK(31); > + > static int __init setup_css(int nr) > { > struct channel_subsystem *css; > @@ -899,6 +903,9 @@ static int __init setup_css(int nr) > dev_set_name(&css->device, "css%x", nr); > css->device.groups = cssdev_attr_groups; > css->device.release = channel_subsystem_release; > + /* some cio DMA memory needs to be 31 bit addressable */ > + css->device.coherent_dma_mask = DMA_BIT_MASK(31), > + css->device.dma_mask = &css_dev_dma_mask; Question: Does this percolate down to the child devices eventually? E.g., you have a ccw_device getting the mask from its parent subchannel, which gets it from its parent css? If so, does that clash with the the mask you used for the virtio_ccw_device in a previous patch? Or are they two really different things? > > mutex_init(&css->mutex); > css->cssid = chsc_get_cssid(nr); > @@ -1018,6 +1025,55 @@ static struct notifier_block css_power_notifier = { > .notifier_call = css_power_event, > }; > > +#define POOL_INIT_PAGES 1 > +static struct gen_pool *cio_dma_pool; > +/* Currently cio supports only a single css */ > +static struct device *cio_dma_css; That global variable feels wrong, especially if you plan to support MCSS-E in the future. (Do you? :) If yes, should the dma pool be global or per-css? As css0 currently is the root device for the channel subsystem stuff, you'd either need a new parent to hang this off from or size this with the number of css images.) For now, just grab channel_subsystems[0]->device directly? > +static gfp_t cio_dma_flags; > + > +static void __init cio_dma_pool_init(void) > +{ > + void *cpu_addr; > + dma_addr_t dma_addr; > + int i; > + > + cio_dma_css = &channel_subsystems[0]->device; > + cio_dma_flags = GFP_DMA | GFP_KERNEL | __GFP_ZERO; > + cio_dma_pool = gen_pool_create(3, -1); > + /* No need to free up the resources: compiled in */ > + for (i = 0; i < POOL_INIT_PAGES; ++i) { > + cpu_addr = dma_alloc_coherent(cio_dma_css, PAGE_SIZE, &dma_addr, > + cio_dma_flags); > + if (!cpu_addr) > + return; > + gen_pool_add_virt(cio_dma_pool, (unsigned long) cpu_addr, > + dma_addr, PAGE_SIZE, -1); > + } > + > +} > + > +void *cio_dma_zalloc(size_t size) > +{ > + dma_addr_t dma_addr; > + unsigned long addr = gen_pool_alloc(cio_dma_pool, size); > + > + if (!addr) { > + addr = (unsigned long) dma_alloc_coherent(cio_dma_css, > + PAGE_SIZE, &dma_addr, cio_dma_flags); > + if (!addr) > + return NULL; > + gen_pool_add_virt(cio_dma_pool, addr, dma_addr, PAGE_SIZE, -1); > + addr = gen_pool_alloc(cio_dma_pool, size); > + } > + return (void *) addr; At this point, you're always going via the css0 device. I'm wondering whether you should pass in the cssid here and use css_by_id(cssid) to make this future proof. But then, I'm not quite clear from which context this will be called. > +} > + > +void cio_dma_free(void *cpu_addr, size_t size) > +{ > + memset(cpu_addr, 0, size); > + gen_pool_free(cio_dma_pool, (unsigned long) cpu_addr, size); > +} > + > /* > * Now that the driver core is running, we can setup our channel subsystem. > * The struct subchannel's are created during probing. > @@ -1063,6 +1119,7 @@ static int __init css_bus_init(void) > unregister_reboot_notifier(&css_reboot_notifier); > goto out_unregister; > } > + cio_dma_pool_init(); > css_init_done = 1; > > /* Enable default isc for I/O subchannels. */
On Tue, 9 Apr 2019 12:44:58 +0200 Cornelia Huck <cohuck@redhat.com> wrote: > On Fri, 5 Apr 2019 01:16:14 +0200 > Halil Pasic <pasic@linux.ibm.com> wrote: > > > To support protected virtualization cio will need to make sure the > > memory used for communication with the hypervisor is DMA memory. > > > > Let us introduce a DMA pool to cio that will help us in allocating > > missing 'and' > Nod. > > deallocating those chunks of memory. > > > > We use a gen_pool backed with DMA pages to avoid each allocation > > effectively wasting a page, as we typically allocate much less > > than PAGE_SIZE. > > > > Signed-off-by: Halil Pasic <pasic@linux.ibm.com> > > --- > > arch/s390/Kconfig | 1 + > > arch/s390/include/asm/cio.h | 3 +++ > > drivers/s390/cio/css.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ > > 3 files changed, 61 insertions(+) > > > > diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig > > index 46c69283a67b..e8099ab47368 100644 > > --- a/arch/s390/Kconfig > > +++ b/arch/s390/Kconfig > > @@ -194,6 +194,7 @@ config S390 > > select VIRT_TO_BUS > > select HAVE_NMI > > select SWIOTLB > > + select GENERIC_ALLOCATOR > > > > > > config SCHED_OMIT_FRAME_POINTER > > diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h > > index 1727180e8ca1..4510e418614a 100644 > > --- a/arch/s390/include/asm/cio.h > > +++ b/arch/s390/include/asm/cio.h > > @@ -328,6 +328,9 @@ static inline u8 pathmask_to_pos(u8 mask) > > void channel_subsystem_reinit(void); > > extern void css_schedule_reprobe(void); > > > > +extern void *cio_dma_zalloc(size_t size); > > +extern void cio_dma_free(void *cpu_addr, size_t size); > > + > > /* Function from drivers/s390/cio/chsc.c */ > > int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); > > int chsc_sstpi(void *page, void *result, size_t size); > > diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c > > index aea502922646..72629d99d8e4 100644 > > --- a/drivers/s390/cio/css.c > > +++ b/drivers/s390/cio/css.c > > @@ -20,6 +20,8 @@ > > #include <linux/reboot.h> > > #include <linux/suspend.h> > > #include <linux/proc_fs.h> > > +#include <linux/genalloc.h> > > +#include <linux/dma-mapping.h> > > #include <asm/isc.h> > > #include <asm/crw.h> > > > > @@ -886,6 +888,8 @@ static const struct attribute_group *cssdev_attr_groups[] = { > > NULL, > > }; > > > > +static u64 css_dev_dma_mask = DMA_BIT_MASK(31); > > + > > static int __init setup_css(int nr) > > { > > struct channel_subsystem *css; > > @@ -899,6 +903,9 @@ static int __init setup_css(int nr) > > dev_set_name(&css->device, "css%x", nr); > > css->device.groups = cssdev_attr_groups; > > css->device.release = channel_subsystem_release; > > + /* some cio DMA memory needs to be 31 bit addressable */ > > + css->device.coherent_dma_mask = DMA_BIT_MASK(31), > > + css->device.dma_mask = &css_dev_dma_mask; > > Question: Does this percolate down to the child devices eventually? > E.g., you have a ccw_device getting the mask from its parent > subchannel, which gets it from its parent css? If so, does that clash > with the the mask you used for the virtio_ccw_device in a previous > patch? Or are they two really different things? > AFAIK id does not percolate. I could not find anything showing in this direction in drivers core at least. AFAIU dma_mask is also about the addressing limitations of the device itself (in the good old pci world, not really applicable for ccw devices). Regarding virtio_ccw, I need to set the DMA stuff on the parent device because that is how the stuff in virtio_ring.c works. There I we can get away with DMA_BIT_MASK(64) as that stuff is not used in places where 31 bit addresses are required. For the actual ccw stuff I used the cio DMA pool introduced here. FWIW the allocation mechanisms provided by DMA API (as documented) is not very well suited with what we need for ccw. This is why I choose to do this gen_pool thing. The gist of it is as-is at the moment we get page granularity allocations from DMA API. Of course we could use dma_pool which is kind of perfect for uniformly sized objects. As you will see in the rest of the series, we have a comparatively small number of smallish objects of varying sizes. And some of these are short lived. So the child devices like subchannels and ccw devices do not use DMA API directly, except for virtio_ccw. Does all that make any sense to you? > > > > mutex_init(&css->mutex); > > css->cssid = chsc_get_cssid(nr); > > @@ -1018,6 +1025,55 @@ static struct notifier_block css_power_notifier = { > > .notifier_call = css_power_event, > > }; > > > > +#define POOL_INIT_PAGES 1 > > +static struct gen_pool *cio_dma_pool; > > +/* Currently cio supports only a single css */ > > +static struct device *cio_dma_css; > > That global variable feels wrong, especially if you plan to support > MCSS-E in the future. (Do you? :) Not that I'm aware of any plans to add support MCSS-E. > If yes, should the dma pool be global > or per-css? As css0 currently is the root device for the channel > subsystem stuff, you'd either need a new parent to hang this off from > or size this with the number of css images.) > > For now, just grab channel_subsystems[0]->device directly? > Patch 6 gets rid of this variable and adds an accessor instead: +struct device *cio_get_dma_css_dev(void) +{ + return &channel_subsystems[0]->device; +} + I can move that here if you like (for v1). Yes it is kind of unfortunate that some pieces of this code look like there could be more than one css, but actually MCSS-E is not supported. I would prefer sorting these problems out when they arise, if they ever arise. > > +static gfp_t cio_dma_flags; > > + > > +static void __init cio_dma_pool_init(void) > > +{ > > + void *cpu_addr; > > + dma_addr_t dma_addr; > > + int i; > > + > > + cio_dma_css = &channel_subsystems[0]->device; > > + cio_dma_flags = GFP_DMA | GFP_KERNEL | __GFP_ZERO; > > + cio_dma_pool = gen_pool_create(3, -1); > > + /* No need to free up the resources: compiled in */ > > + for (i = 0; i < POOL_INIT_PAGES; ++i) { > > + cpu_addr = dma_alloc_coherent(cio_dma_css, PAGE_SIZE, &dma_addr, > > + cio_dma_flags); > > + if (!cpu_addr) > > + return; > > + gen_pool_add_virt(cio_dma_pool, (unsigned long) cpu_addr, > > + dma_addr, PAGE_SIZE, -1); > > + } > > + > > +} > > + > > +void *cio_dma_zalloc(size_t size) > > +{ > > + dma_addr_t dma_addr; > > + unsigned long addr = gen_pool_alloc(cio_dma_pool, size); > > + > > + if (!addr) { > > + addr = (unsigned long) dma_alloc_coherent(cio_dma_css, > > + PAGE_SIZE, &dma_addr, cio_dma_flags); > > + if (!addr) > > + return NULL; > > + gen_pool_add_virt(cio_dma_pool, addr, dma_addr, PAGE_SIZE, -1); > > + addr = gen_pool_alloc(cio_dma_pool, size); > > + } > > + return (void *) addr; > > At this point, you're always going via the css0 device. I'm wondering > whether you should pass in the cssid here and use css_by_id(cssid) to > make this future proof. But then, I'm not quite clear from which > context this will be called. As said before I don't see MCSS-E coming. Regarding the client code, please check out the rest of the series. Especially patch 6. From my perspective it would be at this stage saner to make it more obvious that only one css is supported (at the moment), than to implement MCSS-E, or to make this (kind of) MCSS-E ready. Regards, Halil > > > +} > > + > > +void cio_dma_free(void *cpu_addr, size_t size) > > +{ > > + memset(cpu_addr, 0, size); > > + gen_pool_free(cio_dma_pool, (unsigned long) cpu_addr, size); > > +} > > + > > /* > > * Now that the driver core is running, we can setup our channel subsystem. > > * The struct subchannel's are created during probing. > > @@ -1063,6 +1119,7 @@ static int __init css_bus_init(void) > > unregister_reboot_notifier(&css_reboot_notifier); > > goto out_unregister; > > } > > + cio_dma_pool_init(); > > css_init_done = 1; > > > > /* Enable default isc for I/O subchannels. */ >
On Tue, 9 Apr 2019 14:11:14 +0200 Halil Pasic <pasic@linux.ibm.com> wrote: > On Tue, 9 Apr 2019 12:44:58 +0200 > Cornelia Huck <cohuck@redhat.com> wrote: > > > On Fri, 5 Apr 2019 01:16:14 +0200 > > Halil Pasic <pasic@linux.ibm.com> wrote: > > > @@ -886,6 +888,8 @@ static const struct attribute_group *cssdev_attr_groups[] = { > > > NULL, > > > }; > > > > > > +static u64 css_dev_dma_mask = DMA_BIT_MASK(31); > > > + > > > static int __init setup_css(int nr) > > > { > > > struct channel_subsystem *css; > > > @@ -899,6 +903,9 @@ static int __init setup_css(int nr) > > > dev_set_name(&css->device, "css%x", nr); > > > css->device.groups = cssdev_attr_groups; > > > css->device.release = channel_subsystem_release; > > > + /* some cio DMA memory needs to be 31 bit addressable */ > > > + css->device.coherent_dma_mask = DMA_BIT_MASK(31), > > > + css->device.dma_mask = &css_dev_dma_mask; > > > > Question: Does this percolate down to the child devices eventually? > > E.g., you have a ccw_device getting the mask from its parent > > subchannel, which gets it from its parent css? If so, does that clash > > with the the mask you used for the virtio_ccw_device in a previous > > patch? Or are they two really different things? > > > > AFAIK id does not percolate. I could not find anything showing in this > direction in drivers core at least. AFAIU dma_mask is also about the > addressing limitations of the device itself (in the good old pci world, > not really applicable for ccw devices). > > Regarding virtio_ccw, I need to set the DMA stuff on the parent device > because that is how the stuff in virtio_ring.c works. There I we can > get away with DMA_BIT_MASK(64) as that stuff is not used in places where > 31 bit addresses are required. For the actual ccw stuff I used the > cio DMA pool introduced here. Ok, so those are two different users then. > > FWIW the allocation mechanisms provided by DMA API (as documented) is > not very well suited with what we need for ccw. This is why I choose to > do this gen_pool thing. The gist of it is as-is at the moment we get > page granularity allocations from DMA API. Of course we could use > dma_pool which is kind of perfect for uniformly sized objects. As you > will see in the rest of the series, we have a comparatively small number > of smallish objects of varying sizes. And some of these are short lived. > > So the child devices like subchannels and ccw devices do not use DMA API > directly, except for virtio_ccw. > > Does all that make any sense to you? I think I need to read more of the series, but that should be enough info to get me started :) > > > > > > > > mutex_init(&css->mutex); > > > css->cssid = chsc_get_cssid(nr); > > > @@ -1018,6 +1025,55 @@ static struct notifier_block css_power_notifier = { > > > .notifier_call = css_power_event, > > > }; > > > > > > +#define POOL_INIT_PAGES 1 > > > +static struct gen_pool *cio_dma_pool; > > > +/* Currently cio supports only a single css */ > > > +static struct device *cio_dma_css; > > > > That global variable feels wrong, especially if you plan to support > > MCSS-E in the future. (Do you? :) > > Not that I'm aware of any plans to add support MCSS-E. > > > If yes, should the dma pool be global > > or per-css? As css0 currently is the root device for the channel > > subsystem stuff, you'd either need a new parent to hang this off from > > or size this with the number of css images.) > > > > For now, just grab channel_subsystems[0]->device directly? > > > > Patch 6 gets rid of this variable and adds an accessor instead: > > +struct device *cio_get_dma_css_dev(void) > +{ > + return &channel_subsystems[0]->device; > +} > + > > I can move that here if you like (for v1). An accessor looks more sane than a global variable, yes. > > Yes it is kind of unfortunate that some pieces of this code look > like there could be more than one css, but actually MCSS-E is not > supported. I would prefer sorting these problems out when they arise, if > they ever arise. As long as it's not too unreasonable, I think we should keep the infrastructure for multiple css instances in place. > > > > +static gfp_t cio_dma_flags; > > > + > > > +static void __init cio_dma_pool_init(void) > > > +{ > > > + void *cpu_addr; > > > + dma_addr_t dma_addr; > > > + int i; > > > + > > > + cio_dma_css = &channel_subsystems[0]->device; > > > + cio_dma_flags = GFP_DMA | GFP_KERNEL | __GFP_ZERO; > > > + cio_dma_pool = gen_pool_create(3, -1); > > > + /* No need to free up the resources: compiled in */ > > > + for (i = 0; i < POOL_INIT_PAGES; ++i) { > > > + cpu_addr = dma_alloc_coherent(cio_dma_css, PAGE_SIZE, &dma_addr, > > > + cio_dma_flags); > > > + if (!cpu_addr) > > > + return; > > > + gen_pool_add_virt(cio_dma_pool, (unsigned long) cpu_addr, > > > + dma_addr, PAGE_SIZE, -1); > > > + } > > > + > > > +} > > > + > > > +void *cio_dma_zalloc(size_t size) > > > +{ > > > + dma_addr_t dma_addr; > > > + unsigned long addr = gen_pool_alloc(cio_dma_pool, size); > > > + > > > + if (!addr) { > > > + addr = (unsigned long) dma_alloc_coherent(cio_dma_css, > > > + PAGE_SIZE, &dma_addr, cio_dma_flags); > > > + if (!addr) > > > + return NULL; > > > + gen_pool_add_virt(cio_dma_pool, addr, dma_addr, PAGE_SIZE, -1); > > > + addr = gen_pool_alloc(cio_dma_pool, size); > > > + } > > > + return (void *) addr; > > > > At this point, you're always going via the css0 device. I'm wondering > > whether you should pass in the cssid here and use css_by_id(cssid) to > > make this future proof. But then, I'm not quite clear from which > > context this will be called. > > As said before I don't see MCSS-E coming. Regarding the client code, > please check out the rest of the series. Especially patch 6. > > From my perspective it would be at this stage saner to make it more > obvious that only one css is supported (at the moment), than to implement > MCSS-E, or to make this (kind of) MCSS-E ready. I disagree. I think there's value in keeping the interfaces clean (within reasonable bounds, of course.) Even if there is no implementation of MCSS-E other than QEMU... it is probably a good idea to spend some brain cycles to make this conceptually clean.
On Tue, 9 Apr 2019 19:14:53 +0200 Cornelia Huck <cohuck@redhat.com> wrote: [..] > > > At this point, you're always going via the css0 device. I'm > > > wondering whether you should pass in the cssid here and use > > > css_by_id(cssid) to make this future proof. But then, I'm not > > > quite clear from which context this will be called. > > > > As said before I don't see MCSS-E coming. Regarding the client code, > > please check out the rest of the series. Especially patch 6. > > > > From my perspective it would be at this stage saner to make it more > > obvious that only one css is supported (at the moment), than to > > implement MCSS-E, or to make this (kind of) MCSS-E ready. > > I disagree. I think there's value in keeping the interfaces clean > (within reasonable bounds, of course.) Even if there is no > implementation of MCSS-E other than QEMU... it is probably a good idea > to spend some brain cycles to make this conceptually clean. AFAIU Linux currently does not support MCSS-E. I don't have the bandwidth to implement MCSS-E support in the kernel. I fully agree for external interfaces should be MCSS-E conform, so should we ever decide to implement we don't have to overcome self-made obstacles. Kernel internal stuff however, IMHO, should avoid carrying a ballast of an 20%-30% implemented MCSS-E support. I see no benefit. But I don't insist. If you have a good idea how to make this more MCSS-E conform, you are welcome. In think something like this is best done on top of a series that provides a basic working solution. Especially if the conceptually clean thing is conceptually or code-wise more complex than the basic solution. If you have something in mind that is simpler and more lightweight than what I did here, I would be more than happy to make that happen. Regards, Halil
On Wed, 10 Apr 2019 17:31:48 +0200 Halil Pasic <pasic@linux.ibm.com> wrote: > On Tue, 9 Apr 2019 19:14:53 +0200 > Cornelia Huck <cohuck@redhat.com> wrote: > > [..] > > > > > At this point, you're always going via the css0 device. I'm > > > > wondering whether you should pass in the cssid here and use > > > > css_by_id(cssid) to make this future proof. But then, I'm not > > > > quite clear from which context this will be called. > > > > > > As said before I don't see MCSS-E coming. Regarding the client code, > > > please check out the rest of the series. Especially patch 6. > > > > > > From my perspective it would be at this stage saner to make it more > > > obvious that only one css is supported (at the moment), than to > > > implement MCSS-E, or to make this (kind of) MCSS-E ready. > > > > I disagree. I think there's value in keeping the interfaces clean > > (within reasonable bounds, of course.) Even if there is no > > implementation of MCSS-E other than QEMU... it is probably a good idea > > to spend some brain cycles to make this conceptually clean. > > AFAIU Linux currently does not support MCSS-E. I don't have the > bandwidth to implement MCSS-E support in the kernel. > > I fully agree for external interfaces should be MCSS-E conform, so > should we ever decide to implement we don't have to overcome self-made > obstacles. > > Kernel internal stuff however, IMHO, should avoid carrying a ballast of > an 20%-30% implemented MCSS-E support. I see no benefit. > > But I don't insist. If you have a good idea how to make this more MCSS-E > conform, you are welcome. In think something like this is best done on > top of a series that provides a basic working solution. Especially if the > conceptually clean thing is conceptually or code-wise more complex than > the basic solution. If you have something in mind that is simpler and > more lightweight than what I did here, I would be more than happy to make > that happen. I haven't asked for adding complete support for MCSS-E, just to keep it in the back of our minds... Back to the issue at hand. You're currently hard-wiring this to the css0 device. My question is: Does it make sense to have a per-css area? From my limited perspective, I think it does (more css images would mean more devices, and just adding smaller areas per-css is probably easier than adding one big area scaling with the number of css images). In that case, what about tacking the area to the actual css object, instead of having the global variable? Should be an easy change, and feels cleaner.
On Wed, 10 Apr 2019 18:07:19 +0200 Cornelia Huck <cohuck@redhat.com> wrote: > On Wed, 10 Apr 2019 17:31:48 +0200 > Halil Pasic <pasic@linux.ibm.com> wrote: > > > On Tue, 9 Apr 2019 19:14:53 +0200 > > Cornelia Huck <cohuck@redhat.com> wrote: > > > > [..] > > > > > > > At this point, you're always going via the css0 device. I'm > > > > > wondering whether you should pass in the cssid here and use > > > > > css_by_id(cssid) to make this future proof. But then, I'm not > > > > > quite clear from which context this will be called. > > > > > > > > As said before I don't see MCSS-E coming. Regarding the client code, > > > > please check out the rest of the series. Especially patch 6. > > > > > > > > From my perspective it would be at this stage saner to make it more > > > > obvious that only one css is supported (at the moment), than to > > > > implement MCSS-E, or to make this (kind of) MCSS-E ready. > > > > > > I disagree. I think there's value in keeping the interfaces clean > > > (within reasonable bounds, of course.) Even if there is no > > > implementation of MCSS-E other than QEMU... it is probably a good idea > > > to spend some brain cycles to make this conceptually clean. > > > > AFAIU Linux currently does not support MCSS-E. I don't have the > > bandwidth to implement MCSS-E support in the kernel. > > > > I fully agree for external interfaces should be MCSS-E conform, so > > should we ever decide to implement we don't have to overcome self-made > > obstacles. > > > > Kernel internal stuff however, IMHO, should avoid carrying a ballast of > > an 20%-30% implemented MCSS-E support. I see no benefit. > > > > But I don't insist. If you have a good idea how to make this more MCSS-E > > conform, you are welcome. In think something like this is best done on > > top of a series that provides a basic working solution. Especially if the > > conceptually clean thing is conceptually or code-wise more complex than > > the basic solution. If you have something in mind that is simpler and > > more lightweight than what I did here, I would be more than happy to make > > that happen. > > I haven't asked for adding complete support for MCSS-E, just to keep it > in the back of our minds... I will, I promise ;)! > > Back to the issue at hand. You're currently hard-wiring this to the > css0 device. My question is: Does it make sense to have a per-css area? What do you mean by area? > From my limited perspective, I think it does (more css images would > mean more devices, and just adding smaller areas per-css is probably > easier than adding one big area scaling with the number of css images). Currently all devices are hooked up to css0. This is also hard-wired: int css_register_subchannel(struct subchannel *sch) { int ret; /* Initialize the subchannel structure */ sch->dev.parent = &channel_subsystems[0]->device; I think the hard-wiring the DMA properties of the CSS to channel_subsystems[0] is consistent with prior art. > In that case, what about tacking the area to the actual css object, > instead of having the global variable? Should be an easy change, and > feels cleaner. > I already got rid of the global variable. Instead we have an accessor now. My problem with a gen_pool per ccs approach is the airq aiv vectors. Those vectors are currently shared between devices (at least for virtio-ccw), and those vectors must be allocated as shared (DMA) memory. Please take another look at patch #6. Would you like airq_iv_create(unsigned long bits, unsigned long flags) to take a struct channel_subsystem argument? Regards, Halil
On Fri, 5 Apr 2019, Halil Pasic wrote: > To support protected virtualization cio will need to make sure the > memory used for communication with the hypervisor is DMA memory. > > Let us introduce a DMA pool to cio that will help us in allocating > deallocating those chunks of memory. > > We use a gen_pool backed with DMA pages to avoid each allocation > effectively wasting a page, as we typically allocate much less > than PAGE_SIZE. I don't think we should use this global DMA pool. I guess it's OK for stuff like airq (where we don't have a struct device at hand) but for CCW we should use the device we have. Yes, this way we waste some memory but all dma memory a device uses should fit in a page - so the wastage is not too much. Sebastian
On Thu, 11 Apr 2019 20:25:01 +0200 (CEST) Sebastian Ott <sebott@linux.ibm.com> wrote: > On Fri, 5 Apr 2019, Halil Pasic wrote: > > To support protected virtualization cio will need to make sure the > > memory used for communication with the hypervisor is DMA memory. > > > > Let us introduce a DMA pool to cio that will help us in allocating > > deallocating those chunks of memory. > > > > We use a gen_pool backed with DMA pages to avoid each allocation > > effectively wasting a page, as we typically allocate much less > > than PAGE_SIZE. > > I don't think we should use this global DMA pool. I guess it's OK for > stuff like airq (where we don't have a struct device at hand) but for > CCW we should use the device we have. Yes, this way we waste some memory > but all dma memory a device uses should fit in a page - so the wastage > is not too much. > Is what you envision an own gen_pool on for each subchannel (e.g. a struct io_subchannel member)? I'm struggling with understanding the expected benefits of a per-subchannel pool/allocator. Can you please tell me what benefits do you expect (over the current approach)? I understand you idea is to keep the CIO global pool for stuff that can not be tied to a single device (i.e. ariq). So the per device stuff would also mean more code. Would you be OK with postponing this alleged enhancement (i.e. implement it as a patch on top of this series)? Regards, Halil
On Fri, 12 Apr 2019, Halil Pasic wrote: > On Thu, 11 Apr 2019 20:25:01 +0200 (CEST) > Sebastian Ott <sebott@linux.ibm.com> wrote: > > I don't think we should use this global DMA pool. I guess it's OK for > > stuff like airq (where we don't have a struct device at hand) but for > > CCW we should use the device we have. Yes, this way we waste some memory > > but all dma memory a device uses should fit in a page - so the wastage > > is not too much. > > > > Is what you envision an own gen_pool on for each subchannel (e.g. a > struct io_subchannel member)? Either that or if that's too much overhead simply map a page and create a struct containing the few dma areas for that device. > I'm struggling with understanding the expected benefits of a > per-subchannel pool/allocator. Can you please tell me what benefits do > you expect (over the current approach)? Logically DMA is a capability of a device and the whole DMA API is build around devices. Working around that just feels wrong. For practical matters: DMA debugging will complain about misuse of a specific device or driver. > I understand you idea is to keep the CIO global pool for stuff that can > not be tied to a single device (i.e. ariq). So the per device stuff would > also mean more code. Would you be OK with postponing this alleged > enhancement (i.e. implement it as a patch on top of this series)? I don't like it but it's just in-kernel usage which we can change at any time. So if it helps you to do it that way, why not..
On Fri, 12 Apr 2019 14:12:31 +0200 (CEST) Sebastian Ott <sebott@linux.ibm.com> wrote: > On Fri, 12 Apr 2019, Halil Pasic wrote: > > On Thu, 11 Apr 2019 20:25:01 +0200 (CEST) > > Sebastian Ott <sebott@linux.ibm.com> wrote: > > > I don't think we should use this global DMA pool. I guess it's OK for > > > stuff like airq (where we don't have a struct device at hand) but for > > > CCW we should use the device we have. Yes, this way we waste some memory > > > but all dma memory a device uses should fit in a page - so the wastage > > > is not too much. Regarding the wastage. Let us do the math together in search for an upper (wastage) limit. #define __MAX_CSSID 0 #define __MAX_SUBCHANNEL 65535 #define __MAX_SSID 3 that is a maximum of 2^16 devices per subchannel set x 2^2 subchannel sets * 2^0 css == 2^18 devices at most. With your a page per device we have max 2^30 = 2^18 + 2^12 bytes. That is exactly 1GB. And because some of the stuff needs to be 31 bit addressable all of that 1GB would have to be under 2GB. Currently we need at least 224 bytes per device that is ~ 6% of a PAGE_SIZE. > > > > > > > Is what you envision an own gen_pool on for each subchannel (e.g. a > > struct io_subchannel member)? > > Either that or if that's too much overhead simply map a page and create > a struct containing the few dma areas for that device. In virtio-ccw we do dynamic allocations of DMA memory. And we could theoretically do the same elsewhere. So I don't think a struct with a few dma areas would do (assumed I understood you correctly, which isn't very likely). > > > I'm struggling with understanding the expected benefits of a > > per-subchannel pool/allocator. Can you please tell me what benefits do > > you expect (over the current approach)? > > Logically DMA is a capability of a device and the whole DMA API is build > around devices. I agree the whole DMA API is built around devices. IMHO DMA is indeed on one hand a capability of a device, but it is also a capability of a machine (system). Normally (PCI) limitations can come either from the device side (e.g. device supports only 32 bit addressing) or from the machine/bus side. In our case however all ccw devices have the same limitations/requirements. E.g. if you want to do a SSCH your ORB and your CCWs will have to have to be 31 bit addressable (physical). Your data can be above 2G if MIDA is available and used. None of this depends on the device we are talking to, but dictated by the properties of the architecture and the machine. Another important thing to notice is that for CCW IO isolation is done very differently than for PCI. > Working around that just feels wrong. Then the airq iv->vector should be a per-device thing as well, or? > For practical > matters: DMA debugging will complain about misuse of a specific device or > driver. > Do you mean CONFIG_DMA_API_DEBUG and CONFIG_DMA_API_DEBUG_SG? I've been running with those and did not see any complaints. Maybe we should clarify this one offline... > > I understand you idea is to keep the CIO global pool for stuff that can > > not be tied to a single device (i.e. ariq). So the per device stuff would > > also mean more code. Would you be OK with postponing this alleged > > enhancement (i.e. implement it as a patch on top of this series)? > > I don't like it but it's just in-kernel usage which we can change at any > time. So if it helps you to do it that way, why not.. > Right. There should be no external interface implications to changing this AFAICT. I prefer seeing this matter as not substantial for what I'm trying to accomplish with this series. Regards, Halil
On Fri, 12 Apr 2019, Halil Pasic wrote: > On Fri, 12 Apr 2019 14:12:31 +0200 (CEST) > Sebastian Ott <sebott@linux.ibm.com> wrote: > > On Fri, 12 Apr 2019, Halil Pasic wrote: > > > On Thu, 11 Apr 2019 20:25:01 +0200 (CEST) > > > Sebastian Ott <sebott@linux.ibm.com> wrote: > > > > I don't think we should use this global DMA pool. I guess it's OK for > > > > stuff like airq (where we don't have a struct device at hand) but for > > > > CCW we should use the device we have. Yes, this way we waste some memory > > > > but all dma memory a device uses should fit in a page - so the wastage > > > > is not too much. > > Regarding the wastage. Let us do the math together in search for an > upper (wastage) limit. [...] > Currently we need at least 224 bytes per device that is ~ 6% > of a PAGE_SIZE. Yes, we basically waste the whole page. I'm ok with that if the benefit is to play nice with the kernel APIs. > > For practical > > matters: DMA debugging will complain about misuse of a specific device or > > driver. > > > > Do you mean CONFIG_DMA_API_DEBUG and CONFIG_DMA_API_DEBUG_SG? I've been > running with those and did not see any complaints. Maybe we should > clarify this one offline... I didn't mean to imply that there are bugs already - just that when used as intended the DMA_DEBUG_API can complain about stuff like "your device is gone but you have still DMA memory set up for it" which will not work if you don't use the correct device... Sebastian
On Tue, 16 Apr 2019 14:50:14 +0200 (CEST) Sebastian Ott <sebott@linux.ibm.com> wrote: > On Fri, 12 Apr 2019, Halil Pasic wrote: > > On Fri, 12 Apr 2019 14:12:31 +0200 (CEST) > > Sebastian Ott <sebott@linux.ibm.com> wrote: > > > On Fri, 12 Apr 2019, Halil Pasic wrote: > > > > On Thu, 11 Apr 2019 20:25:01 +0200 (CEST) > > > > Sebastian Ott <sebott@linux.ibm.com> wrote: > > > > > I don't think we should use this global DMA pool. I guess it's OK for > > > > > stuff like airq (where we don't have a struct device at hand) but for > > > > > CCW we should use the device we have. Yes, this way we waste some memory > > > > > but all dma memory a device uses should fit in a page - so the wastage > > > > > is not too much. > > > > Regarding the wastage. Let us do the math together in search for an > > upper (wastage) limit. > [...] > > Currently we need at least 224 bytes per device that is ~ 6% > > of a PAGE_SIZE. > > Yes, we basically waste the whole page. I'm ok with that if the benefit is > to play nice with the kernel APIs. > > > > For practical > > > matters: DMA debugging will complain about misuse of a specific device or > > > driver. > > > > > > > Do you mean CONFIG_DMA_API_DEBUG and CONFIG_DMA_API_DEBUG_SG? I've been > > running with those and did not see any complaints. Maybe we should > > clarify this one offline... > > I didn't mean to imply that there are bugs already - just that when used > as intended the DMA_DEBUG_API can complain about stuff like "your device > is gone but you have still DMA memory set up for it" which will not work > if you don't use the correct device... > Right. In fact the 'real' allocations happen using gen_pool, and the pool never shrinks. IMHO as so often in software engineering we have a trade-off here. I'm still not convinced I traded badly here, but I will take the request of yours to tie the dma allocations to a more the device requiring the dma as a maintainers request, and accommodate it in v1. Are you fine with having a similar gen_pool backed with dma_pages on a per struct io_subchannel_private basis? Based on our offline chat I think you are, but but better safe than sorry. Regards, Halil
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 46c69283a67b..e8099ab47368 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -194,6 +194,7 @@ config S390 select VIRT_TO_BUS select HAVE_NMI select SWIOTLB + select GENERIC_ALLOCATOR config SCHED_OMIT_FRAME_POINTER diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index 1727180e8ca1..4510e418614a 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -328,6 +328,9 @@ static inline u8 pathmask_to_pos(u8 mask) void channel_subsystem_reinit(void); extern void css_schedule_reprobe(void); +extern void *cio_dma_zalloc(size_t size); +extern void cio_dma_free(void *cpu_addr, size_t size); + /* Function from drivers/s390/cio/chsc.c */ int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); int chsc_sstpi(void *page, void *result, size_t size); diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index aea502922646..72629d99d8e4 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -20,6 +20,8 @@ #include <linux/reboot.h> #include <linux/suspend.h> #include <linux/proc_fs.h> +#include <linux/genalloc.h> +#include <linux/dma-mapping.h> #include <asm/isc.h> #include <asm/crw.h> @@ -886,6 +888,8 @@ static const struct attribute_group *cssdev_attr_groups[] = { NULL, }; +static u64 css_dev_dma_mask = DMA_BIT_MASK(31); + static int __init setup_css(int nr) { struct channel_subsystem *css; @@ -899,6 +903,9 @@ static int __init setup_css(int nr) dev_set_name(&css->device, "css%x", nr); css->device.groups = cssdev_attr_groups; css->device.release = channel_subsystem_release; + /* some cio DMA memory needs to be 31 bit addressable */ + css->device.coherent_dma_mask = DMA_BIT_MASK(31), + css->device.dma_mask = &css_dev_dma_mask; mutex_init(&css->mutex); css->cssid = chsc_get_cssid(nr); @@ -1018,6 +1025,55 @@ static struct notifier_block css_power_notifier = { .notifier_call = css_power_event, }; +#define POOL_INIT_PAGES 1 +static struct gen_pool *cio_dma_pool; +/* Currently cio supports only a single css */ +static struct device *cio_dma_css; +static gfp_t cio_dma_flags; + +static void __init cio_dma_pool_init(void) +{ + void *cpu_addr; + dma_addr_t dma_addr; + int i; + + cio_dma_css = &channel_subsystems[0]->device; + cio_dma_flags = GFP_DMA | GFP_KERNEL | __GFP_ZERO; + cio_dma_pool = gen_pool_create(3, -1); + /* No need to free up the resources: compiled in */ + for (i = 0; i < POOL_INIT_PAGES; ++i) { + cpu_addr = dma_alloc_coherent(cio_dma_css, PAGE_SIZE, &dma_addr, + cio_dma_flags); + if (!cpu_addr) + return; + gen_pool_add_virt(cio_dma_pool, (unsigned long) cpu_addr, + dma_addr, PAGE_SIZE, -1); + } + +} + +void *cio_dma_zalloc(size_t size) +{ + dma_addr_t dma_addr; + unsigned long addr = gen_pool_alloc(cio_dma_pool, size); + + if (!addr) { + addr = (unsigned long) dma_alloc_coherent(cio_dma_css, + PAGE_SIZE, &dma_addr, cio_dma_flags); + if (!addr) + return NULL; + gen_pool_add_virt(cio_dma_pool, addr, dma_addr, PAGE_SIZE, -1); + addr = gen_pool_alloc(cio_dma_pool, size); + } + return (void *) addr; +} + +void cio_dma_free(void *cpu_addr, size_t size) +{ + memset(cpu_addr, 0, size); + gen_pool_free(cio_dma_pool, (unsigned long) cpu_addr, size); +} + /* * Now that the driver core is running, we can setup our channel subsystem. * The struct subchannel's are created during probing. @@ -1063,6 +1119,7 @@ static int __init css_bus_init(void) unregister_reboot_notifier(&css_reboot_notifier); goto out_unregister; } + cio_dma_pool_init(); css_init_done = 1; /* Enable default isc for I/O subchannels. */
To support protected virtualization cio will need to make sure the memory used for communication with the hypervisor is DMA memory. Let us introduce a DMA pool to cio that will help us in allocating deallocating those chunks of memory. We use a gen_pool backed with DMA pages to avoid each allocation effectively wasting a page, as we typically allocate much less than PAGE_SIZE. Signed-off-by: Halil Pasic <pasic@linux.ibm.com> --- arch/s390/Kconfig | 1 + arch/s390/include/asm/cio.h | 3 +++ drivers/s390/cio/css.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+)