diff mbox series

udmabuf: revert 'Add support for mapping hugepages (v4)'

Message ID 20230608204927.88711-1-mike.kravetz@oracle.com (mailing list archive)
State New, archived
Headers show
Series udmabuf: revert 'Add support for mapping hugepages (v4)' | expand

Commit Message

Mike Kravetz June 8, 2023, 8:49 p.m. UTC
This effectively reverts commit 16c243e99d33 ("udmabuf: Add support
for mapping hugepages (v4)").  Recently, Junxiao Chang found a BUG
with page map counting as described here [1].  This issue pointed out
that the udmabuf driver was making direct use of subpages of hugetlb
pages.  This is not a good idea, and no other mm code attempts such use.
In addition to the mapcount issue, this also causes issues with hugetlb
vmemmap optimization and page poisoning.

For now, remove hugetlb support.

If udmabuf wants to be used on hugetlb mappings, it should be changed to
only use complete hugetlb pages.  This will require different alignment
and size requirements on the UDMABUF_CREATE API.

[1] https://lore.kernel.org/linux-mm/20230512072036.1027784-1-junxiao.chang@intel.com/

Fixes: 16c243e99d33 ("udmabuf: Add support for mapping hugepages (v4)")
Cc: <stable@vger.kernel.org>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 drivers/dma-buf/udmabuf.c | 47 +++++----------------------------------
 1 file changed, 6 insertions(+), 41 deletions(-)

Comments

Vivek Kasireddy June 12, 2023, 7:10 a.m. UTC | #1
Hi Mike,

Sorry for the late reply; I just got back from vacation.
If it is unsafe to directly use the subpages of a hugetlb page, then reverting
this patch seems like the only option for addressing this issue immediately.
So, this patch is
Acked-by: Vivek Kasireddy <vivek.kasireddy@intel.com>

As far as the use-case is concerned, there are two main users of the udmabuf
driver: Qemu and CrosVM VMMs. However, it appears Qemu is the only one
that uses hugetlb pages (when hugetlb=on is set) as the backing store for
Guest (Linux, Android and Windows) system memory. The main goal is to
share the pages associated with the Guest allocated framebuffer (FB) with
the Host GPU driver and other components in a zero-copy way. To that end,
the guest GPU driver (virtio-gpu) allocates 4k size pages (associated with
the FB) and pins them before sharing the (guest) physical (or dma) addresses
(and lengths) with Qemu. Qemu then translates the addresses into file
offsets and shares these offsets with udmabuf. 

The udmabuf driver obtains the pages associated with the file offsets and
uses these pages to eventually populate a scatterlist. It also creates a 
dmabuf fd and acts as the exporter. AFAIK, it should be possible to populate
the scatterlist with physical/dma addresses (of huge pages) instead of using
subpages but this might limit the capabilities of some (dmabuf) importers.
I'll try to figure out a solution using physical/dma addresses and see if it
works as expected, and will share the patch on linux-mm to request
feedback once it is ready.

Thanks,
Vivek

> 
> This effectively reverts commit 16c243e99d33 ("udmabuf: Add support
> for mapping hugepages (v4)").  Recently, Junxiao Chang found a BUG
> with page map counting as described here [1].  This issue pointed out
> that the udmabuf driver was making direct use of subpages of hugetlb
> pages.  This is not a good idea, and no other mm code attempts such use.
> In addition to the mapcount issue, this also causes issues with hugetlb
> vmemmap optimization and page poisoning.
> 
> For now, remove hugetlb support.
> 
> If udmabuf wants to be used on hugetlb mappings, it should be changed to
> only use complete hugetlb pages.  This will require different alignment
> and size requirements on the UDMABUF_CREATE API.
> 
> [1] https://lore.kernel.org/linux-mm/20230512072036.1027784-1-
> junxiao.chang@intel.com/
> 
> Fixes: 16c243e99d33 ("udmabuf: Add support for mapping hugepages (v4)")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>  drivers/dma-buf/udmabuf.c | 47 +++++----------------------------------
>  1 file changed, 6 insertions(+), 41 deletions(-)
> 
> diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c
> index 01f2e86f3f7c..12cf6bb2e3ce 100644
> --- a/drivers/dma-buf/udmabuf.c
> +++ b/drivers/dma-buf/udmabuf.c
> @@ -12,7 +12,6 @@
>  #include <linux/shmem_fs.h>
>  #include <linux/slab.h>
>  #include <linux/udmabuf.h>
> -#include <linux/hugetlb.h>
>  #include <linux/vmalloc.h>
>  #include <linux/iosys-map.h>
> 
> @@ -207,9 +206,7 @@ static long udmabuf_create(struct miscdevice
> *device,
>  	struct udmabuf *ubuf;
>  	struct dma_buf *buf;
>  	pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit;
> -	struct page *page, *hpage = NULL;
> -	pgoff_t subpgoff, maxsubpgs;
> -	struct hstate *hpstate;
> +	struct page *page;
>  	int seals, ret = -EINVAL;
>  	u32 i, flags;
> 
> @@ -245,7 +242,7 @@ static long udmabuf_create(struct miscdevice
> *device,
>  		if (!memfd)
>  			goto err;
>  		mapping = memfd->f_mapping;
> -		if (!shmem_mapping(mapping) &&
> !is_file_hugepages(memfd))
> +		if (!shmem_mapping(mapping))
>  			goto err;
>  		seals = memfd_fcntl(memfd, F_GET_SEALS, 0);
>  		if (seals == -EINVAL)
> @@ -256,48 +253,16 @@ static long udmabuf_create(struct miscdevice
> *device,
>  			goto err;
>  		pgoff = list[i].offset >> PAGE_SHIFT;
>  		pgcnt = list[i].size   >> PAGE_SHIFT;
> -		if (is_file_hugepages(memfd)) {
> -			hpstate = hstate_file(memfd);
> -			pgoff = list[i].offset >> huge_page_shift(hpstate);
> -			subpgoff = (list[i].offset &
> -				    ~huge_page_mask(hpstate)) >>
> PAGE_SHIFT;
> -			maxsubpgs = huge_page_size(hpstate) >>
> PAGE_SHIFT;
> -		}
>  		for (pgidx = 0; pgidx < pgcnt; pgidx++) {
> -			if (is_file_hugepages(memfd)) {
> -				if (!hpage) {
> -					hpage =
> find_get_page_flags(mapping, pgoff,
> -
> FGP_ACCESSED);
> -					if (!hpage) {
> -						ret = -EINVAL;
> -						goto err;
> -					}
> -				}
> -				page = hpage + subpgoff;
> -				get_page(page);
> -				subpgoff++;
> -				if (subpgoff == maxsubpgs) {
> -					put_page(hpage);
> -					hpage = NULL;
> -					subpgoff = 0;
> -					pgoff++;
> -				}
> -			} else {
> -				page =
> shmem_read_mapping_page(mapping,
> -							       pgoff + pgidx);
> -				if (IS_ERR(page)) {
> -					ret = PTR_ERR(page);
> -					goto err;
> -				}
> +			page = shmem_read_mapping_page(mapping, pgoff
> + pgidx);
> +			if (IS_ERR(page)) {
> +				ret = PTR_ERR(page);
> +				goto err;
>  			}
>  			ubuf->pages[pgbuf++] = page;
>  		}
>  		fput(memfd);
>  		memfd = NULL;
> -		if (hpage) {
> -			put_page(hpage);
> -			hpage = NULL;
> -		}
>  	}
> 
>  	exp_info.ops  = &udmabuf_ops;
> --
> 2.40.1
David Hildenbrand June 12, 2023, 7:46 a.m. UTC | #2
On 12.06.23 09:10, Kasireddy, Vivek wrote:
> Hi Mike,

Hi Vivek,

> 
> Sorry for the late reply; I just got back from vacation.
> If it is unsafe to directly use the subpages of a hugetlb page, then reverting
> this patch seems like the only option for addressing this issue immediately.
> So, this patch is
> Acked-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> 
> As far as the use-case is concerned, there are two main users of the udmabuf
> driver: Qemu and CrosVM VMMs. However, it appears Qemu is the only one
> that uses hugetlb pages (when hugetlb=on is set) as the backing store for
> Guest (Linux, Android and Windows) system memory. The main goal is to
> share the pages associated with the Guest allocated framebuffer (FB) with
> the Host GPU driver and other components in a zero-copy way. To that end,
> the guest GPU driver (virtio-gpu) allocates 4k size pages (associated with
> the FB) and pins them before sharing the (guest) physical (or dma) addresses
> (and lengths) with Qemu. Qemu then translates the addresses into file
> offsets and shares these offsets with udmabuf.

Is my understanding correct, that we can effectively long-term pin 
(worse than mlock) 64 MiB per UDMABUF_CREATE, allowing eventually !root 
users

ll /dev/udmabuf
crw-rw---- 1 root kvm 10, 125 12. Jun 08:12 /dev/udmabuf

to bypass there effective MEMLOCK limit, fragmenting physical memory and 
breaking swap?


Regarding the udmabuf_vm_fault(), I assume we're mapping pages we 
obtained from the memfd ourselves into a special VMA (mmap() of the 
udmabuf). I'm not sure how well shmem pages are prepared for getting 
mapped by someone else into an arbitrary VMA (page->index?).

... also, just imagine someone doing FALLOC_FL_PUNCH_HOLE / ftruncate() 
on the memfd. What's mapped into the memfd no longer corresponds to 
what's pinned / mapped into the VMA.


Was linux-mm (and especially shmem maintainers, ccing Hugh) involved in 
the upstreaming of udmabuf?
Vivek Kasireddy June 13, 2023, 8:26 a.m. UTC | #3
Hi David,

> 
> On 12.06.23 09:10, Kasireddy, Vivek wrote:
> > Hi Mike,
> 
> Hi Vivek,
> 
> >
> > Sorry for the late reply; I just got back from vacation.
> > If it is unsafe to directly use the subpages of a hugetlb page, then reverting
> > this patch seems like the only option for addressing this issue immediately.
> > So, this patch is
> > Acked-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> >
> > As far as the use-case is concerned, there are two main users of the
> udmabuf
> > driver: Qemu and CrosVM VMMs. However, it appears Qemu is the only
> one
> > that uses hugetlb pages (when hugetlb=on is set) as the backing store for
> > Guest (Linux, Android and Windows) system memory. The main goal is to
> > share the pages associated with the Guest allocated framebuffer (FB) with
> > the Host GPU driver and other components in a zero-copy way. To that
> end,
> > the guest GPU driver (virtio-gpu) allocates 4k size pages (associated with
> > the FB) and pins them before sharing the (guest) physical (or dma)
> addresses
> > (and lengths) with Qemu. Qemu then translates the addresses into file
> > offsets and shares these offsets with udmabuf.
> 
> Is my understanding correct, that we can effectively long-term pin
> (worse than mlock) 64 MiB per UDMABUF_CREATE, allowing eventually !root
The 64 MiB limit is the theoretical upper bound that we have not seen hit in 
practice. Typically, for a 1920x1080 resolution (commonly used in Guests),
the size of the FB is ~8 MB (1920x1080x4). And, most modern Graphics
compositors flip between two FBs.

> users
> 
> ll /dev/udmabuf
> crw-rw---- 1 root kvm 10, 125 12. Jun 08:12 /dev/udmabuf
> 
> to bypass there effective MEMLOCK limit, fragmenting physical memory and
> breaking swap?
Right, it does not look like the mlock limits are honored.

> 
> Regarding the udmabuf_vm_fault(), I assume we're mapping pages we
> obtained from the memfd ourselves into a special VMA (mmap() of the
mmap operation is really needed only if any component on the Host needs
CPU access to the buffer. But in most scenarios, we try to ensure direct GPU
access (h/w acceleration via gl) to these pages.

> udmabuf). I'm not sure how well shmem pages are prepared for getting
> mapped by someone else into an arbitrary VMA (page->index?).
Most drm/gpu drivers use shmem pages as the backing store for FBs and
other buffers and also provide mmap capability. What concerns do you see
with this approach?

> 
> ... also, just imagine someone doing FALLOC_FL_PUNCH_HOLE / ftruncate()
> on the memfd. What's mapped into the memfd no longer corresponds to
> what's pinned / mapped into the VMA.
IIUC, making use of the DMA_BUF_IOCTL_SYNC ioctl would help with any
coherency issues:
https://www.kernel.org/doc/html/v6.2/driver-api/dma-buf.html#c.dma_buf_sync

> 
> 
> Was linux-mm (and especially shmem maintainers, ccing Hugh) involved in
> the upstreaming of udmabuf?
It does not appear so from the link below although other key lists were cc'd:
https://patchwork.freedesktop.org/patch/246100/?series=39879&rev=7

Thanks,
Vivek
> 
> --
> Cheers,
> 
> David / dhildenb
David Laight June 13, 2023, 12:25 p.m. UTC | #4
From: Kasireddy, Vivek <vivek.kasireddy@intel.com>
> Sent: 13 June 2023 09:26
...
> > Is my understanding correct, that we can effectively long-term pin
> > (worse than mlock) 64 MiB per UDMABUF_CREATE, allowing eventually !root
> > users
>
> The 64 MiB limit is the theoretical upper bound that we have not seen hit in
> practice. Typically, for a 1920x1080 resolution (commonly used in Guests),
> the size of the FB is ~8 MB (1920x1080x4). And, most modern Graphics
> compositors flip between two FBs.

What code does and what potentially malicious code might do
are entirely different things.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
David Hildenbrand June 13, 2023, 5:51 p.m. UTC | #5
On 13.06.23 10:26, Kasireddy, Vivek wrote:
> Hi David,
> 
>>
>> On 12.06.23 09:10, Kasireddy, Vivek wrote:
>>> Hi Mike,
>>
>> Hi Vivek,
>>
>>>
>>> Sorry for the late reply; I just got back from vacation.
>>> If it is unsafe to directly use the subpages of a hugetlb page, then reverting
>>> this patch seems like the only option for addressing this issue immediately.
>>> So, this patch is
>>> Acked-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
>>>
>>> As far as the use-case is concerned, there are two main users of the
>> udmabuf
>>> driver: Qemu and CrosVM VMMs. However, it appears Qemu is the only
>> one
>>> that uses hugetlb pages (when hugetlb=on is set) as the backing store for
>>> Guest (Linux, Android and Windows) system memory. The main goal is to
>>> share the pages associated with the Guest allocated framebuffer (FB) with
>>> the Host GPU driver and other components in a zero-copy way. To that
>> end,
>>> the guest GPU driver (virtio-gpu) allocates 4k size pages (associated with
>>> the FB) and pins them before sharing the (guest) physical (or dma)
>> addresses
>>> (and lengths) with Qemu. Qemu then translates the addresses into file
>>> offsets and shares these offsets with udmabuf.
>>
>> Is my understanding correct, that we can effectively long-term pin
>> (worse than mlock) 64 MiB per UDMABUF_CREATE, allowing eventually !root
> The 64 MiB limit is the theoretical upper bound that we have not seen hit in
> practice. Typically, for a 1920x1080 resolution (commonly used in Guests),
> the size of the FB is ~8 MB (1920x1080x4). And, most modern Graphics
> compositors flip between two FBs.
> 

Okay, but users with privileges to open that file can just create as 
many as they want? I think I'll have to play with it.

>> users
>>
>> ll /dev/udmabuf
>> crw-rw---- 1 root kvm 10, 125 12. Jun 08:12 /dev/udmabuf
>>
>> to bypass there effective MEMLOCK limit, fragmenting physical memory and
>> breaking swap?
> Right, it does not look like the mlock limits are honored.
> 

That should be added.

>>
>> Regarding the udmabuf_vm_fault(), I assume we're mapping pages we
>> obtained from the memfd ourselves into a special VMA (mmap() of the
> mmap operation is really needed only if any component on the Host needs
> CPU access to the buffer. But in most scenarios, we try to ensure direct GPU
> access (h/w acceleration via gl) to these pages.
> 
>> udmabuf). I'm not sure how well shmem pages are prepared for getting
>> mapped by someone else into an arbitrary VMA (page->index?).
> Most drm/gpu drivers use shmem pages as the backing store for FBs and
> other buffers and also provide mmap capability. What concerns do you see
> with this approach?

Are these mmaping the pages the way udmabuf maps these pages (IOW, 
on-demand fault where we core-mm will adjust the mapcount etc)?

Skimming over at shmem_read_mapping_page() users, I assume most of them 
use a VM_PFNMAP mapping (or don't mmap them at all), where we won't be 
messing with the struct page at all.

(That might even allow you to mmap hugetlb sub-pages, because the struct 
page -- and mapcount -- will be ignored completely and not touched.)

> 
>>
>> ... also, just imagine someone doing FALLOC_FL_PUNCH_HOLE / ftruncate()
>> on the memfd. What's mapped into the memfd no longer corresponds to
>> what's pinned / mapped into the VMA.
> IIUC, making use of the DMA_BUF_IOCTL_SYNC ioctl would help with any
> coherency issues:
> https://www.kernel.org/doc/html/v6.2/driver-api/dma-buf.html#c.dma_buf_sync
> 

Would it as of now? udmabuf_create() pulls the shmem pages out of the 
memfd, not sure how DMA_BUF_IOCTL_SYNC would help to update that 
whenever the pages inside the memfd would change (e.g., 
FALLOC_FL_PUNCH_HOLE + realloc).

But that's most probably simply "not supported".

>>
>>
>> Was linux-mm (and especially shmem maintainers, ccing Hugh) involved in
>> the upstreaming of udmabuf?
> It does not appear so from the link below although other key lists were cc'd:
> https://patchwork.freedesktop.org/patch/246100/?series=39879&rev=7

That's unfortunate :(
Hugh Dickins June 14, 2023, 3:40 a.m. UTC | #6
On Tue, 13 Jun 2023, David Hildenbrand wrote:
> On 13.06.23 10:26, Kasireddy, Vivek wrote:
> >> On 12.06.23 09:10, Kasireddy, Vivek wrote:
> >>> Sorry for the late reply; I just got back from vacation.
> >>> If it is unsafe to directly use the subpages of a hugetlb page, then
> >>> reverting
> >>> this patch seems like the only option for addressing this issue
> >>> immediately.
> >>> So, this patch is
> >>> Acked-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> >>>
> >>> As far as the use-case is concerned, there are two main users of the
> >> udmabuf
> >>> driver: Qemu and CrosVM VMMs. However, it appears Qemu is the only
> >> one
> >>> that uses hugetlb pages (when hugetlb=on is set) as the backing store for
> >>> Guest (Linux, Android and Windows) system memory. The main goal is to
> >>> share the pages associated with the Guest allocated framebuffer (FB) with
> >>> the Host GPU driver and other components in a zero-copy way. To that
> >> end,
> >>> the guest GPU driver (virtio-gpu) allocates 4k size pages (associated with
> >>> the FB) and pins them before sharing the (guest) physical (or dma)
> >> addresses
> >>> (and lengths) with Qemu. Qemu then translates the addresses into file
> >>> offsets and shares these offsets with udmabuf.
> >>
> >> Is my understanding correct, that we can effectively long-term pin
> >> (worse than mlock) 64 MiB per UDMABUF_CREATE, allowing eventually !root
> > The 64 MiB limit is the theoretical upper bound that we have not seen hit in
> > practice. Typically, for a 1920x1080 resolution (commonly used in Guests),
> > the size of the FB is ~8 MB (1920x1080x4). And, most modern Graphics
> > compositors flip between two FBs.
> > 
> 
> Okay, but users with privileges to open that file can just create as many as
> they want? I think I'll have to play with it.
> 
> >> users
> >>
> >> ll /dev/udmabuf
> >> crw-rw---- 1 root kvm 10, 125 12. Jun 08:12 /dev/udmabuf
> >>
> >> to bypass there effective MEMLOCK limit, fragmenting physical memory and
> >> breaking swap?
> > Right, it does not look like the mlock limits are honored.
> > 
> 
> That should be added.

Agreed.

> 
> >>
> >> Regarding the udmabuf_vm_fault(), I assume we're mapping pages we
> >> obtained from the memfd ourselves into a special VMA (mmap() of the
> > mmap operation is really needed only if any component on the Host needs
> > CPU access to the buffer. But in most scenarios, we try to ensure direct GPU
> > access (h/w acceleration via gl) to these pages.
> > 
> >> udmabuf). I'm not sure how well shmem pages are prepared for getting
> >> mapped by someone else into an arbitrary VMA (page->index?).
> > Most drm/gpu drivers use shmem pages as the backing store for FBs and
> > other buffers and also provide mmap capability. What concerns do you see
> > with this approach?
> 
> Are these mmaping the pages the way udmabuf maps these pages (IOW, on-demand
> fault where we core-mm will adjust the mapcount etc)?
> 
> Skimming over at shmem_read_mapping_page() users, I assume most of them use a
> VM_PFNMAP mapping (or don't mmap them at all), where we won't be messing with
> the struct page at all.
> 
> (That might even allow you to mmap hugetlb sub-pages, because the struct page
> -- and mapcount -- will be ignored completely and not touched.)

You're well ahead of me: I didn't reach an understanding of whether or not
mapcount would get manipulated here - though if Junxiao's original patch
did fix the immediate hugetlb symptoms, presumably it is (and without much
point, since udmabuf holds on to that extra reference which pins each
page for the duration).

> 
> > 
> >>
> >> ... also, just imagine someone doing FALLOC_FL_PUNCH_HOLE / ftruncate()
> >> on the memfd. What's mapped into the memfd no longer corresponds to
> >> what's pinned / mapped into the VMA.
> > IIUC, making use of the DMA_BUF_IOCTL_SYNC ioctl would help with any
> > coherency issues:
> > https://www.kernel.org/doc/html/v6.2/driver-api/dma-buf.html#c.dma_buf_sync
> > 
> 
> Would it as of now? udmabuf_create() pulls the shmem pages out of the memfd,
> not sure how DMA_BUF_IOCTL_SYNC would help to update that whenever the pages
> inside the memfd would change (e.g., FALLOC_FL_PUNCH_HOLE + realloc).
> 
> But that's most probably simply "not supported".

Yes, the pages which udmabuf is holding would be the originals: they will
then be detached from the hole-punched file, and subsequent faults or writes
to that backing file (through shmem, rather than through udmabuf) can fill
in the holes with new, different pages.  So long as that's well understood,
then it's not necessarily a disaster.

I see udmabuf asks for SEAL_SHRINK (I guess to keep away from SIGBUS),
but refuses SEAL_WRITE - so hole-punching remains permitted.

> 
> >>
> >>
> >> Was linux-mm (and especially shmem maintainers, ccing Hugh) involved in
> >> the upstreaming of udmabuf?

Thanks for the Cc, David.  No, I wasn't involved at all; but I probably
would not have understood their needs much better then than now.

I don't see anything obviously wrong with its use of shmem, aside from
the unlimited pinning of pages which you pointed out; and I'll tend to
assume that it's okay, from its five years of use.  But certainly the
more recent addition of hugetlb was mistaken, and needs to be reverted.

> > It does not appear so from the link below although other key lists were
> > cc'd:
> > https://patchwork.freedesktop.org/patch/246100/?series=39879&rev=7

The i915 folks (looks like Daniel Vetter was involved there) have been
using shmem_read_mapping_page() for a very long time: but they take care
to register a shrinker and swap out under pressure, rather than holding
pins indefinitely.

I wonder, if we're taking MFD_HUGETLB away from them, whether this
would be another call for MFD_HUGEPAGE (shmem memfd using THPs):
https://lore.kernel.org/linux-mm/c140f56a-1aa3-f7ae-b7d1-93da7d5a3572@google.com/

And that series did also support F_MEM_LOCK, which could be used to
help with the accounting of the locked pages.  (But IIRC the necessary
way of accounting changed just afterwards - or was it just before? -
so that old series may not show what's needed today.)

I was happy with using fcntls in that series; but could not decide the
right restrictionss for F_MEM_UNLOCK (how assured is a memlock if anyone
can unlock it?) - maybe F_MEM_UNLOCK should be refused while pins are
outstanding.

But I digress.  Yes, please do revert that hugetlb usage from udmabuf.

Hugh

> 
> That's unfortunate :(
> 
> -- 
> Cheers,
> 
> David / dhildenb
Vivek Kasireddy June 14, 2023, 7:51 a.m. UTC | #7
Hi David,

> 
> On 13.06.23 10:26, Kasireddy, Vivek wrote:
> > Hi David,
> >
> >>
> >> On 12.06.23 09:10, Kasireddy, Vivek wrote:
> >>> Hi Mike,
> >>
> >> Hi Vivek,
> >>
> >>>
> >>> Sorry for the late reply; I just got back from vacation.
> >>> If it is unsafe to directly use the subpages of a hugetlb page, then
> reverting
> >>> this patch seems like the only option for addressing this issue
> immediately.
> >>> So, this patch is
> >>> Acked-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
> >>>
> >>> As far as the use-case is concerned, there are two main users of the
> >> udmabuf
> >>> driver: Qemu and CrosVM VMMs. However, it appears Qemu is the only
> >> one
> >>> that uses hugetlb pages (when hugetlb=on is set) as the backing store for
> >>> Guest (Linux, Android and Windows) system memory. The main goal is
> to
> >>> share the pages associated with the Guest allocated framebuffer (FB)
> with
> >>> the Host GPU driver and other components in a zero-copy way. To that
> >> end,
> >>> the guest GPU driver (virtio-gpu) allocates 4k size pages (associated with
> >>> the FB) and pins them before sharing the (guest) physical (or dma)
> >> addresses
> >>> (and lengths) with Qemu. Qemu then translates the addresses into file
> >>> offsets and shares these offsets with udmabuf.
> >>
> >> Is my understanding correct, that we can effectively long-term pin
> >> (worse than mlock) 64 MiB per UDMABUF_CREATE, allowing eventually
> !root
> > The 64 MiB limit is the theoretical upper bound that we have not seen hit in
> > practice. Typically, for a 1920x1080 resolution (commonly used in Guests),
> > the size of the FB is ~8 MB (1920x1080x4). And, most modern Graphics
> > compositors flip between two FBs.
> >
> 
> Okay, but users with privileges to open that file can just create as
> many as they want? I think I'll have to play with it.
Yeah, unfortunately, we are not restricting the total number of FBs or other
buffers that are mapped by udambuf per user. We'll definitely try to add a
patch to align it with mlock limits.

> 
> >> users
> >>
> >> ll /dev/udmabuf
> >> crw-rw---- 1 root kvm 10, 125 12. Jun 08:12 /dev/udmabuf
> >>
> >> to bypass there effective MEMLOCK limit, fragmenting physical memory
> and
> >> breaking swap?
> > Right, it does not look like the mlock limits are honored.
> >
> 
> That should be added.
> 
> >>
> >> Regarding the udmabuf_vm_fault(), I assume we're mapping pages we
> >> obtained from the memfd ourselves into a special VMA (mmap() of the
> > mmap operation is really needed only if any component on the Host needs
> > CPU access to the buffer. But in most scenarios, we try to ensure direct GPU
> > access (h/w acceleration via gl) to these pages.
> >
> >> udmabuf). I'm not sure how well shmem pages are prepared for getting
> >> mapped by someone else into an arbitrary VMA (page->index?).
> > Most drm/gpu drivers use shmem pages as the backing store for FBs and
> > other buffers and also provide mmap capability. What concerns do you see
> > with this approach?
> 
> Are these mmaping the pages the way udmabuf maps these pages (IOW,
> on-demand fault where we core-mm will adjust the mapcount etc)?
> 
> Skimming over at shmem_read_mapping_page() users, I assume most of
> them
> use a VM_PFNMAP mapping (or don't mmap them at all), where we won't be
> messing with the struct page at all.
> 
> (That might even allow you to mmap hugetlb sub-pages, because the struct
> page -- and mapcount -- will be ignored completely and not touched.)
Oh, are you suggesting that if we do vma->vm_flags |= VM_PFNMAP
in the mmap handler (mmap_udmabuf) and also do
vmf_insert_pfn(vma, vmf->address, page_to_pfn(page))
instead of
vmf->page = ubuf->pages[pgoff];
get_page(vmf->page);

in the vma fault handler (udmabuf_vm_fault), we can avoid most of the
pitfalls you have identified -- including with the usage of hugetlb subpages? 

> 
> >
> >>
> >> ... also, just imagine someone doing FALLOC_FL_PUNCH_HOLE /
> ftruncate()
> >> on the memfd. What's mapped into the memfd no longer corresponds to
> >> what's pinned / mapped into the VMA.
> > IIUC, making use of the DMA_BUF_IOCTL_SYNC ioctl would help with any
> > coherency issues:
> > https://www.kernel.org/doc/html/v6.2/driver-api/dma-
> buf.html#c.dma_buf_sync
> >
> 
> Would it as of now? udmabuf_create() pulls the shmem pages out of the
> memfd, not sure how DMA_BUF_IOCTL_SYNC would help to update that
> whenever the pages inside the memfd would change (e.g.,
> FALLOC_FL_PUNCH_HOLE + realloc).
AFAIU, Qemu owns the memfd and if it were to somehow punch a hole in 
its memfd, it is not clear to me if that would affect the Guest pinned FB
pages as well. Regardless, what do you suggest is the right thing to do in
this case on the udmabuf side?

Thanks,
Vivek

> 
> But that's most probably simply "not supported".
> 
> >>
> >>
> >> Was linux-mm (and especially shmem maintainers, ccing Hugh) involved
> in
> >> the upstreaming of udmabuf?
> > It does not appear so from the link below although other key lists were
> cc'd:
> > https://patchwork.freedesktop.org/patch/246100/?series=39879&rev=7
> 
> That's unfortunate :(
> 
> --
> Cheers,
> 
> David / dhildenb
David Hildenbrand June 15, 2023, 9:48 a.m. UTC | #8
>> Skimming over at shmem_read_mapping_page() users, I assume most of
>> them
>> use a VM_PFNMAP mapping (or don't mmap them at all), where we won't be
>> messing with the struct page at all.
>>
>> (That might even allow you to mmap hugetlb sub-pages, because the struct
>> page -- and mapcount -- will be ignored completely and not touched.)
> Oh, are you suggesting that if we do vma->vm_flags |= VM_PFNMAP
> in the mmap handler (mmap_udmabuf) and also do
> vmf_insert_pfn(vma, vmf->address, page_to_pfn(page))
> instead of
> vmf->page = ubuf->pages[pgoff];
> get_page(vmf->page);
> 
> in the vma fault handler (udmabuf_vm_fault), we can avoid most of the
> pitfalls you have identified -- including with the usage of hugetlb subpages?

Yes, that's my thinking, but I have to do my homework first to see if 
that would really work for hugetlb.

The thing is, I kind-of consider what udmabuf does a layer violation: we 
have a filesystem (shmem/hugetlb) that should handle mappings to user 
space. Yet, a driver decides to bypass that and simply map the pages 
ordinarily to user space. (revealed by the fact that hugetlb does never 
map sub-pages but udmabuf decides to do so)

In an ideal world everybody would simply mmap() the original memfd, but 
thinking about offset+size configuration within the memfd that might not 
always be desirable. As a workaround, we could mmap() only the PFNs, 
leaving the struct page unaffected.

I'll have to look closer into that.
Gerd Hoffmann June 19, 2023, 12:35 p.m. UTC | #9
On Thu, Jun 08, 2023 at 01:49:27PM -0700, Mike Kravetz wrote:
> This effectively reverts commit 16c243e99d33 ("udmabuf: Add support
> for mapping hugepages (v4)").  Recently, Junxiao Chang found a BUG
> with page map counting as described here [1].  This issue pointed out
> that the udmabuf driver was making direct use of subpages of hugetlb
> pages.  This is not a good idea, and no other mm code attempts such use.
> In addition to the mapcount issue, this also causes issues with hugetlb
> vmemmap optimization and page poisoning.
> 
> For now, remove hugetlb support.
> 
> If udmabuf wants to be used on hugetlb mappings, it should be changed to
> only use complete hugetlb pages.  This will require different alignment
> and size requirements on the UDMABUF_CREATE API.
> 
> [1] https://lore.kernel.org/linux-mm/20230512072036.1027784-1-junxiao.chang@intel.com/
> 
> Fixes: 16c243e99d33 ("udmabuf: Add support for mapping hugepages (v4)")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>

Acked-by: Gerd Hoffmann <kraxel@redhat.com>
diff mbox series

Patch

diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c
index 01f2e86f3f7c..12cf6bb2e3ce 100644
--- a/drivers/dma-buf/udmabuf.c
+++ b/drivers/dma-buf/udmabuf.c
@@ -12,7 +12,6 @@ 
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/udmabuf.h>
-#include <linux/hugetlb.h>
 #include <linux/vmalloc.h>
 #include <linux/iosys-map.h>
 
@@ -207,9 +206,7 @@  static long udmabuf_create(struct miscdevice *device,
 	struct udmabuf *ubuf;
 	struct dma_buf *buf;
 	pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit;
-	struct page *page, *hpage = NULL;
-	pgoff_t subpgoff, maxsubpgs;
-	struct hstate *hpstate;
+	struct page *page;
 	int seals, ret = -EINVAL;
 	u32 i, flags;
 
@@ -245,7 +242,7 @@  static long udmabuf_create(struct miscdevice *device,
 		if (!memfd)
 			goto err;
 		mapping = memfd->f_mapping;
-		if (!shmem_mapping(mapping) && !is_file_hugepages(memfd))
+		if (!shmem_mapping(mapping))
 			goto err;
 		seals = memfd_fcntl(memfd, F_GET_SEALS, 0);
 		if (seals == -EINVAL)
@@ -256,48 +253,16 @@  static long udmabuf_create(struct miscdevice *device,
 			goto err;
 		pgoff = list[i].offset >> PAGE_SHIFT;
 		pgcnt = list[i].size   >> PAGE_SHIFT;
-		if (is_file_hugepages(memfd)) {
-			hpstate = hstate_file(memfd);
-			pgoff = list[i].offset >> huge_page_shift(hpstate);
-			subpgoff = (list[i].offset &
-				    ~huge_page_mask(hpstate)) >> PAGE_SHIFT;
-			maxsubpgs = huge_page_size(hpstate) >> PAGE_SHIFT;
-		}
 		for (pgidx = 0; pgidx < pgcnt; pgidx++) {
-			if (is_file_hugepages(memfd)) {
-				if (!hpage) {
-					hpage = find_get_page_flags(mapping, pgoff,
-								    FGP_ACCESSED);
-					if (!hpage) {
-						ret = -EINVAL;
-						goto err;
-					}
-				}
-				page = hpage + subpgoff;
-				get_page(page);
-				subpgoff++;
-				if (subpgoff == maxsubpgs) {
-					put_page(hpage);
-					hpage = NULL;
-					subpgoff = 0;
-					pgoff++;
-				}
-			} else {
-				page = shmem_read_mapping_page(mapping,
-							       pgoff + pgidx);
-				if (IS_ERR(page)) {
-					ret = PTR_ERR(page);
-					goto err;
-				}
+			page = shmem_read_mapping_page(mapping, pgoff + pgidx);
+			if (IS_ERR(page)) {
+				ret = PTR_ERR(page);
+				goto err;
 			}
 			ubuf->pages[pgbuf++] = page;
 		}
 		fput(memfd);
 		memfd = NULL;
-		if (hpage) {
-			put_page(hpage);
-			hpage = NULL;
-		}
 	}
 
 	exp_info.ops  = &udmabuf_ops;