diff mbox series

[v4,bpf-next,06/10] lib/buildid: implement sleepable build_id_parse() API

Message ID 20240807234029.456316-7-andrii@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Harden and extend ELF build ID parsing logic | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 42 this patch: 42
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 2 of 2 maintainers
netdev/build_clang success Errors and warnings before: 43 this patch: 43
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 15724 this patch: 43
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Andrii Nakryiko Aug. 7, 2024, 11:40 p.m. UTC
Extend freader with a flag specifying whether it's OK to cause page
fault to fetch file data that is not already physically present in
memory. With this, it's now easy to wait for data if the caller is
running in sleepable (faultable) context.

We utilize read_cache_folio() to bring the desired folio into page
cache, after which the rest of the logic works just the same at folio level.

Suggested-by: Omar Sandoval <osandov@fb.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 lib/buildid.c | 44 ++++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 16 deletions(-)

Comments

Shakeel Butt Aug. 8, 2024, 6:40 p.m. UTC | #1
On Wed, Aug 07, 2024 at 04:40:25PM GMT, Andrii Nakryiko wrote:
> Extend freader with a flag specifying whether it's OK to cause page
> fault to fetch file data that is not already physically present in
> memory. With this, it's now easy to wait for data if the caller is
> running in sleepable (faultable) context.
> 
> We utilize read_cache_folio() to bring the desired folio into page
> cache, after which the rest of the logic works just the same at folio level.
> 
> Suggested-by: Omar Sandoval <osandov@fb.com>
> Cc: Shakeel Butt <shakeel.butt@linux.dev>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> ---
>  lib/buildid.c | 44 ++++++++++++++++++++++++++++----------------
>  1 file changed, 28 insertions(+), 16 deletions(-)
> 
> diff --git a/lib/buildid.c b/lib/buildid.c
> index 5e6f842f56f0..e1c01b23efd8 100644
> --- a/lib/buildid.c
> +++ b/lib/buildid.c
> @@ -20,6 +20,7 @@ struct freader {
>  			struct folio *folio;
>  			void *addr;
>  			loff_t folio_off;
> +			bool may_fault;
>  		};
>  		struct {
>  			const char *data;
> @@ -29,12 +30,13 @@ struct freader {
>  };
>  
>  static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
> -				   struct address_space *mapping)
> +				   struct address_space *mapping, bool may_fault)
>  {
>  	memset(r, 0, sizeof(*r));
>  	r->buf = buf;
>  	r->buf_sz = buf_sz;
>  	r->mapping = mapping;
> +	r->may_fault = may_fault;
>  }
>  
>  static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz)
> @@ -63,6 +65,11 @@ static int freader_get_folio(struct freader *r, loff_t file_off)
>  	freader_put_folio(r);
>  
>  	r->folio = filemap_get_folio(r->mapping, file_off >> PAGE_SHIFT);
> +
> +	/* if sleeping is allowed, wait for the page, if necessary */
> +	if (r->may_fault && (IS_ERR(r->folio) || !folio_test_uptodate(r->folio)))
> +		r->folio = read_cache_folio(r->mapping, file_off >> PAGE_SHIFT, NULL, NULL);

Willy's network fs comment is bugging me. If we pass NULL for filler,
the kernel will going to use fs's read_folio() callback. I have checked
read_folio() for fuse and nfs and it seems like for at least these two
filesystems the callback is accessing file->private_data. So, if the elf
file is on these filesystems, we might see null accesses.
Andrii Nakryiko Aug. 8, 2024, 8:15 p.m. UTC | #2
On Thu, Aug 8, 2024 at 11:40 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Wed, Aug 07, 2024 at 04:40:25PM GMT, Andrii Nakryiko wrote:
> > Extend freader with a flag specifying whether it's OK to cause page
> > fault to fetch file data that is not already physically present in
> > memory. With this, it's now easy to wait for data if the caller is
> > running in sleepable (faultable) context.
> >
> > We utilize read_cache_folio() to bring the desired folio into page
> > cache, after which the rest of the logic works just the same at folio level.
> >
> > Suggested-by: Omar Sandoval <osandov@fb.com>
> > Cc: Shakeel Butt <shakeel.butt@linux.dev>
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > ---
> >  lib/buildid.c | 44 ++++++++++++++++++++++++++++----------------
> >  1 file changed, 28 insertions(+), 16 deletions(-)
> >
> > diff --git a/lib/buildid.c b/lib/buildid.c
> > index 5e6f842f56f0..e1c01b23efd8 100644
> > --- a/lib/buildid.c
> > +++ b/lib/buildid.c
> > @@ -20,6 +20,7 @@ struct freader {
> >                       struct folio *folio;
> >                       void *addr;
> >                       loff_t folio_off;
> > +                     bool may_fault;
> >               };
> >               struct {
> >                       const char *data;
> > @@ -29,12 +30,13 @@ struct freader {
> >  };
> >
> >  static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
> > -                                struct address_space *mapping)
> > +                                struct address_space *mapping, bool may_fault)
> >  {
> >       memset(r, 0, sizeof(*r));
> >       r->buf = buf;
> >       r->buf_sz = buf_sz;
> >       r->mapping = mapping;
> > +     r->may_fault = may_fault;
> >  }
> >
> >  static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz)
> > @@ -63,6 +65,11 @@ static int freader_get_folio(struct freader *r, loff_t file_off)
> >       freader_put_folio(r);
> >
> >       r->folio = filemap_get_folio(r->mapping, file_off >> PAGE_SHIFT);
> > +
> > +     /* if sleeping is allowed, wait for the page, if necessary */
> > +     if (r->may_fault && (IS_ERR(r->folio) || !folio_test_uptodate(r->folio)))
> > +             r->folio = read_cache_folio(r->mapping, file_off >> PAGE_SHIFT, NULL, NULL);
>
> Willy's network fs comment is bugging me. If we pass NULL for filler,
> the kernel will going to use fs's read_folio() callback. I have checked
> read_folio() for fuse and nfs and it seems like for at least these two
> filesystems the callback is accessing file->private_data. So, if the elf
> file is on these filesystems, we might see null accesses.
>

Isn't that just a huge problem with the read_cache_folio() interface
then? That file is optional, in general, but for some specific FS
types it's not. How generic code is supposed to know this?

Or maybe it's a bug with the nfs_read_folio() and fuse_read_folio()
implementation that they can't handle NULL file argument?
netfs_read_folio(), for example, seems to be working with file == NULL
just fine.

Matthew, can you please advise what's the right approach here? I can,
of course, always get file refcount, but most of the time it will be
just an unnecessary overhead, so ideally I'd like to avoid that. But
if I have to check each read_folio callback implementation to know
whether it's required or not, then that's not great...
Shakeel Butt Aug. 8, 2024, 9:02 p.m. UTC | #3
On Thu, Aug 08, 2024 at 01:15:52PM GMT, Andrii Nakryiko wrote:
> On Thu, Aug 8, 2024 at 11:40 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Wed, Aug 07, 2024 at 04:40:25PM GMT, Andrii Nakryiko wrote:
> > > Extend freader with a flag specifying whether it's OK to cause page
> > > fault to fetch file data that is not already physically present in
> > > memory. With this, it's now easy to wait for data if the caller is
> > > running in sleepable (faultable) context.
> > >
> > > We utilize read_cache_folio() to bring the desired folio into page
> > > cache, after which the rest of the logic works just the same at folio level.
> > >
> > > Suggested-by: Omar Sandoval <osandov@fb.com>
> > > Cc: Shakeel Butt <shakeel.butt@linux.dev>
> > > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > ---
> > >  lib/buildid.c | 44 ++++++++++++++++++++++++++++----------------
> > >  1 file changed, 28 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/lib/buildid.c b/lib/buildid.c
> > > index 5e6f842f56f0..e1c01b23efd8 100644
> > > --- a/lib/buildid.c
> > > +++ b/lib/buildid.c
> > > @@ -20,6 +20,7 @@ struct freader {
> > >                       struct folio *folio;
> > >                       void *addr;
> > >                       loff_t folio_off;
> > > +                     bool may_fault;
> > >               };
> > >               struct {
> > >                       const char *data;
> > > @@ -29,12 +30,13 @@ struct freader {
> > >  };
> > >
> > >  static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
> > > -                                struct address_space *mapping)
> > > +                                struct address_space *mapping, bool may_fault)
> > >  {
> > >       memset(r, 0, sizeof(*r));
> > >       r->buf = buf;
> > >       r->buf_sz = buf_sz;
> > >       r->mapping = mapping;
> > > +     r->may_fault = may_fault;
> > >  }
> > >
> > >  static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz)
> > > @@ -63,6 +65,11 @@ static int freader_get_folio(struct freader *r, loff_t file_off)
> > >       freader_put_folio(r);
> > >
> > >       r->folio = filemap_get_folio(r->mapping, file_off >> PAGE_SHIFT);
> > > +
> > > +     /* if sleeping is allowed, wait for the page, if necessary */
> > > +     if (r->may_fault && (IS_ERR(r->folio) || !folio_test_uptodate(r->folio)))
> > > +             r->folio = read_cache_folio(r->mapping, file_off >> PAGE_SHIFT, NULL, NULL);
> >
> > Willy's network fs comment is bugging me. If we pass NULL for filler,
> > the kernel will going to use fs's read_folio() callback. I have checked
> > read_folio() for fuse and nfs and it seems like for at least these two
> > filesystems the callback is accessing file->private_data. So, if the elf
> > file is on these filesystems, we might see null accesses.
> >
> 
> Isn't that just a huge problem with the read_cache_folio() interface
> then? That file is optional, in general, but for some specific FS
> types it's not. How generic code is supposed to know this?
> 
> Or maybe it's a bug with the nfs_read_folio() and fuse_read_folio()
> implementation that they can't handle NULL file argument?
> netfs_read_folio(), for example, seems to be working with file == NULL
> just fine.

If you go a bit down in netfs_alloc_request() there is the following
code:

        if (rreq->netfs_ops->init_request) {
		ret = rreq->netfs_ops->init_request(rreq, file);
		...
	...

I think this init_request is pointing to nfs_netfs_init_request which
calls nfs_file_open_context(file) and access filp->private_data.

> 
> Matthew, can you please advise what's the right approach here? I can,
> of course, always get file refcount, but most of the time it will be
> just an unnecessary overhead, so ideally I'd like to avoid that. But
> if I have to check each read_folio callback implementation to know
> whether it's required or not, then that's not great...

I don't think we will need file refcnt. We have mmap lock in read mode
in this context because we are accessing vma and this vma has reference
to the file. So, this file can not go away under us here.
Andrii Nakryiko Aug. 8, 2024, 9:21 p.m. UTC | #4
On Thu, Aug 8, 2024 at 2:02 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Thu, Aug 08, 2024 at 01:15:52PM GMT, Andrii Nakryiko wrote:
> > On Thu, Aug 8, 2024 at 11:40 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > >
> > > On Wed, Aug 07, 2024 at 04:40:25PM GMT, Andrii Nakryiko wrote:
> > > > Extend freader with a flag specifying whether it's OK to cause page
> > > > fault to fetch file data that is not already physically present in
> > > > memory. With this, it's now easy to wait for data if the caller is
> > > > running in sleepable (faultable) context.
> > > >
> > > > We utilize read_cache_folio() to bring the desired folio into page
> > > > cache, after which the rest of the logic works just the same at folio level.
> > > >
> > > > Suggested-by: Omar Sandoval <osandov@fb.com>
> > > > Cc: Shakeel Butt <shakeel.butt@linux.dev>
> > > > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > > ---
> > > >  lib/buildid.c | 44 ++++++++++++++++++++++++++++----------------
> > > >  1 file changed, 28 insertions(+), 16 deletions(-)
> > > >
> > > > diff --git a/lib/buildid.c b/lib/buildid.c
> > > > index 5e6f842f56f0..e1c01b23efd8 100644
> > > > --- a/lib/buildid.c
> > > > +++ b/lib/buildid.c
> > > > @@ -20,6 +20,7 @@ struct freader {
> > > >                       struct folio *folio;
> > > >                       void *addr;
> > > >                       loff_t folio_off;
> > > > +                     bool may_fault;
> > > >               };
> > > >               struct {
> > > >                       const char *data;
> > > > @@ -29,12 +30,13 @@ struct freader {
> > > >  };
> > > >
> > > >  static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
> > > > -                                struct address_space *mapping)
> > > > +                                struct address_space *mapping, bool may_fault)
> > > >  {
> > > >       memset(r, 0, sizeof(*r));
> > > >       r->buf = buf;
> > > >       r->buf_sz = buf_sz;
> > > >       r->mapping = mapping;
> > > > +     r->may_fault = may_fault;
> > > >  }
> > > >
> > > >  static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz)
> > > > @@ -63,6 +65,11 @@ static int freader_get_folio(struct freader *r, loff_t file_off)
> > > >       freader_put_folio(r);
> > > >
> > > >       r->folio = filemap_get_folio(r->mapping, file_off >> PAGE_SHIFT);
> > > > +
> > > > +     /* if sleeping is allowed, wait for the page, if necessary */
> > > > +     if (r->may_fault && (IS_ERR(r->folio) || !folio_test_uptodate(r->folio)))
> > > > +             r->folio = read_cache_folio(r->mapping, file_off >> PAGE_SHIFT, NULL, NULL);
> > >
> > > Willy's network fs comment is bugging me. If we pass NULL for filler,
> > > the kernel will going to use fs's read_folio() callback. I have checked
> > > read_folio() for fuse and nfs and it seems like for at least these two
> > > filesystems the callback is accessing file->private_data. So, if the elf
> > > file is on these filesystems, we might see null accesses.
> > >
> >
> > Isn't that just a huge problem with the read_cache_folio() interface
> > then? That file is optional, in general, but for some specific FS
> > types it's not. How generic code is supposed to know this?
> >
> > Or maybe it's a bug with the nfs_read_folio() and fuse_read_folio()
> > implementation that they can't handle NULL file argument?
> > netfs_read_folio(), for example, seems to be working with file == NULL
> > just fine.
>
> If you go a bit down in netfs_alloc_request() there is the following
> code:
>
>         if (rreq->netfs_ops->init_request) {
>                 ret = rreq->netfs_ops->init_request(rreq, file);
>                 ...
>         ...
>
> I think this init_request is pointing to nfs_netfs_init_request which
> calls nfs_file_open_context(file) and access filp->private_data.

That's "nfs", which we know requires a file. For netfs implementations
(cifs_init_request() and v9fs_init_request()), they both treat file as
optional consistently.

But regardless, that's just pointless code archeology, I'll just pass
the file reference unconditionally.

>
> >
> > Matthew, can you please advise what's the right approach here? I can,
> > of course, always get file refcount, but most of the time it will be
> > just an unnecessary overhead, so ideally I'd like to avoid that. But
> > if I have to check each read_folio callback implementation to know
> > whether it's required or not, then that's not great...
>
> I don't think we will need file refcnt. We have mmap lock in read mode
> in this context because we are accessing vma and this vma has reference
> to the file. So, this file can not go away under us here.

Yep, good point, then it's not a problem, thanks! Will update.
Andrii Nakryiko Aug. 8, 2024, 9:23 p.m. UTC | #5
On Thu, Aug 8, 2024 at 1:58 PM Jann Horn <jannh@google.com> wrote:
>
> On Thu, Aug 8, 2024 at 10:16 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> > On Thu, Aug 8, 2024 at 11:40 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > >
> > > On Wed, Aug 07, 2024 at 04:40:25PM GMT, Andrii Nakryiko wrote:
> > > > Extend freader with a flag specifying whether it's OK to cause page
> > > > fault to fetch file data that is not already physically present in
> > > > memory. With this, it's now easy to wait for data if the caller is
> > > > running in sleepable (faultable) context.
> > > >
> > > > We utilize read_cache_folio() to bring the desired folio into page
> > > > cache, after which the rest of the logic works just the same at folio level.
> > > >
> > > > Suggested-by: Omar Sandoval <osandov@fb.com>
> > > > Cc: Shakeel Butt <shakeel.butt@linux.dev>
> > > > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > > ---
> > > >  lib/buildid.c | 44 ++++++++++++++++++++++++++++----------------
> > > >  1 file changed, 28 insertions(+), 16 deletions(-)
> > > >
> > > > diff --git a/lib/buildid.c b/lib/buildid.c
> > > > index 5e6f842f56f0..e1c01b23efd8 100644
> > > > --- a/lib/buildid.c
> > > > +++ b/lib/buildid.c
> > > > @@ -20,6 +20,7 @@ struct freader {
> > > >                       struct folio *folio;
> > > >                       void *addr;
> > > >                       loff_t folio_off;
> > > > +                     bool may_fault;
> > > >               };
> > > >               struct {
> > > >                       const char *data;
> > > > @@ -29,12 +30,13 @@ struct freader {
> > > >  };
> > > >
> > > >  static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
> > > > -                                struct address_space *mapping)
> > > > +                                struct address_space *mapping, bool may_fault)
> > > >  {
> > > >       memset(r, 0, sizeof(*r));
> > > >       r->buf = buf;
> > > >       r->buf_sz = buf_sz;
> > > >       r->mapping = mapping;
> > > > +     r->may_fault = may_fault;
> > > >  }
> > > >
> > > >  static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz)
> > > > @@ -63,6 +65,11 @@ static int freader_get_folio(struct freader *r, loff_t file_off)
> > > >       freader_put_folio(r);
> > > >
> > > >       r->folio = filemap_get_folio(r->mapping, file_off >> PAGE_SHIFT);
> > > > +
> > > > +     /* if sleeping is allowed, wait for the page, if necessary */
> > > > +     if (r->may_fault && (IS_ERR(r->folio) || !folio_test_uptodate(r->folio)))
> > > > +             r->folio = read_cache_folio(r->mapping, file_off >> PAGE_SHIFT, NULL, NULL);
> > >
> > > Willy's network fs comment is bugging me. If we pass NULL for filler,
> > > the kernel will going to use fs's read_folio() callback. I have checked
> > > read_folio() for fuse and nfs and it seems like for at least these two
> > > filesystems the callback is accessing file->private_data. So, if the elf
> > > file is on these filesystems, we might see null accesses.
> > >
> >
> > Isn't that just a huge problem with the read_cache_folio() interface
> > then? That file is optional, in general, but for some specific FS
> > types it's not. How generic code is supposed to know this?
>
> I think you have to think about it the other way around. The file is

Fair enough:

  > @file: Passed to filler function, may be NULL if not required.

But then you look at mapping_read_folio_gfp() which *always*
unconditionally passes NULL for filler and file, and that makes you
think that file is some special *extra* parameter.

But regardless, as you pointed out, I won't have to take extra ref, so
my concerns about performance are wrong. I'll pass the file.

> required, unless you know the filler function that will be used
> doesn't use the file. Which you don't know when you're coming from
> generic code, so generic code has to pass in a file.
>
> As far as I can tell, most of the callers of read_cache_folio() (via
> read_mapping_folio()) are inside filesystem implementations, not
> generic code, so they know what the filler function will do. You're
> generic code, so I think you have to pass in a file.
>

Yep, I guess this is a bit of trailblazing use case. I was confused by
some other helpers passing NULL for file unconditionally, which made
me think that NULL is a supported default use case. Clearly I was
wrong.

> > Or maybe it's a bug with the nfs_read_folio() and fuse_read_folio()
> > implementation that they can't handle NULL file argument?
> > netfs_read_folio(), for example, seems to be working with file == NULL
> > just fine.
> >
> > Matthew, can you please advise what's the right approach here? I can,
> > of course, always get file refcount, but most of the time it will be
> > just an unnecessary overhead, so ideally I'd like to avoid that. But
> > if I have to check each read_folio callback implementation to know
> > whether it's required or not, then that's not great...
>
> Why would you need to increment the file refcount? As far as I can
> tell, all your accesses to the file would happen under
> __build_id_parse(), which is borrowing the refcounted reference from
> vma->vm_file; the file can't go away as long as your caller is holding
> the mmap lock.

Yep, agreed.
diff mbox series

Patch

diff --git a/lib/buildid.c b/lib/buildid.c
index 5e6f842f56f0..e1c01b23efd8 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -20,6 +20,7 @@  struct freader {
 			struct folio *folio;
 			void *addr;
 			loff_t folio_off;
+			bool may_fault;
 		};
 		struct {
 			const char *data;
@@ -29,12 +30,13 @@  struct freader {
 };
 
 static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
-				   struct address_space *mapping)
+				   struct address_space *mapping, bool may_fault)
 {
 	memset(r, 0, sizeof(*r));
 	r->buf = buf;
 	r->buf_sz = buf_sz;
 	r->mapping = mapping;
+	r->may_fault = may_fault;
 }
 
 static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz)
@@ -63,6 +65,11 @@  static int freader_get_folio(struct freader *r, loff_t file_off)
 	freader_put_folio(r);
 
 	r->folio = filemap_get_folio(r->mapping, file_off >> PAGE_SHIFT);
+
+	/* if sleeping is allowed, wait for the page, if necessary */
+	if (r->may_fault && (IS_ERR(r->folio) || !folio_test_uptodate(r->folio)))
+		r->folio = read_cache_folio(r->mapping, file_off >> PAGE_SHIFT, NULL, NULL);
+
 	if (IS_ERR(r->folio) || !folio_test_uptodate(r->folio)) {
 		if (!IS_ERR(r->folio))
 			folio_put(r->folio);
@@ -284,18 +291,8 @@  static int get_build_id_64(struct freader *r, unsigned char *build_id, __u32 *si
 /* enough for Elf64_Ehdr, Elf64_Phdr, and all the smaller requests */
 #define MAX_FREADER_BUF_SZ 64
 
-/*
- * Parse build ID of ELF file mapped to vma
- * @vma:      vma object
- * @build_id: buffer to store build id, at least BUILD_ID_SIZE long
- * @size:     returns actual build id size in case of success
- *
- * Assumes no page fault can be taken, so if relevant portions of ELF file are
- * not already paged in, fetching of build ID fails.
- *
- * Return: 0 on success; negative error, otherwise
- */
-int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size)
+static int __build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
+			    __u32 *size, bool may_fault)
 {
 	const Elf32_Ehdr *ehdr;
 	struct freader r;
@@ -306,7 +303,7 @@  int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id,
 	if (!vma->vm_file)
 		return -EINVAL;
 
-	freader_init_from_file(&r, buf, sizeof(buf), vma->vm_file->f_mapping);
+	freader_init_from_file(&r, buf, sizeof(buf), vma->vm_file->f_mapping, may_fault);
 
 	/* fetch first 18 bytes of ELF header for checks */
 	ehdr = freader_fetch(&r, 0, offsetofend(Elf32_Ehdr, e_type));
@@ -334,6 +331,22 @@  int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id,
 	return ret;
 }
 
+/*
+ * Parse build ID of ELF file mapped to vma
+ * @vma:      vma object
+ * @build_id: buffer to store build id, at least BUILD_ID_SIZE long
+ * @size:     returns actual build id size in case of success
+ *
+ * Assumes no page fault can be taken, so if relevant portions of ELF file are
+ * not already paged in, fetching of build ID fails.
+ *
+ * Return: 0 on success; negative error, otherwise
+ */
+int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size)
+{
+	return __build_id_parse(vma, build_id, size, false /* !may_fault */);
+}
+
 /*
  * Parse build ID of ELF file mapped to VMA
  * @vma:      vma object
@@ -347,8 +360,7 @@  int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id,
  */
 int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size)
 {
-	/* fallback to non-faultable version for now */
-	return build_id_parse_nofault(vma, build_id, size);
+	return __build_id_parse(vma, build_id, size, true /* may_fault */);
 }
 
 /**