From patchwork Sat May 4 00:30:02 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Andrii Nakryiko X-Patchwork-Id: 13653682 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D33FEA920; Sat, 4 May 2024 00:30:11 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714782611; cv=none; b=DkFht93wM3DXZiweqZ+TXKR6JaytkrurX1L4jiVIDPcd063d6ANcn8lzpnuVR0FLFcT74mLYkZ47eiryKm7lv510cpFn1WGZdIPWiEBmDlHpCU4VobBBT7MJg2an6blCcIMz/jhohNx9DSPgZ1SdxAS1NkalqtgacwNnjcDrzjM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714782611; c=relaxed/simple; bh=OsmoaOeW12SHNusJpkc/eLJf6KgrIDQoVNy4kZW2PTw=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=NOAHvphfW5hQeXNi3TQekDQHkk4IwW4wS3602DyjN8S68C8eyYJy3plVdtfdTWUAyWwSgDfsNiBnssqgP5/Z9ngJnZNy7aLAKjXF8dstjPHzdGhBSgPLmoFKkD+TQd3GcJ2ToDFDpk/DFQ+/Ejjcq6Mf5NZRZr9s8z/vmnCJHdo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=irYCdfxY; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="irYCdfxY" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 3F0E4C116B1; Sat, 4 May 2024 00:30:11 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1714782611; bh=OsmoaOeW12SHNusJpkc/eLJf6KgrIDQoVNy4kZW2PTw=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=irYCdfxYU679AgBBu/x1QTRLnZGEvoju3sMUjuymybtvTRyAXO34Tu5LrJt7S78GY OYoR1xjPv5idsNALPBfT7OYYvZAVRvGMyGBnqgsAFCh5iwoF6Zj9+pOMlsxs3vOp35 ImKr2la/Wynbx9YuuVLBqCV/jZZkBNJ4eDaZdYQbwS4A3J1cMls5s+EgtK5ozteiCq /tYaIw5mCGNRgjh3VARmvmzCzLCsA9n6fr/3fi0mORhOWDmjf8DQfjEiOYKVkIq2ts GlWEUVLdgIHoayTxpXLIO+ah6hUvLFFp4Ih08ljZbPW1C39TkwvEBJbUta+rtjK4Vl QEt+UszXK9gRg== From: Andrii Nakryiko To: linux-fsdevel@vger.kernel.org, brauner@kernel.org, viro@zeniv.linux.org.uk, akpm@linux-foundation.org Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org, gregkh@linuxfoundation.org, linux-mm@kvack.org, Andrii Nakryiko Subject: [PATCH 1/5] fs/procfs: extract logic for getting VMA name constituents Date: Fri, 3 May 2024 17:30:02 -0700 Message-ID: <20240504003006.3303334-2-andrii@kernel.org> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240504003006.3303334-1-andrii@kernel.org> References: <20240504003006.3303334-1-andrii@kernel.org> Precedence: bulk X-Mailing-List: linux-fsdevel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Extract generic logic to fetch relevant pieces of data to describe VMA name. This could be just some string (either special constant or user-provided), or a string with some formatted wrapping text (e.g., "[anon_shmem:]"), or, commonly, file path. seq_file-based logic has different methods to handle all three cases, but they are currently mixed in with extracting underlying sources of data. This patch splits this into data fetching and data formatting, so that data fetching can be reused later on. There should be no functional changes. Signed-off-by: Andrii Nakryiko --- fs/proc/task_mmu.c | 125 +++++++++++++++++++++++++-------------------- 1 file changed, 71 insertions(+), 54 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e5a5f015ff03..8e503a1635b7 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -239,6 +239,67 @@ static int do_maps_open(struct inode *inode, struct file *file, sizeof(struct proc_maps_private)); } +static void get_vma_name(struct vm_area_struct *vma, + const struct path **path, + const char **name, + const char **name_fmt) +{ + struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL; + + *name = NULL; + *path = NULL; + *name_fmt = NULL; + + /* + * Print the dentry name for named mappings, and a + * special [heap] marker for the heap: + */ + if (vma->vm_file) { + /* + * If user named this anon shared memory via + * prctl(PR_SET_VMA ..., use the provided name. + */ + if (anon_name) { + *name_fmt = "[anon_shmem:%s]"; + *name = anon_name->name; + } else { + *path = file_user_path(vma->vm_file); + } + return; + } + + if (vma->vm_ops && vma->vm_ops->name) { + *name = vma->vm_ops->name(vma); + if (*name) + return; + } + + *name = arch_vma_name(vma); + if (*name) + return; + + if (!vma->vm_mm) { + *name = "[vdso]"; + return; + } + + if (vma_is_initial_heap(vma)) { + *name = "[heap]"; + return; + } + + if (vma_is_initial_stack(vma)) { + *name = "[stack]"; + return; + } + + if (anon_name) { + *name_fmt = "[anon:%s]"; + *name = anon_name->name; + return; + } +} + static void show_vma_header_prefix(struct seq_file *m, unsigned long start, unsigned long end, vm_flags_t flags, unsigned long long pgoff, @@ -262,17 +323,15 @@ static void show_vma_header_prefix(struct seq_file *m, static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { - struct anon_vma_name *anon_name = NULL; - struct mm_struct *mm = vma->vm_mm; - struct file *file = vma->vm_file; + const struct path *path; + const char *name_fmt, *name; vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; unsigned long start, end; dev_t dev = 0; - const char *name = NULL; - if (file) { + if (vma->vm_file) { const struct inode *inode = file_user_inode(vma->vm_file); dev = inode->i_sb->s_dev; @@ -283,57 +342,15 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) start = vma->vm_start; end = vma->vm_end; show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); - if (mm) - anon_name = anon_vma_name(vma); - /* - * Print the dentry name for named mappings, and a - * special [heap] marker for the heap: - */ - if (file) { + get_vma_name(vma, &path, &name, &name_fmt); + if (path) { seq_pad(m, ' '); - /* - * If user named this anon shared memory via - * prctl(PR_SET_VMA ..., use the provided name. - */ - if (anon_name) - seq_printf(m, "[anon_shmem:%s]", anon_name->name); - else - seq_path(m, file_user_path(file), "\n"); - goto done; - } - - if (vma->vm_ops && vma->vm_ops->name) { - name = vma->vm_ops->name(vma); - if (name) - goto done; - } - - name = arch_vma_name(vma); - if (!name) { - if (!mm) { - name = "[vdso]"; - goto done; - } - - if (vma_is_initial_heap(vma)) { - name = "[heap]"; - goto done; - } - - if (vma_is_initial_stack(vma)) { - name = "[stack]"; - goto done; - } - - if (anon_name) { - seq_pad(m, ' '); - seq_printf(m, "[anon:%s]", anon_name->name); - } - } - -done: - if (name) { + seq_path(m, path, "\n"); + } else if (name_fmt) { + seq_pad(m, ' '); + seq_printf(m, name_fmt, name); + } else if (name) { seq_pad(m, ' '); seq_puts(m, name); } From patchwork Sat May 4 00:30:03 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Andrii Nakryiko X-Patchwork-Id: 13653683 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2D726F9CC; Sat, 4 May 2024 00:30:14 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714782615; cv=none; b=ll/IaOKf2b7sY1thk9MRAe20gBFz3hGC8dzEqUioVGhOFgXAmjBtKXda9HwDh1oHaXShD3APfDAEYEDhXUaCRbJYa+gPbxNaltmA9lYg/84f2H4wIWf4DRD0TNCmcE6aWEf26Ir8AYMY3q+y7GdpSsZIZYJF0b6RYSeFTw0Z+Z8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714782615; c=relaxed/simple; bh=bA1/zohcH7zJsizG/omlxjWAua+BN8I1Y4tME/v2yWE=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=L8t+biKDGVUDAoxOA4J6xyBJJD1n+Q7vcEzxN4+k3XYYCf2y5MT876Zj95PstU9uxBlGbixBX51s//ZQFL19kNSHdcsysqLltRuAthPsaj/Rk1am0Lqyl9hI3dNezIyRA61l0WPp5CDuqW1w+mx0wZ9Ln8nJy9JKWRSpPw4ZNyA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=WrZP0Xur; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="WrZP0Xur" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 8A6F0C4AF1A; Sat, 4 May 2024 00:30:14 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1714782614; bh=bA1/zohcH7zJsizG/omlxjWAua+BN8I1Y4tME/v2yWE=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=WrZP0Xurp/ZbvPWVA9809rCs+L7l7mQdgZ6yIM7b3QF7cm1v8nXt0fYOnCQot2oYV Htd5hqWJbw/65G5zv7QzELJ1Bvtnm7BYXkZvSA3tEF4KIdTGk2Dk1b3JJV2Cb3JuB6 zpi/B66/gqNai2l5MacYx/9thT+2dV7eazaB5jcGfLFlFKkbzp6DAzwWCAmT6+69Fl ZoEWzDkmQ/ilrjCyJOy9DvQp3Kqxrgk22MYfhSewopFKprmBeeGEq7It8VkQzvdQLB /518wopvnYHsQ7bGeYM666wy2ecaLcENtma7+7VIQHj1kTX+FXzxhPuwXxqXWh/tgY HoNfZSLtDEqJA== From: Andrii Nakryiko To: linux-fsdevel@vger.kernel.org, brauner@kernel.org, viro@zeniv.linux.org.uk, akpm@linux-foundation.org Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org, gregkh@linuxfoundation.org, linux-mm@kvack.org, Andrii Nakryiko Subject: [PATCH 2/5] fs/procfs: implement efficient VMA querying API for /proc//maps Date: Fri, 3 May 2024 17:30:03 -0700 Message-ID: <20240504003006.3303334-3-andrii@kernel.org> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240504003006.3303334-1-andrii@kernel.org> References: <20240504003006.3303334-1-andrii@kernel.org> Precedence: bulk X-Mailing-List: linux-fsdevel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 /proc//maps file is extremely useful in practice for various tasks involving figuring out process memory layout, what files are backing any given memory range, etc. One important class of applications that absolutely rely on this are profilers/stack symbolizers. They would normally capture stack trace containing absolute memory addresses of some functions, and would then use /proc//maps file to file corresponding backing ELF files, file offsets within them, and then continue from there to get yet more information (ELF symbols, DWARF information) to get human-readable symbolic information. As such, there are both performance and correctness requirement involved. This address to VMA information translation has to be done as efficiently as possible, but also not miss any VMA (especially in the case of loading/unloading shared libraries). Unfortunately, for all the /proc//maps file universality and usefulness, it doesn't fit the above 100%. First, it's text based, which makes its programmatic use from applications and libraries unnecessarily cumbersome and slow due to the need to do text parsing to get necessary pieces of information. Second, it's main purpose is to emit all VMAs sequentially, but in practice captured addresses would fall only into a small subset of all process' VMAs, mainly containing executable text. Yet, library would need to parse most or all of the contents to find needed VMAs, as there is no way to skip VMAs that are of no use. Efficient library can do the linear pass and it is still relatively efficient, but it's definitely an overhead that can be avoided, if there was a way to do more targeted querying of the relevant VMA information. Another problem when writing generic stack trace symbolization library is an unfortunate performance-vs-correctness tradeoff that needs to be made. Library has to make a decision to either cache parsed contents of /proc//maps for service future requests (if application requests to symbolize another set of addresses, captured at some later time, which is typical for periodic/continuous profiling cases) to avoid higher costs of needed to re-parse this file or caching the contents in memory to speed up future requests. In the former case, more memory is used for the cache and there is a risk of getting stale data if application loaded/unloaded shared libraries, or otherwise changed its set of VMAs through additiona mmap() calls (and other means of altering memory address space). In the latter case, it's the performance hit that comes from re-opening the file and re-reading/re-parsing its contents all over again. This patch aims to solve this problem by providing a new API built on top of /proc//maps. It is ioctl()-based and built as a binary interface, avoiding the cost and awkwardness of textual representation for programmatic use. It's designed to be extensible and forward/backward compatible by including user-specified field size and using copy_struct_from_user() approach. But, most importantly, it allows to do point queries for specific single address, specified by user. And this is done efficiently using VMA iterator. User has a choice to pick either getting VMA that covers provided address or -ENOENT if none is found (exact, least surprising, case). Or, with an extra query flag (PROCFS_PROCMAP_EXACT_OR_NEXT_VMA), they can get either VMA that covers the address (if there is one), or the closest next VMA (i.e., VMA with the smallest vm_start > addr). The later allows more efficient use, but, given it could be a surprising behavior, requires an explicit opt-in. Basing this ioctl()-based API on top of /proc//maps's FD makes sense given it's querying the same set of VMA data. All the permissions checks performed on /proc//maps opening fit here as well. ioctl-based implementation is fetching remembered mm_struct reference, but otherwise doesn't interfere with seq_file-based implementation of /proc//maps textual interface, and so could be used together or independently without paying any price for that. There is one extra thing that /proc//maps doesn't currently provide, and that's an ability to fetch ELF build ID, if present. User has control over whether this piece of information is requested or not by either setting build_id_size field to zero or non-zero maximum buffer size they provided through build_id_addr field (which encodes user pointer as __u64 field). The need to get ELF build ID reliably is an important aspect when dealing with profiling and stack trace symbolization, and /proc//maps textual representation doesn't help with this, requiring applications to open underlying ELF binary through /proc//map_files/- symlink, which adds an extra permissions implications due giving a full access to the binary from (potentially) another process, while all application is interested in is build ID. Giving an ability to request just build ID doesn't introduce any additional security concerns, on top of what /proc//maps is already concerned with, simplifying the overall logic. Kernel already implements build ID fetching, which is used from BPF subsystem. We are reusing this code here, but plan a follow up changes to make it work better under more relaxed assumption (compared to what existing code assumes) of being called from user process context, in which page faults are allowed. BPF-specific implementation currently bails out if necessary part of ELF file is not paged in, all due to extra BPF-specific restrictions (like the need to fetch build ID in restrictive contexts such as NMI handler). Note also, that fetching VMA name (e.g., backing file path, or special hard-coded or user-provided names) is optional just like build ID. If user sets vma_name_size to zero, kernel code won't attempt to retrieve it, saving resources. Signed-off-by: Andrii Nakryiko --- fs/proc/task_mmu.c | 165 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/fs.h | 32 ++++++++ 2 files changed, 197 insertions(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8e503a1635b7..cb7b1ff1a144 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -375,11 +376,175 @@ static int pid_maps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_maps_op); } +static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) +{ + struct procfs_procmap_query karg; + struct vma_iterator iter; + struct vm_area_struct *vma; + struct mm_struct *mm; + const char *name = NULL; + char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; + __u64 usize; + int err; + + if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize))) + return -EFAULT; + if (usize > PAGE_SIZE) + return -E2BIG; + if (usize < offsetofend(struct procfs_procmap_query, query_addr)) + return -EINVAL; + err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); + if (err) + return err; + + if (karg.query_flags & ~PROCFS_PROCMAP_EXACT_OR_NEXT_VMA) + return -EINVAL; + if (!!karg.vma_name_size != !!karg.vma_name_addr) + return -EINVAL; + if (!!karg.build_id_size != !!karg.build_id_addr) + return -EINVAL; + + mm = priv->mm; + if (!mm || !mmget_not_zero(mm)) + return -ESRCH; + if (mmap_read_lock_killable(mm)) { + mmput(mm); + return -EINTR; + } + + vma_iter_init(&iter, mm, karg.query_addr); + vma = vma_next(&iter); + if (!vma) { + err = -ENOENT; + goto out; + } + /* user wants covering VMA, not the closest next one */ + if (!(karg.query_flags & PROCFS_PROCMAP_EXACT_OR_NEXT_VMA) && + vma->vm_start > karg.query_addr) { + err = -ENOENT; + goto out; + } + + karg.vma_start = vma->vm_start; + karg.vma_end = vma->vm_end; + + if (vma->vm_file) { + const struct inode *inode = file_user_inode(vma->vm_file); + + karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT; + karg.dev_major = MAJOR(inode->i_sb->s_dev); + karg.dev_minor = MINOR(inode->i_sb->s_dev); + karg.inode = inode->i_ino; + } else { + karg.vma_offset = 0; + karg.dev_major = 0; + karg.dev_minor = 0; + karg.inode = 0; + } + + karg.vma_flags = 0; + if (vma->vm_flags & VM_READ) + karg.vma_flags |= PROCFS_PROCMAP_VMA_READABLE; + if (vma->vm_flags & VM_WRITE) + karg.vma_flags |= PROCFS_PROCMAP_VMA_WRITABLE; + if (vma->vm_flags & VM_EXEC) + karg.vma_flags |= PROCFS_PROCMAP_VMA_EXECUTABLE; + if (vma->vm_flags & VM_MAYSHARE) + karg.vma_flags |= PROCFS_PROCMAP_VMA_SHARED; + + if (karg.build_id_size) { + __u32 build_id_sz = BUILD_ID_SIZE_MAX; + + err = build_id_parse(vma, build_id_buf, &build_id_sz); + if (!err) { + if (karg.build_id_size < build_id_sz) { + err = -ENAMETOOLONG; + goto out; + } + karg.build_id_size = build_id_sz; + } + } + + if (karg.vma_name_size) { + size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size); + const struct path *path; + const char *name_fmt; + size_t name_sz = 0; + + get_vma_name(vma, &path, &name, &name_fmt); + + if (path || name_fmt || name) { + name_buf = kmalloc(name_buf_sz, GFP_KERNEL); + if (!name_buf) { + err = -ENOMEM; + goto out; + } + } + if (path) { + name = d_path(path, name_buf, name_buf_sz); + if (IS_ERR(name)) { + err = PTR_ERR(name); + goto out; + } + name_sz = name_buf + name_buf_sz - name; + } else if (name || name_fmt) { + name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name); + name = name_buf; + } + if (name_sz > name_buf_sz) { + err = -ENAMETOOLONG; + goto out; + } + karg.vma_name_size = name_sz; + } + + /* unlock and put mm_struct before copying data to user */ + mmap_read_unlock(mm); + mmput(mm); + + if (karg.vma_name_size && copy_to_user((void __user *)karg.vma_name_addr, + name, karg.vma_name_size)) { + kfree(name_buf); + return -EFAULT; + } + kfree(name_buf); + + if (karg.build_id_size && copy_to_user((void __user *)karg.build_id_addr, + build_id_buf, karg.build_id_size)) + return -EFAULT; + + if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize))) + return -EFAULT; + + return 0; + +out: + mmap_read_unlock(mm); + mmput(mm); + kfree(name_buf); + return err; +} + +static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + switch (cmd) { + case PROCFS_PROCMAP_QUERY: + return do_procmap_query(priv, (void __user *)arg); + default: + return -ENOIOCTLCMD; + } +} + const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, + .unlocked_ioctl = procfs_procmap_ioctl, + .compat_ioctl = procfs_procmap_ioctl, }; /* diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 45e4e64fd664..fe8924a8d916 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -393,4 +393,36 @@ struct pm_scan_arg { __u64 return_mask; }; +/* /proc//maps ioctl */ +#define PROCFS_IOCTL_MAGIC 0x9f +#define PROCFS_PROCMAP_QUERY _IOWR(PROCFS_IOCTL_MAGIC, 1, struct procfs_procmap_query) + +enum procmap_query_flags { + PROCFS_PROCMAP_EXACT_OR_NEXT_VMA = 0x01, +}; + +enum procmap_vma_flags { + PROCFS_PROCMAP_VMA_READABLE = 0x01, + PROCFS_PROCMAP_VMA_WRITABLE = 0x02, + PROCFS_PROCMAP_VMA_EXECUTABLE = 0x04, + PROCFS_PROCMAP_VMA_SHARED = 0x08, +}; + +struct procfs_procmap_query { + __u64 size; + __u64 query_flags; /* in */ + __u64 query_addr; /* in */ + __u64 vma_start; /* out */ + __u64 vma_end; /* out */ + __u64 vma_flags; /* out */ + __u64 vma_offset; /* out */ + __u64 inode; /* out */ + __u32 dev_major; /* out */ + __u32 dev_minor; /* out */ + __u32 vma_name_size; /* in/out */ + __u32 build_id_size; /* in/out */ + __u64 vma_name_addr; /* in */ + __u64 build_id_addr; /* in */ +}; + #endif /* _UAPI_LINUX_FS_H */ From patchwork Sat May 4 00:30:04 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Andrii Nakryiko X-Patchwork-Id: 13653684 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6CC8A107B3; Sat, 4 May 2024 00:30:18 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714782618; cv=none; b=KwaFp7iFLQJp0+fofwC/kGaIAm6SsI0q8I7X83fz6wmXm9wGdOUQsEgTrAsnobGpBvUqDAYXsapUWETX7ObMKyxEzmw8IPNE6iIxiC/d7GQJaJDolvxAGk+ECcVRcJrGgXXAzJDKlHagAAuB0IeiDwZHunebqU4QLh5UpJyur1k= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714782618; c=relaxed/simple; bh=akE5P6xvZT48f5LRQWKFAyRAluTFD6SKWbtJB6gxXDE=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=BwNU+mmdmc3nwF+dQdhcgHkVGaqtuw0vf26fd+xsmPvzz3uJyZPPZQLTb6i056tNOPe4kBFa2yoh8waaOEpsKImE5tGph1Jj1iqUn74YD7pLQlV/Zr1H/DwGfCa9mPIV6tYZAi/axZMmf4wXRUFuyeyoqCn9KmeY38nBB/Z/AL8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=mEFOCn6M; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="mEFOCn6M" Received: by smtp.kernel.org (Postfix) with ESMTPSA id D2A35C4AF1A; Sat, 4 May 2024 00:30:17 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1714782618; bh=akE5P6xvZT48f5LRQWKFAyRAluTFD6SKWbtJB6gxXDE=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=mEFOCn6M0/9FUvNcIDMh645SuA6hG0akoQKvB+ogesqfmOVkkryzy317thFixlYaT jT0pIGyWZec5FtbBb2fvSdERaUl+MA2M1GiODZsWdBqLdR2zAzt2TR1GUqAn8TgJ6c jtGuzWWJ8q5/TpHhLac5Df+dU4ptxTmZiiBA1VOXmgXkQ0+5v8eSxYTuoLpj92koNh aLG4yZedRRylNI0sepW39uw9NDJcJBauAHNY0kq9ZhmbPUPAQ5HWjbrP+A/QWCMhh4 UlFI5mCAFWYuZr40FZlblG//YUIcPds3/uSqo0HtePPOViVrumGVaDLsbMz2j7riB+ zH166DwWwqaVw== From: Andrii Nakryiko To: linux-fsdevel@vger.kernel.org, brauner@kernel.org, viro@zeniv.linux.org.uk, akpm@linux-foundation.org Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org, gregkh@linuxfoundation.org, linux-mm@kvack.org, Andrii Nakryiko Subject: [PATCH 3/5] tools: sync uapi/linux/fs.h header into tools subdir Date: Fri, 3 May 2024 17:30:04 -0700 Message-ID: <20240504003006.3303334-4-andrii@kernel.org> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240504003006.3303334-1-andrii@kernel.org> References: <20240504003006.3303334-1-andrii@kernel.org> Precedence: bulk X-Mailing-List: linux-fsdevel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Keep them in sync for use from BPF selftests. Signed-off-by: Andrii Nakryiko --- .../perf/trace/beauty/include/uapi/linux/fs.h | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tools/perf/trace/beauty/include/uapi/linux/fs.h b/tools/perf/trace/beauty/include/uapi/linux/fs.h index 45e4e64fd664..fe8924a8d916 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/fs.h +++ b/tools/perf/trace/beauty/include/uapi/linux/fs.h @@ -393,4 +393,36 @@ struct pm_scan_arg { __u64 return_mask; }; +/* /proc//maps ioctl */ +#define PROCFS_IOCTL_MAGIC 0x9f +#define PROCFS_PROCMAP_QUERY _IOWR(PROCFS_IOCTL_MAGIC, 1, struct procfs_procmap_query) + +enum procmap_query_flags { + PROCFS_PROCMAP_EXACT_OR_NEXT_VMA = 0x01, +}; + +enum procmap_vma_flags { + PROCFS_PROCMAP_VMA_READABLE = 0x01, + PROCFS_PROCMAP_VMA_WRITABLE = 0x02, + PROCFS_PROCMAP_VMA_EXECUTABLE = 0x04, + PROCFS_PROCMAP_VMA_SHARED = 0x08, +}; + +struct procfs_procmap_query { + __u64 size; + __u64 query_flags; /* in */ + __u64 query_addr; /* in */ + __u64 vma_start; /* out */ + __u64 vma_end; /* out */ + __u64 vma_flags; /* out */ + __u64 vma_offset; /* out */ + __u64 inode; /* out */ + __u32 dev_major; /* out */ + __u32 dev_minor; /* out */ + __u32 vma_name_size; /* in/out */ + __u32 build_id_size; /* in/out */ + __u64 vma_name_addr; /* in */ + __u64 build_id_addr; /* in */ +}; + #endif /* _UAPI_LINUX_FS_H */ From patchwork Sat May 4 00:30:05 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Andrii Nakryiko X-Patchwork-Id: 13653685 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 619EE1DDF4; Sat, 4 May 2024 00:30:21 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714782621; cv=none; b=Py2DDxc/4crp+LjGr5l3GD6Zvqen1RAwtdS4VEbAGL0AyBa4kLLsy/maZz6vccdwu5venoXm3mV7+bW1O7m+ZI/q3KYh2TBay2dmlknZlFGengHp9fmF180Q1vyQeFhegTOfUezNanbEM2J8pCcdXzqxChMs92ln1ItGMIR8s3M= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714782621; c=relaxed/simple; bh=YxeS+sLpQRnaSgu/9E3ClWUqEPF8Db9NHuIyW1Jsti8=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=C2yud+DDaPi3a9pcaeIOSAfyVGJQryQZm50Dj/ROrUbKMGED8yCBiBrCL6f2TDvxxKFslxZnfkTFK3M/UqIv/rzHrjVCs3QNkDVOp37NIpZT3CTi4Mp1FyAhJ7Mdv6/HhytONNkiAQhlLKCYAqsbb/fktiLoidT9Jpzi4sEN14M= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=KEgnPpHB; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="KEgnPpHB" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 08AAFC32789; Sat, 4 May 2024 00:30:21 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1714782621; bh=YxeS+sLpQRnaSgu/9E3ClWUqEPF8Db9NHuIyW1Jsti8=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=KEgnPpHBAUag30KLKXRbcwwv5eSBfqoQqRe1IJe1bSxSom7XCLuXPyPJlx+VCp2A7 FRJ9upZx8QAsxZqb0ywEIr12qdiefeoYMQgEcQE3sFCkVzlo4BwwLTfVTS17gQ8pXS MJSfEPbXXyRUsJXYDTuy39AISOPZoOmP6UH+97Msw1OdtCErksIrOyWu9A8bOBR8DN gP20QGIvODzLTCLXHQhPR11wcdaZ5z/5ThA6YuXsRqoaLQMyy2i5LQBWwstFu91JBd 1cd08eKVWgqk1Iwc1kfGCnC7+S49U5eFUZNjZekrVZn2eM9uxF5rk8IdO/qkuaaxga x3SBOYcifgczw== From: Andrii Nakryiko To: linux-fsdevel@vger.kernel.org, brauner@kernel.org, viro@zeniv.linux.org.uk, akpm@linux-foundation.org Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org, gregkh@linuxfoundation.org, linux-mm@kvack.org, Andrii Nakryiko Subject: [PATCH 4/5] selftests/bpf: make use of PROCFS_PROCMAP_QUERY ioctl, if available Date: Fri, 3 May 2024 17:30:05 -0700 Message-ID: <20240504003006.3303334-5-andrii@kernel.org> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240504003006.3303334-1-andrii@kernel.org> References: <20240504003006.3303334-1-andrii@kernel.org> Precedence: bulk X-Mailing-List: linux-fsdevel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Instead of parsing text-based /proc//maps file, try to use PROCFS_PROCMAP_QUERY ioctl() to simplify and speed up data fetching. This logic is used to do uprobe file offset calculation, so any bugs in this logic would manifest as failing uprobe BPF selftests. This also serves as a simple demonstration of one of the intended uses. Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/test_progs.c | 3 + tools/testing/selftests/bpf/test_progs.h | 2 + tools/testing/selftests/bpf/trace_helpers.c | 105 +++++++++++++++++--- 3 files changed, 95 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 89ff704e9dad..6a19970f2531 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -19,6 +19,8 @@ #include #include "json_writer.h" +int env_verbosity = 0; + static bool verbose(void) { return env.verbosity > VERBOSE_NONE; @@ -848,6 +850,7 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) return -EINVAL; } } + env_verbosity = env->verbosity; if (verbose()) { if (setenv("SELFTESTS_VERBOSE", "1", 1) == -1) { diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 0ba5a20b19ba..6eae7fdab0d7 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -95,6 +95,8 @@ struct test_state { FILE *stdout; }; +extern int env_verbosity; + struct test_env { struct test_selector test_selector; struct test_selector subtest_selector; diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 70e29f316fe7..8ac71e73d173 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include "trace_helpers.h" #include @@ -233,29 +235,92 @@ int kallsyms_find(const char *sym, unsigned long long *addr) return err; } +#ifdef PROCFS_PROCMAP_QUERY +int env_verbosity __weak = 0; + +int procmap_query(int fd, const void *addr, size_t *start, size_t *offset, int *flags) +{ + char path_buf[PATH_MAX], build_id_buf[20]; + struct procfs_procmap_query q; + int err; + + memset(&q, 0, sizeof(q)); + q.size = sizeof(q); + q.query_addr = (__u64)addr; + q.vma_name_addr = (__u64)path_buf; + q.vma_name_size = sizeof(path_buf); + q.build_id_addr = (__u64)build_id_buf; + q.build_id_size = sizeof(build_id_buf); + + err = ioctl(fd, PROCFS_PROCMAP_QUERY, &q); + if (err < 0) { + err = -errno; + if (err == -ENOTTY) + return -EOPNOTSUPP; /* ioctl() not implemented yet */ + if (err == -ENOENT) + return -ESRCH; /* vma not found */ + return err; + } + + if (env_verbosity >= 1) { + printf("VMA FOUND (addr %08lx): %08lx-%08lx %c%c%c%c %08lx %02x:%02x %ld %s (build ID: %s, %d bytes)\n", + (long)addr, (long)q.vma_start, (long)q.vma_end, + (q.vma_flags & PROCFS_PROCMAP_VMA_READABLE) ? 'r' : '-', + (q.vma_flags & PROCFS_PROCMAP_VMA_WRITABLE) ? 'w' : '-', + (q.vma_flags & PROCFS_PROCMAP_VMA_EXECUTABLE) ? 'x' : '-', + (q.vma_flags & PROCFS_PROCMAP_VMA_SHARED) ? 's' : 'p', + (long)q.vma_offset, q.dev_major, q.dev_minor, (long)q.inode, + q.vma_name_size ? path_buf : "", + q.build_id_size ? "YES" : "NO", + q.build_id_size); + } + + *start = q.vma_start; + *offset = q.vma_offset; + *flags = q.vma_flags; + return 0; +} +#else +int procmap_query(int fd, const void *addr, size_t *start, size_t *offset, int *flags) +{ + return -EOPNOTSUPP; +} +#endif + ssize_t get_uprobe_offset(const void *addr) { - size_t start, end, base; - char buf[256]; - bool found = false; + size_t start, base, end; FILE *f; + char buf[256]; + int err, flags; f = fopen("/proc/self/maps", "r"); if (!f) return -errno; - while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) { - if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) { - found = true; - break; + err = procmap_query(fileno(f), addr, &start, &base, &flags); + if (err == 0) { + if (!(flags & PROCFS_PROCMAP_VMA_EXECUTABLE)) + return -ESRCH; + } else if (err != -EOPNOTSUPP) { + fclose(f); + return err; + } else if (err) { + bool found = false; + + while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) { + if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) { + found = true; + break; + } + } + if (!found) { + fclose(f); + return -ESRCH; } } - fclose(f); - if (!found) - return -ESRCH; - #if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2 #define OP_RT_RA_MASK 0xffff0000UL @@ -296,15 +361,25 @@ ssize_t get_rel_offset(uintptr_t addr) size_t start, end, offset; char buf[256]; FILE *f; + int err, flags; f = fopen("/proc/self/maps", "r"); if (!f) return -errno; - while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &offset) == 4) { - if (addr >= start && addr < end) { - fclose(f); - return (size_t)addr - start + offset; + err = procmap_query(fileno(f), (const void *)addr, &start, &offset, &flags); + if (err == 0) { + fclose(f); + return (size_t)addr - start + offset; + } else if (err != -EOPNOTSUPP) { + fclose(f); + return err; + } else if (err) { + while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &offset) == 4) { + if (addr >= start && addr < end) { + fclose(f); + return (size_t)addr - start + offset; + } } } From patchwork Sat May 4 00:30:06 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Andrii Nakryiko X-Patchwork-Id: 13653686 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id ED6E0225CB; Sat, 4 May 2024 00:30:24 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714782625; cv=none; b=Kc9Gdcjd1lspTkS9eGxtkt7VFmcI0KI/wF9zx5oN3D2yBJe8kgVblA/hGVflR345GexS+/kKQXE3mXMTajnWKuJNqcTkRHrNiDAhL9eJV4MQwPVoVsrEtQsy8RAKvF2TeZwXEdpYGY1iSDQ4BpjydB0OcXNh4mJf3DeciuWFAms= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714782625; c=relaxed/simple; bh=21/1lr/MosDNysUEDGD3CRROcRa+YFl3m2iHec0XNMo=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=PIWW+4ovtYOrI4T2UFdfXFGY9orfmBSZcnxx19CeoE1x8kbRZkNiv9nM+HVk7MpHSlr+p3td5Hbc3L8qhpLOA1kPRj++qlPVPGuk3duSEpki38brYNRoJasxcFLe2UkO0R0TrRRb/44/5kqJjojnetb1kkSpDL/dR3jFCY7ohpI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=p/zUIjd+; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="p/zUIjd+" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 49A96C4AF19; Sat, 4 May 2024 00:30:24 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1714782624; bh=21/1lr/MosDNysUEDGD3CRROcRa+YFl3m2iHec0XNMo=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=p/zUIjd+EJG/uhzDDuN/1qq7S7QRwP4UoIbpJliZGUPyuB85/sY34OXgwKbpg+63u wQrnAtuKyV0HMkJm7QY4NpNlq32eKut7Gcb+myozTQVNUs6kAVesEe7SCDya5synuE fEvh9ODYpYPzHQXKnccnWIEKYJvBp9p7evPuODQU8bOytoQ2Tbp6LsgkQ6c05Tge4K rXGdkZrN9UC10Y7RxiVSDj++99Yv+ldhmPKQ9wO+2DG48DTgJFQgowmjtu4jc7DRo4 7DfleH/tM1K7jBw3mFAPPA5HdwSMpT3p1ijeUsM1eBrnHUm46B+ll7HtY63kU/6In2 1VBohECG2aqEg== From: Andrii Nakryiko To: linux-fsdevel@vger.kernel.org, brauner@kernel.org, viro@zeniv.linux.org.uk, akpm@linux-foundation.org Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org, gregkh@linuxfoundation.org, linux-mm@kvack.org, Andrii Nakryiko Subject: [PATCH 5/5] selftests/bpf: a simple benchmark tool for /proc//maps APIs Date: Fri, 3 May 2024 17:30:06 -0700 Message-ID: <20240504003006.3303334-6-andrii@kernel.org> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240504003006.3303334-1-andrii@kernel.org> References: <20240504003006.3303334-1-andrii@kernel.org> Precedence: bulk X-Mailing-List: linux-fsdevel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Implement a simple tool/benchmark for comparing address "resolution" logic based on textual /proc//maps interface and new binary ioctl-based PROCFS_PROCMAP_QUERY command. The tool expects a file with a list of hex addresses, relevant PID, and then provides control over whether textual or binary ioctl-based ways to process VMAs should be used. The overall logic implements as efficient way to do batched processing of a given set of (unsorted) addresses. We first sort them in increasing order (remembering their original position to restore original order, if necessary), and then process all VMAs from /proc//maps, matching addresses to VMAs and calculating file offsets, if matched. For ioctl-based approach the idea is similar, but is implemented even more efficiently, requesting only VMAs that cover all given addresses, skipping all the irrelevant VMAs altogether. To be able to compare efficiency of both APIs tool has "benchark" mode. User provides a number of processing runs to run in a tight loop, timing specifically /proc//maps parsing and processing parts of the logic only. Address sorting and re-sorting is excluded. This gives a more direct way to compare ioctl- vs text-based APIs. We used a medium-sized production application to do representative benchmark. A bunch of stack traces were captured, resulting in 4435 user space addresses (699 unique ones, but we didn't deduplicate them). Application itself had 702 VMAs reported in /proc//maps. Averaging time taken to process all addresses 10000 times, showed that: - text-based approach took 380 microseconds *per one batch run*; - ioctl-based approach took 10 microseconds *per identical batch run*. This gives about ~35x speed up to do exactly the same amoun of work (build IDs were not fetched for ioctl-based benchmark; fetching build IDs resulted in 2x slowdown compared to no-build-ID case). I also did an strace run of both cases. In text-based one the tool did 68 read() syscalls, fetching up to 4KB of data in one go. In comparison, ioctl-based implementation had to do only 6 ioctl() calls to fetch all relevant VMAs. It is projected that savings from processing big production applications would only widen the gap in favor of binary-based querying ioctl API, as bigger applications will tend to have even more non-executable VMA mappings relative to executable ones. Signed-off-by: Andrii Nakryiko --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/procfs_query.c | 366 +++++++++++++++++++++ 3 files changed, 368 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/procfs_query.c diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index f1aebabfb017..7eaa8f417278 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -45,6 +45,7 @@ test_cpp /veristat /sign-file /uprobe_multi +/procfs_query *.ko *.tmp xskxceiver diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ba28d42b74db..07e17bb89767 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -131,7 +131,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \ - xdp_features bpf_test_no_cfi.ko + xdp_features bpf_test_no_cfi.ko procfs_query TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi diff --git a/tools/testing/selftests/bpf/procfs_query.c b/tools/testing/selftests/bpf/procfs_query.c new file mode 100644 index 000000000000..8ca3978244ad --- /dev/null +++ b/tools/testing/selftests/bpf/procfs_query.c @@ -0,0 +1,366 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static bool verbose; +static bool quiet; +static bool use_ioctl; +static bool request_build_id; +static char *addrs_path; +static int pid; +static int bench_runs; + +const char *argp_program_version = "procfs_query 0.0"; +const char *argp_program_bug_address = ""; + +static inline uint64_t get_time_ns(void) +{ + struct timespec t; + + clock_gettime(CLOCK_MONOTONIC, &t); + + return (uint64_t)t.tv_sec * 1000000000 + t.tv_nsec; +} + +static const struct argp_option opts[] = { + { "verbose", 'v', NULL, 0, "Verbose mode" }, + { "quiet", 'q', NULL, 0, "Quiet mode (no output)" }, + { "pid", 'p', "PID", 0, "PID of the process" }, + { "addrs-path", 'f', "PATH", 0, "File with addresses to resolve" }, + { "benchmark", 'B', "RUNS", 0, "Benchmark mode" }, + { "query", 'Q', NULL, 0, "Use ioctl()-based point query API (by default text parsing is done)" }, + { "build-id", 'b', NULL, 0, "Fetch build ID, if available (only for ioctl mode)" }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case 'v': + verbose = true; + break; + case 'q': + quiet = true; + break; + case 'i': + use_ioctl = true; + break; + case 'b': + request_build_id = true; + break; + case 'p': + pid = strtol(arg, NULL, 10); + break; + case 'f': + addrs_path = strdup(arg); + break; + case 'B': + bench_runs = strtol(arg, NULL, 10); + if (bench_runs <= 0) { + fprintf(stderr, "Invalid benchmark run count: %s\n", arg); + return -EINVAL; + } + break; + case ARGP_KEY_ARG: + argp_usage(state); + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static const struct argp argp = { + .options = opts, + .parser = parse_arg, +}; + +struct addr { + unsigned long long addr; + int idx; +}; + +static struct addr *addrs; +static size_t addr_cnt, addr_cap; + +struct resolved_addr { + unsigned long long file_off; + const char *vma_name; + int build_id_sz; + char build_id[20]; +}; + +static struct resolved_addr *resolved; + +static int resolve_addrs_ioctl(void) +{ + char buf[32], build_id_buf[20], vma_name[PATH_MAX]; + struct procfs_procmap_query q; + int fd, err, i; + struct addr *a = &addrs[0]; + struct resolved_addr *r; + + snprintf(buf, sizeof(buf), "/proc/%d/maps", pid); + fd = open(buf, O_RDONLY); + if (fd < 0) { + err = -errno; + fprintf(stderr, "Failed to open process map file (%s): %d\n", buf, err); + return err; + } + + memset(&q, 0, sizeof(q)); + q.size = sizeof(q); + q.query_flags = PROCFS_PROCMAP_EXACT_OR_NEXT_VMA; + q.vma_name_addr = (__u64)vma_name; + if (request_build_id) + q.build_id_addr = (__u64)build_id_buf; + + for (i = 0; i < addr_cnt; ) { + char *name = NULL; + + q.query_addr = (__u64)a->addr; + q.vma_name_size = sizeof(vma_name); + if (request_build_id) + q.build_id_size = sizeof(build_id_buf); + + err = ioctl(fd, PROCFS_PROCMAP_QUERY, &q); + if (err < 0 && errno == ENOTTY) { + close(fd); + fprintf(stderr, "PROCFS_PROCMAP_QUERY ioctl() command is not supported on this kernel!\n"); + return -EOPNOTSUPP; /* ioctl() not implemented yet */ + } + if (err < 0 && errno == ENOENT) { + fprintf(stderr, "ENOENT\n"); + i++; + a++; + continue; /* unresolved address */ + } + if (err < 0) { + err = -errno; + close(fd); + fprintf(stderr, "PROCFS_PROCMAP_QUERY ioctl() returned error: %d\n", err); + return err; + } + + /* skip addrs falling before current VMA */ + for (; i < addr_cnt && a->addr < q.vma_start; i++, a++) { + } + /* process addrs covered by current VMA */ + for (; i < addr_cnt && a->addr < q.vma_end; i++, a++) { + r = &resolved[a->idx]; + r->file_off = a->addr - q.vma_start + q.vma_offset; + + /* reuse name, if it was already strdup()'ed */ + if (q.vma_name_size) + name = name ?: strdup(vma_name); + r->vma_name = name; + + if (q.build_id_size) { + r->build_id_sz = q.build_id_size; + memcpy(r->build_id, build_id_buf, q.build_id_size); + } + } + } + + close(fd); + return 0; +} + +static int resolve_addrs_parse(void) +{ + size_t vma_start, vma_end, vma_offset, ino; + uint32_t dev_major, dev_minor; + char perms[4], buf[32], vma_name[PATH_MAX]; + FILE *f; + int err, idx = 0; + struct addr *a = &addrs[idx]; + struct resolved_addr *r; + + snprintf(buf, sizeof(buf), "/proc/%d/maps", pid); + f = fopen(buf, "r"); + if (!f) { + err = -errno; + fprintf(stderr, "Failed to open process map file (%s): %d\n", buf, err); + return err; + } + + while ((err = fscanf(f, "%zx-%zx %c%c%c%c %zx %x:%x %zu %[^\n]\n", + &vma_start, &vma_end, + &perms[0], &perms[1], &perms[2], &perms[3], + &vma_offset, &dev_major, &dev_minor, &ino, vma_name)) >= 10) { + const char *name = NULL; + + /* skip addrs before current vma, they stay unresolved */ + for (; idx < addr_cnt && a->addr < vma_start; idx++, a++) { + } + + /* resolve all addrs within current vma now */ + for (; idx < addr_cnt && a->addr < vma_end; idx++, a++) { + r = &resolved[a->idx]; + r->file_off = a->addr - vma_start + vma_offset; + + /* reuse name, if it was already strdup()'ed */ + if (err > 10) + name = name ?: strdup(vma_name); + else + name = NULL; + r->vma_name = name; + } + + /* ran out of addrs to resolve, stop early */ + if (idx >= addr_cnt) + break; + } + + fclose(f); + return 0; +} + +static int cmp_by_addr(const void *a, const void *b) +{ + const struct addr *x = a, *y = b; + + if (x->addr != y->addr) + return x->addr < y->addr ? -1 : 1; + return x->idx < y->idx ? -1 : 1; +} + +static int cmp_by_idx(const void *a, const void *b) +{ + const struct addr *x = a, *y = b; + + return x->idx < y->idx ? -1 : 1; +} + +int main(int argc, char **argv) +{ + FILE* f; + int err, i; + unsigned long long addr; + uint64_t start_ns; + double total_ns; + + /* Parse command line arguments */ + err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) + return err; + + if (pid <= 0 || !addrs_path) { + fprintf(stderr, "Please provide PID and file with addresses to process!\n"); + exit(1); + } + + if (verbose) { + fprintf(stderr, "PID: %d\n", pid); + fprintf(stderr, "PATH: %s\n", addrs_path); + } + + f = fopen(addrs_path, "r"); + if (!f) { + err = -errno; + fprintf(stderr, "Failed to open '%s': %d\n", addrs_path, err); + goto out; + } + + while ((err = fscanf(f, "%llx\n", &addr)) == 1) { + if (addr_cnt == addr_cap) { + addr_cap = addr_cap == 0 ? 16 : (addr_cap * 3 / 2); + addrs = realloc(addrs, sizeof(*addrs) * addr_cap); + memset(addrs + addr_cnt, 0, (addr_cap - addr_cnt) * sizeof(*addrs)); + } + + addrs[addr_cnt].addr = addr; + addrs[addr_cnt].idx = addr_cnt; + + addr_cnt++; + } + if (verbose) + fprintf(stderr, "READ %zu addrs!\n", addr_cnt); + if (!feof(f)) { + fprintf(stderr, "Failure parsing full list of addresses at '%s'!\n", addrs_path); + err = -EINVAL; + fclose(f); + goto out; + } + fclose(f); + if (addr_cnt == 0) { + fprintf(stderr, "No addresses provided, bailing out!\n"); + err = -ENOENT; + goto out; + } + + resolved = calloc(addr_cnt, sizeof(*resolved)); + + qsort(addrs, addr_cnt, sizeof(*addrs), cmp_by_addr); + if (verbose) { + fprintf(stderr, "SORTED ADDRS (%zu):\n", addr_cnt); + for (i = 0; i < addr_cnt; i++) { + fprintf(stderr, "ADDR #%d: %#llx\n", addrs[i].idx, addrs[i].addr); + } + } + + start_ns = get_time_ns(); + for (i = bench_runs ?: 1; i > 0; i--) { + if (use_ioctl) { + err = resolve_addrs_ioctl(); + } else { + err = resolve_addrs_parse(); + } + if (err) { + fprintf(stderr, "Failed to resolve addrs: %d!\n", err); + goto out; + } + } + total_ns = get_time_ns() - start_ns; + + if (bench_runs) { + fprintf(stderr, "BENCHMARK MODE. RUNS: %d TOTAL TIME (ms): %.3lf TIME/RUN (ms): %.3lf TIME/ADDR (us): %.3lf\n", + bench_runs, total_ns / 1000000.0, total_ns / bench_runs / 1000000.0, + total_ns / bench_runs / addr_cnt / 1000.0); + } + + /* sort them back into the original order */ + qsort(addrs, addr_cnt, sizeof(*addrs), cmp_by_idx); + + if (!quiet) { + printf("RESOLVED ADDRS (%zu):\n", addr_cnt); + for (i = 0; i < addr_cnt; i++) { + const struct addr *a = &addrs[i]; + const struct resolved_addr *r = &resolved[a->idx]; + + if (r->file_off) { + printf("RESOLVED #%d: %#llx -> OFF %#llx", + a->idx, a->addr, r->file_off); + if (r->vma_name) + printf(" NAME %s", r->vma_name); + if (r->build_id_sz) { + char build_id_str[41]; + int j; + + for (j = 0; j < r->build_id_sz; j++) + sprintf(&build_id_str[j * 2], "%02hhx", r->build_id[j]); + printf(" BUILDID %s", build_id_str); + } + printf("\n"); + } else { + printf("UNRESOLVED #%d: %#llx\n", a->idx, a->addr); + } + } + } +out: + free(addrs); + free(addrs_path); + free(resolved); + + return err < 0 ? -err : 0; +}