From patchwork Wed Mar 6 01:59:11 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Steven Rostedt X-Patchwork-Id: 13583201 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7864F2CA7; Wed, 6 Mar 2024 01:58:14 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690294; cv=none; b=L8klqRwH3YyfGb8cpjxgxm5bTcRSPwXrBGwZoqWp6aZAKXW/s59Kt1tqMEcez9L1zCPhcNCbDbv6Q/dCR/AncESnKiK+McZvZJCNJYtf41zvy+L/kvehE2deSV7tQ00XOtSXaq3Tngr/yiyS5OGmY38HmpP/DUXFpncyMxdBkLw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690294; c=relaxed/simple; bh=TmFcu6VUmLrYChoYqs/XPYIPbEXoIAXhHqI8nZm4bNM=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=bY23ghmbZLCvRxBzdHHUBQFCSBPel/bdNwHcNGV2+LCJtdr0ZlDtpumuLAjoSn3BkKmjQbEhljf++sRSITFFcfXNVezesmlKdEDeTPfVMxl1N3j2pmOvH8Aa9NxhAgpfjsYnR2Hcxjrc0XFQQRGaIvKFfjT6CV4JbnykYELN3Nc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id 01ED9C433F1; Wed, 6 Mar 2024 01:58:13 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rhgZh-00000000TAP-2WRD; Tue, 05 Mar 2024 21:00:05 -0500 Message-ID: <20240306020005.461538701@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 05 Mar 2024 20:59:11 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton , Vincent Donnefort , Joel Fernandes , Daniel Bristot de Oliveira , Ingo Molnar , Peter Zijlstra , suleiman@google.com, Thomas Gleixner , Vineeth Pillai , Youssef Esmat , Beau Belgrave , Alexander Graf , Baoquan He , Borislav Petkov , "Paul E. McKenney" , David Howells Subject: [PATCH 1/8] ring-buffer: Allow mapped field to be set without mapping References: <20240306015910.766510873@goodmis.org> Precedence: bulk X-Mailing-List: linux-trace-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: "Steven Rostedt (Google)" In preparation for having the ring buffer mapped to a dedicated location, which will have the same restrictions as user space memory mapped buffers, allow it to use the "mapped" field of the ring_buffer_per_cpu structure without having the user space meta page mapping. When this starts using the mapped field, it will need to handle adding a user space mapping (and removing it) from a ring buffer that is using a dedicated memory range. Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d7d7a701867..524b2c185c88 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5171,6 +5171,9 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; + if (!meta) + return; + meta->reader.read = cpu_buffer->reader_page->read; meta->reader.id = cpu_buffer->reader_page->id; meta->reader.lost_events = cpu_buffer->lost_events; @@ -6159,7 +6162,7 @@ rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) mutex_lock(&cpu_buffer->mapping_lock); - if (!cpu_buffer->mapped) { + if (!cpu_buffer->mapped || !cpu_buffer->meta_page) { mutex_unlock(&cpu_buffer->mapping_lock); return ERR_PTR(-ENODEV); } @@ -6217,7 +6220,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu) mutex_lock(&cpu_buffer->mapping_lock); - if (cpu_buffer->mapped) { + if (cpu_buffer->meta_page) { err = __rb_inc_dec_mapped(buffer, cpu_buffer, true); mutex_unlock(&cpu_buffer->mapping_lock); return err; @@ -6247,7 +6250,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu) raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_setup_ids_meta_page(cpu_buffer, subbuf_ids); - cpu_buffer->mapped = 1; + cpu_buffer->mapped++; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); unlock: From patchwork Wed Mar 6 01:59:12 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Steven Rostedt X-Patchwork-Id: 13583202 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 926747460; Wed, 6 Mar 2024 01:58:14 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690294; cv=none; b=o2GiXkjQmIfC9MpiLdV5new0CYs1LaBHedzr2Z1LhDGLvcQRKdoFT5JHp0RkeRg2saK/hWod/KIInZAk+w0BONLuYWs/bc71jva0sOYODL/lQzqx4qULFstZUxunWbU5DFJ5xm3XEnph+tcHowXsCv5zzL9rYStxt31CbW9/Wy0= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690294; c=relaxed/simple; bh=2/CKQBuq9CSlnosVwiJHlMV4gWcxEq3p69MeimVmHLI=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=lhCpbJeoU17uwDRX+C3Z1z8NP6H2louncnc5rBSXjUfV45Dlp5YEGCY4mGZm+NDz/Xj0MAf5j/4m3eJN1Up6NZaJ9Nz1w0xxDb3uQwuWFximcQBT3w+mNriCt/XQpZBjcWBz5kkoJknAdCfu7YnRWMEN57CdxCz2PtuNoim0SXI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id 30FB8C433B2; Wed, 6 Mar 2024 01:58:14 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rhgZh-00000000TAu-3CEg; Tue, 05 Mar 2024 21:00:05 -0500 Message-ID: <20240306020005.621589841@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 05 Mar 2024 20:59:12 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton , Vincent Donnefort , Joel Fernandes , Daniel Bristot de Oliveira , Ingo Molnar , Peter Zijlstra , suleiman@google.com, Thomas Gleixner , Vineeth Pillai , Youssef Esmat , Beau Belgrave , Alexander Graf , Baoquan He , Borislav Petkov , "Paul E. McKenney" , David Howells Subject: [PATCH 2/8] ring-buffer: Add ring_buffer_alloc_range() References: <20240306015910.766510873@goodmis.org> Precedence: bulk X-Mailing-List: linux-trace-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: "Steven Rostedt (Google)" In preparation to allowing the trace ring buffer to be allocated in a range of memory that is persistent across reboots, add ring_buffer_alloc_range(). It takes a contiguous range of memory and will split it up evening for the per CPU ring buffers. If there's not enough memory to handle all CPUs with the minimum size, it will fail to allocate the ring buffer. Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 17 +++ kernel/trace/ring_buffer.c | 221 ++++++++++++++++++++++++++++++------ 2 files changed, 203 insertions(+), 35 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 0841ba8bab14..17b5508f042c 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -89,6 +89,11 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, struct trace_buffer * __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key); +struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, + int order, unsigned long start, + unsigned long range_size, + struct lock_class_key *key); + /* * Because the ring buffer is generic, if other users of the ring buffer get * traced by ftrace, it can produce lockdep warnings. We need to keep each @@ -100,6 +105,18 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k __ring_buffer_alloc((size), (flags), &__key); \ }) +/* + * Because the ring buffer is generic, if other users of the ring buffer get + * traced by ftrace, it can produce lockdep warnings. We need to keep each + * ring buffer's lock class separate. + */ +#define ring_buffer_alloc_range(size, flags, order, start, range_size) \ +({ \ + static struct lock_class_key __key; \ + __ring_buffer_alloc_range((size), (flags), (order), (start), \ + (range_size), &__key); \ +}) + int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full); __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table, int full); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 524b2c185c88..367597dc766b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -41,6 +41,9 @@ static void update_pages_handler(struct work_struct *work); +struct ring_buffer_meta { +}; + /* * The ring buffer header is special. We must manually up keep it. */ @@ -339,7 +342,8 @@ struct buffer_page { local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ unsigned order; /* order of the page */ - u32 id; /* ID for external mapping */ + u32 id:30; /* ID for external mapping */ + u32 range:1; /* Mapped via a range */ struct buffer_data_page *page; /* Actual data page */ }; @@ -370,7 +374,9 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) static void free_buffer_page(struct buffer_page *bpage) { - free_pages((unsigned long)bpage->page, bpage->order); + /* Range pages are not to be freed */ + if (!bpage->range) + free_pages((unsigned long)bpage->page, bpage->order); kfree(bpage); } @@ -520,6 +526,9 @@ struct trace_buffer { struct rb_irq_work irq_work; bool time_stamp_abs; + unsigned long range_addr_start; + unsigned long range_addr_end; + unsigned int subbuf_size; unsigned int subbuf_order; unsigned int max_data_size; @@ -1431,9 +1440,67 @@ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) } } +/* + * Take an address, add the meta data size as well as the array of + * array subbuffer indexes, then align it to a subbuffer size. + */ +static unsigned long +rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) +{ + addr += sizeof(struct ring_buffer_meta) + + sizeof(int) * nr_subbufs; + return ALIGN(addr, subbuf_size); +} + +/* + * Return a specific sub-buffer for a given @cpu defined by @idx. + */ +static void *rb_range_buffer(struct trace_buffer *buffer, int cpu, int nr_pages, int idx) +{ + unsigned long ptr; + int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + int nr_subbufs; + + /* Include the reader page */ + nr_subbufs = nr_pages + 1; + + /* + * The first chunk may not be subbuffer aligned, where as + * the rest of the chunks are. + */ + ptr = buffer->range_addr_start; + ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); + if (cpu) { + unsigned long p; + + ptr += subbuf_size * nr_subbufs; + + /* Save the beginning of this CPU chunk */ + p = ptr; + + ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); + + if (cpu > 1) { + unsigned long size; + + ptr += subbuf_size * nr_subbufs; + + /* Now all chunks after this are the same size */ + size = ptr - p; + ptr += size * (cpu - 2); + + ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); + } + } + if (ptr + subbuf_size * nr_subbufs > buffer->range_addr_end) + return NULL; + return (void *)ptr; +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { + struct trace_buffer *buffer = cpu_buffer->buffer; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; gfp_t mflags; @@ -1470,6 +1537,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, set_current_oom_origin(); for (i = 0; i < nr_pages; i++) { struct page *page; + int cpu = cpu_buffer->cpu; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), mflags, cpu_to_node(cpu_buffer->cpu)); @@ -1478,14 +1546,22 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_check_bpage(cpu_buffer, bpage); - list_add(&bpage->list, pages); + list_add_tail(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - mflags | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) - goto free_pages; - bpage->page = page_address(page); + if (buffer->range_addr_start) { + /* A range was given. Use that for the buffer page */ + bpage->page = rb_range_buffer(buffer, cpu, nr_pages, i + 1); + if (!bpage->page) + goto free_pages; + bpage->range = 1; + } else { + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), + mflags | __GFP_ZERO, + cpu_buffer->buffer->subbuf_order); + if (!page) + goto free_pages; + bpage->page = page_address(page); + } bpage->order = cpu_buffer->buffer->subbuf_order; rb_init_page(bpage->page); @@ -1567,11 +1643,18 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) - goto fail_free_reader; - bpage->page = page_address(page); + if (buffer->range_addr_start) { + bpage->page = rb_range_buffer(buffer, cpu, nr_pages, 0); + if (!bpage->page) + goto fail_free_reader; + bpage->range = 1; + } else { + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO, + cpu_buffer->buffer->subbuf_order); + if (!page) + goto fail_free_reader; + bpage->page = page_address(page); + } rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -1622,22 +1705,14 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } -/** - * __ring_buffer_alloc - allocate a new ring_buffer - * @size: the size in bytes per cpu that is needed. - * @flags: attributes to set for the ring buffer. - * @key: ring buffer reader_lock_key. - * - * Currently the only flag that is available is the RB_FL_OVERWRITE - * flag. This flag means that the buffer will overwrite old data - * when the buffer wraps. If this flag is not set, the buffer will - * drop data when the tail hits the head. - */ -struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, - struct lock_class_key *key) +static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, + int order, unsigned long start, + unsigned long end, + struct lock_class_key *key) { struct trace_buffer *buffer; long nr_pages; + int subbuf_size; int bsize; int cpu; int ret; @@ -1651,14 +1726,13 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; - /* Default buffer page size - one system page */ - buffer->subbuf_order = 0; - buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE; + buffer->subbuf_order = order; + subbuf_size = (PAGE_SIZE << order); + buffer->subbuf_size = subbuf_size - BUF_PAGE_HDR_SIZE; /* Max payload is buffer page size - header (8bytes) */ buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2); - nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; @@ -1666,10 +1740,6 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&buffer->irq_work.waiters); - /* need at least two pages */ - if (nr_pages < 2) - nr_pages = 2; - buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; @@ -1678,6 +1748,46 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; + /* If start/end are specified, then that overrides size */ + if (start && end) { + unsigned long ptr; + int n; + + size = end - start; + size = size / nr_cpu_ids; + nr_pages = (size - sizeof(struct ring_buffer_meta)) / + (subbuf_size + sizeof(int)); + /* Need at least two pages plus the reader page */ + if (nr_pages < 3) + goto fail_free_buffers; + + again: + /* Make sure that the size fits aligned */ + for (n = 0, ptr = start; n < nr_cpu_ids; n++) { + ptr += sizeof(struct ring_buffer_meta) + + sizeof(int) * nr_pages; + ptr = ALIGN(ptr, subbuf_size); + ptr += subbuf_size * nr_pages; + } + if (ptr > end) { + if (nr_pages <= 3) + goto fail_free_buffers; + nr_pages--; + goto again; + } + + /* nr_pages should not count the reader page */ + nr_pages--; + buffer->range_addr_start = start; + buffer->range_addr_end = end; + } else { + + /* need at least two pages */ + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); + if (nr_pages < 2) + nr_pages = 2; + } + cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); @@ -1706,8 +1816,49 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, kfree(buffer); return NULL; } + +/** + * __ring_buffer_alloc - allocate a new ring_buffer + * @size: the size in bytes per cpu that is needed. + * @flags: attributes to set for the ring buffer. + * @key: ring buffer reader_lock_key. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, + struct lock_class_key *key) +{ + /* Default buffer page size - one system page */ + return alloc_buffer(size, flags, 0, 0, 0,key); + +} EXPORT_SYMBOL_GPL(__ring_buffer_alloc); +/** + * __ring_buffer_alloc_range - allocate a new ring_buffer from existing memory + * @size: the size in bytes per cpu that is needed. + * @flags: attributes to set for the ring buffer. + * @start: start of allocated range + * @range_size: size of allocated range + * @order: sub-buffer order + * @key: ring buffer reader_lock_key. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, + int order, unsigned long start, + unsigned long range_size, + struct lock_class_key *key) +{ + return alloc_buffer(size, flags, order, start, start + range_size, key); +} + /** * ring_buffer_free - free a ring buffer. * @buffer: the buffer to free. From patchwork Wed Mar 6 01:59:13 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Steven Rostedt X-Patchwork-Id: 13583205 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C7519D518; Wed, 6 Mar 2024 01:58:14 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690294; cv=none; b=spD1ooTUAEMqD/0nu/QlxX5LlAZ231xOEO4SFGWToTvi78OWEkGUKc/ehHmxa5lvRWI3CDYfIJv7tt70QICyQelnObXWiGsTPbtpzZjjEKNRTS9WKLSi56ylZPcR3e8IdGg3wNvGMpbc3DZkmuTiJWBR/OOR2oJFQI8s4XTswVE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690294; c=relaxed/simple; bh=YbMoNGJoQ7LPWdY+QV7XwG6+ZinAIu9HNwmJVTQ9CoU=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=H8X1xqzlmZB0DAFcdDD17Hwh47Hj9bI13vAFjruXcBvmsJiqCsy2GqPBD7zSWChDS1eZqvCEbLn+4IqZMfvUHayOaLnoMLYYPsvmOuzMrcHyODHo4oplrlqPxTIU1qS4tG5w2u980w7PaaxgIECplzfFdBiZNQOAQFsOir2dSEE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id 63663C43142; Wed, 6 Mar 2024 01:58:14 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rhgZh-00000000TBO-3sEl; Tue, 05 Mar 2024 21:00:05 -0500 Message-ID: <20240306020005.782553145@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 05 Mar 2024 20:59:13 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton , Vincent Donnefort , Joel Fernandes , Daniel Bristot de Oliveira , Ingo Molnar , Peter Zijlstra , suleiman@google.com, Thomas Gleixner , Vineeth Pillai , Youssef Esmat , Beau Belgrave , Alexander Graf , Baoquan He , Borislav Petkov , "Paul E. McKenney" , David Howells Subject: [PATCH 3/8] tracing: Create "boot_mapped" instance for memory mapped buffer References: <20240306015910.766510873@goodmis.org> Precedence: bulk X-Mailing-List: linux-trace-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: "Steven Rostedt (Google)" Add two global variables trace_buffer_start and trace_buffer_size. If they are both set, then a "boot_mapped" instance will be created using the memory specified by these variables as its ring buffer. The instance will exist in: /sys/kernel/tracing/instances/boot_mapped Note, because the ring buffer is using a defined memory ranged, it will act just like a memory mapped ring buffer. It will not have a snapshot buffer, as it can't swap out the buffer. The snapshot files as well as any tracers that uses a snapshot will not be present in the boot_mapped instance. Signed-off-by: Steven Rostedt (Google) --- include/linux/trace.h | 7 +++++ kernel/trace/trace.c | 65 ++++++++++++++++++++++++++++++++++++------- kernel/trace/trace.h | 3 ++ 3 files changed, 65 insertions(+), 10 deletions(-) diff --git a/include/linux/trace.h b/include/linux/trace.h index fdcd76b7be83..75dab6bb88c9 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -33,6 +33,13 @@ struct trace_array; int register_ftrace_export(struct trace_export *export); int unregister_ftrace_export(struct trace_export *export); +/* + * If the below are set, then a "boot_mapped" tracing instance will + * be created using this memory for its ring buffer. + */ +extern unsigned long trace_buffer_start; +extern unsigned long trace_buffer_size; + /** * trace_array_puts - write a constant string into the trace buffer. * @tr: The trace array to write to diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ff0b0a999171..ff986d2a4bd0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4900,6 +4900,11 @@ static int tracing_open(struct inode *inode, struct file *file) static bool trace_ok_for_array(struct tracer *t, struct trace_array *tr) { +#ifdef CONFIG_TRACER_SNAPSHOT + /* arrays with mapped buffer range do not have snapshots */ + if (tr->range_addr_start && t->use_max_tr) + return false; +#endif return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; } @@ -8670,11 +8675,13 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) tr, cpu, &tracing_entries_fops); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, - tr, cpu, &snapshot_fops); + if (!tr->range_addr_start) { + trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, + tr, cpu, &snapshot_fops); - trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, - tr, cpu, &snapshot_raw_fops); + trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, + tr, cpu, &snapshot_raw_fops); + } #endif } @@ -9211,7 +9218,18 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size buf->tr = tr; - buf->buffer = ring_buffer_alloc(size, rb_flags); + if (tr->range_addr_start && tr->range_addr_size) { + buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, + tr->range_addr_start, + tr->range_addr_size); + /* + * This is basically the same as a mapped buffer, + * with the same restrictions. + */ + tr->mapped++; + } else { + buf->buffer = ring_buffer_alloc(size, rb_flags); + } if (!buf->buffer) return -ENOMEM; @@ -9248,6 +9266,10 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return ret; #ifdef CONFIG_TRACER_MAX_TRACE + /* Fix mapped buffer trace arrays do not have snapshot buffers */ + if (tr->range_addr_start) + return 0; + ret = allocate_trace_buffer(tr, &tr->max_buffer, allocate_snapshot ? size : 1); if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { @@ -9348,7 +9370,9 @@ static int trace_array_create_dir(struct trace_array *tr) } static struct trace_array * -trace_array_create_systems(const char *name, const char *systems) +trace_array_create_systems(const char *name, const char *systems, + unsigned long range_addr_start, + unsigned long range_addr_size) { struct trace_array *tr; int ret; @@ -9374,6 +9398,10 @@ trace_array_create_systems(const char *name, const char *systems) goto out_free_tr; } + /* Only for boot up memory mapped ring buffers */ + tr->range_addr_start = range_addr_start; + tr->range_addr_size = range_addr_size; + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -9431,9 +9459,24 @@ trace_array_create_systems(const char *name, const char *systems) static struct trace_array *trace_array_create(const char *name) { - return trace_array_create_systems(name, NULL); + return trace_array_create_systems(name, NULL, 0, 0); +} + +unsigned long trace_buffer_start; +unsigned long trace_buffer_size; + +static int __init trace_range_tr(void) +{ + if (!trace_buffer_start || !trace_buffer_size) + return 0; + + trace_array_create_systems("boot_mapped", NULL, + trace_buffer_start, trace_buffer_size); + return 0; } +fs_initcall(trace_range_tr); + static int instance_mkdir(const char *name) { struct trace_array *tr; @@ -9485,7 +9528,7 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system goto out_unlock; } - tr = trace_array_create_systems(name, systems); + tr = trace_array_create_systems(name, systems, 0, 0); if (IS_ERR(tr)) tr = NULL; @@ -9678,8 +9721,10 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) MEM_FAIL(1, "Could not allocate function filter files"); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, - tr, &snapshot_fops); + if (!tr->range_addr_start) { + trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, + tr, &snapshot_fops); + } #endif trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 749a182dab48..d22d7c3b770a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -338,6 +338,9 @@ struct trace_array { unsigned int snapshot; unsigned int mapped; unsigned long max_latency; + /* The below is for memory mapped ring buffer */ + unsigned long range_addr_start; + unsigned long range_addr_size; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; struct work_struct fsnotify_work; From patchwork Wed Mar 6 01:59:14 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Steven Rostedt X-Patchwork-Id: 13583204 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B8D66D29E; Wed, 6 Mar 2024 01:58:14 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690294; cv=none; b=hFd1e8WB7c2hUPEQoPacsR7T+MR+SpTPUGpM/CsiR5+KdQTZbmqObGd4ittTSZJ+MMEnHZlQWZItXLpqF/z8tmmP6zA0lDBnkWKJRKbO0Q7vsPqAxrqpEsXSHWtZ37sYyTc1zJtbXxTwZUN20NjEq4Id99oRh4cXv5N2GfbCNVM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690294; c=relaxed/simple; bh=em3ZqGZ6YTuBG08XGUyrZRoIgT6NwNqNxuWnOr9k2IA=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=MkShHj9HhTwAS5QYZlqfOso+HForEcRBY4MPYzCmltKHdC+fgyzSkvmXmFXavxVhKFD7FSW0OBJinJbcXa4zRd75X7gGdZBMqI1bwI2fZf45EyvxLB7WUCWXaeZ3U1J+A4yOEVO47+pNKzDLGzuMjgnGVFHqrxTW290GIPtwgXs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id 95FC3C433F1; Wed, 6 Mar 2024 01:58:14 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rhgZi-00000000TBs-0KlB; Tue, 05 Mar 2024 21:00:06 -0500 Message-ID: <20240306020005.944149230@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 05 Mar 2024 20:59:14 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton , Vincent Donnefort , Joel Fernandes , Daniel Bristot de Oliveira , Ingo Molnar , Peter Zijlstra , suleiman@google.com, Thomas Gleixner , Vineeth Pillai , Youssef Esmat , Beau Belgrave , Alexander Graf , Baoquan He , Borislav Petkov , "Paul E. McKenney" , David Howells Subject: [PATCH 4/8] HACK: Hard code in mapped tracing buffer address References: <20240306015910.766510873@goodmis.org> Precedence: bulk X-Mailing-List: linux-trace-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: "Steven Rostedt (Google)" Do not submit! This is for testing purposes only. It hard codes an address that I was using to store the ring buffer range. How the memory actually gets mapped will be another project. Signed-off-by: Steven Rostedt (Google) --- arch/x86/kernel/setup.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 84201071dfac..dcba729349d3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -26,6 +26,8 @@ #include #include +#include + #include #include @@ -1106,6 +1108,24 @@ void __init setup_arch(char **cmdline_p) */ arch_reserve_crashkernel(); + trace_buffer_size = 12582912; + { + phys_addr_t ftrace_addr; + unsigned long phys_start = 0x285400000; + unsigned long phys_end = phys_start + trace_buffer_size + 1024*1024; + + ftrace_addr = memblock_phys_alloc_range(trace_buffer_size, 4096, + phys_start, phys_end); + if (ftrace_addr) { + printk("MEMORY ALLOC %lx-%lx\n", (long)ftrace_addr, + (long)ftrace_addr + trace_buffer_size); + trace_buffer_start = (unsigned long)__va(ftrace_addr); + printk("MEMORY ADDR %lx-%lx\n", trace_buffer_start, + trace_buffer_start + trace_buffer_size); + } else + printk("MEMORY FAILED\n"); + } + memblock_find_dma_reserve(); if (!early_xdbc_setup_hardware()) From patchwork Wed Mar 6 01:59:15 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Steven Rostedt X-Patchwork-Id: 13583207 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2E5FBDF4D; Wed, 6 Mar 2024 01:58:14 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690295; cv=none; b=C0moDiM29V/IzKLgwrr+rkXySq+L5mShSF7QQ7D64CpmRmAdrTJsASjx3SY0MT+ipwnPdUYcAxWctDWAl1GricHr0Z6cPRVlQiY+0l3lD7NoqykpLBh/QfWCPzp5uv5reevvn/312hmf7myA4SdhOtGCcStYfE/Oh33cVNaN1Ns= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690295; c=relaxed/simple; bh=Tn/6xaTC2W1L4/hOdrfLHam9sH6p0LWo1P7r5ZH4URU=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=psRuEBo6sHWDrGIxf+/nAcVZS06c6Rg/JT59cWmTiCBwLa/rJhLBk4L4Mw/Q+ErMJR/Rt9D16JtcqVAyxtfzQXzTBnVxWdDEAOhq8XfbhRuKSUSbMsScvDzqpiIMSWg0LjQ7sDTcjMoqVYTxq9yUc8fJ7Ycq04HK8t/A/FUdgN8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id A36B3C433A6; Wed, 6 Mar 2024 01:58:14 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rhgZi-00000000TCM-10VX; Tue, 05 Mar 2024 21:00:06 -0500 Message-ID: <20240306020006.100449500@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 05 Mar 2024 20:59:15 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton , Vincent Donnefort , Joel Fernandes , Daniel Bristot de Oliveira , Ingo Molnar , Peter Zijlstra , suleiman@google.com, Thomas Gleixner , Vineeth Pillai , Youssef Esmat , Beau Belgrave , Alexander Graf , Baoquan He , Borislav Petkov , "Paul E. McKenney" , David Howells Subject: [PATCH 5/8] ring-buffer: Add ring_buffer_meta data References: <20240306015910.766510873@goodmis.org> Precedence: bulk X-Mailing-List: linux-trace-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: "Steven Rostedt (Google)" Populate the ring_buffer_meta array. It holds the pointer to the head_buffer (next to read), the commit_buffer (next to write) the size of the sub-buffers, number of sub-buffers and an array that keeps track of the order of the sub-buffers. This information will be stored in the persistent memory to help on reboot to reconstruct the ring buffer. Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 207 ++++++++++++++++++++++++++++++++----- 1 file changed, 182 insertions(+), 25 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 367597dc766b..5a90ada49366 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -42,6 +42,11 @@ static void update_pages_handler(struct work_struct *work); struct ring_buffer_meta { + unsigned long head_buffer; + unsigned long commit_buffer; + __u32 subbuf_size; + __u32 nr_subbufs; + int buffers[]; }; /* @@ -497,6 +502,7 @@ struct ring_buffer_per_cpu { struct mutex mapping_lock; unsigned long *subbuf_ids; /* ID to subbuf addr */ struct trace_buffer_meta *meta_page; + struct ring_buffer_meta *ring_meta; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -1206,6 +1212,11 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) * Set the previous list pointer to have the HEAD flag. */ rb_set_list_to_head(head->list.prev); + + if (cpu_buffer->ring_meta) { + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + meta->head_buffer = (unsigned long)head->page; + } } static void rb_list_head_clear(struct list_head *list) @@ -1453,50 +1464,124 @@ rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) } /* - * Return a specific sub-buffer for a given @cpu defined by @idx. + * Return the ring_buffer_meta for a given @cpu. */ -static void *rb_range_buffer(struct trace_buffer *buffer, int cpu, int nr_pages, int idx) +static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) { - unsigned long ptr; int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + unsigned long ptr = buffer->range_addr_start; + struct ring_buffer_meta *meta; int nr_subbufs; - /* Include the reader page */ - nr_subbufs = nr_pages + 1; + if (!ptr) + return NULL; + + /* When nr_pages passed in is zero, the first meta has already been initialized */ + if (!nr_pages) { + meta = (struct ring_buffer_meta *)ptr; + nr_subbufs = meta->nr_subbufs; + } else { + meta = NULL; + /* Include the reader page */ + nr_subbufs = nr_pages + 1; + } /* * The first chunk may not be subbuffer aligned, where as * the rest of the chunks are. */ - ptr = buffer->range_addr_start; - ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); if (cpu) { - unsigned long p; - - ptr += subbuf_size * nr_subbufs; - - /* Save the beginning of this CPU chunk */ - p = ptr; - ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); + ptr += subbuf_size * nr_subbufs; if (cpu > 1) { unsigned long size; + unsigned long p; + /* Save the beginning of this CPU chunk */ + p = ptr; + ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); ptr += subbuf_size * nr_subbufs; /* Now all chunks after this are the same size */ size = ptr - p; ptr += size * (cpu - 2); - - ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); } } - if (ptr + subbuf_size * nr_subbufs > buffer->range_addr_end) + return (void *)ptr; +} + +static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) +{ + int subbuf_size = meta->subbuf_size; + unsigned long ptr; + + ptr = (unsigned long)meta; + ptr = rb_range_align_subbuf(ptr, subbuf_size, meta->nr_subbufs); + + return (void *)ptr; +} + +/* + * Return a specific sub-buffer for a given @cpu defined by @idx. + */ +static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) +{ + struct ring_buffer_meta *meta; + unsigned long ptr; + int subbuf_size; + + meta = rb_range_meta(cpu_buffer->buffer, 0, cpu_buffer->cpu); + if (!meta) + return NULL; + + if (WARN_ON_ONCE(idx >= meta->nr_subbufs)) return NULL; + + subbuf_size = meta->subbuf_size; + + /* Map this buffer to the order that's in meta->buffers[] */ + idx = meta->buffers[idx]; + + ptr = (unsigned long)rb_subbufs_from_meta(meta); + + ptr += subbuf_size * idx; + if (ptr + subbuf_size > cpu_buffer->buffer->range_addr_end) + return NULL; + return (void *)ptr; } +static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) +{ + struct ring_buffer_meta *meta; + void *subbuf; + int cpu; + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + meta = rb_range_meta(buffer, nr_pages, cpu); + + meta->nr_subbufs = nr_pages + 1; + meta->subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + + subbuf = rb_subbufs_from_meta(meta); + + /* + * The buffers[] array holds the order of the sub-buffers + * that are after the meta data. The sub-buffers may + * be swapped out when read and inserted into a different + * location of the ring buffer. Although their addresses + * remain the same, the buffers[] array contains the + * index into the sub-buffers holding their actual order. + */ + for (int i = 0; i < meta->nr_subbufs; i++) { + meta->buffers[i] = i; + rb_init_page(subbuf); + subbuf += meta->subbuf_size; + } + } +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { @@ -1537,7 +1622,6 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, set_current_oom_origin(); for (i = 0; i < nr_pages; i++) { struct page *page; - int cpu = cpu_buffer->cpu; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), mflags, cpu_to_node(cpu_buffer->cpu)); @@ -1550,10 +1634,11 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (buffer->range_addr_start) { /* A range was given. Use that for the buffer page */ - bpage->page = rb_range_buffer(buffer, cpu, nr_pages, i + 1); + bpage->page = rb_range_buffer(cpu_buffer, i + 1); if (!bpage->page) goto free_pages; bpage->range = 1; + bpage->id = i + 1; } else { page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags | __GFP_ZERO, @@ -1561,9 +1646,9 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (!page) goto free_pages; bpage->page = page_address(page); + rb_init_page(bpage->page); } bpage->order = cpu_buffer->buffer->subbuf_order; - rb_init_page(bpage->page); if (user_thread && fatal_signal_pending(current)) goto free_pages; @@ -1644,7 +1729,13 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->reader_page = bpage; if (buffer->range_addr_start) { - bpage->page = rb_range_buffer(buffer, cpu, nr_pages, 0); + /* + * Range mapped buffers have the same restrictions as memory + * mapped ones do. + */ + cpu_buffer->mapped = 1; + cpu_buffer->ring_meta = rb_range_meta(buffer, nr_pages, cpu); + bpage->page = rb_range_buffer(cpu_buffer, 0); if (!bpage->page) goto fail_free_reader; bpage->range = 1; @@ -1654,8 +1745,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (!page) goto fail_free_reader; bpage->page = page_address(page); + rb_init_page(bpage->page); } - rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); @@ -1669,6 +1760,10 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; rb_head_page_activate(cpu_buffer); + if (cpu_buffer->ring_meta) { + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + meta->commit_buffer = meta->head_buffer; + } return cpu_buffer; @@ -1780,6 +1875,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, nr_pages--; buffer->range_addr_start = start; buffer->range_addr_end = end; + + rb_range_meta_init(buffer, nr_pages); } else { /* need at least two pages */ @@ -2464,6 +2561,52 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->next_event = 0; } +/* Return the index into the sub-buffers for a given sub-buffer */ +static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) +{ + void *subbuf_array; + + subbuf_array = (void *)meta + sizeof(int) * meta->nr_subbufs; + subbuf_array = (void *)ALIGN((unsigned long)subbuf_array, meta->subbuf_size); + return (subbuf - subbuf_array) / meta->subbuf_size; +} + +static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *next_page) +{ + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + unsigned long old_head = (unsigned long)next_page->page; + unsigned long new_head; + + rb_inc_page(&next_page); + new_head = (unsigned long)next_page->page; + + /* + * Only move it forward once, if something else came in and + * moved it forward, then we don't want to touch it. + */ + (void)cmpxchg(&meta->head_buffer, old_head, new_head); +} + +static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *reader) +{ + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + void *old_reader = cpu_buffer->reader_page->page; + void *new_reader = reader->page; + int id; + + id = reader->id; + cpu_buffer->reader_page->id = id; + reader->id = 0; + + meta->buffers[0] = rb_meta_subbuf_idx(meta, new_reader); + meta->buffers[id] = rb_meta_subbuf_idx(meta, old_reader); + + /* The head pointer is the one after the reader */ + rb_update_meta_head(cpu_buffer, reader); +} + /* * rb_handle_head_page - writer hit the head page * @@ -2513,6 +2656,8 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); + if (cpu_buffer->ring_meta) + rb_update_meta_head(cpu_buffer, next_page); /* * The entries will be zeroed out when we move the * tail page. @@ -3074,6 +3219,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); + if (cpu_buffer->ring_meta) { + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + meta->commit_buffer = cpu_buffer->commit_page->page; + } /* add barrier to keep gcc from optimizing too much */ barrier(); } @@ -4691,6 +4840,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) if (!ret) goto spin; + if (cpu_buffer->ring_meta) + rb_update_meta_reader(cpu_buffer, reader); + /* * Yay! We succeeded in replacing the page. * @@ -5381,11 +5533,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; - if (cpu_buffer->mapped) - rb_update_meta_page(cpu_buffer); - rb_head_page_activate(cpu_buffer); cpu_buffer->pages_removed = 0; + + if (cpu_buffer->mapped) { + rb_update_meta_page(cpu_buffer); + if (cpu_buffer->ring_meta) { + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + meta->commit_buffer = meta->head_buffer; + } + } } /* Must have disabled the cpu buffer then done a synchronize_rcu */ From patchwork Wed Mar 6 01:59:16 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Steven Rostedt X-Patchwork-Id: 13583206 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id E6EDBD53F; Wed, 6 Mar 2024 01:58:14 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690295; cv=none; b=YkGY7MutDRIZnjCKL981wh+Xs14VSQ4wjJnpD9c5HSFYRoyp8BCLQRFIQ+27jiDceOR/AWkTzUzlaPVbvn3pIppJsvH+UAy51lSlb8PB/daQTqJHQsDr2lcQUl1Ly0LubrhTWiAQ5EuqDUbzCECEuSzVEKUAK4p++xfs37Mg8TA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690295; c=relaxed/simple; bh=NjJZ7PbqyeYT2ESTwvoCVQwGM0ZlE/Evn2oRDxhoydQ=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=gun/QVk+OGpXGKewca0NLzPEOKoQYMhzUEX8aZVMuaMOOYiAd2EDqetXwVSk0WwpOUX72NUM62BZKRTahyfU0Mdc+oyhvpTskeumgStyX9ulxJVs0XzlHD1UshKaPmBKmDCr5oPy5L8xI0MEDWHHqUa+am3sZByNsK4VagaRmUk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id CC338C41679; Wed, 6 Mar 2024 01:58:14 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rhgZi-00000000TCq-1ggX; Tue, 05 Mar 2024 21:00:06 -0500 Message-ID: <20240306020006.261750712@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 05 Mar 2024 20:59:16 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton , Vincent Donnefort , Joel Fernandes , Daniel Bristot de Oliveira , Ingo Molnar , Peter Zijlstra , suleiman@google.com, Thomas Gleixner , Vineeth Pillai , Youssef Esmat , Beau Belgrave , Alexander Graf , Baoquan He , Borislav Petkov , "Paul E. McKenney" , David Howells Subject: [PATCH 6/8] ring-buffer: Add output of ring buffer meta page References: <20240306015910.766510873@goodmis.org> Precedence: bulk X-Mailing-List: linux-trace-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: "Steven Rostedt (Google)" Add a buffer_meta per-cpu file for the trace instance that is mapped to boot memory. This shows the current meta-data and can be used by user space tools to record off the current mappings to help reconstruct the ring buffer after a reboot. It does not expose any virtual addresses, just indexes into the sub-buffer pages. Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 77 ++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.c | 30 ++++++++++++++- kernel/trace/trace.h | 2 + 3 files changed, 107 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5a90ada49366..1e06ebe36ad1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -31,6 +31,8 @@ #include #include +#include "trace.h" + /* * The "absolute" timestamp in the buffer is only 59 bits. * If a clock has the 5 MSBs set, it needs to be saved and @@ -1582,6 +1584,81 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) } } +static void *rbm_start(struct seq_file *m, loff_t *pos) +{ + struct ring_buffer_per_cpu *cpu_buffer = m->private; + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + unsigned long val; + + if (!meta) + return NULL; + + if (*pos > meta->nr_subbufs) + return NULL; + + val = *pos; + val++; + + return (void *)val; +} + +static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + + return rbm_start(m, pos); +} + +static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); + +static int rbm_show(struct seq_file *m, void *v) +{ + struct ring_buffer_per_cpu *cpu_buffer = m->private; + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + unsigned long val = (unsigned long)v; + + if (val == 1) { + seq_printf(m, "head_buffer: %d\n", + rb_meta_subbuf_idx(meta, (void *)meta->head_buffer)); + seq_printf(m, "commit_buffer: %d\n", + rb_meta_subbuf_idx(meta, (void *)meta->commit_buffer)); + seq_printf(m, "subbuf_size: %d\n", meta->subbuf_size); + seq_printf(m, "nr_subbufs: %d\n", meta->nr_subbufs); + return 0; + } + + val -= 2; + seq_printf(m, "buffer[%ld]: %d\n", val, meta->buffers[val]); + + return 0; +} + +static void rbm_stop(struct seq_file *m, void *p) +{ +} + +static const struct seq_operations rb_meta_seq_ops = { + .start = rbm_start, + .next = rbm_next, + .show = rbm_show, + .stop = rbm_stop, +}; + +int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &rb_meta_seq_ops); + if (ret) + return ret; + + m = file->private_data; + m->private = buffer->buffers[cpu]; + + return 0; +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ff986d2a4bd0..b4a7960aed98 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4997,7 +4997,7 @@ static int show_traces_open(struct inode *inode, struct file *file) return 0; } -static int show_traces_release(struct inode *inode, struct file *file) +static int tracing_seq_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -5038,7 +5038,7 @@ static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, .llseek = seq_lseek, - .release = show_traces_release, + .release = tracing_seq_release, }; static ssize_t @@ -6840,6 +6840,22 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } +static int tracing_buffer_meta_open(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + int cpu = tracing_get_cpu(inode); + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = ring_buffer_meta_seq_init(filp, tr->array_buffer.buffer, cpu); + if (ret < 0) + __trace_array_put(tr); + return ret; +} + static ssize_t tracing_free_buffer_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -7416,6 +7432,13 @@ static const struct file_operations tracing_entries_fops = { .release = tracing_release_generic_tr, }; +static const struct file_operations tracing_buffer_meta_fops = { + .open = tracing_buffer_meta_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_seq_release, +}; + static const struct file_operations tracing_total_entries_fops = { .open = tracing_open_generic_tr, .read = tracing_total_entries_read, @@ -8674,6 +8697,9 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_entries_fops); + if (tr->range_addr_start) + trace_create_cpu_file("buffer_meta", TRACE_MODE_READ, d_cpu, + tr, cpu, &tracing_buffer_meta_fops); #ifdef CONFIG_TRACER_SNAPSHOT if (!tr->range_addr_start) { trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d22d7c3b770a..ccff4891c2ac 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -644,6 +644,8 @@ trace_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long len, unsigned int trace_ctx); +int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu); + struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); From patchwork Wed Mar 6 01:59:17 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Steven Rostedt X-Patchwork-Id: 13583208 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4F781FC18; Wed, 6 Mar 2024 01:58:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690295; cv=none; b=K1BSGT2zxvTX2KTycimW94jy2FeKoa2DAA8i/Ok4JpwSP6H0vgV0dKX3B97fsZ//kK8DoZSMiWrByNJWgEp+NWZg7p2qXtTJmuOJlKHapDV6Ol3EP557dPMfPkoAwQrkkxQdAzUpc/AHoDJB6FC4ZnLmG1Ai30b526+3zhzreGw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690295; c=relaxed/simple; bh=FhZpnR+W7njdgbykATP512Ts6RKIPCFf6M9e5o5Ao0A=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=qBW35gJnRqq04QWRv6coyHg+gOBvzg24eLdyV+ievc0Th3gEMpjI4zifCaM/1pbRubyxhrOpcO9pN62O6frLhBl6td488ovLAjQN1QBPU9jhfuufQ+eb7Olp1cgnybEOPu7gskdwoPd40ZEwwR7eoM+MDicEpkDoyjE7q/N2Am8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id 08E15C43330; Wed, 6 Mar 2024 01:58:15 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rhgZi-00000000TDK-2NKC; Tue, 05 Mar 2024 21:00:06 -0500 Message-ID: <20240306020006.423802400@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 05 Mar 2024 20:59:17 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton , Vincent Donnefort , Joel Fernandes , Daniel Bristot de Oliveira , Ingo Molnar , Peter Zijlstra , suleiman@google.com, Thomas Gleixner , Vineeth Pillai , Youssef Esmat , Beau Belgrave , Alexander Graf , Baoquan He , Borislav Petkov , "Paul E. McKenney" , David Howells Subject: [PATCH 7/8] ring-buffer: Add test if range of boot buffer is valid References: <20240306015910.766510873@goodmis.org> Precedence: bulk X-Mailing-List: linux-trace-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: "Steven Rostedt (Google)" Add a test against the ring buffer memory range to see if it has valid data. The ring_buffer_meta structure is given a new field called "first_buffer" which holds the address of the first sub-buffer. This is used to both determine if the other fields are valid as well as finding the offset between the old addresses of the sub-buffer from the previous boot to the new addresses of the current boot. Since the values for nr_subbufs and subbuf_size is to be the same, check if the values in the meta page match the values calculated. Take the range of the first_buffer and the total size of all the buffers and make sure the saved head_buffer and commit_buffer fall in the range. Iterate through all the sub-buffers to make sure that the values in the sub-buffer "commit" field (the field that holds the amount of data on the sub-buffer) is within the end of the sub-buffer. Also check the index array to make sure that all the indexes are within nr_subbufs. Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 142 ++++++++++++++++++++++++++++++++++--- 1 file changed, 134 insertions(+), 8 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1e06ebe36ad1..e74185a4d864 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -44,6 +44,7 @@ static void update_pages_handler(struct work_struct *work); struct ring_buffer_meta { + unsigned long first_buffer; unsigned long head_buffer; unsigned long commit_buffer; __u32 subbuf_size; @@ -1554,20 +1555,101 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) return (void *)ptr; } +/* + * See if the existing memory contains valid ring buffer data. + * As the previous kernel must be the same as this kernel, all + * the calculations (size of buffers and number of buffers) + * must be the same. + */ +static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, + struct trace_buffer *buffer, int nr_pages) +{ + int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + struct buffer_data_page *subbuf; + unsigned long buffers_start; + unsigned long buffers_end; + + /* The subbuffer's size and number of subbuffers must match */ + if (meta->subbuf_size != subbuf_size || + meta->nr_subbufs != nr_pages + 1) { + pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu); + return false; + } + + buffers_start = meta->first_buffer; + buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs); + + /* Is the head and commit buffers within the range of buffers? */ + if (meta->head_buffer < buffers_start || + meta->head_buffer >= buffers_end) { + pr_info("Ring buffer boot meta [%d] head buffer out of range\n", cpu); + return false; + } + + if (meta->commit_buffer < buffers_start || + meta->commit_buffer >= buffers_end) { + pr_info("Ring buffer boot meta [%d] commit buffer out of range\n", cpu); + return false; + } + + subbuf = rb_subbufs_from_meta(meta); + + /* Is the meta buffers and the subbufs themselves have correct data? */ + for (int i = 0; i < meta->nr_subbufs; i++) { + if (meta->buffers[i] < 0 || + meta->buffers[i] >= meta->nr_subbufs) { + pr_info("Ring buffer boot meta [%d] array out of range\n", cpu); + return false; + } + + if ((unsigned)local_read(&subbuf->commit) > subbuf_size) { + pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu); + return false; + } + + subbuf = (void *)subbuf + subbuf_size; + } + + pr_info("Ring buffer meta is from previous boot!\n"); + return true; +} + static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { struct ring_buffer_meta *meta; + unsigned long delta; void *subbuf; int cpu; for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + void *next_meta; + meta = rb_range_meta(buffer, nr_pages, cpu); + if (rb_meta_valid(meta, cpu, buffer, nr_pages)) { + /* Make the mappings match the current address */ + subbuf = rb_subbufs_from_meta(meta); + delta = (unsigned long)subbuf - meta->first_buffer; + meta->first_buffer += delta; + meta->head_buffer += delta; + meta->commit_buffer += delta; + continue; + } + + if (cpu < nr_cpu_ids - 1) + next_meta = rb_range_meta(buffer, nr_pages, cpu + 1); + else + next_meta = (void *)buffer->range_addr_end; + + memset(meta, 0, next_meta - (void *)meta); + meta->nr_subbufs = nr_pages + 1; meta->subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; subbuf = rb_subbufs_from_meta(meta); + meta->first_buffer = (unsigned long)subbuf; + /* * The buffers[] array holds the order of the sub-buffers * that are after the meta data. The sub-buffers may @@ -1659,10 +1741,26 @@ int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, in return 0; } +/* Map the buffer_pages to the previous head and commit pages */ +static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage) +{ + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + + if (meta->head_buffer == (unsigned long)bpage->page) + cpu_buffer->head_page = bpage; + + if (meta->commit_buffer == (unsigned long)bpage->page) { + cpu_buffer->commit_page = bpage; + cpu_buffer->tail_page = bpage; + } +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { struct trace_buffer *buffer = cpu_buffer->buffer; + struct ring_buffer_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; gfp_t mflags; @@ -1697,6 +1795,10 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, */ if (user_thread) set_current_oom_origin(); + + if (buffer->range_addr_start) + meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); + for (i = 0; i < nr_pages; i++) { struct page *page; @@ -1709,11 +1811,14 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add_tail(&bpage->list, pages); - if (buffer->range_addr_start) { + if (meta) { /* A range was given. Use that for the buffer page */ bpage->page = rb_range_buffer(cpu_buffer, i + 1); if (!bpage->page) goto free_pages; + /* If this is valid from a previous boot */ + if (meta->head_buffer) + rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; bpage->id = i + 1; } else { @@ -1775,6 +1880,7 @@ static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_meta *meta; struct buffer_page *bpage; struct page *page; int ret; @@ -1815,6 +1921,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) bpage->page = rb_range_buffer(cpu_buffer, 0); if (!bpage->page) goto fail_free_reader; + if (cpu_buffer->ring_meta->head_buffer) + rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; } else { page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO, @@ -1832,14 +1940,32 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (ret < 0) goto fail_free_reader; - cpu_buffer->head_page - = list_entry(cpu_buffer->pages, struct buffer_page, list); - cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + /* If the boot meta was valid then this has already been updated */ + meta = cpu_buffer->ring_meta; + if (!meta || !meta->head_buffer || + !cpu_buffer->head_page || !cpu_buffer->commit_page || !cpu_buffer->tail_page) { + if (meta && meta->head_buffer && + (cpu_buffer->head_page || cpu_buffer->commit_page || cpu_buffer->tail_page)) { + pr_warn("Ring buffer meta buffers not all mapped\n"); + if (!cpu_buffer->head_page) + pr_warn(" Missing head_page\n"); + if (!cpu_buffer->commit_page) + pr_warn(" Missing commit_page\n"); + if (!cpu_buffer->tail_page) + pr_warn(" Missing tail_page\n"); + } - rb_head_page_activate(cpu_buffer); - if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; - meta->commit_buffer = meta->head_buffer; + cpu_buffer->head_page + = list_entry(cpu_buffer->pages, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + + rb_head_page_activate(cpu_buffer); + + if (cpu_buffer->ring_meta) + meta->commit_buffer = meta->head_buffer; + } else { + /* The valid meta buffer still needs to activate the head page */ + rb_head_page_activate(cpu_buffer); } return cpu_buffer; From patchwork Wed Mar 6 01:59:18 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Steven Rostedt X-Patchwork-Id: 13583209 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4DF5EFC01; Wed, 6 Mar 2024 01:58:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690295; cv=none; b=H4gY6nIO3ugzCUNN5MfnEtiPqGF500CNs3Usw2VdaP1Acd6Kk30JWSHExG8JsqUdxNiDRq8wRtPwUns+pQ/ognXBz6vQ13UAQPjCGNF3DvSeSYqJNYrFL0PRsadTsEScNbq0+tW8QOiIA16CEC/hxH31Xv0wpwGGUr5mwRv87Gc= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709690295; c=relaxed/simple; bh=ZKShaqpkZZF9PUDpuAu2YxtJR6tLcb8gXSihlRlAzTk=; h=Message-ID:Date:From:To:Cc:Subject:References:MIME-Version: Content-Type; b=Qmm8UmoYvCR5gFC3HurkEM9vBSNF3io/KayW1cKmMaypKKnvfMEPzMkFsA3IVHzppIIQ5K7CPpqdNpAgINfBBjofFsWuNSBQW0LSZ44WBUF87oqk0rwUK//XQuauRP6KVkhZ41EL5HR7W3bJH/jb05WkoSzH/Oqv/jQPD7Fft+w= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id 19F44C32782; Wed, 6 Mar 2024 01:58:15 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rhgZi-00000000TDp-34Bg; Tue, 05 Mar 2024 21:00:06 -0500 Message-ID: <20240306020006.586558735@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 05 Mar 2024 20:59:18 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton , Vincent Donnefort , Joel Fernandes , Daniel Bristot de Oliveira , Ingo Molnar , Peter Zijlstra , suleiman@google.com, Thomas Gleixner , Vineeth Pillai , Youssef Esmat , Beau Belgrave , Alexander Graf , Baoquan He , Borislav Petkov , "Paul E. McKenney" , David Howells Subject: [PATCH 8/8] ring-buffer: Validate boot range memory events References: <20240306015910.766510873@goodmis.org> Precedence: bulk X-Mailing-List: linux-trace-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: "Steven Rostedt (Google)" Make sure all the events in each of the sub-buffers that were mapped in a memory region are valid. This moves the code that walks the buffers for time-stamp validation out of the CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS ifdef block and is used to validate the content. Only the ring buffer event meta data is checked and not the data load. This also has a second purpose. The buffer_page structure that points to the data sub-buffers has accounting that keeps track of the number of events that are on the sub-buffer. This updates that counter as well. That counter is used in reading the buffer and knowing if the ring buffer is empty or not. Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 222 +++++++++++++++++++++++++++---------- 1 file changed, 165 insertions(+), 57 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e74185a4d864..f7b511935fcf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1610,10 +1610,171 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, subbuf = (void *)subbuf + subbuf_size; } - pr_info("Ring buffer meta is from previous boot!\n"); return true; } +static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); + +#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS +static DEFINE_PER_CPU(atomic_t, checking); +static atomic_t ts_dump; + +#define buffer_warn_return(fmt, ...) \ + do { \ + /* If another report is happening, ignore this one */ \ + if (atomic_inc_return(&ts_dump) != 1) { \ + atomic_dec(&ts_dump); \ + goto out; \ + } \ + atomic_inc(&cpu_buffer->record_disabled); \ + pr_warn(fmt, ##__VA_ARGS__); \ + dump_buffer_page(bpage, info, tail); \ + atomic_dec(&ts_dump); \ + /* There's some cases in boot up that this can happen */ \ + if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \ + /* Do not re-enable checking */ \ + return; \ + } while (0) +#else +#define buffer_warn_return(fmt, ...) do { } while (0) +#endif + +static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu, + unsigned long long *timestamp, bool warn) +{ + struct ring_buffer_event *event; + u64 ts, delta; + int events = 0; + int e; + + ts = dpage->time_stamp; + + for (e = 0; e < tail; e += rb_event_length(event)) { + + event = (struct ring_buffer_event *)(dpage->data + e); + + switch (event->type_len) { + + case RINGBUF_TYPE_TIME_EXTEND: + delta = rb_event_time_stamp(event); + ts += delta; + break; + + case RINGBUF_TYPE_TIME_STAMP: + delta = rb_event_time_stamp(event); + delta = rb_fix_abs_ts(delta, ts); + if (warn && delta < ts) { + buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", + cpu, ts, delta); + } + ts = delta; + break; + + case RINGBUF_TYPE_PADDING: + if (event->time_delta == 1) + break; + fallthrough; + case RINGBUF_TYPE_DATA: + events++; + ts += event->time_delta; + break; + + default: + return -1; + } + } + *timestamp = ts; + return events; +} + +static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) +{ + unsigned long long ts; + int tail; + + tail = local_read(&dpage->commit); + return rb_read_data_buffer(dpage, tail, cpu, &ts, false); +} + +/* If the meta data has been validated, now validate the events */ +static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct buffer_page *head_page; + unsigned long entry_bytes = 0; + unsigned long entries = 0; + int ret; + int i; + + if (!meta || !meta->head_buffer) + return; + + /* Do the reader page first */ + ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu); + if (ret < 0) { + printk("INVALID READER PAGE\n"); + goto invalid; + } + entries += ret; + entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); + local_set(&cpu_buffer->reader_page->entries, ret); + + head_page = cpu_buffer->head_page; + + /* If both the head and commit are on the reader_page then we are done. */ + if (head_page == cpu_buffer->reader_page && + head_page == cpu_buffer->commit_page) + goto done; + + /* Iterate until finding the commit page */ + for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { + + /* Reader page has already been done */ + if (head_page == cpu_buffer->reader_page) + continue; + + ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); + if (ret < 0) { + pr_info("Ring buffer meta [%d] invalid buffer page\n", + cpu_buffer->cpu); + goto invalid; + } + entries += ret; + entry_bytes += local_read(&head_page->page->commit); + local_set(&cpu_buffer->head_page->entries, ret); + + if (head_page == cpu_buffer->commit_page) + break; + } + + if (head_page != cpu_buffer->commit_page) { + pr_info("Ring buffer meta [%d] commit page not found\n", + cpu_buffer->cpu); + goto invalid; + } + done: + local_set(&cpu_buffer->entries, entries); + local_set(&cpu_buffer->entries_bytes, entry_bytes); + + pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu); + return; + + invalid: + /* The content of the buffers are invalid, reset the meta data */ + meta->head_buffer = 0; + meta->commit_buffer = 0; + + /* Reset the reader page */ + local_set(&cpu_buffer->reader_page->entries, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); + + /* Reset all the subbuffers */ + for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) { + local_set(&head_page->entries, 0); + local_set(&head_page->page->commit, 0); + } +} + static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { struct ring_buffer_meta *meta; @@ -1691,8 +1852,6 @@ static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) return rbm_start(m, pos); } -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); - static int rbm_show(struct seq_file *m, void *v) { struct ring_buffer_per_cpu *cpu_buffer = m->private; @@ -1940,6 +2099,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (ret < 0) goto fail_free_reader; + rb_meta_validate_events(cpu_buffer); + /* If the boot meta was valid then this has already been updated */ meta = cpu_buffer->ring_meta; if (!meta || !meta->head_buffer || @@ -3844,26 +4005,6 @@ static void dump_buffer_page(struct buffer_data_page *bpage, pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e); } -static DEFINE_PER_CPU(atomic_t, checking); -static atomic_t ts_dump; - -#define buffer_warn_return(fmt, ...) \ - do { \ - /* If another report is happening, ignore this one */ \ - if (atomic_inc_return(&ts_dump) != 1) { \ - atomic_dec(&ts_dump); \ - goto out; \ - } \ - atomic_inc(&cpu_buffer->record_disabled); \ - pr_warn(fmt, ##__VA_ARGS__); \ - dump_buffer_page(bpage, info, tail); \ - atomic_dec(&ts_dump); \ - /* There's some cases in boot up that this can happen */ \ - if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \ - /* Do not re-enable checking */ \ - return; \ - } while (0) - /* * Check if the current event time stamp matches the deltas on * the buffer page. @@ -3902,41 +4043,8 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) goto out; - ts = bpage->time_stamp; - - for (e = 0; e < tail; e += rb_event_length(event)) { - - event = (struct ring_buffer_event *)(bpage->data + e); - - switch (event->type_len) { - - case RINGBUF_TYPE_TIME_EXTEND: - delta = rb_event_time_stamp(event); - ts += delta; - break; - - case RINGBUF_TYPE_TIME_STAMP: - delta = rb_event_time_stamp(event); - delta = rb_fix_abs_ts(delta, ts); - if (delta < ts) { - buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", - cpu_buffer->cpu, ts, delta); - } - ts = delta; - break; + ret = rb_read_data_buffer(bpage, tail, &ts, cpu_buffer->cpu, true); - case RINGBUF_TYPE_PADDING: - if (event->time_delta == 1) - break; - fallthrough; - case RINGBUF_TYPE_DATA: - ts += event->time_delta; - break; - - default: - RB_WARN_ON(cpu_buffer, 1); - } - } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",