From patchwork Tue Jun 16 07:49:27 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Kees Cook X-Patchwork-Id: 11606743 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 6A1796A2 for ; Tue, 16 Jun 2020 07:49:49 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 4E7FF207C4 for ; Tue, 16 Jun 2020 07:49:49 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=chromium.org header.i=@chromium.org header.b="FG9PpeCl" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726303AbgFPHts (ORCPT ); Tue, 16 Jun 2020 03:49:48 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56984 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726428AbgFPHtq (ORCPT ); Tue, 16 Jun 2020 03:49:46 -0400 Received: from mail-pf1-x443.google.com (mail-pf1-x443.google.com [IPv6:2607:f8b0:4864:20::443]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D3C4FC03E97C for ; Tue, 16 Jun 2020 00:49:44 -0700 (PDT) Received: by mail-pf1-x443.google.com with SMTP id d66so9107946pfd.6 for ; Tue, 16 Jun 2020 00:49:44 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=chromium.org; s=google; h=from:to:cc:subject:date:message-id:in-reply-to:references :mime-version:content-transfer-encoding; bh=mjcyOk28N/gDNh+lNJI3lw4LaLVWwmKl55JA3X6Dw/I=; b=FG9PpeCl4IGZS0FoNmzdMakw2RdED+vA0suD67BJd2Q1POY++LxiBHhwe4plC4cga8 iDPgxWJGUXT92vT8ZuWCJtOCWA1ztsAt8azrEh5ct14BZQG2ahACYirwzmmPk7G6j8pv DaCnf8hnlAC4JZ0uKMWM+knX75nXFBb223vWw= X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=mjcyOk28N/gDNh+lNJI3lw4LaLVWwmKl55JA3X6Dw/I=; b=rV+4UTx2xQUEBLADg097xGXwIB1xrct4517Zr0mY0XgpH60RSIB8DUISsMq6KuwMrJ MPeXVuAgGpsV4PDbU1ooWh7ssk7HDvJfu/YHHGrUlgRVcy9rBTHNCfW/U8TYyS0lZYJm iJ3nDH9J0XJalzvsA5GpUdiwDSS+FPMgltd/Yh2DNBSrrTDb/E+ljQj2oM/qutDZrpPX dNOgHq56BY9MWXXZMt3GLth0zpazrO/Ajq+KMU+ZUN9qhu82jap71wW/KqZXEcYiFlnG YSOVnCLTkhPG79ZLufjD2QCu1Y/fPznWGUCR6w0IM8zcn7SybH06b7F5DJjYd4B1oBFv I9Yg== X-Gm-Message-State: AOAM530uwn9VOnroHEXzmtuYxYX9RGhrMGjiJjZk6Ha6i5hU28UJCq5K ifWxxA/JtZYmVDG9OEJPWao+tQ== X-Google-Smtp-Source: ABdhPJyGg39N+96ssceO/9HxF6jdYQbkHmDZ4rA7Tk6OOphboXmqXiNDvckhAdVMS9jNEwdwSrEJpg== X-Received: by 2002:a63:1862:: with SMTP id 34mr1181225pgy.246.1592293783735; Tue, 16 Jun 2020 00:49:43 -0700 (PDT) Received: from www.outflux.net (smtp.outflux.net. [198.145.64.163]) by smtp.gmail.com with ESMTPSA id a7sm1587517pjd.2.2020.06.16.00.49.42 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 16 Jun 2020 00:49:42 -0700 (PDT) From: Kees Cook To: linux-kernel@vger.kernel.org Cc: Kees Cook , Christian Brauner , Sargun Dhillon , Tycho Andersen , Jann Horn , "zhujianwei (C)" , Dave Hansen , Matthew Wilcox , Andy Lutomirski , Will Drewry , Shuah Khan , Matt Denton , Chris Palmer , Jeffrey Vander Stoep , Aleksa Sarai , Hehuazhen , x86@kernel.org, Linux Containers , linux-security-module@vger.kernel.org, linux-api@vger.kernel.org Subject: [PATCH 1/8] selftests/seccomp: Improve calibration loop Date: Tue, 16 Jun 2020 00:49:27 -0700 Message-Id: <20200616074934.1600036-2-keescook@chromium.org> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20200616074934.1600036-1-keescook@chromium.org> References: <20200616074934.1600036-1-keescook@chromium.org> MIME-Version: 1.0 Sender: owner-linux-security-module@vger.kernel.org Precedence: bulk List-ID: The seccomp benchmark calibration loop did not need to take so long. Instead, use a simple 1 second timeout and multiply up to target. It does not need to be accurate. Signed-off-by: Kees Cook --- .../selftests/seccomp/seccomp_benchmark.c | 50 ++++++++++++------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index eca13fe1fba9..91f5a89cadac 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -18,9 +18,9 @@ unsigned long long timing(clockid_t clk_id, unsigned long long samples) { - pid_t pid, ret; - unsigned long long i; struct timespec start, finish; + unsigned long long i; + pid_t pid, ret; pid = getpid(); assert(clock_gettime(clk_id, &start) == 0); @@ -31,30 +31,43 @@ unsigned long long timing(clockid_t clk_id, unsigned long long samples) assert(clock_gettime(clk_id, &finish) == 0); i = finish.tv_sec - start.tv_sec; - i *= 1000000000; + i *= 1000000000ULL; i += finish.tv_nsec - start.tv_nsec; - printf("%lu.%09lu - %lu.%09lu = %llu\n", + printf("%lu.%09lu - %lu.%09lu = %llu (%.1fs)\n", finish.tv_sec, finish.tv_nsec, start.tv_sec, start.tv_nsec, - i); + i, (double)i / 1000000000.0); return i; } unsigned long long calibrate(void) { - unsigned long long i; - - printf("Calibrating reasonable sample size...\n"); + struct timespec start, finish; + unsigned long long i, samples, step = 9973; + pid_t pid, ret; + int seconds = 15; - for (i = 5; ; i++) { - unsigned long long samples = 1 << i; + printf("Calibrating sample size for %d seconds worth of syscalls ...\n", seconds); - /* Find something that takes more than 5 seconds to run. */ - if (timing(CLOCK_REALTIME, samples) / 1000000000ULL > 5) - return samples; - } + samples = 0; + pid = getpid(); + assert(clock_gettime(CLOCK_MONOTONIC, &start) == 0); + do { + for (i = 0; i < step; i++) { + ret = syscall(__NR_getpid); + assert(pid == ret); + } + assert(clock_gettime(CLOCK_MONOTONIC, &finish) == 0); + + samples += step; + i = finish.tv_sec - start.tv_sec; + i *= 1000000000ULL; + i += finish.tv_nsec - start.tv_nsec; + } while (i < 1000000000ULL); + + return samples * seconds; } int main(int argc, char *argv[]) @@ -70,15 +83,16 @@ int main(int argc, char *argv[]) unsigned long long samples; unsigned long long native, filter1, filter2; + printf("Current BPF sysctl settings:\n"); + system("sysctl net.core.bpf_jit_enable"); + system("sysctl net.core.bpf_jit_harden"); + if (argc > 1) samples = strtoull(argv[1], NULL, 0); else samples = calibrate(); - printf("Current BPF sysctl settings:\n"); - system("sysctl net.core.bpf_jit_enable"); - system("sysctl net.core.bpf_jit_harden"); - printf("Benchmarking %llu samples...\n", samples); + printf("Benchmarking %llu syscalls...\n", samples); /* Native call */ native = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; From patchwork Tue Jun 16 07:49:28 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Kees Cook X-Patchwork-Id: 11606745 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 3814A913 for ; Tue, 16 Jun 2020 07:49:55 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 21078207BC for ; Tue, 16 Jun 2020 07:49:55 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=chromium.org header.i=@chromium.org header.b="Mha/1v4Z" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1725896AbgFPHtw (ORCPT ); Tue, 16 Jun 2020 03:49:52 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56992 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726496AbgFPHtr (ORCPT ); Tue, 16 Jun 2020 03:49:47 -0400 Received: from mail-pf1-x444.google.com (mail-pf1-x444.google.com [IPv6:2607:f8b0:4864:20::444]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id E6695C08C5C5 for ; Tue, 16 Jun 2020 00:49:45 -0700 (PDT) Received: by mail-pf1-x444.google.com with SMTP id 23so9087563pfw.10 for ; Tue, 16 Jun 2020 00:49:45 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=chromium.org; s=google; h=from:to:cc:subject:date:message-id:in-reply-to:references :mime-version:content-transfer-encoding; bh=aNHAhNI6ZTS0Tgwm1Tz8nA62WMbjyUFexQ8Av4x5DPE=; b=Mha/1v4ZCwL99EzxQ318yAgdODuTalg/C5a5af5exiokd8c5p7ofhUjPq9F2KXq+2Z y7yP3+k5eNCRMrakWOPlm6eIxB24HvrLbf760fTeCcQu5b/qlXMhpecyVB/F0VkMtoBL zMVf+teQCmyVijKGcfvhfnSjCaUXm3PFBJyUc= X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=aNHAhNI6ZTS0Tgwm1Tz8nA62WMbjyUFexQ8Av4x5DPE=; b=C1AkGryBJV5JJpUZ53a/oupaPlLa95y0sqtDu3lyEhbYFw/GpjBZGFgJZFn5Bmdtxf +0utXrZ9dPRiBcFWimSEHsvVRGnpiadcAeH9yn3aFfm/IXsnezeCClY/Ri8szBjSWxSX fYacm21GqYvMDPtW/uhwiEhhL6DGfjcrFPGytWkT3Y2WAdW6geA5gtvHicsxoX7PlHHV 81zr2FnXPFqEI7Y7nqKafGklJgZ/1pro6G8EZZ9/jkjPqgHkEpdp17t+3HSsPKjj3cKT PHwjKjqJda+dbsYKIZqmSK2ckCfvjRO+349jGEfr60Pmf5SaRYuQMgYiKXZCtFkUgQq9 ypCg== X-Gm-Message-State: AOAM530tjQFlFg1b5nKdEAzyHmYcemFTfsShls/YvJG56YVyZ57eNXmw 5HnRY+MVEa6z/XMw8PfSoVn1Zg== X-Google-Smtp-Source: ABdhPJxI3CY4yrnJgeIk7sRjLPfxIjZxfINYy/6N9uGIlb1gaIy+PlZdT78JgpuwV7WO9L49REPdQw== X-Received: by 2002:aa7:972b:: with SMTP id k11mr1003541pfg.299.1592293785373; Tue, 16 Jun 2020 00:49:45 -0700 (PDT) Received: from www.outflux.net (smtp.outflux.net. [198.145.64.163]) by smtp.gmail.com with ESMTPSA id t76sm14171409pfc.220.2020.06.16.00.49.42 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 16 Jun 2020 00:49:42 -0700 (PDT) From: Kees Cook To: linux-kernel@vger.kernel.org Cc: Kees Cook , Christian Brauner , Sargun Dhillon , Tycho Andersen , Jann Horn , "zhujianwei (C)" , Dave Hansen , Matthew Wilcox , Andy Lutomirski , Will Drewry , Shuah Khan , Matt Denton , Chris Palmer , Jeffrey Vander Stoep , Aleksa Sarai , Hehuazhen , x86@kernel.org, Linux Containers , linux-security-module@vger.kernel.org, linux-api@vger.kernel.org Subject: [PATCH 2/8] seccomp: Use pr_fmt Date: Tue, 16 Jun 2020 00:49:28 -0700 Message-Id: <20200616074934.1600036-3-keescook@chromium.org> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20200616074934.1600036-1-keescook@chromium.org> References: <20200616074934.1600036-1-keescook@chromium.org> MIME-Version: 1.0 Sender: owner-linux-security-module@vger.kernel.org Precedence: bulk List-ID: Avoid open-coding "seccomp: " prefixes for pr_*() calls. Signed-off-by: Kees Cook --- kernel/seccomp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0016cad0e605..a319700c04c4 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -13,6 +13,7 @@ * Mode 2 allows user-defined system call filters in the form * of Berkeley Packet Filters/Linux Socket Filters. */ +#define pr_fmt(fmt) "seccomp: " fmt #include #include @@ -1873,7 +1874,7 @@ static int __init seccomp_sysctl_init(void) hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); if (!hdr) - pr_warn("seccomp: sysctl registration failed\n"); + pr_warn("sysctl registration failed\n"); else kmemleak_not_leak(hdr); From patchwork Tue Jun 16 07:49:29 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Kees Cook X-Patchwork-Id: 11606751 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 86BA3913 for ; Tue, 16 Jun 2020 07:50:06 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 6D79C208E4 for ; Tue, 16 Jun 2020 07:50:06 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=chromium.org header.i=@chromium.org header.b="IcgNnBPc" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726881AbgFPHuA (ORCPT ); Tue, 16 Jun 2020 03:50:00 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57014 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726467AbgFPHtt (ORCPT ); Tue, 16 Jun 2020 03:49:49 -0400 Received: from mail-pl1-x633.google.com (mail-pl1-x633.google.com [IPv6:2607:f8b0:4864:20::633]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 0376BC03E97C for ; Tue, 16 Jun 2020 00:49:47 -0700 (PDT) Received: by mail-pl1-x633.google.com with SMTP id d8so8013451plo.12 for ; Tue, 16 Jun 2020 00:49:47 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=chromium.org; s=google; h=from:to:cc:subject:date:message-id:in-reply-to:references :mime-version:content-transfer-encoding; bh=CiCnefOEfqSi7wYDZ0Ly9qFUXTY1JQW6i7HQc69RyoQ=; b=IcgNnBPcUI4XxaHRZv+68h1umX7cyvIMCP36Q7y+O5a5sA6ytY5XtslKRv21o3+cf9 X+FMmnNEztj2FpFUzVzclsbAf//w6fXPH21coPI4d9FweeYzdb0yM25T7mcIhl6EITHx x7tTwGEQtIjtktlXFsLdB31FXf9xbkZG5U6Uk= X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=CiCnefOEfqSi7wYDZ0Ly9qFUXTY1JQW6i7HQc69RyoQ=; b=pC0iQw/qUl9FbhYKiadyL/93bgMn+SBGV/DQgOsFMUiVtv0PQba7A6SbNAjA38zaYq V7pZdgmcxJBKtdQGwKJSVALoH0+DEzeTV2EKczkfZRtnGQcQFyWdGm+tOTlQjy6NMML7 wgRsqqYEK1AfTM2r3Rct7xiY9eh0VoFq2/9Eh/gdHQKlRliF7rYwVi5oSgte8cIxdLns GNGBLiDpntC7o15BnMoeHazHB73Delu5f0IOrQ87n+BiRpLyNBJHav0WZ7+YNA/iHYvH M0d1Oaz7tWQl+7Bzx0iSIAs+hqser4ej/PwEH+N1gmIxV9OQ2rV/kxI7mvMtoM/DuvKD 6TVw== X-Gm-Message-State: AOAM531hKfQNA7Xr59DImOdmS2U6JryBhGOJgnoWRJWHt0GhJoWOSNyr e532tRZI9980t18+TugVr8bsLg== X-Google-Smtp-Source: ABdhPJxhW3y6afh5FWceB3QtLHRl8svdq02t3gc51fHES1LTZe4hExYi0JoBBUl7YmojYiOSd5noTw== X-Received: by 2002:a17:90a:4d4e:: with SMTP id l14mr1619537pjh.10.1592293784832; Tue, 16 Jun 2020 00:49:44 -0700 (PDT) Received: from www.outflux.net (smtp.outflux.net. [198.145.64.163]) by smtp.gmail.com with ESMTPSA id z1sm1612258pjz.10.2020.06.16.00.49.42 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 16 Jun 2020 00:49:42 -0700 (PDT) From: Kees Cook To: linux-kernel@vger.kernel.org Cc: Kees Cook , Andy Lutomirski , Will Drewry , Christian Brauner , Sargun Dhillon , Tycho Andersen , Jann Horn , "zhujianwei (C)" , Dave Hansen , Matthew Wilcox , Andy Lutomirski , Shuah Khan , Matt Denton , Chris Palmer , Jeffrey Vander Stoep , Aleksa Sarai , Hehuazhen , x86@kernel.org, Linux Containers , linux-security-module@vger.kernel.org, linux-api@vger.kernel.org Subject: [PATCH 3/8] seccomp: Introduce SECCOMP_PIN_ARCHITECTURE Date: Tue, 16 Jun 2020 00:49:29 -0700 Message-Id: <20200616074934.1600036-4-keescook@chromium.org> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20200616074934.1600036-1-keescook@chromium.org> References: <20200616074934.1600036-1-keescook@chromium.org> MIME-Version: 1.0 Sender: owner-linux-security-module@vger.kernel.org Precedence: bulk List-ID: For systems that provide multiple syscall maps based on architectures (e.g. AUDIT_ARCH_X86_64 and AUDIT_ARCH_I386 via CONFIG_COMPAT), allow a fast way to pin the process to a specific syscall mapping, instead of needing to generate all filters with an architecture check as the first filter action. Cc: Andy Lutomirski Cc: Will Drewry Signed-off-by: Kees Cook --- include/linux/seccomp.h | 3 +++ include/uapi/linux/seccomp.h | 1 + kernel/seccomp.c | 37 ++++++++++++++++++++++++++++++++++-- 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index babcd6c02d09..6525ddec177a 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -30,6 +30,9 @@ struct seccomp_filter; */ struct seccomp { int mode; +#ifdef CONFIG_COMPAT + u32 arch; +#endif atomic_t filter_count; struct seccomp_filter *filter; }; diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index c1735455bc53..84e89bb201ae 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -16,6 +16,7 @@ #define SECCOMP_SET_MODE_FILTER 1 #define SECCOMP_GET_ACTION_AVAIL 2 #define SECCOMP_GET_NOTIF_SIZES 3 +#define SECCOMP_PIN_ARCHITECTURE 4 /* Valid flags for SECCOMP_SET_MODE_FILTER */ #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index a319700c04c4..43edf53c2d84 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -268,9 +268,16 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, struct seccomp_filter **match) { u32 ret = SECCOMP_RET_ALLOW; + struct seccomp_filter *f; + +#ifdef CONFIG_COMPAT + /* Block mismatched architectures. */ + if (current->seccomp.arch && current->seccomp.arch != sd->arch) + return SECCOMP_RET_KILL_PROCESS; +#endif + /* Make sure cross-thread synced filter points somewhere sane. */ - struct seccomp_filter *f = - READ_ONCE(current->seccomp.filter); + f = READ_ONCE(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (WARN_ON(f == NULL)) @@ -478,6 +485,11 @@ static inline void seccomp_sync_threads(unsigned long flags) if (task_no_new_privs(caller)) task_set_no_new_privs(thread); +#ifdef CONFIG_COMPAT + /* Copy any pinned architecture. */ + thread->seccomp.arch = caller->seccomp.arch; +#endif + /* * Opt the other thread into seccomp if needed. * As threads are considered to be trust-realm @@ -1456,6 +1468,20 @@ static long seccomp_get_notif_sizes(void __user *usizes) return 0; } +static long seccomp_pin_architecture(void) +{ +#ifdef CONFIG_COMPAT + u32 arch = syscall_get_arch(current); + + /* How did you even get here? */ + if (current->seccomp.arch && current->seccomp.arch != arch) + return -EBUSY; + + current->seccomp.arch = arch; +#endif + return 0; +} + /* Common entry point for both prctl and syscall. */ static long do_seccomp(unsigned int op, unsigned int flags, void __user *uargs) @@ -1477,6 +1503,13 @@ static long do_seccomp(unsigned int op, unsigned int flags, return -EINVAL; return seccomp_get_notif_sizes(uargs); + case SECCOMP_PIN_ARCHITECTURE: + if (flags != 0) + return -EINVAL; + if (uargs != NULL) + return -EINVAL; + + return seccomp_pin_architecture(); default: return -EINVAL; } From patchwork Tue Jun 16 07:49:30 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Kees Cook X-Patchwork-Id: 11606759 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 878F860D for ; Tue, 16 Jun 2020 07:50:40 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 5DB96206D7 for ; Tue, 16 Jun 2020 07:50:40 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=chromium.org header.i=@chromium.org header.b="f9r7BI5F" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726526AbgFPHuY (ORCPT ); Tue, 16 Jun 2020 03:50:24 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57004 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726531AbgFPHts (ORCPT ); Tue, 16 Jun 2020 03:49:48 -0400 Received: from mail-pg1-x542.google.com (mail-pg1-x542.google.com [IPv6:2607:f8b0:4864:20::542]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 5AC43C08C5C9 for ; Tue, 16 Jun 2020 00:49:47 -0700 (PDT) Received: by mail-pg1-x542.google.com with SMTP id d4so1812986pgk.4 for ; Tue, 16 Jun 2020 00:49:47 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=chromium.org; s=google; h=from:to:cc:subject:date:message-id:in-reply-to:references :mime-version:content-transfer-encoding; bh=hymiFj42whdy2D05bhHghrqvNPgonrcbk2GbFm3EsyI=; b=f9r7BI5FO+bkXtD3hSIV/gO/wykpcs5Nzg5uxvgwjdkEuE1YiLJ6bjd62HL+z1G7wf 9mJpAuojdEY7w8SJooynYRV1iiolNxDFS/JPxuueIwKEPebi8V9NBBP+WhY1V1A2Whbc r5Cq2o+4beGeb3ZnFnvnJTb4H0+uzqyzh9VZk= X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=hymiFj42whdy2D05bhHghrqvNPgonrcbk2GbFm3EsyI=; b=pUB834TCefp6RxMaqQoj2pBYb9GtUUsxxHCgDb7NH6pxy1M8xoPvhKUbsYLxm9dALZ i9Sk8KmKwM719Ey1OOeQysO+3I1ijhkrVdeP7fSioBipBXhNRWc3GjvxSO76twadiEp2 7UsB90flOle0sYc0F5kOqVuoosC1b2YwhdzLLCGcXN8s6HTYSsJMXB/ULYqPFMhOxUKD dpkvUWkrrEK5xuFdw1un32sMgEdxon+/vkJFzs7DJT0xNjNIIu05d9LHtwisl44kR2Ft 2Fd1bgYWiLRmDc+9DGbvkaNdsWlayl9wN0edfwfnuRmsZkz41GqTL4sz2Jv+oushKe3t MOAg== X-Gm-Message-State: AOAM531lpERMGBz8xoJ27RODBaP/Ba+/R52K3zHjwAW0yiu66rqEKSPy aqDiqwf0+/7No3fOh3Bwlu30+w== X-Google-Smtp-Source: ABdhPJyJE2ecymVAnnpMx8JwXvxakKaTc+cHmhmj6xoqwH9R6Zju+NCUJsZ1oJjUnMfzMDAdkGqUiA== X-Received: by 2002:aa7:9289:: with SMTP id j9mr1077179pfa.124.1592293786493; Tue, 16 Jun 2020 00:49:46 -0700 (PDT) Received: from www.outflux.net (smtp.outflux.net. [198.145.64.163]) by smtp.gmail.com with ESMTPSA id 140sm16357711pfz.154.2020.06.16.00.49.42 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 16 Jun 2020 00:49:42 -0700 (PDT) From: Kees Cook To: linux-kernel@vger.kernel.org Cc: Kees Cook , Christian Brauner , Sargun Dhillon , Tycho Andersen , Jann Horn , "zhujianwei (C)" , Dave Hansen , Matthew Wilcox , Andy Lutomirski , Will Drewry , Shuah Khan , Matt Denton , Chris Palmer , Jeffrey Vander Stoep , Aleksa Sarai , Hehuazhen , x86@kernel.org, Linux Containers , linux-security-module@vger.kernel.org, linux-api@vger.kernel.org Subject: [PATCH 4/8] seccomp: Implement constant action bitmaps Date: Tue, 16 Jun 2020 00:49:30 -0700 Message-Id: <20200616074934.1600036-5-keescook@chromium.org> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20200616074934.1600036-1-keescook@chromium.org> References: <20200616074934.1600036-1-keescook@chromium.org> MIME-Version: 1.0 Sender: owner-linux-security-module@vger.kernel.org Precedence: bulk List-ID: One of the most common pain points with seccomp filters has been dealing with the overhead of processing the filters, especially for "always allow" or "always reject" cases. While BPF is extremely fast[1], it will always have overhead associated with it. Additionally, due to seccomp's design, filters are layered, which means processing time goes up as the number of filters attached goes up. In the past, efforts have been focused on making filter execution complete in a shorter amount of time. For example, filters were rewritten from using linear if/then/else syscall search to using balanced binary trees, or moving tests for syscalls common to the process's workload to the front of the filter. However, there are limits to this, especially when some processes are dealing with tens of filters[2], or when some architectures have a less efficient BPF engine[3]. The most common use of seccomp, constructing syscall block/allow-lists, where syscalls that are always allowed or always rejected (without regard to any arguments), also tends to produce the most pathological runtime problems, in that a large number of syscall checks in the filter need to be performed to come to a determination. In order to optimize these cases from O(n) to O(1), seccomp can use bitmaps to immediately determine the desired action. A critical observation in the prior paragraph bears repeating: the common case for syscall tests do not check arguments. For any given filter, there is a constant mapping from the combination of architecture and syscall to the seccomp action result. (For kernels/architectures without CONFIG_COMPAT, there is a single architecture.). As such, it is possible to construct a mapping of arch/syscall to action, which can be updated as new filters are attached to a process. In order to build this mapping at filter attach time, each filter is executed for every syscall (under each possible architecture), and checked for any accesses of struct seccomp_data that are not the "arch" nor "nr" (syscall) members. If only "arch" and "nr" are examined, then there is a constant mapping for that syscall, and bitmaps can be updated accordingly. If any accesses happen outside of those struct members, seccomp must not bypass filter execution for that syscall, since program state will be used to determine filter action result. During syscall action probing, in order to determine whether other members of struct seccomp_data are being accessed during a filter execution, the struct is placed across a page boundary with the "arch" and "nr" members in the first page, and everything else in the second page. The "page accessed" flag is cleared in the second page's PTE, and the filter is run. If the "page accessed" flag appears as set after running the filter, we can determine that the filter looked beyond the "arch" and "nr" members, and exclude that syscall from the constant action bitmaps. For architectures to support this optimization, they must declare their architectures for seccomp to see (via SECCOMP_ARCH and SECCOMP_ARCH_COMPAT macros), and provide a way to perform efficient CPU-local kernel TLB flushes (via local_flush_tlb_kernel_range()), and then set HAVE_ARCH_SECCOMP_BITMAP in their Kconfig. Areas needing more attention: On x86, this currently adds 168 bytes (or 336 bytes under CONFIG_COMPAT) to the size of task_struct. Allocating these on demand may be a better use of memory, but may not result in good cache locality. For architectures with "synthetic" architectures, like x86_x32, additional work is needed. It should be possible to define a simple mechanism based on the masking done in the x86 syscall entry path to create another set of bitmaps for seccomp to key off of. I am, however, considering just leaving HAVE_ARCH_SECCOMP_BITMAP depend on !X86_X32. [1] https://lore.kernel.org/bpf/20200531171915.wsxvdjeetmhpsdv2@ast-mbp.dhcp.thefacebook.com/ [2] https://lore.kernel.org/bpf/20200601101137.GA121847@gardel-login/ [3] https://lore.kernel.org/bpf/717a06e7f35740ccb4c70470ec70fb2f@huawei.com/ Signed-off-by: Kees Cook --- arch/Kconfig | 7 ++ include/linux/seccomp.h | 15 +++ kernel/seccomp.c | 227 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 246 insertions(+), 3 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 8cc35dc556c7..4e692b7a4435 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -465,6 +465,13 @@ config SECCOMP_FILTER See Documentation/userspace-api/seccomp_filter.rst for details. +config HAVE_ARCH_SECCOMP_BITMAP + bool + help + An arch should select this symbol if it provides all of these things: + - SECCOMP_ARCH (and SECCOMP_ARCH_COMPAT if appropriate) + - local_flush_tlb_kernel_range() + config HAVE_ARCH_STACKLEAK bool help diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 6525ddec177a..31ee2d6f4ec0 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -16,6 +16,17 @@ #include #include +/* When no bits are set for a syscall, filters are run. */ +struct seccomp_bitmaps { +#ifdef CONFIG_HAVE_ARCH_SECCOMP_BITMAP + /* "allow" are initialized to set and only ever get cleared. */ + DECLARE_BITMAP(allow, NR_syscalls); + /* These are initialized to clear and only ever get set. */ + DECLARE_BITMAP(kill_thread, NR_syscalls); + DECLARE_BITMAP(kill_process, NR_syscalls); +#endif +}; + struct seccomp_filter; /** * struct seccomp - the state of a seccomp'ed process @@ -35,6 +46,10 @@ struct seccomp { #endif atomic_t filter_count; struct seccomp_filter *filter; + struct seccomp_bitmaps native; +#ifdef CONFIG_COMPAT + struct seccomp_bitmaps compat; +#endif }; #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 43edf53c2d84..2fbe7d2260f7 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -44,6 +44,11 @@ #include #include +#ifdef CONFIG_HAVE_ARCH_SECCOMP_BITMAP +#include +#include +#endif + enum notify_state { SECCOMP_NOTIFY_INIT, SECCOMP_NOTIFY_SENT, @@ -476,6 +481,16 @@ static inline void seccomp_sync_threads(unsigned long flags) atomic_set(&thread->seccomp.filter_count, atomic_read(&thread->seccomp.filter_count)); + /* Copy syscall filter bitmaps. */ + memcpy(&thread->seccomp.native, + &caller->seccomp.native, + sizeof(caller->seccomp.native)); +#ifdef CONFIG_COMPAT + memcpy(&thread->seccomp.compat, + &caller->seccomp.compat, + sizeof(caller->seccomp.compat)); +#endif + /* * Don't let an unprivileged task work around * the no_new_privs restriction by creating @@ -578,6 +593,144 @@ seccomp_prepare_user_filter(const char __user *user_filter) return filter; } +static inline bool sd_touched(pte_t *ptep) +{ + return !!pte_young(*(READ_ONCE(ptep))); +} + +#ifdef CONFIG_HAVE_ARCH_SECCOMP_BITMAP +/* + * We can build bitmaps only when an arch/nr combination reads nothing more + * that sd->nr and sd->arch, since those have a constant mapping to the + * syscall. To do this, we can run the filters for each syscall number, and + * examine the page table entry that is aligned to everything past sd->arch, + * checking for the ACCESSED flag. + * + * This approach could also be used to test for access to sd->arch too, + * if we wanted to warn about compat-unsafe filters. + */ +static void seccomp_update_bitmap(struct seccomp_filter *filter, + void *pagepair, u32 arch, + struct seccomp_bitmaps *bitmaps) +{ + struct seccomp_data *sd; + unsigned long vaddr; + u32 nr, ret; + pte_t *ptep; + u64 check; + + /* Initialize bitmaps for first filter. */ + if (!filter->prev) + bitmap_fill(bitmaps->allow, NR_syscalls); + /* + * Prepare to detect memory accesses: find the PTE for the second page + * in the page pair. + */ + vaddr = (unsigned long)(pagepair + PAGE_SIZE); + ptep = virt_to_kpte(vaddr); + /* + * Split struct seccomp_data across two pages, with everything after + * sd->arch (i.e. starting with sd->instruction_pointer), in the second + * page of the page pair. + */ + sd = pagepair + PAGE_SIZE - offsetof(struct seccomp_data, instruction_pointer); + + /* Mark the second page as untouched (i.e. "old") */ + preempt_disable(); + set_pte_at(&init_mm, vaddr, ptep, pte_mkold(*(READ_ONCE(ptep)))); + local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE); + preempt_enable(); + /* Make sure the PTE agrees that it is untouched. */ + if (WARN_ON_ONCE(sd_touched(ptep))) + return; + /* Read a portion of struct seccomp_data from the second page. */ + check = sd->instruction_pointer; + /* First, verify the contents are zero from vzalloc(). */ + if (WARN_ON_ONCE(check)) + return; + /* Now make sure the ACCESSED bit has been set after the read. */ + if (!sd_touched(ptep)) { + /* + * If autodetection fails, fall back to standard beahavior by + * clearing the entire "allow" bitmap. + */ + pr_warn_once("seccomp: cannot build automatic syscall filters\n"); + bitmap_zero(bitmaps->allow, NR_syscalls); + return; + } + + /* + * For every syscall, if we don't already know we need to run + * the full filter, simulate the filter with our static values. + */ + for (nr = 0; nr < NR_syscalls; nr++) { + /* Are we already at the maximal rejection state? */ + if (test_bit(nr, bitmaps->kill_process)) + continue; + + sd->nr = nr; + sd->arch = arch; + + /* Do we need to reset the ACCESSED bit? */ + if (sd_touched(ptep)) { + preempt_disable(); + set_pte_at(&init_mm, vaddr, ptep, pte_mkold(*(READ_ONCE(ptep)))); + local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE); + preempt_enable(); + } + + /* Evaluate filter for this syscall. */ + ret = bpf_prog_run_pin_on_cpu(filter->prog, sd); + /* + * If this run through the filter didn't access + * beyond "arch", we know the result is a constant + * mapping for arch/nr -> ret. + */ + if (!sd_touched(ptep)) { + /* Constant evaluation. Mark appropriate bitmaps. */ + switch (ret) { + case SECCOMP_RET_KILL_PROCESS: + set_bit(nr, bitmaps->kill_process); + break; + case SECCOMP_RET_KILL_THREAD: + set_bit(nr, bitmaps->kill_thread); + break; + default: + break; + case SECCOMP_RET_ALLOW: + /* + * If we always map to allow, there are + * no changes needed to the bitmaps. + */ + continue; + } + } + + /* + * Dynamic evaluation of syscall, or non-allow constant + * mapping to something other than SECCOMP_RET_ALLOW: we + * must not short-circuit-allow it anymore. + */ + clear_bit(nr, bitmaps->allow); + } +} + +static void seccomp_update_bitmaps(struct seccomp_filter *filter, + void *pagepair) +{ + seccomp_update_bitmap(filter, pagepair, SECCOMP_ARCH, + ¤t->seccomp.native); +#ifdef CONFIG_COMPAT + seccomp_update_bitmap(filter, pagepair, SECCOMP_ARCH_COMPAT, + ¤t->seccomp.compat); +#endif +} +#else +static void seccomp_update_bitmaps(struct seccomp_filter *filter, + void *pagepair) +{ } +#endif + /** * seccomp_attach_filter: validate and attach filter * @flags: flags to change filter behavior @@ -591,7 +744,8 @@ seccomp_prepare_user_filter(const char __user *user_filter) * - in NEW_LISTENER mode: the fd of the new listener */ static long seccomp_attach_filter(unsigned int flags, - struct seccomp_filter *filter) + struct seccomp_filter *filter, + void *pagepair) { unsigned long total_insns; struct seccomp_filter *walker; @@ -630,6 +784,9 @@ static long seccomp_attach_filter(unsigned int flags, current->seccomp.filter = filter; atomic_inc(¤t->seccomp.filter_count); + /* Evaluate filter for new known-outcome syscalls */ + seccomp_update_bitmaps(filter, pagepair); + /* Now that the new filter is in place, synchronize to all threads. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) seccomp_sync_threads(flags); @@ -857,6 +1014,56 @@ static int seccomp_do_user_notification(int this_syscall, return -1; } +#ifdef CONFIG_HAVE_ARCH_SECCOMP_BITMAP +static inline bool __bypass_filter(struct seccomp_bitmaps *bitmaps, + u32 nr, u32 *filter_ret) +{ + if (nr < NR_syscalls) { + if (test_bit(nr, current->seccomp.native.allow)) { + *filter_ret = SECCOMP_RET_ALLOW; + return true; + } + if (test_bit(nr, current->seccomp.native.kill_process)) { + *filter_ret = SECCOMP_RET_KILL_PROCESS; + return true; + } + if (test_bit(nr, current->seccomp.native.kill_thread)) { + *filter_ret = SECCOMP_RET_KILL_THREAD; + return true; + } + } + return false; +} + +static inline u32 check_syscall(const struct seccomp_data *sd, + struct seccomp_filter **match) +{ + u32 filter_ret = SECCOMP_RET_KILL_PROCESS; + +#ifdef CONFIG_COMPAT + if (sd->arch == SECCOMP_ARCH) { +#endif + if (__bypass_filter(¤t->seccomp.native, sd->nr, &filter_ret)) + return filter_ret; +#ifdef CONFIG_COMPAT + } else if (sd->arch == SECCOMP_ARCH_COMPAT) { + if (__bypass_filter(¤t->seccomp.compat, sd->nr, &filter_ret)) + return filter_ret; + } else { + WARN_ON_ONCE(1); + return filter_ret; + } +#endif + return seccomp_run_filters(sd, match); +} +#else +static inline u32 check_syscall(const struct seccomp_data *sd, + struct seccomp_filter **match) +{ + return seccomp_run_filters(sd, match); +} +#endif + static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) { @@ -876,7 +1083,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, sd = &sd_local; } - filter_ret = seccomp_run_filters(sd, &match); + filter_ret = check_syscall(sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION_FULL; @@ -1346,6 +1553,7 @@ static long seccomp_set_mode_filter(unsigned int flags, long ret = -EINVAL; int listener = -1; struct file *listener_f = NULL; + void *pagepair; /* Validate flags. */ if (flags & ~SECCOMP_FILTER_FLAG_MASK) @@ -1391,12 +1599,24 @@ static long seccomp_set_mode_filter(unsigned int flags, mutex_lock_killable(¤t->signal->cred_guard_mutex)) goto out_put_fd; + /* + * This memory will be needed for bitmap testing, but we'll + * be holding a spinlock at that point. Do the allocation + * (and free) outside of the lock. + * + * Alternative: we could do the bitmap update before attach + * to avoid spending too much time under lock. + */ + pagepair = vzalloc(PAGE_SIZE * 2); + if (!pagepair) + goto out_put_fd; + spin_lock_irq(¤t->sighand->siglock); if (!seccomp_may_assign_mode(seccomp_mode)) goto out; - ret = seccomp_attach_filter(flags, prepared); + ret = seccomp_attach_filter(flags, prepared, pagepair); if (ret) goto out; /* Do not free the successfully attached filter. */ @@ -1405,6 +1625,7 @@ static long seccomp_set_mode_filter(unsigned int flags, seccomp_assign_mode(current, seccomp_mode, flags); out: spin_unlock_irq(¤t->sighand->siglock); + vfree(pagepair); if (flags & SECCOMP_FILTER_FLAG_TSYNC) mutex_unlock(¤t->signal->cred_guard_mutex); out_put_fd: From patchwork Tue Jun 16 07:49:31 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Kees Cook X-Patchwork-Id: 11606747 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id E10DE913 for ; Tue, 16 Jun 2020 07:50:01 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id BFDD920857 for ; Tue, 16 Jun 2020 07:50:01 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=chromium.org header.i=@chromium.org header.b="IaxMJQ7n" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727029AbgFPHuA (ORCPT ); Tue, 16 Jun 2020 03:50:00 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56992 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726755AbgFPHtt (ORCPT ); Tue, 16 Jun 2020 03:49:49 -0400 Received: from mail-pl1-x642.google.com (mail-pl1-x642.google.com [IPv6:2607:f8b0:4864:20::642]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 12D1EC08C5CB for ; Tue, 16 Jun 2020 00:49:48 -0700 (PDT) Received: by mail-pl1-x642.google.com with SMTP id 35so2683106ple.0 for ; Tue, 16 Jun 2020 00:49:48 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=chromium.org; s=google; h=from:to:cc:subject:date:message-id:in-reply-to:references :mime-version:content-transfer-encoding; bh=wt0dnmhR56V/P6llfsBTHQQvjngTx36J9Qjg5uVymmU=; b=IaxMJQ7nBNiW5yMR3uEv+C0sfD9cWFXioN0idRkD1TyTDHTD3tjSieVkLQPYYJXrN9 UG6DPRLyd6b0Nkl+H0xAFSlOV80Bgr+l1zUu6QPhgWP8WQRiciWN8PPoIcTdZS1q5VcL Z/2hOwYi9tyaYY0xQbA4ecFT+gFRiaKCQId3A= X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=wt0dnmhR56V/P6llfsBTHQQvjngTx36J9Qjg5uVymmU=; b=WITQQUsB5CXoerPJx/3HZbA9zt9s5zVdnYCUkHmfawSVBDo+7NGl1xgGCLhcUbI88y uBSx3ArQekiySuqgmTWZhOoRFgztgVBY1CJ0UQrHDogGwPWZ54WmtSAIbeevSS4SHOwj PmB/5VB9BSpL0R1Heu0MnH7dB9pcgQuiyEYUweITrFi+4OhB0UWAku2LZ1gqdg33RQyA eitcdDtCuvQqxvIk2BA+hRWexJY7uUMJd8DwlKJE8yXD9rqupkvahLh/lBlC+U2uKGf2 EljBEgG75WriKGOn+aV9wziSYeya9Bxxt0ZmBMKKyiaWuirmPIsK8K8ACLlo15L48uPc LHMA== X-Gm-Message-State: AOAM5339b9jdEa4UzIuwuVuVy4VXtHvoCkRTekqJgwwti2qR+AzjNw25 D+d57SdINsFPnMpo0Z++Xitqbw== X-Google-Smtp-Source: ABdhPJz51Cj3QahaU02gjrHM4b4Cws12LNOG8GBUTha+3SWwT0/3aI1Q636W96Qx6LeNZBs7QwuWyA== X-Received: by 2002:a17:902:b710:: with SMTP id d16mr1054700pls.28.1592293787567; Tue, 16 Jun 2020 00:49:47 -0700 (PDT) Received: from www.outflux.net (smtp.outflux.net. [198.145.64.163]) by smtp.gmail.com with ESMTPSA id n65sm15622327pfn.17.2020.06.16.00.49.44 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 16 Jun 2020 00:49:46 -0700 (PDT) From: Kees Cook To: linux-kernel@vger.kernel.org Cc: Kees Cook , Christian Brauner , Sargun Dhillon , Tycho Andersen , Jann Horn , "zhujianwei (C)" , Dave Hansen , Matthew Wilcox , Andy Lutomirski , Will Drewry , Shuah Khan , Matt Denton , Chris Palmer , Jeffrey Vander Stoep , Aleksa Sarai , Hehuazhen , x86@kernel.org, Linux Containers , linux-security-module@vger.kernel.org, linux-api@vger.kernel.org Subject: [PATCH 5/8] selftests/seccomp: Compare bitmap vs filter overhead Date: Tue, 16 Jun 2020 00:49:31 -0700 Message-Id: <20200616074934.1600036-6-keescook@chromium.org> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20200616074934.1600036-1-keescook@chromium.org> References: <20200616074934.1600036-1-keescook@chromium.org> MIME-Version: 1.0 Sender: owner-linux-security-module@vger.kernel.org Precedence: bulk List-ID: As part of the seccomp benchmarking, include the expectations with regard to the timing behavior of the constant action bitmaps, and report inconsistencies better. Example output with constant action bitmaps on x86: $ sudo ./seccomp_benchmark 30344920 Current BPF sysctl settings: net.core.bpf_jit_enable = 1 net.core.bpf_jit_harden = 0 Benchmarking 30344920 syscalls... 22.113430452 - 0.005691205 = 22107739247 (22.1s) getpid native: 728 ns 44.867669556 - 22.113755935 = 22753913621 (22.8s) getpid RET_ALLOW 1 filter (bitmap): 749 ns 67.649040358 - 44.868003056 = 22781037302 (22.8s) getpid RET_ALLOW 2 filters (bitmap): 750 ns 92.555661414 - 67.650328959 = 24905332455 (24.9s) getpid RET_ALLOW 3 filters (full): 820 ns 118.170831065 - 92.556057543 = 25614773522 (25.6s) getpid RET_ALLOW 4 filters (full): 844 ns Estimated total seccomp overhead for 1 bitmapped filter: 21 ns Estimated total seccomp overhead for 2 bitmapped filters: 22 ns Estimated total seccomp overhead for 3 full filters: 92 ns Estimated total seccomp overhead for 4 full filters: 116 ns Estimated seccomp entry overhead: 20 ns Estimated seccomp per-filter overhead (last 2 diff): 24 ns Estimated seccomp per-filter overhead (filters / 4): 24 ns Expectations: native ≤ 1 bitmap (728 ≤ 749): ✔️ native ≤ 1 filter (728 ≤ 820): ✔️ per-filter (last 2 diff) ≈ per-filter (filters / 4) (24 ≈ 24): ✔️ 1 bitmapped ≈ 2 bitmapped (21 ≈ 22): ✔️ entry ≈ 1 bitmapped (20 ≈ 21): ✔️ entry ≈ 2 bitmapped (20 ≈ 22): ✔️ native + entry + (per filter * 4) ≈ 4 filters total (844 ≈ 844): ✔️ Signed-off-by: Kees Cook --- .../selftests/seccomp/seccomp_benchmark.c | 151 +++++++++++++++--- tools/testing/selftests/seccomp/settings | 2 +- 2 files changed, 130 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index 91f5a89cadac..fcc806585266 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -4,12 +4,16 @@ */ #define _GNU_SOURCE #include +#include +#include +#include #include #include #include #include #include #include +#include #include #include #include @@ -70,18 +74,74 @@ unsigned long long calibrate(void) return samples * seconds; } +bool approx(int i_one, int i_two) +{ + double one = i_one, one_bump = one * 0.01; + double two = i_two, two_bump = two * 0.01; + + one_bump = one + MAX(one_bump, 2.0); + two_bump = two + MAX(two_bump, 2.0); + + /* Equal to, or within 1% or 2 digits */ + if (one == two || + (one > two && one <= two_bump) || + (two > one && two <= one_bump)) + return true; + return false; +} + +bool le(int i_one, int i_two) +{ + if (i_one <= i_two) + return true; + return false; +} + +long compare(const char *name_one, const char *name_eval, const char *name_two, + unsigned long long one, bool (*eval)(int, int), unsigned long long two) +{ + bool good; + + printf("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two, + (long long)one, name_eval, (long long)two); + if (one > INT_MAX) { + printf("Miscalculation! Measurement went negative: %lld\n", (long long)one); + return 1; + } + if (two > INT_MAX) { + printf("Miscalculation! Measurement went negative: %lld\n", (long long)two); + return 1; + } + + good = eval(one, two); + printf("%s\n", good ? "✔️" : "❌"); + + return good ? 0 : 1; +} + int main(int argc, char *argv[]) { + struct sock_filter bitmap_filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog bitmap_prog = { + .len = (unsigned short)ARRAY_SIZE(bitmap_filter), + .filter = bitmap_filter, + }; struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, args[0])), BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog prog = { .len = (unsigned short)ARRAY_SIZE(filter), .filter = filter, }; - long ret; - unsigned long long samples; - unsigned long long native, filter1, filter2; + + long ret, bits; + unsigned long long samples, calc; + unsigned long long native, filter1, filter2, bitmap1, bitmap2; + unsigned long long entry, per_filter1, per_filter2; printf("Current BPF sysctl settings:\n"); system("sysctl net.core.bpf_jit_enable"); @@ -101,35 +161,82 @@ int main(int argc, char *argv[]) ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); assert(ret == 0); - /* One filter */ - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + /* One filter resulting in a bitmap */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); assert(ret == 0); - filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1); + bitmap1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1); + + /* Second filter resulting in a bitmap */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); + assert(ret == 0); - if (filter1 == native) - printf("No overhead measured!? Try running again with more samples.\n"); + bitmap2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2); - /* Two filters */ + /* Third filter, can no longer be converted to bitmap */ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); assert(ret == 0); - filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2); - - /* Calculations */ - printf("Estimated total seccomp overhead for 1 filter: %llu ns\n", - filter1 - native); + filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1); - printf("Estimated total seccomp overhead for 2 filters: %llu ns\n", - filter2 - native); + /* Fourth filter, can not be converted to bitmap because of filter 3 */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); + assert(ret == 0); - printf("Estimated seccomp per-filter overhead: %llu ns\n", - filter2 - filter1); + filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2); + + /* Estimations */ +#define ESTIMATE(fmt, var, what) do { \ + var = (what); \ + printf("Estimated " fmt ": %llu ns\n", var); \ + if (var > INT_MAX) \ + goto more_samples; \ + } while (0) + + ESTIMATE("total seccomp overhead for 1 bitmapped filter", calc, + bitmap1 - native); + ESTIMATE("total seccomp overhead for 2 bitmapped filters", calc, + bitmap2 - native); + ESTIMATE("total seccomp overhead for 3 full filters", calc, + filter1 - native); + ESTIMATE("total seccomp overhead for 4 full filters", calc, + filter2 - native); + ESTIMATE("seccomp entry overhead", entry, + bitmap1 - native - (bitmap2 - bitmap1)); + ESTIMATE("seccomp per-filter overhead (last 2 diff)", per_filter1, + filter2 - filter1); + ESTIMATE("seccomp per-filter overhead (filters / 4)", per_filter2, + (filter2 - native - entry) / 4); + + printf("Expectations:\n"); + ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1); + bits = compare("native", "≤", "1 filter", native, le, filter1); + if (bits) + goto more_samples; + + ret |= compare("per-filter (last 2 diff)", "≈", "per-filter (filters / 4)", + per_filter1, approx, per_filter2); + + bits = compare("1 bitmapped", "≈", "2 bitmapped", + bitmap1 - native, approx, bitmap2 - native); + if (bits) { + printf("Skipping constant action bitmap expectations: they appear unsupported.\n"); + goto out; + } - printf("Estimated seccomp entry overhead: %llu ns\n", - filter1 - native - (filter2 - filter1)); + ret |= compare("entry", "≈", "1 bitmapped", entry, approx, bitmap1 - native); + ret |= compare("entry", "≈", "2 bitmapped", entry, approx, bitmap2 - native); + ret |= compare("native + entry + (per filter * 4)", "≈", "4 filters total", + entry + (per_filter1 * 4) + native, approx, filter2); + if (ret == 0) + goto out; +more_samples: + printf("Saw unexpected benchmark result. Try running again with more samples?\n"); +out: return 0; } diff --git a/tools/testing/selftests/seccomp/settings b/tools/testing/selftests/seccomp/settings index ba4d85f74cd6..6091b45d226b 100644 --- a/tools/testing/selftests/seccomp/settings +++ b/tools/testing/selftests/seccomp/settings @@ -1 +1 @@ -timeout=90 +timeout=120 From patchwork Tue Jun 16 07:49:32 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Kees Cook X-Patchwork-Id: 11606749 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id B6FF160D for ; Tue, 16 Jun 2020 07:50:03 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id A07E2207C4 for ; Tue, 16 Jun 2020 07:50:03 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=chromium.org header.i=@chromium.org header.b="iqIaqckV" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726879AbgFPHuB (ORCPT ); Tue, 16 Jun 2020 03:50:01 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57004 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726912AbgFPHt5 (ORCPT ); Tue, 16 Jun 2020 03:49:57 -0400 Received: from mail-pj1-x1041.google.com (mail-pj1-x1041.google.com [IPv6:2607:f8b0:4864:20::1041]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D4B3DC08C5C5 for ; Tue, 16 Jun 2020 00:49:49 -0700 (PDT) Received: by mail-pj1-x1041.google.com with SMTP id i4so1168889pjd.0 for ; Tue, 16 Jun 2020 00:49:49 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=chromium.org; s=google; h=from:to:cc:subject:date:message-id:in-reply-to:references :mime-version:content-transfer-encoding; bh=ZajOneOdlQKQYlo8Wv9k+OekhqsO1R3/YXgaBEECIjE=; b=iqIaqckVAQUEvZtp4+/qsmQGE37BQNoO7PPWo/5IR2umOLzuafxrc36ojsqisBA3bi 4zDJkmA8POCcTYB6xD7jmfglS/ATrJpAhXrtlCt2PRv4l86gJXFiKqXPxzD8cJrtBfyx OCYp+xbwpnhRS2dDLpCWXdlx5v5ovlS/TJnjk= X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=ZajOneOdlQKQYlo8Wv9k+OekhqsO1R3/YXgaBEECIjE=; b=ANhMfjCkrICPhNtn0NxeAWhY2gx+y5P8vZNL4m9WwPpgNC9jfmAPN8uPlIfkUxYPWN Ri0AHUf2sG056eny9tWAKCTp/L0JzzcFvTtfW+WI2knWJVwTSW5JHoZQzv1fdmHvZ45w K1PuG/5V2fCGpi2ruC+BeRVPbjHqijIzR+XYAVg+uPtAETW9qvtTU6X7XlYY/j5q4tGR EY8ncOEuFW1ftSZzvBYoumgsKV30xrcrFdMhJZCCc44XMq1gaquYsxBEzjA1PKYJhT5+ LF2/0TDlYMpfu0+mIsxQ0fInqkaA7qGzVNRYANua23enk6OGkruczKDfCusmQBMS8O5g qGcg== X-Gm-Message-State: AOAM530Ad/HL44h/OUy8IQaIRux25n1KXak31reW4IjJYhIxccByjIUH PyVEk2Qw+Zg8RPjYpBU1/E9R/A== X-Google-Smtp-Source: ABdhPJzFGf3zdAH2lmOL+oV+0KAfuNbBFFTLvoCLw3Vpzi0gDjZEe/FxPo59Sd/M81CmJjESIzVUVw== X-Received: by 2002:a17:90a:f508:: with SMTP id cs8mr1719058pjb.16.1592293789402; Tue, 16 Jun 2020 00:49:49 -0700 (PDT) Received: from www.outflux.net (smtp.outflux.net. [198.145.64.163]) by smtp.gmail.com with ESMTPSA id m22sm17139899pfk.216.2020.06.16.00.49.44 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 16 Jun 2020 00:49:46 -0700 (PDT) From: Kees Cook To: linux-kernel@vger.kernel.org Cc: Kees Cook , Christian Brauner , Sargun Dhillon , Tycho Andersen , Jann Horn , "zhujianwei (C)" , Dave Hansen , Matthew Wilcox , Andy Lutomirski , Will Drewry , Shuah Khan , Matt Denton , Chris Palmer , Jeffrey Vander Stoep , Aleksa Sarai , Hehuazhen , x86@kernel.org, Linux Containers , linux-security-module@vger.kernel.org, linux-api@vger.kernel.org Subject: [PATCH 6/8] x86: Provide API for local kernel TLB flushing Date: Tue, 16 Jun 2020 00:49:32 -0700 Message-Id: <20200616074934.1600036-7-keescook@chromium.org> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20200616074934.1600036-1-keescook@chromium.org> References: <20200616074934.1600036-1-keescook@chromium.org> MIME-Version: 1.0 Sender: owner-linux-security-module@vger.kernel.org Precedence: bulk List-ID: The seccomp constant action bitmap filter evaluation routine depends on being able to quickly clear the PTE "accessed" bit for a temporary allocation. Provide access to the existing CPU-local kernel memory TLB flushing routines. Signed-off-by: Kees Cook --- arch/x86/include/asm/tlbflush.h | 2 ++ arch/x86/mm/tlb.c | 12 +++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 8c87a2e0b660..ae853e77d6bc 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -228,6 +228,8 @@ extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, bool freed_tables); +extern void local_flush_tlb_kernel_range(unsigned long start, + unsigned long end); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 1a3569b43aa5..ffcf2bd0ce1c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -959,16 +959,22 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, NULL, 1); } -static void do_kernel_range_flush(void *info) +void local_flush_tlb_kernel_range(unsigned long start, unsigned long end) { - struct flush_tlb_info *f = info; unsigned long addr; /* flush range by one by one 'invlpg' */ - for (addr = f->start; addr < f->end; addr += PAGE_SIZE) + for (addr = start; addr < end; addr += PAGE_SIZE) flush_tlb_one_kernel(addr); } +static void do_kernel_range_flush(void *info) +{ + struct flush_tlb_info *f = info; + + local_flush_tlb_kernel_range(f->start, f->end); +} + void flush_tlb_kernel_range(unsigned long start, unsigned long end) { /* Balance as user space task's flush, a bit conservative */ From patchwork Tue Jun 16 07:49:33 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Kees Cook X-Patchwork-Id: 11606755 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id A60A560D for ; Tue, 16 Jun 2020 07:50:21 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 8A297206D7 for ; Tue, 16 Jun 2020 07:50:21 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=chromium.org header.i=@chromium.org header.b="mBMsu/H3" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726947AbgFPHuI (ORCPT ); Tue, 16 Jun 2020 03:50:08 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57000 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726868AbgFPHt5 (ORCPT ); Tue, 16 Jun 2020 03:49:57 -0400 Received: from mail-pf1-x441.google.com (mail-pf1-x441.google.com [IPv6:2607:f8b0:4864:20::441]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id ACCDAC0A88B4 for ; Tue, 16 Jun 2020 00:49:48 -0700 (PDT) Received: by mail-pf1-x441.google.com with SMTP id s23so9104473pfh.7 for ; Tue, 16 Jun 2020 00:49:48 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=chromium.org; s=google; h=from:to:cc:subject:date:message-id:in-reply-to:references :mime-version:content-transfer-encoding; bh=LpGj0LlWhL2sX6NdVIScVPST1vvxv4HP5Xz6PQqXsUs=; b=mBMsu/H37l5/82lF78S3TFrEkOg5oaDeSPvONr2tsAmEU8cRaPFfhq+CvSo+fcCXID IaaYZg3VqQimghB/9or2O9e8aLGitYBJVNAhSN6fjjNL8HkLB1nz+Zgd5nThB33PKaA+ TdE8nGpBIfY9arDLD7VJCBRTPkHDuKf0fytoA= X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=LpGj0LlWhL2sX6NdVIScVPST1vvxv4HP5Xz6PQqXsUs=; b=mWsjVUOSkwvC8HRq8ZOdRerci/wwocktrC1vekpU85mg93vYm8F2t8pwvQZ3YnQ3Mw XT6Hgk2x1vrPx72RVHuldX0dcGIhCut9A/eRERxbecVCtcdaEDYSUwjvEK7HwVCqIZ9c Kb9JxvE+y9XhlDOxR6FIWh39+LdO7oP8gWZkqOFAECr921t1CJiwyneBcQUP/Z4iE9ge S2FD7IWr2AduJIBZqFFVkw94oDlI5zOTJp+7IQnNKbVcHFHnVbGgbj5pE/SAKdhnNr5a 3o86e1ppRoeo1e48flME+9jJwrKTPDI7fMaTEMC+DNls36QKZHBxk1FSbWMyucVTZMSm JU/Q== X-Gm-Message-State: AOAM532eNAccX3EZTgr6OB1vd4GYqLbXgTbz0WuLzE4pbpDzDWhUDlxv OUFrRQmndATHOOY0Y2zso7D66w== X-Google-Smtp-Source: ABdhPJySFd7sCS5BogiTiTKCUdeWwjE8zpkNMModdzfibmP1Pshjj5uSx9ng6G7MEky9Ar+74tfsDQ== X-Received: by 2002:a62:1bc5:: with SMTP id b188mr970686pfb.119.1592293788235; Tue, 16 Jun 2020 00:49:48 -0700 (PDT) Received: from www.outflux.net (smtp.outflux.net. [198.145.64.163]) by smtp.gmail.com with ESMTPSA id l23sm13485938pgc.55.2020.06.16.00.49.44 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 16 Jun 2020 00:49:46 -0700 (PDT) From: Kees Cook To: linux-kernel@vger.kernel.org Cc: Kees Cook , Christian Brauner , Sargun Dhillon , Tycho Andersen , Jann Horn , "zhujianwei (C)" , Dave Hansen , Matthew Wilcox , Andy Lutomirski , Will Drewry , Shuah Khan , Matt Denton , Chris Palmer , Jeffrey Vander Stoep , Aleksa Sarai , Hehuazhen , x86@kernel.org, Linux Containers , linux-security-module@vger.kernel.org, linux-api@vger.kernel.org Subject: [PATCH 7/8] x86: Enable seccomp constant action bitmaps Date: Tue, 16 Jun 2020 00:49:33 -0700 Message-Id: <20200616074934.1600036-8-keescook@chromium.org> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20200616074934.1600036-1-keescook@chromium.org> References: <20200616074934.1600036-1-keescook@chromium.org> MIME-Version: 1.0 Sender: owner-linux-security-module@vger.kernel.org Precedence: bulk List-ID: Now that CPU-local kernel TLB flushes are available to seccomp, define the specific architectures seccomp should be expected to reason about, so that constant action bitmaps can be enabled for x86. TODO: handle x32 via a "synthetic architecture" check, like done in syscall entry. Signed-off-by: Kees Cook --- arch/x86/Kconfig | 1 + arch/x86/include/asm/syscall.h | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a0cc524882d..0f7a0abab88f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -149,6 +149,7 @@ config X86 select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT select HAVE_ARCH_PREL32_RELOCATIONS select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ARCH_SECCOMP_BITMAP if !X86_X32 select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_STACKLEAK select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 7cbf733d11af..b89e86f4c061 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -97,6 +97,7 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(®s->bx + i, args, n * sizeof(args[0])); } +#define SECCOMP_ARCH AUDIT_ARCH_I386 static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_I386; @@ -152,6 +153,10 @@ static inline void syscall_set_arguments(struct task_struct *task, } } +#define SECCOMP_ARCH AUDIT_ARCH_X86_64 +#ifdef CONFIG_COMPAT +#define SECCOMP_ARCH_COMPAT AUDIT_ARCH_I386 +#endif static inline int syscall_get_arch(struct task_struct *task) { /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ From patchwork Tue Jun 16 07:49:34 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Kees Cook X-Patchwork-Id: 11606757 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 9972C60D for ; Tue, 16 Jun 2020 07:50:22 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 829EF206D7 for ; Tue, 16 Jun 2020 07:50:22 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=chromium.org header.i=@chromium.org header.b="h2BrY8BK" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726464AbgFPHuH (ORCPT ); Tue, 16 Jun 2020 03:50:07 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57006 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726950AbgFPHt6 (ORCPT ); Tue, 16 Jun 2020 03:49:58 -0400 Received: from mail-pl1-x641.google.com (mail-pl1-x641.google.com [IPv6:2607:f8b0:4864:20::641]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 6F746C008630 for ; Tue, 16 Jun 2020 00:49:50 -0700 (PDT) Received: by mail-pl1-x641.google.com with SMTP id n9so8031540plk.1 for ; Tue, 16 Jun 2020 00:49:50 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=chromium.org; s=google; h=from:to:cc:subject:date:message-id:in-reply-to:references :mime-version:content-transfer-encoding; bh=UXEYLD7af7LJOjB8xnPNTPrt+WaWXkTb11rPY8DaG9o=; b=h2BrY8BKMYCkCsU3ZiGHQqvWYu+MzqkLnnp8a6bzu9p+/NEaFXUfSdN4wBcE1aqXeV LA1oJJ0czwCl5t+MiPZRo6sJHzWOJHE16oztLq7Xpft20dvdV3hag8ApW2Y4RoJ5MLKb wvY/4ueFcil/Y8V4OtWUc2a6E2mKkX2HP0OWs= X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=UXEYLD7af7LJOjB8xnPNTPrt+WaWXkTb11rPY8DaG9o=; b=TH9O2T2gdVsiYXJP31iKWdXfBl6MV846nvM2d3wZGBFc6WUXDqCjTOUtiIdb6460qI tF/AhUaYrqOAHCkQ4MqCFVNnXV6n+raULODlXDhGIryfPdov67CcxG5T1qQ1eXPpfHlL ek3meCuWXZSWVr7Vyat1GpV7/kp459D6MxJoBC4mezO5cyic7xdjCK5Pxd8UsMeuee9U pb+234p4TijmahWMF5QZCC2Tq43jQibiQ7KgLn9gJTokezgaxOrDUqPv/V5ohDeLd58O AQUXVpoj1GxfgIwwycEL6TheaPJ651aWqyElpfAkrrdnYrpNs1FMcnxgHSb1fFlJQ/rm AwwQ== X-Gm-Message-State: AOAM533vigwcGL5t2eo/cyxC/rBOhOWIqikkop4Wgr+4cNY1CgtnjUkF jadYfnchVg6FAx1S08znNDGzbQ== X-Google-Smtp-Source: ABdhPJws+IxzhfQqZ/B7NKXZPC6tuHYhP5yRNSlhuhhCXbsMlnjbimxI9zgc2yJDjJa4JxGBDknPEA== X-Received: by 2002:a17:902:aa92:: with SMTP id d18mr1033453plr.210.1592293789983; Tue, 16 Jun 2020 00:49:49 -0700 (PDT) Received: from www.outflux.net (smtp.outflux.net. [198.145.64.163]) by smtp.gmail.com with ESMTPSA id x8sm1650796pje.31.2020.06.16.00.49.45 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 16 Jun 2020 00:49:46 -0700 (PDT) From: Kees Cook To: linux-kernel@vger.kernel.org Cc: Kees Cook , Christian Brauner , Sargun Dhillon , Tycho Andersen , Jann Horn , "zhujianwei (C)" , Dave Hansen , Matthew Wilcox , Andy Lutomirski , Will Drewry , Shuah Khan , Matt Denton , Chris Palmer , Jeffrey Vander Stoep , Aleksa Sarai , Hehuazhen , x86@kernel.org, Linux Containers , linux-security-module@vger.kernel.org, linux-api@vger.kernel.org Subject: [PATCH 8/8] [DEBUG] seccomp: Report bitmap coverage ranges Date: Tue, 16 Jun 2020 00:49:34 -0700 Message-Id: <20200616074934.1600036-9-keescook@chromium.org> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20200616074934.1600036-1-keescook@chromium.org> References: <20200616074934.1600036-1-keescook@chromium.org> MIME-Version: 1.0 Sender: owner-linux-security-module@vger.kernel.org Precedence: bulk List-ID: This is what I've been using to explore actual bitmap results for real-world filters. Signed-off-by: Kees Cook --- kernel/seccomp.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 2fbe7d2260f7..370b7ed9273b 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -715,6 +715,85 @@ static void seccomp_update_bitmap(struct seccomp_filter *filter, } } +static void __report_bitmap(const char *arch, u32 ret, int start, int finish) +{ + int gap; + char *name; + + if (finish == -1) + return; + + switch (ret) { + case UINT_MAX: + name = "filter"; + break; + case SECCOMP_RET_ALLOW: + name = "SECCOMP_RET_ALLOW"; + break; + case SECCOMP_RET_KILL_PROCESS: + name = "SECCOMP_RET_KILL_PROCESS"; + break; + case SECCOMP_RET_KILL_THREAD: + name = "SECCOMP_RET_KILL_THREAD"; + break; + default: + WARN_ON_ONCE(1); + name = "unknown"; + break; + } + + gap = 0; + if (start < 100) + gap++; + if (start < 10) + gap++; + if (finish < 100) + gap++; + if (finish < 10) + gap++; + + if (start == finish) + pr_info("%s %3d: %s\n", arch, start, name); + else if (start + 1 == finish) + pr_info("%s %*s%d,%d: %s\n", arch, gap, "", start, finish, name); + else + pr_info("%s %*s%d-%d: %s\n", arch, gap, "", start, finish, name); +} + +static void report_bitmap(struct seccomp_bitmaps *bitmaps, const char *arch) +{ + u32 nr; + int start = 0, finish = -1; + u32 ret = UINT_MAX; + struct report_states { + unsigned long *bitmap; + u32 ret; + } states[] = { + { .bitmap = bitmaps->allow, .ret = SECCOMP_RET_ALLOW, }, + { .bitmap = bitmaps->kill_process, .ret = SECCOMP_RET_KILL_PROCESS, }, + { .bitmap = bitmaps->kill_thread, .ret = SECCOMP_RET_KILL_THREAD, }, + { .bitmap = NULL, .ret = UINT_MAX, }, + }; + + for (nr = 0; nr < NR_syscalls; nr++) { + int i; + + for (i = 0; i < ARRAY_SIZE(states); i++) { + if (!states[i].bitmap || test_bit(nr, states[i].bitmap)) { + if (ret != states[i].ret) { + __report_bitmap(arch, ret, start, finish); + ret = states[i].ret; + start = nr; + } + finish = nr; + break; + } + } + } + if (start != nr) + __report_bitmap(arch, ret, start, finish); +} + static void seccomp_update_bitmaps(struct seccomp_filter *filter, void *pagepair) { @@ -724,6 +803,20 @@ static void seccomp_update_bitmaps(struct seccomp_filter *filter, seccomp_update_bitmap(filter, pagepair, SECCOMP_ARCH_COMPAT, ¤t->seccomp.compat); #endif + if (strncmp(current->comm, "test-", 5) == 0 || + strcmp(current->comm, "seccomp_bpf") == 0 || + /* + * Why are systemd's process names head-truncated to 8 bytes + * and wrapped in parens!? + */ + (current->comm[0] == '(' && strrchr(current->comm, ')') != NULL)) { + pr_info("reporting syscall bitmap usage for %d (%s):\n", + task_pid_nr(current), current->comm); + report_bitmap(¤t->seccomp.native, "native"); +#ifdef CONFIG_COMPAT + report_bitmap(¤t->seccomp.compat, "compat"); +#endif + } } #else static void seccomp_update_bitmaps(struct seccomp_filter *filter, @@ -783,6 +876,10 @@ static long seccomp_attach_filter(unsigned int flags, filter->prev = current->seccomp.filter; current->seccomp.filter = filter; atomic_inc(¤t->seccomp.filter_count); + if (atomic_read(¤t->seccomp.filter_count) > 10) + pr_info("%d filters: %d (%s)\n", + atomic_read(¤t->seccomp.filter_count), + task_pid_nr(current), current->comm); /* Evaluate filter for new known-outcome syscalls */ seccomp_update_bitmaps(filter, pagepair); @@ -2131,6 +2228,16 @@ static int __init seccomp_sysctl_init(void) pr_warn("sysctl registration failed\n"); else kmemleak_not_leak(hdr); +#ifndef CONFIG_HAVE_ARCH_SECCOMP_BITMAP + pr_info("arch lacks support for constant action bitmaps\n"); +#else + pr_info("NR_syscalls: %d\n", NR_syscalls); + pr_info("arch: 0x%x\n", SECCOMP_ARCH); +#ifdef CONFIG_COMPAT + pr_info("compat arch: 0x%x\n", SECCOMP_ARCH_COMPAT); +#endif +#endif + pr_info("sizeof(struct seccomp_bitmaps): %zu\n", sizeof(struct seccomp_bitmaps)); return 0; }