From patchwork Wed Mar  8 18:41:14 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Andrii Nakryiko <andrii@kernel.org>
X-Patchwork-Id: 13166305
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 80C4EC678D5
	for <bpf@archiver.kernel.org>; Wed,  8 Mar 2023 18:41:32 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229653AbjCHSlb convert rfc822-to-8bit (ORCPT
        <rfc822;bpf@archiver.kernel.org>); Wed, 8 Mar 2023 13:41:31 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:38646 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229529AbjCHSla (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 8 Mar 2023 13:41:30 -0500
Received: from mx0b-00082601.pphosted.com (mx0b-00082601.pphosted.com
 [67.231.153.30])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 93C90ACE36
        for <bpf@vger.kernel.org>; Wed,  8 Mar 2023 10:41:28 -0800 (PST)
Received: from pps.filterd (m0109332.ppops.net [127.0.0.1])
        by mx0a-00082601.pphosted.com (8.17.1.19/8.17.1.19) with ESMTP id
 328I0h2f017413
        for <bpf@vger.kernel.org>; Wed, 8 Mar 2023 10:41:27 -0800
Received: from mail.thefacebook.com ([163.114.132.120])
        by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3p6ffbdwd7-1
        (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128
 verify=NOT)
        for <bpf@vger.kernel.org>; Wed, 08 Mar 2023 10:41:27 -0800
Received: from twshared29091.48.prn1.facebook.com (2620:10d:c085:208::11) by
 mail.thefacebook.com (2620:10d:c085:21d::6) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.17; Wed, 8 Mar 2023 10:41:26 -0800
Received: by devbig019.vll3.facebook.com (Postfix, from userid 137359)
        id 52EDA299B7569; Wed,  8 Mar 2023 10:41:24 -0800 (PST)
From: Andrii Nakryiko <andrii@kernel.org>
To: <bpf@vger.kernel.org>, <ast@kernel.org>, <daniel@iogearbox.net>
CC: <andrii@kernel.org>, <kernel-team@meta.com>,
        Tejun Heo <tj@kernel.org>
Subject: [PATCH v5 bpf-next 1/8] bpf: factor out fetching basic kfunc metadata
Date: Wed, 8 Mar 2023 10:41:14 -0800
Message-ID: <20230308184121.1165081-2-andrii@kernel.org>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20230308184121.1165081-1-andrii@kernel.org>
References: <20230308184121.1165081-1-andrii@kernel.org>
MIME-Version: 1.0
X-FB-Internal: Safe
X-Proofpoint-GUID: PCYTcOYstSJDVzCPGBWUfWR5LMAA7EZN
X-Proofpoint-ORIG-GUID: PCYTcOYstSJDVzCPGBWUfWR5LMAA7EZN
X-Proofpoint-Virus-Version: vendor=baseguard
 engine=ICAP:2.0.254,Aquarius:18.0.942,Hydra:6.0.573,FMLib:17.11.170.22
 definitions=2023-03-08_12,2023-03-08_03,2023-02-09_01
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org
X-Patchwork-Delegate: bpf@iogearbox.net

Factor out logic to fetch basic kfunc metadata based on struct bpf_insn.
This is not exactly short or trivial code to just copy/paste and this
information is sometimes necessary in other parts of the verifier logic.
Subsequent patches will rely on this to determine if an instruction is
a kfunc call to iterator next method.

No functional changes intended, including that verbose() warning
behavior when kfunc is not allowed for a particular program type.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 kernel/bpf/verifier.c | 92 +++++++++++++++++++++++++++----------------
 1 file changed, 59 insertions(+), 33 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b2116ca78d9a..8d40fba6a1c0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10079,24 +10079,21 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 	return 0;
 }
 
-static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
-			    int *insn_idx_p)
+static int fetch_kfunc_meta(struct bpf_verifier_env *env,
+			    struct bpf_insn *insn,
+			    struct bpf_kfunc_call_arg_meta *meta,
+			    const char **kfunc_name)
 {
-	const struct btf_type *t, *func, *func_proto, *ptr_type;
-	u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id;
-	struct bpf_reg_state *regs = cur_regs(env);
-	const char *func_name, *ptr_type_name;
-	bool sleepable, rcu_lock, rcu_unlock;
-	struct bpf_kfunc_call_arg_meta meta;
-	int err, insn_idx = *insn_idx_p;
-	const struct btf_param *args;
-	const struct btf_type *ret_t;
+	const struct btf_type *func, *func_proto;
+	u32 func_id, *kfunc_flags;
+	const char *func_name;
 	struct btf *desc_btf;
-	u32 *kfunc_flags;
 
-	/* skip for now, but return error when we find this in fixup_kfunc_call */
+	if (kfunc_name)
+		*kfunc_name = NULL;
+
 	if (!insn->imm)
-		return 0;
+		return -EINVAL;
 
 	desc_btf = find_kfunc_desc_btf(env, insn->off);
 	if (IS_ERR(desc_btf))
@@ -10105,22 +10102,51 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	func_id = insn->imm;
 	func = btf_type_by_id(desc_btf, func_id);
 	func_name = btf_name_by_offset(desc_btf, func->name_off);
+	if (kfunc_name)
+		*kfunc_name = func_name;
 	func_proto = btf_type_by_id(desc_btf, func->type);
 
 	kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id);
 	if (!kfunc_flags) {
-		verbose(env, "calling kernel function %s is not allowed\n",
-			func_name);
 		return -EACCES;
 	}
 
-	/* Prepare kfunc call metadata */
-	memset(&meta, 0, sizeof(meta));
-	meta.btf = desc_btf;
-	meta.func_id = func_id;
-	meta.kfunc_flags = *kfunc_flags;
-	meta.func_proto = func_proto;
-	meta.func_name = func_name;
+	memset(meta, 0, sizeof(*meta));
+	meta->btf = desc_btf;
+	meta->func_id = func_id;
+	meta->kfunc_flags = *kfunc_flags;
+	meta->func_proto = func_proto;
+	meta->func_name = func_name;
+
+	return 0;
+}
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+			    int *insn_idx_p)
+{
+	const struct btf_type *t, *ptr_type;
+	u32 i, nargs, ptr_type_id, release_ref_obj_id;
+	struct bpf_reg_state *regs = cur_regs(env);
+	const char *func_name, *ptr_type_name;
+	bool sleepable, rcu_lock, rcu_unlock;
+	struct bpf_kfunc_call_arg_meta meta;
+	struct bpf_insn_aux_data *insn_aux;
+	int err, insn_idx = *insn_idx_p;
+	const struct btf_param *args;
+	const struct btf_type *ret_t;
+	struct btf *desc_btf;
+
+	/* skip for now, but return error when we find this in fixup_kfunc_call */
+	if (!insn->imm)
+		return 0;
+
+	err = fetch_kfunc_meta(env, insn, &meta, &func_name);
+	if (err == -EACCES && func_name)
+		verbose(env, "calling kernel function %s is not allowed\n", func_name);
+	if (err)
+		return err;
+	desc_btf = meta.btf;
+	insn_aux = &env->insn_aux_data[insn_idx];
 
 	if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
 		verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
@@ -10173,7 +10199,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		err = release_reference(env, regs[meta.release_regno].ref_obj_id);
 		if (err) {
 			verbose(env, "kfunc %s#%d reference has not been acquired before\n",
-				func_name, func_id);
+				func_name, meta.func_id);
 			return err;
 		}
 	}
@@ -10185,14 +10211,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		err = ref_convert_owning_non_owning(env, release_ref_obj_id);
 		if (err) {
 			verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
-				func_name, func_id);
+				func_name, meta.func_id);
 			return err;
 		}
 
 		err = release_reference(env, release_ref_obj_id);
 		if (err) {
 			verbose(env, "kfunc %s#%d reference has not been acquired before\n",
-				func_name, func_id);
+				func_name, meta.func_id);
 			return err;
 		}
 	}
@@ -10202,7 +10228,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 					set_rbtree_add_callback_state);
 		if (err) {
 			verbose(env, "kfunc %s#%d failed callback verification\n",
-				func_name, func_id);
+				func_name, meta.func_id);
 			return err;
 		}
 	}
@@ -10211,7 +10237,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		mark_reg_not_init(env, regs, caller_saved[i]);
 
 	/* Check return type */
-	t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
+	t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
 
 	if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
 		/* Only exception is bpf_obj_new_impl */
@@ -10260,11 +10286,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				regs[BPF_REG_0].btf = ret_btf;
 				regs[BPF_REG_0].btf_id = ret_btf_id;
 
-				env->insn_aux_data[insn_idx].obj_new_size = ret_t->size;
-				env->insn_aux_data[insn_idx].kptr_struct_meta =
+				insn_aux->obj_new_size = ret_t->size;
+				insn_aux->kptr_struct_meta =
 					btf_find_struct_meta(ret_btf, ret_btf_id);
 			} else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
-				env->insn_aux_data[insn_idx].kptr_struct_meta =
+				insn_aux->kptr_struct_meta =
 					btf_find_struct_meta(meta.arg_obj_drop.btf,
 							     meta.arg_obj_drop.btf_id);
 			} else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
@@ -10397,8 +10423,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			regs[BPF_REG_0].id = ++env->id_gen;
 	} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
 
-	nargs = btf_type_vlen(func_proto);
-	args = (const struct btf_param *)(func_proto + 1);
+	nargs = btf_type_vlen(meta.func_proto);
+	args = (const struct btf_param *)(meta.func_proto + 1);
 	for (i = 0; i < nargs; i++) {
 		u32 regno = i + 1;
 

From patchwork Wed Mar  8 18:41:15 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Andrii Nakryiko <andrii@kernel.org>
X-Patchwork-Id: 13166307
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 58946C742A7
	for <bpf@archiver.kernel.org>; Wed,  8 Mar 2023 18:41:37 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229686AbjCHSlf convert rfc822-to-8bit (ORCPT
        <rfc822;bpf@archiver.kernel.org>); Wed, 8 Mar 2023 13:41:35 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:38728 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229761AbjCHSlf (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 8 Mar 2023 13:41:35 -0500
Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com
 [67.231.145.42])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 6E1EFAFB96
        for <bpf@vger.kernel.org>; Wed,  8 Mar 2023 10:41:33 -0800 (PST)
Received: from pps.filterd (m0109334.ppops.net [127.0.0.1])
        by mx0a-00082601.pphosted.com (8.17.1.19/8.17.1.19) with ESMTP id
 328I0P6D009105
        for <bpf@vger.kernel.org>; Wed, 8 Mar 2023 10:41:33 -0800
Received: from mail.thefacebook.com ([163.114.132.120])
        by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3p6feu63e2-1
        (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128
 verify=NOT)
        for <bpf@vger.kernel.org>; Wed, 08 Mar 2023 10:41:32 -0800
Received: from twshared21709.17.frc2.facebook.com (2620:10d:c085:208::11) by
 mail.thefacebook.com (2620:10d:c085:11d::7) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.17; Wed, 8 Mar 2023 10:41:32 -0800
Received: by devbig019.vll3.facebook.com (Postfix, from userid 137359)
        id 61592299B757F; Wed,  8 Mar 2023 10:41:26 -0800 (PST)
From: Andrii Nakryiko <andrii@kernel.org>
To: <bpf@vger.kernel.org>, <ast@kernel.org>, <daniel@iogearbox.net>
CC: <andrii@kernel.org>, <kernel-team@meta.com>,
        Tejun Heo <tj@kernel.org>
Subject: [PATCH v5 bpf-next 2/8] bpf: add iterator kfuncs registration and
 validation logic
Date: Wed, 8 Mar 2023 10:41:15 -0800
Message-ID: <20230308184121.1165081-3-andrii@kernel.org>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20230308184121.1165081-1-andrii@kernel.org>
References: <20230308184121.1165081-1-andrii@kernel.org>
MIME-Version: 1.0
X-FB-Internal: Safe
X-Proofpoint-GUID: jx_Xt_ZFacE9OBUuAfUMucDW34JZsBR1
X-Proofpoint-ORIG-GUID: jx_Xt_ZFacE9OBUuAfUMucDW34JZsBR1
X-Proofpoint-Virus-Version: vendor=baseguard
 engine=ICAP:2.0.254,Aquarius:18.0.942,Hydra:6.0.573,FMLib:17.11.170.22
 definitions=2023-03-08_12,2023-03-08_03,2023-02-09_01
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org
X-Patchwork-Delegate: bpf@iogearbox.net

Add ability to register kfuncs that implement BPF open-coded iterator
contract and enforce naming and function proto convention. Enforcement
happens at the time of kfunc registration and significantly simplifies
the rest of iterators logic in the verifier.

More details follow in subsequent patches, but we enforce the following
conditions.

All kfuncs (constructor, next, destructor) have to be named consistenly
as bpf_iter_<type>_{new,next,destroy}(), respectively. <type> represents
iterator type, and iterator state should be represented as a matching
`struct bpf_iter_<type>` state type. Also, all iter kfuncs should have
a pointer to this `struct bpf_iter_<type>` as the very first argument.

Additionally:
  - Constructor, i.e., bpf_iter_<type>_new(), can have arbitrary extra
  number of arguments. Return type is not enforced either.
  - Next method, i.e., bpf_iter_<type>_next(), has to return a pointer
  type and should have exactly one argument: `struct bpf_iter_<type> *`
  (const/volatile/restrict and typedefs are ignored).
  - Destructor, i.e., bpf_iter_<type>_destroy(), should return void and
  should have exactly one argument, similar to the next method.
  - struct bpf_iter_<type> size is enforced to be positive and
  a multiple of 8 bytes (to fit stack slots correctly).

Such strictness and consistency allows to build generic helpers
abstracting important, but boilerplate, details to be able to use
open-coded iterators effectively and ergonomically (see bpf_for_each()
in subsequent patches). It also simplifies the verifier logic in some
places. At the same time, this doesn't hurt generality of possible
iterator implementations. Win-win.

Constructor kfunc is marked with a new KF_ITER_NEW flags, next method is
marked with KF_ITER_NEXT (and should also have KF_RET_NULL, of course),
while destructor kfunc is marked as KF_ITER_DESTROY.

Additionally, we add a trivial kfunc name validation: it should be
a valid non-NULL and non-empty string.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf_verifier.h |   2 +
 include/linux/btf.h          |   4 ++
 kernel/bpf/btf.c             | 112 ++++++++++++++++++++++++++++++++++-
 3 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 18538bad2b8c..e2dc7f064449 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -59,6 +59,8 @@ struct bpf_active_lock {
 	u32 id;
 };
 
+#define ITER_PREFIX "bpf_iter_"
+
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 556b3e2e7471..1bba0827e8c4 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -71,6 +71,10 @@
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
 #define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
+/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */
+#define KF_ITER_NEW     (1 << 8) /* kfunc implements BPF iter constructor */
+#define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
+#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a8cb09e5973b..71758cd15b07 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7596,6 +7596,108 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
 BTF_TRACING_TYPE_xxx
 #undef BTF_TRACING_TYPE
 
+static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
+				 const struct btf_type *func, u32 func_flags)
+{
+	u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+	const char *name, *sfx, *iter_name;
+	const struct btf_param *arg;
+	const struct btf_type *t;
+	char exp_name[128];
+	u32 nr_args;
+
+	/* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
+	if (!flags || (flags & (flags - 1)))
+		return -EINVAL;
+
+	/* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */
+	nr_args = btf_type_vlen(func);
+	if (nr_args < 1)
+		return -EINVAL;
+
+	arg = &btf_params(func)[0];
+	t = btf_type_skip_modifiers(btf, arg->type, NULL);
+	if (!t || !btf_type_is_ptr(t))
+		return -EINVAL;
+	t = btf_type_skip_modifiers(btf, t->type, NULL);
+	if (!t || !__btf_type_is_struct(t))
+		return -EINVAL;
+
+	name = btf_name_by_offset(btf, t->name_off);
+	if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+		return -EINVAL;
+
+	/* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
+	 * fit nicely in stack slots
+	 */
+	if (t->size == 0 || (t->size % 8))
+		return -EINVAL;
+
+	/* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
+	 * naming pattern
+	 */
+	iter_name = name + sizeof(ITER_PREFIX) - 1;
+	if (flags & KF_ITER_NEW)
+		sfx = "new";
+	else if (flags & KF_ITER_NEXT)
+		sfx = "next";
+	else /* (flags & KF_ITER_DESTROY) */
+		sfx = "destroy";
+
+	snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx);
+	if (strcmp(func_name, exp_name))
+		return -EINVAL;
+
+	/* only iter constructor should have extra arguments */
+	if (!(flags & KF_ITER_NEW) && nr_args != 1)
+		return -EINVAL;
+
+	if (flags & KF_ITER_NEXT) {
+		/* bpf_iter_<type>_next() should return pointer */
+		t = btf_type_skip_modifiers(btf, func->type, NULL);
+		if (!t || !btf_type_is_ptr(t))
+			return -EINVAL;
+	}
+
+	if (flags & KF_ITER_DESTROY) {
+		/* bpf_iter_<type>_destroy() should return void */
+		t = btf_type_by_id(btf, func->type);
+		if (!t || !btf_type_is_void(t))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
+{
+	const struct btf_type *func;
+	const char *func_name;
+	int err;
+
+	/* any kfunc should be FUNC -> FUNC_PROTO */
+	func = btf_type_by_id(btf, func_id);
+	if (!func || !btf_type_is_func(func))
+		return -EINVAL;
+
+	/* sanity check kfunc name */
+	func_name = btf_name_by_offset(btf, func->name_off);
+	if (!func_name || !func_name[0])
+		return -EINVAL;
+
+	func = btf_type_by_id(btf, func->type);
+	if (!func || !btf_type_is_func_proto(func))
+		return -EINVAL;
+
+	if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) {
+		err = btf_check_iter_kfuncs(btf, func_name, func, func_flags);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 /* Kernel Function (kfunc) BTF ID set registration API */
 
 static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
@@ -7772,7 +7874,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
 				       const struct btf_kfunc_id_set *kset)
 {
 	struct btf *btf;
-	int ret;
+	int ret, i;
 
 	btf = btf_get_module_btf(kset->owner);
 	if (!btf) {
@@ -7789,7 +7891,15 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
 
+	for (i = 0; i < kset->set->cnt; i++) {
+		ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
+					     kset->set->pairs[i].flags);
+		if (ret)
+			goto err_out;
+	}
+
 	ret = btf_populate_kfunc_set(btf, hook, kset->set);
+err_out:
 	btf_put(btf);
 	return ret;
 }

From patchwork Wed Mar  8 18:41:16 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Andrii Nakryiko <andrii@kernel.org>
X-Patchwork-Id: 13166308
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 6B613C678D5
	for <bpf@archiver.kernel.org>; Wed,  8 Mar 2023 18:41:41 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229761AbjCHSlj convert rfc822-to-8bit (ORCPT
        <rfc822;bpf@archiver.kernel.org>); Wed, 8 Mar 2023 13:41:39 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:38734 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229764AbjCHSli (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 8 Mar 2023 13:41:38 -0500
Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com
 [67.231.145.42])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 9F877B1A45
        for <bpf@vger.kernel.org>; Wed,  8 Mar 2023 10:41:34 -0800 (PST)
Received: from pps.filterd (m0148461.ppops.net [127.0.0.1])
        by mx0a-00082601.pphosted.com (8.17.1.19/8.17.1.19) with ESMTP id
 328I0MxI008334
        for <bpf@vger.kernel.org>; Wed, 8 Mar 2023 10:41:34 -0800
Received: from maileast.thefacebook.com ([163.114.130.16])
        by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3p6fekp2t2-2
        (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128
 verify=NOT)
        for <bpf@vger.kernel.org>; Wed, 08 Mar 2023 10:41:34 -0800
Received: from twshared38955.16.prn3.facebook.com (2620:10d:c0a8:1b::d) by
 mail.thefacebook.com (2620:10d:c0a8:82::f) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.17; Wed, 8 Mar 2023 10:41:30 -0800
Received: by devbig019.vll3.facebook.com (Postfix, from userid 137359)
        id 71C91299B759F; Wed,  8 Mar 2023 10:41:28 -0800 (PST)
From: Andrii Nakryiko <andrii@kernel.org>
To: <bpf@vger.kernel.org>, <ast@kernel.org>, <daniel@iogearbox.net>
CC: <andrii@kernel.org>, <kernel-team@meta.com>,
        Tejun Heo <tj@kernel.org>
Subject: [PATCH v5 bpf-next 3/8] bpf: add support for open-coded iterator
 loops
Date: Wed, 8 Mar 2023 10:41:16 -0800
Message-ID: <20230308184121.1165081-4-andrii@kernel.org>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20230308184121.1165081-1-andrii@kernel.org>
References: <20230308184121.1165081-1-andrii@kernel.org>
MIME-Version: 1.0
X-FB-Internal: Safe
X-Proofpoint-GUID: X8bYKAf-H3wy7OCYSVo5tQNfj-tb1qFh
X-Proofpoint-ORIG-GUID: X8bYKAf-H3wy7OCYSVo5tQNfj-tb1qFh
X-Proofpoint-Virus-Version: vendor=baseguard
 engine=ICAP:2.0.254,Aquarius:18.0.942,Hydra:6.0.573,FMLib:17.11.170.22
 definitions=2023-03-08_12,2023-03-08_03,2023-02-09_01
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org
X-Patchwork-Delegate: bpf@iogearbox.net

Teach verifier about the concept of the open-coded (or inline) iterators.

This patch adds generic iterator loop verification logic, new STACK_ITER
stack slot type to contain iterator state, and necessary kfunc plumbing
for iterator's constructor, destructor and next methods. Next patch
implements first specific iterator (numbers iterator for implementing
for() loop logic). Such split allows to have more focused commits for
verifier logic and separate commit that we could point later to
demonstrating  what does it take to add a new kind of iterator.

Each kind of iterator has its own associated struct bpf_iter_<type>,
where <type> denotes a specific type of iterator. struct bpf_iter_<type>
state is supposed to live on BPF program stack, so there will be no way
to change its size later on without breaking backwards compatibility, so
choose wisely! But given this struct is specific to a given <type> of
iterator, this allows a lot of flexibility: simple iterators could be
fine with just one stack slot (8 bytes), like numbers iterator in the
next patch, while some other more complicated iterators might need way
more to keep their iterator state. Either way, such design allows to
avoid runtime memory allocations, which otherwise would be necessary if
we fixed on-the-stack size and it turned out to be too small for a given
iterator implementation.

The way BPF verifier logic is implemented, there are no artificial
restrictions on a number of active iterators, it should work correctly
using multiple active iterators at the same time. This also means you
can have multiple nested iteration loops. struct bpf_iter_<type>
reference can be safely passed to subprograms as well.

General flow is easiest to demonstrate with a simple example using
number iterator implemented in next patch. Here's the simplest possible
loop:

  struct bpf_iter_num it;
  int *v;

  bpf_iter_num_new(&it, 2, 5);
  while ((v = bpf_iter_num_next(&it))) {
      bpf_printk("X = %d", *v);
  }
  bpf_iter_num_destroy(&it);

Above snippet should output "X = 2", "X = 3", "X = 4". Note that 5 is
exclusive and is not returned. This matches similar APIs (e.g., slices
in Go or Rust) that implement a range of elements, where end index is
non-inclusive.

In the above example, we see a trio of function:
  - constructor, bpf_iter_num_new(), which initializes iterator state
  (struct bpf_iter_num it) on the stack. If any of the input arguments
  are invalid, constructor should make sure to still initialize it such
  that subsequent bpf_iter_num_next() calls will return NULL. I.e., on
  error, return error and construct empty iterator.
  - next method, bpf_iter_num_next(), which accepts pointer to iterator
  state and produces an element. Next method should always return
  a pointer. The contract between BPF verifier is that next method will
  always eventually return NULL when elements are exhausted. Once NULL is
  returned, subsequent next calls should keep returning NULL. In the
  case of numbers iterator, bpf_iter_num_next() returns a pointer to an int
  (storage for this integer is inside the iterator state itself),
  which can be dereferenced after corresponding NULL check.
  - once done with the iterator, it's mandated that user cleans up its
  state with the call to destructor, bpf_iter_num_destroy() in this
  case. Destructor frees up any resources and marks stack space used by
  struct bpf_iter_num as usable for something else.

Any other iterator implementation will have to implement at least these
three methods. It is enforced that for any given type of iterator only
applicable constructor/destructor/next are callable. I.e., verifier
ensures you can't pass number iterator state into, say, cgroup
iterator's next method.

It is important to keep the naming pattern consistent to be able to
create generic macros to help with BPF iter usability. E.g., one
of the follow up patches adds generic bpf_for_each() macro to bpf_misc.h
in selftests, which allows to utilize iterator "trio" nicely without
having to code the above somewhat tedious loop explicitly every time.
This is enforced at kfunc registration point by one of the previous
patches in this series.

At the implementation level, iterator state tracking for verification
purposes is very similar to dynptr. We add STACK_ITER stack slot type,
reserve necessary number of slots, depending on
sizeof(struct bpf_iter_<type>), and keep track of necessary extra state
in the "main" slot, which is marked with non-zero ref_obj_id. Other
slots are also marked as STACK_ITER, but have zero ref_obj_id. This is
simpler than having a separate "is_first_slot" flag.

Another big distinction is that STACK_ITER is *always refcounted*, which
simplifies implementation without sacrificing usability. So no need for
extra "iter_id", no need to anticipate reuse of STACK_ITER slots for new
constructors, etc. Keeping it simple here.

As far as the verification logic goes, there are two extensive comments:
in process_iter_next_call() and iter_active_depths_differ() explaining
some important and sometimes subtle aspects. Please refer to them for
details.

But from 10,000-foot point of view, next methods are the points of
forking a verification state, which are conceptually similar to what
verifier is doing when validating conditional jump. We branch out at
a `call bpf_iter_<type>_next` instruction and simulate two outcomes:
NULL (iteration is done) and non-NULL (new element is returned). NULL is
simulated first and is supposed to reach exit without looping. After
that non-NULL case is validated and it either reaches exit (for trivial
examples with no real loop), or reaches another `call bpf_iter_<type>_next`
instruction with the state equivalent to already (partially) validated
one. State equivalency at that point means we technically are going to
be looping forever without "breaking out" out of established "state
envelope" (i.e., subsequent iterations don't add any new knowledge or
constraints to the verifier state, so running 1, 2, 10, or a million of
them doesn't matter). But taking into account the contract stating that
iterator next method *has to* return NULL eventually, we can conclude
that loop body is safe and will eventually terminate. Given we validated
logic outside of the loop (NULL case), and concluded that loop body is
safe (though potentially looping many times), verifier can claim safety
of the overall program logic.

The rest of the patch is necessary plumbing for state tracking, marking,
validation, and necessary further kfunc plumbing to allow implementing
iterator constructor, destructor, and next methods.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf_verifier.h |  23 ++
 kernel/bpf/verifier.c        | 595 ++++++++++++++++++++++++++++++++++-
 2 files changed, 610 insertions(+), 8 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e2dc7f064449..0c052bc79940 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -61,6 +61,12 @@ struct bpf_active_lock {
 
 #define ITER_PREFIX "bpf_iter_"
 
+enum bpf_iter_state {
+	BPF_ITER_STATE_INVALID, /* for non-first slot */
+	BPF_ITER_STATE_ACTIVE,
+	BPF_ITER_STATE_DRAINED,
+};
+
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
@@ -105,6 +111,18 @@ struct bpf_reg_state {
 			bool first_slot;
 		} dynptr;
 
+		/* For bpf_iter stack slots */
+		struct {
+			/* BTF container and BTF type ID describing
+			 * struct bpf_iter_<type> of an iterator state
+			 */
+			struct btf *btf;
+			u32 btf_id;
+			/* packing following two fields to fit iter state into 16 bytes */
+			enum bpf_iter_state state:2;
+			int depth:30;
+		} iter;
+
 		/* Max size from any of the above. */
 		struct {
 			unsigned long raw1;
@@ -143,6 +161,8 @@ struct bpf_reg_state {
 	 * same reference to the socket, to determine proper reference freeing.
 	 * For stack slots that are dynptrs, this is used to track references to
 	 * the dynptr to determine proper reference freeing.
+	 * Similarly to dynptrs, we use ID to track "belonging" of a reference
+	 * to a specific instance of bpf_iter.
 	 */
 	u32 id;
 	/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
@@ -213,9 +233,11 @@ enum bpf_stack_slot_type {
 	 * is stored in bpf_stack_state->spilled_ptr.dynptr.type
 	 */
 	STACK_DYNPTR,
+	STACK_ITER,
 };
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
+
 #define BPF_DYNPTR_SIZE		sizeof(struct bpf_dynptr_kern)
 #define BPF_DYNPTR_NR_SLOTS		(BPF_DYNPTR_SIZE / BPF_REG_SIZE)
 
@@ -450,6 +472,7 @@ struct bpf_insn_aux_data {
 	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
+	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
 	u8 alu_state; /* used in combination with alu_limit */
 
 	/* below fields are initialized once */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8d40fba6a1c0..45a082284464 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -302,6 +302,10 @@ struct bpf_kfunc_call_arg_meta {
 		enum bpf_dynptr_type type;
 		u32 id;
 	} initialized_dynptr;
+	struct {
+		u8 spi;
+		u8 frameno;
+	} iter;
 	u64 mem_size;
 };
 
@@ -668,6 +672,7 @@ static char slot_type_char[] = {
 	[STACK_MISC]	= 'm',
 	[STACK_ZERO]	= '0',
 	[STACK_DYNPTR]	= 'd',
+	[STACK_ITER]	= 'i',
 };
 
 static void print_liveness(struct bpf_verifier_env *env,
@@ -742,6 +747,11 @@ static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *re
 	return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
 }
 
+static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
+{
+	return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
+}
+
 static const char *kernel_type_name(const struct btf* btf, u32 id)
 {
 	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
@@ -766,6 +776,30 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type)
 	}
 }
 
+static const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+	if (!btf || btf_id == 0)
+		return "<invalid>";
+
+	/* we already validated that type is valid and has conforming name */
+	return kernel_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+static const char *iter_state_str(enum bpf_iter_state state)
+{
+	switch (state) {
+	case BPF_ITER_STATE_ACTIVE:
+		return "active";
+	case BPF_ITER_STATE_DRAINED:
+		return "drained";
+	case BPF_ITER_STATE_INVALID:
+		return "<invalid>";
+	default:
+		WARN_ONCE(1, "unknown iter state %d\n", state);
+		return "<unknown>";
+	}
+}
+
 static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
 {
 	env->scratched_regs |= 1U << regno;
@@ -1118,6 +1152,157 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
 	}
 }
 
+static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+
+static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+				 struct bpf_reg_state *reg, int insn_idx,
+				 struct btf *btf, u32 btf_id, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j, id;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0)
+		return spi;
+
+	id = acquire_reference_state(env, insn_idx);
+	if (id < 0)
+		return id;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+		struct bpf_reg_state *st = &slot->spilled_ptr;
+
+		__mark_reg_known_zero(st);
+		st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+		st->live |= REG_LIVE_WRITTEN;
+		st->ref_obj_id = i == 0 ? id : 0;
+		st->iter.btf = btf;
+		st->iter.btf_id = btf_id;
+		st->iter.state = BPF_ITER_STATE_ACTIVE;
+		st->iter.depth = 0;
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			slot->slot_type[j] = STACK_ITER;
+
+		mark_stack_slot_scratched(env, spi - i);
+	}
+
+	return 0;
+}
+
+static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
+				   struct bpf_reg_state *reg, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0)
+		return spi;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+		struct bpf_reg_state *st = &slot->spilled_ptr;
+
+		if (i == 0)
+			WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+
+		__mark_reg_not_init(env, st);
+
+		/* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+		st->live |= REG_LIVE_WRITTEN;
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			slot->slot_type[j] = STACK_INVALID;
+
+		mark_stack_slot_scratched(env, spi - i);
+	}
+
+	return 0;
+}
+
+static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
+				     struct bpf_reg_state *reg, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j;
+
+	/* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+	 * will do check_mem_access to check and update stack bounds later, so
+	 * return true for that case.
+	 */
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi == -ERANGE)
+		return true;
+	if (spi < 0)
+		return false;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			if (slot->slot_type[j] == STACK_ITER)
+				return false;
+	}
+
+	return true;
+}
+
+static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				   struct btf *btf, u32 btf_id, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0)
+		return false;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+		struct bpf_reg_state *st = &slot->spilled_ptr;
+
+		/* only main (first) slot has ref_obj_id set */
+		if (i == 0 && !st->ref_obj_id)
+			return false;
+		if (i != 0 && st->ref_obj_id)
+			return false;
+		if (st->iter.btf != btf || st->iter.btf_id != btf_id)
+			return false;
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			if (slot->slot_type[j] != STACK_ITER)
+				return false;
+	}
+
+	return true;
+}
+
+/* Check if given stack slot is "special":
+ *   - spilled register state (STACK_SPILL);
+ *   - dynptr state (STACK_DYNPTR);
+ *   - iter state (STACK_ITER).
+ */
+static bool is_stack_slot_special(const struct bpf_stack_state *stack)
+{
+	enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
+
+	switch (type) {
+	case STACK_SPILL:
+	case STACK_DYNPTR:
+	case STACK_ITER:
+		return true;
+	case STACK_INVALID:
+	case STACK_MISC:
+	case STACK_ZERO:
+		return false;
+	default:
+		WARN_ONCE(1, "unknown stack slot type %d\n", type);
+		return true;
+	}
+}
+
 /* The reg state of a pointer or a bounded scalar was saved when
  * it was spilled to the stack.
  */
@@ -1267,6 +1452,19 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			if (reg->ref_obj_id)
 				verbose(env, "(ref_id=%d)", reg->ref_obj_id);
 			break;
+		case STACK_ITER:
+			/* only main slot has ref_obj_id set; skip others */
+			reg = &state->stack[i].spilled_ptr;
+			if (!reg->ref_obj_id)
+				continue;
+
+			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+			print_liveness(env, reg->live);
+			verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+				iter_type_str(reg->iter.btf, reg->iter.btf_id),
+				reg->ref_obj_id, iter_state_str(reg->iter.state),
+				reg->iter.depth);
+			break;
 		case STACK_MISC:
 		case STACK_ZERO:
 		default:
@@ -2710,6 +2908,25 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *
 			     state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
 }
 
+static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+			  int spi, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int err, i;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
+
+		err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+		if (err)
+			return err;
+
+		mark_stack_slot_scratched(env, spi - i);
+	}
+
+	return 0;
+}
+
 /* This function is supposed to be used by the following 32-bit optimization
  * code only. It returns TRUE if the source or destination register operates
  * on 64-bit, otherwise return FALSE.
@@ -3691,8 +3908,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 
 		/* regular write of data into stack destroys any spilled ptr */
 		state->stack[spi].spilled_ptr.type = NOT_INIT;
-		/* Mark slots as STACK_MISC if they belonged to spilled ptr. */
-		if (is_spilled_reg(&state->stack[spi]))
+		/* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+		if (is_stack_slot_special(&state->stack[spi]))
 			for (i = 0; i < BPF_REG_SIZE; i++)
 				scrub_spilled_slot(&state->stack[spi].slot_type[i]);
 
@@ -6506,6 +6723,203 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
 	return err;
 }
 
+static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
+{
+	struct bpf_func_state *state = func(env, reg);
+
+	return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+}
+
+static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_ITER_NEW;
+}
+
+static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_ITER_NEXT;
+}
+
+static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_ITER_DESTROY;
+}
+
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
+{
+	/* btf_check_iter_kfuncs() guarantees that first argument of any iter
+	 * kfunc is iter state pointer
+	 */
+	return arg == 0 && is_iter_kfunc(meta);
+}
+
+static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+			    struct bpf_kfunc_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	const struct btf_type *t;
+	const struct btf_param *arg;
+	int spi, err, i, nr_slots;
+	u32 btf_id;
+
+	/* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
+	arg = &btf_params(meta->func_proto)[0];
+	t = btf_type_skip_modifiers(meta->btf, arg->type, NULL);	/* PTR */
+	t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id);	/* STRUCT */
+	nr_slots = t->size / BPF_REG_SIZE;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0 && spi != -ERANGE)
+		return spi;
+
+	meta->iter.spi = spi;
+	meta->iter.frameno = reg->frameno;
+
+	if (is_iter_new_kfunc(meta)) {
+		/* bpf_iter_<type>_new() expects pointer to uninit iter state */
+		if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
+			verbose(env, "expected uninitialized iter_%s as arg #%d\n",
+				iter_type_str(meta->btf, btf_id), regno);
+			return -EINVAL;
+		}
+
+		for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
+			err = check_mem_access(env, insn_idx, regno,
+					       i, BPF_DW, BPF_WRITE, -1, false);
+			if (err)
+				return err;
+		}
+
+		err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+		if (err)
+			return err;
+	} else {
+		/* iter_next() or iter_destroy() expect initialized iter state*/
+		if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+			verbose(env, "expected an initialized iter_%s as arg #%d\n",
+				iter_type_str(meta->btf, btf_id), regno);
+			return -EINVAL;
+		}
+
+		err = mark_iter_read(env, reg, spi, nr_slots);
+		if (err)
+			return err;
+
+		meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+
+		if (is_iter_destroy_kfunc(meta)) {
+			err = unmark_stack_slots_iter(env, reg, nr_slots);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+/* process_iter_next_call() is called when verifier gets to iterator's next
+ * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
+ * to it as just "iter_next()" in comments below.
+ *
+ * BPF verifier relies on a crucial contract for any iter_next()
+ * implementation: it should *eventually* return NULL, and once that happens
+ * it should keep returning NULL. That is, once iterator exhausts elements to
+ * iterate, it should never reset or spuriously return new elements.
+ *
+ * With the assumption of such contract, process_iter_next_call() simulates
+ * a fork in the verifier state to validate loop logic correctness and safety
+ * without having to simulate infinite amount of iterations.
+ *
+ * In current state, we first assume that iter_next() returned NULL and
+ * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
+ * conditions we should not form an infinite loop and should eventually reach
+ * exit.
+ *
+ * Besides that, we also fork current state and enqueue it for later
+ * verification. In a forked state we keep iterator state as ACTIVE
+ * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
+ * also bump iteration depth to prevent erroneous infinite loop detection
+ * later on (see iter_active_depths_differ() comment for details). In this
+ * state we assume that we'll eventually loop back to another iter_next()
+ * calls (it could be in exactly same location or in some other instruction,
+ * it doesn't matter, we don't make any unnecessary assumptions about this,
+ * everything revolves around iterator state in a stack slot, not which
+ * instruction is calling iter_next()). When that happens, we either will come
+ * to iter_next() with equivalent state and can conclude that next iteration
+ * will proceed in exactly the same way as we just verified, so it's safe to
+ * assume that loop converges. If not, we'll go on another iteration
+ * simulation with a different input state, until all possible starting states
+ * are validated or we reach maximum number of instructions limit.
+ *
+ * This way, we will either exhaustively discover all possible input states
+ * that iterator loop can start with and eventually will converge, or we'll
+ * effectively regress into bounded loop simulation logic and either reach
+ * maximum number of instructions if loop is not provably convergent, or there
+ * is some statically known limit on number of iterations (e.g., if there is
+ * an explicit `if n > 100 then break;` statement somewhere in the loop).
+ *
+ * One very subtle but very important aspect is that we *always* simulate NULL
+ * condition first (as the current state) before we simulate non-NULL case.
+ * This has to do with intricacies of scalar precision tracking. By simulating
+ * "exit condition" of iter_next() returning NULL first, we make sure all the
+ * relevant precision marks *that will be set **after** we exit iterator loop*
+ * are propagated backwards to common parent state of NULL and non-NULL
+ * branches. Thanks to that, state equivalence checks done later in forked
+ * state, when reaching iter_next() for ACTIVE iterator, can assume that
+ * precision marks are finalized and won't change. Because simulating another
+ * ACTIVE iterator iteration won't change them (because given same input
+ * states we'll end up with exactly same output states which we are currently
+ * comparing; and verification after the loop already propagated back what
+ * needs to be **additionally** tracked as precise). It's subtle, grok
+ * precision tracking for more intuitive understanding.
+ */
+static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
+				  struct bpf_kfunc_call_arg_meta *meta)
+{
+	struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+	struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
+	struct bpf_reg_state *cur_iter, *queued_iter;
+	int iter_frameno = meta->iter.frameno;
+	int iter_spi = meta->iter.spi;
+
+	BTF_TYPE_EMIT(struct bpf_iter);
+
+	cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+
+	if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
+	    cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
+		verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n",
+			cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
+		return -EFAULT;
+	}
+
+	if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+		/* branch out active iter state */
+		queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
+		if (!queued_st)
+			return -ENOMEM;
+
+		queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+		queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
+		queued_iter->iter.depth++;
+
+		queued_fr = queued_st->frame[queued_st->curframe];
+		mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
+	}
+
+	/* switch to DRAINED state, but keep the depth unchanged */
+	/* mark current iter state as drained and assume returned NULL */
+	cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
+	__mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]);
+
+	return 0;
+}
+
 static bool arg_type_is_mem_size(enum bpf_arg_type type)
 {
 	return type == ARG_CONST_SIZE ||
@@ -9099,6 +9513,7 @@ enum kfunc_ptr_arg_type {
 	KF_ARG_PTR_TO_ALLOC_BTF_ID,  /* Allocated object */
 	KF_ARG_PTR_TO_KPTR,	     /* PTR_TO_KPTR but type specific */
 	KF_ARG_PTR_TO_DYNPTR,
+	KF_ARG_PTR_TO_ITER,
 	KF_ARG_PTR_TO_LIST_HEAD,
 	KF_ARG_PTR_TO_LIST_NODE,
 	KF_ARG_PTR_TO_BTF_ID,	     /* Also covers reg2btf_ids conversions */
@@ -9220,6 +9635,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_DYNPTR;
 
+	if (is_kfunc_arg_iter(meta, argno))
+		return KF_ARG_PTR_TO_ITER;
+
 	if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_LIST_HEAD;
 
@@ -9848,6 +10266,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			break;
 		case KF_ARG_PTR_TO_KPTR:
 		case KF_ARG_PTR_TO_DYNPTR:
+		case KF_ARG_PTR_TO_ITER:
 		case KF_ARG_PTR_TO_LIST_HEAD:
 		case KF_ARG_PTR_TO_LIST_NODE:
 		case KF_ARG_PTR_TO_RB_ROOT:
@@ -9944,6 +10363,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 
 			break;
 		}
+		case KF_ARG_PTR_TO_ITER:
+			ret = process_iter_arg(env, regno, insn_idx, meta);
+			if (ret < 0)
+				return ret;
+			break;
 		case KF_ARG_PTR_TO_LIST_HEAD:
 			if (reg->type != PTR_TO_MAP_VALUE &&
 			    reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
@@ -10148,6 +10572,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	desc_btf = meta.btf;
 	insn_aux = &env->insn_aux_data[insn_idx];
 
+	insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+
 	if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
 		verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
 		return -EACCES;
@@ -10436,6 +10862,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			mark_btf_func_reg_size(env, regno, t->size);
 	}
 
+	if (is_iter_next_kfunc(&meta)) {
+		err = process_iter_next_call(env, insn_idx, &meta);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -13548,6 +13980,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 			 * async state will be pushed for further exploration.
 			 */
 			mark_prune_point(env, t);
+		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+			struct bpf_kfunc_call_arg_meta meta;
+
+			ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+			if (ret == 0 && is_iter_next_kfunc(&meta))
+				mark_prune_point(env, t);
+		}
 		return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
 
 	case BPF_JA:
@@ -14301,6 +14740,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 	 * didn't use them
 	 */
 	for (i = 0; i < old->allocated_stack; i++) {
+		struct bpf_reg_state *old_reg, *cur_reg;
+
 		spi = i / BPF_REG_SIZE;
 
 		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
@@ -14357,9 +14798,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 				return false;
 			break;
 		case STACK_DYNPTR:
-		{
-			const struct bpf_reg_state *old_reg, *cur_reg;
-
 			old_reg = &old->stack[spi].spilled_ptr;
 			cur_reg = &cur->stack[spi].spilled_ptr;
 			if (old_reg->dynptr.type != cur_reg->dynptr.type ||
@@ -14367,7 +14805,22 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
 				return false;
 			break;
-		}
+		case STACK_ITER:
+			old_reg = &old->stack[spi].spilled_ptr;
+			cur_reg = &cur->stack[spi].spilled_ptr;
+			/* iter.depth is not compared between states as it
+			 * doesn't matter for correctness and would otherwise
+			 * prevent convergence; we maintain it only to prevent
+			 * infinite loop check triggering, see
+			 * iter_active_depths_differ()
+			 */
+			if (old_reg->iter.btf != cur_reg->iter.btf ||
+			    old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+			    old_reg->iter.state != cur_reg->iter.state ||
+			    /* ignore {old_reg,cur_reg}->iter.depth, see above */
+			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+				return false;
+			break;
 		case STACK_MISC:
 		case STACK_ZERO:
 		case STACK_INVALID:
@@ -14626,6 +15079,92 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,
 	return true;
 }
 
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+	return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "ininite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ *   0: again:                          ; set up iter_next() call args
+ *   1:   r1 = &it                      ; <CHECKPOINT HERE>
+ *   2:   call bpf_iter_num_next        ; this is iter_next() call
+ *   3:   if r0 == 0 goto done
+ *   4:   ... something useful here ...
+ *   5:   goto again                    ; another iteration
+ *   6: done:
+ *   7:   r1 = &it
+ *   8:   call bpf_iter_num_destroy     ; clean up iter state
+ *   9:   exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * inifintely looping:
+ *
+ *   struct bpf_iter_num it;
+ *   int *p, x;
+ *
+ *   bpf_iter_num_new(&it, 0, 10);
+ *   while ((p = bpf_iter_num_next(&t))) {
+ *       x = p;
+ *       while (x--) {} // <<-- infinite loop here
+ *   }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
+{
+	struct bpf_reg_state *slot, *cur_slot;
+	struct bpf_func_state *state;
+	int i, fr;
+
+	for (fr = old->curframe; fr >= 0; fr--) {
+		state = old->frame[fr];
+		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+			if (state->stack[i].slot_type[0] != STACK_ITER)
+				continue;
+
+			slot = &state->stack[i].spilled_ptr;
+			if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+				continue;
+
+			cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+			if (cur_slot->iter.depth != slot->iter.depth)
+				return true;
+		}
+	}
+	return false;
+}
 
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
@@ -14673,8 +15212,46 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				 * Since the verifier still needs to catch infinite loops
 				 * inside async callbacks.
 				 */
-			} else if (states_maybe_looping(&sl->state, cur) &&
-				   states_equal(env, &sl->state, cur)) {
+				goto skip_inf_loop_check;
+			}
+			/* BPF open-coded iterators loop detection is special.
+			 * states_maybe_looping() logic is too simplistic in detecting
+			 * states that *might* be equivalent, because it doesn't know
+			 * about ID remapping, so don't even perform it.
+			 * See process_iter_next_call() and iter_active_depths_differ()
+			 * for overview of the logic. When current and one of parent
+			 * states are detected as equivalent, it's a good thing: we prove
+			 * convergence and can stop simulating further iterations.
+			 * It's safe to assume that iterator loop will finish, taking into
+			 * account iter_next() contract of eventually returning
+			 * sticky NULL result.
+			 */
+			if (is_iter_next_insn(env, insn_idx)) {
+				if (states_equal(env, &sl->state, cur)) {
+					struct bpf_func_state *cur_frame;
+					struct bpf_reg_state *iter_state, *iter_reg;
+					int spi;
+
+					cur_frame = cur->frame[cur->curframe];
+					/* btf_check_iter_kfuncs() enforces that
+					 * iter state pointer is always the first arg
+					 */
+					iter_reg = &cur_frame->regs[BPF_REG_1];
+					/* current state is valid due to states_equal(),
+					 * so we can assume valid iter and reg state,
+					 * no need for extra (re-)validations
+					 */
+					spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+					iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+						goto hit;
+				}
+				goto skip_inf_loop_check;
+			}
+			/* attempt to detect infinite loop to avoid unnecessary doomed work */
+			if (states_maybe_looping(&sl->state, cur) &&
+			    states_equal(env, &sl->state, cur) &&
+			    !iter_active_depths_differ(&sl->state, cur)) {
 				verbose_linfo(env, insn_idx, "; ");
 				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
 				return -EINVAL;
@@ -14691,6 +15268,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * This threshold shouldn't be too high either, since states
 			 * at the end of the loop are likely to be useful in pruning.
 			 */
+skip_inf_loop_check:
 			if (!env->test_state_freq &&
 			    env->jmps_processed - env->prev_jmps_processed < 20 &&
 			    env->insn_processed - env->prev_insn_processed < 100)
@@ -14698,6 +15276,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			goto miss;
 		}
 		if (states_equal(env, &sl->state, cur)) {
+hit:
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
 			 * prune the search.

From patchwork Wed Mar  8 18:41:17 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Andrii Nakryiko <andrii@kernel.org>
X-Patchwork-Id: 13166309
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 21C18C6FD1F
	for <bpf@archiver.kernel.org>; Wed,  8 Mar 2023 18:41:45 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229716AbjCHSln convert rfc822-to-8bit (ORCPT
        <rfc822;bpf@archiver.kernel.org>); Wed, 8 Mar 2023 13:41:43 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:38822 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229701AbjCHSlm (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 8 Mar 2023 13:41:42 -0500
Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com
 [67.231.145.42])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 44742B1A45
        for <bpf@vger.kernel.org>; Wed,  8 Mar 2023 10:41:40 -0800 (PST)
Received: from pps.filterd (m0148461.ppops.net [127.0.0.1])
        by mx0a-00082601.pphosted.com (8.17.1.19/8.17.1.19) with ESMTP id
 328I0QAw008405
        for <bpf@vger.kernel.org>; Wed, 8 Mar 2023 10:41:40 -0800
Received: from mail.thefacebook.com ([163.114.132.120])
        by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3p6fekp2tw-2
        (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128
 verify=NOT)
        for <bpf@vger.kernel.org>; Wed, 08 Mar 2023 10:41:39 -0800
Received: from twshared29091.48.prn1.facebook.com (2620:10d:c085:108::4) by
 mail.thefacebook.com (2620:10d:c085:11d::7) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.17; Wed, 8 Mar 2023 10:41:38 -0800
Received: by devbig019.vll3.facebook.com (Postfix, from userid 137359)
        id 9485C299B75C0; Wed,  8 Mar 2023 10:41:30 -0800 (PST)
From: Andrii Nakryiko <andrii@kernel.org>
To: <bpf@vger.kernel.org>, <ast@kernel.org>, <daniel@iogearbox.net>
CC: <andrii@kernel.org>, <kernel-team@meta.com>,
        Tejun Heo <tj@kernel.org>
Subject: [PATCH v5 bpf-next 4/8] bpf: implement numbers iterator
Date: Wed, 8 Mar 2023 10:41:17 -0800
Message-ID: <20230308184121.1165081-5-andrii@kernel.org>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20230308184121.1165081-1-andrii@kernel.org>
References: <20230308184121.1165081-1-andrii@kernel.org>
MIME-Version: 1.0
X-FB-Internal: Safe
X-Proofpoint-GUID: zh4Adg3nIV6M5hHK-HPzvkgVlVAn5PS6
X-Proofpoint-ORIG-GUID: zh4Adg3nIV6M5hHK-HPzvkgVlVAn5PS6
X-Proofpoint-Virus-Version: vendor=baseguard
 engine=ICAP:2.0.254,Aquarius:18.0.942,Hydra:6.0.573,FMLib:17.11.170.22
 definitions=2023-03-08_12,2023-03-08_03,2023-02-09_01
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org
X-Patchwork-Delegate: bpf@iogearbox.net

Implement the first open-coded iterator type over a range of integers.

It's public API consists of:
  - bpf_iter_num_new() constructor, which accepts [start, end) range
    (that is, start is inclusive, end is exclusive).
  - bpf_iter_num_next() which will keep returning read-only pointer to int
    until the range is exhausted, at which point NULL will be returned.
    If bpf_iter_num_next() is kept calling after this, NULL will be
    persistently returned.
  - bpf_iter_num_destroy() destructor, which needs to be called at some
    point to clean up iterator state. BPF verifier enforces that iterator
    destructor is called at some point before BPF program exits.

Note that `start = end = X` is a valid combination to setup an empty
iterator. bpf_iter_num_new() will return 0 (success) for any such
combination.

If bpf_iter_num_new() detects invalid combination of input arguments, it
returns error, resets iterator state to, effectively, empty iterator, so
any subsequent call to bpf_iter_num_next() will keep returning NULL.

BPF verifier has no knowledge that returned integers are in the
[start, end) value range, as both `start` and `end` are not statically
known and enforced: they are runtime values.

While the implementation is pretty trivial, some care needs to be taken
to avoid overflows and underflows. Subsequent selftests will validate
correctness of [start, end) semantics, especially around extremes
(INT_MIN and INT_MAX).

Similarly to bpf_loop(), we enforce that no more than BPF_MAX_LOOPS can
be specified.

bpf_iter_num_{new,next,destroy}() is a logical evolution from bounded
BPF loops and bpf_loop() helper and is the basis for implementing
ergonomic BPF loops with no statically known or verified bounds.
Subsequent patches implement bpf_for() macro, demonstrating how this can
be wrapped into something that works and feels like a normal for() loop
in C language.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf.h            |  8 +++-
 include/uapi/linux/bpf.h       |  8 ++++
 kernel/bpf/bpf_iter.c          | 70 ++++++++++++++++++++++++++++++++++
 kernel/bpf/helpers.c           |  3 ++
 tools/include/uapi/linux/bpf.h |  8 ++++
 5 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6792a7940e1e..e64ff1e89fb2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1617,8 +1617,12 @@ struct bpf_array {
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
 #define MAX_TAIL_CALL_CNT 33
 
-/* Maximum number of loops for bpf_loop */
-#define BPF_MAX_LOOPS	BIT(23)
+/* Maximum number of loops for bpf_loop and bpf_iter_num.
+ * It's enum to expose it (and thus make it discoverable) through BTF.
+ */
+enum {
+	BPF_MAX_LOOPS = 8 * 1024 * 1024,
+};
 
 #define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
 				 BPF_F_RDONLY_PROG |	\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 976b194eb775..4abddb668a10 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7112,4 +7112,12 @@ enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
 };
 
+/* BPF numbers iterator state */
+struct bpf_iter_num {
+	/* opaque iterator state; having __u64 here allows to preserve correct
+	 * alignment requirements in vmlinux.h, generated from BTF
+	 */
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 5dc307bdeaeb..96856f130cbf 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -776,3 +776,73 @@ const struct bpf_func_proto bpf_loop_proto = {
 	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,
 	.arg4_type	= ARG_ANYTHING,
 };
+
+struct bpf_iter_num_kern {
+	int cur; /* current value, inclusive */
+	int end; /* final value, exclusive */
+} __aligned(8);
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
+
+	BTF_TYPE_EMIT(struct btf_iter_num);
+
+	/* start == end is legit, it's an empty range and we'll just get NULL
+	 * on first (and any subsequent) bpf_iter_num_next() call
+	 */
+	if (start > end) {
+		s->cur = s->end = 0;
+		return -EINVAL;
+	}
+
+	/* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
+	if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
+		s->cur = s->end = 0;
+		return -E2BIG;
+	}
+
+	/* user will call bpf_iter_num_next() first,
+	 * which will set s->cur to exactly start value;
+	 * underflow shouldn't matter
+	 */
+	s->cur = start - 1;
+	s->end = end;
+
+	return 0;
+}
+
+__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	/* check failed initialization or if we are done (same behavior);
+	 * need to be careful about overflow, so convert to s64 for checks,
+	 * e.g., if s->cur == s->end == INT_MAX, we can't just do
+	 * s->cur + 1 >= s->end
+	 */
+	if ((s64)(s->cur + 1) >= s->end) {
+		s->cur = s->end = 0;
+		return NULL;
+	}
+
+	s->cur++;
+
+	return &s->cur;
+}
+
+__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	s->cur = s->end = 0;
+}
+
+__diag_pop();
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 637ac4e92e75..f9b7eeedce08 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2411,6 +2411,9 @@ BTF_ID_FLAGS(func, bpf_rcu_read_lock)
 BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
 BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 976b194eb775..4abddb668a10 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7112,4 +7112,12 @@ enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
 };
 
+/* BPF numbers iterator state */
+struct bpf_iter_num {
+	/* opaque iterator state; having __u64 here allows to preserve correct
+	 * alignment requirements in vmlinux.h, generated from BTF
+	 */
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */

From patchwork Wed Mar  8 18:41:18 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Andrii Nakryiko <andrii@kernel.org>
X-Patchwork-Id: 13166313
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id D7197C678D5
	for <bpf@archiver.kernel.org>; Wed,  8 Mar 2023 18:41:51 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229776AbjCHSlt convert rfc822-to-8bit (ORCPT
        <rfc822;bpf@archiver.kernel.org>); Wed, 8 Mar 2023 13:41:49 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:38912 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229815AbjCHSlr (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 8 Mar 2023 13:41:47 -0500
Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com
 [67.231.145.42])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 6E1FBAF69E
        for <bpf@vger.kernel.org>; Wed,  8 Mar 2023 10:41:45 -0800 (PST)
Received: from pps.filterd (m0109334.ppops.net [127.0.0.1])
        by mx0a-00082601.pphosted.com (8.17.1.19/8.17.1.19) with ESMTP id
 328I0EVU008189
        for <bpf@vger.kernel.org>; Wed, 8 Mar 2023 10:41:45 -0800
Received: from maileast.thefacebook.com ([163.114.130.16])
        by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3p6feu63fj-2
        (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128
 verify=NOT)
        for <bpf@vger.kernel.org>; Wed, 08 Mar 2023 10:41:45 -0800
Received: from twshared16996.15.frc2.facebook.com (2620:10d:c0a8:1b::d) by
 mail.thefacebook.com (2620:10d:c0a8:82::d) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.17; Wed, 8 Mar 2023 10:41:43 -0800
Received: by devbig019.vll3.facebook.com (Postfix, from userid 137359)
        id A2E4C299B75C5; Wed,  8 Mar 2023 10:41:32 -0800 (PST)
From: Andrii Nakryiko <andrii@kernel.org>
To: <bpf@vger.kernel.org>, <ast@kernel.org>, <daniel@iogearbox.net>
CC: <andrii@kernel.org>, <kernel-team@meta.com>,
        Tejun Heo <tj@kernel.org>
Subject: [PATCH v5 bpf-next 5/8] selftests/bpf: add bpf_for_each(), bpf_for(),
 and bpf_repeat() macros
Date: Wed, 8 Mar 2023 10:41:18 -0800
Message-ID: <20230308184121.1165081-6-andrii@kernel.org>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20230308184121.1165081-1-andrii@kernel.org>
References: <20230308184121.1165081-1-andrii@kernel.org>
MIME-Version: 1.0
X-FB-Internal: Safe
X-Proofpoint-GUID: inZPlKUiAygShEPsT2BriBOHIkKpcfzp
X-Proofpoint-ORIG-GUID: inZPlKUiAygShEPsT2BriBOHIkKpcfzp
X-Proofpoint-Virus-Version: vendor=baseguard
 engine=ICAP:2.0.254,Aquarius:18.0.942,Hydra:6.0.573,FMLib:17.11.170.22
 definitions=2023-03-08_12,2023-03-08_03,2023-02-09_01
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org
X-Patchwork-Delegate: bpf@iogearbox.net

Add bpf_for_each(), bpf_for(), and bpf_repeat() macros that make writing
open-coded iterator-based loops much more convenient and natural. These
macros utilize cleanup attribute to ensure proper destruction of the
iterator and thanks to that manage to provide the ergonomics that is
very close to C language's for() construct. Typical loop would look like:

  int i;
  int arr[N];

  bpf_for(i, 0, N) {
      /* verifier will know that i >= 0 && i < N, so could be used to
       * directly access array elements with no extra checks
       */
       arr[i] = i;
  }

bpf_repeat() is very similar, but it doesn't expose iteration number and
is meant as a simple "repeat action N times" loop:

  bpf_repeat(N) { /* whatever, N times */ }

Note that `break` and `continue` statements inside the {} block work as
expected.

bpf_for_each() is a generalization over any kind of BPF open-coded
iterator allowing to use for-each-like approach instead of calling
low-level bpf_iter_<type>_{new,next,destroy}() APIs explicitly. E.g.:

  struct cgroup *cg;

  bpf_for_each(cgroup, cg, some, input, args) {
      /* do something with each cg */
  }

would call (not-yet-implemented) bpf_iter_cgroup_{new,next,destroy}()
functions to form a loop over cgroups, where `some, input, args` are
passed verbatim into constructor as

  bpf_iter_cgroup_new(&it, some, input, args).

As a first demonstration, add pyperf variant based on the bpf_for() loop.

Also clean up a few tests that either included bpf_misc.h header
unnecessarily from the user-space, which is unsupported, or included it
before any common types are defined (and thus leading to unnecessary
compilation warnings, potentially).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 .../bpf/prog_tests/bpf_verif_scale.c          |  6 ++
 .../bpf/prog_tests/uprobe_autoattach.c        |  1 -
 tools/testing/selftests/bpf/progs/bpf_misc.h  | 99 +++++++++++++++++++
 tools/testing/selftests/bpf/progs/lsm.c       |  4 +-
 tools/testing/selftests/bpf/progs/pyperf.h    | 14 ++-
 .../selftests/bpf/progs/pyperf600_iter.c      |  7 ++
 .../selftests/bpf/progs/pyperf600_nounroll.c  |  3 -
 7 files changed, 124 insertions(+), 10 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/pyperf600_iter.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
index 5ca252823294..731c343897d8 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
@@ -144,6 +144,12 @@ void test_verif_scale_pyperf600_nounroll()
 	scale_test("pyperf600_nounroll.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
 }
 
+void test_verif_scale_pyperf600_iter()
+{
+	/* open-coded BPF iterator version */
+	scale_test("pyperf600_iter.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
+}
+
 void test_verif_scale_loop1()
 {
 	scale_test("loop1.bpf.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false);
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c b/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c
index 6558c857e620..d5b3377aa33c 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c
@@ -3,7 +3,6 @@
 
 #include <test_progs.h>
 #include "test_uprobe_autoattach.skel.h"
-#include "progs/bpf_misc.h"
 
 /* uprobe attach point */
 static noinline int autoattach_trigger_func(int arg1, int arg2, int arg3,
diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index f704885aa534..597688a188ae 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -75,5 +75,104 @@
 #define FUNC_REG_ARG_CNT 5
 #endif
 
+struct bpf_iter_num;
+
+extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __ksym;
+extern int *bpf_iter_num_next(struct bpf_iter_num *it) __ksym;
+extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __ksym;
+
+#ifndef bpf_for_each
+/* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for
+ * using BPF open-coded iterators without having to write mundane explicit
+ * low-level loop logic. Instead, it provides for()-like generic construct
+ * that can be used pretty naturally. E.g., for some hypothetical cgroup
+ * iterator, you'd write:
+ *
+ * struct cgroup *cg, *parent_cg = <...>;
+ *
+ * bpf_for_each(cgroup, cg, parent_cg, CG_ITER_CHILDREN) {
+ *     bpf_printk("Child cgroup id = %d", cg->cgroup_id);
+ *     if (cg->cgroup_id == 123)
+ *         break;
+ * }
+ *
+ * I.e., it looks almost like high-level for each loop in other languages,
+ * supports continue/break, and is verifiable by BPF verifier.
+ *
+ * For iterating integers, the difference betwen bpf_for_each(num, i, N, M)
+ * and bpf_for(i, N, M) is in that bpf_for() provides additional proof to
+ * verifier that i is in [N, M) range, and in bpf_for_each() case i is `int
+ * *`, not just `int`. So for integers bpf_for() is more convenient.
+ *
+ * Note: this macro relies on C99 feature of allowing to declare variables
+ * inside for() loop, bound to for() loop lifetime. It also utilizes GCC
+ * extension: __attribute__((cleanup(<func>))), supported by both GCC and
+ * Clang.
+ */
+#define bpf_for_each(type, cur, args...) for (							\
+	/* initialize and define destructor */							\
+	struct bpf_iter_##type ___it __attribute__((aligned(8), /* enforce, just in case */,	\
+						    cleanup(bpf_iter_##type##_destroy))),	\
+	/* ___p pointer is just to call bpf_iter_##type##_new() *once* to init ___it */		\
+			       *___p = (bpf_iter_##type##_new(&___it, ##args),			\
+	/* this is a workaround for Clang bug: it currently doesn't emit BTF */			\
+	/* for bpf_iter_##type##_destroy() when used from cleanup() attribute */		\
+					(void)bpf_iter_##type##_destroy, (void *)0);		\
+	/* iteration and termination check */							\
+	(((cur) = bpf_iter_##type##_next(&___it)));						\
+)
+#endif /* bpf_for_each */
+
+#ifndef bpf_for
+/* bpf_for(i, start, end) implements a for()-like looping construct that sets
+ * provided integer variable *i* to values starting from *start* through,
+ * but not including, *end*. It also proves to BPF verifier that *i* belongs
+ * to range [start, end), so this can be used for accessing arrays without
+ * extra checks.
+ *
+ * Note: *start* and *end* are assumed to be expressions with no side effects
+ * and whose values do not change throughout bpf_for() loop execution. They do
+ * not have to be statically known or constant, though.
+ *
+ * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for()
+ * loop bound variables and cleanup attribute, supported by GCC and Clang.
+ */
+#define bpf_for(i, start, end) for (								\
+	/* initialize and define destructor */							\
+	struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */	\
+						 cleanup(bpf_iter_num_destroy))),		\
+	/* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */		\
+			    *___p = (bpf_iter_num_new(&___it, (start), (end)),			\
+	/* this is a workaround for Clang bug: it currently doesn't emit BTF */			\
+	/* for bpf_iter_num_destroy() when used from cleanup() attribute */			\
+				(void)bpf_iter_num_destroy, (void *)0);				\
+	({											\
+		/* iteration step */								\
+		int *___t = bpf_iter_num_next(&___it);						\
+		/* termination and bounds check */						\
+		(___t && ((i) = *___t, (i) >= (start) && (i) < (end)));				\
+	});											\
+)
+#endif /* bpf_for */
+
+#ifndef bpf_repeat
+/* bpf_repeat(N) performs N iterations without exposing iteration number
+ *
+ * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for()
+ * loop bound variables and cleanup attribute, supported by GCC and Clang.
+ */
+#define bpf_repeat(N) for (									\
+	/* initialize and define destructor */							\
+	struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */	\
+						 cleanup(bpf_iter_num_destroy))),		\
+	/* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */		\
+			    *___p = (bpf_iter_num_new(&___it, 0, (N)),				\
+	/* this is a workaround for Clang bug: it currently doesn't emit BTF */			\
+	/* for bpf_iter_num_destroy() when used from cleanup() attribute */			\
+				(void)bpf_iter_num_destroy, (void *)0);				\
+	bpf_iter_num_next(&___it);								\
+	/* nothing here  */									\
+)
+#endif /* bpf_repeat */
 
 #endif
diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c
index dc93887ed34c..fadfdd98707c 100644
--- a/tools/testing/selftests/bpf/progs/lsm.c
+++ b/tools/testing/selftests/bpf/progs/lsm.c
@@ -4,12 +4,12 @@
  * Copyright 2020 Google LLC.
  */
 
-#include "bpf_misc.h"
 #include "vmlinux.h"
+#include <errno.h>
 #include <bpf/bpf_core_read.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include <errno.h>
+#include "bpf_misc.h"
 
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
diff --git a/tools/testing/selftests/bpf/progs/pyperf.h b/tools/testing/selftests/bpf/progs/pyperf.h
index 6c7b1fb268d6..f2e7a31c8d75 100644
--- a/tools/testing/selftests/bpf/progs/pyperf.h
+++ b/tools/testing/selftests/bpf/progs/pyperf.h
@@ -7,6 +7,7 @@
 #include <stdbool.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
 
 #define FUNCTION_NAME_LEN 64
 #define FILE_NAME_LEN 128
@@ -294,17 +295,22 @@ int __on_event(struct bpf_raw_tracepoint_args *ctx)
 	if (ctx.done)
 		return 0;
 #else
-#ifdef NO_UNROLL
+#if defined(USE_ITER)
+/* no for loop, no unrolling */
+#elif defined(NO_UNROLL)
 #pragma clang loop unroll(disable)
-#else
-#ifdef UNROLL_COUNT
+#elif defined(UNROLL_COUNT)
 #pragma clang loop unroll_count(UNROLL_COUNT)
 #else
 #pragma clang loop unroll(full)
-#endif
 #endif /* NO_UNROLL */
 		/* Unwind python stack */
+#ifdef USE_ITER
+		int i;
+		bpf_for(i, 0, STACK_MAX_LEN) {
+#else /* !USE_ITER */
 		for (int i = 0; i < STACK_MAX_LEN; ++i) {
+#endif
 			if (frame_ptr && get_frame_data(frame_ptr, pidData, &frame, &sym)) {
 				int32_t new_symbol_id = *symbol_counter * 64 + cur_cpu;
 				int32_t *symbol_id = bpf_map_lookup_elem(&symbolmap, &sym);
diff --git a/tools/testing/selftests/bpf/progs/pyperf600_iter.c b/tools/testing/selftests/bpf/progs/pyperf600_iter.c
new file mode 100644
index 000000000000..d62e1b200c30
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/pyperf600_iter.c
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+#define STACK_MAX_LEN 600
+#define SUBPROGS
+#define NO_UNROLL
+#define USE_ITER
+#include "pyperf.h"
diff --git a/tools/testing/selftests/bpf/progs/pyperf600_nounroll.c b/tools/testing/selftests/bpf/progs/pyperf600_nounroll.c
index 6beff7502f4d..520b58c4f8db 100644
--- a/tools/testing/selftests/bpf/progs/pyperf600_nounroll.c
+++ b/tools/testing/selftests/bpf/progs/pyperf600_nounroll.c
@@ -2,7 +2,4 @@
 // Copyright (c) 2019 Facebook
 #define STACK_MAX_LEN 600
 #define NO_UNROLL
-/* clang will not unroll at all.
- * Total program size is around 2k insns
- */
 #include "pyperf.h"

From patchwork Wed Mar  8 18:41:19 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Andrii Nakryiko <andrii@kernel.org>
X-Patchwork-Id: 13166311
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id E90F6C64EC4
	for <bpf@archiver.kernel.org>; Wed,  8 Mar 2023 18:41:49 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229764AbjCHSlr convert rfc822-to-8bit (ORCPT
        <rfc822;bpf@archiver.kernel.org>); Wed, 8 Mar 2023 13:41:47 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:38820 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229701AbjCHSlq (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 8 Mar 2023 13:41:46 -0500
Received: from mx0b-00082601.pphosted.com (mx0b-00082601.pphosted.com
 [67.231.153.30])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 43006ADC11
        for <bpf@vger.kernel.org>; Wed,  8 Mar 2023 10:41:42 -0800 (PST)
Received: from pps.filterd (m0109332.ppops.net [127.0.0.1])
        by mx0a-00082601.pphosted.com (8.17.1.19/8.17.1.19) with ESMTP id
 328I0FhL014157
        for <bpf@vger.kernel.org>; Wed, 8 Mar 2023 10:41:41 -0800
Received: from maileast.thefacebook.com ([163.114.130.16])
        by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3p6ffbdweq-1
        (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128
 verify=NOT)
        for <bpf@vger.kernel.org>; Wed, 08 Mar 2023 10:41:41 -0800
Received: from twshared38955.16.prn3.facebook.com (2620:10d:c0a8:1b::d) by
 mail.thefacebook.com (2620:10d:c0a8:83::6) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.17; Wed, 8 Mar 2023 10:41:40 -0800
Received: by devbig019.vll3.facebook.com (Postfix, from userid 137359)
        id B157C299B75ED; Wed,  8 Mar 2023 10:41:34 -0800 (PST)
From: Andrii Nakryiko <andrii@kernel.org>
To: <bpf@vger.kernel.org>, <ast@kernel.org>, <daniel@iogearbox.net>
CC: <andrii@kernel.org>, <kernel-team@meta.com>,
        Tejun Heo <tj@kernel.org>
Subject: [PATCH v5 bpf-next 6/8] selftests/bpf: add iterators tests
Date: Wed, 8 Mar 2023 10:41:19 -0800
Message-ID: <20230308184121.1165081-7-andrii@kernel.org>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20230308184121.1165081-1-andrii@kernel.org>
References: <20230308184121.1165081-1-andrii@kernel.org>
MIME-Version: 1.0
X-FB-Internal: Safe
X-Proofpoint-GUID: 8PSgJsrsXpjk8w3bUuRQyQCLt29taAVi
X-Proofpoint-ORIG-GUID: 8PSgJsrsXpjk8w3bUuRQyQCLt29taAVi
X-Proofpoint-Virus-Version: vendor=baseguard
 engine=ICAP:2.0.254,Aquarius:18.0.942,Hydra:6.0.573,FMLib:17.11.170.22
 definitions=2023-03-08_12,2023-03-08_03,2023-02-09_01
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org
X-Patchwork-Delegate: bpf@iogearbox.net

Add various tests for open-coded iterators. Some of them excercise
various possible coding patterns in C, some go down to low-level
assembly for more control over various conditions, especially invalid
ones.

We also make use of bpf_for(), bpf_for_each(), bpf_repeat() macros in
some of these tests.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/iters.c  |  15 +
 tools/testing/selftests/bpf/progs/bpf_misc.h  |   1 +
 tools/testing/selftests/bpf/progs/iters.c     | 720 ++++++++++++++++++
 .../selftests/bpf/progs/iters_looping.c       | 163 ++++
 .../selftests/bpf/progs/iters_state_safety.c  | 426 +++++++++++
 5 files changed, 1325 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/iters.c
 create mode 100644 tools/testing/selftests/bpf/progs/iters.c
 create mode 100644 tools/testing/selftests/bpf/progs/iters_looping.c
 create mode 100644 tools/testing/selftests/bpf/progs/iters_state_safety.c

diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c
new file mode 100644
index 000000000000..414fb8d82145
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/iters.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+
+#include "iters.skel.h"
+#include "iters_state_safety.skel.h"
+#include "iters_looping.skel.h"
+
+void test_iters(void)
+{
+	RUN_TESTS(iters_state_safety);
+	RUN_TESTS(iters_looping);
+	RUN_TESTS(iters);
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index 597688a188ae..43b154a639e7 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -36,6 +36,7 @@
 #define __clobber_common "r0", "r1", "r2", "r3", "r4", "r5", "memory"
 #define __imm(name) [name]"i"(name)
 #define __imm_addr(name) [name]"i"(&name)
+#define __imm_ptr(name) [name]"p"(&name)
 
 #if defined(__TARGET_ARCH_x86)
 #define SYSCALL_WRAPPER 1
diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
new file mode 100644
index 000000000000..84e5dc10243c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/iters.c
@@ -0,0 +1,720 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+static volatile int zero = 0;
+
+int my_pid;
+int arr[256];
+int small_arr[16] SEC(".data.small_arr");
+
+#ifdef REAL_TEST
+#define MY_PID_GUARD() if (my_pid != (bpf_get_current_pid_tgid() >> 32)) return 0
+#else
+#define MY_PID_GUARD() ({ })
+#endif
+
+SEC("?raw_tp")
+__failure __msg("math between map_value pointer and register with unbounded min value is not allowed")
+int iter_err_unsafe_c_loop(const void *ctx)
+{
+	struct bpf_iter_num it;
+	int *v, i = zero; /* obscure initial value of i */
+
+	MY_PID_GUARD();
+
+	bpf_iter_num_new(&it, 0, 1000);
+	while ((v = bpf_iter_num_next(&it))) {
+		i++;
+	}
+	bpf_iter_num_destroy(&it);
+
+	small_arr[i] = 123; /* invalid */
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("unbounded memory access")
+int iter_err_unsafe_asm_loop(const void *ctx)
+{
+	struct bpf_iter_num it;
+	int *v, i = 0;
+
+	MY_PID_GUARD();
+
+	asm volatile (
+		"r6 = %[zero];" /* iteration counter */
+		"r1 = %[it];" /* iterator state */
+		"r2 = 0;"
+		"r3 = 1000;"
+		"r4 = 1;"
+		"call %[bpf_iter_num_new];"
+	"loop:"
+		"r1 = %[it];"
+		"call %[bpf_iter_num_next];"
+		"if r0 == 0 goto out;"
+		"r6 += 1;"
+		"goto loop;"
+	"out:"
+		"r1 = %[it];"
+		"call %[bpf_iter_num_destroy];"
+		"r1 = %[small_arr];"
+		"r2 = r6;"
+		"r2 <<= 2;"
+		"r1 += r2;"
+		"*(u32 *)(r1 + 0) = r6;" /* invalid */
+		:
+		: [it]"r"(&it),
+		  [small_arr]"p"(small_arr),
+		  [zero]"p"(zero),
+		  __imm(bpf_iter_num_new),
+		  __imm(bpf_iter_num_next),
+		  __imm(bpf_iter_num_destroy)
+		: __clobber_common, "r6"
+	);
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_while_loop(const void *ctx)
+{
+	struct bpf_iter_num it;
+	int *v, i;
+
+	MY_PID_GUARD();
+
+	bpf_iter_num_new(&it, 0, 3);
+	while ((v = bpf_iter_num_next(&it))) {
+		bpf_printk("ITER_BASIC: E1 VAL: v=%d", *v);
+	}
+	bpf_iter_num_destroy(&it);
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_while_loop_auto_cleanup(const void *ctx)
+{
+	__attribute__((cleanup(bpf_iter_num_destroy))) struct bpf_iter_num it;
+	int *v, i;
+
+	MY_PID_GUARD();
+
+	bpf_iter_num_new(&it, 0, 3);
+	while ((v = bpf_iter_num_next(&it))) {
+		bpf_printk("ITER_BASIC: E1 VAL: v=%d", *v);
+	}
+	/* (!) no explicit bpf_iter_num_destroy() */
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_for_loop(const void *ctx)
+{
+	struct bpf_iter_num it;
+	int *v, i;
+
+	MY_PID_GUARD();
+
+	bpf_iter_num_new(&it, 5, 10);
+	for (v = bpf_iter_num_next(&it); v; v = bpf_iter_num_next(&it)) {
+		bpf_printk("ITER_BASIC: E2 VAL: v=%d", *v);
+	}
+	bpf_iter_num_destroy(&it);
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_bpf_for_each_macro(const void *ctx)
+{
+	int *v;
+
+	MY_PID_GUARD();
+
+	bpf_for_each(num, v, 5, 10) {
+		bpf_printk("ITER_BASIC: E2 VAL: v=%d", *v);
+	}
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_bpf_for_macro(const void *ctx)
+{
+	int i;
+
+	MY_PID_GUARD();
+
+	bpf_for(i, 5, 10) {
+		bpf_printk("ITER_BASIC: E2 VAL: v=%d", i);
+	}
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_pragma_unroll_loop(const void *ctx)
+{
+	struct bpf_iter_num it;
+	int *v, i;
+
+	MY_PID_GUARD();
+
+	bpf_iter_num_new(&it, 0, 2);
+#pragma nounroll
+	for (i = 0; i < 3; i++) {
+		v = bpf_iter_num_next(&it);
+		bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1);
+	}
+	bpf_iter_num_destroy(&it);
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_manual_unroll_loop(const void *ctx)
+{
+	struct bpf_iter_num it;
+	int *v, i;
+
+	MY_PID_GUARD();
+
+	bpf_iter_num_new(&it, 100, 200);
+	v = bpf_iter_num_next(&it);
+	bpf_printk("ITER_BASIC: E4 VAL: v=%d", v ? *v : -1);
+	v = bpf_iter_num_next(&it);
+	bpf_printk("ITER_BASIC: E4 VAL: v=%d", v ? *v : -1);
+	v = bpf_iter_num_next(&it);
+	bpf_printk("ITER_BASIC: E4 VAL: v=%d", v ? *v : -1);
+	v = bpf_iter_num_next(&it);
+	bpf_printk("ITER_BASIC: E4 VAL: v=%d\n", v ? *v : -1);
+	bpf_iter_num_destroy(&it);
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_multiple_sequential_loops(const void *ctx)
+{
+	struct bpf_iter_num it;
+	int *v, i;
+
+	MY_PID_GUARD();
+
+	bpf_iter_num_new(&it, 0, 3);
+	while ((v = bpf_iter_num_next(&it))) {
+		bpf_printk("ITER_BASIC: E1 VAL: v=%d", *v);
+	}
+	bpf_iter_num_destroy(&it);
+
+	bpf_iter_num_new(&it, 5, 10);
+	for (v = bpf_iter_num_next(&it); v; v = bpf_iter_num_next(&it)) {
+		bpf_printk("ITER_BASIC: E2 VAL: v=%d", *v);
+	}
+	bpf_iter_num_destroy(&it);
+
+	bpf_iter_num_new(&it, 0, 2);
+#pragma nounroll
+	for (i = 0; i < 3; i++) {
+		v = bpf_iter_num_next(&it);
+		bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1);
+	}
+	bpf_iter_num_destroy(&it);
+
+	bpf_iter_num_new(&it, 100, 200);
+	v = bpf_iter_num_next(&it);
+	bpf_printk("ITER_BASIC: E4 VAL: v=%d", v ? *v : -1);
+	v = bpf_iter_num_next(&it);
+	bpf_printk("ITER_BASIC: E4 VAL: v=%d", v ? *v : -1);
+	v = bpf_iter_num_next(&it);
+	bpf_printk("ITER_BASIC: E4 VAL: v=%d", v ? *v : -1);
+	v = bpf_iter_num_next(&it);
+	bpf_printk("ITER_BASIC: E4 VAL: v=%d\n", v ? *v : -1);
+	bpf_iter_num_destroy(&it);
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_limit_cond_break_loop(const void *ctx)
+{
+	struct bpf_iter_num it;
+	int *v, i = 0, sum = 0;
+
+	MY_PID_GUARD();
+
+	bpf_iter_num_new(&it, 0, 10);
+	while ((v = bpf_iter_num_next(&it))) {
+		bpf_printk("ITER_SIMPLE: i=%d v=%d", i, *v);
+		sum += *v;
+
+		i++;
+		if (i > 3)
+			break;
+	}
+	bpf_iter_num_destroy(&it);
+
+	bpf_printk("ITER_SIMPLE: sum=%d\n", sum);
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_obfuscate_counter(const void *ctx)
+{
+	struct bpf_iter_num it;
+	int *v, sum = 0;
+	/* Make i's initial value unknowable for verifier to prevent it from
+	 * pruning if/else branch inside the loop body and marking i as precise.
+	 */
+	int i = zero;
+
+	MY_PID_GUARD();
+
+	bpf_iter_num_new(&it, 0, 10);
+	while ((v = bpf_iter_num_next(&it))) {
+		int x;
+
+		i += 1;
+
+		/* If we initialized i as `int i = 0;` above, verifier would
+		 * track that i becomes 1 on first iteration after increment
+		 * above, and here verifier would eagerly prune else branch
+		 * and mark i as precise, ruining open-coded iterator logic
+		 * completely, as each next iteration would have a different
+		 * *precise* value of i, and thus there would be no
+		 * convergence of state. This would result in reaching maximum
+		 * instruction limit, no matter what the limit is.
+		 */
+		if (i == 1)
+			x = 123;
+		else
+			x = i * 3 + 1;
+
+		bpf_printk("ITER_OBFUSCATE_COUNTER: i=%d v=%d x=%d", i, *v, x);
+
+		sum += x;
+	}
+	bpf_iter_num_destroy(&it);
+
+	bpf_printk("ITER_OBFUSCATE_COUNTER: sum=%d\n", sum);
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_search_loop(const void *ctx)
+{
+	struct bpf_iter_num it;
+	int *v, *elem = NULL;
+	bool found = false;
+
+	MY_PID_GUARD();
+
+	bpf_iter_num_new(&it, 0, 10);
+
+	while ((v = bpf_iter_num_next(&it))) {
+		bpf_printk("ITER_SEARCH_LOOP: v=%d", *v);
+
+		if (*v == 2) {
+			found = true;
+			elem = v;
+			barrier_var(elem);
+		}
+	}
+
+	/* should fail to verify if bpf_iter_num_destroy() is here */
+
+	if (found)
+		/* here found element will be wrong, we should have copied
+		 * value to a variable, but here we want to make sure we can
+		 * access memory after the loop anyways
+		 */
+		bpf_printk("ITER_SEARCH_LOOP: FOUND IT = %d!\n", *elem);
+	else
+		bpf_printk("ITER_SEARCH_LOOP: NOT FOUND IT!\n");
+
+	bpf_iter_num_destroy(&it);
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_array_fill(const void *ctx)
+{
+	int sum, i;
+
+	MY_PID_GUARD();
+
+	bpf_for(i, 0, ARRAY_SIZE(arr)) {
+		arr[i] = i * 2;
+	}
+
+	sum = 0;
+	bpf_for(i, 0, ARRAY_SIZE(arr)) {
+		sum += arr[i];
+	}
+
+	bpf_printk("ITER_ARRAY_FILL: sum=%d (should be %d)\n", sum, 255 * 256);
+
+	return 0;
+}
+
+static int arr2d[4][5];
+static int arr2d_row_sums[4];
+static int arr2d_col_sums[5];
+
+SEC("raw_tp")
+__success
+int iter_nested_iters(const void *ctx)
+{
+	int sum, row, col;
+
+	MY_PID_GUARD();
+
+	bpf_for(row, 0, ARRAY_SIZE(arr2d)) {
+		bpf_for( col, 0, ARRAY_SIZE(arr2d[0])) {
+			arr2d[row][col] = row * col;
+		}
+	}
+
+	/* zero-initialize sums */
+	sum = 0;
+	bpf_for(row, 0, ARRAY_SIZE(arr2d)) {
+		arr2d_row_sums[row] = 0;
+	}
+	bpf_for(col, 0, ARRAY_SIZE(arr2d[0])) {
+		arr2d_col_sums[col] = 0;
+	}
+
+	/* calculate sums */
+	bpf_for(row, 0, ARRAY_SIZE(arr2d)) {
+		bpf_for(col, 0, ARRAY_SIZE(arr2d[0])) {
+			sum += arr2d[row][col];
+			arr2d_row_sums[row] += arr2d[row][col];
+			arr2d_col_sums[col] += arr2d[row][col];
+		}
+	}
+
+	bpf_printk("ITER_NESTED_ITERS: total sum=%d", sum);
+	bpf_for(row, 0, ARRAY_SIZE(arr2d)) {
+		bpf_printk("ITER_NESTED_ITERS: row #%d sum=%d", row, arr2d_row_sums[row]);
+	}
+	bpf_for(col, 0, ARRAY_SIZE(arr2d[0])) {
+		bpf_printk("ITER_NESTED_ITERS: col #%d sum=%d%s",
+			   col, arr2d_col_sums[col],
+			   col == ARRAY_SIZE(arr2d[0]) - 1 ? "\n" : "");
+	}
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_nested_deeply_iters(const void *ctx)
+{
+	int sum = 0;
+
+	MY_PID_GUARD();
+
+	bpf_repeat(10) {
+		bpf_repeat(10) {
+			bpf_repeat(10) {
+				bpf_repeat(10) {
+					bpf_repeat(10) {
+						sum += 1;
+					}
+				}
+			}
+		}
+		/* validate that we can break from inside bpf_repeat() */
+		break;
+	}
+
+	return sum;
+}
+
+static __noinline void fill_inner_dimension(int row)
+{
+	int col;
+
+	bpf_for(col, 0, ARRAY_SIZE(arr2d[0])) {
+		arr2d[row][col] = row * col;
+	}
+}
+
+static __noinline int sum_inner_dimension(int row)
+{
+	int sum = 0, col;
+
+	bpf_for(col, 0, ARRAY_SIZE(arr2d[0])) {
+		sum += arr2d[row][col];
+		arr2d_row_sums[row] += arr2d[row][col];
+		arr2d_col_sums[col] += arr2d[row][col];
+	}
+
+	return sum;
+}
+
+SEC("raw_tp")
+__success
+int iter_subprog_iters(const void *ctx)
+{
+	int sum, row, col;
+
+	MY_PID_GUARD();
+
+	bpf_for(row, 0, ARRAY_SIZE(arr2d)) {
+		fill_inner_dimension(row);
+	}
+
+	/* zero-initialize sums */
+	sum = 0;
+	bpf_for(row, 0, ARRAY_SIZE(arr2d)) {
+		arr2d_row_sums[row] = 0;
+	}
+	bpf_for(col, 0, ARRAY_SIZE(arr2d[0])) {
+		arr2d_col_sums[col] = 0;
+	}
+
+	/* calculate sums */
+	bpf_for(row, 0, ARRAY_SIZE(arr2d)) {
+		sum += sum_inner_dimension(row);
+	}
+
+	bpf_printk("ITER_SUBPROG_ITERS: total sum=%d", sum);
+	bpf_for(row, 0, ARRAY_SIZE(arr2d)) {
+		bpf_printk("ITER_SUBPROG_ITERS: row #%d sum=%d",
+			   row, arr2d_row_sums[row]);
+	}
+	bpf_for(col, 0, ARRAY_SIZE(arr2d[0])) {
+		bpf_printk("ITER_SUBPROG_ITERS: col #%d sum=%d%s",
+			   col, arr2d_col_sums[col],
+			   col == ARRAY_SIZE(arr2d[0]) - 1 ? "\n" : "");
+	}
+
+	return 0;
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, int);
+	__uint(max_entries, 1000);
+} arr_map SEC(".maps");
+
+SEC("?raw_tp")
+__failure __msg("invalid mem access 'scalar'")
+int iter_err_too_permissive1(const void *ctx)
+{
+	int *map_val = NULL;
+	int key = 0;
+
+	MY_PID_GUARD();
+
+	map_val = bpf_map_lookup_elem(&arr_map, &key);
+	if (!map_val)
+		return 0;
+
+	bpf_repeat(1000000) {
+		map_val = NULL;
+	}
+
+	*map_val = 123;
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("invalid mem access 'map_value_or_null'")
+int iter_err_too_permissive2(const void *ctx)
+{
+	int *map_val = NULL;
+	int key = 0;
+
+	MY_PID_GUARD();
+
+	map_val = bpf_map_lookup_elem(&arr_map, &key);
+	if (!map_val)
+		return 0;
+
+	bpf_repeat(1000000) {
+		map_val = bpf_map_lookup_elem(&arr_map, &key);
+	}
+
+	*map_val = 123;
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("invalid mem access 'map_value_or_null'")
+int iter_err_too_permissive3(const void *ctx)
+{
+	int *map_val = NULL;
+	int key = 0;
+	bool found = false;
+
+	MY_PID_GUARD();
+
+	bpf_repeat(1000000) {
+		map_val = bpf_map_lookup_elem(&arr_map, &key);
+		found = true;
+	}
+
+	if (found)
+		*map_val = 123;
+
+	return 0;
+}
+
+SEC("raw_tp")
+__success
+int iter_tricky_but_fine(const void *ctx)
+{
+	int *map_val = NULL;
+	int key = 0;
+	bool found = false;
+
+	MY_PID_GUARD();
+
+	bpf_repeat(1000000) {
+		map_val = bpf_map_lookup_elem(&arr_map, &key);
+		if (map_val) {
+			found = true;
+			break;
+		}
+	}
+
+	if (found)
+		*map_val = 123;
+
+	return 0;
+}
+
+#define __bpf_memzero(p, sz) bpf_probe_read_kernel((p), (sz), 0)
+
+SEC("raw_tp")
+__success
+int iter_stack_array_loop(const void *ctx)
+{
+	long arr1[16], arr2[16], sum = 0;
+	int *v, i;
+
+	MY_PID_GUARD();
+
+	/* zero-init arr1 and arr2 in such a way that verifier doesn't know
+	 * it's all zeros; if we don't do that, we'll make BPF verifier track
+	 * all combination of zero/non-zero stack slots for arr1/arr2, which
+	 * will lead to O(2^(ARRAY_SIZE(arr1)+ARRAY_SIZE(arr2))) different
+	 * states
+	 */
+	__bpf_memzero(arr1, sizeof(arr1));
+	__bpf_memzero(arr2, sizeof(arr1));
+
+	/* validate that we can break and continue when using bpf_for() */
+	bpf_for(i, 0, ARRAY_SIZE(arr1)) {
+		if (i & 1) {
+			arr1[i] = i;
+			continue;
+		} else {
+			arr2[i] = i;
+			break;
+		}
+	}
+
+	bpf_for(i, 0, ARRAY_SIZE(arr1)) {
+		sum += arr1[i] + arr2[i];
+	}
+
+	return sum;
+}
+
+static __noinline void fill(struct bpf_iter_num *it, int *arr, __u32 n, int mul)
+{
+	int *t, i;
+
+	while ((t = bpf_iter_num_next(it))) {
+		i = *t;
+		if (i >= n)
+			break;
+		arr[i] =  i * mul;
+	}
+}
+
+static __noinline int sum(struct bpf_iter_num *it, int *arr, __u32 n)
+{
+	int *t, i, sum = 0;;
+
+	while ((t = bpf_iter_num_next(it))) {
+		i = *t;
+		if (i >= n)
+			break;
+		sum += arr[i];
+	}
+
+	return sum;
+}
+
+SEC("raw_tp")
+__success
+int iter_pass_iter_ptr_to_subprog(const void *ctx)
+{
+	int arr1[16], arr2[32];
+	struct bpf_iter_num it;
+	int n, sum1, sum2;
+
+	MY_PID_GUARD();
+
+	/* fill arr1 */
+	n = ARRAY_SIZE(arr1);
+	bpf_iter_num_new(&it, 0, n);
+	fill(&it, arr1, n, 2);
+	bpf_iter_num_destroy(&it);
+
+	/* fill arr2 */
+	n = ARRAY_SIZE(arr2);
+	bpf_iter_num_new(&it, 0, n);
+	fill(&it, arr2, n, 10);
+	bpf_iter_num_destroy(&it);
+
+	/* sum arr1 */
+	n = ARRAY_SIZE(arr1);
+	bpf_iter_num_new(&it, 0, n);
+	sum1 = sum(&it, arr1, n);
+	bpf_iter_num_destroy(&it);
+
+	/* sum arr2 */
+	n = ARRAY_SIZE(arr2);
+	bpf_iter_num_new(&it, 0, n);
+	sum2 = sum(&it, arr2, n);
+	bpf_iter_num_destroy(&it);
+
+	bpf_printk("sum1=%d, sum2=%d", sum1, sum2);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/iters_looping.c b/tools/testing/selftests/bpf/progs/iters_looping.c
new file mode 100644
index 000000000000..05fa5ce7fc59
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/iters_looping.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <errno.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+#define ITER_HELPERS						\
+	  __imm(bpf_iter_num_new),				\
+	  __imm(bpf_iter_num_next),				\
+	  __imm(bpf_iter_num_destroy)
+
+SEC("?raw_tp")
+__success
+int force_clang_to_emit_btf_for_externs(void *ctx)
+{
+	/* we need this as a workaround to enforce compiler emitting BTF
+	 * information for bpf_iter_num_{new,next,destroy}() kfuncs,
+	 * as, apparently, it doesn't emit it for symbols only referenced from
+	 * assembly (or cleanup attribute, for that matter, as well)
+	 */
+	bpf_repeat(0);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__success
+int consume_first_item_only(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+
+		/* consume first item */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_next];"
+
+		"if r0 == 0 goto +1;"
+		"r0 = *(u32 *)(r0 + 0);"
+
+		/* destroy iterator */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("R0 invalid mem access 'scalar'")
+int missing_null_check_fail(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+
+		/* consume first element */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_next];"
+
+		/* FAIL: deref with no NULL check */
+		"r1 = *(u32 *)(r0 + 0);"
+
+		/* destroy iterator */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure
+__msg("invalid access to memory, mem_size=4 off=0 size=8")
+__msg("R0 min value is outside of the allowed memory range")
+int wrong_sized_read_fail(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+
+		/* consume first element */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_next];"
+
+		"if r0 == 0 goto +1;"
+		/* FAIL: deref more than available 4 bytes */
+		"r0 = *(u64 *)(r0 + 0);"
+
+		/* destroy iterator */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__flag(BPF_F_TEST_STATE_FREQ)
+int simplest_loop(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		"r6 = 0;" /* init sum */
+
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 10;"
+		"call %[bpf_iter_num_new];"
+
+	"1:"
+		/* consume next item */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_next];"
+
+		"if r0 == 0 goto 2f;"
+		"r0 = *(u32 *)(r0 + 0);"
+		"r6 += r0;" /* accumulate sum */
+		"goto 1b;"
+
+	"2:"
+		/* destroy iterator */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common, "r6"
+	);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/iters_state_safety.c b/tools/testing/selftests/bpf/progs/iters_state_safety.c
new file mode 100644
index 000000000000..d47e59aba6de
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/iters_state_safety.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Facebook */
+
+#include <errno.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+#define ITER_HELPERS						\
+	  __imm(bpf_iter_num_new),				\
+	  __imm(bpf_iter_num_next),				\
+	  __imm(bpf_iter_num_destroy)
+
+SEC("?raw_tp")
+__success
+int force_clang_to_emit_btf_for_externs(void *ctx)
+{
+	/* we need this as a workaround to enforce compiler emitting BTF
+	 * information for bpf_iter_num_{new,next,destroy}() kfuncs,
+	 * as, apparently, it doesn't emit it for symbols only referenced from
+	 * assembly (or cleanup attribute, for that matter, as well)
+	 */
+	bpf_repeat(0);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("fp-8_w=iter_num(ref_id=1,state=active,depth=0)")
+int create_and_destroy(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+		/* destroy iterator */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("Unreleased reference id=1")
+int create_and_forget_to_destroy_fail(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("expected an initialized iter_num as arg #1")
+int destroy_without_creating_fail(void *ctx)
+{
+	/* init with zeros to stop verifier complaining about uninit stack */
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("expected an initialized iter_num as arg #1")
+int compromise_iter_w_direct_write_fail(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+
+		/* directly write over first half of iter state */
+		"*(u64 *)(%[iter] + 0) = r0;"
+
+		/* (attempt to) destroy iterator */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("Unreleased reference id=1")
+int compromise_iter_w_direct_write_and_skip_destroy_fail(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+
+		/* directly write over first half of iter state */
+		"*(u64 *)(%[iter] + 0) = r0;"
+
+		/* don't destroy iter, leaking ref, which should fail */
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("expected an initialized iter_num as arg #1")
+int compromise_iter_w_helper_write_fail(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+
+		/* overwrite 8th byte with bpf_probe_read_kernel() */
+		"r1 = %[iter];"
+		"r1 += 7;"
+		"r2 = 1;"
+		"r3 = 0;" /* NULL */
+		"call %[bpf_probe_read_kernel];"
+
+		/* (attempt to) destroy iterator */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS, __imm(bpf_probe_read_kernel)
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+static __noinline void subprog_with_iter(void)
+{
+	struct bpf_iter_num iter;
+
+	bpf_iter_num_new(&iter, 0, 1);
+
+	return;
+}
+
+SEC("?raw_tp")
+__failure
+/* ensure there was a call to subprog, which might happen without __noinline */
+__msg("returning from callee:")
+__msg("Unreleased reference id=1")
+int leak_iter_from_subprog_fail(void *ctx)
+{
+	subprog_with_iter();
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__success __log_level(2)
+__msg("fp-8_w=iter_num(ref_id=1,state=active,depth=0)")
+int valid_stack_reuse(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+		/* destroy iterator */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+
+		/* now reuse same stack slots */
+
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+		/* destroy iterator */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("expected uninitialized iter_num as arg #1")
+int double_create_fail(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+		/* (attempt to) create iterator again */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+		/* destroy iterator */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("expected an initialized iter_num as arg #1")
+int double_destroy_fail(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+		/* destroy iterator */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		/* (attempt to) destroy iterator again */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("expected an initialized iter_num as arg #1")
+int next_without_new_fail(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* don't create iterator and try to iterate*/
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_next];"
+		/* destroy iterator */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("expected an initialized iter_num as arg #1")
+int next_after_destroy_fail(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* create iterator */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+		/* destroy iterator */
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_destroy];"
+		/* don't create iterator and try to iterate*/
+		"r1 = %[iter];"
+		"call %[bpf_iter_num_next];"
+		:
+		: __imm_ptr(iter), ITER_HELPERS
+		: __clobber_common
+	);
+
+	return 0;
+}
+
+SEC("?raw_tp")
+__failure __msg("invalid read from stack")
+int __naked read_from_iter_slot_fail(void)
+{
+	asm volatile (
+		/* r6 points to struct bpf_iter_num on the stack */
+		"r6 = r10;"
+		"r6 += -24;"
+
+		/* create iterator */
+		"r1 = r6;"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+
+		/* attemp to leak bpf_iter_num state */
+		"r7 = *(u64 *)(r6 + 0);"
+		"r8 = *(u64 *)(r6 + 8);"
+
+		/* destroy iterator */
+		"r1 = r6;"
+		"call %[bpf_iter_num_destroy];"
+
+		/* leak bpf_iter_num state */
+		"r0 = r7;"
+		"if r7 > r8 goto +1;"
+		"r0 = r8;"
+		"exit;"
+		:
+		: ITER_HELPERS
+		: __clobber_common, "r6", "r7", "r8"
+	);
+}
+
+int zero;
+
+SEC("?raw_tp")
+__failure
+__flag(BPF_F_TEST_STATE_FREQ)
+__msg("Unreleased reference")
+int stacksafe_should_not_conflate_stack_spill_and_iter(void *ctx)
+{
+	struct bpf_iter_num iter;
+
+	asm volatile (
+		/* Create a fork in logic, with general setup as follows:
+		 *   - fallthrough (first) path is valid;
+		 *   - branch (second) path is invalid.
+		 * Then depending on what we do in fallthrough vs branch path,
+		 * we try to detect bugs in func_states_equal(), regsafe(),
+		 * refsafe(), stack_safe(), and similar by tricking verifier
+		 * into believing that branch state is a valid subset of
+		 * a fallthrough state. Verifier should reject overall
+		 * validation, unless there is a bug somewhere in verifier
+		 * logic.
+		 */
+		"call %[bpf_get_prandom_u32];"
+		"r6 = r0;"
+		"call %[bpf_get_prandom_u32];"
+		"r7 = r0;"
+
+		"if r6 > r7 goto bad;" /* fork */
+
+		/* spill r6 into stack slot of bpf_iter_num var */
+		"*(u64 *)(%[iter] + 0) = r6;"
+
+		"goto skip_bad;"
+
+	"bad:"
+		/* create iterator in the same stack slot */
+		"r1 = %[iter];"
+		"r2 = 0;"
+		"r3 = 1000;"
+		"call %[bpf_iter_num_new];"
+
+		/* but then forget about it and overwrite it back to r6 spill */
+		"*(u64 *)(%[iter] + 0) = r6;"
+
+	"skip_bad:"
+		"goto +0;" /* force checkpoint */
+
+		/* corrupt stack slots, if they are really dynptr */
+		"*(u64 *)(%[iter] + 0) = r6;"
+		:
+		: __imm_ptr(iter),
+		  __imm_addr(zero),
+		  __imm(bpf_get_prandom_u32),
+		  __imm(bpf_dynptr_from_mem),
+		  ITER_HELPERS
+		: __clobber_common, "r6", "r7"
+	);
+
+	return 0;
+}

From patchwork Wed Mar  8 18:41:20 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Andrii Nakryiko <andrii@kernel.org>
X-Patchwork-Id: 13166312
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id DFFB1C6FD1F
	for <bpf@archiver.kernel.org>; Wed,  8 Mar 2023 18:41:50 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229701AbjCHSls convert rfc822-to-8bit (ORCPT
        <rfc822;bpf@archiver.kernel.org>); Wed, 8 Mar 2023 13:41:48 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:38912 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229776AbjCHSlq (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 8 Mar 2023 13:41:46 -0500
Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com
 [67.231.145.42])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 2129CB32B8
        for <bpf@vger.kernel.org>; Wed,  8 Mar 2023 10:41:45 -0800 (PST)
Received: from pps.filterd (m0109334.ppops.net [127.0.0.1])
        by mx0a-00082601.pphosted.com (8.17.1.19/8.17.1.19) with ESMTP id
 328I0FRg008233
        for <bpf@vger.kernel.org>; Wed, 8 Mar 2023 10:41:45 -0800
Received: from maileast.thefacebook.com ([163.114.130.16])
        by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3p6feu63fh-2
        (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128
 verify=NOT)
        for <bpf@vger.kernel.org>; Wed, 08 Mar 2023 10:41:44 -0800
Received: from twshared13315.14.prn3.facebook.com (2620:10d:c0a8:1b::d) by
 mail.thefacebook.com (2620:10d:c0a8:83::5) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.17; Wed, 8 Mar 2023 10:41:43 -0800
Received: by devbig019.vll3.facebook.com (Postfix, from userid 137359)
        id C1B2F299B760F; Wed,  8 Mar 2023 10:41:36 -0800 (PST)
From: Andrii Nakryiko <andrii@kernel.org>
To: <bpf@vger.kernel.org>, <ast@kernel.org>, <daniel@iogearbox.net>
CC: <andrii@kernel.org>, <kernel-team@meta.com>,
        Tejun Heo <tj@kernel.org>
Subject: [PATCH v5 bpf-next 7/8] selftests/bpf: add number iterator tests
Date: Wed, 8 Mar 2023 10:41:20 -0800
Message-ID: <20230308184121.1165081-8-andrii@kernel.org>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20230308184121.1165081-1-andrii@kernel.org>
References: <20230308184121.1165081-1-andrii@kernel.org>
MIME-Version: 1.0
X-FB-Internal: Safe
X-Proofpoint-GUID: 39ijf2xoxNh17Vp-YgwEKXCj3RsDW2va
X-Proofpoint-ORIG-GUID: 39ijf2xoxNh17Vp-YgwEKXCj3RsDW2va
X-Proofpoint-Virus-Version: vendor=baseguard
 engine=ICAP:2.0.254,Aquarius:18.0.942,Hydra:6.0.573,FMLib:17.11.170.22
 definitions=2023-03-08_12,2023-03-08_03,2023-02-09_01
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org
X-Patchwork-Delegate: bpf@iogearbox.net

Add number iterator (bpf_iter_num_{new,next,destroy}()) tests,
validating the correct handling of various corner and common cases
*at runtime*.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/iters.c  |  49 ++++
 tools/testing/selftests/bpf/progs/iters_num.c | 242 ++++++++++++++++++
 2 files changed, 291 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/iters_num.c

diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c
index 414fb8d82145..2e7caff9523e 100644
--- a/tools/testing/selftests/bpf/prog_tests/iters.c
+++ b/tools/testing/selftests/bpf/prog_tests/iters.c
@@ -6,10 +6,59 @@
 #include "iters.skel.h"
 #include "iters_state_safety.skel.h"
 #include "iters_looping.skel.h"
+#include "iters_num.skel.h"
+
+static void subtest_num_iters(void)
+{
+	struct iters_num *skel;
+	int err;
+
+	skel = iters_num__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+		return;
+
+	err = iters_num__attach(skel);
+	if (!ASSERT_OK(err, "skel_attach"))
+		goto cleanup;
+
+	usleep(1);
+	iters_num__detach(skel);
+
+#define VALIDATE_CASE(case_name)					\
+	ASSERT_EQ(skel->bss->res_##case_name,				\
+		  skel->rodata->exp_##case_name,			\
+		  #case_name)
+
+	VALIDATE_CASE(empty_zero);
+	VALIDATE_CASE(empty_int_min);
+	VALIDATE_CASE(empty_int_max);
+	VALIDATE_CASE(empty_minus_one);
+
+	VALIDATE_CASE(simple_sum);
+	VALIDATE_CASE(neg_sum);
+	VALIDATE_CASE(very_neg_sum);
+	VALIDATE_CASE(neg_pos_sum);
+
+	VALIDATE_CASE(invalid_range);
+	VALIDATE_CASE(max_range);
+	VALIDATE_CASE(e2big_range);
+
+	VALIDATE_CASE(succ_elem_cnt);
+	VALIDATE_CASE(overfetched_elem_cnt);
+	VALIDATE_CASE(fail_elem_cnt);
+
+#undef VALIDATE_CASE
+
+cleanup:
+	iters_num__destroy(skel);
+}
 
 void test_iters(void)
 {
 	RUN_TESTS(iters_state_safety);
 	RUN_TESTS(iters_looping);
 	RUN_TESTS(iters);
+
+	if (test__start_subtest("num"))
+		subtest_num_iters();
 }
diff --git a/tools/testing/selftests/bpf/progs/iters_num.c b/tools/testing/selftests/bpf/progs/iters_num.c
new file mode 100644
index 000000000000..7a77a8daee0d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/iters_num.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include <limits.h>
+#include <linux/errno.h>
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+const volatile __s64 exp_empty_zero = 0 + 1;
+__s64 res_empty_zero;
+
+SEC("raw_tp/sys_enter")
+int num_empty_zero(const void *ctx)
+{
+	__s64 sum = 0, i;
+
+	bpf_for(i, 0, 0) sum += i;
+	res_empty_zero = 1 + sum;
+
+	return 0;
+}
+
+const volatile __s64 exp_empty_int_min = 0 + 2;
+__s64 res_empty_int_min;
+
+SEC("raw_tp/sys_enter")
+int num_empty_int_min(const void *ctx)
+{
+	__s64 sum = 0, i;
+
+	bpf_for(i, INT_MIN, INT_MIN) sum += i;
+	res_empty_int_min = 2 + sum;
+
+	return 0;
+}
+
+const volatile __s64 exp_empty_int_max = 0 + 3;
+__s64 res_empty_int_max;
+
+SEC("raw_tp/sys_enter")
+int num_empty_int_max(const void *ctx)
+{
+	__s64 sum = 0, i;
+
+	bpf_for(i, INT_MAX, INT_MAX) sum += i;
+	res_empty_int_max = 3 + sum;
+
+	return 0;
+}
+
+const volatile __s64 exp_empty_minus_one = 0 + 4;
+__s64 res_empty_minus_one;
+
+SEC("raw_tp/sys_enter")
+int num_empty_minus_one(const void *ctx)
+{
+	__s64 sum = 0, i;
+
+	bpf_for(i, -1, -1) sum += i;
+	res_empty_minus_one = 4 + sum;
+
+	return 0;
+}
+
+const volatile __s64 exp_simple_sum = 9 * 10 / 2;
+__s64 res_simple_sum;
+
+SEC("raw_tp/sys_enter")
+int num_simple_sum(const void *ctx)
+{
+	__s64 sum = 0, i;
+
+	bpf_for(i, 0, 10) sum += i;
+	res_simple_sum = sum;
+
+	return 0;
+}
+
+const volatile __s64 exp_neg_sum = -11 * 10 / 2;
+__s64 res_neg_sum;
+
+SEC("raw_tp/sys_enter")
+int num_neg_sum(const void *ctx)
+{
+	__s64 sum = 0, i;
+
+	bpf_for(i, -10, 0) sum += i;
+	res_neg_sum = sum;
+
+	return 0;
+}
+
+const volatile __s64 exp_very_neg_sum = INT_MIN + (__s64)(INT_MIN + 1);
+__s64 res_very_neg_sum;
+
+SEC("raw_tp/sys_enter")
+int num_very_neg_sum(const void *ctx)
+{
+	__s64 sum = 0, i;
+
+	bpf_for(i, INT_MIN, INT_MIN + 2) sum += i;
+	res_very_neg_sum = sum;
+
+	return 0;
+}
+
+const volatile __s64 exp_very_big_sum = (__s64)(INT_MAX - 1) + (__s64)(INT_MAX - 2);
+__s64 res_very_big_sum;
+
+SEC("raw_tp/sys_enter")
+int num_very_big_sum(const void *ctx)
+{
+	__s64 sum = 0, i;
+
+	bpf_for(i, INT_MAX - 2, INT_MAX) sum += i;
+	res_very_big_sum = sum;
+
+	return 0;
+}
+
+const volatile __s64 exp_neg_pos_sum = -3;
+__s64 res_neg_pos_sum;
+
+SEC("raw_tp/sys_enter")
+int num_neg_pos_sum(const void *ctx)
+{
+	__s64 sum = 0, i;
+
+	bpf_for(i, -3, 3) sum += i;
+	res_neg_pos_sum = sum;
+
+	return 0;
+}
+
+const volatile __s64 exp_invalid_range = -EINVAL;
+__s64 res_invalid_range;
+
+SEC("raw_tp/sys_enter")
+int num_invalid_range(const void *ctx)
+{
+	struct bpf_iter_num it;
+
+	res_invalid_range = bpf_iter_num_new(&it, 1, 0);
+	bpf_iter_num_destroy(&it);
+
+	return 0;
+}
+
+const volatile __s64 exp_max_range = 0 + 10;
+__s64 res_max_range;
+
+SEC("raw_tp/sys_enter")
+int num_max_range(const void *ctx)
+{
+	struct bpf_iter_num it;
+
+	res_max_range = 10 + bpf_iter_num_new(&it, 0, BPF_MAX_LOOPS);
+	bpf_iter_num_destroy(&it);
+
+	return 0;
+}
+
+const volatile __s64 exp_e2big_range = -E2BIG;
+__s64 res_e2big_range;
+
+SEC("raw_tp/sys_enter")
+int num_e2big_range(const void *ctx)
+{
+	struct bpf_iter_num it;
+
+	res_e2big_range = bpf_iter_num_new(&it, -1, BPF_MAX_LOOPS);
+	bpf_iter_num_destroy(&it);
+
+	return 0;
+}
+
+const volatile __s64 exp_succ_elem_cnt = 10;
+__s64 res_succ_elem_cnt;
+
+SEC("raw_tp/sys_enter")
+int num_succ_elem_cnt(const void *ctx)
+{
+	struct bpf_iter_num it;
+	int cnt = 0, *v;
+
+	bpf_iter_num_new(&it, 0, 10);
+	while ((v = bpf_iter_num_next(&it))) {
+		cnt++;
+	}
+	bpf_iter_num_destroy(&it);
+
+	res_succ_elem_cnt = cnt;
+
+	return 0;
+}
+
+const volatile __s64 exp_overfetched_elem_cnt = 5;
+__s64 res_overfetched_elem_cnt;
+
+SEC("raw_tp/sys_enter")
+int num_overfetched_elem_cnt(const void *ctx)
+{
+	struct bpf_iter_num it;
+	int cnt = 0, *v, i;
+
+	bpf_iter_num_new(&it, 0, 5);
+	for (i = 0; i < 10; i++) {
+		v = bpf_iter_num_next(&it);
+		if (v)
+			cnt++;
+	}
+	bpf_iter_num_destroy(&it);
+
+	res_overfetched_elem_cnt = cnt;
+
+	return 0;
+}
+
+const volatile __s64 exp_fail_elem_cnt = 20 + 0;
+__s64 res_fail_elem_cnt;
+
+SEC("raw_tp/sys_enter")
+int num_fail_elem_cnt(const void *ctx)
+{
+	struct bpf_iter_num it;
+	int cnt = 0, *v, i;
+
+	bpf_iter_num_new(&it, 100, 10);
+	for (i = 0; i < 10; i++) {
+		v = bpf_iter_num_next(&it);
+		if (v)
+			cnt++;
+	}
+	bpf_iter_num_destroy(&it);
+
+	res_fail_elem_cnt = 20 + cnt;
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";

From patchwork Wed Mar  8 18:41:21 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Andrii Nakryiko <andrii@kernel.org>
X-Patchwork-Id: 13166310
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 97599C678D5
	for <bpf@archiver.kernel.org>; Wed,  8 Mar 2023 18:41:48 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S229809AbjCHSlr convert rfc822-to-8bit (ORCPT
        <rfc822;bpf@archiver.kernel.org>); Wed, 8 Mar 2023 13:41:47 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:38822 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S229764AbjCHSlp (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 8 Mar 2023 13:41:45 -0500
Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com
 [67.231.145.42])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 5C50DB4F64
        for <bpf@vger.kernel.org>; Wed,  8 Mar 2023 10:41:44 -0800 (PST)
Received: from pps.filterd (m0044012.ppops.net [127.0.0.1])
        by mx0a-00082601.pphosted.com (8.17.1.19/8.17.1.19) with ESMTP id
 328I1E8m024144
        for <bpf@vger.kernel.org>; Wed, 8 Mar 2023 10:41:44 -0800
Received: from mail.thefacebook.com ([163.114.132.120])
        by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3p6fftdvur-1
        (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128
 verify=NOT)
        for <bpf@vger.kernel.org>; Wed, 08 Mar 2023 10:41:43 -0800
Received: from twshared21709.17.frc2.facebook.com (2620:10d:c085:108::8) by
 mail.thefacebook.com (2620:10d:c085:21d::7) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.2507.17; Wed, 8 Mar 2023 10:41:43 -0800
Received: by devbig019.vll3.facebook.com (Postfix, from userid 137359)
        id D846A299B7615; Wed,  8 Mar 2023 10:41:38 -0800 (PST)
From: Andrii Nakryiko <andrii@kernel.org>
To: <bpf@vger.kernel.org>, <ast@kernel.org>, <daniel@iogearbox.net>
CC: <andrii@kernel.org>, <kernel-team@meta.com>,
        Tejun Heo <tj@kernel.org>
Subject: [PATCH v5 bpf-next 8/8] selftests/bpf: implement and test custom
 testmod_seq iterator
Date: Wed, 8 Mar 2023 10:41:21 -0800
Message-ID: <20230308184121.1165081-9-andrii@kernel.org>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20230308184121.1165081-1-andrii@kernel.org>
References: <20230308184121.1165081-1-andrii@kernel.org>
MIME-Version: 1.0
X-FB-Internal: Safe
X-Proofpoint-GUID: 4ElseWWoLDw3F1Ytjgaw3YrfLOuv3ewL
X-Proofpoint-ORIG-GUID: 4ElseWWoLDw3F1Ytjgaw3YrfLOuv3ewL
X-Proofpoint-Virus-Version: vendor=baseguard
 engine=ICAP:2.0.254,Aquarius:18.0.942,Hydra:6.0.573,FMLib:17.11.170.22
 definitions=2023-03-08_12,2023-03-08_03,2023-02-09_01
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org
X-Patchwork-Delegate: bpf@iogearbox.net

Implement a trivial iterator returning same specified integer value
N times as part of bpf_testmod kernel module. Add selftests to validate
everything works end to end.

We also reuse these tests as "verification-only" tests to validate that
kernel prints the state of custom kernel module-defined iterator correctly:

  fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)

"testmod_seq" part is an iterator type, and is coming from module's BTF
data dynamically at runtime.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.s390x    |  1 +
 .../selftests/bpf/bpf_testmod/bpf_testmod.c   | 42 +++++++++-
 .../selftests/bpf/bpf_testmod/bpf_testmod.h   |  6 ++
 .../testing/selftests/bpf/prog_tests/iters.c  | 42 ++++++++++
 .../selftests/bpf/progs/iters_testmod_seq.c   | 79 +++++++++++++++++++
 5 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/iters_testmod_seq.c

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index a02a085e7f32..34cb8b2de8ca 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -8,6 +8,7 @@ dynptr/test_dynptr_skb_data
 dynptr/test_skb_readonly
 fexit_sleep                              # fexit_skel_load fexit skeleton failed                                       (trampoline)
 get_stack_raw_tp                         # user_stack corrupted user stack                                             (no backchain userspace)
+iters/testmod_seq*                       # s390x doesn't support kfuncs in modules yet
 kprobe_multi_bench_attach                # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 kprobe_multi_test                        # relies on fentry
 ksyms_module                             # test_ksyms_module__open_and_load unexpected error: -9                       (?)
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 46500636d8cd..5e6e85c8d77d 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -65,6 +65,34 @@ bpf_testmod_test_mod_kfunc(int i)
 	*(int *)this_cpu_ptr(&bpf_testmod_ksym_percpu) = i;
 }
 
+__bpf_kfunc int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt)
+{
+	if (cnt < 0) {
+		it->cnt = 0;
+		return -EINVAL;
+	}
+
+	it->value = value;
+	it->cnt = cnt;
+
+	return 0;
+}
+
+__bpf_kfunc s64 *bpf_iter_testmod_seq_next(struct bpf_iter_testmod_seq* it)
+{
+	if (it->cnt <= 0)
+		return NULL;
+
+	it->cnt--;
+
+	return &it->value;
+}
+
+__bpf_kfunc void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it)
+{
+	it->cnt = 0;
+}
+
 struct bpf_testmod_btf_type_tag_1 {
 	int a;
 };
@@ -220,6 +248,17 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
 	.write = bpf_testmod_test_write,
 };
 
+BTF_SET8_START(bpf_testmod_common_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_iter_testmod_seq_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_testmod_seq_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_testmod_seq_destroy, KF_ITER_DESTROY)
+BTF_SET8_END(bpf_testmod_common_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_testmod_common_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &bpf_testmod_common_kfunc_ids,
+};
+
 BTF_SET8_START(bpf_testmod_check_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc)
 BTF_SET8_END(bpf_testmod_check_kfunc_ids)
@@ -235,7 +274,8 @@ static int bpf_testmod_init(void)
 {
 	int ret;
 
-	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_testmod_kfunc_set);
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_testmod_common_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_testmod_kfunc_set);
 	if (ret < 0)
 		return ret;
 	if (bpf_fentry_test1(0) < 0)
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
index 0d71e2607832..f32793efe095 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
@@ -22,4 +22,10 @@ struct bpf_testmod_test_writable_ctx {
 	int val;
 };
 
+/* BPF iter that returns *value* *n* times in a row */
+struct bpf_iter_testmod_seq {
+	s64 value;
+	int cnt;
+};
+
 #endif /* _BPF_TESTMOD_H */
diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c
index 2e7caff9523e..10804ae5ae97 100644
--- a/tools/testing/selftests/bpf/prog_tests/iters.c
+++ b/tools/testing/selftests/bpf/prog_tests/iters.c
@@ -7,6 +7,7 @@
 #include "iters_state_safety.skel.h"
 #include "iters_looping.skel.h"
 #include "iters_num.skel.h"
+#include "iters_testmod_seq.skel.h"
 
 static void subtest_num_iters(void)
 {
@@ -53,12 +54,53 @@ static void subtest_num_iters(void)
 	iters_num__destroy(skel);
 }
 
+static void subtest_testmod_seq_iters(void)
+{
+	struct iters_testmod_seq *skel;
+	int err;
+
+	if (!env.has_testmod) {
+		test__skip();
+		return;
+	}
+
+	skel = iters_testmod_seq__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+		return;
+
+	err = iters_testmod_seq__attach(skel);
+	if (!ASSERT_OK(err, "skel_attach"))
+		goto cleanup;
+
+	usleep(1);
+	iters_testmod_seq__detach(skel);
+
+#define VALIDATE_CASE(case_name)					\
+	ASSERT_EQ(skel->bss->res_##case_name,				\
+		  skel->rodata->exp_##case_name,			\
+		  #case_name)
+
+	VALIDATE_CASE(empty);
+	VALIDATE_CASE(full);
+	VALIDATE_CASE(truncated);
+
+#undef VALIDATE_CASE
+
+cleanup:
+	iters_testmod_seq__destroy(skel);
+}
+
 void test_iters(void)
 {
 	RUN_TESTS(iters_state_safety);
 	RUN_TESTS(iters_looping);
 	RUN_TESTS(iters);
 
+	if (env.has_testmod)
+		RUN_TESTS(iters_testmod_seq);
+
 	if (test__start_subtest("num"))
 		subtest_num_iters();
+	if (test__start_subtest("testmod_seq"))
+		subtest_testmod_seq_iters();
 }
diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c
new file mode 100644
index 000000000000..3873fb6c292a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+struct bpf_iter_testmod_seq {
+	u64 :64;
+	u64 :64;
+};
+
+extern int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt) __ksym;
+extern s64 *bpf_iter_testmod_seq_next(struct bpf_iter_testmod_seq *it) __ksym;
+extern void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it) __ksym;
+
+const volatile __s64 exp_empty = 0 + 1;
+__s64 res_empty;
+
+SEC("raw_tp/sys_enter")
+__success __log_level(2)
+__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)")
+__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)")
+__msg("call bpf_iter_testmod_seq_destroy")
+int testmod_seq_empty(const void *ctx)
+{
+	__s64 sum = 0, *i;
+
+	bpf_for_each(testmod_seq, i, 1000, 0) sum += *i;
+	res_empty = 1 + sum;
+
+	return 0;
+}
+
+const volatile __s64 exp_full = 1000000;
+__s64 res_full;
+
+SEC("raw_tp/sys_enter")
+__success __log_level(2)
+__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)")
+__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)")
+__msg("call bpf_iter_testmod_seq_destroy")
+int testmod_seq_full(const void *ctx)
+{
+	__s64 sum = 0, *i;
+
+	bpf_for_each(testmod_seq, i, 1000, 1000) sum += *i;
+	res_full = sum;
+
+	return 0;
+}
+
+const volatile __s64 exp_truncated = 10 * 1000000;
+__s64 res_truncated;
+
+static volatile int zero = 0;
+
+SEC("raw_tp/sys_enter")
+__success __log_level(2)
+__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)")
+__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)")
+__msg("call bpf_iter_testmod_seq_destroy")
+int testmod_seq_truncated(const void *ctx)
+{
+	__s64 sum = 0, *i;
+	int cnt = zero;
+
+	bpf_for_each(testmod_seq, i, 10, 2000000) {
+		sum += *i;
+		cnt++;
+		if (cnt >= 1000000)
+			break;
+	}
+	res_truncated = sum;
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";