From patchwork Wed Nov 30 08:22:43 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059536
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id ED864C4321E
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:23:50 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234660AbiK3IXs (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:23:48 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54366 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234607AbiK3IXn (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:23:43 -0500
Received: from mail-pg1-x52d.google.com (mail-pg1-x52d.google.com
 [IPv6:2607:f8b0:4864:20::52d])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 47E275CD20;
        Wed, 30 Nov 2022 00:23:41 -0800 (PST)
Received: by mail-pg1-x52d.google.com with SMTP id f9so15373680pgf.7;
        Wed, 30 Nov 2022 00:23:41 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=07zmtYwtZNHOeI0n4lVsdVba4Oqv1PX4InXOLTY/bBE=;
        b=J44k0xFxvr3pbIt2/0+Dhckp3Yq5cPLDBa9WgYLjhNVHUxkt1cMmipofxaglV+YHyr
         DDMh3rOhxcK9n4xVLsLz/UIDCdGKpw6QjIbXXj/eof3wAdmOxlURJTP34vLoYIyeXApR
         Z/VBSuaKv/oDGxvBJ/W6EyzgGuk3QVMZ7MPaTRaAu2MbmRw+Tlhbx4G1qDV66xjPUOlS
         stsYqEO7hOHJUEPaALhk+kxKppdXtzaNExPenwCHoGtFGUEgkRU/4op7LugP53ND8YBp
         IzTQ56+BTSFadSHKSuD3pOndeemQlksi7I/OByOQdM40VFN7fmiqk0RaoL/M1kvK11wR
         VMjw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=07zmtYwtZNHOeI0n4lVsdVba4Oqv1PX4InXOLTY/bBE=;
        b=IO/OhCxYDeHWSc1ABlU/cG67w9kMNVG9VThsvX0pIC6mvCEqmfOjH59m5iRWdjXlwS
         3RGpJ7y9e9Osx1mOyklqh6SGhpTcmB0li7nrrvKoDf2GUnLtwNzE7VPMJO9hRKC12aaF
         NXyXD6YcIuP4mUCH7if8CCD+qvkDd3f/T517scMPRX48XpcB9qmswP5dmAoBNw7tJuHq
         be+ZCqW026Ixk8TgQAhYsf5Qly9KL/TAvjJ5iEHrGXh202PZ123JbmXhXoZfoJ+79Gy9
         TAsbJtqib2CW5BuS8oUwHJRYvkxyN3/FIVPcNEOpiMP/U6awFge8H5ALkIJZflCTIc5P
         4z6g==
X-Gm-Message-State: ANoB5plzUyo2kbGq5a8DG55XhaI3ApHxAqeb4sEo2ADhIceiLZ7ZZs7F
        GMOQxCA8Z+oV6rH1tmprfCc=
X-Google-Smtp-Source: 
 AA0mqf4ZGiPXyblarl8VUJWZ2zKl8vFsW0DkgV3YCcylRX9sizJiI5IpJ4q3MTh4W/5a06oTVYpa0Q==
X-Received: by 2002:a63:d48:0:b0:474:6739:6a09 with SMTP id
 8-20020a630d48000000b0047467396a09mr43063850pgn.292.1669796620340;
        Wed, 30 Nov 2022 00:23:40 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 4-20020a621404000000b005743cdde1b8sm795302pfu.127.2022.11.30.00.23.39
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:23:40 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 01/31] rhashtable: Allow rhashtable to be used from irq-safe
 contexts
Date: Tue, 29 Nov 2022 22:22:43 -1000
Message-Id: <20221130082313.3241517-2-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

rhashtable currently only does bh-safe synchronization making it impossible
to use from irq-safe contexts. Switch it to use irq-safe synchronization to
remove the restriction.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/rhashtable.h | 51 ++++++++++++++++++++++----------------
 lib/rhashtable.c           | 16 +++++++-----
 2 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 68dab3e08aad..785fba3464f2 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -324,28 +324,31 @@ static inline struct rhash_lock_head __rcu **rht_bucket_insert(
  */
 
 static inline void rht_lock(struct bucket_table *tbl,
-			    struct rhash_lock_head __rcu **bkt)
+			    struct rhash_lock_head __rcu **bkt,
+			    unsigned long *flags)
 {
-	local_bh_disable();
+	local_irq_save(*flags);
 	bit_spin_lock(0, (unsigned long *)bkt);
 	lock_map_acquire(&tbl->dep_map);
 }
 
 static inline void rht_lock_nested(struct bucket_table *tbl,
 				   struct rhash_lock_head __rcu **bucket,
-				   unsigned int subclass)
+				   unsigned int subclass,
+				   unsigned long *flags)
 {
-	local_bh_disable();
+	local_irq_save(*flags);
 	bit_spin_lock(0, (unsigned long *)bucket);
 	lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
 }
 
 static inline void rht_unlock(struct bucket_table *tbl,
-			      struct rhash_lock_head __rcu **bkt)
+			      struct rhash_lock_head __rcu **bkt,
+			      unsigned long *flags)
 {
 	lock_map_release(&tbl->dep_map);
 	bit_spin_unlock(0, (unsigned long *)bkt);
-	local_bh_enable();
+	local_irq_restore(*flags);
 }
 
 static inline struct rhash_head *__rht_ptr(
@@ -393,7 +396,8 @@ static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
 
 static inline void rht_assign_unlock(struct bucket_table *tbl,
 				     struct rhash_lock_head __rcu **bkt,
-				     struct rhash_head *obj)
+				     struct rhash_head *obj,
+				     unsigned long *flags)
 {
 	if (rht_is_a_nulls(obj))
 		obj = NULL;
@@ -401,7 +405,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
 	rcu_assign_pointer(*bkt, (void *)obj);
 	preempt_enable();
 	__release(bitlock);
-	local_bh_enable();
+	local_irq_restore(*flags);
 }
 
 /**
@@ -706,6 +710,7 @@ static inline void *__rhashtable_insert_fast(
 	struct rhash_head __rcu **pprev;
 	struct bucket_table *tbl;
 	struct rhash_head *head;
+	unsigned long flags;
 	unsigned int hash;
 	int elasticity;
 	void *data;
@@ -720,11 +725,11 @@ static inline void *__rhashtable_insert_fast(
 	if (!bkt)
 		goto out;
 	pprev = NULL;
-	rht_lock(tbl, bkt);
+	rht_lock(tbl, bkt, &flags);
 
 	if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
 slow_path:
-		rht_unlock(tbl, bkt);
+		rht_unlock(tbl, bkt, &flags);
 		rcu_read_unlock();
 		return rhashtable_insert_slow(ht, key, obj);
 	}
@@ -756,9 +761,9 @@ static inline void *__rhashtable_insert_fast(
 		RCU_INIT_POINTER(list->rhead.next, head);
 		if (pprev) {
 			rcu_assign_pointer(*pprev, obj);
-			rht_unlock(tbl, bkt);
+			rht_unlock(tbl, bkt, &flags);
 		} else
-			rht_assign_unlock(tbl, bkt, obj);
+			rht_assign_unlock(tbl, bkt, obj, &flags);
 		data = NULL;
 		goto out;
 	}
@@ -785,7 +790,7 @@ static inline void *__rhashtable_insert_fast(
 	}
 
 	atomic_inc(&ht->nelems);
-	rht_assign_unlock(tbl, bkt, obj);
+	rht_assign_unlock(tbl, bkt, obj, &flags);
 
 	if (rht_grow_above_75(ht, tbl))
 		schedule_work(&ht->run_work);
@@ -797,7 +802,7 @@ static inline void *__rhashtable_insert_fast(
 	return data;
 
 out_unlock:
-	rht_unlock(tbl, bkt);
+	rht_unlock(tbl, bkt, &flags);
 	goto out;
 }
 
@@ -991,6 +996,7 @@ static inline int __rhashtable_remove_fast_one(
 	struct rhash_lock_head __rcu **bkt;
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
+	unsigned long flags;
 	unsigned int hash;
 	int err = -ENOENT;
 
@@ -999,7 +1005,7 @@ static inline int __rhashtable_remove_fast_one(
 	if (!bkt)
 		return -ENOENT;
 	pprev = NULL;
-	rht_lock(tbl, bkt);
+	rht_lock(tbl, bkt, &flags);
 
 	rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
 		struct rhlist_head *list;
@@ -1043,14 +1049,14 @@ static inline int __rhashtable_remove_fast_one(
 
 		if (pprev) {
 			rcu_assign_pointer(*pprev, obj);
-			rht_unlock(tbl, bkt);
+			rht_unlock(tbl, bkt, &flags);
 		} else {
-			rht_assign_unlock(tbl, bkt, obj);
+			rht_assign_unlock(tbl, bkt, obj, &flags);
 		}
 		goto unlocked;
 	}
 
-	rht_unlock(tbl, bkt);
+	rht_unlock(tbl, bkt, &flags);
 unlocked:
 	if (err > 0) {
 		atomic_dec(&ht->nelems);
@@ -1143,6 +1149,7 @@ static inline int __rhashtable_replace_fast(
 	struct rhash_lock_head __rcu **bkt;
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
+	unsigned long flags;
 	unsigned int hash;
 	int err = -ENOENT;
 
@@ -1158,7 +1165,7 @@ static inline int __rhashtable_replace_fast(
 		return -ENOENT;
 
 	pprev = NULL;
-	rht_lock(tbl, bkt);
+	rht_lock(tbl, bkt, &flags);
 
 	rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
 		if (he != obj_old) {
@@ -1169,15 +1176,15 @@ static inline int __rhashtable_replace_fast(
 		rcu_assign_pointer(obj_new->next, obj_old->next);
 		if (pprev) {
 			rcu_assign_pointer(*pprev, obj_new);
-			rht_unlock(tbl, bkt);
+			rht_unlock(tbl, bkt, &flags);
 		} else {
-			rht_assign_unlock(tbl, bkt, obj_new);
+			rht_assign_unlock(tbl, bkt, obj_new, &flags);
 		}
 		err = 0;
 		goto unlocked;
 	}
 
-	rht_unlock(tbl, bkt);
+	rht_unlock(tbl, bkt, &flags);
 
 unlocked:
 	return err;
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e12bbfb240b8..9781572b2f31 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -231,6 +231,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht,
 	struct rhash_head *head, *next, *entry;
 	struct rhash_head __rcu **pprev = NULL;
 	unsigned int new_hash;
+	unsigned long flags;
 
 	if (new_tbl->nest)
 		goto out;
@@ -253,13 +254,14 @@ static int rhashtable_rehash_one(struct rhashtable *ht,
 
 	new_hash = head_hashfn(ht, new_tbl, entry);
 
-	rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash], SINGLE_DEPTH_NESTING);
+	rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash],
+			SINGLE_DEPTH_NESTING, &flags);
 
 	head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);
 
 	RCU_INIT_POINTER(entry->next, head);
 
-	rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry);
+	rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry, &flags);
 
 	if (pprev)
 		rcu_assign_pointer(*pprev, next);
@@ -276,18 +278,19 @@ static int rhashtable_rehash_chain(struct rhashtable *ht,
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
+	unsigned long flags;
 	int err;
 
 	if (!bkt)
 		return 0;
-	rht_lock(old_tbl, bkt);
+	rht_lock(old_tbl, bkt, &flags);
 
 	while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
 		;
 
 	if (err == -ENOENT)
 		err = 0;
-	rht_unlock(old_tbl, bkt);
+	rht_unlock(old_tbl, bkt, &flags);
 
 	return err;
 }
@@ -590,6 +593,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 	struct bucket_table *new_tbl;
 	struct bucket_table *tbl;
 	struct rhash_lock_head __rcu **bkt;
+	unsigned long flags;
 	unsigned int hash;
 	void *data;
 
@@ -607,7 +611,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 			new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 			data = ERR_PTR(-EAGAIN);
 		} else {
-			rht_lock(tbl, bkt);
+			rht_lock(tbl, bkt, &flags);
 			data = rhashtable_lookup_one(ht, bkt, tbl,
 						     hash, key, obj);
 			new_tbl = rhashtable_insert_one(ht, bkt, tbl,
@@ -615,7 +619,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 			if (PTR_ERR(new_tbl) != -EEXIST)
 				data = ERR_CAST(new_tbl);
 
-			rht_unlock(tbl, bkt);
+			rht_unlock(tbl, bkt, &flags);
 		}
 	} while (!IS_ERR_OR_NULL(new_tbl));
 

From patchwork Wed Nov 30 08:22:44 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059537
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id A9C31C433FE
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:23:54 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234733AbiK3IXw (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:23:52 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54236 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234763AbiK3IXr (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:23:47 -0500
Received: from mail-pf1-x429.google.com (mail-pf1-x429.google.com
 [IPv6:2607:f8b0:4864:20::429])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 57C4243AEB;
        Wed, 30 Nov 2022 00:23:43 -0800 (PST)
Received: by mail-pf1-x429.google.com with SMTP id k79so1631136pfd.7;
        Wed, 30 Nov 2022 00:23:43 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=K1j0A0QkOym5FfXdZ+NvM9qS4irkehDZAEMG1X/6jxM=;
        b=p031m+vQzbLBfL8ozACACJHzwZiZlwGL0XEKWMgbJLqVECiC1DzktHz+wtua/s/rpd
         AjEk+8BWWSxQrhJ06FGwz2hOwDKJxQWjrpbXQTDvzvlGDXku5XthSMzZgxZ2zVRK71Vt
         2nQVXHgOs1LKMyZGCjUoIeuPkoFmvtAnYQAlwZ9EPnTnRs2tBO7EBPRc/MAFvhZFEFC4
         BKW/hpV0Y4aJtSgp8tbhBre80DsR9sNf2JridkuIgJ0S2kcTX2RoNA306iBnG3crXmnH
         5Xv/BvTYnnL1IAAudSfqREYp0EJpQ8mWtLd73RvYkhT5WMaa2R28iaL3UiD6PH88HXlC
         R+oQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=K1j0A0QkOym5FfXdZ+NvM9qS4irkehDZAEMG1X/6jxM=;
        b=Kq8I50sPCW1UaBkBOG4J7FB1PxhhMI5acoIMzgxmJ61ktAlUT3w/9VXKL9qdA3OgYh
         LPMBm/q1Qn7yE44GOjamKjk3Ma4+L7NX4yWuWOBx/jBublCVrhVT7Gs95B5SDElS0gCV
         9eb4sN1Onhk3WzEl9jCtMBzmoiVdgQkBmlrWEIqJe13YOUzi0BJnDPRdASbx4viI/rRZ
         HsGh7bQH7RfHANyWCt7bpO0lmbLLSNl0n5rzGc8OUgHqbciGVShnGtEEQZdkuaLO6C3D
         WKvBIEG4ErHbPXO3J/4nmRTMuIBkKka+rZrAwzTrwqykebfxPvKjwk2tcBk+/oYvizMj
         NOWg==
X-Gm-Message-State: ANoB5pkgS76Mkc3oyarxzRntK1Z+L+y36CazwDg1ohaeiQmlBFoZaEaw
        Z0NBVS5dqn0n8aiQ0ba+IoI=
X-Google-Smtp-Source: 
 AA0mqf4lIvT/nRGCo9RyXdTNb2HH4H5ipD+ppktOrDLuv65JRw9ceUvN1RH8I9d7NzciEPnb4jvAig==
X-Received: by 2002:a63:fa49:0:b0:476:f92f:885b with SMTP id
 g9-20020a63fa49000000b00476f92f885bmr42322828pgk.31.1669796622606;
        Wed, 30 Nov 2022 00:23:42 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 c10-20020a170902c1ca00b0018991f3bfb2sm776692plc.3.2022.11.30.00.23.41
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:23:42 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 02/31] cgroup: Implement cgroup_show_cftypes()
Date: Tue, 29 Nov 2022 22:22:44 -1000
Message-Id: <20221130082313.3241517-3-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

Implement cgroup_show_cftypes() which shows and hides all cgroup files
associated with the specified set of cgroup file types. CFTYPE_HIDDEN flag
is added so that files can be created hidden from the get-go.

cgroup_show_cftypes() can be used whether the cftypes are added or not. It
also combines with cgroup_show_file() so that a given file is visible iff
both its cftype and cfile are visible.

This will be used by a new sched_class to selectively show and hide CPU
controller interface files depending on whether they're supported.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/cgroup-defs.h |  8 +++
 include/linux/cgroup.h      |  1 +
 kernel/cgroup/cgroup.c      | 97 ++++++++++++++++++++++++++++++++++---
 3 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8a0d5466c7be..8af1e7d487cb 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -127,12 +127,18 @@ enum {
 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
 	CFTYPE_DEBUG		= (1 << 5),	/* create when cgroup_debug */
 
+	CFTYPE_HIDDEN		= (1 << 6),	/* file type hidden, see cgroup_show_cftypes() */
+
 	/* internal flags, do not use outside cgroup core proper */
 	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
 	__CFTYPE_NOT_ON_DFL	= (1 << 17),	/* not on default hierarchy */
 	__CFTYPE_ADDED		= (1 << 18),
 };
 
+enum cfile_flags {
+	CFILE_HIDDEN		= (1 << 0),	/* file instance hidden */
+};
+
 /*
  * cgroup_file is the handle for a file instance created in a cgroup which
  * is used, for example, to generate file changed notifications.  This can
@@ -140,7 +146,9 @@ enum {
  */
 struct cgroup_file {
 	/* do not access any fields from outside cgroup core */
+	struct cftype *cft;
 	struct kernfs_node *kn;
+	unsigned int flags;
 	unsigned long notified_at;
 	struct timer_list notify_timer;
 };
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 528bd44b59e2..14fd0902bd8b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -114,6 +114,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cftype *cfts);
+void cgroup_show_cftype(struct cftype *cft, bool show);
 void cgroup_file_notify(struct cgroup_file *cfile);
 void cgroup_file_show(struct cgroup_file *cfile, bool show);
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f1e6058089f5..bbfc9388bd7d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4192,10 +4192,13 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 		return ret;
 	}
 
+	kernfs_show(kn, !(cft->flags & CFTYPE_HIDDEN));
+
 	if (cft->file_offset) {
 		struct cgroup_file *cfile = (void *)css + cft->file_offset;
 
 		timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
+		cfile->cft = cft;
 
 		spin_lock_irq(&cgroup_file_kn_lock);
 		cfile->kn = kn;
@@ -4474,6 +4477,24 @@ void cgroup_file_notify(struct cgroup_file *cfile)
 	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
 }
 
+static struct kernfs_node *cfile_kn_get(struct cgroup_file *cfile)
+{
+	struct kernfs_node *kn;
+
+	spin_lock_irq(&cgroup_file_kn_lock);
+	kn = cfile->kn;
+	kernfs_get(kn);
+	spin_unlock_irq(&cgroup_file_kn_lock);
+
+	return kn;
+}
+
+static bool cfile_visible(struct cgroup_file *cfile)
+{
+	return !(cfile->cft->flags & CFTYPE_HIDDEN) &&
+		!(cfile->flags & CFILE_HIDDEN);
+}
+
 /**
  * cgroup_file_show - show or hide a hidden cgroup file
  * @cfile: target cgroup_file obtained by setting cftype->file_offset
@@ -4483,15 +4504,20 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show)
 {
 	struct kernfs_node *kn;
 
-	spin_lock_irq(&cgroup_file_kn_lock);
-	kn = cfile->kn;
-	kernfs_get(kn);
-	spin_unlock_irq(&cgroup_file_kn_lock);
+	mutex_lock(&cgroup_mutex);
 
-	if (kn)
-		kernfs_show(kn, show);
+	if (show)
+		cfile->flags &= ~CFILE_HIDDEN;
+	else
+		cfile->flags |= CFILE_HIDDEN;
 
-	kernfs_put(kn);
+	kn = cfile_kn_get(cfile);
+	if (kn) {
+		kernfs_show(kn, cfile_visible(cfile));
+		kernfs_put(kn);
+	}
+
+	mutex_unlock(&cgroup_mutex);
 }
 
 /**
@@ -5505,6 +5531,63 @@ static void offline_css(struct cgroup_subsys_state *css)
 	wake_up_all(&css->cgroup->offline_waitq);
 }
 
+/**
+ * cgroup_show_cftype - show or hide a cgroup file type
+ * @cft: cftype to show or hide
+ * @show: whether to show or hide
+ *
+ * Sets %CFTYPE_HIDDEN and shows/hides the matching files according to @show.
+ * @cft may or may not be added at the time of this call. After hiding, it's
+ * guaranteed that there are no in-flight operations on the hidden files.
+ */
+void cgroup_show_cftype(struct cftype *cft, bool show)
+{
+	struct cgroup_subsys *ss = cft->ss;
+	struct cgroup *root = ss ? &ss->root->cgrp : &cgrp_dfl_root.cgrp;
+	struct cgroup_subsys_state *css;
+
+	mutex_lock(&cgroup_mutex);
+
+	if (show)
+		cft->flags &= ~CFTYPE_HIDDEN;
+	else
+		cft->flags |= CFTYPE_HIDDEN;
+
+	if (!(cft->flags & __CFTYPE_ADDED))
+		goto out_unlock;
+
+	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
+		struct cgroup *cgrp = css->cgroup;
+		struct kernfs_node *kn;
+
+		if (!(css->flags & CSS_VISIBLE))
+			continue;
+
+		if (cft->file_offset) {
+			struct cgroup_file *cfile =
+				(void *)css + cft->file_offset;
+
+			kn = cfile_kn_get(cfile);
+			if (kn) {
+				kernfs_show(kn, cfile_visible(cfile));
+				kernfs_put(kn);
+			}
+		} else {
+			char buf[CGROUP_FILE_NAME_MAX];
+
+			kn = kernfs_find_and_get(cgrp->kn,
+					cgroup_file_name(cgrp, cft, buf));
+			if (kn) {
+				kernfs_show(kn, show);
+				kernfs_put(kn);
+			}
+		}
+	}
+
+out_unlock:
+	mutex_unlock(&cgroup_mutex);
+}
+
 /**
  * css_create - create a cgroup_subsys_state
  * @cgrp: the cgroup new css will be associated with

From patchwork Wed Nov 30 08:22:45 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059538
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 7C8B3C4321E
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:23:57 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234669AbiK3IXy (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:23:54 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54282 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234654AbiK3IXs (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:23:48 -0500
Received: from mail-pj1-x102a.google.com (mail-pj1-x102a.google.com
 [IPv6:2607:f8b0:4864:20::102a])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 3ECD75FBBA;
        Wed, 30 Nov 2022 00:23:45 -0800 (PST)
Received: by mail-pj1-x102a.google.com with SMTP id
 t11-20020a17090a024b00b0021932afece4so1193375pje.5;
        Wed, 30 Nov 2022 00:23:45 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=BRnwnSE0pgMdvenGsAIhpw8rC6avY4LYPdWHnC8Zd84=;
        b=KqVk2nFS12JlPcPUEdhLECvI1z71JEov8mUnf2l8ZwccxMz4bTbQ6mrVZSMMkV8cg/
         d43NmAZd+QvARDxttlLPbhIhC4tzB60TXvTypj2hQ7NTz86ybp3FHqX7NPjXk7p0PDae
         mvNfsvGt6O8/9ceaL4jZiF+/cZOIHgtye/orITjLxgC1k6EqUoXzZjPjcDAxjy9fgg5u
         0BuxteZw2FAvOSZsl1VrN7umTcFpVWEp3c/x8T+NflmyhhSlU6soHmHcnpjVrkjR/8Oh
         UBPQHQIDRdEn69bVO4wtu6OiGM0c0lZAoTLgIJGMQRI3uxOBV5GSdybtqJt5KMq3ccrC
         w5Yg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=BRnwnSE0pgMdvenGsAIhpw8rC6avY4LYPdWHnC8Zd84=;
        b=knn8nGTtShG3youZiy487IpowU1MNJMNJu6AT4hRmHbkvKsJDBzVQHDw7sfP/TyWxD
         y9yBlH5lwpzjPKeuFWOLzYQ6JsfRzl3SS/KB+IRSmP25PFNA6k/N1Fw2h0lM0lK2b0+q
         Dvdv3/KoNW5ipMq4frarR9kAvtMtWvFl3nuel2nATnYgQHTEnpgEr4FZY0ksQqy4cVQQ
         Q+vASQgaivSGi8RkVBlpw/Je88AZ5SdZP6Xdh5ZveK4WEoCn11HnmDksStnU8gRDKcJ9
         OZJ4mH5n4XV2q0kZa+l3Rhhpc23t0iuPV/r4/+rkdOEwR767CTVtsl0k/CFVBjBWToJ4
         34Pw==
X-Gm-Message-State: ANoB5pldzp/7VBEdUJ6IbqTrUHWVs/u2M9qeB7enJuSvvNOSiKzeLZBu
        J+UkEkN0XihcvlbcIOG9r7g=
X-Google-Smtp-Source: 
 AA0mqf4lZvxlZSCJ9+DtY5osPPsA+cMeAN3oF5PNd+otYSYGGpOO+tfzw1EQjaQjf4nnHDVJDGbVDQ==
X-Received: by 2002:a17:902:e886:b0:188:fb19:5f39 with SMTP id
 w6-20020a170902e88600b00188fb195f39mr44380705plg.21.1669796624541;
        Wed, 30 Nov 2022 00:23:44 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 e9-20020aa79809000000b0057507bbd704sm819767pfl.5.2022.11.30.00.23.43
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:23:44 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 03/31] BPF: Add @prog to bpf_struct_ops->check_member()
Date: Tue, 29 Nov 2022 22:22:45 -1000
Message-Id: <20221130082313.3241517-4-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org
X-Patchwork-Delegate: bpf@iogearbox.net

Pass an extra @prog parameter to bpf_struct_ops->check_member(). This will
be used in the future to verify @prog->aux->sleepable per-operation so that
a subset of operations in a struct_ops can be sleepable.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/bpf.h   | 3 ++-
 kernel/bpf/verifier.c | 2 +-
 net/ipv4/bpf_tcp_ca.c | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 798aec816970..82f5c30100fd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1337,7 +1337,8 @@ struct bpf_struct_ops {
 	const struct bpf_verifier_ops *verifier_ops;
 	int (*init)(struct btf *btf);
 	int (*check_member)(const struct btf_type *t,
-			    const struct btf_member *member);
+			    const struct btf_member *member,
+			    struct bpf_prog *prog);
 	int (*init_member)(const struct btf_type *t,
 			   const struct btf_member *member,
 			   void *kdata, const void *udata);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 07c0259dfc1a..b5dba33f8e7d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14928,7 +14928,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
 	}
 
 	if (st_ops->check_member) {
-		int err = st_ops->check_member(t, member);
+		int err = st_ops->check_member(t, member, prog);
 
 		if (err) {
 			verbose(env, "attach to unsupported member %s of struct %s\n",
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 6da16ae6a962..24c819509b98 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -247,7 +247,8 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
 }
 
 static int bpf_tcp_ca_check_member(const struct btf_type *t,
-				   const struct btf_member *member)
+				   const struct btf_member *member,
+				   struct bpf_prog *prog)
 {
 	if (is_unsupported(__btf_member_bit_offset(t, member) / 8))
 		return -ENOTSUPP;

From patchwork Wed Nov 30 08:22:46 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059539
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id DEE0FC433FE
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:24:24 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234777AbiK3IYX (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:24:23 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54984 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234768AbiK3IXy (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:23:54 -0500
Received: from mail-pf1-x433.google.com (mail-pf1-x433.google.com
 [IPv6:2607:f8b0:4864:20::433])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 3B687654C7;
        Wed, 30 Nov 2022 00:23:47 -0800 (PST)
Received: by mail-pf1-x433.google.com with SMTP id z17so11344071pff.1;
        Wed, 30 Nov 2022 00:23:47 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=1JjbQFqfdsaVhZqtFfQSRXxMBSehwkQCfe7x4A4m6D0=;
        b=EqvLmZuOXH1Zu4wV8SRbBjPk6VDpBCHsg+vQDcTngJACJKrZ/EG8ThdJydry1OyED5
         aqP1XIZbP3/xA1c0VRJ0+xB4iMIw9gMn3hd/uvGfPoJ4ajT2r3UdPTExbybW/OFOOq3I
         pnpUfWXJlUkihue1VSjbbqZGJiXLEbsFmDSnn2o6/MWcCL8LJliZeN1f6F6vn+W6+oCe
         MM3aMX+nDAOyB3Ct8ua63fXfF3b0oZNXvDtF499k7ElWAq4GSirE8n3nCtJeTwTQrr/p
         gBn+xpd1VyGomr4YSlAIPOisotD/Tsw98Oja+KxKyDDLwu5axVENTJIgtRzwIDJsQsbO
         rZfA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=1JjbQFqfdsaVhZqtFfQSRXxMBSehwkQCfe7x4A4m6D0=;
        b=1CNF2kz7TjNEIN4nSpWk8UWVwjJ2aWwPHJqIl14jBzbKVq9cA5FfdNLbq3SnZ5lAsS
         EWfkrBWUWw18wKIO85Lhdwu9y/zjHRPxkVWyTG6CeT+NQPCIzvPhox1j14+xG9LhcIGI
         QFZqL48X9IhqGw2dfCHahoeowaVQQHr6BiMTLtLvbiHbHsqBkCxBD3FMx4DAB9hysBV5
         PS9X+hdwiX2di7XPQZcmqcfSUggr/ivygkKHLRUuj9VPqWLKd9nQUxc9Gs+zaOcZd/t4
         jrCkpjexFSDmYp2z+uZ2wf3rEDDhtu0zTzzAVbSAIp9oaHEu4pAyNiXz48xZwe2j2/B7
         IAaQ==
X-Gm-Message-State: ANoB5pkvSwwk6SZEgCroHblD/ApwV+6lSnjDXl9nrSVLt2rg50cy5Kco
        Db0mF9HeZijAj61FTgIC/b4=
X-Google-Smtp-Source: 
 AA0mqf5iBqlSrn0+Stho4qdCinOFxoRLiURspBXjmTMrjNfE21w7GMJIi3yXNJzzGuhk1n740H7VWA==
X-Received: by 2002:a05:6a02:187:b0:46b:26a6:51bc with SMTP id
 bj7-20020a056a02018700b0046b26a651bcmr53295988pgb.204.1669796626483;
        Wed, 30 Nov 2022 00:23:46 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 d9-20020a170902cec900b001896040022asm753472plg.190.2022.11.30.00.23.45
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:23:46 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 04/31] sched: Allow sched_cgroup_fork() to fail and introduce
 sched_cancel_fork()
Date: Tue, 29 Nov 2022 22:22:46 -1000
Message-Id: <20221130082313.3241517-5-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

A new sched_clas needs a bit more control over forking. This patch makes the
following changes:

* Add sched_cancel_fork() which is called if fork fails after sched_fork()
  succeeds so that the preparation can be undone.

* Allow sched_cgroup_fork() to fail.

Neither is used yet and this patch shouldn't cause any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/task.h |  3 ++-
 kernel/fork.c              | 15 ++++++++++-----
 kernel/sched/core.c        |  8 +++++++-
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index d6c48163c6de..b5ff1361ac8d 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -58,7 +58,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 
 extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern void sched_cancel_fork(struct task_struct *p);
 extern void sched_post_fork(struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 08969f5aa38d..a90c6a4938c6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2226,7 +2226,7 @@ static __latent_entropy struct task_struct *copy_process(
 
 	retval = perf_event_init_task(p, clone_flags);
 	if (retval)
-		goto bad_fork_cleanup_policy;
+		goto bad_fork_sched_cancel_fork;
 	retval = audit_alloc(p);
 	if (retval)
 		goto bad_fork_cleanup_perf;
@@ -2367,7 +2367,9 @@ static __latent_entropy struct task_struct *copy_process(
 	 * cgroup specific, it unconditionally needs to place the task on a
 	 * runqueue.
 	 */
-	sched_cgroup_fork(p, args);
+	retval = sched_cgroup_fork(p, args);
+	if (retval)
+		goto bad_fork_cancel_cgroup;
 
 	/*
 	 * From this point on we must avoid any synchronous user-space
@@ -2419,13 +2421,13 @@ static __latent_entropy struct task_struct *copy_process(
 	/* Don't start children in a dying pid namespace */
 	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
 		retval = -ENOMEM;
-		goto bad_fork_cancel_cgroup;
+		goto bad_fork_core_free;
 	}
 
 	/* Let kill terminate clone/fork in the middle */
 	if (fatal_signal_pending(current)) {
 		retval = -EINTR;
-		goto bad_fork_cancel_cgroup;
+		goto bad_fork_core_free;
 	}
 
 	init_task_pid_links(p);
@@ -2492,10 +2494,11 @@ static __latent_entropy struct task_struct *copy_process(
 
 	return p;
 
-bad_fork_cancel_cgroup:
+bad_fork_core_free:
 	sched_core_free(p);
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
+bad_fork_cancel_cgroup:
 	cgroup_cancel_fork(p, args);
 bad_fork_put_pidfd:
 	if (clone_flags & CLONE_PIDFD) {
@@ -2534,6 +2537,8 @@ static __latent_entropy struct task_struct *copy_process(
 	audit_free(p);
 bad_fork_cleanup_perf:
 	perf_event_free_task(p);
+bad_fork_sched_cancel_fork:
+	sched_cancel_fork(p);
 bad_fork_cleanup_policy:
 	lockdep_free_task(p);
 #ifdef CONFIG_NUMA
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cb2aa2b54c7a..85eb82ad2ffd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4604,7 +4604,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	return 0;
 }
 
-void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 {
 	unsigned long flags;
 
@@ -4631,6 +4631,12 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 	if (p->sched_class->task_fork)
 		p->sched_class->task_fork(p);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	return 0;
+}
+
+void sched_cancel_fork(struct task_struct *p)
+{
 }
 
 void sched_post_fork(struct task_struct *p)

From patchwork Wed Nov 30 08:22:47 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059540
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id D4A81C4321E
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:24:25 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234910AbiK3IYY (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:24:24 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55060 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234785AbiK3IXz (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:23:55 -0500
Received: from mail-pf1-x435.google.com (mail-pf1-x435.google.com
 [IPv6:2607:f8b0:4864:20::435])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 273605C748;
        Wed, 30 Nov 2022 00:23:49 -0800 (PST)
Received: by mail-pf1-x435.google.com with SMTP id 140so16160474pfz.6;
        Wed, 30 Nov 2022 00:23:49 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=U2/8JQ+FgnnSuHcjXh2J3RCUAB76b7QNPLT1f/A9xug=;
        b=BSi9n9GXOiYxxRJdC4TPmhxPiqB0MHRHFnvILrIVFCYNPVdtJrRTEsrU8HbTLbTtTE
         xiVeIMW9m9gb4SOO9kRP0RitBzzv0K+RNYaOHUShGlNlSvjXp38qVFKwEIPPm34dCt0y
         65FmagfmqDP+Da7yyQuLDjhMRltDlcxl3FzDLJEctobX0NijYoU8gFDXAZ3Pp8VE2kQ+
         n62V8Ac4nlzIjFW7Nk04UGSgWF52Ip/AN06Tj0+Z0rEYvUVErZxXyLirXNBP5nVCI0l/
         BfXzfJlV7DKtLbFY2WBlp1JIrngzB41hM5NELWpB7fPKmC7ydK5KhZm6cIC/j1HF2zH0
         agQA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=U2/8JQ+FgnnSuHcjXh2J3RCUAB76b7QNPLT1f/A9xug=;
        b=GgDyjKUmJj5IepmHj0Cv6E1mTDJ/vwNGFcIEy85tDXGAArSREhF80RnVC3edOTiQlB
         rArZbyaZEsSl6HPiTyS6D0bK2pdOI6ec+U0Z7BKs54Ty37TpN+bHJ6rIGjaQ/4T5I8ir
         /tJnjFki2uDSBhCQLqq5xH2SMfr43e+F/FQBlnZ4+/h8gXUY1am9QMvQ/K+/4zFlpenB
         p1VF+PyzfTNc8CekzBmdU1dWk1cU44/VTyHWB/aLEV90ZEiPQG/OdEVpS7GD7t8ANtRu
         YvAFO6ufVC+FxWjiWgdFaDIvBtD1sSd/GuFqDi47CirkRkEjhCLJ4jVx0Okct+Sr5irS
         H/dQ==
X-Gm-Message-State: ANoB5pnRb8pVmKBCiX6qN7Uw9GodHygTOPImA/i5byqQ7P2P4RYDvjQo
        1rPPfLzsORO9DMRuITXECAk=
X-Google-Smtp-Source: 
 AA0mqf7rsSp9lcZ5WgoCYorTanlglI0Qu4SNRXJjk7DZQxDZxxscIehc8TZmXxMJ/k7hasLfjXXE5g==
X-Received: by 2002:a05:6a00:1d25:b0:562:5f71:d188 with SMTP id
 a37-20020a056a001d2500b005625f71d188mr42805390pfx.57.1669796628479;
        Wed, 30 Nov 2022 00:23:48 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 n5-20020a170902e54500b001837b19ebb8sm739977plf.244.2022.11.30.00.23.47
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:23:48 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 05/31] sched: Add sched_class->reweight_task()
Date: Tue, 29 Nov 2022 22:22:47 -1000
Message-Id: <20221130082313.3241517-6-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

Currently, during a task weight change, sched core directly calls
reweight_task() defined in fair.c if @p is on CFS. Let's make it a proper
sched_class operation instead. CFS's reweight_task() is renamed to
reweight_task_fair() and now called through sched_class.

While it turns a direct call into an indirect one, set_load_weight() isn't
called from a hot path and this change shouldn't cause any noticeable
difference. This will be used to implement reweight_task for other scheduler
classes.

This will be used by a new sched_class to track weight changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c  | 4 ++--
 kernel/sched/fair.c  | 3 ++-
 kernel/sched/sched.h | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 85eb82ad2ffd..70ec74dbb45a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1275,8 +1275,8 @@ static void set_load_weight(struct task_struct *p, bool update_load)
 	 * SCHED_OTHER tasks have to update their load when changing their
 	 * weight
 	 */
-	if (update_load && p->sched_class == &fair_sched_class) {
-		reweight_task(p, prio);
+	if (update_load && p->sched_class->reweight_task) {
+		p->sched_class->reweight_task(task_rq(p), p, prio);
 	} else {
 		load->weight = scale_load(sched_prio_to_weight[prio]);
 		load->inv_weight = sched_prio_to_wmult[prio];
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4a0b8bd941c..78263cef1ea8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3330,7 +3330,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 
 }
 
-void reweight_task(struct task_struct *p, int prio)
+static void reweight_task_fair(struct rq *rq, struct task_struct *p, int prio)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -12176,6 +12176,7 @@ DEFINE_SCHED_CLASS(fair) = {
 	.task_tick		= task_tick_fair,
 	.task_fork		= task_fork_fair,
 
+	.reweight_task		= reweight_task_fair,
 	.prio_changed		= prio_changed_fair,
 	.switched_from		= switched_from_fair,
 	.switched_to		= switched_to_fair,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a4a20046e586..08799b2a566e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2193,6 +2193,8 @@ struct sched_class {
 	 */
 	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
 	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
+	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
+			      int newprio);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
 			      int oldprio);
 
@@ -2345,8 +2347,6 @@ extern void init_sched_dl_class(void);
 extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
 
-extern void reweight_task(struct task_struct *p, int prio);
-
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);
 

From patchwork Wed Nov 30 08:22:48 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059541
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id BCEA8C4321E
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:24:28 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234851AbiK3IY0 (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:24:26 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55152 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234820AbiK3IX4 (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:23:56 -0500
Received: from mail-pl1-x633.google.com (mail-pl1-x633.google.com
 [IPv6:2607:f8b0:4864:20::633])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 521DE65E4B;
        Wed, 30 Nov 2022 00:23:51 -0800 (PST)
Received: by mail-pl1-x633.google.com with SMTP id io19so15943802plb.8;
        Wed, 30 Nov 2022 00:23:51 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=pluDkz1LDp7E7JUyCCPK7UNMM5NVNT0Y38yg2BcWmFs=;
        b=mCDeRUH4Ouwb2Jm6I9rgWHU8YyNp0l8cxQUrHnc6pIUDmdnyIaRxVbglSNYJesOXjT
         a3jpT1uJwSVTiSg7uJ3o+DKPignfb01PwxD4ho+WhlbTFgyoihvVg5szIGRJEmGU8c5G
         OFOPA4cOMnLCxUDVe4QtmoBc+YjrMXCcBayKmWcITnCHCG3xI3r5S49PZY8iuXVXA/ZC
         W2SY2yB2E+tPupuH+k5vxHwfLxOMmE8sOakTzkLCMKGHnysYTtXO88GH5KBPQeSV2mj0
         o++mVg7UVuO6XV74dveJtzRvpPevpqmiqH1y1JUDnIFyXOs6sxRk5YsKyJvm62ZUGuBu
         iFrA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=pluDkz1LDp7E7JUyCCPK7UNMM5NVNT0Y38yg2BcWmFs=;
        b=SjyV5CV77eMCv/jwCwNq9TOFnlBrMOJU0KchZjy37/xDEBIqwldNl5eef6E4814I3U
         BSAWG+lskYvhNKoP+zOt8UfZU7XQFhr3zqI0prHUp7fAaDYwtCXWh9PzMh+CW1tNj4mm
         MuPpA3WMyMXhz0P+p90Dy4a+BGbnUV4fsfokmbYHvCGrp4lGA3NcBq3rRo2ADBRboW+w
         6hg5P2szSg4uJgh8T5z+FD1Oov29KpJHyIrof4wxWd7tMHPcBXUUDePMOFq/UaxUGzXQ
         wnN3+lpS+whRay8I2oK5mcUTdh3SudR+P2+7tIWkE/OBulp/WetPJWMyP19WH6MwpN0K
         N1rQ==
X-Gm-Message-State: ANoB5pm5AUSlqAlIjkAJUCSGiji+C9mkKaRotUx9bMOs+OH//4EEx7j7
        cBb7Is3UAVVF4ZSfTadp44Y=
X-Google-Smtp-Source: 
 AA0mqf5wRcUWJx9QjxiJGhLEm2Uj2g8fv4bUSgks58qS+LVh+MEtmoofSaMC/e1gZ0tSHDiC1kCwWg==
X-Received: by 2002:a17:902:d395:b0:189:58a9:14aa with SMTP id
 e21-20020a170902d39500b0018958a914aamr30172508pld.22.1669796630448;
        Wed, 30 Nov 2022 00:23:50 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 s3-20020a625e03000000b0056bf29c9ba3sm819636pfb.146.2022.11.30.00.23.49
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:23:50 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 06/31] sched: Add sched_class->switching_to() and expose
 check_class_changing/changed()
Date: Tue, 29 Nov 2022 22:22:48 -1000
Message-Id: <20221130082313.3241517-7-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

When a task switches to a new sched_class, the prev and new classes are
notified through ->switched_from() and ->switched_to(), respectively, after
the switching is done. However, a new sched_class needs to prepare the task
state before it is enqueued on the new class for the first time.

This patch adds ->switching_to() which is called during sched_class switch
through check_class_changing() before the task is restored and exposes
check_class_changing/changed() in kernel/sched/sched.h.

This is a prep patch and doesn't cause any behavior changes. The new
operation and exposed functions aren't used yet.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c  | 20 +++++++++++++++++---
 kernel/sched/sched.h |  7 +++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 70ec74dbb45a..d2247e8144e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2151,6 +2151,17 @@ inline int task_curr(const struct task_struct *p)
 	return cpu_curr(task_cpu(p)) == p;
 }
 
+/*
+ * ->switching_to() is called with the pi_lock and rq_lock held and must not
+ * mess with locking.
+ */
+void check_class_changing(struct rq *rq, struct task_struct *p,
+			  const struct sched_class *prev_class)
+{
+	if (prev_class != p->sched_class && p->sched_class->switching_to)
+		p->sched_class->switching_to(rq, p);
+}
+
 /*
  * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
  * use the balance_callback list if you want balancing.
@@ -2158,9 +2169,9 @@ inline int task_curr(const struct task_struct *p)
  * this means any call to check_class_changed() must be followed by a call to
  * balance_callback().
  */
-static inline void check_class_changed(struct rq *rq, struct task_struct *p,
-				       const struct sched_class *prev_class,
-				       int oldprio)
+void check_class_changed(struct rq *rq, struct task_struct *p,
+			 const struct sched_class *prev_class,
+			 int oldprio)
 {
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
@@ -6974,6 +6985,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 	}
 
 	__setscheduler_prio(p, prio);
+	check_class_changing(rq, p, prev_class);
 
 	if (queued)
 		enqueue_task(rq, p, queue_flag);
@@ -7603,6 +7615,8 @@ static int __sched_setscheduler(struct task_struct *p,
 	}
 	__setscheduler_uclamp(p, attr);
 
+	check_class_changing(rq, p, prev_class);
+
 	if (queued) {
 		/*
 		 * We enqueue to tail when the priority of a task is
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 08799b2a566e..3f98773d66dd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2191,6 +2191,7 @@ struct sched_class {
 	 * cannot assume the switched_from/switched_to pair is serialized by
 	 * rq->lock. They are however serialized by p->pi_lock.
 	 */
+	void (*switching_to) (struct rq *this_rq, struct task_struct *task);
 	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
 	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
 	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
@@ -2427,6 +2428,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
+extern void check_class_changing(struct rq *rq, struct task_struct *p,
+				 const struct sched_class *prev_class);
+extern void check_class_changed(struct rq *rq, struct task_struct *p,
+				const struct sched_class *prev_class,
+				int oldprio);
+
 extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
 
 #ifdef CONFIG_PREEMPT_RT

From patchwork Wed Nov 30 08:22:49 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059542
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 5F56FC433FE
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:24:29 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234856AbiK3IY1 (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:24:27 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55366 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234832AbiK3IX7 (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:23:59 -0500
Received: from mail-pl1-x634.google.com (mail-pl1-x634.google.com
 [IPv6:2607:f8b0:4864:20::634])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 329105ADE3;
        Wed, 30 Nov 2022 00:23:53 -0800 (PST)
Received: by mail-pl1-x634.google.com with SMTP id y17so8057061plp.3;
        Wed, 30 Nov 2022 00:23:53 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=pPSOyYDQAZPjQr3nZNHPmM2AV/KaTKSr8+C9zQ01Fn8=;
        b=ctsgL0qNfr78PSmSyKd17UDDhrhbsslLQCw6oULN9EjajmQ9CwZuJ0pPlTTogyWu5P
         DN8QWkwS9hQ5chL74cvDCOfeY4q6hy3IPIW6bNgqdjZCIwHTOflnBeXxUJa50zdAvL7j
         Hdhz0zlHECfMJ4yjmBnEtaS04/rEbWCwrbFmN/zR2uHg8FWt6h78ItlR3jrgXl5IsahX
         +ddCcitqlzkXgoggWJ1G1G2TiLUGzewyNgJPitEjyC747c2nNbLNOWS8X1DKRPUoBXKl
         esyExhQz6ubqT1pMmW+otKXuDmJePQ5pb35bwVsEPe6YW8SI9XoFQnA7q+RD4Avez03i
         Z2xg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=pPSOyYDQAZPjQr3nZNHPmM2AV/KaTKSr8+C9zQ01Fn8=;
        b=E3hQJWd0am7xVi4V2+Oiz5bf74+aYB4x2CmmPJ/06HpSJazQPcbOgH8/LYtaVDQFhF
         qeVniWpcLTj/I8c62GeteZKkugTMvCHgj6DCaNfhgI0SViWEba4fl7UxiUyb0MEXOcxd
         TOQyhWAcYnbxOhFVFEmA4Z+a/HDpdrq3rq9G33pfWcE2RN0telysL4j1BOz64+wib8yW
         VpyMVpVqZMvI9rVprNo0r386l1Pr9DgxLdwXSTFQNQdDanX3xQGTcE8gNwstqqBmuDqr
         swOnzME2X/Rdm7+Lu9g143CrG3wOHPEZug/RiFSh8PSpzaqqSIRP1eoCzOYO8XUoMFQf
         8VgA==
X-Gm-Message-State: ANoB5pmQjAMM6xW2JY0Y9BuiEdhRHRvWoDcr3DDDE9BzkQgV1TxA4tO9
        Pu/U//7TUR+YlRT8eSQjQms=
X-Google-Smtp-Source: 
 AA0mqf6TSiqAsJmOgLB1bR7+c2GdJ5ZMIEeQiHYjyDxdUosSey3eNQ6xWL6FHEbuinS6IAWW5LUeEA==
X-Received: by 2002:a17:90a:8b03:b0:213:16d2:4d4c with SMTP id
 y3-20020a17090a8b0300b0021316d24d4cmr64678734pjn.70.1669796632491;
        Wed, 30 Nov 2022 00:23:52 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 c29-20020aa7953d000000b0057524960947sm816402pfp.39.2022.11.30.00.23.51
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:23:52 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 07/31] sched: Factor out cgroup weight conversion functions
Date: Tue, 29 Nov 2022 22:22:49 -1000
Message-Id: <20221130082313.3241517-8-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

Factor out sched_weight_from/to_cgroup() which convert between scheduler
shares and cgroup weight. No functional change. The factored out functions
will be used by a new sched_class.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c  | 28 +++++++++++++---------------
 kernel/sched/sched.h | 18 ++++++++++++++++++
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2247e8144e3..cc5d1867bb4c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10977,29 +10977,27 @@ static int cpu_extra_stat_show(struct seq_file *sf,
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+
+static unsigned long tg_weight(struct task_group *tg)
+{
+	return scale_load_down(tg->shares);
+}
+
 static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
 			       struct cftype *cft)
 {
-	struct task_group *tg = css_tg(css);
-	u64 weight = scale_load_down(tg->shares);
-
-	return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+	return sched_weight_to_cgroup(tg_weight(css_tg(css)));
 }
 
 static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
-				struct cftype *cft, u64 weight)
+				struct cftype *cft, u64 cgrp_weight)
 {
-	/*
-	 * cgroup weight knobs should use the common MIN, DFL and MAX
-	 * values which are 1, 100 and 10000 respectively.  While it loses
-	 * a bit of range on both ends, it maps pretty well onto the shares
-	 * value used by scheduler and the round-trip conversions preserve
-	 * the original value over the entire range.
-	 */
-	if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+	unsigned long weight;
+
+	if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
 		return -ERANGE;
 
-	weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+	weight = sched_weight_from_cgroup(cgrp_weight);
 
 	return sched_group_set_shares(css_tg(css), scale_load(weight));
 }
@@ -11007,7 +11005,7 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
 static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
 				    struct cftype *cft)
 {
-	unsigned long weight = scale_load_down(css_tg(css)->shares);
+	unsigned long weight = tg_weight(css_tg(css));
 	int last_delta = INT_MAX;
 	int prio, delta;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3f98773d66dd..31f008f1e0cc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -435,6 +435,24 @@ struct task_group {
 #define MAX_SHARES		(1UL << 18)
 #endif
 
+/*
+ * cgroup weight knobs should use the common MIN, DFL and MAX values which are
+ * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it
+ * maps pretty well onto the shares value used by scheduler and the round-trip
+ * conversions preserve the original value over the entire range.
+ */
+static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight)
+{
+	return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL);
+}
+
+static inline unsigned long sched_weight_to_cgroup(unsigned long weight)
+{
+	return clamp_t(unsigned long,
+		       DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024),
+		       CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
+}
+
 typedef int (*tg_visitor)(struct task_group *, void *);
 
 extern int walk_tg_tree_from(struct task_group *from,

From patchwork Wed Nov 30 08:22:50 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059543
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 21630C433FE
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:24:41 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S235070AbiK3IYi (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:24:38 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54360 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234858AbiK3IYU (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:24:20 -0500
Received: from mail-pf1-x435.google.com (mail-pf1-x435.google.com
 [IPv6:2607:f8b0:4864:20::435])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 291BF68C70;
        Wed, 30 Nov 2022 00:23:55 -0800 (PST)
Received: by mail-pf1-x435.google.com with SMTP id q12so12076737pfn.10;
        Wed, 30 Nov 2022 00:23:55 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=5PkMg2emfhFv69qdb2Amxj+15eTI7uc+9BwnY9bGLjs=;
        b=YQYjuZ+fbLPlhNrwiRsXT4329Ax25y8medICOYHHg+y5W8AEbfbfBXz3pl3oQ+PGfD
         /g0ShPm0wNAdQNSzxxwPrbM/aYNfF3lorg9TJnCb9sSJbm9vs+6bbU1VTfmjy4ci0w3z
         zsPFWF+P0NO+kW4dvJxshAMcjLzwVKRbl5u870x962WqQ9H7gKPb1Ma5Mq/VPYFJubkF
         duPm8+RBMiOn0hs0Q4NjtM5+4UIh4noWtroB+QNLTAVPQFiAjRTfU5NLGbkBQtDTkgY9
         fvHvz4KkpYoDFvG+X5qH4b/lp88k0zK/eHBKX+vRTt9JRcQQLTC2Q14DSURS649cd/By
         1/Ww==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=5PkMg2emfhFv69qdb2Amxj+15eTI7uc+9BwnY9bGLjs=;
        b=6eVB7hyPDHm8/xHlQ/MC7HZ0c++llU4u9zjeJZnlilcUoVY5XavxaeTszR50dAkQYD
         MRBC41jgED055RpOWp/SlEy1QGEY1rSpGKMw9OiAAuLu3xzUDLaCt+1LLDsaMRWikDLp
         c+dHoN6cmKivm1XGpMPjw0BhO6xh9pz2stzm+B8utCtwPB+LA0yZ03lRb1D9073k+HW7
         JehpYqYpTkK0xfEjtKB4hYexugJKvvrwACzPv2YHVxHPRWm28Fup/smQFO2aQOxicg1W
         8tqMh+p3ZsC2whWgYv/lBg49f4Nm6cfV2f9MN40nne/3L/Sa0PHepUhJ2vDs56cknhkX
         eXcQ==
X-Gm-Message-State: ANoB5pnni8cUAywCZf4Z5n0WKg9Q4BdoHze4QGUG5dg+eo+SwCOZku4E
        Xu5lbcAoCNoO2J9xYIC2nxM=
X-Google-Smtp-Source: 
 AA0mqf5khfJpBB2SGa+YzS1bCnTZccWpOPVLVk2luE01pJ/m4ZfD3x26EBjSK6mJ5P9b9AueQZ2ttQ==
X-Received: by 2002:a05:6a00:1348:b0:56b:e27f:76ee with SMTP id
 k8-20020a056a00134800b0056be27f76eemr41494660pfu.31.1669796634425;
        Wed, 30 Nov 2022 00:23:54 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 w63-20020a626242000000b005624e2e0508sm785690pfb.207.2022.11.30.00.23.53
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:23:54 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 08/31] sched: Expose css_tg() and __setscheduler_prio() in
 kernel/sched/sched.h
Date: Tue, 29 Nov 2022 22:22:50 -1000
Message-Id: <20221130082313.3241517-9-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

These will be used by a new sched_class.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c  | 7 +------
 kernel/sched/sched.h | 7 +++++++
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cc5d1867bb4c..ec3af657e8f4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6845,7 +6845,7 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
 }
 EXPORT_SYMBOL(default_wake_function);
 
-static void __setscheduler_prio(struct task_struct *p, int prio)
+void __setscheduler_prio(struct task_struct *p, int prio)
 {
 	if (dl_prio(prio))
 		p->sched_class = &dl_sched_class;
@@ -10245,11 +10245,6 @@ void sched_move_task(struct task_struct *tsk)
 	task_rq_unlock(rq, tsk, &rf);
 }
 
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
-{
-	return css ? container_of(css, struct task_group, css) : NULL;
-}
-
 static struct cgroup_subsys_state *
 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 31f008f1e0cc..5d12c386aac6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -469,6 +469,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
 	return walk_tg_tree_from(&root_task_group, down, up, data);
 }
 
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct task_group, css) : NULL;
+}
+
 extern int tg_nop(struct task_group *tg, void *data);
 
 extern void free_fair_sched_group(struct task_group *tg);
@@ -493,6 +498,8 @@ extern long sched_group_rt_runtime(struct task_group *tg);
 extern long sched_group_rt_period(struct task_group *tg);
 extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
 
+extern void __setscheduler_prio(struct task_struct *p, int prio);
+
 extern struct task_group *sched_create_group(struct task_group *parent);
 extern void sched_online_group(struct task_group *tg,
 			       struct task_group *parent);

From patchwork Wed Nov 30 08:22:51 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059544
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 437B5C4321E
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:24:55 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234685AbiK3IYx (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:24:53 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54960 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234890AbiK3IYW (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:24:22 -0500
Received: from mail-pl1-x633.google.com (mail-pl1-x633.google.com
 [IPv6:2607:f8b0:4864:20::633])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 45E2866CB7;
        Wed, 30 Nov 2022 00:23:57 -0800 (PST)
Received: by mail-pl1-x633.google.com with SMTP id w23so15930224ply.12;
        Wed, 30 Nov 2022 00:23:57 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=ezcFrNZOthARI59n5xWOuY0GggDE8K93SHPg6RLEJIk=;
        b=OAUSKTsMnw6D9ofbCbJTttCYAs8Ns2RXG/lIu5a97vR4xsJWDqL+NAcPIattVOiXkH
         AACPoBRFYcUI5CO/fefjTLdgKGlRFi4DCr38z+cgHVx7rWXeb/cnX1+4L1LNJCtZuWJz
         IXOkhF/9S0vEmnhXtC3cRtp/mlJf4dL/l6q8up1M8ErP+MdQCZD5lwBmaOiIiRq6OQjD
         9zFKCZ+dXptdqYMp/VYiUyQF1GuFHkhiTt5a4ibxuupZgdBTR+KcP5W8k1Op7XrB6IFt
         I2foOGOuulMvc3Hb3kPTDQuJzRgElRI9YoYgWRWv+KXkj6gKw7Q3O+WM7j05chzMTHJq
         Qlow==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=ezcFrNZOthARI59n5xWOuY0GggDE8K93SHPg6RLEJIk=;
        b=6AqVpQssLXusMMHJ8Gyl+g9nvt97HVGsymYlIkhIBAUDdzlzBSv2OC6Tbhv4hYKcNs
         /cLKEFX84mceAwyozuOyRUNwNQZsxm3XHvv1Ot4T85EKo4HoDRBwJ7CgojcUoCH/g6lf
         haKw1k+2Yz3JnRtbdTVNktHmxOaLvAil/lOQIy2xzETtKOp6H3qbperV83Pp9B/sEdDS
         OHvDEWIwABuVfRORHCPsWDG3Nj+V09XnZU+5P6Xlegk3twcwtkEtS7j77RKYUdQwmg4U
         2jPXOE8xVoAV1Ko2plIWIEAlAO86e66/egwU4PLqXNLSSmNDOmzhJ7lNgBXbv0wTsZCx
         PL9w==
X-Gm-Message-State: ANoB5pmvBN4W4jvlJTcRZsqZ4sMzKfWvUsiXF0STnk9tiqE8W3iVtki6
        W1ZplWAGJP82Jz+KkAPDxSU=
X-Google-Smtp-Source: 
 AA0mqf63YY8Rxnd2rTCHKISEopuPr/dvBmgAxTwgensMQmlxphR+I1tgf8BAkmooo+b7L886iQkCZw==
X-Received: by 2002:a17:902:8c8e:b0:188:d588:34f2 with SMTP id
 t14-20020a1709028c8e00b00188d58834f2mr54836041plo.15.1669796636539;
        Wed, 30 Nov 2022 00:23:56 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 pw9-20020a17090b278900b00210c84b8ae5sm689952pjb.35.2022.11.30.00.23.55
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:23:56 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 09/31] sched: Enumerate CPU cgroup file types
Date: Tue, 29 Nov 2022 22:22:51 -1000
Message-Id: <20221130082313.3241517-10-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

Rename cpu[_legacy]_files to cpu[_legacy]_cftypes for clarity and add
cpu_cftype_id which enumerates every cgroup2 interface file type. This
doesn't make any functional difference now. The enums will be used to access
specific cftypes by a new sched_class to selectively show CPU controller
interface files.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c  | 22 +++++++++++-----------
 kernel/sched/sched.h | 21 +++++++++++++++++++++
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec3af657e8f4..735e94bc7dbb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10882,7 +10882,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
 }
 #endif
 
-static struct cftype cpu_legacy_files[] = {
+static struct cftype cpu_legacy_cftypes[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
@@ -11089,21 +11089,21 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
 }
 #endif
 
-static struct cftype cpu_files[] = {
+struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	{
+	[CPU_CFTYPE_WEIGHT] = {
 		.name = "weight",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_u64 = cpu_weight_read_u64,
 		.write_u64 = cpu_weight_write_u64,
 	},
-	{
+	[CPU_CFTYPE_WEIGHT_NICE] = {
 		.name = "weight.nice",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_s64 = cpu_weight_nice_read_s64,
 		.write_s64 = cpu_weight_nice_write_s64,
 	},
-	{
+	[CPU_CFTYPE_IDLE] = {
 		.name = "idle",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_s64 = cpu_idle_read_s64,
@@ -11111,13 +11111,13 @@ static struct cftype cpu_files[] = {
 	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
-	{
+	[CPU_CFTYPE_MAX] = {
 		.name = "max",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cpu_max_show,
 		.write = cpu_max_write,
 	},
-	{
+	[CPU_CFTYPE_MAX_BURST] = {
 		.name = "max.burst",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_u64 = cpu_cfs_burst_read_u64,
@@ -11125,13 +11125,13 @@ static struct cftype cpu_files[] = {
 	},
 #endif
 #ifdef CONFIG_UCLAMP_TASK_GROUP
-	{
+	[CPU_CFTYPE_UCLAMP_MIN] = {
 		.name = "uclamp.min",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cpu_uclamp_min_show,
 		.write = cpu_uclamp_min_write,
 	},
-	{
+	[CPU_CFTYPE_UCLAMP_MAX] = {
 		.name = "uclamp.max",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cpu_uclamp_max_show,
@@ -11151,8 +11151,8 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.can_attach	= cpu_cgroup_can_attach,
 #endif
 	.attach		= cpu_cgroup_attach,
-	.legacy_cftypes	= cpu_legacy_files,
-	.dfl_cftypes	= cpu_files,
+	.legacy_cftypes	= cpu_legacy_cftypes,
+	.dfl_cftypes	= cpu_cftypes,
 	.early_init	= true,
 	.threaded	= true,
 };
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5d12c386aac6..157eeabca5db 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3215,4 +3215,25 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
 	cgroup_account_cputime(curr, delta_exec);
 }
 
+#ifdef CONFIG_CGROUP_SCHED
+enum cpu_cftype_id {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	CPU_CFTYPE_WEIGHT,
+	CPU_CFTYPE_WEIGHT_NICE,
+	CPU_CFTYPE_IDLE,
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	CPU_CFTYPE_MAX,
+	CPU_CFTYPE_MAX_BURST,
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+	CPU_CFTYPE_UCLAMP_MIN,
+	CPU_CFTYPE_UCLAMP_MAX,
+#endif
+	CPU_CFTYPE_CNT,
+};
+
+extern struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1];
+#endif /* CONFIG_CGROUP_SCHED */
+
 #endif /* _KERNEL_SCHED_SCHED_H */

From patchwork Wed Nov 30 08:22:52 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059545
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 69413C4321E
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:25:00 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234794AbiK3IY6 (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:24:58 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55066 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234939AbiK3IYY (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:24:24 -0500
Received: from mail-pf1-x430.google.com (mail-pf1-x430.google.com
 [IPv6:2607:f8b0:4864:20::430])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 41E6D6B3A4;
        Wed, 30 Nov 2022 00:23:59 -0800 (PST)
Received: by mail-pf1-x430.google.com with SMTP id o1so11607049pfp.12;
        Wed, 30 Nov 2022 00:23:59 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=Gxv++NBnFpdmWnlSJbtHUBmuUdRGOnW9DgpB9Iflias=;
        b=PUfmJVBIHHYPYEhAS6R2x9XGUCDE+Carvu4xp0kRrUUWyPtZhoudJ+UNr44LBXF2oZ
         /3uLF+zPKD3Tzwo5cqM6mOxsrTxC1Ln/8UALs9GvigOvKHwZFV3aP8kTZQAPdFk9gB41
         Io4OSqlNQVrl3hJ5op9KpGnkk+70pBtHajl+DKn9S74EeBMk1+ngPJKEIHxtIXgfmPW0
         HkBwVlgQrlPs/3acXQO3UtaoJ+0EvFDM7sIZDD74dVevV2wl6gTmesCPeOl5+e6q3mFj
         3ImKlSYxpoIYgmmvMeyC7ZWpjE/VBGgCKp1FtHBm/3gJb/OOT3r++gNu8HvruNUKWiFM
         fedw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=Gxv++NBnFpdmWnlSJbtHUBmuUdRGOnW9DgpB9Iflias=;
        b=Yz8cA+SoBATZnM23r++pKdSiucUfdXug/oVkZbiHiziBwnkbyL9OVMBittXZYrZw3V
         xRvKZ9tX5vbQIWdCV7HiO7CFkokK/F1E0Z9gzfBl6bVc3h7+MMvZh1sIDzAWU9qGN/XG
         jPNA/M3qLMEDIVihEPAAOL53WrGjhGrTNuEi9A875nsgHjH0FNLSBigm+cZ6AyanOb5o
         sSuhZ733F7I0JJSdlP4a0EVzotmt4WeIqNwlqEMdBKFkx+VkBpFCKpFsw36QhBVhFH28
         ikg1FulYcFyiOvKH87z9EbRQ83fc2RQmFxlk3asZUAAae+40UGb8+I5Wm5v1d9iRUecn
         YT9Q==
X-Gm-Message-State: ANoB5pkJ0tR3Kj9m0xyq9UDDc0EX/XHuOnuQZZHoI7vCrEineUPqaWjI
        ZSn/HN3Rzck/BFffaZl55LA=
X-Google-Smtp-Source: 
 AA0mqf63XMQjNSEmX6yr63Qw7tRmJcXsN1K9X4Efcne2YjnQusUApnFeyHjLfZ9GCL1LF3xSRpu+tQ==
X-Received: by 2002:a65:458e:0:b0:477:e813:1795 with SMTP id
 o14-20020a65458e000000b00477e8131795mr21863122pgq.54.1669796638582;
        Wed, 30 Nov 2022 00:23:58 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 d6-20020a170902654600b00172b87d9770sm761887pln.81.2022.11.30.00.23.57
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:23:58 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 10/31] sched: Add @reason to sched_class->rq_{on|off}line()
Date: Tue, 29 Nov 2022 22:22:52 -1000
Message-Id: <20221130082313.3241517-11-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

->rq_{on|off}line are called either during CPU hotplug or cpuset partition
updates. Let's add an argument to distinguish the two cases. The argument
will be used by a new sched_class to track CPU hotplug events.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c     | 12 ++++++------
 kernel/sched/deadline.c |  4 ++--
 kernel/sched/fair.c     |  4 ++--
 kernel/sched/rt.c       |  4 ++--
 kernel/sched/sched.h    | 13 +++++++++----
 kernel/sched/topology.c |  4 ++--
 6 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 735e94bc7dbb..0699b49b1a21 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9286,7 +9286,7 @@ static inline void balance_hotplug_wait(void)
 
 #endif /* CONFIG_HOTPLUG_CPU */
 
-void set_rq_online(struct rq *rq)
+void set_rq_online(struct rq *rq, enum rq_onoff_reason reason)
 {
 	if (!rq->online) {
 		const struct sched_class *class;
@@ -9296,19 +9296,19 @@ void set_rq_online(struct rq *rq)
 
 		for_each_class(class) {
 			if (class->rq_online)
-				class->rq_online(rq);
+				class->rq_online(rq, reason);
 		}
 	}
 }
 
-void set_rq_offline(struct rq *rq)
+void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason)
 {
 	if (rq->online) {
 		const struct sched_class *class;
 
 		for_each_class(class) {
 			if (class->rq_offline)
-				class->rq_offline(rq);
+				class->rq_offline(rq, reason);
 		}
 
 		cpumask_clear_cpu(rq->cpu, rq->rd->online);
@@ -9404,7 +9404,7 @@ int sched_cpu_activate(unsigned int cpu)
 	rq_lock_irqsave(rq, &rf);
 	if (rq->rd) {
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-		set_rq_online(rq);
+		set_rq_online(rq, RQ_ONOFF_HOTPLUG);
 	}
 	rq_unlock_irqrestore(rq, &rf);
 
@@ -9449,7 +9449,7 @@ int sched_cpu_deactivate(unsigned int cpu)
 	if (rq->rd) {
 		update_rq_clock(rq);
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-		set_rq_offline(rq);
+		set_rq_offline(rq, RQ_ONOFF_HOTPLUG);
 	}
 	rq_unlock_irqrestore(rq, &rf);
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9ae8f41e3372..f63e5d0c5fb1 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2519,7 +2519,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 }
 
 /* Assumes rq->lock is held */
-static void rq_online_dl(struct rq *rq)
+static void rq_online_dl(struct rq *rq, enum rq_onoff_reason reason)
 {
 	if (rq->dl.overloaded)
 		dl_set_overload(rq);
@@ -2530,7 +2530,7 @@ static void rq_online_dl(struct rq *rq)
 }
 
 /* Assumes rq->lock is held */
-static void rq_offline_dl(struct rq *rq)
+static void rq_offline_dl(struct rq *rq, enum rq_onoff_reason reason)
 {
 	if (rq->dl.overloaded)
 		dl_clear_overload(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 78263cef1ea8..6cad7d07186b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11436,14 +11436,14 @@ void trigger_load_balance(struct rq *rq)
 	nohz_balancer_kick(rq);
 }
 
-static void rq_online_fair(struct rq *rq)
+static void rq_online_fair(struct rq *rq, enum rq_onoff_reason reason)
 {
 	update_sysctl();
 
 	update_runtime_enabled(rq);
 }
 
-static void rq_offline_fair(struct rq *rq)
+static void rq_offline_fair(struct rq *rq, enum rq_onoff_reason reason)
 {
 	update_sysctl();
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ed2a47e4ddae..0fb7ee087669 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2470,7 +2470,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 }
 
 /* Assumes rq->lock is held */
-static void rq_online_rt(struct rq *rq)
+static void rq_online_rt(struct rq *rq, enum rq_onoff_reason reason)
 {
 	if (rq->rt.overloaded)
 		rt_set_overload(rq);
@@ -2481,7 +2481,7 @@ static void rq_online_rt(struct rq *rq)
 }
 
 /* Assumes rq->lock is held */
-static void rq_offline_rt(struct rq *rq)
+static void rq_offline_rt(struct rq *rq, enum rq_onoff_reason reason)
 {
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 157eeabca5db..3c6ea8296ae4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2169,6 +2169,11 @@ extern const u32		sched_prio_to_wmult[40];
 
 #define RETRY_TASK		((void *)-1UL)
 
+enum rq_onoff_reason {
+	RQ_ONOFF_HOTPLUG,		/* CPU is going on/offline */
+	RQ_ONOFF_TOPOLOGY,		/* sched domain topology update */
+};
+
 struct sched_class {
 
 #ifdef CONFIG_UCLAMP_TASK
@@ -2201,8 +2206,8 @@ struct sched_class {
 				 const struct cpumask *newmask,
 				 u32 flags);
 
-	void (*rq_online)(struct rq *rq);
-	void (*rq_offline)(struct rq *rq);
+	void (*rq_online)(struct rq *rq, enum rq_onoff_reason reason);
+	void (*rq_offline)(struct rq *rq, enum rq_onoff_reason reason);
 
 	struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
 #endif
@@ -2726,8 +2731,8 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 	raw_spin_rq_unlock(rq1);
 }
 
-extern void set_rq_online (struct rq *rq);
-extern void set_rq_offline(struct rq *rq);
+extern void set_rq_online (struct rq *rq, enum rq_onoff_reason reason);
+extern void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason);
 extern bool sched_smp_initialized;
 
 #else /* CONFIG_SMP */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8739c2a5a54e..0e859bea1cb6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -493,7 +493,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
 		old_rd = rq->rd;
 
 		if (cpumask_test_cpu(rq->cpu, old_rd->online))
-			set_rq_offline(rq);
+			set_rq_offline(rq, RQ_ONOFF_TOPOLOGY);
 
 		cpumask_clear_cpu(rq->cpu, old_rd->span);
 
@@ -511,7 +511,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
 
 	cpumask_set_cpu(rq->cpu, rd->span);
 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
-		set_rq_online(rq);
+		set_rq_online(rq, RQ_ONOFF_TOPOLOGY);
 
 	raw_spin_rq_unlock_irqrestore(rq, flags);
 

From patchwork Wed Nov 30 08:22:53 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059546
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 773F6C4321E
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:25:10 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234812AbiK3IZJ (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:25:09 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55836 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234768AbiK3IY0 (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:24:26 -0500
Received: from mail-pg1-x534.google.com (mail-pg1-x534.google.com
 [IPv6:2607:f8b0:4864:20::534])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 5B577697F3;
        Wed, 30 Nov 2022 00:24:01 -0800 (PST)
Received: by mail-pg1-x534.google.com with SMTP id 6so15376487pgm.6;
        Wed, 30 Nov 2022 00:24:01 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=7y4gENgPynEKBUdFJwGmBAsFu1euUGtdtu24JG61qwA=;
        b=CJvW8ygBlZVDZ74fJVPYgS2dv/GKHVTIJmr5+sz1/txVMlnhk8dVxjsGp3Wxhkee+P
         72OXixyd+hEQSEnN2v4DfzI6r6VaxPTpGRPiVAkgg8tz5g5EeD3CFhh5ncJVRWQWOker
         ZsrqidAXjSwZvnhsVs2P5a7/QYBNEYsfTWABSprkg8Xl2i2kAXlUEaoVh3FWYlFkUMSq
         2H+p11u0QwvfSukVrlFAhqwLGJhLdHz1s9NohzuM9MpambkBDf3rvHnhDrRAnvt3evl6
         WZpfmBeT/pLW5J8klGCZwCxA1QZOa/BtKgN7rCZGq9nFTo+oSw4mFUCwJDmTuFObEbk3
         3RaA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=7y4gENgPynEKBUdFJwGmBAsFu1euUGtdtu24JG61qwA=;
        b=W2jkhv3jBJ6zbe1ItpR2Xv5Coy4LreYhq21gB3fe5KSpwzENagsVykE3I0KoJr0Nwm
         OvNVimq6EhDCDBf3cGlagHjSbi4GuZB9oWEd9Rzms4lqjm+gnoI9f0vkc8/kHaZ2KJpz
         6lvMpAy5YJIOvCXGJDKahtYdUniVhmTDxlhdn8rml+pxzUBu7a6Q8KObn/L3xhxQQMZ7
         689g+U7L2+LSt4cSISH6ZpANgnCGXOtMNh1SCPgQVy67YZ+nAONUv/v7RuepPnazwcDy
         UWi+kejEj+rX3VnJ0mAETxeKJRfO7agqFYf/CTBd28TyONz2NYF3xmOnsurN8Oy9hcu6
         egOQ==
X-Gm-Message-State: ANoB5plC7Z9i0o3TfcT+b2Xu1inKNWgciIPJNg9NlEjaQbJEHlNdUzrg
        CkbgyyJOsyGEbQFqeLvYBN4=
X-Google-Smtp-Source: 
 AA0mqf4bvkU+fnRZ8mV+saHzyKt7pe/mzs18ZpR5lZIbv6rSS0Vb8lLdcliU5/9E7HAY9a+REmm/RA==
X-Received: by 2002:a05:6a00:c5:b0:56b:a4f6:e030 with SMTP id
 e5-20020a056a0000c500b0056ba4f6e030mr43147233pfj.85.1669796640619;
        Wed, 30 Nov 2022 00:24:00 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 o11-20020a1709026b0b00b00186616b8fbasm774285plk.10.2022.11.30.00.23.59
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:00 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 11/31] sched: Add @reason to sched_move_task()
Date: Tue, 29 Nov 2022 22:22:53 -1000
Message-Id: <20221130082313.3241517-12-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

sched_move_task() can be called for both cgroup and autogroup moves. Add a
parameter to distinguish the two cases. This will be used by a new
sched_class to track cgroup migrations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/autogroup.c | 4 ++--
 kernel/sched/core.c      | 4 ++--
 kernel/sched/sched.h     | 7 ++++++-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 991fc9002535..2be1b10ce93e 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -151,7 +151,7 @@ void sched_autogroup_exit_task(struct task_struct *p)
 	 * see this thread after that: we can no longer use signal->autogroup.
 	 * See the PF_EXITING check in task_wants_autogroup().
 	 */
-	sched_move_task(p);
+	sched_move_task(p, SCHED_MOVE_TASK_AUTOGROUP);
 }
 
 static void
@@ -183,7 +183,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 	 * sched_autogroup_exit_task().
 	 */
 	for_each_thread(p, t)
-		sched_move_task(t);
+		sched_move_task(t, SCHED_MOVE_TASK_AUTOGROUP);
 
 	unlock_task_sighand(p, &flags);
 	autogroup_kref_put(prev);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0699b49b1a21..9c5bfeeb30ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10210,7 +10210,7 @@ static void sched_change_group(struct task_struct *tsk)
  * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
  * its new group.
  */
-void sched_move_task(struct task_struct *tsk)
+void sched_move_task(struct task_struct *tsk, enum sched_move_task_reason reason)
 {
 	int queued, running, queue_flags =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -10321,7 +10321,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 	struct cgroup_subsys_state *css;
 
 	cgroup_taskset_for_each(task, css, tset)
-		sched_move_task(task);
+		sched_move_task(task, SCHED_MOVE_TASK_CGROUP);
 }
 
 #ifdef CONFIG_UCLAMP_TASK_GROUP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3c6ea8296ae4..ef8da88e677c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -506,7 +506,12 @@ extern void sched_online_group(struct task_group *tg,
 extern void sched_destroy_group(struct task_group *tg);
 extern void sched_release_group(struct task_group *tg);
 
-extern void sched_move_task(struct task_struct *tsk);
+enum sched_move_task_reason {
+	SCHED_MOVE_TASK_CGROUP,
+	SCHED_MOVE_TASK_AUTOGROUP,
+};
+extern void sched_move_task(struct task_struct *tsk,
+			    enum sched_move_task_reason reason);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);

From patchwork Wed Nov 30 08:22:54 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059547
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id D6D5AC433FE
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:25:24 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S235044AbiK3IZW (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:25:22 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54360 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235041AbiK3IYh (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:24:37 -0500
Received: from mail-pf1-x436.google.com (mail-pf1-x436.google.com
 [IPv6:2607:f8b0:4864:20::436])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 69DF069DC6;
        Wed, 30 Nov 2022 00:24:03 -0800 (PST)
Received: by mail-pf1-x436.google.com with SMTP id w129so16152364pfb.5;
        Wed, 30 Nov 2022 00:24:03 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=CyCwssOUByk9u3EvR0soF76EBPw2aKcIpdc6kp+NXfg=;
        b=EtCiSwOIRsVfBunVEbjhjRdt6RlsAnVVQzGbQRtleJ2RxML28cNc0t3/0YbKwkFXsU
         4fdlbdgJ5MOoWwAiRE+8jKpOYB/xvVazm/Tdl+GcSYLiqJA4NcwHOzRsx2oesPla8++a
         BGkLZoaixYeq7vxmOQY7snFvJoH0Y9phfrpGwWqGMXocIv/cu3hetCHhaCB6nk5TJePR
         XjeZVFr4YlRfQWX8iIom+me0e8zF85/GlIh73q2ljNB5J5mQB95hRoJL9r9b6E4vGpE+
         aOCCyKFYQ82c4jdjFTt2OoMLGWExM8zR9EvSvJ4NLOuU0UIPIlaUN2OyaQig0nncKV/O
         jkXg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=CyCwssOUByk9u3EvR0soF76EBPw2aKcIpdc6kp+NXfg=;
        b=HUiHidjjopUnOYWujMUmpy2Jjc6fQL+5DUkcPlQk1b3JbgiNLPhWckGPSYklj9MX9/
         yiWOtqKxN8k0cw36x1QkEuvCHqBiCkH8zlMu/SolP8eEZ3wQ0u3JXwXNr2W+0osfTOtH
         r4rfpH+BwGVGEhxMazrz1AmhXuZ15nJp7YaZ7lYioHTGT06zwVyrTh/aSvFzKlhzZy9P
         cS97gEI/CeIzgjYlPs164dqO0rsvdzb0Vq86LXfxL4zzHb7G4YR9Jt0j+hmZXDaF1D9w
         04DZPaPlArjeKQy3M7JrJrp6D8D4VxYNOXM3oDHw4EjnB8fM5PuJ+aZCCWgTpBda2Ma6
         JP9Q==
X-Gm-Message-State: ANoB5pngOy+4RRnvGu1jMZFkq0Pueencu2NDL6BwOTZoBH7npDhcHoCk
        lZlNudnfEPFdAzK/AyPIgwg=
X-Google-Smtp-Source: 
 AA0mqf4yns/L88Azq/CVLHRiUU8IgmTZX6KXM9WlWzfRweU7VIZgnLWXOhWhzo1PmqyDFGEnmkZhdA==
X-Received: by 2002:a62:e717:0:b0:572:df9e:d57d with SMTP id
 s23-20020a62e717000000b00572df9ed57dmr64124119pfh.10.1669796642714;
        Wed, 30 Nov 2022 00:24:02 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 q5-20020a17090311c500b00178b6ccc8a0sm776806plh.51.2022.11.30.00.24.01
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:02 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 12/31] sched: Add normal_policy()
Date: Tue, 29 Nov 2022 22:22:54 -1000
Message-Id: <20221130082313.3241517-13-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

Add normal_policy() which wraps testing for %SCHED_NORMAL. Makes no behavior
change. Will be used by a new sched_class to expand what's considered normal
policy.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/fair.c  | 2 +-
 kernel/sched/sched.h | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6cad7d07186b..1d71e74cb0ff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7441,7 +7441,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
 	 * is driven by the tick):
 	 */
-	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
+	if (unlikely(!normal_policy(p->policy)) || !sched_feat(WAKEUP_PREEMPTION))
 		return;
 
 	find_matching_se(&se, &pse);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef8da88e677c..0741827e3541 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -182,9 +182,15 @@ static inline int idle_policy(int policy)
 {
 	return policy == SCHED_IDLE;
 }
+
+static inline int normal_policy(int policy)
+{
+	return policy == SCHED_NORMAL;
+}
+
 static inline int fair_policy(int policy)
 {
-	return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+	return normal_policy(policy) || policy == SCHED_BATCH;
 }
 
 static inline int rt_policy(int policy)

From patchwork Wed Nov 30 08:22:55 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059548
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 38E03C47088
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:25:27 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234871AbiK3IZZ (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:25:25 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56462 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234858AbiK3IYq (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:24:46 -0500
Received: from mail-pg1-x52c.google.com (mail-pg1-x52c.google.com
 [IPv6:2607:f8b0:4864:20::52c])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id AAC446D482;
        Wed, 30 Nov 2022 00:24:05 -0800 (PST)
Received: by mail-pg1-x52c.google.com with SMTP id r18so15367129pgr.12;
        Wed, 30 Nov 2022 00:24:05 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=bT49z/mzeTzqh2wPqLdg02CTarWafMLV3eCTvzPKZD4=;
        b=ebWxoeUBkAcNEBzhdtaYvvAG3x4bUDfhz3wIqPwXkNvD2Be2WofD5+silHYd0TH/04
         WheMCbaSJIbXwqQ1kKBVeiCHtMVNEXYBxm+KLCNj4nn188bGJ7fumotihKDrLhO00lbL
         1V2f0szVq3K57ot6zdH1gY0FZPk07NgEFgjdh62PTQZydb+gsX7O7MCEqtskZURKxzMm
         AE7rTI3OyEOH0Mn7E6cXI0s6KFE/JMo1AvdKmaim3EFCpD/R+qg+ZUWa0Pwa9kPaaoPu
         VZWSxIeGxrw0oEiszUzLVz+ifdOuNqUH/5vhH1IcFW3fjjY5dIMxPHQ/8FPoC5KeU8XA
         QkAQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=bT49z/mzeTzqh2wPqLdg02CTarWafMLV3eCTvzPKZD4=;
        b=BJ2iDXToXzrCvJRMTYnFgdZkAX57T+4JRO1oLI7Fe/5osWfUUzMSHUGPwjoSSMXzY+
         21Xy0cX9cpMieF5p8giS0TmJW+JmRrzbYUShqVblosz++VrUQIdH2n0wAPmU+B2gVNzM
         f80Ek98GMnciH5PcfGDLBN7A/BMNzm+gqikv9xGFjJv3lPOoL6LN8E+jLOwGcMRAED3J
         uaGkLfSwKnPxken9Wi/0VPWxuP0BgERUvXC630XoDnhpoMY7lkMyKznSwmjA/9PS5HNe
         0G65v7F4Tn8dwipDg/hKp5r1FGQNg/MVWCqxLd9EIvMaNy2fi1DjKxaGHdTFFkcHafp0
         xukg==
X-Gm-Message-State: ANoB5pkGMCV9+mqqg71t29+Yp0VF4EOppt3vDcOEd1yK///+c0i61ivR
        f9mnOb121FtoiGBLcXuuELI=
X-Google-Smtp-Source: 
 AA0mqf7PZCYgmrrJ4accabsIH2WLe0OTvp2eB0fMJIPsF3WyPQpOvqfipESGqH+ip0YTNeK4+zW3Jg==
X-Received: by 2002:a63:185a:0:b0:476:e84c:ab63 with SMTP id
 26-20020a63185a000000b00476e84cab63mr39686668pgy.496.1669796644760;
        Wed, 30 Nov 2022 00:24:04 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 y17-20020a170903011100b0018099c9618esm739527plc.231.2022.11.30.00.24.03
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:04 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 13/31] sched_ext: Add boilerplate for extensible scheduler
 class
Date: Tue, 29 Nov 2022 22:22:55 -1000
Message-Id: <20221130082313.3241517-14-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

This adds dummy implementations of sched_ext interfaces which interact with
the scheduler core and hook them in the correct places. As they're all
dummies, this doesn't cause any behavior changes. This is split out to help
reviewing.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h | 12 ++++++++++++
 kernel/fork.c             |  2 ++
 kernel/sched/core.c       | 35 +++++++++++++++++++++++++++--------
 kernel/sched/ext.h        | 34 ++++++++++++++++++++++++++++++++++
 kernel/sched/idle.c       |  2 ++
 kernel/sched/sched.h      |  2 ++
 6 files changed, 79 insertions(+), 8 deletions(-)
 create mode 100644 include/linux/sched/ext.h
 create mode 100644 kernel/sched/ext.h

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
new file mode 100644
index 000000000000..a05dfcf533b0
--- /dev/null
+++ b/include/linux/sched/ext.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_EXT_H
+#define _LINUX_SCHED_EXT_H
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+#error "NOT IMPLEMENTED YET"
+#else	/* !CONFIG_SCHED_CLASS_EXT */
+
+static inline void sched_ext_free(struct task_struct *p) {}
+
+#endif	/* CONFIG_SCHED_CLASS_EXT */
+#endif	/* _LINUX_SCHED_EXT_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index a90c6a4938c6..606c6b349799 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/ext.h>
 #include <linux/seq_file.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
@@ -833,6 +834,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(refcount_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	sched_ext_free(tsk);
 	io_uring_free(tsk);
 	cgroup_free(tsk);
 	task_numa_free(tsk, true);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9c5bfeeb30ba..b9e69e009343 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4554,6 +4554,8 @@ late_initcall(sched_core_sysctl_init);
  */
 int sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
+	int ret;
+
 	__sched_fork(clone_flags, p);
 	/*
 	 * We mark the process as NEW here. This guarantees that
@@ -4590,12 +4592,16 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 		p->sched_reset_on_fork = 0;
 	}
 
-	if (dl_prio(p->prio))
-		return -EAGAIN;
-	else if (rt_prio(p->prio))
+	scx_pre_fork(p);
+
+	if (dl_prio(p->prio)) {
+		ret = -EAGAIN;
+		goto out_cancel;
+	} else if (rt_prio(p->prio)) {
 		p->sched_class = &rt_sched_class;
-	else
+	} else {
 		p->sched_class = &fair_sched_class;
+	}
 
 	init_entity_runnable_average(&p->se);
 
@@ -4613,6 +4619,10 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
 #endif
 	return 0;
+
+out_cancel:
+	scx_cancel_fork(p);
+	return ret;
 }
 
 int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
@@ -4643,16 +4653,18 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 		p->sched_class->task_fork(p);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
-	return 0;
+	return scx_fork(p);
 }
 
 void sched_cancel_fork(struct task_struct *p)
 {
+	scx_cancel_fork(p);
 }
 
 void sched_post_fork(struct task_struct *p)
 {
 	uclamp_post_fork(p);
+	scx_post_fork(p);
 }
 
 unsigned long to_ratio(u64 period, u64 runtime)
@@ -5800,10 +5812,13 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
 	 * We can terminate the balance pass as soon as we know there is
 	 * a runnable task of @class priority or higher.
 	 */
-	for_class_range(class, prev->sched_class, &idle_sched_class) {
+	for_balance_class_range(class, prev->sched_class, &idle_sched_class) {
 		if (class->balance(rq, prev, rf))
 			break;
 	}
+#else
+	/* SCX needs the balance call even in UP, call it explicitly */
+	balance_scx_on_up(rq, prev, rf);
 #endif
 
 	put_prev_task(rq, prev);
@@ -5818,6 +5833,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	const struct sched_class *class;
 	struct task_struct *p;
 
+	if (scx_enabled())
+		goto restart;
+
 	/*
 	 * Optimization: we know that if all tasks are in the fair class we can
 	 * call that function directly, but only if the @prev task wasn't of a
@@ -5843,7 +5861,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 restart:
 	put_prev_task_balance(rq, prev, rf);
 
-	for_each_class(class) {
+	for_each_active_class(class) {
 		p = class->pick_next_task(rq);
 		if (p)
 			return p;
@@ -5876,7 +5894,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
 	const struct sched_class *class;
 	struct task_struct *p;
 
-	for_each_class(class) {
+	for_each_active_class(class) {
 		p = class->pick_task(rq);
 		if (p)
 			return p;
@@ -9810,6 +9828,7 @@ void __init sched_init(void)
 	balance_push_set(smp_processor_id(), false);
 #endif
 	init_sched_fair_class();
+	init_sched_ext_class();
 
 	psi_init();
 
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
new file mode 100644
index 000000000000..f348158ed33a
--- /dev/null
+++ b/kernel/sched/ext.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+#error "NOT IMPLEMENTED YET"
+#else	/* CONFIG_SCHED_CLASS_EXT */
+
+#define scx_enabled()		false
+
+static inline void scx_pre_fork(struct task_struct *p) {}
+static inline int scx_fork(struct task_struct *p) { return 0; }
+static inline void scx_post_fork(struct task_struct *p) {}
+static inline void scx_cancel_fork(struct task_struct *p) {}
+static inline int balance_scx(struct rq *rq, struct task_struct *prev,
+			      struct rq_flags *rf) { return 0; }
+static inline void init_sched_ext_class(void) {}
+
+#define for_each_active_class		for_each_class
+#define for_balance_class_range		for_class_range
+
+#endif	/* CONFIG_SCHED_CLASS_EXT */
+
+#ifndef CONFIG_SMP
+static inline void balance_scx_on_up(struct rq *rq, struct task_struct *prev,
+				     struct rq_flags *rf)
+{
+	balance_scx(rq, prev, rf);
+}
+#endif
+
+#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
+#error "NOT IMPLEMENTED YET"
+#else
+static inline void scx_update_idle(struct rq *rq, bool idle) {}
+#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f26ab2675f7d..86bc5832bdc4 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -428,11 +428,13 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
+	scx_update_idle(rq, false);
 }
 
 static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
 {
 	update_idle_core(rq);
+	scx_update_idle(rq, true);
 	schedstat_inc(rq->sched_goidle);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0741827e3541..c00c27de2a30 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3252,4 +3252,6 @@ enum cpu_cftype_id {
 extern struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1];
 #endif /* CONFIG_CGROUP_SCHED */
 
+#include "ext.h"
+
 #endif /* _KERNEL_SCHED_SCHED_H */

From patchwork Wed Nov 30 08:22:57 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059549
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 5BD8DC433FE
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:25:39 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S235117AbiK3IZh (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:25:37 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55950 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235123AbiK3IYz (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:24:55 -0500
Received: from mail-pf1-x436.google.com (mail-pf1-x436.google.com
 [IPv6:2607:f8b0:4864:20::436])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 1153F6DFFB;
        Wed, 30 Nov 2022 00:24:10 -0800 (PST)
Received: by mail-pf1-x436.google.com with SMTP id w129so16152605pfb.5;
        Wed, 30 Nov 2022 00:24:10 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=JZXs/tHvndxmGX94M+8xZoj6QZaNPUzG00ObylI8ArA=;
        b=Xd5j4gE7oX5D6D+g9dm5qtSCevEdbriaEgZeism3Wy0ktHo9b9FkKWzenhpjinCxU3
         HDUXWRMyMwX1FwULxMO0LB1a623C10YCHFF4aAdZAenvucQNO3vu4NZ2TGatbRNLycC8
         V2DRgteQWjJW4y9yMKrkQDClSt8gaZHD6si/K1qvbLAD7lPCfsrQiWVmayAoyUqgcomQ
         H9mCSvQyZqZpuJOczOVpJdEemD8UmsiKY1ONDNUgPgvuWzVHbdlyXT1UUSx5FtAQqTf1
         73U/DLGfLayBXB3pf68BYDPGfIzfQPxQQzWAh5pF246kY+J4y2Y51evYVdRje0XhD750
         S42w==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=JZXs/tHvndxmGX94M+8xZoj6QZaNPUzG00ObylI8ArA=;
        b=Lyy3g9qqtvt3xuU+vQJoJqecs6vy8n9rLoDLJIGyxNbksgnOqqcDoqiD94UZMEtHZ5
         biQ4ZMtlG3lZqfT6iqD/6t1Jsy2GxFIwmuJnAv15b/XhGPj4Q6XK2j8zLwUyxAcYD42L
         0T70Jz57spTw3VtJH8z8WL3pP3BKTAcVZoMx02J5W796zdOAcK5mrCG3fHWm4oBabALf
         XNJbn8Mzul0fznnQIhk6BWInvclIDcQIqVKeJ4kIcT10pIYGf7p0HYGDT1bOOqeu8kZK
         nptvdlypKMK2Vwh2BQKhgML/+BZpRLU8ebiD/NfOmGB/UDit2wjkC204jWuD3sVdGdnV
         nLJw==
X-Gm-Message-State: ANoB5pnIDjUd1DxTYbvYYY0HZsS106guGADzteivAjqE90YIBrJxzzLh
        lZzZ6fErdSiz4Bu4sWyG0T4=
X-Google-Smtp-Source: 
 AA0mqf73gUCLJfUooieU9UjobbvjGv4CN28mvSpS8LEdRLMy+QDHVOIJ/bSQmXxdX6Cj6Ss4YWE6rA==
X-Received: by 2002:a62:840b:0:b0:575:2199:70e8 with SMTP id
 k11-20020a62840b000000b00575219970e8mr15236940pfd.14.1669796649470;
        Wed, 30 Nov 2022 00:24:09 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 z22-20020a63d016000000b004761e544ec6sm515630pgf.89.2022.11.30.00.24.08
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:09 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>,
        Dave Marchevsky <davemarchevsky@meta.com>
Subject: [PATCH 15/31] sched_ext: [TEMPORARY] Add temporary workaround kfunc
 helpers
Date: Tue, 29 Nov 2022 22:22:57 -1000
Message-Id: <20221130082313.3241517-16-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

THIS IS A TEMPORARY WORKAROUND. WILL BE DROPPED.

These exist to work around till we have proper generic BPF helpers.

NOT_FOR_UPSTREAM
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Dave Marchevsky <davemarchevsky@meta.com>
---
 kernel/sched/ext.c | 84 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f42464d66de4..1428385093dc 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2745,6 +2745,86 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = {
 	.set			= &scx_kfunc_ids_any,
 };
 
+/********************************************************************************
+ * Temporary BPF helpers to be replaced by generic non-scx-specific BPF helpers
+ */
+
+struct cgroup *scx_bpf_task_cgroup(const struct task_struct *p)
+{
+	struct task_group *tg = p->sched_task_group;
+
+	if (tg && tg->css.cgroup)
+		return tg->css.cgroup;
+	else
+		return &cgrp_dfl_root.cgrp;
+}
+
+struct task_struct *scx_bpf_find_task_by_pid(s32 pid)
+{
+	return find_task_by_pid_ns(pid, &init_pid_ns);
+}
+
+s32 scx_bpf_pick_idle_cpu_untyped(unsigned long cpus_allowed)
+{
+	return scx_bpf_pick_idle_cpu((const struct cpumask *)cpus_allowed);
+}
+
+bool scx_bpf_has_idle_cpus_among(const struct cpumask *cpus_allowed)
+{
+	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
+		scx_ops_error("built-in idle tracking is disabled");
+		return false;
+	}
+#ifdef CONFIG_SMP
+	return cpumask_any_and(idle_masks.cpu, cpus_allowed) < nr_cpu_ids;
+#else
+	return false;
+#endif
+}
+
+s32 scx_bpf_has_idle_cpus_among_untyped(unsigned long cpus_allowed)
+{
+	return scx_bpf_has_idle_cpus_among((const struct cpumask *)cpus_allowed);
+}
+
+s32 scx_bpf_cpumask_test_cpu(s32 cpu, const struct cpumask *cpumask)
+{
+	return cpumask_test_cpu(cpu, cpumask);
+}
+
+s32 scx_bpf_cpumask_first(const struct cpumask *cpus_allowed)
+{
+	return cpumask_first(cpus_allowed);
+}
+
+s32 scx_bpf_cpumask_first_untyped(unsigned long cpus_allowed)
+{
+	return cpumask_first((const struct cpumask *)cpus_allowed);
+}
+
+bool scx_bpf_cpumask_intersects(const struct cpumask *src1p,
+				const struct cpumask *src2p)
+{
+	return cpumask_intersects(src1p, src2p);
+}
+
+BTF_SET8_START(scx_kfunc_ids_xxx)
+BTF_ID_FLAGS(func, scx_bpf_task_cgroup)
+BTF_ID_FLAGS(func, scx_bpf_find_task_by_pid, KF_RET_NULL)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_untyped)
+BTF_ID_FLAGS(func, scx_bpf_has_idle_cpus_among)
+BTF_ID_FLAGS(func, scx_bpf_has_idle_cpus_among_untyped)
+BTF_ID_FLAGS(func, scx_bpf_cpumask_test_cpu)
+BTF_ID_FLAGS(func, scx_bpf_cpumask_first)
+BTF_ID_FLAGS(func, scx_bpf_cpumask_first_untyped)
+BTF_ID_FLAGS(func, scx_bpf_cpumask_intersects)
+BTF_SET8_END(scx_kfunc_ids_xxx)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_xxx = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_xxx,
+};
+
 __diag_pop();
 
 /*
@@ -2770,7 +2850,9 @@ static int __init register_ext_kfuncs(void)
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_online)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
-					     &scx_kfunc_set_any))) {
+					     &scx_kfunc_set_any)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_xxx))) {
 		pr_err("sched_ext: failed to register kfunc sets (%d)\n", ret);
 		return ret;
 	}

From patchwork Wed Nov 30 08:22:58 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059551
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 26512C433FE
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:26:14 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S235158AbiK3I0M (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:26:12 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56326 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235159AbiK3IZW (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:25:22 -0500
Received: from mail-pj1-x1033.google.com (mail-pj1-x1033.google.com
 [IPv6:2607:f8b0:4864:20::1033])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D46B06F834;
        Wed, 30 Nov 2022 00:24:16 -0800 (PST)
Received: by mail-pj1-x1033.google.com with SMTP id b11so15051067pjp.2;
        Wed, 30 Nov 2022 00:24:16 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=66r3FbV6ja5J9RWEoQ0luoafR1ZXpTMI7gq+vFqLhwQ=;
        b=UtP55JpGfH0eUD2B0TTW8KtnRfoX/Tdy6I8C3ZBgapt3FF9J1QHDw/VaSHB5XZeKwZ
         PIy12c0XTViNnVedEgkHnhhFAg+6fUM4TYXzMtz86dy/z2B/LCfMQD0pdDj1nVr5UJgV
         k7jJ6BUc4h7ljteEFj1mnjj9ORLKKPpFOdEYoupO5JCc0a1L7ALy/UKzaLjmevtYMP1W
         WmLNbMDKUGb+WChdJsCXZDs0EqEWNBp3SiRpQ75mOJQmQy8SmxjZGGMQiCGCIKZaw4bT
         raaxZb1AJMd8k4SLBWrntvxFq/5x+KYoKrSqAZncBh5yT7seCFcOwq7hRPxJuatZNvH4
         kCtg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=66r3FbV6ja5J9RWEoQ0luoafR1ZXpTMI7gq+vFqLhwQ=;
        b=GdmKINWVBrL+v9oJWDc0xh79ff9D0T0hr/IQ0O2qtS0ZHBf/HXpcNYaMb90oa4Xy0c
         0JFCqjmEoWdVTYI/RUOtFCi4jApynKEGJzjwptH1xpUsM9VM/EOZO0DaoO64Xc8btmhS
         2YdW7VxGFpi76CLgQ3kNSKwUpLosi9tE2weE2kjDnd03+93tG9bmdtiuIsyU+GCuW38A
         7uYMFwQKp3day3FNQ3LcBI2JtCq9AHX4gSrsmmDBLVAqz1amUmZNcSplWPQwOrzD4le0
         FPuwyNdjqYkPK8LWeb9wCsfzxluoWgTXv7Htw8gOf9T0HrdWCBqcwFv6uXgmnRczEMuO
         OSOg==
X-Gm-Message-State: ANoB5pn5aaNhz0zsIRww7TfzJdG2RIkmxy1IVhlS7Nud483tIVm/7b31
        tmozD5leG16TrdHx9VmaW98=
X-Google-Smtp-Source: 
 AA0mqf4jdahVC7Kj92kgq6ruInCu+Gd+q3scD841KYbIRceeyRpwJS/WoS+AGO5m67nOq7sw0OD4ug==
X-Received: by 2002:a17:902:f80b:b0:188:4e75:7365 with SMTP id
 ix11-20020a170902f80b00b001884e757365mr43291661plb.102.1669796651883;
        Wed, 30 Nov 2022 00:24:11 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 o13-20020a17090a0a0d00b0021896fa945asm2589253pjo.15.2022.11.30.00.24.11
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:11 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 16/31] sched_ext: Add scx_example_dummy and scx_example_qmap
 example schedulers
Date: Tue, 29 Nov 2022 22:22:58 -1000
Message-Id: <20221130082313.3241517-17-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

Add two simple example BPF schedulers - dummy and qmap.

* dummy: In terms of scheduling, it behaves identical to not having any
  operation implemented at all. The two operations it implements are only to
  improve visibility and exit handling. On certain homogeneous
  configurations, this actually can perform pretty well.

* qmap: A fixed five level priority scheduler to demonstrate queueing PIDs
  on BPF maps for scheduling. While not very practical, this is useful as a
  simple example and will be used to demonstrate different features.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 tools/sched_ext/.gitignore              |   5 +
 tools/sched_ext/Makefile                | 187 +++++++++++++++++++
 tools/sched_ext/gnu/stubs.h             |   1 +
 tools/sched_ext/scx_common.bpf.h        | 117 ++++++++++++
 tools/sched_ext/scx_example_dummy.bpf.c |  56 ++++++
 tools/sched_ext/scx_example_dummy.c     |  93 ++++++++++
 tools/sched_ext/scx_example_qmap.bpf.c  | 228 ++++++++++++++++++++++++
 tools/sched_ext/scx_example_qmap.c      |  84 +++++++++
 tools/sched_ext/user_exit_info.h        |  50 ++++++
 9 files changed, 821 insertions(+)
 create mode 100644 tools/sched_ext/.gitignore
 create mode 100644 tools/sched_ext/Makefile
 create mode 100644 tools/sched_ext/gnu/stubs.h
 create mode 100644 tools/sched_ext/scx_common.bpf.h
 create mode 100644 tools/sched_ext/scx_example_dummy.bpf.c
 create mode 100644 tools/sched_ext/scx_example_dummy.c
 create mode 100644 tools/sched_ext/scx_example_qmap.bpf.c
 create mode 100644 tools/sched_ext/scx_example_qmap.c
 create mode 100644 tools/sched_ext/user_exit_info.h

diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
new file mode 100644
index 000000000000..6734f7fd9324
--- /dev/null
+++ b/tools/sched_ext/.gitignore
@@ -0,0 +1,5 @@
+scx_example_dummy
+scx_example_qmap
+*.skel.h
+*.subskel.h
+/tools/
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
new file mode 100644
index 000000000000..db99745e566d
--- /dev/null
+++ b/tools/sched_ext/Makefile
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+include ../build/Build.include
+include ../scripts/Makefile.arch
+include ../scripts/Makefile.include
+
+ifneq ($(LLVM),)
+ifneq ($(filter %/,$(LLVM)),)
+LLVM_PREFIX := $(LLVM)
+else ifneq ($(filter -%,$(LLVM)),)
+LLVM_SUFFIX := $(LLVM)
+endif
+
+CLANG_TARGET_FLAGS_arm          := arm-linux-gnueabi
+CLANG_TARGET_FLAGS_arm64        := aarch64-linux-gnu
+CLANG_TARGET_FLAGS_hexagon      := hexagon-linux-musl
+CLANG_TARGET_FLAGS_m68k         := m68k-linux-gnu
+CLANG_TARGET_FLAGS_mips         := mipsel-linux-gnu
+CLANG_TARGET_FLAGS_powerpc      := powerpc64le-linux-gnu
+CLANG_TARGET_FLAGS_riscv        := riscv64-linux-gnu
+CLANG_TARGET_FLAGS_s390         := s390x-linux-gnu
+CLANG_TARGET_FLAGS_x86          := x86_64-linux-gnu
+CLANG_TARGET_FLAGS              := $(CLANG_TARGET_FLAGS_$(ARCH))
+
+ifeq ($(CROSS_COMPILE),)
+ifeq ($(CLANG_TARGET_FLAGS),)
+$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk
+else
+CLANG_FLAGS     += --target=$(CLANG_TARGET_FLAGS)
+endif # CLANG_TARGET_FLAGS
+else
+CLANG_FLAGS     += --target=$(notdir $(CROSS_COMPILE:%-=%))
+endif # CROSS_COMPILE
+
+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as
+else
+CC := $(CROSS_COMPILE)gcc
+endif # LLVM
+
+CURDIR := $(abspath .)
+TOOLSDIR := $(abspath ..)
+LIBDIR := $(TOOLSDIR)/lib
+BPFDIR := $(LIBDIR)/bpf
+TOOLSINCDIR := $(TOOLSDIR)/include
+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
+APIDIR := $(TOOLSINCDIR)/uapi
+GENDIR := $(abspath ../../include/generated)
+GENHDR := $(GENDIR)/autoconf.h
+
+SCRATCH_DIR := $(CURDIR)/tools
+BUILD_DIR := $(SCRATCH_DIR)/build
+INCLUDE_DIR := $(SCRATCH_DIR)/include
+BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a
+ifneq ($(CROSS_COMPILE),)
+HOST_BUILD_DIR		:= $(BUILD_DIR)/host
+HOST_SCRATCH_DIR	:= host-tools
+HOST_INCLUDE_DIR	:= $(HOST_SCRATCH_DIR)/include
+else
+HOST_BUILD_DIR		:= $(BUILD_DIR)
+HOST_SCRATCH_DIR	:= $(SCRATCH_DIR)
+HOST_INCLUDE_DIR	:= $(INCLUDE_DIR)
+endif
+HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a
+RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids
+DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool
+
+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux)					\
+		     $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)		\
+		     ../../vmlinux						\
+		     /sys/kernel/btf/vmlinux					\
+		     /boot/vmlinux-$(shell uname -r)
+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
+ifeq ($(VMLINUX_BTF),)
+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
+endif
+
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+
+ifneq ($(wildcard $(GENHDR)),)
+  GENFLAGS := -DHAVE_GENHDR
+endif
+
+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS)			\
+	  -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)				\
+	  -I$(TOOLSINCDIR) -I$(APIDIR)
+
+# Silence some warnings when compiled with clang
+ifneq ($(LLVM),)
+CFLAGS += -Wno-unused-command-line-argument
+endif
+
+LDFLAGS = -lelf -lz
+
+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null |				\
+			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '-target bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
+endef
+
+BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
+	     $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)		\
+	     -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR)				\
+	     -I../../include							\
+	     $(call get_sys_includes,$(CLANG))					\
+	     -Wno-compare-distinct-pointer-types				\
+	     -O2 -mcpu=v3
+
+all: scx_example_dummy scx_example_qmap
+
+# sort removes libbpf duplicates when not cross-building
+MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
+	       $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids	\
+	       $(INCLUDE_DIR))
+
+$(MAKE_DIRS):
+	$(call msg,MKDIR,,$@)
+	$(Q)mkdir -p $@
+
+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)			\
+	   $(APIDIR)/linux/bpf.h						\
+	   | $(BUILD_DIR)/libbpf
+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/	\
+		    EXTRA_CFLAGS='-g -O0'					\
+		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
+
+$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
+		    $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
+	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)				\
+		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD)		\
+		    EXTRA_CFLAGS='-g -O0'					\
+		    OUTPUT=$(HOST_BUILD_DIR)/bpftool/				\
+		    LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/			\
+		    LIBBPF_DESTDIR=$(HOST_SCRATCH_DIR)/				\
+		    prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install-bin
+
+$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
+ifeq ($(VMLINUX_H),)
+	$(call msg,GEN,,$@)
+	$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
+else
+	$(call msg,CP,,$@)
+	$(Q)cp "$(VMLINUX_H)" $@
+endif
+
+%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.bpf.h user_exit_info.h	\
+	| $(BPFOBJ)
+	$(call msg,CLNG-BPF,,$@)
+	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
+
+%.skel.h: %.bpf.o $(BPFTOOL)
+	$(call msg,GEN-SKEL,,$@)
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
+	$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
+	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(<:.bpf.o=) > $@
+	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(<:.bpf.o=) > $(@:.skel.h=.subskel.h)
+
+scx_example_dummy: scx_example_dummy.c scx_example_dummy.skel.h user_exit_info.h
+	$(CC) $(CFLAGS) -c $< -o $@.o
+	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+
+scx_example_qmap: scx_example_qmap.c scx_example_qmap.skel.h user_exit_info.h
+	$(CC) $(CFLAGS) -c $< -o $@.o
+	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+
+clean:
+	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
+	rm -f *.o *.bpf.o *.skel.h *.subskel.h
+	rm -f scx_example_dummy scx_example_qmap
+
+.PHONY: all clean
+
+# delete failed targets
+.DELETE_ON_ERROR:
+
+# keep intermediate (.skel.h, .bpf.o, etc) targets
+.SECONDARY:
diff --git a/tools/sched_ext/gnu/stubs.h b/tools/sched_ext/gnu/stubs.h
new file mode 100644
index 000000000000..719225b16626
--- /dev/null
+++ b/tools/sched_ext/gnu/stubs.h
@@ -0,0 +1 @@
+/* dummy .h to trick /usr/include/features.h to work with 'clang -target bpf' */
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
new file mode 100644
index 000000000000..4cfbbee38d9a
--- /dev/null
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCHED_EXT_COMMON_BPF_H
+#define __SCHED_EXT_COMMON_BPF_H
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <linux/errno.h>
+#include "user_exit_info.h"
+
+/*
+ * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can
+ * lead to really confusing misbehaviors. Let's trigger a build failure.
+ */
+static inline void ___vmlinux_h_sanity_check___(void)
+{
+	_Static_assert(SCX_DSQ_FLAG_BUILTIN,
+		       "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole");
+}
+
+extern void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
+
+static inline __attribute__((format(printf, 1, 2)))
+void ___scx_bpf_error_format_checker(const char *fmt, ...) {}
+
+/*
+ * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
+ * instead of an array of u64. Note that __param[] must have at least one
+ * element to keep the verifier happy.
+ */
+#define scx_bpf_error(fmt, args...)						\
+({										\
+	static char ___fmt[] = fmt;						\
+	unsigned long long ___param[___bpf_narg(args) ?: 1] = {};		\
+										\
+	_Pragma("GCC diagnostic push")						\
+	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")			\
+	___bpf_fill(___param, args);						\
+	_Pragma("GCC diagnostic pop")						\
+										\
+	scx_bpf_error_bstr(___fmt, ___param, sizeof(___param));			\
+										\
+	___scx_bpf_error_format_checker(fmt, ##args);				\
+})
+
+extern s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
+extern bool scx_bpf_consume(u64 dsq_id) __ksym;
+extern u32 scx_bpf_dispatch_nr_slots(void) __ksym;
+extern void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
+extern s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
+extern bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
+extern s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed) __ksym;
+extern void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+extern bool scx_bpf_task_running(const struct task_struct *p) __ksym;
+extern s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
+
+/* XXX - temporary ones to be replaced by generic BPF helpers */
+extern struct cgroup *scx_bpf_task_cgroup(const struct task_struct *p) __ksym;
+extern struct task_struct *scx_bpf_find_task_by_pid(s32 pid) __ksym;
+extern s32 scx_bpf_pick_idle_cpu_untyped(unsigned long cpus_allowed) __ksym;
+extern bool scx_bpf_has_idle_cpus_among(const struct cpumask *cpus_allowed) __ksym;
+extern bool scx_bpf_has_idle_cpus_among_untyped(unsigned long cpus_allowed) __ksym;
+extern s32 scx_bpf_cpumask_test_cpu(s32 cpu, const struct cpumask *cpumask) __ksym;
+extern s32 scx_bpf_cpumask_first(const struct cpumask *cpus_allowed) __ksym;
+extern s32 scx_bpf_cpumask_first_untyped(unsigned long cpus_allowed) __ksym;
+extern bool scx_bpf_cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p) __ksym;
+
+#define PF_KTHREAD			0x00200000	/* I am a kernel thread */
+#define PF_EXITING			0x00000004
+#define CLOCK_MONOTONIC			1
+
+#define BPF_STRUCT_OPS(name, args...)						\
+SEC("struct_ops/"#name)								\
+BPF_PROG(name, ##args)
+
+/**
+ * MEMBER_VPTR - Obtain the verified pointer to a struct or array member
+ * @base: struct or array to index
+ * @member: dereferenced member (e.g. ->field, [idx0][idx1], ...)
+ *
+ * The verifier often gets confused by the instruction sequence the compiler
+ * generates for indexing struct fields or arrays. This macro forces the
+ * compiler to generate a code sequence which first calculates the byte offset,
+ * checks it against the struct or array size and add that byte offset to
+ * generate the pointer to the member to help the verifier.
+ *
+ * Ideally, we want to abort if the calculated offset is out-of-bounds. However,
+ * BPF currently doesn't support abort, so evaluate to NULL instead. The caller
+ * must check for NULL and take appropriate action to appease the verifier. To
+ * avoid confusing the verifier, it's best to check for NULL and dereference
+ * immediately.
+ *
+ *	vptr = MEMBER_VPTR(my_array, [i][j]);
+ *	if (!vptr)
+ *		return error;
+ *	*vptr = new_value;
+ */
+#define MEMBER_VPTR(base, member) (typeof(base member) *)({			\
+	u64 __base = (u64)base;							\
+	u64 __addr = (u64)&(base member) - __base;				\
+	asm volatile (								\
+		"if %0 <= %[max] goto +2\n"					\
+		"%0 = 0\n"							\
+		"goto +1\n"							\
+		"%0 += %1\n"							\
+		: "+r"(__addr)							\
+		: "r"(__base),							\
+		  [max]"i"(sizeof(base) - sizeof(base member)));		\
+	__addr;									\
+})
+
+#endif	/* __SCHED_EXT_COMMON_BPF_H */
diff --git a/tools/sched_ext/scx_example_dummy.bpf.c b/tools/sched_ext/scx_example_dummy.bpf.c
new file mode 100644
index 000000000000..ac7b490b5a39
--- /dev/null
+++ b/tools/sched_ext/scx_example_dummy.bpf.c
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A minimal dummy scheduler.
+ *
+ * In terms of scheduling, this behaves the same as not specifying any ops at
+ * all - a global FIFO. The only things it adds are the following niceties:
+ *
+ * - Statistics tracking how many are queued to local and global dsq's.
+ * - Termination notification for userspace.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include "scx_common.bpf.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct user_exit_info uei;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(u32));
+	__uint(value_size, sizeof(u64));
+	__uint(max_entries, 2);			/* [local, global] */
+} stats SEC(".maps");
+
+static void stat_inc(u32 idx)
+{
+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
+	if (cnt_p)
+		(*cnt_p)++;
+}
+
+void BPF_STRUCT_OPS(dummy_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	if (enq_flags & SCX_ENQ_LOCAL) {
+		stat_inc(0);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+	} else {
+		stat_inc(1);
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	}
+}
+
+void BPF_STRUCT_OPS(dummy_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops dummy_ops = {
+	.enqueue		= (void *)dummy_enqueue,
+	.exit			= (void *)dummy_exit,
+	.name			= "dummy",
+};
diff --git a/tools/sched_ext/scx_example_dummy.c b/tools/sched_ext/scx_example_dummy.c
new file mode 100644
index 000000000000..72881c881830
--- /dev/null
+++ b/tools/sched_ext/scx_example_dummy.c
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include "user_exit_info.h"
+#include "scx_example_dummy.skel.h"
+
+const char help_fmt[] =
+"A minimal dummy sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s\n"
+"\n"
+"  -h            Display this help and exit\n";
+
+static volatile int exit_req;
+
+static void sigint_handler(int dummy)
+{
+	exit_req = 1;
+}
+
+static void read_stats(struct scx_example_dummy *skel, u64 *stats)
+{
+	int nr_cpus = libbpf_num_possible_cpus();
+	u64 cnts[2][nr_cpus];
+	u32 idx;
+
+	memset(stats, 0, sizeof(stats[0]) * 2);
+
+	for (idx = 0; idx < 2; idx++) {
+		int ret, cpu;
+
+		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
+					  &idx, cnts[idx]);
+		if (ret < 0)
+			continue;
+		for (cpu = 0; cpu < nr_cpus; cpu++)
+			stats[idx] += cnts[idx][cpu];
+	}
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_example_dummy *skel;
+	struct bpf_link *link;
+	u32 opt;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = scx_example_dummy__open();
+	assert(skel);
+
+	while ((opt = getopt(argc, argv, "h")) != -1) {
+		switch (opt) {
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	assert(!scx_example_dummy__load(skel));
+
+	link = bpf_map__attach_struct_ops(skel->maps.dummy_ops);
+	assert(link);
+
+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
+		u64 stats[2];
+
+		read_stats(skel, stats);
+		printf("local=%lu global=%lu\n", stats[0], stats[1]);
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	uei_print(&skel->bss->uei);
+	scx_example_dummy__destroy(skel);
+	return 0;
+}
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
new file mode 100644
index 000000000000..742c866d2a8e
--- /dev/null
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A simple five-level FIFO queue scheduler.
+ *
+ * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
+ * assigned to one depending on its compound weight. Each CPU round robins
+ * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
+ * queue0, 2 from queue1, 4 from queue2 and so on.
+ *
+ * This scheduler demonstrates:
+ *
+ * - BPF-side queueing using PIDs.
+ * - Sleepable per-task storage allocation using ops.prep_enable().
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include "scx_common.bpf.h"
+#include <linux/sched/prio.h>
+
+char _license[] SEC("license") = "GPL";
+
+const volatile u64 slice_ns = SCX_SLICE_DFL;
+
+u32 test_error_cnt;
+
+struct user_exit_info uei;
+
+struct qmap {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, 4096);
+	__type(value, u32);
+} queue0 SEC(".maps"),
+  queue1 SEC(".maps"),
+  queue2 SEC(".maps"),
+  queue3 SEC(".maps"),
+  queue4 SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__uint(max_entries, 5);
+	__type(key, int);
+	__array(values, struct qmap);
+} queue_arr SEC(".maps") = {
+	.values = {
+		[0] = &queue0,
+		[1] = &queue1,
+		[2] = &queue2,
+		[3] = &queue3,
+		[4] = &queue4,
+	},
+};
+
+/* Per-task scheduling context */
+struct task_ctx {
+	bool	force_local;	/* Dispatch directly to local_dsq */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+/* Per-cpu dispatch index and remaining count */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, 2);
+	__type(key, u32);
+	__type(value, u64);
+} dispatch_idx_cnt SEC(".maps");
+
+/* Statistics */
+unsigned long nr_enqueued, nr_dispatched, nr_reenqueued;
+
+s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	struct task_ctx *tctx;
+	s32 cpu;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx) {
+		scx_bpf_error("task_ctx lookup failed");
+		return -ESRCH;
+	}
+
+	if (p->nr_cpus_allowed == 1 ||
+	    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+		tctx->force_local = true;
+		return prev_cpu;
+	}
+
+	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr);
+	if (cpu >= 0)
+		return cpu;
+
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct task_ctx *tctx;
+	u32 pid = p->pid;
+	int idx;
+	void *ring;
+
+	if (test_error_cnt && !--test_error_cnt)
+		scx_bpf_error("test triggering error");
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!tctx) {
+		scx_bpf_error("task_ctx lookup failed");
+		return;
+	}
+
+	/* Is select_cpu() is telling us to enqueue locally? */
+	if (tctx->force_local) {
+		tctx->force_local = false;
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
+		return;
+	}
+
+	/* Coarsely map the compount weight to a FIFO. */
+	if (p->scx.weight <= 25)
+		idx = 0;
+	else if (p->scx.weight <= 50)
+		idx = 1;
+	else if (p->scx.weight < 200)
+		idx = 2;
+	else if (p->scx.weight < 400)
+		idx = 3;
+	else
+		idx = 4;
+
+	ring = bpf_map_lookup_elem(&queue_arr, &idx);
+	if (!ring) {
+		scx_bpf_error("failed to find ring %d", idx);
+		return;
+	}
+
+	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
+	if (bpf_map_push_elem(ring, &pid, 0)) {
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags);
+		return;
+	}
+
+	__sync_fetch_and_add(&nr_enqueued, 1);
+}
+
+void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
+{
+	u32 zero = 0, one = 1;
+	u64 *idx = bpf_map_lookup_elem(&dispatch_idx_cnt, &zero);
+	u64 *cnt = bpf_map_lookup_elem(&dispatch_idx_cnt, &one);
+	void *fifo;
+	s32 pid;
+	int i;
+
+	if (!idx || !cnt) {
+		scx_bpf_error("failed to lookup idx[%p], cnt[%p]", idx, cnt);
+		return;
+	}
+
+	for (i = 0; i < 5; i++) {
+		/* Advance the dispatch cursor and pick the fifo. */
+		if (!*cnt) {
+			*idx = (*idx + 1) % 5;
+			*cnt = 1 << *idx;
+		}
+		(*cnt)--;
+
+		fifo = bpf_map_lookup_elem(&queue_arr, idx);
+		if (!fifo) {
+			scx_bpf_error("failed to find ring %llu", *idx);
+			return;
+		}
+
+		/* Dispatch or advance. */
+		if (!bpf_map_pop_elem(fifo, &pid)) {
+			struct task_struct *p;
+
+			p = scx_bpf_find_task_by_pid(pid);
+			if (p) {
+				__sync_fetch_and_add(&nr_dispatched, 1);
+				scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0);
+				return;
+			}
+		}
+
+		*cnt = 0;
+	}
+}
+
+s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
+		   struct scx_enable_args *args)
+{
+	/*
+	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
+	 * in this function and the following will automatically use GFP_KERNEL.
+	 */
+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops qmap_ops = {
+	.select_cpu		= (void *)qmap_select_cpu,
+	.enqueue		= (void *)qmap_enqueue,
+	/*
+	 * The queue map doesn't support removal and sched_ext can handle
+	 * spurious dispatches. Let's be lazy and not bother with dequeueing.
+	 */
+	.dispatch		= (void *)qmap_dispatch,
+	.prep_enable		= (void *)qmap_prep_enable,
+	.exit			= (void *)qmap_exit,
+	.name			= "qmap",
+};
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
new file mode 100644
index 000000000000..c5a208533404
--- /dev/null
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include "user_exit_info.h"
+#include "scx_example_qmap.skel.h"
+
+const char help_fmt[] =
+"A simple five-level FIFO queue sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-s SLICE_US] [-e COUNT]\n"
+"\n"
+"  -s SLICE_US   Override slice duration\n"
+"  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
+"  -h            Display this help and exit\n";
+
+static volatile int exit_req;
+
+static void sigint_handler(int dummy)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_example_qmap *skel;
+	struct bpf_link *link;
+	int opt;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = scx_example_qmap__open();
+	assert(skel);
+
+	while ((opt = getopt(argc, argv, "hs:e:tTd:")) != -1) {
+		switch (opt) {
+		case 's':
+			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
+			break;
+		case 'e':
+			skel->bss->test_error_cnt = strtoull(optarg, NULL, 0);
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	assert(!scx_example_qmap__load(skel));
+
+	link = bpf_map__attach_struct_ops(skel->maps.qmap_ops);
+	assert(link);
+
+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
+		long nr_enqueued = skel->bss->nr_enqueued;
+		long nr_dispatched = skel->bss->nr_dispatched;
+
+		printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%lu\n",
+		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
+		       skel->bss->nr_reenqueued);
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	uei_print(&skel->bss->uei);
+	scx_example_qmap__destroy(skel);
+	return 0;
+}
diff --git a/tools/sched_ext/user_exit_info.h b/tools/sched_ext/user_exit_info.h
new file mode 100644
index 000000000000..e701ef0e0b86
--- /dev/null
+++ b/tools/sched_ext/user_exit_info.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Define struct user_exit_info which is shared between BPF and userspace parts
+ * to communicate exit status and other information.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __USER_EXIT_INFO_H
+#define __USER_EXIT_INFO_H
+
+struct user_exit_info {
+	int		type;
+	char		reason[128];
+	char		msg[1024];
+};
+
+#ifdef __bpf__
+
+#include "vmlinux.h"
+#include <bpf/bpf_core_read.h>
+
+static inline void uei_record(struct user_exit_info *uei,
+			      const struct scx_exit_info *ei)
+{
+	bpf_probe_read_kernel_str(uei->reason, sizeof(uei->reason), ei->reason);
+	bpf_probe_read_kernel_str(uei->msg, sizeof(uei->msg), ei->msg);
+	/* use __sync to force memory barrier */
+	__sync_val_compare_and_swap(&uei->type, uei->type, ei->type);
+}
+
+#else	/* !__bpf__ */
+
+static inline bool uei_exited(struct user_exit_info *uei)
+{
+	/* use __sync to force memory barrier */
+	return __sync_val_compare_and_swap(&uei->type, -1, -1);
+}
+
+static inline void uei_print(const struct user_exit_info *uei)
+{
+	fprintf(stderr, "EXIT: %s", uei->reason);
+	if (uei->msg[0] != '\0')
+		fprintf(stderr, " (%s)", uei->msg);
+	fputs("\n", stderr);
+}
+
+#endif	/* __bpf__ */
+#endif	/* __USER_EXIT_INFO_H */

From patchwork Wed Nov 30 08:22:59 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059550
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id A0CD2C352A1
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:26:14 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S235267AbiK3I0N (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:26:13 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56322 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235172AbiK3IZW (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:25:22 -0500
Received: from mail-pf1-x431.google.com (mail-pf1-x431.google.com
 [IPv6:2607:f8b0:4864:20::431])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 2E55F6F83F;
        Wed, 30 Nov 2022 00:24:17 -0800 (PST)
Received: by mail-pf1-x431.google.com with SMTP id c15so6849988pfb.13;
        Wed, 30 Nov 2022 00:24:17 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=HvUGhplUugUdbhZ6AcFPSZ2kGK89RILzAfThTT+mh4o=;
        b=Aqi+ctOZLyxpQeJ9kP9vRp7mV73z26pS3NRvNv1lfheJ6bopv/MA0ovZiuVRnL8oLk
         LNHV5JKI6du5qZUhib95ZKiXhfCMkYZbrPPI/q49G4kv009reSMXAfF8i5K4k3uaW761
         Ro72iZOJE3TPBerxs0KqscMcI9avf5uHb3lofWm5zooDZngDttyV/3/BbK37Q74hK5S4
         nd4B8yQZYZS6EgBofp2InS5ng59u6hfTUjv8p/bI9XnXwzYCQ2vgH6S7nYVz0Aje5iUY
         6K5ZuRBnywC2ADe6uTHV9lSPJy+Gibxm+xw34ED+B+zQJ6lQxHpiicr1J46kgW6FXA2S
         rvzw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=HvUGhplUugUdbhZ6AcFPSZ2kGK89RILzAfThTT+mh4o=;
        b=HGzh6Buou/2iJgRenbUAgH5UemvcN0ohaKfo/iogsR/ix9jthtX3f+NO8aqbcQ8yia
         3lvPdb56Ud7NisEQGhr7McmJ8AwyMnJiDErIfBW/057vfxnnw+EiF0gwvxrfd/4JA8Lc
         9dZx/Kq0t+DYrD4L3FkCZIGf1m2vpQ79aLkci5NbYxT+lz/4mniPTRlJDdFeciF3UIlU
         CQ01LWgth00AQuyB3jhNe2Om1mSrfzehq1/0JWnTP3KIcVmygSCeMUE43ajb103FvTEE
         RVZaXr5juHGvmMdCQF90ja6Mf7a7nDS/34TilRPGPvNh79uU5NgvqnwD9OH82dVNuHgY
         CWfA==
X-Gm-Message-State: ANoB5ply8n1uKWEr3JudGXmX63q9kQNlA4al3YRlRg+WzcfEI7JeQW9w
        fWbGS3+XoyfiW/BivkTCO1CBbanhpvfy2A==
X-Google-Smtp-Source: 
 AA0mqf5E5M8TU/Bwg2ulcTDcSUxY4JBCDG/ToY1x+skWe/Km784BlYrJdF5iBWCQDPeRHKrTlbqNww==
X-Received: by 2002:a65:4584:0:b0:478:50ca:cdf2 with SMTP id
 o4-20020a654584000000b0047850cacdf2mr4414909pgq.200.1669796653923;
        Wed, 30 Nov 2022 00:24:13 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 s4-20020a17090a764400b0020087d7e778sm2558390pjl.37.2022.11.30.00.24.13
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:13 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 17/31] sched_ext: Add sysrq-S which disables the BPF scheduler
Date: Tue, 29 Nov 2022 22:22:59 -1000
Message-Id: <20221130082313.3241517-18-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

This enables the admin to abort the BPF scheduler and revert to CFS anytime.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 drivers/tty/sysrq.c         |  1 +
 include/linux/sched/ext.h   |  1 +
 kernel/sched/build_policy.c |  1 +
 kernel/sched/ext.c          | 20 ++++++++++++++++++++
 4 files changed, 23 insertions(+)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index d2b2720db6ca..b3c22bf5f0d1 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -520,6 +520,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = {
 	NULL,				/* P */
 	NULL,				/* Q */
 	NULL,				/* R */
+	/* S: May be registered by sched_ext for resetting */
 	NULL,				/* S */
 	NULL,				/* T */
 	NULL,				/* U */
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index e2e743ccd00d..1ec8be53057f 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -55,6 +55,7 @@ enum scx_exit_type {
 	SCX_EXIT_DONE,
 
 	SCX_EXIT_UNREG = 64,	/* BPF unregistration */
+	SCX_EXIT_SYSRQ,		/* requested by 'S' sysrq */
 
 	SCX_EXIT_ERROR = 1024,	/* runtime error, error msg contains details */
 	SCX_EXIT_ERROR_BPF,	/* ERROR but triggered through scx_bpf_error() */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 4c658b21f603..005025f55bea 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tsacct_kern.h>
 #include <linux/vtime.h>
+#include <linux/sysrq.h>
 #include <linux/percpu-rwsem.h>
 
 #include <uapi/linux/sched/types.h>
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1428385093dc..a0057a8447cb 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1771,6 +1771,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	case SCX_EXIT_UNREG:
 		reason = "BPF scheduler unregistered";
 		break;
+	case SCX_EXIT_SYSRQ:
+		reason = "disabled by sysrq-S";
+		break;
 	case SCX_EXIT_ERROR:
 		reason = "runtime error";
 		break;
@@ -2377,6 +2380,21 @@ struct bpf_struct_ops bpf_sched_ext_ops = {
 	.name = "sched_ext_ops",
 };
 
+static void sysrq_handle_sched_ext_reset(int key)
+{
+	if (scx_ops_helper)
+		scx_ops_disable(SCX_EXIT_SYSRQ);
+	else
+		pr_info("sched_ext: BPF scheduler not yet used\n");
+}
+
+static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
+	.handler	= sysrq_handle_sched_ext_reset,
+	.help_msg	= "reset-sched-ext(S)",
+	.action_msg	= "Disable sched_ext and revert all tasks to CFS",
+	.enable_mask	= SYSRQ_ENABLE_RTNICE,
+};
+
 void __init init_sched_ext_class(void)
 {
 	int cpu;
@@ -2400,6 +2418,8 @@ void __init init_sched_ext_class(void)
 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
 		rq->scx.nr_running = 0;
 	}
+
+	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
 }
 
 

From patchwork Wed Nov 30 08:23:00 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059552
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 60DA4C4321E
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:26:16 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S235055AbiK3I0N (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:26:13 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56388 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234859AbiK3IZX (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:25:23 -0500
Received: from mail-pj1-x1035.google.com (mail-pj1-x1035.google.com
 [IPv6:2607:f8b0:4864:20::1035])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 321B560345;
        Wed, 30 Nov 2022 00:24:18 -0800 (PST)
Received: by mail-pj1-x1035.google.com with SMTP id
 b13-20020a17090a5a0d00b0021906102d05so1216428pjd.5;
        Wed, 30 Nov 2022 00:24:18 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=TTnDarJAIOFnrZMrKBccMkgLgr3KISG6ySOAHjPJ4NA=;
        b=FWbg7tlCr9ZJApIPO88BjZAq6OfnOoVGuC+ZwhLyEbwVGTiL2AK4DRp747BxJDuDmm
         +68hzIcrCM6R8vEWkbO7bojekE/ax59SSKgd1CQeE2HR5m5wTqa5uclc/i+gDaV4Hp3T
         RM+MySwyueXj1iY4R/VBRUV6P66SOrH68R4PMhrUGjSFzBAayikxThWIoG2P5Wa7KpfD
         RRhP8tdQUtd4EYoD6cLC9mQKVr5N6Nh/JUI19ScwjPPuUW92UaMOZ7yuZX0+kJ/D0/cx
         ri18tKVAj86M4I21yPAgyN+J8LY3156Eg0cj6Jvlh3K3Dlr+MaM5W/ZWEQMPS9wJahep
         lQcA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=TTnDarJAIOFnrZMrKBccMkgLgr3KISG6ySOAHjPJ4NA=;
        b=kfDUQSaGVHVKbYBMA+hpg35q+0lJ2OaD38VgU7NPpHIn7VrGjxiJmUpP+XPHYk7igg
         F98+9KUJ01ckp4O0I99QZ+7xUMwZsjQNcpXnO78sAW7GGd1GYrjzjw8Sy8/hET2FhYq8
         sCZv+qzPJTugAb4SKWeIvYeKDffbfVpeVyOOvklhJmf5fSGnXnqsyZACe/hCqTxInQkY
         b5p8O4ItzXHMgFBHGGWzBKLAZ86T5Al2Ud0SCwz6yZFdKoonfSbum5tTfnnN2NTVJSYr
         M0iNuX2AIAmSy2g6A4uCD6ySUn3hpMq+jyPUzktCZTfVmEcQvR0yU8pJj5jcoJ6AkD6r
         4grw==
X-Gm-Message-State: ANoB5plyzpOruYTybBdndgmsPLCBFpXkGeQOMHfNaJsTRe7FUHNYtKZC
        KlLmkApvkhRpsYtJSwWSwFQ=
X-Google-Smtp-Source: 
 AA0mqf6afQ4RhFhqFPrc81ZCpFELGPUbLtWalZxPzPLAh1B7J2KYeUvXb0uavuTz6gcwIarlmdgE1w==
X-Received: by 2002:a17:90b:3941:b0:215:db2e:bb17 with SMTP id
 oe1-20020a17090b394100b00215db2ebb17mr63972053pjb.166.1669796656148;
        Wed, 30 Nov 2022 00:24:16 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 bd12-20020a170902830c00b001894198d0ebsm762648plb.24.2022.11.30.00.24.15
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:15 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 18/31] sched_ext: Implement runnable task stall watchdog
Date: Tue, 29 Nov 2022 22:23:00 -1000
Message-Id: <20221130082313.3241517-19-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

From: David Vernet <dvernet@meta.com>

The most common and critical way that a BPF scheduler can misbehave is by
failing to run runnable tasks for too long. This patch implements a
watchdog.

* All tasks record when they become runnable.

* A watchdog work periodically scans all runnable tasks. If any task has
  stayed runnable for too long, the BPF scheduler is aborted.

* scheduler_tick() monitors whether the watchdog itself is stuck. If so, the
  BPF scheduler is aborted.

Because the watchdog only scans the tasks which are currently runnable and
usually very infrequently, the overhead should be negligible.
scx_example_qmap is updated so that it can be told to stall user and/or
kernel tasks.

A detected task stall looks like the following:

 sched_ext: BPF scheduler "qmap" errored, disabling
 sched_ext: runnable task stall (dbus-daemon[953] failed to run for 6.478s)
    scx_check_timeout_workfn+0x10e/0x1b0
    process_one_work+0x287/0x560
    worker_thread+0x234/0x420
    kthread+0xe9/0x100
    ret_from_fork+0x1f/0x30

A detected watchdog stall:

 sched_ext: BPF scheduler "qmap" errored, disabling
 sched_ext: runnable task stall (watchdog failed to check in for 5.001s)
    scheduler_tick+0x2eb/0x340
    update_process_times+0x7a/0x90
    tick_sched_timer+0xd8/0x130
    __hrtimer_run_queues+0x178/0x3b0
    hrtimer_interrupt+0xfc/0x390
    __sysvec_apic_timer_interrupt+0xb7/0x2b0
    sysvec_apic_timer_interrupt+0x90/0xb0
    asm_sysvec_apic_timer_interrupt+0x1b/0x20
    default_idle+0x14/0x20
    arch_cpu_idle+0xf/0x20
    default_idle_call+0x50/0x90
    do_idle+0xe8/0x240
    cpu_startup_entry+0x1d/0x20
    kernel_init+0x0/0x190
    start_kernel+0x0/0x392
    start_kernel+0x324/0x392
    x86_64_start_reservations+0x2a/0x2c
    x86_64_start_kernel+0x104/0x109
    secondary_startup_64_no_verify+0xce/0xdb

Note that this patch exposes scx_ops_error[_type]() in kernel/sched/ext.h to
inline scx_notify_sched_tick().

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h              |  14 +++
 init/init_task.c                       |   2 +
 kernel/sched/core.c                    |   3 +
 kernel/sched/ext.c                     | 133 +++++++++++++++++++++++--
 kernel/sched/ext.h                     |  26 +++++
 kernel/sched/sched.h                   |   1 +
 tools/sched_ext/scx_example_qmap.bpf.c |  12 +++
 tools/sched_ext/scx_example_qmap.c     |  14 ++-
 8 files changed, 193 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 1ec8be53057f..1a57945abea0 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -59,6 +59,7 @@ enum scx_exit_type {
 
 	SCX_EXIT_ERROR = 1024,	/* runtime error, error msg contains details */
 	SCX_EXIT_ERROR_BPF,	/* ERROR but triggered through scx_bpf_error() */
+	SCX_EXIT_ERROR_STALL,	/* watchdog detected stalled runnable tasks */
 };
 
 /*
@@ -323,6 +324,15 @@ struct sched_ext_ops {
 	 */
 	u64 flags;
 
+	/**
+	 * timeout_ms - The maximum amount of time, in milliseconds, that a
+	 * runnable task should be able to wait before being scheduled. The
+	 * maximum timeout may not exceed the default timeout of 30 seconds.
+	 *
+	 * Defaults to the maximum allowed timeout value of 30 seconds.
+	 */
+	u32 timeout_ms;
+
 	/**
 	 * name - BPF scheduler's name
 	 *
@@ -357,6 +367,8 @@ enum scx_ent_flags {
 	SCX_TASK_OPS_PREPPED	= 1 << 3, /* prepared for BPF scheduler enable */
 	SCX_TASK_OPS_ENABLED	= 1 << 4, /* task has BPF scheduler enabled */
 
+	SCX_TASK_WATCHDOG_RESET = 1 << 5, /* task watchdog counter should be reset */
+
 	SCX_TASK_CURSOR		= 1 << 6, /* iteration cursor, not a task */
 };
 
@@ -367,11 +379,13 @@ enum scx_ent_flags {
 struct sched_ext_entity {
 	struct scx_dispatch_q	*dsq;
 	struct list_head	dsq_node;
+	struct list_head	watchdog_node;
 	u32			flags;		/* protected by rq lock */
 	u32			weight;
 	s32			sticky_cpu;
 	s32			holding_cpu;
 	atomic64_t		ops_state;
+	unsigned long		runnable_at;
 
 	/* BPF scheduler modifiable fields */
 
diff --git a/init/init_task.c b/init/init_task.c
index bdbc663107bf..913194aab623 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -106,9 +106,11 @@ struct task_struct init_task
 #ifdef CONFIG_SCHED_CLASS_EXT
 	.scx		= {
 		.dsq_node	= LIST_HEAD_INIT(init_task.scx.dsq_node),
+		.watchdog_node	= LIST_HEAD_INIT(init_task.scx.watchdog_node),
 		.sticky_cpu	= -1,
 		.holding_cpu	= -1,
 		.ops_state	= ATOMIC_INIT(0),
+		.runnable_at	= INITIAL_JIFFIES,
 		.slice		= SCX_SLICE_DFL,
 	},
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dc499f18573a..39d9ccb64f40 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4366,11 +4366,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #ifdef CONFIG_SCHED_CLASS_EXT
 	p->scx.dsq		= NULL;
 	INIT_LIST_HEAD(&p->scx.dsq_node);
+	INIT_LIST_HEAD(&p->scx.watchdog_node);
 	p->scx.flags		= 0;
 	p->scx.weight		= 0;
 	p->scx.sticky_cpu	= -1;
 	p->scx.holding_cpu	= -1;
 	atomic64_set(&p->scx.ops_state, 0);
+	p->scx.runnable_at	= INITIAL_JIFFIES;
 	p->scx.slice		= SCX_SLICE_DFL;
 #endif
 
@@ -5514,6 +5516,7 @@ void scheduler_tick(void)
 	if (sched_feat(LATENCY_WARN) && resched_latency)
 		resched_latency_warn(cpu, resched_latency);
 
+	scx_notify_sched_tick();
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a0057a8447cb..030175f2b1d6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,6 +9,7 @@
 enum scx_internal_consts {
 	SCX_NR_ONLINE_OPS	 = SCX_OP_IDX(init),
 	SCX_DSP_DFL_MAX_BATCH	 = 32,
+	SCX_MAX_RUNNABLE_TIMEOUT = 30 * MSEC_PER_SEC,
 };
 
 enum scx_ops_enable_state {
@@ -87,6 +88,23 @@ static struct scx_exit_info scx_exit_info;
 
 static atomic64_t scx_nr_rejected = ATOMIC64_INIT(0);
 
+/*
+ * The maximum amount of time that a task may be runnable without being
+ * scheduled on a CPU. If this timeout is exceeded, it will trigger
+ * scx_ops_error().
+ */
+unsigned long task_runnable_timeout_ms;
+
+/*
+ * The last time the delayed work was run. This delayed work relies on
+ * ksoftirqd being able to run to service timer interrupts, so it's possible
+ * that this work itself could get wedged. To account for this, we check that
+ * it's not stalled in the timer tick, and trigger an error if it is.
+ */
+unsigned long last_timeout_check = INITIAL_JIFFIES;
+
+static struct delayed_work check_timeout_work;
+
 /* idle tracking */
 #ifdef CONFIG_SMP
 #ifdef CONFIG_CPUMASK_OFFSTACK
@@ -148,10 +166,6 @@ static DEFINE_PER_CPU(struct consume_ctx, consume_ctx);
 
 void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
 		      u64 enq_flags);
-__printf(2, 3) static void scx_ops_error_type(enum scx_exit_type type,
-					      const char *fmt, ...);
-#define scx_ops_error(fmt, args...)						\
-	scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)
 
 struct scx_task_iter {
 	struct sched_ext_entity		cursor;
@@ -597,6 +611,27 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
 }
 
+static bool watchdog_task_watched(const struct task_struct *p)
+{
+	return !list_empty(&p->scx.watchdog_node);
+}
+
+static void watchdog_watch_task(struct rq *rq, struct task_struct *p)
+{
+	lockdep_assert_rq_held(rq);
+	if (p->scx.flags & SCX_TASK_WATCHDOG_RESET)
+		p->scx.runnable_at = jiffies;
+	p->scx.flags &= ~SCX_TASK_WATCHDOG_RESET;
+	list_add_tail(&p->scx.watchdog_node, &rq->scx.watchdog_list);
+}
+
+static void watchdog_unwatch_task(struct task_struct *p, bool reset_timeout)
+{
+	list_del_init(&p->scx.watchdog_node);
+	if (reset_timeout)
+		p->scx.flags |= SCX_TASK_WATCHDOG_RESET;
+}
+
 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
 {
 	int sticky_cpu = p->scx.sticky_cpu;
@@ -613,9 +648,12 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
 		sticky_cpu = cpu_of(rq);
 
-	if (p->scx.flags & SCX_TASK_QUEUED)
+	if (p->scx.flags & SCX_TASK_QUEUED) {
+		WARN_ON_ONCE(!watchdog_task_watched(p));
 		return;
+	}
 
+	watchdog_watch_task(rq, p);
 	p->scx.flags |= SCX_TASK_QUEUED;
 	rq->scx.nr_running++;
 	add_nr_running(rq, 1);
@@ -628,8 +666,12 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 	struct scx_rq *scx_rq = &rq->scx;
 	u64 opss;
 
-	if (!(p->scx.flags & SCX_TASK_QUEUED))
+	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
+		WARN_ON_ONCE(watchdog_task_watched(p));
 		return;
+	}
+
+	watchdog_unwatch_task(p, false);
 
 	/* acquire ensures that we see the preceding updates on QUEUED */
 	opss = atomic64_read_acquire(&p->scx.ops_state);
@@ -1168,6 +1210,8 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 	}
 
 	p->se.exec_start = rq_clock_task(rq);
+
+	watchdog_unwatch_task(p, true);
 }
 
 static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
@@ -1180,11 +1224,14 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 	 */
 	if (p->scx.flags & SCX_TASK_BAL_KEEP) {
 		p->scx.flags &= ~SCX_TASK_BAL_KEEP;
+		watchdog_watch_task(rq, p);
 		dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
 		return;
 	}
 
 	if (p->scx.flags & SCX_TASK_QUEUED) {
+		watchdog_watch_task(rq, p);
+
 		/*
 		 * If @p has slice left and balance_scx() didn't tag it for
 		 * keeping, @p is getting preempted by a higher priority
@@ -1405,6 +1452,48 @@ static void reset_idle_masks(void) {}
 
 #endif /* CONFIG_SMP */
 
+static bool check_rq_for_timeouts(struct rq *rq)
+{
+	struct task_struct *p;
+	unsigned long flags;
+	bool timed_out = false;
+	unsigned long timeout = msecs_to_jiffies(task_runnable_timeout_ms);
+
+	raw_spin_rq_lock_irqsave(rq, flags);
+	list_for_each_entry(p, &rq->scx.watchdog_list, scx.watchdog_node) {
+		unsigned long last_runnable = p->scx.runnable_at;
+
+		if (unlikely(time_after(jiffies, last_runnable + timeout))) {
+			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
+
+			scx_ops_error_type(SCX_EXIT_ERROR_STALL,
+					   "%s[%d] failed to run for %u.%03us",
+					   p->comm, p->pid,
+					   dur_ms / 1000, dur_ms % 1000);
+			timed_out = true;
+			break;
+		}
+	}
+	raw_spin_rq_unlock_irqrestore(rq, flags);
+
+	return timed_out;
+}
+
+static void scx_check_timeout_workfn(struct work_struct *work)
+{
+	int cpu;
+
+	last_timeout_check = jiffies;
+	for_each_online_cpu(cpu) {
+		if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
+			break;
+
+		cond_resched();
+	}
+	queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+			   task_runnable_timeout_ms / 2);
+}
+
 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 {
 	update_curr_scx(rq);
@@ -1430,7 +1519,7 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 		}
 	}
 
-	p->scx.flags |= SCX_TASK_OPS_PREPPED;
+	p->scx.flags |= (SCX_TASK_OPS_PREPPED | SCX_TASK_WATCHDOG_RESET);
 	return 0;
 }
 
@@ -1767,6 +1856,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 			break;
 	}
 
+	cancel_delayed_work_sync(&check_timeout_work);
+
 	switch (type) {
 	case SCX_EXIT_UNREG:
 		reason = "BPF scheduler unregistered";
@@ -1780,6 +1871,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	case SCX_EXIT_ERROR_BPF:
 		reason = "scx_bpf_error";
 		break;
+	case SCX_EXIT_ERROR_STALL:
+		reason = "runnable task stall";
+		break;
 	default:
 		reason = "<UNKNOWN>";
 	}
@@ -1954,8 +2048,8 @@ static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
 
 static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
 
-__printf(2, 3) static void scx_ops_error_type(enum scx_exit_type type,
-					      const char *fmt, ...)
+__printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
+				       const char *fmt, ...)
 {
 	struct scx_exit_info *ei = &scx_exit_info;
 	int none = SCX_EXIT_NONE;
@@ -2059,6 +2153,20 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		goto err_disable;
 	}
 
+	task_runnable_timeout_ms = SCX_MAX_RUNNABLE_TIMEOUT;
+	if (ops->timeout_ms) {
+		/*
+		 * Excessively large timeouts should have been rejected in
+		 * bpf_scx_init_member().
+		 */
+		WARN_ON_ONCE(ops->timeout_ms > SCX_MAX_RUNNABLE_TIMEOUT);
+		task_runnable_timeout_ms = ops->timeout_ms;
+	}
+
+	last_timeout_check = jiffies;
+	queue_delayed_work(system_unbound_wq, &check_timeout_work,
+			   task_runnable_timeout_ms / 2);
+
 	/*
 	 * Lock out forks before opening the floodgate so that they don't wander
 	 * into the operations prematurely.
@@ -2316,6 +2424,11 @@ static int bpf_scx_init_member(const struct btf_type *t,
 		if (ret == 0)
 			return -EINVAL;
 		return 1;
+	case offsetof(struct sched_ext_ops, timeout_ms):
+		if (*(u32 *)(udata + moff) > SCX_MAX_RUNNABLE_TIMEOUT)
+			return -E2BIG;
+		ops->timeout_ms = *(u32 *)(udata + moff);
+		return 1;
 	}
 
 	return 0;
@@ -2416,10 +2529,12 @@ void __init init_sched_ext_class(void)
 		struct rq *rq = cpu_rq(cpu);
 
 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
+		INIT_LIST_HEAD(&rq->scx.watchdog_list);
 		rq->scx.nr_running = 0;
 	}
 
 	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
+	INIT_DELAYED_WORK(&check_timeout_work, scx_check_timeout_workfn);
 }
 
 
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 6d5669481274..bda1d9c11486 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -67,6 +67,8 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
 extern const struct sched_class ext_sched_class;
 extern const struct bpf_verifier_ops bpf_sched_ext_verifier_ops;
 extern const struct file_operations sched_ext_fops;
+extern unsigned long task_runnable_timeout_ms;
+extern unsigned long last_timeout_check;
 
 DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled);
 #define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
@@ -79,6 +81,29 @@ void scx_cancel_fork(struct task_struct *p);
 int balance_scx(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
 void init_sched_ext_class(void);
 
+__printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
+				       const char *fmt, ...);
+#define scx_ops_error(fmt, args...)						\
+	scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)
+
+static inline void scx_notify_sched_tick(void)
+{
+	unsigned long last_check, timeout;
+
+	if (!scx_enabled())
+		return;
+
+	last_check = last_timeout_check;
+	timeout = msecs_to_jiffies(task_runnable_timeout_ms);
+	if (unlikely(time_after(jiffies, last_check + timeout))) {
+		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
+
+		scx_ops_error_type(SCX_EXIT_ERROR_STALL,
+				   "watchdog failed to check in for %u.%03us",
+				   dur_ms / 1000, dur_ms % 1000);
+	}
+}
+
 static inline const struct sched_class *next_active_class(const struct sched_class *class)
 {
 	class++;
@@ -112,6 +137,7 @@ static inline void scx_cancel_fork(struct task_struct *p) {}
 static inline int balance_scx(struct rq *rq, struct task_struct *prev,
 			      struct rq_flags *rf) { return 0; }
 static inline void init_sched_ext_class(void) {}
+static inline void scx_notify_sched_tick(void) {}
 
 #define for_each_active_class		for_each_class
 #define for_balance_class_range		for_class_range
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 18a80d5b542b..40692fa7cb3c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -692,6 +692,7 @@ struct cfs_rq {
 #ifdef CONFIG_SCHED_CLASS_EXT
 struct scx_rq {
 	struct scx_dispatch_q	local_dsq;
+	struct list_head	watchdog_list;
 	u64			ops_qseq;
 	u32			nr_running;
 #ifdef CONFIG_SMP
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index 742c866d2a8e..9e0b6519c8a4 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -22,6 +22,8 @@
 char _license[] SEC("license") = "GPL";
 
 const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile u32 stall_user_nth;
+const volatile u32 stall_kernel_nth;
 
 u32 test_error_cnt;
 
@@ -106,6 +108,15 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	u32 pid = p->pid;
 	int idx;
 	void *ring;
+	static u32 user_cnt, kernel_cnt;
+
+	if (p->flags & PF_KTHREAD) {
+		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
+			return;
+	} else {
+		if (stall_user_nth && !(++user_cnt % stall_user_nth))
+			return;
+	}
 
 	if (test_error_cnt && !--test_error_cnt)
 		scx_bpf_error("test triggering error");
@@ -224,5 +235,6 @@ struct sched_ext_ops qmap_ops = {
 	.dispatch		= (void *)qmap_dispatch,
 	.prep_enable		= (void *)qmap_prep_enable,
 	.exit			= (void *)qmap_exit,
+	.timeout_ms		= 5000U,
 	.name			= "qmap",
 };
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
index c5a208533404..34c764c38e19 100644
--- a/tools/sched_ext/scx_example_qmap.c
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -20,10 +20,12 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
+"  -t COUNT      Stall every COUNT'th user thread\n"
+"  -T COUNT      Stall every COUNT'th kernel thread\n"
 "  -h            Display this help and exit\n";
 
 static volatile int exit_req;
@@ -47,13 +49,19 @@ int main(int argc, char **argv)
 	skel = scx_example_qmap__open();
 	assert(skel);
 
-	while ((opt = getopt(argc, argv, "hs:e:tTd:")) != -1) {
+	while ((opt = getopt(argc, argv, "hs:e:t:T:d:")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
 			break;
 		case 'e':
-			skel->bss->test_error_cnt = strtoull(optarg, NULL, 0);
+			skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
+			break;
+		case 't':
+			skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
+			break;
+		case 'T':
+			skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
 			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));

From patchwork Wed Nov 30 08:23:01 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059553
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id A16B6C47088
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:26:16 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S235275AbiK3I0O (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:26:14 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56344 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235069AbiK3IZX (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:25:23 -0500
Received: from mail-pj1-x102a.google.com (mail-pj1-x102a.google.com
 [IPv6:2607:f8b0:4864:20::102a])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 024DD6F807;
        Wed, 30 Nov 2022 00:24:18 -0800 (PST)
Received: by mail-pj1-x102a.google.com with SMTP id
 t11-20020a17090a024b00b0021932afece4so1194547pje.5;
        Wed, 30 Nov 2022 00:24:18 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=TeRxkUtMHLuUFqMgF18Iov3Mm0w03j8sOh2D0fkG7OA=;
        b=g9whwIlDXn1gAn57uP4ayQTIZ3MxO29p26WHIkTYhqXN0IbpiDCF+mEdhTWoghYZ8N
         OkxSP2pE6rvgdR/iELyZXC0pKsC6gJvixv940sey3X8shr915VlvYN1itzp6TzWEVwBY
         OHoHUn/0bPuL36ql//W3BHrB0hGaeLEQ1hD5qgVUtE8VNu20iu6LXQknEYuyGpZGF64X
         Qf+EYt5RYOgyePeftfpU9i+OmLjTkxFmq8D47KBfwzg3FQctawGt3Bm9pMD+4Kl4KJ+P
         rPC2ICIucZHJEDBGDV10EMeZhFyKssPlJ1/u5DRPr4s7h74HuEovmgSLGoBrI2ZVH5Qs
         1Trg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=TeRxkUtMHLuUFqMgF18Iov3Mm0w03j8sOh2D0fkG7OA=;
        b=Lz0i7nMPro+6SrOKGfjgZ9FnrLPrW6koqTip+3LCG5a4qGP6r4ltLGJa2mUwwpZ2Qc
         HA/tn7SV+2rWaHYF1Tayzs4tlgD3KyCHuECsyloiE4MXo+OneClJPfXXNJOwCJisHcNI
         HpsV+jO9Z0YvdHWv2fxBEogm7W9rHpGiH/ACSY/UG1iFyo12fwQkjg59zz4gF0JPWqjc
         DahIlQPR3NFSgVMQ3521CdiOhssjRrAST0obO5jltUwyGq76aSG90JTM783vbO6asob9
         60lnDWImfmL4wj59CNnVFkRpbHieZRQqM6cRrMmvu+DJwifCgpINFk2Lt6iiRzIQCgKe
         XvPg==
X-Gm-Message-State: ANoB5pkc53DxMTnGVOKXaDLitgYvdpmxGVIJ6iJ7D8HVPWPba1XTVLoL
        JV0Cp+RO1zc2Ux+9snSQSHg=
X-Google-Smtp-Source: 
 AA0mqf6mefq6GpDFMq05Kn+XBObUaj8OK0h0LXJbfOCd4ac/a8nw3NHFz5iV18RMVmHoXFAtXBm/VA==
X-Received: by 2002:a17:90b:784:b0:218:fa11:5f87 with SMTP id
 l4-20020a17090b078400b00218fa115f87mr32105256pjz.25.1669796658265;
        Wed, 30 Nov 2022 00:24:18 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 f131-20020a623889000000b00574ebfdc721sm837329pfa.16.2022.11.30.00.24.17
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:17 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 19/31] sched_ext: Allow BPF schedulers to disallow specific
 tasks from joining SCHED_EXT
Date: Tue, 29 Nov 2022 22:23:01 -1000
Message-Id: <20221130082313.3241517-20-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

BPF schedulers might not want to schedule certain tasks - e.g. kernel
threads. This patch adds p->scx.disallow which can be set by BPF schedulers
in such cases. The field can be changed anytime and setting it in
ops.prep_enable() guarantees that the task can never be scheduled by
sched_ext.

scx_example_qmap is updated with the -d option to disallow a specific PID:

  # echo $$
  1092
  # egrep '(policy)|(ext\.enabled)' /proc/self/sched
  policy                                       :                    0
  ext.enabled                                  :                    0
  # ./set-scx 1092
  # egrep '(policy)|(ext\.enabled)' /proc/self/sched
  policy                                       :                    7
  ext.enabled                                  :                    0

Run "scx_example_qmap -d 1092" in another terminal.

  # grep rejected /sys/kernel/debug/sched/ext
  nr_rejected                   : 1
  # egrep '(policy)|(ext\.enabled)' /proc/self/sched
  policy                                       :                    0
  ext.enabled                                  :                    0
  # ./set-scx 1092
  setparam failed for 1092 (Permission denied)

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Barret Rhoden <brho@google.com>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h              | 12 ++++++++
 kernel/sched/core.c                    |  4 +++
 kernel/sched/ext.c                     | 38 ++++++++++++++++++++++++++
 kernel/sched/ext.h                     |  3 ++
 tools/sched_ext/scx_example_qmap.bpf.c |  4 +++
 tools/sched_ext/scx_example_qmap.c     |  8 +++++-
 6 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 1a57945abea0..82dcbecfcfb9 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -397,6 +397,18 @@ struct sched_ext_entity {
 	 */
 	u64			slice;
 
+	/*
+	 * If set, reject future sched_setscheduler(2) calls updating the policy
+	 * to %SCHED_EXT with -%EACCES.
+	 *
+	 * If set from ops.prep_enable() and the task's policy is already
+	 * %SCHED_EXT, which can happen while the BPF scheduler is being loaded
+	 * or by inhering the parent's policy during fork, the task's policy is
+	 * rejected and forcefully reverted to %SCHED_NORMAL. The number of such
+	 * events are reported through /sys/kernel/debug/sched_ext::nr_rejected.
+	 */
+	bool			disallow;	/* reject switching into SCX */
+
 	/* cold fields */
 	struct list_head	tasks_node;
 };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39d9ccb64f40..3404277fed30 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7552,6 +7552,10 @@ static int __sched_setscheduler(struct task_struct *p,
 		goto unlock;
 	}
 
+	retval = scx_check_setscheduler(p, policy);
+	if (retval)
+		goto unlock;
+
 	/*
 	 * If not changing anything there's no need to proceed further,
 	 * but store a possible modification of reset_on_fork.
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 030175f2b1d6..ddd5aa4a8bca 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1509,6 +1509,8 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 
 	WARN_ON_ONCE(p->scx.flags & SCX_TASK_OPS_PREPPED);
 
+	p->scx.disallow = false;
+
 	if (SCX_HAS_OP(prep_enable)) {
 		struct scx_enable_args args = { };
 
@@ -1519,6 +1521,27 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 		}
 	}
 
+	if (p->scx.disallow) {
+		struct rq *rq;
+		struct rq_flags rf;
+
+		rq = task_rq_lock(p, &rf);
+
+		/*
+		 * We're either in fork or load path and @p->policy will be
+		 * applied right after. Reverting @p->policy here and rejecting
+		 * %SCHED_EXT transitions from scx_check_setscheduler()
+		 * guarantees that if ops.prep_enable() sets @p->disallow, @p
+		 * can never be in SCX.
+		 */
+		if (p->policy == SCHED_EXT) {
+			p->policy = SCHED_NORMAL;
+			atomic64_inc(&scx_nr_rejected);
+		}
+
+		task_rq_unlock(rq, p, &rf);
+	}
+
 	p->scx.flags |= (SCX_TASK_OPS_PREPPED | SCX_TASK_WATCHDOG_RESET);
 	return 0;
 }
@@ -1664,6 +1687,18 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
 static void check_preempt_curr_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
 
+int scx_check_setscheduler(struct task_struct *p, int policy)
+{
+	lockdep_assert_rq_held(task_rq(p));
+
+	/* if disallow, reject transitioning into SCX */
+	if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
+	    p->policy != policy && policy == SCHED_EXT)
+		return -EACCES;
+
+	return 0;
+}
+
 /*
  * Omitted operations:
  *
@@ -2367,6 +2402,9 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
 		if (off >= offsetof(struct task_struct, scx.slice) &&
 		    off + size <= offsetofend(struct task_struct, scx.slice))
 			return SCALAR_VALUE;
+		if (off >= offsetof(struct task_struct, scx.disallow) &&
+		    off + size <= offsetofend(struct task_struct, scx.disallow))
+			return SCALAR_VALUE;
 	}
 
 	if (atype == BPF_READ)
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index bda1d9c11486..0743a0536560 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -79,6 +79,7 @@ int scx_fork(struct task_struct *p);
 void scx_post_fork(struct task_struct *p);
 void scx_cancel_fork(struct task_struct *p);
 int balance_scx(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+int scx_check_setscheduler(struct task_struct *p, int policy);
 void init_sched_ext_class(void);
 
 __printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
@@ -136,6 +137,8 @@ static inline void scx_post_fork(struct task_struct *p) {}
 static inline void scx_cancel_fork(struct task_struct *p) {}
 static inline int balance_scx(struct rq *rq, struct task_struct *prev,
 			      struct rq_flags *rf) { return 0; }
+static inline int scx_check_setscheduler(struct task_struct *p,
+					 int policy) { return 0; }
 static inline void init_sched_ext_class(void) {}
 static inline void scx_notify_sched_tick(void) {}
 
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index 9e0b6519c8a4..b6febc5dadbf 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -24,6 +24,7 @@ char _license[] SEC("license") = "GPL";
 const volatile u64 slice_ns = SCX_SLICE_DFL;
 const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
+const volatile s32 disallow_tgid;
 
 u32 test_error_cnt;
 
@@ -208,6 +209,9 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
 		   struct scx_enable_args *args)
 {
+	if (p->tgid == disallow_tgid)
+		p->scx.disallow = true;
+
 	/*
 	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
 	 * in this function and the following will automatically use GFP_KERNEL.
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
index 34c764c38e19..99cc7169bd90 100644
--- a/tools/sched_ext/scx_example_qmap.c
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -20,12 +20,13 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
 "  -t COUNT      Stall every COUNT'th user thread\n"
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
+"  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 "  -h            Display this help and exit\n";
 
 static volatile int exit_req;
@@ -63,6 +64,11 @@ int main(int argc, char **argv)
 		case 'T':
 			skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
 			break;
+		case 'd':
+			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
+			if (skel->rodata->disallow_tgid < 0)
+				skel->rodata->disallow_tgid = getpid();
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';

From patchwork Wed Nov 30 08:23:02 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059554
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id A0DFDC4321E
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:26:44 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S235343AbiK3I0m (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:26:42 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56462 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234861AbiK3IZY (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:25:24 -0500
Received: from mail-pg1-x534.google.com (mail-pg1-x534.google.com
 [IPv6:2607:f8b0:4864:20::534])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 0DF4B43AEB;
        Wed, 30 Nov 2022 00:24:21 -0800 (PST)
Received: by mail-pg1-x534.google.com with SMTP id q1so15362119pgl.11;
        Wed, 30 Nov 2022 00:24:21 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=3xNLbIsa5gtviYtGbS0XJgTqtP9STFkn3KhLBxJlSY0=;
        b=EQR4AeKgF7dwfmXpdNFKVBnWmW+pCUqxeTtRwXRJ0+4LEYOg8snYX1U2aIR53pm2GN
         p8svxnC6OYcJxckHdlLkVzjt69Waao3KPNuqZhTs7FGt7RWAHbaqj+l8TBzeogJNmsn+
         mammGiPEkAwPtVELXSq8Rs88LCdIq5gttHc6QbYS6L3s2aPD0yyeiQBECSRRUwJK2wck
         FZC776aaw2nQbLtwW1uDPTejQ6kUHsXIMgR7dLQ61QjcSCXx7iIcJMts5ncRBeNWsQWJ
         /XVXd8CgSWlxOTNRiejilc+YTJkzI9qMLGAu8GWNuaeKVrDx02eXKM7SasmXGZASwvKr
         XdWw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=3xNLbIsa5gtviYtGbS0XJgTqtP9STFkn3KhLBxJlSY0=;
        b=Thg7bjgkFC7ktYw/33ybMjYoZd4vuAJx6vAGUjJ55DK20WQNG5hRMqGrNN9Lx8VpNR
         LQo2emkbmuL1MFpqHXi1Du0UGlmZ80k+wti1uLk8sh01soF6Mnsrhs2rbU1C3j4VWXtI
         8UuKxPm6f0TwUlLokW1QGlOj74PAFC7TIIOajSM+FKagNX3G3y2k67SSAcr4ceS2BE8y
         I+BMA5wQk/IPem5xNTM6A3rSiueqoTYpFSW/KTxGBJMOqvIxUOH+x15nDo0uU/uJOJk4
         HxfT4HQOVtLWyivIZz/vH5YFodIHjUJ72sTnl30K2xuf0ITkn8HfL8o9ca+kB0wNdcqg
         5adw==
X-Gm-Message-State: ANoB5pnsP4sV3b07xPlsCA62CR8zp7XfDdsFO78FtIjuffFUgSSrLtVc
        /jPb2g46qxfT9pA3KtfhL/Y=
X-Google-Smtp-Source: 
 AA0mqf52p+ZRo0BmR4ZL/XEv1sKhOHpYfvNqaOh/EtAiX1D/HZyRqbODM6vn6qrOJxKDoS/21Ztu/A==
X-Received: by 2002:a63:114b:0:b0:46a:e00b:ada0 with SMTP id
 11-20020a63114b000000b0046ae00bada0mr38704577pgr.409.1669796660212;
        Wed, 30 Nov 2022 00:24:20 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 s3-20020a625e03000000b0056bf29c9ba3sm820774pfb.146.2022.11.30.00.24.19
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:19 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 20/31] sched_ext: Allow BPF schedulers to switch all eligible
 tasks into sched_ext
Date: Tue, 29 Nov 2022 22:23:02 -1000
Message-Id: <20221130082313.3241517-21-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

Currently, to use sched_ext, each task has to be put into sched_ext using
sched_setscheduler(2). However, some BPF schedulers and use cases might
prefer to service all eligible tasks.

This patch adds a new kfunc helper, scx_bpf_switch_all(), that BPF
schedulers can call from ops.init() to switch all SCHED_NORMAL, SCHED_BATCH
and SCHED_IDLE tasks into sched_ext. This has the benefit that the scheduler
swaps are transparent to the users and applications. As we know that CFS is
not being used when scx_bpf_switch_all() is used, we can also disable hot
path entry points with static_branches.

Both the dummy and qmap example schedulers are updated with the '-a' option
which enables the switch_all behavior.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Barret Rhoden <brho@google.com>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c                     |  8 +++--
 kernel/sched/ext.c                      | 42 +++++++++++++++++++++++++
 kernel/sched/ext.h                      |  5 +++
 tools/sched_ext/scx_common.bpf.h        |  1 +
 tools/sched_ext/scx_example_dummy.bpf.c | 11 +++++++
 tools/sched_ext/scx_example_dummy.c     |  8 +++--
 tools/sched_ext/scx_example_qmap.bpf.c  |  9 ++++++
 tools/sched_ext/scx_example_qmap.c      |  7 +++--
 8 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3404277fed30..20536957840d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1204,7 +1204,7 @@ bool sched_can_stop_tick(struct rq *rq)
 	 * if there's more than one we need the tick for involuntary
 	 * preemption.
 	 */
-	if (rq->nr_running > 1)
+	if (!scx_switched_all() && rq->nr_running > 1)
 		return false;
 
 	return true;
@@ -5520,8 +5520,10 @@ void scheduler_tick(void)
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
-	rq->idle_balance = idle_cpu(cpu);
-	trigger_load_balance(rq);
+	if (!scx_switched_all()) {
+		rq->idle_balance = idle_cpu(cpu);
+		trigger_load_balance(rq);
+	}
 #endif
 }
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ddd5aa4a8bca..ba0a7a9ea5f2 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -73,6 +73,10 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static bool scx_switch_all_req;
+static bool scx_switching_all;
+DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+
 static struct sched_ext_ops scx_ops;
 static bool warned_zero_slice;
 
@@ -1844,6 +1848,8 @@ bool task_on_scx(struct task_struct *p)
 {
 	if (!scx_enabled() || scx_ops_disabling())
 		return false;
+	if (READ_ONCE(scx_switching_all))
+		return true;
 	return p->policy == SCHED_EXT;
 }
 
@@ -1982,6 +1988,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	 */
 	mutex_lock(&scx_ops_enable_mutex);
 
+	static_branch_disable(&__scx_switched_all);
+	WRITE_ONCE(scx_switching_all, false);
+
 	/* avoid racing against fork */
 	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
@@ -2159,6 +2168,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 */
 	cpus_read_lock();
 
+	scx_switch_all_req = false;
 	if (scx_ops.init) {
 		ret = scx_ops.init();
 
@@ -2281,6 +2291,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 * transitions here are synchronized against sched_ext_free() through
 	 * scx_tasks_lock.
 	 */
+	WRITE_ONCE(scx_switching_all, scx_switch_all_req);
+
 	scx_task_iter_init(&sti);
 	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
 		if (READ_ONCE(p->__state) != TASK_DEAD) {
@@ -2312,6 +2324,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		goto err_disable_unlock;
 	}
 
+	if (scx_switch_all_req)
+		static_branch_enable_cpuslocked(&__scx_switched_all);
+
 	cpus_read_unlock();
 	mutex_unlock(&scx_ops_enable_mutex);
 
@@ -2346,6 +2361,9 @@ static int scx_debug_show(struct seq_file *m, void *v)
 	mutex_lock(&scx_ops_enable_mutex);
 	seq_printf(m, "%-30s: %s\n", "ops", scx_ops.name);
 	seq_printf(m, "%-30s: %ld\n", "enabled", scx_enabled());
+	seq_printf(m, "%-30s: %d\n", "switching_all",
+		   READ_ONCE(scx_switching_all));
+	seq_printf(m, "%-30s: %ld\n", "switched_all", scx_switched_all());
 	seq_printf(m, "%-30s: %s\n", "enable_state",
 		   scx_ops_enable_state_str[scx_ops_enable_state()]);
 	seq_printf(m, "%-30s: %llu\n", "nr_rejected",
@@ -2586,6 +2604,28 @@ __diag_push();
 __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in vmlinux BTF");
 
+/**
+ * scx_bpf_switch_all - Switch all tasks into SCX
+ * @into_scx: switch direction
+ *
+ * If @into_scx is %true, all existing and future non-dl/rt tasks are switched
+ * to SCX. If %false, only tasks which have %SCHED_EXT explicitly set are put on
+ * SCX. The actual switching is asynchronous. Can be called from ops.init().
+ */
+void scx_bpf_switch_all(void)
+{
+	scx_switch_all_req = true;
+}
+
+BTF_SET8_START(scx_kfunc_ids_init)
+BTF_ID_FLAGS(func, scx_bpf_switch_all)
+BTF_SET8_END(scx_kfunc_ids_init)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_init = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_init,
+};
+
 /**
  * scx_bpf_create_dsq - Create a dsq
  * @dsq_id: dsq to attach
@@ -3015,6 +3055,8 @@ static int __init register_ext_kfuncs(void)
 	 * allow all kfuncs for everybody.
 	 */
 	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_init)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_sleepable)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					    &scx_kfunc_set_dispatch)) ||
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 0743a0536560..142dce30764d 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -71,7 +71,9 @@ extern unsigned long task_runnable_timeout_ms;
 extern unsigned long last_timeout_check;
 
 DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled);
+DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
 #define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
+#define scx_switched_all()	static_branch_unlikely(&__scx_switched_all)
 
 bool task_on_scx(struct task_struct *p);
 void scx_pre_fork(struct task_struct *p);
@@ -108,6 +110,8 @@ static inline void scx_notify_sched_tick(void)
 static inline const struct sched_class *next_active_class(const struct sched_class *class)
 {
 	class++;
+	if (scx_switched_all() && class == &fair_sched_class)
+		class++;
 	if (!scx_enabled() && class == &ext_sched_class)
 		class++;
 	return class;
@@ -130,6 +134,7 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
 #else	/* CONFIG_SCHED_CLASS_EXT */
 
 #define scx_enabled()		false
+#define scx_switched_all()	false
 
 static inline void scx_pre_fork(struct task_struct *p) {}
 static inline int scx_fork(struct task_struct *p) { return 0; }
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index 4cfbbee38d9a..212cb934db2d 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -48,6 +48,7 @@ void ___scx_bpf_error_format_checker(const char *fmt, ...) {}
 	___scx_bpf_error_format_checker(fmt, ##args);				\
 })
 
+extern void scx_bpf_switch_all(void) __ksym;
 extern s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
 extern bool scx_bpf_consume(u64 dsq_id) __ksym;
 extern u32 scx_bpf_dispatch_nr_slots(void) __ksym;
diff --git a/tools/sched_ext/scx_example_dummy.bpf.c b/tools/sched_ext/scx_example_dummy.bpf.c
index ac7b490b5a39..28251373d1c3 100644
--- a/tools/sched_ext/scx_example_dummy.bpf.c
+++ b/tools/sched_ext/scx_example_dummy.bpf.c
@@ -7,6 +7,7 @@
  *
  * - Statistics tracking how many are queued to local and global dsq's.
  * - Termination notification for userspace.
+ * - Support for switch_all.
  *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
@@ -16,6 +17,8 @@
 
 char _license[] SEC("license") = "GPL";
 
+const volatile bool switch_all;
+
 struct user_exit_info uei;
 
 struct {
@@ -32,6 +35,13 @@ static void stat_inc(u32 idx)
 		(*cnt_p)++;
 }
 
+s32 BPF_STRUCT_OPS(dummy_init)
+{
+	if (switch_all)
+		scx_bpf_switch_all();
+	return 0;
+}
+
 void BPF_STRUCT_OPS(dummy_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	if (enq_flags & SCX_ENQ_LOCAL) {
@@ -51,6 +61,7 @@ void BPF_STRUCT_OPS(dummy_exit, struct scx_exit_info *ei)
 SEC(".struct_ops")
 struct sched_ext_ops dummy_ops = {
 	.enqueue		= (void *)dummy_enqueue,
+	.init			= (void *)dummy_init,
 	.exit			= (void *)dummy_exit,
 	.name			= "dummy",
 };
diff --git a/tools/sched_ext/scx_example_dummy.c b/tools/sched_ext/scx_example_dummy.c
index 72881c881830..9229973e8698 100644
--- a/tools/sched_ext/scx_example_dummy.c
+++ b/tools/sched_ext/scx_example_dummy.c
@@ -19,8 +19,9 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s\n"
+"Usage: %s [-a]\n"
 "\n"
+"  -a            Switch all tasks\n"
 "  -h            Display this help and exit\n";
 
 static volatile int exit_req;
@@ -64,8 +65,11 @@ int main(int argc, char **argv)
 	skel = scx_example_dummy__open();
 	assert(skel);
 
-	while ((opt = getopt(argc, argv, "h")) != -1) {
+	while ((opt = getopt(argc, argv, "ah")) != -1) {
 		switch (opt) {
+		case 'a':
+			skel->rodata->switch_all = true;
+			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index b6febc5dadbf..bde8cd339935 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -22,6 +22,7 @@
 char _license[] SEC("license") = "GPL";
 
 const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile bool switch_all;
 const volatile u32 stall_user_nth;
 const volatile u32 stall_kernel_nth;
 const volatile s32 disallow_tgid;
@@ -223,6 +224,13 @@ s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
 		return -ENOMEM;
 }
 
+s32 BPF_STRUCT_OPS(qmap_init)
+{
+	if (switch_all)
+		scx_bpf_switch_all();
+	return 0;
+}
+
 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 {
 	uei_record(&uei, ei);
@@ -238,6 +246,7 @@ struct sched_ext_ops qmap_ops = {
 	 */
 	.dispatch		= (void *)qmap_dispatch,
 	.prep_enable		= (void *)qmap_prep_enable,
+	.init			= (void *)qmap_init,
 	.exit			= (void *)qmap_exit,
 	.timeout_ms		= 5000U,
 	.name			= "qmap",
diff --git a/tools/sched_ext/scx_example_qmap.c b/tools/sched_ext/scx_example_qmap.c
index 99cc7169bd90..52a44269508b 100644
--- a/tools/sched_ext/scx_example_qmap.c
+++ b/tools/sched_ext/scx_example_qmap.c
@@ -20,7 +20,7 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID]\n"
+"Usage: %s [-a] [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -50,8 +50,11 @@ int main(int argc, char **argv)
 	skel = scx_example_qmap__open();
 	assert(skel);
 
-	while ((opt = getopt(argc, argv, "hs:e:t:T:d:")) != -1) {
+	while ((opt = getopt(argc, argv, "ahs:e:t:T:d:")) != -1) {
 		switch (opt) {
+		case 'a':
+			skel->rodata->switch_all = true;
+			break;
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
 			break;

From patchwork Wed Nov 30 08:23:03 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059555
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 0F2EEC433FE
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:26:46 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S235186AbiK3I0o (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:26:44 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54952 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235041AbiK3IZZ (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:25:25 -0500
Received: from mail-pj1-x102e.google.com (mail-pj1-x102e.google.com
 [IPv6:2607:f8b0:4864:20::102e])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 2706328707;
        Wed, 30 Nov 2022 00:24:23 -0800 (PST)
Received: by mail-pj1-x102e.google.com with SMTP id
 k2-20020a17090a4c8200b002187cce2f92so1215908pjh.2;
        Wed, 30 Nov 2022 00:24:23 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=Kh4NrL+dr0sdu1195cFhZBscorFNrhboBCZ/rmCbEHU=;
        b=PYx04NfZyFn6FYHHqEuXFBc9XpU78sr1/3nnZrnlgeQVZCNB4WT+Exb4nKDj84FXbV
         vyssL1SnRfEwaHJUfVl6dWgWrvQOk3bkn807Bzx2L8heKaUeHhY1Qkk6/uyEtqmJ7fmk
         AdtyPyiQIHQ4Xzp5CPr3eU+NoHp2ErzNVTnfb/d1DAZEL3SiJHkwkRxd4OHTEdOYtZSZ
         ydaLp3N181Lywg1LigB8vH5vQubpBgbxuFDAn2lTviS40TucHlUwubmXMETZ8g9Er8oU
         xVu9/7rAc/2X5jBmITlgI3406P8zOn3Lf46sk2AEXuBUmybZ37loiRNg7dSzggjXYZR6
         xiRw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=Kh4NrL+dr0sdu1195cFhZBscorFNrhboBCZ/rmCbEHU=;
        b=cPGWM/Xeqz84q42n3Rpmqes+JRZilECZUSatZYHwFRw0fHRbl0lLWOXYh4TarBRUZD
         BjBv8tOxEWCPJOiUPGTZo1UWVy7ETQ9/E//aec0sq0FvfnSG1wqRpXZWeNk64T5wp4G8
         O15jHm4Od16PXZ4t7aFaxMtJea4sacp72exqTLIJRGqJtlm3R02ebKR+8rAPw1CLZbXq
         oYB5cfwl7LcCfZb/W2FWSZLXjvxd2Fzjbvm/jHrcQyg7Cm1urlz/0dxLvr/cg6BauoHC
         A6ifFHbs//+53mT9r3wW9wjUk25GTiDQduYh4f8j7WXdc3aRh/q0mac3XQ/afwu0E8Ad
         +Cug==
X-Gm-Message-State: ANoB5pkXt0XBgv+/XLBX9VZ8yaUR/dhmCXnzGFnGN0BaBpfpdMcTTS4J
        6fn7XQ9XtofqCWT0oLWIw9M=
X-Google-Smtp-Source: 
 AA0mqf4uOwbcXXp4YdqoIxlc0LtxMFtvW97TLx83nqFRy/KUP9fsHuEUzGOUkSPr4B640TsjdLllaA==
X-Received: by 2002:a17:90a:3d41:b0:213:d34:a80b with SMTP id
 o1-20020a17090a3d4100b002130d34a80bmr64370418pjf.74.1669796662290;
        Wed, 30 Nov 2022 00:24:22 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 p4-20020a170902e74400b00186b55e3cd6sm769366plf.133.2022.11.30.00.24.21
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:21 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 21/31] sched_ext: Implement scx_bpf_kick_cpu() and task
 preemption support
Date: Tue, 29 Nov 2022 22:23:03 -1000
Message-Id: <20221130082313.3241517-22-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

It's often useful to wake up and/or trigger reschedule on other CPUs. This
patch adds scx_bpf_kick_cpu() kfunc helper that BPF scheduler can call to
kick the target CPU into the scheduling path.

As a sched_ext task relinquishes its CPU only after its slice is depleted,
this patch also adds SCX_KICK_PREEMPT and SCX_ENQ_PREEMPT which clears the
slice of the target CPU's current task to guarantee that sched_ext's
scheduling path runs on the CPU.

This patch also adds a new example scheduler, scx_example_central, which
demonstrates central scheduling where one CPU is responsible for making all
scheduling decisions in the system. The central CPU makes scheduling
decisions for all CPUs in the system, queues tasks on the appropriate local
dsq's and preempts the worker CPUs. The worker CPUs in turn preempt the
central CPU when it needs tasks to run.

Currently, every CPU depends on its own tick to expire the current task. A
follow-up patch implementing tickless support for sched_ext will allow the
worker CPUs to go full tickless so that they can run completely undisturbed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h                 |   4 +
 kernel/sched/ext.c                        |  88 ++++++++-
 kernel/sched/ext.h                        |  12 ++
 kernel/sched/sched.h                      |   1 +
 tools/sched_ext/.gitignore                |   1 +
 tools/sched_ext/Makefile                  |   8 +-
 tools/sched_ext/scx_common.bpf.h          |   1 +
 tools/sched_ext/scx_example_central.bpf.c | 229 ++++++++++++++++++++++
 tools/sched_ext/scx_example_central.c     |  91 +++++++++
 9 files changed, 430 insertions(+), 5 deletions(-)
 create mode 100644 tools/sched_ext/scx_example_central.bpf.c
 create mode 100644 tools/sched_ext/scx_example_central.c

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 82dcbecfcfb9..6e25c3431bb4 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -394,6 +394,10 @@ struct sched_ext_entity {
 	 * scx_bpf_dispatch() but can also be modified directly by the BPF
 	 * scheduler. Automatically decreased by SCX as the task executes. On
 	 * depletion, a scheduling event is triggered.
+	 *
+	 * This value is cleared to zero if the task is preempted by
+	 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
+	 * task ran. Use p->se.sum_exec_runtime instead.
 	 */
 	u64			slice;
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ba0a7a9ea5f2..4a98047a06bc 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -404,7 +404,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 		}
 	}
 
-	if (enq_flags & SCX_ENQ_HEAD)
+	if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
 		list_add(&p->scx.dsq_node, &dsq->fifo);
 	else
 		list_add_tail(&p->scx.dsq_node, &dsq->fifo);
@@ -420,8 +420,16 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
 
 	if (is_local) {
 		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+		bool preempt = false;
 
-		if (sched_class_above(&ext_sched_class, rq->curr->sched_class))
+		if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
+		    rq->curr->sched_class == &ext_sched_class) {
+			rq->curr->scx.slice = 0;
+			preempt = true;
+		}
+
+		if (preempt || sched_class_above(&ext_sched_class,
+						 rq->curr->sched_class))
 			resched_curr(rq);
 	} else {
 		raw_spin_unlock(&dsq->lock);
@@ -1707,7 +1715,9 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
  * Omitted operations:
  *
  * - check_preempt_curr: NOOP as it isn't useful in the wakeup path because the
- *   task isn't tied to the CPU at that point.
+ *   task isn't tied to the CPU at that point. Preemption is implemented by
+ *   resetting the victim task's slice to 0 and triggering reschedule on the
+ *   target CPU.
  *
  * - migrate_task_rq: Unncessary as task to cpu mapping is transient.
  *
@@ -2564,6 +2574,34 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
 	.enable_mask	= SYSRQ_ENABLE_RTNICE,
 };
 
+#ifdef CONFIG_SMP
+static void kick_cpus_irq_workfn(struct irq_work *irq_work)
+{
+	struct rq *this_rq = this_rq();
+	int this_cpu = cpu_of(this_rq);
+	int cpu;
+
+	for_each_cpu(cpu, this_rq->scx.cpus_to_kick) {
+		struct rq *rq = cpu_rq(cpu);
+		unsigned long flags;
+
+		raw_spin_rq_lock_irqsave(rq, flags);
+
+		if (cpu_online(cpu) || cpu == this_cpu) {
+			if (cpumask_test_cpu(cpu, this_rq->scx.cpus_to_preempt) &&
+			    rq->curr->sched_class == &ext_sched_class)
+				rq->curr->scx.slice = 0;
+			resched_curr(rq);
+		}
+
+		raw_spin_rq_unlock_irqrestore(rq, flags);
+	}
+
+	cpumask_clear(this_rq->scx.cpus_to_kick);
+	cpumask_clear(this_rq->scx.cpus_to_preempt);
+}
+#endif
+
 void __init init_sched_ext_class(void)
 {
 	int cpu;
@@ -2587,6 +2625,11 @@ void __init init_sched_ext_class(void)
 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
 		INIT_LIST_HEAD(&rq->scx.watchdog_list);
 		rq->scx.nr_running = 0;
+#ifdef CONFIG_SMP
+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
+		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
+#endif
 	}
 
 	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
@@ -2772,6 +2815,44 @@ static const struct btf_kfunc_id_set scx_kfunc_set_consume = {
 	.set			= &scx_kfunc_ids_consume,
 };
 
+/**
+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
+ * @cpu: cpu to kick
+ * @flags: SCX_KICK_* flags
+ *
+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
+ * trigger rescheduling on a busy CPU. This can be called from any online
+ * scx_ops operation and the actual kicking is performed asynchronously through
+ * an irq work.
+ */
+void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+{
+	if (!ops_cpu_valid(cpu)) {
+		scx_ops_error("invalid cpu %d", cpu);
+		return;
+	}
+#ifdef CONFIG_SMP
+	{
+		struct rq *rq;
+
+		preempt_disable();
+		rq = this_rq();
+
+		/*
+		 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid
+		 * nesting rq locks. We can probably be smarter and avoid
+		 * bouncing if called from ops which don't hold a rq lock.
+		 */
+		cpumask_set_cpu(cpu, rq->scx.cpus_to_kick);
+		if (flags & SCX_KICK_PREEMPT)
+			cpumask_set_cpu(cpu, rq->scx.cpus_to_preempt);
+
+		irq_work_queue(&rq->scx.kick_cpus_irq_work);
+		preempt_enable();
+	}
+#endif
+}
+
 /**
  * scx_bpf_dsq_nr_queued - Return the number of queued tasks
  * @dsq_id: id of the dsq
@@ -2845,6 +2926,7 @@ s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed)
 }
 
 BTF_SET8_START(scx_kfunc_ids_online)
+BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
 BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu)
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 142dce30764d..3597b7b5829e 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -19,6 +19,14 @@ enum scx_enq_flags {
 
 	/* high 32bits are ext specific flags */
 
+	/*
+	 * Set the following to trigger preemption when calling
+	 * scx_bpf_dispatch() with a local dsq as the target. The slice of the
+	 * current task is cleared to zero and the CPU is kicked into the
+	 * scheduling path. Implies %SCX_ENQ_HEAD.
+	 */
+	SCX_ENQ_PREEMPT		= 1LLU << 32,
+
 	/*
 	 * The task being enqueued is the only task available for the cpu. By
 	 * default, ext core keeps executing such tasks but when
@@ -51,6 +59,10 @@ enum scx_deq_flags {
 	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
 };
 
+enum scx_kick_flags {
+	SCX_KICK_PREEMPT	= 1LLU << 0,	/* force scheduling on the CPU */
+};
+
 #ifdef CONFIG_SCHED_CLASS_EXT
 
 struct sched_enq_and_set_ctx {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 40692fa7cb3c..0d8b52c52e2b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -698,6 +698,7 @@ struct scx_rq {
 #ifdef CONFIG_SMP
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_preempt;
+	struct irq_work		kick_cpus_irq_work;
 #endif
 };
 #endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
index 6734f7fd9324..389f0e5b0970 100644
--- a/tools/sched_ext/.gitignore
+++ b/tools/sched_ext/.gitignore
@@ -1,5 +1,6 @@
 scx_example_dummy
 scx_example_qmap
+scx_example_central
 *.skel.h
 *.subskel.h
 /tools/
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index db99745e566d..d406b7586e08 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -114,7 +114,7 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     -Wno-compare-distinct-pointer-types				\
 	     -O2 -mcpu=v3
 
-all: scx_example_dummy scx_example_qmap
+all: scx_example_dummy scx_example_qmap scx_example_central
 
 # sort removes libbpf duplicates when not cross-building
 MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
@@ -173,10 +173,14 @@ scx_example_qmap: scx_example_qmap.c scx_example_qmap.skel.h user_exit_info.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
+scx_example_central: scx_example_central.c scx_example_central.skel.h user_exit_info.h
+	$(CC) $(CFLAGS) -c $< -o $@.o
+	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+
 clean:
 	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
-	rm -f scx_example_dummy scx_example_qmap
+	rm -f scx_example_dummy scx_example_qmap scx_example_central
 
 .PHONY: all clean
 
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index 212cb934db2d..dc4d3f7b461f 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -53,6 +53,7 @@ extern s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
 extern bool scx_bpf_consume(u64 dsq_id) __ksym;
 extern u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 extern void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
+extern void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 extern s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
 extern bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
 extern s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed) __ksym;
diff --git a/tools/sched_ext/scx_example_central.bpf.c b/tools/sched_ext/scx_example_central.bpf.c
new file mode 100644
index 000000000000..f53ed4baf92d
--- /dev/null
+++ b/tools/sched_ext/scx_example_central.bpf.c
@@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A central FIFO sched_ext scheduler which demonstrates the followings:
+ *
+ * a. Making all scheduling decisions from one CPU:
+ *
+ *    The central CPU is the only one making scheduling decisions. All other
+ *    CPUs kick the central CPU when they run out of tasks to run.
+ *
+ *    There is one global BPF queue and the central CPU schedules all CPUs by
+ *    dispatching from the global queue to each CPU's local dsq from dispatch().
+ *    This isn't the most straight-forward. e.g. It'd be easier to bounce
+ *    through per-CPU BPF queues. The current design is chosen to maximally
+ *    utilize and verify various scx mechanisms such as LOCAL_ON dispatching and
+ *    consume_final().
+ *
+ * b. Preemption
+ *
+ *    SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the
+ *    next tasks.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include "scx_common.bpf.h"
+
+char _license[] SEC("license") = "GPL";
+
+enum {
+	FALLBACK_DSQ_ID		= 0,
+	MAX_CPUS		= 4096,
+	MS_TO_NS		= 1000LLU * 1000,
+	TIMER_INTERVAL_NS	= 1 * MS_TO_NS,
+};
+
+const volatile bool switch_all;
+const volatile s32 central_cpu;
+const volatile u32 nr_cpu_ids;
+
+u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
+u64 nr_dispatches, nr_mismatches, nr_overflows;
+
+struct user_exit_info uei;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, 4096);
+	__type(value, s32);
+} central_q SEC(".maps");
+
+/* can't use percpu map due to bad lookups */
+static bool cpu_gimme_task[MAX_CPUS];
+
+struct central_timer {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, u32);
+	__type(value, struct central_timer);
+} central_timer SEC(".maps");
+
+static bool vtime_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
+s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	/*
+	 * Steer wakeups to the central CPU as much as possible to avoid
+	 * disturbing other CPUs. It's safe to blindly return the central cpu as
+	 * select_cpu() is a hint and if @p can't be on it, the kernel will
+	 * automatically pick a fallback CPU.
+	 */
+	return central_cpu;
+}
+
+void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	s32 pid = p->pid;
+
+	__sync_fetch_and_add(&nr_total, 1);
+
+	if (bpf_map_push_elem(&central_q, &pid, 0)) {
+		__sync_fetch_and_add(&nr_overflows, 1);
+		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, enq_flags);
+		return;
+	}
+
+	__sync_fetch_and_add(&nr_queued, 1);
+
+	if (!scx_bpf_task_running(p))
+		scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
+}
+
+static int dispatch_a_task_loopfn(u32 idx, void *data)
+{
+	s32 cpu = *(s32 *)data;
+	s32 pid;
+	struct task_struct *p;
+	bool *gimme;
+
+	if (bpf_map_pop_elem(&central_q, &pid))
+		return 1;
+
+	__sync_fetch_and_sub(&nr_queued, 1);
+
+	p = scx_bpf_find_task_by_pid(pid);
+	if (!p) {
+		__sync_fetch_and_add(&nr_lost_pids, 1);
+		return 0;
+	}
+
+	/*
+	 * If we can't run the task at the top, do the dumb thing and bounce it
+	 * to the fallback dsq.
+	 */
+	if (!scx_bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
+		__sync_fetch_and_add(&nr_mismatches, 1);
+		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, 0);
+		return 0;
+	}
+
+	/* dispatch to the local and mark that @cpu doesn't need more tasks */
+	scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
+
+	if (cpu != central_cpu)
+		scx_bpf_kick_cpu(cpu, 0);
+
+	gimme = MEMBER_VPTR(cpu_gimme_task, [cpu]);
+	if (gimme)
+		*gimme = false;
+
+	return 1;
+}
+
+static int dispatch_to_one_cpu_loopfn(u32 idx, void *data)
+{
+	s32 cpu = idx;
+
+	if (cpu >= 0 && cpu < MAX_CPUS) {
+		bool *gimme = MEMBER_VPTR(cpu_gimme_task, [cpu]);
+		if (gimme && !*gimme)
+			return 0;
+	}
+
+	bpf_loop(1 << 23, dispatch_a_task_loopfn, &cpu, 0);
+	return 0;
+}
+
+void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
+{
+	/* if out of tasks to run, gimme more */
+	if (!scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID)) {
+		bool *gimme = MEMBER_VPTR(cpu_gimme_task, [cpu]);
+		if (gimme)
+			*gimme = true;
+	}
+
+	if (cpu == central_cpu) {
+		/* we're the scheduling CPU, dispatch for every CPU */
+		__sync_fetch_and_add(&nr_dispatches, 1);
+		bpf_loop(nr_cpu_ids, dispatch_to_one_cpu_loopfn, NULL, 0);
+	} else {
+		/*
+		 * Force dispatch on the scheduling CPU so that it finds a task
+		 * to run for us.
+		 */
+		scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
+	}
+}
+
+void BPF_STRUCT_OPS(central_consume, s32 cpu)
+{
+	/*
+	 * When preempted, we want the central CPU to always run dispatch() as
+	 * soon as possible so that it can schedule other CPUs. Don't consume
+	 * the fallback dsq if central.
+	 */
+	if (cpu != central_cpu)
+		scx_bpf_consume(FALLBACK_DSQ_ID);
+}
+
+void BPF_STRUCT_OPS(central_consume_final, s32 cpu)
+{
+	/*
+	 * Now that the central CPU has dispatched, we can let it consume the
+	 * fallback dsq.
+	 */
+	if (cpu == central_cpu)
+		scx_bpf_consume(FALLBACK_DSQ_ID);
+}
+
+int BPF_STRUCT_OPS(central_init)
+{
+	if (switch_all)
+		scx_bpf_switch_all();
+
+	return scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+}
+
+void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops central_ops = {
+	/*
+	 * We are offloading all scheduling decisions to the central CPU and
+	 * thus being the last task on a given CPU doesn't mean anything
+	 * special. Enqueue the last tasks like any other tasks.
+	 */
+	.flags			= SCX_OPS_ENQ_LAST,
+
+	.select_cpu		= (void *)central_select_cpu,
+	.enqueue		= (void *)central_enqueue,
+	.dispatch		= (void *)central_dispatch,
+	.consume		= (void *)central_consume,
+	.consume_final		= (void *)central_consume_final,
+	.init			= (void *)central_init,
+	.exit			= (void *)central_exit,
+	.name			= "central",
+};
diff --git a/tools/sched_ext/scx_example_central.c b/tools/sched_ext/scx_example_central.c
new file mode 100644
index 000000000000..c85e84459c58
--- /dev/null
+++ b/tools/sched_ext/scx_example_central.c
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include "user_exit_info.h"
+#include "scx_example_central.skel.h"
+
+const char help_fmt[] =
+"A central FIFO sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-a] [-c CPU]\n"
+"\n"
+"  -a            Switch all tasks\n"
+"  -c CPU        Override the central CPU (default: 0)\n"
+"  -h            Display this help and exit\n";
+
+static volatile int exit_req;
+
+static void sigint_handler(int dummy)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_example_central *skel;
+	struct bpf_link *link;
+	u64 seq = 0;
+	s32 opt;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = scx_example_central__open();
+	assert(skel);
+
+	skel->rodata->central_cpu = 0;
+	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
+
+	while ((opt = getopt(argc, argv, "ahc:")) != -1) {
+		switch (opt) {
+		case 'a':
+			skel->rodata->switch_all = true;
+			break;
+		case 'c':
+			skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	assert(!scx_example_central__load(skel));
+
+	link = bpf_map__attach_struct_ops(skel->maps.central_ops);
+	assert(link);
+
+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
+		printf("[SEQ %lu]\n", seq++);
+		printf("total:%10lu    local:%10lu   queued:%10lu     lost:%10lu\n",
+		       skel->bss->nr_total,
+		       skel->bss->nr_locals,
+		       skel->bss->nr_queued,
+		       skel->bss->nr_lost_pids);
+		printf("                 dispatch:%10lu mismatch:%10lu overflow:%10lu\n",
+		       skel->bss->nr_dispatches,
+		       skel->bss->nr_mismatches,
+		       skel->bss->nr_overflows);
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	uei_print(&skel->bss->uei);
+	scx_example_central__destroy(skel);
+	return 0;
+}

From patchwork Wed Nov 30 08:23:04 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059556
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 7651CC352A1
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:26:46 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S235370AbiK3I0o (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:26:44 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55918 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234873AbiK3IZZ (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:25:25 -0500
Received: from mail-pl1-x633.google.com (mail-pl1-x633.google.com
 [IPv6:2607:f8b0:4864:20::633])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id E722E5C776;
        Wed, 30 Nov 2022 00:24:24 -0800 (PST)
Received: by mail-pl1-x633.google.com with SMTP id d3so10968524plr.10;
        Wed, 30 Nov 2022 00:24:24 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=sTx6H8f+ndFeS1WR24S49kfIzumfz2ZStsXAYvcIlho=;
        b=WeAHEayqB1rE8cJMhH0BNAf20UpSIBGx/i4g+C1yIjyzBp9rAcSEniWNobf/5Dav7C
         CESxiyh0gEtHlZTT3hG7TPOaPAEz8DFjxfhUzbdSF3yAX9CA9VkXKLYMS5nr9fPXj/3l
         5m81ZwF1hwCsHzmLC9BqebSdD+uwBb06HVnQsUseYgseGzz+sm2wgOSFgmMrajiOak8j
         8SDninprROYJep3hfEgrexvxoJ0NFjWSdiJVuX2HpKGLbPqTJoERJOHMxwnzL5Pd6mvk
         ddaGJ3PPHSU225HBlRahWCQi38moCjShla+JoAtZ0Wa7wqvDb2nqQ6K4Fdjy4fFQXPng
         HLlQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=sTx6H8f+ndFeS1WR24S49kfIzumfz2ZStsXAYvcIlho=;
        b=CpRvGj406fh5PcEKvh4pm0HbOsf1sxrHTze04aSkLS2lycXwip9XoMJzeKUjqovFbG
         EyJJOF+nNcE1IvnyBSGdwCrCMTJzDQSPh2iHoqBJBiQcKXJYdY+XgKiN/9IiGQJi8wAV
         PDkyWSSTuwBfGxWyOJLAdy6DrimgkcWiCIQ/HET2U/oKKnelfXn87bhX1eE5nVC2ySrN
         Gs1CKgU586qWATaiXnfBhEYX8AMut9RU4ppy5ShjoOZ+BIvWIxSgdw0YakhxWgUHIzy+
         so0of0eKnT/cZ1t99pcBISDW/uZmDNIuSEYCcDS0deOXVWr/xofKqFtN8uAyX0oqUyH3
         NnmQ==
X-Gm-Message-State: ANoB5pkkKiYaqE+YOFveiAGx8DpQylRv4StnRbDWfTAeIhe2kJZEA04V
        Ub5fmBMm2sgHjGUxI1/lY8I=
X-Google-Smtp-Source: 
 AA0mqf5TZN5pEmu46xuGXI8smPjweGBma2CdnQPyJR3K7eVBRBdquc+nxuWiBys1CYzH3/JZLn04Yg==
X-Received: by 2002:a17:902:e807:b0:188:f6b7:bbf8 with SMTP id
 u7-20020a170902e80700b00188f6b7bbf8mr41537349plg.112.1669796664242;
        Wed, 30 Nov 2022 00:24:24 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 i15-20020a17090332cf00b001892af9472esm741249plr.261.2022.11.30.00.24.23
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:23 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 22/31] sched_ext: Add task state tracking operations
Date: Tue, 29 Nov 2022 22:23:04 -1000
Message-Id: <20221130082313.3241517-23-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

Being able to track the task runnable and running state transitions are
useful for a variety of purposes including latency tracking and load factor
calculation.

Currently, BPF schedulers don't have a good way of tracking these
transitions. Becoming runnable can be determined from ops.enqueue() but
becoming quiescent can only be inferred from the lack of subsequent enqueue.
Also, as the local dsq can have multiple tasks and some events are handled
in the sched_ext core, it's difficult to determine when a given task starts
and stops executing.

This patch adds sched_ext_ops.runnable(), .running(), .stopping() and
.quiescent() operations to track the task runnable and running state
transitions. They're mostly self explanatory; however, we want to ensure
that running <-> stopping transitions are always contained within runnable
<-> quiescent transitions which is a bit different from how the scheduler
core behaves. This adds a bit of complication. See the comment in
dequeue_task_scx().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h | 65 +++++++++++++++++++++++++++++++++++++++
 kernel/sched/ext.c        | 31 +++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 6e25c3431bb4..4f8898556b28 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -209,6 +209,71 @@ struct sched_ext_ops {
 	 */
 	void (*consume_final)(s32 cpu);
 
+	/**
+	 * runnable - A task is becoming runnable on its associated CPU
+	 * @p: task becoming runnable
+	 * @enq_flags: %SCX_ENQ_*
+	 *
+	 * This and the following three functions can be used to track a task's
+	 * execution state transitions. A task becomes ->runnable() on a CPU,
+	 * and then goes through one or more ->running() and ->stopping() pairs
+	 * as it runs on the CPU, and eventually becomes ->quiescent() when it's
+	 * done running on the CPU.
+	 *
+	 * @p is becoming runnable on the CPU because it's
+	 *
+	 * - waking up (%SCX_ENQ_WAKEUP)
+	 * - being moved from another CPU
+	 * - being restored after temporarily taken off the queue for an
+	 *   attribute change.
+	 *
+	 * This and ->enqueue() are related but not coupled. This operation
+	 * notifies @p's state transition and may not be followed by ->enqueue()
+	 * e.g. when @p is being dispatched to a remote CPU. Likewise, a task
+	 * may be ->enqueue()'d without being preceded by this operation e.g.
+	 * after exhausting its slice.
+	 */
+	void (*runnable)(struct task_struct *p, u64 enq_flags);
+
+	/**
+	 * running - A task is starting to run on its associated CPU
+	 * @p: task starting to run
+	 *
+	 * See ->runnable() for explanation on the task state notifiers.
+	 */
+	void (*running)(struct task_struct *p);
+
+	/**
+	 * stopping - A task is stopping execution
+	 * @p: task stopping to run
+	 * @runnable: is task @p still runnable?
+	 *
+	 * See ->runnable() for explanation on the task state notifiers. If
+	 * !@runnable, ->quiescent() will be invoked after this operation
+	 * returns.
+	 */
+	void (*stopping)(struct task_struct *p, bool runnable);
+
+	/**
+	 * quiescent - A task is becoming not runnable on its associated CPU
+	 * @p: task becoming not runnable
+	 * @deq_flags: %SCX_DEQ_*
+	 *
+	 * See ->runnable() for explanation on the task state notifiers.
+	 *
+	 * @p is becoming quiescent on the CPU because it's
+	 *
+	 * - sleeping (%SCX_DEQ_SLEEP)
+	 * - being moved to another CPU
+	 * - being temporarily taken off the queue for an attribute change
+	 *   (%SCX_DEQ_SAVE)
+	 *
+	 * This and ->dequeue() are related but not coupled. This operation
+	 * notifies @p's state transition and may not be preceded by ->dequeue()
+	 * e.g. when @p is being dispatched to a remote CPU.
+	 */
+	void (*quiescent)(struct task_struct *p, u64 deq_flags);
+
 	/**
 	 * yield - Yield CPU
 	 * @from: yielding task
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4a98047a06bc..2eb382ed0e2f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -670,6 +670,9 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	rq->scx.nr_running++;
 	add_nr_running(rq, 1);
 
+	if (SCX_HAS_OP(runnable))
+		scx_ops.runnable(p, enq_flags);
+
 	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
 }
 
@@ -716,6 +719,26 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 		break;
 	}
 
+	/*
+	 * A currently running task which is going off @rq first gets dequeued
+	 * and then stops running. As we want running <-> stopping transitions
+	 * to be contained within runnable <-> quiescent transitions, trigger
+	 * ->stopping() early here instead of in put_prev_task_scx().
+	 *
+	 * @p may go through multiple stopping <-> running transitions between
+	 * here and put_prev_task_scx() if task attribute changes occur while
+	 * balance_scx() leaves @rq unlocked. However, they don't contain any
+	 * information meaningful to the BPF scheduler and can be suppressed by
+	 * skipping the callbacks if the task is !QUEUED.
+	 */
+	if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
+		update_curr_scx(rq);
+		scx_ops.stopping(p, false);
+	}
+
+	if (SCX_HAS_OP(quiescent))
+		scx_ops.quiescent(p, deq_flags);
+
 	p->scx.flags &= ~SCX_TASK_QUEUED;
 	scx_rq->nr_running--;
 	sub_nr_running(rq, 1);
@@ -1223,6 +1246,10 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 
 	p->se.exec_start = rq_clock_task(rq);
 
+	/* see dequeue_task_scx() on why we skip when !QUEUED */
+	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
+		scx_ops.running(p);
+
 	watchdog_unwatch_task(p, true);
 }
 
@@ -1230,6 +1257,10 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
 {
 	update_curr_scx(rq);
 
+	/* see dequeue_task_scx() on why we skip when !QUEUED */
+	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
+		scx_ops.stopping(p, true);
+
 	/*
 	 * If we're being called from put_prev_task_balance(), balance_scx() may
 	 * have decided that @p should keep running.

From patchwork Wed Nov 30 08:23:05 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059557
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 537DEC4321E
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:26:48 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S235088AbiK3I0q (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:26:46 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54984 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234858AbiK3IZa (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:25:30 -0500
Received: from mail-pj1-x1033.google.com (mail-pj1-x1033.google.com
 [IPv6:2607:f8b0:4864:20::1033])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 8A3985CD19;
        Wed, 30 Nov 2022 00:24:27 -0800 (PST)
Received: by mail-pj1-x1033.google.com with SMTP id
 k2-20020a17090a4c8200b002187cce2f92so1216036pjh.2;
        Wed, 30 Nov 2022 00:24:27 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=NHTDzRdz5DA6IrrQUBWLTf9dsKz4vZyh/RaT4ZRnGbs=;
        b=NswQGLw+Yjzh7aN9TicpM7rOiVvvLDWt+r98Mv2sRM2eh7iBFGBVhtsiSgeqLX2qwf
         BIV5hpYwnKM9fBH5R0b04rVUFEjwLrEi0iF4qZkaX3aK8UmqZOeHsxdHI+PgQoJPPe4X
         95ZtdDssB33Z/qck6fk95BKgpd8SHGmKkJZUq8qwJ9fJH5TGsseFFmHrJqkVf7R8i6Ia
         7RCnyqMSF1zYLNLCgjtucM7Guq2+Spl4KJMn8NfT0nww9oxnNnGMhSTc6ZiGmSiLG3CU
         mQQbCFUBtyoBa2lQLBDUUf6T70CxggfjvKbbtNJY2r591N3FJ34/9eMSeRs65wG4LUR7
         JTyg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=NHTDzRdz5DA6IrrQUBWLTf9dsKz4vZyh/RaT4ZRnGbs=;
        b=63K/HowMS8r51C4sFlH4irNbnJ/y4TNQLCPBlXguGBBFvrryNuTq8PBDQpiQp0WsvV
         QQNxeQvegq4xwSZ845wWQwAKF+OQUkQ3YZUFcdnnzVm2v1AJRzZZGtBmW379Lnj3afZQ
         G60bNkCeZxKBOfjZJ3KEenpG45lpGWV4b8kMZfGq0XzB6kx34QgzhtnN0s8Zd+I75vF/
         UVBZPyfSx3hME1v9E/58t5SKjlv7pR4oJzYj+UVDllmCmRV7LwCRCOe1Kd9M1e1MeaT1
         W9Oh+9ybdVf5tQl+roklo4RUeZ59g//sTS8F8jP1o/5mGygs2zyL0GV8bw/eKMwzDsj8
         Pccg==
X-Gm-Message-State: ANoB5pn/5bTAmZgoyNfCd2969XX76UpJEAzwHUfkiwu7vUFBWIY5pm7y
        CBNCrUKwYZftk6Zg8o5XNVg=
X-Google-Smtp-Source: 
 AA0mqf7pccS0FtBSkVA8kSq6vgg+b67wsbp3cCRBG6xVFu1aihf6EDapVgQxsOKLbvQyxZ4GWwTl5Q==
X-Received: by 2002:a17:902:7006:b0:181:b55a:f987 with SMTP id
 y6-20020a170902700600b00181b55af987mr41609614plk.67.1669796666302;
        Wed, 30 Nov 2022 00:24:26 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 o13-20020a170902d4cd00b0017f7c4e260fsm765905plg.150.2022.11.30.00.24.25
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:25 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 23/31] sched_ext: Implement tickless support
Date: Tue, 29 Nov 2022 22:23:05 -1000
Message-Id: <20221130082313.3241517-24-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

Allow BPF schedulers to indicate tickless operation by setting p->scx.slice
to SCX_SLICE_INF. A CPU whose current task has infinte slice goes into
tickless operation.

scx_example_central is updated to use tickless operations for all tasks and
instead use a BPF timer to expire slices. This also uses the SCX_ENQ_PREEMPT
and task state tracking added by the previous patches.

Currently, there is no way to pin the timer on the central CPU, so it may
end up on one of the worker CPUs; however, outside of that, the worker CPUs
can go tickless both while running sched_ext tasks and idling.

With schbench running, scx_example_central shows:

  root@test ~# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts
  LOC:     142024        656        664        449   Local timer interrupts
  LOC:     161663        663        665        449   Local timer interrupts

Without it:

  root@test ~ [SIGINT]# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts
  LOC:     188778       3142       3793       3993   Local timer interrupts
  LOC:     198993       5314       6323       6438   Local timer interrupts

While scx_example_central itself is too barebone to be useful as a
production scheduler, a more featureful central scheduler can be built using
the same approach. Google's experience shows that such an approach can have
significant benefits for certain applications such as VM hosting.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h                 |   1 +
 kernel/sched/core.c                       |   9 +-
 kernel/sched/ext.c                        |  43 +++++-
 kernel/sched/ext.h                        |   2 +
 kernel/sched/sched.h                      |   6 +
 tools/sched_ext/scx_example_central.bpf.c | 160 +++++++++++++++++++++-
 tools/sched_ext/scx_example_central.c     |   3 +-
 7 files changed, 212 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 4f8898556b28..b1c95fb11c8d 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -19,6 +19,7 @@ enum scx_consts {
 	SCX_EXIT_MSG_LEN	= 1024,
 
 	SCX_SLICE_DFL		= 20 * NSEC_PER_MSEC,
+	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
 };
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 20536957840d..89d2421809da 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1200,13 +1200,16 @@ bool sched_can_stop_tick(struct rq *rq)
 		return true;
 
 	/*
-	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
-	 * if there's more than one we need the tick for involuntary
-	 * preemption.
+	 * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
+	 * left. For CFS, if there's more than one we need the tick for
+	 * involuntary preemption. For SCX, ask.
 	 */
 	if (!scx_switched_all() && rq->nr_running > 1)
 		return false;
 
+	if (scx_enabled() && !scx_can_stop_tick(rq))
+		return false;
+
 	return true;
 }
 #endif /* CONFIG_NO_HZ_FULL */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2eb382ed0e2f..cf6493f684f3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -383,7 +383,8 @@ static void update_curr_scx(struct rq *rq)
 	account_group_exec_runtime(curr, delta_exec);
 	cgroup_account_cputime(curr, delta_exec);
 
-	curr->scx.slice -= min(curr->scx.slice, delta_exec);
+	if (curr->scx.slice != SCX_SLICE_INF)
+		curr->scx.slice -= min(curr->scx.slice, delta_exec);
 }
 
 static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
@@ -1251,6 +1252,20 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 		scx_ops.running(p);
 
 	watchdog_unwatch_task(p, true);
+
+	/*
+	 * @p is getting newly scheduled or got kicked after someone updated its
+	 * slice. Refresh whether tick can be stopped. See can_stop_tick_scx().
+	 */
+	if ((p->scx.slice == SCX_SLICE_INF) !=
+	    (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
+		if (p->scx.slice == SCX_SLICE_INF)
+			rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
+		else
+			rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
+
+		sched_update_tick_dependency(rq);
+	}
 }
 
 static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
@@ -1742,6 +1757,26 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
 	return 0;
 }
 
+#ifdef CONFIG_NO_HZ_FULL
+bool scx_can_stop_tick(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	if (scx_ops_disabling())
+		return false;
+
+	if (p->sched_class != &ext_sched_class)
+		return true;
+
+	/*
+	 * @rq can consume from different dsq's, so we can't tell whether it
+	 * needs the tick or not by looking at nr_running. Allow stopping ticks
+	 * iff the BPF scheduler indicated so. See set_next_task_scx().
+	 */
+	return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
+}
+#endif
+
 /*
  * Omitted operations:
  *
@@ -1923,7 +1958,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	struct scx_task_iter sti;
 	struct task_struct *p;
 	const char *reason;
-	int i, type;
+	int i, cpu, type;
 
 	type = atomic_read(&scx_exit_type);
 	while (true) {
@@ -2021,6 +2056,10 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	scx_task_iter_exit(&sti);
 	spin_unlock_irq(&scx_tasks_lock);
 
+	/* kick all CPUs to restore ticks */
+	for_each_possible_cpu(cpu)
+		resched_cpu(cpu);
+
 forward_progress_guaranteed:
 	/*
 	 * Here, every runnable task is guaranteed to make forward progress and
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 3597b7b5829e..e9ec267f13d5 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -94,6 +94,7 @@ void scx_post_fork(struct task_struct *p);
 void scx_cancel_fork(struct task_struct *p);
 int balance_scx(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
 int scx_check_setscheduler(struct task_struct *p, int policy);
+bool scx_can_stop_tick(struct rq *rq);
 void init_sched_ext_class(void);
 
 __printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
@@ -156,6 +157,7 @@ static inline int balance_scx(struct rq *rq, struct task_struct *prev,
 			      struct rq_flags *rf) { return 0; }
 static inline int scx_check_setscheduler(struct task_struct *p,
 					 int policy) { return 0; }
+static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
 static inline void init_sched_ext_class(void) {}
 static inline void scx_notify_sched_tick(void) {}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0d8b52c52e2b..a95aae3bc69a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -690,11 +690,17 @@ struct cfs_rq {
 };
 
 #ifdef CONFIG_SCHED_CLASS_EXT
+/* scx_rq->flags, protected by the rq lock */
+enum scx_rq_flags {
+	SCX_RQ_CAN_STOP_TICK	= 1 << 0,
+};
+
 struct scx_rq {
 	struct scx_dispatch_q	local_dsq;
 	struct list_head	watchdog_list;
 	u64			ops_qseq;
 	u32			nr_running;
+	u32			flags;
 #ifdef CONFIG_SMP
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_preempt;
diff --git a/tools/sched_ext/scx_example_central.bpf.c b/tools/sched_ext/scx_example_central.bpf.c
index f53ed4baf92d..ce994e9ecc92 100644
--- a/tools/sched_ext/scx_example_central.bpf.c
+++ b/tools/sched_ext/scx_example_central.bpf.c
@@ -14,7 +14,26 @@
  *    utilize and verify various scx mechanisms such as LOCAL_ON dispatching and
  *    consume_final().
  *
- * b. Preemption
+ * b. Tickless operation
+ *
+ *    All tasks are dispatched with the infinite slice which allows stopping the
+ *    ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full
+ *    parameter. The tickless operation can be observed through
+ *    /proc/interrupts.
+ *
+ *    Periodic switching is enforced by a periodic timer checking all CPUs and
+ *    preempting them as necessary. Unfortunately, BPF timer currently doesn't
+ *    have a way to pin to a specific CPU, so the periodic timer isn't pinned to
+ *    the central CPU.
+ *
+ * c. Preemption
+ *
+ *    Kthreads are unconditionally queued to the head of a matching local dsq
+ *    and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always
+ *    prioritized over user threads, which is required for ensuring forward
+ *    progress as e.g. the periodic timer may run on a ksoftirqd and if the
+ *    ksoftirqd gets starved by a user thread, there may not be anything else to
+ *    vacate that user thread.
  *
  *    SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the
  *    next tasks.
@@ -38,8 +57,18 @@ const volatile bool switch_all;
 const volatile s32 central_cpu;
 const volatile u32 nr_cpu_ids;
 
+/*
+ * XXX - kernel should be able to shut down the associated timers. For now,
+ * implement it manually. They should be bool but the verifier gets confused
+ * about the value range of bool variables when verifying the return value of
+ * the loopfns. Also, they can't be static because verification fails with BTF
+ * error message for some reason.
+ */
+int timer_running;
+int timer_kill;
+
 u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
-u64 nr_dispatches, nr_mismatches, nr_overflows;
+u64 nr_timers, nr_dispatches, nr_mismatches, nr_overflows;
 
 struct user_exit_info uei;
 
@@ -51,6 +80,7 @@ struct {
 
 /* can't use percpu map due to bad lookups */
 static bool cpu_gimme_task[MAX_CPUS];
+static u64 cpu_started_at[MAX_CPUS];
 
 struct central_timer {
 	struct bpf_timer timer;
@@ -86,9 +116,22 @@ void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
 
 	__sync_fetch_and_add(&nr_total, 1);
 
+	/*
+	 * Push per-cpu kthreads at the head of local dsq's and preempt the
+	 * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked
+	 * behind other threads which is necessary for forward progress
+	 * guarantee as we depend on the BPF timer which may run from ksoftirqd.
+	 */
+	if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
+		__sync_fetch_and_add(&nr_locals, 1);
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
+				 enq_flags | SCX_ENQ_PREEMPT);
+		return;
+	}
+
 	if (bpf_map_push_elem(&central_q, &pid, 0)) {
 		__sync_fetch_and_add(&nr_overflows, 1);
-		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, enq_flags);
+		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
 		return;
 	}
 
@@ -122,12 +165,12 @@ static int dispatch_a_task_loopfn(u32 idx, void *data)
 	 */
 	if (!scx_bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
 		__sync_fetch_and_add(&nr_mismatches, 1);
-		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, 0);
+		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
 		return 0;
 	}
 
 	/* dispatch to the local and mark that @cpu doesn't need more tasks */
-	scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
+	scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
 
 	if (cpu != central_cpu)
 		scx_bpf_kick_cpu(cpu, 0);
@@ -196,16 +239,119 @@ void BPF_STRUCT_OPS(central_consume_final, s32 cpu)
 		scx_bpf_consume(FALLBACK_DSQ_ID);
 }
 
+void BPF_STRUCT_OPS(central_running, struct task_struct *p)
+{
+	s32 cpu = scx_bpf_task_cpu(p);
+	u64 *started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+	if (started_at)
+		*started_at = bpf_ktime_get_ns() ?: 1;	/* 0 indicates idle */
+}
+
+void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
+{
+	s32 cpu = scx_bpf_task_cpu(p);
+	u64 *started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+	if (started_at)
+		*started_at = 0;
+}
+
+static int kick_cpus_loopfn(u32 idx, void *data)
+{
+	s32 cpu = (nr_timers + idx) % nr_cpu_ids;
+	u64 *nr_to_kick = data;
+	u64 now = bpf_ktime_get_ns();
+	u64 *started_at;
+	s32 pid;
+
+	if (cpu == central_cpu)
+		goto kick;
+
+	/* kick iff there's something pending */
+	if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) ||
+	    scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu))
+		;
+	else if (*nr_to_kick)
+		(*nr_to_kick)--;
+	else
+		return 0;
+
+	/* and the current one exhausted its slice */
+	started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+	if (started_at && *started_at &&
+	    vtime_before(now, *started_at + SCX_SLICE_DFL))
+		return 0;
+kick:
+	scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
+	return 0;
+}
+
+static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+	u64 nr_to_kick = nr_queued;
+
+	if (timer_kill) {
+		timer_running = 0;
+		return 0;
+	}
+
+	bpf_loop(nr_cpu_ids, kick_cpus_loopfn, &nr_to_kick, 0);
+	bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+	__sync_fetch_and_add(&nr_timers, 1);
+	return 0;
+}
+
 int BPF_STRUCT_OPS(central_init)
 {
+	u32 key = 0;
+	struct bpf_timer *timer;
+	int ret;
+
 	if (switch_all)
 		scx_bpf_switch_all();
 
-	return scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+	ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+	if (ret)
+		return ret;
+
+	timer = bpf_map_lookup_elem(&central_timer, &key);
+	if (!timer)
+		return -ESRCH;
+
+	bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
+	bpf_timer_set_callback(timer, central_timerfn);
+	ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+	timer_running = !ret;
+	return ret;
+}
+
+static int exit_wait_timer_nested_loopfn(u32 idx, void *data)
+{
+	u64 expiration = *(u64 *)data;
+
+	return !timer_running || vtime_before(expiration, bpf_ktime_get_ns());
+}
+
+static int exit_wait_timer_loopfn(u32 idx, void *data)
+{
+	u64 expiration = *(u64 *)data;
+
+	bpf_loop(1 << 23, exit_wait_timer_nested_loopfn, data, 0);
+	return !timer_running || vtime_before(expiration, bpf_ktime_get_ns());
 }
 
 void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
 {
+	u64 expiration = bpf_ktime_get_ns() + 1000 * MS_TO_NS;
+
+	/*
+	 * XXX - We just need to make sure that the timer body isn't running on
+	 * exit. If we catch the timer while waiting, great. If not, it's still
+	 * highly likely that the timer body won't run in the future. Once bpf
+	 * can shut down associated timers, this hackery should go away.
+	 */
+	timer_kill = 1;
+	bpf_loop(1 << 23, exit_wait_timer_loopfn, &expiration, 0);
+
 	uei_record(&uei, ei);
 }
 
@@ -223,6 +369,8 @@ struct sched_ext_ops central_ops = {
 	.dispatch		= (void *)central_dispatch,
 	.consume		= (void *)central_consume,
 	.consume_final		= (void *)central_consume_final,
+	.running		= (void *)central_running,
+	.stopping		= (void *)central_stopping,
 	.init			= (void *)central_init,
 	.exit			= (void *)central_exit,
 	.name			= "central",
diff --git a/tools/sched_ext/scx_example_central.c b/tools/sched_ext/scx_example_central.c
index c85e84459c58..83cbd1932958 100644
--- a/tools/sched_ext/scx_example_central.c
+++ b/tools/sched_ext/scx_example_central.c
@@ -76,7 +76,8 @@ int main(int argc, char **argv)
 		       skel->bss->nr_locals,
 		       skel->bss->nr_queued,
 		       skel->bss->nr_lost_pids);
-		printf("                 dispatch:%10lu mismatch:%10lu overflow:%10lu\n",
+		printf("timer:%10lu dispatch:%10lu mismatch:%10lu overflow:%10lu\n",
+		       skel->bss->nr_timers,
 		       skel->bss->nr_dispatches,
 		       skel->bss->nr_mismatches,
 		       skel->bss->nr_overflows);

From patchwork Wed Nov 30 08:23:06 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059562
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id CC704C4708B
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:27:10 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234752AbiK3I1J (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:27:09 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55168 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235137AbiK3I0D (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:26:03 -0500
Received: from mail-pg1-x52d.google.com (mail-pg1-x52d.google.com
 [IPv6:2607:f8b0:4864:20::52d])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id A942C5C748;
        Wed, 30 Nov 2022 00:24:29 -0800 (PST)
Received: by mail-pg1-x52d.google.com with SMTP id h193so15372074pgc.10;
        Wed, 30 Nov 2022 00:24:29 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=xuV9L30JoCh7E6KY+umYFNLMi3zgwK6bilplbHB6nTg=;
        b=nNeZ3+CuWBpHcc8GovH9fZ6ZO7yZF7T+Pxc92A4+yD99l6tipQFB+cQ0MT2/GSTi8I
         oP3XTHdSpjhzUVZ0TMJ+B4FFMUORjuw1i71sjqHeDFnGwulIESX27EHqys9hiwsmegXI
         QG0Yf/dj4ZgoRQgsY6Qxuf3oohwtz5J8PfhLsdxQ+qCN2Do1jl/pnjxwq4WOL7GJXL9b
         4xb53909SmaDK8KL6r8l8Qp0wIpRKHMl+1KFw36AXwzi/H+q6+ITGwjo6bFayGqdYAwG
         Azx8WCKVmVcSMRGlVFxo39BlejLFfu/IgbizKdKgzXlzHncTeOkRXY54cjV2GBeal5eE
         FUfA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=xuV9L30JoCh7E6KY+umYFNLMi3zgwK6bilplbHB6nTg=;
        b=LonLCpnngo9XdJq5vW+8t+wWC7rHqQWz9cOselVm/DGq133tVTW5nyTMjs38nh/f+H
         tcdZQmdEt+2Y9BDDHyWMANailtxN5A2OxeypnvgfaSPWuDPL82CahrQ/Y0CCgKdA24Yh
         b4Sxveiqmp648uSjHpyULtwuDYqLFDsn7DgtIs21B6ukY+HpTnAxL1wK+DG+qMBO38wM
         THRgHBo/K6EoprMdCBppFmQPVjxd8VpuNsRzfNLtzeMkjbw0sF9oI8pWb600My7ZmAD/
         NKMwEZ2gvzSRZUmnE7w3lSqlPm/O9cUpTRZELF7tKGr61K8qOA67RUxUUMO1Wp6CdQad
         F2Pg==
X-Gm-Message-State: ANoB5plmG5aceKuQs+zUOAz0/jBLM1lcOOzK1AB4G2tJRoGopD3JTCNr
        uSN3e/n4ct32i7D5KnF4pGw=
X-Google-Smtp-Source: 
 AA0mqf4be4E5yI+mi8uXByXVy6y277w04ISWZrgQ7mHvSwpumST0dLMCwVj1gjJntmzcgFtAwkXdiQ==
X-Received: by 2002:a63:d34e:0:b0:477:650a:705c with SMTP id
 u14-20020a63d34e000000b00477650a705cmr34616401pgi.588.1669796668533;
        Wed, 30 Nov 2022 00:24:28 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 b7-20020a170902650700b0018869119e37sm756024plk.142.2022.11.30.00.24.27
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:28 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 24/31] sched_ext: Add cgroup support
Date: Tue, 29 Nov 2022 22:23:06 -1000
Message-Id: <20221130082313.3241517-25-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

Add sched_ext_ops operations to init/exit cgroups, and track task migrations
and config changes. Because different BPF schedulers may implement different
subsets of CPU control features, allow BPF schedulers to pick which cgroup
interface files to enable using SCX_OPS_CGROUP_KNOB_* flags. For now, only
the weight knobs are supported but adding more should be straightforward.

While a BPF scheduler is being enabled and disabled, relevant cgroup
operations are locked out using scx_cgroup_rwsem. This avoids situations
like task prep taking place while the task is being moved across cgroups,
making things easier for BPF schedulers.

This patch also adds scx_example_pair which implements a variant of core
scheduling where a hyperthread pair only run tasks from the same cgroup. The
BPF scheduler achieves this by putting tasks into per-cgroup queues,
time-slicing the cgroup to run for each pair first, and then scheduling
within the cgroup. See the header comment in scx_example_pair.bpf.c for more
details.

Note that scx_example_pair's cgroup-boundary guarantee breaks down for tasks
running in higher priority scheduler classes. This will be addressed by a
followup patch which implements a mechanism to track CPU preemption.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h              |  96 ++++-
 init/Kconfig                           |   5 +
 kernel/sched/core.c                    |  69 ++-
 kernel/sched/ext.c                     | 326 ++++++++++++++-
 kernel/sched/ext.h                     |  23 +
 kernel/sched/sched.h                   |  12 +-
 tools/sched_ext/.gitignore             |   1 +
 tools/sched_ext/Makefile               |   9 +-
 tools/sched_ext/scx_example_pair.bpf.c | 554 +++++++++++++++++++++++++
 tools/sched_ext/scx_example_pair.c     | 143 +++++++
 tools/sched_ext/scx_example_pair.h     |  10 +
 11 files changed, 1226 insertions(+), 22 deletions(-)
 create mode 100644 tools/sched_ext/scx_example_pair.bpf.c
 create mode 100644 tools/sched_ext/scx_example_pair.c
 create mode 100644 tools/sched_ext/scx_example_pair.h

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index b1c95fb11c8d..dc51304b6599 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -12,6 +12,8 @@
 #include <linux/rhashtable.h>
 #include <linux/llist.h>
 
+struct cgroup;
+
 enum scx_consts {
 	SCX_OPS_NAME_LEN	= 128,
 	SCX_EXIT_REASON_LEN	= 128,
@@ -109,14 +111,27 @@ enum scx_ops_flags {
 	 */
 	SCX_OPS_ENQ_EXITING	= 1LLU << 2,
 
+	/*
+	 * CPU cgroup knob enable flags
+	 */
+	SCX_OPS_CGROUP_KNOB_WEIGHT = 1LLU << 16,	/* cpu.weight */
+
 	SCX_OPS_ALL_FLAGS	= SCX_OPS_KEEP_BUILTIN_IDLE |
 				  SCX_OPS_ENQ_LAST |
-				  SCX_OPS_ENQ_EXITING,
+				  SCX_OPS_ENQ_EXITING |
+				  SCX_OPS_CGROUP_KNOB_WEIGHT,
 };
 
 /* argument container for ops.enable() and friends */
 struct scx_enable_args {
-	/* empty for now */
+	/* the cgroup the task is joining */
+	struct cgroup		*cgroup;
+};
+
+/* argument container for ops->cgroup_init() */
+struct scx_cgroup_init_args {
+	/* the weight of the cgroup [1..10000] */
+	u32			weight;
 };
 
 /**
@@ -341,7 +356,8 @@ struct sched_ext_ops {
 	 * @p: task to enable BPF scheduling for
 	 * @args: enable arguments, see the struct definition
 	 *
-	 * Enable @p for BPF scheduling. @p will start running soon.
+	 * Enable @p for BPF scheduling. @p is now in the cgroup specified for
+	 * the preceding prep_enable() and will start running soon.
 	 */
 	void (*enable)(struct task_struct *p, struct scx_enable_args *args);
 
@@ -365,6 +381,77 @@ struct sched_ext_ops {
 	 */
 	void (*disable)(struct task_struct *p);
 
+	/**
+	 * cgroup_init - Initialize a cgroup
+	 * @cgrp: cgroup being initialized
+	 * @args: init arguments, see the struct definition
+	 *
+	 * Either the BPF scheduler is being loaded or @cgrp created, initialize
+	 * @cgrp for sched_ext. This operation may block.
+	 *
+	 * Return 0 for success, -errno for failure. An error return while
+	 * loading will abort loading of the BPF scheduler. During cgroup
+	 * creation, it will abort the specific cgroup creation.
+	 */
+	s32 (*cgroup_init)(struct cgroup *cgrp,
+			   struct scx_cgroup_init_args *args);
+
+	/**
+	 * cgroup_exit - Exit a cgroup
+	 * @cgrp: cgroup being exited
+	 *
+	 * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
+	 * @cgrp for sched_ext. This operation my block.
+	 */
+	void (*cgroup_exit)(struct cgroup *cgrp);
+
+	/**
+	 * cgroup_prep_move - Prepare a task to be moved to a different cgroup
+	 * @p: task being moved
+	 * @from: cgroup @p is being moved from
+	 * @to: cgroup @p is being moved to
+	 *
+	 * Prepare @p for move from cgroup @from to @to. This operation may
+	 * block and can be used for allocations.
+	 *
+	 * Return 0 for success, -errno for failure. An error return aborts the
+	 * migration.
+	 */
+	s32 (*cgroup_prep_move)(struct task_struct *p,
+				struct cgroup *from, struct cgroup *to);
+
+	/**
+	 * cgroup_move - Commit cgroup move
+	 * @p: task being moved
+	 * @from: cgroup @p is being moved from
+	 * @to: cgroup @p is being moved to
+	 *
+	 * Commit the move. @p is dequeued during this operation.
+	 */
+	void (*cgroup_move)(struct task_struct *p,
+			    struct cgroup *from, struct cgroup *to);
+
+	/**
+	 * cgroup_cancel_move - Cancel cgroup move
+	 * @p: task whose cgroup move is being canceled
+	 * @from: cgroup @p was being moved from
+	 * @to: cgroup @p was being moved to
+	 *
+	 * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
+	 * Undo the preparation.
+	 */
+	void (*cgroup_cancel_move)(struct task_struct *p,
+				   struct cgroup *from, struct cgroup *to);
+
+	/**
+	 * cgroup_set_weight - A cgroup's weight is being changed
+	 * @cgrp: cgroup whose weight is being updated
+	 * @weight: new weight [1..10000]
+	 *
+	 * Update @tg's weight to @weight.
+	 */
+	void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
 	/*
 	 * All online ops must come before ops.init().
 	 */
@@ -481,6 +568,9 @@ struct sched_ext_entity {
 
 	/* cold fields */
 	struct list_head	tasks_node;
+#ifdef CONFIG_EXT_GROUP_SCHED
+	struct cgroup		*cgrp_moving_from;
+#endif
 };
 
 void sched_ext_free(struct task_struct *p);
diff --git a/init/Kconfig b/init/Kconfig
index abf65098f1b6..826624ec8925 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1037,6 +1037,11 @@ config RT_GROUP_SCHED
 	  realtime bandwidth for them.
 	  See Documentation/scheduler/sched-rt-group.rst for more information.
 
+config EXT_GROUP_SCHED
+	bool
+	depends on SCHED_CLASS_EXT && CGROUP_SCHED
+	default y
+
 endif #CGROUP_SCHED
 
 config UCLAMP_TASK_GROUP
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 89d2421809da..79560641a61f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9720,6 +9720,9 @@ void __init sched_init(void)
 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_EXT_GROUP_SCHED
+		root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
+#endif /* CONFIG_EXT_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
@@ -10175,6 +10178,7 @@ struct task_group *sched_create_group(struct task_group *parent)
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 
+	scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
 	alloc_uclamp_sched_group(tg, parent);
 
 	return tg;
@@ -10286,6 +10290,8 @@ void sched_move_task(struct task_struct *tsk, enum sched_move_task_reason reason
 		put_prev_task(rq, tsk);
 
 	sched_change_group(tsk);
+	if (reason == SCHED_MOVE_TASK_CGROUP)
+		scx_cgroup_move_task(tsk);
 
 	if (queued)
 		enqueue_task(rq, tsk, queue_flags);
@@ -10325,6 +10331,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 	struct task_group *parent = css_tg(css->parent);
+	int ret;
+
+	ret = scx_tg_online(tg);
+	if (ret)
+		return ret;
 
 	if (parent)
 		sched_online_group(tg, parent);
@@ -10341,6 +10352,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 	return 0;
 }
 
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+	struct task_group *tg = css_tg(css);
+
+	scx_tg_offline(tg);
+}
+
 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
@@ -10358,9 +10376,10 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
 	sched_unregister_group(tg);
 }
 
-#ifdef CONFIG_RT_GROUP_SCHED
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 {
+#ifdef CONFIG_RT_GROUP_SCHED
 	struct task_struct *task;
 	struct cgroup_subsys_state *css;
 
@@ -10368,7 +10387,8 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 		if (!sched_rt_can_attach(css_tg(css), task))
 			return -EINVAL;
 	}
-	return 0;
+#endif
+	return scx_cgroup_can_attach(tset);
 }
 #endif
 
@@ -10379,7 +10399,16 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 
 	cgroup_taskset_for_each(task, css, tset)
 		sched_move_task(task, SCHED_MOVE_TASK_CGROUP);
+
+	scx_cgroup_finish_attach();
+}
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+	scx_cgroup_cancel_attach(tset);
 }
+#endif
 
 #ifdef CONFIG_UCLAMP_TASK_GROUP
 static void cpu_util_update_eff(struct cgroup_subsys_state *css)
@@ -10562,9 +10591,15 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cftype, u64 shareval)
 {
+	int ret;
+
 	if (shareval > scale_load_down(ULONG_MAX))
 		shareval = MAX_SHARES;
-	return sched_group_set_shares(css_tg(css), scale_load(shareval));
+	ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
+	if (!ret)
+		scx_group_set_weight(css_tg(css),
+				     sched_weight_to_cgroup(shareval));
+	return ret;
 }
 
 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
@@ -11028,11 +11063,15 @@ static int cpu_extra_stat_show(struct seq_file *sf,
 	return 0;
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 
 static unsigned long tg_weight(struct task_group *tg)
 {
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	return scale_load_down(tg->shares);
+#else
+	return sched_weight_from_cgroup(tg->cgrp_weight);
+#endif
 }
 
 static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
@@ -11045,13 +11084,17 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cft, u64 cgrp_weight)
 {
 	unsigned long weight;
+	int ret;
 
 	if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
 		return -ERANGE;
 
 	weight = sched_weight_from_cgroup(cgrp_weight);
 
-	return sched_group_set_shares(css_tg(css), scale_load(weight));
+	ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+	if (!ret)
+		scx_group_set_weight(css_tg(css), cgrp_weight);
+	return ret;
 }
 
 static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
@@ -11076,7 +11119,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
 				     struct cftype *cft, s64 nice)
 {
 	unsigned long weight;
-	int idx;
+	int idx, ret;
 
 	if (nice < MIN_NICE || nice > MAX_NICE)
 		return -ERANGE;
@@ -11085,7 +11128,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
 	idx = array_index_nospec(idx, 40);
 	weight = sched_prio_to_weight[idx];
 
-	return sched_group_set_shares(css_tg(css), scale_load(weight));
+	ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+	if (!ret)
+		scx_group_set_weight(css_tg(css),
+				     sched_weight_to_cgroup(weight));
+	return ret;
 }
 #endif
 
@@ -11147,7 +11194,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
 #endif
 
 struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 	[CPU_CFTYPE_WEIGHT] = {
 		.name = "weight",
 		.flags = CFTYPE_NOT_ON_ROOT,
@@ -11201,13 +11248,17 @@ struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = {
 struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_alloc	= cpu_cgroup_css_alloc,
 	.css_online	= cpu_cgroup_css_online,
+	.css_offline	= cpu_cgroup_css_offline,
 	.css_released	= cpu_cgroup_css_released,
 	.css_free	= cpu_cgroup_css_free,
 	.css_extra_stat_show = cpu_extra_stat_show,
-#ifdef CONFIG_RT_GROUP_SCHED
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 	.can_attach	= cpu_cgroup_can_attach,
 #endif
 	.attach		= cpu_cgroup_attach,
+#ifdef CONFIG_EXT_GROUP_SCHED
+	.cancel_attach	= cpu_cgroup_cancel_attach,
+#endif
 	.legacy_cftypes	= cpu_legacy_cftypes,
 	.dfl_cftypes	= cpu_cftypes,
 	.early_init	= true,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index cf6493f684f3..bd03b55fbcf5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1561,6 +1561,19 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 		resched_curr(rq);
 }
 
+static struct cgroup *tg_cgrp(struct task_group *tg)
+{
+	/*
+	 * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
+	 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
+	 * root cgroup.
+	 */
+	if (tg && tg->css.cgroup)
+		return tg->css.cgroup;
+	else
+		return &cgrp_dfl_root.cgrp;
+}
+
 static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 {
 	int ret;
@@ -1570,7 +1583,7 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
 	p->scx.disallow = false;
 
 	if (SCX_HAS_OP(prep_enable)) {
-		struct scx_enable_args args = { };
+		struct scx_enable_args args = { .cgroup = tg_cgrp(tg) };
 
 		ret = scx_ops.prep_enable(p, &args);
 		if (unlikely(ret)) {
@@ -1610,7 +1623,8 @@ static void scx_ops_enable_task(struct task_struct *p)
 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_OPS_PREPPED));
 
 	if (SCX_HAS_OP(enable)) {
-		struct scx_enable_args args = { };
+		struct scx_enable_args args =
+			{ .cgroup = tg_cgrp(p->sched_task_group) };
 		scx_ops.enable(p, &args);
 	}
 	p->scx.flags &= ~SCX_TASK_OPS_PREPPED;
@@ -1623,7 +1637,8 @@ static void scx_ops_disable_task(struct task_struct *p)
 
 	if (p->scx.flags & SCX_TASK_OPS_PREPPED) {
 		if (SCX_HAS_OP(cancel_enable)) {
-			struct scx_enable_args args = { };
+			struct scx_enable_args args =
+				{ .cgroup = tg_cgrp(task_group(p)) };
 			scx_ops.cancel_enable(p, &args);
 		}
 		p->scx.flags &= ~SCX_TASK_OPS_PREPPED;
@@ -1777,6 +1792,156 @@ bool scx_can_stop_tick(struct rq *rq)
 }
 #endif
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+
+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
+
+int scx_tg_online(struct task_group *tg)
+{
+	int ret = 0;
+
+	WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
+
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	if (SCX_HAS_OP(cgroup_init)) {
+		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+
+		ret = scx_ops.cgroup_init(tg->css.cgroup, &args);
+		if (!ret)
+			tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
+		else
+			ret = ops_sanitize_err("cgroup_init", ret);
+	} else {
+		tg->scx_flags |= SCX_TG_ONLINE;
+	}
+
+	percpu_up_read(&scx_cgroup_rwsem);
+	return ret;
+}
+
+void scx_tg_offline(struct task_group *tg)
+{
+	WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
+
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
+		scx_ops.cgroup_exit(tg->css.cgroup);
+	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
+
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
+int scx_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *css;
+	struct task_struct *p;
+	int ret;
+
+	/* released in scx_finish/cancel_attach() */
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	if (!scx_enabled())
+		return 0;
+
+	cgroup_taskset_for_each(p, css, tset) {
+		struct cgroup *from = tg_cgrp(p->sched_task_group);
+
+		if (SCX_HAS_OP(cgroup_prep_move)) {
+			ret = scx_ops.cgroup_prep_move(p, from, css->cgroup);
+			if (ret)
+				goto err;
+		}
+
+		WARN_ON_ONCE(p->scx.cgrp_moving_from);
+		p->scx.cgrp_moving_from = from;
+	}
+
+	return 0;
+
+err:
+	cgroup_taskset_for_each(p, css, tset) {
+		if (!p->scx.cgrp_moving_from)
+			break;
+		if (SCX_HAS_OP(cgroup_cancel_move))
+			scx_ops.cgroup_cancel_move(p, p->scx.cgrp_moving_from,
+						   css->cgroup);
+		p->scx.cgrp_moving_from = NULL;
+	}
+
+	percpu_up_read(&scx_cgroup_rwsem);
+	return ops_sanitize_err("cgroup_prep_move", ret);
+}
+
+void scx_cgroup_move_task(struct task_struct *p)
+{
+	if (!scx_enabled())
+		return;
+
+	if (SCX_HAS_OP(cgroup_move)) {
+		WARN_ON_ONCE(!p->scx.cgrp_moving_from);
+		scx_ops.cgroup_move(p, p->scx.cgrp_moving_from,
+				    tg_cgrp(p->sched_task_group));
+	}
+	p->scx.cgrp_moving_from = NULL;
+}
+
+void scx_cgroup_finish_attach(void)
+{
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *css;
+	struct task_struct *p;
+
+	if (!scx_enabled())
+		goto out_unlock;
+
+	cgroup_taskset_for_each(p, css, tset) {
+		if (SCX_HAS_OP(cgroup_cancel_move)) {
+			WARN_ON_ONCE(!p->scx.cgrp_moving_from);
+			scx_ops.cgroup_cancel_move(p, p->scx.cgrp_moving_from,
+						   css->cgroup);
+		}
+		p->scx.cgrp_moving_from = NULL;
+	}
+out_unlock:
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
+void scx_group_set_weight(struct task_group *tg, unsigned long weight)
+{
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	if (tg->scx_weight != weight) {
+		if (SCX_HAS_OP(cgroup_set_weight))
+			scx_ops.cgroup_set_weight(tg_cgrp(tg), weight);
+		tg->scx_weight = weight;
+	}
+
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
+static void scx_cgroup_lock(void)
+{
+	percpu_down_write(&scx_cgroup_rwsem);
+}
+
+static void scx_cgroup_unlock(void)
+{
+	percpu_up_write(&scx_cgroup_rwsem);
+}
+
+#else	/* CONFIG_EXT_GROUP_SCHED */
+
+static inline void scx_cgroup_lock(void) {}
+static inline void scx_cgroup_unlock(void) {}
+
+#endif	/* CONFIG_EXT_GROUP_SCHED */
+
 /*
  * Omitted operations:
  *
@@ -1916,6 +2081,130 @@ static void destroy_dsq(u64 dsq_id)
 	rcu_read_unlock();
 }
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+static void scx_cgroup_exit(void)
+{
+	struct cgroup_subsys_state *css;
+
+	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
+
+	/*
+	 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+	 * cgroups and exit all the inited ones, all online cgroups are exited.
+	 */
+	rcu_read_lock();
+	css_for_each_descendant_post(css, &root_task_group.css) {
+		struct task_group *tg = css_tg(css);
+
+		if (!(tg->scx_flags & SCX_TG_INITED))
+			continue;
+		tg->scx_flags &= ~SCX_TG_INITED;
+
+		if (!scx_ops.cgroup_exit)
+			continue;
+
+		if (WARN_ON_ONCE(!css_tryget(css)))
+			continue;
+		rcu_read_unlock();
+
+		scx_ops.cgroup_exit(css->cgroup);
+
+		rcu_read_lock();
+		css_put(css);
+	}
+	rcu_read_unlock();
+}
+
+static int scx_cgroup_init(void)
+{
+	struct cgroup_subsys_state *css;
+	int ret;
+
+	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
+
+	/*
+	 * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
+	 * cgroups and init, all online cgroups are initialized.
+	 */
+	rcu_read_lock();
+	css_for_each_descendant_pre(css, &root_task_group.css) {
+		struct task_group *tg = css_tg(css);
+		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+
+		if ((tg->scx_flags &
+		     (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
+			continue;
+
+		if (!scx_ops.cgroup_init) {
+			tg->scx_flags |= SCX_TG_INITED;
+			continue;
+		}
+
+		if (WARN_ON_ONCE(!css_tryget(css)))
+			continue;
+		rcu_read_unlock();
+
+		ret = scx_ops.cgroup_init(css->cgroup, &args);
+		if (ret) {
+			css_put(css);
+			return ret;
+		}
+		tg->scx_flags |= SCX_TG_INITED;
+
+		rcu_read_lock();
+		css_put(css);
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static void scx_cgroup_config_knobs(void)
+{
+	static DEFINE_MUTEX(cgintf_mutex);
+	DECLARE_BITMAP(mask, CPU_CFTYPE_CNT) = { };
+	u64 knob_flags;
+	int i;
+
+	/*
+	 * Called from both class switch and ops enable/disable paths,
+	 * synchronize internally.
+	 */
+	mutex_lock(&cgintf_mutex);
+
+	/* if fair is in use, all knobs should be shown */
+	if (!scx_switched_all()) {
+		bitmap_fill(mask, CPU_CFTYPE_CNT);
+		goto apply;
+	}
+
+	/*
+	 * On ext, only show the supported knobs. Otherwise, show all possible
+	 * knobs so that configuration attempts succeed and the states are
+	 * remembered while ops is not loaded.
+	 */
+	if (scx_enabled())
+		knob_flags = scx_ops.flags;
+	else
+		knob_flags = SCX_OPS_ALL_FLAGS;
+
+	if (knob_flags & SCX_OPS_CGROUP_KNOB_WEIGHT) {
+		__set_bit(CPU_CFTYPE_WEIGHT, mask);
+		__set_bit(CPU_CFTYPE_WEIGHT_NICE, mask);
+	}
+apply:
+	for (i = 0; i < CPU_CFTYPE_CNT; i++)
+		cgroup_show_cftype(&cpu_cftypes[i], test_bit(i, mask));
+
+	mutex_unlock(&cgintf_mutex);
+}
+
+#else
+static void scx_cgroup_exit(void) {}
+static int scx_cgroup_init(void) { return 0; }
+static void scx_cgroup_config_knobs(void) {}
+#endif
+
 /*
  * Used by sched_fork() and __setscheduler_prio() to pick the matching
  * sched_class. dl/rt are already handled.
@@ -2071,9 +2360,10 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	static_branch_disable(&__scx_switched_all);
 	WRITE_ONCE(scx_switching_all, false);
 
-	/* avoid racing against fork */
+	/* avoid racing against fork and cgroup changes */
 	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
+	scx_cgroup_lock();
 
 	spin_lock_irq(&scx_tasks_lock);
 	scx_task_iter_init(&sti);
@@ -2108,6 +2398,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
 	synchronize_rcu();
 
+	scx_cgroup_exit();
+
+	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 	cpus_read_unlock();
 
@@ -2137,6 +2430,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 
 	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
 		     SCX_OPS_DISABLING);
+
+	scx_cgroup_config_knobs();
 }
 
 static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
@@ -2293,10 +2588,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			   task_runnable_timeout_ms / 2);
 
 	/*
-	 * Lock out forks before opening the floodgate so that they don't wander
-	 * into the operations prematurely.
+	 * Lock out forks, cgroup on/offlining and moves before opening the
+	 * floodgate so that they don't wander into the operations prematurely.
 	 */
 	percpu_down_write(&scx_fork_rwsem);
+	scx_cgroup_lock();
 
 	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
 		if (((void (**)(void))ops)[i])
@@ -2315,6 +2611,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
 	}
 
+	/*
+	 * All cgroups should be initialized before letting in tasks. cgroup
+	 * on/offlining and task migrations are already locked out.
+	 */
+	ret = scx_cgroup_init();
+	if (ret)
+		goto err_disable_unlock;
+
 	static_branch_enable_cpuslocked(&__scx_ops_enabled);
 
 	/*
@@ -2397,6 +2701,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 
 	spin_unlock_irq(&scx_tasks_lock);
 	preempt_enable();
+	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 
 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
@@ -2410,6 +2715,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	cpus_read_unlock();
 	mutex_unlock(&scx_ops_enable_mutex);
 
+	scx_cgroup_config_knobs();
+
 	return 0;
 
 err_unlock:
@@ -2417,6 +2724,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	return ret;
 
 err_disable_unlock:
+	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 err_disable:
 	cpus_read_unlock();
@@ -2578,6 +2886,9 @@ static int bpf_scx_check_member(const struct btf_type *t,
 
 	switch (moff) {
 	case offsetof(struct sched_ext_ops, prep_enable):
+	case offsetof(struct sched_ext_ops, cgroup_init):
+	case offsetof(struct sched_ext_ops, cgroup_exit):
+	case offsetof(struct sched_ext_ops, cgroup_prep_move):
 	case offsetof(struct sched_ext_ops, init):
 	case offsetof(struct sched_ext_ops, exit):
 		/*
@@ -2704,6 +3015,7 @@ void __init init_sched_ext_class(void)
 
 	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
 	INIT_DELAYED_WORK(&check_timeout_work, scx_check_timeout_workfn);
+	scx_cgroup_config_knobs();
 }
 
 
@@ -2745,7 +3057,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_init = {
  * @node: NUMA node to allocate from
  *
  * Create a dsq identified by @dsq_id. Can be called from sleepable operations
- * including ops.init() and .prep_enable().
+ * including ops.init(), .prep_enable() and .cgroup_prep_move().
  */
 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
 {
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index e9ec267f13d5..470b2224cdfa 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -59,6 +59,11 @@ enum scx_deq_flags {
 	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
 };
 
+enum scx_tg_flags {
+	SCX_TG_ONLINE		= 1U << 0,
+	SCX_TG_INITED		= 1U << 1,
+};
+
 enum scx_kick_flags {
 	SCX_KICK_PREEMPT	= 1LLU << 0,	/* force scheduling on the CPU */
 };
@@ -185,3 +190,21 @@ static inline void scx_update_idle(struct rq *rq, bool idle)
 #else
 static inline void scx_update_idle(struct rq *rq, bool idle) {}
 #endif
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+int scx_tg_online(struct task_group *tg);
+void scx_tg_offline(struct task_group *tg);
+int scx_cgroup_can_attach(struct cgroup_taskset *tset);
+void scx_cgroup_move_task(struct task_struct *p);
+void scx_cgroup_finish_attach(void);
+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
+void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
+#else	/* CONFIG_EXT_GROUP_SCHED */
+static inline int scx_tg_online(struct task_group *tg) { return 0; }
+static inline void scx_tg_offline(struct task_group *tg) {}
+static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
+static inline void scx_cgroup_move_task(struct task_struct *p) {}
+static inline void scx_cgroup_finish_attach(void) {}
+static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
+static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
+#endif	/* CONFIG_EXT_GROUP_SCHED */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a95aae3bc69a..a2ffa94ede02 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -406,6 +406,11 @@ struct task_group {
 	struct rt_bandwidth	rt_bandwidth;
 #endif
 
+#ifdef CONFIG_EXT_GROUP_SCHED
+	u32			scx_flags;	/* SCX_TG_* */
+	u32			scx_weight;
+#endif
+
 	struct rcu_head		rcu;
 	struct list_head	list;
 
@@ -535,6 +540,11 @@ extern void set_task_rq_fair(struct sched_entity *se,
 static inline void set_task_rq_fair(struct sched_entity *se,
 			     struct cfs_rq *prev, struct cfs_rq *next) { }
 #endif /* CONFIG_SMP */
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+	return 0;
+}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #else /* CONFIG_CGROUP_SCHED */
@@ -3260,7 +3270,7 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
 
 #ifdef CONFIG_CGROUP_SCHED
 enum cpu_cftype_id {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
 	CPU_CFTYPE_WEIGHT,
 	CPU_CFTYPE_WEIGHT_NICE,
 	CPU_CFTYPE_IDLE,
diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
index 389f0e5b0970..ebc34dcf925b 100644
--- a/tools/sched_ext/.gitignore
+++ b/tools/sched_ext/.gitignore
@@ -1,6 +1,7 @@
 scx_example_dummy
 scx_example_qmap
 scx_example_central
+scx_example_pair
 *.skel.h
 *.subskel.h
 /tools/
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index d406b7586e08..45ab39139afc 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -114,7 +114,7 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     -Wno-compare-distinct-pointer-types				\
 	     -O2 -mcpu=v3
 
-all: scx_example_dummy scx_example_qmap scx_example_central
+all: scx_example_dummy scx_example_qmap scx_example_central scx_example_pair
 
 # sort removes libbpf duplicates when not cross-building
 MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
@@ -177,10 +177,15 @@ scx_example_central: scx_example_central.c scx_example_central.skel.h user_exit_
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
+scx_example_pair: scx_example_pair.c scx_example_pair.skel.h user_exit_info.h
+	$(CC) $(CFLAGS) -c $< -o $@.o
+	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+
 clean:
 	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
-	rm -f scx_example_dummy scx_example_qmap scx_example_central
+	rm -f scx_example_dummy scx_example_qmap scx_example_central		\
+	      scx_example_pair
 
 .PHONY: all clean
 
diff --git a/tools/sched_ext/scx_example_pair.bpf.c b/tools/sched_ext/scx_example_pair.bpf.c
new file mode 100644
index 000000000000..7694d2169383
--- /dev/null
+++ b/tools/sched_ext/scx_example_pair.bpf.c
@@ -0,0 +1,554 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A demo sched_ext core-scheduler which always makes every sibling CPU pair
+ * execute from the same CPU cgroup.
+ *
+ * Each CPU in the system is paired with exactly one other CPU, according to a
+ * "stride" value that can be specified when the BPF scheduler program is first
+ * loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee
+ * that they will only ever schedule tasks that belong to the same CPU cgroup.
+ *
+ * Scheduler Initialization
+ * ------------------------
+ *
+ * The scheduler BPF program is first initialized from user space, before it is
+ * enabled. During this initialization process, each CPU on the system is
+ * assigned several values that are constant throughout its runtime:
+ *
+ * 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling
+ *		  decisions. Paired CPUs always schedule tasks from the same
+ *		  CPU cgroup, and synchronize with each other to guarantee
+ *		  that this constraint is not violated.
+ * 2. *Pair ID*:  Each CPU pair is assigned a Pair ID, which is used to access
+ *		  a struct pair_ctx object that is shared between the pair.
+ * 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the
+ *		       pair. Each struct pair_ctx has an active_mask field,
+ *		       which is a bitmap used to indicate whether each core
+ *		       in the pair currently has an actively running task.
+ *		       This index specifies which entry in the bitmap corresponds
+ *		       to each CPU in the pair.
+ *
+ * During this initialization, the CPUs are paired according to a "stride" that
+ * may be specified when invoking the user space program that initializes and
+ * loads the scheduler. By default, the stride is 1/2 the total number of CPUs.
+ *
+ * Tasks and cgroups
+ * -----------------
+ *
+ * Every cgroup in the system is registered with the scheduler using the
+ * pair_cgroup_init() callback, and every task in the system is associated with
+ * exactly one cgroup. At a high level, the idea with the pair scheduler is to
+ * always schedule tasks from the same cgroup within a given CPU pair. When a
+ * task is enqueued (i.e. passed to the pair_enqueue() callback function), its
+ * cgroup ID is read from its task struct, and then a corresponding queue map
+ * is used to FIFO-enqueue the task for that cgroup.
+ *
+ * If you look through the implementation of the scheduler, you'll notice that
+ * there is quite a bit of complexity involved with looking up the per-cgroup
+ * FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash
+ * BPF hash map that is used to map a cgroup ID to a globally unique ID that's
+ * allocated in the BPF program. This is done because we use separate maps to
+ * store the FIFO queue of tasks, and the length of that map, per cgroup. This
+ * complexity is only present because of current deficiencies in BPF that will
+ * soon be addressed. The main point to keep in mind is that newly enqueued
+ * tasks are added to their cgroup's FIFO queue.
+ *
+ * Dispatching tasks
+ * -----------------
+ *
+ * This section will describe how enqueued tasks are dispatched and scheduled.
+ * Tasks are dispatched in pair_dispatch(), and at a high level the workflow is
+ * as follows:
+ *
+ * 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is
+ *    the structure that's used to synchronize amongst the two pair CPUs in their
+ *    scheduling decisions. After any of the following events have occurred:
+ *
+ * - The cgroup's slice run has expired, or
+ * - The cgroup becomes empty, or
+ * - Either CPU in the pair is preempted by a higher priority scheduling class
+ *
+ * The cgroup transitions to the draining state and stops executing new tasks
+ * from the cgroup.
+ *
+ * 2. If the pair is still executing a task, mark the pair_ctx as draining, and
+ *    wait for the pair CPU to be preempted.
+ *
+ * 3. Otherwise, if the pair CPU is not running a task, we can move onto
+ *    scheduling new tasks. Pop the next cgroup id from the top_q queue.
+ *
+ * 4. Pop a task from that cgroup's FIFO task queue, and begin executing it.
+ *
+ * Note again that this scheduling behavior is simple, but the implementation
+ * is complex mostly because this it hits several BPF shortcomings and has to
+ * work around in often awkward ways. Most of the shortcomings are expected to
+ * be resolved in the near future which should allow greatly simplifying this
+ * scheduler.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include "scx_common.bpf.h"
+#include "scx_example_pair.h"
+
+char _license[] SEC("license") = "GPL";
+
+const volatile bool switch_all;
+
+const volatile u32 nr_cpu_ids;
+
+/* a pair of CPUs stay on a cgroup for this duration */
+const volatile u32 pair_batch_dur_ns = SCX_SLICE_DFL;
+
+/* cpu ID -> pair cpu ID */
+const volatile s32 pair_cpu[MAX_CPUS] = { [0 ... MAX_CPUS - 1] = -1 };
+
+/* cpu ID -> pair_id */
+const volatile u32 pair_id[MAX_CPUS];
+
+/* CPU ID -> CPU # in the pair (0 or 1) */
+const volatile u32 in_pair_idx[MAX_CPUS];
+
+struct pair_ctx {
+	struct bpf_spin_lock	lock;
+
+	/* the cgroup the pair is currently executing */
+	u64			cgid;
+
+	/* the pair started executing the current cgroup at */
+	u64			started_at;
+
+	/* whether the current cgroup is draining */
+	bool			draining;
+
+	/* the CPUs that are currently active on the cgroup */
+	u32			active_mask;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, MAX_CPUS / 2);
+	__type(key, u32);
+	__type(value, struct pair_ctx);
+} pair_ctx SEC(".maps");
+
+/* queue of cgrp_q's possibly with tasks on them */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	/*
+	 * Because it's difficult to build strong synchronization encompassing
+	 * multiple non-trivial operations in BPF, this queue is managed in an
+	 * opportunistic way so that we guarantee that a cgroup w/ active tasks
+	 * is always on it but possibly multiple times. Once we have more robust
+	 * synchronization constructs and e.g. linked list, we should be able to
+	 * do this in a prettier way but for now just size it big enough.
+	 */
+	__uint(max_entries, 4 * MAX_CGRPS);
+	__type(value, u64);
+} top_q SEC(".maps");
+
+/* per-cgroup q which FIFOs the tasks from the cgroup */
+struct cgrp_q {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, MAX_QUEUED);
+	__type(value, u32);
+};
+
+/*
+ * Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local
+ * storage; however, a cgroup local storage can only be accessed from the BPF
+ * progs attached to the cgroup. For now, work around by allocating array of
+ * cgrp_q's and then allocating per-cgroup indices.
+ *
+ * Another caveat: It's difficult to populate a large array of maps statically
+ * or from BPF. Initialize it from userland.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__uint(max_entries, MAX_CGRPS);
+	__type(key, s32);
+	__array(values, struct cgrp_q);
+} cgrp_q_arr SEC(".maps");
+
+static u64 cgrp_q_len[MAX_CGRPS];
+
+/*
+ * This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be
+ * useful to have as a map type.
+ */
+static u32 cgrp_q_idx_cursor;
+static u64 cgrp_q_idx_busy[MAX_CGRPS];
+
+/*
+ * All added up, the following is what we do:
+ *
+ * 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking
+ *    for a free ID. If not found, fail cgroup creation with -EBUSY.
+ *
+ * 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following
+ *    cgrp_q_idx_hash.
+ *
+ * 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from
+ *    cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr.
+ *
+ * This is sadly complicated for something pretty simple. Hopefully, we should
+ * be able to simplify in the future.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, MAX_CGRPS);
+	__uint(key_size, sizeof(u64));		/* cgrp ID */
+	__uint(value_size, sizeof(s32));	/* cgrp_q idx */
+} cgrp_q_idx_hash SEC(".maps");
+
+/* statistics */
+u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions;
+u64 nr_exps, nr_exp_waits, nr_exp_empty;
+u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty;
+
+struct user_exit_info uei;
+
+static bool time_before(u64 a, u64 b)
+{
+	return (s64)(a - b) < 0;
+}
+
+void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	s32 pid = p->pid;
+	u64 cgid = p->sched_task_group->css.cgroup->kn->id;
+	u32 *q_idx;
+	struct cgrp_q *cgq;
+	u64 *cgq_len;
+
+	__sync_fetch_and_add(&nr_total, 1);
+
+	/* find the cgroup's q and push @p into it */
+	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
+	if (!q_idx) {
+		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
+		return;
+	}
+
+	cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx);
+	if (!cgq) {
+		scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]",
+			      cgid, *q_idx);
+		return;
+	}
+
+	if (bpf_map_push_elem(cgq, &pid, 0)) {
+		scx_bpf_error("cgroup[%llu] queue overflow", cgid);
+		return;
+	}
+
+	/* bump q len, if going 0 -> 1, queue cgroup into the top_q */
+	cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
+	if (!cgq_len) {
+		scx_bpf_error("MEMBER_VTPR malfunction");
+		return;
+	}
+
+	if (!__sync_fetch_and_add(cgq_len, 1) &&
+	    bpf_map_push_elem(&top_q, &cgid, 0)) {
+		scx_bpf_error("top_q overflow");
+		return;
+	}
+}
+
+/* find the next cgroup to execute and return it in *data */
+static int next_cgid_loopfn(u32 idx, void *data)
+{
+	u64 cgid;
+	u32 *q_idx;
+	u64 *cgq_len;
+
+	if (bpf_map_pop_elem(&top_q, &cgid))
+		return 1;
+
+	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
+	if (!q_idx)
+		return 0;
+
+	/* this is the only place where empty cgroups are taken off the top_q */
+	cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
+	if (!cgq_len || !*cgq_len)
+		return 0;
+
+	/* if it has any tasks, requeue as we may race and not execute it */
+	bpf_map_push_elem(&top_q, &cgid, 0);
+	*(u64 *)data = cgid;
+	return 1;
+}
+
+struct claim_task_loopctx {
+	u32		q_idx;
+	bool		claimed;
+};
+
+/* claim one task from the specified cgq */
+static int claim_task_loopfn(u32 idx, void *data)
+{
+	struct claim_task_loopctx *claimc = data;
+	u64 *cgq_len;
+	u64 len;
+
+	cgq_len = MEMBER_VPTR(cgrp_q_len, [claimc->q_idx]);
+	if (!cgq_len)
+		return 1;
+
+	len = *cgq_len;
+	if (!len)
+		return 1;
+
+	if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len)
+		return 0;
+
+	claimc->claimed = true;
+	return 1;
+}
+
+static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask)
+{
+	u32 *vptr, in_pair_mask;
+	int err;
+
+	vptr = (u32 *)MEMBER_VPTR(pair_id, [cpu]);
+	if (!vptr)
+		return -EINVAL;
+
+	*pairc = bpf_map_lookup_elem(&pair_ctx, vptr);
+	if (!(*pairc))
+		return -EINVAL;
+
+	vptr = (u32 *)MEMBER_VPTR(in_pair_idx, [cpu]);
+	if (!vptr)
+		return -EINVAL;
+
+	*mask = 1U << *vptr;
+
+	return 0;
+}
+
+static int dispatch_loopfn(u32 idx, void *data)
+{
+	s32 cpu = *(s32 *)data;
+	struct pair_ctx *pairc;
+	struct bpf_map *cgq_map;
+	struct claim_task_loopctx claimc;
+	struct task_struct *p;
+	u64 now = bpf_ktime_get_ns();
+	bool kick_pair = false;
+	bool expired;
+	u32 *vptr, in_pair_mask;
+	s32 pid;
+	u64 cgid;
+	int ret;
+
+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
+	if (ret) {
+		scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]",
+			      cpu);
+		return 1;
+	}
+
+	bpf_spin_lock(&pairc->lock);
+	pairc->active_mask &= ~in_pair_mask;
+
+	expired = time_before(pairc->started_at + pair_batch_dur_ns, now);
+	if (expired || pairc->draining) {
+		u64 new_cgid = 0;
+
+		__sync_fetch_and_add(&nr_exps, 1);
+
+		/*
+		 * We're done with the current cgid. An obvious optimization
+		 * would be not draining if the next cgroup is the current one.
+		 * For now, be dumb and always expire.
+		 */
+		pairc->draining = true;
+
+		if (pairc->active_mask) {
+			/*
+			 * The other CPU is still active We want to wait until
+			 * this cgroup expires.
+			 *
+			 * If the pair controls its CPU, and the time already
+			 * expired, kick.  When the other CPU arrives at
+			 * dispatch and clears its active mask, it'll push the
+			 * pair to the next cgroup and kick this CPU.
+			 */
+			__sync_fetch_and_add(&nr_exp_waits, 1);
+			bpf_spin_unlock(&pairc->lock);
+			if (expired)
+				kick_pair = true;
+			goto out_maybe_kick;
+		}
+
+		bpf_spin_unlock(&pairc->lock);
+
+		/*
+		 * Pick the next cgroup. It'd be easier / cleaner to not drop
+		 * pairc->lock and use stronger synchronization here especially
+		 * given that we'll be switching cgroups significantly less
+		 * frequently than tasks. Unfortunately, bpf_spin_lock can't
+		 * really protect anything non-trivial. Let's do opportunistic
+		 * operations instead.
+		 */
+		bpf_loop(1 << 23, next_cgid_loopfn, &new_cgid, 0);
+		/* no active cgroup, go idle */
+		if (!new_cgid) {
+			__sync_fetch_and_add(&nr_exp_empty, 1);
+			return 1;
+		}
+
+		bpf_spin_lock(&pairc->lock);
+
+		/*
+		 * The other CPU may already have started on a new cgroup while
+		 * we dropped the lock. Make sure that we're still draining and
+		 * start on the new cgroup.
+		 */
+		if (pairc->draining && !pairc->active_mask) {
+			__sync_fetch_and_add(&nr_cgrp_next, 1);
+			pairc->cgid = new_cgid;
+			pairc->started_at = now;
+			pairc->draining = false;
+			kick_pair = true;
+		} else {
+			__sync_fetch_and_add(&nr_cgrp_coll, 1);
+		}
+	}
+
+	cgid = pairc->cgid;
+	pairc->active_mask |= in_pair_mask;
+	bpf_spin_unlock(&pairc->lock);
+
+	/* again, it'd be better to do all these with the lock held, oh well */
+	vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
+	if (!vptr) {
+		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
+		return 1;
+	}
+
+	claimc = (struct claim_task_loopctx){ .q_idx = *vptr };
+	bpf_loop(1 << 23, claim_task_loopfn, &claimc, 0);
+	if (!claimc.claimed) {
+		/* the cgroup must be empty, expire and repeat */
+		__sync_fetch_and_add(&nr_cgrp_empty, 1);
+		bpf_spin_lock(&pairc->lock);
+		pairc->draining = true;
+		pairc->active_mask &= ~in_pair_mask;
+		bpf_spin_unlock(&pairc->lock);
+		return 0;
+	}
+
+	cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &claimc.q_idx);
+	if (!cgq_map) {
+		scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]",
+			      cgid, claimc.q_idx);
+		return 1;
+	}
+
+	if (bpf_map_pop_elem(cgq_map, &pid)) {
+		scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]",
+			      cgid, claimc.q_idx);
+		return 1;
+	}
+
+	p = scx_bpf_find_task_by_pid(pid);
+	if (p) {
+		__sync_fetch_and_add(&nr_dispatched, 1);
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+	} else {
+		/* we don't handle dequeues, retry on lost tasks */
+		__sync_fetch_and_add(&nr_missing, 1);
+		return 0;
+	}
+
+out_maybe_kick:
+	if (kick_pair) {
+		s32 *pair = (s32 *)MEMBER_VPTR(pair_cpu, [cpu]);
+		if (pair) {
+			__sync_fetch_and_add(&nr_kicks, 1);
+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
+		}
+	}
+	return 1;
+}
+
+void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev)
+{
+	s32 cpu_on_stack = cpu;
+
+	bpf_loop(1 << 23, dispatch_loopfn, &cpu_on_stack, 0);
+}
+
+static int alloc_cgrp_q_idx_loopfn(u32 idx, void *data)
+{
+	u32 q_idx;
+
+	q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS;
+	if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1)) {
+		*(s32 *)data = q_idx;
+		return 1;
+	}
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp)
+{
+	u64 cgid = cgrp->kn->id;
+	s32 q_idx = -1;
+
+	bpf_loop(MAX_CGRPS, alloc_cgrp_q_idx_loopfn, &q_idx, 0);
+	if (q_idx < 0)
+		return -EBUSY;
+
+	if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) {
+		u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]);
+		if (busy)
+			*busy = 0;
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp)
+{
+	u64 cgid = cgrp->kn->id;
+	s32 *q_idx;
+
+	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
+	if (q_idx) {
+		u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]);
+		if (busy)
+			*busy = 0;
+		bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid);
+	}
+}
+
+s32 BPF_STRUCT_OPS(pair_init)
+{
+	if (switch_all)
+		scx_bpf_switch_all();
+	return 0;
+}
+
+void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops pair_ops = {
+	.enqueue		= (void *)pair_enqueue,
+	.dispatch		= (void *)pair_dispatch,
+	.cgroup_init		= (void *)pair_cgroup_init,
+	.cgroup_exit		= (void *)pair_cgroup_exit,
+	.init			= (void *)pair_init,
+	.exit			= (void *)pair_exit,
+	.name			= "pair",
+};
diff --git a/tools/sched_ext/scx_example_pair.c b/tools/sched_ext/scx_example_pair.c
new file mode 100644
index 000000000000..255ea7b1235d
--- /dev/null
+++ b/tools/sched_ext/scx_example_pair.c
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include "user_exit_info.h"
+#include "scx_example_pair.h"
+#include "scx_example_pair.skel.h"
+
+const char help_fmt[] =
+"A demo sched_ext core-scheduler which always makes every sibling CPU pair\n"
+"execute from the same CPU cgroup.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-a] [-S STRIDE]\n"
+"\n"
+"  -a            Switch all tasks\n"
+"  -S STRIDE     Override CPU pair stride (default: nr_cpus_ids / 2)\n"
+"  -h            Display this help and exit\n";
+
+static volatile int exit_req;
+
+static void sigint_handler(int dummy)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_example_pair *skel;
+	struct bpf_link *link;
+	u64 seq = 0;
+	s32 stride, i, opt, outer_fd;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	skel = scx_example_pair__open();
+	assert(skel);
+
+	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
+
+	/* pair up the earlier half to the latter by default, override with -s */
+	stride = skel->rodata->nr_cpu_ids / 2;
+
+	while ((opt = getopt(argc, argv, "ahS:")) != -1) {
+		switch (opt) {
+		case 'a':
+			skel->rodata->switch_all = true;
+			break;
+		case 'S':
+			stride = strtoul(optarg, NULL, 0);
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	for (i = 0; i < skel->rodata->nr_cpu_ids; i++) {
+		if (skel->rodata->pair_cpu[i] < 0) {
+			skel->rodata->pair_cpu[i] = i + stride;
+			skel->rodata->pair_cpu[i + stride] = i;
+			skel->rodata->pair_id[i] = i;
+			skel->rodata->pair_id[i + stride] = i;
+			skel->rodata->in_pair_idx[i] = 0;
+			skel->rodata->in_pair_idx[i + stride] = 1;
+		}
+	}
+
+	assert(!scx_example_pair__load(skel));
+
+	/*
+	 * Populate the cgrp_q_arr map which is an array containing per-cgroup
+	 * queues. It'd probably be better to do this from BPF but there are too
+	 * many to initialize statically and there's no way to dynamically
+	 * populate from BPF.
+	 */
+	outer_fd = bpf_map__fd(skel->maps.cgrp_q_arr);
+	assert(outer_fd >= 0);
+
+	printf("Initializing");
+        for (i = 0; i < MAX_CGRPS; i++) {
+		s32 inner_fd;
+
+		if (exit_req)
+			break;
+
+		inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0,
+					  sizeof(u32), MAX_QUEUED, NULL);
+		assert(inner_fd >= 0);
+		assert(!bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY));
+		close(inner_fd);
+
+		if (!(i % 10))
+			printf(".");
+		fflush(stdout);
+        }
+	printf("\n");
+
+	/*
+	 * Fully initialized, attach and run.
+	 */
+	link = bpf_map__attach_struct_ops(skel->maps.pair_ops);
+	assert(link);
+
+	while (!exit_req && !uei_exited(&skel->bss->uei)) {
+		printf("[SEQ %lu]\n", seq++);
+		printf(" total:%10lu dispatch:%10lu   missing:%10lu\n",
+		       skel->bss->nr_total,
+		       skel->bss->nr_dispatched,
+		       skel->bss->nr_missing);
+		printf(" kicks:%10lu preemptions:%7lu\n",
+		       skel->bss->nr_kicks,
+		       skel->bss->nr_preemptions);
+		printf("   exp:%10lu exp_wait:%10lu exp_empty:%10lu\n",
+		       skel->bss->nr_exps,
+		       skel->bss->nr_exp_waits,
+		       skel->bss->nr_exp_empty);
+		printf("cgnext:%10lu   cgcoll:%10lu   cgempty:%10lu\n",
+		       skel->bss->nr_cgrp_next,
+		       skel->bss->nr_cgrp_coll,
+		       skel->bss->nr_cgrp_empty);
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	uei_print(&skel->bss->uei);
+	scx_example_pair__destroy(skel);
+	return 0;
+}
diff --git a/tools/sched_ext/scx_example_pair.h b/tools/sched_ext/scx_example_pair.h
new file mode 100644
index 000000000000..f60b824272f7
--- /dev/null
+++ b/tools/sched_ext/scx_example_pair.h
@@ -0,0 +1,10 @@
+#ifndef __SCX_EXAMPLE_PAIR_H
+#define __SCX_EXAMPLE_PAIR_H
+
+enum {
+	MAX_CPUS		= 4096,
+	MAX_QUEUED		= 4096,
+	MAX_CGRPS		= 4096,
+};
+
+#endif /* __SCX_EXAMPLE_PAIR_H */

From patchwork Wed Nov 30 08:23:07 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059559
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 95269C352A1
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:27:09 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234858AbiK3I1I (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:27:08 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55966 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235226AbiK3IZk (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:25:40 -0500
Received: from mail-pj1-x1030.google.com (mail-pj1-x1030.google.com
 [IPv6:2607:f8b0:4864:20::1030])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id A51E67047A;
        Wed, 30 Nov 2022 00:24:31 -0800 (PST)
Received: by mail-pj1-x1030.google.com with SMTP id o12so7208237pjo.4;
        Wed, 30 Nov 2022 00:24:31 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=icbEL6Yyh88UwKlWIMibQXf09tYG4jVGd0Sd4NYusRk=;
        b=cwevkODSfgy/ecZgn1k9Sl3axSX97jP3z2nxUdls+f3M+9XG9jOSmyTeNQioc3Oih/
         xnhqxcRW67kGSuIu4oa/20EDH5zCBGY/CoRZLTDXxRZvatLVoqEY99RmTc0CcwmW1Z+o
         H3ucNRMQanqy8GVJv9CU7u9JohFhNWWYT1jD5UT9gsIfNScLNUw3IZB7X2CoETDsmuHY
         eL8ju2tQjY4NQmmlfR4/smloVFDKwtC1WswsrbyTN/oBtrJXyhHZyxH819kHtmbIBQGm
         frocUBnuSV/ZMRlPAJsnA6irxxhKS9FGbpOcRiZreHnMoEqMcWslMZI6kR3TqrSnFOgv
         /7UA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=icbEL6Yyh88UwKlWIMibQXf09tYG4jVGd0Sd4NYusRk=;
        b=lu0nj3hvLfj23dqw0xD46o9Lg2QvJXzrr/GNYjHcdZhDrYUI2HRhkqJgKVZzmSuPnP
         0IyTXDtwgSb81hDOI8DG7LVc2YjFeZ4ir8fkF7G4m8MxHD4MjKa/VaZuzbOT4Hvwlww/
         6BRe3LaH8lwkjdiyzm0MBCah+jkTW8BT7h91aq3frco5fYo8xIBh1sA0FvNU/BrgcOlx
         zC7+FkB4W/t6587MsQaJZNCoLCXnICTzPbgdGzQuvoDQC6EwcGSLTWLh+Nu11rNO0UWN
         +1IEWRWNzUF+2r65DrHxrNjtYEZ1hoea0M+wHOGnzpj5Q5IyYQjW/2b2cljvsqGsfKri
         Ga7A==
X-Gm-Message-State: ANoB5pn7VID3NFhMPA2fpyIf/Bc3rs2kOSgu+vGnxYi4npm7gwaRo4y3
        QDuDGwos6SPHMbwwwYBoa7A=
X-Google-Smtp-Source: 
 AA0mqf4IlcoridLQ++OjRdvy1H/e5LzA5fityT9BR15abVAoR90Rh0qY8hdmHjRCpcdgu7gGeuvKMg==
X-Received: by 2002:a17:902:bf01:b0:186:8bca:1d50 with SMTP id
 bi1-20020a170902bf0100b001868bca1d50mr42720977plb.158.1669796670788;
        Wed, 30 Nov 2022 00:24:30 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 u6-20020a170902e80600b0017849a2b56asm777944plg.46.2022.11.30.00.24.29
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:30 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 25/31] sched_ext: Implement SCX_KICK_WAIT
Date: Tue, 29 Nov 2022 22:23:07 -1000
Message-Id: <20221130082313.3241517-26-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

From: David Vernet <dvernet@meta.com>

If set when calling scx_bpf_kick_cpu(), the invoking CPU will busy wait for
the kicked cpu to enter the scheduler. This will be used to improve the
exclusion guarantees in scx_example_pair.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c  |  4 +++-
 kernel/sched/ext.c   | 36 ++++++++++++++++++++++++++++++++++--
 kernel/sched/ext.h   | 20 ++++++++++++++++++++
 kernel/sched/sched.h |  2 ++
 4 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 79560641a61f..ea4f6edfcf32 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5886,8 +5886,10 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
 	for_each_active_class(class) {
 		p = class->pick_next_task(rq);
-		if (p)
+		if (p) {
+			scx_notify_pick_next_task(rq, p, class);
 			return p;
+		}
 	}
 
 	BUG(); /* The idle class should always have a runnable task. */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index bd03b55fbcf5..aeaad3d8b05a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -109,8 +109,9 @@ unsigned long last_timeout_check = INITIAL_JIFFIES;
 
 static struct delayed_work check_timeout_work;
 
-/* idle tracking */
 #ifdef CONFIG_SMP
+
+/* idle tracking */
 #ifdef CONFIG_CPUMASK_OFFSTACK
 #define CL_ALIGNED_IF_ONSTACK
 #else
@@ -123,7 +124,11 @@ static struct {
 } idle_masks CL_ALIGNED_IF_ONSTACK;
 
 static bool __cacheline_aligned_in_smp has_idle_cpus;
-#endif
+
+/* for %SCX_KICK_WAIT */
+static u64 __percpu *kick_cpus_pnt_seqs;
+
+#endif	/* CONFIG_SMP */
 
 /*
  * Direct dispatch marker.
@@ -2959,6 +2964,7 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
 static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 {
 	struct rq *this_rq = this_rq();
+	u64 *pseqs = this_cpu_ptr(kick_cpus_pnt_seqs);
 	int this_cpu = cpu_of(this_rq);
 	int cpu;
 
@@ -2972,14 +2978,32 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 			if (cpumask_test_cpu(cpu, this_rq->scx.cpus_to_preempt) &&
 			    rq->curr->sched_class == &ext_sched_class)
 				rq->curr->scx.slice = 0;
+			pseqs[cpu] = rq->scx.pnt_seq;
 			resched_curr(rq);
+		} else {
+			cpumask_clear_cpu(cpu, this_rq->scx.cpus_to_wait);
 		}
 
 		raw_spin_rq_unlock_irqrestore(rq, flags);
 	}
 
+	for_each_cpu_andnot(cpu, this_rq->scx.cpus_to_wait,
+			    cpumask_of(this_cpu)) {
+		/*
+		 * Pairs with smp_store_release() issued by this CPU in
+		 * scx_notify_pick_next_task() on the resched path.
+		 *
+		 * We busy-wait here to guarantee that no other task can be
+		 * scheduled on our core before the target CPU has entered the
+		 * resched path.
+		 */
+		while (smp_load_acquire(&cpu_rq(cpu)->scx.pnt_seq) == pseqs[cpu])
+			cpu_relax();
+	}
+
 	cpumask_clear(this_rq->scx.cpus_to_kick);
 	cpumask_clear(this_rq->scx.cpus_to_preempt);
+	cpumask_clear(this_rq->scx.cpus_to_wait);
 }
 #endif
 
@@ -2999,6 +3023,11 @@ void __init init_sched_ext_class(void)
 #ifdef CONFIG_SMP
 	BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
+
+	kick_cpus_pnt_seqs = __alloc_percpu(sizeof(kick_cpus_pnt_seqs[0]) *
+					    num_possible_cpus(),
+					    __alignof__(kick_cpus_pnt_seqs[0]));
+	BUG_ON(!kick_cpus_pnt_seqs);
 #endif
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
@@ -3009,6 +3038,7 @@ void __init init_sched_ext_class(void)
 #ifdef CONFIG_SMP
 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
 		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
 #endif
 	}
@@ -3228,6 +3258,8 @@ void scx_bpf_kick_cpu(s32 cpu, u64 flags)
 		cpumask_set_cpu(cpu, rq->scx.cpus_to_kick);
 		if (flags & SCX_KICK_PREEMPT)
 			cpumask_set_cpu(cpu, rq->scx.cpus_to_preempt);
+		if (flags & SCX_KICK_WAIT)
+			cpumask_set_cpu(cpu, rq->scx.cpus_to_wait);
 
 		irq_work_queue(&rq->scx.kick_cpus_irq_work);
 		preempt_enable();
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 470b2224cdfa..8ae717c5e850 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -66,6 +66,7 @@ enum scx_tg_flags {
 
 enum scx_kick_flags {
 	SCX_KICK_PREEMPT	= 1LLU << 0,	/* force scheduling on the CPU */
+	SCX_KICK_WAIT		= 1LLU << 1,	/* wait for the CPU to be rescheduled */
 };
 
 #ifdef CONFIG_SCHED_CLASS_EXT
@@ -107,6 +108,22 @@ __printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
 #define scx_ops_error(fmt, args...)						\
 	scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)
 
+static inline void scx_notify_pick_next_task(struct rq *rq,
+					     const struct task_struct *p,
+					     const struct sched_class *active)
+{
+#ifdef CONFIG_SMP
+	if (!scx_enabled())
+		return;
+	/*
+	 * Pairs with the smp_load_acquire() issued by a CPU in
+	 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
+	 * resched.
+	 */
+	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
+#endif
+}
+
 static inline void scx_notify_sched_tick(void)
 {
 	unsigned long last_check, timeout;
@@ -164,6 +181,9 @@ static inline int scx_check_setscheduler(struct task_struct *p,
 					 int policy) { return 0; }
 static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
 static inline void init_sched_ext_class(void) {}
+static inline void scx_notify_pick_next_task(struct rq *rq,
+					     const struct task_struct *p,
+					     const struct sched_class *active) {}
 static inline void scx_notify_sched_tick(void) {}
 
 #define for_each_active_class		for_each_class
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a2ffa94ede02..5af758cc1e38 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -714,6 +714,8 @@ struct scx_rq {
 #ifdef CONFIG_SMP
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_preempt;
+	cpumask_var_t		cpus_to_wait;
+	u64			pnt_seq;
 	struct irq_work		kick_cpus_irq_work;
 #endif
 };

From patchwork Wed Nov 30 08:23:08 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059566
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id CABE7C433FE
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:27:19 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234963AbiK3I1R (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:27:17 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55836 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235246AbiK3I0G (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:26:06 -0500
Received: from mail-pj1-x1033.google.com (mail-pj1-x1033.google.com
 [IPv6:2607:f8b0:4864:20::1033])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D978271189;
        Wed, 30 Nov 2022 00:24:33 -0800 (PST)
Received: by mail-pj1-x1033.google.com with SMTP id b11so15051766pjp.2;
        Wed, 30 Nov 2022 00:24:33 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=qtwptDEgI43zTSLHjhdcWrqPT/rV9PLFes5E76kL9kM=;
        b=FrAaUtHpslCr9LntxcCVmJDfObb+pKBlnNxAGKqNoWwoDgLYaSlRtHqCxTx/N44iJh
         p5V89eOPTEFn9qY+davXraFuWr0RXYoocIlwdCHgCwXjGHw+NDmie8UDkKb2nwsOs3fy
         ncaFvtqWBGEMYFGKgLqEEfypI1iW+SPhxwfjLSKdIxoiauuaW2ETubqZcyzBf3+i4xDj
         gAloT0sOqEhr5E5g7AHpqHunaKHapOFsBcQFkgMoPZA/aQ/0Z911Me56VYUe/fsMUSpQ
         l9LIErwfeLsmQmnBhHaNEuxeWh6kRyGxyRIjFZtGaY2n0EDQTSSGi7Oek4vbKT99XX9V
         eYXA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=qtwptDEgI43zTSLHjhdcWrqPT/rV9PLFes5E76kL9kM=;
        b=CivR3oDOHvyPP2Zvh5ZmbMgGyNlBg4o6Pz1hGkvp3f/VKpYs3pWwEarqOzKjpJv6s/
         xxxu8Kj6lJFc4lpurq0fmKPLkOjitgF+b9gmGKdZe2LQBjmVbVOaUMcz4kadNXGqJ9w6
         cmxcGpNm+nUB/KgPX9I4Z09NZjxidE819PIKVfbV6GZlsJV6xSjrERJ2yGN8WszvoHAQ
         QEAtWv0t97EmJNeEudmiPgrMV+YQKSWPUUGBOOYnboZuy0RDZO7dW+cjj8YL2Ld3quNW
         M6zP1Lh0uOoF0G3kproSzx2LOZ0vtMA686uLvVgs8OFb0usG90PMDSAk6d/cQ+Q0VJTv
         LBnA==
X-Gm-Message-State: ANoB5pmRv6YN43RQ3HUh5+xIJJ9jcTT6PkDiWqM01iBxYKoWaoedcgI9
        0b3Fe/rLWiVDqC29PeMALko=
X-Google-Smtp-Source: 
 AA0mqf62ram+jDLTNeJC92cObAdZJC/ha02gqV8GD3aVz2U+moDhQsIvla3CEuMqkKRsCaNnaJ0Nmw==
X-Received: by 2002:a17:902:ce90:b0:186:b46d:da5e with SMTP id
 f16-20020a170902ce9000b00186b46dda5emr41588058plg.92.1669796672859;
        Wed, 30 Nov 2022 00:24:32 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 h15-20020a056a00000f00b00574d38f4d37sm816850pfk.45.2022.11.30.00.24.32
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:32 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 26/31] sched_ext: Implement
 sched_ext_ops.cpu_acquire/release()
Date: Tue, 29 Nov 2022 22:23:08 -1000
Message-Id: <20221130082313.3241517-27-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

From: David Vernet <dvernet@meta.com>

Scheduler classes are strictly ordered and when a higher priority class has
tasks to run, the lower priority ones lose access to the CPU. Being able to
monitor and act on these events are necessary for use cases includling
strict core-scheduling and latency management.

This patch adds two operations ops.cpu_acquire() and .cpu_release(). The
former is invoked when a CPU becomes available to the BPF scheduler and the
opposite for the latter. This patch also implements
scx_bpf_reenqueue_local() which can be called from .cpu_release() to trigger
requeueing of all tasks in the local dsq of the CPU so that the tasks can be
reassigned to other available CPUs.

scx_example_pair is updated to use .cpu_acquire/release() along with
%SCX_KICK_WAIT to make the pair scheduling guarantee strict even when a CPU
is preempted by a higher priority scheduler class.

scx_example_qmap is updated to use .cpu_acquire/release() to empty the local
dsq of a preempted CPU. A similar approach can be adopted by BPF schedulers
that want to have a tight control over latency.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h              |  48 ++++++++++
 kernel/sched/ext.c                     | 122 ++++++++++++++++++++++++-
 kernel/sched/ext.h                     |  22 ++++-
 kernel/sched/sched.h                   |   1 +
 tools/sched_ext/scx_common.bpf.h       |   1 +
 tools/sched_ext/scx_example_pair.bpf.c | 101 +++++++++++++++++++-
 tools/sched_ext/scx_example_qmap.bpf.c |  35 +++++++
 7 files changed, 323 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index dc51304b6599..61f45aa03704 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -134,6 +134,32 @@ struct scx_cgroup_init_args {
 	u32			weight;
 };
 
+enum scx_cpu_preempt_reason {
+	/* next task is being scheduled by &sched_class_rt */
+        SCX_CPU_PREEMPT_RT,
+	/* next task is being scheduled by &sched_class_dl */
+        SCX_CPU_PREEMPT_DL,
+	/* next task is being scheduled by &sched_class_stop */
+        SCX_CPU_PREEMPT_STOP,
+	/* unknown reason for SCX being preempted */
+        SCX_CPU_PREEMPT_UNKNOWN,
+};
+
+/*
+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * expanded in the future.
+ */
+struct scx_cpu_acquire_args {};
+
+/* argument container for ops->cpu_release() */
+struct scx_cpu_release_args {
+	/* the reason the CPU was preempted */
+	enum scx_cpu_preempt_reason reason;
+
+	/* the task that's going to be scheduled on the CPU */
+	const struct task_struct *task;
+};
+
 /**
  * struct sched_ext_ops - Operation table for BPF scheduler implementation
  *
@@ -336,6 +362,28 @@ struct sched_ext_ops {
 	 */
 	void (*update_idle)(s32 cpu, bool idle);
 
+	/**
+	 * cpu_acquire - A CPU is becoming available to the BPF scheduler
+	 * @cpu: The CPU being acquired by the BPF scheduler.
+	 * @args: Acquire arguments, see the struct definition.
+	 *
+	 * A CPU that was previously released from the BPF scheduler is now once
+	 * again under its control.
+	 */
+	void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+
+	/**
+	 * cpu_release - A CPU is taken away from the BPF scheduler
+	 * @cpu: The CPU being released by the BPF scheduler.
+	 * @args: Release arguments, see the struct definition.
+	 *
+	 * The specified CPU is no longer under the control of the BPF
+	 * scheduler. This could be because it was preempted by a higher
+	 * priority sched_class, though there may be other reasons as well. The
+	 * caller should consult @args->reason to determine the cause.
+	 */
+	void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+
 	/**
 	 * prep_enable - Prepare to enable BPF scheduling for a task
 	 * @p: task to prepare BPF scheduling for
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index aeaad3d8b05a..9a7a4e54e8fa 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -82,6 +82,7 @@ static bool warned_zero_slice;
 
 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
+DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
 static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
 
 struct static_key_false scx_has_op[SCX_NR_ONLINE_OPS] =
@@ -1172,6 +1173,19 @@ int balance_scx(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
 	lockdep_assert_rq_held(rq);
 
+	if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
+	    unlikely(rq->scx.cpu_released)) {
+		/*
+		 * If the previous sched_class for the current CPU was not SCX,
+		 * notify the BPF scheduler that it again has control of the
+		 * core. This callback complements ->cpu_release(), which is
+		 * emitted in scx_notify_pick_next_task().
+		 */
+		if (SCX_HAS_OP(cpu_acquire))
+			scx_ops.cpu_acquire(cpu_of(rq), NULL);
+		rq->scx.cpu_released = false;
+	}
+
 	if (prev_on_scx) {
 		WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
 		update_curr_scx(rq);
@@ -1179,7 +1193,9 @@ int balance_scx(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 		/*
 		 * If @prev is runnable & has slice left, it has priority and
 		 * fetching more just increases latency for the fetched tasks.
-		 * Tell put_prev_task_scx() to put @prev on local_dsq.
+		 * Tell put_prev_task_scx() to put @prev on local_dsq. If the
+		 * BPF scheduler wants to handle this explicitly, it should
+		 * implement ->cpu_released().
 		 *
 		 * See scx_ops_disable_workfn() for the explanation on the
 		 * disabling() test.
@@ -1349,6 +1365,57 @@ static struct task_struct *pick_next_task_scx(struct rq *rq)
 	return p;
 }
 
+static enum scx_cpu_preempt_reason
+preempt_reason_from_class(const struct sched_class *class)
+{
+	if (class == &stop_sched_class)
+		return SCX_CPU_PREEMPT_RT;
+	else if (class == &dl_sched_class)
+		return SCX_CPU_PREEMPT_DL;
+	else if (class == &rt_sched_class)
+		return SCX_CPU_PREEMPT_STOP;
+	else
+		return SCX_CPU_PREEMPT_UNKNOWN;
+}
+
+void __scx_notify_pick_next_task(struct rq *rq,
+				 const struct task_struct *task,
+				 const struct sched_class *active)
+{
+	lockdep_assert_rq_held(rq);
+
+	/*
+	 * The callback is conceptually meant to convey that the CPU is no
+	 * longer under the control of SCX. Therefore, don't invoke the
+	 * callback if the CPU is is staying on SCX, or going idle (in which
+	 * case the SCX scheduler has actively decided not to schedule any
+	 * tasks on the CPU).
+	 */
+	if (likely(active >= &ext_sched_class))
+		return;
+
+	/*
+	 * At this point we know that SCX was preempted by a higher priority
+	 * sched_class, so invoke the ->cpu_release() callback if we have not
+	 * done so already. We only send the callback once between SCX being
+	 * preempted, and it regaining control of the CPU.
+	 *
+	 * ->cpu_release() complements ->cpu_acquire(), which is emitted the
+	 *  next time that balance_scx() is invoked.
+	 */
+	if (!rq->scx.cpu_released) {
+		if (SCX_HAS_OP(cpu_release)) {
+			struct scx_cpu_release_args args = {
+				.reason = preempt_reason_from_class(active),
+				.task = task,
+			};
+
+			scx_ops.cpu_release(cpu_of(rq), &args);
+		}
+		rq->scx.cpu_released = true;
+	}
+}
+
 #ifdef CONFIG_SMP
 
 static bool test_and_clear_cpu_idle(int cpu)
@@ -2400,6 +2467,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		static_branch_disable_cpuslocked(&scx_has_op[i]);
 	static_branch_disable_cpuslocked(&scx_ops_enq_last);
 	static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
+	static_branch_disable_cpuslocked(&scx_ops_cpu_preempt);
 	static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
 	synchronize_rcu();
 
@@ -2608,6 +2676,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 
 	if (ops->flags & SCX_OPS_ENQ_EXITING)
 		static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
+	if (scx_ops.cpu_acquire || scx_ops.cpu_release)
+		static_branch_enable_cpuslocked(&scx_ops_cpu_preempt);
 
 	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
 		reset_idle_masks();
@@ -3035,6 +3105,7 @@ void __init init_sched_ext_class(void)
 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
 		INIT_LIST_HEAD(&rq->scx.watchdog_list);
 		rq->scx.nr_running = 0;
+		rq->scx.cpu_released = false;
 #ifdef CONFIG_SMP
 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
@@ -3227,6 +3298,53 @@ static const struct btf_kfunc_id_set scx_kfunc_set_consume = {
 	.set			= &scx_kfunc_ids_consume,
 };
 
+/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local dsq
+ *
+ * Iterate over all of the tasks currently enqueued on the LOCAL_DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
+ * processed tasks.
+ */
+u32 scx_bpf_reenqueue_local(void)
+{
+	u32 nr_enqueued, i;
+	struct rq *rq;
+	struct scx_rq *scx_rq;
+
+	rq = cpu_rq(smp_processor_id());
+	lockdep_assert_rq_held(rq);
+	scx_rq = &rq->scx;
+
+	/*
+	 * Get the number of tasks on the local dsq before iterating over it to
+	 * pull off tasks. The enqueue callback below can signal that it wants
+	 * the task to stay on the local dsq, and we want to prevent the BPF
+	 * scheduler from causing us to loop indefinitely.
+	 */
+	nr_enqueued = scx_rq->local_dsq.nr;
+	for (i = 0; i < nr_enqueued; i++) {
+		struct task_struct *p;
+
+		p = pick_task_scx(rq);
+		WARN_ON_ONCE(atomic64_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+		WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
+		WARN_ON_ONCE(p->scx.holding_cpu != -1);
+		dispatch_dequeue(scx_rq, p);
+		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
+	}
+
+	return nr_enqueued;
+}
+
+BTF_SET8_START(scx_kfunc_ids_cpu_release)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
+BTF_SET8_END(scx_kfunc_ids_cpu_release)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_cpu_release,
+};
+
 /**
  * scx_bpf_kick_cpu - Trigger reschedule on a CPU
  * @cpu: cpu to kick
@@ -3558,6 +3676,8 @@ static int __init register_ext_kfuncs(void)
 					    &scx_kfunc_set_dispatch)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_consume)) ||
+	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					     &scx_kfunc_set_cpu_release)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_online)) ||
 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 8ae717c5e850..e1eaaba3d4c7 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -27,6 +27,17 @@ enum scx_enq_flags {
 	 */
 	SCX_ENQ_PREEMPT		= 1LLU << 32,
 
+	/*
+	 * The task being enqueued was previously enqueued on the current CPU's
+	 * %SCX_DSQ_LOCAL, but was removed from it in a call to the
+	 * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
+	 * invoked in a ->cpu_release() callback, and the task is again
+	 * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
+	 * task will not be scheduled on the CPU until at least the next invocation
+	 * of the ->cpu_acquire() callback.
+	 */
+	SCX_ENQ_REENQ		= 1LLU << 40,
+
 	/*
 	 * The task being enqueued is the only task available for the cpu. By
 	 * default, ext core keeps executing such tasks but when
@@ -93,6 +104,8 @@ DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
 #define scx_enabled()		static_branch_unlikely(&__scx_ops_enabled)
 #define scx_switched_all()	static_branch_unlikely(&__scx_switched_all)
 
+DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
+
 bool task_on_scx(struct task_struct *p);
 void scx_pre_fork(struct task_struct *p);
 int scx_fork(struct task_struct *p);
@@ -108,13 +121,17 @@ __printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
 #define scx_ops_error(fmt, args...)						\
 	scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)
 
+void __scx_notify_pick_next_task(struct rq *rq,
+				 const struct task_struct *p,
+				 const struct sched_class *active);
+
 static inline void scx_notify_pick_next_task(struct rq *rq,
 					     const struct task_struct *p,
 					     const struct sched_class *active)
 {
-#ifdef CONFIG_SMP
 	if (!scx_enabled())
 		return;
+#ifdef CONFIG_SMP
 	/*
 	 * Pairs with the smp_load_acquire() issued by a CPU in
 	 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
@@ -122,6 +139,9 @@ static inline void scx_notify_pick_next_task(struct rq *rq,
 	 */
 	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
 #endif
+	if (!static_branch_unlikely(&scx_ops_cpu_preempt))
+		return;
+	__scx_notify_pick_next_task(rq, p, active);
 }
 
 static inline void scx_notify_sched_tick(void)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5af758cc1e38..0eea6756bc06 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -711,6 +711,7 @@ struct scx_rq {
 	u64			ops_qseq;
 	u32			nr_running;
 	u32			flags;
+	bool			cpu_released;
 #ifdef CONFIG_SMP
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_preempt;
diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h
index dc4d3f7b461f..a44720e62d21 100644
--- a/tools/sched_ext/scx_common.bpf.h
+++ b/tools/sched_ext/scx_common.bpf.h
@@ -60,6 +60,7 @@ extern s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed) __ksym;
 extern void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
 extern bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 extern s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
+extern u32 scx_bpf_reenqueue_local(void) __ksym;
 
 /* XXX - temporary ones to be replaced by generic BPF helpers */
 extern struct cgroup *scx_bpf_task_cgroup(const struct task_struct *p) __ksym;
diff --git a/tools/sched_ext/scx_example_pair.bpf.c b/tools/sched_ext/scx_example_pair.bpf.c
index 7694d2169383..fefd6a50e5b3 100644
--- a/tools/sched_ext/scx_example_pair.bpf.c
+++ b/tools/sched_ext/scx_example_pair.bpf.c
@@ -85,6 +85,28 @@
  * be resolved in the near future which should allow greatly simplifying this
  * scheduler.
  *
+ * Dealing with preemption
+ * -----------------------
+ *
+ * SCX is the lowest priority sched_class, and could be preempted by them at
+ * any time. To address this, the scheduler implements pair_cpu_release() and
+ * pair_cpu_acquire() callbacks which are invoked by the core scheduler when
+ * the scheduler loses and gains control of the CPU respectively.
+ *
+ * In pair_cpu_release(), we mark the pair_ctx as having been preempted, and
+ * then invoke:
+ *
+ * scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT);
+ *
+ * This preempts the pair CPU, and waits until it has re-entered the scheduler
+ * before returning. This is necessary to ensure that the higher priority
+ * sched_class that preempted our scheduler does not schedule a task
+ * concurrently with our pair CPU.
+ *
+ * When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption
+ * in the pair_ctx, and send another resched IPI to the pair CPU to re-enable
+ * pair scheduling.
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
@@ -124,6 +146,12 @@ struct pair_ctx {
 
 	/* the CPUs that are currently active on the cgroup */
 	u32			active_mask;
+
+	/*
+	 * the CPUs that are currently preempted and running tasks in a
+	 * different scheduler.
+	 */
+	u32			preempted_mask;
 };
 
 struct {
@@ -340,7 +368,7 @@ static int dispatch_loopfn(u32 idx, void *data)
 	struct task_struct *p;
 	u64 now = bpf_ktime_get_ns();
 	bool kick_pair = false;
-	bool expired;
+	bool expired, pair_preempted;
 	u32 *vptr, in_pair_mask;
 	s32 pid;
 	u64 cgid;
@@ -369,10 +397,14 @@ static int dispatch_loopfn(u32 idx, void *data)
 		 */
 		pairc->draining = true;
 
-		if (pairc->active_mask) {
+		pair_preempted = pairc->preempted_mask;
+		if (pairc->active_mask || pair_preempted) {
 			/*
-			 * The other CPU is still active We want to wait until
-			 * this cgroup expires.
+			 * The other CPU is still active, or is no longer under
+			 * our control due to e.g. being preempted by a higher
+			 * priority sched_class. We want to wait until this
+			 * cgroup expires, or until control of our pair CPU has
+			 * been returned to us.
 			 *
 			 * If the pair controls its CPU, and the time already
 			 * expired, kick.  When the other CPU arrives at
@@ -381,7 +413,7 @@ static int dispatch_loopfn(u32 idx, void *data)
 			 */
 			__sync_fetch_and_add(&nr_exp_waits, 1);
 			bpf_spin_unlock(&pairc->lock);
-			if (expired)
+			if (expired && !pair_preempted)
 				kick_pair = true;
 			goto out_maybe_kick;
 		}
@@ -485,6 +517,63 @@ void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev)
 	bpf_loop(1 << 23, dispatch_loopfn, &cpu_on_stack, 0);
 }
 
+void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args)
+{
+	int ret;
+	u32 in_pair_mask;
+	struct pair_ctx *pairc;
+	bool kick_pair;
+
+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
+	if (ret)
+		return;
+
+	bpf_spin_lock(&pairc->lock);
+	pairc->preempted_mask &= ~in_pair_mask;
+	/* Kick the pair CPU, unless it was also preempted. */
+	kick_pair = !pairc->preempted_mask;
+	bpf_spin_unlock(&pairc->lock);
+
+	if (kick_pair) {
+		s32 *pair = (s32 *)MEMBER_VPTR(pair_cpu, [cpu]);
+
+		if (pair) {
+			__sync_fetch_and_add(&nr_kicks, 1);
+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
+		}
+	}
+}
+
+void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
+{
+	int ret;
+	u32 in_pair_mask;
+	struct pair_ctx *pairc;
+	bool kick_pair;
+
+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
+	if (ret)
+		return;
+
+	bpf_spin_lock(&pairc->lock);
+	pairc->preempted_mask |= in_pair_mask;
+	pairc->active_mask &= ~in_pair_mask;
+	/* Kick the pair CPU if it's still running. */
+	kick_pair = pairc->active_mask;
+	pairc->draining = true;
+	bpf_spin_unlock(&pairc->lock);
+
+	if (kick_pair) {
+		s32 *pair = (s32 *)MEMBER_VPTR(pair_cpu, [cpu]);
+
+		if (pair) {
+			__sync_fetch_and_add(&nr_kicks, 1);
+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT);
+		}
+	}
+	__sync_fetch_and_add(&nr_preemptions, 1);
+}
+
 static int alloc_cgrp_q_idx_loopfn(u32 idx, void *data)
 {
 	u32 q_idx;
@@ -546,6 +635,8 @@ SEC(".struct_ops")
 struct sched_ext_ops pair_ops = {
 	.enqueue		= (void *)pair_enqueue,
 	.dispatch		= (void *)pair_dispatch,
+	.cpu_acquire		= (void *)pair_cpu_acquire,
+	.cpu_release		= (void *)pair_cpu_release,
 	.cgroup_init		= (void *)pair_cgroup_init,
 	.cgroup_exit		= (void *)pair_cgroup_exit,
 	.init			= (void *)pair_init,
diff --git a/tools/sched_ext/scx_example_qmap.bpf.c b/tools/sched_ext/scx_example_qmap.bpf.c
index bde8cd339935..b41a5d99beb8 100644
--- a/tools/sched_ext/scx_example_qmap.bpf.c
+++ b/tools/sched_ext/scx_example_qmap.bpf.c
@@ -11,6 +11,8 @@
  *
  * - BPF-side queueing using PIDs.
  * - Sleepable per-task storage allocation using ops.prep_enable().
+ * - Using ops.cpu_release() to handle a higher priority scheduling class taking
+ *   the CPU away.
  *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
@@ -136,6 +138,22 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
+	/*
+	 * If the task was re-enqueued due to the CPU being preempted by a
+	 * higher priority scheduling class, just re-enqueue the task directly
+	 * on the global DSQ. As we want another CPU to pick it up, find and
+	 * kick an idle CPU.
+	 */
+	if (enq_flags & SCX_ENQ_REENQ) {
+		s32 cpu;
+
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, 0, enq_flags);
+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr);
+		if (cpu >= 0)
+			scx_bpf_kick_cpu(cpu, 0);
+		return;
+	}
+
 	/* Coarsely map the compount weight to a FIFO. */
 	if (p->scx.weight <= 25)
 		idx = 0;
@@ -207,6 +225,22 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
+void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
+{
+	u32 cnt;
+
+	/*
+	 * Called when @cpu is taken by a higher priority scheduling class. This
+	 * makes @cpu no longer available for executing sched_ext tasks. As we
+	 * don't want the tasks in @cpu's local dsq to sit there until @cpu
+	 * becomes available again, re-enqueue them into the global dsq. See
+	 * %SCX_ENQ_REENQ handling in qmap_enqueue().
+	 */
+	cnt = scx_bpf_reenqueue_local();
+	if (cnt)
+		__sync_fetch_and_add(&nr_reenqueued, cnt);
+}
+
 s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
 		   struct scx_enable_args *args)
 {
@@ -245,6 +279,7 @@ struct sched_ext_ops qmap_ops = {
 	 * spurious dispatches. Let's be lazy and not bother with dequeueing.
 	 */
 	.dispatch		= (void *)qmap_dispatch,
+	.cpu_release		= (void *)qmap_cpu_release,
 	.prep_enable		= (void *)qmap_prep_enable,
 	.init			= (void *)qmap_init,
 	.exit			= (void *)qmap_exit,

From patchwork Wed Nov 30 08:23:09 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059558
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 6F0ABC433FE
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:27:10 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234744AbiK3I1J (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:27:09 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55366 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234952AbiK3I0G (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:26:06 -0500
Received: from mail-pf1-x42b.google.com (mail-pf1-x42b.google.com
 [IPv6:2607:f8b0:4864:20::42b])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id A55F7654E3;
        Wed, 30 Nov 2022 00:24:35 -0800 (PST)
Received: by mail-pf1-x42b.google.com with SMTP id o1so11608176pfp.12;
        Wed, 30 Nov 2022 00:24:35 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=5z3GprH1pp84MWfe+7OnmUCElSOiK2zPKNBF1OgTbsQ=;
        b=GMkv+ohqpS1dMznFVn3RUzHRhXhriAHoM9YlFlSR5agq6W6y+rPOh2W3C/ErRz7eOt
         9aBqm8jGEu5MCG9wdrJlsvsH/IKP57iYLGIf3NEmRpWFO2ERE4+iefqpUzujONrjA04l
         LAVzSgbt2AvXARIKAEz0iUT4ksAlPBn0LnvCFc8OGmr5HcfBkMbTIrtbnePAVFDCovvK
         m+3x1oReNUojIC3w6h9aqs9HQRT344qCqjnygzp/M5jMca71zjJZ1iP9YR2DbzVXD/6D
         T8ETeRrhc+95FDmiDgHbEHdQ6EwPb7cWPIPuXiDAOOL7Cc3ehMvR46LOar/ol3KAcUe5
         Skhg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=5z3GprH1pp84MWfe+7OnmUCElSOiK2zPKNBF1OgTbsQ=;
        b=Vh+95ttd4SeR6DC57UvfxpO9KMCLpT8eWiJx3x4zW+vqQsP4JocCzQ3MSfgONdpmkw
         E0eDn+2qrnzH3HimnSf87hv8WnWIUiEGeLYgnsoHTE7LolGIzzHFSkrtWZHgHuPvSdPA
         i1/gRhnZOrvLuUbrMbOb3if/8w2LOq6IzVccNmG06WUz1p1LdMwQF2zlWSCfHcwNemLA
         YL7nVb6mSC2lMpLtXQ3qx8zKsYO20+VYR+DC54dQP4lUZhawucaRDuG/haZLI776Moek
         f5Lea00tRXhmpFjUKn9ZbkMoeXhnC06NCpzC67IyZqGD84MEUn7ipYI1QzOHvGC6WH8Y
         QQWQ==
X-Gm-Message-State: ANoB5pm3JHZucLvOwjtP7qIU6QvFX88ooraNqAsSU2wKQLhS3wR8t+Rg
        iKEZ8mwgvgxhLI43yBs4V4g=
X-Google-Smtp-Source: 
 AA0mqf5abYIzg94lJG4TmTISQGdiVpSgpIHOaET9p3gg21b01KHPIESCEdxzJRsJI4BqNja20sqkDw==
X-Received: by 2002:a63:fc63:0:b0:478:18d6:ad22 with SMTP id
 r35-20020a63fc63000000b0047818d6ad22mr12578836pgk.492.1669796674939;
        Wed, 30 Nov 2022 00:24:34 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 o1-20020a170902bcc100b00187197c4999sm748502pls.167.2022.11.30.00.24.34
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:34 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 27/31] sched_ext: Implement sched_ext_ops.cpu_online/offline()
Date: Tue, 29 Nov 2022 22:23:09 -1000
Message-Id: <20221130082313.3241517-28-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

Add ops.cpu_online/offline() which are invoked when CPUs come online and
offline respectively. As the enqueue path already automatically bypasses
tasks to the local dsq on a deactivated CPU, BPF schedulers are guaranteed
to see tasks only on CPUs which are between online() and offline().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h | 18 ++++++++++++++++++
 kernel/sched/ext.c        | 15 +++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 61f45aa03704..d9f941e23011 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -384,6 +384,24 @@ struct sched_ext_ops {
 	 */
 	void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
 
+	/**
+	 * cpu_online - A CPU became online
+	 * @cpu: CPU which just came up
+	 *
+	 * @cpu just came online. @cpu doesn't call ->enqueue() or consume tasks
+	 * associated with other CPUs beforehand.
+	 */
+	void (*cpu_online)(s32 cpu);
+
+	/**
+	 * cpu_offline - A CPU is going offline
+	 * @cpu: CPU which is going offline
+	 *
+	 * @cpu is going offline. @cpu doesn't call ->enqueue() or consume tasks
+	 * associated with other CPUs afterwards.
+	 */
+	void (*cpu_offline)(s32 cpu);
+
 	/**
 	 * prep_enable - Prepare to enable BPF scheduling for a task
 	 * @p: task to prepare BPF scheduling for
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9a7a4e54e8fa..aab9ae13b88f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1574,6 +1574,18 @@ void __scx_update_idle(struct rq *rq, bool idle)
 	}
 }
 
+static void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason)
+{
+	if (SCX_HAS_OP(cpu_online) && reason == RQ_ONOFF_HOTPLUG)
+		scx_ops.cpu_online(cpu_of(rq));
+}
+
+static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason)
+{
+	if (SCX_HAS_OP(cpu_offline) && reason == RQ_ONOFF_HOTPLUG)
+		scx_ops.cpu_offline(cpu_of(rq));
+}
+
 #else /* !CONFIG_SMP */
 
 static bool test_and_clear_cpu_idle(int cpu) { return false; }
@@ -2049,6 +2061,9 @@ DEFINE_SCHED_CLASS(ext) = {
 	.pick_task		= pick_task_scx,
 
 	.set_cpus_allowed	= set_cpus_allowed_scx,
+
+	.rq_online		= rq_online_scx,
+	.rq_offline		= rq_offline_scx,
 #endif
 
 	.task_tick		= task_tick_scx,

From patchwork Wed Nov 30 08:23:10 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059563
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 8C6B8C4321E
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:27:13 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234878AbiK3I1K (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:27:10 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56000 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235254AbiK3I0G (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:26:06 -0500
Received: from mail-pl1-x62a.google.com (mail-pl1-x62a.google.com
 [IPv6:2607:f8b0:4864:20::62a])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 01CA95EF82;
        Wed, 30 Nov 2022 00:24:37 -0800 (PST)
Received: by mail-pl1-x62a.google.com with SMTP id p24so12210518plw.1;
        Wed, 30 Nov 2022 00:24:37 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=B1q5Og4gF9TMiIChJG+mXWVfWA2k2pfdZe1vyZxuhx0=;
        b=CoqZis41No0S8AUk07QtKJMLPLj05nVLUMimfEOfLp74zpQCcnVVqlt1H6nQFk1ve/
         eD0V3kPIRlka8f4pfgN2bLigL8t0jy9pzsGeHfP00qpDLt5u0dXNwgZ51qRJPxsPYxAb
         HazKTJA6Wu8jiq7Np76aGR/YMRSmo7S0HS0pb4Gfs7ytvyAsbWL7+6QEhzTW+aymf2tb
         xbOM3aS66kYmbDbC8y6RLCxz7BQKOuvNFof/OPRC37DYYyoJor6HVTiyLalukowsDkRV
         kIgTKxgBLmiOyHSn6T6ihcJHSF8CTR4RCXzAmKVjqYUi79P5LKClyEUVr3kp2Dnq+mgI
         ewzQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=B1q5Og4gF9TMiIChJG+mXWVfWA2k2pfdZe1vyZxuhx0=;
        b=FbQVU7R4ETpfPXAiW/WAWxVGR8vvpyPLT/qmAPY1uik4J7rz6Pda/XbI5TOepw1FGS
         enkviue67Owd1pcO+ziPRXa11OcNsQugbueMrf/SQ2LABFFzeerCoNMCox1cvxM+MS8d
         M+35U4tyIHhErmIpZ9QGd2s3kLCsBKv0wO/mnfoqK4mBC9x0mSdNDEO+dTWBGH1uK7cQ
         aJZIfeOZXU8X6mod++ixpCz4Du4Bb12s9H+/4ZyVEZYtamWUQr3aAmitHWHd1T4EfPNr
         XdNne0vlRdzEUs/B9w2AJHlc22tfFFbesKLdwiHyE9LlSWMZsCiiqnCdEkFpzCz5/DpP
         ndhQ==
X-Gm-Message-State: ANoB5plOgimV5BXLLW5Y/yVuKzOTSuNIKQOW/7QvoTS/zLVxrTZukPpc
        pHDloBJf+kaNfOuR8ndQQCA=
X-Google-Smtp-Source: 
 AA0mqf6+Fre1nmSTpWARo+gUGk4V/WdO9j19MlfgjXqZNXLR0ssxUZtEHCKYekvslkZMD0TiLdsTKA==
X-Received: by 2002:a17:902:b613:b0:188:f570:7bd6 with SMTP id
 b19-20020a170902b61300b00188f5707bd6mr41149360pls.97.1669796677134;
        Wed, 30 Nov 2022 00:24:37 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 mm10-20020a17090b358a00b00218f9bd50c7sm681574pjb.50.2022.11.30.00.24.36
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:36 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 28/31] sched_ext: Add Documentation/scheduler/sched-ext.rst
Date: Tue, 29 Nov 2022 22:23:10 -1000
Message-Id: <20221130082313.3241517-29-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 Documentation/scheduler/index.rst     |   1 +
 Documentation/scheduler/sched-ext.rst | 230 ++++++++++++++++++++++++++
 include/linux/sched/ext.h             |   2 +
 kernel/sched/ext.c                    |   2 +
 kernel/sched/ext.h                    |   2 +
 5 files changed, 237 insertions(+)
 create mode 100644 Documentation/scheduler/sched-ext.rst

diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst
index b430d856056a..8a27a9967284 100644
--- a/Documentation/scheduler/index.rst
+++ b/Documentation/scheduler/index.rst
@@ -18,6 +18,7 @@ Linux Scheduler
     sched-nice-design
     sched-rt-group
     sched-stats
+    sched-ext
     sched-debug
 
     text_files
diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
new file mode 100644
index 000000000000..81f78e05a6c2
--- /dev/null
+++ b/Documentation/scheduler/sched-ext.rst
@@ -0,0 +1,230 @@
+==========================
+Extensible Scheduler Class
+==========================
+
+sched_ext is a scheduler class whose behavior can be defined by a set of BPF
+programs - the BPF scheduler.
+
+* sched_ext exports a full scheduling interface so that any scheduling
+  algorithm can be implemented on top.
+
+* The BPF scheduler can group CPUs however it sees fit and schedule them
+  together, as tasks aren't tied to specific CPUs at the time of wakeup.
+
+* The BPF scheduler can be turned on and off dynamically anytime.
+
+* The system integrity is maintained no matter what the BPF scheduler does.
+  The default scheduling behavior is restored anytime an error is detected,
+  a runnable task stalls, or on sysrq-S.
+
+Switching to and from sched_ext
+===============================
+
+``CONFIG_SCHED_CLASS_EXT`` is the config option to enable sched_ext and
+``tools/sched_ext`` contains the example schedulers.
+
+sched_ext is used only when the BPF scheduler is loaded and running.
+
+If a task explicitly sets its scheduling policy to ``SCHED_EXT``, it will be
+treated as ``SCHED_NORMAL`` and scheduled by CFS until the BPF scheduler is
+loaded. On load, such tasks will be switched to and scheduled by sched_ext.
+
+The BPF scheduler can choose to schedule all normal and lower class tasks by
+calling ``scx_bpf_switch_all()`` from its ``init()`` operation. In this
+case, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE`` and
+``SCHED_EXT`` tasks are scheduled by sched_ext. In the example schedulers,
+this mode can be selected with the ``-a`` option.
+
+Terminating the sched_ext scheduler program, triggering sysrq-S, or
+detection of any internal error including stalled runnable tasks aborts the
+BPF scheduler and reverts all tasks back to CFS.
+
+.. code-block:: none
+
+    # make -j16 -C tools/sched_ext
+    # tools/sched_ext/scx_example_dummy -a
+    local=0 global=3
+    local=5 global=24
+    local=9 global=44
+    local=13 global=56
+    local=17 global=72
+    ^CEXIT: BPF scheduler unregistered
+
+If ``CONFIG_SCHED_DEBUG`` is set, the current status of the BPF scheduler
+and whether a given task is on sched_ext can be determined as follows:
+
+.. code-block:: none
+
+    # cat /sys/kernel/debug/sched/ext
+    ops                           : dummy
+    enabled                       : 1
+    switching_all                 : 1
+    switched_all                  : 1
+    enable_state                  : enabled
+
+    # grep ext /proc/self/sched
+    ext.enabled                                  :                    1
+
+The Basics
+==========
+
+Userspace can implement an arbitrary BPF scheduler by loading a set of BPF
+programs that implement ``struct sched_ext_ops``. The only mandatory field
+is ``.name`` which must be a valid BPF object name. All operations are
+optional. The following modified excerpt is from
+``tools/sched/scx_example_dummy.bpf.c`` showing a minimal global FIFO
+scheduler.
+
+.. code-block:: c
+
+    s32 BPF_STRUCT_OPS(dummy_init)
+    {
+            if (switch_all)
+                    scx_bpf_switch_all();
+            return 0;
+    }
+
+    void BPF_STRUCT_OPS(dummy_enqueue, struct task_struct *p, u64 enq_flags)
+    {
+            if (enq_flags & SCX_ENQ_LOCAL)
+                    scx_bpf_dispatch(p, SCX_DSQ_LOCAL, enq_flags);
+            else
+                    scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, enq_flags);
+    }
+
+    void BPF_STRUCT_OPS(dummy_exit, struct scx_exit_info *ei)
+    {
+            exit_type = ei->type;
+    }
+
+    SEC(".struct_ops")
+    struct sched_ext_ops dummy_ops = {
+            .enqueue                = (void *)dummy_enqueue,
+            .init                   = (void *)dummy_init,
+            .exit                   = (void *)dummy_exit,
+            .name                   = "dummy",
+    };
+
+Dispatch Queues
+---------------
+
+To match the impedance between the scheduler core and the BPF scheduler,
+sched_ext uses simple FIFOs called dsq's (dispatch queues). By default,
+there is one global FIFO (``SCX_DSQ_GLOBAL``), and one local dsq per CPU
+(``SCX_DSQ_LOCAL``). The BPF scheduler can manage an arbitrary number of
+dsq's using ``scx_bpf_create_dsq()`` and ``scx_bpf_destroy_dsq()``.
+
+A task is always *dispatch*ed to a dsq for execution. The task starts
+execution when a CPU *consume*s the task from the dsq.
+
+Internally, a CPU only executes tasks which are running on its local dsq,
+and the ``.consume()`` operation is in fact a transfer of a task from a
+remote dsq to the CPU's local dsq. A CPU therefore only consumes from other
+dsq's when its local dsq is empty, and dispatching a task to a local dsq
+will cause it to be executed before the CPU attempts to consume tasks which
+were previously dispatched to other dsq's.
+
+Scheduling Cycle
+----------------
+
+The following briefly shows how a waking task is scheduled and executed.
+
+1. When a task is waking up, ``.select_cpu()`` is the first operation
+   invoked. This serves two purposes. First, CPU selection optimization
+   hint. Second, waking up the selected CPU if idle.
+
+   The CPU selected by ``.select_cpu()`` is an optimization hint and not
+   binding. The actual decision is made at the last step of scheduling.
+   However, there is a small performance gain if the CPU ``.select_cpu()``
+   returns matches the CPU the task eventually runs on.
+
+   A side-effect of selecting a CPU is waking it up from idle. While a BPF
+   scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper,
+   using ``.select_cpu()`` judiciously can be simpler and more efficient.
+
+   Note that the scheduler core will ignore an invalid CPU selection, for
+   example, if it's outside the allowed cpumask of the task.
+
+2. Once the target CPU is selected, ``.enqueue()`` is invoked. It can make
+   one of the following decisions:
+
+   * Immediately dispatch the task to either the global or local dsq by
+     calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or
+     ``SCX_DSQ_LOCAL``, respectively.
+
+   * Immediately dispatch the task to a user-created dsq by calling
+     ``scx_bpf_dispatch()`` with a dsq ID which is smaller than 2^63.
+
+   * Queue the task on the BPF side.
+
+3. When a CPU is ready to schedule, it first looks at its local dsq. If
+   empty, it invokes ``.consume()`` which should make one or more
+   ``scx_bpf_consume()`` calls to consume tasks from dsq's. If a
+   ``scx_bpf_consume()`` call succeeds, the CPU has the next task to run and
+   ``.consume()`` can return.
+
+   If ``.consume()`` is not implemented, the built-in ``SCX_DSQ_GLOBAL`` dsq
+   is consumed by default.
+
+4. If there's still no task to run, ``.dispatch()`` is invoked which should
+   make one or more ``scx_bpf_dispatch()`` calls to dispatch tasks from the
+   BPF scheduler to one of the dsq's. If more than one task has been
+   dispatched, go back to the previous consumption step.
+
+5. If there's still no task to run, ``.consume_final()`` is invoked.
+   ``.consume_final()`` is equivalent to ``.consume()``, but is invoked
+   right before the CPU goes idle. This provide schedulers with a hook that
+   can be used to implement, e.g., more aggressive work stealing from remote
+   dsq's.
+
+Note that the BPF scheduler can always choose to dispatch tasks immediately
+in ``.enqueue()`` as illustrated in the above dummy example. In such case,
+there's no need to implement ``.dispatch()`` as a task is never queued on
+the BPF side.
+
+Where to Look
+=============
+
+* ``include/linux/sched/ext.h`` defines the core data structures, ops table
+  and constants.
+
+* ``kernel/sched/ext.c`` contains sched_ext core implementation and helpers.
+  The functions prefixed with ``scx_bpf_`` can be called from the BPF
+  scheduler.
+
+* ``tools/sched_ext/`` hosts example BPF scheduler implementations.
+
+  * ``scx_example_dummy[.bpf].c``: Minimal global FIFO scheduler example
+    using a custom dsq.
+
+  * ``scx_example_qmap[.bpf].c``: A multi-level FIFO scheduler supporting
+    five levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``.
+
+ABI Instability
+===============
+
+The APIs provided by sched_ext to BPF schedulers programs have no stability
+guarantees. This includes the ops table callbacks and constants defined in
+``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in
+``kernel/sched/ext.c``.
+
+While we will attempt to provide a relatively stable API surface when
+possible, they are subject to change without warning between kernel
+versions.
+
+Caveats
+=======
+
+* The current implementation isn't safe in that the BPF scheduler can crash
+  the kernel.
+
+  * Unsafe cpumask helpers should be replaced by proper generic BPF helpers.
+
+  * Currently, all kfunc helpers can be called by any operation as BPF
+    doesn't yet support filtering kfunc calls per struct_ops operation. Some
+    helpers are context sensitive as should be restricted accordingly.
+
+  * Timers used by the BPF scheduler should be shut down when aborting.
+
+* There are a couple BPF hacks which are still needed even for sched_ext
+  proper. They should be removed in the near future.
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index d9f941e23011..49eda3adeecf 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index aab9ae13b88f..a28144220501 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index e1eaaba3d4c7..b97dbb840ac9 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>

From patchwork Wed Nov 30 08:23:11 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059565
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 03EC1C4321E
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:27:19 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234957AbiK3I1O (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:27:14 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56002 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234964AbiK3I0G (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:26:06 -0500
Received: from mail-pf1-x42c.google.com (mail-pf1-x42c.google.com
 [IPv6:2607:f8b0:4864:20::42c])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 15ED55EFAB;
        Wed, 30 Nov 2022 00:24:40 -0800 (PST)
Received: by mail-pf1-x42c.google.com with SMTP id h28so2570931pfq.9;
        Wed, 30 Nov 2022 00:24:40 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=4IS0nMCdHVulRyAQQAAlt29WZFhq5Fd/tcL0lNTuYBs=;
        b=i6i7qAO21L8+XaNrsf8d4VQSyJX/P5bEwDd8mJ3fWT7NvpxYeRnEcmM0Ok3jnjkw4P
         mDJdITZNohLJGys+2fEKBhzUMLvudWlPhwpwWwCcecLZ1nx/F4WBmvFeuYWP1XeIizwn
         5aEjbS4sOzECEcBjplXpUoZWtIibWMOiz+icFcBRIMpOzsYNSaN8duzT+7fHo7ZMUJu2
         WarcJYtN2tJ0WU1/woYCDFia2los3Wdp/Lg7yNdUe+0JnlI1Qqp1O/tEDFdmBbdYOMWl
         q22AUZ0SvWMEKteAkwAqrcC14N6+N8324AhfW9UTg/xTHX61XrWU/tLZmRmBPJHE8Lz0
         XsyQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=4IS0nMCdHVulRyAQQAAlt29WZFhq5Fd/tcL0lNTuYBs=;
        b=RqNdJKA5zpJG0HZjUccAZBcEloKAblfhTGHf7zRAOZfBF2vDL7OXS2r80YGaRRyHTR
         r0vL8DOg7Rdu4re8OjKoPLpJIVwS1ju6jS2VVsxOpQEz1W1io1og9HJhOGo3zckE0Cjt
         Zv9kTEOBF7VC4LMk8QNBOLo37XLzcTrBS///MMh1OS88KCFrcZtiiiMjMwdoQpyj+F7P
         zLkv7kL541ngjUqClbvH+guEmrOYGyLjdvtJu8QoF4nnslykAG8X4Drg+dVR9QVaGpzP
         +S9Vlq7P9J4WAt/tlihIDFb8U/44Hmm3GADh8Mfc3k0LN4TregP9P2HZ64lL0JYfGERb
         rH5A==
X-Gm-Message-State: ANoB5pnvEmXSoyRJGnFTLZVVn176Y5as4Nlt5qedN3JTMhGn+Me1mCCr
        HDgvNoph1vMSzu68I2eSEXo=
X-Google-Smtp-Source: 
 AA0mqf5EqYuK/Rl+DMmDo9jEujUnd/+ga1YGkZxVcsCoAP14hUxzVwbkreE48X9sJNr0HasC9iI5MQ==
X-Received: by 2002:a63:5857:0:b0:46f:9763:a37b with SMTP id
 i23-20020a635857000000b0046f9763a37bmr39090104pgm.177.1669796679270;
        Wed, 30 Nov 2022 00:24:39 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 q13-20020a17090311cd00b00189929219acsm755650plh.183.2022.11.30.00.24.38
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:38 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 29/31] sched_ext: Add a basic, userland vruntime scheduler
Date: Tue, 29 Nov 2022 22:23:11 -1000
Message-Id: <20221130082313.3241517-30-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

From: David Vernet <dvernet@meta.com>

This patch adds a new scx_example_userland BPF scheduler that implements a
fairly unsophisticated sorted-list vruntime scheduler in userland to
demonstrate how most scheduling decisions can be delegated to userland. The
scheduler doesn't implement load balancing, and treats all tasks as part of
a single domain.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/.gitignore                    |   1 +
 tools/sched_ext/Makefile                      |  10 +-
 tools/sched_ext/scx_example_userland.bpf.c    | 265 ++++++++++++
 tools/sched_ext/scx_example_userland.c        | 403 ++++++++++++++++++
 tools/sched_ext/scx_example_userland_common.h |  19 +
 5 files changed, 696 insertions(+), 2 deletions(-)
 create mode 100644 tools/sched_ext/scx_example_userland.bpf.c
 create mode 100644 tools/sched_ext/scx_example_userland.c
 create mode 100644 tools/sched_ext/scx_example_userland_common.h

diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
index ebc34dcf925b..75a536dfebfc 100644
--- a/tools/sched_ext/.gitignore
+++ b/tools/sched_ext/.gitignore
@@ -2,6 +2,7 @@ scx_example_dummy
 scx_example_qmap
 scx_example_central
 scx_example_pair
+scx_example_userland
 *.skel.h
 *.subskel.h
 /tools/
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 45ab39139afc..563b54333ab1 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -114,7 +114,8 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     -Wno-compare-distinct-pointer-types				\
 	     -O2 -mcpu=v3
 
-all: scx_example_dummy scx_example_qmap scx_example_central scx_example_pair
+all: scx_example_dummy scx_example_qmap scx_example_central scx_example_pair	\
+     scx_example_userland
 
 # sort removes libbpf duplicates when not cross-building
 MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
@@ -181,11 +182,16 @@ scx_example_pair: scx_example_pair.c scx_example_pair.skel.h user_exit_info.h
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
+scx_example_userland: scx_example_userland.c scx_example_userland.skel.h	\
+		      scx_example_userland_common.h user_exit_info.h
+	$(CC) $(CFLAGS) -c $< -o $@.o
+	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+
 clean:
 	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
 	rm -f scx_example_dummy scx_example_qmap scx_example_central		\
-	      scx_example_pair
+	      scx_example_pair scx_example_userland
 
 .PHONY: all clean
 
diff --git a/tools/sched_ext/scx_example_userland.bpf.c b/tools/sched_ext/scx_example_userland.bpf.c
new file mode 100644
index 000000000000..b0af532b4db0
--- /dev/null
+++ b/tools/sched_ext/scx_example_userland.bpf.c
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A minimal userland scheduler.
+ *
+ * In terms of scheduling, this provides two different types of behaviors:
+ * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity.
+ *    All such tasks are direct-dispatched from the kernel, and are never
+ *    enqueued in user space.
+ * 2. A primitive vruntime scheduler that is implemented in user space, for all
+ *    other tasks.
+ *
+ * Some parts of this example user space scheduler could be implemented more
+ * efficiently using more complex and sophisticated data structures. For
+ * example, rather than using BPF_MAP_TYPE_QUEUE's,
+ * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between
+ * user space and kernel space. Similarly, we use a simple vruntime-sorted list
+ * in user space, but an rbtree could be used instead.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <string.h>
+#include "scx_common.bpf.h"
+#include "scx_example_userland_common.h"
+
+char _license[] SEC("license") = "GPL";
+
+const volatile bool switch_all;
+const volatile u32 num_possible_cpus;
+const volatile s32 usersched_pid;
+
+/* Stats that are printed by user space. */
+u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues;
+
+struct user_exit_info uei;
+
+/*
+ * Whether the user space scheduler needs to be scheduled due to a task being
+ * enqueued in user space.
+ */
+static bool usersched_needed;
+
+/*
+ * The map containing tasks that are enqueued in user space from the kernel.
+ *
+ * This map is drained by the user space scheduler.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, USERLAND_MAX_TASKS);
+	__type(value, struct scx_userland_enqueued_task);
+} enqueued SEC(".maps");
+
+/*
+ * The map containing tasks that are dispatched to the kernel from user space.
+ *
+ * Drained by the kernel in userland_dispatch().
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, USERLAND_MAX_TASKS);
+	__type(value, s32);
+} dispatched SEC(".maps");
+
+/* Per-task scheduling context */
+struct task_ctx {
+	bool force_local; /* Dispatch directly to local DSQ */
+};
+
+/* Map that contains task-local storage. */
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+static bool is_usersched_task(const struct task_struct *p)
+{
+	return p->pid == usersched_pid;
+}
+
+static bool keep_in_kernel(const struct task_struct *p)
+{
+	return p->nr_cpus_allowed < num_possible_cpus;
+}
+
+static struct task_struct *usersched_task(void)
+{
+	struct task_struct *p;
+
+	p = scx_bpf_find_task_by_pid(usersched_pid);
+	/*
+	 * Should never happen -- the usersched task should always be managed
+	 * by sched_ext.
+	 */
+	if (!p)
+		scx_bpf_error("Failed to find usersched task %d", usersched_pid);
+
+	/*
+	 * While p should never be NULL, have logic to return current so the
+	 * caller doesn't have to bother with checking for NULL.
+	 */
+	return p ?: bpf_get_current_task_btf();
+}
+
+s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	if (keep_in_kernel(p)) {
+		s32 cpu;
+		struct task_ctx *tctx;
+
+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("Failed to look up task-local storage for %s", p->comm);
+			return -ESRCH;
+		}
+
+		if (p->nr_cpus_allowed == 1 ||
+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+			tctx->force_local = true;
+			return prev_cpu;
+		}
+
+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr);
+		if (cpu >= 0) {
+			tctx->force_local = true;
+			return cpu;
+		}
+	}
+
+	return prev_cpu;
+}
+
+static void dispatch_user_scheduler(void)
+{
+	usersched_needed = false;
+	scx_bpf_dispatch(usersched_task(), SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+}
+
+static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)
+{
+	struct scx_userland_enqueued_task task;
+
+	memset(&task, 0, sizeof(task));
+	task.pid = p->pid;
+	task.sum_exec_runtime = p->se.sum_exec_runtime;
+	task.weight = p->scx.weight;
+
+	if (bpf_map_push_elem(&enqueued, &task, 0)) {
+		/*
+		 * If we fail to enqueue the task in user space, put it
+		 * directly on the global DSQ.
+		 */
+		__sync_fetch_and_add(&nr_failed_enqueues, 1);
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	} else {
+		__sync_fetch_and_add(&nr_user_enqueues, 1);
+		usersched_needed = true;
+	}
+}
+
+void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	if (keep_in_kernel(p)) {
+		u64 dsq_id = SCX_DSQ_GLOBAL;
+		struct task_ctx *tctx;
+
+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("Failed to lookup task ctx for %s", p->comm);
+			return;
+		}
+
+		if (tctx->force_local)
+			dsq_id = SCX_DSQ_LOCAL;
+		tctx->force_local = false;
+		scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags);
+		__sync_fetch_and_add(&nr_kernel_enqueues, 1);
+		return;
+	} else if (!is_usersched_task(p)) {
+		enqueue_task_in_user_space(p, enq_flags);
+	}
+}
+
+static int drain_dispatch_q_loopfn(u32 idx, void *data)
+{
+	s32 cpu = *(s32 *)data;
+	s32 pid;
+	struct task_struct *p;
+
+	if (bpf_map_pop_elem(&dispatched, &pid))
+		return 1;
+
+	/*
+	 * The task could have exited by the time we get around to dispatching
+	 * it. Treat this as a normal occurrence, and simply move onto the next
+	 * iteration.
+	 */
+	p = scx_bpf_find_task_by_pid(pid);
+	if (!p)
+		return 0;
+
+	scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+	return 0;
+}
+
+void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)
+{
+	if (usersched_needed)
+		dispatch_user_scheduler();
+
+	/* XXX: Use an iterator when it's available. */
+	bpf_loop(4096, drain_dispatch_q_loopfn, &cpu, 0);
+}
+
+s32 BPF_STRUCT_OPS(userland_prep_enable, struct task_struct *p,
+		   struct scx_enable_args *args)
+{
+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+s32 BPF_STRUCT_OPS(userland_init)
+{
+	int ret;
+
+	if (num_possible_cpus == 0) {
+		scx_bpf_error("User scheduler # CPUs uninitialized (%d)",
+			      num_possible_cpus);
+		return -EINVAL;
+	}
+
+	if (usersched_pid <= 0) {
+		scx_bpf_error("User scheduler pid uninitialized (%d)",
+			      usersched_pid);
+		return -EINVAL;
+	}
+
+	if (switch_all)
+		scx_bpf_switch_all();
+	return 0;
+}
+
+void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops userland_ops = {
+	.select_cpu		= (void *)userland_select_cpu,
+	.enqueue		= (void *)userland_enqueue,
+	.dispatch		= (void *)userland_dispatch,
+	.prep_enable		= (void *)userland_prep_enable,
+	.init			= (void *)userland_init,
+	.exit			= (void *)userland_exit,
+	.timeout_ms		= 3000,
+	.name			= "userland",
+};
diff --git a/tools/sched_ext/scx_example_userland.c b/tools/sched_ext/scx_example_userland.c
new file mode 100644
index 000000000000..4ddd257b9e42
--- /dev/null
+++ b/tools/sched_ext/scx_example_userland.c
@@ -0,0 +1,403 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A demo sched_ext user space scheduler which provides vruntime semantics
+ * using a simple ordered-list implementation.
+ *
+ * Each CPU in the system resides in a single, global domain. This precludes
+ * the need to do any load balancing between domains. The scheduler could
+ * easily be extended to support multiple domains, with load balancing
+ * happening in user space.
+ *
+ * Any task which has any CPU affinity is scheduled entirely in BPF. This
+ * program only schedules tasks which may run on any CPU.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <sched.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <pthread.h>
+#include <bpf/bpf.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+
+#include "user_exit_info.h"
+#include "scx_example_userland_common.h"
+#include "scx_example_userland.skel.h"
+
+const char help_fmt[] =
+"A minimal userland sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-a]\n"
+"\n"
+"  -a            Switch all tasks\n"
+"  -b            The number of tasks to batch when dispatching.\n"
+"                Defaults to 8\n"
+"  -h            Display this help and exit\n";
+
+/* Defined in UAPI */
+#define SCHED_EXT 7
+
+/* Number of tasks to batch when dispatching to user space. */
+static __u32 batch_size = 8;
+
+static volatile int exit_req;
+static int enqueued_fd, dispatched_fd;
+
+static struct scx_example_userland *skel;
+static struct bpf_link *ops_link;
+
+/* Stats collected in user space. */
+static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches;
+
+/* The data structure containing tasks that are enqueued in user space. */
+struct enqueued_task {
+	LIST_ENTRY(enqueued_task) entries;
+	__u64 sum_exec_runtime;
+	double vruntime;
+};
+
+/*
+ * Use a vruntime-sorted list to store tasks. This could easily be extended to
+ * a more optimal data structure, such as an rbtree as is done in CFS. We
+ * currently elect to use a sorted list to simplify the example for
+ * illustrative purposes.
+ */
+LIST_HEAD(listhead, enqueued_task);
+
+/*
+ * A vruntime-sorted list of tasks. The head of the list contains the task with
+ * the lowest vruntime. That is, the task that has the "highest" claim to be
+ * scheduled.
+ */
+static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head);
+
+/*
+ * The statically allocated array of tasks. We use a statically allocated list
+ * here to avoid having to allocate on the enqueue path, which could cause a
+ * deadlock. A more substantive user space scheduler could e.g. provide a hook
+ * for newly enabled tasks that are passed to the scheduler from the
+ * .prep_enable() callback to allows the scheduler to allocate on safe paths.
+ */
+struct enqueued_task tasks[USERLAND_MAX_TASKS];
+
+static double min_vruntime;
+
+static void sigint_handler(int userland)
+{
+	exit_req = 1;
+}
+
+static __u32 task_pid(const struct enqueued_task *task)
+{
+	return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task);
+}
+
+static int dispatch_task(s32 pid)
+{
+	int err;
+
+	err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0);
+	if (err) {
+		fprintf(stderr, "Failed to dispatch task %d\n", pid);
+		exit_req = 1;
+	} else {
+		nr_vruntime_dispatches++;
+	}
+
+	return err;
+}
+
+static struct enqueued_task *get_enqueued_task(__s32 pid)
+{
+	if (pid >= USERLAND_MAX_TASKS)
+		return NULL;
+
+	return &tasks[pid];
+}
+
+static double calc_vruntime_delta(__u64 weight, __u64 delta)
+{
+	double weight_f = (double)weight / 100.0;
+	double delta_f = (double)delta;
+
+	return delta_f / weight_f;
+}
+
+static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task)
+{
+	__u64 delta;
+
+	delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime;
+
+	enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta);
+	if (min_vruntime > enqueued->vruntime)
+		enqueued->vruntime = min_vruntime;
+	enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime;
+}
+
+static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task)
+{
+	struct enqueued_task *curr, *enqueued, *prev;
+
+	curr = get_enqueued_task(bpf_task->pid);
+	if (!curr)
+		return ENOENT;
+
+	update_enqueued(curr, bpf_task);
+	nr_vruntime_enqueues++;
+
+	/*
+	 * Enqueue the task in a vruntime-sorted list. A more optimal data
+	 * structure such as an rbtree could easily be used as well. We elect
+	 * to use a list here simply because it's less code, and thus the
+	 * example is less convoluted and better serves to illustrate what a
+	 * user space scheduler could look like.
+	 */
+
+	if (LIST_EMPTY(&vruntime_head)) {
+		LIST_INSERT_HEAD(&vruntime_head, curr, entries);
+		return 0;
+	}
+
+	LIST_FOREACH(enqueued, &vruntime_head, entries) {
+		if (curr->vruntime <= enqueued->vruntime) {
+			LIST_INSERT_BEFORE(enqueued, curr, entries);
+			return 0;
+		}
+		prev = enqueued;
+	}
+
+	LIST_INSERT_AFTER(prev, curr, entries);
+
+	return 0;
+}
+
+static void drain_enqueued_map(void)
+{
+	while (1) {
+		struct scx_userland_enqueued_task task;
+		int err;
+
+		if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task))
+			return;
+
+		err = vruntime_enqueue(&task);
+		if (err) {
+			fprintf(stderr, "Failed to enqueue task %d: %s\n",
+				task.pid, strerror(err));
+			exit_req = 1;
+			return;
+		}
+	}
+}
+
+static void dispatch_batch(void)
+{
+	__u32 i;
+
+	for (i = 0; i < batch_size; i++) {
+		struct enqueued_task *task;
+		int err;
+		__s32 pid;
+
+		task = LIST_FIRST(&vruntime_head);
+		if (!task)
+			return;
+
+		min_vruntime = task->vruntime;
+		pid = task_pid(task);
+		LIST_REMOVE(task, entries);
+		err = dispatch_task(pid);
+		if (err) {
+			fprintf(stderr, "Failed to dispatch task %d in %u\n",
+				pid, i);
+			return;
+		}
+	}
+}
+
+static void *run_stats_printer(void *arg)
+{
+	while (!exit_req) {
+		__u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total;
+
+		nr_failed_enqueues = skel->bss->nr_failed_enqueues;
+		nr_kernel_enqueues = skel->bss->nr_kernel_enqueues;
+		nr_user_enqueues = skel->bss->nr_user_enqueues;
+		total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues;
+
+		printf("o-----------------------o\n");
+		printf("| BPF ENQUEUES          |\n");
+		printf("|-----------------------|\n");
+		printf("|  kern:     %10llu |\n", nr_kernel_enqueues);
+		printf("|  user:     %10llu |\n", nr_user_enqueues);
+		printf("|  failed:   %10llu |\n", nr_failed_enqueues);
+		printf("|  -------------------- |\n");
+		printf("|  total:    %10llu |\n", total);
+		printf("|                       |\n");
+		printf("|-----------------------|\n");
+		printf("| VRUNTIME / USER       |\n");
+		printf("|-----------------------|\n");
+		printf("|  enq:      %10llu |\n", nr_vruntime_enqueues);
+		printf("|  disp:     %10llu |\n", nr_vruntime_dispatches);
+		printf("o-----------------------o\n");
+		printf("\n\n");
+		sleep(1);
+	}
+
+	return NULL;
+}
+
+static int spawn_stats_thread(void)
+{
+	pthread_t stats_printer;
+
+	return pthread_create(&stats_printer, NULL, run_stats_printer, NULL);
+}
+
+static int bootstrap(int argc, char **argv)
+{
+	int err;
+	__u32 opt;
+	struct sched_param sched_param = {
+		.sched_priority = sched_get_priority_max(SCHED_EXT),
+	};
+	bool switch_all = false;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	/*
+	 * Enforce that the user scheduler task is managed by sched_ext. The
+	 * task eagerly drains the list of enqueued tasks in its main work
+	 * loop, and then yields the CPU. The BPF scheduler only schedules the
+	 * user space scheduler task when at least one other task in the system
+	 * needs to be scheduled.
+	 */
+	err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param);
+	if (err) {
+		fprintf(stderr, "Failed to set scheduler to SCHED_EXT: %s\n", strerror(err));
+		return err;
+	}
+
+	while ((opt = getopt(argc, argv, "ahb:")) != -1) {
+		switch (opt) {
+		case 'a':
+			switch_all = true;
+			break;
+		case 'b':
+			batch_size = strtoul(optarg, NULL, 0);
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			exit(opt != 'h');
+		}
+	}
+
+	/*
+	 * It's not always safe to allocate in a user space scheduler, as an
+	 * enqueued task could hold a lock that we require in order to be able
+	 * to allocate.
+	 */
+	err = mlockall(MCL_CURRENT | MCL_FUTURE);
+	if (err) {
+		fprintf(stderr, "Failed to prefault and lock address space: %s\n",
+			strerror(err));
+		return err;
+	}
+
+	skel = scx_example_userland__open();
+	if (!skel) {
+		fprintf(stderr, "Failed to open scheduler: %s\n", strerror(errno));
+		return errno;
+	}
+	skel->rodata->num_possible_cpus = libbpf_num_possible_cpus();
+	assert(skel->rodata->num_possible_cpus > 0);
+	skel->rodata->usersched_pid = getpid();
+	assert(skel->rodata->usersched_pid > 0);
+	skel->rodata->switch_all = switch_all;
+
+	err = scx_example_userland__load(skel);
+	if (err) {
+		fprintf(stderr, "Failed to load scheduler: %s\n", strerror(err));
+		goto destroy_skel;
+	}
+
+	enqueued_fd = bpf_map__fd(skel->maps.enqueued);
+	dispatched_fd = bpf_map__fd(skel->maps.dispatched);
+	assert(enqueued_fd > 0);
+	assert(dispatched_fd > 0);
+
+	err = spawn_stats_thread();
+	if (err) {
+		fprintf(stderr, "Failed to spawn stats thread: %s\n", strerror(err));
+		goto destroy_skel;
+	}
+
+	ops_link = bpf_map__attach_struct_ops(skel->maps.userland_ops);
+	if (!ops_link) {
+		fprintf(stderr, "Failed to attach struct ops: %s\n", strerror(errno));
+		err = errno;
+		goto destroy_skel;
+	}
+
+	return 0;
+
+destroy_skel:
+	scx_example_userland__destroy(skel);
+	exit_req = 1;
+	return err;
+}
+
+static void sched_main_loop(void)
+{
+	while (!exit_req) {
+		/*
+		 * Perform the following work in the main user space scheduler
+		 * loop:
+		 *
+		 * 1. Drain all tasks from the enqueued map, and enqueue them
+		 *    to the vruntime sorted list.
+		 *
+		 * 2. Dispatch a batch of tasks from the vruntime sorted list
+		 *    down to the kernel.
+		 *
+		 * 3. Yield the CPU back to the system. The BPF scheduler will
+		 *    reschedule the user space scheduler once another task has
+		 *    been enqueued to user space.
+		 */
+		drain_enqueued_map();
+		dispatch_batch();
+		sched_yield();
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int err;
+
+	err = bootstrap(argc, argv);
+	if (err) {
+		fprintf(stderr, "Failed to bootstrap scheduler: %s\n", strerror(err));
+		return err;
+	}
+
+	sched_main_loop();
+
+	exit_req = 1;
+	bpf_link__destroy(ops_link);
+	uei_print(&skel->bss->uei);
+	scx_example_userland__destroy(skel);
+	return 0;
+}
diff --git a/tools/sched_ext/scx_example_userland_common.h b/tools/sched_ext/scx_example_userland_common.h
new file mode 100644
index 000000000000..639c6809c5ff
--- /dev/null
+++ b/tools/sched_ext/scx_example_userland_common.h
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta, Inc */
+
+#ifndef __SCX_USERLAND_COMMON_H
+#define __SCX_USERLAND_COMMON_H
+
+#define USERLAND_MAX_TASKS 8192
+
+/*
+ * An instance of a task that has been enqueued by the kernel for consumption
+ * by a user space global scheduler thread.
+ */
+struct scx_userland_enqueued_task {
+	__s32 pid;
+	u64 sum_exec_runtime;
+	u64 weight;
+};
+
+#endif  // __SCX_USERLAND_COMMON_H

From patchwork Wed Nov 30 08:23:12 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059564
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 1C545C433FE
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:27:17 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234921AbiK3I1L (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:27:11 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54440 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234798AbiK3I0G (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:26:06 -0500
Received: from mail-pl1-x631.google.com (mail-pl1-x631.google.com
 [IPv6:2607:f8b0:4864:20::631])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 3DD36711A8;
        Wed, 30 Nov 2022 00:24:42 -0800 (PST)
Received: by mail-pl1-x631.google.com with SMTP id k7so15935302pll.6;
        Wed, 30 Nov 2022 00:24:42 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=wCwWDY7A5zziFJP94Jv5F0FRo6+xcm6X2sOasxBuB3o=;
        b=EC4pL5gGWBRqUVIAXv7YYhncHn3GjLY4MOa5M8M9RrHHrbGakKN7ObFNYwwe69u4RO
         +9mVtohnknVMahSlffehR69/gCG9Rv5qIZakZe/uQMm7pqSRdXy2Q8rDwBOs3wWSryWf
         3QbVkTYOciPUHFFu3XlwyfssWIYVFuD6NQeSLv+32PapLcos7GjAzpxtRwn1+dUbzMeH
         5PcSOXE7IPrVfGZ3Uap8IW7oh4p+2HO+9DgH/3MaxzCX0jGyS4CpstzSdr+SQb650d/J
         g4oi7jNXGX6FF3ZBSowF8AoeUiMtrKOWg1wvBFHGlFH1Jlxohw4277rUrXEw2XX3lO/9
         +WYA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=wCwWDY7A5zziFJP94Jv5F0FRo6+xcm6X2sOasxBuB3o=;
        b=I3zC3SPBeYUjRFAEzMVa5Fgg/ii4/yu61UMmbEe+SiyIVQwC5sY4YDOpFke0sj6jkg
         CdtYJOFX7iQKvgN8LmC//n0zOVyS/Zy2PJdlHIHLmdZeCwnWxk4CIxJu7YeVdDnncPEI
         cD9yI5diBXPYjy2KTK7jcKHh/VrA1JC9CjHALebrlnz1YmXmSogswyLfirxtgcogBE5s
         JOOAXfmZXhjlIbVBu6qERk4dXTELZfuJjHs0ti9pX/qfJj0d3ll+MZ5eIOaIh/+XSHVx
         po+YtOKSx6xTHfWR0ySpkKko1DqKNYpEO9C3hSXjPm/hmWJODigwv3nbQXg4GLhANRIt
         GgGg==
X-Gm-Message-State: ANoB5plXbAR56boL2yzrfZiViqCfcs38t9Ll1LklWhNIgnEImiau1Cre
        31WuhCfRNOn1IKiITrK9BDg=
X-Google-Smtp-Source: 
 AA0mqf4MaKQiWtHhjv1W3oXmBFdXCEHg7OTn3mFDTQXZtH4K4ikGQNetNRAHaHZxV+1Zft84MtFxwQ==
X-Received: by 2002:a17:902:b68d:b0:189:ab77:d07 with SMTP id
 c13-20020a170902b68d00b00189ab770d07mr1017940pls.53.1669796681405;
        Wed, 30 Nov 2022 00:24:41 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 mp4-20020a17090b190400b002192f87474fsm683580pjb.37.2022.11.30.00.24.40
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:41 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>,
        Dave Marchevsky <davemarchevsky@meta.com>
Subject: [PATCH 30/31] BPF: [TEMPORARY] Nerf BTF scalar value check
Date: Tue, 29 Nov 2022 22:23:12 -1000
Message-Id: <20221130082313.3241517-31-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org
X-Patchwork-Delegate: bpf@iogearbox.net

THIS IS A TEMPORARY WORKAROUND. WILL BE DROPPED.

This is needed for scx_example_atropos for now. This won't be needed once we
have generic BPF bitmask / cpumask helpers.

NOT_SIGNED_OFF
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Dave Marchevsky <davemarchevsky@meta.com>
---
 kernel/bpf/btf.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 5579ff3a5b54..1dc5a6e5e530 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6400,11 +6400,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 						return ret;
 				}
 			}
-
+#warning "btf argument scalar test nerfed"
+			/*
 			if (reg->type == SCALAR_VALUE)
 				continue;
 			bpf_log(log, "R%d is not a scalar\n", regno);
 			return -EINVAL;
+			*/
+			continue;
 		}
 
 		if (!btf_type_is_ptr(t)) {

From patchwork Wed Nov 30 08:23:13 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tejun Heo <tj@kernel.org>
X-Patchwork-Id: 13059567
X-Patchwork-Delegate: bpf@iogearbox.net
Return-Path: <bpf-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 0FE9FC352A1
	for <bpf@archiver.kernel.org>; Wed, 30 Nov 2022 08:27:21 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S234982AbiK3I1T (ORCPT <rfc822;bpf@archiver.kernel.org>);
        Wed, 30 Nov 2022 03:27:19 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56324 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S235039AbiK3I0L (ORCPT <rfc822;bpf@vger.kernel.org>);
        Wed, 30 Nov 2022 03:26:11 -0500
Received: from mail-pj1-x1034.google.com (mail-pj1-x1034.google.com
 [IPv6:2607:f8b0:4864:20::1034])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D505F65E4B;
        Wed, 30 Nov 2022 00:24:44 -0800 (PST)
Received: by mail-pj1-x1034.google.com with SMTP id
 u15-20020a17090a3fcf00b002191825cf02so1225702pjm.2;
        Wed, 30 Nov 2022 00:24:44 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:from:to:cc:subject:date
         :message-id:reply-to;
        bh=7XZP5SOZqgHI8H6KI1Dt2EKjXnbk5rPnL6vKag+G4wU=;
        b=IhqNJcXnKtzT12gERSN27qmVwobsBKcYPdmz06+aQeGR7BAZfKHPZiPutRAiTBGoSW
         7Z26rNdI3e+pCyGzkEmN3vKMisZ7ZiDqg7/9ERbtA7Ksq/8bK4UroHrDzbwqW/qiCZk/
         AGdiv/d2Q//rOXkg+R5/LgGtz60rjvYPTC3OpdpyIqNINoJDqazPe9KdiETDs+XDNIc3
         dbmRwf+dR/uzlqR1uA+qum/a02nQ8mQ6gsrt7jZeyM+CVeWRULULZPy9Fge9YHOaS/DP
         R7ESChB+RpAXKrquNOdg0Swyzr2u/JxdeWLyUPuwrcW6OEdZ8R4P5Z0C2qB6Jk+05+BK
         nCDw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:sender:x-gm-message-state:from
         :to:cc:subject:date:message-id:reply-to;
        bh=7XZP5SOZqgHI8H6KI1Dt2EKjXnbk5rPnL6vKag+G4wU=;
        b=vMJXvRleEW9pf+RjndiMbf3d1ngJlELmPZTorayFaIkS6dNSkNI0OyL7TSKGJphhBR
         EhPRtcMiS8bBKm7zBRhtTBqcSAVKfwY/g3j0A4jRIybvKiendl6IterSZT+das93b4O1
         5R+kmV65iCTROKFIPigTFtFMqJcY2FWhWCQJy0YLZNQf7rmsKu2rH2sP0bC+w42hTRB/
         LL6cVq2eN2QoggKJKoF+cNeUbcGdcFthHtj1aPW4tIw0gfJ3AZ8vW2YViC28pl24K+v+
         9v6o74UDRQFe5lbvd0ZoMEt8TJGf9WcBoFu89xVSX7x1QG5PxWeBAIApM2v2BGMpyzXE
         ZGUQ==
X-Gm-Message-State: ANoB5pn4dl59h+rzImFyXT03rCNuo54Woxiorqt1B4MLD2f7I53OHqk0
        1iQOnWsmEKEqeMMqHbanz5c=
X-Google-Smtp-Source: 
 AA0mqf5pohg13H/jmB+4ffra/bue23Gq9AuxG7ozxXWubvZrmu+uRjRN4LNGhGw7Y0eNfRQR6o/0/g==
X-Received: by 2002:a17:902:bcc7:b0:188:f42e:6a90 with SMTP id
 o7-20020a170902bcc700b00188f42e6a90mr45320254pls.127.1669796683729;
        Wed, 30 Nov 2022 00:24:43 -0800 (PST)
Received: from localhost ([2600:380:4a00:1415:d028:b547:7d35:7b0b])
        by smtp.gmail.com with ESMTPSA id
 k14-20020aa7972e000000b0056bb4bbfb9bsm830113pfg.95.2022.11.30.00.24.42
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 30 Nov 2022 00:24:43 -0800 (PST)
Sender: Tejun Heo <htejun@gmail.com>
From: Tejun Heo <tj@kernel.org>
To: torvalds@linux-foundation.org, mingo@redhat.com,
        peterz@infradead.org, juri.lelli@redhat.com,
        vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
        rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
        bristot@redhat.com, vschneid@redhat.com, ast@kernel.org,
        daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org,
        joshdon@google.com, brho@google.com, pjt@google.com,
        derkling@google.com, haoluo@google.com, dvernet@meta.com,
        dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com
Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org,
        kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 31/31] sched_ext: Add a rust userspace hybrid example
 scheduler
Date: Tue, 29 Nov 2022 22:23:13 -1000
Message-Id: <20221130082313.3241517-32-tj@kernel.org>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221130082313.3241517-1-tj@kernel.org>
References: <20221130082313.3241517-1-tj@kernel.org>
MIME-Version: 1.0
Precedence: bulk
List-ID: <bpf.vger.kernel.org>
X-Mailing-List: bpf@vger.kernel.org

From: Dan Schatzberg <dschatzberg@meta.com>

Atropos is a multi-domain BPF / userspace hybrid scheduler where the BPF
part does simple round robin in each domain and the userspace part
calculates the load factor of each domain and tells the BPF part how to load
balance the domains.

This scheduler demonstrates dividing scheduling logic between BPF and
userspace and using rust to build the userspace part. An earlier variant of
this scheduler was used to balance across six domains, each representing a
chiplet in a six-chiplet AMD processor, and could match the performance of
production setup using CFS.

Signed-off-by: Dan Schatzberg <dschatzberg@meta.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/Makefile                      |  13 +-
 tools/sched_ext/atropos/.gitignore            |   3 +
 tools/sched_ext/atropos/Cargo.toml            |  34 +
 tools/sched_ext/atropos/build.rs              |  70 ++
 tools/sched_ext/atropos/rustfmt.toml          |   8 +
 tools/sched_ext/atropos/src/bpf/atropos.bpf.c | 501 ++++++++++++++
 tools/sched_ext/atropos/src/bpf/atropos.h     |  38 +
 tools/sched_ext/atropos/src/main.rs           | 648 ++++++++++++++++++
 .../sched_ext/atropos/src/oss/atropos_sys.rs  |  10 +
 tools/sched_ext/atropos/src/oss/mod.rs        |  29 +
 tools/sched_ext/atropos/src/util.rs           |  24 +
 11 files changed, 1376 insertions(+), 2 deletions(-)
 create mode 100644 tools/sched_ext/atropos/.gitignore
 create mode 100644 tools/sched_ext/atropos/Cargo.toml
 create mode 100644 tools/sched_ext/atropos/build.rs
 create mode 100644 tools/sched_ext/atropos/rustfmt.toml
 create mode 100644 tools/sched_ext/atropos/src/bpf/atropos.bpf.c
 create mode 100644 tools/sched_ext/atropos/src/bpf/atropos.h
 create mode 100644 tools/sched_ext/atropos/src/main.rs
 create mode 100644 tools/sched_ext/atropos/src/oss/atropos_sys.rs
 create mode 100644 tools/sched_ext/atropos/src/oss/mod.rs
 create mode 100644 tools/sched_ext/atropos/src/util.rs

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 563b54333ab1..c8802cefb98a 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -84,6 +84,8 @@ CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS)			\
 	  -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)				\
 	  -I$(TOOLSINCDIR) -I$(APIDIR)
 
+CARGOFLAGS := --release
+
 # Silence some warnings when compiled with clang
 ifneq ($(LLVM),)
 CFLAGS += -Wno-unused-command-line-argument
@@ -115,7 +117,7 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
 	     -O2 -mcpu=v3
 
 all: scx_example_dummy scx_example_qmap scx_example_central scx_example_pair	\
-     scx_example_userland
+     scx_example_userland atropos
 
 # sort removes libbpf duplicates when not cross-building
 MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
@@ -187,13 +189,20 @@ scx_example_userland: scx_example_userland.c scx_example_userland.skel.h	\
 	$(CC) $(CFLAGS) -c $< -o $@.o
 	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
 
+atropos: export RUSTFLAGS = -C link-args=-lzstd
+atropos: export ATROPOS_CLANG = $(CLANG)
+atropos: export ATROPOS_BPF_CFLAGS = $(BPF_CFLAGS)
+atropos: $(INCLUDE_DIR)/vmlinux.h
+	cargo build --manifest-path=atropos/Cargo.toml $(CARGOFLAGS)
+
 clean:
+	cargo clean --manifest-path=atropos/Cargo.toml
 	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
 	rm -f *.o *.bpf.o *.skel.h *.subskel.h
 	rm -f scx_example_dummy scx_example_qmap scx_example_central		\
 	      scx_example_pair scx_example_userland
 
-.PHONY: all clean
+.PHONY: all atropos clean
 
 # delete failed targets
 .DELETE_ON_ERROR:
diff --git a/tools/sched_ext/atropos/.gitignore b/tools/sched_ext/atropos/.gitignore
new file mode 100644
index 000000000000..186dba259ec2
--- /dev/null
+++ b/tools/sched_ext/atropos/.gitignore
@@ -0,0 +1,3 @@
+src/bpf/.output
+Cargo.lock
+target
diff --git a/tools/sched_ext/atropos/Cargo.toml b/tools/sched_ext/atropos/Cargo.toml
new file mode 100644
index 000000000000..67e543c457c4
--- /dev/null
+++ b/tools/sched_ext/atropos/Cargo.toml
@@ -0,0 +1,34 @@
+# @generated by autocargo
+
+[package]
+name = "atropos-bin"
+version = "0.5.0"
+authors = ["Dan Schatzberg <dschatzberg@meta.com>", "Meta"]
+edition = "2021"
+description = "Userspace scheduling with BPF"
+readme = "../README.md"
+repository = "https://github.com/facebookincubator/atropos"
+license = "Apache-2.0"
+
+[dependencies]
+anyhow = "1.0.65"
+bitvec = { version = "1.0", features = ["serde"] }
+clap = { version = "3.2.17", features = ["derive", "env", "regex", "unicode", "wrap_help"] }
+ctrlc = { version = "3.1", features = ["termination"] }
+fb_procfs = { git = "https://github.com/facebookincubator/below.git", rev = "f305730"}
+hex = "0.4.3"
+libbpf-rs = "0.19.1"
+libc = "0.2.137"
+slog = { version = "2.7", features = ["max_level_trace", "nested-values"] }
+slog-async = { version = "2.3", features = ["nested-values"] }
+slog-term = "2.8"
+
+[build-dependencies]
+bindgen = { version = "0.61.0", features = ["logging", "static"], default-features = false }
+libbpf-cargo = "0.13.0"
+
+[features]
+enable_backtrace = []
+
+[patch.crates-io]
+libbpf-sys = { git = "https://github.com/dschatzberg/libbpf-sys", rev = "0ef4011"}
diff --git a/tools/sched_ext/atropos/build.rs b/tools/sched_ext/atropos/build.rs
new file mode 100644
index 000000000000..26e792c5e17e
--- /dev/null
+++ b/tools/sched_ext/atropos/build.rs
@@ -0,0 +1,70 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+extern crate bindgen;
+
+use std::env;
+use std::fs::create_dir_all;
+use std::path::Path;
+use std::path::PathBuf;
+
+use libbpf_cargo::SkeletonBuilder;
+
+const HEADER_PATH: &str = "src/bpf/atropos.h";
+
+fn bindgen_atropos() {
+    // Tell cargo to invalidate the built crate whenever the wrapper changes
+    println!("cargo:rerun-if-changed={}", HEADER_PATH);
+
+    // The bindgen::Builder is the main entry point
+    // to bindgen, and lets you build up options for
+    // the resulting bindings.
+    let bindings = bindgen::Builder::default()
+        // The input header we would like to generate
+        // bindings for.
+        .header(HEADER_PATH)
+        // Tell cargo to invalidate the built crate whenever any of the
+        // included header files changed.
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks))
+        // Finish the builder and generate the bindings.
+        .generate()
+        // Unwrap the Result and panic on failure.
+        .expect("Unable to generate bindings");
+
+    // Write the bindings to the $OUT_DIR/bindings.rs file.
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
+    bindings
+        .write_to_file(out_path.join("atropos-sys.rs"))
+        .expect("Couldn't write bindings!");
+}
+
+fn gen_bpf_sched(name: &str) {
+    let bpf_cflags = env::var("ATROPOS_BPF_CFLAGS").unwrap();
+    let clang = env::var("ATROPOS_CLANG").unwrap();
+    eprintln!("{}", clang);
+    let outpath = format!("./src/bpf/.output/{}.skel.rs", name);
+    let skel = Path::new(&outpath);
+    let src = format!("./src/bpf/{}.bpf.c", name);
+    SkeletonBuilder::new()
+        .source(src.clone())
+        .clang(clang)
+        .clang_args(bpf_cflags)
+        .build_and_generate(&skel)
+        .unwrap();
+    println!("cargo:rerun-if-changed={}", src);
+}
+
+fn main() {
+    bindgen_atropos();
+    // It's unfortunate we cannot use `OUT_DIR` to store the generated skeleton.
+    // Reasons are because the generated skeleton contains compiler attributes
+    // that cannot be `include!()`ed via macro. And we cannot use the `#[path = "..."]`
+    // trick either because you cannot yet `concat!(env!("OUT_DIR"), "/skel.rs")` inside
+    // the path attribute either (see https://github.com/rust-lang/rust/pull/83366).
+    //
+    // However, there is hope! When the above feature stabilizes we can clean this
+    // all up.
+    create_dir_all("./src/bpf/.output").unwrap();
+    gen_bpf_sched("atropos");
+}
diff --git a/tools/sched_ext/atropos/rustfmt.toml b/tools/sched_ext/atropos/rustfmt.toml
new file mode 100644
index 000000000000..b7258ed0a8d8
--- /dev/null
+++ b/tools/sched_ext/atropos/rustfmt.toml
@@ -0,0 +1,8 @@
+# Get help on options with `rustfmt --help=config`
+# Please keep these in alphabetical order.
+edition = "2021"
+group_imports = "StdExternalCrate"
+imports_granularity = "Item"
+merge_derives = false
+use_field_init_shorthand = true
+version = "Two"
diff --git a/tools/sched_ext/atropos/src/bpf/atropos.bpf.c b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
new file mode 100644
index 000000000000..9e5164131de5
--- /dev/null
+++ b/tools/sched_ext/atropos/src/bpf/atropos.bpf.c
@@ -0,0 +1,501 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+//
+// Atropos is a multi-domain BPF / userspace hybrid scheduler where the BPF
+// part does simple round robin in each domain and the userspace part
+// calculates the load factor of each domain and tells the BPF part how to load
+// balance the domains.
+//
+// Every task has an entry in the task_data map which lists which domain the
+// task belongs to. When a task first enters the system (atropos_enable), they
+// are round-robined to a domain.
+//
+// atropos_select_cpu is the primary scheduling logic, invoked when a task
+// becomes runnable. The lb_data map is populated by userspace to inform the BPF
+// scheduler that a task should be migrated to a new domain. Otherwise, the task
+// is scheduled in priority order as follows:
+// * The current core if the task was woken up synchronously and there are idle
+//   cpus in the system
+// * The previous core, if idle
+// * The pinned-to core if the task is pinned to a specific core
+// * Any idle cpu in the domain
+//
+// If none of the above conditions are met, then the task is enqueued to a
+// dispatch queue corresponding to the domain (atropos_enqueue).
+//
+// atropos_consume will attempt to consume a task from its domain's
+// corresponding dispatch queue (this occurs after scheduling any tasks directly
+// assigned to it due to the logic in atropos_select_cpu). If no task is found,
+// then greedy load stealing will attempt to find a task on another dispatch
+// queue to run.
+//
+// Load balancing is almost entirely handled by userspace. BPF populates the
+// task weight, dom mask and current dom in the task_data map and executes the
+// load balance based on userspace populating the lb_data map.
+#include "../../../scx_common.bpf.h"
+#include "atropos.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <string.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+/*
+ * const volatiles are set during initialization and treated as consts by the
+ * jit compiler.
+ */
+
+/*
+ * Domains and cpus
+ */
+const volatile __u32 nr_doms;
+const volatile __u32 nr_cpus;
+const volatile __u32 cpu_dom_id_map[MAX_CPUS];
+const volatile __u64 dom_cpumasks[MAX_DOMS][MAX_CPUS / 64];
+
+const volatile bool switch_all;
+const volatile __u64 greedy_threshold = (u64)-1;
+
+/* base slice duration */
+const volatile __u64 slice_us = 20000;
+
+/*
+ * Exit info
+ */
+int exit_type = SCX_EXIT_NONE;
+char exit_msg[SCX_EXIT_MSG_LEN];
+
+struct pcpu_ctx {
+	__u32 dom_rr_cur; /* used when scanning other doms */
+
+	/* libbpf-rs does not respect the alignment, so pad out the struct explicitly */
+	__u8 _padding[CACHELINE_SIZE - sizeof(u64)];
+} __attribute__((aligned(CACHELINE_SIZE)));
+
+struct pcpu_ctx pcpu_ctx[MAX_CPUS];
+
+/*
+ * Statistics
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(u32));
+	__uint(value_size, sizeof(u64));
+	__uint(max_entries, ATROPOS_NR_STATS);
+} stats SEC(".maps");
+
+static inline void stat_add(enum stat_idx idx, u64 addend)
+{
+	u32 idx_v = idx;
+
+	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
+	if (cnt_p)
+		(*cnt_p) += addend;
+}
+
+// Map pid -> task_ctx
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, pid_t);
+	__type(value, struct task_ctx);
+	__uint(max_entries, 1000000);
+	__uint(map_flags, 0);
+} task_data SEC(".maps");
+
+// This is populated from userspace to indicate which pids should be reassigned
+// to new doms
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, pid_t);
+	__type(value, u32);
+	__uint(max_entries, 1000);
+	__uint(map_flags, 0);
+} lb_data SEC(".maps");
+
+struct refresh_task_cpumask_loop_ctx {
+	struct task_struct *p;
+	struct task_ctx *ctx;
+};
+
+static int refresh_task_cpumask(u32 cpu, void *data)
+{
+	struct refresh_task_cpumask_loop_ctx *c = data;
+	struct task_struct *p = c->p;
+	struct task_ctx *ctx = c->ctx;
+	u64 mask = 1LLU << (cpu % 64);
+	const volatile __u64 *dptr;
+
+	dptr = MEMBER_VPTR(dom_cpumasks, [ctx->dom_id][cpu / 64]);
+	if (!dptr)
+		return 1;
+
+	if ((*dptr & mask) &&
+	    scx_bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
+		u64 *cptr = MEMBER_VPTR(ctx->cpumask, [cpu / 64]);
+		if (!cptr)
+			return 1;
+		*cptr |= mask;
+	} else {
+		u64 *cptr = MEMBER_VPTR(ctx->cpumask, [cpu / 64]);
+		if (!cptr)
+			return 1;
+		*cptr *= ~mask;
+	}
+
+	return 0;
+}
+
+static void task_set_dq(struct task_ctx *task_ctx, struct task_struct *p,
+			u32 dom_id)
+{
+	struct refresh_task_cpumask_loop_ctx lctx = {
+		.p = p,
+		.ctx = task_ctx,
+	};
+
+	task_ctx->dom_id = dom_id;
+	bpf_loop(nr_cpus, refresh_task_cpumask, &lctx, 0);
+}
+
+s32 BPF_STRUCT_OPS(atropos_select_cpu, struct task_struct *p, int prev_cpu,
+		   u32 wake_flags)
+{
+	s32 cpu;
+
+	pid_t pid = p->pid;
+	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
+	if (!task_ctx) {
+		stat_add(ATROPOS_STAT_TASK_GET_ERR, 1);
+		return prev_cpu;
+	}
+
+	bool load_balanced = false;
+	u32 *new_dom = bpf_map_lookup_elem(&lb_data, &pid);
+	if (new_dom && *new_dom != task_ctx->dom_id) {
+		task_set_dq(task_ctx, p, *new_dom);
+		stat_add(ATROPOS_STAT_LOAD_BALANCE, 1);
+		load_balanced = true;
+	}
+
+	/*
+	 * If WAKE_SYNC and the machine isn't fully saturated, wake up @p to the
+	 * local dq of the waker.
+	 */
+	if (p->nr_cpus_allowed > 1 && (wake_flags & SCX_WAKE_SYNC)) {
+		struct task_struct *current = (void *)bpf_get_current_task();
+
+		if (!(BPF_CORE_READ(current, flags) & PF_EXITING) &&
+		    task_ctx->dom_id < MAX_DOMS &&
+		    scx_bpf_has_idle_cpus_among_untyped(
+			    (unsigned long)dom_cpumasks[task_ctx->dom_id])) {
+			cpu = bpf_get_smp_processor_id();
+			if (scx_bpf_cpumask_test_cpu(cpu,
+						     p->cpus_ptr)) {
+				stat_add(ATROPOS_STAT_WAKE_SYNC, 1);
+				goto local;
+			}
+		}
+	}
+
+	/* if the previous CPU is idle, dispatch directly to it */
+	if (!load_balanced) {
+		u8 prev_idle = scx_bpf_test_and_clear_cpu_idle(prev_cpu);
+		if (*(volatile u8 *)&prev_idle) {
+			stat_add(ATROPOS_STAT_PREV_IDLE, 1);
+			cpu = prev_cpu;
+			goto local;
+		}
+	}
+
+	/* If only one core is allowed, dispatch */
+	if (p->nr_cpus_allowed == 1) {
+		cpu = scx_bpf_cpumask_first_untyped(
+			(unsigned long)task_ctx->cpumask);
+		stat_add(ATROPOS_STAT_PINNED, 1);
+		goto local;
+	}
+
+	/* Find an idle cpu and just dispatch */
+	cpu = scx_bpf_pick_idle_cpu_untyped((unsigned long)task_ctx->cpumask);
+	if (cpu >= 0) {
+		stat_add(ATROPOS_STAT_DIRECT_DISPATCH, 1);
+		goto local;
+	}
+
+	return prev_cpu;
+
+local:
+	task_ctx->dispatch_local = true;
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(atropos_enqueue, struct task_struct *p, u32 enq_flags)
+{
+	p->scx.slice = slice_us * 1000;
+
+	pid_t pid = p->pid;
+	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
+	if (!task_ctx) {
+		stat_add(ATROPOS_STAT_TASK_GET_ERR, 1);
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		return;
+	}
+
+	if (task_ctx->dispatch_local) {
+		task_ctx->dispatch_local = false;
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		return;
+	}
+
+	scx_bpf_dispatch(p, task_ctx->dom_id, SCX_SLICE_DFL, enq_flags);
+}
+
+static u32 cpu_to_dom_id(s32 cpu)
+{
+	if (nr_doms <= 1)
+		return 0;
+
+	if (cpu >= 0 && cpu < MAX_CPUS) {
+		u32 dom_id;
+
+		/*
+		 * XXX - idk why the verifier thinks cpu_dom_id_map[cpu] is not
+		 * safe here.
+		 */
+		bpf_probe_read_kernel(&dom_id, sizeof(dom_id),
+				      (const void *)&cpu_dom_id_map[cpu]);
+		return dom_id;
+	} else {
+		return MAX_DOMS;
+	}
+}
+
+static bool is_cpu_in_dom(u32 cpu, u32 dom_id)
+{
+	u64 mask = 0;
+
+	/*
+	 * XXX - derefing two dimensional array triggers the verifier, use
+	 * probe_read instead.
+	 */
+	bpf_probe_read_kernel(&mask, sizeof(mask),
+			      (const void *)&dom_cpumasks[dom_id][cpu / 64]);
+	return mask & (1LLU << (cpu % 64));
+}
+
+struct cpumask_intersects_domain_loop_ctx {
+	const struct cpumask *cpumask;
+	u32 dom_id;
+	bool ret;
+};
+
+static int cpumask_intersects_domain_loopfn(u32 idx, void *data)
+{
+	struct cpumask_intersects_domain_loop_ctx *lctx = data;
+
+	if (scx_bpf_cpumask_test_cpu(idx, lctx->cpumask) &&
+	    is_cpu_in_dom(idx, lctx->dom_id)) {
+		lctx->ret = true;
+		return 1;
+	}
+	return 0;
+}
+
+static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id)
+{
+	struct cpumask_intersects_domain_loop_ctx lctx = {
+		.cpumask = cpumask,
+		.dom_id = dom_id,
+		.ret = false,
+	};
+
+	bpf_loop(nr_cpus, cpumask_intersects_domain_loopfn, &lctx, 0);
+	return lctx.ret;
+}
+
+static u32 dom_rr_next(s32 cpu)
+{
+	if (cpu >= 0 && cpu < MAX_CPUS) {
+		struct pcpu_ctx *pcpuc = &pcpu_ctx[cpu];
+		u32 dom_id = (pcpuc->dom_rr_cur + 1) % nr_doms;
+
+		if (dom_id == cpu_to_dom_id(cpu))
+			dom_id = (dom_id + 1) % nr_doms;
+
+		pcpuc->dom_rr_cur = dom_id;
+		return dom_id;
+	}
+	return 0;
+}
+
+static int greedy_loopfn(s32 idx, void *data)
+{
+	u32 dom_id = dom_rr_next(*(s32 *)data);
+
+	if (scx_bpf_dsq_nr_queued(dom_id) > greedy_threshold &&
+	    scx_bpf_consume(dom_id)) {
+		stat_add(ATROPOS_STAT_GREEDY, 1);
+		return 1;
+	}
+	return 0;
+}
+
+void BPF_STRUCT_OPS(atropos_consume, s32 cpu)
+{
+	u32 dom = cpu_to_dom_id(cpu);
+	if (scx_bpf_consume(dom)) {
+		stat_add(ATROPOS_STAT_DSQ_DISPATCH, 1);
+		return;
+	}
+
+	if (greedy_threshold != (u64)-1)
+		bpf_loop(nr_doms - 1, greedy_loopfn, &cpu, 0);
+}
+
+struct pick_task_domain_loop_ctx {
+	struct task_struct *p;
+	const struct cpumask *cpumask;
+	u64 dom_mask;
+	u32 dom_rr_base;
+	u32 dom_id;
+};
+
+static int pick_task_domain_loopfn(u32 idx, void *data)
+{
+	struct pick_task_domain_loop_ctx *lctx = data;
+	u32 dom_id = (lctx->dom_rr_base + idx) % nr_doms;
+
+	if (dom_id >= MAX_DOMS)
+		return 1;
+
+	if (cpumask_intersects_domain(lctx->cpumask, dom_id)) {
+		lctx->dom_mask |= 1LLU << dom_id;
+		if (lctx->dom_id == MAX_DOMS)
+			lctx->dom_id = dom_id;
+	}
+	return 0;
+}
+
+static u32 pick_task_domain(struct task_ctx *task_ctx, struct task_struct *p,
+			    const struct cpumask *cpumask)
+{
+	struct pick_task_domain_loop_ctx lctx = {
+		.p = p,
+		.cpumask = cpumask,
+		.dom_id = MAX_DOMS,
+	};
+	s32 cpu = bpf_get_smp_processor_id();
+
+	if (cpu < 0 || cpu >= MAX_CPUS)
+		return MAX_DOMS;
+
+	lctx.dom_rr_base = ++(pcpu_ctx[cpu].dom_rr_cur);
+
+	bpf_loop(nr_doms, pick_task_domain_loopfn, &lctx, 0);
+	task_ctx->dom_mask = lctx.dom_mask;
+
+	return lctx.dom_id;
+}
+
+static void task_set_domain(struct task_ctx *task_ctx, struct task_struct *p,
+			    const struct cpumask *cpumask)
+{
+	u32 dom_id = 0;
+
+	if (nr_doms > 1)
+		dom_id = pick_task_domain(task_ctx, p, cpumask);
+
+	task_set_dq(task_ctx, p, dom_id);
+}
+
+void BPF_STRUCT_OPS(atropos_set_cpumask, struct task_struct *p,
+		    const struct cpumask *cpumask)
+{
+	pid_t pid = p->pid;
+	struct task_ctx *task_ctx = bpf_map_lookup_elem(&task_data, &pid);
+	if (!task_ctx) {
+		stat_add(ATROPOS_STAT_TASK_GET_ERR, 1);
+		return;
+	}
+
+	task_set_domain(task_ctx, p, cpumask);
+}
+
+void BPF_STRUCT_OPS(atropos_enable, struct task_struct *p)
+{
+	struct task_ctx task_ctx;
+	memset(&task_ctx, 0, sizeof(task_ctx));
+	task_ctx.weight = p->scx.weight;
+
+	task_set_domain(&task_ctx, p, p->cpus_ptr);
+	pid_t pid = p->pid;
+	long ret =
+		bpf_map_update_elem(&task_data, &pid, &task_ctx, BPF_NOEXIST);
+	if (ret) {
+		stat_add(ATROPOS_STAT_TASK_GET_ERR, 1);
+		return;
+	}
+}
+
+void BPF_STRUCT_OPS(atropos_disable, struct task_struct *p)
+{
+	pid_t pid = p->pid;
+	long ret = bpf_map_delete_elem(&task_data, &pid);
+	if (ret) {
+		stat_add(ATROPOS_STAT_TASK_GET_ERR, 1);
+		return;
+	}
+}
+
+int BPF_STRUCT_OPS(atropos_init)
+{
+	int ret;
+	u32 local_nr_doms = nr_doms;
+
+	bpf_printk("atropos init");
+
+	if (switch_all)
+		scx_bpf_switch_all();
+
+	// BPF verifier gets cranky if we don't bound this like so
+	if (local_nr_doms > MAX_DOMS)
+		local_nr_doms = MAX_DOMS;
+
+	for (u32 i = 0; i < local_nr_doms; ++i) {
+		ret = scx_bpf_create_dsq(i, -1);
+		if (ret < 0)
+			return ret;
+	}
+
+	for (u32 i = 0; i < nr_cpus; ++i) {
+		pcpu_ctx[i].dom_rr_cur = i;
+	}
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(atropos_exit, struct scx_exit_info *ei)
+{
+	bpf_probe_read_kernel_str(exit_msg, sizeof(exit_msg), ei->msg);
+	exit_type = ei->type;
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops atropos = {
+	.select_cpu = (void *)atropos_select_cpu,
+	.enqueue = (void *)atropos_enqueue,
+	.consume = (void *)atropos_consume,
+	.set_cpumask = (void *)atropos_set_cpumask,
+	.enable = (void *)atropos_enable,
+	.disable = (void *)atropos_disable,
+	.init = (void *)atropos_init,
+	.exit = (void *)atropos_exit,
+	.flags = 0,
+	.name = "atropos",
+};
diff --git a/tools/sched_ext/atropos/src/bpf/atropos.h b/tools/sched_ext/atropos/src/bpf/atropos.h
new file mode 100644
index 000000000000..9a2bfee8b1d6
--- /dev/null
+++ b/tools/sched_ext/atropos/src/bpf/atropos.h
@@ -0,0 +1,38 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+#ifndef __ATROPOS_H
+#define __ATROPOS_H
+
+#include <stdbool.h>
+
+#define	MAX_CPUS 512
+#define	MAX_DOMS 64 /* limited to avoid complex bitmask ops */
+#define	CACHELINE_SIZE 64
+
+/* Statistics */
+enum stat_idx {
+	ATROPOS_STAT_TASK_GET_ERR,
+	ATROPOS_STAT_TASK_GET_ERR_ENABLE,
+	ATROPOS_STAT_CPUMASK_ERR,
+	ATROPOS_STAT_WAKE_SYNC,
+	ATROPOS_STAT_PREV_IDLE,
+	ATROPOS_STAT_PINNED,
+	ATROPOS_STAT_DIRECT_DISPATCH,
+	ATROPOS_STAT_DSQ_DISPATCH,
+	ATROPOS_STAT_GREEDY,
+	ATROPOS_STAT_LOAD_BALANCE,
+	ATROPOS_STAT_LAST_TASK,
+	ATROPOS_NR_STATS,
+};
+
+struct task_ctx {
+	unsigned long long dom_mask; /* the domains this task can run on */
+	unsigned long long cpumask[MAX_CPUS / 64];
+	unsigned int dom_id;
+	unsigned int weight;
+	bool dispatch_local;
+};
+
+#endif /* __ATROPOS_H */
diff --git a/tools/sched_ext/atropos/src/main.rs b/tools/sched_ext/atropos/src/main.rs
new file mode 100644
index 000000000000..b9ae312a562f
--- /dev/null
+++ b/tools/sched_ext/atropos/src/main.rs
@@ -0,0 +1,648 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+#![deny(clippy::all)]
+use std::collections::BTreeMap;
+use std::ffi::CStr;
+use std::sync::atomic::AtomicBool;
+use std::sync::atomic::Ordering;
+use std::sync::Arc;
+
+use ::fb_procfs as procfs;
+use anyhow::anyhow;
+use anyhow::bail;
+use anyhow::Context;
+use bitvec::prelude::*;
+use clap::Parser;
+
+mod util;
+
+oss_shim!();
+
+/// Atropos is a multi-domain BPF / userspace hybrid scheduler where the BPF
+/// part does simple round robin in each domain and the userspace part
+/// calculates the load factor of each domain and tells the BPF part how to load
+/// balance the domains.
+
+/// This scheduler demonstrates dividing scheduling logic between BPF and
+/// userspace and using rust to build the userspace part. An earlier variant of
+/// this scheduler was used to balance across six domains, each representing a
+/// chiplet in a six-chiplet AMD processor, and could match the performance of
+/// production setup using CFS.
+#[derive(Debug, Parser)]
+struct Opt {
+    /// Set the log level for more or less verbose output. --log_level=debug
+    /// will output libbpf verbose details
+    #[clap(short, long, default_value = "info")]
+    log_level: String,
+    /// Set the cpumask for a domain, provide multiple --cpumasks, one for each
+    /// domain. E.g. --cpumasks 0xff_00ff --cpumasks 0xff00 will create two
+    /// domains with the corresponding CPUs belonging to each domain. Each CPU
+    /// must belong to precisely one domain.
+    #[clap(short, long, required = true, min_values = 1)]
+    cpumasks: Vec<String>,
+    /// Switch all tasks to sched_ext. If not specified, only tasks which
+    /// have their scheduling policy set to SCHED_EXT using
+    /// sched_setscheduler(2) are switched.
+    #[clap(short, long, default_value = "false")]
+    all: bool,
+    /// Enable load balancing. Periodically userspace will calculate the load
+    /// factor of each domain and instruct BPF which processes to move.
+    #[clap(short, long, default_value = "true")]
+    load_balance: bool,
+    /// Enable greedy task stealing. When a domain is idle, a cpu will attempt
+    /// to steal tasks from a domain with at least greedy_threshold tasks
+    /// enqueued. These tasks aren't permanently stolen from the domain.
+    #[clap(short, long)]
+    greedy_threshold: Option<u64>,
+}
+
+type CpusetDqPair = (Vec<BitVec<u64, Lsb0>>, Vec<i32>);
+
+// Returns Vec of cpuset for each dq and a vec of dq for each cpu
+fn parse_cpusets(cpumasks: &[String]) -> anyhow::Result<CpusetDqPair> {
+    if cpumasks.len() > atropos_sys::MAX_DOMS as usize {
+        bail!(
+            "Number of requested DSQs ({}) is greater than MAX_DOMS ({})",
+            cpumasks.len(),
+            atropos_sys::MAX_DOMS
+        );
+    }
+    let num_cpus = libbpf_rs::num_possible_cpus()?;
+    if num_cpus > atropos_sys::MAX_CPUS as usize {
+        bail!(
+            "num_cpus ({}) is greater than MAX_CPUS ({})",
+            num_cpus,
+            atropos_sys::MAX_CPUS,
+        );
+    }
+    let mut cpus = vec![-1i32; num_cpus];
+    let mut cpusets = vec![bitvec![u64, Lsb0; 0; atropos_sys::MAX_CPUS as usize]; cpumasks.len()];
+    for (dq, cpumask) in cpumasks.iter().enumerate() {
+        let hex_str = {
+            let mut tmp_str = cpumask
+                .strip_prefix("0x")
+                .unwrap_or(cpumask)
+                .replace('_', "");
+            if tmp_str.len() % 2 != 0 {
+                tmp_str = "0".to_string() + &tmp_str;
+            }
+            tmp_str
+        };
+        let byte_vec = hex::decode(&hex_str)
+            .with_context(|| format!("Failed to parse cpumask: {}", cpumask))?;
+
+        for (index, &val) in byte_vec.iter().rev().enumerate() {
+            let mut v = val;
+            while v != 0 {
+                let lsb = v.trailing_zeros() as usize;
+                v &= !(1 << lsb);
+                let cpu = index * 8 + lsb;
+                if cpu > num_cpus {
+                    bail!(
+                        concat!(
+                            "Found cpu ({}) in cpumask ({}) which is larger",
+                            " than the number of cpus on the machine ({})"
+                        ),
+                        cpu,
+                        cpumask,
+                        num_cpus
+                    );
+                }
+                if cpus[cpu] != -1 {
+                    bail!(
+                        "Found cpu ({}) with dq ({}) but also in cpumask ({})",
+                        cpu,
+                        cpus[cpu],
+                        cpumask
+                    );
+                }
+                cpus[cpu] = dq as i32;
+                cpusets[dq].set(cpu, true);
+            }
+        }
+        cpusets[dq].set_uninitialized(false);
+    }
+
+    for (cpu, &dq) in cpus.iter().enumerate() {
+        if dq < 0 {
+            bail!(
+                "Cpu {} not assigned to any dq. Make sure it is covered by some --cpumasks argument.",
+                cpu
+            );
+        }
+    }
+
+    Ok((cpusets, cpus))
+}
+
+struct Sample {
+    total_cpu: procfs::CpuStat,
+}
+
+fn get_cpustats(reader: &mut procfs::ProcReader) -> anyhow::Result<Sample> {
+    let stat = reader.read_stat().context("Failed to read procfs")?;
+    Ok(Sample {
+        total_cpu: stat
+            .total_cpu
+            .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))?,
+    })
+}
+
+fn calculate_cpu_busy(prev: &procfs::CpuStat, next: &procfs::CpuStat) -> anyhow::Result<f64> {
+    match (prev, next) {
+        (
+            procfs::CpuStat {
+                user_usec: Some(prev_user),
+                nice_usec: Some(prev_nice),
+                system_usec: Some(prev_system),
+                idle_usec: Some(prev_idle),
+                iowait_usec: Some(prev_iowait),
+                irq_usec: Some(prev_irq),
+                softirq_usec: Some(prev_softirq),
+                stolen_usec: Some(prev_stolen),
+                guest_usec: _,
+                guest_nice_usec: _,
+            },
+            procfs::CpuStat {
+                user_usec: Some(curr_user),
+                nice_usec: Some(curr_nice),
+                system_usec: Some(curr_system),
+                idle_usec: Some(curr_idle),
+                iowait_usec: Some(curr_iowait),
+                irq_usec: Some(curr_irq),
+                softirq_usec: Some(curr_softirq),
+                stolen_usec: Some(curr_stolen),
+                guest_usec: _,
+                guest_nice_usec: _,
+            },
+        ) => {
+            let idle_usec = curr_idle - prev_idle;
+            let iowait_usec = curr_iowait - prev_iowait;
+            let user_usec = curr_user - prev_user;
+            let system_usec = curr_system - prev_system;
+            let nice_usec = curr_nice - prev_nice;
+            let irq_usec = curr_irq - prev_irq;
+            let softirq_usec = curr_softirq - prev_softirq;
+            let stolen_usec = curr_stolen - prev_stolen;
+
+            let busy_usec =
+                user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
+            let total_usec = idle_usec + busy_usec + iowait_usec;
+            Ok(busy_usec as f64 / total_usec as f64)
+        }
+        _ => {
+            bail!("Some procfs stats are not populated!");
+        }
+    }
+}
+
+fn calculate_pid_busy(
+    prev: &procfs::PidStat,
+    next: &procfs::PidStat,
+    dur: std::time::Duration,
+) -> anyhow::Result<f64> {
+    match (
+        (prev.user_usecs, prev.system_usecs),
+        (next.user_usecs, prev.system_usecs),
+    ) {
+        ((Some(prev_user), Some(prev_system)), (Some(next_user), Some(next_system))) => {
+            if (next_user >= prev_user) && (next_system >= prev_system) {
+                let busy_usec = next_user + next_system - prev_user - prev_system;
+                Ok(busy_usec as f64 / dur.as_micros() as f64)
+            } else {
+                bail!("Pid usage values look wrong");
+            }
+        }
+        _ => {
+            bail!("Some procfs stats are not populated!");
+        }
+    }
+}
+
+struct PidInfo {
+    pub pid: i32,
+    pub dom: u32,
+    pub dom_mask: u64,
+}
+
+struct LoadInfo {
+    pids_by_milliload: BTreeMap<u64, PidInfo>,
+    pid_stats: BTreeMap<i32, procfs::PidStat>,
+    global_load_sum: f64,
+    dom_load: Vec<f64>,
+}
+
+// We calculate the load for each task and then each dom by enumerating all the
+// tasks in task_data and calculating their CPU util from procfs.
+
+// Given procfs reader, task data map, and pidstat from previous calculation,
+// return:
+//  * a sorted map from milliload -> pid_data,
+//  * a map from pid -> pidstat
+//  * a vec of per-dom looads
+fn calculate_load(
+    proc_reader: &procfs::ProcReader,
+    task_data: &libbpf_rs::Map,
+    interval: std::time::Duration,
+    prev_pid_stat: &BTreeMap<i32, procfs::PidStat>,
+    nr_doms: usize,
+) -> anyhow::Result<LoadInfo> {
+    let mut ret = LoadInfo {
+        pids_by_milliload: BTreeMap::new(),
+        pid_stats: BTreeMap::new(),
+        global_load_sum: 0f64,
+        dom_load: vec![0f64; nr_doms],
+    };
+    for key in task_data.keys() {
+        if let Some(task_ctx_vec) = task_data
+            .lookup(&key, libbpf_rs::MapFlags::ANY)
+            .context("Failed to lookup task_data")?
+        {
+            let task_ctx =
+                unsafe { &*(task_ctx_vec.as_slice().as_ptr() as *const atropos_sys::task_ctx) };
+            let pid = i32::from_ne_bytes(
+                key.as_slice()
+                    .try_into()
+                    .context("Invalid key length in task_data map")?,
+            );
+            match proc_reader.read_tid_stat(pid as u32) {
+                Ok(stat) => {
+                    ret.pid_stats.insert(pid, stat);
+                }
+                Err(procfs::Error::IoError(_, ref e))
+                    if e.raw_os_error()
+                        .map_or(false, |ec| ec == 2 || ec == 3 /* ENOENT or ESRCH */) =>
+                {
+                    continue;
+                }
+                Err(e) => {
+                    bail!(e);
+                }
+            }
+            let pid_load = match (prev_pid_stat.get(&pid), ret.pid_stats.get(&pid)) {
+                (Some(prev_pid_stat), Some(next_pid_stat)) => {
+                    calculate_pid_busy(prev_pid_stat, next_pid_stat, interval)?
+                }
+                // If we don't have any utilization #s for the process, just skip it
+                _ => {
+                    continue;
+                }
+            } * task_ctx.weight as f64;
+            if !pid_load.is_finite() || pid_load <= 0.0 {
+                continue;
+            }
+            ret.global_load_sum += pid_load;
+            ret.dom_load[task_ctx.dom_id as usize] += pid_load;
+            // Only record pids that are eligible for load balancing
+            if task_ctx.dom_mask == (1u64 << task_ctx.dom_id) {
+                continue;
+            }
+            ret.pids_by_milliload.insert(
+                (pid_load * 1000.0) as u64,
+                PidInfo {
+                    pid,
+                    dom: task_ctx.dom_id,
+                    dom_mask: task_ctx.dom_mask,
+                },
+            );
+        }
+    }
+    Ok(ret)
+}
+
+#[derive(Copy, Clone, Default)]
+struct DomLoadBalanceInfo {
+    load_to_pull: f64,
+    load_to_give: f64,
+}
+
+#[derive(Default)]
+struct LoadBalanceInfo {
+    doms: Vec<DomLoadBalanceInfo>,
+    doms_with_load_to_pull: BTreeMap<u32, f64>,
+    doms_with_load_to_give: BTreeMap<u32, f64>,
+}
+
+// To balance dom loads we identify doms with lower and higher load than average
+fn calculate_dom_load_balance(global_load_avg: f64, dom_load: &[f64]) -> LoadBalanceInfo {
+    let mut ret = LoadBalanceInfo::default();
+    ret.doms.resize(dom_load.len(), Default::default());
+
+    const LOAD_IMBAL_HIGH_PCT: f64 = 0.10;
+    const LOAD_IMBAL_MAX_ADJ_PCT: f64 = 0.10;
+    let high = global_load_avg * LOAD_IMBAL_HIGH_PCT;
+    let adj_max = global_load_avg * LOAD_IMBAL_MAX_ADJ_PCT;
+
+    for (dom, dom_load) in dom_load.iter().enumerate() {
+        let mut imbal = dom_load - global_load_avg;
+
+        let mut dom_load_to_pull = 0f64;
+        let mut dom_load_to_give = 0f64;
+        if imbal >= 0f64 {
+            dom_load_to_give = imbal;
+        } else {
+            imbal = -imbal;
+            if imbal > high {
+                dom_load_to_pull = f64::min(imbal, adj_max);
+            }
+        }
+        ret.doms[dom].load_to_pull = dom_load_to_pull;
+        ret.doms[dom].load_to_give = dom_load_to_give;
+        if dom_load_to_pull > 0f64 {
+            ret.doms_with_load_to_pull
+                .insert(dom as u32, dom_load_to_pull);
+        }
+        if dom_load_to_give > 0f64 {
+            ret.doms_with_load_to_give
+                .insert(dom as u32, dom_load_to_give);
+        }
+    }
+    ret
+}
+
+fn clear_map(map: &mut libbpf_rs::Map) {
+    // XXX: libbpf_rs has some design flaw that make it impossible to
+    // delete while iterating despite it being safe so we alias it here
+    let deleter: &mut libbpf_rs::Map = unsafe { &mut *(map as *mut _) };
+    for key in map.keys() {
+        let _ = deleter.delete(&key);
+    }
+}
+
+// Actually execute the load balancing. Concretely this writes pid -> dom
+// entries into the lb_data map for bpf side to consume.
+//
+// The logic here is simple, greedily balance the heaviest load processes until
+// either we have no doms with load to give or no doms with load to pull.
+fn load_balance(
+    global_load_avg: f64,
+    lb_data: &mut libbpf_rs::Map,
+    pids_by_milliload: &BTreeMap<u64, PidInfo>,
+    mut doms_with_load_to_pull: BTreeMap<u32, f64>,
+    mut doms_with_load_to_give: BTreeMap<u32, f64>,
+) -> anyhow::Result<()> {
+    clear_map(lb_data);
+    const LOAD_IMBAL_MIN_ADJ_PCT: f64 = 0.01;
+    let adj_min = global_load_avg * LOAD_IMBAL_MIN_ADJ_PCT;
+    for (pid_milliload, pidinfo) in pids_by_milliload.iter().rev() {
+        if doms_with_load_to_give.is_empty() || doms_with_load_to_pull.is_empty() {
+            break;
+        }
+
+        let pid_load = *pid_milliload as f64 / 1000f64;
+        let mut remove_to_give = None;
+        let mut remove_to_pull = None;
+        if let Some(dom_imbal) = doms_with_load_to_give.get_mut(&pidinfo.dom) {
+            if *dom_imbal < pid_load {
+                continue;
+            }
+
+            for (new_dom, new_dom_imbal) in doms_with_load_to_pull.iter_mut() {
+                if (pidinfo.dom_mask & (1 << new_dom)) == 0 || *new_dom_imbal < pid_load {
+                    continue;
+                }
+
+                *dom_imbal -= pid_load;
+                if *dom_imbal <= adj_min {
+                    remove_to_give = Some(pidinfo.dom);
+                }
+                *new_dom_imbal -= pid_load;
+                if *new_dom_imbal <= adj_min {
+                    remove_to_pull = Some(pidinfo.dom);
+                }
+
+                lb_data
+                    .update(
+                        &(pidinfo.pid as libc::pid_t).to_ne_bytes(),
+                        &new_dom.to_ne_bytes(),
+                        libbpf_rs::MapFlags::NO_EXIST,
+                    )
+                    .context("Failed to update lb_data")?;
+                break;
+            }
+        }
+
+        remove_to_give.map(|dom| doms_with_load_to_give.remove(&dom));
+        remove_to_pull.map(|dom| doms_with_load_to_pull.remove(&dom));
+    }
+    Ok(())
+}
+
+fn print_stats(
+    logger: slog::Logger,
+    stats_map: &mut libbpf_rs::Map,
+    nr_doms: usize,
+    nr_cpus: usize,
+    cpu_busy: f64,
+    global_load_avg: f64,
+    dom_load: &[f64],
+    dom_lb_info: &[DomLoadBalanceInfo],
+) -> anyhow::Result<()> {
+    let stats = {
+        let mut stats: Vec<u64> = Vec::new();
+        let zero_vec = vec![vec![0u8; stats_map.value_size() as usize]; nr_cpus];
+        for stat in 0..atropos_sys::stat_idx_ATROPOS_NR_STATS {
+            let cpu_stat_vec = stats_map
+                .lookup_percpu(&(stat as u32).to_ne_bytes(), libbpf_rs::MapFlags::ANY)
+                .with_context(|| format!("Failed to lookup stat {}", stat))?
+                .expect("per-cpu stat should exist");
+            let sum = cpu_stat_vec
+                .iter()
+                .map(|val| {
+                    u64::from_ne_bytes(
+                        val.as_slice()
+                            .try_into()
+                            .expect("Invalid value length in stat map"),
+                    )
+                })
+                .sum();
+            stats_map
+                .update_percpu(
+                    &(stat as u32).to_ne_bytes(),
+                    &zero_vec,
+                    libbpf_rs::MapFlags::ANY,
+                )
+                .context("Failed to zero stat")?;
+            stats.push(sum);
+        }
+        stats
+    };
+    let mut total = 0;
+    total += stats[atropos_sys::stat_idx_ATROPOS_STAT_WAKE_SYNC as usize];
+    total += stats[atropos_sys::stat_idx_ATROPOS_STAT_PREV_IDLE as usize];
+    total += stats[atropos_sys::stat_idx_ATROPOS_STAT_PINNED as usize];
+    total += stats[atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_DISPATCH as usize];
+    total += stats[atropos_sys::stat_idx_ATROPOS_STAT_DSQ_DISPATCH as usize];
+    total += stats[atropos_sys::stat_idx_ATROPOS_STAT_GREEDY as usize];
+    total += stats[atropos_sys::stat_idx_ATROPOS_STAT_LAST_TASK as usize];
+    slog::info!(logger, "cpu={:5.1}", cpu_busy * 100.0);
+    slog::info!(
+        logger,
+        "task_get_errs: {}, cpumask_errs: {}",
+        stats[atropos_sys::stat_idx_ATROPOS_STAT_TASK_GET_ERR as usize],
+        stats[atropos_sys::stat_idx_ATROPOS_STAT_CPUMASK_ERR as usize],
+    );
+    slog::info!(
+        logger,
+        "tot={:6} wake_sync={:4.1},prev_idle={:4.1},pinned={:4.1},direct={:4.1},dq={:4.1},greedy={:4.1}",
+        total,
+        stats[atropos_sys::stat_idx_ATROPOS_STAT_WAKE_SYNC as usize] as f64 / total as f64 * 100f64,
+        stats[atropos_sys::stat_idx_ATROPOS_STAT_PREV_IDLE as usize] as f64 / total as f64 * 100f64,
+        stats[atropos_sys::stat_idx_ATROPOS_STAT_PINNED as usize] as f64 / total as f64 * 100f64,
+        stats[atropos_sys::stat_idx_ATROPOS_STAT_DIRECT_DISPATCH as usize] as f64 / total as f64
+            * 100f64,
+        stats[atropos_sys::stat_idx_ATROPOS_STAT_DSQ_DISPATCH as usize] as f64 / total as f64
+            * 100f64,
+        stats[atropos_sys::stat_idx_ATROPOS_STAT_GREEDY as usize] as f64 / total as f64 * 100f64,
+    );
+
+    slog::info!(
+        logger,
+        "load_avg:{:.1}, load_balances={}",
+        global_load_avg,
+        stats[atropos_sys::stat_idx_ATROPOS_STAT_LOAD_BALANCE as usize]
+    );
+    for i in 0..nr_doms {
+        slog::info!(logger, "DOM[{:02}]", i);
+        slog::info!(
+            logger,
+            " load={:.1} to_pull={:.1},to_give={:.1}",
+            dom_load[i],
+            dom_lb_info[i].load_to_pull,
+            dom_lb_info[i].load_to_give,
+        );
+    }
+    Ok(())
+}
+
+pub fn run(
+    logger: slog::Logger,
+    debug: bool,
+    cpumasks: Vec<String>,
+    switch_all: bool,
+    balance_load: bool,
+    greedy_threshold: Option<u64>,
+) -> anyhow::Result<()> {
+    slog::info!(logger, "Atropos Scheduler Initialized");
+    let mut skel_builder = AtroposSkelBuilder::default();
+    skel_builder.obj_builder.debug(debug);
+    let mut skel = skel_builder.open().context("Failed to open BPF program")?;
+
+    let (cpusets, cpus) = parse_cpusets(&cpumasks)?;
+    let nr_doms = cpusets.len();
+    let nr_cpus = libbpf_rs::num_possible_cpus()?;
+    skel.rodata().nr_doms = nr_doms as u32;
+    skel.rodata().nr_cpus = nr_cpus as u32;
+
+    for (cpu, dom) in cpus.iter().enumerate() {
+        skel.rodata().cpu_dom_id_map[cpu] = *dom as u32;
+    }
+
+    for (dom, cpuset) in cpusets.iter().enumerate() {
+        let raw_cpuset_slice = cpuset.as_raw_slice();
+        let dom_cpumask_slice = &mut skel.rodata().dom_cpumasks[dom];
+        let (left, _) = dom_cpumask_slice.split_at_mut(raw_cpuset_slice.len());
+        left.clone_from_slice(cpuset.as_raw_slice());
+        slog::info!(logger, "dom {} cpumask {:X?}", dom, dom_cpumask_slice);
+    }
+
+    skel.rodata().switch_all = switch_all;
+
+    if let Some(greedy) = greedy_threshold {
+        skel.rodata().greedy_threshold = greedy;
+    }
+
+    let mut skel = skel.load().context("Failed to load BPF program")?;
+    skel.attach().context("Failed to attach BPF program")?;
+
+    let _structops = skel
+        .maps_mut()
+        .atropos()
+        .attach_struct_ops()
+        .context("Failed to attach atropos struct ops")?;
+    slog::info!(logger, "Atropos Scheduler Attached");
+    let shutdown = Arc::new(AtomicBool::new(false));
+    let shutdown_clone = shutdown.clone();
+    ctrlc::set_handler(move || {
+        shutdown_clone.store(true, Ordering::Relaxed);
+    })
+    .context("Error setting Ctrl-C handler")?;
+
+    let mut proc_reader = procfs::ProcReader::new();
+    let mut prev_sample = get_cpustats(&mut proc_reader)?;
+    let mut prev_pid_stat: BTreeMap<i32, procfs::PidStat> = BTreeMap::new();
+    while !shutdown.load(Ordering::Relaxed)
+        && unsafe { std::ptr::read_volatile(&skel.bss().exit_type as *const _) } == 0
+    {
+        let interval = std::time::Duration::from_secs(1);
+        std::thread::sleep(interval);
+        let now = std::time::SystemTime::now();
+        let next_sample = get_cpustats(&mut proc_reader)?;
+        let cpu_busy = calculate_cpu_busy(&prev_sample.total_cpu, &next_sample.total_cpu)?;
+        prev_sample = next_sample;
+        let load_info = calculate_load(
+            &proc_reader,
+            skel.maps().task_data(),
+            interval,
+            &prev_pid_stat,
+            nr_doms,
+        )?;
+        prev_pid_stat = load_info.pid_stats;
+
+        let global_load_avg = load_info.global_load_sum / nr_doms as f64;
+        let mut lb_info = calculate_dom_load_balance(global_load_avg, &load_info.dom_load);
+
+        let doms_with_load_to_pull = std::mem::take(&mut lb_info.doms_with_load_to_pull);
+        let doms_with_load_to_give = std::mem::take(&mut lb_info.doms_with_load_to_give);
+        if balance_load {
+            load_balance(
+                global_load_avg,
+                skel.maps_mut().lb_data(),
+                &load_info.pids_by_milliload,
+                doms_with_load_to_pull,
+                doms_with_load_to_give,
+            )?;
+            slog::info!(
+                logger,
+                "Load balancing took {:?}",
+                now.elapsed().context("Getting a duration failed")?
+            );
+        }
+        print_stats(
+            logger.clone(),
+            skel.maps_mut().stats(),
+            nr_doms,
+            nr_cpus,
+            cpu_busy,
+            global_load_avg,
+            &load_info.dom_load,
+            &lb_info.doms,
+        )?;
+    }
+    /* Report msg if EXT_OPS_EXIT_ERROR */
+    if skel.bss().exit_type == 2 {
+        let exit_msg_cstr = unsafe { CStr::from_ptr(skel.bss().exit_msg.as_ptr() as *const _) };
+        let exit_msg = exit_msg_cstr
+            .to_str()
+            .context("Failed to convert exit msg to string")?;
+        eprintln!("exit_type={} msg={}", skel.bss().exit_type, exit_msg);
+    }
+    Ok(())
+}
+
+fn main() -> anyhow::Result<()> {
+    let opts = Opt::parse();
+    let logger = setup_logger(&opts.log_level)?;
+    let debug = opts.log_level == "debug";
+
+    run(
+        logger,
+        debug,
+        opts.cpumasks,
+        opts.all,
+        opts.load_balance,
+        opts.greedy_threshold,
+    )
+}
diff --git a/tools/sched_ext/atropos/src/oss/atropos_sys.rs b/tools/sched_ext/atropos/src/oss/atropos_sys.rs
new file mode 100644
index 000000000000..bbeaf856d40e
--- /dev/null
+++ b/tools/sched_ext/atropos/src/oss/atropos_sys.rs
@@ -0,0 +1,10 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(dead_code)]
+
+include!(concat!(env!("OUT_DIR"), "/atropos-sys.rs"));
diff --git a/tools/sched_ext/atropos/src/oss/mod.rs b/tools/sched_ext/atropos/src/oss/mod.rs
new file mode 100644
index 000000000000..5afcf35f777d
--- /dev/null
+++ b/tools/sched_ext/atropos/src/oss/mod.rs
@@ -0,0 +1,29 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+#[path = "../bpf/.output/atropos.skel.rs"]
+mod atropos;
+use std::str::FromStr;
+
+use anyhow::bail;
+pub use atropos::*;
+use slog::o;
+use slog::Drain;
+use slog::Level;
+
+pub mod atropos_sys;
+
+pub fn setup_logger(level: &str) -> anyhow::Result<slog::Logger> {
+    let log_level = match Level::from_str(level) {
+        Ok(l) => l,
+        Err(()) => bail!("Failed to parse \"{}\" as a log level", level),
+    };
+    let decorator = slog_term::TermDecorator::new().build();
+    let drain = slog_term::FullFormat::new(decorator).build().fuse();
+    let drain = slog_async::Async::new(drain)
+        .build()
+        .filter_level(log_level)
+        .fuse();
+    Ok(slog::Logger::root(drain, o!()))
+}
diff --git a/tools/sched_ext/atropos/src/util.rs b/tools/sched_ext/atropos/src/util.rs
new file mode 100644
index 000000000000..eae414c0919a
--- /dev/null
+++ b/tools/sched_ext/atropos/src/util.rs
@@ -0,0 +1,24 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2.
+
+// Shim between facebook types and open source types.
+//
+// The type interfaces and module hierarchy should be identical on
+// both "branches". And since we glob import, all the submodules in
+// this crate will inherit our name bindings and can use generic paths,
+// eg `crate::logging::setup(..)`.
+#[macro_export]
+macro_rules! oss_shim {
+    () => {
+        #[cfg(fbcode_build)]
+        mod facebook;
+        #[cfg(fbcode_build)]
+        use facebook::*;
+        #[cfg(not(fbcode_build))]
+        mod oss;
+        #[cfg(not(fbcode_build))]
+        use oss::*;
+    };
+}