[094/192] loop: use worker per cgroup instead of kworker

Message ID	20210629023815.9pfROXugj%akpm@linux-foundation.org (mailing list archive)
State	New
Headers	show Return-Path: <SRS0=4KF6=LX=kvack.org=owner-linux-mm@kernel.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 572AF61D06 Date: Mon, 28 Jun 2021 19:38:15 -0700 From: Andrew Morton <akpm@linux-foundation.org> To: akpm@linux-foundation.org, axboe@kernel.dk, chris@chrisdown.name, hannes@cmpxchg.org, linux-mm@kvack.org, mhocko@suse.com, ming.lei@redhat.com, mm-commits@vger.kernel.org, schatzberg.dan@gmail.com, shakeelb@google.com, tj@kernel.org, torvalds@linux-foundation.org Subject: [patch 094/192] loop: use worker per cgroup instead of kworker Message-ID: <20210629023815.9pfROXugj%akpm@linux-foundation.org> In-Reply-To: <20210628193256.008961950a714730751c1423@linux-foundation.org> User-Agent: s-nail v14.8.16 Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	[001/192] mm/gup: fix try_grab_compound_head() race with split_huge_page() \| expand [001/192] mm/gup: fix try_grab_compound_head() race with split_huge_page() [002/192] mm/page_alloc: fix memory map initialization for descending nodes [003/192] mm/page_alloc: correct return value of populated elements if bulk array is populated [004/192] kthread: switch to new kerneldoc syntax for named variable macro argument [005/192] kthread_worker: fix return value when kthread_mod_delayed_work() races with kthread_cance… [006/192] ia64: headers: drop duplicated words [007/192] ia64: mca_drv: fix incorrect array size calculation [008/192] streamline_config.pl: make spacing consistent [009/192] streamline_config.pl: add softtabstop=4 for vim users [010/192] scripts/spelling.txt: add more spellings to spelling.txt [011/192] ntfs: fix validity check for file name attribute [012/192] squashfs: add option to panic on errors [013/192] ocfs2: remove unnecessary INIT_LIST_HEAD() [014/192] ocfs2: fix snprintf() checking [015/192] ocfs2: remove redundant assignment to pointer queue [016/192] ocfs2: remove repeated uptodate check for buffer [017/192] ocfs2: replace simple_strtoull() with kstrtoull() [018/192] ocfs2: remove redundant initialization of variable ret [019/192] kernel: watchdog: modify the explanation related to watchdog thread [020/192] doc: watchdog: modify the explanation related to watchdog thread [021/192] doc: watchdog: modify the doc related to "watchdog/%u" [022/192] slab: use __func__ to trace function name [023/192] kunit: make test->lock irq safe [024/192] mm/slub, kunit: add a KUnit test for SLUB debugging functionality [025/192] slub: remove resiliency_test() function [026/192] mm, slub: change run-time assertion in kmalloc_index() to compile-time [027/192] slub: restore slub_debug=- behavior [028/192] slub: actually use 'message' in restore_bytes() [029/192] slub: indicate slab_fix() uses printf formats [030/192] slub: force on no_hash_pointers when slub_debug is enabled [031/192] mm: slub: move sysfs slab alloc/free interfaces to debugfs [032/192] mm/slub: add taint after the errors are printed [033/192] mm/kmemleak: fix possible wrong memory scanning period [034/192] dax: fix ENOMEM handling in grab_mapping_entry() [035/192] tools/vm/page_owner_sort.c: check malloc() return [036/192] mm/debug_vm_pgtable: ensure THP availability via has_transparent_hugepage() [037/192] mm: mmap_lock: use local locks instead of disabling preemption [038/192] mm/page_reporting: fix code style in __page_reporting_request() [039/192] mm/page_reporting: export reporting order as module parameter [040/192] mm/page_reporting: allow driver to specify reporting order [041/192] virtio_balloon: specify page reporting order if needed [042/192] mm: page-writeback: kill get_writeback_state() comments [043/192] mm/page-writeback: Fix performance when BDI's share of ratio is 0. [044/192] mm/page-writeback: update the comment of Dirty position control [045/192] mm/page-writeback: use __this_cpu_inc() in account_page_dirtied() [046/192] writeback, cgroup: do not switch inodes with I_WILL_FREE flag [047/192] writeback, cgroup: add smp_mb() to cgroup_writeback_umount() [048/192] writeback, cgroup: increment isw_nr_in_flight before grabbing an inode [049/192] writeback, cgroup: switch to rcu_work API in inode_switch_wbs() [050/192] writeback, cgroup: keep list of inodes attached to bdi_writeback [051/192] writeback, cgroup: split out the functional part of inode_switch_wbs_work_fn() [052/192] writeback, cgroup: support switching multiple inodes at once [053/192] writeback, cgroup: release dying cgwbs by switching attached inodes [054/192] fs: unexport __set_page_dirty [055/192] fs: move ramfs_aops to libfs [056/192] mm: require ->set_page_dirty to be explicitly wired up [057/192] mm/writeback: move __set_page_dirty() to core mm [058/192] mm/writeback: use __set_page_dirty in __set_page_dirty_nobuffers [059/192] iomap: use __set_page_dirty_nobuffers [060/192] fs: remove anon_set_page_dirty() [061/192] fs: remove noop_set_page_dirty() [062/192] mm: move page dirtying prototypes from mm.h [063/192] mm/gup_benchmark: support threading [064/192] mm: gup: allow FOLL_PIN to scale in SMP [065/192] mm: gup: pack has_pinned in MMF_HAS_PINNED [066/192] mm: pagewalk: fix walk for hugepage tables [067/192] mm/swapfile: use percpu_ref to serialize against concurrent swapoff [068/192] swap: fix do_swap_page() race with swapoff [069/192] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info() [070/192] mm/shmem: fix shmem_swapin() race with swapoff [071/192] mm/swapfile: move get_swap_page_of_type() under CONFIG_HIBERNATION [072/192] mm/swap: remove unused local variable nr_shadows [073/192] mm/swap_slots.c: delete meaningless forward declarations [074/192] mm, swap: remove unnecessary smp_rmb() in swap_type_to_swap_info() [075/192] mm: free idle swap cache page after COW [076/192] swap: check mapping_empty() for swap cache before being freed [077/192] mm/memcg: move mod_objcg_state() to memcontrol.c [078/192] mm/memcg: cache vmstat data in percpu memcg_stock_pcp [079/192] mm/memcg: improve refill_obj_stock() performance [080/192] mm/memcg: optimize user context object stock access [081/192] mm: memcg/slab: properly set up gfp flags for objcg pointer array [082/192] mm: memcg/slab: create a new set of kmalloc-cg-<n> caches [083/192] mm: memcg/slab: disable cache merging for KMALLOC_NORMAL caches [084/192] mm: memcontrol: fix root_mem_cgroup charging [085/192] mm: memcontrol: fix page charging in page replacement [086/192] mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm [087/192] mm: memcontrol: remove the pgdata parameter of mem_cgroup_page_lruvec [088/192] mm: memcontrol: simplify lruvec_holds_page_lru_lock [089/192] mm: memcontrol: rename lruvec_holds_page_lru_lock to page_matches_lruvec [090/192] mm: memcontrol: simplify the logic of objcg pinning memcg [091/192] mm: memcontrol: move obj_cgroup_uncharge_pages() out of css_set_lock [092/192] mm: vmscan: remove noinline_for_stack [093/192] memcontrol: use flexible-array member [094/192] loop: use worker per cgroup instead of kworker [095/192] mm: charge active memcg when no mm is set [096/192] loop: charge i/o to mem and blk cg [097/192] mm: memcontrol: remove trailing semicolon in macros [098/192] perf: MAP_EXECUTABLE does not indicate VM_MAYEXEC [099/192] binfmt: remove in-tree usage of MAP_EXECUTABLE [100/192] mm: ignore MAP_EXECUTABLE in ksys_mmap_pgoff() [101/192] mm/mmap.c: logic of find_vma_intersection repeated in __do_munmap [102/192] mm/mmap: introduce unlock_range() for code cleanup [103/192] mm/mmap: use find_vma_intersection() in do_mmap() for overlap [104/192] mm/memory.c: fix comment of finish_mkwrite_fault() [105/192] mm: add vma_lookup(), update find_vma_intersection() comments [106/192] drm/i915/selftests: use vma_lookup() in __igt_mmap() [107/192] arch/arc/kernel/troubleshoot: use vma_lookup() instead of find_vma() [108/192] arch/arm64/kvm: use vma_lookup() instead of find_vma_intersection() [109/192] arch/powerpc/kvm/book3s_hv_uvmem: use vma_lookup() instead of find_vma_intersection() [110/192] arch/powerpc/kvm/book3s: use vma_lookup() in kvmppc_hv_setup_htab_rma() [111/192] arch/mips/kernel/traps: use vma_lookup() instead of find_vma() [112/192] arch/m68k/kernel/sys_m68k: use vma_lookup() in sys_cacheflush() [113/192] x86/sgx: use vma_lookup() in sgx_encl_find() [114/192] virt/kvm: use vma_lookup() instead of find_vma_intersection() [115/192] vfio: use vma_lookup() instead of find_vma_intersection() [116/192] net/ipv5/tcp: use vma_lookup() in tcp_zerocopy_receive() [117/192] drm/amdgpu: use vma_lookup() in amdgpu_ttm_tt_get_user_pages() [118/192] media: videobuf2: use vma_lookup() in get_vaddr_frames() [119/192] misc/sgi-gru/grufault: use vma_lookup() in gru_find_vma() [120/192] kernel/events/uprobes: use vma_lookup() in find_active_uprobe() [121/192] lib/test_hmm: use vma_lookup() in dmirror_migrate() [122/192] mm/ksm: use vma_lookup() in find_mergeable_vma() [123/192] mm/migrate: use vma_lookup() in do_pages_stat_array() [124/192] mm/mremap: use vma_lookup() in vma_to_resize() [125/192] mm/memory.c: use vma_lookup() in __access_remote_vm() [126/192] mm/mempolicy: use vma_lookup() in __access_remote_vm() [127/192] mm: update legacy flush_tlb_* to use vma [128/192] mm: improve mprotect(R\|W) efficiency on pages referenced once [129/192] h8300: remove unused variable [130/192] mm/dmapool: use DEVICE_ATTR_RO macro [131/192] mm, tracing: unify PFN format strings [132/192] mm/page_alloc: add an alloc_pages_bulk_array_node() helper [133/192] mm/vmalloc: switch to bulk allocator in __vmalloc_area_node() [134/192] mm/vmalloc: print a warning message first on failure [135/192] mm/vmalloc: remove quoted strings split across lines [136/192] mm/vmalloc: fallback to a single page allocator [137/192] mm: vmalloc: add cond_resched() in __vunmap() [138/192] printk: introduce dump_stack_lvl() [139/192] kasan: use dump_stack_lvl(KERN_ERR) to print stacks [140/192] kasan: test: improve failure message in KUNIT_EXPECT_KASAN_FAIL() [141/192] kasan: allow an architecture to disable inline instrumentation [142/192] kasan: allow architectures to provide an outline readiness check [143/192] mm: define default MAX_PTRS_PER_* in include/pgtable.h [144/192] kasan: use MAX_PTRS_PER_* for early shadow tables [145/192] kasan: rename CONFIG_KASAN_SW_TAGS_IDENTIFY to CONFIG_KASAN_TAGS_IDENTIFY [146/192] kasan: integrate the common part of two KASAN tag-based modes [147/192] kasan: add memory corruption identification support for hardware tag-based mode [148/192] mm: report which part of mem is being freed on initmem case [149/192] mm/mmzone.h: simplify is_highmem_idx() [150/192] mm: make __dump_page static [151/192] mm/page_alloc: bail out on fatal signal during reclaim/compaction retry attempt [152/192] mm/debug: factor PagePoisoned out of __dump_page [153/192] mm/page_owner: constify dump_page_owner [154/192] mm: make compound_head const-preserving [155/192] mm: constify get_pfnblock_flags_mask and get_pfnblock_migratetype [156/192] mm: constify page_count and page_ref_count [157/192] mm: optimise nth_page for contiguous memmap [158/192] mm/page_alloc: switch to pr_debug [159/192] kbuild: skip per-CPU BTF generation for pahole v1.18-v1.21 [160/192] mm/page_alloc: split per cpu page lists and zone stats [161/192] mm/page_alloc: convert per-cpu list protection to local_lock [162/192] mm/vmstat: convert NUMA statistics to basic NUMA counters [163/192] mm/vmstat: inline NUMA event counter updates [164/192] mm/page_alloc: batch the accounting updates in the bulk allocator [165/192] mm/page_alloc: reduce duration that IRQs are disabled for VM counters [166/192] mm/page_alloc: explicitly acquire the zone lock in __free_pages_ok [167/192] mm/page_alloc: avoid conflating IRQs disabled with zone->lock [168/192] mm/page_alloc: update PGFREE outside the zone lock in __free_pages_ok [169/192] mm: page_alloc: dump migrate-failed pages only at -EBUSY [170/192] mm/page_alloc: delete vm.percpu_pagelist_fraction [171/192] mm/page_alloc: disassociate the pcp->high from pcp->batch [172/192] mm/page_alloc: adjust pcp->high after CPU hotplug events [173/192] mm/page_alloc: scale the number of pages that are batch freed [174/192] mm/page_alloc: limit the number of pages on PCP lists when reclaim is active [175/192] mm/page_alloc: introduce vm.percpu_pagelist_high_fraction [176/192] mm: drop SECTION_SHIFT in code comments [177/192] mm/page_alloc: improve memmap_pages dbg msg [178/192] mm/page_alloc: fix counting of managed_pages [179/192] mm/page_alloc: move free_the_page [180/192] alpha: remove DISCONTIGMEM and NUMA [181/192] arc: update comment about HIGHMEM implementation [182/192] arc: remove support for DISCONTIGMEM [183/192] m68k: remove support for DISCONTIGMEM [184/192] mm: remove CONFIG_DISCONTIGMEM [185/192] arch, mm: remove stale mentions of DISCONIGMEM [186/192] docs: remove description of DISCONTIGMEM [187/192] mm: replace CONFIG_NEED_MULTIPLE_NODES with CONFIG_NUMA [188/192] mm: replace CONFIG_FLAT_NODE_MEM_MAP with CONFIG_FLATMEM [189/192] mm/page_alloc: allow high-order pages to be stored on the per-cpu lists [190/192] mm/page_alloc: split pcp->high across all online CPUs for cpuless nodes [191/192] mm,hwpoison: send SIGBUS with error virutal address [192/192] mm,hwpoison: make get_hwpoison_page() call get_any_page()

Message ID

20210629023815.9pfROXugj%akpm@linux-foundation.org (mailing list archive)

State

New

Headers

DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 572AF61D06
Date: Mon, 28 Jun 2021 19:38:15 -0700
From: Andrew Morton <akpm@linux-foundation.org>
To: akpm@linux-foundation.org, axboe@kernel.dk, chris@chrisdown.name,
 hannes@cmpxchg.org, linux-mm@kvack.org, mhocko@suse.com,
 ming.lei@redhat.com, mm-commits@vger.kernel.org,
 schatzberg.dan@gmail.com, shakeelb@google.com, tj@kernel.org,
 torvalds@linux-foundation.org
Subject: [patch 094/192] loop: use worker per cgroup instead of
 kworker
Message-ID: <20210629023815.9pfROXugj%akpm@linux-foundation.org>
In-Reply-To: <20210628193256.008961950a714730751c1423@linux-foundation.org>
User-Agent: s-nail v14.8.16
Sender: owner-linux-mm@kvack.org
Precedence: bulk

Series

[001/192] mm/gup: fix try_grab_compound_head() race with split_huge_page() | expand

Commit Message

Andrew Morton June 29, 2021, 2:38 a.m. UTC

From: Dan Schatzberg <schatzberg.dan@gmail.com>
Subject: loop: use worker per cgroup instead of kworker

Patch series "Charge loop device i/o to issuing cgroup", v14.

The loop device runs all i/o to the backing file on a separate kworker
thread which results in all i/o being charged to the root cgroup.  This
allows a loop device to be used to trivially bypass resource limits and
other policy.  This patch series fixes this gap in accounting.

A simple script to demonstrate this behavior on cgroupv2 machine:

'''
#!/bin/bash
set -e

CGROUP=/sys/fs/cgroup/test.slice
LOOP_DEV=/dev/loop0

if [[ ! -d $CGROUP ]]
then
    sudo mkdir $CGROUP
fi

grep oom_kill $CGROUP/memory.events

# Set a memory limit, write more than that limit to tmpfs -> OOM kill
sudo unshare -m bash -c "
echo \$\$ > $CGROUP/cgroup.procs;
echo 0 > $CGROUP/memory.swap.max;
echo 64M > $CGROUP/memory.max;
mount -t tmpfs -o size=512m tmpfs /tmp;
dd if=/dev/zero of=/tmp/file bs=1M count=256" || true

grep oom_kill $CGROUP/memory.events

# Set a memory limit, write more than that limit through loopback
# device -> no OOM kill
sudo unshare -m bash -c "
echo \$\$ > $CGROUP/cgroup.procs;
echo 0 > $CGROUP/memory.swap.max;
echo 64M > $CGROUP/memory.max;
mount -t tmpfs -o size=512m tmpfs /tmp;
truncate -s 512m /tmp/backing_file
losetup $LOOP_DEV /tmp/backing_file
dd if=/dev/zero of=$LOOP_DEV bs=1M count=256;
losetup -D $LOOP_DEV" || true

grep oom_kill $CGROUP/memory.events
'''

Naively charging cgroups could result in priority inversions through the
single kworker thread in the case where multiple cgroups are
reading/writing to the same loop device.  This patch series does some
minor modification to the loop driver so that each cgroup can make forward
progress independently to avoid this inversion.

With this patch series applied, the above script triggers OOM kills when
writing through the loop device as expected.

This patch (of 3):

Existing uses of loop device may have multiple cgroups reading/writing to
the same device.  Simply charging resources for I/O to the backing file
could result in priority inversion where one cgroup gets synchronously
blocked, holding up all other I/O to the loop device.

In order to avoid this priority inversion, we use a single workqueue where
each work item is a "struct loop_worker" which contains a queue of struct
loop_cmds to issue.  The loop device maintains a tree mapping blk css_id
-> loop_worker.  This allows each cgroup to independently make forward
progress issuing I/O to the backing file.

There is also a single queue for I/O associated with the rootcg which can
be used in cases of extreme memory shortage where we cannot allocate a
loop_worker.

The locking for the tree and queues is fairly heavy handed - we acquire a
per-loop-device spinlock any time either is accessed.  The existing
implementation serializes all I/O through a single thread anyways, so I
don't believe this is any worse.

[colin.king@canonical.com: fixes]
Link: https://lkml.kernel.org/r/20210610173944.1203706-1-schatzberg.dan@gmail.com
Link: https://lkml.kernel.org/r/20210610173944.1203706-2-schatzberg.dan@gmail.com
Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Chris Down <chris@chrisdown.name>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 drivers/block/loop.c |  209 +++++++++++++++++++++++++++++++++++------
 drivers/block/loop.h |   12 +-
 2 files changed, 187 insertions(+), 34 deletions(-)

--- a/drivers/block/loop.c~loop-use-worker-per-cgroup-instead-of-kworker
+++ a/drivers/block/loop.c
@@ -71,7 +71,6 @@ 
 #include <linux/writeback.h>
 #include <linux/completion.h>
 #include <linux/highmem.h>
-#include <linux/kthread.h>
 #include <linux/splice.h>
 #include <linux/sysfs.h>
 #include <linux/miscdevice.h>
@@ -84,6 +83,8 @@ 
 
 #include <linux/uaccess.h>
 
+#define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ)
+
 static DEFINE_IDR(loop_index_idr);
 static DEFINE_MUTEX(loop_ctl_mutex);
 
@@ -921,27 +922,95 @@  static void loop_config_discard(struct l
 	q->limits.discard_alignment = 0;
 }
 
-static void loop_unprepare_queue(struct loop_device *lo)
+struct loop_worker {
+	struct rb_node rb_node;
+	struct work_struct work;
+	struct list_head cmd_list;
+	struct list_head idle_list;
+	struct loop_device *lo;
+	struct cgroup_subsys_state *css;
+	unsigned long last_ran_at;
+};
+
+static void loop_workfn(struct work_struct *work);
+static void loop_rootcg_workfn(struct work_struct *work);
+static void loop_free_idle_workers(struct timer_list *timer);
+
+#ifdef CONFIG_BLK_CGROUP
+static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
 {
-	kthread_flush_worker(&lo->worker);
-	kthread_stop(lo->worker_task);
+	return !css || css == blkcg_root_css;
 }
-
-static int loop_kthread_worker_fn(void *worker_ptr)
+#else
+static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
 {
-	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
-	return kthread_worker_fn(worker_ptr);
+	return !css;
 }
+#endif
 
-static int loop_prepare_queue(struct loop_device *lo)
+static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
 {
-	kthread_init_worker(&lo->worker);
-	lo->worker_task = kthread_run(loop_kthread_worker_fn,
-			&lo->worker, "loop%d", lo->lo_number);
-	if (IS_ERR(lo->worker_task))
-		return -ENOMEM;
-	set_user_nice(lo->worker_task, MIN_NICE);
-	return 0;
+	struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL;
+	struct loop_worker *cur_worker, *worker = NULL;
+	struct work_struct *work;
+	struct list_head *cmd_list;
+
+	spin_lock_irq(&lo->lo_work_lock);
+
+	if (queue_on_root_worker(cmd->css))
+		goto queue_work;
+
+	node = &lo->worker_tree.rb_node;
+
+	while (*node) {
+		parent = *node;
+		cur_worker = container_of(*node, struct loop_worker, rb_node);
+		if (cur_worker->css == cmd->css) {
+			worker = cur_worker;
+			break;
+		} else if ((long)cur_worker->css < (long)cmd->css) {
+			node = &(*node)->rb_left;
+		} else {
+			node = &(*node)->rb_right;
+		}
+	}
+	if (worker)
+		goto queue_work;
+
+	worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT | __GFP_NOWARN);
+	/*
+	 * In the event we cannot allocate a worker, just queue on the
+	 * rootcg worker
+	 */
+	if (!worker)
+		goto queue_work;
+
+	worker->css = cmd->css;
+	css_get(worker->css);
+	INIT_WORK(&worker->work, loop_workfn);
+	INIT_LIST_HEAD(&worker->cmd_list);
+	INIT_LIST_HEAD(&worker->idle_list);
+	worker->lo = lo;
+	rb_link_node(&worker->rb_node, parent, node);
+	rb_insert_color(&worker->rb_node, &lo->worker_tree);
+queue_work:
+	if (worker) {
+		/*
+		 * We need to remove from the idle list here while
+		 * holding the lock so that the idle timer doesn't
+		 * free the worker
+		 */
+		if (!list_empty(&worker->idle_list))
+			list_del_init(&worker->idle_list);
+		work = &worker->work;
+		cmd_list = &worker->cmd_list;
+	} else {
+		work = &lo->rootcg_work;
+		cmd_list = &lo->rootcg_cmd_list;
+	}
+	list_add_tail(&cmd->list_entry, cmd_list);
+	queue_work(lo->workqueue, work);
+	spin_unlock_irq(&lo->lo_work_lock);
 }
 
 static void loop_update_rotational(struct loop_device *lo)
@@ -1127,12 +1196,23 @@  static int loop_configure(struct loop_de
 	    !file->f_op->write_iter)
 		lo->lo_flags |= LO_FLAGS_READ_ONLY;
 
-	error = loop_prepare_queue(lo);
-	if (error)
+	lo->workqueue = alloc_workqueue("loop%d",
+					WQ_UNBOUND | WQ_FREEZABLE,
+					0,
+					lo->lo_number);
+	if (!lo->workqueue) {
+		error = -ENOMEM;
 		goto out_unlock;
+	}
 
 	set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
+	INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
+	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
+	INIT_LIST_HEAD(&lo->idle_worker_list);
+	lo->worker_tree = RB_ROOT;
+	timer_setup(&lo->timer, loop_free_idle_workers,
+		TIMER_DEFERRABLE);
 	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
 	lo->lo_device = bdev;
 	lo->lo_backing_file = file;
@@ -1200,6 +1280,7 @@  static int __loop_clr_fd(struct loop_dev
 	int err = 0;
 	bool partscan = false;
 	int lo_number;
+	struct loop_worker *pos, *worker;
 
 	mutex_lock(&lo->lo_mutex);
 	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
@@ -1219,6 +1300,18 @@  static int __loop_clr_fd(struct loop_dev
 	/* freeze request queue during the transition */
 	blk_mq_freeze_queue(lo->lo_queue);
 
+	destroy_workqueue(lo->workqueue);
+	spin_lock_irq(&lo->lo_work_lock);
+	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
+				idle_list) {
+		list_del(&worker->idle_list);
+		rb_erase(&worker->rb_node, &lo->worker_tree);
+		css_put(worker->css);
+		kfree(worker);
+	}
+	spin_unlock_irq(&lo->lo_work_lock);
+	del_timer_sync(&lo->timer);
+
 	spin_lock_irq(&lo->lo_lock);
 	lo->lo_backing_file = NULL;
 	spin_unlock_irq(&lo->lo_lock);
@@ -1255,7 +1348,6 @@  static int __loop_clr_fd(struct loop_dev
 
 	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
 	lo_number = lo->lo_number;
-	loop_unprepare_queue(lo);
 out_unlock:
 	mutex_unlock(&lo->lo_mutex);
 	if (partscan) {
@@ -2015,7 +2107,7 @@  static blk_status_t loop_queue_rq(struct
 	} else
 #endif
 		cmd->css = NULL;
-	kthread_queue_work(&lo->worker, &cmd->work);
+	loop_queue_work(lo, cmd);
 
 	return BLK_STS_OK;
 }
@@ -2045,26 +2137,82 @@  static void loop_handle_cmd(struct loop_
 	}
 }
 
-static void loop_queue_work(struct kthread_work *work)
+static void loop_set_timer(struct loop_device *lo)
+{
+	timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
+}
+
+static void loop_process_work(struct loop_worker *worker,
+			struct list_head *cmd_list, struct loop_device *lo)
 {
-	struct loop_cmd *cmd =
-		container_of(work, struct loop_cmd, work);
+	int orig_flags = current->flags;
+	struct loop_cmd *cmd;
+
+	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
+	spin_lock_irq(&lo->lo_work_lock);
+	while (!list_empty(cmd_list)) {
+		cmd = container_of(
+			cmd_list->next, struct loop_cmd, list_entry);
+		list_del(cmd_list->next);
+		spin_unlock_irq(&lo->lo_work_lock);
+
+		loop_handle_cmd(cmd);
+		cond_resched();
+
+		spin_lock_irq(&lo->lo_work_lock);
+	}
 
-	loop_handle_cmd(cmd);
+	/*
+	 * We only add to the idle list if there are no pending cmds
+	 * *and* the worker will not run again which ensures that it
+	 * is safe to free any worker on the idle list
+	 */
+	if (worker && !work_pending(&worker->work)) {
+		worker->last_ran_at = jiffies;
+		list_add_tail(&worker->idle_list, &lo->idle_worker_list);
+		loop_set_timer(lo);
+	}
+	spin_unlock_irq(&lo->lo_work_lock);
+	current->flags = orig_flags;
 }
 
-static int loop_init_request(struct blk_mq_tag_set *set, struct request *rq,
-		unsigned int hctx_idx, unsigned int numa_node)
+static void loop_workfn(struct work_struct *work)
 {
-	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	struct loop_worker *worker =
+		container_of(work, struct loop_worker, work);
+	loop_process_work(worker, &worker->cmd_list, worker->lo);
+}
 
-	kthread_init_work(&cmd->work, loop_queue_work);
-	return 0;
+static void loop_rootcg_workfn(struct work_struct *work)
+{
+	struct loop_device *lo =
+		container_of(work, struct loop_device, rootcg_work);
+	loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
+}
+
+static void loop_free_idle_workers(struct timer_list *timer)
+{
+	struct loop_device *lo = container_of(timer, struct loop_device, timer);
+	struct loop_worker *pos, *worker;
+
+	spin_lock_irq(&lo->lo_work_lock);
+	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
+				idle_list) {
+		if (time_is_after_jiffies(worker->last_ran_at +
+						LOOP_IDLE_WORKER_TIMEOUT))
+			break;
+		list_del(&worker->idle_list);
+		rb_erase(&worker->rb_node, &lo->worker_tree);
+		css_put(worker->css);
+		kfree(worker);
+	}
+	if (!list_empty(&lo->idle_worker_list))
+		loop_set_timer(lo);
+	spin_unlock_irq(&lo->lo_work_lock);
 }
 
 static const struct blk_mq_ops loop_mq_ops = {
 	.queue_rq       = loop_queue_rq,
-	.init_request	= loop_init_request,
 	.complete	= lo_complete_rq,
 };
 
@@ -2153,6 +2301,7 @@  static int loop_add(struct loop_device *
 	mutex_init(&lo->lo_mutex);
 	lo->lo_number		= i;
 	spin_lock_init(&lo->lo_lock);
+	spin_lock_init(&lo->lo_work_lock);
 	disk->major		= LOOP_MAJOR;
 	disk->first_minor	= i << part_shift;
 	disk->fops		= &lo_fops;
--- a/drivers/block/loop.h~loop-use-worker-per-cgroup-instead-of-kworker
+++ a/drivers/block/loop.h
@@ -14,7 +14,6 @@ 
 #include <linux/blk-mq.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
-#include <linux/kthread.h>
 #include <uapi/linux/loop.h>
 
 /* Possible states of device */
@@ -55,8 +54,13 @@  struct loop_device {
 
 	spinlock_t		lo_lock;
 	int			lo_state;
-	struct kthread_worker	worker;
-	struct task_struct	*worker_task;
+	spinlock_t              lo_work_lock;
+	struct workqueue_struct *workqueue;
+	struct work_struct      rootcg_work;
+	struct list_head        rootcg_cmd_list;
+	struct list_head        idle_worker_list;
+	struct rb_root          worker_tree;
+	struct timer_list       timer;
 	bool			use_dio;
 	bool			sysfs_inited;
 
@@ -67,7 +71,7 @@  struct loop_device {
 };
 
 struct loop_cmd {
-	struct kthread_work work;
+	struct list_head list_entry;
 	bool use_aio; /* use AIO interface to handle I/O */
 	atomic_t ref; /* only for aio */
 	long ret;