@@ -2189,16 +2189,6 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
*/
atomic_set_release(&bp_desc.refs, 1);
- /*
- * Function tracing can enable thousands of places that need to be
- * updated. This can take quite some time, and with full kernel debugging
- * enabled, this could cause the softlockup watchdog to trigger.
- * This function gets called every 256 entries added to be patched.
- * Call cond_resched() here to make sure that other tasks can get scheduled
- * while processing all the functions being patched.
- */
- cond_resched();
-
/*
* Corresponding read barrier in int3 notifier for making sure the
* nr_entries and handler are correctly ordered wrt. patching.
@@ -549,14 +549,15 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
break;
}
- /* Reschedule on every XA_CHECK_SCHED iteration. */
+ /*
+ * Drop the lock every XA_CHECK_SCHED iteration so the
+ * scheduler can preempt if needed.
+ */
if (!(++count % XA_CHECK_SCHED)) {
xas_pause(&xas);
xas_unlock(&xas);
mutex_unlock(&encl->lock);
- cond_resched();
-
mutex_lock(&encl->lock);
xas_lock(&xas);
}
@@ -723,16 +724,15 @@ void sgx_encl_release(struct kref *ref)
}
kfree(entry);
+
/*
- * Invoke scheduler on every XA_CHECK_SCHED iteration
- * to prevent soft lockups.
+ * Drop the lock every XA_CHECK_SCHED iteration so the
+ * scheduler can preempt if needed.
*/
if (!(++count % XA_CHECK_SCHED)) {
xas_pause(&xas);
xas_unlock(&xas);
- cond_resched();
-
xas_lock(&xas);
}
}
@@ -439,9 +439,6 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
break;
}
- if (need_resched())
- cond_resched();
-
ret = sgx_encl_add_page(encl, add_arg.src + c, add_arg.offset + c,
&secinfo, add_arg.flags);
if (ret)
@@ -98,8 +98,6 @@ static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list)
list_move_tail(&page->list, &dirty);
left_dirty++;
}
-
- cond_resched();
}
list_splice(&dirty, dirty_page_list);
@@ -413,8 +411,6 @@ static int ksgxd(void *p)
if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
sgx_reclaim_pages();
-
- cond_resched();
}
return 0;
@@ -581,7 +577,6 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
}
sgx_reclaim_pages();
- cond_resched();
}
if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
@@ -175,7 +175,6 @@ static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
return -EBUSY;
}
}
- cond_resched();
}
/*
@@ -204,7 +203,6 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
continue;
xa_erase(&vepc->page_array, index);
- cond_resched();
}
/*
@@ -223,7 +221,6 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
list_add_tail(&epc_page->list, &secs_pages);
xa_erase(&vepc->page_array, index);
- cond_resched();
}
/*
@@ -245,7 +242,6 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
if (sgx_vepc_free_page(epc_page))
list_add_tail(&epc_page->list, &secs_pages);
- cond_resched();
}
if (!list_empty(&secs_pages))
@@ -435,7 +435,11 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
kvfree(new);
new = NULL;
if (r == -E2BIG) {
- cond_resched();
+ /*
+ * A vCPU was just added or a enabled its APIC.
+ * Give things time to settle before retrying.
+ */
+ cond_resched_stall();
goto retry;
}
@@ -6431,8 +6431,8 @@ static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
}
if (need_topup_split_caches_or_resched(kvm)) {
+ /* The preemption point in write_unlock() reschedules if needed. */
write_unlock(&kvm->mmu_lock);
- cond_resched();
/*
* If the topup succeeds, return -EAGAIN to indicate that the
* rmap iterator should be restarted because the MMU lock was
@@ -476,7 +476,6 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
page_virtual = kmap_local_page(pages[i]);
clflush_cache_range(page_virtual, PAGE_SIZE);
kunmap_local(page_virtual);
- cond_resched();
}
}
@@ -2157,12 +2156,14 @@ void sev_vm_destroy(struct kvm *kvm)
/*
* if userspace was terminated before unregistering the memory regions
* then lets unpin all the registered memory.
+ *
+ * This might be a while but we are preemptible so the scheduler can
+ * always preempt if needed.
*/
if (!list_empty(head)) {
list_for_each_safe(pos, q, head) {
__unregister_enc_region_locked(kvm,
list_entry(pos, struct enc_region, list));
- cond_resched();
}
}
@@ -2819,7 +2819,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
prog->aux->extable = (void *) image + roundup(proglen, align);
}
oldproglen = proglen;
- cond_resched();
}
if (bpf_jit_enable > 1)
@@ -2594,7 +2594,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
}
}
oldproglen = proglen;
- cond_resched();
}
if (bpf_jit_enable > 1)
@@ -2510,7 +2510,6 @@ int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr,
addr += range;
if (err_ptr)
err_ptr += batch;
- cond_resched();
}
out:
There are broadly three sets of uses of cond_resched(): 1. Calls to cond_resched() out of the goodness of our heart, otherwise known as avoiding lockup splats. 2. Open coded variants of cond_resched_lock() which call cond_resched(). 3. Retry or error handling loops, where cond_resched() is used as a quick alternative to spinning in a tight-loop. When running under a full preemption model, the cond_resched() reduces to a NOP (not even a barrier) so removing it obviously cannot matter. But considering only voluntary preemption models (for say code that has been mostly tested under those), for set-1 and set-2 the scheduler can now preempt kernel tasks running beyond their time quanta anywhere they are preemptible() [1]. Which removes any need for these explicitly placed scheduling points. The cond_resched() calls in set-3 are a little more difficult. To start with, given it's NOP character under full preemption, it never actually saved us from a tight loop. With voluntary preemption, it's not a NOP, but it might as well be -- for most workloads the scheduler does not have an interminable supply of runnable tasks on the runqueue. So, cond_resched() is useful to not get softlockup splats, but not terribly good for error handling. Ideally, these should be replaced with some kind of timed or event wait. For now we use cond_resched_stall(), which tries to schedule if possible, and executes a cpu_relax() if not. Most of the instances of cond_resched() here are from set-1 or set-2. Remove them. There's one set-3 case where kvm_recalculate_apic_map() sees an unexpected APIC-id, where we now use cond_resched_stall() to delay the retry. [1] https://lore.kernel.org/lkml/20231107215742.363031-1-ankur.a.arora@oracle.com/ Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Sean Christopherson <seanjc@google.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Juergen Gross <jgross@suse.com> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: David Ahern <dsahern@kernel.org> Cc: x86@kernel.org Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com> --- arch/x86/kernel/alternative.c | 10 ---------- arch/x86/kernel/cpu/sgx/encl.c | 14 +++++++------- arch/x86/kernel/cpu/sgx/ioctl.c | 3 --- arch/x86/kernel/cpu/sgx/main.c | 5 ----- arch/x86/kernel/cpu/sgx/virt.c | 4 ---- arch/x86/kvm/lapic.c | 6 +++++- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/svm/sev.c | 5 +++-- arch/x86/net/bpf_jit_comp.c | 1 - arch/x86/net/bpf_jit_comp32.c | 1 - arch/x86/xen/mmu_pv.c | 1 - 11 files changed, 16 insertions(+), 36 deletions(-)