@@ -686,7 +686,41 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
#ifdef CONFIG_PER_VMA_LOCK
static inline void vma_lock_init(struct vma_lock *vm_lock)
{
- init_rwsem(&vm_lock->lock);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ static struct lock_class_key lockdep_key;
+
+ lockdep_init_map(&vm_lock->dep_map, "vm_lock", &lockdep_key, 0);
+#endif
+ atomic_set(&vm_lock->count, VMA_LOCK_UNLOCKED);
+}
+
+static inline unsigned int vma_lock_reader_count(unsigned int counter)
+{
+ return counter & VMA_LOCK_RD_COUNT_MASK;
+}
+
+static inline void __vma_end_read(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ unsigned int count, prev, new;
+
+ count = (unsigned int)atomic_read(&vma->vm_lock.count);
+ for (;;) {
+ if (unlikely(vma_lock_reader_count(count) == 0)) {
+ /*
+ * Overflow was possible in vma_start_read_locked().
+ * When detected, wrap around preserving writer bits.
+ */
+ new = count | ~(VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT);
+ } else
+ new = count - 1;
+ prev = atomic_cmpxchg(&vma->vm_lock.count, count, new);
+ if (prev == count)
+ break;
+ count = prev;
+ }
+ rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
+ if (vma_lock_reader_count(new) == 0 && (new & VMA_LOCK_WR_WAIT))
+ wake_up(&mm->vma_writer_wait);
}
/*
@@ -696,6 +730,9 @@ static inline void vma_lock_init(struct vma_lock *vm_lock)
*/
static inline bool vma_start_read(struct vm_area_struct *vma)
{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned int count, prev, new;
+
/*
* Check before locking. A race might cause false locked result.
* We can use READ_ONCE() for the mm_lock_seq here, and don't need
@@ -703,11 +740,35 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* we don't rely on for anything - the mm_lock_seq read against which we
* need ordering is below.
*/
- if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
+ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
return false;
- if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0))
- return false;
+ rwsem_acquire_read(&vma->vm_lock.dep_map, 0, 0, _RET_IP_);
+ count = (unsigned int)atomic_read(&vma->vm_lock.count);
+ for (;;) {
+ /* Is VMA is write-locked or writer is waiting? */
+ if (count & (VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT)) {
+ rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
+ return false;
+ }
+
+ new = count + 1;
+ /* If atomic_t overflows, fail to lock. */
+ if (new & (VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT)) {
+ rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
+ return false;
+ }
+
+ /*
+ * Atomic RMW will provide implicit mb on success to pair with smp_wmb in
+ * vma_start_write, on failure we retry.
+ */
+ prev = atomic_cmpxchg(&vma->vm_lock.count, count, new);
+ if (prev == count)
+ break;
+ count = prev;
+ }
+ lock_acquired(&vma->vm_lock.dep_map, _RET_IP_);
/*
* Overflow might produce false locked result.
@@ -720,8 +781,8 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* after it has been unlocked.
* This pairs with RELEASE semantics in vma_end_write_all().
*/
- if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
- up_read(&vma->vm_lock.lock);
+ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
+ __vma_end_read(mm, vma);
return false;
}
return true;
@@ -733,8 +794,30 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
*/
static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
{
- mmap_assert_locked(vma->vm_mm);
- down_read_nested(&vma->vm_lock.lock, subclass);
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned int count, prev, new;
+
+ mmap_assert_locked(mm);
+
+ rwsem_acquire_read(&vma->vm_lock.dep_map, subclass, 0, _RET_IP_);
+ count = (unsigned int)atomic_read(&vma->vm_lock.count);
+ for (;;) {
+ /* We are holding mmap_lock, no active or waiting writers are possible. */
+ VM_BUG_ON_VMA(count & (VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT), vma);
+ new = count + 1;
+ /* Unlikely but if atomic_t overflows, wrap around to. */
+ if (WARN_ON(new & (VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT)))
+ new = 0;
+ /*
+ * Atomic RMW will provide implicit mb on success to pair with smp_wmb in
+ * vma_start_write, on failure we retry.
+ */
+ prev = atomic_cmpxchg(&vma->vm_lock.count, count, new);
+ if (prev == count)
+ break;
+ count = prev;
+ }
+ lock_acquired(&vma->vm_lock.dep_map, _RET_IP_);
}
/*
@@ -743,14 +826,15 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int
*/
static inline void vma_start_read_locked(struct vm_area_struct *vma)
{
- mmap_assert_locked(vma->vm_mm);
- down_read(&vma->vm_lock.lock);
+ vma_start_read_locked_nested(vma, 0);
}
static inline void vma_end_read(struct vm_area_struct *vma)
{
+ struct mm_struct *mm = vma->vm_mm;
+
rcu_read_lock(); /* keeps vma alive till the end of up_read */
- up_read(&vma->vm_lock.lock);
+ __vma_end_read(mm, vma);
rcu_read_unlock();
}
@@ -774,12 +858,34 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l
*/
static inline void vma_start_write(struct vm_area_struct *vma)
{
+ unsigned int count, prev, new;
unsigned int mm_lock_seq;
+ might_sleep();
if (__is_vma_write_locked(vma, &mm_lock_seq))
return;
- down_write(&vma->vm_lock.lock);
+ rwsem_acquire(&vma->vm_lock.dep_map, 0, 0, _RET_IP_);
+ count = (unsigned int)atomic_read(&vma->vm_lock.count);
+ for (;;) {
+ if (vma_lock_reader_count(count) > 0)
+ new = count | VMA_LOCK_WR_WAIT;
+ else
+ new = count | VMA_LOCK_WR_LOCKED;
+ prev = atomic_cmpxchg(&vma->vm_lock.count, count, new);
+ if (prev == count)
+ break;
+ count = prev;
+ }
+ if (new & VMA_LOCK_WR_WAIT) {
+ lock_contended(&vma->vm_lock.dep_map, _RET_IP_);
+ wait_event(vma->vm_mm->vma_writer_wait,
+ atomic_cmpxchg(&vma->vm_lock.count, VMA_LOCK_WR_WAIT,
+ VMA_LOCK_WR_LOCKED) == VMA_LOCK_WR_WAIT);
+
+ }
+ lock_acquired(&vma->vm_lock.dep_map, _RET_IP_);
+
/*
* We should use WRITE_ONCE() here because we can have concurrent reads
* from the early lockless pessimistic check in vma_start_read().
@@ -787,7 +893,10 @@ static inline void vma_start_write(struct vm_area_struct *vma)
* we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
*/
WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
- up_write(&vma->vm_lock.lock);
+ /* Write barrier to ensure vm_lock_seq change is visible before count */
+ smp_wmb();
+ rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
+ atomic_set(&vma->vm_lock.count, VMA_LOCK_UNLOCKED);
}
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -797,9 +906,14 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
}
+static inline bool is_vma_read_locked(struct vm_area_struct *vma)
+{
+ return vma_lock_reader_count((unsigned int)atomic_read(&vma->vm_lock.count)) > 0;
+}
+
static inline void vma_assert_locked(struct vm_area_struct *vma)
{
- if (!rwsem_is_locked(&vma->vm_lock.lock))
+ if (!is_vma_read_locked(vma))
vma_assert_write_locked(vma);
}
@@ -615,8 +615,23 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
}
#endif
+#define VMA_LOCK_UNLOCKED 0
+#define VMA_LOCK_WR_LOCKED (1 << 31)
+#define VMA_LOCK_WR_WAIT (1 << 30)
+
+#define VMA_LOCK_RD_COUNT_MASK (VMA_LOCK_WR_WAIT - 1)
+
struct vma_lock {
- struct rw_semaphore lock;
+ /*
+ * count & VMA_LOCK_RD_COUNT_MASK > 0 ==> read-locked with 'count' number of readers
+ * count & VMA_LOCK_WR_LOCKED != 0 ==> write-locked
+ * count & VMA_LOCK_WR_WAIT != 0 ==> writer is waiting
+ * count = 0 ==> unlocked
+ */
+ atomic_t count;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
};
struct vma_numab_state {
@@ -883,6 +898,7 @@ struct mm_struct {
* by mmlist_lock
*/
#ifdef CONFIG_PER_VMA_LOCK
+ struct wait_queue_head vma_writer_wait;
/*
* This field has lock-like semantics, meaning it is sometimes
* accessed with ACQUIRE/RELEASE semantics.
@@ -121,6 +121,9 @@ static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
mm_lock_seqcount_init(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+ init_waitqueue_head(&mm->vma_writer_wait);
+#endif
}
static inline void mmap_write_lock(struct mm_struct *mm)
@@ -486,7 +486,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
vm_rcu);
/* The vma should not be locked while being destroyed. */
- VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma);
+ VM_BUG_ON_VMA(is_vma_read_locked(vma), vma);
__vm_area_free(vma);
}
#endif
@@ -40,6 +40,8 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
#ifdef CONFIG_PER_VMA_LOCK
+ .vma_writer_wait =
+ __WAIT_QUEUE_HEAD_INITIALIZER(init_mm.vma_writer_wait),
.mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq),
#endif
.user_ns = &init_user_ns,
rw_semaphore is a sizable structure of 40 bytes and consumes considerable space for each vm_area_struct. However vma_lock has two important specifics which can be used to replace rw_semaphore with a simpler structure: 1. Readers never wait. They try to take the vma_lock and fall back to mmap_lock if that fails. 2. Only one writer at a time will ever try to write-lock a vma_lock because writers first take mmap_lock in write mode. Because of these requirements, full rw_semaphore functionality is not needed and we can replace rw_semaphore with an atomic variable. When a reader takes read lock, it increments the atomic, unless the top two bits are set indicating a writer is present. When writer takes write lock, it sets VMA_LOCK_WR_LOCKED bit if there are no readers or VMA_LOCK_WR_WAIT bit if readers are holding the lock and puts itself onto newly introduced mm.vma_writer_wait. Since all writers take mmap_lock in write mode first, there can be only one writer at a time. The last reader to release the lock will signal the writer to wake up. atomic_t might overflow if there are many competing readers, therefore vma_start_read() implements an overflow check and if that occurs it exits with a failure to lock. vma_start_read_locked{_nested} may cause an overflow but it is later handled by __vma_end_read(). Signed-off-by: Suren Baghdasaryan <surenb@google.com> --- include/linux/mm.h | 142 ++++++++++++++++++++++++++++++++++---- include/linux/mm_types.h | 18 ++++- include/linux/mmap_lock.h | 3 + kernel/fork.c | 2 +- mm/init-mm.c | 2 + 5 files changed, 151 insertions(+), 16 deletions(-)