Message ID | 20250131090658.3386285-15-senozhatsky@chromium.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | zsmalloc/zram: there be preemption | expand |
On Fri, Jan 31, 2025 at 06:06:13PM +0900, Sergey Senozhatsky wrote: > Switch over from rwlock_t to a atomic_t variable that takes negative > value when the page is under migration, or positive values when the > page is used by zsmalloc users (object map, etc.) Using a rwsem > per-zspage is a little too memory heavy, a simple atomic_t should > suffice. > > zspage lock is a leaf lock for zs_map_object(), where it's read-acquired. > Since this lock now permits preemption extra care needs to be taken when > it is write-acquired - all writers grab it in atomic context, so they > cannot spin and wait for (potentially preempted) reader to unlock zspage. > There are only two writers at this moment - migration and compaction. In > both cases we use write-try-lock and bail out if zspage is read locked. > Writers, on the other hand, never get preempted, so readers can spin > waiting for the writer to unlock zspage. > > With this we can implement a preemptible object mapping. > > Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org> > Cc: Yosry Ahmed <yosry.ahmed@linux.dev> > --- > mm/zsmalloc.c | 135 +++++++++++++++++++++++++++++++------------------- > 1 file changed, 83 insertions(+), 52 deletions(-) > > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c > index 4b4c77bc08f9..f5b5fe732e50 100644 > --- a/mm/zsmalloc.c > +++ b/mm/zsmalloc.c > @@ -292,6 +292,9 @@ static inline void free_zpdesc(struct zpdesc *zpdesc) > __free_page(page); > } > > +#define ZS_PAGE_UNLOCKED 0 > +#define ZS_PAGE_WRLOCKED -1 > + > struct zspage { > struct { > unsigned int huge:HUGE_BITS; > @@ -304,7 +307,7 @@ struct zspage { > struct zpdesc *first_zpdesc; > struct list_head list; /* fullness list */ > struct zs_pool *pool; > - rwlock_t lock; > + atomic_t lock; > }; > > struct mapping_area { > @@ -314,6 +317,59 @@ struct mapping_area { > enum zs_mapmode vm_mm; /* mapping mode */ > }; > > +static void zspage_lock_init(struct zspage *zspage) > +{ > + atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED); > +} > + > +/* > + * zspage lock permits preemption on the reader-side (there can be multiple > + * readers). Writers (exclusive zspage ownership), on the other hand, are > + * always run in atomic context and cannot spin waiting for a (potentially > + * preempted) reader to unlock zspage. This, basically, means that writers > + * can only call write-try-lock and must bail out if it didn't succeed. > + * > + * At the same time, writers cannot reschedule under zspage write-lock, > + * so readers can spin waiting for the writer to unlock zspage. > + */ > +static void zspage_read_lock(struct zspage *zspage) > +{ > + atomic_t *lock = &zspage->lock; > + int old = atomic_read(lock); > + > + do { > + if (old == ZS_PAGE_WRLOCKED) { > + cpu_relax(); > + old = atomic_read(lock); > + continue; > + } > + } while (!atomic_try_cmpxchg(lock, &old, old + 1)); > +} > + > +static void zspage_read_unlock(struct zspage *zspage) > +{ > + atomic_dec(&zspage->lock); > +} > + > +static bool zspage_try_write_lock(struct zspage *zspage) > +{ > + atomic_t *lock = &zspage->lock; > + int old = ZS_PAGE_UNLOCKED; > + > + preempt_disable(); > + if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED)) FWIW, I am usually afraid to manually implement locking like this. For example, queued_spin_trylock() uses atomic_try_cmpxchg_acquire() not atomic_try_cmpxchg(), and I am not quite sure what could happen without ACQUIRE semantics here on some architectures. We also lose some debugging capabilities as Hilf pointed out in another patch. Just my 2c. > + return true; > + > + preempt_enable(); > + return false; > +} > + > +static void zspage_write_unlock(struct zspage *zspage) > +{ > + atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED); > + preempt_enable(); > +} > + > /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ > static void SetZsHugePage(struct zspage *zspage) > { > @@ -325,12 +381,6 @@ static bool ZsHugePage(struct zspage *zspage) > return zspage->huge; > } > > -static void lock_init(struct zspage *zspage); > -static void migrate_read_lock(struct zspage *zspage); > -static void migrate_read_unlock(struct zspage *zspage); > -static void migrate_write_lock(struct zspage *zspage); > -static void migrate_write_unlock(struct zspage *zspage); > - > #ifdef CONFIG_COMPACTION > static void kick_deferred_free(struct zs_pool *pool); > static void init_deferred_free(struct zs_pool *pool); > @@ -1026,7 +1076,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, > return NULL; > > zspage->magic = ZSPAGE_MAGIC; > - lock_init(zspage); > + zspage_lock_init(zspage); > > for (i = 0; i < class->pages_per_zspage; i++) { > struct zpdesc *zpdesc; > @@ -1251,7 +1301,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, > * zs_unmap_object API so delegate the locking from class to zspage > * which is smaller granularity. > */ > - migrate_read_lock(zspage); > + zspage_read_lock(zspage); > pool_read_unlock(pool); > > class = zspage_class(pool, zspage); > @@ -1311,7 +1361,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) > } > local_unlock(&zs_map_area.lock); > > - migrate_read_unlock(zspage); > + zspage_read_unlock(zspage); > } > EXPORT_SYMBOL_GPL(zs_unmap_object); > > @@ -1705,18 +1755,18 @@ static void lock_zspage(struct zspage *zspage) > /* > * Pages we haven't locked yet can be migrated off the list while we're > * trying to lock them, so we need to be careful and only attempt to > - * lock each page under migrate_read_lock(). Otherwise, the page we lock > + * lock each page under zspage_read_lock(). Otherwise, the page we lock > * may no longer belong to the zspage. This means that we may wait for > * the wrong page to unlock, so we must take a reference to the page > - * prior to waiting for it to unlock outside migrate_read_lock(). > + * prior to waiting for it to unlock outside zspage_read_lock(). > */ > while (1) { > - migrate_read_lock(zspage); > + zspage_read_lock(zspage); > zpdesc = get_first_zpdesc(zspage); > if (zpdesc_trylock(zpdesc)) > break; > zpdesc_get(zpdesc); > - migrate_read_unlock(zspage); > + zspage_read_unlock(zspage); > zpdesc_wait_locked(zpdesc); > zpdesc_put(zpdesc); > } > @@ -1727,41 +1777,16 @@ static void lock_zspage(struct zspage *zspage) > curr_zpdesc = zpdesc; > } else { > zpdesc_get(zpdesc); > - migrate_read_unlock(zspage); > + zspage_read_unlock(zspage); > zpdesc_wait_locked(zpdesc); > zpdesc_put(zpdesc); > - migrate_read_lock(zspage); > + zspage_read_lock(zspage); > } > } > - migrate_read_unlock(zspage); > + zspage_read_unlock(zspage); > } > #endif /* CONFIG_COMPACTION */ > > -static void lock_init(struct zspage *zspage) > -{ > - rwlock_init(&zspage->lock); > -} > - > -static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock) > -{ > - read_lock(&zspage->lock); > -} > - > -static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) > -{ > - read_unlock(&zspage->lock); > -} > - > -static void migrate_write_lock(struct zspage *zspage) > -{ > - write_lock(&zspage->lock); > -} > - > -static void migrate_write_unlock(struct zspage *zspage) > -{ > - write_unlock(&zspage->lock); > -} > - > #ifdef CONFIG_COMPACTION > > static const struct movable_operations zsmalloc_mops; > @@ -1803,7 +1828,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) > } > > static int zs_page_migrate(struct page *newpage, struct page *page, > - enum migrate_mode mode) > + enum migrate_mode mode) > { > struct zs_pool *pool; > struct size_class *class; > @@ -1819,15 +1844,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page, > > VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc)); > > - /* We're committed, tell the world that this is a Zsmalloc page. */ > - __zpdesc_set_zsmalloc(newzpdesc); > - > /* The page is locked, so this pointer must remain valid */ > zspage = get_zspage(zpdesc); > pool = zspage->pool; > > /* > - * The pool lock protects the race between zpage migration > + * The pool->lock protects the race between zpage migration > * and zs_free. > */ > pool_write_lock(pool); > @@ -1837,8 +1859,15 @@ static int zs_page_migrate(struct page *newpage, struct page *page, > * the class lock protects zpage alloc/free in the zspage. > */ > size_class_lock(class); > - /* the migrate_write_lock protects zpage access via zs_map_object */ > - migrate_write_lock(zspage); > + /* the zspage write_lock protects zpage access via zs_map_object */ > + if (!zspage_try_write_lock(zspage)) { > + size_class_unlock(class); > + pool_write_unlock(pool); > + return -EINVAL; > + } > + > + /* We're committed, tell the world that this is a Zsmalloc page. */ > + __zpdesc_set_zsmalloc(newzpdesc); > > offset = get_first_obj_offset(zpdesc); > s_addr = kmap_local_zpdesc(zpdesc); > @@ -1869,7 +1898,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, > */ > pool_write_unlock(pool); > size_class_unlock(class); > - migrate_write_unlock(zspage); > + zspage_write_unlock(zspage); > > zpdesc_get(newzpdesc); > if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) { > @@ -2005,9 +2034,11 @@ static unsigned long __zs_compact(struct zs_pool *pool, > if (!src_zspage) > break; > > - migrate_write_lock(src_zspage); > + if (!zspage_try_write_lock(src_zspage)) > + break; > + > migrate_zspage(pool, src_zspage, dst_zspage); > - migrate_write_unlock(src_zspage); > + zspage_write_unlock(src_zspage); > > fg = putback_zspage(class, src_zspage); > if (fg == ZS_INUSE_RATIO_0) { > -- > 2.48.1.362.g079036d154-goog >
On (25/01/31 15:51), Yosry Ahmed wrote: > > +static void zspage_read_lock(struct zspage *zspage) > > +{ > > + atomic_t *lock = &zspage->lock; > > + int old = atomic_read(lock); > > + > > + do { > > + if (old == ZS_PAGE_WRLOCKED) { > > + cpu_relax(); > > + old = atomic_read(lock); > > + continue; > > + } > > + } while (!atomic_try_cmpxchg(lock, &old, old + 1)); > > +} > > + > > +static void zspage_read_unlock(struct zspage *zspage) > > +{ > > + atomic_dec(&zspage->lock); > > +} > > + > > +static bool zspage_try_write_lock(struct zspage *zspage) > > +{ > > + atomic_t *lock = &zspage->lock; > > + int old = ZS_PAGE_UNLOCKED; > > + > > + preempt_disable(); > > + if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED)) > > FWIW, I am usually afraid to manually implement locking like this. For > example, queued_spin_trylock() uses atomic_try_cmpxchg_acquire() not > atomic_try_cmpxchg(), and I am not quite sure what could happen without > ACQUIRE semantics here on some architectures. I looked into it a bit, wasn't sure either. Perhaps we can switch to acquire/release semantics, I'm not an expert on this, would highly appreciate help. > We also lose some debugging capabilities as Hilf pointed out in another > patch. So that zspage lock should have not been a lock, I think, it's a ref-counter and it's being used as one map() { page->users++; } unmap() { page->users--; } migrate() { if (!page->users) migrate_page(); } > Just my 2c. Perhaps we can sprinkle some lockdep on it. For instance: --- mm/zsmalloc.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 956445f4d554..06b1d8ca9e89 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -308,6 +308,10 @@ struct zspage { struct list_head list; /* fullness list */ struct zs_pool *pool; atomic_t lock; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map lockdep_map; +#endif }; struct mapping_area { @@ -319,6 +323,12 @@ struct mapping_area { static void zspage_lock_init(struct zspage *zspage) { +#ifdef CONFIG_DEBUG_LOCK_ALLOC + static struct lock_class_key key; + + lockdep_init_map(&zspage->lockdep_map, "zsmalloc-page", &key, 0); +#endif + atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED); } @@ -344,11 +354,19 @@ static void zspage_read_lock(struct zspage *zspage) continue; } } while (!atomic_try_cmpxchg(lock, &old, old + 1)); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_); +#endif } static void zspage_read_unlock(struct zspage *zspage) { atomic_dec(&zspage->lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_release(&zspage->lockdep_map, _RET_IP_); +#endif } static bool zspage_try_write_lock(struct zspage *zspage) @@ -357,8 +375,12 @@ static bool zspage_try_write_lock(struct zspage *zspage) int old = ZS_PAGE_UNLOCKED; preempt_disable(); - if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED)) + if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED)) { +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_); +#endif return true; + } preempt_enable(); return false;
On (25/02/03 12:13), Sergey Senozhatsky wrote: > > Just my 2c. > > Perhaps we can sprinkle some lockdep on it. I forgot to rwsem_release() in zspage_write_unlock() and that has triggered lockdep :) --- mm/zsmalloc.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 956445f4d554..1d4700e457d4 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -308,6 +308,10 @@ struct zspage { struct list_head list; /* fullness list */ struct zs_pool *pool; atomic_t lock; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map lockdep_map; +#endif }; struct mapping_area { @@ -319,6 +323,12 @@ struct mapping_area { static void zspage_lock_init(struct zspage *zspage) { +#ifdef CONFIG_DEBUG_LOCK_ALLOC + static struct lock_class_key key; + + lockdep_init_map(&zspage->lockdep_map, "zsmalloc-page", &key, 0); +#endif + atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED); } @@ -344,11 +354,19 @@ static void zspage_read_lock(struct zspage *zspage) continue; } } while (!atomic_try_cmpxchg(lock, &old, old + 1)); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_); +#endif } static void zspage_read_unlock(struct zspage *zspage) { atomic_dec(&zspage->lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_release(&zspage->lockdep_map, _RET_IP_); +#endif } static bool zspage_try_write_lock(struct zspage *zspage) @@ -357,8 +375,12 @@ static bool zspage_try_write_lock(struct zspage *zspage) int old = ZS_PAGE_UNLOCKED; preempt_disable(); - if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED)) + if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED)) { +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_acquire(&zspage->lockdep_map, 0, 0, _RET_IP_); +#endif return true; + } preempt_enable(); return false; @@ -367,6 +389,9 @@ static bool zspage_try_write_lock(struct zspage *zspage) static void zspage_write_unlock(struct zspage *zspage) { atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_release(&zspage->lockdep_map, _RET_IP_); +#endif preempt_enable(); }
On Mon, Feb 03, 2025 at 12:13:49PM +0900, Sergey Senozhatsky wrote: > On (25/01/31 15:51), Yosry Ahmed wrote: > > > +static void zspage_read_lock(struct zspage *zspage) > > > +{ > > > + atomic_t *lock = &zspage->lock; > > > + int old = atomic_read(lock); > > > + > > > + do { > > > + if (old == ZS_PAGE_WRLOCKED) { > > > + cpu_relax(); > > > + old = atomic_read(lock); > > > + continue; > > > + } > > > + } while (!atomic_try_cmpxchg(lock, &old, old + 1)); > > > +} > > > + > > > +static void zspage_read_unlock(struct zspage *zspage) > > > +{ > > > + atomic_dec(&zspage->lock); > > > +} > > > + > > > +static bool zspage_try_write_lock(struct zspage *zspage) > > > +{ > > > + atomic_t *lock = &zspage->lock; > > > + int old = ZS_PAGE_UNLOCKED; > > > + > > > + preempt_disable(); > > > + if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED)) > > > > FWIW, I am usually afraid to manually implement locking like this. For > > example, queued_spin_trylock() uses atomic_try_cmpxchg_acquire() not > > atomic_try_cmpxchg(), and I am not quite sure what could happen without > > ACQUIRE semantics here on some architectures. > > I looked into it a bit, wasn't sure either. Perhaps we can switch > to acquire/release semantics, I'm not an expert on this, would highly > appreciate help. > > > We also lose some debugging capabilities as Hilf pointed out in another > > patch. > > So that zspage lock should have not been a lock, I think, it's a ref-counter > and it's being used as one > > map() > { > page->users++; > } > > unmap() > { > page->users--; > } > > migrate() > { > if (!page->users) > migrate_page(); > } Hmm, but in this case we want migration to block new map/unmap operations. So a vanilla refcount won't work. > > > Just my 2c. > > Perhaps we can sprinkle some lockdep on it. For instance: Honestly this looks like more reason to use existing lock primitives to me. What are the candidates? I assume rw_semaphore, anything else? I guess the main reason you didn't use a rw_semaphore is the extra memory usage. Seems like it uses ~32 bytes more than rwlock_t on x86_64. That's per zspage. Depending on how many compressed pages we have per-zspage this may not be too bad. For example, if a zspage has a chain length of 4, and the average compression ratio of 1/3, that's 12 compressed pages so the extra overhead is <3 bytes per compressed page. Given that the chain length is a function of the class size, I think we can calculate the exact extra memory overhead per-compressed page for each class and get a mean/median over all classes. If the memory overhead is insignificant I'd rather use exisitng lock primitives tbh.
On (25/02/03 21:11), Yosry Ahmed wrote: > > > We also lose some debugging capabilities as Hilf pointed out in another > > > patch. > > > > So that zspage lock should have not been a lock, I think, it's a ref-counter > > and it's being used as one > > > > map() > > { > > page->users++; > > } > > > > unmap() > > { > > page->users--; > > } > > > > migrate() > > { > > if (!page->users) > > migrate_page(); > > } > > Hmm, but in this case we want migration to block new map/unmap > operations. So a vanilla refcount won't work. Yeah, correct - migration needs negative values so that map would wait until it's positive (or zero). > > > Just my 2c. > > > > Perhaps we can sprinkle some lockdep on it. For instance: > > Honestly this looks like more reason to use existing lock primitives to > me. What are the candidates? I assume rw_semaphore, anything else? Right, rwsem "was" the first choice. > I guess the main reason you didn't use a rw_semaphore is the extra > memory usage. sizeof(struct zs_page) change is one thing. Another thing is that zspage->lock is taken from atomic sections, pretty much everywhere. compaction/migration write-lock it under pool rwlock and class spinlock, but both compaction and migration now EAGAIN if the lock is locked already, so that is sorted out. The remaining problem is map(), which takes zspage read-lock under pool rwlock. RFC series (which you hated with passion :P) converted all zsmalloc into preemptible ones because of this - zspage->lock is a nested leaf-lock, so it cannot schedule unless locks it's nested under permit it (needless to say neither rwlock nor spinlock permit it). > Seems like it uses ~32 bytes more than rwlock_t on x86_64. > That's per zspage. Depending on how many compressed pages we have > per-zspage this may not be too bad. So on a 16GB laptop our memory pressure test at peak used approx 1M zspages. That is 32 bytes * 1M ~ 32MB of extra memory use. Not alarmingly a lot, less than what a single browser tab needs nowadays. I suppose on 4GB/8GB that will be even smaller (because those device generate less zspages). Numbers are not the main issue, however.
On Tue, Feb 04, 2025 at 03:59:42PM +0900, Sergey Senozhatsky wrote: > On (25/02/03 21:11), Yosry Ahmed wrote: > > > > We also lose some debugging capabilities as Hilf pointed out in another > > > > patch. > > > > > > So that zspage lock should have not been a lock, I think, it's a ref-counter > > > and it's being used as one > > > > > > map() > > > { > > > page->users++; > > > } > > > > > > unmap() > > > { > > > page->users--; > > > } > > > > > > migrate() > > > { > > > if (!page->users) > > > migrate_page(); > > > } > > > > Hmm, but in this case we want migration to block new map/unmap > > operations. So a vanilla refcount won't work. > > Yeah, correct - migration needs negative values so that map would > wait until it's positive (or zero). > > > > > Just my 2c. > > > > > > Perhaps we can sprinkle some lockdep on it. For instance: > > > > Honestly this looks like more reason to use existing lock primitives to > > me. What are the candidates? I assume rw_semaphore, anything else? > > Right, rwsem "was" the first choice. > > > I guess the main reason you didn't use a rw_semaphore is the extra > > memory usage. > > sizeof(struct zs_page) change is one thing. Another thing is that > zspage->lock is taken from atomic sections, pretty much everywhere. > compaction/migration write-lock it under pool rwlock and class spinlock, > but both compaction and migration now EAGAIN if the lock is locked > already, so that is sorted out. > > The remaining problem is map(), which takes zspage read-lock under pool > rwlock. RFC series (which you hated with passion :P) converted all zsmalloc > into preemptible ones because of this - zspage->lock is a nested leaf-lock, > so it cannot schedule unless locks it's nested under permit it (needless to > say neither rwlock nor spinlock permit it). Hmm, so we want the lock to be preemtible, but we don't want to use an existing preemtible lock because it may be held it from atomic context. I think one problem here is that the lock you are introducing is a spinning lock but the lock holder can be preempted. This is why spinning locks do not allow preemption. Others waiting for the lock can spin waiting for a process that is scheduled out. For example, the compaction/migration code could be sleeping holding the write lock, and a map() call would spin waiting for that sleeping task. I wonder if there's a way to rework the locking instead to avoid the nesting. It seems like sometimes we lock the zspage with the pool lock held, sometimes with the class lock held, and sometimes with no lock held. What are the rules here for acquiring the zspage lock? Do we need to hold another lock just to make sure the zspage does not go away from under us? Can we use RCU or something similar to do that instead? > > > Seems like it uses ~32 bytes more than rwlock_t on x86_64. > > That's per zspage. Depending on how many compressed pages we have > > per-zspage this may not be too bad. > > So on a 16GB laptop our memory pressure test at peak used approx 1M zspages. > That is 32 bytes * 1M ~ 32MB of extra memory use. Not alarmingly a lot, > less than what a single browser tab needs nowadays. I suppose on 4GB/8GB > that will be even smaller (because those device generate less zspages). > Numbers are not the main issue, however. >
On (25/02/04 17:19), Yosry Ahmed wrote: > > sizeof(struct zs_page) change is one thing. Another thing is that > > zspage->lock is taken from atomic sections, pretty much everywhere. > > compaction/migration write-lock it under pool rwlock and class spinlock, > > but both compaction and migration now EAGAIN if the lock is locked > > already, so that is sorted out. > > > > The remaining problem is map(), which takes zspage read-lock under pool > > rwlock. RFC series (which you hated with passion :P) converted all zsmalloc > > into preemptible ones because of this - zspage->lock is a nested leaf-lock, > > so it cannot schedule unless locks it's nested under permit it (needless to > > say neither rwlock nor spinlock permit it). > > Hmm, so we want the lock to be preemtible, but we don't want to use an > existing preemtible lock because it may be held it from atomic context. > > I think one problem here is that the lock you are introducing is a > spinning lock but the lock holder can be preempted. This is why spinning > locks do not allow preemption. Others waiting for the lock can spin > waiting for a process that is scheduled out. > > For example, the compaction/migration code could be sleeping holding the > write lock, and a map() call would spin waiting for that sleeping task. write-lock holders cannot sleep, that's the key part. So the rules are: 1) writer cannot sleep - migration/compaction runs in atomic context and grabs write-lock only from atomic context - write-locking function disables preemption before lock(), just to be safe, and enables it after unlock() 2) writer does not spin waiting - that's why there is only write_try_lock function - compaction and migration bail out when they cannot lock the zspage 3) readers can sleep and can spin waiting for a lock - other (even preempted) readers don't block new readers - writers don't sleep, they always unlock > I wonder if there's a way to rework the locking instead to avoid the > nesting. It seems like sometimes we lock the zspage with the pool lock > held, sometimes with the class lock held, and sometimes with no lock > held. > > What are the rules here for acquiring the zspage lock? Most of that code is not written by me, but I think the rule is to disable "migration" be it via pool lock or class lock. > Do we need to hold another lock just to make sure the zspage does not go > away from under us? Yes, the page cannot go away via "normal" path: zs_free(last object) -> zspage becomes empty -> free zspage so when we have active mapping() it's only migration and compaction that can free zspage (its content is migrated and so it becomes empty). > Can we use RCU or something similar to do that instead? Hmm, I don't know... zsmalloc is not "read-mostly", it's whatever data patterns the clients have. I suspect we'd need to synchronize RCU every time a zspage is freed: zs_free() [this one is complicated], or migration, or compaction? Sounds like anti-pattern for RCU?
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 4b4c77bc08f9..f5b5fe732e50 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -292,6 +292,9 @@ static inline void free_zpdesc(struct zpdesc *zpdesc) __free_page(page); } +#define ZS_PAGE_UNLOCKED 0 +#define ZS_PAGE_WRLOCKED -1 + struct zspage { struct { unsigned int huge:HUGE_BITS; @@ -304,7 +307,7 @@ struct zspage { struct zpdesc *first_zpdesc; struct list_head list; /* fullness list */ struct zs_pool *pool; - rwlock_t lock; + atomic_t lock; }; struct mapping_area { @@ -314,6 +317,59 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; +static void zspage_lock_init(struct zspage *zspage) +{ + atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED); +} + +/* + * zspage lock permits preemption on the reader-side (there can be multiple + * readers). Writers (exclusive zspage ownership), on the other hand, are + * always run in atomic context and cannot spin waiting for a (potentially + * preempted) reader to unlock zspage. This, basically, means that writers + * can only call write-try-lock and must bail out if it didn't succeed. + * + * At the same time, writers cannot reschedule under zspage write-lock, + * so readers can spin waiting for the writer to unlock zspage. + */ +static void zspage_read_lock(struct zspage *zspage) +{ + atomic_t *lock = &zspage->lock; + int old = atomic_read(lock); + + do { + if (old == ZS_PAGE_WRLOCKED) { + cpu_relax(); + old = atomic_read(lock); + continue; + } + } while (!atomic_try_cmpxchg(lock, &old, old + 1)); +} + +static void zspage_read_unlock(struct zspage *zspage) +{ + atomic_dec(&zspage->lock); +} + +static bool zspage_try_write_lock(struct zspage *zspage) +{ + atomic_t *lock = &zspage->lock; + int old = ZS_PAGE_UNLOCKED; + + preempt_disable(); + if (atomic_try_cmpxchg(lock, &old, ZS_PAGE_WRLOCKED)) + return true; + + preempt_enable(); + return false; +} + +static void zspage_write_unlock(struct zspage *zspage) +{ + atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED); + preempt_enable(); +} + /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ static void SetZsHugePage(struct zspage *zspage) { @@ -325,12 +381,6 @@ static bool ZsHugePage(struct zspage *zspage) return zspage->huge; } -static void lock_init(struct zspage *zspage); -static void migrate_read_lock(struct zspage *zspage); -static void migrate_read_unlock(struct zspage *zspage); -static void migrate_write_lock(struct zspage *zspage); -static void migrate_write_unlock(struct zspage *zspage); - #ifdef CONFIG_COMPACTION static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); @@ -1026,7 +1076,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, return NULL; zspage->magic = ZSPAGE_MAGIC; - lock_init(zspage); + zspage_lock_init(zspage); for (i = 0; i < class->pages_per_zspage; i++) { struct zpdesc *zpdesc; @@ -1251,7 +1301,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, * zs_unmap_object API so delegate the locking from class to zspage * which is smaller granularity. */ - migrate_read_lock(zspage); + zspage_read_lock(zspage); pool_read_unlock(pool); class = zspage_class(pool, zspage); @@ -1311,7 +1361,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) } local_unlock(&zs_map_area.lock); - migrate_read_unlock(zspage); + zspage_read_unlock(zspage); } EXPORT_SYMBOL_GPL(zs_unmap_object); @@ -1705,18 +1755,18 @@ static void lock_zspage(struct zspage *zspage) /* * Pages we haven't locked yet can be migrated off the list while we're * trying to lock them, so we need to be careful and only attempt to - * lock each page under migrate_read_lock(). Otherwise, the page we lock + * lock each page under zspage_read_lock(). Otherwise, the page we lock * may no longer belong to the zspage. This means that we may wait for * the wrong page to unlock, so we must take a reference to the page - * prior to waiting for it to unlock outside migrate_read_lock(). + * prior to waiting for it to unlock outside zspage_read_lock(). */ while (1) { - migrate_read_lock(zspage); + zspage_read_lock(zspage); zpdesc = get_first_zpdesc(zspage); if (zpdesc_trylock(zpdesc)) break; zpdesc_get(zpdesc); - migrate_read_unlock(zspage); + zspage_read_unlock(zspage); zpdesc_wait_locked(zpdesc); zpdesc_put(zpdesc); } @@ -1727,41 +1777,16 @@ static void lock_zspage(struct zspage *zspage) curr_zpdesc = zpdesc; } else { zpdesc_get(zpdesc); - migrate_read_unlock(zspage); + zspage_read_unlock(zspage); zpdesc_wait_locked(zpdesc); zpdesc_put(zpdesc); - migrate_read_lock(zspage); + zspage_read_lock(zspage); } } - migrate_read_unlock(zspage); + zspage_read_unlock(zspage); } #endif /* CONFIG_COMPACTION */ -static void lock_init(struct zspage *zspage) -{ - rwlock_init(&zspage->lock); -} - -static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock) -{ - read_lock(&zspage->lock); -} - -static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) -{ - read_unlock(&zspage->lock); -} - -static void migrate_write_lock(struct zspage *zspage) -{ - write_lock(&zspage->lock); -} - -static void migrate_write_unlock(struct zspage *zspage) -{ - write_unlock(&zspage->lock); -} - #ifdef CONFIG_COMPACTION static const struct movable_operations zsmalloc_mops; @@ -1803,7 +1828,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) } static int zs_page_migrate(struct page *newpage, struct page *page, - enum migrate_mode mode) + enum migrate_mode mode) { struct zs_pool *pool; struct size_class *class; @@ -1819,15 +1844,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page, VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc)); - /* We're committed, tell the world that this is a Zsmalloc page. */ - __zpdesc_set_zsmalloc(newzpdesc); - /* The page is locked, so this pointer must remain valid */ zspage = get_zspage(zpdesc); pool = zspage->pool; /* - * The pool lock protects the race between zpage migration + * The pool->lock protects the race between zpage migration * and zs_free. */ pool_write_lock(pool); @@ -1837,8 +1859,15 @@ static int zs_page_migrate(struct page *newpage, struct page *page, * the class lock protects zpage alloc/free in the zspage. */ size_class_lock(class); - /* the migrate_write_lock protects zpage access via zs_map_object */ - migrate_write_lock(zspage); + /* the zspage write_lock protects zpage access via zs_map_object */ + if (!zspage_try_write_lock(zspage)) { + size_class_unlock(class); + pool_write_unlock(pool); + return -EINVAL; + } + + /* We're committed, tell the world that this is a Zsmalloc page. */ + __zpdesc_set_zsmalloc(newzpdesc); offset = get_first_obj_offset(zpdesc); s_addr = kmap_local_zpdesc(zpdesc); @@ -1869,7 +1898,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, */ pool_write_unlock(pool); size_class_unlock(class); - migrate_write_unlock(zspage); + zspage_write_unlock(zspage); zpdesc_get(newzpdesc); if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) { @@ -2005,9 +2034,11 @@ static unsigned long __zs_compact(struct zs_pool *pool, if (!src_zspage) break; - migrate_write_lock(src_zspage); + if (!zspage_try_write_lock(src_zspage)) + break; + migrate_zspage(pool, src_zspage, dst_zspage); - migrate_write_unlock(src_zspage); + zspage_write_unlock(src_zspage); fg = putback_zspage(class, src_zspage); if (fg == ZS_INUSE_RATIO_0) {
Switch over from rwlock_t to a atomic_t variable that takes negative value when the page is under migration, or positive values when the page is used by zsmalloc users (object map, etc.) Using a rwsem per-zspage is a little too memory heavy, a simple atomic_t should suffice. zspage lock is a leaf lock for zs_map_object(), where it's read-acquired. Since this lock now permits preemption extra care needs to be taken when it is write-acquired - all writers grab it in atomic context, so they cannot spin and wait for (potentially preempted) reader to unlock zspage. There are only two writers at this moment - migration and compaction. In both cases we use write-try-lock and bail out if zspage is read locked. Writers, on the other hand, never get preempted, so readers can spin waiting for the writer to unlock zspage. With this we can implement a preemptible object mapping. Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org> Cc: Yosry Ahmed <yosry.ahmed@linux.dev> --- mm/zsmalloc.c | 135 +++++++++++++++++++++++++++++++------------------- 1 file changed, 83 insertions(+), 52 deletions(-)