Message ID | 1608894171-54174-2-git-send-email-tiantao6@hisilicon.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Fix the compatibility of zsmalloc and zswap | expand |
On Fri, Dec 25, 2020 at 07:02:50PM +0800, Tian Tao wrote: > add a flag to zpool, named is "can_sleep_mapped", and have it set true > for zbud/z3fold, set false for zsmalloc. Then zswap could go the current > path if the flag is true; and if it's false, copy data from src to a > temporary buffer, then unmap the handle, take the mutex, process the > buffer instead of src to avoid sleeping function called from atomic > context. > > Signed-off-by: Tian Tao <tiantao6@hisilicon.com> > --- > include/linux/zpool.h | 3 +++ > mm/zpool.c | 13 +++++++++++++ > mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- > 3 files changed, 61 insertions(+), 5 deletions(-) > > diff --git a/include/linux/zpool.h b/include/linux/zpool.h > index 51bf430..e899701 100644 > --- a/include/linux/zpool.h > +++ b/include/linux/zpool.h > @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); > * @malloc: allocate mem from a pool. > * @free: free mem from a pool. > * @shrink: shrink the pool. > + * @sleep_mapped: whether zpool driver can sleep during map. I don't think it's a good idea. It just breaks zpool abstraction in that it exposes internal implementation to user to avoid issue zswap recently introduced. It also conflicts zpool_map_handle's semantic. Rather than introducing another break in zpool due to the new zswap feature recenlty introduced, zswap could introduce CONFIG_ZSWAP_HW_COMPRESSOR. Once it's configured, zsmalloc could be disabled. And with disabling CONFIG_ZSWAP_HW_COMPRESSOR, zswap doesn't need to make any bounce buffer copy so that no existing zsmalloc user will see performance regression. > * @map: map a handle. > * @unmap: unmap a handle. > * @total_size: get total size of a pool. > @@ -100,6 +101,7 @@ struct zpool_driver { > int (*shrink)(void *pool, unsigned int pages, > unsigned int *reclaimed); > > + bool sleep_mapped; > void *(*map)(void *pool, unsigned long handle, > enum zpool_mapmode mm); > void (*unmap)(void *pool, unsigned long handle); > @@ -112,5 +114,6 @@ void zpool_register_driver(struct zpool_driver *driver); > int zpool_unregister_driver(struct zpool_driver *driver); > > bool zpool_evictable(struct zpool *pool); > +bool zpool_can_sleep_mapped(struct zpool *pool); > > #endif > diff --git a/mm/zpool.c b/mm/zpool.c > index 3744a2d..5ed7120 100644 > --- a/mm/zpool.c > +++ b/mm/zpool.c > @@ -23,6 +23,7 @@ struct zpool { > void *pool; > const struct zpool_ops *ops; > bool evictable; > + bool can_sleep_mapped; > > struct list_head list; > }; > @@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, > zpool->pool = driver->create(name, gfp, ops, zpool); > zpool->ops = ops; > zpool->evictable = driver->shrink && ops && ops->evict; > + zpool->can_sleep_mapped = driver->sleep_mapped; > > if (!zpool->pool) { > pr_err("couldn't create %s pool\n", type); > @@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool) > return zpool->evictable; > } > > +/** > + * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. > + * @zpool: The zpool to test > + * > + * Returns: true if zpool can sleep; false otherwise. > + */ > +bool zpool_can_sleep_mapped(struct zpool *zpool) > +{ > + return zpool->can_sleep_mapped; > +} > + > MODULE_LICENSE("GPL"); > MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); > MODULE_DESCRIPTION("Common API for compressed memory storage"); > diff --git a/mm/zswap.c b/mm/zswap.c > index 182f6ad..67d4555 100644 > --- a/mm/zswap.c > +++ b/mm/zswap.c > @@ -935,13 +935,20 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > struct scatterlist input, output; > struct crypto_acomp_ctx *acomp_ctx; > > - u8 *src; > + u8 *src, *tmp; > unsigned int dlen; > int ret; > struct writeback_control wbc = { > .sync_mode = WB_SYNC_NONE, > }; > > + if (!zpool_can_sleep_mapped(pool)) { > + > + tmp = kmalloc(entry->length, GFP_ATOMIC); > + if (!tmp) > + return -ENOMEM; > + } > + > /* extract swpentry from data */ > zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); > swpentry = zhdr->swpentry; /* here */ > @@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > dlen = PAGE_SIZE; > src = (u8 *)zhdr + sizeof(struct zswap_header); > > + if (!zpool_can_sleep_mapped(pool)) { > + > + memcpy(tmp, src, entry->length); > + src = tmp; > + > + zpool_unmap_handle(pool, handle); > + } > + > mutex_lock(acomp_ctx->mutex); > sg_init_one(&input, src, entry->length); > sg_init_table(&output, 1); > @@ -1033,7 +1048,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > spin_unlock(&tree->lock); > > end: > - zpool_unmap_handle(pool, handle); > + if (zpool_can_sleep_mapped(pool)) > + zpool_unmap_handle(pool, handle); > + else > + kfree(tmp); > + > return ret; > } > > @@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > struct zswap_entry *entry; > struct scatterlist input, output; > struct crypto_acomp_ctx *acomp_ctx; > - u8 *src, *dst; > + u8 *src, *dst, *tmp; > unsigned int dlen; > int ret; > > @@ -1256,12 +1275,29 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > goto freeentry; > } > > + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { > + > + tmp = kmalloc(entry->length, GFP_ATOMIC); > + if (!tmp) { > + ret = -ENOMEM; > + goto freeentry; > + } > + } > + > /* decompress */ > dlen = PAGE_SIZE; > src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); > if (zpool_evictable(entry->pool->zpool)) > src += sizeof(struct zswap_header); > > + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { > + > + memcpy(tmp, src, entry->length); > + src = tmp; > + > + zpool_unmap_handle(entry->pool->zpool, entry->handle); > + } > + > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > mutex_lock(acomp_ctx->mutex); > sg_init_one(&input, src, entry->length); > @@ -1271,7 +1307,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); > mutex_unlock(acomp_ctx->mutex); > > - zpool_unmap_handle(entry->pool->zpool, entry->handle); > + if (zpool_can_sleep_mapped(entry->pool->zpool)) > + zpool_unmap_handle(entry->pool->zpool, entry->handle); > + else > + kfree(tmp); > + > BUG_ON(ret); > > freeentry: > @@ -1279,7 +1319,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > zswap_entry_put(tree, entry); > spin_unlock(&tree->lock); > > - return 0; > + return ret; > } > > /* frees an entry in zswap */ > -- > 2.7.4 > >
On Thu, Jan 14, 2021 at 7:29 PM Minchan Kim <minchan@kernel.org> wrote: > > On Fri, Dec 25, 2020 at 07:02:50PM +0800, Tian Tao wrote: > > add a flag to zpool, named is "can_sleep_mapped", and have it set true > > for zbud/z3fold, set false for zsmalloc. Then zswap could go the current > > path if the flag is true; and if it's false, copy data from src to a > > temporary buffer, then unmap the handle, take the mutex, process the > > buffer instead of src to avoid sleeping function called from atomic > > context. > > > > Signed-off-by: Tian Tao <tiantao6@hisilicon.com> > > --- > > include/linux/zpool.h | 3 +++ > > mm/zpool.c | 13 +++++++++++++ > > mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- > > 3 files changed, 61 insertions(+), 5 deletions(-) > > > > diff --git a/include/linux/zpool.h b/include/linux/zpool.h > > index 51bf430..e899701 100644 > > --- a/include/linux/zpool.h > > +++ b/include/linux/zpool.h > > @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); > > * @malloc: allocate mem from a pool. > > * @free: free mem from a pool. > > * @shrink: shrink the pool. > > + * @sleep_mapped: whether zpool driver can sleep during map. > > I don't think it's a good idea. It just breaks zpool abstraction > in that it exposes internal implementation to user to avoid issue > zswap recently introduced. It also conflicts zpool_map_handle's > semantic. > > Rather than introducing another break in zpool due to the new > zswap feature recenlty introduced, zswap could introduce > CONFIG_ZSWAP_HW_COMPRESSOR. Once it's configured, zsmalloc could > be disabled. And with disabling CONFIG_ZSWAP_HW_COMPRESSOR, zswap > doesn't need to make any bounce buffer copy so that no existing > zsmalloc user will see performance regression. I believe it won't help that much -- the new compressor API presumes that the caller may sleep during compression and that will be an accident waiting to happen as long as we use it and keep the handle mapped in zsmalloc case. Or maybe I interpreted you wrong and you are suggesting re-introducing calls to the old API under this #ifdef, is that the case? Best regards, Vitaly > > > * @map: map a handle. > > * @unmap: unmap a handle. > > * @total_size: get total size of a pool. > > @@ -100,6 +101,7 @@ struct zpool_driver { > > int (*shrink)(void *pool, unsigned int pages, > > unsigned int *reclaimed); > > > > + bool sleep_mapped; > > void *(*map)(void *pool, unsigned long handle, > > enum zpool_mapmode mm); > > void (*unmap)(void *pool, unsigned long handle); > > @@ -112,5 +114,6 @@ void zpool_register_driver(struct zpool_driver *driver); > > int zpool_unregister_driver(struct zpool_driver *driver); > > > > bool zpool_evictable(struct zpool *pool); > > +bool zpool_can_sleep_mapped(struct zpool *pool); > > > > #endif > > diff --git a/mm/zpool.c b/mm/zpool.c > > index 3744a2d..5ed7120 100644 > > --- a/mm/zpool.c > > +++ b/mm/zpool.c > > @@ -23,6 +23,7 @@ struct zpool { > > void *pool; > > const struct zpool_ops *ops; > > bool evictable; > > + bool can_sleep_mapped; > > > > struct list_head list; > > }; > > @@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, > > zpool->pool = driver->create(name, gfp, ops, zpool); > > zpool->ops = ops; > > zpool->evictable = driver->shrink && ops && ops->evict; > > + zpool->can_sleep_mapped = driver->sleep_mapped; > > > > if (!zpool->pool) { > > pr_err("couldn't create %s pool\n", type); > > @@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool) > > return zpool->evictable; > > } > > > > +/** > > + * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. > > + * @zpool: The zpool to test > > + * > > + * Returns: true if zpool can sleep; false otherwise. > > + */ > > +bool zpool_can_sleep_mapped(struct zpool *zpool) > > +{ > > + return zpool->can_sleep_mapped; > > +} > > + > > MODULE_LICENSE("GPL"); > > MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); > > MODULE_DESCRIPTION("Common API for compressed memory storage"); > > diff --git a/mm/zswap.c b/mm/zswap.c > > index 182f6ad..67d4555 100644 > > --- a/mm/zswap.c > > +++ b/mm/zswap.c > > @@ -935,13 +935,20 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > > struct scatterlist input, output; > > struct crypto_acomp_ctx *acomp_ctx; > > > > - u8 *src; > > + u8 *src, *tmp; > > unsigned int dlen; > > int ret; > > struct writeback_control wbc = { > > .sync_mode = WB_SYNC_NONE, > > }; > > > > + if (!zpool_can_sleep_mapped(pool)) { > > + > > + tmp = kmalloc(entry->length, GFP_ATOMIC); > > + if (!tmp) > > + return -ENOMEM; > > + } > > + > > /* extract swpentry from data */ > > zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); > > swpentry = zhdr->swpentry; /* here */ > > @@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > > dlen = PAGE_SIZE; > > src = (u8 *)zhdr + sizeof(struct zswap_header); > > > > + if (!zpool_can_sleep_mapped(pool)) { > > + > > + memcpy(tmp, src, entry->length); > > + src = tmp; > > + > > + zpool_unmap_handle(pool, handle); > > + } > > + > > mutex_lock(acomp_ctx->mutex); > > sg_init_one(&input, src, entry->length); > > sg_init_table(&output, 1); > > @@ -1033,7 +1048,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > > spin_unlock(&tree->lock); > > > > end: > > - zpool_unmap_handle(pool, handle); > > + if (zpool_can_sleep_mapped(pool)) > > + zpool_unmap_handle(pool, handle); > > + else > > + kfree(tmp); > > + > > return ret; > > } > > > > @@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > > struct zswap_entry *entry; > > struct scatterlist input, output; > > struct crypto_acomp_ctx *acomp_ctx; > > - u8 *src, *dst; > > + u8 *src, *dst, *tmp; > > unsigned int dlen; > > int ret; > > > > @@ -1256,12 +1275,29 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > > goto freeentry; > > } > > > > + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { > > + > > + tmp = kmalloc(entry->length, GFP_ATOMIC); > > + if (!tmp) { > > + ret = -ENOMEM; > > + goto freeentry; > > + } > > + } > > + > > /* decompress */ > > dlen = PAGE_SIZE; > > src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); > > if (zpool_evictable(entry->pool->zpool)) > > src += sizeof(struct zswap_header); > > > > + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { > > + > > + memcpy(tmp, src, entry->length); > > + src = tmp; > > + > > + zpool_unmap_handle(entry->pool->zpool, entry->handle); > > + } > > + > > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > > mutex_lock(acomp_ctx->mutex); > > sg_init_one(&input, src, entry->length); > > @@ -1271,7 +1307,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > > ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); > > mutex_unlock(acomp_ctx->mutex); > > > > - zpool_unmap_handle(entry->pool->zpool, entry->handle); > > + if (zpool_can_sleep_mapped(entry->pool->zpool)) > > + zpool_unmap_handle(entry->pool->zpool, entry->handle); > > + else > > + kfree(tmp); > > + > > BUG_ON(ret); > > > > freeentry: > > @@ -1279,7 +1319,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > > zswap_entry_put(tree, entry); > > spin_unlock(&tree->lock); > > > > - return 0; > > + return ret; > > } > > > > /* frees an entry in zswap */ > > -- > > 2.7.4 > > > >
On Thu, Jan 14, 2021 at 10:29 AM Minchan Kim <minchan@kernel.org> wrote: > > On Fri, Dec 25, 2020 at 07:02:50PM +0800, Tian Tao wrote: > > add a flag to zpool, named is "can_sleep_mapped", and have it set true > > for zbud/z3fold, set false for zsmalloc. Then zswap could go the current > > path if the flag is true; and if it's false, copy data from src to a > > temporary buffer, then unmap the handle, take the mutex, process the > > buffer instead of src to avoid sleeping function called from atomic > > context. > > > > Signed-off-by: Tian Tao <tiantao6@hisilicon.com> > > --- > > include/linux/zpool.h | 3 +++ > > mm/zpool.c | 13 +++++++++++++ > > mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- > > 3 files changed, 61 insertions(+), 5 deletions(-) > > > > diff --git a/include/linux/zpool.h b/include/linux/zpool.h > > index 51bf430..e899701 100644 > > --- a/include/linux/zpool.h > > +++ b/include/linux/zpool.h > > @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); > > * @malloc: allocate mem from a pool. > > * @free: free mem from a pool. > > * @shrink: shrink the pool. > > + * @sleep_mapped: whether zpool driver can sleep during map. > > I don't think it's a good idea. It just breaks zpool abstraction > in that it exposes internal implementation to user to avoid issue > zswap recently introduced. It also conflicts zpool_map_handle's > semantic. > > Rather than introducing another break in zpool due to the new > zswap feature recenlty introduced, zswap could introduce > CONFIG_ZSWAP_HW_COMPRESSOR. Once it's configured, zsmalloc could > be disabled. And with disabling CONFIG_ZSWAP_HW_COMPRESSOR, zswap > doesn't need to make any bounce buffer copy so that no existing > zsmalloc user will see performance regression. > I agree with Minchan. There is no reason to add extra overhead for configurations where there is no hardware available.
On Thu, Jan 14, 2021 at 7:43 PM Shakeel Butt <shakeelb@google.com> wrote: > > On Thu, Jan 14, 2021 at 10:29 AM Minchan Kim <minchan@kernel.org> wrote: > > > > On Fri, Dec 25, 2020 at 07:02:50PM +0800, Tian Tao wrote: > > > add a flag to zpool, named is "can_sleep_mapped", and have it set true > > > for zbud/z3fold, set false for zsmalloc. Then zswap could go the current > > > path if the flag is true; and if it's false, copy data from src to a > > > temporary buffer, then unmap the handle, take the mutex, process the > > > buffer instead of src to avoid sleeping function called from atomic > > > context. > > > > > > Signed-off-by: Tian Tao <tiantao6@hisilicon.com> > > > --- > > > include/linux/zpool.h | 3 +++ > > > mm/zpool.c | 13 +++++++++++++ > > > mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- > > > 3 files changed, 61 insertions(+), 5 deletions(-) > > > > > > diff --git a/include/linux/zpool.h b/include/linux/zpool.h > > > index 51bf430..e899701 100644 > > > --- a/include/linux/zpool.h > > > +++ b/include/linux/zpool.h > > > @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); > > > * @malloc: allocate mem from a pool. > > > * @free: free mem from a pool. > > > * @shrink: shrink the pool. > > > + * @sleep_mapped: whether zpool driver can sleep during map. > > > > I don't think it's a good idea. It just breaks zpool abstraction > > in that it exposes internal implementation to user to avoid issue > > zswap recently introduced. It also conflicts zpool_map_handle's > > semantic. > > > > Rather than introducing another break in zpool due to the new > > zswap feature recenlty introduced, zswap could introduce > > CONFIG_ZSWAP_HW_COMPRESSOR. Once it's configured, zsmalloc could > > be disabled. And with disabling CONFIG_ZSWAP_HW_COMPRESSOR, zswap > > doesn't need to make any bounce buffer copy so that no existing > > zsmalloc user will see performance regression. > > > > I agree with Minchan. There is no reason to add extra overhead for > configurations where there is no hardware available. I still don't get what the suggested alternative is. How would disabling CONFIG_ZSWAP_HW_COMPRESSOR help if zswap is using acomp API? Best regards, Vitaly
On Thu, Jan 14, 2021 at 07:40:50PM +0100, Vitaly Wool wrote: > On Thu, Jan 14, 2021 at 7:29 PM Minchan Kim <minchan@kernel.org> wrote: > > > > On Fri, Dec 25, 2020 at 07:02:50PM +0800, Tian Tao wrote: > > > add a flag to zpool, named is "can_sleep_mapped", and have it set true > > > for zbud/z3fold, set false for zsmalloc. Then zswap could go the current > > > path if the flag is true; and if it's false, copy data from src to a > > > temporary buffer, then unmap the handle, take the mutex, process the > > > buffer instead of src to avoid sleeping function called from atomic > > > context. > > > > > > Signed-off-by: Tian Tao <tiantao6@hisilicon.com> > > > --- > > > include/linux/zpool.h | 3 +++ > > > mm/zpool.c | 13 +++++++++++++ > > > mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- > > > 3 files changed, 61 insertions(+), 5 deletions(-) > > > > > > diff --git a/include/linux/zpool.h b/include/linux/zpool.h > > > index 51bf430..e899701 100644 > > > --- a/include/linux/zpool.h > > > +++ b/include/linux/zpool.h > > > @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); > > > * @malloc: allocate mem from a pool. > > > * @free: free mem from a pool. > > > * @shrink: shrink the pool. > > > + * @sleep_mapped: whether zpool driver can sleep during map. > > > > I don't think it's a good idea. It just breaks zpool abstraction > > in that it exposes internal implementation to user to avoid issue > > zswap recently introduced. It also conflicts zpool_map_handle's > > semantic. > > > > Rather than introducing another break in zpool due to the new > > zswap feature recenlty introduced, zswap could introduce > > CONFIG_ZSWAP_HW_COMPRESSOR. Once it's configured, zsmalloc could > > be disabled. And with disabling CONFIG_ZSWAP_HW_COMPRESSOR, zswap > > doesn't need to make any bounce buffer copy so that no existing > > zsmalloc user will see performance regression. > > I believe it won't help that much -- the new compressor API presumes > that the caller may sleep during compression and that will be an > accident waiting to happen as long as we use it and keep the handle > mapped in zsmalloc case. > > Or maybe I interpreted you wrong and you are suggesting re-introducing > calls to the old API under this #ifdef, is that the case? Yub. zswap could abstract that part under #ifdef to keep old behavior. > > Best regards, > Vitaly > > > > > > * @map: map a handle. > > > * @unmap: unmap a handle. > > > * @total_size: get total size of a pool. > > > @@ -100,6 +101,7 @@ struct zpool_driver { > > > int (*shrink)(void *pool, unsigned int pages, > > > unsigned int *reclaimed); > > > > > > + bool sleep_mapped; > > > void *(*map)(void *pool, unsigned long handle, > > > enum zpool_mapmode mm); > > > void (*unmap)(void *pool, unsigned long handle); > > > @@ -112,5 +114,6 @@ void zpool_register_driver(struct zpool_driver *driver); > > > int zpool_unregister_driver(struct zpool_driver *driver); > > > > > > bool zpool_evictable(struct zpool *pool); > > > +bool zpool_can_sleep_mapped(struct zpool *pool); > > > > > > #endif > > > diff --git a/mm/zpool.c b/mm/zpool.c > > > index 3744a2d..5ed7120 100644 > > > --- a/mm/zpool.c > > > +++ b/mm/zpool.c > > > @@ -23,6 +23,7 @@ struct zpool { > > > void *pool; > > > const struct zpool_ops *ops; > > > bool evictable; > > > + bool can_sleep_mapped; > > > > > > struct list_head list; > > > }; > > > @@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, > > > zpool->pool = driver->create(name, gfp, ops, zpool); > > > zpool->ops = ops; > > > zpool->evictable = driver->shrink && ops && ops->evict; > > > + zpool->can_sleep_mapped = driver->sleep_mapped; > > > > > > if (!zpool->pool) { > > > pr_err("couldn't create %s pool\n", type); > > > @@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool) > > > return zpool->evictable; > > > } > > > > > > +/** > > > + * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. > > > + * @zpool: The zpool to test > > > + * > > > + * Returns: true if zpool can sleep; false otherwise. > > > + */ > > > +bool zpool_can_sleep_mapped(struct zpool *zpool) > > > +{ > > > + return zpool->can_sleep_mapped; > > > +} > > > + > > > MODULE_LICENSE("GPL"); > > > MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); > > > MODULE_DESCRIPTION("Common API for compressed memory storage"); > > > diff --git a/mm/zswap.c b/mm/zswap.c > > > index 182f6ad..67d4555 100644 > > > --- a/mm/zswap.c > > > +++ b/mm/zswap.c > > > @@ -935,13 +935,20 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > > > struct scatterlist input, output; > > > struct crypto_acomp_ctx *acomp_ctx; > > > > > > - u8 *src; > > > + u8 *src, *tmp; > > > unsigned int dlen; > > > int ret; > > > struct writeback_control wbc = { > > > .sync_mode = WB_SYNC_NONE, > > > }; > > > > > > + if (!zpool_can_sleep_mapped(pool)) { > > > + > > > + tmp = kmalloc(entry->length, GFP_ATOMIC); > > > + if (!tmp) > > > + return -ENOMEM; > > > + } > > > + > > > /* extract swpentry from data */ > > > zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); > > > swpentry = zhdr->swpentry; /* here */ > > > @@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > > > dlen = PAGE_SIZE; > > > src = (u8 *)zhdr + sizeof(struct zswap_header); > > > > > > + if (!zpool_can_sleep_mapped(pool)) { > > > + > > > + memcpy(tmp, src, entry->length); > > > + src = tmp; > > > + > > > + zpool_unmap_handle(pool, handle); > > > + } > > > + > > > mutex_lock(acomp_ctx->mutex); > > > sg_init_one(&input, src, entry->length); > > > sg_init_table(&output, 1); > > > @@ -1033,7 +1048,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > > > spin_unlock(&tree->lock); > > > > > > end: > > > - zpool_unmap_handle(pool, handle); > > > + if (zpool_can_sleep_mapped(pool)) > > > + zpool_unmap_handle(pool, handle); > > > + else > > > + kfree(tmp); > > > + > > > return ret; > > > } > > > > > > @@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > > > struct zswap_entry *entry; > > > struct scatterlist input, output; > > > struct crypto_acomp_ctx *acomp_ctx; > > > - u8 *src, *dst; > > > + u8 *src, *dst, *tmp; > > > unsigned int dlen; > > > int ret; > > > > > > @@ -1256,12 +1275,29 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > > > goto freeentry; > > > } > > > > > > + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { > > > + > > > + tmp = kmalloc(entry->length, GFP_ATOMIC); > > > + if (!tmp) { > > > + ret = -ENOMEM; > > > + goto freeentry; > > > + } > > > + } > > > + > > > /* decompress */ > > > dlen = PAGE_SIZE; > > > src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); > > > if (zpool_evictable(entry->pool->zpool)) > > > src += sizeof(struct zswap_header); > > > > > > + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { > > > + > > > + memcpy(tmp, src, entry->length); > > > + src = tmp; > > > + > > > + zpool_unmap_handle(entry->pool->zpool, entry->handle); > > > + } > > > + > > > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > > > mutex_lock(acomp_ctx->mutex); > > > sg_init_one(&input, src, entry->length); > > > @@ -1271,7 +1307,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > > > ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); > > > mutex_unlock(acomp_ctx->mutex); > > > > > > - zpool_unmap_handle(entry->pool->zpool, entry->handle); > > > + if (zpool_can_sleep_mapped(entry->pool->zpool)) > > > + zpool_unmap_handle(entry->pool->zpool, entry->handle); > > > + else > > > + kfree(tmp); > > > + > > > BUG_ON(ret); > > > > > > freeentry: > > > @@ -1279,7 +1319,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > > > zswap_entry_put(tree, entry); > > > spin_unlock(&tree->lock); > > > > > > - return 0; > > > + return ret; > > > } > > > > > > /* frees an entry in zswap */ > > > -- > > > 2.7.4 > > > > > >
On Thu, Jan 14, 2021 at 7:56 PM Minchan Kim <minchan@kernel.org> wrote: > > On Thu, Jan 14, 2021 at 07:40:50PM +0100, Vitaly Wool wrote: > > On Thu, Jan 14, 2021 at 7:29 PM Minchan Kim <minchan@kernel.org> wrote: > > > > > > On Fri, Dec 25, 2020 at 07:02:50PM +0800, Tian Tao wrote: > > > > add a flag to zpool, named is "can_sleep_mapped", and have it set true > > > > for zbud/z3fold, set false for zsmalloc. Then zswap could go the current > > > > path if the flag is true; and if it's false, copy data from src to a > > > > temporary buffer, then unmap the handle, take the mutex, process the > > > > buffer instead of src to avoid sleeping function called from atomic > > > > context. > > > > > > > > Signed-off-by: Tian Tao <tiantao6@hisilicon.com> > > > > --- > > > > include/linux/zpool.h | 3 +++ > > > > mm/zpool.c | 13 +++++++++++++ > > > > mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- > > > > 3 files changed, 61 insertions(+), 5 deletions(-) > > > > > > > > diff --git a/include/linux/zpool.h b/include/linux/zpool.h > > > > index 51bf430..e899701 100644 > > > > --- a/include/linux/zpool.h > > > > +++ b/include/linux/zpool.h > > > > @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); > > > > * @malloc: allocate mem from a pool. > > > > * @free: free mem from a pool. > > > > * @shrink: shrink the pool. > > > > + * @sleep_mapped: whether zpool driver can sleep during map. > > > > > > I don't think it's a good idea. It just breaks zpool abstraction > > > in that it exposes internal implementation to user to avoid issue > > > zswap recently introduced. It also conflicts zpool_map_handle's > > > semantic. > > > > > > Rather than introducing another break in zpool due to the new > > > zswap feature recenlty introduced, zswap could introduce > > > CONFIG_ZSWAP_HW_COMPRESSOR. Once it's configured, zsmalloc could > > > be disabled. And with disabling CONFIG_ZSWAP_HW_COMPRESSOR, zswap > > > doesn't need to make any bounce buffer copy so that no existing > > > zsmalloc user will see performance regression. > > > > I believe it won't help that much -- the new compressor API presumes > > that the caller may sleep during compression and that will be an > > accident waiting to happen as long as we use it and keep the handle > > mapped in zsmalloc case. > > > > Or maybe I interpreted you wrong and you are suggesting re-introducing > > calls to the old API under this #ifdef, is that the case? > > Yub. zswap could abstract that part under #ifdef to keep old behavior. We can reconsider this option when zsmalloc implements reclaim callback. So far it's obviously too much a mess for a reason so weak. > > > > Best regards, > > Vitaly > > > > > > > > > * @map: map a handle. > > > > * @unmap: unmap a handle. > > > > * @total_size: get total size of a pool. > > > > @@ -100,6 +101,7 @@ struct zpool_driver { > > > > int (*shrink)(void *pool, unsigned int pages, > > > > unsigned int *reclaimed); > > > > > > > > + bool sleep_mapped; > > > > void *(*map)(void *pool, unsigned long handle, > > > > enum zpool_mapmode mm); > > > > void (*unmap)(void *pool, unsigned long handle); > > > > @@ -112,5 +114,6 @@ void zpool_register_driver(struct zpool_driver *driver); > > > > int zpool_unregister_driver(struct zpool_driver *driver); > > > > > > > > bool zpool_evictable(struct zpool *pool); > > > > +bool zpool_can_sleep_mapped(struct zpool *pool); > > > > > > > > #endif > > > > diff --git a/mm/zpool.c b/mm/zpool.c > > > > index 3744a2d..5ed7120 100644 > > > > --- a/mm/zpool.c > > > > +++ b/mm/zpool.c > > > > @@ -23,6 +23,7 @@ struct zpool { > > > > void *pool; > > > > const struct zpool_ops *ops; > > > > bool evictable; > > > > + bool can_sleep_mapped; > > > > > > > > struct list_head list; > > > > }; > > > > @@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, > > > > zpool->pool = driver->create(name, gfp, ops, zpool); > > > > zpool->ops = ops; > > > > zpool->evictable = driver->shrink && ops && ops->evict; > > > > + zpool->can_sleep_mapped = driver->sleep_mapped; > > > > > > > > if (!zpool->pool) { > > > > pr_err("couldn't create %s pool\n", type); > > > > @@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool) > > > > return zpool->evictable; > > > > } > > > > > > > > +/** > > > > + * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. > > > > + * @zpool: The zpool to test > > > > + * > > > > + * Returns: true if zpool can sleep; false otherwise. > > > > + */ > > > > +bool zpool_can_sleep_mapped(struct zpool *zpool) > > > > +{ > > > > + return zpool->can_sleep_mapped; > > > > +} > > > > + > > > > MODULE_LICENSE("GPL"); > > > > MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); > > > > MODULE_DESCRIPTION("Common API for compressed memory storage"); > > > > diff --git a/mm/zswap.c b/mm/zswap.c > > > > index 182f6ad..67d4555 100644 > > > > --- a/mm/zswap.c > > > > +++ b/mm/zswap.c > > > > @@ -935,13 +935,20 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > > > > struct scatterlist input, output; > > > > struct crypto_acomp_ctx *acomp_ctx; > > > > > > > > - u8 *src; > > > > + u8 *src, *tmp; > > > > unsigned int dlen; > > > > int ret; > > > > struct writeback_control wbc = { > > > > .sync_mode = WB_SYNC_NONE, > > > > }; > > > > > > > > + if (!zpool_can_sleep_mapped(pool)) { > > > > + > > > > + tmp = kmalloc(entry->length, GFP_ATOMIC); > > > > + if (!tmp) > > > > + return -ENOMEM; > > > > + } > > > > + > > > > /* extract swpentry from data */ > > > > zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); > > > > swpentry = zhdr->swpentry; /* here */ > > > > @@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > > > > dlen = PAGE_SIZE; > > > > src = (u8 *)zhdr + sizeof(struct zswap_header); > > > > > > > > + if (!zpool_can_sleep_mapped(pool)) { > > > > + > > > > + memcpy(tmp, src, entry->length); > > > > + src = tmp; > > > > + > > > > + zpool_unmap_handle(pool, handle); > > > > + } > > > > + > > > > mutex_lock(acomp_ctx->mutex); > > > > sg_init_one(&input, src, entry->length); > > > > sg_init_table(&output, 1); > > > > @@ -1033,7 +1048,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > > > > spin_unlock(&tree->lock); > > > > > > > > end: > > > > - zpool_unmap_handle(pool, handle); > > > > + if (zpool_can_sleep_mapped(pool)) > > > > + zpool_unmap_handle(pool, handle); > > > > + else > > > > + kfree(tmp); > > > > + > > > > return ret; > > > > } > > > > > > > > @@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > > > > struct zswap_entry *entry; > > > > struct scatterlist input, output; > > > > struct crypto_acomp_ctx *acomp_ctx; > > > > - u8 *src, *dst; > > > > + u8 *src, *dst, *tmp; > > > > unsigned int dlen; > > > > int ret; > > > > > > > > @@ -1256,12 +1275,29 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > > > > goto freeentry; > > > > } > > > > > > > > + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { > > > > + > > > > + tmp = kmalloc(entry->length, GFP_ATOMIC); > > > > + if (!tmp) { > > > > + ret = -ENOMEM; > > > > + goto freeentry; > > > > + } > > > > + } > > > > + > > > > /* decompress */ > > > > dlen = PAGE_SIZE; > > > > src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); > > > > if (zpool_evictable(entry->pool->zpool)) > > > > src += sizeof(struct zswap_header); > > > > > > > > + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { > > > > + > > > > + memcpy(tmp, src, entry->length); > > > > + src = tmp; > > > > + > > > > + zpool_unmap_handle(entry->pool->zpool, entry->handle); > > > > + } > > > > + > > > > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > > > > mutex_lock(acomp_ctx->mutex); > > > > sg_init_one(&input, src, entry->length); > > > > @@ -1271,7 +1307,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > > > > ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); > > > > mutex_unlock(acomp_ctx->mutex); > > > > > > > > - zpool_unmap_handle(entry->pool->zpool, entry->handle); > > > > + if (zpool_can_sleep_mapped(entry->pool->zpool)) > > > > + zpool_unmap_handle(entry->pool->zpool, entry->handle); > > > > + else > > > > + kfree(tmp); > > > > + > > > > BUG_ON(ret); > > > > > > > > freeentry: > > > > @@ -1279,7 +1319,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > > > > zswap_entry_put(tree, entry); > > > > spin_unlock(&tree->lock); > > > > > > > > - return 0; > > > > + return ret; > > > > } > > > > > > > > /* frees an entry in zswap */ > > > > -- > > > > 2.7.4 > > > > > > > >
On Thu, Jan 14, 2021 at 11:05 AM Vitaly Wool <vitaly.wool@konsulko.com> wrote: > > On Thu, Jan 14, 2021 at 7:56 PM Minchan Kim <minchan@kernel.org> wrote: > > > > On Thu, Jan 14, 2021 at 07:40:50PM +0100, Vitaly Wool wrote: > > > On Thu, Jan 14, 2021 at 7:29 PM Minchan Kim <minchan@kernel.org> wrote: > > > > > > > > On Fri, Dec 25, 2020 at 07:02:50PM +0800, Tian Tao wrote: > > > > > add a flag to zpool, named is "can_sleep_mapped", and have it set true > > > > > for zbud/z3fold, set false for zsmalloc. Then zswap could go the current > > > > > path if the flag is true; and if it's false, copy data from src to a > > > > > temporary buffer, then unmap the handle, take the mutex, process the > > > > > buffer instead of src to avoid sleeping function called from atomic > > > > > context. > > > > > > > > > > Signed-off-by: Tian Tao <tiantao6@hisilicon.com> > > > > > --- > > > > > include/linux/zpool.h | 3 +++ > > > > > mm/zpool.c | 13 +++++++++++++ > > > > > mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- > > > > > 3 files changed, 61 insertions(+), 5 deletions(-) > > > > > > > > > > diff --git a/include/linux/zpool.h b/include/linux/zpool.h > > > > > index 51bf430..e899701 100644 > > > > > --- a/include/linux/zpool.h > > > > > +++ b/include/linux/zpool.h > > > > > @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); > > > > > * @malloc: allocate mem from a pool. > > > > > * @free: free mem from a pool. > > > > > * @shrink: shrink the pool. > > > > > + * @sleep_mapped: whether zpool driver can sleep during map. > > > > > > > > I don't think it's a good idea. It just breaks zpool abstraction > > > > in that it exposes internal implementation to user to avoid issue > > > > zswap recently introduced. It also conflicts zpool_map_handle's > > > > semantic. > > > > > > > > Rather than introducing another break in zpool due to the new > > > > zswap feature recenlty introduced, zswap could introduce > > > > CONFIG_ZSWAP_HW_COMPRESSOR. Once it's configured, zsmalloc could > > > > be disabled. And with disabling CONFIG_ZSWAP_HW_COMPRESSOR, zswap > > > > doesn't need to make any bounce buffer copy so that no existing > > > > zsmalloc user will see performance regression. > > > > > > I believe it won't help that much -- the new compressor API presumes > > > that the caller may sleep during compression and that will be an > > > accident waiting to happen as long as we use it and keep the handle > > > mapped in zsmalloc case. > > > > > > Or maybe I interpreted you wrong and you are suggesting re-introducing > > > calls to the old API under this #ifdef, is that the case? > > > > Yub. zswap could abstract that part under #ifdef to keep old behavior. > > We can reconsider this option when zsmalloc implements reclaim > callback. So far it's obviously too much a mess for a reason so weak. > Sorry I don't understand the link between zsmalloc implementing shrink callback and this patch. This patch is adding an overhead for all zswap+zsmalloc users irrespective of availability of hardware. If we want to add support for new hardware, please add without impacting the current users. Shakeel
On Thu, Jan 14, 2021 at 11:21:19AM -0800, Shakeel Butt wrote: > On Thu, Jan 14, 2021 at 11:05 AM Vitaly Wool <vitaly.wool@konsulko.com> wrote: > > > > On Thu, Jan 14, 2021 at 7:56 PM Minchan Kim <minchan@kernel.org> wrote: > > > > > > On Thu, Jan 14, 2021 at 07:40:50PM +0100, Vitaly Wool wrote: > > > > On Thu, Jan 14, 2021 at 7:29 PM Minchan Kim <minchan@kernel.org> wrote: > > > > > > > > > > On Fri, Dec 25, 2020 at 07:02:50PM +0800, Tian Tao wrote: > > > > > > add a flag to zpool, named is "can_sleep_mapped", and have it set true > > > > > > for zbud/z3fold, set false for zsmalloc. Then zswap could go the current > > > > > > path if the flag is true; and if it's false, copy data from src to a > > > > > > temporary buffer, then unmap the handle, take the mutex, process the > > > > > > buffer instead of src to avoid sleeping function called from atomic > > > > > > context. > > > > > > > > > > > > Signed-off-by: Tian Tao <tiantao6@hisilicon.com> > > > > > > --- > > > > > > include/linux/zpool.h | 3 +++ > > > > > > mm/zpool.c | 13 +++++++++++++ > > > > > > mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- > > > > > > 3 files changed, 61 insertions(+), 5 deletions(-) > > > > > > > > > > > > diff --git a/include/linux/zpool.h b/include/linux/zpool.h > > > > > > index 51bf430..e899701 100644 > > > > > > --- a/include/linux/zpool.h > > > > > > +++ b/include/linux/zpool.h > > > > > > @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); > > > > > > * @malloc: allocate mem from a pool. > > > > > > * @free: free mem from a pool. > > > > > > * @shrink: shrink the pool. > > > > > > + * @sleep_mapped: whether zpool driver can sleep during map. > > > > > > > > > > I don't think it's a good idea. It just breaks zpool abstraction > > > > > in that it exposes internal implementation to user to avoid issue > > > > > zswap recently introduced. It also conflicts zpool_map_handle's > > > > > semantic. > > > > > > > > > > Rather than introducing another break in zpool due to the new > > > > > zswap feature recenlty introduced, zswap could introduce > > > > > CONFIG_ZSWAP_HW_COMPRESSOR. Once it's configured, zsmalloc could > > > > > be disabled. And with disabling CONFIG_ZSWAP_HW_COMPRESSOR, zswap > > > > > doesn't need to make any bounce buffer copy so that no existing > > > > > zsmalloc user will see performance regression. > > > > > > > > I believe it won't help that much -- the new compressor API presumes > > > > that the caller may sleep during compression and that will be an > > > > accident waiting to happen as long as we use it and keep the handle > > > > mapped in zsmalloc case. > > > > > > > > Or maybe I interpreted you wrong and you are suggesting re-introducing > > > > calls to the old API under this #ifdef, is that the case? > > > > > > Yub. zswap could abstract that part under #ifdef to keep old behavior. > > > > We can reconsider this option when zsmalloc implements reclaim > > callback. So far it's obviously too much a mess for a reason so weak. > > > > Sorry I don't understand the link between zsmalloc implementing shrink > callback and this patch. This patch is adding an overhead for all > zswap+zsmalloc users irrespective of availability of hardware. If we > want to add support for new hardware, please add without impacting the > current users. Furthermore, please don't make mess for zpool semantic.
On Thu, Jan 14, 2021 at 8:21 PM Shakeel Butt <shakeelb@google.com> wrote: > > On Thu, Jan 14, 2021 at 11:05 AM Vitaly Wool <vitaly.wool@konsulko.com> wrote: > > > > On Thu, Jan 14, 2021 at 7:56 PM Minchan Kim <minchan@kernel.org> wrote: > > > > > > On Thu, Jan 14, 2021 at 07:40:50PM +0100, Vitaly Wool wrote: > > > > On Thu, Jan 14, 2021 at 7:29 PM Minchan Kim <minchan@kernel.org> wrote: > > > > > > > > > > On Fri, Dec 25, 2020 at 07:02:50PM +0800, Tian Tao wrote: > > > > > > add a flag to zpool, named is "can_sleep_mapped", and have it set true > > > > > > for zbud/z3fold, set false for zsmalloc. Then zswap could go the current > > > > > > path if the flag is true; and if it's false, copy data from src to a > > > > > > temporary buffer, then unmap the handle, take the mutex, process the > > > > > > buffer instead of src to avoid sleeping function called from atomic > > > > > > context. > > > > > > > > > > > > Signed-off-by: Tian Tao <tiantao6@hisilicon.com> > > > > > > --- > > > > > > include/linux/zpool.h | 3 +++ > > > > > > mm/zpool.c | 13 +++++++++++++ > > > > > > mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- > > > > > > 3 files changed, 61 insertions(+), 5 deletions(-) > > > > > > > > > > > > diff --git a/include/linux/zpool.h b/include/linux/zpool.h > > > > > > index 51bf430..e899701 100644 > > > > > > --- a/include/linux/zpool.h > > > > > > +++ b/include/linux/zpool.h > > > > > > @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); > > > > > > * @malloc: allocate mem from a pool. > > > > > > * @free: free mem from a pool. > > > > > > * @shrink: shrink the pool. > > > > > > + * @sleep_mapped: whether zpool driver can sleep during map. > > > > > > > > > > I don't think it's a good idea. It just breaks zpool abstraction > > > > > in that it exposes internal implementation to user to avoid issue > > > > > zswap recently introduced. It also conflicts zpool_map_handle's > > > > > semantic. > > > > > > > > > > Rather than introducing another break in zpool due to the new > > > > > zswap feature recenlty introduced, zswap could introduce > > > > > CONFIG_ZSWAP_HW_COMPRESSOR. Once it's configured, zsmalloc could > > > > > be disabled. And with disabling CONFIG_ZSWAP_HW_COMPRESSOR, zswap > > > > > doesn't need to make any bounce buffer copy so that no existing > > > > > zsmalloc user will see performance regression. > > > > > > > > I believe it won't help that much -- the new compressor API presumes > > > > that the caller may sleep during compression and that will be an > > > > accident waiting to happen as long as we use it and keep the handle > > > > mapped in zsmalloc case. > > > > > > > > Or maybe I interpreted you wrong and you are suggesting re-introducing > > > > calls to the old API under this #ifdef, is that the case? > > > > > > Yub. zswap could abstract that part under #ifdef to keep old behavior. > > > > We can reconsider this option when zsmalloc implements reclaim > > callback. So far it's obviously too much a mess for a reason so weak. > > > > Sorry I don't understand the link between zsmalloc implementing shrink > callback and this patch. There is none. There is a link between taking all the burden to revive zsmalloc for zswap at the cost of extra zswap complexity and zsmalloc not being fully zswap compatible. The ultimate zswap goal is to cache hottest swapped-out pages in a compressed format. zsmalloc doesn't implement reclaim callback, and therefore zswap can *not* fulfill its goal since old pages are there to stay, and it can't accept new hotter ones. So, let me make it clear: zswap/zsmalloc combo is a legacy corner case. > This patch is adding an overhead for all > zswap+zsmalloc users irrespective of availability of hardware. If we > want to add support for new hardware, please add without impacting the > current users. No, it's not like that. zswap+zsmalloc combination is currently already broken and this patch implements a way to work that around. The users are already impacted and that is of course a problem. The workaround is not perfect but looks good enough for me. The suggested messy #ifdef-based alternative will turn zswap into something really tricky to understand and maintain and therefore is not going to work. The best possible thing here would be for zsmalloc to stop taking spinlock in map() callback and releasing in unmap() (it _is_ legit -- I don't argue that -- but it does look weird and goes against the recent efforts to make Linux kernel more realtime and generally more responsive; moreover, it may be just a way to conceal some race conditions which could be easily fixed otherwise). Best regards, Vitaly
On Thu, Jan 14, 2021 at 11:53 AM Vitaly Wool <vitaly.wool@konsulko.com> wrote: > > On Thu, Jan 14, 2021 at 8:21 PM Shakeel Butt <shakeelb@google.com> wrote: > > > > On Thu, Jan 14, 2021 at 11:05 AM Vitaly Wool <vitaly.wool@konsulko.com> wrote: > > > > > > On Thu, Jan 14, 2021 at 7:56 PM Minchan Kim <minchan@kernel.org> wrote: > > > > > > > > On Thu, Jan 14, 2021 at 07:40:50PM +0100, Vitaly Wool wrote: > > > > > On Thu, Jan 14, 2021 at 7:29 PM Minchan Kim <minchan@kernel.org> wrote: > > > > > > > > > > > > On Fri, Dec 25, 2020 at 07:02:50PM +0800, Tian Tao wrote: > > > > > > > add a flag to zpool, named is "can_sleep_mapped", and have it set true > > > > > > > for zbud/z3fold, set false for zsmalloc. Then zswap could go the current > > > > > > > path if the flag is true; and if it's false, copy data from src to a > > > > > > > temporary buffer, then unmap the handle, take the mutex, process the > > > > > > > buffer instead of src to avoid sleeping function called from atomic > > > > > > > context. > > > > > > > > > > > > > > Signed-off-by: Tian Tao <tiantao6@hisilicon.com> > > > > > > > --- > > > > > > > include/linux/zpool.h | 3 +++ > > > > > > > mm/zpool.c | 13 +++++++++++++ > > > > > > > mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- > > > > > > > 3 files changed, 61 insertions(+), 5 deletions(-) > > > > > > > > > > > > > > diff --git a/include/linux/zpool.h b/include/linux/zpool.h > > > > > > > index 51bf430..e899701 100644 > > > > > > > --- a/include/linux/zpool.h > > > > > > > +++ b/include/linux/zpool.h > > > > > > > @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); > > > > > > > * @malloc: allocate mem from a pool. > > > > > > > * @free: free mem from a pool. > > > > > > > * @shrink: shrink the pool. > > > > > > > + * @sleep_mapped: whether zpool driver can sleep during map. > > > > > > > > > > > > I don't think it's a good idea. It just breaks zpool abstraction > > > > > > in that it exposes internal implementation to user to avoid issue > > > > > > zswap recently introduced. It also conflicts zpool_map_handle's > > > > > > semantic. > > > > > > > > > > > > Rather than introducing another break in zpool due to the new > > > > > > zswap feature recenlty introduced, zswap could introduce > > > > > > CONFIG_ZSWAP_HW_COMPRESSOR. Once it's configured, zsmalloc could > > > > > > be disabled. And with disabling CONFIG_ZSWAP_HW_COMPRESSOR, zswap > > > > > > doesn't need to make any bounce buffer copy so that no existing > > > > > > zsmalloc user will see performance regression. > > > > > > > > > > I believe it won't help that much -- the new compressor API presumes > > > > > that the caller may sleep during compression and that will be an > > > > > accident waiting to happen as long as we use it and keep the handle > > > > > mapped in zsmalloc case. > > > > > > > > > > Or maybe I interpreted you wrong and you are suggesting re-introducing > > > > > calls to the old API under this #ifdef, is that the case? > > > > > > > > Yub. zswap could abstract that part under #ifdef to keep old behavior. > > > > > > We can reconsider this option when zsmalloc implements reclaim > > > callback. So far it's obviously too much a mess for a reason so weak. > > > > > > > Sorry I don't understand the link between zsmalloc implementing shrink > > callback and this patch. > > There is none. There is a link between taking all the burden to revive > zsmalloc for zswap at the cost of extra zswap complexity and zsmalloc > not being fully zswap compatible. > > The ultimate zswap goal is to cache hottest swapped-out pages in a > compressed format. zsmalloc doesn't implement reclaim callback, and > therefore zswap can *not* fulfill its goal since old pages are there > to stay, and it can't accept new hotter ones. So, let me make it > clear: zswap/zsmalloc combo is a legacy corner case. > This is the first time I am hearing that zswap/zsmalloc combo is a legacy configuration. We use zswap in production and intentionally size the zswap pool to never have to go to the backing device. So absence of reclaim callback is not an issue for us. Please let me know if the zswap/zsmalloc combo is officially on its way to deprecation. > > This patch is adding an overhead for all > > zswap+zsmalloc users irrespective of availability of hardware. If we > > want to add support for new hardware, please add without impacting the > > current users. > > No, it's not like that. zswap+zsmalloc combination is currently > already broken By broken do you mean the missing reclaim callback? > and this patch implements a way to work that around. > The users are already impacted and that is of course a problem. Are you talking about rt users or was there some other report? > The workaround is not perfect but looks good enough for me. I would like a see page fault perf experiment for the non-hardware case.
On Thu, Jan 14, 2021 at 10:09 PM Shakeel Butt <shakeelb@google.com> wrote: > > On Thu, Jan 14, 2021 at 11:53 AM Vitaly Wool <vitaly.wool@konsulko.com> wrote: > > > > On Thu, Jan 14, 2021 at 8:21 PM Shakeel Butt <shakeelb@google.com> wrote: > > > > > > On Thu, Jan 14, 2021 at 11:05 AM Vitaly Wool <vitaly.wool@konsulko.com> wrote: > > > > > > > > On Thu, Jan 14, 2021 at 7:56 PM Minchan Kim <minchan@kernel.org> wrote: > > > > > > > > > > On Thu, Jan 14, 2021 at 07:40:50PM +0100, Vitaly Wool wrote: > > > > > > On Thu, Jan 14, 2021 at 7:29 PM Minchan Kim <minchan@kernel.org> wrote: > > > > > > > > > > > > > > On Fri, Dec 25, 2020 at 07:02:50PM +0800, Tian Tao wrote: > > > > > > > > add a flag to zpool, named is "can_sleep_mapped", and have it set true > > > > > > > > for zbud/z3fold, set false for zsmalloc. Then zswap could go the current > > > > > > > > path if the flag is true; and if it's false, copy data from src to a > > > > > > > > temporary buffer, then unmap the handle, take the mutex, process the > > > > > > > > buffer instead of src to avoid sleeping function called from atomic > > > > > > > > context. > > > > > > > > > > > > > > > > Signed-off-by: Tian Tao <tiantao6@hisilicon.com> > > > > > > > > --- > > > > > > > > include/linux/zpool.h | 3 +++ > > > > > > > > mm/zpool.c | 13 +++++++++++++ > > > > > > > > mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- > > > > > > > > 3 files changed, 61 insertions(+), 5 deletions(-) > > > > > > > > > > > > > > > > diff --git a/include/linux/zpool.h b/include/linux/zpool.h > > > > > > > > index 51bf430..e899701 100644 > > > > > > > > --- a/include/linux/zpool.h > > > > > > > > +++ b/include/linux/zpool.h > > > > > > > > @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); > > > > > > > > * @malloc: allocate mem from a pool. > > > > > > > > * @free: free mem from a pool. > > > > > > > > * @shrink: shrink the pool. > > > > > > > > + * @sleep_mapped: whether zpool driver can sleep during map. > > > > > > > > > > > > > > I don't think it's a good idea. It just breaks zpool abstraction > > > > > > > in that it exposes internal implementation to user to avoid issue > > > > > > > zswap recently introduced. It also conflicts zpool_map_handle's > > > > > > > semantic. > > > > > > > > > > > > > > Rather than introducing another break in zpool due to the new > > > > > > > zswap feature recenlty introduced, zswap could introduce > > > > > > > CONFIG_ZSWAP_HW_COMPRESSOR. Once it's configured, zsmalloc could > > > > > > > be disabled. And with disabling CONFIG_ZSWAP_HW_COMPRESSOR, zswap > > > > > > > doesn't need to make any bounce buffer copy so that no existing > > > > > > > zsmalloc user will see performance regression. > > > > > > > > > > > > I believe it won't help that much -- the new compressor API presumes > > > > > > that the caller may sleep during compression and that will be an > > > > > > accident waiting to happen as long as we use it and keep the handle > > > > > > mapped in zsmalloc case. > > > > > > > > > > > > Or maybe I interpreted you wrong and you are suggesting re-introducing > > > > > > calls to the old API under this #ifdef, is that the case? > > > > > > > > > > Yub. zswap could abstract that part under #ifdef to keep old behavior. > > > > > > > > We can reconsider this option when zsmalloc implements reclaim > > > > callback. So far it's obviously too much a mess for a reason so weak. > > > > > > > > > > Sorry I don't understand the link between zsmalloc implementing shrink > > > callback and this patch. > > > > There is none. There is a link between taking all the burden to revive > > zsmalloc for zswap at the cost of extra zswap complexity and zsmalloc > > not being fully zswap compatible. > > > > The ultimate zswap goal is to cache hottest swapped-out pages in a > > compressed format. zsmalloc doesn't implement reclaim callback, and > > therefore zswap can *not* fulfill its goal since old pages are there > > to stay, and it can't accept new hotter ones. So, let me make it > > clear: zswap/zsmalloc combo is a legacy corner case. > > > > This is the first time I am hearing that zswap/zsmalloc combo is a > legacy configuration. We use zswap in production and intentionally > size the zswap pool to never have to go to the backing device. So > absence of reclaim callback is not an issue for us. Please let me know > if the zswap/zsmalloc combo is officially on its way to deprecation. No, zswap/zsmalloc combo not on the way to deprecation. I generally would not advise on using it but your particular case does make sense (although using frontswap/zswap without a backing device *is* a corner case). > > > This patch is adding an overhead for all > > > zswap+zsmalloc users irrespective of availability of hardware. If we > > > want to add support for new hardware, please add without impacting the > > > current users. > > > > No, it's not like that. zswap+zsmalloc combination is currently > > already broken > > By broken do you mean the missing reclaim callback? No. I mean deadlocks in -rt kernel / scheduling while atomic bugs in the mainline. Missing reclaim callback goes into "not fully compatible" section in my world. > > and this patch implements a way to work that around. > > The users are already impacted and that is of course a problem. > > Are you talking about rt users or was there some other report? I don't think the issue is specific to -rt series. There may (and will) still be "scheduling while atomic" bugs occurring, just not as often. > > The workaround is not perfect but looks good enough for me. > > I would like a see page fault perf experiment for the non-hardware case. I second that. @tiantao (H), would it be possible to provide one? Also, correct me if I'm wrong but from what I recall, the acomp API does one redundant copy less than the old comp API so zsmalloc should be back to square one even with the buffer implemented in this patch. The other backends should do a little better though but if so, it's the upside of not taking too many spinlocks. Best regards, Vitaly
在 2021/1/15 6:41, Vitaly Wool 写道: > On Thu, Jan 14, 2021 at 10:09 PM Shakeel Butt <shakeelb@google.com> wrote: >> On Thu, Jan 14, 2021 at 11:53 AM Vitaly Wool <vitaly.wool@konsulko.com> wrote: >>> On Thu, Jan 14, 2021 at 8:21 PM Shakeel Butt <shakeelb@google.com> wrote: >>>> On Thu, Jan 14, 2021 at 11:05 AM Vitaly Wool <vitaly.wool@konsulko.com> wrote: >>>>> On Thu, Jan 14, 2021 at 7:56 PM Minchan Kim <minchan@kernel.org> wrote: >>>>>> On Thu, Jan 14, 2021 at 07:40:50PM +0100, Vitaly Wool wrote: >>>>>>> On Thu, Jan 14, 2021 at 7:29 PM Minchan Kim <minchan@kernel.org> wrote: >>>>>>>> On Fri, Dec 25, 2020 at 07:02:50PM +0800, Tian Tao wrote: >>>>>>>>> add a flag to zpool, named is "can_sleep_mapped", and have it set true >>>>>>>>> for zbud/z3fold, set false for zsmalloc. Then zswap could go the current >>>>>>>>> path if the flag is true; and if it's false, copy data from src to a >>>>>>>>> temporary buffer, then unmap the handle, take the mutex, process the >>>>>>>>> buffer instead of src to avoid sleeping function called from atomic >>>>>>>>> context. >>>>>>>>> >>>>>>>>> Signed-off-by: Tian Tao <tiantao6@hisilicon.com> >>>>>>>>> --- >>>>>>>>> include/linux/zpool.h | 3 +++ >>>>>>>>> mm/zpool.c | 13 +++++++++++++ >>>>>>>>> mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- >>>>>>>>> 3 files changed, 61 insertions(+), 5 deletions(-) >>>>>>>>> >>>>>>>>> diff --git a/include/linux/zpool.h b/include/linux/zpool.h >>>>>>>>> index 51bf430..e899701 100644 >>>>>>>>> --- a/include/linux/zpool.h >>>>>>>>> +++ b/include/linux/zpool.h >>>>>>>>> @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); >>>>>>>>> * @malloc: allocate mem from a pool. >>>>>>>>> * @free: free mem from a pool. >>>>>>>>> * @shrink: shrink the pool. >>>>>>>>> + * @sleep_mapped: whether zpool driver can sleep during map. >>>>>>>> I don't think it's a good idea. It just breaks zpool abstraction >>>>>>>> in that it exposes internal implementation to user to avoid issue >>>>>>>> zswap recently introduced. It also conflicts zpool_map_handle's >>>>>>>> semantic. >>>>>>>> >>>>>>>> Rather than introducing another break in zpool due to the new >>>>>>>> zswap feature recenlty introduced, zswap could introduce >>>>>>>> CONFIG_ZSWAP_HW_COMPRESSOR. Once it's configured, zsmalloc could >>>>>>>> be disabled. And with disabling CONFIG_ZSWAP_HW_COMPRESSOR, zswap >>>>>>>> doesn't need to make any bounce buffer copy so that no existing >>>>>>>> zsmalloc user will see performance regression. >>>>>>> I believe it won't help that much -- the new compressor API presumes >>>>>>> that the caller may sleep during compression and that will be an >>>>>>> accident waiting to happen as long as we use it and keep the handle >>>>>>> mapped in zsmalloc case. >>>>>>> >>>>>>> Or maybe I interpreted you wrong and you are suggesting re-introducing >>>>>>> calls to the old API under this #ifdef, is that the case? >>>>>> Yub. zswap could abstract that part under #ifdef to keep old behavior. >>>>> We can reconsider this option when zsmalloc implements reclaim >>>>> callback. So far it's obviously too much a mess for a reason so weak. >>>>> >>>> Sorry I don't understand the link between zsmalloc implementing shrink >>>> callback and this patch. >>> There is none. There is a link between taking all the burden to revive >>> zsmalloc for zswap at the cost of extra zswap complexity and zsmalloc >>> not being fully zswap compatible. >>> >>> The ultimate zswap goal is to cache hottest swapped-out pages in a >>> compressed format. zsmalloc doesn't implement reclaim callback, and >>> therefore zswap can *not* fulfill its goal since old pages are there >>> to stay, and it can't accept new hotter ones. So, let me make it >>> clear: zswap/zsmalloc combo is a legacy corner case. >>> >> This is the first time I am hearing that zswap/zsmalloc combo is a >> legacy configuration. We use zswap in production and intentionally >> size the zswap pool to never have to go to the backing device. So >> absence of reclaim callback is not an issue for us. Please let me know >> if the zswap/zsmalloc combo is officially on its way to deprecation. > No, zswap/zsmalloc combo not on the way to deprecation. I generally > would not advise on using it but your particular case does make sense > (although using frontswap/zswap without a backing device *is* a corner > case). > >>>> This patch is adding an overhead for all >>>> zswap+zsmalloc users irrespective of availability of hardware. If we >>>> want to add support for new hardware, please add without impacting the >>>> current users. >>> No, it's not like that. zswap+zsmalloc combination is currently >>> already broken >> By broken do you mean the missing reclaim callback? > No. I mean deadlocks in -rt kernel / scheduling while atomic bugs in > the mainline. Missing reclaim callback goes into "not fully > compatible" section in my world. > >>> and this patch implements a way to work that around. >>> The users are already impacted and that is of course a problem. >> Are you talking about rt users or was there some other report? > I don't think the issue is specific to -rt series. There may (and > will) still be "scheduling while atomic" bugs occurring, just not as > often. > >>> The workaround is not perfect but looks good enough for me. >> I would like a see page fault perf experiment for the non-hardware case. > I second that. @tiantao (H), would it be possible to provide one? No problem, but can you tell how to provide the data you want for page fault? > > Also, correct me if I'm wrong but from what I recall, the acomp API > does one redundant copy less than the old comp API so zsmalloc should > be back to square one even with the buffer implemented in this patch. > The other backends should do a little better though but if so, it's > the upside of not taking too many spinlocks. > > Best regards, > Vitaly
On Fri, Dec 25, 2020 at 12:02 PM Tian Tao <tiantao6@hisilicon.com> wrote: > > add a flag to zpool, named is "can_sleep_mapped", and have it set true > for zbud/z3fold, set false for zsmalloc. Then zswap could go the current > path if the flag is true; and if it's false, copy data from src to a > temporary buffer, then unmap the handle, take the mutex, process the > buffer instead of src to avoid sleeping function called from atomic > context. > > Signed-off-by: Tian Tao <tiantao6@hisilicon.com> > --- > include/linux/zpool.h | 3 +++ > mm/zpool.c | 13 +++++++++++++ > mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- > 3 files changed, 61 insertions(+), 5 deletions(-) > > diff --git a/include/linux/zpool.h b/include/linux/zpool.h > index 51bf430..e899701 100644 > --- a/include/linux/zpool.h > +++ b/include/linux/zpool.h > @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); > * @malloc: allocate mem from a pool. > * @free: free mem from a pool. > * @shrink: shrink the pool. > + * @sleep_mapped: whether zpool driver can sleep during map. > * @map: map a handle. > * @unmap: unmap a handle. > * @total_size: get total size of a pool. > @@ -100,6 +101,7 @@ struct zpool_driver { > int (*shrink)(void *pool, unsigned int pages, > unsigned int *reclaimed); > > + bool sleep_mapped; > void *(*map)(void *pool, unsigned long handle, > enum zpool_mapmode mm); > void (*unmap)(void *pool, unsigned long handle); > @@ -112,5 +114,6 @@ void zpool_register_driver(struct zpool_driver *driver); > int zpool_unregister_driver(struct zpool_driver *driver); > > bool zpool_evictable(struct zpool *pool); > +bool zpool_can_sleep_mapped(struct zpool *pool); > > #endif > diff --git a/mm/zpool.c b/mm/zpool.c > index 3744a2d..5ed7120 100644 > --- a/mm/zpool.c > +++ b/mm/zpool.c > @@ -23,6 +23,7 @@ struct zpool { > void *pool; > const struct zpool_ops *ops; > bool evictable; > + bool can_sleep_mapped; > > struct list_head list; > }; > @@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, > zpool->pool = driver->create(name, gfp, ops, zpool); > zpool->ops = ops; > zpool->evictable = driver->shrink && ops && ops->evict; > + zpool->can_sleep_mapped = driver->sleep_mapped; > > if (!zpool->pool) { > pr_err("couldn't create %s pool\n", type); > @@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool) > return zpool->evictable; > } > > +/** > + * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. > + * @zpool: The zpool to test > + * > + * Returns: true if zpool can sleep; false otherwise. > + */ > +bool zpool_can_sleep_mapped(struct zpool *zpool) > +{ > + return zpool->can_sleep_mapped; > +} > + > MODULE_LICENSE("GPL"); > MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); > MODULE_DESCRIPTION("Common API for compressed memory storage"); > diff --git a/mm/zswap.c b/mm/zswap.c > index 182f6ad..67d4555 100644 > --- a/mm/zswap.c > +++ b/mm/zswap.c > @@ -935,13 +935,20 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > struct scatterlist input, output; > struct crypto_acomp_ctx *acomp_ctx; > > - u8 *src; > + u8 *src, *tmp; > unsigned int dlen; > int ret; > struct writeback_control wbc = { > .sync_mode = WB_SYNC_NONE, > }; > > + if (!zpool_can_sleep_mapped(pool)) { > + > + tmp = kmalloc(entry->length, GFP_ATOMIC); This has escaped my attention, but this is obviously a bug. 'entry' is not initialized at this point. You either have to move memory allocation further down when entry->length starts making sense, or allocate PAGE_SIZE. Best regards, Vitaly > + if (!tmp) > + return -ENOMEM; > + } > + > /* extract swpentry from data */ > zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); > swpentry = zhdr->swpentry; /* here */ > @@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > dlen = PAGE_SIZE; > src = (u8 *)zhdr + sizeof(struct zswap_header); > > + if (!zpool_can_sleep_mapped(pool)) { > + > + memcpy(tmp, src, entry->length); > + src = tmp; > + > + zpool_unmap_handle(pool, handle); > + } > + > mutex_lock(acomp_ctx->mutex); > sg_init_one(&input, src, entry->length); > sg_init_table(&output, 1); > @@ -1033,7 +1048,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) > spin_unlock(&tree->lock); > > end: > - zpool_unmap_handle(pool, handle); > + if (zpool_can_sleep_mapped(pool)) > + zpool_unmap_handle(pool, handle); > + else > + kfree(tmp); > + > return ret; > } > > @@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > struct zswap_entry *entry; > struct scatterlist input, output; > struct crypto_acomp_ctx *acomp_ctx; > - u8 *src, *dst; > + u8 *src, *dst, *tmp; > unsigned int dlen; > int ret; > > @@ -1256,12 +1275,29 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > goto freeentry; > } > > + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { > + > + tmp = kmalloc(entry->length, GFP_ATOMIC); > + if (!tmp) { > + ret = -ENOMEM; > + goto freeentry; > + } > + } > + > /* decompress */ > dlen = PAGE_SIZE; > src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); > if (zpool_evictable(entry->pool->zpool)) > src += sizeof(struct zswap_header); > > + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { > + > + memcpy(tmp, src, entry->length); > + src = tmp; > + > + zpool_unmap_handle(entry->pool->zpool, entry->handle); > + } > + > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > mutex_lock(acomp_ctx->mutex); > sg_init_one(&input, src, entry->length); > @@ -1271,7 +1307,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); > mutex_unlock(acomp_ctx->mutex); > > - zpool_unmap_handle(entry->pool->zpool, entry->handle); > + if (zpool_can_sleep_mapped(entry->pool->zpool)) > + zpool_unmap_handle(entry->pool->zpool, entry->handle); > + else > + kfree(tmp); > + > BUG_ON(ret); > > freeentry: > @@ -1279,7 +1319,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, > zswap_entry_put(tree, entry); > spin_unlock(&tree->lock); > > - return 0; > + return ret; > } > > /* frees an entry in zswap */ > -- > 2.7.4 >
> -----Original Message----- > From: Vitaly Wool [mailto:vitaly.wool@konsulko.com] > Sent: Thursday, January 21, 2021 10:17 PM > To: tiantao (H) <tiantao6@hisilicon.com> > Cc: Seth Jennings <sjenning@redhat.com>; Dan Streetman <ddstreet@ieee.org>; > Andrew Morton <akpm@linux-foundation.org>; Song Bao Hua (Barry Song) > <song.bao.hua@hisilicon.com>; Linux-MM <linux-mm@kvack.org> > Subject: Re: [RFC mm/zswap 1/2] mm/zswap: add the flag can_sleep_mapped > > On Fri, Dec 25, 2020 at 12:02 PM Tian Tao <tiantao6@hisilicon.com> wrote: > > > > add a flag to zpool, named is "can_sleep_mapped", and have it set true > > for zbud/z3fold, set false for zsmalloc. Then zswap could go the current > > path if the flag is true; and if it's false, copy data from src to a > > temporary buffer, then unmap the handle, take the mutex, process the > > buffer instead of src to avoid sleeping function called from atomic > > context. > > > > Signed-off-by: Tian Tao <tiantao6@hisilicon.com> > > --- > > include/linux/zpool.h | 3 +++ > > mm/zpool.c | 13 +++++++++++++ > > mm/zswap.c | 50 > +++++++++++++++++++++++++++++++++++++++++++++----- > > 3 files changed, 61 insertions(+), 5 deletions(-) > > > > diff --git a/include/linux/zpool.h b/include/linux/zpool.h > > index 51bf430..e899701 100644 > > --- a/include/linux/zpool.h > > +++ b/include/linux/zpool.h > > @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); > > * @malloc: allocate mem from a pool. > > * @free: free mem from a pool. > > * @shrink: shrink the pool. > > + * @sleep_mapped: whether zpool driver can sleep during map. > > * @map: map a handle. > > * @unmap: unmap a handle. > > * @total_size: get total size of a pool. > > @@ -100,6 +101,7 @@ struct zpool_driver { > > int (*shrink)(void *pool, unsigned int pages, > > unsigned int *reclaimed); > > > > + bool sleep_mapped; > > void *(*map)(void *pool, unsigned long handle, > > enum zpool_mapmode mm); > > void (*unmap)(void *pool, unsigned long handle); > > @@ -112,5 +114,6 @@ void zpool_register_driver(struct zpool_driver *driver); > > int zpool_unregister_driver(struct zpool_driver *driver); > > > > bool zpool_evictable(struct zpool *pool); > > +bool zpool_can_sleep_mapped(struct zpool *pool); > > > > #endif > > diff --git a/mm/zpool.c b/mm/zpool.c > > index 3744a2d..5ed7120 100644 > > --- a/mm/zpool.c > > +++ b/mm/zpool.c > > @@ -23,6 +23,7 @@ struct zpool { > > void *pool; > > const struct zpool_ops *ops; > > bool evictable; > > + bool can_sleep_mapped; > > > > struct list_head list; > > }; > > @@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const > char *name, gfp_t gfp, > > zpool->pool = driver->create(name, gfp, ops, zpool); > > zpool->ops = ops; > > zpool->evictable = driver->shrink && ops && ops->evict; > > + zpool->can_sleep_mapped = driver->sleep_mapped; > > > > if (!zpool->pool) { > > pr_err("couldn't create %s pool\n", type); > > @@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool) > > return zpool->evictable; > > } > > > > +/** > > + * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. > > + * @zpool: The zpool to test > > + * > > + * Returns: true if zpool can sleep; false otherwise. > > + */ > > +bool zpool_can_sleep_mapped(struct zpool *zpool) > > +{ > > + return zpool->can_sleep_mapped; > > +} > > + > > MODULE_LICENSE("GPL"); > > MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); > > MODULE_DESCRIPTION("Common API for compressed memory storage"); > > diff --git a/mm/zswap.c b/mm/zswap.c > > index 182f6ad..67d4555 100644 > > --- a/mm/zswap.c > > +++ b/mm/zswap.c > > @@ -935,13 +935,20 @@ static int zswap_writeback_entry(struct zpool *pool, > unsigned long handle) > > struct scatterlist input, output; > > struct crypto_acomp_ctx *acomp_ctx; > > > > - u8 *src; > > + u8 *src, *tmp; > > unsigned int dlen; > > int ret; > > struct writeback_control wbc = { > > .sync_mode = WB_SYNC_NONE, > > }; > > > > + if (!zpool_can_sleep_mapped(pool)) { > > + > > + tmp = kmalloc(entry->length, GFP_ATOMIC); > > This has escaped my attention, but this is obviously a bug. 'entry' is > not initialized at this point. > You either have to move memory allocation further down when > entry->length starts making sense, or allocate PAGE_SIZE. > Since zsmalloc has no evict entry, we have never arrived here. That's why the testing by both Tiantao and Sebastian didn't show the problem. > Best regards, > Vitaly > > > + if (!tmp) > > + return -ENOMEM; > > + } > > + > > /* extract swpentry from data */ > > zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); > > swpentry = zhdr->swpentry; /* here */ > > @@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, > unsigned long handle) > > dlen = PAGE_SIZE; > > src = (u8 *)zhdr + sizeof(struct zswap_header); > > > > + if (!zpool_can_sleep_mapped(pool)) { > > + > > + memcpy(tmp, src, entry->length); > > + src = tmp; > > + > > + zpool_unmap_handle(pool, handle); > > + } > > + > > mutex_lock(acomp_ctx->mutex); > > sg_init_one(&input, src, entry->length); > > sg_init_table(&output, 1); > > @@ -1033,7 +1048,11 @@ static int zswap_writeback_entry(struct zpool *pool, > unsigned long handle) > > spin_unlock(&tree->lock); > > > > end: > > - zpool_unmap_handle(pool, handle); > > + if (zpool_can_sleep_mapped(pool)) > > + zpool_unmap_handle(pool, handle); > > + else > > + kfree(tmp); > > + > > return ret; > > } > > > > @@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t > offset, > > struct zswap_entry *entry; > > struct scatterlist input, output; > > struct crypto_acomp_ctx *acomp_ctx; > > - u8 *src, *dst; > > + u8 *src, *dst, *tmp; > > unsigned int dlen; > > int ret; > > > > @@ -1256,12 +1275,29 @@ static int zswap_frontswap_load(unsigned type, pgoff_t > offset, > > goto freeentry; > > } > > > > + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { > > + > > + tmp = kmalloc(entry->length, GFP_ATOMIC); > > + if (!tmp) { > > + ret = -ENOMEM; > > + goto freeentry; > > + } > > + } > > + > > /* decompress */ > > dlen = PAGE_SIZE; > > src = zpool_map_handle(entry->pool->zpool, entry->handle, > ZPOOL_MM_RO); > > if (zpool_evictable(entry->pool->zpool)) > > src += sizeof(struct zswap_header); > > > > + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { > > + > > + memcpy(tmp, src, entry->length); > > + src = tmp; > > + > > + zpool_unmap_handle(entry->pool->zpool, entry->handle); > > + } > > + > > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > > mutex_lock(acomp_ctx->mutex); > > sg_init_one(&input, src, entry->length); > > @@ -1271,7 +1307,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t > offset, > > ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), > &acomp_ctx->wait); > > mutex_unlock(acomp_ctx->mutex); > > > > - zpool_unmap_handle(entry->pool->zpool, entry->handle); > > + if (zpool_can_sleep_mapped(entry->pool->zpool)) > > + zpool_unmap_handle(entry->pool->zpool, entry->handle); > > + else > > + kfree(tmp); > > + > > BUG_ON(ret); > > > > freeentry: > > @@ -1279,7 +1319,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t > offset, > > zswap_entry_put(tree, entry); > > spin_unlock(&tree->lock); > > > > - return 0; > > + return ret; > > } > > > > /* frees an entry in zswap */ > > -- > > 2.7.4 > > Thanks Barry
diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 51bf430..e899701 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool); * @malloc: allocate mem from a pool. * @free: free mem from a pool. * @shrink: shrink the pool. + * @sleep_mapped: whether zpool driver can sleep during map. * @map: map a handle. * @unmap: unmap a handle. * @total_size: get total size of a pool. @@ -100,6 +101,7 @@ struct zpool_driver { int (*shrink)(void *pool, unsigned int pages, unsigned int *reclaimed); + bool sleep_mapped; void *(*map)(void *pool, unsigned long handle, enum zpool_mapmode mm); void (*unmap)(void *pool, unsigned long handle); @@ -112,5 +114,6 @@ void zpool_register_driver(struct zpool_driver *driver); int zpool_unregister_driver(struct zpool_driver *driver); bool zpool_evictable(struct zpool *pool); +bool zpool_can_sleep_mapped(struct zpool *pool); #endif diff --git a/mm/zpool.c b/mm/zpool.c index 3744a2d..5ed7120 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -23,6 +23,7 @@ struct zpool { void *pool; const struct zpool_ops *ops; bool evictable; + bool can_sleep_mapped; struct list_head list; }; @@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; zpool->evictable = driver->shrink && ops && ops->evict; + zpool->can_sleep_mapped = driver->sleep_mapped; if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); @@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool) return zpool->evictable; } +/** + * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. + * @zpool: The zpool to test + * + * Returns: true if zpool can sleep; false otherwise. + */ +bool zpool_can_sleep_mapped(struct zpool *zpool) +{ + return zpool->can_sleep_mapped; +} + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zswap.c b/mm/zswap.c index 182f6ad..67d4555 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -935,13 +935,20 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - u8 *src; + u8 *src, *tmp; unsigned int dlen; int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; + if (!zpool_can_sleep_mapped(pool)) { + + tmp = kmalloc(entry->length, GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + } + /* extract swpentry from data */ zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); swpentry = zhdr->swpentry; /* here */ @@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) dlen = PAGE_SIZE; src = (u8 *)zhdr + sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(pool)) { + + memcpy(tmp, src, entry->length); + src = tmp; + + zpool_unmap_handle(pool, handle); + } + mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); @@ -1033,7 +1048,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) spin_unlock(&tree->lock); end: - zpool_unmap_handle(pool, handle); + if (zpool_can_sleep_mapped(pool)) + zpool_unmap_handle(pool, handle); + else + kfree(tmp); + return ret; } @@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, struct zswap_entry *entry; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - u8 *src, *dst; + u8 *src, *dst, *tmp; unsigned int dlen; int ret; @@ -1256,12 +1275,29 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, goto freeentry; } + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + + tmp = kmalloc(entry->length, GFP_ATOMIC); + if (!tmp) { + ret = -ENOMEM; + goto freeentry; + } + } + /* decompress */ dlen = PAGE_SIZE; src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); if (zpool_evictable(entry->pool->zpool)) src += sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + + memcpy(tmp, src, entry->length); + src = tmp; + + zpool_unmap_handle(entry->pool->zpool, entry->handle); + } + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); @@ -1271,7 +1307,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); mutex_unlock(acomp_ctx->mutex); - zpool_unmap_handle(entry->pool->zpool, entry->handle); + if (zpool_can_sleep_mapped(entry->pool->zpool)) + zpool_unmap_handle(entry->pool->zpool, entry->handle); + else + kfree(tmp); + BUG_ON(ret); freeentry: @@ -1279,7 +1319,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - return 0; + return ret; } /* frees an entry in zswap */
add a flag to zpool, named is "can_sleep_mapped", and have it set true for zbud/z3fold, set false for zsmalloc. Then zswap could go the current path if the flag is true; and if it's false, copy data from src to a temporary buffer, then unmap the handle, take the mutex, process the buffer instead of src to avoid sleeping function called from atomic context. Signed-off-by: Tian Tao <tiantao6@hisilicon.com> --- include/linux/zpool.h | 3 +++ mm/zpool.c | 13 +++++++++++++ mm/zswap.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 61 insertions(+), 5 deletions(-)