diff mbox series

mm/zsmalloc.c: Fix zsmalloc 32-bit PAE support

Message ID 20181210142105.6750-1-rafael.tinoco@linaro.org (mailing list archive)
State New, archived
Headers show
Series mm/zsmalloc.c: Fix zsmalloc 32-bit PAE support | expand

Commit Message

Rafael David Tinoco Dec. 10, 2018, 2:21 p.m. UTC
On 32-bit systems, zsmalloc uses HIGHMEM and, when PAE is enabled, the
physical frame number might be so big that zsmalloc obj encoding (to
location) will break, causing:

BUG: KASAN: null-ptr-deref in zs_map_object+0xa4/0x2bc
Read of size 4 at addr 00000000 by task mkfs.ext4/623
CPU: 2 PID: 623 Comm: mkfs.ext4 Not tainted 4.19.0-rc8-00017-g8239bc6d3307-dirty #15
Hardware name: Generic DT based system
[<c0418f7c>] (unwind_backtrace) from [<c0410ca4>] (show_stack+0x20/0x24)
[<c0410ca4>] (show_stack) from [<c16bd540>] (dump_stack+0xbc/0xe8)
[<c16bd540>] (dump_stack) from [<c06cab74>] (kasan_report+0x248/0x390)
[<c06cab74>] (kasan_report) from [<c06c94e8>] (__asan_load4+0x78/0xb4)
[<c06c94e8>] (__asan_load4) from [<c06ddc24>] (zs_map_object+0xa4/0x2bc)
[<c06ddc24>] (zs_map_object) from [<bf0bbbd8>] (zram_bvec_rw.constprop.2+0x324/0x8e4 [zram])
[<bf0bbbd8>] (zram_bvec_rw.constprop.2 [zram]) from [<bf0bc3cc>] (zram_make_request+0x234/0x46c [zram])
[<bf0bc3cc>] (zram_make_request [zram]) from [<c09aff9c>] (generic_make_request+0x304/0x63c)
[<c09aff9c>] (generic_make_request) from [<c09b0320>] (submit_bio+0x4c/0x1c8)
[<c09b0320>] (submit_bio) from [<c0743570>] (submit_bh_wbc.constprop.15+0x238/0x26c)
[<c0743570>] (submit_bh_wbc.constprop.15) from [<c0746cf8>] (__block_write_full_page+0x524/0x76c)
[<c0746cf8>] (__block_write_full_page) from [<c07472c4>] (block_write_full_page+0x1bc/0x1d4)
[<c07472c4>] (block_write_full_page) from [<c0748eb4>] (blkdev_writepage+0x24/0x28)
[<c0748eb4>] (blkdev_writepage) from [<c064a780>] (__writepage+0x44/0x78)
[<c064a780>] (__writepage) from [<c064b50c>] (write_cache_pages+0x3b8/0x800)
[<c064b50c>] (write_cache_pages) from [<c064dd78>] (generic_writepages+0x74/0xa0)
[<c064dd78>] (generic_writepages) from [<c0748e64>] (blkdev_writepages+0x18/0x1c)
[<c0748e64>] (blkdev_writepages) from [<c064e890>] (do_writepages+0x68/0x134)
[<c064e890>] (do_writepages) from [<c06368a4>] (__filemap_fdatawrite_range+0xb0/0xf4)
[<c06368a4>] (__filemap_fdatawrite_range) from [<c0636b68>] (file_write_and_wait_range+0x64/0xd0)
[<c0636b68>] (file_write_and_wait_range) from [<c0747af8>] (blkdev_fsync+0x54/0x84)
[<c0747af8>] (blkdev_fsync) from [<c0739dac>] (vfs_fsync_range+0x70/0xd4)
[<c0739dac>] (vfs_fsync_range) from [<c0739e98>] (do_fsync+0x4c/0x80)
[<c0739e98>] (do_fsync) from [<c073a26c>] (sys_fsync+0x1c/0x20)
[<c073a26c>] (sys_fsync) from [<c0401000>] (ret_fast_syscall+0x0/0x2c)

when trying to decode (the pfn) and map the object.

That happens because one architecture might not re-define
MAX_PHYSMEM_BITS, like in this ARM 32-bit w/ LPAE enabled example. For
32-bit systems, if not re-defined, MAX_POSSIBLE_PHYSMEM_BITS will
default to BITS_PER_LONG (32) in most cases, and, with PAE enabled,
_PFN_BITS might be wrong: which may cause obj variable to overflow if
frame number is in HIGHMEM and referencing a page above the 4GB
watermark.

commit 6e00ec00b1a7 ("staging: zsmalloc: calculate MAX_PHYSMEM_BITS if
not defined") realized MAX_PHYSMEM_BITS depended on SPARSEMEM headers
and "fixed" it by calculating it using BITS_PER_LONG if SPARSEMEM wasn't
used, like in the example given above.

Systems with potential for PAE exist for a long time and assuming
BITS_PER_LONG seems inadequate. Defining MAX_PHYSMEM_BITS looks better,
however it is NOT a constant anymore for x86.

SO, instead, MAX_POSSIBLE_PHYSMEM_BITS should be defined by every
architecture using zsmalloc, together with a sanity check for
MAX_POSSIBLE_PHYSMEM_BITS being too big on 32-bit systems.

Link: https://bugs.linaro.org/show_bug.cgi?id=3765#c17
Signed-off-by: Rafael David Tinoco <rafael.tinoco@linaro.org>
---
 arch/arm/include/asm/pgtable-2level-types.h |  2 ++
 arch/arm/include/asm/pgtable-3level-types.h |  2 ++
 arch/arm64/include/asm/pgtable-types.h      |  2 ++
 arch/ia64/include/asm/page.h                |  2 ++
 arch/mips/include/asm/page.h                |  2 ++
 arch/powerpc/include/asm/mmu.h              |  2 ++
 arch/s390/include/asm/page.h                |  2 ++
 arch/sh/include/asm/page.h                  |  2 ++
 arch/sparc/include/asm/page_32.h            |  2 ++
 arch/sparc/include/asm/page_64.h            |  2 ++
 arch/x86/include/asm/pgtable-2level_types.h |  2 ++
 arch/x86/include/asm/pgtable-3level_types.h |  3 +-
 arch/x86/include/asm/pgtable_64_types.h     |  4 +--
 mm/zsmalloc.c                               | 35 +++++++++++----------
 14 files changed, 45 insertions(+), 19 deletions(-)

Comments

Robin Murphy Dec. 10, 2018, 2:35 p.m. UTC | #1
On 10/12/2018 14:21, Rafael David Tinoco wrote:
> On 32-bit systems, zsmalloc uses HIGHMEM and, when PAE is enabled, the
> physical frame number might be so big that zsmalloc obj encoding (to
> location) will break, causing:
> 
> BUG: KASAN: null-ptr-deref in zs_map_object+0xa4/0x2bc
> Read of size 4 at addr 00000000 by task mkfs.ext4/623
> CPU: 2 PID: 623 Comm: mkfs.ext4 Not tainted 4.19.0-rc8-00017-g8239bc6d3307-dirty #15
> Hardware name: Generic DT based system
> [<c0418f7c>] (unwind_backtrace) from [<c0410ca4>] (show_stack+0x20/0x24)
> [<c0410ca4>] (show_stack) from [<c16bd540>] (dump_stack+0xbc/0xe8)
> [<c16bd540>] (dump_stack) from [<c06cab74>] (kasan_report+0x248/0x390)
> [<c06cab74>] (kasan_report) from [<c06c94e8>] (__asan_load4+0x78/0xb4)
> [<c06c94e8>] (__asan_load4) from [<c06ddc24>] (zs_map_object+0xa4/0x2bc)
> [<c06ddc24>] (zs_map_object) from [<bf0bbbd8>] (zram_bvec_rw.constprop.2+0x324/0x8e4 [zram])
> [<bf0bbbd8>] (zram_bvec_rw.constprop.2 [zram]) from [<bf0bc3cc>] (zram_make_request+0x234/0x46c [zram])
> [<bf0bc3cc>] (zram_make_request [zram]) from [<c09aff9c>] (generic_make_request+0x304/0x63c)
> [<c09aff9c>] (generic_make_request) from [<c09b0320>] (submit_bio+0x4c/0x1c8)
> [<c09b0320>] (submit_bio) from [<c0743570>] (submit_bh_wbc.constprop.15+0x238/0x26c)
> [<c0743570>] (submit_bh_wbc.constprop.15) from [<c0746cf8>] (__block_write_full_page+0x524/0x76c)
> [<c0746cf8>] (__block_write_full_page) from [<c07472c4>] (block_write_full_page+0x1bc/0x1d4)
> [<c07472c4>] (block_write_full_page) from [<c0748eb4>] (blkdev_writepage+0x24/0x28)
> [<c0748eb4>] (blkdev_writepage) from [<c064a780>] (__writepage+0x44/0x78)
> [<c064a780>] (__writepage) from [<c064b50c>] (write_cache_pages+0x3b8/0x800)
> [<c064b50c>] (write_cache_pages) from [<c064dd78>] (generic_writepages+0x74/0xa0)
> [<c064dd78>] (generic_writepages) from [<c0748e64>] (blkdev_writepages+0x18/0x1c)
> [<c0748e64>] (blkdev_writepages) from [<c064e890>] (do_writepages+0x68/0x134)
> [<c064e890>] (do_writepages) from [<c06368a4>] (__filemap_fdatawrite_range+0xb0/0xf4)
> [<c06368a4>] (__filemap_fdatawrite_range) from [<c0636b68>] (file_write_and_wait_range+0x64/0xd0)
> [<c0636b68>] (file_write_and_wait_range) from [<c0747af8>] (blkdev_fsync+0x54/0x84)
> [<c0747af8>] (blkdev_fsync) from [<c0739dac>] (vfs_fsync_range+0x70/0xd4)
> [<c0739dac>] (vfs_fsync_range) from [<c0739e98>] (do_fsync+0x4c/0x80)
> [<c0739e98>] (do_fsync) from [<c073a26c>] (sys_fsync+0x1c/0x20)
> [<c073a26c>] (sys_fsync) from [<c0401000>] (ret_fast_syscall+0x0/0x2c)
> 
> when trying to decode (the pfn) and map the object.
> 
> That happens because one architecture might not re-define
> MAX_PHYSMEM_BITS, like in this ARM 32-bit w/ LPAE enabled example. For
> 32-bit systems, if not re-defined, MAX_POSSIBLE_PHYSMEM_BITS will
> default to BITS_PER_LONG (32) in most cases, and, with PAE enabled,
> _PFN_BITS might be wrong: which may cause obj variable to overflow if
> frame number is in HIGHMEM and referencing a page above the 4GB
> watermark.
> 
> commit 6e00ec00b1a7 ("staging: zsmalloc: calculate MAX_PHYSMEM_BITS if
> not defined") realized MAX_PHYSMEM_BITS depended on SPARSEMEM headers
> and "fixed" it by calculating it using BITS_PER_LONG if SPARSEMEM wasn't
> used, like in the example given above.
> 
> Systems with potential for PAE exist for a long time and assuming
> BITS_PER_LONG seems inadequate. Defining MAX_PHYSMEM_BITS looks better,
> however it is NOT a constant anymore for x86.
> 
> SO, instead, MAX_POSSIBLE_PHYSMEM_BITS should be defined by every
> architecture using zsmalloc, together with a sanity check for
> MAX_POSSIBLE_PHYSMEM_BITS being too big on 32-bit systems.
> 
> Link: https://bugs.linaro.org/show_bug.cgi?id=3765#c17
> Signed-off-by: Rafael David Tinoco <rafael.tinoco@linaro.org>
> ---
>   arch/arm/include/asm/pgtable-2level-types.h |  2 ++
>   arch/arm/include/asm/pgtable-3level-types.h |  2 ++
>   arch/arm64/include/asm/pgtable-types.h      |  2 ++
>   arch/ia64/include/asm/page.h                |  2 ++
>   arch/mips/include/asm/page.h                |  2 ++
>   arch/powerpc/include/asm/mmu.h              |  2 ++
>   arch/s390/include/asm/page.h                |  2 ++
>   arch/sh/include/asm/page.h                  |  2 ++
>   arch/sparc/include/asm/page_32.h            |  2 ++
>   arch/sparc/include/asm/page_64.h            |  2 ++
>   arch/x86/include/asm/pgtable-2level_types.h |  2 ++
>   arch/x86/include/asm/pgtable-3level_types.h |  3 +-
>   arch/x86/include/asm/pgtable_64_types.h     |  4 +--
>   mm/zsmalloc.c                               | 35 +++++++++++----------
>   14 files changed, 45 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/arm/include/asm/pgtable-2level-types.h b/arch/arm/include/asm/pgtable-2level-types.h
> index 66cb5b0e89c5..552dba411324 100644
> --- a/arch/arm/include/asm/pgtable-2level-types.h
> +++ b/arch/arm/include/asm/pgtable-2level-types.h
> @@ -64,4 +64,6 @@ typedef pteval_t pgprot_t;
>   
>   #endif /* STRICT_MM_TYPECHECKS */
>   
> +#define MAX_POSSIBLE_PHYSMEM_BITS 32
> +
>   #endif	/* _ASM_PGTABLE_2LEVEL_TYPES_H */
> diff --git a/arch/arm/include/asm/pgtable-3level-types.h b/arch/arm/include/asm/pgtable-3level-types.h
> index 921aa30259c4..664c39e6517c 100644
> --- a/arch/arm/include/asm/pgtable-3level-types.h
> +++ b/arch/arm/include/asm/pgtable-3level-types.h
> @@ -67,4 +67,6 @@ typedef pteval_t pgprot_t;
>   
>   #endif	/* STRICT_MM_TYPECHECKS */
>   
> +#define MAX_POSSIBLE_PHYSMEM_BITS 36

Nit: with LPAE, physical addresses go up to 40 bits, not just 36.

Robin.

> +
>   #endif	/* _ASM_PGTABLE_3LEVEL_TYPES_H */
> diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h
> index 345a072b5856..45c3834eb4c8 100644
> --- a/arch/arm64/include/asm/pgtable-types.h
> +++ b/arch/arm64/include/asm/pgtable-types.h
> @@ -64,4 +64,6 @@ typedef struct { pteval_t pgprot; } pgprot_t;
>   #include <asm-generic/5level-fixup.h>
>   #endif
>   
> +#define MAX_POSSIBLE_PHYSMEM_BITS CONFIG_ARM64_PA_BITS
> +
>   #endif	/* __ASM_PGTABLE_TYPES_H */
> diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h
> index 5798bd2b462c..a3e055979e46 100644
> --- a/arch/ia64/include/asm/page.h
> +++ b/arch/ia64/include/asm/page.h
> @@ -235,4 +235,6 @@ get_order (unsigned long size)
>   
>   #define __HAVE_ARCH_GATE_AREA	1
>   
> +#define MAX_POSSIBLE_PHYSMEM_BITS 50
> +
>   #endif /* _ASM_IA64_PAGE_H */
> diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h
> index e8cc328fce2d..f6a5dea1a66c 100644
> --- a/arch/mips/include/asm/page.h
> +++ b/arch/mips/include/asm/page.h
> @@ -263,4 +263,6 @@ extern int __virt_addr_valid(const volatile void *kaddr);
>   #include <asm-generic/memory_model.h>
>   #include <asm-generic/getorder.h>
>   
> +#define MAX_POSSIBLE_PHYSMEM_BITS 48
> +
>   #endif /* _ASM_PAGE_H */
> diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
> index eb20eb3b8fb0..2ebc1d2d9a5c 100644
> --- a/arch/powerpc/include/asm/mmu.h
> +++ b/arch/powerpc/include/asm/mmu.h
> @@ -324,6 +324,8 @@ static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
>   #define MAX_PHYSMEM_BITS        46
>   #endif
>   
> +#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
> +
>   #ifdef CONFIG_PPC_BOOK3S_64
>   #include <asm/book3s/64/mmu.h>
>   #else /* CONFIG_PPC_BOOK3S_64 */
> diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
> index a4d38092530a..8abec1461bf7 100644
> --- a/arch/s390/include/asm/page.h
> +++ b/arch/s390/include/asm/page.h
> @@ -180,4 +180,6 @@ static inline int devmem_is_allowed(unsigned long pfn)
>   #include <asm-generic/memory_model.h>
>   #include <asm-generic/getorder.h>
>   
> +#define MAX_POSSIBLE_PHYSMEM_BITS CONFIG_MAX_PHYSMEM_BITS
> +
>   #endif /* _S390_PAGE_H */
> diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h
> index 5eef8be3e59f..40c7e12cf09e 100644
> --- a/arch/sh/include/asm/page.h
> +++ b/arch/sh/include/asm/page.h
> @@ -205,4 +205,6 @@ typedef struct page *pgtable_t;
>   #define ARCH_SLAB_MINALIGN	8
>   #endif
>   
> +#define MAX_POSSIBLE_PHYSMEM_BITS 32
> +
>   #endif /* __ASM_SH_PAGE_H */
> diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h
> index b76d59edec8c..14e9ca4659d7 100644
> --- a/arch/sparc/include/asm/page_32.h
> +++ b/arch/sparc/include/asm/page_32.h
> @@ -139,4 +139,6 @@ extern unsigned long pfn_base;
>   #include <asm-generic/memory_model.h>
>   #include <asm-generic/getorder.h>
>   
> +#define MAX_POSSIBLE_PHYSMEM_BITS 32
> +
>   #endif /* _SPARC_PAGE_H */
> diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
> index e80f2d5bf62f..6d6f3654ead1 100644
> --- a/arch/sparc/include/asm/page_64.h
> +++ b/arch/sparc/include/asm/page_64.h
> @@ -163,4 +163,6 @@ extern unsigned long PAGE_OFFSET;
>   
>   #include <asm-generic/getorder.h>
>   
> +#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYS_ADDRESS_BITS
> +
>   #endif /* _SPARC64_PAGE_H */
> diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
> index 6deb6cd236e3..c2eae59e6505 100644
> --- a/arch/x86/include/asm/pgtable-2level_types.h
> +++ b/arch/x86/include/asm/pgtable-2level_types.h
> @@ -38,4 +38,6 @@ typedef union {
>   /* This covers all VMSPLIT_* and VMSPLIT_*_OPT variants */
>   #define PGD_KERNEL_START	(CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
>   
> +#define MAX_POSSIBLE_PHYSMEM_BITS 32
> +
>   #endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */
> diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
> index 33845d36897c..5fce514a49a0 100644
> --- a/arch/x86/include/asm/pgtable-3level_types.h
> +++ b/arch/x86/include/asm/pgtable-3level_types.h
> @@ -45,7 +45,8 @@ typedef union {
>    */
>   #define PTRS_PER_PTE	512
>   
> -#define MAX_POSSIBLE_PHYSMEM_BITS	36
>   #define PGD_KERNEL_START	(CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
>   
> +#define MAX_POSSIBLE_PHYSMEM_BITS 36
> +
>   #endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
> index 84bd9bdc1987..d808cfde3d19 100644
> --- a/arch/x86/include/asm/pgtable_64_types.h
> +++ b/arch/x86/include/asm/pgtable_64_types.h
> @@ -64,8 +64,6 @@ extern unsigned int ptrs_per_p4d;
>   #define P4D_SIZE		(_AC(1, UL) << P4D_SHIFT)
>   #define P4D_MASK		(~(P4D_SIZE - 1))
>   
> -#define MAX_POSSIBLE_PHYSMEM_BITS	52
> -
>   #else /* CONFIG_X86_5LEVEL */
>   
>   /*
> @@ -154,4 +152,6 @@ extern unsigned int ptrs_per_p4d;
>   
>   #define PGD_KERNEL_START	((PAGE_SIZE / 2) / sizeof(pgd_t))
>   
> +#define MAX_POSSIBLE_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46)
> +
>   #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 0787d33b80d8..132c20b6fd4f 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -80,23 +80,7 @@
>    * as single (unsigned long) handle value.
>    *
>    * Note that object index <obj_idx> starts from 0.
> - *
> - * This is made more complicated by various memory models and PAE.
> - */
> -
> -#ifndef MAX_POSSIBLE_PHYSMEM_BITS
> -#ifdef MAX_PHYSMEM_BITS
> -#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
> -#else
> -/*
> - * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
> - * be PAGE_SHIFT
>    */
> -#define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
> -#endif
> -#endif
> -
> -#define _PFN_BITS		(MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
>   
>   /*
>    * Memory for allocating for handle keeps object position by
> @@ -116,6 +100,25 @@
>    */
>   #define OBJ_ALLOCATED_TAG 1
>   #define OBJ_TAG_BITS 1
> +
> +/*
> + * MAX_POSSIBLE_PHYSMEM_BITS should be defined by all archs using zsmalloc:
> + * Trying to guess it from MAX_PHYSMEM_BITS, or considering it BITS_PER_LONG,
> + * proved to be wrong by not considering PAE capabilities, or using SPARSEMEM
> + * only headers, leading to bad object encoding due to object index overflow.
> + */
> +#ifndef MAX_POSSIBLE_PHYSMEM_BITS
> + #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
> + #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";
> +#else
> + #ifndef CONFIG_64BIT
> +  #if (MAX_POSSIBLE_PHYSMEM_BITS >= (BITS_PER_LONG + PAGE_SHIFT - OBJ_TAG_BITS))
> +   #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";
> +  #endif
> + #endif
> +#endif
> +
> +#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
>   #define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
>   #define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
>   
>
Russell King (Oracle) Dec. 10, 2018, 3:05 p.m. UTC | #2
On Mon, Dec 10, 2018 at 02:35:55PM +0000, Robin Murphy wrote:
> On 10/12/2018 14:21, Rafael David Tinoco wrote:
> >On 32-bit systems, zsmalloc uses HIGHMEM and, when PAE is enabled, the
> >physical frame number might be so big that zsmalloc obj encoding (to
> >location) will break, causing:
> >
> >BUG: KASAN: null-ptr-deref in zs_map_object+0xa4/0x2bc
> >Read of size 4 at addr 00000000 by task mkfs.ext4/623
> >CPU: 2 PID: 623 Comm: mkfs.ext4 Not tainted 4.19.0-rc8-00017-g8239bc6d3307-dirty #15
> >Hardware name: Generic DT based system
> >[<c0418f7c>] (unwind_backtrace) from [<c0410ca4>] (show_stack+0x20/0x24)
> >[<c0410ca4>] (show_stack) from [<c16bd540>] (dump_stack+0xbc/0xe8)
> >[<c16bd540>] (dump_stack) from [<c06cab74>] (kasan_report+0x248/0x390)
> >[<c06cab74>] (kasan_report) from [<c06c94e8>] (__asan_load4+0x78/0xb4)
> >[<c06c94e8>] (__asan_load4) from [<c06ddc24>] (zs_map_object+0xa4/0x2bc)
> >[<c06ddc24>] (zs_map_object) from [<bf0bbbd8>] (zram_bvec_rw.constprop.2+0x324/0x8e4 [zram])
> >[<bf0bbbd8>] (zram_bvec_rw.constprop.2 [zram]) from [<bf0bc3cc>] (zram_make_request+0x234/0x46c [zram])
> >[<bf0bc3cc>] (zram_make_request [zram]) from [<c09aff9c>] (generic_make_request+0x304/0x63c)
> >[<c09aff9c>] (generic_make_request) from [<c09b0320>] (submit_bio+0x4c/0x1c8)
> >[<c09b0320>] (submit_bio) from [<c0743570>] (submit_bh_wbc.constprop.15+0x238/0x26c)
> >[<c0743570>] (submit_bh_wbc.constprop.15) from [<c0746cf8>] (__block_write_full_page+0x524/0x76c)
> >[<c0746cf8>] (__block_write_full_page) from [<c07472c4>] (block_write_full_page+0x1bc/0x1d4)
> >[<c07472c4>] (block_write_full_page) from [<c0748eb4>] (blkdev_writepage+0x24/0x28)
> >[<c0748eb4>] (blkdev_writepage) from [<c064a780>] (__writepage+0x44/0x78)
> >[<c064a780>] (__writepage) from [<c064b50c>] (write_cache_pages+0x3b8/0x800)
> >[<c064b50c>] (write_cache_pages) from [<c064dd78>] (generic_writepages+0x74/0xa0)
> >[<c064dd78>] (generic_writepages) from [<c0748e64>] (blkdev_writepages+0x18/0x1c)
> >[<c0748e64>] (blkdev_writepages) from [<c064e890>] (do_writepages+0x68/0x134)
> >[<c064e890>] (do_writepages) from [<c06368a4>] (__filemap_fdatawrite_range+0xb0/0xf4)
> >[<c06368a4>] (__filemap_fdatawrite_range) from [<c0636b68>] (file_write_and_wait_range+0x64/0xd0)
> >[<c0636b68>] (file_write_and_wait_range) from [<c0747af8>] (blkdev_fsync+0x54/0x84)
> >[<c0747af8>] (blkdev_fsync) from [<c0739dac>] (vfs_fsync_range+0x70/0xd4)
> >[<c0739dac>] (vfs_fsync_range) from [<c0739e98>] (do_fsync+0x4c/0x80)
> >[<c0739e98>] (do_fsync) from [<c073a26c>] (sys_fsync+0x1c/0x20)
> >[<c073a26c>] (sys_fsync) from [<c0401000>] (ret_fast_syscall+0x0/0x2c)
> >
> >when trying to decode (the pfn) and map the object.
> >
> >That happens because one architecture might not re-define
> >MAX_PHYSMEM_BITS, like in this ARM 32-bit w/ LPAE enabled example. For
> >32-bit systems, if not re-defined, MAX_POSSIBLE_PHYSMEM_BITS will
> >default to BITS_PER_LONG (32) in most cases, and, with PAE enabled,
> >_PFN_BITS might be wrong: which may cause obj variable to overflow if
> >frame number is in HIGHMEM and referencing a page above the 4GB
> >watermark.
> >
> >commit 6e00ec00b1a7 ("staging: zsmalloc: calculate MAX_PHYSMEM_BITS if
> >not defined") realized MAX_PHYSMEM_BITS depended on SPARSEMEM headers
> >and "fixed" it by calculating it using BITS_PER_LONG if SPARSEMEM wasn't
> >used, like in the example given above.
> >
> >Systems with potential for PAE exist for a long time and assuming
> >BITS_PER_LONG seems inadequate. Defining MAX_PHYSMEM_BITS looks better,
> >however it is NOT a constant anymore for x86.
> >
> >SO, instead, MAX_POSSIBLE_PHYSMEM_BITS should be defined by every
> >architecture using zsmalloc, together with a sanity check for
> >MAX_POSSIBLE_PHYSMEM_BITS being too big on 32-bit systems.
> >
> >Link: https://bugs.linaro.org/show_bug.cgi?id=3765#c17
> >Signed-off-by: Rafael David Tinoco <rafael.tinoco@linaro.org>
> >---
> >  arch/arm/include/asm/pgtable-2level-types.h |  2 ++
> >  arch/arm/include/asm/pgtable-3level-types.h |  2 ++
> >  arch/arm64/include/asm/pgtable-types.h      |  2 ++
> >  arch/ia64/include/asm/page.h                |  2 ++
> >  arch/mips/include/asm/page.h                |  2 ++
> >  arch/powerpc/include/asm/mmu.h              |  2 ++
> >  arch/s390/include/asm/page.h                |  2 ++
> >  arch/sh/include/asm/page.h                  |  2 ++
> >  arch/sparc/include/asm/page_32.h            |  2 ++
> >  arch/sparc/include/asm/page_64.h            |  2 ++
> >  arch/x86/include/asm/pgtable-2level_types.h |  2 ++
> >  arch/x86/include/asm/pgtable-3level_types.h |  3 +-
> >  arch/x86/include/asm/pgtable_64_types.h     |  4 +--
> >  mm/zsmalloc.c                               | 35 +++++++++++----------
> >  14 files changed, 45 insertions(+), 19 deletions(-)
> >
> >diff --git a/arch/arm/include/asm/pgtable-2level-types.h b/arch/arm/include/asm/pgtable-2level-types.h
> >index 66cb5b0e89c5..552dba411324 100644
> >--- a/arch/arm/include/asm/pgtable-2level-types.h
> >+++ b/arch/arm/include/asm/pgtable-2level-types.h
> >@@ -64,4 +64,6 @@ typedef pteval_t pgprot_t;
> >  #endif /* STRICT_MM_TYPECHECKS */
> >+#define MAX_POSSIBLE_PHYSMEM_BITS 32
> >+
> >  #endif	/* _ASM_PGTABLE_2LEVEL_TYPES_H */
> >diff --git a/arch/arm/include/asm/pgtable-3level-types.h b/arch/arm/include/asm/pgtable-3level-types.h
> >index 921aa30259c4..664c39e6517c 100644
> >--- a/arch/arm/include/asm/pgtable-3level-types.h
> >+++ b/arch/arm/include/asm/pgtable-3level-types.h
> >@@ -67,4 +67,6 @@ typedef pteval_t pgprot_t;
> >  #endif	/* STRICT_MM_TYPECHECKS */
> >+#define MAX_POSSIBLE_PHYSMEM_BITS 36
> 
> Nit: with LPAE, physical addresses go up to 40 bits, not just 36.

Right, with that set at 40, we get:

#define _PFN_BITS               (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)

== 28

#define OBJ_TAG_BITS 1
#define OBJ_INDEX_BITS  (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)

== 3

#define ZS_MAX_ZSPAGE_ORDER 2
#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)

== 4

#define ZS_MIN_ALLOC_SIZE \
        MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))

== 4 << 12 >> 3 = 2048

or half-page allocations.

Given this in Documentation/vm/zsmalloc.rst:

On the other hand, if we just use single
(0-order) pages, it would suffer from very high fragmentation --
any object of size PAGE_SIZE/2 or larger would occupy an entire page.
This was one of the major issues with its predecessor (xvmalloc).

It seems that would not be acceptable, so I have to ask whether
using an unsigned long to store PFN + object ID is really acceptable.
Maybe the real solution to this problem is to stop using an
unsigned long to contain both the PFN and object ID?
Kirill A. Shutemov Dec. 10, 2018, 3:15 p.m. UTC | #3
On Mon, Dec 10, 2018 at 12:21:05PM -0200, Rafael David Tinoco wrote:
> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
> index 84bd9bdc1987..d808cfde3d19 100644
> --- a/arch/x86/include/asm/pgtable_64_types.h
> +++ b/arch/x86/include/asm/pgtable_64_types.h
> @@ -64,8 +64,6 @@ extern unsigned int ptrs_per_p4d;
>  #define P4D_SIZE		(_AC(1, UL) << P4D_SHIFT)
>  #define P4D_MASK		(~(P4D_SIZE - 1))
>  
> -#define MAX_POSSIBLE_PHYSMEM_BITS	52
> -
>  #else /* CONFIG_X86_5LEVEL */
>  
>  /*
> @@ -154,4 +152,6 @@ extern unsigned int ptrs_per_p4d;
>  
>  #define PGD_KERNEL_START	((PAGE_SIZE / 2) / sizeof(pgd_t))
>  
> +#define MAX_POSSIBLE_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46)
> +

...

>  #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 0787d33b80d8..132c20b6fd4f 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c

...

> @@ -116,6 +100,25 @@
>   */
>  #define OBJ_ALLOCATED_TAG 1
>  #define OBJ_TAG_BITS 1
> +
> +/*
> + * MAX_POSSIBLE_PHYSMEM_BITS should be defined by all archs using zsmalloc:
> + * Trying to guess it from MAX_PHYSMEM_BITS, or considering it BITS_PER_LONG,
> + * proved to be wrong by not considering PAE capabilities, or using SPARSEMEM
> + * only headers, leading to bad object encoding due to object index overflow.
> + */
> +#ifndef MAX_POSSIBLE_PHYSMEM_BITS
> + #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
> + #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";
> +#else
> + #ifndef CONFIG_64BIT
> +  #if (MAX_POSSIBLE_PHYSMEM_BITS >= (BITS_PER_LONG + PAGE_SHIFT - OBJ_TAG_BITS))
> +   #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";
> +  #endif
> + #endif
> +#endif
> +
> +#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
>  #define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
>  #define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)

Have you tested it with CONFIG_X86_5LEVEL=y?

ASAICS, the patch makes OBJ_INDEX_BITS and what depends from it dynamic --
it depends what paging mode we are booting in. ZS_SIZE_CLASSES depends
indirectly on OBJ_INDEX_BITS and I don't see how struct zs_pool definition
can compile with dynamic ZS_SIZE_CLASSES.

Hm?
Rafael David Tinoco Dec. 10, 2018, 4:48 p.m. UTC | #4
On 12/10/18 1:15 PM, Kirill A. Shutemov wrote:
> On Mon, Dec 10, 2018 at 12:21:05PM -0200, Rafael David Tinoco wrote:
>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>> index 84bd9bdc1987..d808cfde3d19 100644
>> --- a/arch/x86/include/asm/pgtable_64_types.h
>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>> @@ -64,8 +64,6 @@ extern unsigned int ptrs_per_p4d;
>>  #define P4D_SIZE		(_AC(1, UL) << P4D_SHIFT)
>>  #define P4D_MASK		(~(P4D_SIZE - 1))
>>  
>> -#define MAX_POSSIBLE_PHYSMEM_BITS	52
>> -
>>  #else /* CONFIG_X86_5LEVEL */
>>  
>>  /*
>> @@ -154,4 +152,6 @@ extern unsigned int ptrs_per_p4d;
>>  
>>  #define PGD_KERNEL_START	((PAGE_SIZE / 2) / sizeof(pgd_t))
>>  
>> +#define MAX_POSSIBLE_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46)
>> +
> 
> ...
> 
>>  #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
>> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
>> index 0787d33b80d8..132c20b6fd4f 100644
>> --- a/mm/zsmalloc.c
>> +++ b/mm/zsmalloc.c
> 
> ...
> 
>> @@ -116,6 +100,25 @@
>>   */
>>  #define OBJ_ALLOCATED_TAG 1
>>  #define OBJ_TAG_BITS 1
>> +
>> +/*
>> + * MAX_POSSIBLE_PHYSMEM_BITS should be defined by all archs using zsmalloc:
>> + * Trying to guess it from MAX_PHYSMEM_BITS, or considering it BITS_PER_LONG,
>> + * proved to be wrong by not considering PAE capabilities, or using SPARSEMEM
>> + * only headers, leading to bad object encoding due to object index overflow.
>> + */
>> +#ifndef MAX_POSSIBLE_PHYSMEM_BITS
>> + #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
>> + #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";
>> +#else
>> + #ifndef CONFIG_64BIT
>> +  #if (MAX_POSSIBLE_PHYSMEM_BITS >= (BITS_PER_LONG + PAGE_SHIFT - OBJ_TAG_BITS))
>> +   #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";
>> +  #endif
>> + #endif
>> +#endif
>> +
>> +#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
>>  #define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
>>  #define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
> 
> Have you tested it with CONFIG_X86_5LEVEL=y?
> 
> ASAICS, the patch makes OBJ_INDEX_BITS and what depends from it dynamic --
> it depends what paging mode we are booting in. ZS_SIZE_CLASSES depends
> indirectly on OBJ_INDEX_BITS and I don't see how struct zs_pool definition
> can compile with dynamic ZS_SIZE_CLASSES.
> 
> Hm?
> 

You're right, terribly sorry. This was a last time change.

mm/zsmalloc.c:256:21: error: variably modified ‘size_class’ at file scope

I'll revisit the patch. Any other comments are welcome.

Thank you
Rafael David Tinoco Dec. 10, 2018, 4:53 p.m. UTC | #5
On 12/10/18 1:05 PM, Russell King - ARM Linux wrote:
> On Mon, Dec 10, 2018 at 02:35:55PM +0000, Robin Murphy wrote:
>> On 10/12/2018 14:21, Rafael David Tinoco wrote:
>>> On 32-bit systems, zsmalloc uses HIGHMEM and, when PAE is enabled, the
>>> physical frame number might be so big that zsmalloc obj encoding (to
>>> location) will break, causing:
>>>
>>> BUG: KASAN: null-ptr-deref in zs_map_object+0xa4/0x2bc
>>> Read of size 4 at addr 00000000 by task mkfs.ext4/623
>>> CPU: 2 PID: 623 Comm: mkfs.ext4 Not tainted 4.19.0-rc8-00017-g8239bc6d3307-dirty #15
>>> Hardware name: Generic DT based system
>>> [<c0418f7c>] (unwind_backtrace) from [<c0410ca4>] (show_stack+0x20/0x24)
>>> [<c0410ca4>] (show_stack) from [<c16bd540>] (dump_stack+0xbc/0xe8)
>>> [<c16bd540>] (dump_stack) from [<c06cab74>] (kasan_report+0x248/0x390)
>>> [<c06cab74>] (kasan_report) from [<c06c94e8>] (__asan_load4+0x78/0xb4)
>>> [<c06c94e8>] (__asan_load4) from [<c06ddc24>] (zs_map_object+0xa4/0x2bc)
>>> [<c06ddc24>] (zs_map_object) from [<bf0bbbd8>] (zram_bvec_rw.constprop.2+0x324/0x8e4 [zram])
>>> [<bf0bbbd8>] (zram_bvec_rw.constprop.2 [zram]) from [<bf0bc3cc>] (zram_make_request+0x234/0x46c [zram])
>>> [<bf0bc3cc>] (zram_make_request [zram]) from [<c09aff9c>] (generic_make_request+0x304/0x63c)
>>> [<c09aff9c>] (generic_make_request) from [<c09b0320>] (submit_bio+0x4c/0x1c8)
>>> [<c09b0320>] (submit_bio) from [<c0743570>] (submit_bh_wbc.constprop.15+0x238/0x26c)
>>> [<c0743570>] (submit_bh_wbc.constprop.15) from [<c0746cf8>] (__block_write_full_page+0x524/0x76c)
>>> [<c0746cf8>] (__block_write_full_page) from [<c07472c4>] (block_write_full_page+0x1bc/0x1d4)
>>> [<c07472c4>] (block_write_full_page) from [<c0748eb4>] (blkdev_writepage+0x24/0x28)
>>> [<c0748eb4>] (blkdev_writepage) from [<c064a780>] (__writepage+0x44/0x78)
>>> [<c064a780>] (__writepage) from [<c064b50c>] (write_cache_pages+0x3b8/0x800)
>>> [<c064b50c>] (write_cache_pages) from [<c064dd78>] (generic_writepages+0x74/0xa0)
>>> [<c064dd78>] (generic_writepages) from [<c0748e64>] (blkdev_writepages+0x18/0x1c)
>>> [<c0748e64>] (blkdev_writepages) from [<c064e890>] (do_writepages+0x68/0x134)
>>> [<c064e890>] (do_writepages) from [<c06368a4>] (__filemap_fdatawrite_range+0xb0/0xf4)
>>> [<c06368a4>] (__filemap_fdatawrite_range) from [<c0636b68>] (file_write_and_wait_range+0x64/0xd0)
>>> [<c0636b68>] (file_write_and_wait_range) from [<c0747af8>] (blkdev_fsync+0x54/0x84)
>>> [<c0747af8>] (blkdev_fsync) from [<c0739dac>] (vfs_fsync_range+0x70/0xd4)
>>> [<c0739dac>] (vfs_fsync_range) from [<c0739e98>] (do_fsync+0x4c/0x80)
>>> [<c0739e98>] (do_fsync) from [<c073a26c>] (sys_fsync+0x1c/0x20)
>>> [<c073a26c>] (sys_fsync) from [<c0401000>] (ret_fast_syscall+0x0/0x2c)
>>>
>>> when trying to decode (the pfn) and map the object.
>>>
>>> That happens because one architecture might not re-define
>>> MAX_PHYSMEM_BITS, like in this ARM 32-bit w/ LPAE enabled example. For
>>> 32-bit systems, if not re-defined, MAX_POSSIBLE_PHYSMEM_BITS will
>>> default to BITS_PER_LONG (32) in most cases, and, with PAE enabled,
>>> _PFN_BITS might be wrong: which may cause obj variable to overflow if
>>> frame number is in HIGHMEM and referencing a page above the 4GB
>>> watermark.
>>>
>>> commit 6e00ec00b1a7 ("staging: zsmalloc: calculate MAX_PHYSMEM_BITS if
>>> not defined") realized MAX_PHYSMEM_BITS depended on SPARSEMEM headers
>>> and "fixed" it by calculating it using BITS_PER_LONG if SPARSEMEM wasn't
>>> used, like in the example given above.
>>>
>>> Systems with potential for PAE exist for a long time and assuming
>>> BITS_PER_LONG seems inadequate. Defining MAX_PHYSMEM_BITS looks better,
>>> however it is NOT a constant anymore for x86.
>>>
>>> SO, instead, MAX_POSSIBLE_PHYSMEM_BITS should be defined by every
>>> architecture using zsmalloc, together with a sanity check for
>>> MAX_POSSIBLE_PHYSMEM_BITS being too big on 32-bit systems.
>>>
>>> Link: https://bugs.linaro.org/show_bug.cgi?id=3765#c17
>>> Signed-off-by: Rafael David Tinoco <rafael.tinoco@linaro.org>
>>> ---
>>>  arch/arm/include/asm/pgtable-2level-types.h |  2 ++
>>>  arch/arm/include/asm/pgtable-3level-types.h |  2 ++
>>>  arch/arm64/include/asm/pgtable-types.h      |  2 ++
>>>  arch/ia64/include/asm/page.h                |  2 ++
>>>  arch/mips/include/asm/page.h                |  2 ++
>>>  arch/powerpc/include/asm/mmu.h              |  2 ++
>>>  arch/s390/include/asm/page.h                |  2 ++
>>>  arch/sh/include/asm/page.h                  |  2 ++
>>>  arch/sparc/include/asm/page_32.h            |  2 ++
>>>  arch/sparc/include/asm/page_64.h            |  2 ++
>>>  arch/x86/include/asm/pgtable-2level_types.h |  2 ++
>>>  arch/x86/include/asm/pgtable-3level_types.h |  3 +-
>>>  arch/x86/include/asm/pgtable_64_types.h     |  4 +--
>>>  mm/zsmalloc.c                               | 35 +++++++++++----------
>>>  14 files changed, 45 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/arch/arm/include/asm/pgtable-2level-types.h b/arch/arm/include/asm/pgtable-2level-types.h
>>> index 66cb5b0e89c5..552dba411324 100644
>>> --- a/arch/arm/include/asm/pgtable-2level-types.h
>>> +++ b/arch/arm/include/asm/pgtable-2level-types.h
>>> @@ -64,4 +64,6 @@ typedef pteval_t pgprot_t;
>>>  #endif /* STRICT_MM_TYPECHECKS */
>>> +#define MAX_POSSIBLE_PHYSMEM_BITS 32
>>> +
>>>  #endif	/* _ASM_PGTABLE_2LEVEL_TYPES_H */
>>> diff --git a/arch/arm/include/asm/pgtable-3level-types.h b/arch/arm/include/asm/pgtable-3level-types.h
>>> index 921aa30259c4..664c39e6517c 100644
>>> --- a/arch/arm/include/asm/pgtable-3level-types.h
>>> +++ b/arch/arm/include/asm/pgtable-3level-types.h
>>> @@ -67,4 +67,6 @@ typedef pteval_t pgprot_t;
>>>  #endif	/* STRICT_MM_TYPECHECKS */
>>> +#define MAX_POSSIBLE_PHYSMEM_BITS 36
>>
>> Nit: with LPAE, physical addresses go up to 40 bits, not just 36.

Hum, I got it from where it was being defined when having SPARSEMEM
enabled (#define MAX_PHYSMEM_BITS 36 in
arch/arm/include/asm/sparsemem.h), since without SPARSEMEM it would
default to BITS_PER_LONG.

> Right, with that set at 40, we get:
> 
> #define _PFN_BITS               (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
> 
> == 28
> 
> #define OBJ_TAG_BITS 1
> #define OBJ_INDEX_BITS  (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
> 
> == 3
> 
> #define ZS_MAX_ZSPAGE_ORDER 2
> #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
> 
> == 4
> 
> #define ZS_MIN_ALLOC_SIZE \
>         MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
> 
> == 4 << 12 >> 3 = 2048
> 
> or half-page allocations.
> 
> Given this in Documentation/vm/zsmalloc.rst:
> 
> On the other hand, if we just use single
> (0-order) pages, it would suffer from very high fragmentation --
> any object of size PAGE_SIZE/2 or larger would occupy an entire page.
> This was one of the major issues with its predecessor (xvmalloc).
> 
> It seems that would not be acceptable, so I have to ask whether
> using an unsigned long to store PFN + object ID is really acceptable.
> Maybe the real solution to this problem is to stop using an
> unsigned long to contain both the PFN and object ID?
>
kernel test robot Dec. 13, 2018, 6:17 a.m. UTC | #6
Hi Rafael,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.20-rc6 next-20181212]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Rafael-David-Tinoco/mm-zsmalloc-c-Fix-zsmalloc-32-bit-PAE-support/20181211-020704
config: mips-allmodconfig (attached as .config)
compiler: mips-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.2.0 make.cross ARCH=mips 

All error/warnings (new ones prefixed by >>):

>> mm/zsmalloc.c:116:5: error: #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";
       #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";
        ^~~~~
   In file included from include/linux/cache.h:5:0,
                    from arch/mips/include/asm/cpu-info.h:15,
                    from arch/mips/include/asm/cpu-features.h:13,
                    from arch/mips/include/asm/bitops.h:21,
                    from include/linux/bitops.h:19,
                    from include/linux/kernel.h:11,
                    from include/linux/list.h:9,
                    from include/linux/module.h:9,
                    from mm/zsmalloc.c:33:
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]
     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/uapi/linux/kernel.h:13:40: note: in definition of macro '__KERNEL_DIV_ROUND_UP'
    #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
                                           ^
>> mm/zsmalloc.c:133:2: note: in expansion of macro 'MAX'
     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
     ^~~
>> mm/zsmalloc.c:151:59: note: in expansion of macro 'ZS_MIN_ALLOC_SIZE'
    #define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
                                                              ^~~~~~~~~~~~~~~~~
>> mm/zsmalloc.c:256:32: note: in expansion of macro 'ZS_SIZE_CLASSES'
     struct size_class *size_class[ZS_SIZE_CLASSES];
                                   ^~~~~~~~~~~~~~~
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]
     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/uapi/linux/kernel.h:13:40: note: in definition of macro '__KERNEL_DIV_ROUND_UP'
    #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
                                           ^
>> mm/zsmalloc.c:133:2: note: in expansion of macro 'MAX'
     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
     ^~~
>> mm/zsmalloc.c:151:59: note: in expansion of macro 'ZS_MIN_ALLOC_SIZE'
    #define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
                                                              ^~~~~~~~~~~~~~~~~
>> mm/zsmalloc.c:256:32: note: in expansion of macro 'ZS_SIZE_CLASSES'
     struct size_class *size_class[ZS_SIZE_CLASSES];
                                   ^~~~~~~~~~~~~~~
>> mm/zsmalloc.c:256:21: error: variably modified 'size_class' at file scope
     struct size_class *size_class[ZS_SIZE_CLASSES];
                        ^~~~~~~~~~
   In file included from include/linux/kernel.h:10:0,
                    from include/linux/list.h:9,
                    from include/linux/module.h:9,
                    from mm/zsmalloc.c:33:
   mm/zsmalloc.c: In function 'get_size_class_index':
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]
     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/linux/compiler.h:76:40: note: in definition of macro 'likely'
    # define likely(x) __builtin_expect(!!(x), 1)
                                           ^
>> mm/zsmalloc.c:133:2: note: in expansion of macro 'MAX'
     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
     ^~~
   mm/zsmalloc.c:543:20: note: in expansion of macro 'ZS_MIN_ALLOC_SIZE'
     if (likely(size > ZS_MIN_ALLOC_SIZE))
                       ^~~~~~~~~~~~~~~~~
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]
     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/linux/compiler.h:76:40: note: in definition of macro 'likely'
    # define likely(x) __builtin_expect(!!(x), 1)
                                           ^
>> mm/zsmalloc.c:133:2: note: in expansion of macro 'MAX'
     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
     ^~~
   mm/zsmalloc.c:543:20: note: in expansion of macro 'ZS_MIN_ALLOC_SIZE'
     if (likely(size > ZS_MIN_ALLOC_SIZE))
                       ^~~~~~~~~~~~~~~~~
   In file included from include/linux/cache.h:5:0,
                    from arch/mips/include/asm/cpu-info.h:15,
                    from arch/mips/include/asm/cpu-features.h:13,
                    from arch/mips/include/asm/bitops.h:21,
                    from include/linux/bitops.h:19,
                    from include/linux/kernel.h:11,
                    from include/linux/list.h:9,
                    from include/linux/module.h:9,
                    from mm/zsmalloc.c:33:
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]
     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/uapi/linux/kernel.h:13:40: note: in definition of macro '__KERNEL_DIV_ROUND_UP'
    #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
                                           ^
>> mm/zsmalloc.c:133:2: note: in expansion of macro 'MAX'
     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
     ^~~
   mm/zsmalloc.c:544:29: note: in expansion of macro 'ZS_MIN_ALLOC_SIZE'
      idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
                                ^~~~~~~~~~~~~~~~~
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]
     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/uapi/linux/kernel.h:13:40: note: in definition of macro '__KERNEL_DIV_ROUND_UP'
    #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
                                           ^
>> mm/zsmalloc.c:133:2: note: in expansion of macro 'MAX'
     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
     ^~~
   mm/zsmalloc.c:544:29: note: in expansion of macro 'ZS_MIN_ALLOC_SIZE'
      idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
                                ^~~~~~~~~~~~~~~~~
   In file included from include/linux/list.h:9:0,
                    from include/linux/module.h:9,
                    from mm/zsmalloc.c:33:
>> mm/zsmalloc.c:133:49: warning: right shift count is negative [-Wshift-count-negative]
     MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
                                                    ^
   include/linux/kernel.h:861:27: note: in definition of macro '__cmp'
    #define __cmp(x, y, op) ((x) op (y) ? (x) : (y))
                              ^
   include/linux/kernel.h:937:27: note: in expansion of macro '__careful_cmp'
    #define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <)
                              ^~~~~~~~~~~~~
>> mm/zsmalloc.c:547:9: note: in expansion of macro 'min_t'
     return min_t(int, ZS_SIZE_CLASSES - 1, idx);
            ^~~~~

vim +116 mm/zsmalloc.c

    32	
  > 33	#include <linux/module.h>
    34	#include <linux/kernel.h>
    35	#include <linux/sched.h>
    36	#include <linux/magic.h>
    37	#include <linux/bitops.h>
    38	#include <linux/errno.h>
    39	#include <linux/highmem.h>
    40	#include <linux/string.h>
    41	#include <linux/slab.h>
    42	#include <asm/tlbflush.h>
    43	#include <asm/pgtable.h>
    44	#include <linux/cpumask.h>
    45	#include <linux/cpu.h>
    46	#include <linux/vmalloc.h>
    47	#include <linux/preempt.h>
    48	#include <linux/spinlock.h>
    49	#include <linux/shrinker.h>
    50	#include <linux/types.h>
    51	#include <linux/debugfs.h>
    52	#include <linux/zsmalloc.h>
    53	#include <linux/zpool.h>
    54	#include <linux/mount.h>
    55	#include <linux/migrate.h>
    56	#include <linux/pagemap.h>
    57	#include <linux/fs.h>
    58	
    59	#define ZSPAGE_MAGIC	0x58
    60	
    61	/*
    62	 * This must be power of 2 and greater than of equal to sizeof(link_free).
    63	 * These two conditions ensure that any 'struct link_free' itself doesn't
    64	 * span more than 1 page which avoids complex case of mapping 2 pages simply
    65	 * to restore link_free pointer values.
    66	 */
    67	#define ZS_ALIGN		8
    68	
    69	/*
    70	 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
    71	 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
    72	 */
    73	#define ZS_MAX_ZSPAGE_ORDER 2
    74	#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
    75	
    76	#define ZS_HANDLE_SIZE (sizeof(unsigned long))
    77	
    78	/*
    79	 * Object location (<PFN>, <obj_idx>) is encoded as
    80	 * as single (unsigned long) handle value.
    81	 *
    82	 * Note that object index <obj_idx> starts from 0.
    83	 */
    84	
    85	/*
    86	 * Memory for allocating for handle keeps object position by
    87	 * encoding <page, obj_idx> and the encoded value has a room
    88	 * in least bit(ie, look at obj_to_location).
    89	 * We use the bit to synchronize between object access by
    90	 * user and migration.
    91	 */
    92	#define HANDLE_PIN_BIT	0
    93	
    94	/*
    95	 * Head in allocated object should have OBJ_ALLOCATED_TAG
    96	 * to identify the object was allocated or not.
    97	 * It's okay to add the status bit in the least bit because
    98	 * header keeps handle which is 4byte-aligned address so we
    99	 * have room for two bit at least.
   100	 */
   101	#define OBJ_ALLOCATED_TAG 1
   102	#define OBJ_TAG_BITS 1
   103	
   104	/*
   105	 * MAX_POSSIBLE_PHYSMEM_BITS should be defined by all archs using zsmalloc:
   106	 * Trying to guess it from MAX_PHYSMEM_BITS, or considering it BITS_PER_LONG,
   107	 * proved to be wrong by not considering PAE capabilities, or using SPARSEMEM
   108	 * only headers, leading to bad object encoding due to object index overflow.
   109	 */
   110	#ifndef MAX_POSSIBLE_PHYSMEM_BITS
   111	 #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
   112	 #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";
   113	#else
   114	 #ifndef CONFIG_64BIT
   115	  #if (MAX_POSSIBLE_PHYSMEM_BITS >= (BITS_PER_LONG + PAGE_SHIFT - OBJ_TAG_BITS))
 > 116	   #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";
   117	  #endif
   118	 #endif
   119	#endif
   120	
   121	#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
   122	#define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
 > 123	#define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
   124	
   125	#define FULLNESS_BITS	2
   126	#define CLASS_BITS	8
   127	#define ISOLATED_BITS	3
   128	#define MAGIC_VAL_BITS	8
   129	
   130	#define MAX(a, b) ((a) >= (b) ? (a) : (b))
   131	/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
   132	#define ZS_MIN_ALLOC_SIZE \
 > 133		MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
   134	/* each chunk includes extra space to keep handle */
   135	#define ZS_MAX_ALLOC_SIZE	PAGE_SIZE
   136	
   137	/*
   138	 * On systems with 4K page size, this gives 255 size classes! There is a
   139	 * trader-off here:
   140	 *  - Large number of size classes is potentially wasteful as free page are
   141	 *    spread across these classes
   142	 *  - Small number of size classes causes large internal fragmentation
   143	 *  - Probably its better to use specific size classes (empirically
   144	 *    determined). NOTE: all those class sizes must be set as multiple of
   145	 *    ZS_ALIGN to make sure link_free itself never has to span 2 pages.
   146	 *
   147	 *  ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
   148	 *  (reason above)
   149	 */
   150	#define ZS_SIZE_CLASS_DELTA	(PAGE_SIZE >> CLASS_BITS)
 > 151	#define ZS_SIZE_CLASSES	(DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
   152					      ZS_SIZE_CLASS_DELTA) + 1)
   153	
   154	enum fullness_group {
   155		ZS_EMPTY,
   156		ZS_ALMOST_EMPTY,
   157		ZS_ALMOST_FULL,
   158		ZS_FULL,
   159		NR_ZS_FULLNESS,
   160	};
   161	
   162	enum zs_stat_type {
   163		CLASS_EMPTY,
   164		CLASS_ALMOST_EMPTY,
   165		CLASS_ALMOST_FULL,
   166		CLASS_FULL,
   167		OBJ_ALLOCATED,
   168		OBJ_USED,
   169		NR_ZS_STAT_TYPE,
   170	};
   171	
   172	struct zs_size_stat {
   173		unsigned long objs[NR_ZS_STAT_TYPE];
   174	};
   175	
   176	#ifdef CONFIG_ZSMALLOC_STAT
   177	static struct dentry *zs_stat_root;
   178	#endif
   179	
   180	#ifdef CONFIG_COMPACTION
   181	static struct vfsmount *zsmalloc_mnt;
   182	#endif
   183	
   184	/*
   185	 * We assign a page to ZS_ALMOST_EMPTY fullness group when:
   186	 *	n <= N / f, where
   187	 * n = number of allocated objects
   188	 * N = total number of objects zspage can store
   189	 * f = fullness_threshold_frac
   190	 *
   191	 * Similarly, we assign zspage to:
   192	 *	ZS_ALMOST_FULL	when n > N / f
   193	 *	ZS_EMPTY	when n == 0
   194	 *	ZS_FULL		when n == N
   195	 *
   196	 * (see: fix_fullness_group())
   197	 */
   198	static const int fullness_threshold_frac = 4;
   199	static size_t huge_class_size;
   200	
   201	struct size_class {
   202		spinlock_t lock;
   203		struct list_head fullness_list[NR_ZS_FULLNESS];
   204		/*
   205		 * Size of objects stored in this class. Must be multiple
   206		 * of ZS_ALIGN.
   207		 */
   208		int size;
   209		int objs_per_zspage;
   210		/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
   211		int pages_per_zspage;
   212	
   213		unsigned int index;
   214		struct zs_size_stat stats;
   215	};
   216	
   217	/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
   218	static void SetPageHugeObject(struct page *page)
   219	{
   220		SetPageOwnerPriv1(page);
   221	}
   222	
   223	static void ClearPageHugeObject(struct page *page)
   224	{
   225		ClearPageOwnerPriv1(page);
   226	}
   227	
   228	static int PageHugeObject(struct page *page)
   229	{
   230		return PageOwnerPriv1(page);
   231	}
   232	
   233	/*
   234	 * Placed within free objects to form a singly linked list.
   235	 * For every zspage, zspage->freeobj gives head of this list.
   236	 *
   237	 * This must be power of 2 and less than or equal to ZS_ALIGN
   238	 */
   239	struct link_free {
   240		union {
   241			/*
   242			 * Free object index;
   243			 * It's valid for non-allocated object
   244			 */
   245			unsigned long next;
   246			/*
   247			 * Handle of allocated object.
   248			 */
   249			unsigned long handle;
   250		};
   251	};
   252	
   253	struct zs_pool {
   254		const char *name;
   255	
 > 256		struct size_class *size_class[ZS_SIZE_CLASSES];
   257		struct kmem_cache *handle_cachep;
   258		struct kmem_cache *zspage_cachep;
   259	
   260		atomic_long_t pages_allocated;
   261	
   262		struct zs_pool_stats stats;
   263	
   264		/* Compact classes */
   265		struct shrinker shrinker;
   266	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot Dec. 13, 2018, 7:27 a.m. UTC | #7
Hi Rafael,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.20-rc6 next-20181212]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Rafael-David-Tinoco/mm-zsmalloc-c-Fix-zsmalloc-32-bit-PAE-support/20181211-020704
config: xtensa-allmodconfig (attached as .config)
compiler: xtensa-linux-gcc (GCC) 8.1.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=8.1.0 make.cross ARCH=xtensa 

All errors (new ones prefixed by >>):

>> mm/zsmalloc.c:112:3: error: #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";
     #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";
      ^~~~~

vim +112 mm/zsmalloc.c

   103	
   104	/*
   105	 * MAX_POSSIBLE_PHYSMEM_BITS should be defined by all archs using zsmalloc:
   106	 * Trying to guess it from MAX_PHYSMEM_BITS, or considering it BITS_PER_LONG,
   107	 * proved to be wrong by not considering PAE capabilities, or using SPARSEMEM
   108	 * only headers, leading to bad object encoding due to object index overflow.
   109	 */
   110	#ifndef MAX_POSSIBLE_PHYSMEM_BITS
   111	 #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
 > 112	 #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";
   113	#else
   114	 #ifndef CONFIG_64BIT
   115	  #if (MAX_POSSIBLE_PHYSMEM_BITS >= (BITS_PER_LONG + PAGE_SHIFT - OBJ_TAG_BITS))
   116	   #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";
   117	  #endif
   118	 #endif
   119	#endif
   120	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox series

Patch

diff --git a/arch/arm/include/asm/pgtable-2level-types.h b/arch/arm/include/asm/pgtable-2level-types.h
index 66cb5b0e89c5..552dba411324 100644
--- a/arch/arm/include/asm/pgtable-2level-types.h
+++ b/arch/arm/include/asm/pgtable-2level-types.h
@@ -64,4 +64,6 @@  typedef pteval_t pgprot_t;
 
 #endif /* STRICT_MM_TYPECHECKS */
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
+
 #endif	/* _ASM_PGTABLE_2LEVEL_TYPES_H */
diff --git a/arch/arm/include/asm/pgtable-3level-types.h b/arch/arm/include/asm/pgtable-3level-types.h
index 921aa30259c4..664c39e6517c 100644
--- a/arch/arm/include/asm/pgtable-3level-types.h
+++ b/arch/arm/include/asm/pgtable-3level-types.h
@@ -67,4 +67,6 @@  typedef pteval_t pgprot_t;
 
 #endif	/* STRICT_MM_TYPECHECKS */
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 36
+
 #endif	/* _ASM_PGTABLE_3LEVEL_TYPES_H */
diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h
index 345a072b5856..45c3834eb4c8 100644
--- a/arch/arm64/include/asm/pgtable-types.h
+++ b/arch/arm64/include/asm/pgtable-types.h
@@ -64,4 +64,6 @@  typedef struct { pteval_t pgprot; } pgprot_t;
 #include <asm-generic/5level-fixup.h>
 #endif
 
+#define MAX_POSSIBLE_PHYSMEM_BITS CONFIG_ARM64_PA_BITS
+
 #endif	/* __ASM_PGTABLE_TYPES_H */
diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h
index 5798bd2b462c..a3e055979e46 100644
--- a/arch/ia64/include/asm/page.h
+++ b/arch/ia64/include/asm/page.h
@@ -235,4 +235,6 @@  get_order (unsigned long size)
 
 #define __HAVE_ARCH_GATE_AREA	1
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 50
+
 #endif /* _ASM_IA64_PAGE_H */
diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h
index e8cc328fce2d..f6a5dea1a66c 100644
--- a/arch/mips/include/asm/page.h
+++ b/arch/mips/include/asm/page.h
@@ -263,4 +263,6 @@  extern int __virt_addr_valid(const volatile void *kaddr);
 #include <asm-generic/memory_model.h>
 #include <asm-generic/getorder.h>
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 48
+
 #endif /* _ASM_PAGE_H */
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index eb20eb3b8fb0..2ebc1d2d9a5c 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -324,6 +324,8 @@  static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
 #define MAX_PHYSMEM_BITS        46
 #endif
 
+#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
+
 #ifdef CONFIG_PPC_BOOK3S_64
 #include <asm/book3s/64/mmu.h>
 #else /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index a4d38092530a..8abec1461bf7 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -180,4 +180,6 @@  static inline int devmem_is_allowed(unsigned long pfn)
 #include <asm-generic/memory_model.h>
 #include <asm-generic/getorder.h>
 
+#define MAX_POSSIBLE_PHYSMEM_BITS CONFIG_MAX_PHYSMEM_BITS
+
 #endif /* _S390_PAGE_H */
diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h
index 5eef8be3e59f..40c7e12cf09e 100644
--- a/arch/sh/include/asm/page.h
+++ b/arch/sh/include/asm/page.h
@@ -205,4 +205,6 @@  typedef struct page *pgtable_t;
 #define ARCH_SLAB_MINALIGN	8
 #endif
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
+
 #endif /* __ASM_SH_PAGE_H */
diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h
index b76d59edec8c..14e9ca4659d7 100644
--- a/arch/sparc/include/asm/page_32.h
+++ b/arch/sparc/include/asm/page_32.h
@@ -139,4 +139,6 @@  extern unsigned long pfn_base;
 #include <asm-generic/memory_model.h>
 #include <asm-generic/getorder.h>
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
+
 #endif /* _SPARC_PAGE_H */
diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index e80f2d5bf62f..6d6f3654ead1 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -163,4 +163,6 @@  extern unsigned long PAGE_OFFSET;
 
 #include <asm-generic/getorder.h>
 
+#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYS_ADDRESS_BITS
+
 #endif /* _SPARC64_PAGE_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index 6deb6cd236e3..c2eae59e6505 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -38,4 +38,6 @@  typedef union {
 /* This covers all VMSPLIT_* and VMSPLIT_*_OPT variants */
 #define PGD_KERNEL_START	(CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 32
+
 #endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 33845d36897c..5fce514a49a0 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -45,7 +45,8 @@  typedef union {
  */
 #define PTRS_PER_PTE	512
 
-#define MAX_POSSIBLE_PHYSMEM_BITS	36
 #define PGD_KERNEL_START	(CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
 
+#define MAX_POSSIBLE_PHYSMEM_BITS 36
+
 #endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 84bd9bdc1987..d808cfde3d19 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -64,8 +64,6 @@  extern unsigned int ptrs_per_p4d;
 #define P4D_SIZE		(_AC(1, UL) << P4D_SHIFT)
 #define P4D_MASK		(~(P4D_SIZE - 1))
 
-#define MAX_POSSIBLE_PHYSMEM_BITS	52
-
 #else /* CONFIG_X86_5LEVEL */
 
 /*
@@ -154,4 +152,6 @@  extern unsigned int ptrs_per_p4d;
 
 #define PGD_KERNEL_START	((PAGE_SIZE / 2) / sizeof(pgd_t))
 
+#define MAX_POSSIBLE_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46)
+
 #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0787d33b80d8..132c20b6fd4f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -80,23 +80,7 @@ 
  * as single (unsigned long) handle value.
  *
  * Note that object index <obj_idx> starts from 0.
- *
- * This is made more complicated by various memory models and PAE.
- */
-
-#ifndef MAX_POSSIBLE_PHYSMEM_BITS
-#ifdef MAX_PHYSMEM_BITS
-#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
-#else
-/*
- * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
- * be PAGE_SHIFT
  */
-#define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
-#endif
-#endif
-
-#define _PFN_BITS		(MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
 
 /*
  * Memory for allocating for handle keeps object position by
@@ -116,6 +100,25 @@ 
  */
 #define OBJ_ALLOCATED_TAG 1
 #define OBJ_TAG_BITS 1
+
+/*
+ * MAX_POSSIBLE_PHYSMEM_BITS should be defined by all archs using zsmalloc:
+ * Trying to guess it from MAX_PHYSMEM_BITS, or considering it BITS_PER_LONG,
+ * proved to be wrong by not considering PAE capabilities, or using SPARSEMEM
+ * only headers, leading to bad object encoding due to object index overflow.
+ */
+#ifndef MAX_POSSIBLE_PHYSMEM_BITS
+ #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
+ #error "MAX_POSSIBLE_PHYSMEM_BITS HAS to be defined by arch using zsmalloc";
+#else
+ #ifndef CONFIG_64BIT
+  #if (MAX_POSSIBLE_PHYSMEM_BITS >= (BITS_PER_LONG + PAGE_SHIFT - OBJ_TAG_BITS))
+   #error "MAX_POSSIBLE_PHYSMEM_BITS is wrong for this arch";
+  #endif
+ #endif
+#endif
+
+#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
 #define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
 #define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)