@@ -35,6 +35,9 @@ void copy_highpage(struct page *to, struct page *from);
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
+#define clear_user_highpage clear_user_highpage
+void clear_user_highpage(struct page *page, unsigned long vaddr);
+
typedef struct page *pgtable_t;
extern int pfn_valid(unsigned long);
@@ -44,3 +44,11 @@ void copy_user_highpage(struct page *to, struct page *from,
flush_dcache_page(to);
}
EXPORT_SYMBOL_GPL(copy_user_highpage);
+
+inline void clear_user_highpage(struct page *page, unsigned long vaddr)
+{
+ void *addr = page_address(page);
+
+ clear_user_page(addr, vaddr, page);
+}
+EXPORT_SYMBOL_GPL(clear_user_highpage);
In !HIGHMEM cases, specially in 64-bit architectures, we don't need temp mapping of pages. Hence, k(map|unmap)_atomic() acts as nothing more than multiple barrier() calls, for example for a 2MB hugepage in clear_huge_page() these are called 512 times i.e. to map and unmap each subpage that means in total 2048 barrier calls. This called for optimization. Simply getting VADDR from page does the job for us. We profiled clear_huge_page() using ftrace and observed an improvement of 62%. Setup:- Below data has been collected on Qualcomm's SM7250 SoC THP enabled (kernel v4.19.113) with only CPU-0(Cortex-A55) and CPU-7(Cortex-A76) switched on and set to max frequency, also DDR set to perf governor. FTRACE Data:- Base data:- Number of iterations: 48 Mean of allocation time: 349.5 us std deviation: 74.5 us v1 data:- Number of iterations: 48 Mean of allocation time: 131 us std deviation: 32.7 us The following simple userspace experiment to allocate 100MB(BUF_SZ) of pages and writing to it gave us a good insight, we observed an improvement of 42% in allocation and writing timings. ------------------------------------------------------------- Test code snippet ------------------------------------------------------------- clock_start(); buf = malloc(BUF_SZ); /* Allocate 100 MB of memory */ for(i=0; i < BUF_SZ_PAGES; i++) { *((int *)(buf + (i*PAGE_SIZE))) = 1; } clock_end(); ------------------------------------------------------------- Malloc test timings for 100MB anon allocation:- Base data:- Number of iterations: 100 Mean of allocation time: 31831 us std deviation: 4286 us v1 data:- Number of iterations: 100 Mean of allocation time: 18193 us std deviation: 4915 us Reported-by: Chintan Pandya <chintan.pandya@oneplus.com> Signed-off-by: Prathu Baronia <prathu.baronia@oneplus.com> --- arch/arm64/include/asm/page.h | 3 +++ arch/arm64/mm/copypage.c | 8 ++++++++ 2 files changed, 11 insertions(+)