@@ -50,6 +50,9 @@
#include <asm/efi.h>
#include <asm/xen/hypervisor.h>
#include <asm/mmu_context.h>
+#ifdef CONFIG_PIN_MEMORY
+#include <linux/pin_memory.h>
+#endif
static int num_standard_resources;
static struct resource *standard_resources;
@@ -243,6 +246,10 @@ static void __init request_standard_resources(void)
crashk_res.end <= res->end)
request_resource(res, &crashk_res);
#endif
+#ifdef CONFIG_PIN_MEMORY
+ if (pin_memory_resource.end)
+ insert_resource(&iomem_resource, &pin_memory_resource);
+#endif
}
}
@@ -41,7 +41,9 @@
#include <linux/sizes.h>
#include <asm/tlb.h>
#include <asm/alternative.h>
-
+#ifdef CONFIG_PIN_MEMORY
+#include <linux/pin_memory.h>
+#endif
#define ARM64_ZONE_DMA_BITS 30
/*
@@ -68,6 +70,16 @@
phys_addr_t arm64_dma_phys_limit __ro_after_init;
static phys_addr_t arm64_dma32_phys_limit __ro_after_init;
+#ifdef CONFIG_PIN_MEMORY
+struct resource pin_memory_resource = {
+ .name = "Pin memory maps",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_MEM,
+ .desc = IORES_DESC_PIN_MEM_MAPS
+};
+#endif
+
#ifdef CONFIG_KEXEC_CORE
/*
* reserve_crashkernel() - reserves memory for crash kernel
@@ -129,6 +141,47 @@ static void __init reserve_crashkernel(void)
}
#endif /* CONFIG_KEXEC_CORE */
+#ifdef CONFIG_PIN_MEMORY
+static void __init reserve_pin_memory_res(void)
+{
+ unsigned long long mem_start, mem_len;
+ int ret;
+
+ ret = parse_pin_memory(boot_command_line, memblock_phys_mem_size(),
+ &mem_len, &mem_start);
+ if (ret || !mem_len)
+ return;
+
+ mem_len = PAGE_ALIGN(mem_len);
+
+ if (!memblock_is_region_memory(mem_start, mem_len)) {
+ pr_warn("cannot reserve for pin memory: region is not memory!\n");
+ return;
+ }
+
+ if (memblock_is_region_reserved(mem_start, mem_len)) {
+ pr_warn("cannot reserve for pin memory: region overlaps reserved memory!\n");
+ return;
+ }
+
+ if (!IS_ALIGNED(mem_start, SZ_2M)) {
+ pr_warn("cannot reserve for pin memory: base address is not 2MB aligned\n");
+ return;
+ }
+
+ memblock_reserve(mem_start, mem_len);
+ pr_debug("pin memory resource reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+ mem_start, mem_start + mem_len, mem_len >> 20);
+
+ pin_memory_resource.start = mem_start;
+ pin_memory_resource.end = mem_start + mem_len - 1;
+}
+#else
+static void __init reserve_pin_memory_res(void)
+{
+}
+#endif /* CONFIG_PIN_MEMORY */
+
#ifdef CONFIG_CRASH_DUMP
static int __init early_init_dt_scan_elfcorehdr(unsigned long node,
const char *uname, int depth, void *data)
@@ -452,6 +505,8 @@ void __init arm64_memblock_init(void)
reserve_crashkernel();
+ reserve_pin_memory_res();
+
reserve_elfcorehdr();
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
@@ -573,6 +628,11 @@ void __init mem_init(void)
/* this will put all unused low memory onto the freelists */
memblock_free_all();
+#ifdef CONFIG_PIN_MEMORY
+ /* pre alloc the pages for pin memory */
+ init_reserve_page_map((unsigned long)pin_memory_resource.start,
+ (unsigned long)(pin_memory_resource.end - pin_memory_resource.start));
+#endif
mem_init_print_info(NULL);
/*
@@ -560,3 +560,10 @@ config RANDOM_TRUST_BOOTLOADER
booloader is trustworthy so it will be added to the kernel's entropy
pool. Otherwise, say N here so it will be regarded as device input that
only mixes the entropy pool.
+
+config PIN_MEMORY_DEV
+ bool "/dev/pinmem character device"
+ depends PIN_MEMORY
+ default n
+ help
+ pin memory driver
@@ -52,3 +52,4 @@ js-rtc-y = rtc.o
obj-$(CONFIG_XILLYBUS) += xillybus/
obj-$(CONFIG_POWERNV_OP_PANEL) += powernv-op-panel.o
obj-$(CONFIG_ADI) += adi.o
+obj-$(CONFIG_PIN_MEMORY_DEV) += pin_memory.o
new file mode 100644
@@ -0,0 +1,198 @@
+/*
+ * Copyright @ Huawei Technologies Co., Ltd. 2020-2020. ALL rights reserved.
+ * Description: Euler pin memory driver
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm_types.h>
+#include <asm/processor.h>
+#include <uapi/asm-generic/ioctl.h>
+#include <uapi/asm-generic/mman-common.h>
+#include <uapi/asm/setup.h>
+#include <linux/pin_mem.h>
+#include <linux/sched/mm.h>
+
+#define MAX_PIN_MEM_AREA_NUM 16
+struct _pin_mem_area {
+ unsigned long virt_start;
+ unsigned long virt_end;
+};
+
+struct pin_mem_area_set {
+ unsigned int pid;
+ unsigned int area_num;
+ struct _pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM];
+};
+
+#define PIN_MEM_MAGIC 0x59
+#define _SET_PIN_MEM_AREA 1
+#define _CLEAR_PIN_MEM_AREA 2
+#define _REMAP_PIN_MEM_AREA 3
+#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set)
+#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int)
+#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int)
+
+static int set_pin_mem(struct pin_mem_area_set *pmas)
+{
+ int i;
+ int ret = 0;
+ struct _pin_mem_area *pma;
+ struct mm_struct *mm;
+ struct task_struct *task;
+ struct pid *pid_s;
+
+ pid_s = find_get_pid(pmas->pid);
+ if (!pid_s) {
+ pr_warn("Get pid struct fail:%d.\n", pmas->pid);
+ goto fail;
+ }
+ rcu_read_lock();
+ task = pid_task(pid_s, PIDTYPE_PID);
+ if (!task) {
+ pr_warn("Get task struct fail:%d.\n", pmas->pid);
+ goto fail;
+ }
+ mm = get_task_mm(task);
+ for (i = 0; i < pmas->area_num; i++) {
+ pma = &(pmas->mem_area[i]);
+ ret = pin_mem_area(task, mm, pma->virt_start, pma->virt_end);
+ if (ret) {
+ mmput(mm);
+ goto fail;
+ }
+ }
+ mmput(mm);
+ rcu_read_unlock();
+ return ret;
+
+fail:
+ rcu_read_unlock();
+ return -EFAULT;
+}
+
+static int set_pin_mem_area(unsigned long arg)
+{
+ struct pin_mem_area_set pmas;
+ void __user *buf = (void __user *)arg;
+
+ if (!access_ok(buf, sizeof(pmas)))
+ return -EFAULT;
+ if (copy_from_user(&pmas, buf, sizeof(pmas)))
+ return -EINVAL;
+ if (pmas.area_num > MAX_PIN_MEM_AREA_NUM) {
+ pr_warn("Input area_num is too large.\n");
+ return -EINVAL;
+ }
+
+ return set_pin_mem(&pmas);
+}
+
+static int pin_mem_remap(unsigned long arg)
+{
+ int pid;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ vm_fault_t ret;
+ void __user *buf = (void __user *)arg;
+ struct pid *pid_s;
+
+ if (!access_ok(buf, sizeof(int)))
+ return -EINVAL;
+ if (copy_from_user(&pid, buf, sizeof(int)))
+ return -EINVAL;
+
+ pid_s = find_get_pid(pid);
+ if (!pid_s) {
+ pr_warn("Get pid struct fail:%d.\n", pid);
+ return -EINVAL;
+ }
+ rcu_read_lock();
+ task = pid_task(pid_s, PIDTYPE_PID);
+ if (!task) {
+ pr_warn("Get task struct fail:%d.\n", pid);
+ goto fault;
+ }
+ mm = get_task_mm(task);
+ ret = do_mem_remap(pid, mm);
+ if (ret) {
+ pr_warn("Handle pin memory remap fail.\n");
+ mmput(mm);
+ goto fault;
+ }
+ mmput(mm);
+ rcu_read_unlock();
+ return 0;
+
+fault:
+ rcu_read_unlock();
+ return -EFAULT;
+}
+
+static long pin_memory_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+ long ret = 0;
+
+ if (_IOC_TYPE(cmd) != PIN_MEM_MAGIC)
+ return -EINVAL;
+ if (_IOC_NR(cmd) > _REMAP_PIN_MEM_AREA)
+ return -EINVAL;
+
+ switch (cmd) {
+ case SET_PIN_MEM_AREA:
+ ret = set_pin_mem_area(arg);
+ break;
+ case CLEAR_PIN_MEM_AREA:
+ clear_pin_memory_record();
+ break;
+ case REMAP_PIN_MEM_AREA:
+ ret = pin_mem_remap(arg);
+ break;
+ default:
+ return -EINVAL;
+ }
+ return ret;
+}
+
+static const struct file_operations pin_memory_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = pin_memory_ioctl,
+ .compat_ioctl = pin_memory_ioctl,
+};
+
+static struct miscdevice pin_memory_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "pinmem",
+ .fops = &pin_memory_fops,
+};
+
+static int pin_memory_init(void)
+{
+ int err = misc_register(&pin_memory_miscdev);
+ if (!err) {
+ pr_info("pin_memory init\n");
+ } else {
+ pr_warn("pin_memory init failed!\n");
+ }
+ return err;
+}
+
+static void pin_memory_exit(void)
+{
+ misc_deregister(&pin_memory_miscdev);
+ pr_info("pin_memory ko exists!\n");
+}
+
+module_init(pin_memory_init);
+module_exit(pin_memory_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Euler");
+MODULE_DESCRIPTION("pin memory");
@@ -75,4 +75,9 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
unsigned long long *crash_size, unsigned long long *crash_base);
+#ifdef CONFIG_PIN_MEMORY
+int __init parse_pin_memory(char *cmdline, unsigned long long system_ram,
+ unsigned long long *pin_size, unsigned long long *pin_base);
+#endif
+
#endif /* LINUX_CRASH_CORE_H */
new file mode 100644
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+ * Provide the pin memory method for check point and restore task.
+ */
+#ifndef _LINUX_PIN_MEMORY_H
+#define _LINUX_PIN_MEMORY_H
+
+#ifdef CONFIG_PIN_MEMORY
+#include <linux/errno.h>
+#include <linux/kabi.h>
+#include <linux/mm_types.h>
+#include <linux/err.h>
+#ifdef CONFIG_ARM64
+#include <linux/ioport.h>
+#endif
+
+#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
+
+#define COLLECT_PAGES_FINISH 1
+#define COLLECT_PAGES_NEED_CONTINUE -1
+#define COLLECT_PAGES_FAIL 0
+
+#define COMPOUND_PAD_MASK 0xffffffff
+#define COMPOUND_PAD_START 0x88
+#define COMPOUND_PAD_DELTA 0x40
+#define LIST_POISON4 0xdead000000000400
+
+#define next_pme(pme) ((unsigned long *)(pme + 1) + pme->nr_pages)
+
+struct page_map_entry {
+ unsigned long virt_addr;
+ unsigned int nr_pages;
+ unsigned int is_huge_page;
+ unsigned long phy_addr_array[0];
+};
+
+struct page_map_info {
+ int pid;
+ int pid_reserved;
+ unsigned int entry_num;
+ struct page_map_entry *pme;
+};
+
+extern struct page_map_info *get_page_map_info(int pid);
+extern struct page_map_info *create_page_map_info(int pid);
+extern vm_fault_t do_mem_remap(int pid, struct mm_struct *mm);
+extern vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, struct page *page);
+extern void clear_pin_memory_record(void);
+extern int pin_mem_area(struct task_struct *task, struct mm_struct *mm,
+ unsigned long start_addr, unsigned long end_addr);
+extern vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, struct page *page);
+
+/* reserve space for pin memory*/
+#ifdef CONFIG_ARM64
+extern struct resource pin_memory_resource;
+#endif
+extern void init_reserve_page_map(unsigned long map_addr, unsigned long map_size);
+
+#endif /* CONFIG_PIN_MEMORY */
+#endif /* _LINUX_PIN_MEMORY_H */
@@ -292,6 +292,17 @@ int __init parse_crashkernel_low(char *cmdline,
"crashkernel=", suffix_tbl[SUFFIX_LOW]);
}
+#ifdef CONFIG_PIN_MEMORY
+int __init parse_pin_memory(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *pin_size,
+ unsigned long long *pin_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, pin_size, pin_base,
+ "pinmemory=", NULL);
+}
+#endif
+
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
void *data, size_t data_len)
{
@@ -739,4 +739,10 @@ config ARCH_HAS_HUGEPD
config MAPPING_DIRTY_HELPERS
bool
+config PIN_MEMORY
+ bool "Support for pin memory"
+ depends on CHECKPOINT_RESTORE
+ help
+ Say y here to enable the pin memory feature for checkpoint
+ and restore.
endmenu
@@ -108,3 +108,4 @@ obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
+obj-$(CONFIG_PIN_MEMORY) += pin_mem.o
@@ -3083,4 +3083,65 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
mlock_vma_page(new);
update_mmu_cache_pmd(vma, address, pvmw->pmd);
}
+
+#ifdef CONFIG_PIN_MEMORY
+vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, struct page *page)
+{
+ gfp_t gfp;
+ pgtable_t pgtable;
+ spinlock_t *ptl;
+ pmd_t entry;
+ vm_fault_t ret = 0;
+ struct mem_cgroup *memcg;
+
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
+ return VM_FAULT_OOM;
+ gfp = alloc_hugepage_direct_gfpmask(vma);
+ prep_transhuge_page(page);
+ if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+ pgtable = pte_alloc_one(vma->vm_mm, address);
+ if (unlikely(!pgtable)) {
+ ret = VM_FAULT_OOM;
+ goto release;
+ }
+ __SetPageUptodate(page);
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (unlikely(!pmd_none(*pmd))) {
+ goto unlock_release;
+ } else {
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto unlock_release;
+ entry = mk_huge_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ page_add_new_anon_rmap(page, vma, address, true);
+ mem_cgroup_commit_charge(page, memcg, false, true);
+ lru_cache_add_active_or_unevictable(page, vma);
+ pgtable_trans_huge_deposit(vma->vm_mm, pmd, pgtable);
+ set_pmd_at(vma->vm_mm, address, pmd, entry);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ mm_inc_nr_ptes(vma->vm_mm);
+ spin_unlock(ptl);
+ count_vm_event(THP_FAULT_ALLOC);
+ }
+
+ return 0;
+unlock_release:
+ spin_unlock(ptl);
+release:
+ if (pgtable)
+ pte_free(vma->vm_mm, pgtable);
+ mem_cgroup_cancel_charge(page, memcg, true);
+ put_page(page);
+ return ret;
+}
+#endif
+
#endif
@@ -4799,4 +4799,72 @@ void ptlock_free(struct page *page)
{
kmem_cache_free(page_ptl_cachep, page->ptl);
}
+
+#ifdef CONFIG_PIN_MEMORY
+vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, struct page *page)
+{
+ struct mem_cgroup *memcg;
+ pte_t entry;
+ spinlock_t *ptl;
+ pte_t *pte;
+ vm_fault_t ret = 0;
+
+ if (pte_alloc(vma->vm_mm, pmd, address))
+ return VM_FAULT_OOM;
+
+ /* See the comment in pte_alloc_one_map() */
+ if (unlikely(pmd_trans_unstable(pmd)))
+ return 0;
+
+ /* Allocate our own private page. */
+ if (unlikely(anon_vma_prepare(vma)))
+ goto oom;
+
+ if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
+ false))
+ goto oom_free_page;
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceeding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, address,
+ &ptl);
+ if (!pte_none(*pte)) {
+ ret = VM_FAULT_FALLBACK;
+ goto release;
+ }
+
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto release;
+ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
+ lru_cache_add_active_or_unevictable(page, vma);
+
+ set_pte_at(vma->vm_mm, address, pte, entry);
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, address, pte);
+unlock:
+ pte_unmap_unlock(pte, ptl);
+ return ret;
+release:
+ mem_cgroup_cancel_charge(page, memcg, false);
+ put_page(page);
+ goto unlock;
+oom_free_page:
+ put_page(page);
+oom:
+ return VM_FAULT_OOM;
+}
+#endif
+
#endif
new file mode 100644
@@ -0,0 +1,691 @@
+/*
+ * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
+ * Provide the pin memory method for check point and restore task.
+ */
+#ifdef CONFIG_PIN_MEMORY
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sched/cputime.h>
+#include <linux/tick.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pin_mem.h>
+#include <linux/idr.h>
+#include <linux/page-isolation.h>
+#include <linux/sched/mm.h>
+#include <linux/ctype.h>
+
+#define MAX_PIN_PID_NUM 128
+static DEFINE_SPINLOCK(page_map_entry_lock);
+
+unsigned int pin_pid_num;
+static unsigned int *pin_pid_num_addr;
+static unsigned long __page_map_entry_start;
+static unsigned long page_map_entry_end;
+static struct page_map_info *user_space_reserve_start;
+static struct page_map_entry *page_map_entry_start;
+unsigned int max_pin_pid_num __read_mostly;
+
+static int __init setup_max_pin_pid_num(char *str)
+{
+ int ret = 1;
+
+ if (!str)
+ goto out;
+
+ ret = kstrtouint(str, 10, &max_pin_pid_num);
+out:
+ if (ret) {
+ pr_warn("Unable to parse max pin pid num.\n");
+ } else {
+ if (max_pin_pid_num > MAX_PIN_PID_NUM) {
+ max_pin_pid_num = 0;
+ pr_warn("Input max_pin_pid_num is too large.\n");
+ }
+ }
+ return ret;
+}
+early_param("max_pin_pid_num", setup_max_pin_pid_num);
+
+struct page_map_info *create_page_map_info(int pid)
+{
+ struct page_map_info *new;
+
+ if (!user_space_reserve_start)
+ return NULL;
+
+ if (pin_pid_num >= max_pin_pid_num) {
+ pr_warn("Pin pid num too large than max_pin_pid_num, fail create: %d!", pid);
+ return NULL;
+ }
+ new = (struct page_map_info *)(user_space_reserve_start + pin_pid_num);
+ new->pid = pid;
+ new->pme = NULL;
+ new->entry_num = 0;
+ new->pid_reserved = false;
+ (*pin_pid_num_addr)++;
+ pin_pid_num++;
+ return new;
+}
+EXPORT_SYMBOL_GPL(create_page_map_info);
+
+struct page_map_info *get_page_map_info(int pid)
+{
+ int i;
+
+ if (!user_space_reserve_start)
+ return NULL;
+
+ for (i = 0; i < pin_pid_num; i++) {
+ if (user_space_reserve_start[i].pid == pid) {
+ return &(user_space_reserve_start[i]);
+ }
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(get_page_map_info);
+
+static struct page *find_head_page(struct page *page)
+{
+ struct page *p = page;
+
+ while (!PageBuddy(p)) {
+ if (PageLRU(p))
+ return NULL;
+ p--;
+ }
+ return p;
+}
+
+static void spilt_page_area_left(struct zone *zone, struct free_area *area, struct page *page,
+ unsigned long size, int order)
+{
+ unsigned long cur_size = 1 << order;
+ unsigned long total_size = 0;
+ struct page *tmp;
+ unsigned long tmp_size = size;
+
+ while (size && cur_size > size) {
+ cur_size >>= 1;
+ order--;
+ area--;
+ if (cur_size <= size) {
+ list_add(&page[total_size].lru, &area->free_list[MIGRATE_MOVABLE]);
+ atomic_set(&(page[total_size]._mapcount), PAGE_BUDDY_MAPCOUNT_VALUE);
+ set_page_private(&page[total_size], order);
+ set_pageblock_migratetype(&page[total_size], MIGRATE_MOVABLE);
+ area->nr_free++;
+ total_size += cur_size;
+ size -= cur_size;
+ }
+ }
+}
+
+static void spilt_page_area_right(struct zone *zone, struct free_area *area, struct page *page,
+ unsigned long size, int order)
+{
+ unsigned long cur_size = 1 << order;
+ struct page *right_page, *head_page;
+ unsigned long tmp_size = size;
+
+ right_page = page + size;
+ while (size && cur_size > size) {
+ cur_size >>= 1;
+ order--;
+ area--;
+ if (cur_size <= size) {
+ head_page = right_page - cur_size;
+ list_add(&head_page->lru, &area->free_list[MIGRATE_MOVABLE]);
+ atomic_set(&(head_page->_mapcount), PAGE_BUDDY_MAPCOUNT_VALUE);
+ set_page_private(head_page, order);
+ set_pageblock_migratetype(head_page, MIGRATE_MOVABLE);
+ area->nr_free++;
+ size -= cur_size;
+ right_page = head_page;
+ }
+ }
+}
+
+void reserve_page_from_buddy(unsigned long nr_pages, struct page *page)
+{
+ unsigned int current_order;
+ struct page *page_end;
+ struct free_area *area;
+ struct zone *zone;
+ struct page *head_page;
+
+ head_page = find_head_page(page);
+ if (!head_page) {
+ pr_warn("Find page head fail.");
+ return;
+ }
+ current_order = head_page->private;
+ page_end = head_page + (1 << current_order);
+ zone = page_zone(head_page);
+ area = &(zone->free_area[current_order]);
+ list_del(&head_page->lru);
+ atomic_set(&head_page->_mapcount, -1);
+ set_page_private(head_page, 0);
+ area->nr_free--;
+ if (head_page != page)
+ spilt_page_area_left(zone, area, head_page,
+ (unsigned long)(page - head_page), current_order);
+ page = page + nr_pages;
+ if (page < page_end) {
+ spilt_page_area_right(zone, area, page,
+ (unsigned long)(page_end - page), current_order);
+ } else if (page > page_end) {
+ pr_warn("Find page end smaller than page.");
+ }
+}
+
+static inline void reserve_user_normal_pages(struct page *page)
+{
+ if (!atomic_read(&page->_refcount)) {
+ atomic_inc(&page->_refcount);
+ reserve_page_from_buddy(1, page);
+ } else {
+ pr_warn("Page %pK refcount %d large than zero, no need reserve.\n",
+ page, page->_refcount.counter);
+ }
+}
+
+static void init_huge_pmd_pages(struct page *head_page)
+{
+ int i = 0;
+ struct page *page = head_page;
+ unsigned long *temp;
+ unsigned long compound_pad = COMPOUND_PAD_START;
+
+ __set_bit(PG_head, &page->flags);
+ __set_bit(PG_active, &page->flags);
+ atomic_set(&page->_refcount, 1);
+ page++;
+ i++;
+ page->compound_head = (unsigned long)head_page + 1;
+ page->_compound_pad_2 = (unsigned long)head_page & COMPOUND_PAD_MASK;
+ temp = (unsigned long *)(&(page->_compound_pad_2));
+ temp[1] = LIST_POISON4;
+ page->compound_dtor = HUGETLB_PAGE_DTOR + 1;
+ page->compound_order = HPAGE_PMD_ORDER;
+ page++;
+ i++;
+ page->compound_head = (unsigned long)head_page + 1;
+ page->_compound_pad_2 = (unsigned long)head_page + compound_pad;
+ i++;
+ INIT_LIST_HEAD(&(page->deferred_list));
+ for (; i < HPAGE_PMD_NR; i++) {
+ page = head_page + i;
+ page->compound_head = (unsigned long)head_page + 1;
+ compound_pad += COMPOUND_PAD_DELTA;
+ page->_compound_pad_2 = (unsigned long)head_page + compound_pad;
+ temp = (unsigned long *)(&(page->_compound_pad_2));
+ temp[1] = LIST_POISON4;
+ }
+}
+
+static void reserve_user_huge_pmd_pages(struct page *page)
+{
+ struct page *head_page;
+
+ if (!atomic_read(&page->_refcount)) {
+ atomic_inc(&page->_refcount);
+ head_page = find_head_page(page);
+ reserve_page_from_buddy((1 << HPAGE_PMD_ORDER), page);
+ init_huge_pmd_pages(page);
+ } else {
+ pr_warn("Page %pK refcount %d large than zero, no need reserve.\n",
+ page, page->_refcount.counter);
+ }
+}
+
+static void reserve_user_space_map_pages(void)
+{
+ struct page_map_info *pmi;
+ struct page_map_entry *pme;
+ unsigned int i, j, index;
+ struct page *page;
+ unsigned long flags;
+ unsigned long page_size;
+ int err = 0;
+ unsigned long phy_addr;
+
+ if (!user_space_reserve_start)
+ return;
+ spin_lock_irqsave(&page_map_entry_lock, flags);
+ for (index = 0; index < pin_pid_num; index++) {
+ pmi = &(user_space_reserve_start[index]);
+ pme = pmi->pme;
+
+ for (i = 0; i < pmi->entry_num; i++) {
+ err = 0;
+ for (j = 0; j < pme->nr_pages; j++) {
+ phy_addr = pme->phy_addr_array[j];
+ if (!phy_addr)
+ continue;
+ page = phys_to_page(phy_addr);
+ if (atomic_read(&page->_refcount)) {
+ pme->phy_addr_array[j] = 0;
+ page_size = pme->is_huge_page ? HPAGE_PMD_SIZE : PAGE_SIZE;
+ continue;
+ }
+ if (!pme->is_huge_page) {
+ reserve_user_normal_pages(page);
+ } else {
+ reserve_user_huge_pmd_pages(page);
+ }
+ }
+ pme = (struct page_map_entry *)next_pme(pme);
+ if (err)
+ err_phy_num++;
+ }
+ page_size = pme->is_huge_page ? HPAGE_PMD_SIZE : PAGE_SIZE;
+ }
+ spin_unlock(&page_map_entry_lock);
+}
+
+
+/* The whole page map entry collect process must be Sequentially.
+ The user_space_reserve_start points to the first page map info for
+ the first dump task. And the page_map_entry_start points to
+ the first page map entry of the first dump vma. */
+static void init_page_map_info(unsigned int *map_addr)
+{
+ unsigned long map_len = pin_memory_resource.end - pin_memory_resource.start;
+
+ if (user_space_reserve_start || !max_pin_pid_num)
+ return;
+ pin_pid_num = *map_addr;
+ pin_pid_num_addr = map_addr;
+ user_space_reserve_start =
+ (struct kup_page_map_info *)(map_addr + 1);
+ page_map_entry_start =
+ (struct page_map_entry *)(user_space_reserve_start + max_pin_pid_num);
+ page_map_entry_end = (unsigned long)map_addr + map_len;
+ if (pin_pid_num > 0)
+ reserve_user_space_map_pages();
+}
+
+int collect_pmd_huge_pages(struct task_struct *task,
+ unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme)
+{
+ long res;
+ int index = 0;
+ unsigned long start = start_addr;
+ struct page *temp_page;
+
+ while (start < end_addr) {
+ temp_page = NULL;
+ res = get_user_pages_remote(task, task->mm, start, 1,
+ FOLL_TOUCH|FOLL_GET, &temp_page, NULL, NULL);
+ if (!res) {
+ pr_warn("Get huge page for addr(%lx) fail.", start);
+ return COLLECT_PAGES_FAIL;
+ }
+ if (PageHead(temp_page)) {
+ start += HPAGE_PMD_SIZE;
+ pme->phy_addr_array[index] = page_to_phys(temp_page);
+ index++;
+ } else {
+ pme->nr_pages = index;
+ atomic_dec(&((temp_page)->_refcount));
+ return COLLECT_PAGES_NEED_CONTINUE;
+ }
+ }
+ pme->nr_pages = index;
+ return COLLECT_PAGES_FINISH;
+}
+
+int collect_normal_pages(struct task_struct *task,
+ unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme)
+{
+ int res;
+ unsigned long next;
+ unsigned long i, nr_pages;
+ struct page *tmp_page;
+ unsigned long *phy_addr_array = pme->phy_addr_array;
+ struct page **page_array = (struct page **)pme->phy_addr_array;
+
+ next = (start_addr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+ next = (next > end_addr) ? end_addr : next;
+ pme->nr_pages = 0;
+ while (start_addr < next) {
+ nr_pages = (next - start_addr) / PAGE_SIZE;
+ res = get_user_pages_remote(task, task->mm, start_addr, 1,
+ FOLL_TOUCH|FOLL_GET, &tmp_page, NULL, NULL);
+ if (!res) {
+ pr_warn("Get user pages of %lx fail.\n", start_addr);
+ return COLLECT_PAGES_FAIL;
+ }
+ if (PageHead(tmp_page)) {
+ atomic_dec(&(tmp_page->_refcount));
+ return COLLECT_PAGES_NEED_CONTINUE;
+ }
+ atomic_dec(&(tmp_page->_refcount));
+ if (PageTail(tmp_page)) {
+ start_addr = next;
+ pme->virt_addr = start_addr;
+ next = (next + HPAGE_PMD_SIZE) > end_addr ? end_addr : (next + HPAGE_PMD_SIZE);
+ continue;
+ }
+ res = get_user_pages_remote(task, task->mm, start_addr, nr_pages,
+ FOLL_TOUCH|FOLL_GET, page_array, NULL, NULL);
+ if (!res) {
+ pr_warn("Get user pages of %lx fail.\n", start_addr);
+ return COLLECT_PAGES_FAIL;
+ }
+ for (i = 0; i < nr_pages; i++) {
+ phy_addr_array[i] = page_to_phys(page_array[i]);
+ }
+ pme->nr_pages += nr_pages;
+ page_array += nr_pages;
+ phy_addr_array += nr_pages;
+ start_addr = next;
+ next = (next + HPAGE_PMD_SIZE) > end_addr ? end_addr : (next + HPAGE_PMD_SIZE);
+ }
+ return COLLECT_PAGES_FINISH;
+}
+
+/* Users make sure that the pin memory belongs to anonymous vma. */
+int pin_mem_area(struct task_struct *task, struct mm_struct *mm,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ int pid, ret;
+ int is_huge_page = false;
+ unsigned int page_size;
+ unsigned long nr_pages, flags;
+ struct page_map_entry *pme;
+ struct page_map_info *pmi;
+ struct vm_area_struct *vma;
+ unsigned long i;
+ struct page *tmp_page;
+
+ if (!page_map_entry_start
+ || !task || !mm
+ || start_addr >= end_addr)
+ return -EFAULT;
+
+ pid = task->pid;
+ spin_lock_irqsave(&page_map_entry_lock, flags);
+ nr_pages = ((end_addr - start_addr) / PAGE_SIZE);
+ if ((unsigned long)page_map_entry_start + nr_pages * sizeof(struct page *)
+ >= page_map_entry_end) {
+ pr_warn("Page map entry use up!\n");
+ ret = -EFAULT;
+ goto finish;
+ }
+ vma = find_extend_vma(mm, start_addr);
+ if (!vma) {
+ pr_warn("Find no match vma!\n");
+ ret = -EFAULT;
+ goto finish;
+ }
+ if (start_addr == (start_addr & HPAGE_PMD_MASK) &&
+ transparent_hugepage_enabled(vma)) {
+ page_size = HPAGE_PMD_SIZE;
+ is_huge_page = true;
+ } else {
+ page_size = PAGE_SIZE;
+ }
+ pme = page_map_entry_start;
+ pme->virt_addr = start_addr;
+ pme->is_huge_page = is_huge_page;
+ memset(pme->phy_addr_array, 0, nr_pages * sizeof(unsigned long));
+ down_write(&mm->mmap_sem);
+ if (!is_huge_page) {
+ ret = collect_normal_pages(task, start_addr, end_addr, pme);
+ if (!pme->nr_pages) {
+ if (ret == COLLECT_PAGES_FINISH) {
+ ret = 0;
+ up_write(&mm->mmap_sem);
+ goto finish;
+ }
+ pme->is_huge_page = true;
+ page_size = HPAGE_PMD_SIZE;
+ ret = collect_pmd_huge_pages(task, pme->virt_addr, end_addr, pme);
+ }
+ } else {
+ ret = collect_pmd_huge_pages(task, start_addr, end_addr, pme);
+ if (!pme->nr_pages) {
+ if (ret == COLLECT_PAGES_FINISH) {
+ ret = 0;
+ up_write(&mm->mmap_sem);
+ goto finish;
+ }
+ pme->is_huge_page = false;
+ page_size = PAGE_SIZE;
+ ret = collect_normal_pages(task, pme->virt_addr, end_addr, pme);
+ }
+ }
+ up_write(&mm->mmap_sem);
+ if (ret == COLLECT_PAGES_FAIL) {
+ ret = -EFAULT;
+ goto finish;
+ }
+
+ /* check for zero pages */
+ for (i = 0; i < pme->nr_pages; i++) {
+ tmp_page = phys_to_page(pme->phy_addr_array[i]);
+ if (!pme->is_huge_page) {
+ if (page_to_pfn(tmp_page) == my_zero_pfn(pme->virt_addr + i * PAGE_SIZE))
+ pme->phy_addr_array[i] = 0;
+ } else if (is_huge_zero_page(tmp_page))
+ pme->phy_addr_array[i] = 0;
+ }
+
+ page_map_entry_start = (struct page_map_entry *)(next_pme(pme));
+ pmi = get_page_map_info(pid);
+ if (!pmi)
+ pmi = create_page_map_info(pid);
+ if (!pmi) {
+ pr_warn("Create page map info fail for pid: %d!\n", pid);
+ ret = -EFAULT;
+ goto finish;
+ }
+ if (!pmi->pme)
+ pmi->pme = pme;
+ pmi->entry_num++;
+
+ if (ret == COLLECT_PAGES_NEED_CONTINUE) {
+ ret = pin_mem_area(task, mm, pme->virt_addr + pme->nr_pages * page_size, end_addr);
+ }
+
+finish:
+ spin_unlock_irqrestore(&page_map_entry_lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pin_mem_area);
+
+vm_fault_t remap_normal_pages(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page_map_entry *pme)
+{
+ int ret;
+ unsigned int j;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pmd_t *pmd;
+ pud_t *pud;
+ struct page *page;
+ unsigned long address;
+ unsigned long phy_addr;
+
+ for (j = 0; j < pme->nr_pages; j++) {
+ address = pme->virt_addr + j * PAGE_SIZE;
+ phy_addr = pme->phy_addr_array[j];
+ if (!phy_addr)
+ continue;
+ page = phys_to_page(phy_addr);
+ if (page->flags & (1 << PG_reserved))
+ page->flags -= (1 << PG_reserved);
+ if (page_to_pfn(page) == my_zero_pfn(address)) {
+ pme->phy_addr_array[j] = 0;
+ continue;
+ }
+ page->mapping = NULL;
+ pgd = pgd_offset(mm, address);
+ p4d = p4d_alloc(mm, pgd, address);
+ if (!p4d)
+ return VM_FAULT_OOM;
+ pud = pud_alloc(mm, p4d, address);
+ if (!pud)
+ return VM_FAULT_OOM;
+ pmd = pmd_alloc(mm, pud, address);
+ if (!pmd)
+ return VM_FAULT_OOM;
+ ret = do_anon_page_remap(vma, address, pmd, page);
+ if (ret == VM_FAULT_OOM)
+ return ret;
+ }
+ return 0;
+}
+
+vm_fault_t remap_huge_pmd_pages(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page_map_entry *pme)
+{
+ int ret;
+ unsigned int j;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pmd_t *pmd;
+ pud_t *pud;
+ struct page *page;
+ unsigned long address;
+ unsigned long phy_addr;
+
+ for (j = 0; j < pme->nr_pages; j++) {
+ address = pme->virt_addr + j * HPAGE_PMD_SIZE;
+ phy_addr = pme->phy_addr_array[j];
+ if (!phy_addr)
+ continue;
+ page = phys_to_page(phy_addr);
+ if (page->flags & (1 << PG_reserved))
+ page->flags -= (1 << PG_reserved);
+ if (is_huge_zero_page(page)) {
+ pme->phy_addr_array[j] = 0;
+ continue;
+ }
+ pgd = pgd_offset(mm, address);
+ p4d = p4d_alloc(mm, pgd, address);
+ if (!p4d)
+ return VM_FAULT_OOM;
+ pud = pud_alloc(mm, p4d, address);
+ if (!pud)
+ return VM_FAULT_OOM;
+ pmd = pmd_alloc(mm, pud, address);
+ if (!pmd)
+ return VM_FAULT_OOM;
+ ret = do_anon_huge_page_remap(vma, address, pmd, page);
+ if (ret == VM_FAULT_OOM)
+ return ret;
+ }
+ return 0;
+}
+
+vm_fault_t do_mem_remap(int pid, struct mm_struct *mm)
+{
+ unsigned int i = 0;
+ vm_fault_t ret = 0;
+ struct vm_area_struct *vma;
+ struct page_map_info *pmi;
+ struct page_map_entry *pme;
+
+ pmi = get_page_map_info(pid);
+ if (!pmi)
+ return -EFAULT;
+ down_write(&mm->mmap_sem);
+ pme = pmi->pme;
+ vma = mm->mmap;
+ while ((i < pmi->entry_num) && (vma != NULL)) {
+ if (pme->virt_addr >= vma->vm_start && pme->virt_addr < vma->vm_end) {
+ i++;
+ if (!vma_is_anonymous(vma)) {
+ pme = (struct page_map_entry *)(next_pme(pme));
+ continue;
+ }
+ if (!pme->is_huge_page) {
+ ret = remap_normal_pages(mm, vma, pme);
+ if (ret < 0)
+ goto out;
+ } else {
+ ret = remap_huge_pmd_pages(mm, vma, pme);
+ if (ret < 0)
+ goto out;
+ }
+ pme = (struct page_map_entry *)(next_pme(pme));
+ } else {
+ vma = vma->vm_next;
+ }
+ }
+out:
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(do_mem_remap);
+
+#if defined(CONFIG_ARM64)
+void init_reserve_page_map(unsigned long map_addr, unsigned long map_size)
+{
+ void *addr;
+
+ if (!map_addr || !map_size)
+ return;
+ addr = phys_to_virt(map_addr);
+ init_page_map_info((unsigned int *)addr);
+}
+#else
+void init_reserve_page_map(unsigned long map_addr, unsigned long map_size)
+{
+}
+#endif
+
+/* Clear all pin memory record. */
+void clear_pin_memory_record(void)
+{
+ if (pin_pid_num_addr) {
+ *pin_pid_num_addr = 0;
+ pin_pid_num = 0;
+ page_map_entry_start = (struct page_map_entry *)__page_map_entry_start;
+ }
+ if (kernel_space_reserve_start && kernel_pin_space_size > 0) {
+ *(unsigned long *)kernel_space_reserve_start = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(clear_pin_memory_record);
+
+vm_fault_t reserve_kernel_space_mem(unsigned long start_addr, unsigned int pages)
+{
+ unsigned long i;
+ unsigned long entry_num;
+ struct page_map_entry *pme, *pme_start;
+
+
+ entry_num = *(unsigned long *)kernel_space_reserve_start;
+ pme_start = (struct page_map_entry *)(kernel_space_reserve_start + sizeof(entry_num));
+ pme = pme_start;
+ spin_lock(&page_map_entry_lock);
+ for (i = 0; i < entry_num; i++) {
+ if (start_addr == pme->virt_addr) {
+ spin_unlock(&page_map_entry_lock);
+ return 0;
+ }
+ pme = pme + 1;
+ }
+ if ((unsigned long)(pme_start + entry_num) >= kernel_space_reserve_end) {
+ spin_unlock(&page_map_entry_lock);
+ return VM_FAULT_OOM;
+ }
+ pme = pme_start + entry_num;
+ pme->virt_addr = start_addr;
+ pme->nr_pages = pages;
+ pme->is_huge_page = false;
+ *(unsigned long *)kernel_space_reserve_start = entry_num + 1;
+ spin_unlock(&page_map_entry_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(reserve_kernel_space_mem);
+
+#endif /* CONFIG_PIN_MEMORY */