Message ID | 20220110223201.31024-9-alex.sierra@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add MEMORY_DEVICE_COHERENT for coherent device memory mapping | expand |
On Tuesday, 11 January 2022 9:31:59 AM AEDT Alex Sierra wrote: > Device Coherent type uses device memory that is coherently accesible by > the CPU. This could be shown as SP (special purpose) memory range > at the BIOS-e820 memory enumeration. If no SP memory is supported in > system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP. > > Currently, test_hmm only supports two different SP ranges of at least > 256MB size. This could be specified in the kernel parameter variable > efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 & > 0x140000000 physical address. Ex. > efi_fake_mem=1G@0x100000000:0x40000,1G@0x140000000:0x40000 > > Private and coherent device mirror instances can be created in the same > probed. This is done by passing the module parameters spm_addr_dev0 & > spm_addr_dev1. In this case, it will create four instances of > device_mirror. The first two correspond to private device type, the > last two to coherent type. Then, they can be easily accessed from user > space through /dev/hmm_mirror<num_device>. Usually num_device 0 and 1 > are for private, and 2 and 3 for coherent types. If no module > parameters are passed, two instances of private type device_mirror will > be created only. > > Signed-off-by: Alex Sierra <alex.sierra@amd.com> > --- > lib/test_hmm.c | 247 ++++++++++++++++++++++++++++++++------------ > lib/test_hmm_uapi.h | 15 ++- > 2 files changed, 193 insertions(+), 69 deletions(-) > > diff --git a/lib/test_hmm.c b/lib/test_hmm.c > index 9edeff52302e..7c641c5a9cfa 100644 > --- a/lib/test_hmm.c > +++ b/lib/test_hmm.c > @@ -29,11 +29,22 @@ > > #include "test_hmm_uapi.h" > > -#define DMIRROR_NDEVICES 2 > +#define DMIRROR_NDEVICES 4 > #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 > #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) > #define DEVMEM_CHUNKS_RESERVE 16 > > +/* > + * For device_private pages, dpage is just a dummy struct page > + * representing a piece of device memory. dmirror_devmem_alloc_page > + * allocates a real system memory page as backing storage to fake a > + * real device. zone_device_data points to that backing page. But > + * for device_coherent memory, the struct page represents real > + * physical CPU-accessible memory that we can use directly. > + */ > +#define BACKING_PAGE(page) (is_device_private_page((page)) ? \ > + (page)->zone_device_data : (page)) > + > static unsigned long spm_addr_dev0; > module_param(spm_addr_dev0, long, 0644); > MODULE_PARM_DESC(spm_addr_dev0, > @@ -122,6 +133,21 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce, > return 0; > } > > +static bool dmirror_is_private_zone(struct dmirror_device *mdevice) > +{ > + return (mdevice->zone_device_type == > + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false; > +} > + > +static enum migrate_vma_direction > + dmirror_select_device(struct dmirror *dmirror) > +{ > + return (dmirror->mdevice->zone_device_type == > + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? > + MIGRATE_VMA_SELECT_DEVICE_PRIVATE : > + MIGRATE_VMA_SELECT_DEVICE_COHERENT; > +} > + > static void dmirror_bounce_fini(struct dmirror_bounce *bounce) > { > vfree(bounce->ptr); > @@ -572,16 +598,19 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, > static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) > { > struct page *dpage = NULL; > - struct page *rpage; > + struct page *rpage = NULL; > > /* > - * This is a fake device so we alloc real system memory to store > - * our device memory. > + * For ZONE_DEVICE private type, this is a fake device so we alloc real > + * system memory to store our device memory. > + * For ZONE_DEVICE coherent type we use the actual dpage to store the data > + * and ignore rpage. > */ > - rpage = alloc_page(GFP_HIGHUSER); > - if (!rpage) > - return NULL; > - > + if (dmirror_is_private_zone(mdevice)) { > + rpage = alloc_page(GFP_HIGHUSER); > + if (!rpage) > + return NULL; > + } > spin_lock(&mdevice->lock); > > if (mdevice->free_pages) { > @@ -601,7 +630,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) > return dpage; > > error: > - __free_page(rpage); > + if (rpage) > + __free_page(rpage); > return NULL; > } > > @@ -627,12 +657,15 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, > * unallocated pte_none() or read-only zero page. > */ > spage = migrate_pfn_to_page(*src); > + WARN(spage && is_zone_device_page(spage), > + "page already in device spage pfn: 0x%lx\n", > + page_to_pfn(spage)); This should also lead to test failure because we are only supposed to be selecting system pages. Ie. if (WARN_ON_ONCE()) { dpage = NULL; continue; } > dpage = dmirror_devmem_alloc_page(mdevice); > if (!dpage) > continue; > > - rpage = dpage->zone_device_data; > + rpage = BACKING_PAGE(dpage); > if (spage) > copy_highpage(rpage, spage); > else > @@ -646,6 +679,8 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, > */ > rpage->zone_device_data = dmirror; > > + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", > + page_to_pfn(spage), page_to_pfn(dpage)); > *dst = migrate_pfn(page_to_pfn(dpage)) | > MIGRATE_PFN_LOCKED; > if ((*src & MIGRATE_PFN_WRITE) || > @@ -724,11 +759,7 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, > if (!dpage) > continue; > > - /* > - * Store the page that holds the data so the page table > - * doesn't have to deal with ZONE_DEVICE private pages. > - */ > - entry = dpage->zone_device_data; > + entry = BACKING_PAGE(dpage); > if (*dst & MIGRATE_PFN_WRITE) > entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); > entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); > @@ -808,8 +839,101 @@ static int dmirror_exclusive(struct dmirror *dmirror, > return ret; > } > > -static int dmirror_migrate(struct dmirror *dmirror, > - struct hmm_dmirror_cmd *cmd) > +static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, > + struct dmirror *dmirror) > +{ > + const unsigned long *src = args->src; > + unsigned long *dst = args->dst; > + unsigned long start = args->start; > + unsigned long end = args->end; > + unsigned long addr; > + > + for (addr = start; addr < end; addr += PAGE_SIZE, > + src++, dst++) { > + struct page *dpage, *spage; > + > + spage = migrate_pfn_to_page(*src); > + if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) > + continue; > + > + WARN_ON(!is_device_page(spage)); Same - the test should fail in this case (ie. by not migrating the page). > + spage = BACKING_PAGE(spage); > + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); > + if (!dpage) > + continue; > + pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n", > + page_to_pfn(spage), page_to_pfn(dpage)); > + > + lock_page(dpage); > + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); > + copy_highpage(dpage, spage); > + *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; > + if (*src & MIGRATE_PFN_WRITE) > + *dst |= MIGRATE_PFN_WRITE; > + } > + return 0; > +} > + > +static int dmirror_migrate_to_system(struct dmirror *dmirror, > + struct hmm_dmirror_cmd *cmd) > +{ > + unsigned long start, end, addr; > + unsigned long size = cmd->npages << PAGE_SHIFT; > + struct mm_struct *mm = dmirror->notifier.mm; > + struct vm_area_struct *vma; > + unsigned long src_pfns[64]; > + unsigned long dst_pfns[64]; > + struct migrate_vma args; > + unsigned long next; > + int ret; > + > + start = cmd->addr; > + end = start + size; > + if (end < start) > + return -EINVAL; > + > + /* Since the mm is for the mirrored process, get a reference first. */ > + if (!mmget_not_zero(mm)) > + return -EINVAL; > + > + mmap_read_lock(mm); > + for (addr = start; addr < end; addr = next) { > + vma = vma_lookup(mm, addr); > + if (!vma || !(vma->vm_flags & VM_READ)) { > + ret = -EINVAL; > + goto out; > + } > + next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); > + if (next > vma->vm_end) > + next = vma->vm_end; > + > + args.vma = vma; > + args.src = src_pfns; > + args.dst = dst_pfns; > + args.start = addr; > + args.end = next; > + args.pgmap_owner = dmirror->mdevice; > + args.flags = dmirror_select_device(dmirror); > + > + ret = migrate_vma_setup(&args); > + if (ret) > + goto out; > + > + pr_debug("Migrating from device mem to sys mem\n"); > + dmirror_devmem_fault_alloc_and_copy(&args, dmirror); > + > + migrate_vma_pages(&args); > + migrate_vma_finalize(&args); > + } > +out: > + mmap_read_unlock(mm); > + mmput(mm); > + return ret; I think you need to return the number of pages successfully migrated otherwise failures migrating back to system memory will go unnoticed. This isn't a problem for device private pages because in that case pages are migrated back in response to a fault, and if they fail the process will get killed with SIGBUS. > +} > + > +static int dmirror_migrate_to_device(struct dmirror *dmirror, > + struct hmm_dmirror_cmd *cmd) > { > unsigned long start, end, addr; > unsigned long size = cmd->npages << PAGE_SHIFT; > @@ -853,6 +977,7 @@ static int dmirror_migrate(struct dmirror *dmirror, > if (ret) > goto out; > > + pr_debug("Migrating from sys mem to device mem\n"); > dmirror_migrate_alloc_and_copy(&args, dmirror); > migrate_vma_pages(&args); > dmirror_migrate_finalize_and_map(&args, dmirror); > @@ -861,7 +986,7 @@ static int dmirror_migrate(struct dmirror *dmirror, > mmap_read_unlock(mm); > mmput(mm); > > - /* Return the migrated data for verification. */ > + /* Return the migrated data for verification. only for pages in device zone */ > ret = dmirror_bounce_init(&bounce, start, size); > if (ret) > return ret; > @@ -898,12 +1023,22 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, > } > > page = hmm_pfn_to_page(entry); > - if (is_device_private_page(page)) { > - /* Is the page migrated to this device or some other? */ > - if (dmirror->mdevice == dmirror_page_to_device(page)) > + if (is_device_page(page)) { > + /* Is page ZONE_DEVICE coherent? */ > + if (!is_device_private_page(page)) { This would read more clearly without the negation - ie. `if (is_device_coherent_page(page))`. > + if (dmirror->mdevice == dmirror_page_to_device(page)) > + *perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL; > + else > + *perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE; > + /* > + * Is page ZONE_DEVICE private migrated to > + * this device or some other? > + */ > + } else if (dmirror->mdevice == dmirror_page_to_device(page)) { > *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; > - else > + } else { > *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; > + } > } else if (is_zero_pfn(page_to_pfn(page))) > *perm = HMM_DMIRROR_PROT_ZERO; > else > @@ -1100,8 +1235,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, > ret = dmirror_write(dmirror, &cmd); > break; > > - case HMM_DMIRROR_MIGRATE: > - ret = dmirror_migrate(dmirror, &cmd); > + case HMM_DMIRROR_MIGRATE_TO_DEV: > + ret = dmirror_migrate_to_device(dmirror, &cmd); > + break; > + > + case HMM_DMIRROR_MIGRATE_TO_SYS: > + ret = dmirror_migrate_to_system(dmirror, &cmd); > break; > > case HMM_DMIRROR_EXCLUSIVE: > @@ -1142,14 +1281,13 @@ static const struct file_operations dmirror_fops = { > > static void dmirror_devmem_free(struct page *page) > { > - struct page *rpage = page->zone_device_data; > + struct page *rpage = BACKING_PAGE(page); > struct dmirror_device *mdevice; > > - if (rpage) > + if (rpage != page) > __free_page(rpage); > > mdevice = dmirror_page_to_device(page); > - > spin_lock(&mdevice->lock); > mdevice->cfree++; > page->zone_device_data = mdevice->free_pages; > @@ -1157,38 +1295,6 @@ static void dmirror_devmem_free(struct page *page) > spin_unlock(&mdevice->lock); > } > > -static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, > - struct dmirror *dmirror) > -{ > - const unsigned long *src = args->src; > - unsigned long *dst = args->dst; > - unsigned long start = args->start; > - unsigned long end = args->end; > - unsigned long addr; > - > - for (addr = start; addr < end; addr += PAGE_SIZE, > - src++, dst++) { > - struct page *dpage, *spage; > - > - spage = migrate_pfn_to_page(*src); > - if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) > - continue; > - spage = spage->zone_device_data; > - > - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); > - if (!dpage) > - continue; > - > - lock_page(dpage); > - xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); > - copy_highpage(dpage, spage); > - *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; > - if (*src & MIGRATE_PFN_WRITE) > - *dst |= MIGRATE_PFN_WRITE; > - } > - return 0; > -} > - > static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) > { > struct migrate_vma args; > @@ -1203,7 +1309,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) > * the mirror but here we use it to hold the page for the simulated > * device memory and that page holds the pointer to the mirror. > */ > - rpage = vmf->page->zone_device_data; > + rpage = BACKING_PAGE(vmf->page); Note that as I understand things this isn't strictly required because this will never be called for device coherent pages. > dmirror = rpage->zone_device_data; > > /* FIXME demonstrate how we can adjust migrate range */ > @@ -1213,7 +1319,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) > args.src = &src_pfns; > args.dst = &dst_pfns; > args.pgmap_owner = dmirror->mdevice; > - args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; > + args.flags = dmirror_select_device(dmirror); > > if (migrate_vma_setup(&args)) > return VM_FAULT_SIGBUS; > @@ -1279,14 +1385,26 @@ static void dmirror_device_remove(struct dmirror_device *mdevice) > static int __init hmm_dmirror_init(void) > { > int ret; > - int id; > + int id = 0; > + int ndevices = 0; > > ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, > "HMM_DMIRROR"); > if (ret) > goto err_unreg; > > - for (id = 0; id < DMIRROR_NDEVICES; id++) { > + memset(dmirror_devices, 0, DMIRROR_NDEVICES * sizeof(dmirror_devices[0])); > + dmirror_devices[ndevices++].zone_device_type = > + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; > + dmirror_devices[ndevices++].zone_device_type = > + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; > + if (spm_addr_dev0 && spm_addr_dev1) { > + dmirror_devices[ndevices++].zone_device_type = > + HMM_DMIRROR_MEMORY_DEVICE_COHERENT; > + dmirror_devices[ndevices++].zone_device_type = > + HMM_DMIRROR_MEMORY_DEVICE_COHERENT; > + } > + for (id = 0; id < ndevices; id++) { > ret = dmirror_device_init(dmirror_devices + id, id); > if (ret) > goto err_chrdev; > @@ -1308,7 +1426,8 @@ static void __exit hmm_dmirror_exit(void) > int id; > > for (id = 0; id < DMIRROR_NDEVICES; id++) > - dmirror_device_remove(dmirror_devices + id); > + if (dmirror_devices[id].zone_device_type) Oh, I understand why you reserved 0 for zone_device_type now. But I still thinks it's unnecessary - hmm_dmirror_exit() getting called implies hmm_dmirror_init() was successful which implies zone_device_type was initialised for all devices. > + dmirror_device_remove(dmirror_devices + id); > unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); > } > > diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h > index 625f3690d086..e190b2ab6f19 100644 > --- a/lib/test_hmm_uapi.h > +++ b/lib/test_hmm_uapi.h > @@ -33,11 +33,12 @@ struct hmm_dmirror_cmd { > /* Expose the address space of the calling process through hmm device file */ > #define HMM_DMIRROR_READ _IOWR('H', 0x00, struct hmm_dmirror_cmd) > #define HMM_DMIRROR_WRITE _IOWR('H', 0x01, struct hmm_dmirror_cmd) > -#define HMM_DMIRROR_MIGRATE _IOWR('H', 0x02, struct hmm_dmirror_cmd) > -#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x03, struct hmm_dmirror_cmd) > -#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x04, struct hmm_dmirror_cmd) > -#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) > -#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x06, struct hmm_dmirror_cmd) > +#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd) > +#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd) > +#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd) > +#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) > +#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd) > +#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x07, struct hmm_dmirror_cmd) > > /* > * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT. > @@ -52,6 +53,8 @@ struct hmm_dmirror_cmd { > * device the ioctl() is made > * HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some > * other device > + * HMM_DMIRROR_PROT_DEV_COHERENT: Migrate device coherent page on the device > + * the ioctl() is made > */ > enum { > HMM_DMIRROR_PROT_ERROR = 0xFF, > @@ -63,6 +66,8 @@ enum { > HMM_DMIRROR_PROT_ZERO = 0x10, > HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20, > HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30, > + HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL = 0x40, > + HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE = 0x50, > }; > > enum { >
diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 9edeff52302e..7c641c5a9cfa 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -29,11 +29,22 @@ #include "test_hmm_uapi.h" -#define DMIRROR_NDEVICES 2 +#define DMIRROR_NDEVICES 4 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) #define DEVMEM_CHUNKS_RESERVE 16 +/* + * For device_private pages, dpage is just a dummy struct page + * representing a piece of device memory. dmirror_devmem_alloc_page + * allocates a real system memory page as backing storage to fake a + * real device. zone_device_data points to that backing page. But + * for device_coherent memory, the struct page represents real + * physical CPU-accessible memory that we can use directly. + */ +#define BACKING_PAGE(page) (is_device_private_page((page)) ? \ + (page)->zone_device_data : (page)) + static unsigned long spm_addr_dev0; module_param(spm_addr_dev0, long, 0644); MODULE_PARM_DESC(spm_addr_dev0, @@ -122,6 +133,21 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce, return 0; } +static bool dmirror_is_private_zone(struct dmirror_device *mdevice) +{ + return (mdevice->zone_device_type == + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false; +} + +static enum migrate_vma_direction + dmirror_select_device(struct dmirror *dmirror) +{ + return (dmirror->mdevice->zone_device_type == + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? + MIGRATE_VMA_SELECT_DEVICE_PRIVATE : + MIGRATE_VMA_SELECT_DEVICE_COHERENT; +} + static void dmirror_bounce_fini(struct dmirror_bounce *bounce) { vfree(bounce->ptr); @@ -572,16 +598,19 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) { struct page *dpage = NULL; - struct page *rpage; + struct page *rpage = NULL; /* - * This is a fake device so we alloc real system memory to store - * our device memory. + * For ZONE_DEVICE private type, this is a fake device so we alloc real + * system memory to store our device memory. + * For ZONE_DEVICE coherent type we use the actual dpage to store the data + * and ignore rpage. */ - rpage = alloc_page(GFP_HIGHUSER); - if (!rpage) - return NULL; - + if (dmirror_is_private_zone(mdevice)) { + rpage = alloc_page(GFP_HIGHUSER); + if (!rpage) + return NULL; + } spin_lock(&mdevice->lock); if (mdevice->free_pages) { @@ -601,7 +630,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) return dpage; error: - __free_page(rpage); + if (rpage) + __free_page(rpage); return NULL; } @@ -627,12 +657,15 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, * unallocated pte_none() or read-only zero page. */ spage = migrate_pfn_to_page(*src); + WARN(spage && is_zone_device_page(spage), + "page already in device spage pfn: 0x%lx\n", + page_to_pfn(spage)); dpage = dmirror_devmem_alloc_page(mdevice); if (!dpage) continue; - rpage = dpage->zone_device_data; + rpage = BACKING_PAGE(dpage); if (spage) copy_highpage(rpage, spage); else @@ -646,6 +679,8 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, */ rpage->zone_device_data = dmirror; + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", + page_to_pfn(spage), page_to_pfn(dpage)); *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; if ((*src & MIGRATE_PFN_WRITE) || @@ -724,11 +759,7 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, if (!dpage) continue; - /* - * Store the page that holds the data so the page table - * doesn't have to deal with ZONE_DEVICE private pages. - */ - entry = dpage->zone_device_data; + entry = BACKING_PAGE(dpage); if (*dst & MIGRATE_PFN_WRITE) entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); @@ -808,8 +839,101 @@ static int dmirror_exclusive(struct dmirror *dmirror, return ret; } -static int dmirror_migrate(struct dmirror *dmirror, - struct hmm_dmirror_cmd *cmd) +static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, + struct dmirror *dmirror) +{ + const unsigned long *src = args->src; + unsigned long *dst = args->dst; + unsigned long start = args->start; + unsigned long end = args->end; + unsigned long addr; + + for (addr = start; addr < end; addr += PAGE_SIZE, + src++, dst++) { + struct page *dpage, *spage; + + spage = migrate_pfn_to_page(*src); + if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) + continue; + + WARN_ON(!is_device_page(spage)); + spage = BACKING_PAGE(spage); + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); + if (!dpage) + continue; + pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n", + page_to_pfn(spage), page_to_pfn(dpage)); + + lock_page(dpage); + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); + copy_highpage(dpage, spage); + *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; + if (*src & MIGRATE_PFN_WRITE) + *dst |= MIGRATE_PFN_WRITE; + } + return 0; +} + +static int dmirror_migrate_to_system(struct dmirror *dmirror, + struct hmm_dmirror_cmd *cmd) +{ + unsigned long start, end, addr; + unsigned long size = cmd->npages << PAGE_SHIFT; + struct mm_struct *mm = dmirror->notifier.mm; + struct vm_area_struct *vma; + unsigned long src_pfns[64]; + unsigned long dst_pfns[64]; + struct migrate_vma args; + unsigned long next; + int ret; + + start = cmd->addr; + end = start + size; + if (end < start) + return -EINVAL; + + /* Since the mm is for the mirrored process, get a reference first. */ + if (!mmget_not_zero(mm)) + return -EINVAL; + + mmap_read_lock(mm); + for (addr = start; addr < end; addr = next) { + vma = vma_lookup(mm, addr); + if (!vma || !(vma->vm_flags & VM_READ)) { + ret = -EINVAL; + goto out; + } + next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); + if (next > vma->vm_end) + next = vma->vm_end; + + args.vma = vma; + args.src = src_pfns; + args.dst = dst_pfns; + args.start = addr; + args.end = next; + args.pgmap_owner = dmirror->mdevice; + args.flags = dmirror_select_device(dmirror); + + ret = migrate_vma_setup(&args); + if (ret) + goto out; + + pr_debug("Migrating from device mem to sys mem\n"); + dmirror_devmem_fault_alloc_and_copy(&args, dmirror); + + migrate_vma_pages(&args); + migrate_vma_finalize(&args); + } +out: + mmap_read_unlock(mm); + mmput(mm); + + return ret; +} + +static int dmirror_migrate_to_device(struct dmirror *dmirror, + struct hmm_dmirror_cmd *cmd) { unsigned long start, end, addr; unsigned long size = cmd->npages << PAGE_SHIFT; @@ -853,6 +977,7 @@ static int dmirror_migrate(struct dmirror *dmirror, if (ret) goto out; + pr_debug("Migrating from sys mem to device mem\n"); dmirror_migrate_alloc_and_copy(&args, dmirror); migrate_vma_pages(&args); dmirror_migrate_finalize_and_map(&args, dmirror); @@ -861,7 +986,7 @@ static int dmirror_migrate(struct dmirror *dmirror, mmap_read_unlock(mm); mmput(mm); - /* Return the migrated data for verification. */ + /* Return the migrated data for verification. only for pages in device zone */ ret = dmirror_bounce_init(&bounce, start, size); if (ret) return ret; @@ -898,12 +1023,22 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, } page = hmm_pfn_to_page(entry); - if (is_device_private_page(page)) { - /* Is the page migrated to this device or some other? */ - if (dmirror->mdevice == dmirror_page_to_device(page)) + if (is_device_page(page)) { + /* Is page ZONE_DEVICE coherent? */ + if (!is_device_private_page(page)) { + if (dmirror->mdevice == dmirror_page_to_device(page)) + *perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL; + else + *perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE; + /* + * Is page ZONE_DEVICE private migrated to + * this device or some other? + */ + } else if (dmirror->mdevice == dmirror_page_to_device(page)) { *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; - else + } else { *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; + } } else if (is_zero_pfn(page_to_pfn(page))) *perm = HMM_DMIRROR_PROT_ZERO; else @@ -1100,8 +1235,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, ret = dmirror_write(dmirror, &cmd); break; - case HMM_DMIRROR_MIGRATE: - ret = dmirror_migrate(dmirror, &cmd); + case HMM_DMIRROR_MIGRATE_TO_DEV: + ret = dmirror_migrate_to_device(dmirror, &cmd); + break; + + case HMM_DMIRROR_MIGRATE_TO_SYS: + ret = dmirror_migrate_to_system(dmirror, &cmd); break; case HMM_DMIRROR_EXCLUSIVE: @@ -1142,14 +1281,13 @@ static const struct file_operations dmirror_fops = { static void dmirror_devmem_free(struct page *page) { - struct page *rpage = page->zone_device_data; + struct page *rpage = BACKING_PAGE(page); struct dmirror_device *mdevice; - if (rpage) + if (rpage != page) __free_page(rpage); mdevice = dmirror_page_to_device(page); - spin_lock(&mdevice->lock); mdevice->cfree++; page->zone_device_data = mdevice->free_pages; @@ -1157,38 +1295,6 @@ static void dmirror_devmem_free(struct page *page) spin_unlock(&mdevice->lock); } -static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, - struct dmirror *dmirror) -{ - const unsigned long *src = args->src; - unsigned long *dst = args->dst; - unsigned long start = args->start; - unsigned long end = args->end; - unsigned long addr; - - for (addr = start; addr < end; addr += PAGE_SIZE, - src++, dst++) { - struct page *dpage, *spage; - - spage = migrate_pfn_to_page(*src); - if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) - continue; - spage = spage->zone_device_data; - - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); - if (!dpage) - continue; - - lock_page(dpage); - xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); - copy_highpage(dpage, spage); - *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; - if (*src & MIGRATE_PFN_WRITE) - *dst |= MIGRATE_PFN_WRITE; - } - return 0; -} - static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { struct migrate_vma args; @@ -1203,7 +1309,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) * the mirror but here we use it to hold the page for the simulated * device memory and that page holds the pointer to the mirror. */ - rpage = vmf->page->zone_device_data; + rpage = BACKING_PAGE(vmf->page); dmirror = rpage->zone_device_data; /* FIXME demonstrate how we can adjust migrate range */ @@ -1213,7 +1319,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) args.src = &src_pfns; args.dst = &dst_pfns; args.pgmap_owner = dmirror->mdevice; - args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + args.flags = dmirror_select_device(dmirror); if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; @@ -1279,14 +1385,26 @@ static void dmirror_device_remove(struct dmirror_device *mdevice) static int __init hmm_dmirror_init(void) { int ret; - int id; + int id = 0; + int ndevices = 0; ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, "HMM_DMIRROR"); if (ret) goto err_unreg; - for (id = 0; id < DMIRROR_NDEVICES; id++) { + memset(dmirror_devices, 0, DMIRROR_NDEVICES * sizeof(dmirror_devices[0])); + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; + if (spm_addr_dev0 && spm_addr_dev1) { + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_COHERENT; + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_COHERENT; + } + for (id = 0; id < ndevices; id++) { ret = dmirror_device_init(dmirror_devices + id, id); if (ret) goto err_chrdev; @@ -1308,7 +1426,8 @@ static void __exit hmm_dmirror_exit(void) int id; for (id = 0; id < DMIRROR_NDEVICES; id++) - dmirror_device_remove(dmirror_devices + id); + if (dmirror_devices[id].zone_device_type) + dmirror_device_remove(dmirror_devices + id); unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); } diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index 625f3690d086..e190b2ab6f19 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h @@ -33,11 +33,12 @@ struct hmm_dmirror_cmd { /* Expose the address space of the calling process through hmm device file */ #define HMM_DMIRROR_READ _IOWR('H', 0x00, struct hmm_dmirror_cmd) #define HMM_DMIRROR_WRITE _IOWR('H', 0x01, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_MIGRATE _IOWR('H', 0x02, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x03, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x04, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x06, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x07, struct hmm_dmirror_cmd) /* * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT. @@ -52,6 +53,8 @@ struct hmm_dmirror_cmd { * device the ioctl() is made * HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some * other device + * HMM_DMIRROR_PROT_DEV_COHERENT: Migrate device coherent page on the device + * the ioctl() is made */ enum { HMM_DMIRROR_PROT_ERROR = 0xFF, @@ -63,6 +66,8 @@ enum { HMM_DMIRROR_PROT_ZERO = 0x10, HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20, HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30, + HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL = 0x40, + HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE = 0x50, }; enum {
Device Coherent type uses device memory that is coherently accesible by the CPU. This could be shown as SP (special purpose) memory range at the BIOS-e820 memory enumeration. If no SP memory is supported in system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP. Currently, test_hmm only supports two different SP ranges of at least 256MB size. This could be specified in the kernel parameter variable efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 & 0x140000000 physical address. Ex. efi_fake_mem=1G@0x100000000:0x40000,1G@0x140000000:0x40000 Private and coherent device mirror instances can be created in the same probed. This is done by passing the module parameters spm_addr_dev0 & spm_addr_dev1. In this case, it will create four instances of device_mirror. The first two correspond to private device type, the last two to coherent type. Then, they can be easily accessed from user space through /dev/hmm_mirror<num_device>. Usually num_device 0 and 1 are for private, and 2 and 3 for coherent types. If no module parameters are passed, two instances of private type device_mirror will be created only. Signed-off-by: Alex Sierra <alex.sierra@amd.com> --- lib/test_hmm.c | 247 ++++++++++++++++++++++++++++++++------------ lib/test_hmm_uapi.h | 15 ++- 2 files changed, 193 insertions(+), 69 deletions(-)