diff mbox series

[v2,1/4] hmm: Device exclusive memory access

Message ID 20210219020750.16444-2-apopple@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Add support for SVM atomics in Nouveau | expand

Commit Message

Alistair Popple Feb. 19, 2021, 2:07 a.m. UTC
Some devices require exclusive write access to shared virtual
memory (SVM) ranges to perform atomic operations on that memory. This
requires CPU page tables to be updated to deny access whilst atomic
operations are occurring.

In order to do this introduce a new swap entry
type (SWP_DEVICE_EXCLUSIVE). When a SVM range needs to be marked for
exclusive access by a device all page table mappings for the particular
range are replaced with device exclusive swap entries. This causes any
CPU access to the page to result in a fault.

Faults are resovled by replacing the faulting entry with the original
mapping. This results in MMU notifiers being called which a driver uses
to update access permissions such as revoking atomic access. After
notifiers have been called the device will no longer have exclusive
access to the region.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
---
 Documentation/vm/hmm.rst |  15 +++
 fs/proc/task_mmu.c       |   7 ++
 include/linux/hmm.h      |   4 +
 include/linux/rmap.h     |   1 +
 include/linux/swap.h     |  10 +-
 include/linux/swapops.h  |  32 ++++++
 mm/hmm.c                 | 206 +++++++++++++++++++++++++++++++++++++++
 mm/memory.c              |  34 ++++++-
 mm/mprotect.c            |   7 ++
 mm/page_vma_mapped.c     |  14 ++-
 mm/rmap.c                |  29 +++++-
 11 files changed, 347 insertions(+), 12 deletions(-)

Comments

kernel test robot Feb. 19, 2021, 4:04 a.m. UTC | #1
Hi Alistair,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kselftest/next]
[also build test ERROR on linus/master v5.11 next-20210218]
[cannot apply to hnaz-linux-mm/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Alistair-Popple/Add-support-for-SVM-atomics-in-Nouveau/20210219-100858
base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
config: mips-randconfig-r036-20210218 (attached as .config)
compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project c9439ca36342fb6013187d0a69aef92736951476)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install mips cross compiling tool for clang build
        # apt-get install binutils-mips-linux-gnu
        # https://github.com/0day-ci/linux/commit/bb5444811772d30b2e3bbaa44baeb8a4b3f03cec
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Alistair-Popple/Add-support-for-SVM-atomics-in-Nouveau/20210219-100858
        git checkout bb5444811772d30b2e3bbaa44baeb8a4b3f03cec
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=mips 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

>> fs/proc/task_mmu.c:521:12: error: implicit declaration of function 'is_device_exclusive_entry' [-Werror,-Wimplicit-function-declaration]
                   else if (is_device_exclusive_entry(swpent))
                            ^
   fs/proc/task_mmu.c:521:12: note: did you mean 'is_device_private_entry'?
   include/linux/swapops.h:176:20: note: 'is_device_private_entry' declared here
   static inline bool is_device_private_entry(swp_entry_t entry)
                      ^
>> fs/proc/task_mmu.c:522:11: error: implicit declaration of function 'device_exclusive_entry_to_page' [-Werror,-Wimplicit-function-declaration]
                           page = device_exclusive_entry_to_page(swpent);
                                  ^
   fs/proc/task_mmu.c:522:11: note: did you mean 'device_private_entry_to_page'?
   include/linux/swapops.h:191:28: note: 'device_private_entry_to_page' declared here
   static inline struct page *device_private_entry_to_page(swp_entry_t entry)
                              ^
>> fs/proc/task_mmu.c:522:9: warning: incompatible integer to pointer conversion assigning to 'struct page *' from 'int' [-Wint-conversion]
                           page = device_exclusive_entry_to_page(swpent);
                                ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   fs/proc/task_mmu.c:1395:7: error: implicit declaration of function 'is_device_exclusive_entry' [-Werror,-Wimplicit-function-declaration]
                   if (is_device_exclusive_entry(entry))
                       ^
   fs/proc/task_mmu.c:1396:11: error: implicit declaration of function 'device_exclusive_entry_to_page' [-Werror,-Wimplicit-function-declaration]
                           page = device_exclusive_entry_to_page(entry);
                                  ^
   fs/proc/task_mmu.c:1396:9: warning: incompatible integer to pointer conversion assigning to 'struct page *' from 'int' [-Wint-conversion]
                           page = device_exclusive_entry_to_page(entry);
                                ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   2 warnings and 4 errors generated.


vim +/is_device_exclusive_entry +521 fs/proc/task_mmu.c

   490	
   491	static void smaps_pte_entry(pte_t *pte, unsigned long addr,
   492			struct mm_walk *walk)
   493	{
   494		struct mem_size_stats *mss = walk->private;
   495		struct vm_area_struct *vma = walk->vma;
   496		bool locked = !!(vma->vm_flags & VM_LOCKED);
   497		struct page *page = NULL;
   498	
   499		if (pte_present(*pte)) {
   500			page = vm_normal_page(vma, addr, *pte);
   501		} else if (is_swap_pte(*pte)) {
   502			swp_entry_t swpent = pte_to_swp_entry(*pte);
   503	
   504			if (!non_swap_entry(swpent)) {
   505				int mapcount;
   506	
   507				mss->swap += PAGE_SIZE;
   508				mapcount = swp_swapcount(swpent);
   509				if (mapcount >= 2) {
   510					u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
   511	
   512					do_div(pss_delta, mapcount);
   513					mss->swap_pss += pss_delta;
   514				} else {
   515					mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
   516				}
   517			} else if (is_migration_entry(swpent))
   518				page = migration_entry_to_page(swpent);
   519			else if (is_device_private_entry(swpent))
   520				page = device_private_entry_to_page(swpent);
 > 521			else if (is_device_exclusive_entry(swpent))
 > 522				page = device_exclusive_entry_to_page(swpent);
   523		} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
   524								&& pte_none(*pte))) {
   525			page = xa_load(&vma->vm_file->f_mapping->i_pages,
   526							linear_page_index(vma, addr));
   527			if (xa_is_value(page))
   528				mss->swap += PAGE_SIZE;
   529			return;
   530		}
   531	
   532		if (!page)
   533			return;
   534	
   535		smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
   536	}
   537	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Alistair Popple Feb. 19, 2021, 4:29 a.m. UTC | #2
Apologies for the noise, looks like I don't have a CONFIG_DEVICE_PRIVATE=n 
build in my tests and missed creating definitions for the new static inline 
functions for that configuration.

I'll wait for some feedback on the overall approach and fix this in a v3.

 - Alistair

On Friday, 19 February 2021 3:04:07 PM AEDT kernel test robot wrote:
> External email: Use caution opening links or attachments
> 
> 
> Hi Alistair,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on kselftest/next]
> [also build test ERROR on linus/master v5.11 next-20210218]
> [cannot apply to hnaz-linux-mm/master]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch]
> 
> url:    https://github.com/0day-ci/linux/commits/Alistair-Popple/Add-support-for-SVM-atomics-in-Nouveau/20210219-100858
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
> config: mips-randconfig-r036-20210218 (attached as .config)
> compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project 
c9439ca36342fb6013187d0a69aef92736951476)
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/
make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # install mips cross compiling tool for clang build
>         # apt-get install binutils-mips-linux-gnu
>         # https://github.com/0day-ci/linux/commit/
bb5444811772d30b2e3bbaa44baeb8a4b3f03cec
>         git remote add linux-review https://github.com/0day-ci/linux
>         git fetch --no-tags linux-review Alistair-Popple/Add-support-for-
SVM-atomics-in-Nouveau/20210219-100858
>         git checkout bb5444811772d30b2e3bbaa44baeb8a4b3f03cec
>         # save the attached .config to linux build tree
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=mips
> 
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
> 
> All error/warnings (new ones prefixed by >>):
> 
> >> fs/proc/task_mmu.c:521:12: error: implicit declaration of function 
'is_device_exclusive_entry' [-Werror,-Wimplicit-function-declaration]
>                    else if (is_device_exclusive_entry(swpent))
>                             ^
>    fs/proc/task_mmu.c:521:12: note: did you mean 'is_device_private_entry'?
>    include/linux/swapops.h:176:20: note: 'is_device_private_entry' declared 
here
>    static inline bool is_device_private_entry(swp_entry_t entry)
>                       ^
> >> fs/proc/task_mmu.c:522:11: error: implicit declaration of function 
'device_exclusive_entry_to_page' [-Werror,-Wimplicit-function-declaration]
>                            page = device_exclusive_entry_to_page(swpent);
>                                   ^
>    fs/proc/task_mmu.c:522:11: note: did you mean 
'device_private_entry_to_page'?
>    include/linux/swapops.h:191:28: note: 'device_private_entry_to_page' 
declared here
>    static inline struct page *device_private_entry_to_page(swp_entry_t 
entry)
>                               ^
> >> fs/proc/task_mmu.c:522:9: warning: incompatible integer to pointer 
conversion assigning to 'struct page *' from 'int' [-Wint-conversion]
>                            page = device_exclusive_entry_to_page(swpent);
>                                 ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>    fs/proc/task_mmu.c:1395:7: error: implicit declaration of function 
'is_device_exclusive_entry' [-Werror,-Wimplicit-function-declaration]
>                    if (is_device_exclusive_entry(entry))
>                        ^
>    fs/proc/task_mmu.c:1396:11: error: implicit declaration of function 
'device_exclusive_entry_to_page' [-Werror,-Wimplicit-function-declaration]
>                            page = device_exclusive_entry_to_page(entry);
>                                   ^
>    fs/proc/task_mmu.c:1396:9: warning: incompatible integer to pointer 
conversion assigning to 'struct page *' from 'int' [-Wint-conversion]
>                            page = device_exclusive_entry_to_page(entry);
>                                 ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>    2 warnings and 4 errors generated.
> 
> 
> vim +/is_device_exclusive_entry +521 fs/proc/task_mmu.c
> 
>    490
>    491  static void smaps_pte_entry(pte_t *pte, unsigned long addr,
>    492                  struct mm_walk *walk)
>    493  {
>    494          struct mem_size_stats *mss = walk->private;
>    495          struct vm_area_struct *vma = walk->vma;
>    496          bool locked = !!(vma->vm_flags & VM_LOCKED);
>    497          struct page *page = NULL;
>    498
>    499          if (pte_present(*pte)) {
>    500                  page = vm_normal_page(vma, addr, *pte);
>    501          } else if (is_swap_pte(*pte)) {
>    502                  swp_entry_t swpent = pte_to_swp_entry(*pte);
>    503
>    504                  if (!non_swap_entry(swpent)) {
>    505                          int mapcount;
>    506
>    507                          mss->swap += PAGE_SIZE;
>    508                          mapcount = swp_swapcount(swpent);
>    509                          if (mapcount >= 2) {
>    510                                  u64 pss_delta = (u64)PAGE_SIZE << 
PSS_SHIFT;
>    511
>    512                                  do_div(pss_delta, mapcount);
>    513                                  mss->swap_pss += pss_delta;
>    514                          } else {
>    515                                  mss->swap_pss += (u64)PAGE_SIZE << 
PSS_SHIFT;
>    516                          }
>    517                  } else if (is_migration_entry(swpent))
>    518                          page = migration_entry_to_page(swpent);
>    519                  else if (is_device_private_entry(swpent))
>    520                          page = device_private_entry_to_page(swpent);
>  > 521                  else if (is_device_exclusive_entry(swpent))
>  > 522                          page = 
device_exclusive_entry_to_page(swpent);
>    523          } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss-
>check_shmem_swap
>    524                                                          && 
pte_none(*pte))) {
>    525                  page = xa_load(&vma->vm_file->f_mapping->i_pages,
>    526                                                  
linear_page_index(vma, addr));
>    527                  if (xa_is_value(page))
>    528                          mss->swap += PAGE_SIZE;
>    529                  return;
>    530          }
>    531
>    532          if (!page)
>    533                  return;
>    534
>    535          smaps_account(mss, page, false, pte_young(*pte), 
pte_dirty(*pte), locked);
>    536  }
>    537
> 
> ---
> 0-DAY CI Kernel Test Service, Intel Corporation
> https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot Feb. 19, 2021, 5:09 a.m. UTC | #3
Hi Alistair,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kselftest/next]
[also build test ERROR on linus/master v5.11 next-20210218]
[cannot apply to hnaz-linux-mm/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Alistair-Popple/Add-support-for-SVM-atomics-in-Nouveau/20210219-100858
base:   https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
config: x86_64-randconfig-s021-20210217 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.3-215-g0fb77bb6-dirty
        # https://github.com/0day-ci/linux/commit/bb5444811772d30b2e3bbaa44baeb8a4b3f03cec
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Alistair-Popple/Add-support-for-SVM-atomics-in-Nouveau/20210219-100858
        git checkout bb5444811772d30b2e3bbaa44baeb8a4b3f03cec
        # save the attached .config to linux build tree
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   ld: warning: orphan section `.data..decrypted' from `arch/x86/kernel/cpu/vmware.o' being placed in section `.data..decrypted'
   ld: warning: orphan section `.data..decrypted' from `arch/x86/kernel/kvm.o' being placed in section `.data..decrypted'
   ld: mm/memory.o: in function `do_swap_page':
>> mm/memory.c:3300: undefined reference to `hmm_remove_exclusive_entry'


vim +3300 mm/memory.c

  3270	
  3271	/*
  3272	 * We enter with non-exclusive mmap_lock (to exclude vma changes,
  3273	 * but allow concurrent faults), and pte mapped but not yet locked.
  3274	 * We return with pte unmapped and unlocked.
  3275	 *
  3276	 * We return with the mmap_lock locked or unlocked in the same cases
  3277	 * as does filemap_fault().
  3278	 */
  3279	vm_fault_t do_swap_page(struct vm_fault *vmf)
  3280	{
  3281		struct vm_area_struct *vma = vmf->vma;
  3282		struct page *page = NULL, *swapcache;
  3283		swp_entry_t entry;
  3284		pte_t pte;
  3285		int locked;
  3286		int exclusive = 0;
  3287		vm_fault_t ret = 0;
  3288		void *shadow = NULL;
  3289	
  3290		if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
  3291			goto out;
  3292	
  3293		entry = pte_to_swp_entry(vmf->orig_pte);
  3294		if (unlikely(non_swap_entry(entry))) {
  3295			if (is_migration_entry(entry)) {
  3296				migration_entry_wait(vma->vm_mm, vmf->pmd,
  3297						     vmf->address);
  3298			} else if (is_device_exclusive_entry(entry)) {
  3299				vmf->page = device_exclusive_entry_to_page(entry);
> 3300				ret = hmm_remove_exclusive_entry(vmf);
  3301			} else if (is_device_private_entry(entry)) {
  3302				vmf->page = device_private_entry_to_page(entry);
  3303				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
  3304			} else if (is_hwpoison_entry(entry)) {
  3305				ret = VM_FAULT_HWPOISON;
  3306			} else {
  3307				print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
  3308				ret = VM_FAULT_SIGBUS;
  3309			}
  3310			goto out;
  3311		}
  3312	
  3313	
  3314		delayacct_set_flag(DELAYACCT_PF_SWAPIN);
  3315		page = lookup_swap_cache(entry, vma, vmf->address);
  3316		swapcache = page;
  3317	
  3318		if (!page) {
  3319			struct swap_info_struct *si = swp_swap_info(entry);
  3320	
  3321			if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
  3322			    __swap_count(entry) == 1) {
  3323				/* skip swapcache */
  3324				page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
  3325								vmf->address);
  3326				if (page) {
  3327					int err;
  3328	
  3329					__SetPageLocked(page);
  3330					__SetPageSwapBacked(page);
  3331					set_page_private(page, entry.val);
  3332	
  3333					/* Tell memcg to use swap ownership records */
  3334					SetPageSwapCache(page);
  3335					err = mem_cgroup_charge(page, vma->vm_mm,
  3336								GFP_KERNEL);
  3337					ClearPageSwapCache(page);
  3338					if (err) {
  3339						ret = VM_FAULT_OOM;
  3340						goto out_page;
  3341					}
  3342	
  3343					shadow = get_shadow_from_swap_cache(entry);
  3344					if (shadow)
  3345						workingset_refault(page, shadow);
  3346	
  3347					lru_cache_add(page);
  3348					swap_readpage(page, true);
  3349				}
  3350			} else {
  3351				page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
  3352							vmf);
  3353				swapcache = page;
  3354			}
  3355	
  3356			if (!page) {
  3357				/*
  3358				 * Back out if somebody else faulted in this pte
  3359				 * while we released the pte lock.
  3360				 */
  3361				vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
  3362						vmf->address, &vmf->ptl);
  3363				if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
  3364					ret = VM_FAULT_OOM;
  3365				delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
  3366				goto unlock;
  3367			}
  3368	
  3369			/* Had to read the page from swap area: Major fault */
  3370			ret = VM_FAULT_MAJOR;
  3371			count_vm_event(PGMAJFAULT);
  3372			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
  3373		} else if (PageHWPoison(page)) {
  3374			/*
  3375			 * hwpoisoned dirty swapcache pages are kept for killing
  3376			 * owner processes (which may be unknown at hwpoison time)
  3377			 */
  3378			ret = VM_FAULT_HWPOISON;
  3379			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
  3380			goto out_release;
  3381		}
  3382	
  3383		locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
  3384	
  3385		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
  3386		if (!locked) {
  3387			ret |= VM_FAULT_RETRY;
  3388			goto out_release;
  3389		}
  3390	
  3391		/*
  3392		 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
  3393		 * release the swapcache from under us.  The page pin, and pte_same
  3394		 * test below, are not enough to exclude that.  Even if it is still
  3395		 * swapcache, we need to check that the page's swap has not changed.
  3396		 */
  3397		if (unlikely((!PageSwapCache(page) ||
  3398				page_private(page) != entry.val)) && swapcache)
  3399			goto out_page;
  3400	
  3401		page = ksm_might_need_to_copy(page, vma, vmf->address);
  3402		if (unlikely(!page)) {
  3403			ret = VM_FAULT_OOM;
  3404			page = swapcache;
  3405			goto out_page;
  3406		}
  3407	
  3408		cgroup_throttle_swaprate(page, GFP_KERNEL);
  3409	
  3410		/*
  3411		 * Back out if somebody else already faulted in this pte.
  3412		 */
  3413		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
  3414				&vmf->ptl);
  3415		if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
  3416			goto out_nomap;
  3417	
  3418		if (unlikely(!PageUptodate(page))) {
  3419			ret = VM_FAULT_SIGBUS;
  3420			goto out_nomap;
  3421		}
  3422	
  3423		/*
  3424		 * The page isn't present yet, go ahead with the fault.
  3425		 *
  3426		 * Be careful about the sequence of operations here.
  3427		 * To get its accounting right, reuse_swap_page() must be called
  3428		 * while the page is counted on swap but not yet in mapcount i.e.
  3429		 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
  3430		 * must be called after the swap_free(), or it will never succeed.
  3431		 */
  3432	
  3433		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
  3434		dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
  3435		pte = mk_pte(page, vma->vm_page_prot);
  3436		if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
  3437			pte = maybe_mkwrite(pte_mkdirty(pte), vma);
  3438			vmf->flags &= ~FAULT_FLAG_WRITE;
  3439			ret |= VM_FAULT_WRITE;
  3440			exclusive = RMAP_EXCLUSIVE;
  3441		}
  3442		flush_icache_page(vma, page);
  3443		if (pte_swp_soft_dirty(vmf->orig_pte))
  3444			pte = pte_mksoft_dirty(pte);
  3445		if (pte_swp_uffd_wp(vmf->orig_pte)) {
  3446			pte = pte_mkuffd_wp(pte);
  3447			pte = pte_wrprotect(pte);
  3448		}
  3449		set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
  3450		arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
  3451		vmf->orig_pte = pte;
  3452	
  3453		/* ksm created a completely new copy */
  3454		if (unlikely(page != swapcache && swapcache)) {
  3455			page_add_new_anon_rmap(page, vma, vmf->address, false);
  3456			lru_cache_add_inactive_or_unevictable(page, vma);
  3457		} else {
  3458			do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
  3459		}
  3460	
  3461		swap_free(entry);
  3462		if (mem_cgroup_swap_full(page) ||
  3463		    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
  3464			try_to_free_swap(page);
  3465		unlock_page(page);
  3466		if (page != swapcache && swapcache) {
  3467			/*
  3468			 * Hold the lock to avoid the swap entry to be reused
  3469			 * until we take the PT lock for the pte_same() check
  3470			 * (to avoid false positives from pte_same). For
  3471			 * further safety release the lock after the swap_free
  3472			 * so that the swap count won't change under a
  3473			 * parallel locked swapcache.
  3474			 */
  3475			unlock_page(swapcache);
  3476			put_page(swapcache);
  3477		}
  3478	
  3479		if (vmf->flags & FAULT_FLAG_WRITE) {
  3480			ret |= do_wp_page(vmf);
  3481			if (ret & VM_FAULT_ERROR)
  3482				ret &= VM_FAULT_ERROR;
  3483			goto out;
  3484		}
  3485	
  3486		/* No need to invalidate - it was non-present before */
  3487		update_mmu_cache(vma, vmf->address, vmf->pte);
  3488	unlock:
  3489		pte_unmap_unlock(vmf->pte, vmf->ptl);
  3490	out:
  3491		return ret;
  3492	out_nomap:
  3493		pte_unmap_unlock(vmf->pte, vmf->ptl);
  3494	out_page:
  3495		unlock_page(page);
  3496	out_release:
  3497		put_page(page);
  3498		if (page != swapcache && swapcache) {
  3499			unlock_page(swapcache);
  3500			put_page(swapcache);
  3501		}
  3502		return ret;
  3503	}
  3504	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Christoph Hellwig Feb. 19, 2021, 9:47 a.m. UTC | #4
>  			page = migration_entry_to_page(swpent);
>  		else if (is_device_private_entry(swpent))
>  			page = device_private_entry_to_page(swpent);
> +		else if (is_device_exclusive_entry(swpent))
> +			page = device_exclusive_entry_to_page(swpent);

>  			page = migration_entry_to_page(swpent);
>  		else if (is_device_private_entry(swpent))
>  			page = device_private_entry_to_page(swpent);
> +		else if (is_device_exclusive_entry(swpent))
> +			page = device_exclusive_entry_to_page(swpent);

>  		if (is_device_private_entry(entry))
>  			page = device_private_entry_to_page(entry);
> +
> +		if (is_device_exclusive_entry(entry))
> +			page = device_exclusive_entry_to_page(entry);

Any chance we can come up with a clever scheme to avoid all this
boilerplate code (and maybe also what it gets compiled to)?

> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> index 866a0fa104c4..5d28ff6d4d80 100644
> --- a/include/linux/hmm.h
> +++ b/include/linux/hmm.h
> @@ -109,6 +109,10 @@ struct hmm_range {
>   */
>  int hmm_range_fault(struct hmm_range *range);
>  
> +int hmm_exclusive_range(struct mm_struct *mm, unsigned long start,
> +			unsigned long end, struct page **pages);
> +vm_fault_t hmm_remove_exclusive_entry(struct vm_fault *vmf);

Can we avoid the hmm naming for new code (we should probably also kill
it off for the existing code)?

> +#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e) \
> +					|| is_device_exclusive_entry(e)); })
> +#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e) \
> +					|| is_device_exclusive_entry(e)); })

Can you turn these into properly formatted inline functions?  As-is this
becomes pretty unreadable.

> +static inline void make_device_exclusive_entry_read(swp_entry_t *entry)
> +{
> +	*entry = swp_entry(SWP_DEVICE_EXCLUSIVE_READ, swp_offset(*entry));
> +}

s/make_device_exclusive_entry_read/mark_device_exclusive_entry_readable/
??

> +
> +static inline swp_entry_t make_device_exclusive_entry(struct page *page, bool write)
> +{
> +	return swp_entry(write ? SWP_DEVICE_EXCLUSIVE_WRITE : SWP_DEVICE_EXCLUSIVE_READ,
> +			 page_to_pfn(page));
> +}

I'd split this into two helpers, which is easier to follow and avoids
the pointlessly overlong lines.

> +static inline bool is_device_exclusive_entry(swp_entry_t entry)
> +{
> +	int type = swp_type(entry);
> +	return type == SWP_DEVICE_EXCLUSIVE_READ || type == SWP_DEVICE_EXCLUSIVE_WRITE;
> +}

Another overly long line.  I also wouldn't bother with the local
variable:

	return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ ||
		swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
		

> +static inline bool is_write_device_exclusive_entry(swp_entry_t entry)
> +{
> +	return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
> +}

Or reuse these kind of helpers..

> +
> +static inline unsigned long device_exclusive_entry_to_pfn(swp_entry_t entry)
> +{
> +	return swp_offset(entry);
> +}
> +
> +static inline struct page *device_exclusive_entry_to_page(swp_entry_t entry)
> +{
> +	return pfn_to_page(swp_offset(entry));
> +}

I'd rather open code these two, and as a prep patch also kill off the
equivalents for the migration and device private entries, which would
actually clean up a lot of the mess mentioned in my first comment above.

> +static int hmm_exclusive_skip(unsigned long start,
> +		      unsigned long end,
> +		      __always_unused int depth,
> +		      struct mm_walk *walk)
> +{
> +	struct hmm_exclusive_walk *hmm_exclusive_walk = walk->private;
> +	unsigned long addr;
> +
> +	for (addr = start; addr < end; addr += PAGE_SIZE)
> +		hmm_exclusive_walk->pages[hmm_exclusive_walk->npages++] = NULL;
> +
> +	return 0;
> +}

Wouldn't pre-zeroing the array be simpler and more efficient?

> +int hmm_exclusive_range(struct mm_struct *mm, unsigned long start,
> +			unsigned long end, struct page **pages)
> +{
> +	struct hmm_exclusive_walk hmm_exclusive_walk = { .pages = pages, .npages = 0 };
> +	int i;
> +
> +	/* Collect and lock candidate pages */
> +	walk_page_range(mm, start, end, &hmm_exclusive_walk_ops, &hmm_exclusive_walk);

Please avoid the overly long lines.

But more importantly:  Unless I'm missing something obvious this
walk_page_range call just open codes get_user_pages_fast, why can't you
use that?

> +#if defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) || defined(CONFIG_HUGETLB)
> +		if (PageTransHuge(page)) {
> +			VM_BUG_ON_PAGE(1, page);
> +			continue;
> +		}
> +#endif

Doesn't PageTransHuge always return false for that case?  If not
shouldn't we make sure it does?

> +
> +		pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
> +		if (pte_swp_soft_dirty(*pvmw.pte))
> +			pte = pte_mksoft_dirty(pte);
> +
> +		entry = pte_to_swp_entry(*pvmw.pte);
> +		if (pte_swp_uffd_wp(*pvmw.pte))
> +			pte = pte_mkuffd_wp(pte);
> +		else if (is_write_device_exclusive_entry(entry))
> +			pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> +
> +		set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
> +
> +		/*
> +		 * No need to take a page reference as one was already
> +		 * created when the swap entry was made.
> +		 */
> +		if (PageAnon(page))
> +			page_add_anon_rmap(page, vma, pvmw.address, false);
> +		else
> +			page_add_file_rmap(page, false);
> +
> +		if (vma->vm_flags & VM_LOCKED)
> +			mlock_vma_page(page);
> +
> +		/*
> +		 * No need to invalidate - it was non-present before. However
> +		 * secondary CPUs may have mappings that need invalidating.
> +		 */
> +		update_mmu_cache(vma, pvmw.address, pvmw.pte);

It would be nice to split out the body of this loop into a helper.

> +				if (!is_device_private_entry(entry) &&
> +					!is_device_exclusive_entry(entry))

The normal style for this would be:

				if (!is_device_private_entry(entry) &&
				    !is_device_exclusive_entry(entry))

> -		if (!is_device_private_entry(entry))
> +		if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry))

Plase split this into two lines.

> @@ -216,6 +219,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  	}
>  	if (!map_pte(pvmw))
>  		goto next_pte;
> +
>  	while (1) {
>  		if (check_pte(pvmw))
>  			return true;

Spurious whitespace change.

> -	if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
> +	if (IS_ENABLED(CONFIG_MIGRATION) && (flags & (TTU_MIGRATION | TTU_EXCLUSIVE)) &&

Please split this into two lines.

>  	    is_zone_device_page(page) && !is_device_private_page(page))
>  		return true;
>  
> @@ -1591,6 +1591,33 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
>  			/* We have to invalidate as we cleared the pte */
>  			mmu_notifier_invalidate_range(mm, address,
>  						      address + PAGE_SIZE);
> +		} else if (flags & TTU_EXCLUSIVE) {

try_to_unmap_one has turned into a monster.  A little refactoring to
split it into managable pieces would be nice.
Jason Gunthorpe Feb. 19, 2021, 2:01 p.m. UTC | #5
On Fri, Feb 19, 2021 at 09:47:41AM +0000, Christoph Hellwig wrote:

> > diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> > index 866a0fa104c4..5d28ff6d4d80 100644
> > +++ b/include/linux/hmm.h
> > @@ -109,6 +109,10 @@ struct hmm_range {
> >   */
> >  int hmm_range_fault(struct hmm_range *range);
> >  
> > +int hmm_exclusive_range(struct mm_struct *mm, unsigned long start,
> > +			unsigned long end, struct page **pages);
> > +vm_fault_t hmm_remove_exclusive_entry(struct vm_fault *vmf);
> 
> Can we avoid the hmm naming for new code (we should probably also kill
> it off for the existing code)?

Yes please, I'd prefer it if hmm.c was just the special page walker
and not a grab bag of unrelated things

Is there is a more natural place to put this in the mm for this idea?

Jason
Alistair Popple Feb. 22, 2021, 10:46 a.m. UTC | #6
On Friday, 19 February 2021 8:47:41 PM AEDT Christoph Hellwig wrote:
> >  			page = migration_entry_to_page(swpent);
> >  		else if (is_device_private_entry(swpent))
> >  			page = device_private_entry_to_page(swpent);
> > +		else if (is_device_exclusive_entry(swpent))
> > +			page = device_exclusive_entry_to_page(swpent);
> 
> >  			page = migration_entry_to_page(swpent);
> >  		else if (is_device_private_entry(swpent))
> >  			page = device_private_entry_to_page(swpent);
> > +		else if (is_device_exclusive_entry(swpent))
> > +			page = device_exclusive_entry_to_page(swpent);
> 
> >  		if (is_device_private_entry(entry))
> >  			page = device_private_entry_to_page(entry);
> > +
> > +		if (is_device_exclusive_entry(entry))
> > +			page = device_exclusive_entry_to_page(entry);
> 
> Any chance we can come up with a clever scheme to avoid all this
> boilerplate code (and maybe also what it gets compiled to)?

If I open code the entry_to_page() functions as suggested below then these 
simplify down to single if statements like so:

                if (is_migration_entry(entry) ||
                    is_device_private_entry(entry) ||
                    is_device_exclusive_entry(entry))
                        page = pfn_to_page(swp_offset(entry));

I could simplify further by hiding that in a single static inline like so:

static inline bool is_special_entry(swp_entry_t entry)
{
	return is_migration_entry(entry) ||
			is_device_private_entry(entry) ||
          is_device_exclusive_entry(entry);
}

My only concern with doing that is these entries can't *always* be treated the 
same so it might make it too easy to overlook the subtle differences.

> > diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> > index 866a0fa104c4..5d28ff6d4d80 100644
> > --- a/include/linux/hmm.h
> > +++ b/include/linux/hmm.h
> > @@ -109,6 +109,10 @@ struct hmm_range {
> >   */
> >  int hmm_range_fault(struct hmm_range *range);
> >  
> > +int hmm_exclusive_range(struct mm_struct *mm, unsigned long start,
> > +			unsigned long end, struct page **pages);
> > +vm_fault_t hmm_remove_exclusive_entry(struct vm_fault *vmf);
> 
> Can we avoid the hmm naming for new code (we should probably also kill
> it off for the existing code)?

Sure. I ended up stuffing it in there for the moment because I didn't know of 
any other "obvious" spot to put it. How about these for names:

int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
			unsigned long end, struct page **pages);
vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf);

I am open to any alternative naming suggestions though.

> > +#define free_swap_and_cache(e) ({(is_migration_entry(e) || 
is_device_private_entry(e) \
> > +					|| is_device_exclusive_entry(e)); })
> > +#define swapcache_prepare(e) ({(is_migration_entry(e) || 
is_device_private_entry(e) \
> > +					|| is_device_exclusive_entry(e)); })
> Can you turn these into properly formatted inline functions?  As-is this
> becomes pretty unreadable.

Ok, if I add a is_special_entry() function as suggested above these could just 
use that.
 
> > +static inline void make_device_exclusive_entry_read(swp_entry_t *entry)
> > +{
> > +	*entry = swp_entry(SWP_DEVICE_EXCLUSIVE_READ, swp_offset(*entry));
> > +}
> 
> s/make_device_exclusive_entry_read/mark_device_exclusive_entry_readable/
> ??

See my next comment.

> > +
> > +static inline swp_entry_t make_device_exclusive_entry(struct page *page, 
bool write)
> > +{
> > +	return swp_entry(write ? SWP_DEVICE_EXCLUSIVE_WRITE : 
SWP_DEVICE_EXCLUSIVE_READ,
> > +			 page_to_pfn(page));
> > +}
> 
> I'd split this into two helpers, which is easier to follow and avoids
> the pointlessly overlong lines.

I assume you mean separate read and write functions instead of using the write 
flag?

These are based on the existing device private functions which themselves 
looked to be based on the migration entry functions. It would be good to keep 
these consistent so when making the changes above I would also refactor the 
existing make_device_private_entry() and make_migration_entry() functions to 
match as well.

As a test I tried refactoring the migration entry functions into the below and 
it seemed to make things a bit easier to follow if not a little verbose:

static inline int is_writable_migration_entry(swp_entry_t entry);
static inline swp_entry_t make_readable_migration_entry(pgoff_t offset);
static inline swp_entry_t make_writable_migration_entry(pgoff_t offset);
static inline int is_migration_entry(swp_entry_t entry);

So I can do that along with the same refactoring of device private accessor 
functions as a prep patch if that seems reasonable?

> > +static inline bool is_device_exclusive_entry(swp_entry_t entry)
> > +{
> > +	int type = swp_type(entry);
> > +	return type == SWP_DEVICE_EXCLUSIVE_READ || type == 
SWP_DEVICE_EXCLUSIVE_WRITE;
> > +}
> 
> Another overly long line.  I also wouldn't bother with the local
> variable:
> 
> 	return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ ||
> 		swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
> 		
> 
> > +static inline bool is_write_device_exclusive_entry(swp_entry_t entry)
> > +{
> > +	return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
> > +}
> 
> Or reuse these kind of helpers..
>
> > +
> > +static inline unsigned long device_exclusive_entry_to_pfn(swp_entry_t 
entry)
> > +{
> > +	return swp_offset(entry);
> > +}
> > +
> > +static inline struct page *device_exclusive_entry_to_page(swp_entry_t 
entry)
> > +{
> > +	return pfn_to_page(swp_offset(entry));
> > +}
> 
> I'd rather open code these two, and as a prep patch also kill off the
> equivalents for the migration and device private entries, which would
> actually clean up a lot of the mess mentioned in my first comment above.

Ok, I have just tried doing that and it ends up being more concise and 
accurate so I'll add it to the series. 

> > +static int hmm_exclusive_skip(unsigned long start,
> > +		      unsigned long end,
> > +		      __always_unused int depth,
> > +		      struct mm_walk *walk)
> > +{
> > +	struct hmm_exclusive_walk *hmm_exclusive_walk = walk->private;
> > +	unsigned long addr;
> > +
> > +	for (addr = start; addr < end; addr += PAGE_SIZE)
> > +		hmm_exclusive_walk->pages[hmm_exclusive_walk->npages++] = NULL;
> > +
> > +	return 0;
> > +}
> 
> Wouldn't pre-zeroing the array be simpler and more efficient?

Good point. I had other code here but I didn't need it and should have just 
removed what was left.

> > +int hmm_exclusive_range(struct mm_struct *mm, unsigned long start,
> > +			unsigned long end, struct page **pages)
> > +{
> > +	struct hmm_exclusive_walk hmm_exclusive_walk = { .pages = pages, 
.npages = 0 };
> > +	int i;
> > +
> > +	/* Collect and lock candidate pages */
> > +	walk_page_range(mm, start, end, &hmm_exclusive_walk_ops, 
&hmm_exclusive_walk);
> 
> Please avoid the overly long lines.
> 
> But more importantly:  Unless I'm missing something obvious this
> walk_page_range call just open codes get_user_pages_fast, why can't you
> use that?

Good idea, you aren't missing anything although I don't think 
get_user_pages_fast() will work as the driver needs to be able to operate on a 
remote mm_struct. 

But something like get_user_pages_remote() would work better than open-coding 
it.

> > +#if defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) || defined(CONFIG_HUGETLB)
> > +		if (PageTransHuge(page)) {
> > +			VM_BUG_ON_PAGE(1, page);
> > +			continue;
> > +		}
> > +#endif
> 
> Doesn't PageTransHuge always return false for that case?  If not
> shouldn't we make sure it does?

Right, this is probably an overly defensive check so I'll remove it.

[snip - will address code format/structure suggestions]

> 
> try_to_unmap_one has turned into a monster.  A little refactoring to
> split it into managable pieces would be nice.
> 

Agreed, adding this it did seem a little unwieldy. I will see if I can do a 
bit of refactoring for the next version.

 - Alistair
diff mbox series

Patch

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 09e28507f5b2..ffdc58cb2e7c 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -405,6 +405,21 @@  between device driver specific code and shared common code:
 
    The lock can now be released.
 
+Exclusive access memory
+=======================
+
+Not all devices support atomic access to system memory. To support atomic
+operations to a shared virtual memory page such a device needs access to that
+page which is exclusive of any userspace access from the CPU. The
+``hmm_exclusive_range()`` function can be used to make a memory range
+inaccessible from userspace.
+
+This replaces all mappings for pages in the given range with special swap
+entries. Any attempt to access the swap entry results in a fault which is
+resovled by replacing the entry with the original mapping. A driver gets
+notified that the mapping has been changed by MMU notifiers, after which point
+it will no longer have exclusive access to the page.
+
 Memory cgroup (memcg) and rss accounting
 ========================================
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 602e3a52884d..79aaf8768be3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -518,6 +518,8 @@  static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 			page = migration_entry_to_page(swpent);
 		else if (is_device_private_entry(swpent))
 			page = device_private_entry_to_page(swpent);
+		else if (is_device_exclusive_entry(swpent))
+			page = device_exclusive_entry_to_page(swpent);
 	} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
 							&& pte_none(*pte))) {
 		page = xa_load(&vma->vm_file->f_mapping->i_pages,
@@ -695,6 +697,8 @@  static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
 			page = migration_entry_to_page(swpent);
 		else if (is_device_private_entry(swpent))
 			page = device_private_entry_to_page(swpent);
+		else if (is_device_exclusive_entry(swpent))
+			page = device_exclusive_entry_to_page(swpent);
 	}
 	if (page) {
 		int mapcount = page_mapcount(page);
@@ -1387,6 +1391,9 @@  static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 
 		if (is_device_private_entry(entry))
 			page = device_private_entry_to_page(entry);
+
+		if (is_device_exclusive_entry(entry))
+			page = device_exclusive_entry_to_page(entry);
 	}
 
 	if (page && !PageAnon(page))
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 866a0fa104c4..5d28ff6d4d80 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -109,6 +109,10 @@  struct hmm_range {
  */
 int hmm_range_fault(struct hmm_range *range);
 
+int hmm_exclusive_range(struct mm_struct *mm, unsigned long start,
+			unsigned long end, struct page **pages);
+vm_fault_t hmm_remove_exclusive_entry(struct vm_fault *vmf);
+
 /*
  * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
  *
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 70085ca1a3fc..5503fc4d1138 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -98,6 +98,7 @@  enum ttu_flags {
 	TTU_RMAP_LOCKED		= 0x80,	/* do not grab rmap lock:
 					 * caller holds it */
 	TTU_SPLIT_FREEZE	= 0x100,		/* freeze pte under splitting thp */
+	TTU_EXCLUSIVE		= 0x200,		/* replace with exclusive access */
 };
 
 #ifdef CONFIG_MMU
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 596bc2f4d9b0..9803753b0302 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -63,9 +63,11 @@  static inline int current_is_kswapd(void)
  * to a special SWP_DEVICE_* entry.
  */
 #ifdef CONFIG_DEVICE_PRIVATE
-#define SWP_DEVICE_NUM 2
+#define SWP_DEVICE_NUM 4
 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
 #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
+#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
+#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3)
 #else
 #define SWP_DEVICE_NUM 0
 #endif
@@ -519,8 +521,10 @@  static inline void show_swap_cache_info(void)
 {
 }
 
-#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
-#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
+#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e) \
+					|| is_device_exclusive_entry(e)); })
+#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e) \
+					|| is_device_exclusive_entry(e)); })
 
 static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
 {
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index d9b7c9132c2f..a307134bfee5 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -122,6 +122,38 @@  static inline bool is_write_device_private_entry(swp_entry_t entry)
 	return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
 }
 
+static inline void make_device_exclusive_entry_read(swp_entry_t *entry)
+{
+	*entry = swp_entry(SWP_DEVICE_EXCLUSIVE_READ, swp_offset(*entry));
+}
+
+static inline swp_entry_t make_device_exclusive_entry(struct page *page, bool write)
+{
+	return swp_entry(write ? SWP_DEVICE_EXCLUSIVE_WRITE : SWP_DEVICE_EXCLUSIVE_READ,
+			 page_to_pfn(page));
+}
+
+static inline bool is_device_exclusive_entry(swp_entry_t entry)
+{
+	int type = swp_type(entry);
+	return type == SWP_DEVICE_EXCLUSIVE_READ || type == SWP_DEVICE_EXCLUSIVE_WRITE;
+}
+
+static inline bool is_write_device_exclusive_entry(swp_entry_t entry)
+{
+	return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
+}
+
+static inline unsigned long device_exclusive_entry_to_pfn(swp_entry_t entry)
+{
+	return swp_offset(entry);
+}
+
+static inline struct page *device_exclusive_entry_to_page(swp_entry_t entry)
+{
+	return pfn_to_page(swp_offset(entry));
+}
+
 static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry)
 {
 	return swp_offset(entry);
diff --git a/mm/hmm.c b/mm/hmm.c
index 943cb2ba4442..90cdf99bee52 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -26,6 +26,8 @@ 
 #include <linux/mmu_notifier.h>
 #include <linux/memory_hotplug.h>
 
+#include "internal.h"
+
 struct hmm_vma_walk {
 	struct hmm_range	*range;
 	unsigned long		last;
@@ -272,6 +274,9 @@  static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 		if (!non_swap_entry(entry))
 			goto fault;
 
+		if (is_device_exclusive_entry(entry))
+			goto fault;
+
 		if (is_migration_entry(entry)) {
 			pte_unmap(ptep);
 			hmm_vma_walk->last = addr;
@@ -593,3 +598,204 @@  int hmm_range_fault(struct hmm_range *range)
 	return ret;
 }
 EXPORT_SYMBOL(hmm_range_fault);
+
+struct hmm_exclusive_walk {
+	struct page **pages;
+	int npages;
+};
+
+static int hmm_exclusive_skip(unsigned long start,
+		      unsigned long end,
+		      __always_unused int depth,
+		      struct mm_walk *walk)
+{
+	struct hmm_exclusive_walk *hmm_exclusive_walk = walk->private;
+	unsigned long addr;
+
+	for (addr = start; addr < end; addr += PAGE_SIZE)
+		hmm_exclusive_walk->pages[hmm_exclusive_walk->npages++] = NULL;
+
+	return 0;
+}
+
+static int hmm_exclusive_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
+		  struct mm_walk *walk)
+{
+	struct hmm_exclusive_walk *hmm_exclusive_walk = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
+	pte_t *ptep;
+
+	if (pmd_trans_huge(*pmdp)) {
+		int ret;
+		struct page *page;
+
+		ptl = pmd_lock(mm, pmdp);
+		if (unlikely(!pmd_trans_huge(*pmdp))) {
+			spin_unlock(ptl);
+			walk->action = ACTION_AGAIN;
+			return 0;
+		}
+
+		page = pmd_page(*pmdp);
+		if (is_huge_zero_page(page)) {
+			spin_unlock(ptl);
+			return hmm_exclusive_skip(addr, end, -1, walk);
+		}
+
+		get_page(page);
+		spin_unlock(ptl);
+		if (unlikely(!trylock_page(page)))
+			return hmm_exclusive_skip(addr, end, -1, walk);
+
+		ret = split_huge_page(page);
+		unlock_page(page);
+		put_page(page);
+		if (ret || pmd_none(*pmdp))
+			return hmm_exclusive_skip(addr, end, -1, walk);
+	}
+
+	if (unlikely(pmd_bad(*pmdp)))
+		return hmm_exclusive_skip(addr, end, -1, walk);
+
+	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	for (; addr < end; addr += PAGE_SIZE, ptep++) {
+		struct page *page;
+		pte_t pte = *ptep;
+
+		if (pte_none(pte) || !pte_present(pte) ||
+		    is_zero_pfn(pte_pfn(pte)))
+			continue;
+
+		page = vm_normal_page(vma, addr, pte);
+		if (!page)
+			continue;
+
+		get_page(page);
+		if (!trylock_page(page)) {
+			put_page(page);
+			continue;
+		}
+
+		hmm_exclusive_walk->pages[hmm_exclusive_walk->npages++] = page;
+	}
+
+	pte_unmap_unlock(ptep - 1, ptl);
+	return 0;
+}
+
+static const struct mm_walk_ops hmm_exclusive_walk_ops = {
+	.pmd_entry	= hmm_exclusive_pmd,
+	.pte_hole	= hmm_exclusive_skip,
+};
+
+/**
+ * hmm_exclusive_range() - Mark a range for exclusive use by a device
+ * @mm: mm_struct of assoicated target process
+ * @start: start of the region to mark for exclusive device access
+ * @end: end address of region
+ * @pages: returns the pages which were successfully mark for exclusive acces
+ *
+ * Returns: number of pages successfully marked for exclusive access
+ *
+ * This function finds the ptes mapping page(s) to the given address range and
+ * replaces them with special swap entries preventing userspace CPU access. On
+ * fault these entries are replaced with the original mapping after calling MMU
+ * notifiers.
+ */
+int hmm_exclusive_range(struct mm_struct *mm, unsigned long start,
+			unsigned long end, struct page **pages)
+{
+	struct hmm_exclusive_walk hmm_exclusive_walk = { .pages = pages, .npages = 0 };
+	int i;
+
+	/* Collect and lock candidate pages */
+	walk_page_range(mm, start, end, &hmm_exclusive_walk_ops, &hmm_exclusive_walk);
+
+	for (i = 0; i < hmm_exclusive_walk.npages; i++)
+		if (pages[i])
+			if (!try_to_unmap(pages[i], TTU_EXCLUSIVE | TTU_IGNORE_MLOCK)) {
+				unlock_page(pages[i]);
+				put_page(pages[i]);
+				pages[i] = NULL;
+			}
+
+	return hmm_exclusive_walk.npages;
+}
+EXPORT_SYMBOL_GPL(hmm_exclusive_range);
+
+/*
+ * Restore a potential migration pte to a working pte entry
+ */
+vm_fault_t hmm_remove_exclusive_entry(struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct vm_area_struct *vma = vmf->vma;
+	struct page_vma_mapped_walk pvmw = {
+		.page = page,
+		.vma = vma,
+		.address = vmf->address,
+		.flags = PVMW_SYNC,
+	};
+	pte_t pte;
+	vm_fault_t ret = 0;
+	struct mmu_notifier_range range;
+
+	lock_page(page);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+				vmf->address & PAGE_MASK,
+				(vmf->address & PAGE_MASK) + PAGE_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+
+	while (page_vma_mapped_walk(&pvmw)) {
+		swp_entry_t entry;
+
+		if (unlikely(!pte_same(*pvmw.pte, vmf->orig_pte))) {
+			page_vma_mapped_walk_done(&pvmw);
+			break;
+		}
+
+#if defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) || defined(CONFIG_HUGETLB)
+		if (PageTransHuge(page)) {
+			VM_BUG_ON_PAGE(1, page);
+			continue;
+		}
+#endif
+
+		pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+		if (pte_swp_soft_dirty(*pvmw.pte))
+			pte = pte_mksoft_dirty(pte);
+
+		entry = pte_to_swp_entry(*pvmw.pte);
+		if (pte_swp_uffd_wp(*pvmw.pte))
+			pte = pte_mkuffd_wp(pte);
+		else if (is_write_device_exclusive_entry(entry))
+			pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+
+		set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
+
+		/*
+		 * No need to take a page reference as one was already
+		 * created when the swap entry was made.
+		 */
+		if (PageAnon(page))
+			page_add_anon_rmap(page, vma, pvmw.address, false);
+		else
+			page_add_file_rmap(page, false);
+
+		if (vma->vm_flags & VM_LOCKED)
+			mlock_vma_page(page);
+
+		/*
+		 * No need to invalidate - it was non-present before. However
+		 * secondary CPUs may have mappings that need invalidating.
+		 */
+		update_mmu_cache(vma, pvmw.address, pvmw.pte);
+	}
+
+	unlock_page(page);
+
+	mmu_notifier_invalidate_range_end(&range);
+	return ret;
+}
diff --git a/mm/memory.c b/mm/memory.c
index feff48e1465a..ed03a37f25cb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -73,6 +73,7 @@ 
 #include <linux/perf_event.h>
 #include <linux/ptrace.h>
 #include <linux/vmalloc.h>
+#include <linux/hmm.h>
 
 #include <trace/events/kmem.h>
 
@@ -736,9 +737,29 @@  copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 				pte = pte_swp_mkuffd_wp(pte);
 			set_pte_at(src_mm, addr, src_pte, pte);
 		}
-	} else if (is_device_private_entry(entry)) {
+	} else if (is_device_exclusive_entry(entry)) {
 		page = device_private_entry_to_page(entry);
 
+		get_page(page);
+		rss[mm_counter(page)]++;
+
+		if (is_write_device_exclusive_entry(entry) &&
+				is_cow_mapping(vm_flags)) {
+			/*
+			 * COW mappings require pages in both
+			 * parent and child to be set to read.
+			 */
+			make_device_exclusive_entry_read(&entry);
+			pte = swp_entry_to_pte(entry);
+			if (pte_swp_soft_dirty(*src_pte))
+				pte = pte_swp_mksoft_dirty(pte);
+			if (pte_swp_uffd_wp(*src_pte))
+				pte = pte_swp_mkuffd_wp(pte);
+			set_pte_at(src_mm, addr, src_pte, pte);
+		}
+	} else if (is_device_private_entry(entry)) {
+		page = device_exclusive_entry_to_page(entry);
+
 		/*
 		 * Update rss count even for unaddressable pages, as
 		 * they should treated just like normal pages in this
@@ -1273,7 +1294,8 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		}
 
 		entry = pte_to_swp_entry(ptent);
-		if (is_device_private_entry(entry)) {
+		if (is_device_private_entry(entry) ||
+		    is_device_exclusive_entry(entry)) {
 			struct page *page = device_private_entry_to_page(entry);
 
 			if (unlikely(details && details->check_mapping)) {
@@ -1289,7 +1311,10 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 
 			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 			rss[mm_counter(page)]--;
-			page_remove_rmap(page, false);
+
+			if (is_device_private_entry(entry))
+				page_remove_rmap(page, false);
+
 			put_page(page);
 			continue;
 		}
@@ -3270,6 +3295,9 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 		if (is_migration_entry(entry)) {
 			migration_entry_wait(vma->vm_mm, vmf->pmd,
 					     vmf->address);
+		} else if (is_device_exclusive_entry(entry)) {
+			vmf->page = device_exclusive_entry_to_page(entry);
+			ret = hmm_remove_exclusive_entry(vmf);
 		} else if (is_device_private_entry(entry)) {
 			vmf->page = device_private_entry_to_page(entry);
 			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ab709023e9aa..f93967211e52 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -163,6 +163,13 @@  static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				newpte = swp_entry_to_pte(entry);
 				if (pte_swp_uffd_wp(oldpte))
 					newpte = pte_swp_mkuffd_wp(newpte);
+			} else if (is_write_device_exclusive_entry(entry)) {
+				make_device_exclusive_entry_read(&entry);
+				newpte = swp_entry_to_pte(entry);
+				if (pte_swp_soft_dirty(oldpte))
+					newpte = pte_swp_mksoft_dirty(newpte);
+				if (pte_swp_uffd_wp(oldpte))
+					newpte = pte_swp_mkuffd_wp(newpte);
 			} else {
 				newpte = oldpte;
 			}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 86e3a3688d59..ff65a2c35996 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -41,7 +41,8 @@  static bool map_pte(struct page_vma_mapped_walk *pvmw)
 
 				/* Handle un-addressable ZONE_DEVICE memory */
 				entry = pte_to_swp_entry(*pvmw->pte);
-				if (!is_device_private_entry(entry))
+				if (!is_device_private_entry(entry) &&
+					!is_device_exclusive_entry(entry))
 					return false;
 			} else if (!pte_present(*pvmw->pte))
 				return false;
@@ -93,16 +94,18 @@  static bool check_pte(struct page_vma_mapped_walk *pvmw)
 			return false;
 		entry = pte_to_swp_entry(*pvmw->pte);
 
-		if (!is_migration_entry(entry))
+		if (is_migration_entry(entry))
+			pfn = migration_entry_to_pfn(entry);
+		else if (is_device_exclusive_entry(entry))
+			pfn = device_exclusive_entry_to_pfn(entry);
+		else
 			return false;
-
-		pfn = migration_entry_to_pfn(entry);
 	} else if (is_swap_pte(*pvmw->pte)) {
 		swp_entry_t entry;
 
 		/* Handle un-addressable ZONE_DEVICE memory */
 		entry = pte_to_swp_entry(*pvmw->pte);
-		if (!is_device_private_entry(entry))
+		if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry))
 			return false;
 
 		pfn = device_private_entry_to_pfn(entry);
@@ -216,6 +219,7 @@  bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 	}
 	if (!map_pte(pvmw))
 		goto next_pte;
+
 	while (1) {
 		if (check_pte(pvmw))
 			return true;
diff --git a/mm/rmap.c b/mm/rmap.c
index 08c56aaf72eb..e4d03f7a8c22 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1395,7 +1395,7 @@  static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
 		return true;
 
-	if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
+	if (IS_ENABLED(CONFIG_MIGRATION) && (flags & (TTU_MIGRATION | TTU_EXCLUSIVE)) &&
 	    is_zone_device_page(page) && !is_device_private_page(page))
 		return true;
 
@@ -1591,6 +1591,33 @@  static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			/* We have to invalidate as we cleared the pte */
 			mmu_notifier_invalidate_range(mm, address,
 						      address + PAGE_SIZE);
+		} else if (flags & TTU_EXCLUSIVE) {
+			swp_entry_t entry;
+			pte_t swp_pte;
+
+			if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+				set_pte_at(mm, address, pvmw.pte, pteval);
+				ret = false;
+				page_vma_mapped_walk_done(&pvmw);
+				break;
+			}
+
+			/*
+			 * Store the pfn of the page in a special migration
+			 * pte. do_swap_page() will wait until the migration
+			 * pte is removed and then restart fault handling.
+			 */
+			entry = make_device_exclusive_entry(subpage,
+					pte_write(pteval));
+			swp_pte = swp_entry_to_pte(entry);
+			if (pte_soft_dirty(pteval))
+				swp_pte = pte_swp_mksoft_dirty(swp_pte);
+			if (pte_uffd_wp(pteval))
+				swp_pte = pte_swp_mkuffd_wp(swp_pte);
+
+			/* Take a reference for the swap entry */
+			get_page(page);
+			set_pte_at(mm, address, pvmw.pte, swp_pte);
 		} else if (IS_ENABLED(CONFIG_MIGRATION) &&
 				(flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
 			swp_entry_t entry;