[1/3] KVM: Support sharing gpc locks

Message ID	20230127044500.680329-2-stevensd@google.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <kvm-owner@vger.kernel.org> From: David Stevens <stevensd@chromium.org> To: Sean Christopherson <seanjc@google.com>, David Woodhouse <dwmw@amazon.co.uk> Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org, David Stevens <stevensd@chromium.org> Subject: [PATCH 1/3] KVM: Support sharing gpc locks Date: Fri, 27 Jan 2023 13:44:58 +0900 Message-Id: <20230127044500.680329-2-stevensd@google.com> In-Reply-To: <20230127044500.680329-1-stevensd@google.com> References: <20230127044500.680329-1-stevensd@google.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	KVM: x86: replace kvm_vcpu_map usage in vmx \| expand [0/3] KVM: x86: replace kvm_vcpu_map usage in vmx [1/3] KVM: Support sharing gpc locks [2/3] KVM: use gfn=>pfn cache in nested_get_vmcs12_pages [3/3] KVM: use gfn=>pfn cache for evmcs

Message ID

20230127044500.680329-2-stevensd@google.com (mailing list archive)

State

New, archived

Headers

From: David Stevens <stevensd@chromium.org>
To: Sean Christopherson <seanjc@google.com>,
        David Woodhouse <dwmw@amazon.co.uk>
Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
        David Stevens <stevensd@chromium.org>
Subject: [PATCH 1/3] KVM: Support sharing gpc locks
Date: Fri, 27 Jan 2023 13:44:58 +0900
Message-Id: <20230127044500.680329-2-stevensd@google.com>
In-Reply-To: <20230127044500.680329-1-stevensd@google.com>
References: <20230127044500.680329-1-stevensd@google.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: bulk

Series

KVM: x86: replace kvm_vcpu_map usage in vmx | expand

Commit Message

David Stevens Jan. 27, 2023, 4:44 a.m. UTC

From: David Stevens <stevensd@chromium.org>

Support initializing a gfn_to_pfn_cache with an external lock instead of
its embedded lock. This allows groups of gpcs that are accessed together
to share a lock, which can greatly simplify locking.

Signed-off-by: David Stevens <stevensd@chromium.org>
---
 arch/x86/kvm/x86.c        |  8 +++---
 arch/x86/kvm/xen.c        | 58 +++++++++++++++++++--------------------
 include/linux/kvm_host.h  | 12 ++++++++
 include/linux/kvm_types.h |  3 +-
 virt/kvm/pfncache.c       | 37 +++++++++++++++----------
 5 files changed, 70 insertions(+), 48 deletions(-)

Comments

kernel test robot Jan. 28, 2023, 9:42 p.m. UTC | #1

Hi David,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kvm/queue]
[also build test ERROR on linus/master v6.2-rc5 next-20230127]
[cannot apply to kvm/linux-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/David-Stevens/KVM-Support-sharing-gpc-locks/20230128-161425
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link:    https://lore.kernel.org/r/20230127044500.680329-2-stevensd%40google.com
patch subject: [PATCH 1/3] KVM: Support sharing gpc locks
config: i386-randconfig-a003 (https://download.01.org/0day-ci/archive/20230129/202301290541.8nmbLtFC-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/14d62b317199184bb71256cc5f652b04fb2f9107
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review David-Stevens/KVM-Support-sharing-gpc-locks/20230128-161425
        git checkout 14d62b317199184bb71256cc5f652b04fb2f9107
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=i386 olddefconfig
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash arch/x86/ drivers/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   arch/x86/kvm/xen.c: In function 'kvm_xen_update_runstate_guest':
>> arch/x86/kvm/xen.c:319:45: error: 'gpc1->lock' is a pointer; did you mean to use '->'?
     319 |                 lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
         |                                             ^
         |                                             ->


vim +319 arch/x86/kvm/xen.c

   172	
   173	static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
   174	{
   175		struct kvm_vcpu_xen *vx = &v->arch.xen;
   176		struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
   177		struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
   178		size_t user_len, user_len1, user_len2;
   179		struct vcpu_runstate_info rs;
   180		unsigned long flags;
   181		size_t times_ofs;
   182		uint8_t *update_bit = NULL;
   183		uint64_t entry_time;
   184		uint64_t *rs_times;
   185		int *rs_state;
   186	
   187		/*
   188		 * The only difference between 32-bit and 64-bit versions of the
   189		 * runstate struct is the alignment of uint64_t in 32-bit, which
   190		 * means that the 64-bit version has an additional 4 bytes of
   191		 * padding after the first field 'state'. Let's be really really
   192		 * paranoid about that, and matching it with our internal data
   193		 * structures that we memcpy into it...
   194		 */
   195		BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
   196		BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
   197		BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
   198	#ifdef CONFIG_X86_64
   199		/*
   200		 * The 64-bit structure has 4 bytes of padding before 'state_entry_time'
   201		 * so each subsequent field is shifted by 4, and it's 4 bytes longer.
   202		 */
   203		BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
   204			     offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
   205		BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
   206			     offsetof(struct compat_vcpu_runstate_info, time) + 4);
   207		BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4);
   208	#endif
   209		/*
   210		 * The state field is in the same place at the start of both structs,
   211		 * and is the same size (int) as vx->current_runstate.
   212		 */
   213		BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
   214			     offsetof(struct compat_vcpu_runstate_info, state));
   215		BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
   216			     sizeof(vx->current_runstate));
   217		BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
   218			     sizeof(vx->current_runstate));
   219	
   220		/*
   221		 * The state_entry_time field is 64 bits in both versions, and the
   222		 * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
   223		 * is little-endian means that it's in the last *byte* of the word.
   224		 * That detail is important later.
   225		 */
   226		BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
   227			     sizeof(uint64_t));
   228		BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
   229			     sizeof(uint64_t));
   230		BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
   231	
   232		/*
   233		 * The time array is four 64-bit quantities in both versions, matching
   234		 * the vx->runstate_times and immediately following state_entry_time.
   235		 */
   236		BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
   237			     offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
   238		BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
   239			     offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
   240		BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
   241			     sizeof_field(struct compat_vcpu_runstate_info, time));
   242		BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
   243			     sizeof(vx->runstate_times));
   244	
   245		if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
   246			user_len = sizeof(struct vcpu_runstate_info);
   247			times_ofs = offsetof(struct vcpu_runstate_info,
   248					     state_entry_time);
   249		} else {
   250			user_len = sizeof(struct compat_vcpu_runstate_info);
   251			times_ofs = offsetof(struct compat_vcpu_runstate_info,
   252					     state_entry_time);
   253		}
   254	
   255		/*
   256		 * There are basically no alignment constraints. The guest can set it
   257		 * up so it crosses from one page to the next, and at arbitrary byte
   258		 * alignment (and the 32-bit ABI doesn't align the 64-bit integers
   259		 * anyway, even if the overall struct had been 64-bit aligned).
   260		 */
   261		if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
   262			user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
   263			user_len2 = user_len - user_len1;
   264		} else {
   265			user_len1 = user_len;
   266			user_len2 = 0;
   267		}
   268		BUG_ON(user_len1 + user_len2 != user_len);
   269	
   270	 retry:
   271		/*
   272		 * Attempt to obtain the GPC lock on *both* (if there are two)
   273		 * gfn_to_pfn caches that cover the region.
   274		 */
   275		if (atomic) {
   276			local_irq_save(flags);
   277			if (!read_trylock(gpc1->lock)) {
   278				local_irq_restore(flags);
   279				return;
   280			}
   281		} else {
   282			read_lock_irqsave(gpc1->lock, flags);
   283		}
   284		while (!kvm_gpc_check(gpc1, user_len1)) {
   285			read_unlock_irqrestore(gpc1->lock, flags);
   286	
   287			/* When invoked from kvm_sched_out() we cannot sleep */
   288			if (atomic)
   289				return;
   290	
   291			if (kvm_gpc_refresh(gpc1, user_len1))
   292				return;
   293	
   294			read_lock_irqsave(gpc1->lock, flags);
   295		}
   296	
   297		if (likely(!user_len2)) {
   298			/*
   299			 * Set up three pointers directly to the runstate_info
   300			 * struct in the guest (via the GPC).
   301			 *
   302			 *  • @rs_state   → state field
   303			 *  • @rs_times   → state_entry_time field.
   304			 *  • @update_bit → last byte of state_entry_time, which
   305			 *                  contains the XEN_RUNSTATE_UPDATE bit.
   306			 */
   307			rs_state = gpc1->khva;
   308			rs_times = gpc1->khva + times_ofs;
   309			if (v->kvm->arch.xen.runstate_update_flag)
   310				update_bit = ((void *)(&rs_times[1])) - 1;
   311		} else {
   312			/*
   313			 * The guest's runstate_info is split across two pages and we
   314			 * need to hold and validate both GPCs simultaneously. We can
   315			 * declare a lock ordering GPC1 > GPC2 because nothing else
   316			 * takes them more than one at a time. Set a subclass on the
   317			 * gpc1 lock to make lockdep shut up about it.
   318			 */
 > 319			lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
   320			if (atomic) {
   321				if (!read_trylock(gpc2->lock)) {
   322					read_unlock_irqrestore(gpc1->lock, flags);
   323					return;
   324				}
   325			} else {
   326				read_lock(gpc2->lock);
   327			}
   328	
   329			if (!kvm_gpc_check(gpc2, user_len2)) {
   330				read_unlock(gpc2->lock);
   331				read_unlock_irqrestore(gpc1->lock, flags);
   332	
   333				/* When invoked from kvm_sched_out() we cannot sleep */
   334				if (atomic)
   335					return;
   336	
   337				/*
   338				 * Use kvm_gpc_activate() here because if the runstate
   339				 * area was configured in 32-bit mode and only extends
   340				 * to the second page now because the guest changed to
   341				 * 64-bit mode, the second GPC won't have been set up.
   342				 */
   343				if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1,
   344						     user_len2))
   345					return;
   346	
   347				/*
   348				 * We dropped the lock on GPC1 so we have to go all the
   349				 * way back and revalidate that too.
   350				 */
   351				goto retry;
   352			}
   353	
   354			/*
   355			 * In this case, the runstate_info struct will be assembled on
   356			 * the kernel stack (compat or not as appropriate) and will
   357			 * be copied to GPC1/GPC2 with a dual memcpy. Set up the three
   358			 * rs pointers accordingly.
   359			 */
   360			rs_times = &rs.state_entry_time;
   361	
   362			/*
   363			 * The rs_state pointer points to the start of what we'll
   364			 * copy to the guest, which in the case of a compat guest
   365			 * is the 32-bit field that the compiler thinks is padding.
   366			 */
   367			rs_state = ((void *)rs_times) - times_ofs;
   368	
   369			/*
   370			 * The update_bit is still directly in the guest memory,
   371			 * via one GPC or the other.
   372			 */
   373			if (v->kvm->arch.xen.runstate_update_flag) {
   374				if (user_len1 >= times_ofs + sizeof(uint64_t))
   375					update_bit = gpc1->khva + times_ofs +
   376						sizeof(uint64_t) - 1;
   377				else
   378					update_bit = gpc2->khva + times_ofs +
   379						sizeof(uint64_t) - 1 - user_len1;
   380			}
   381

kernel test robot Jan. 29, 2023, 12:46 a.m. UTC | #2

Hi David,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kvm/queue]
[also build test ERROR on linus/master v6.2-rc5 next-20230127]
[cannot apply to kvm/linux-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/David-Stevens/KVM-Support-sharing-gpc-locks/20230128-161425
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link:    https://lore.kernel.org/r/20230127044500.680329-2-stevensd%40google.com
patch subject: [PATCH 1/3] KVM: Support sharing gpc locks
config: x86_64-randconfig-a014 (https://download.01.org/0day-ci/archive/20230129/202301290827.aRAvXuk9-lkp@intel.com/config)
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/14d62b317199184bb71256cc5f652b04fb2f9107
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review David-Stevens/KVM-Support-sharing-gpc-locks/20230128-161425
        git checkout 14d62b317199184bb71256cc5f652b04fb2f9107
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> arch/x86/kvm/xen.c:319:31: error: member reference type 'rwlock_t *' is a pointer; did you mean to use '->'?
                   lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
                                     ~~~~~~~~~~^
                                               ->
>> arch/x86/kvm/xen.c:319:21: error: passing 'struct lockdep_map' to parameter of incompatible type 'struct lockdep_map *'; take the address with &
                   lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
                                     ^~~~~~~~~~~~~~~~~~
                                     &
   include/linux/lockdep.h:296:58: note: passing argument to parameter 'lock' here
   static inline void lock_set_subclass(struct lockdep_map *lock,
                                                            ^
   2 errors generated.


vim +319 arch/x86/kvm/xen.c

   172	
   173	static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
   174	{
   175		struct kvm_vcpu_xen *vx = &v->arch.xen;
   176		struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
   177		struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
   178		size_t user_len, user_len1, user_len2;
   179		struct vcpu_runstate_info rs;
   180		unsigned long flags;
   181		size_t times_ofs;
   182		uint8_t *update_bit = NULL;
   183		uint64_t entry_time;
   184		uint64_t *rs_times;
   185		int *rs_state;
   186	
   187		/*
   188		 * The only difference between 32-bit and 64-bit versions of the
   189		 * runstate struct is the alignment of uint64_t in 32-bit, which
   190		 * means that the 64-bit version has an additional 4 bytes of
   191		 * padding after the first field 'state'. Let's be really really
   192		 * paranoid about that, and matching it with our internal data
   193		 * structures that we memcpy into it...
   194		 */
   195		BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
   196		BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
   197		BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
   198	#ifdef CONFIG_X86_64
   199		/*
   200		 * The 64-bit structure has 4 bytes of padding before 'state_entry_time'
   201		 * so each subsequent field is shifted by 4, and it's 4 bytes longer.
   202		 */
   203		BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
   204			     offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
   205		BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
   206			     offsetof(struct compat_vcpu_runstate_info, time) + 4);
   207		BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4);
   208	#endif
   209		/*
   210		 * The state field is in the same place at the start of both structs,
   211		 * and is the same size (int) as vx->current_runstate.
   212		 */
   213		BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
   214			     offsetof(struct compat_vcpu_runstate_info, state));
   215		BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
   216			     sizeof(vx->current_runstate));
   217		BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
   218			     sizeof(vx->current_runstate));
   219	
   220		/*
   221		 * The state_entry_time field is 64 bits in both versions, and the
   222		 * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
   223		 * is little-endian means that it's in the last *byte* of the word.
   224		 * That detail is important later.
   225		 */
   226		BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
   227			     sizeof(uint64_t));
   228		BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
   229			     sizeof(uint64_t));
   230		BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
   231	
   232		/*
   233		 * The time array is four 64-bit quantities in both versions, matching
   234		 * the vx->runstate_times and immediately following state_entry_time.
   235		 */
   236		BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
   237			     offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
   238		BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
   239			     offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
   240		BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
   241			     sizeof_field(struct compat_vcpu_runstate_info, time));
   242		BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
   243			     sizeof(vx->runstate_times));
   244	
   245		if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
   246			user_len = sizeof(struct vcpu_runstate_info);
   247			times_ofs = offsetof(struct vcpu_runstate_info,
   248					     state_entry_time);
   249		} else {
   250			user_len = sizeof(struct compat_vcpu_runstate_info);
   251			times_ofs = offsetof(struct compat_vcpu_runstate_info,
   252					     state_entry_time);
   253		}
   254	
   255		/*
   256		 * There are basically no alignment constraints. The guest can set it
   257		 * up so it crosses from one page to the next, and at arbitrary byte
   258		 * alignment (and the 32-bit ABI doesn't align the 64-bit integers
   259		 * anyway, even if the overall struct had been 64-bit aligned).
   260		 */
   261		if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
   262			user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
   263			user_len2 = user_len - user_len1;
   264		} else {
   265			user_len1 = user_len;
   266			user_len2 = 0;
   267		}
   268		BUG_ON(user_len1 + user_len2 != user_len);
   269	
   270	 retry:
   271		/*
   272		 * Attempt to obtain the GPC lock on *both* (if there are two)
   273		 * gfn_to_pfn caches that cover the region.
   274		 */
   275		if (atomic) {
   276			local_irq_save(flags);
   277			if (!read_trylock(gpc1->lock)) {
   278				local_irq_restore(flags);
   279				return;
   280			}
   281		} else {
   282			read_lock_irqsave(gpc1->lock, flags);
   283		}
   284		while (!kvm_gpc_check(gpc1, user_len1)) {
   285			read_unlock_irqrestore(gpc1->lock, flags);
   286	
   287			/* When invoked from kvm_sched_out() we cannot sleep */
   288			if (atomic)
   289				return;
   290	
   291			if (kvm_gpc_refresh(gpc1, user_len1))
   292				return;
   293	
   294			read_lock_irqsave(gpc1->lock, flags);
   295		}
   296	
   297		if (likely(!user_len2)) {
   298			/*
   299			 * Set up three pointers directly to the runstate_info
   300			 * struct in the guest (via the GPC).
   301			 *
   302			 *  • @rs_state   → state field
   303			 *  • @rs_times   → state_entry_time field.
   304			 *  • @update_bit → last byte of state_entry_time, which
   305			 *                  contains the XEN_RUNSTATE_UPDATE bit.
   306			 */
   307			rs_state = gpc1->khva;
   308			rs_times = gpc1->khva + times_ofs;
   309			if (v->kvm->arch.xen.runstate_update_flag)
   310				update_bit = ((void *)(&rs_times[1])) - 1;
   311		} else {
   312			/*
   313			 * The guest's runstate_info is split across two pages and we
   314			 * need to hold and validate both GPCs simultaneously. We can
   315			 * declare a lock ordering GPC1 > GPC2 because nothing else
   316			 * takes them more than one at a time. Set a subclass on the
   317			 * gpc1 lock to make lockdep shut up about it.
   318			 */
 > 319			lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
   320			if (atomic) {
   321				if (!read_trylock(gpc2->lock)) {
   322					read_unlock_irqrestore(gpc1->lock, flags);
   323					return;
   324				}
   325			} else {
   326				read_lock(gpc2->lock);
   327			}
   328	
   329			if (!kvm_gpc_check(gpc2, user_len2)) {
   330				read_unlock(gpc2->lock);
   331				read_unlock_irqrestore(gpc1->lock, flags);
   332	
   333				/* When invoked from kvm_sched_out() we cannot sleep */
   334				if (atomic)
   335					return;
   336	
   337				/*
   338				 * Use kvm_gpc_activate() here because if the runstate
   339				 * area was configured in 32-bit mode and only extends
   340				 * to the second page now because the guest changed to
   341				 * 64-bit mode, the second GPC won't have been set up.
   342				 */
   343				if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1,
   344						     user_len2))
   345					return;
   346	
   347				/*
   348				 * We dropped the lock on GPC1 so we have to go all the
   349				 * way back and revalidate that too.
   350				 */
   351				goto retry;
   352			}
   353	
   354			/*
   355			 * In this case, the runstate_info struct will be assembled on
   356			 * the kernel stack (compat or not as appropriate) and will
   357			 * be copied to GPC1/GPC2 with a dual memcpy. Set up the three
   358			 * rs pointers accordingly.
   359			 */
   360			rs_times = &rs.state_entry_time;
   361	
   362			/*
   363			 * The rs_state pointer points to the start of what we'll
   364			 * copy to the guest, which in the case of a compat guest
   365			 * is the 32-bit field that the compiler thinks is padding.
   366			 */
   367			rs_state = ((void *)rs_times) - times_ofs;
   368	
   369			/*
   370			 * The update_bit is still directly in the guest memory,
   371			 * via one GPC or the other.
   372			 */
   373			if (v->kvm->arch.xen.runstate_update_flag) {
   374				if (user_len1 >= times_ofs + sizeof(uint64_t))
   375					update_bit = gpc1->khva + times_ofs +
   376						sizeof(uint64_t) - 1;
   377				else
   378					update_bit = gpc2->khva + times_ofs +
   379						sizeof(uint64_t) - 1 - user_len1;
   380			}
   381

David Woodhouse Jan. 29, 2023, 10:08 a.m. UTC | #3

(Resending as the corporate email system has taken to adding redundant
information in transit as an HTML part — to a message that didn't need
it, which already had it in the Organization: header anyway, and was
previously plain text only.)

On Fri, 2023-01-27 at 13:44 +0900, David Stevens wrote:
> @@ -316,19 +316,19 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
>                  * takes them more than one at a time. Set a subclass on the
>                  * gpc1 lock to make lockdep shut up about it.
>                  */
> -               lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
> +               lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);

The test robot already pointed out this should be gpc1->lock->dep_map,
but if we do what I said below in the first attempt at this email, this
all goes away anyway.

>                 if (atomic) {
> -                       if (!read_trylock(&gpc2->lock)) {
> -                               read_unlock_irqrestore(&gpc1->lock, flags);
> +                       if (!read_trylock(gpc2->lock)) {
> +                               read_unlock_irqrestore(gpc1->lock, flags);
>                                 return;
>                         }

LGTM without having had time to go through it in detail. As Sean said,
we were waiting for someone (perhaps even ourselves) to do this.
Thanks.

For the runstate one above, this is crying out for gpc1 and gpc2 to
*share* a lock, and ditch the relocking and the whole comment about
being able to declare a lock ordering between the two, which is half
shown in the context cited above.

Probably worth doing that in a separate patch on top of this; I'll take
a look at it when my feet next hit the ground on Tuesday if you haven't
already done so. It's easy to test with the xen_shinfo_test self-test.

Or with this if you want more fun:
https://lore.kernel.org/qemu-devel/20230128081113.1615111-1-dwmw2@infradead.org/

:)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 508074e47bc0..ec0de9bc2eae 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3047,14 +3047,14 @@  static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
 	struct pvclock_vcpu_time_info *guest_hv_clock;
 	unsigned long flags;
 
-	read_lock_irqsave(&gpc->lock, flags);
+	read_lock_irqsave(gpc->lock, flags);
 	while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
-		read_unlock_irqrestore(&gpc->lock, flags);
+		read_unlock_irqrestore(gpc->lock, flags);
 
 		if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock)))
 			return;
 
-		read_lock_irqsave(&gpc->lock, flags);
+		read_lock_irqsave(gpc->lock, flags);
 	}
 
 	guest_hv_clock = (void *)(gpc->khva + offset);
@@ -3083,7 +3083,7 @@  static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
 	guest_hv_clock->version = ++vcpu->hv_clock.version;
 
 	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
-	read_unlock_irqrestore(&gpc->lock, flags);
+	read_unlock_irqrestore(gpc->lock, flags);
 
 	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
 }
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 2681e2007e39..fa8ab23271d3 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -59,12 +59,12 @@  static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
 		wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
 
 		/* It could be invalid again already, so we need to check */
-		read_lock_irq(&gpc->lock);
+		read_lock_irq(gpc->lock);
 
 		if (gpc->valid)
 			break;
 
-		read_unlock_irq(&gpc->lock);
+		read_unlock_irq(gpc->lock);
 	} while (1);
 
 	/* Paranoia checks on the 32-bit struct layout */
@@ -101,7 +101,7 @@  static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
 	smp_wmb();
 
 	wc->version = wc_version + 1;
-	read_unlock_irq(&gpc->lock);
+	read_unlock_irq(gpc->lock);
 
 	kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
 
@@ -274,15 +274,15 @@  static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 	 */
 	if (atomic) {
 		local_irq_save(flags);
-		if (!read_trylock(&gpc1->lock)) {
+		if (!read_trylock(gpc1->lock)) {
 			local_irq_restore(flags);
 			return;
 		}
 	} else {
-		read_lock_irqsave(&gpc1->lock, flags);
+		read_lock_irqsave(gpc1->lock, flags);
 	}
 	while (!kvm_gpc_check(gpc1, user_len1)) {
-		read_unlock_irqrestore(&gpc1->lock, flags);
+		read_unlock_irqrestore(gpc1->lock, flags);
 
 		/* When invoked from kvm_sched_out() we cannot sleep */
 		if (atomic)
@@ -291,7 +291,7 @@  static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 		if (kvm_gpc_refresh(gpc1, user_len1))
 			return;
 
-		read_lock_irqsave(&gpc1->lock, flags);
+		read_lock_irqsave(gpc1->lock, flags);
 	}
 
 	if (likely(!user_len2)) {
@@ -316,19 +316,19 @@  static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 		 * takes them more than one at a time. Set a subclass on the
 		 * gpc1 lock to make lockdep shut up about it.
 		 */
-		lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
+		lock_set_subclass(gpc1->lock.dep_map, 1, _THIS_IP_);
 		if (atomic) {
-			if (!read_trylock(&gpc2->lock)) {
-				read_unlock_irqrestore(&gpc1->lock, flags);
+			if (!read_trylock(gpc2->lock)) {
+				read_unlock_irqrestore(gpc1->lock, flags);
 				return;
 			}
 		} else {
-			read_lock(&gpc2->lock);
+			read_lock(gpc2->lock);
 		}
 
 		if (!kvm_gpc_check(gpc2, user_len2)) {
-			read_unlock(&gpc2->lock);
-			read_unlock_irqrestore(&gpc1->lock, flags);
+			read_unlock(gpc2->lock);
+			read_unlock_irqrestore(gpc1->lock, flags);
 
 			/* When invoked from kvm_sched_out() we cannot sleep */
 			if (atomic)
@@ -428,9 +428,9 @@  static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 	}
 
 	if (user_len2)
-		read_unlock(&gpc2->lock);
+		read_unlock(gpc2->lock);
 
-	read_unlock_irqrestore(&gpc1->lock, flags);
+	read_unlock_irqrestore(gpc1->lock, flags);
 
 	mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
 	if (user_len2)
@@ -505,14 +505,14 @@  void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
 	 * does anyway. Page it in and retry the instruction. We're just a
 	 * little more honest about it.
 	 */
-	read_lock_irqsave(&gpc->lock, flags);
+	read_lock_irqsave(gpc->lock, flags);
 	while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
-		read_unlock_irqrestore(&gpc->lock, flags);
+		read_unlock_irqrestore(gpc->lock, flags);
 
 		if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info)))
 			return;
 
-		read_lock_irqsave(&gpc->lock, flags);
+		read_lock_irqsave(gpc->lock, flags);
 	}
 
 	/* Now gpc->khva is a valid kernel address for the vcpu_info */
@@ -540,7 +540,7 @@  void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
 			     : "0" (evtchn_pending_sel32));
 		WRITE_ONCE(vi->evtchn_upcall_pending, 1);
 	}
-	read_unlock_irqrestore(&gpc->lock, flags);
+	read_unlock_irqrestore(gpc->lock, flags);
 
 	/* For the per-vCPU lapic vector, deliver it as MSI. */
 	if (v->arch.xen.upcall_vector)
@@ -568,9 +568,9 @@  int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	BUILD_BUG_ON(sizeof(rc) !=
 		     sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
 
-	read_lock_irqsave(&gpc->lock, flags);
+	read_lock_irqsave(gpc->lock, flags);
 	while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
-		read_unlock_irqrestore(&gpc->lock, flags);
+		read_unlock_irqrestore(gpc->lock, flags);
 
 		/*
 		 * This function gets called from kvm_vcpu_block() after setting the
@@ -590,11 +590,11 @@  int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 			 */
 			return 0;
 		}
-		read_lock_irqsave(&gpc->lock, flags);
+		read_lock_irqsave(gpc->lock, flags);
 	}
 
 	rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
-	read_unlock_irqrestore(&gpc->lock, flags);
+	read_unlock_irqrestore(gpc->lock, flags);
 	return rc;
 }
 
@@ -1172,7 +1172,7 @@  static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
 	int idx, i;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	read_lock_irqsave(&gpc->lock, flags);
+	read_lock_irqsave(gpc->lock, flags);
 	if (!kvm_gpc_check(gpc, PAGE_SIZE))
 		goto out_rcu;
 
@@ -1193,7 +1193,7 @@  static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
 	}
 
  out_rcu:
-	read_unlock_irqrestore(&gpc->lock, flags);
+	read_unlock_irqrestore(gpc->lock, flags);
 	srcu_read_unlock(&kvm->srcu, idx);
 
 	return ret;
@@ -1576,7 +1576,7 @@  int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
 
 	idx = srcu_read_lock(&kvm->srcu);
 
-	read_lock_irqsave(&gpc->lock, flags);
+	read_lock_irqsave(gpc->lock, flags);
 	if (!kvm_gpc_check(gpc, PAGE_SIZE))
 		goto out_rcu;
 
@@ -1607,10 +1607,10 @@  int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
 	} else {
 		rc = 1; /* Delivered to the bitmap in shared_info. */
 		/* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
-		read_unlock_irqrestore(&gpc->lock, flags);
+		read_unlock_irqrestore(gpc->lock, flags);
 		gpc = &vcpu->arch.xen.vcpu_info_cache;
 
-		read_lock_irqsave(&gpc->lock, flags);
+		read_lock_irqsave(gpc->lock, flags);
 		if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
 			/*
 			 * Could not access the vcpu_info. Set the bit in-kernel
@@ -1644,7 +1644,7 @@  int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
 	}
 
  out_rcu:
-	read_unlock_irqrestore(&gpc->lock, flags);
+	read_unlock_irqrestore(gpc->lock, flags);
 	srcu_read_unlock(&kvm->srcu, idx);
 
 	if (kick_vcpu) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 109b18e2789c..7d1f9c6561e3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1279,6 +1279,18 @@  void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
 		  struct kvm_vcpu *vcpu, enum pfn_cache_usage usage);
 
+/**
+ * kvm_gpc_init_with_lock - initialize gfn_to_pfn_cache with an external lock.
+ *
+ * @lock: an initialized rwlock
+ *
+ * See kvm_gpc_init. Allows multiple gfn_to_pfn_cache structs to share the
+ * same lock.
+ */
+void kvm_gpc_init_with_lock(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
+			    struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+			    rwlock_t *lock);
+
 /**
  * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
  *                    physical address.
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 76de36e56cdf..b6432c8cc19c 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -70,7 +70,8 @@  struct gfn_to_pfn_cache {
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
 	struct list_head list;
-	rwlock_t lock;
+	rwlock_t *lock;
+	rwlock_t _lock;
 	struct mutex refresh_lock;
 	void *khva;
 	kvm_pfn_t pfn;
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 2d6aba677830..2c6a2edaca9f 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -31,7 +31,7 @@  void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 
 	spin_lock(&kvm->gpc_lock);
 	list_for_each_entry(gpc, &kvm->gpc_list, list) {
-		write_lock_irq(&gpc->lock);
+		write_lock_irq(gpc->lock);
 
 		/* Only a single page so no need to care about length */
 		if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
@@ -50,7 +50,7 @@  void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
 				__set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap);
 			}
 		}
-		write_unlock_irq(&gpc->lock);
+		write_unlock_irq(gpc->lock);
 	}
 	spin_unlock(&kvm->gpc_lock);
 
@@ -147,7 +147,7 @@  static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 
 	lockdep_assert_held(&gpc->refresh_lock);
 
-	lockdep_assert_held_write(&gpc->lock);
+	lockdep_assert_held_write(gpc->lock);
 
 	/*
 	 * Invalidate the cache prior to dropping gpc->lock, the gpa=>uhva
@@ -160,7 +160,7 @@  static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		mmu_seq = gpc->kvm->mmu_invalidate_seq;
 		smp_rmb();
 
-		write_unlock_irq(&gpc->lock);
+		write_unlock_irq(gpc->lock);
 
 		/*
 		 * If the previous iteration "failed" due to an mmu_notifier
@@ -208,7 +208,7 @@  static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 			}
 		}
 
-		write_lock_irq(&gpc->lock);
+		write_lock_irq(gpc->lock);
 
 		/*
 		 * Other tasks must wait for _this_ refresh to complete before
@@ -231,7 +231,7 @@  static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 	return 0;
 
 out_error:
-	write_lock_irq(&gpc->lock);
+	write_lock_irq(gpc->lock);
 
 	return -EFAULT;
 }
@@ -261,7 +261,7 @@  static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
 	 */
 	mutex_lock(&gpc->refresh_lock);
 
-	write_lock_irq(&gpc->lock);
+	write_lock_irq(gpc->lock);
 
 	if (!gpc->active) {
 		ret = -EINVAL;
@@ -321,7 +321,7 @@  static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa,
 	unmap_old = (old_pfn != gpc->pfn);
 
 out_unlock:
-	write_unlock_irq(&gpc->lock);
+	write_unlock_irq(gpc->lock);
 
 	mutex_unlock(&gpc->refresh_lock);
 
@@ -339,20 +339,29 @@  EXPORT_SYMBOL_GPL(kvm_gpc_refresh);
 
 void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
 		  struct kvm_vcpu *vcpu, enum pfn_cache_usage usage)
+{
+	rwlock_init(&gpc->_lock);
+	kvm_gpc_init_with_lock(gpc, kvm, vcpu, usage, &gpc->_lock);
+}
+EXPORT_SYMBOL_GPL(kvm_gpc_init);
+
+void kvm_gpc_init_with_lock(struct gfn_to_pfn_cache *gpc, struct kvm *kvm,
+			    struct kvm_vcpu *vcpu, enum pfn_cache_usage usage,
+			    rwlock_t *lock)
 {
 	WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage);
 	WARN_ON_ONCE((usage & KVM_GUEST_USES_PFN) && !vcpu);
 
-	rwlock_init(&gpc->lock);
 	mutex_init(&gpc->refresh_lock);
 
 	gpc->kvm = kvm;
 	gpc->vcpu = vcpu;
+	gpc->lock = lock;
 	gpc->usage = usage;
 	gpc->pfn = KVM_PFN_ERR_FAULT;
 	gpc->uhva = KVM_HVA_ERR_BAD;
 }
-EXPORT_SYMBOL_GPL(kvm_gpc_init);
+EXPORT_SYMBOL_GPL(kvm_gpc_init_with_lock);
 
 int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
 {
@@ -371,9 +380,9 @@  int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
 		 * refresh must not establish a mapping until the cache is
 		 * reachable by mmu_notifier events.
 		 */
-		write_lock_irq(&gpc->lock);
+		write_lock_irq(gpc->lock);
 		gpc->active = true;
-		write_unlock_irq(&gpc->lock);
+		write_unlock_irq(gpc->lock);
 	}
 	return __kvm_gpc_refresh(gpc, gpa, len);
 }
@@ -391,7 +400,7 @@  void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
 		 * must stall mmu_notifier events until all users go away, i.e.
 		 * until gpc->lock is dropped and refresh is guaranteed to fail.
 		 */
-		write_lock_irq(&gpc->lock);
+		write_lock_irq(gpc->lock);
 		gpc->active = false;
 		gpc->valid = false;
 
@@ -406,7 +415,7 @@  void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
 
 		old_pfn = gpc->pfn;
 		gpc->pfn = KVM_PFN_ERR_FAULT;
-		write_unlock_irq(&gpc->lock);
+		write_unlock_irq(gpc->lock);
 
 		spin_lock(&kvm->gpc_lock);
 		list_del(&gpc->list);

[1/3] KVM: Support sharing gpc locks

Commit Message

Comments

Patch