diff mbox series

[v3,7/8] KVM: x86/mmu: Protect rmaps independently with SRCU

Message ID 20210506184241.618958-8-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series Lazily allocate memslot rmaps | expand

Commit Message

Ben Gardon May 6, 2021, 6:42 p.m. UTC
In preparation for lazily allocating the rmaps when the TDP MMU is in
use, protect the rmaps with SRCU. Unfortunately, this requires
propagating a pointer to struct kvm around to several functions.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/mmu/mmu.c | 57 +++++++++++++++++++++++++-----------------
 arch/x86/kvm/x86.c     |  6 ++---
 2 files changed, 37 insertions(+), 26 deletions(-)

Comments

kernel test robot May 6, 2021, 11:58 p.m. UTC | #1
Hi Ben,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on kvm/queue]
[also build test WARNING on tip/master linus/master next-20210506]
[cannot apply to linux/master v5.12]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Ben-Gardon/Lazily-allocate-memslot-rmaps/20210507-024533
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
config: x86_64-randconfig-s021-20210506 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.3-341-g8af24329-dirty
        # https://github.com/0day-ci/linux/commit/dd56af97c1d2b22f9acd46d33c83ac5e7bf67acc
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Ben-Gardon/Lazily-allocate-memslot-rmaps/20210507-024533
        git checkout dd56af97c1d2b22f9acd46d33c83ac5e7bf67acc
        # save the attached .config to linux build tree
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>


sparse warnings: (new ones prefixed by >>)
>> arch/x86/kvm/mmu/mmu.c:938:16: sparse: sparse: incompatible types in comparison expression (different address spaces):
>> arch/x86/kvm/mmu/mmu.c:938:16: sparse:    struct kvm_rmap_head [noderef] __rcu *
>> arch/x86/kvm/mmu/mmu.c:938:16: sparse:    struct kvm_rmap_head *
   arch/x86/kvm/mmu/mmu.c: note: in included file (through include/linux/kvm_host.h, arch/x86/kvm/irq.h):
   arch/x86/include/asm/kvm_host.h:1871:36: sparse: sparse: marked inline, but without a definition
   arch/x86/include/asm/kvm_host.h:1871:36: sparse: sparse: marked inline, but without a definition
   arch/x86/include/asm/kvm_host.h:1871:36: sparse: sparse: marked inline, but without a definition
   arch/x86/include/asm/kvm_host.h:1871:36: sparse: sparse: marked inline, but without a definition
   arch/x86/include/asm/kvm_host.h:1871:36: sparse: sparse: marked inline, but without a definition
   arch/x86/include/asm/kvm_host.h:1871:36: sparse: sparse: marked inline, but without a definition
   arch/x86/include/asm/kvm_host.h:1871:36: sparse: sparse: marked inline, but without a definition
   arch/x86/include/asm/kvm_host.h:1871:36: sparse: sparse: marked inline, but without a definition
   arch/x86/include/asm/kvm_host.h:1871:36: sparse: sparse: marked inline, but without a definition
   arch/x86/include/asm/kvm_host.h:1871:36: sparse: sparse: marked inline, but without a definition
   arch/x86/include/asm/kvm_host.h:1871:36: sparse: sparse: marked inline, but without a definition
   arch/x86/include/asm/kvm_host.h:1871:36: sparse: sparse: marked inline, but without a definition
   arch/x86/include/asm/kvm_host.h:1871:36: sparse: sparse: marked inline, but without a definition
--
   arch/x86/kvm/x86.c:10926:17: sparse: sparse: incompatible types in comparison expression (different address spaces):
>> arch/x86/kvm/x86.c:10926:17: sparse:    struct kvm_rmap_head [noderef] __rcu *
>> arch/x86/kvm/x86.c:10926:17: sparse:    struct kvm_rmap_head *
   arch/x86/kvm/x86.c: note: in included file (through include/linux/notifier.h, include/linux/memory_hotplug.h, include/linux/mmzone.h, ...):
   include/linux/srcu.h:182:9: sparse: sparse: context imbalance in 'vcpu_enter_guest' - unexpected unlock

vim +938 arch/x86/kvm/mmu/mmu.c

   929	
   930	static struct kvm_rmap_head *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
   931						   int level,
   932						   struct kvm_memory_slot *slot)
   933	{
   934		struct kvm_rmap_head *head;
   935		unsigned long idx;
   936	
   937		idx = gfn_to_index(gfn, slot->base_gfn, level);
 > 938		head = srcu_dereference_check(slot->arch.rmap[level - PG_LEVEL_4K],
   939					      &kvm->srcu,
   940					      lockdep_is_held(&kvm->slots_arch_lock));
   941		return &head[idx];
   942	}
   943	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot May 7, 2021, 12:56 a.m. UTC | #2
Hi Ben,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kvm/queue]
[also build test ERROR on tip/master linus/master next-20210506]
[cannot apply to linux/master v5.12]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Ben-Gardon/Lazily-allocate-memslot-rmaps/20210507-024533
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
config: i386-randconfig-r032-20210506 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/dd56af97c1d2b22f9acd46d33c83ac5e7bf67acc
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Ben-Gardon/Lazily-allocate-memslot-rmaps/20210507-024533
        git checkout dd56af97c1d2b22f9acd46d33c83ac5e7bf67acc
        # save the attached .config to linux build tree
        make W=1 W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

   In file included from arch/x86/kvm/mmu/mmu.c:1821:
   arch/x86/kvm/mmu/mmu_audit.c: In function 'inspect_spte_has_rmap':
>> arch/x86/kvm/mmu/mmu_audit.c:150:28: warning: passing argument 1 of '__gfn_to_rmap' makes pointer from integer without a cast [-Wint-conversion]
     150 |  rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
         |                            ^~~
         |                            |
         |                            gfn_t {aka long long unsigned int}
   arch/x86/kvm/mmu/mmu.c:930:56: note: expected 'struct kvm *' but argument is of type 'gfn_t' {aka 'long long unsigned int'}
     930 | static struct kvm_rmap_head *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
         |                                            ~~~~~~~~~~~~^~~
   In file included from arch/x86/kvm/mmu/mmu.c:1821:
>> arch/x86/kvm/mmu/mmu_audit.c:150:53: warning: passing argument 3 of '__gfn_to_rmap' makes integer from pointer without a cast [-Wint-conversion]
     150 |  rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
         |                                                     ^~~~
         |                                                     |
         |                                                     struct kvm_memory_slot *
   arch/x86/kvm/mmu/mmu.c:931:13: note: expected 'int' but argument is of type 'struct kvm_memory_slot *'
     931 |         int level,
         |         ~~~~^~~~~
   In file included from arch/x86/kvm/mmu/mmu.c:1821:
>> arch/x86/kvm/mmu/mmu_audit.c:150:14: error: too few arguments to function '__gfn_to_rmap'
     150 |  rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
         |              ^~~~~~~~~~~~~
   arch/x86/kvm/mmu/mmu.c:930:30: note: declared here
     930 | static struct kvm_rmap_head *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
         |                              ^~~~~~~~~~~~~
   In file included from arch/x86/kvm/mmu/mmu.c:1821:
   arch/x86/kvm/mmu/mmu_audit.c: In function 'audit_write_protection':
   arch/x86/kvm/mmu/mmu_audit.c:203:30: warning: passing argument 1 of '__gfn_to_rmap' makes pointer from integer without a cast [-Wint-conversion]
     203 |  rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
         |                            ~~^~~~~
         |                              |
         |                              gfn_t {aka long long unsigned int}
   arch/x86/kvm/mmu/mmu.c:930:56: note: expected 'struct kvm *' but argument is of type 'gfn_t' {aka 'long long unsigned int'}
     930 | static struct kvm_rmap_head *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
         |                                            ~~~~~~~~~~~~^~~
   In file included from arch/x86/kvm/mmu/mmu.c:1821:
   arch/x86/kvm/mmu/mmu_audit.c:203:50: warning: passing argument 3 of '__gfn_to_rmap' makes integer from pointer without a cast [-Wint-conversion]
     203 |  rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
         |                                                  ^~~~
         |                                                  |
         |                                                  struct kvm_memory_slot *
   arch/x86/kvm/mmu/mmu.c:931:13: note: expected 'int' but argument is of type 'struct kvm_memory_slot *'
     931 |         int level,
         |         ~~~~^~~~~
   In file included from arch/x86/kvm/mmu/mmu.c:1821:
   arch/x86/kvm/mmu/mmu_audit.c:203:14: error: too few arguments to function '__gfn_to_rmap'
     203 |  rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
         |              ^~~~~~~~~~~~~
   arch/x86/kvm/mmu/mmu.c:930:30: note: declared here
     930 | static struct kvm_rmap_head *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
         |                              ^~~~~~~~~~~~~


vim +/__gfn_to_rmap +150 arch/x86/kvm/mmu/mmu_audit.c

2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  125  
eb2591865a234c arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  126  static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  127  {
bd80158aff71a8 arch/x86/kvm/mmu_audit.c     Jan Kiszka          2011-09-12  128  	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
018aabb56d6109 arch/x86/kvm/mmu_audit.c     Takuya Yoshikawa    2015-11-20  129  	struct kvm_rmap_head *rmap_head;
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  130  	struct kvm_mmu_page *rev_sp;
699023e239658e arch/x86/kvm/mmu_audit.c     Paolo Bonzini       2015-05-18  131  	struct kvm_memslots *slots;
699023e239658e arch/x86/kvm/mmu_audit.c     Paolo Bonzini       2015-05-18  132  	struct kvm_memory_slot *slot;
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  133  	gfn_t gfn;
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  134  
573546820b792e arch/x86/kvm/mmu/mmu_audit.c Sean Christopherson 2020-06-22  135  	rev_sp = sptep_to_sp(sptep);
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  136  	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  137  
699023e239658e arch/x86/kvm/mmu_audit.c     Paolo Bonzini       2015-05-18  138  	slots = kvm_memslots_for_spte_role(kvm, rev_sp->role);
699023e239658e arch/x86/kvm/mmu_audit.c     Paolo Bonzini       2015-05-18  139  	slot = __gfn_to_memslot(slots, gfn);
699023e239658e arch/x86/kvm/mmu_audit.c     Paolo Bonzini       2015-05-18  140  	if (!slot) {
bd80158aff71a8 arch/x86/kvm/mmu_audit.c     Jan Kiszka          2011-09-12  141  		if (!__ratelimit(&ratelimit_state))
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  142  			return;
b034cf0105235e arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-12-23  143  		audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
b034cf0105235e arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-12-23  144  		audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
38904e128778c3 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-09-27  145  		       (long int)(sptep - rev_sp->spt), rev_sp->gfn);
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  146  		dump_stack();
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  147  		return;
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  148  	}
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  149  
018aabb56d6109 arch/x86/kvm/mmu_audit.c     Takuya Yoshikawa    2015-11-20 @150  	rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
018aabb56d6109 arch/x86/kvm/mmu_audit.c     Takuya Yoshikawa    2015-11-20  151  	if (!rmap_head->val) {
bd80158aff71a8 arch/x86/kvm/mmu_audit.c     Jan Kiszka          2011-09-12  152  		if (!__ratelimit(&ratelimit_state))
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  153  			return;
b034cf0105235e arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-12-23  154  		audit_printk(kvm, "no rmap for writable spte %llx\n",
b034cf0105235e arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-12-23  155  			     *sptep);
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  156  		dump_stack();
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  157  	}
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  158  }
2f4f337248cd56 arch/x86/kvm/mmu_audit.c     Xiao Guangrong      2010-08-30  159  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Paolo Bonzini May 7, 2021, 8:42 a.m. UTC | #3
On 06/05/21 20:42, Ben Gardon wrote:
> In preparation for lazily allocating the rmaps when the TDP MMU is in
> use, protect the rmaps with SRCU. Unfortunately, this requires
> propagating a pointer to struct kvm around to several functions.

Thinking more about it, this is not needed because all reads of the rmap 
array are guarded by the load-acquire of kvm->arch.memslots_have_rmaps. 
  That is, the pattern is always

	if (!load-acquire(memslot_have_rmaps))
		return;
	... = __gfn_to_rmap(...)

				slots->arch.rmap[x] = ...
				store-release(memslot_have_rmaps, true)

where the load-acquire/store-release have the same role that 
srcu_dereference/rcu_assign_pointer had before this patch.

We also know that any read that misses the check has the potential for a 
NULL pointer dereference, so it *has* to be like that.

That said, srcu_dereference has zero cost unless debugging options are 
enabled, and it *is* true that the rmap can disappear if kvm->srcu is 
not held, so I lean towards keeping this change and just changing the 
commit message like this:

---------
Currently, rmaps are always allocated and published together with a new 
memslot, so the srcu_dereference for the memslots array already ensures 
that the memory pointed to by slots->arch.rmap is zero at the time 
slots->arch.rmap.  However, they still need to be accessed in an SRCU 
read-side critical section, as the whole memslot can be deleted outside 
SRCU.
--------

Thanks,

Paolo

> 
> Suggested-by: Paolo Bonzini<pbonzini@redhat.com>
> Signed-off-by: Ben Gardon<bgardon@google.com>
> ---
>   arch/x86/kvm/mmu/mmu.c | 57 +++++++++++++++++++++++++-----------------
>   arch/x86/kvm/x86.c     |  6 ++---
>   2 files changed, 37 insertions(+), 26 deletions(-)
Sean Christopherson May 10, 2021, 5:45 p.m. UTC | #4
On Fri, May 07, 2021, Paolo Bonzini wrote:
> On 06/05/21 20:42, Ben Gardon wrote:
> > In preparation for lazily allocating the rmaps when the TDP MMU is in
> > use, protect the rmaps with SRCU. Unfortunately, this requires
> > propagating a pointer to struct kvm around to several functions.
> 
> Thinking more about it, this is not needed because all reads of the rmap
> array are guarded by the load-acquire of kvm->arch.memslots_have_rmaps.
> That is, the pattern is always
> 
> 	if (!load-acquire(memslot_have_rmaps))
> 		return;
> 	... = __gfn_to_rmap(...)
> 
> 				slots->arch.rmap[x] = ...
> 				store-release(memslot_have_rmaps, true)
> 
> where the load-acquire/store-release have the same role that
> srcu_dereference/rcu_assign_pointer had before this patch.
> 
> We also know that any read that misses the check has the potential for a
> NULL pointer dereference, so it *has* to be like that.
> 
> That said, srcu_dereference has zero cost unless debugging options are
> enabled, and it *is* true that the rmap can disappear if kvm->srcu is not
> held, so I lean towards keeping this change and just changing the commit
> message like this:
> 
> ---------
> Currently, rmaps are always allocated and published together with a new
> memslot, so the srcu_dereference for the memslots array already ensures that
> the memory pointed to by slots->arch.rmap is zero at the time
> slots->arch.rmap.  However, they still need to be accessed in an SRCU
> read-side critical section, as the whole memslot can be deleted outside
> SRCU.
> --------

I disagree, sprinkling random and unnecessary __rcu/SRCU annotations does more
harm than good.  Adding the unnecessary tag could be quite misleading as it
would imply the rmap pointers can _change_ independent of the memslots.

Similary, adding rcu_assign_pointer() in alloc_memslot_rmap() implies that its
safe to access the rmap after its pointer is assigned, and that's simply not
true since an rmap array can be freed if rmap allocation for a different memslot
fails.  Accessing the rmap is safe if and only if all rmaps are allocated, i.e.
if arch.memslots_have_rmaps is true, as you pointed out.

Furthermore, to actually gain any protection from SRCU, there would have to be
an synchronize_srcu() call after assigning the pointers, and that _does_ have an
associated.  Not to mention that to truly respect the __rcu annotation, deleting
the rmaps would also have to be done "independently" with the correct
rcu_assign_pointer() and synchronize_srcu() logic.
Paolo Bonzini May 10, 2021, 5:53 p.m. UTC | #5
On 10/05/21 19:45, Sean Christopherson wrote:
>>
>> ---------
>> Currently, rmaps are always allocated and published together with a new
>> memslot, so the srcu_dereference for the memslots array already ensures that
>> the memory pointed to by slots->arch.rmap is zero at the time
>> slots->arch.rmap.  However, they still need to be accessed in an SRCU
>> read-side critical section, as the whole memslot can be deleted outside
>> SRCU.
>> --------
> I disagree, sprinkling random and unnecessary __rcu/SRCU annotations does more
> harm than good.  Adding the unnecessary tag could be quite misleading as it
> would imply the rmap pointers can_change_  independent of the memslots.
> 
> Similary, adding rcu_assign_pointer() in alloc_memslot_rmap() implies that its
> safe to access the rmap after its pointer is assigned, and that's simply not
> true since an rmap array can be freed if rmap allocation for a different memslot
> fails.  Accessing the rmap is safe if and only if all rmaps are allocated, i.e.
> if arch.memslots_have_rmaps is true, as you pointed out.

This about freeing is a very good point.

> Furthermore, to actually gain any protection from SRCU, there would have to be
> an synchronize_srcu() call after assigning the pointers, and that _does_  have an
> associated.

... but this is incorrect (I was almost going to point out the below in 
my reply to Ben, then decided I was pointing out the obvious; lesson 
learned).

synchronize_srcu() is only needed after *deleting* something, which in 
this case is done as part of deleting the memslots---it's perfectly fine 
to batch multiple synchronize_*() calls given how expensive some of them 
are.

(BTW an associated what?)

So they still count as RCU-protected in my opinion, just because reading 
them outside SRCU is a big no and ought to warn (it's unlikely that it 
happens with rmaps, but then we just had 2-3 bugs like this being 
reported in a short time for memslots so never say never).  However, 
rcu_assign_pointer is not needed because the visibility of the rmaps is 
further protected by the have-rmaps flag (to be accessed with 
load-acquire/store-release) and not just by the pointer being there and 
non-NULL.

Paolo

> Not to mention that to truly respect the __rcu annotation, deleting
> the rmaps would also have to be done "independently" with the correct
> rcu_assign_pointer() and synchronize_srcu() logic.
>
Sean Christopherson May 10, 2021, 6:28 p.m. UTC | #6
On Mon, May 10, 2021, Paolo Bonzini wrote:
> On 10/05/21 19:45, Sean Christopherson wrote:
> > > 
> > > ---------
> > > Currently, rmaps are always allocated and published together with a new
> > > memslot, so the srcu_dereference for the memslots array already ensures that
> > > the memory pointed to by slots->arch.rmap is zero at the time
> > > slots->arch.rmap.  However, they still need to be accessed in an SRCU
> > > read-side critical section, as the whole memslot can be deleted outside
> > > SRCU.
> > > --------
> > I disagree, sprinkling random and unnecessary __rcu/SRCU annotations does more
> > harm than good.  Adding the unnecessary tag could be quite misleading as it
> > would imply the rmap pointers can_change_  independent of the memslots.
> > 
> > Similary, adding rcu_assign_pointer() in alloc_memslot_rmap() implies that its
> > safe to access the rmap after its pointer is assigned, and that's simply not
> > true since an rmap array can be freed if rmap allocation for a different memslot
> > fails.  Accessing the rmap is safe if and only if all rmaps are allocated, i.e.
> > if arch.memslots_have_rmaps is true, as you pointed out.
> 
> This about freeing is a very good point.
> 
> > Furthermore, to actually gain any protection from SRCU, there would have to be
> > an synchronize_srcu() call after assigning the pointers, and that _does_  have an
> > associated.
> 
> ... but this is incorrect (I was almost going to point out the below in my
> reply to Ben, then decided I was pointing out the obvious; lesson learned).
> 
> synchronize_srcu() is only needed after *deleting* something, which in this

No, synchronization is required any time the writer needs to ensure readers have
recognized the change.  E.g. making a memslot RO, moving a memslot's gfn base,
adding an MSR to the filter list.  I suppose you could frame any modification as
"deleting" something, but IMO that's cheating :-)

> case is done as part of deleting the memslots---it's perfectly fine to batch
> multiple synchronize_*() calls given how expensive some of them are.

Yes, but the shortlog says "Protect rmaps _independently_ with SRCU", emphasis
mine.  If the rmaps are truly protected independently, then they need to have
their own synchronization.  Setting all rmaps could be batched under a single
synchronize_srcu(), but IMO batching the rmaps with the memslot itself would be
in direct contradiction with the shortlog.

> (BTW an associated what?)

Doh.  "associated memslot."

> So they still count as RCU-protected in my opinion, just because reading
> them outside SRCU is a big no and ought to warn (it's unlikely that it
> happens with rmaps, but then we just had 2-3 bugs like this being reported
> in a short time for memslots so never say never).

Yes, but that interpretation holds true for literally everything that is hidden
behind an SRCU-protected pointer.  E.g. this would also be wrong, it's just much
more obviously broken:

bool kvm_is_gfn_writable(struct kvm* kvm, gfn_t gfn)
{
	struct kvm_memory_slot *slot;
	int idx;

	idx = srcu_read_lock(&kvm->srcu);
	slot = gfn_to_memslot(kvm, gfn);
	srcu_read_unlock(&kvm->srcu);

	return slot && !(slot->flags & KVM_MEMSLOT_INVALID) &&
	       !(slot->flags & KVM_MEM_READONLY);
}


> However, rcu_assign_pointer is not needed because the visibility of the rmaps
> is further protected by the have-rmaps flag (to be accessed with
> load-acquire/store-release) and not just by the pointer being there and
> non-NULL.

Yes, and I'm arguing that annotating the rmaps as __rcu is wrong because they
themselves are not protected by SRCU.  The memslot that contains the rmaps is
protected by SRCU, and because of that asserting SRCU is held for read will hold
true.  But, if the memslot code were changed to use a different protection scheme,
e.g. a rwlock for argument's sake, then the SRCU assertion would fail even though
the rmap logic itself didn't change.
Ben Gardon May 11, 2021, 4:22 p.m. UTC | #7
On Mon, May 10, 2021 at 11:28 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Mon, May 10, 2021, Paolo Bonzini wrote:
> > On 10/05/21 19:45, Sean Christopherson wrote:
> > > >
> > > > ---------
> > > > Currently, rmaps are always allocated and published together with a new
> > > > memslot, so the srcu_dereference for the memslots array already ensures that
> > > > the memory pointed to by slots->arch.rmap is zero at the time
> > > > slots->arch.rmap.  However, they still need to be accessed in an SRCU
> > > > read-side critical section, as the whole memslot can be deleted outside
> > > > SRCU.
> > > > --------
> > > I disagree, sprinkling random and unnecessary __rcu/SRCU annotations does more
> > > harm than good.  Adding the unnecessary tag could be quite misleading as it
> > > would imply the rmap pointers can_change_  independent of the memslots.
> > >
> > > Similary, adding rcu_assign_pointer() in alloc_memslot_rmap() implies that its
> > > safe to access the rmap after its pointer is assigned, and that's simply not
> > > true since an rmap array can be freed if rmap allocation for a different memslot
> > > fails.  Accessing the rmap is safe if and only if all rmaps are allocated, i.e.
> > > if arch.memslots_have_rmaps is true, as you pointed out.
> >
> > This about freeing is a very good point.
> >
> > > Furthermore, to actually gain any protection from SRCU, there would have to be
> > > an synchronize_srcu() call after assigning the pointers, and that _does_  have an
> > > associated.
> >
> > ... but this is incorrect (I was almost going to point out the below in my
> > reply to Ben, then decided I was pointing out the obvious; lesson learned).
> >
> > synchronize_srcu() is only needed after *deleting* something, which in this
>
> No, synchronization is required any time the writer needs to ensure readers have
> recognized the change.  E.g. making a memslot RO, moving a memslot's gfn base,
> adding an MSR to the filter list.  I suppose you could frame any modification as
> "deleting" something, but IMO that's cheating :-)
>
> > case is done as part of deleting the memslots---it's perfectly fine to batch
> > multiple synchronize_*() calls given how expensive some of them are.
>
> Yes, but the shortlog says "Protect rmaps _independently_ with SRCU", emphasis
> mine.  If the rmaps are truly protected independently, then they need to have
> their own synchronization.  Setting all rmaps could be batched under a single
> synchronize_srcu(), but IMO batching the rmaps with the memslot itself would be
> in direct contradiction with the shortlog.
>
> > (BTW an associated what?)
>
> Doh.  "associated memslot."
>
> > So they still count as RCU-protected in my opinion, just because reading
> > them outside SRCU is a big no and ought to warn (it's unlikely that it
> > happens with rmaps, but then we just had 2-3 bugs like this being reported
> > in a short time for memslots so never say never).
>
> Yes, but that interpretation holds true for literally everything that is hidden
> behind an SRCU-protected pointer.  E.g. this would also be wrong, it's just much
> more obviously broken:
>
> bool kvm_is_gfn_writable(struct kvm* kvm, gfn_t gfn)
> {
>         struct kvm_memory_slot *slot;
>         int idx;
>
>         idx = srcu_read_lock(&kvm->srcu);
>         slot = gfn_to_memslot(kvm, gfn);
>         srcu_read_unlock(&kvm->srcu);
>
>         return slot && !(slot->flags & KVM_MEMSLOT_INVALID) &&
>                !(slot->flags & KVM_MEM_READONLY);
> }
>
>
> > However, rcu_assign_pointer is not needed because the visibility of the rmaps
> > is further protected by the have-rmaps flag (to be accessed with
> > load-acquire/store-release) and not just by the pointer being there and
> > non-NULL.
>
> Yes, and I'm arguing that annotating the rmaps as __rcu is wrong because they
> themselves are not protected by SRCU.  The memslot that contains the rmaps is
> protected by SRCU, and because of that asserting SRCU is held for read will hold
> true.  But, if the memslot code were changed to use a different protection scheme,
> e.g. a rwlock for argument's sake, then the SRCU assertion would fail even though
> the rmap logic itself didn't change.

I'm inclined to agree with Sean that the extra RCU annotations are
probably unnecessary since we're already doing the srcu dereference
for all the slots. I'll move all these RCU annotations to their own
patch and put it at the end of the series when I send v4.
Paolo Bonzini May 11, 2021, 4:45 p.m. UTC | #8
On 11/05/21 18:22, Ben Gardon wrote:
>> Yes, and I'm arguing that annotating the rmaps as __rcu is wrong because they
>> themselves are not protected by SRCU.  The memslot that contains the rmaps is
>> protected by SRCU, and because of that asserting SRCU is held for read will hold
>> true.  But, if the memslot code were changed to use a different protection scheme,
>> e.g. a rwlock for argument's sake, then the SRCU assertion would fail even though
>> the rmap logic itself didn't change.
>
> I'm inclined to agree with Sean that the extra RCU annotations are
> probably unnecessary since we're already doing the srcu dereference
> for all the slots. I'll move all these RCU annotations to their own
> patch and put it at the end of the series when I send v4.
> 

Fair enough, you can even remove them then.

Paolo
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 730ea84bf7e7..48067c572c02 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -927,13 +927,18 @@  static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
 	__pte_list_remove(sptep, rmap_head);
 }
 
-static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
+static struct kvm_rmap_head *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
+					   int level,
 					   struct kvm_memory_slot *slot)
 {
+	struct kvm_rmap_head *head;
 	unsigned long idx;
 
 	idx = gfn_to_index(gfn, slot->base_gfn, level);
-	return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
+	head = srcu_dereference_check(slot->arch.rmap[level - PG_LEVEL_4K],
+				      &kvm->srcu,
+				      lockdep_is_held(&kvm->slots_arch_lock));
+	return &head[idx];
 }
 
 static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
@@ -944,7 +949,7 @@  static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
 
 	slots = kvm_memslots_for_spte_role(kvm, sp->role);
 	slot = __gfn_to_memslot(slots, gfn);
-	return __gfn_to_rmap(gfn, sp->role.level, slot);
+	return __gfn_to_rmap(kvm, gfn, sp->role.level, slot);
 }
 
 static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -1194,7 +1199,8 @@  static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 		return;
 
 	while (mask) {
-		rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+		rmap_head = __gfn_to_rmap(kvm,
+					  slot->base_gfn + gfn_offset + __ffs(mask),
 					  PG_LEVEL_4K, slot);
 		__rmap_write_protect(kvm, rmap_head, false);
 
@@ -1227,7 +1233,8 @@  static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 		return;
 
 	while (mask) {
-		rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+		rmap_head = __gfn_to_rmap(kvm,
+					  slot->base_gfn + gfn_offset + __ffs(mask),
 					  PG_LEVEL_4K, slot);
 		__rmap_clear_dirty(kvm, rmap_head, slot);
 
@@ -1270,7 +1277,7 @@  bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 
 	if (kvm_memslots_have_rmaps(kvm)) {
 		for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
-			rmap_head = __gfn_to_rmap(gfn, i, slot);
+			rmap_head = __gfn_to_rmap(kvm, gfn, i, slot);
 			write_protected |= __rmap_write_protect(kvm, rmap_head,
 								true);
 		}
@@ -1373,17 +1380,19 @@  struct slot_rmap_walk_iterator {
 };
 
 static void
-rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
+rmap_walk_init_level(struct kvm *kvm, struct slot_rmap_walk_iterator *iterator,
+		     int level)
 {
 	iterator->level = level;
 	iterator->gfn = iterator->start_gfn;
-	iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
-	iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
+	iterator->rmap = __gfn_to_rmap(kvm, iterator->gfn, level,
+				       iterator->slot);
+	iterator->end_rmap = __gfn_to_rmap(kvm, iterator->end_gfn, level,
 					   iterator->slot);
 }
 
 static void
-slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+slot_rmap_walk_init(struct kvm *kvm, struct slot_rmap_walk_iterator *iterator,
 		    struct kvm_memory_slot *slot, int start_level,
 		    int end_level, gfn_t start_gfn, gfn_t end_gfn)
 {
@@ -1393,7 +1402,7 @@  slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
 	iterator->start_gfn = start_gfn;
 	iterator->end_gfn = end_gfn;
 
-	rmap_walk_init_level(iterator, iterator->start_level);
+	rmap_walk_init_level(kvm, iterator, iterator->start_level);
 }
 
 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
@@ -1401,7 +1410,8 @@  static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
 	return !!iterator->rmap;
 }
 
-static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
+static void slot_rmap_walk_next(struct kvm *kvm,
+				struct slot_rmap_walk_iterator *iterator)
 {
 	if (++iterator->rmap <= iterator->end_rmap) {
 		iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
@@ -1413,15 +1423,15 @@  static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
 		return;
 	}
 
-	rmap_walk_init_level(iterator, iterator->level);
+	rmap_walk_init_level(kvm, iterator, iterator->level);
 }
 
-#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,	\
-	   _start_gfn, _end_gfn, _iter_)				\
-	for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,		\
-				 _end_level_, _start_gfn, _end_gfn);	\
-	     slot_rmap_walk_okay(_iter_);				\
-	     slot_rmap_walk_next(_iter_))
+#define for_each_slot_rmap_range(_kvm_, _slot_, _start_level_, _end_level_,	\
+				 _start_gfn, _end_gfn, _iter_)			\
+	for (slot_rmap_walk_init(_kvm_, _iter_, _slot_, _start_level_,		\
+				 _end_level_, _start_gfn, _end_gfn);		\
+	     slot_rmap_walk_okay(_iter_);					\
+	     slot_rmap_walk_next(_kvm_, _iter_))
 
 typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 			       struct kvm_memory_slot *slot, gfn_t gfn,
@@ -1434,8 +1444,9 @@  static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
 	struct slot_rmap_walk_iterator iterator;
 	bool ret = false;
 
-	for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
-				 range->start, range->end - 1, &iterator)
+	for_each_slot_rmap_range(kvm, range->slot, PG_LEVEL_4K,
+				 KVM_MAX_HUGEPAGE_LEVEL, range->start,
+				 range->end - 1, &iterator)
 		ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
 			       iterator.level, range->pte);
 
@@ -5233,8 +5244,8 @@  slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 {
 	struct slot_rmap_walk_iterator iterator;
 
-	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
-			end_gfn, &iterator) {
+	for_each_slot_rmap_range(kvm, memslot, start_level, end_level,
+				 start_gfn, end_gfn, &iterator) {
 		if (iterator.rmap)
 			flush |= fn(kvm, iterator.rmap, memslot);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d7a40ce342cc..1098ab73a704 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10854,9 +10854,9 @@  static int alloc_memslot_rmap(struct kvm_memory_slot *slot,
 		lpages = gfn_to_index(slot->base_gfn + npages - 1,
 				      slot->base_gfn, level) + 1;
 
-		slot->arch.rmap[i] =
-			kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
-				 GFP_KERNEL_ACCOUNT);
+		rcu_assign_pointer(slot->arch.rmap[i],
+				   kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
+					    GFP_KERNEL_ACCOUNT));
 		if (!slot->arch.rmap[i])
 			goto out_free;
 	}