Message ID | 1375189330-24066-10-git-send-email-xiaoguangrong@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Jul 30, 2013 at 09:02:07PM +0800, Xiao Guangrong wrote: > The basic idea is from nulls list which uses a nulls to indicate > whether the desc is moved to different pte-list > > Thanks to SLAB_DESTROY_BY_RCU, the desc can be quickly reused > > Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> > --- > arch/x86/kvm/mmu.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 50 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 36caf6a..f8fc0cc 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -1010,6 +1010,14 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, > desc->sptes[0] = (u64 *)*pte_list; > desc->sptes[1] = spte; > desc_mark_nulls(pte_list, desc); > + > + /* > + * Esure the old spte has been updated into desc, so > + * that the another side can not get the desc from pte_list > + * but miss the old spte. > + */ > + smp_wmb(); > + > *pte_list = (unsigned long)desc | 1; > return 1; > } > @@ -1131,6 +1139,47 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) > WARN_ON(desc_get_nulls_value(desc) != pte_list); > } > > +/* The caller should hold rcu lock. */ > +typedef void (*pte_list_walk_lockless_fn) (u64 *spte, int level); > +static void pte_list_walk_lockless(unsigned long *pte_list, > + pte_list_walk_lockless_fn fn, int level) > +{ > + struct pte_list_desc *desc; > + unsigned long pte_list_value; > + int i; > + > +restart: > + pte_list_value = ACCESS_ONCE(*pte_list); > + if (!pte_list_value) > + return; > + > + if (!(pte_list_value & 1)) > + return fn((u64 *)pte_list_value, level); > + > + /* > + * fetch pte_list before read sptes in the desc, see the comments > + * in pte_list_add(). > + * > + * There is the data dependence since the desc is got from pte_list. > + */ > + smp_read_barrier_depends(); > + > + desc = (struct pte_list_desc *)(pte_list_value & ~1ul); > + while (!desc_is_a_nulls(desc)) { > + for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) > + fn(desc->sptes[i], level); > + > + desc = ACCESS_ONCE(desc->more); > + > + /* It is being initialized. */ > + if (unlikely(!desc)) > + goto restart; > + } > + > + if (unlikely(desc_get_nulls_value(desc) != pte_list)) > + goto restart; So is it really enough to guaranty safety and correctness? What if desc is moved to another rmap while we walking it so fn() is called on incorrect sptes? Or what if desc is moved to another rmap, but then it is moved back to initial rmap (but another place in the desc list) so the check here will not catch that we need to restart walking? > +} > + > static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, > struct kvm_memory_slot *slot) > { > @@ -4557,7 +4606,7 @@ int kvm_mmu_module_init(void) > { > pte_list_desc_cache = kmem_cache_create("pte_list_desc", > sizeof(struct pte_list_desc), > - 0, 0, NULL); > + 0, SLAB_DESTROY_BY_RCU, NULL); > if (!pte_list_desc_cache) > goto nomem; > > -- > 1.8.1.4 -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/28/2013 05:20 PM, Gleb Natapov wrote: > On Tue, Jul 30, 2013 at 09:02:07PM +0800, Xiao Guangrong wrote: >> The basic idea is from nulls list which uses a nulls to indicate >> whether the desc is moved to different pte-list >> >> Thanks to SLAB_DESTROY_BY_RCU, the desc can be quickly reused >> >> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> >> --- >> arch/x86/kvm/mmu.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- >> 1 file changed, 50 insertions(+), 1 deletion(-) >> >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c >> index 36caf6a..f8fc0cc 100644 >> --- a/arch/x86/kvm/mmu.c >> +++ b/arch/x86/kvm/mmu.c >> @@ -1010,6 +1010,14 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, >> desc->sptes[0] = (u64 *)*pte_list; >> desc->sptes[1] = spte; >> desc_mark_nulls(pte_list, desc); >> + >> + /* >> + * Esure the old spte has been updated into desc, so >> + * that the another side can not get the desc from pte_list >> + * but miss the old spte. >> + */ >> + smp_wmb(); >> + >> *pte_list = (unsigned long)desc | 1; >> return 1; >> } >> @@ -1131,6 +1139,47 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) >> WARN_ON(desc_get_nulls_value(desc) != pte_list); >> } >> >> +/* The caller should hold rcu lock. */ >> +typedef void (*pte_list_walk_lockless_fn) (u64 *spte, int level); >> +static void pte_list_walk_lockless(unsigned long *pte_list, >> + pte_list_walk_lockless_fn fn, int level) >> +{ >> + struct pte_list_desc *desc; >> + unsigned long pte_list_value; >> + int i; >> + >> +restart: >> + pte_list_value = ACCESS_ONCE(*pte_list); >> + if (!pte_list_value) >> + return; >> + >> + if (!(pte_list_value & 1)) >> + return fn((u64 *)pte_list_value, level); >> + >> + /* >> + * fetch pte_list before read sptes in the desc, see the comments >> + * in pte_list_add(). >> + * >> + * There is the data dependence since the desc is got from pte_list. >> + */ >> + smp_read_barrier_depends(); >> + >> + desc = (struct pte_list_desc *)(pte_list_value & ~1ul); >> + while (!desc_is_a_nulls(desc)) { >> + for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) >> + fn(desc->sptes[i], level); >> + >> + desc = ACCESS_ONCE(desc->more); >> + >> + /* It is being initialized. */ >> + if (unlikely(!desc)) >> + goto restart; >> + } >> + >> + if (unlikely(desc_get_nulls_value(desc) != pte_list)) >> + goto restart; > So is it really enough to guaranty safety and correctness? What if desc > is moved to another rmap while we walking it so fn() is called on > incorrect sptes? Then fn() will do needless write-protection. It is the unnecessary work but it is acceptable for the rare case. There has a bug that we can not detect mapping level from rmap since the desc can be moved as you said, it can case we do write-protection on the middle spte. Can fix it by getting mapping level from sp->role.level since sp can not be reused when rcu is hold. > Or what if desc is moved to another rmap, but then it > is moved back to initial rmap (but another place in the desc list) so > the check here will not catch that we need to restart walking? It is okay. We always add the new desc to the head, then we will walk all the entires under this case. Right? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Aug 28, 2013 at 05:33:49PM +0800, Xiao Guangrong wrote: > > Or what if desc is moved to another rmap, but then it > > is moved back to initial rmap (but another place in the desc list) so > > the check here will not catch that we need to restart walking? > > It is okay. We always add the new desc to the head, then we will walk > all the entires under this case. > Which races another question: What if desc is added in front of the list behind the point where lockless walker currently is? > Right? Not sure. While lockless walker works on a desc rmap can be completely destroyed and recreated again. It can be any order. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/28/2013 05:46 PM, Gleb Natapov wrote: > On Wed, Aug 28, 2013 at 05:33:49PM +0800, Xiao Guangrong wrote: >>> Or what if desc is moved to another rmap, but then it >>> is moved back to initial rmap (but another place in the desc list) so >>> the check here will not catch that we need to restart walking? >> >> It is okay. We always add the new desc to the head, then we will walk >> all the entires under this case. >> > Which races another question: What if desc is added in front of the list > behind the point where lockless walker currently is? That case is new spte is being added into the rmap. We need not to care the new sptes since it will set the dirty-bitmap then they can be write-protected next time. > >> Right? > Not sure. While lockless walker works on a desc rmap can be completely > destroyed and recreated again. It can be any order. I think the thing is very similar as include/linux/rculist_nulls.h -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Aug 28, 2013 at 06:13:43PM +0800, Xiao Guangrong wrote: > On 08/28/2013 05:46 PM, Gleb Natapov wrote: > > On Wed, Aug 28, 2013 at 05:33:49PM +0800, Xiao Guangrong wrote: > >>> Or what if desc is moved to another rmap, but then it > >>> is moved back to initial rmap (but another place in the desc list) so > >>> the check here will not catch that we need to restart walking? > >> > >> It is okay. We always add the new desc to the head, then we will walk > >> all the entires under this case. > >> > > Which races another question: What if desc is added in front of the list > > behind the point where lockless walker currently is? > > That case is new spte is being added into the rmap. We need not to care the > new sptes since it will set the dirty-bitmap then they can be write-protected > next time. > OK. > > > >> Right? > > Not sure. While lockless walker works on a desc rmap can be completely > > destroyed and recreated again. It can be any order. > > I think the thing is very similar as include/linux/rculist_nulls.h include/linux/rculist_nulls.h is for implementing hash tables, so they may not care about add/del/lookup race for instance, but may be we are (you are saying above that we are not), so similarity does not prove correctness for our case. BTW I do not see rcu_assign_pointer()/rcu_dereference() in your patches which hints on incorrect usage of RCU. I think any access to slab pointers will need to use those. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/28/2013 06:49 PM, Gleb Natapov wrote: > On Wed, Aug 28, 2013 at 06:13:43PM +0800, Xiao Guangrong wrote: >> On 08/28/2013 05:46 PM, Gleb Natapov wrote: >>> On Wed, Aug 28, 2013 at 05:33:49PM +0800, Xiao Guangrong wrote: >>>>> Or what if desc is moved to another rmap, but then it >>>>> is moved back to initial rmap (but another place in the desc list) so >>>>> the check here will not catch that we need to restart walking? >>>> >>>> It is okay. We always add the new desc to the head, then we will walk >>>> all the entires under this case. >>>> >>> Which races another question: What if desc is added in front of the list >>> behind the point where lockless walker currently is? >> >> That case is new spte is being added into the rmap. We need not to care the >> new sptes since it will set the dirty-bitmap then they can be write-protected >> next time. >> > OK. > >>> >>>> Right? >>> Not sure. While lockless walker works on a desc rmap can be completely >>> destroyed and recreated again. It can be any order. >> >> I think the thing is very similar as include/linux/rculist_nulls.h > include/linux/rculist_nulls.h is for implementing hash tables, so they > may not care about add/del/lookup race for instance, but may be we are > (you are saying above that we are not), so similarity does not prove > correctness for our case. We do not care the "add" and "del" too when lookup the rmap. Under the "add" case, it is okay, the reason i have explained above. Under the "del" case, the spte becomes unpresent and flush all tlbs immediately, so it is also okay. I always use a stupid way to check the correctness, that is enumerating all cases we may meet, in this patch, we may meet these cases: 1) kvm deletes the desc before we are current on that descs have been checked, do not need to care it. 2) kvm deletes the desc after we are currently on Since we always add/del the head desc, we can sure the current desc has been deleted, then we will meet case 3). 3) kvm deletes the desc that we are currently on 3.a): the desc stays in slab cache (do not be reused). all spte entires are empty, then the fn() will skip the nonprsent spte, and desc->more is 3.a.1) still pointing to next-desc, then we will continue the lookup 3.a.2) or it is the "nulls list", that means we reach the last one, then finish the walk. 3.b): the desc is alloc-ed from slab cache and it's being initialized. we will see "desc->more == NULL" then restart the walking. It's okay. 3.c): the desc is added to rmap or pte_list again. 3.c.1): the desc is added to the current rmap again. the new desc always acts as the head desc, then we will walk all entries, some entries are double checked and not entry can be missed. It is okay. 3.c.2): the desc is added to another rmap or pte_list since kvm_set_memory_region() and get_dirty are serial by slots-lock. so the "nulls" can not be reused during lookup. Then we we will meet the different "nulls" at the end of walking that will cause rewalk. I know check the algorithm like this is really silly, do you have other idea? > BTW I do not see > rcu_assign_pointer()/rcu_dereference() in your patches which hints on IIUC, We can not directly use rcu_assign_pointer(), that is something like: p = v to assign a pointer to a pointer. But in our case, we need: *pte_list = (unsigned long)desc | 1; So i add the smp_wmb() by myself: /* * Esure the old spte has been updated into desc, so * that the another side can not get the desc from pte_list * but miss the old spte. */ smp_wmb(); *pte_list = (unsigned long)desc | 1; But i missed it when inserting a empty desc, in that case, we need the barrier too since we should make desc->more visible before assign it to pte_list to avoid the lookup side seeing the invalid "nulls". I also use own code instead of rcu_dereference(): pte_list_walk_lockless(): pte_list_value = ACCESS_ONCE(*pte_list); if (!pte_list_value) return; if (!(pte_list_value & 1)) return fn((u64 *)pte_list_value); /* * fetch pte_list before read sptes in the desc, see the comments * in pte_list_add(). * * There is the data dependence since the desc is got from pte_list. */ smp_read_barrier_depends(); That part can be replaced by rcu_dereference(). > incorrect usage of RCU. I think any access to slab pointers will need to > use those. Remove desc is not necessary i think since we do not mind to see the old info. (hlist_nulls_del_rcu() does not use rcu_dereference() too) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Aug 28, 2013 at 08:15:36PM +0800, Xiao Guangrong wrote: > On 08/28/2013 06:49 PM, Gleb Natapov wrote: > > On Wed, Aug 28, 2013 at 06:13:43PM +0800, Xiao Guangrong wrote: > >> On 08/28/2013 05:46 PM, Gleb Natapov wrote: > >>> On Wed, Aug 28, 2013 at 05:33:49PM +0800, Xiao Guangrong wrote: > >>>>> Or what if desc is moved to another rmap, but then it > >>>>> is moved back to initial rmap (but another place in the desc list) so > >>>>> the check here will not catch that we need to restart walking? > >>>> > >>>> It is okay. We always add the new desc to the head, then we will walk > >>>> all the entires under this case. > >>>> > >>> Which races another question: What if desc is added in front of the list > >>> behind the point where lockless walker currently is? > >> > >> That case is new spte is being added into the rmap. We need not to care the > >> new sptes since it will set the dirty-bitmap then they can be write-protected > >> next time. > >> > > OK. > > > >>> > >>>> Right? > >>> Not sure. While lockless walker works on a desc rmap can be completely > >>> destroyed and recreated again. It can be any order. > >> > >> I think the thing is very similar as include/linux/rculist_nulls.h > > include/linux/rculist_nulls.h is for implementing hash tables, so they > > may not care about add/del/lookup race for instance, but may be we are > > (you are saying above that we are not), so similarity does not prove > > correctness for our case. > > We do not care the "add" and "del" too when lookup the rmap. Under the "add" > case, it is okay, the reason i have explained above. Under the "del" case, > the spte becomes unpresent and flush all tlbs immediately, so it is also okay. > > I always use a stupid way to check the correctness, that is enumerating > all cases we may meet, in this patch, we may meet these cases: > > 1) kvm deletes the desc before we are current on > that descs have been checked, do not need to care it. > > 2) kvm deletes the desc after we are currently on > Since we always add/del the head desc, we can sure the current desc has been > deleted, then we will meet case 3). > > 3) kvm deletes the desc that we are currently on > 3.a): the desc stays in slab cache (do not be reused). > all spte entires are empty, then the fn() will skip the nonprsent spte, > and desc->more is > 3.a.1) still pointing to next-desc, then we will continue the lookup > 3.a.2) or it is the "nulls list", that means we reach the last one, > then finish the walk. > > 3.b): the desc is alloc-ed from slab cache and it's being initialized. > we will see "desc->more == NULL" then restart the walking. It's okay. > > 3.c): the desc is added to rmap or pte_list again. > 3.c.1): the desc is added to the current rmap again. > the new desc always acts as the head desc, then we will walk > all entries, some entries are double checked and not entry > can be missed. It is okay. > > 3.c.2): the desc is added to another rmap or pte_list > since kvm_set_memory_region() and get_dirty are serial by slots-lock. > so the "nulls" can not be reused during lookup. Then we we will > meet the different "nulls" at the end of walking that will cause > rewalk. > > I know check the algorithm like this is really silly, do you have other idea? > Not silly, but easy to miss cases. For instance 3.c.3 can be: the desc is added to another rmap then we move to another desc on the wrong rmap, this other desc is also deleted and reinserted into original rmap. Seams like justification from 3.c.1 applies to that to though. > > BTW I do not see > > rcu_assign_pointer()/rcu_dereference() in your patches which hints on > > IIUC, We can not directly use rcu_assign_pointer(), that is something like: > p = v to assign a pointer to a pointer. But in our case, we need: > *pte_list = (unsigned long)desc | 1; From Documentation/RCU/whatisRCU.txt: The updater uses this function to assign a new value to an RCU-protected pointer. This is what we do, no? (assuming slot->arch.rmap[] is what rcu protects here) The fact that the value is not correct pointer should not matter. > > So i add the smp_wmb() by myself: > /* > * Esure the old spte has been updated into desc, so > * that the another side can not get the desc from pte_list > * but miss the old spte. > */ > smp_wmb(); > > *pte_list = (unsigned long)desc | 1; > > But i missed it when inserting a empty desc, in that case, we need the barrier > too since we should make desc->more visible before assign it to pte_list to > avoid the lookup side seeing the invalid "nulls". > > I also use own code instead of rcu_dereference(): > pte_list_walk_lockless(): > pte_list_value = ACCESS_ONCE(*pte_list); > if (!pte_list_value) > return; > > if (!(pte_list_value & 1)) > return fn((u64 *)pte_list_value); > > /* > * fetch pte_list before read sptes in the desc, see the comments > * in pte_list_add(). > * > * There is the data dependence since the desc is got from pte_list. > */ > smp_read_barrier_depends(); > > That part can be replaced by rcu_dereference(). > Yes please, also see commit c87a124a5d5e8cf8e21c4363c3372bcaf53ea190 for kind of scary bugs we can get here. > > incorrect usage of RCU. I think any access to slab pointers will need to > > use those. > > Remove desc is not necessary i think since we do not mind to see the old > info. (hlist_nulls_del_rcu() does not use rcu_dereference() too) > May be a bug. I also noticed that rculist_nulls uses rcu_dereference() to access ->next, but it does not use rcu_assign_pointer() pointer to assign it. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/28/2013 09:36 PM, Gleb Natapov wrote: > On Wed, Aug 28, 2013 at 08:15:36PM +0800, Xiao Guangrong wrote: >> On 08/28/2013 06:49 PM, Gleb Natapov wrote: >>> On Wed, Aug 28, 2013 at 06:13:43PM +0800, Xiao Guangrong wrote: >>>> On 08/28/2013 05:46 PM, Gleb Natapov wrote: >>>>> On Wed, Aug 28, 2013 at 05:33:49PM +0800, Xiao Guangrong wrote: >>>>>>> Or what if desc is moved to another rmap, but then it >>>>>>> is moved back to initial rmap (but another place in the desc list) so >>>>>>> the check here will not catch that we need to restart walking? >>>>>> >>>>>> It is okay. We always add the new desc to the head, then we will walk >>>>>> all the entires under this case. >>>>>> >>>>> Which races another question: What if desc is added in front of the list >>>>> behind the point where lockless walker currently is? >>>> >>>> That case is new spte is being added into the rmap. We need not to care the >>>> new sptes since it will set the dirty-bitmap then they can be write-protected >>>> next time. >>>> >>> OK. >>> >>>>> >>>>>> Right? >>>>> Not sure. While lockless walker works on a desc rmap can be completely >>>>> destroyed and recreated again. It can be any order. >>>> >>>> I think the thing is very similar as include/linux/rculist_nulls.h >>> include/linux/rculist_nulls.h is for implementing hash tables, so they >>> may not care about add/del/lookup race for instance, but may be we are >>> (you are saying above that we are not), so similarity does not prove >>> correctness for our case. >> >> We do not care the "add" and "del" too when lookup the rmap. Under the "add" >> case, it is okay, the reason i have explained above. Under the "del" case, >> the spte becomes unpresent and flush all tlbs immediately, so it is also okay. >> >> I always use a stupid way to check the correctness, that is enumerating >> all cases we may meet, in this patch, we may meet these cases: >> >> 1) kvm deletes the desc before we are current on >> that descs have been checked, do not need to care it. >> >> 2) kvm deletes the desc after we are currently on >> Since we always add/del the head desc, we can sure the current desc has been >> deleted, then we will meet case 3). >> >> 3) kvm deletes the desc that we are currently on >> 3.a): the desc stays in slab cache (do not be reused). >> all spte entires are empty, then the fn() will skip the nonprsent spte, >> and desc->more is >> 3.a.1) still pointing to next-desc, then we will continue the lookup >> 3.a.2) or it is the "nulls list", that means we reach the last one, >> then finish the walk. >> >> 3.b): the desc is alloc-ed from slab cache and it's being initialized. >> we will see "desc->more == NULL" then restart the walking. It's okay. >> >> 3.c): the desc is added to rmap or pte_list again. >> 3.c.1): the desc is added to the current rmap again. >> the new desc always acts as the head desc, then we will walk >> all entries, some entries are double checked and not entry >> can be missed. It is okay. >> >> 3.c.2): the desc is added to another rmap or pte_list >> since kvm_set_memory_region() and get_dirty are serial by slots-lock. >> so the "nulls" can not be reused during lookup. Then we we will >> meet the different "nulls" at the end of walking that will cause >> rewalk. >> >> I know check the algorithm like this is really silly, do you have other idea? >> > Not silly, but easy to miss cases. For instance 3.c.3 can be: > the desc is added to another rmap then we move to another desc on the > wrong rmap, this other desc is also deleted and reinserted into > original rmap. Seams like justification from 3.c.1 applies to that to > though. > >>> BTW I do not see >>> rcu_assign_pointer()/rcu_dereference() in your patches which hints on >> >> IIUC, We can not directly use rcu_assign_pointer(), that is something like: >> p = v to assign a pointer to a pointer. But in our case, we need: >> *pte_list = (unsigned long)desc | 1; >>From Documentation/RCU/whatisRCU.txt: > > The updater uses this function to assign a new value to an RCU-protected pointer. > > This is what we do, no? (assuming slot->arch.rmap[] is what rcu protects here) > The fact that the value is not correct pointer should not matter. > Okay. Will change that code to: + +#define rcu_assign_head_desc(pte_list_p, value) \ + rcu_assign_pointer(*(unsigned long __rcu **)(pte_list_p), (unsigned long *)(value)) + /* * Pte mapping structures: * @@ -1006,14 +1010,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, desc->sptes[1] = spte; desc_mark_nulls(pte_list, desc); - /* - * Esure the old spte has been updated into desc, so - * that the another side can not get the desc from pte_list - * but miss the old spte. - */ - smp_wmb(); - - *pte_list = (unsigned long)desc | 1; + rcu_assign_head_desc(pte_list, (unsigned long)desc | 1); >> >> So i add the smp_wmb() by myself: >> /* >> * Esure the old spte has been updated into desc, so >> * that the another side can not get the desc from pte_list >> * but miss the old spte. >> */ >> smp_wmb(); >> >> *pte_list = (unsigned long)desc | 1; >> >> But i missed it when inserting a empty desc, in that case, we need the barrier >> too since we should make desc->more visible before assign it to pte_list to >> avoid the lookup side seeing the invalid "nulls". >> >> I also use own code instead of rcu_dereference(): >> pte_list_walk_lockless(): >> pte_list_value = ACCESS_ONCE(*pte_list); >> if (!pte_list_value) >> return; >> >> if (!(pte_list_value & 1)) >> return fn((u64 *)pte_list_value); >> >> /* >> * fetch pte_list before read sptes in the desc, see the comments >> * in pte_list_add(). >> * >> * There is the data dependence since the desc is got from pte_list. >> */ >> smp_read_barrier_depends(); >> >> That part can be replaced by rcu_dereference(). >> > Yes please, also see commit c87a124a5d5e8cf8e21c4363c3372bcaf53ea190 for > kind of scary bugs we can get here. Right, it is likely trigger-able in our case, will fix it. > >>> incorrect usage of RCU. I think any access to slab pointers will need to >>> use those. >> >> Remove desc is not necessary i think since we do not mind to see the old >> info. (hlist_nulls_del_rcu() does not use rcu_dereference() too) >> > May be a bug. I also noticed that rculist_nulls uses rcu_dereference() But list_del_rcu() does not use rcu_assign_pointer() too. > to access ->next, but it does not use rcu_assign_pointer() pointer to > assign it. You mean rcu_dereference() is used in hlist_nulls_for_each_entry_rcu()? I think it's because we should validate the prefetched data before entry->next is accessed, it is paired with the barrier in rcu_assign_pointer() when add a new entry into the list. rcu_assign_pointer() make other fields in the entry be visible before linking entry to the list. Otherwise, the lookup can access that entry but get the invalid fields. After more thinking, I still think rcu_assign_pointer() is unneeded when a entry is removed. The remove-API does not care the order between unlink the entry and the changes to its fields. It is the caller's responsibility: - in the case of rcuhlist, the caller uses call_rcu()/synchronize_rcu(), etc to enforce all lookups exit and the later change on that entry is invisible to the lookups. - In the case of rculist_nulls, it seems refcounter is used to guarantee the order (see the example from Documentation/RCU/rculist_nulls.txt). - In our case, we allow the lookup to see the deleted desc even if it is in slab cache or its is initialized or it is re-added. Your thought? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 29, 2013 at 02:50:51PM +0800, Xiao Guangrong wrote: > >>> BTW I do not see > >>> rcu_assign_pointer()/rcu_dereference() in your patches which hints on > >> > >> IIUC, We can not directly use rcu_assign_pointer(), that is something like: > >> p = v to assign a pointer to a pointer. But in our case, we need: > >> *pte_list = (unsigned long)desc | 1; > >>From Documentation/RCU/whatisRCU.txt: > > > > The updater uses this function to assign a new value to an RCU-protected pointer. > > > > This is what we do, no? (assuming slot->arch.rmap[] is what rcu protects here) > > The fact that the value is not correct pointer should not matter. > > > > Okay. Will change that code to: > > + > +#define rcu_assign_head_desc(pte_list_p, value) \ > + rcu_assign_pointer(*(unsigned long __rcu **)(pte_list_p), (unsigned long *)(value)) > + > /* > * Pte mapping structures: > * > @@ -1006,14 +1010,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, > desc->sptes[1] = spte; > desc_mark_nulls(pte_list, desc); > > - /* > - * Esure the old spte has been updated into desc, so > - * that the another side can not get the desc from pte_list > - * but miss the old spte. > - */ > - smp_wmb(); > - > - *pte_list = (unsigned long)desc | 1; > + rcu_assign_head_desc(pte_list, (unsigned long)desc | 1); > > >> > >> So i add the smp_wmb() by myself: > >> /* > >> * Esure the old spte has been updated into desc, so > >> * that the another side can not get the desc from pte_list > >> * but miss the old spte. > >> */ > >> smp_wmb(); > >> > >> *pte_list = (unsigned long)desc | 1; > >> > >> But i missed it when inserting a empty desc, in that case, we need the barrier > >> too since we should make desc->more visible before assign it to pte_list to > >> avoid the lookup side seeing the invalid "nulls". > >> > >> I also use own code instead of rcu_dereference(): > >> pte_list_walk_lockless(): > >> pte_list_value = ACCESS_ONCE(*pte_list); > >> if (!pte_list_value) > >> return; > >> > >> if (!(pte_list_value & 1)) > >> return fn((u64 *)pte_list_value); > >> > >> /* > >> * fetch pte_list before read sptes in the desc, see the comments > >> * in pte_list_add(). > >> * > >> * There is the data dependence since the desc is got from pte_list. > >> */ > >> smp_read_barrier_depends(); > >> > >> That part can be replaced by rcu_dereference(). > >> > > Yes please, also see commit c87a124a5d5e8cf8e21c4363c3372bcaf53ea190 for > > kind of scary bugs we can get here. > > Right, it is likely trigger-able in our case, will fix it. > > > > >>> incorrect usage of RCU. I think any access to slab pointers will need to > >>> use those. > >> > >> Remove desc is not necessary i think since we do not mind to see the old > >> info. (hlist_nulls_del_rcu() does not use rcu_dereference() too) > >> > > May be a bug. I also noticed that rculist_nulls uses rcu_dereference() > > But list_del_rcu() does not use rcu_assign_pointer() too. > This also suspicious. > > to access ->next, but it does not use rcu_assign_pointer() pointer to > > assign it. > > You mean rcu_dereference() is used in hlist_nulls_for_each_entry_rcu()? I think > it's because we should validate the prefetched data before entry->next is > accessed, it is paired with the barrier in rcu_assign_pointer() when add a > new entry into the list. rcu_assign_pointer() make other fields in the entry > be visible before linking entry to the list. Otherwise, the lookup can access > that entry but get the invalid fields. > > After more thinking, I still think rcu_assign_pointer() is unneeded when a entry > is removed. The remove-API does not care the order between unlink the entry and > the changes to its fields. It is the caller's responsibility: > - in the case of rcuhlist, the caller uses call_rcu()/synchronize_rcu(), etc to > enforce all lookups exit and the later change on that entry is invisible to the > lookups. > > - In the case of rculist_nulls, it seems refcounter is used to guarantee the order > (see the example from Documentation/RCU/rculist_nulls.txt). > > - In our case, we allow the lookup to see the deleted desc even if it is in slab cache > or its is initialized or it is re-added. > > Your thought? > As Documentation/RCU/whatisRCU.txt says: As with rcu_assign_pointer(), an important function of rcu_dereference() is to document which pointers are protected by RCU, in particular, flagging a pointer that is subject to changing at any time, including immediately after the rcu_dereference(). And, again like rcu_assign_pointer(), rcu_dereference() is typically used indirectly, via the _rcu list-manipulation primitives, such as list_for_each_entry_rcu(). The documentation aspect of rcu_assign_pointer()/rcu_dereference() is important. The code is complicated, so self documentation will not hurt. I want to see what is actually protected by rcu here. Freeing shadow pages with call_rcu() further complicates matters: does it mean that shadow pages are also protected by rcu? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 29, 2013 at 02:50:51PM +0800, Xiao Guangrong wrote: > After more thinking, I still think rcu_assign_pointer() is unneeded when a entry > is removed. The remove-API does not care the order between unlink the entry and > the changes to its fields. It is the caller's responsibility: > - in the case of rcuhlist, the caller uses call_rcu()/synchronize_rcu(), etc to > enforce all lookups exit and the later change on that entry is invisible to the > lookups. > > - In the case of rculist_nulls, it seems refcounter is used to guarantee the order > (see the example from Documentation/RCU/rculist_nulls.txt). > > - In our case, we allow the lookup to see the deleted desc even if it is in slab cache > or its is initialized or it is re-added. > BTW is it a good idea? We can access deleted desc while it is allocated and initialized to zero by kmem_cache_zalloc(), are we sure we cannot see partially initialized desc->sptes[] entry? On related note what about 32 bit systems, they do not have atomic access to desc->sptes[]. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/29/2013 05:08 PM, Gleb Natapov wrote: > On Thu, Aug 29, 2013 at 02:50:51PM +0800, Xiao Guangrong wrote: >>>>> BTW I do not see >>>>> rcu_assign_pointer()/rcu_dereference() in your patches which hints on >>>> >>>> IIUC, We can not directly use rcu_assign_pointer(), that is something like: >>>> p = v to assign a pointer to a pointer. But in our case, we need: >>>> *pte_list = (unsigned long)desc | 1; >>> >From Documentation/RCU/whatisRCU.txt: >>> >>> The updater uses this function to assign a new value to an RCU-protected pointer. >>> >>> This is what we do, no? (assuming slot->arch.rmap[] is what rcu protects here) >>> The fact that the value is not correct pointer should not matter. >>> >> >> Okay. Will change that code to: >> >> + >> +#define rcu_assign_head_desc(pte_list_p, value) \ >> + rcu_assign_pointer(*(unsigned long __rcu **)(pte_list_p), (unsigned long *)(value)) >> + >> /* >> * Pte mapping structures: >> * >> @@ -1006,14 +1010,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, >> desc->sptes[1] = spte; >> desc_mark_nulls(pte_list, desc); >> >> - /* >> - * Esure the old spte has been updated into desc, so >> - * that the another side can not get the desc from pte_list >> - * but miss the old spte. >> - */ >> - smp_wmb(); >> - >> - *pte_list = (unsigned long)desc | 1; >> + rcu_assign_head_desc(pte_list, (unsigned long)desc | 1); >> >>>> >>>> So i add the smp_wmb() by myself: >>>> /* >>>> * Esure the old spte has been updated into desc, so >>>> * that the another side can not get the desc from pte_list >>>> * but miss the old spte. >>>> */ >>>> smp_wmb(); >>>> >>>> *pte_list = (unsigned long)desc | 1; >>>> >>>> But i missed it when inserting a empty desc, in that case, we need the barrier >>>> too since we should make desc->more visible before assign it to pte_list to >>>> avoid the lookup side seeing the invalid "nulls". >>>> >>>> I also use own code instead of rcu_dereference(): >>>> pte_list_walk_lockless(): >>>> pte_list_value = ACCESS_ONCE(*pte_list); >>>> if (!pte_list_value) >>>> return; >>>> >>>> if (!(pte_list_value & 1)) >>>> return fn((u64 *)pte_list_value); >>>> >>>> /* >>>> * fetch pte_list before read sptes in the desc, see the comments >>>> * in pte_list_add(). >>>> * >>>> * There is the data dependence since the desc is got from pte_list. >>>> */ >>>> smp_read_barrier_depends(); >>>> >>>> That part can be replaced by rcu_dereference(). >>>> >>> Yes please, also see commit c87a124a5d5e8cf8e21c4363c3372bcaf53ea190 for >>> kind of scary bugs we can get here. >> >> Right, it is likely trigger-able in our case, will fix it. >> >>> >>>>> incorrect usage of RCU. I think any access to slab pointers will need to >>>>> use those. >>>> >>>> Remove desc is not necessary i think since we do not mind to see the old >>>> info. (hlist_nulls_del_rcu() does not use rcu_dereference() too) >>>> >>> May be a bug. I also noticed that rculist_nulls uses rcu_dereference() >> >> But list_del_rcu() does not use rcu_assign_pointer() too. >> > This also suspicious. > >>> to access ->next, but it does not use rcu_assign_pointer() pointer to >>> assign it. >> >> You mean rcu_dereference() is used in hlist_nulls_for_each_entry_rcu()? I think >> it's because we should validate the prefetched data before entry->next is >> accessed, it is paired with the barrier in rcu_assign_pointer() when add a >> new entry into the list. rcu_assign_pointer() make other fields in the entry >> be visible before linking entry to the list. Otherwise, the lookup can access >> that entry but get the invalid fields. >> >> After more thinking, I still think rcu_assign_pointer() is unneeded when a entry >> is removed. The remove-API does not care the order between unlink the entry and >> the changes to its fields. It is the caller's responsibility: >> - in the case of rcuhlist, the caller uses call_rcu()/synchronize_rcu(), etc to >> enforce all lookups exit and the later change on that entry is invisible to the >> lookups. >> >> - In the case of rculist_nulls, it seems refcounter is used to guarantee the order >> (see the example from Documentation/RCU/rculist_nulls.txt). >> >> - In our case, we allow the lookup to see the deleted desc even if it is in slab cache >> or its is initialized or it is re-added. >> >> Your thought? >> > > As Documentation/RCU/whatisRCU.txt says: > > As with rcu_assign_pointer(), an important function of > rcu_dereference() is to document which pointers are protected by > RCU, in particular, flagging a pointer that is subject to changing > at any time, including immediately after the rcu_dereference(). > And, again like rcu_assign_pointer(), rcu_dereference() is > typically used indirectly, via the _rcu list-manipulation > primitives, such as list_for_each_entry_rcu(). > > The documentation aspect of rcu_assign_pointer()/rcu_dereference() is > important. The code is complicated, so self documentation will not hurt. > I want to see what is actually protected by rcu here. Freeing shadow > pages with call_rcu() further complicates matters: does it mean that > shadow pages are also protected by rcu? Yes, it stops shadow page to be freed when we do write-protection on it. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 29, 2013 at 05:31:42PM +0800, Xiao Guangrong wrote: > > As Documentation/RCU/whatisRCU.txt says: > > > > As with rcu_assign_pointer(), an important function of > > rcu_dereference() is to document which pointers are protected by > > RCU, in particular, flagging a pointer that is subject to changing > > at any time, including immediately after the rcu_dereference(). > > And, again like rcu_assign_pointer(), rcu_dereference() is > > typically used indirectly, via the _rcu list-manipulation > > primitives, such as list_for_each_entry_rcu(). > > > > The documentation aspect of rcu_assign_pointer()/rcu_dereference() is > > important. The code is complicated, so self documentation will not hurt. > > I want to see what is actually protected by rcu here. Freeing shadow > > pages with call_rcu() further complicates matters: does it mean that > > shadow pages are also protected by rcu? > > Yes, it stops shadow page to be freed when we do write-protection on > it. > Yeah, I got the trick, what I am saying that we have a data structure here protected by RCU, but we do not use RCU functions to access it... BTW why not allocate sp->spt from SLAB_DESTROY_BY_RCU cache too? We may switch write protection on a random spt occasionally if page is deleted and reused for another spt though. For last level spt it should not be a problem and for non last level we have is_last_spte() check in __rmap_write_protect_lockless(). Can it work? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/29/2013 05:51 PM, Gleb Natapov wrote: > On Thu, Aug 29, 2013 at 05:31:42PM +0800, Xiao Guangrong wrote: >>> As Documentation/RCU/whatisRCU.txt says: >>> >>> As with rcu_assign_pointer(), an important function of >>> rcu_dereference() is to document which pointers are protected by >>> RCU, in particular, flagging a pointer that is subject to changing >>> at any time, including immediately after the rcu_dereference(). >>> And, again like rcu_assign_pointer(), rcu_dereference() is >>> typically used indirectly, via the _rcu list-manipulation >>> primitives, such as list_for_each_entry_rcu(). >>> >>> The documentation aspect of rcu_assign_pointer()/rcu_dereference() is >>> important. The code is complicated, so self documentation will not hurt. >>> I want to see what is actually protected by rcu here. Freeing shadow >>> pages with call_rcu() further complicates matters: does it mean that >>> shadow pages are also protected by rcu? >> >> Yes, it stops shadow page to be freed when we do write-protection on >> it. >> > Yeah, I got the trick, what I am saying that we have a data structure > here protected by RCU, but we do not use RCU functions to access it... Yes, they are not used when insert a spte into rmap and get the rmap from the entry... but do we need to use these functions to guarantee the order? The worst case is, we fetch the spte from the desc but the spte is not updated yet, we can happily skip this spte since it will set the dirty-bitmap later, this is guaranteed by the barrier between mmu_spte_update() and mark_page_dirty(), the code is: set_spte(): if (mmu_spte_update(sptep, spte)) kvm_flush_remote_tlbs(vcpu->kvm); if (!remap) { if (rmap_add(vcpu, sptep, gfn) > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); if (level > PT_PAGE_TABLE_LEVEL) ++vcpu->kvm->stat.lpages; } smp_wmb(); if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); So, i guess if we can guaranteed the order by ourself, we do not need to call the rcu functions explicitly... But, the memory barres in the rcu functions are really light on x86 (store can not be reordered with store), so i do not mind to explicitly use them if you think this way is more safe. :) > BTW why not allocate sp->spt from SLAB_DESTROY_BY_RCU cache too? We may > switch write protection on a random spt occasionally if page is deleted > and reused for another spt though. For last level spt it should not be a > problem and for non last level we have is_last_spte() check in > __rmap_write_protect_lockless(). Can it work? Yes, i also considered this way. It can work if we handle is_last_spte() properly. Since the sp->spte can be reused, we can not get the mapping level from sp. We need to encode the mapping level into spte so that cmpxhg can understand if the page table has been moved to another mapping level. Could you allow me to make this optimization separately after this patchset be merged? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/29/2013 05:31 PM, Gleb Natapov wrote: > On Thu, Aug 29, 2013 at 02:50:51PM +0800, Xiao Guangrong wrote: >> After more thinking, I still think rcu_assign_pointer() is unneeded when a entry >> is removed. The remove-API does not care the order between unlink the entry and >> the changes to its fields. It is the caller's responsibility: >> - in the case of rcuhlist, the caller uses call_rcu()/synchronize_rcu(), etc to >> enforce all lookups exit and the later change on that entry is invisible to the >> lookups. >> >> - In the case of rculist_nulls, it seems refcounter is used to guarantee the order >> (see the example from Documentation/RCU/rculist_nulls.txt). >> >> - In our case, we allow the lookup to see the deleted desc even if it is in slab cache >> or its is initialized or it is re-added. >> > BTW is it a good idea? We can access deleted desc while it is allocated > and initialized to zero by kmem_cache_zalloc(), are we sure we cannot > see partially initialized desc->sptes[] entry? On related note what about > 32 bit systems, they do not have atomic access to desc->sptes[]. Good eyes. This is a bug here. It seems we do not have a good to fix this. How disable this optimization on 32 bit host, small changes: static inline void kvm_mmu_rcu_free_page_begin(struct kvm *kvm) { +#ifdef CONFIG_X86_64 rcu_read_lock(); kvm->arch.rcu_free_shadow_page = true; /* Set the indicator before access shadow page. */ smp_mb(); +#else + spin_lock(kvm->mmu_lock); +#endif } static inline void kvm_mmu_rcu_free_page_end(struct kvm *kvm) { +#ifdef CONFIG_X86_64 /* Make sure that access shadow page has finished. */ smp_mb(); kvm->arch.rcu_free_shadow_page = false; rcu_read_unlock(); +#else + spin_unlock(kvm->mmu_lock); +#endif } -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/29/2013 07:33 PM, Xiao Guangrong wrote: > On 08/29/2013 05:31 PM, Gleb Natapov wrote: >> On Thu, Aug 29, 2013 at 02:50:51PM +0800, Xiao Guangrong wrote: >>> After more thinking, I still think rcu_assign_pointer() is unneeded when a entry >>> is removed. The remove-API does not care the order between unlink the entry and >>> the changes to its fields. It is the caller's responsibility: >>> - in the case of rcuhlist, the caller uses call_rcu()/synchronize_rcu(), etc to >>> enforce all lookups exit and the later change on that entry is invisible to the >>> lookups. >>> >>> - In the case of rculist_nulls, it seems refcounter is used to guarantee the order >>> (see the example from Documentation/RCU/rculist_nulls.txt). >>> >>> - In our case, we allow the lookup to see the deleted desc even if it is in slab cache >>> or its is initialized or it is re-added. >>> >> BTW is it a good idea? We can access deleted desc while it is allocated >> and initialized to zero by kmem_cache_zalloc(), are we sure we cannot >> see partially initialized desc->sptes[] entry? On related note what about >> 32 bit systems, they do not have atomic access to desc->sptes[]. Ah... wait. desc is a array of pointers: struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; }; assigning a pointer is aways aotomic, but we should carefully initialize it as you said. I will introduce a constructor for desc slab cache which initialize the struct like this: for (i = 0; i < PTE_LIST_EXT; i++) desc->sptes[i] = NULL; It is okay. > > Good eyes. This is a bug here. > > It seems we do not have a good to fix this. How disable this optimization on > 32 bit host, small changes: > > static inline void kvm_mmu_rcu_free_page_begin(struct kvm *kvm) > { > +#ifdef CONFIG_X86_64 > rcu_read_lock(); > > kvm->arch.rcu_free_shadow_page = true; > /* Set the indicator before access shadow page. */ > smp_mb(); > +#else > + spin_lock(kvm->mmu_lock); > +#endif > } > > static inline void kvm_mmu_rcu_free_page_end(struct kvm *kvm) > { > +#ifdef CONFIG_X86_64 > /* Make sure that access shadow page has finished. */ > smp_mb(); > kvm->arch.rcu_free_shadow_page = false; > > rcu_read_unlock(); > +#else > + spin_unlock(kvm->mmu_lock); > +#endif > } > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 29, 2013 at 07:26:40PM +0800, Xiao Guangrong wrote: > On 08/29/2013 05:51 PM, Gleb Natapov wrote: > > On Thu, Aug 29, 2013 at 05:31:42PM +0800, Xiao Guangrong wrote: > >>> As Documentation/RCU/whatisRCU.txt says: > >>> > >>> As with rcu_assign_pointer(), an important function of > >>> rcu_dereference() is to document which pointers are protected by > >>> RCU, in particular, flagging a pointer that is subject to changing > >>> at any time, including immediately after the rcu_dereference(). > >>> And, again like rcu_assign_pointer(), rcu_dereference() is > >>> typically used indirectly, via the _rcu list-manipulation > >>> primitives, such as list_for_each_entry_rcu(). > >>> > >>> The documentation aspect of rcu_assign_pointer()/rcu_dereference() is > >>> important. The code is complicated, so self documentation will not hurt. > >>> I want to see what is actually protected by rcu here. Freeing shadow > >>> pages with call_rcu() further complicates matters: does it mean that > >>> shadow pages are also protected by rcu? > >> > >> Yes, it stops shadow page to be freed when we do write-protection on > >> it. > >> > > Yeah, I got the trick, what I am saying that we have a data structure > > here protected by RCU, but we do not use RCU functions to access it... > > Yes, they are not used when insert a spte into rmap and get the rmap from > the entry... but do we need to use these functions to guarantee the order? > > The worst case is, we fetch the spte from the desc but the spte is not > updated yet, we can happily skip this spte since it will set the > dirty-bitmap later, this is guaranteed by the barrier between mmu_spte_update() > and mark_page_dirty(), the code is: > > set_spte(): > > if (mmu_spte_update(sptep, spte)) > kvm_flush_remote_tlbs(vcpu->kvm); > > if (!remap) { > if (rmap_add(vcpu, sptep, gfn) > RMAP_RECYCLE_THRESHOLD) > rmap_recycle(vcpu, sptep, gfn); > > if (level > PT_PAGE_TABLE_LEVEL) > ++vcpu->kvm->stat.lpages; > } > > smp_wmb(); > > if (pte_access & ACC_WRITE_MASK) > mark_page_dirty(vcpu->kvm, gfn); > > So, i guess if we can guaranteed the order by ourself, we do not need > to call the rcu functions explicitly... > > But, the memory barres in the rcu functions are really light on x86 (store > can not be reordered with store), so i do not mind to explicitly use them > if you think this way is more safe. :) > I think the self documentation aspect of using rcu function is also important. > > BTW why not allocate sp->spt from SLAB_DESTROY_BY_RCU cache too? We may > > switch write protection on a random spt occasionally if page is deleted > > and reused for another spt though. For last level spt it should not be a > > problem and for non last level we have is_last_spte() check in > > __rmap_write_protect_lockless(). Can it work? > > Yes, i also considered this way. It can work if we handle is_last_spte() > properly. Since the sp->spte can be reused, we can not get the mapping > level from sp. We need to encode the mapping level into spte so that > cmpxhg can understand if the page table has been moved to another mapping > level. Isn't one bit that says that spte is the last one enough? IIRC we have one more ignored bit to spare in spte. > Could you allow me to make this optimization separately after this > patchset be merged? > If you think it will complicate the initial version I am fine with postponing it for later. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 29, 2013 at 08:02:30PM +0800, Xiao Guangrong wrote: > On 08/29/2013 07:33 PM, Xiao Guangrong wrote: > > On 08/29/2013 05:31 PM, Gleb Natapov wrote: > >> On Thu, Aug 29, 2013 at 02:50:51PM +0800, Xiao Guangrong wrote: > >>> After more thinking, I still think rcu_assign_pointer() is unneeded when a entry > >>> is removed. The remove-API does not care the order between unlink the entry and > >>> the changes to its fields. It is the caller's responsibility: > >>> - in the case of rcuhlist, the caller uses call_rcu()/synchronize_rcu(), etc to > >>> enforce all lookups exit and the later change on that entry is invisible to the > >>> lookups. > >>> > >>> - In the case of rculist_nulls, it seems refcounter is used to guarantee the order > >>> (see the example from Documentation/RCU/rculist_nulls.txt). > >>> > >>> - In our case, we allow the lookup to see the deleted desc even if it is in slab cache > >>> or its is initialized or it is re-added. > >>> > >> BTW is it a good idea? We can access deleted desc while it is allocated > >> and initialized to zero by kmem_cache_zalloc(), are we sure we cannot > >> see partially initialized desc->sptes[] entry? On related note what about > >> 32 bit systems, they do not have atomic access to desc->sptes[]. > > Ah... wait. desc is a array of pointers: > > struct pte_list_desc { > u64 *sptes[PTE_LIST_EXT]; > struct pte_list_desc *more; > }; > Yep, I misread it to be u64 bits and wondered why do we use u64 to store pointers. > assigning a pointer is aways aotomic, but we should carefully initialize it > as you said. I will introduce a constructor for desc slab cache which initialize > the struct like this: > > for (i = 0; i < PTE_LIST_EXT; i++) > desc->sptes[i] = NULL; > > It is okay. > I hope slab does not write anything into allocated memory internally if constructor is present. BTW do you know what happens when SLAB debug is enabled and SLAB_DESTROY_BY_RCU is set? Does poison value is written into freed object (freed to slab, but not yet to page allocator)? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/30/2013 07:38 PM, Gleb Natapov wrote: > On Thu, Aug 29, 2013 at 07:26:40PM +0800, Xiao Guangrong wrote: >> On 08/29/2013 05:51 PM, Gleb Natapov wrote: >>> On Thu, Aug 29, 2013 at 05:31:42PM +0800, Xiao Guangrong wrote: >>>>> As Documentation/RCU/whatisRCU.txt says: >>>>> >>>>> As with rcu_assign_pointer(), an important function of >>>>> rcu_dereference() is to document which pointers are protected by >>>>> RCU, in particular, flagging a pointer that is subject to changing >>>>> at any time, including immediately after the rcu_dereference(). >>>>> And, again like rcu_assign_pointer(), rcu_dereference() is >>>>> typically used indirectly, via the _rcu list-manipulation >>>>> primitives, such as list_for_each_entry_rcu(). >>>>> >>>>> The documentation aspect of rcu_assign_pointer()/rcu_dereference() is >>>>> important. The code is complicated, so self documentation will not hurt. >>>>> I want to see what is actually protected by rcu here. Freeing shadow >>>>> pages with call_rcu() further complicates matters: does it mean that >>>>> shadow pages are also protected by rcu? >>>> >>>> Yes, it stops shadow page to be freed when we do write-protection on >>>> it. >>>> >>> Yeah, I got the trick, what I am saying that we have a data structure >>> here protected by RCU, but we do not use RCU functions to access it... >> >> Yes, they are not used when insert a spte into rmap and get the rmap from >> the entry... but do we need to use these functions to guarantee the order? >> >> The worst case is, we fetch the spte from the desc but the spte is not >> updated yet, we can happily skip this spte since it will set the >> dirty-bitmap later, this is guaranteed by the barrier between mmu_spte_update() >> and mark_page_dirty(), the code is: >> >> set_spte(): >> >> if (mmu_spte_update(sptep, spte)) >> kvm_flush_remote_tlbs(vcpu->kvm); >> >> if (!remap) { >> if (rmap_add(vcpu, sptep, gfn) > RMAP_RECYCLE_THRESHOLD) >> rmap_recycle(vcpu, sptep, gfn); >> >> if (level > PT_PAGE_TABLE_LEVEL) >> ++vcpu->kvm->stat.lpages; >> } >> >> smp_wmb(); >> >> if (pte_access & ACC_WRITE_MASK) >> mark_page_dirty(vcpu->kvm, gfn); >> >> So, i guess if we can guaranteed the order by ourself, we do not need >> to call the rcu functions explicitly... >> >> But, the memory barres in the rcu functions are really light on x86 (store >> can not be reordered with store), so i do not mind to explicitly use them >> if you think this way is more safe. :) >> > I think the self documentation aspect of using rcu function is also > important. Okay. I will use these rcu functions and measure them to see whether it'll cause performance issue. > >>> BTW why not allocate sp->spt from SLAB_DESTROY_BY_RCU cache too? We may >>> switch write protection on a random spt occasionally if page is deleted >>> and reused for another spt though. For last level spt it should not be a >>> problem and for non last level we have is_last_spte() check in >>> __rmap_write_protect_lockless(). Can it work? >> >> Yes, i also considered this way. It can work if we handle is_last_spte() >> properly. Since the sp->spte can be reused, we can not get the mapping >> level from sp. We need to encode the mapping level into spte so that >> cmpxhg can understand if the page table has been moved to another mapping >> level. > Isn't one bit that says that spte is the last one enough? IIRC we > have one more ignored bit to spare in spte. Right. But i also want to use this way in fast_page_fault where mapping-level is needed. > >> Could you allow me to make this optimization separately after this >> patchset be merged? >> > If you think it will complicate the initial version I am fine with > postponing it for later. Thank you, Gleb! -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 08/30/2013 07:44 PM, Gleb Natapov wrote: > On Thu, Aug 29, 2013 at 08:02:30PM +0800, Xiao Guangrong wrote: >> On 08/29/2013 07:33 PM, Xiao Guangrong wrote: >>> On 08/29/2013 05:31 PM, Gleb Natapov wrote: >>>> On Thu, Aug 29, 2013 at 02:50:51PM +0800, Xiao Guangrong wrote: >>>>> After more thinking, I still think rcu_assign_pointer() is unneeded when a entry >>>>> is removed. The remove-API does not care the order between unlink the entry and >>>>> the changes to its fields. It is the caller's responsibility: >>>>> - in the case of rcuhlist, the caller uses call_rcu()/synchronize_rcu(), etc to >>>>> enforce all lookups exit and the later change on that entry is invisible to the >>>>> lookups. >>>>> >>>>> - In the case of rculist_nulls, it seems refcounter is used to guarantee the order >>>>> (see the example from Documentation/RCU/rculist_nulls.txt). >>>>> >>>>> - In our case, we allow the lookup to see the deleted desc even if it is in slab cache >>>>> or its is initialized or it is re-added. >>>>> >>>> BTW is it a good idea? We can access deleted desc while it is allocated >>>> and initialized to zero by kmem_cache_zalloc(), are we sure we cannot >>>> see partially initialized desc->sptes[] entry? On related note what about >>>> 32 bit systems, they do not have atomic access to desc->sptes[]. >> >> Ah... wait. desc is a array of pointers: >> >> struct pte_list_desc { >> u64 *sptes[PTE_LIST_EXT]; >> struct pte_list_desc *more; >> }; >> > Yep, I misread it to be u64 bits and wondered why do we use u64 to store > pointers. > >> assigning a pointer is aways aotomic, but we should carefully initialize it >> as you said. I will introduce a constructor for desc slab cache which initialize >> the struct like this: >> >> for (i = 0; i < PTE_LIST_EXT; i++) >> desc->sptes[i] = NULL; >> >> It is okay. >> > I hope slab does not write anything into allocated memory internally if > constructor is present. If only constructor is present (no SLAB_DESTROY_BY_RCU), It'll temporarily write the "poison" value into the memory then call the constructor to initialize it again, e.g, in slab.c: static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { if (cachep->flags & SLAB_POISON) { ...... poison_obj(cachep, objp, POISON_INUSE); } ...... if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); } But SLAB_DESTROY_BY_RCU can force the allocer to don't touch the memory, this is true in our case. > BTW do you know what happens when SLAB debug is enabled > and SLAB_DESTROY_BY_RCU is set? When SLAB debug is enabled, these 3 flags may be set: #define SLAB_DEBUG_FREE 0x00000100UL /* DEBUG: Perform (expensive) checks on free */ #define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */ #define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */ Only SLAB_POISON can write something into the memory, and ... > Does poison value is written into freed > object (freed to slab, but not yet to page allocator)? SLAB_POISON is cleared if SLAB_DESTROY_BY_RCU is used. - In slab, There is the code in __kmem_cache_create(): if (flags & SLAB_DESTROY_BY_RCU) BUG_ON(flags & SLAB_POISON); - In slub, the code is in calculate_sizes(): /* * Determine if we can poison the object itself. If the user of * the slab may touch the object after free or before allocation * then we should never poison the object itself. */ if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && !s->ctor) s->flags |= __OBJECT_POISON; else s->flags &= ~__OBJECT_POISON; - in slob, it seems it does not support SLAB DEBUG. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 36caf6a..f8fc0cc 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1010,6 +1010,14 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, desc->sptes[0] = (u64 *)*pte_list; desc->sptes[1] = spte; desc_mark_nulls(pte_list, desc); + + /* + * Esure the old spte has been updated into desc, so + * that the another side can not get the desc from pte_list + * but miss the old spte. + */ + smp_wmb(); + *pte_list = (unsigned long)desc | 1; return 1; } @@ -1131,6 +1139,47 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) WARN_ON(desc_get_nulls_value(desc) != pte_list); } +/* The caller should hold rcu lock. */ +typedef void (*pte_list_walk_lockless_fn) (u64 *spte, int level); +static void pte_list_walk_lockless(unsigned long *pte_list, + pte_list_walk_lockless_fn fn, int level) +{ + struct pte_list_desc *desc; + unsigned long pte_list_value; + int i; + +restart: + pte_list_value = ACCESS_ONCE(*pte_list); + if (!pte_list_value) + return; + + if (!(pte_list_value & 1)) + return fn((u64 *)pte_list_value, level); + + /* + * fetch pte_list before read sptes in the desc, see the comments + * in pte_list_add(). + * + * There is the data dependence since the desc is got from pte_list. + */ + smp_read_barrier_depends(); + + desc = (struct pte_list_desc *)(pte_list_value & ~1ul); + while (!desc_is_a_nulls(desc)) { + for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) + fn(desc->sptes[i], level); + + desc = ACCESS_ONCE(desc->more); + + /* It is being initialized. */ + if (unlikely(!desc)) + goto restart; + } + + if (unlikely(desc_get_nulls_value(desc) != pte_list)) + goto restart; +} + static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, struct kvm_memory_slot *slot) { @@ -4557,7 +4606,7 @@ int kvm_mmu_module_init(void) { pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), - 0, 0, NULL); + 0, SLAB_DESTROY_BY_RCU, NULL); if (!pte_list_desc_cache) goto nomem;
The basic idea is from nulls list which uses a nulls to indicate whether the desc is moved to different pte-list Thanks to SLAB_DESTROY_BY_RCU, the desc can be quickly reused Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> --- arch/x86/kvm/mmu.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-)