Message ID | 20200326152438.6218-20-alexandru.elisei@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add reassignable BARs and PCIE 1.1 support | expand |
On 26/03/2020 15:24, Alexandru Elisei wrote: Hi, > kvmtool uses brlock for protecting accesses to the ioport and mmio > red-black trees. brlock allows concurrent reads, but only one writer, which > is assumed not to be a VCPU thread (for more information see commit > 0b907ed2eaec ("kvm tools: Add a brlock)). This is done by issuing a > compiler barrier on read and pausing the entire virtual machine on writes. > When KVM_BRLOCK_DEBUG is defined, brlock uses instead a pthread read/write > lock. > > When we will implement reassignable BARs, the mmio or ioport mapping will > be done as a result of a VCPU mmio access. When brlock is a pthread > read/write lock, it means that we will try to acquire a write lock with the > read lock already held by the same VCPU and we will deadlock. When it's > not, a VCPU will have to call kvm__pause, which means the virtual machine > will stay paused forever. > > Let's avoid all this by using a mutex and reference counting the red-black > tree entries. This way we can guarantee that we won't unregister a node > that another thread is currently using for emulation. > > Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com> > --- > include/kvm/ioport.h | 2 + > include/kvm/rbtree-interval.h | 4 +- > ioport.c | 64 +++++++++++++++++------- > mmio.c | 91 +++++++++++++++++++++++++---------- > 4 files changed, 118 insertions(+), 43 deletions(-) > > diff --git a/include/kvm/ioport.h b/include/kvm/ioport.h > index 62a719327e3f..039633f76bdd 100644 > --- a/include/kvm/ioport.h > +++ b/include/kvm/ioport.h > @@ -22,6 +22,8 @@ struct ioport { > struct ioport_operations *ops; > void *priv; > struct device_header dev_hdr; > + u32 refcount; > + bool remove; The use of this extra "remove" variable seems somehow odd. I think normally you would initialise the refcount to 1, and let the unregister operation do a put as well, with the removal code triggered if the count reaches zero. At least this is what kref does, can we do the same here? Or is there anything that would prevent it? I think it's a good idea to stick to existing design patterns for things like refcounts. Cheers, Andre. > }; > > struct ioport_operations { > diff --git a/include/kvm/rbtree-interval.h b/include/kvm/rbtree-interval.h > index 730eb5e8551d..17cd3b5f3199 100644 > --- a/include/kvm/rbtree-interval.h > +++ b/include/kvm/rbtree-interval.h > @@ -6,7 +6,9 @@ > > #define RB_INT_INIT(l, h) \ > (struct rb_int_node){.low = l, .high = h} > -#define rb_int(n) rb_entry(n, struct rb_int_node, node) > +#define rb_int(n) rb_entry(n, struct rb_int_node, node) > +#define rb_int_start(n) ((n)->low) > +#define rb_int_end(n) ((n)->low + (n)->high - 1) > > struct rb_int_node { > struct rb_node node; > diff --git a/ioport.c b/ioport.c > index d9f2e8ea3c3b..5d7288809528 100644 > --- a/ioport.c > +++ b/ioport.c > @@ -2,7 +2,6 @@ > > #include "kvm/kvm.h" > #include "kvm/util.h" > -#include "kvm/brlock.h" > #include "kvm/rbtree-interval.h" > #include "kvm/mutex.h" > > @@ -16,6 +15,8 @@ > > #define ioport_node(n) rb_entry(n, struct ioport, node) > > +static DEFINE_MUTEX(ioport_lock); > + > static struct rb_root ioport_tree = RB_ROOT; > > static struct ioport *ioport_search(struct rb_root *root, u64 addr) > @@ -39,6 +40,36 @@ static void ioport_remove(struct rb_root *root, struct ioport *data) > rb_int_erase(root, &data->node); > } > > +static struct ioport *ioport_get(struct rb_root *root, u64 addr) > +{ > + struct ioport *ioport; > + > + mutex_lock(&ioport_lock); > + ioport = ioport_search(root, addr); > + if (ioport) > + ioport->refcount++; > + mutex_unlock(&ioport_lock); > + > + return ioport; > +} > + > +/* Called with ioport_lock held. */ > +static void ioport_unregister(struct rb_root *root, struct ioport *data) > +{ > + device__unregister(&data->dev_hdr); > + ioport_remove(root, data); > + free(data); > +} > + > +static void ioport_put(struct rb_root *root, struct ioport *data) > +{ > + mutex_lock(&ioport_lock); > + data->refcount--; > + if (data->remove && data->refcount == 0) > + ioport_unregister(root, data); > + mutex_unlock(&ioport_lock); > +} > + > #ifdef CONFIG_HAS_LIBFDT > static void generate_ioport_fdt_node(void *fdt, > struct device_header *dev_hdr, > @@ -80,16 +111,18 @@ int ioport__register(struct kvm *kvm, u16 port, struct ioport_operations *ops, i > .bus_type = DEVICE_BUS_IOPORT, > .data = generate_ioport_fdt_node, > }, > + .refcount = 0, > + .remove = false, > }; > > - br_write_lock(kvm); > + mutex_lock(&ioport_lock); > r = ioport_insert(&ioport_tree, entry); > if (r < 0) > goto out_free; > r = device__register(&entry->dev_hdr); > if (r < 0) > goto out_remove; > - br_write_unlock(kvm); > + mutex_unlock(&ioport_lock); > > return port; > > @@ -97,7 +130,7 @@ out_remove: > ioport_remove(&ioport_tree, entry); > out_free: > free(entry); > - br_write_unlock(kvm); > + mutex_unlock(&ioport_lock); > return r; > } > > @@ -106,22 +139,22 @@ int ioport__unregister(struct kvm *kvm, u16 port) > struct ioport *entry; > int r; > > - br_write_lock(kvm); > + mutex_lock(&ioport_lock); > > r = -ENOENT; > entry = ioport_search(&ioport_tree, port); > if (!entry) > goto done; > > - device__unregister(&entry->dev_hdr); > - ioport_remove(&ioport_tree, entry); > - > - free(entry); > + if (entry->refcount) > + entry->remove = true; > + else > + ioport_unregister(&ioport_tree, entry); > > r = 0; > > done: > - br_write_unlock(kvm); > + mutex_unlock(&ioport_lock); > > return r; > } > @@ -136,9 +169,7 @@ static void ioport__unregister_all(void) > while (rb) { > rb_node = rb_int(rb); > entry = ioport_node(rb_node); > - device__unregister(&entry->dev_hdr); > - ioport_remove(&ioport_tree, entry); > - free(entry); > + ioport_unregister(&ioport_tree, entry); > rb = rb_first(&ioport_tree); > } > } > @@ -164,8 +195,7 @@ bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, > void *ptr = data; > struct kvm *kvm = vcpu->kvm; > > - br_read_lock(kvm); > - entry = ioport_search(&ioport_tree, port); > + entry = ioport_get(&ioport_tree, port); > if (!entry) > goto out; > > @@ -180,9 +210,9 @@ bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, > ptr += size; > } > > -out: > - br_read_unlock(kvm); > + ioport_put(&ioport_tree, entry); > > +out: > if (ret) > return true; > > diff --git a/mmio.c b/mmio.c > index 61e1d47a587d..57a4d5b9d64d 100644 > --- a/mmio.c > +++ b/mmio.c > @@ -1,7 +1,7 @@ > #include "kvm/kvm.h" > #include "kvm/kvm-cpu.h" > #include "kvm/rbtree-interval.h" > -#include "kvm/brlock.h" > +#include "kvm/mutex.h" > > #include <stdio.h> > #include <stdlib.h> > @@ -15,10 +15,14 @@ > > #define mmio_node(n) rb_entry(n, struct mmio_mapping, node) > > +static DEFINE_MUTEX(mmio_lock); > + > struct mmio_mapping { > struct rb_int_node node; > void (*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr); > void *ptr; > + u32 refcount; > + bool remove; > }; > > static struct rb_root mmio_tree = RB_ROOT; > @@ -51,6 +55,11 @@ static int mmio_insert(struct rb_root *root, struct mmio_mapping *data) > return rb_int_insert(root, &data->node); > } > > +static void mmio_remove(struct rb_root *root, struct mmio_mapping *data) > +{ > + rb_int_erase(root, &data->node); > +} > + > static const char *to_direction(u8 is_write) > { > if (is_write) > @@ -59,6 +68,41 @@ static const char *to_direction(u8 is_write) > return "read"; > } > > +static struct mmio_mapping *mmio_get(struct rb_root *root, u64 phys_addr, u32 len) > +{ > + struct mmio_mapping *mmio; > + > + mutex_lock(&mmio_lock); > + mmio = mmio_search(root, phys_addr, len); > + if (mmio) > + mmio->refcount++; > + mutex_unlock(&mmio_lock); > + > + return mmio; > +} > + > +/* Called with mmio_lock held. */ > +static void mmio_deregister(struct kvm *kvm, struct rb_root *root, struct mmio_mapping *mmio) > +{ > + struct kvm_coalesced_mmio_zone zone = (struct kvm_coalesced_mmio_zone) { > + .addr = rb_int_start(&mmio->node), > + .size = 1, > + }; > + ioctl(kvm->vm_fd, KVM_UNREGISTER_COALESCED_MMIO, &zone); > + > + mmio_remove(root, mmio); > + free(mmio); > +} > + > +static void mmio_put(struct kvm *kvm, struct rb_root *root, struct mmio_mapping *mmio) > +{ > + mutex_lock(&mmio_lock); > + mmio->refcount--; > + if (mmio->remove && mmio->refcount == 0) > + mmio_deregister(kvm, root, mmio); > + mutex_unlock(&mmio_lock); > +} > + > int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce, > void (*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr), > void *ptr) > @@ -72,9 +116,11 @@ int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool c > return -ENOMEM; > > *mmio = (struct mmio_mapping) { > - .node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len), > - .mmio_fn = mmio_fn, > - .ptr = ptr, > + .node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len), > + .mmio_fn = mmio_fn, > + .ptr = ptr, > + .refcount = 0, > + .remove = false, > }; > > if (coalesce) { > @@ -88,9 +134,9 @@ int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool c > return -errno; > } > } > - br_write_lock(kvm); > + mutex_lock(&mmio_lock); > ret = mmio_insert(&mmio_tree, mmio); > - br_write_unlock(kvm); > + mutex_unlock(&mmio_lock); > > return ret; > } > @@ -98,25 +144,20 @@ int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool c > bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr) > { > struct mmio_mapping *mmio; > - struct kvm_coalesced_mmio_zone zone; > > - br_write_lock(kvm); > + mutex_lock(&mmio_lock); > mmio = mmio_search_single(&mmio_tree, phys_addr); > if (mmio == NULL) { > - br_write_unlock(kvm); > + mutex_unlock(&mmio_lock); > return false; > } > > - zone = (struct kvm_coalesced_mmio_zone) { > - .addr = phys_addr, > - .size = 1, > - }; > - ioctl(kvm->vm_fd, KVM_UNREGISTER_COALESCED_MMIO, &zone); > + if (mmio->refcount) > + mmio->remove = true; > + else > + mmio_deregister(kvm, &mmio_tree, mmio); > + mutex_unlock(&mmio_lock); > > - rb_int_erase(&mmio_tree, &mmio->node); > - br_write_unlock(kvm); > - > - free(mmio); > return true; > } > > @@ -124,18 +165,18 @@ bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u > { > struct mmio_mapping *mmio; > > - br_read_lock(vcpu->kvm); > - mmio = mmio_search(&mmio_tree, phys_addr, len); > - > - if (mmio) > - mmio->mmio_fn(vcpu, phys_addr, data, len, is_write, mmio->ptr); > - else { > + mmio = mmio_get(&mmio_tree, phys_addr, len); > + if (!mmio) { > if (vcpu->kvm->cfg.mmio_debug) > fprintf(stderr, "Warning: Ignoring MMIO %s at %016llx (length %u)\n", > to_direction(is_write), > (unsigned long long)phys_addr, len); > + goto out; > } > - br_read_unlock(vcpu->kvm); > > + mmio->mmio_fn(vcpu, phys_addr, data, len, is_write, mmio->ptr); > + mmio_put(vcpu->kvm, &mmio_tree, mmio); > + > +out: > return true; > } >
Hi, On 3/31/20 12:51 PM, André Przywara wrote: > On 26/03/2020 15:24, Alexandru Elisei wrote: > > Hi, > >> kvmtool uses brlock for protecting accesses to the ioport and mmio >> red-black trees. brlock allows concurrent reads, but only one writer, which >> is assumed not to be a VCPU thread (for more information see commit >> 0b907ed2eaec ("kvm tools: Add a brlock)). This is done by issuing a >> compiler barrier on read and pausing the entire virtual machine on writes. >> When KVM_BRLOCK_DEBUG is defined, brlock uses instead a pthread read/write >> lock. >> >> When we will implement reassignable BARs, the mmio or ioport mapping will >> be done as a result of a VCPU mmio access. When brlock is a pthread >> read/write lock, it means that we will try to acquire a write lock with the >> read lock already held by the same VCPU and we will deadlock. When it's >> not, a VCPU will have to call kvm__pause, which means the virtual machine >> will stay paused forever. >> >> Let's avoid all this by using a mutex and reference counting the red-black >> tree entries. This way we can guarantee that we won't unregister a node >> that another thread is currently using for emulation. >> >> Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com> >> --- >> include/kvm/ioport.h | 2 + >> include/kvm/rbtree-interval.h | 4 +- >> ioport.c | 64 +++++++++++++++++------- >> mmio.c | 91 +++++++++++++++++++++++++---------- >> 4 files changed, 118 insertions(+), 43 deletions(-) >> >> diff --git a/include/kvm/ioport.h b/include/kvm/ioport.h >> index 62a719327e3f..039633f76bdd 100644 >> --- a/include/kvm/ioport.h >> +++ b/include/kvm/ioport.h >> @@ -22,6 +22,8 @@ struct ioport { >> struct ioport_operations *ops; >> void *priv; >> struct device_header dev_hdr; >> + u32 refcount; >> + bool remove; > The use of this extra "remove" variable seems somehow odd. I think > normally you would initialise the refcount to 1, and let the unregister > operation do a put as well, with the removal code triggered if the count > reaches zero. At least this is what kref does, can we do the same here? > Or is there anything that would prevent it? I think it's a good idea to > stick to existing design patterns for things like refcounts. > > Cheers, > Andre. > You're totally right, it didn't cross my mind to initialize refcount to 1, it's a great idea, I'll do it like that. Thanks, Alex
Hi, On 5/1/20 4:30 PM, Alexandru Elisei wrote: > Hi, > > On 3/31/20 12:51 PM, André Przywara wrote: >> On 26/03/2020 15:24, Alexandru Elisei wrote: >> >> Hi, >> >>> kvmtool uses brlock for protecting accesses to the ioport and mmio >>> red-black trees. brlock allows concurrent reads, but only one writer, which >>> is assumed not to be a VCPU thread (for more information see commit >>> 0b907ed2eaec ("kvm tools: Add a brlock)). This is done by issuing a >>> compiler barrier on read and pausing the entire virtual machine on writes. >>> When KVM_BRLOCK_DEBUG is defined, brlock uses instead a pthread read/write >>> lock. >>> >>> When we will implement reassignable BARs, the mmio or ioport mapping will >>> be done as a result of a VCPU mmio access. When brlock is a pthread >>> read/write lock, it means that we will try to acquire a write lock with the >>> read lock already held by the same VCPU and we will deadlock. When it's >>> not, a VCPU will have to call kvm__pause, which means the virtual machine >>> will stay paused forever. >>> >>> Let's avoid all this by using a mutex and reference counting the red-black >>> tree entries. This way we can guarantee that we won't unregister a node >>> that another thread is currently using for emulation. >>> >>> Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com> >>> --- >>> include/kvm/ioport.h | 2 + >>> include/kvm/rbtree-interval.h | 4 +- >>> ioport.c | 64 +++++++++++++++++------- >>> mmio.c | 91 +++++++++++++++++++++++++---------- >>> 4 files changed, 118 insertions(+), 43 deletions(-) >>> >>> diff --git a/include/kvm/ioport.h b/include/kvm/ioport.h >>> index 62a719327e3f..039633f76bdd 100644 >>> --- a/include/kvm/ioport.h >>> +++ b/include/kvm/ioport.h >>> @@ -22,6 +22,8 @@ struct ioport { >>> struct ioport_operations *ops; >>> void *priv; >>> struct device_header dev_hdr; >>> + u32 refcount; >>> + bool remove; >> The use of this extra "remove" variable seems somehow odd. I think >> normally you would initialise the refcount to 1, and let the unregister >> operation do a put as well, with the removal code triggered if the count >> reaches zero. At least this is what kref does, can we do the same here? >> Or is there anything that would prevent it? I think it's a good idea to >> stick to existing design patterns for things like refcounts. >> >> Cheers, >> Andre. >> > You're totally right, it didn't cross my mind to initialize refcount to 1, it's a > great idea, I'll do it like that. I take that back, I've tested your proposal and it's not working. Let's consider the scenario with 3 threads where the a mmio node is created with a refcount of 1, as you suggested (please excuse any bad formatting): 1 | 2 | 3 1. mmio_get() | | 2. refcount++(== 2)| | | 1. deactivate BAR | | 2. BAR active | | | 1. deactivate BAR | | 2. BAR active | | 3. refcount--(==1) | 3. refcount--(==0) | | 4. remove mmio node| 3. mmio->mmio_fn() | | When Thread 1 calls mmio->mmio_fn(), the mmio node has been free'd. My original version works because the only place where I decrement refcount is after the call to mmio_fn. I really don't want to use locking when calling pci_deactivate_bar. Do you have any ideas? Thanks, Alex
diff --git a/include/kvm/ioport.h b/include/kvm/ioport.h index 62a719327e3f..039633f76bdd 100644 --- a/include/kvm/ioport.h +++ b/include/kvm/ioport.h @@ -22,6 +22,8 @@ struct ioport { struct ioport_operations *ops; void *priv; struct device_header dev_hdr; + u32 refcount; + bool remove; }; struct ioport_operations { diff --git a/include/kvm/rbtree-interval.h b/include/kvm/rbtree-interval.h index 730eb5e8551d..17cd3b5f3199 100644 --- a/include/kvm/rbtree-interval.h +++ b/include/kvm/rbtree-interval.h @@ -6,7 +6,9 @@ #define RB_INT_INIT(l, h) \ (struct rb_int_node){.low = l, .high = h} -#define rb_int(n) rb_entry(n, struct rb_int_node, node) +#define rb_int(n) rb_entry(n, struct rb_int_node, node) +#define rb_int_start(n) ((n)->low) +#define rb_int_end(n) ((n)->low + (n)->high - 1) struct rb_int_node { struct rb_node node; diff --git a/ioport.c b/ioport.c index d9f2e8ea3c3b..5d7288809528 100644 --- a/ioport.c +++ b/ioport.c @@ -2,7 +2,6 @@ #include "kvm/kvm.h" #include "kvm/util.h" -#include "kvm/brlock.h" #include "kvm/rbtree-interval.h" #include "kvm/mutex.h" @@ -16,6 +15,8 @@ #define ioport_node(n) rb_entry(n, struct ioport, node) +static DEFINE_MUTEX(ioport_lock); + static struct rb_root ioport_tree = RB_ROOT; static struct ioport *ioport_search(struct rb_root *root, u64 addr) @@ -39,6 +40,36 @@ static void ioport_remove(struct rb_root *root, struct ioport *data) rb_int_erase(root, &data->node); } +static struct ioport *ioport_get(struct rb_root *root, u64 addr) +{ + struct ioport *ioport; + + mutex_lock(&ioport_lock); + ioport = ioport_search(root, addr); + if (ioport) + ioport->refcount++; + mutex_unlock(&ioport_lock); + + return ioport; +} + +/* Called with ioport_lock held. */ +static void ioport_unregister(struct rb_root *root, struct ioport *data) +{ + device__unregister(&data->dev_hdr); + ioport_remove(root, data); + free(data); +} + +static void ioport_put(struct rb_root *root, struct ioport *data) +{ + mutex_lock(&ioport_lock); + data->refcount--; + if (data->remove && data->refcount == 0) + ioport_unregister(root, data); + mutex_unlock(&ioport_lock); +} + #ifdef CONFIG_HAS_LIBFDT static void generate_ioport_fdt_node(void *fdt, struct device_header *dev_hdr, @@ -80,16 +111,18 @@ int ioport__register(struct kvm *kvm, u16 port, struct ioport_operations *ops, i .bus_type = DEVICE_BUS_IOPORT, .data = generate_ioport_fdt_node, }, + .refcount = 0, + .remove = false, }; - br_write_lock(kvm); + mutex_lock(&ioport_lock); r = ioport_insert(&ioport_tree, entry); if (r < 0) goto out_free; r = device__register(&entry->dev_hdr); if (r < 0) goto out_remove; - br_write_unlock(kvm); + mutex_unlock(&ioport_lock); return port; @@ -97,7 +130,7 @@ out_remove: ioport_remove(&ioport_tree, entry); out_free: free(entry); - br_write_unlock(kvm); + mutex_unlock(&ioport_lock); return r; } @@ -106,22 +139,22 @@ int ioport__unregister(struct kvm *kvm, u16 port) struct ioport *entry; int r; - br_write_lock(kvm); + mutex_lock(&ioport_lock); r = -ENOENT; entry = ioport_search(&ioport_tree, port); if (!entry) goto done; - device__unregister(&entry->dev_hdr); - ioport_remove(&ioport_tree, entry); - - free(entry); + if (entry->refcount) + entry->remove = true; + else + ioport_unregister(&ioport_tree, entry); r = 0; done: - br_write_unlock(kvm); + mutex_unlock(&ioport_lock); return r; } @@ -136,9 +169,7 @@ static void ioport__unregister_all(void) while (rb) { rb_node = rb_int(rb); entry = ioport_node(rb_node); - device__unregister(&entry->dev_hdr); - ioport_remove(&ioport_tree, entry); - free(entry); + ioport_unregister(&ioport_tree, entry); rb = rb_first(&ioport_tree); } } @@ -164,8 +195,7 @@ bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, void *ptr = data; struct kvm *kvm = vcpu->kvm; - br_read_lock(kvm); - entry = ioport_search(&ioport_tree, port); + entry = ioport_get(&ioport_tree, port); if (!entry) goto out; @@ -180,9 +210,9 @@ bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, ptr += size; } -out: - br_read_unlock(kvm); + ioport_put(&ioport_tree, entry); +out: if (ret) return true; diff --git a/mmio.c b/mmio.c index 61e1d47a587d..57a4d5b9d64d 100644 --- a/mmio.c +++ b/mmio.c @@ -1,7 +1,7 @@ #include "kvm/kvm.h" #include "kvm/kvm-cpu.h" #include "kvm/rbtree-interval.h" -#include "kvm/brlock.h" +#include "kvm/mutex.h" #include <stdio.h> #include <stdlib.h> @@ -15,10 +15,14 @@ #define mmio_node(n) rb_entry(n, struct mmio_mapping, node) +static DEFINE_MUTEX(mmio_lock); + struct mmio_mapping { struct rb_int_node node; void (*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr); void *ptr; + u32 refcount; + bool remove; }; static struct rb_root mmio_tree = RB_ROOT; @@ -51,6 +55,11 @@ static int mmio_insert(struct rb_root *root, struct mmio_mapping *data) return rb_int_insert(root, &data->node); } +static void mmio_remove(struct rb_root *root, struct mmio_mapping *data) +{ + rb_int_erase(root, &data->node); +} + static const char *to_direction(u8 is_write) { if (is_write) @@ -59,6 +68,41 @@ static const char *to_direction(u8 is_write) return "read"; } +static struct mmio_mapping *mmio_get(struct rb_root *root, u64 phys_addr, u32 len) +{ + struct mmio_mapping *mmio; + + mutex_lock(&mmio_lock); + mmio = mmio_search(root, phys_addr, len); + if (mmio) + mmio->refcount++; + mutex_unlock(&mmio_lock); + + return mmio; +} + +/* Called with mmio_lock held. */ +static void mmio_deregister(struct kvm *kvm, struct rb_root *root, struct mmio_mapping *mmio) +{ + struct kvm_coalesced_mmio_zone zone = (struct kvm_coalesced_mmio_zone) { + .addr = rb_int_start(&mmio->node), + .size = 1, + }; + ioctl(kvm->vm_fd, KVM_UNREGISTER_COALESCED_MMIO, &zone); + + mmio_remove(root, mmio); + free(mmio); +} + +static void mmio_put(struct kvm *kvm, struct rb_root *root, struct mmio_mapping *mmio) +{ + mutex_lock(&mmio_lock); + mmio->refcount--; + if (mmio->remove && mmio->refcount == 0) + mmio_deregister(kvm, root, mmio); + mutex_unlock(&mmio_lock); +} + int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce, void (*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr), void *ptr) @@ -72,9 +116,11 @@ int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool c return -ENOMEM; *mmio = (struct mmio_mapping) { - .node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len), - .mmio_fn = mmio_fn, - .ptr = ptr, + .node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len), + .mmio_fn = mmio_fn, + .ptr = ptr, + .refcount = 0, + .remove = false, }; if (coalesce) { @@ -88,9 +134,9 @@ int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool c return -errno; } } - br_write_lock(kvm); + mutex_lock(&mmio_lock); ret = mmio_insert(&mmio_tree, mmio); - br_write_unlock(kvm); + mutex_unlock(&mmio_lock); return ret; } @@ -98,25 +144,20 @@ int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool c bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr) { struct mmio_mapping *mmio; - struct kvm_coalesced_mmio_zone zone; - br_write_lock(kvm); + mutex_lock(&mmio_lock); mmio = mmio_search_single(&mmio_tree, phys_addr); if (mmio == NULL) { - br_write_unlock(kvm); + mutex_unlock(&mmio_lock); return false; } - zone = (struct kvm_coalesced_mmio_zone) { - .addr = phys_addr, - .size = 1, - }; - ioctl(kvm->vm_fd, KVM_UNREGISTER_COALESCED_MMIO, &zone); + if (mmio->refcount) + mmio->remove = true; + else + mmio_deregister(kvm, &mmio_tree, mmio); + mutex_unlock(&mmio_lock); - rb_int_erase(&mmio_tree, &mmio->node); - br_write_unlock(kvm); - - free(mmio); return true; } @@ -124,18 +165,18 @@ bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u { struct mmio_mapping *mmio; - br_read_lock(vcpu->kvm); - mmio = mmio_search(&mmio_tree, phys_addr, len); - - if (mmio) - mmio->mmio_fn(vcpu, phys_addr, data, len, is_write, mmio->ptr); - else { + mmio = mmio_get(&mmio_tree, phys_addr, len); + if (!mmio) { if (vcpu->kvm->cfg.mmio_debug) fprintf(stderr, "Warning: Ignoring MMIO %s at %016llx (length %u)\n", to_direction(is_write), (unsigned long long)phys_addr, len); + goto out; } - br_read_unlock(vcpu->kvm); + mmio->mmio_fn(vcpu, phys_addr, data, len, is_write, mmio->ptr); + mmio_put(vcpu->kvm, &mmio_tree, mmio); + +out: return true; }
kvmtool uses brlock for protecting accesses to the ioport and mmio red-black trees. brlock allows concurrent reads, but only one writer, which is assumed not to be a VCPU thread (for more information see commit 0b907ed2eaec ("kvm tools: Add a brlock)). This is done by issuing a compiler barrier on read and pausing the entire virtual machine on writes. When KVM_BRLOCK_DEBUG is defined, brlock uses instead a pthread read/write lock. When we will implement reassignable BARs, the mmio or ioport mapping will be done as a result of a VCPU mmio access. When brlock is a pthread read/write lock, it means that we will try to acquire a write lock with the read lock already held by the same VCPU and we will deadlock. When it's not, a VCPU will have to call kvm__pause, which means the virtual machine will stay paused forever. Let's avoid all this by using a mutex and reference counting the red-black tree entries. This way we can guarantee that we won't unregister a node that another thread is currently using for emulation. Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com> --- include/kvm/ioport.h | 2 + include/kvm/rbtree-interval.h | 4 +- ioport.c | 64 +++++++++++++++++------- mmio.c | 91 +++++++++++++++++++++++++---------- 4 files changed, 118 insertions(+), 43 deletions(-)