Message ID | 20160928182457.12433-22-andre.przywara@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara <andre.przywara@arm.com> wrote: > The INVALL command instructs an ITS to invalidate the configuration > data for all LPIs associated with a given redistributor (read: VCPU). > To avoid iterating (and mapping!) all guest tables, we instead go through > the host LPI table to find any LPIs targetting this VCPU. We then update > the configuration bits for the connected virtual LPIs. > > Signed-off-by: Andre Przywara <andre.przywara@arm.com> > --- > xen/arch/arm/gic-its.c | 58 +++++++++++++++++++++++++++++++++++++++++++ > xen/arch/arm/vgic-its.c | 30 ++++++++++++++++++++++ > xen/include/asm-arm/gic-its.h | 2 ++ > 3 files changed, 90 insertions(+) > > diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c > index 6f4329f..5129d6e 100644 > --- a/xen/arch/arm/gic-its.c > +++ b/xen/arch/arm/gic-its.c > @@ -228,6 +228,18 @@ static int its_send_cmd_inv(struct host_its *its, > return its_send_command(its, cmd); > } > > +static int its_send_cmd_invall(struct host_its *its, int cpu) > +{ > + uint64_t cmd[4]; > + > + cmd[0] = GITS_CMD_INVALL; > + cmd[1] = 0x00; > + cmd[2] = cpu & GENMASK(15, 0); > + cmd[3] = 0x00; > + > + return its_send_command(its, cmd); > +} > + > int gicv3_its_map_device(struct host_its *hw_its, struct domain *d, > int devid, int bits, bool valid) > { > @@ -668,6 +680,52 @@ uint32_t gicv3_lpi_lookup_lpi(struct domain *d, uint32_t host_lpi, int *vcpu_id) > return hlpi.virt_lpi; > } > > +/* Iterate over all host LPIs, and updating the "enabled" state for a given > + * guest redistributor (VCPU) given the respective state in the provided > + * proptable. This proptable is indexed by the stored virtual LPI number. > + * This is to implement a guest INVALL command. > + */ > +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable) > +{ > + int chunk, i; > + struct host_its *its; > + > + for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++) > + { > + if ( !lpi_data.host_lpis[chunk] ) > + continue; > + > + for (i = 0; i < HOST_LPIS_PER_PAGE; i++) > + { > + union host_lpi *hlpip = &lpi_data.host_lpis[chunk][i], hlpi; > + uint32_t hlpi_nr; > + > + hlpi.data = hlpip->data; > + if ( !hlpi.virt_lpi ) > + continue; > + > + if ( hlpi.dom_id != v->domain->domain_id ) > + continue; > + > + if ( hlpi.vcpu_id != v->vcpu_id ) > + continue; > + > + hlpi_nr = chunk * HOST_LPIS_PER_PAGE + i; > + > + if ( proptable[hlpi.virt_lpi] & LPI_PROP_ENABLED ) > + lpi_data.lpi_property[hlpi_nr - 8192] |= LPI_PROP_ENABLED; > + else > + lpi_data.lpi_property[hlpi_nr - 8192] &= ~LPI_PROP_ENABLED; > + } > + } AFAIK, the initial design is to use tasklet to update property table as it consumes lot of time to update the table. > + > + /* Tell all ITSes that they should update the property table for CPU 0, > + * which is where we map all LPIs to. > + */ > + list_for_each_entry(its, &host_its_list, entry) > + its_send_cmd_invall(its, 0); > +} > + > void gicv3_lpi_set_enable(struct host_its *its, > uint32_t deviceid, uint32_t eventid, > uint32_t host_lpi, bool enabled) > diff --git a/xen/arch/arm/vgic-its.c b/xen/arch/arm/vgic-its.c > index 74da8fc..1e429b7 100644 > --- a/xen/arch/arm/vgic-its.c > +++ b/xen/arch/arm/vgic-its.c > @@ -294,6 +294,33 @@ out_unlock: > return ret; > } > > +/* INVALL updates the per-LPI configuration status for every LPI mapped to > + * this redistributor. For the guest side we don't need to update anything, > + * as we always refer to the actual table for the enabled bit and the > + * priority. > + * Enabling or disabling a virtual LPI however needs to be propagated to > + * the respective host LPI. Instead of iterating over all mapped LPIs in our > + * emulated GIC (which is expensive due to the required on-demand mapping), > + * we iterate over all mapped _host_ LPIs and filter for those which are > + * forwarded to this virtual redistributor. > + */ > +static int its_handle_invall(struct virt_its *its, uint64_t *cmdptr) > +{ > + uint32_t collid = its_cmd_get_collection(cmdptr); > + struct vcpu *vcpu; > + > + spin_lock(&its->its_lock); > + vcpu = get_vcpu_from_collection(its, collid); > + spin_unlock(&its->its_lock); > + > + if ( !vcpu ) > + return -1; > + > + gicv3_lpi_update_configurations(vcpu, its->d->arch.vgic.proptable); > + > + return 0; > +} > + > static int its_handle_mapc(struct virt_its *its, uint64_t *cmdptr) > { > uint32_t collid = its_cmd_get_collection(cmdptr); > @@ -515,6 +542,9 @@ static int vgic_its_handle_cmds(struct domain *d, struct virt_its *its, > case GITS_CMD_INV: > its_handle_inv(its, cmdptr); > break; > + case GITS_CMD_INVALL: > + its_handle_invall(its, cmdptr); > + break; > case GITS_CMD_MAPC: > its_handle_mapc(its, cmdptr); > break; > diff --git a/xen/include/asm-arm/gic-its.h b/xen/include/asm-arm/gic-its.h > index 2cdb3e1..ba6b2d5 100644 > --- a/xen/include/asm-arm/gic-its.h > +++ b/xen/include/asm-arm/gic-its.h > @@ -146,6 +146,8 @@ int gicv3_lpi_drop_host_lpi(struct host_its *its, > uint32_t devid, uint32_t eventid, > uint32_t host_lpi); > > +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable); > + > static inline int gicv3_lpi_get_priority(struct domain *d, uint32_t lpi) > { > return d->arch.vgic.proptable[lpi - 8192] & 0xfc; > -- > 2.9.0 > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > https://lists.xen.org/xen-devel
Hi, On 24/10/16 16:32, Vijay Kilari wrote: > On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara <andre.przywara@arm.com> wrote: >> The INVALL command instructs an ITS to invalidate the configuration >> data for all LPIs associated with a given redistributor (read: VCPU). >> To avoid iterating (and mapping!) all guest tables, we instead go through >> the host LPI table to find any LPIs targetting this VCPU. We then update >> the configuration bits for the connected virtual LPIs. >> >> Signed-off-by: Andre Przywara <andre.przywara@arm.com> >> --- >> xen/arch/arm/gic-its.c | 58 +++++++++++++++++++++++++++++++++++++++++++ >> xen/arch/arm/vgic-its.c | 30 ++++++++++++++++++++++ >> xen/include/asm-arm/gic-its.h | 2 ++ >> 3 files changed, 90 insertions(+) >> >> diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c >> index 6f4329f..5129d6e 100644 >> --- a/xen/arch/arm/gic-its.c >> +++ b/xen/arch/arm/gic-its.c >> @@ -228,6 +228,18 @@ static int its_send_cmd_inv(struct host_its *its, >> return its_send_command(its, cmd); >> } >> >> +static int its_send_cmd_invall(struct host_its *its, int cpu) >> +{ >> + uint64_t cmd[4]; >> + >> + cmd[0] = GITS_CMD_INVALL; >> + cmd[1] = 0x00; >> + cmd[2] = cpu & GENMASK(15, 0); >> + cmd[3] = 0x00; >> + >> + return its_send_command(its, cmd); >> +} >> + >> int gicv3_its_map_device(struct host_its *hw_its, struct domain *d, >> int devid, int bits, bool valid) >> { >> @@ -668,6 +680,52 @@ uint32_t gicv3_lpi_lookup_lpi(struct domain *d, uint32_t host_lpi, int *vcpu_id) >> return hlpi.virt_lpi; >> } >> >> +/* Iterate over all host LPIs, and updating the "enabled" state for a given >> + * guest redistributor (VCPU) given the respective state in the provided >> + * proptable. This proptable is indexed by the stored virtual LPI number. >> + * This is to implement a guest INVALL command. >> + */ >> +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable) >> +{ >> + int chunk, i; >> + struct host_its *its; >> + >> + for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++) >> + { >> + if ( !lpi_data.host_lpis[chunk] ) >> + continue; >> + >> + for (i = 0; i < HOST_LPIS_PER_PAGE; i++) >> + { >> + union host_lpi *hlpip = &lpi_data.host_lpis[chunk][i], hlpi; >> + uint32_t hlpi_nr; >> + >> + hlpi.data = hlpip->data; >> + if ( !hlpi.virt_lpi ) >> + continue; >> + >> + if ( hlpi.dom_id != v->domain->domain_id ) >> + continue; >> + >> + if ( hlpi.vcpu_id != v->vcpu_id ) >> + continue; >> + >> + hlpi_nr = chunk * HOST_LPIS_PER_PAGE + i; >> + >> + if ( proptable[hlpi.virt_lpi] & LPI_PROP_ENABLED ) >> + lpi_data.lpi_property[hlpi_nr - 8192] |= LPI_PROP_ENABLED; >> + else >> + lpi_data.lpi_property[hlpi_nr - 8192] &= ~LPI_PROP_ENABLED; >> + } >> + } > AFAIK, the initial design is to use tasklet to update property > table as it consumes > lot of time to update the table. This is a possible, but premature optimization. Linux (at the moment, at least) only calls INVALL _once_, just after initialising the collections. And at this point no LPI is mapped, so the whole routine does basically nothing - and that quite fast. We can later have any kind of fancy algorithm if there is a need for. Cheers, Andre. >> + >> + /* Tell all ITSes that they should update the property table for CPU 0, >> + * which is where we map all LPIs to. >> + */ >> + list_for_each_entry(its, &host_its_list, entry) >> + its_send_cmd_invall(its, 0); >> +} >> + >> void gicv3_lpi_set_enable(struct host_its *its, >> uint32_t deviceid, uint32_t eventid, >> uint32_t host_lpi, bool enabled) >> diff --git a/xen/arch/arm/vgic-its.c b/xen/arch/arm/vgic-its.c >> index 74da8fc..1e429b7 100644 >> --- a/xen/arch/arm/vgic-its.c >> +++ b/xen/arch/arm/vgic-its.c >> @@ -294,6 +294,33 @@ out_unlock: >> return ret; >> } >> >> +/* INVALL updates the per-LPI configuration status for every LPI mapped to >> + * this redistributor. For the guest side we don't need to update anything, >> + * as we always refer to the actual table for the enabled bit and the >> + * priority. >> + * Enabling or disabling a virtual LPI however needs to be propagated to >> + * the respective host LPI. Instead of iterating over all mapped LPIs in our >> + * emulated GIC (which is expensive due to the required on-demand mapping), >> + * we iterate over all mapped _host_ LPIs and filter for those which are >> + * forwarded to this virtual redistributor. >> + */ >> +static int its_handle_invall(struct virt_its *its, uint64_t *cmdptr) >> +{ >> + uint32_t collid = its_cmd_get_collection(cmdptr); >> + struct vcpu *vcpu; >> + >> + spin_lock(&its->its_lock); >> + vcpu = get_vcpu_from_collection(its, collid); >> + spin_unlock(&its->its_lock); >> + >> + if ( !vcpu ) >> + return -1; >> + >> + gicv3_lpi_update_configurations(vcpu, its->d->arch.vgic.proptable); >> + >> + return 0; >> +} >> + >> static int its_handle_mapc(struct virt_its *its, uint64_t *cmdptr) >> { >> uint32_t collid = its_cmd_get_collection(cmdptr); >> @@ -515,6 +542,9 @@ static int vgic_its_handle_cmds(struct domain *d, struct virt_its *its, >> case GITS_CMD_INV: >> its_handle_inv(its, cmdptr); >> break; >> + case GITS_CMD_INVALL: >> + its_handle_invall(its, cmdptr); >> + break; >> case GITS_CMD_MAPC: >> its_handle_mapc(its, cmdptr); >> break; >> diff --git a/xen/include/asm-arm/gic-its.h b/xen/include/asm-arm/gic-its.h >> index 2cdb3e1..ba6b2d5 100644 >> --- a/xen/include/asm-arm/gic-its.h >> +++ b/xen/include/asm-arm/gic-its.h >> @@ -146,6 +146,8 @@ int gicv3_lpi_drop_host_lpi(struct host_its *its, >> uint32_t devid, uint32_t eventid, >> uint32_t host_lpi); >> >> +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable); >> + >> static inline int gicv3_lpi_get_priority(struct domain *d, uint32_t lpi) >> { >> return d->arch.vgic.proptable[lpi - 8192] & 0xfc; >> -- >> 2.9.0 >> >> >> _______________________________________________ >> Xen-devel mailing list >> Xen-devel@lists.xen.org >> https://lists.xen.org/xen-devel >
On Fri, 4 Nov 2016, Andre Przywara wrote: > Hi, > > On 24/10/16 16:32, Vijay Kilari wrote: > > On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara <andre.przywara@arm.com> wrote: > >> The INVALL command instructs an ITS to invalidate the configuration > >> data for all LPIs associated with a given redistributor (read: VCPU). > >> To avoid iterating (and mapping!) all guest tables, we instead go through > >> the host LPI table to find any LPIs targetting this VCPU. We then update > >> the configuration bits for the connected virtual LPIs. > >> > >> Signed-off-by: Andre Przywara <andre.przywara@arm.com> > >> --- > >> xen/arch/arm/gic-its.c | 58 +++++++++++++++++++++++++++++++++++++++++++ > >> xen/arch/arm/vgic-its.c | 30 ++++++++++++++++++++++ > >> xen/include/asm-arm/gic-its.h | 2 ++ > >> 3 files changed, 90 insertions(+) > >> > >> diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c > >> index 6f4329f..5129d6e 100644 > >> --- a/xen/arch/arm/gic-its.c > >> +++ b/xen/arch/arm/gic-its.c > >> @@ -228,6 +228,18 @@ static int its_send_cmd_inv(struct host_its *its, > >> return its_send_command(its, cmd); > >> } > >> > >> +static int its_send_cmd_invall(struct host_its *its, int cpu) > >> +{ > >> + uint64_t cmd[4]; > >> + > >> + cmd[0] = GITS_CMD_INVALL; > >> + cmd[1] = 0x00; > >> + cmd[2] = cpu & GENMASK(15, 0); > >> + cmd[3] = 0x00; > >> + > >> + return its_send_command(its, cmd); > >> +} > >> + > >> int gicv3_its_map_device(struct host_its *hw_its, struct domain *d, > >> int devid, int bits, bool valid) > >> { > >> @@ -668,6 +680,52 @@ uint32_t gicv3_lpi_lookup_lpi(struct domain *d, uint32_t host_lpi, int *vcpu_id) > >> return hlpi.virt_lpi; > >> } > >> > >> +/* Iterate over all host LPIs, and updating the "enabled" state for a given > >> + * guest redistributor (VCPU) given the respective state in the provided > >> + * proptable. This proptable is indexed by the stored virtual LPI number. > >> + * This is to implement a guest INVALL command. > >> + */ > >> +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable) > >> +{ > >> + int chunk, i; > >> + struct host_its *its; > >> + > >> + for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++) > >> + { > >> + if ( !lpi_data.host_lpis[chunk] ) > >> + continue; > >> + > >> + for (i = 0; i < HOST_LPIS_PER_PAGE; i++) > >> + { > >> + union host_lpi *hlpip = &lpi_data.host_lpis[chunk][i], hlpi; > >> + uint32_t hlpi_nr; > >> + > >> + hlpi.data = hlpip->data; > >> + if ( !hlpi.virt_lpi ) > >> + continue; > >> + > >> + if ( hlpi.dom_id != v->domain->domain_id ) > >> + continue; > >> + > >> + if ( hlpi.vcpu_id != v->vcpu_id ) > >> + continue; > >> + > >> + hlpi_nr = chunk * HOST_LPIS_PER_PAGE + i; > >> + > >> + if ( proptable[hlpi.virt_lpi] & LPI_PROP_ENABLED ) > >> + lpi_data.lpi_property[hlpi_nr - 8192] |= LPI_PROP_ENABLED; > >> + else > >> + lpi_data.lpi_property[hlpi_nr - 8192] &= ~LPI_PROP_ENABLED; > >> + } > >> + } > > AFAIK, the initial design is to use tasklet to update property > > table as it consumes > > lot of time to update the table. > > This is a possible, but premature optimization. > Linux (at the moment, at least) only calls INVALL _once_, just after > initialising the collections. And at this point no LPI is mapped, so the > whole routine does basically nothing - and that quite fast. > We can later have any kind of fancy algorithm if there is a need for. I understand, but as-is it's so expensive that could be a DOS vector. Also other OSes could issue INVALL much more often than Linux. Considering that we might support device assigment with ITS soon, I think it might be best to parse per-domain virtual tables rather than the full list of physical LPIs, which theoretically could be much larger. Or alternatively we need to think about adding another field to lpi_data, to link together all lpis assigned to the same domain, but that would cost even more memory. Or we could rate-limit the INVALL calls to one every few seconds or something. Or all of the above :-) We need to protect Xen from too frequent and too expensive requests like this. > >> + > >> + /* Tell all ITSes that they should update the property table for CPU 0, > >> + * which is where we map all LPIs to. > >> + */ > >> + list_for_each_entry(its, &host_its_list, entry) > >> + its_send_cmd_invall(its, 0); > >> +} > >> + > >> void gicv3_lpi_set_enable(struct host_its *its, > >> uint32_t deviceid, uint32_t eventid, > >> uint32_t host_lpi, bool enabled) > >> diff --git a/xen/arch/arm/vgic-its.c b/xen/arch/arm/vgic-its.c > >> index 74da8fc..1e429b7 100644 > >> --- a/xen/arch/arm/vgic-its.c > >> +++ b/xen/arch/arm/vgic-its.c > >> @@ -294,6 +294,33 @@ out_unlock: > >> return ret; > >> } > >> > >> +/* INVALL updates the per-LPI configuration status for every LPI mapped to > >> + * this redistributor. For the guest side we don't need to update anything, > >> + * as we always refer to the actual table for the enabled bit and the > >> + * priority. > >> + * Enabling or disabling a virtual LPI however needs to be propagated to > >> + * the respective host LPI. Instead of iterating over all mapped LPIs in our > >> + * emulated GIC (which is expensive due to the required on-demand mapping), > >> + * we iterate over all mapped _host_ LPIs and filter for those which are > >> + * forwarded to this virtual redistributor. > >> + */ > >> +static int its_handle_invall(struct virt_its *its, uint64_t *cmdptr) > >> +{ > >> + uint32_t collid = its_cmd_get_collection(cmdptr); > >> + struct vcpu *vcpu; > >> + > >> + spin_lock(&its->its_lock); > >> + vcpu = get_vcpu_from_collection(its, collid); > >> + spin_unlock(&its->its_lock); > >> + > >> + if ( !vcpu ) > >> + return -1; > >> + > >> + gicv3_lpi_update_configurations(vcpu, its->d->arch.vgic.proptable); > >> + > >> + return 0; > >> +} > >> + > >> static int its_handle_mapc(struct virt_its *its, uint64_t *cmdptr) > >> { > >> uint32_t collid = its_cmd_get_collection(cmdptr); > >> @@ -515,6 +542,9 @@ static int vgic_its_handle_cmds(struct domain *d, struct virt_its *its, > >> case GITS_CMD_INV: > >> its_handle_inv(its, cmdptr); > >> break; > >> + case GITS_CMD_INVALL: > >> + its_handle_invall(its, cmdptr); > >> + break; > >> case GITS_CMD_MAPC: > >> its_handle_mapc(its, cmdptr); > >> break; > >> diff --git a/xen/include/asm-arm/gic-its.h b/xen/include/asm-arm/gic-its.h > >> index 2cdb3e1..ba6b2d5 100644 > >> --- a/xen/include/asm-arm/gic-its.h > >> +++ b/xen/include/asm-arm/gic-its.h > >> @@ -146,6 +146,8 @@ int gicv3_lpi_drop_host_lpi(struct host_its *its, > >> uint32_t devid, uint32_t eventid, > >> uint32_t host_lpi); > >> > >> +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable); > >> + > >> static inline int gicv3_lpi_get_priority(struct domain *d, uint32_t lpi) > >> { > >> return d->arch.vgic.proptable[lpi - 8192] & 0xfc; > >> -- > >> 2.9.0 > >> > >> > >> _______________________________________________ > >> Xen-devel mailing list > >> Xen-devel@lists.xen.org > >> https://lists.xen.org/xen-devel > > >
Hi, On 10/11/16 00:21, Stefano Stabellini wrote: > On Fri, 4 Nov 2016, Andre Przywara wrote: >> On 24/10/16 16:32, Vijay Kilari wrote: >>> On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara <andre.przywara@arm.com> wrote: >>>> The INVALL command instructs an ITS to invalidate the configuration >>>> data for all LPIs associated with a given redistributor (read: VCPU). >>>> To avoid iterating (and mapping!) all guest tables, we instead go through >>>> the host LPI table to find any LPIs targetting this VCPU. We then update >>>> the configuration bits for the connected virtual LPIs. >>>> >>>> Signed-off-by: Andre Przywara <andre.przywara@arm.com> >>>> --- >>>> xen/arch/arm/gic-its.c | 58 +++++++++++++++++++++++++++++++++++++++++++ >>>> xen/arch/arm/vgic-its.c | 30 ++++++++++++++++++++++ >>>> xen/include/asm-arm/gic-its.h | 2 ++ >>>> 3 files changed, 90 insertions(+) >>>> >>>> diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c >>>> index 6f4329f..5129d6e 100644 >>>> --- a/xen/arch/arm/gic-its.c >>>> +++ b/xen/arch/arm/gic-its.c >>>> @@ -228,6 +228,18 @@ static int its_send_cmd_inv(struct host_its *its, >>>> return its_send_command(its, cmd); >>>> } >>>> >>>> +static int its_send_cmd_invall(struct host_its *its, int cpu) >>>> +{ >>>> + uint64_t cmd[4]; >>>> + >>>> + cmd[0] = GITS_CMD_INVALL; >>>> + cmd[1] = 0x00; >>>> + cmd[2] = cpu & GENMASK(15, 0); >>>> + cmd[3] = 0x00; >>>> + >>>> + return its_send_command(its, cmd); >>>> +} >>>> + >>>> int gicv3_its_map_device(struct host_its *hw_its, struct domain *d, >>>> int devid, int bits, bool valid) >>>> { >>>> @@ -668,6 +680,52 @@ uint32_t gicv3_lpi_lookup_lpi(struct domain *d, uint32_t host_lpi, int *vcpu_id) >>>> return hlpi.virt_lpi; >>>> } >>>> >>>> +/* Iterate over all host LPIs, and updating the "enabled" state for a given >>>> + * guest redistributor (VCPU) given the respective state in the provided >>>> + * proptable. This proptable is indexed by the stored virtual LPI number. >>>> + * This is to implement a guest INVALL command. >>>> + */ >>>> +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable) >>>> +{ >>>> + int chunk, i; >>>> + struct host_its *its; >>>> + >>>> + for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++) >>>> + { >>>> + if ( !lpi_data.host_lpis[chunk] ) >>>> + continue; >>>> + >>>> + for (i = 0; i < HOST_LPIS_PER_PAGE; i++) >>>> + { >>>> + union host_lpi *hlpip = &lpi_data.host_lpis[chunk][i], hlpi; >>>> + uint32_t hlpi_nr; >>>> + >>>> + hlpi.data = hlpip->data; >>>> + if ( !hlpi.virt_lpi ) >>>> + continue; >>>> + >>>> + if ( hlpi.dom_id != v->domain->domain_id ) >>>> + continue; >>>> + >>>> + if ( hlpi.vcpu_id != v->vcpu_id ) >>>> + continue; >>>> + >>>> + hlpi_nr = chunk * HOST_LPIS_PER_PAGE + i; >>>> + >>>> + if ( proptable[hlpi.virt_lpi] & LPI_PROP_ENABLED ) >>>> + lpi_data.lpi_property[hlpi_nr - 8192] |= LPI_PROP_ENABLED; >>>> + else >>>> + lpi_data.lpi_property[hlpi_nr - 8192] &= ~LPI_PROP_ENABLED; >>>> + } >>>> + } >>> AFAIK, the initial design is to use tasklet to update property >>> table as it consumes >>> lot of time to update the table. >> >> This is a possible, but premature optimization. >> Linux (at the moment, at least) only calls INVALL _once_, just after >> initialising the collections. And at this point no LPI is mapped, so the >> whole routine does basically nothing - and that quite fast. >> We can later have any kind of fancy algorithm if there is a need for. > > I understand, but as-is it's so expensive that could be a DOS vector. > Also other OSes could issue INVALL much more often than Linux. > > Considering that we might support device assigment with ITS soon, I > think it might be best to parse per-domain virtual tables rather than > the full list of physical LPIs, which theoretically could be much > larger. Or alternatively we need to think about adding another field to > lpi_data, to link together all lpis assigned to the same domain, but > that would cost even more memory. Or we could rate-limit the INVALL > calls to one every few seconds or something. Or all of the above :-) It is not necessary for an ITS implementation to wait until an INVALL/INV command is issued to take into account the change of the LPI configuration tables (aka property table in this thread). So how about trapping the property table? We would still have to go through the property table the first time (i.e when writing into the GICR_PROPBASER), but INVALL would be a nop. The idea would be unmapping the region when GICR_PROPBASER is written. So any read/write access would be trapped. For a write access, Xen will update the LPIs internal data structures and write the value in the guest page unmapped. If we don't want to have an overhead for the read access, we could just write-protect the page in stage-2 page table. So only write access would be trapped. Going further, for the ITS, Xen is using the guest memory to store the ITS information. This means Xen has to validate the information at every access. So how about restricting the access in stage-2 page table? That would remove the overhead of validating data. Any thoughts? Regards,
On Thu, 10 Nov 2016, Julien Grall wrote: > Hi, > > On 10/11/16 00:21, Stefano Stabellini wrote: > > On Fri, 4 Nov 2016, Andre Przywara wrote: > > > On 24/10/16 16:32, Vijay Kilari wrote: > > > > On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara > > > > <andre.przywara@arm.com> wrote: > > > > > The INVALL command instructs an ITS to invalidate the configuration > > > > > data for all LPIs associated with a given redistributor (read: VCPU). > > > > > To avoid iterating (and mapping!) all guest tables, we instead go > > > > > through > > > > > the host LPI table to find any LPIs targetting this VCPU. We then > > > > > update > > > > > the configuration bits for the connected virtual LPIs. > > > > > > > > > > Signed-off-by: Andre Przywara <andre.przywara@arm.com> > > > > > --- > > > > > xen/arch/arm/gic-its.c | 58 > > > > > +++++++++++++++++++++++++++++++++++++++++++ > > > > > xen/arch/arm/vgic-its.c | 30 ++++++++++++++++++++++ > > > > > xen/include/asm-arm/gic-its.h | 2 ++ > > > > > 3 files changed, 90 insertions(+) > > > > > > > > > > diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c > > > > > index 6f4329f..5129d6e 100644 > > > > > --- a/xen/arch/arm/gic-its.c > > > > > +++ b/xen/arch/arm/gic-its.c > > > > > @@ -228,6 +228,18 @@ static int its_send_cmd_inv(struct host_its *its, > > > > > return its_send_command(its, cmd); > > > > > } > > > > > > > > > > +static int its_send_cmd_invall(struct host_its *its, int cpu) > > > > > +{ > > > > > + uint64_t cmd[4]; > > > > > + > > > > > + cmd[0] = GITS_CMD_INVALL; > > > > > + cmd[1] = 0x00; > > > > > + cmd[2] = cpu & GENMASK(15, 0); > > > > > + cmd[3] = 0x00; > > > > > + > > > > > + return its_send_command(its, cmd); > > > > > +} > > > > > + > > > > > int gicv3_its_map_device(struct host_its *hw_its, struct domain *d, > > > > > int devid, int bits, bool valid) > > > > > { > > > > > @@ -668,6 +680,52 @@ uint32_t gicv3_lpi_lookup_lpi(struct domain *d, > > > > > uint32_t host_lpi, int *vcpu_id) > > > > > return hlpi.virt_lpi; > > > > > } > > > > > > > > > > +/* Iterate over all host LPIs, and updating the "enabled" state for a > > > > > given > > > > > + * guest redistributor (VCPU) given the respective state in the > > > > > provided > > > > > + * proptable. This proptable is indexed by the stored virtual LPI > > > > > number. > > > > > + * This is to implement a guest INVALL command. > > > > > + */ > > > > > +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t > > > > > *proptable) > > > > > +{ > > > > > + int chunk, i; > > > > > + struct host_its *its; > > > > > + > > > > > + for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; > > > > > chunk++) > > > > > + { > > > > > + if ( !lpi_data.host_lpis[chunk] ) > > > > > + continue; > > > > > + > > > > > + for (i = 0; i < HOST_LPIS_PER_PAGE; i++) > > > > > + { > > > > > + union host_lpi *hlpip = &lpi_data.host_lpis[chunk][i], > > > > > hlpi; > > > > > + uint32_t hlpi_nr; > > > > > + > > > > > + hlpi.data = hlpip->data; > > > > > + if ( !hlpi.virt_lpi ) > > > > > + continue; > > > > > + > > > > > + if ( hlpi.dom_id != v->domain->domain_id ) > > > > > + continue; > > > > > + > > > > > + if ( hlpi.vcpu_id != v->vcpu_id ) > > > > > + continue; > > > > > + > > > > > + hlpi_nr = chunk * HOST_LPIS_PER_PAGE + i; > > > > > + > > > > > + if ( proptable[hlpi.virt_lpi] & LPI_PROP_ENABLED ) > > > > > + lpi_data.lpi_property[hlpi_nr - 8192] |= > > > > > LPI_PROP_ENABLED; > > > > > + else > > > > > + lpi_data.lpi_property[hlpi_nr - 8192] &= > > > > > ~LPI_PROP_ENABLED; > > > > > + } > > > > > + } > > > > AFAIK, the initial design is to use tasklet to update property > > > > table as it consumes > > > > lot of time to update the table. > > > > > > This is a possible, but premature optimization. > > > Linux (at the moment, at least) only calls INVALL _once_, just after > > > initialising the collections. And at this point no LPI is mapped, so the > > > whole routine does basically nothing - and that quite fast. > > > We can later have any kind of fancy algorithm if there is a need for. > > > > I understand, but as-is it's so expensive that could be a DOS vector. > > Also other OSes could issue INVALL much more often than Linux. > > > > Considering that we might support device assigment with ITS soon, I > > think it might be best to parse per-domain virtual tables rather than > > the full list of physical LPIs, which theoretically could be much > > larger. Or alternatively we need to think about adding another field to > > lpi_data, to link together all lpis assigned to the same domain, but > > that would cost even more memory. Or we could rate-limit the INVALL > > calls to one every few seconds or something. Or all of the above :-) > > It is not necessary for an ITS implementation to wait until an INVALL/INV > command is issued to take into account the change of the LPI configuration > tables (aka property table in this thread). > > So how about trapping the property table? We would still have to go through > the property table the first time (i.e when writing into the GICR_PROPBASER), > but INVALL would be a nop. > > The idea would be unmapping the region when GICR_PROPBASER is written. So any > read/write access would be trapped. For a write access, Xen will update the > LPIs internal data structures and write the value in the guest page unmapped. > If we don't want to have an overhead for the read access, we could just > write-protect the page in stage-2 page table. So only write access would be > trapped. > > Going further, for the ITS, Xen is using the guest memory to store the ITS > information. This means Xen has to validate the information at every access. > So how about restricting the access in stage-2 page table? That would remove > the overhead of validating data. > > Any thoughts? It is a promising idea. Let me expand on this. I agree that on INVALL if we need to do anything, we should go through the virtual property table rather than the full list of host lpis. Once we agree on that, the two options we have are: 1) We let the guest write anything to the table, then we do a full validation of the table on INVALL. We also do a validation of the table entries used as parameters for any other commands. 2) We map the table read-only, then do a validation of every guest write. INVALL becomes a NOP and parameters validation for many commands could be removed or at least reduced. Conceptually the two options should both lead to exactly the same result. Therefore I think the decision should be made purely on performance: which one is faster? If it is true that INVALL is only typically called once I suspect that 1) is faster, but I would like to see some simple benchmarks, such as the time that it takes to configure the ITS from scratch with the two approaches. That said, even if 1) turns out to be faster and the approach of choice, the idea of making the tables read-only in stage-2 could still be useful to simplify parameters validation and protect Xen from concurrent changes of the table entries from another guest vcpu. If the tables as RW, we need to be very careful in Xen and use barriers to avoid re-reading any guest table entry twice, as the guest could be changing it in parallel to exploit the hypervisor.
Hi Stefano, On 10/11/16 20:42, Stefano Stabellini wrote: > On Thu, 10 Nov 2016, Julien Grall wrote: >> On 10/11/16 00:21, Stefano Stabellini wrote: >>> On Fri, 4 Nov 2016, Andre Przywara wrote: >>>> On 24/10/16 16:32, Vijay Kilari wrote: >>>>> On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara >>>>> AFAIK, the initial design is to use tasklet to update property >>>>> table as it consumes >>>>> lot of time to update the table. >>>> >>>> This is a possible, but premature optimization. >>>> Linux (at the moment, at least) only calls INVALL _once_, just after >>>> initialising the collections. And at this point no LPI is mapped, so the >>>> whole routine does basically nothing - and that quite fast. >>>> We can later have any kind of fancy algorithm if there is a need for. >>> >>> I understand, but as-is it's so expensive that could be a DOS vector. >>> Also other OSes could issue INVALL much more often than Linux. >>> >>> Considering that we might support device assigment with ITS soon, I >>> think it might be best to parse per-domain virtual tables rather than >>> the full list of physical LPIs, which theoretically could be much >>> larger. Or alternatively we need to think about adding another field to >>> lpi_data, to link together all lpis assigned to the same domain, but >>> that would cost even more memory. Or we could rate-limit the INVALL >>> calls to one every few seconds or something. Or all of the above :-) >> >> It is not necessary for an ITS implementation to wait until an INVALL/INV >> command is issued to take into account the change of the LPI configuration >> tables (aka property table in this thread). >> >> So how about trapping the property table? We would still have to go through >> the property table the first time (i.e when writing into the GICR_PROPBASER), >> but INVALL would be a nop. >> >> The idea would be unmapping the region when GICR_PROPBASER is written. So any >> read/write access would be trapped. For a write access, Xen will update the >> LPIs internal data structures and write the value in the guest page unmapped. >> If we don't want to have an overhead for the read access, we could just >> write-protect the page in stage-2 page table. So only write access would be >> trapped. >> >> Going further, for the ITS, Xen is using the guest memory to store the ITS >> information. This means Xen has to validate the information at every access. >> So how about restricting the access in stage-2 page table? That would remove >> the overhead of validating data. >> >> Any thoughts? > > It is a promising idea. Let me expand on this. > > I agree that on INVALL if we need to do anything, we should go through > the virtual property table rather than the full list of host lpis. I agree on that. > > Once we agree on that, the two options we have are: I believe we had a similar discussion when Vijay worked on the vITS (see [1]). I would have hoped that this new proposal took into account the constraint mentioned back then. > > 1) We let the guest write anything to the table, then we do a full > validation of the table on INVALL. We also do a validation of the table > entries used as parameters for any other commands. > > 2) We map the table read-only, then do a validation of every guest > write. INVALL becomes a NOP and parameters validation for many commands > could be removed or at least reduced. > > Conceptually the two options should both lead to exactly the same > result. Therefore I think the decision should be made purely on > performance: which one is faster? If it is true that INVALL is only > typically called once I suspect that 1) is faster, but I would like to > see some simple benchmarks, such as the time that it takes to configure > the ITS from scratch with the two approaches. The problem is not which one is faster but which one will not take down the hypervisor. The guest is allowed to create a command queue as big as 1MB, a command use 32 bytes, so the command queue can fit up 32640 commands. Now imagine a malicious guest filling up the command queue with INVALL and then notify the via (via GITS_CWRITER). Based on patch #5, all those commands will be handled in one. So you have to multiple the time of one command by 32640 times. Given that the hypervisor is not preemptible, it likely means a DOS. A similar problem would happen if an vITS command is translate to an ITS command (see the implementation of INVALL). Multiple malicious guest could slow down the other guest by filling up the host command queue. Worst, a command from a normal guest could be discarded because the host ITS command queue is full (see its_send_command in gic-its.c). That's why in the approach we had on the previous series was "host ITS command should be limited when emulating guest ITS command". From my recall, in that series the host and guest LPIs was fully separated (enabling a guest LPIs was not enabling host LPIs). That said, a design doc explaining all the constraints and code flow would have been really helpful. It took me a while to digest and understand the interaction between each part of the code. The design document would have also been a good place to discuss about problems that span across multiple patches (like the command queue emulation). > > > That said, even if 1) turns out to be faster and the approach of choice, > the idea of making the tables read-only in stage-2 could still be useful > to simplify parameters validation and protect Xen from concurrent > changes of the table entries from another guest vcpu. If the tables as > RW, we need to be very careful in Xen and use barriers to avoid > re-reading any guest table entry twice, as the guest could be changing > it in parallel to exploit the hypervisor. Yes, and this is true for all the tables (PROPBASER, BASER,...) that reside on guest memory. Most of them should not be touched by the guest. This is the same for the command queue (patch #12), accessing the command directly from the guest memory is not safe. A guest could modify the value behind our back. Regards, [1] https://xenbits.xen.org/people/ianc/vits/draftG.html
On Fri, 11 Nov 2016, Julien Grall wrote: > Hi Stefano, > > On 10/11/16 20:42, Stefano Stabellini wrote: > > On Thu, 10 Nov 2016, Julien Grall wrote: > > > On 10/11/16 00:21, Stefano Stabellini wrote: > > > > On Fri, 4 Nov 2016, Andre Przywara wrote: > > > > > On 24/10/16 16:32, Vijay Kilari wrote: > > > > > > On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara > > > > > > AFAIK, the initial design is to use tasklet to update > > > > > > property > > > > > > table as it consumes > > > > > > lot of time to update the table. > > > > > > > > > > This is a possible, but premature optimization. > > > > > Linux (at the moment, at least) only calls INVALL _once_, just after > > > > > initialising the collections. And at this point no LPI is mapped, so > > > > > the > > > > > whole routine does basically nothing - and that quite fast. > > > > > We can later have any kind of fancy algorithm if there is a need for. > > > > > > > > I understand, but as-is it's so expensive that could be a DOS vector. > > > > Also other OSes could issue INVALL much more often than Linux. > > > > > > > > Considering that we might support device assigment with ITS soon, I > > > > think it might be best to parse per-domain virtual tables rather than > > > > the full list of physical LPIs, which theoretically could be much > > > > larger. Or alternatively we need to think about adding another field to > > > > lpi_data, to link together all lpis assigned to the same domain, but > > > > that would cost even more memory. Or we could rate-limit the INVALL > > > > calls to one every few seconds or something. Or all of the above :-) > > > > > > It is not necessary for an ITS implementation to wait until an INVALL/INV > > > command is issued to take into account the change of the LPI configuration > > > tables (aka property table in this thread). > > > > > > So how about trapping the property table? We would still have to go > > > through > > > the property table the first time (i.e when writing into the > > > GICR_PROPBASER), > > > but INVALL would be a nop. > > > > > > The idea would be unmapping the region when GICR_PROPBASER is written. So > > > any > > > read/write access would be trapped. For a write access, Xen will update > > > the > > > LPIs internal data structures and write the value in the guest page > > > unmapped. > > > If we don't want to have an overhead for the read access, we could just > > > write-protect the page in stage-2 page table. So only write access would > > > be > > > trapped. > > > > > > Going further, for the ITS, Xen is using the guest memory to store the ITS > > > information. This means Xen has to validate the information at every > > > access. > > > So how about restricting the access in stage-2 page table? That would > > > remove > > > the overhead of validating data. > > > > > > Any thoughts? > > > > It is a promising idea. Let me expand on this. > > > > I agree that on INVALL if we need to do anything, we should go through > > the virtual property table rather than the full list of host lpis. > > I agree on that. > > > > > Once we agree on that, the two options we have are: > > I believe we had a similar discussion when Vijay worked on the vITS (see [1]). > I would have hoped that this new proposal took into account the constraint > mentioned back then. > > > > > 1) We let the guest write anything to the table, then we do a full > > validation of the table on INVALL. We also do a validation of the table > > entries used as parameters for any other commands. > > > > 2) We map the table read-only, then do a validation of every guest > > write. INVALL becomes a NOP and parameters validation for many commands > > could be removed or at least reduced. > > > > Conceptually the two options should both lead to exactly the same > > result. Therefore I think the decision should be made purely on > > performance: which one is faster? If it is true that INVALL is only > > typically called once I suspect that 1) is faster, but I would like to > > see some simple benchmarks, such as the time that it takes to configure > > the ITS from scratch with the two approaches. > > The problem is not which one is faster but which one will not take down the > hypervisor. > > The guest is allowed to create a command queue as big as 1MB, a command use 32 > bytes, so the command queue can fit up 32640 commands. > > Now imagine a malicious guest filling up the command queue with INVALL and > then notify the via (via GITS_CWRITER). Based on patch #5, all those commands > will be handled in one. So you have to multiple the time of one command by > 32640 times. > > Given that the hypervisor is not preemptible, it likely means a DOS. I think it can be made to work safely using a rate-limiting technique. Such as: Xen is only going to emulate an INVALL for a given domain only once every one or two seconds and no more often than that. x86 has something like that under xen/arch/x86/irq.c, see irq_ratelimit. But to be clear, I am not saying that this is necessarily the best way to do it. I would like to see some benchmarks first. > A similar problem would happen if an vITS command is translate to an ITS > command (see the implementation of INVALL). Multiple malicious guest could > slow down the other guest by filling up the host command queue. Worst, a > command from a normal guest could be discarded because the host ITS command > queue is full (see its_send_command in gic-its.c). Looking at the patches, nothing checks for discarded physical ITS commands. Not good. > That's why in the approach we had on the previous series was "host ITS command > should be limited when emulating guest ITS command". From my recall, in that > series the host and guest LPIs was fully separated (enabling a guest LPIs was > not enabling host LPIs). I am interested in reading what Ian suggested to do when the physical ITS queue is full, but I cannot find anything specific about it in the doc. Do you have a suggestion for this? The only things that come to mind right now are: 1) check if the ITS queue is full and busy loop until it is not (spin_lock style) 2) check if the ITS queue is full and sleep until it is not (mutex style) > That said, a design doc explaining all the constraints and code flow would > have been really helpful. It took me a while to digest and understand the > interaction between each part of the code. The design document would have also > been a good place to discuss about problems that span across multiple patches > (like the command queue emulation). > > > > > > > That said, even if 1) turns out to be faster and the approach of choice, > > the idea of making the tables read-only in stage-2 could still be useful > > to simplify parameters validation and protect Xen from concurrent > > changes of the table entries from another guest vcpu. If the tables as > > RW, we need to be very careful in Xen and use barriers to avoid > > re-reading any guest table entry twice, as the guest could be changing > > it in parallel to exploit the hypervisor. > > Yes, and this is true for all the tables (PROPBASER, BASER,...) that reside on > guest memory. Most of them should not be touched by the guest. > > This is the same for the command queue (patch #12), accessing the command > directly from the guest memory is not safe. A guest could modify the value > behind our back. > > Regards, > > [1] https://xenbits.xen.org/people/ianc/vits/draftG.html
On Fri, 11 Nov 2016, Stefano Stabellini wrote: > On Fri, 11 Nov 2016, Julien Grall wrote: > > Hi Stefano, > > > > On 10/11/16 20:42, Stefano Stabellini wrote: > > > On Thu, 10 Nov 2016, Julien Grall wrote: > > > > On 10/11/16 00:21, Stefano Stabellini wrote: > > > > > On Fri, 4 Nov 2016, Andre Przywara wrote: > > > > > > On 24/10/16 16:32, Vijay Kilari wrote: > > > > > > > On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara > > > > > > > AFAIK, the initial design is to use tasklet to update > > > > > > > property > > > > > > > table as it consumes > > > > > > > lot of time to update the table. > > > > > > > > > > > > This is a possible, but premature optimization. > > > > > > Linux (at the moment, at least) only calls INVALL _once_, just after > > > > > > initialising the collections. And at this point no LPI is mapped, so > > > > > > the > > > > > > whole routine does basically nothing - and that quite fast. > > > > > > We can later have any kind of fancy algorithm if there is a need for. > > > > > > > > > > I understand, but as-is it's so expensive that could be a DOS vector. > > > > > Also other OSes could issue INVALL much more often than Linux. > > > > > > > > > > Considering that we might support device assigment with ITS soon, I > > > > > think it might be best to parse per-domain virtual tables rather than > > > > > the full list of physical LPIs, which theoretically could be much > > > > > larger. Or alternatively we need to think about adding another field to > > > > > lpi_data, to link together all lpis assigned to the same domain, but > > > > > that would cost even more memory. Or we could rate-limit the INVALL > > > > > calls to one every few seconds or something. Or all of the above :-) > > > > > > > > It is not necessary for an ITS implementation to wait until an INVALL/INV > > > > command is issued to take into account the change of the LPI configuration > > > > tables (aka property table in this thread). > > > > > > > > So how about trapping the property table? We would still have to go > > > > through > > > > the property table the first time (i.e when writing into the > > > > GICR_PROPBASER), > > > > but INVALL would be a nop. > > > > > > > > The idea would be unmapping the region when GICR_PROPBASER is written. So > > > > any > > > > read/write access would be trapped. For a write access, Xen will update > > > > the > > > > LPIs internal data structures and write the value in the guest page > > > > unmapped. > > > > If we don't want to have an overhead for the read access, we could just > > > > write-protect the page in stage-2 page table. So only write access would > > > > be > > > > trapped. > > > > > > > > Going further, for the ITS, Xen is using the guest memory to store the ITS > > > > information. This means Xen has to validate the information at every > > > > access. > > > > So how about restricting the access in stage-2 page table? That would > > > > remove > > > > the overhead of validating data. > > > > > > > > Any thoughts? > > > > > > It is a promising idea. Let me expand on this. > > > > > > I agree that on INVALL if we need to do anything, we should go through > > > the virtual property table rather than the full list of host lpis. > > > > I agree on that. > > > > > > > > Once we agree on that, the two options we have are: > > > > I believe we had a similar discussion when Vijay worked on the vITS (see [1]). > > I would have hoped that this new proposal took into account the constraint > > mentioned back then. > > > > > > > > 1) We let the guest write anything to the table, then we do a full > > > validation of the table on INVALL. We also do a validation of the table > > > entries used as parameters for any other commands. > > > > > > 2) We map the table read-only, then do a validation of every guest > > > write. INVALL becomes a NOP and parameters validation for many commands > > > could be removed or at least reduced. > > > > > > Conceptually the two options should both lead to exactly the same > > > result. Therefore I think the decision should be made purely on > > > performance: which one is faster? If it is true that INVALL is only > > > typically called once I suspect that 1) is faster, but I would like to > > > see some simple benchmarks, such as the time that it takes to configure > > > the ITS from scratch with the two approaches. > > > > The problem is not which one is faster but which one will not take down the > > hypervisor. > > > > The guest is allowed to create a command queue as big as 1MB, a command use 32 > > bytes, so the command queue can fit up 32640 commands. > > > > Now imagine a malicious guest filling up the command queue with INVALL and > > then notify the via (via GITS_CWRITER). Based on patch #5, all those commands > > will be handled in one. So you have to multiple the time of one command by > > 32640 times. > > > > Given that the hypervisor is not preemptible, it likely means a DOS. > > I think it can be made to work safely using a rate-limiting technique. > Such as: Xen is only going to emulate an INVALL for a given domain only > once every one or two seconds and no more often than that. x86 has > something like that under xen/arch/x86/irq.c, see irq_ratelimit. > > But to be clear, I am not saying that this is necessarily the best way > to do it. I would like to see some benchmarks first. > > > > A similar problem would happen if an vITS command is translate to an ITS > > command (see the implementation of INVALL). Multiple malicious guest could > > slow down the other guest by filling up the host command queue. Worst, a > > command from a normal guest could be discarded because the host ITS command > > queue is full (see its_send_command in gic-its.c). > > Looking at the patches, nothing checks for discarded physical ITS > commands. Not good. > > > > That's why in the approach we had on the previous series was "host ITS command > > should be limited when emulating guest ITS command". From my recall, in that > > series the host and guest LPIs was fully separated (enabling a guest LPIs was > > not enabling host LPIs). > > I am interested in reading what Ian suggested to do when the physical > ITS queue is full, but I cannot find anything specific about it in the > doc. > > Do you have a suggestion for this? > > The only things that come to mind right now are: > > 1) check if the ITS queue is full and busy loop until it is not (spin_lock style) > 2) check if the ITS queue is full and sleep until it is not (mutex style) Another, probably better idea, is to map all pLPIs of a device when the device is assigned to a guest (including Dom0). This is what was written in Ian's design doc. The advantage of this approach is that Xen doesn't need to take any actions on the physical ITS command queue when the guest issues virtual ITS commands, therefore completely solving this problem at the root. (Although I am not sure about enable/disable commands: could we avoid issuing enable/disable on pLPIs?) It also helps toward solving the INVALL potential DOS issue, because it significantly reduces the computation needed when an INVALL is issued by the guest. On the other end, this approach has the potential of consuming much more memory to map all the possible pLPIs that a device could use up to the theoretical max. Of course that is not good either. But fortunately for PCI devices we know how many events a device can generate. Also we should be able to get that info on device tree for other devices. So I suggest Xen only maps as many pLPIs as events the device can generate, when the device is assigned to the guest. This way there would be no wasted memory. Does it make sense? Do you think it could work?
Hi, On 18/11/16 18:39, Stefano Stabellini wrote: > On Fri, 11 Nov 2016, Stefano Stabellini wrote: >> On Fri, 11 Nov 2016, Julien Grall wrote: >>> On 10/11/16 20:42, Stefano Stabellini wrote: >>> That's why in the approach we had on the previous series was "host ITS command >>> should be limited when emulating guest ITS command". From my recall, in that >>> series the host and guest LPIs was fully separated (enabling a guest LPIs was >>> not enabling host LPIs). >> >> I am interested in reading what Ian suggested to do when the physical >> ITS queue is full, but I cannot find anything specific about it in the >> doc. >> >> Do you have a suggestion for this? >> >> The only things that come to mind right now are: >> >> 1) check if the ITS queue is full and busy loop until it is not (spin_lock style) >> 2) check if the ITS queue is full and sleep until it is not (mutex style) > > Another, probably better idea, is to map all pLPIs of a device when the > device is assigned to a guest (including Dom0). This is what was written > in Ian's design doc. The advantage of this approach is that Xen doesn't > need to take any actions on the physical ITS command queue when the > guest issues virtual ITS commands, therefore completely solving this > problem at the root. (Although I am not sure about enable/disable > commands: could we avoid issuing enable/disable on pLPIs?) In the previous design document (see [1]), the pLPIs are enabled when the device is assigned to the guest. This means that it is not necessary to send command there. This is also means we may receive a pLPI before the associated vLPI has been configured. That said, given that LPIs are edge-triggered, there is no deactivate state (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same LPIs could potentially be raised again. This could generate a storm. The priority drop is necessary if we don't want to block the reception of interrupt for the current physical CPU. What I am more concerned about is this problem can also happen in normal running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For edge-triggered interrupt, there is no way to prevent them to fire again. Maybe it is time to introduce rate-limit interrupt for ARM. Any opinions? > It also helps > toward solving the INVALL potential DOS issue, because it significantly > reduces the computation needed when an INVALL is issued by the guest. > > On the other end, this approach has the potential of consuming much more > memory to map all the possible pLPIs that a device could use up to the > theoretical max. Of course that is not good either. But fortunately for > PCI devices we know how many events a device can generate. Also we > should be able to get that info on device tree for other devices. So I > suggest Xen only maps as many pLPIs as events the device can generate, > when the device is assigned to the guest. This way there would be no > wasted memory. > > Does it make sense? Do you think it could work? Aside the point I raised above, I think the approach looks sensible. Regards, [1] https://xenbits.xen.org/people/ianc/vits/draftG.html#device-assignment
On Fri, 25 Nov 2016, Julien Grall wrote: > Hi, > > On 18/11/16 18:39, Stefano Stabellini wrote: > > On Fri, 11 Nov 2016, Stefano Stabellini wrote: > > > On Fri, 11 Nov 2016, Julien Grall wrote: > > > > On 10/11/16 20:42, Stefano Stabellini wrote: > > > > That's why in the approach we had on the previous series was "host ITS > > > > command > > > > should be limited when emulating guest ITS command". From my recall, in > > > > that > > > > series the host and guest LPIs was fully separated (enabling a guest > > > > LPIs was > > > > not enabling host LPIs). > > > > > > I am interested in reading what Ian suggested to do when the physical > > > ITS queue is full, but I cannot find anything specific about it in the > > > doc. > > > > > > Do you have a suggestion for this? > > > > > > The only things that come to mind right now are: > > > > > > 1) check if the ITS queue is full and busy loop until it is not (spin_lock > > > style) > > > 2) check if the ITS queue is full and sleep until it is not (mutex style) > > > > Another, probably better idea, is to map all pLPIs of a device when the > > device is assigned to a guest (including Dom0). This is what was written > > in Ian's design doc. The advantage of this approach is that Xen doesn't > > need to take any actions on the physical ITS command queue when the > > guest issues virtual ITS commands, therefore completely solving this > > problem at the root. (Although I am not sure about enable/disable > > commands: could we avoid issuing enable/disable on pLPIs?) > > In the previous design document (see [1]), the pLPIs are enabled when the > device is assigned to the guest. This means that it is not necessary to send > command there. This is also means we may receive a pLPI before the associated > vLPI has been configured. > > That said, given that LPIs are edge-triggered, there is no deactivate state > (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same > LPIs could potentially be raised again. This could generate a storm. Thank you for raising this important point. You are correct. > The priority drop is necessary if we don't want to block the reception of > interrupt for the current physical CPU. > > What I am more concerned about is this problem can also happen in normal > running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For > edge-triggered interrupt, there is no way to prevent them to fire again. Maybe > it is time to introduce rate-limit interrupt for ARM. Any opinions? Yes. It could be as simple as disabling the pLPI when Xen receives a second pLPI before the guest EOIs the first corresponding vLPI, which shouldn't happen in normal circumstances. We need a simple per-LPI inflight counter, incremented when a pLPI is received, decremented when the corresponding vLPI is EOIed (the LR is cleared). When the counter > 1, we disable the pLPI and request a maintenance interrupt for the corresponding vLPI. When we receive the maintenance interrupt and we clear the LR of the vLPI, Xen should re-enable the pLPI. Given that the state of the LRs is sync'ed before calling gic_interrupt, we can be sure to know exactly in what state the vLPI is at any given time. But for this to work correctly, it is important to configure the pLPI to be delivered to the same pCPU running the vCPU which handles the vLPI (as it is already the case today for SPIs).
Hi, sorry for chiming in late .... I've been spending some time thinking about this, and I think we can in fact get away without ever propagating command from domains to the host. I made a list of all commands that possible require host ITS command propagation. There are two groups: 1: enabling/disabling LPIs: INV, INVALL 2: mapping/unmapping devices/events: DISCARD, MAPD, MAPTI. The second group can be handled by mapping all required devices up front, I will elaborate on that in a different email. For the first group, read below ... On 01/12/16 01:19, Stefano Stabellini wrote: > On Fri, 25 Nov 2016, Julien Grall wrote: >> Hi, >> >> On 18/11/16 18:39, Stefano Stabellini wrote: >>> On Fri, 11 Nov 2016, Stefano Stabellini wrote: >>>> On Fri, 11 Nov 2016, Julien Grall wrote: >>>>> On 10/11/16 20:42, Stefano Stabellini wrote: >>>>> That's why in the approach we had on the previous series was "host ITS >>>>> command >>>>> should be limited when emulating guest ITS command". From my recall, in >>>>> that >>>>> series the host and guest LPIs was fully separated (enabling a guest >>>>> LPIs was >>>>> not enabling host LPIs). >>>> >>>> I am interested in reading what Ian suggested to do when the physical >>>> ITS queue is full, but I cannot find anything specific about it in the >>>> doc. >>>> >>>> Do you have a suggestion for this? >>>> >>>> The only things that come to mind right now are: >>>> >>>> 1) check if the ITS queue is full and busy loop until it is not (spin_lock >>>> style) >>>> 2) check if the ITS queue is full and sleep until it is not (mutex style) >>> >>> Another, probably better idea, is to map all pLPIs of a device when the >>> device is assigned to a guest (including Dom0). This is what was written >>> in Ian's design doc. The advantage of this approach is that Xen doesn't >>> need to take any actions on the physical ITS command queue when the >>> guest issues virtual ITS commands, therefore completely solving this >>> problem at the root. (Although I am not sure about enable/disable >>> commands: could we avoid issuing enable/disable on pLPIs?) >> >> In the previous design document (see [1]), the pLPIs are enabled when the >> device is assigned to the guest. This means that it is not necessary to send >> command there. This is also means we may receive a pLPI before the associated >> vLPI has been configured. >> >> That said, given that LPIs are edge-triggered, there is no deactivate state >> (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same >> LPIs could potentially be raised again. This could generate a storm. > > Thank you for raising this important point. You are correct. > >> The priority drop is necessary if we don't want to block the reception of >> interrupt for the current physical CPU. >> >> What I am more concerned about is this problem can also happen in normal >> running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For >> edge-triggered interrupt, there is no way to prevent them to fire again. Maybe >> it is time to introduce rate-limit interrupt for ARM. Any opinions? > > Yes. It could be as simple as disabling the pLPI when Xen receives a > second pLPI before the guest EOIs the first corresponding vLPI, which > shouldn't happen in normal circumstances. > > We need a simple per-LPI inflight counter, incremented when a pLPI is > received, decremented when the corresponding vLPI is EOIed (the LR is > cleared). > > When the counter > 1, we disable the pLPI and request a maintenance > interrupt for the corresponding vLPI. So why do we need a _counter_? This is about edge triggered interrupts, I think we can just accumulate all of them into one. So here is what I think: - We use the guest provided pending table to hold a pending bit for each VLPI. We can unmap the memory from the guest, since software is not supposed to access this table as per the spec. - We use the guest provided property table, without trapping it. There is nothing to be "validated" in that table, since it's a really tight encoding and every value written in there is legal. We only look at bit 0 for this exercise here anyway. - Upon reception of a physical LPI, we look it up to find the VCPU and virtual LPI number. This is what we need to do anyway and it's a quick two-level table lookup at the moment. - We use the VCPU's domain and the VLPI number to index the guest's property table and read the enabled bit. Again a quick table lookup. - If the VLPI is enabled, we EOI it on the host and inject it. - If the VLPI is disabled, we set the pending bit in the VCPU's pending table and EOI on the host - to allow other IRQs. - On a guest INV command, we check whether that vLPI is now enabled: - If it is disabled now, we don't need to do anything. - If it is enabled now, we check the pending bit for that VLPI: - If it is 0, we don't do anything. - If it is 1, we inject the VLPI and clear the pending bit. - On a guest INVALL command, we just need to iterate over the virtual LPIs. If you look at the conditions above, the only immediate action is when a VLPI gets enabled _and_ its pending bit is set. So we can do 64-bit read accesses over the whole pending table to find non-zero words and thus set bits, which should be rare in practice. We can store the highest mapped VLPI to avoid iterating over the whole of the table. Ideally the guest has no direct control over the pending bits, since this is what the device generates. Also we limit the number of VLPIs in total per guest anyway. If that still sounds like a DOS vector, we could additionally rate-limit INVALLs, and/or track additions to the pending table after the last INVALL: if there haven't been any new pending bits since the last scan, INVALL is a NOP. Does that makes sense so far? So that just leaves us with this IRQ storm issue, which I am thinking about now. But I guess this is not a show stopper given we can disable the physical LPI if we sense this situation. > When we receive the maintenance interrupt and we clear the LR of the > vLPI, Xen should re-enable the pLPI. > Given that the state of the LRs is sync'ed before calling gic_interrupt, > we can be sure to know exactly in what state the vLPI is at any given > time. But for this to work correctly, it is important to configure the > pLPI to be delivered to the same pCPU running the vCPU which handles > the vLPI (as it is already the case today for SPIs). Why would that be necessary? Cheers, Andre
On Fri, 2 Dec 2016, Andre Przywara wrote: > Hi, > > sorry for chiming in late .... > > I've been spending some time thinking about this, and I think we can in > fact get away without ever propagating command from domains to the host. > > I made a list of all commands that possible require host ITS command > propagation. There are two groups: > 1: enabling/disabling LPIs: INV, INVALL > 2: mapping/unmapping devices/events: DISCARD, MAPD, MAPTI. > > The second group can be handled by mapping all required devices up > front, I will elaborate on that in a different email. > > For the first group, read below ... > > On 01/12/16 01:19, Stefano Stabellini wrote: > > On Fri, 25 Nov 2016, Julien Grall wrote: > >> Hi, > >> > >> On 18/11/16 18:39, Stefano Stabellini wrote: > >>> On Fri, 11 Nov 2016, Stefano Stabellini wrote: > >>>> On Fri, 11 Nov 2016, Julien Grall wrote: > >>>>> On 10/11/16 20:42, Stefano Stabellini wrote: > >>>>> That's why in the approach we had on the previous series was "host ITS > >>>>> command > >>>>> should be limited when emulating guest ITS command". From my recall, in > >>>>> that > >>>>> series the host and guest LPIs was fully separated (enabling a guest > >>>>> LPIs was > >>>>> not enabling host LPIs). > >>>> > >>>> I am interested in reading what Ian suggested to do when the physical > >>>> ITS queue is full, but I cannot find anything specific about it in the > >>>> doc. > >>>> > >>>> Do you have a suggestion for this? > >>>> > >>>> The only things that come to mind right now are: > >>>> > >>>> 1) check if the ITS queue is full and busy loop until it is not (spin_lock > >>>> style) > >>>> 2) check if the ITS queue is full and sleep until it is not (mutex style) > >>> > >>> Another, probably better idea, is to map all pLPIs of a device when the > >>> device is assigned to a guest (including Dom0). This is what was written > >>> in Ian's design doc. The advantage of this approach is that Xen doesn't > >>> need to take any actions on the physical ITS command queue when the > >>> guest issues virtual ITS commands, therefore completely solving this > >>> problem at the root. (Although I am not sure about enable/disable > >>> commands: could we avoid issuing enable/disable on pLPIs?) > >> > >> In the previous design document (see [1]), the pLPIs are enabled when the > >> device is assigned to the guest. This means that it is not necessary to send > >> command there. This is also means we may receive a pLPI before the associated > >> vLPI has been configured. > >> > >> That said, given that LPIs are edge-triggered, there is no deactivate state > >> (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same > >> LPIs could potentially be raised again. This could generate a storm. > > > > Thank you for raising this important point. You are correct. > > > >> The priority drop is necessary if we don't want to block the reception of > >> interrupt for the current physical CPU. > >> > >> What I am more concerned about is this problem can also happen in normal > >> running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For > >> edge-triggered interrupt, there is no way to prevent them to fire again. Maybe > >> it is time to introduce rate-limit interrupt for ARM. Any opinions? > > > > Yes. It could be as simple as disabling the pLPI when Xen receives a > > second pLPI before the guest EOIs the first corresponding vLPI, which > > shouldn't happen in normal circumstances. > > > > We need a simple per-LPI inflight counter, incremented when a pLPI is > > received, decremented when the corresponding vLPI is EOIed (the LR is > > cleared). > > > > When the counter > 1, we disable the pLPI and request a maintenance > > interrupt for the corresponding vLPI. > > So why do we need a _counter_? This is about edge triggered interrupts, > I think we can just accumulate all of them into one. The counter is not to re-inject the same amount of interrupts into the guest, but to detect interrupt storms. > So here is what I think: > - We use the guest provided pending table to hold a pending bit for each > VLPI. We can unmap the memory from the guest, since software is not > supposed to access this table as per the spec. > - We use the guest provided property table, without trapping it. There > is nothing to be "validated" in that table, since it's a really tight > encoding and every value written in there is legal. We only look at bit > 0 for this exercise here anyway. I am following... > - Upon reception of a physical LPI, we look it up to find the VCPU and > virtual LPI number. This is what we need to do anyway and it's a quick > two-level table lookup at the moment. > - We use the VCPU's domain and the VLPI number to index the guest's > property table and read the enabled bit. Again a quick table lookup. They should be both O(2), correct? > - If the VLPI is enabled, we EOI it on the host and inject it. > - If the VLPI is disabled, we set the pending bit in the VCPU's > pending table and EOI on the host - to allow other IRQs. > - On a guest INV command, we check whether that vLPI is now enabled: > - If it is disabled now, we don't need to do anything. > - If it is enabled now, we check the pending bit for that VLPI: > - If it is 0, we don't do anything. > - If it is 1, we inject the VLPI and clear the pending bit. > - On a guest INVALL command, we just need to iterate over the virtual > LPIs. Right, much better. > If you look at the conditions above, the only immediate action is > when a VLPI gets enabled _and_ its pending bit is set. So we can do > 64-bit read accesses over the whole pending table to find non-zero words > and thus set bits, which should be rare in practice. We can store the > highest mapped VLPI to avoid iterating over the whole of the table. > Ideally the guest has no direct control over the pending bits, since > this is what the device generates. Also we limit the number of VLPIs in > total per guest anyway. I wonder if we could even use a fully packed bitmask with only the pending bits, so 1 bit per vLPI, rather than 1 byte per vLPI. That would be a nice improvement. > If that still sounds like a DOS vector, we could additionally rate-limit > INVALLs, and/or track additions to the pending table after the last > INVALL: if there haven't been any new pending bits since the last scan, > INVALL is a NOP. > > Does that makes sense so far? It makes sense. It should be OK. > So that just leaves us with this IRQ storm issue, which I am thinking > about now. But I guess this is not a show stopper given we can disable > the physical LPI if we sense this situation. That is true and it's exactly what we should do. > > When we receive the maintenance interrupt and we clear the LR of the > > vLPI, Xen should re-enable the pLPI. > > Given that the state of the LRs is sync'ed before calling gic_interrupt, > > we can be sure to know exactly in what state the vLPI is at any given > > time. But for this to work correctly, it is important to configure the > > pLPI to be delivered to the same pCPU running the vCPU which handles > > the vLPI (as it is already the case today for SPIs). > > Why would that be necessary? Because the state of the LRs of other pCPUs won't be up to date: we wouldn't know for sure whether the guest EOI'ed the vLPI or not.
Hi Stefano, On 03/12/16 00:46, Stefano Stabellini wrote: > On Fri, 2 Dec 2016, Andre Przywara wrote: >>> When we receive the maintenance interrupt and we clear the LR of the >>> vLPI, Xen should re-enable the pLPI. >>> Given that the state of the LRs is sync'ed before calling gic_interrupt, >>> we can be sure to know exactly in what state the vLPI is at any given >>> time. But for this to work correctly, it is important to configure the >>> pLPI to be delivered to the same pCPU running the vCPU which handles >>> the vLPI (as it is already the case today for SPIs). >> >> Why would that be necessary? > > Because the state of the LRs of other pCPUs won't be up to date: we > wouldn't know for sure whether the guest EOI'ed the vLPI or not. Well, there is still a small window when the interrupt may be received on the previous pCPU. So we have to take into account this case. This window may be bigger with LPIs, because a single vCPU may have thousand interrupts routed. This would take a long time to move all of them when the vCPU is migrating. So we may want to take a lazy approach and moving them when they are received on the "wrong" pCPU. Cheers,
On Mon, 5 Dec 2016, Julien Grall wrote: > Hi Stefano, > > On 03/12/16 00:46, Stefano Stabellini wrote: > > On Fri, 2 Dec 2016, Andre Przywara wrote: > > > > When we receive the maintenance interrupt and we clear the LR of the > > > > vLPI, Xen should re-enable the pLPI. > > > > Given that the state of the LRs is sync'ed before calling gic_interrupt, > > > > we can be sure to know exactly in what state the vLPI is at any given > > > > time. But for this to work correctly, it is important to configure the > > > > pLPI to be delivered to the same pCPU running the vCPU which handles > > > > the vLPI (as it is already the case today for SPIs). > > > > > > Why would that be necessary? > > > > Because the state of the LRs of other pCPUs won't be up to date: we > > wouldn't know for sure whether the guest EOI'ed the vLPI or not. > > Well, there is still a small window when the interrupt may be received on the > previous pCPU. So we have to take into account this case. That's right. We already have a mechanism to deal with that, based on the GIC_IRQ_GUEST_MIGRATING flag. It should work with LPIs too. > This window may be bigger with LPIs, because a single vCPU may have thousand > interrupts routed. This would take a long time to move all of them when the > vCPU is migrating. So we may want to take a lazy approach and moving them when > they are received on the "wrong" pCPU. That's possible. The only downside is that modifying the irq migration workflow is difficult and we might want to avoid it if possible. Another approach is to let the scheduler know that migration is slower. In fact this is not a new problem: it can be slow to migrate interrupts, even few non-LPIs interrupts, even on x86. I wonder if the Xen scheduler has any knowledge of that (CC'ing George and Dario). I guess that's the reason why most people run with dom0_vcpus_pin.
Hi Stefano, On 05/12/16 19:51, Stefano Stabellini wrote: > On Mon, 5 Dec 2016, Julien Grall wrote: >> Hi Stefano, >> >> On 03/12/16 00:46, Stefano Stabellini wrote: >>> On Fri, 2 Dec 2016, Andre Przywara wrote: >>>>> When we receive the maintenance interrupt and we clear the LR of the >>>>> vLPI, Xen should re-enable the pLPI. >>>>> Given that the state of the LRs is sync'ed before calling gic_interrupt, >>>>> we can be sure to know exactly in what state the vLPI is at any given >>>>> time. But for this to work correctly, it is important to configure the >>>>> pLPI to be delivered to the same pCPU running the vCPU which handles >>>>> the vLPI (as it is already the case today for SPIs). >>>> >>>> Why would that be necessary? >>> >>> Because the state of the LRs of other pCPUs won't be up to date: we >>> wouldn't know for sure whether the guest EOI'ed the vLPI or not. >> >> Well, there is still a small window when the interrupt may be received on the >> previous pCPU. So we have to take into account this case. > > That's right. We already have a mechanism to deal with that, based on > the GIC_IRQ_GUEST_MIGRATING flag. It should work with LPIs too. Right. >> This window may be bigger with LPIs, because a single vCPU may have thousand >> interrupts routed. This would take a long time to move all of them when the >> vCPU is migrating. So we may want to take a lazy approach and moving them when >> they are received on the "wrong" pCPU. > > That's possible. The only downside is that modifying the irq migration > workflow is difficult and we might want to avoid it if possible. I don't think this would modify the irq migration work flow. If you look at the implementation of arch_move_irqs, it will just go over the vIRQ and call irq_set_affinity. irq_set_affinity will directly modify the hardware and that's all. > > Another approach is to let the scheduler know that migration is slower. > In fact this is not a new problem: it can be slow to migrate interrupts, > even few non-LPIs interrupts, even on x86. I wonder if the Xen scheduler > has any knowledge of that (CC'ing George and Dario). I guess that's the > reason why most people run with dom0_vcpus_pin. I gave a quick look at x86, arch_move_irqs is not implemented. Only PIRQ are migrated when a vCPU is moving to another pCPU. The function pirq_set_affinity, will change the affinity of a PIRQ but only in software (see irq_set_affinity). This is not yet replicated the configuration into the hardware. In the case of ARM, we directly modify the configuration of the hardware. This adds much more overhead because you have to do an hardware access for every single IRQ. Regards,
On Tue, 6 Dec 2016, Julien Grall wrote: > > > This window may be bigger with LPIs, because a single vCPU may have > > > thousand > > > interrupts routed. This would take a long time to move all of them when > > > the > > > vCPU is migrating. So we may want to take a lazy approach and moving them > > > when > > > they are received on the "wrong" pCPU. > > > > That's possible. The only downside is that modifying the irq migration > > workflow is difficult and we might want to avoid it if possible. > > I don't think this would modify the irq migration work flow. If you look at > the implementation of arch_move_irqs, it will just go over the vIRQ and call > irq_set_affinity. > > irq_set_affinity will directly modify the hardware and that's all. > > > > > Another approach is to let the scheduler know that migration is slower. > > In fact this is not a new problem: it can be slow to migrate interrupts, > > even few non-LPIs interrupts, even on x86. I wonder if the Xen scheduler > > has any knowledge of that (CC'ing George and Dario). I guess that's the > > reason why most people run with dom0_vcpus_pin. > > I gave a quick look at x86, arch_move_irqs is not implemented. Only PIRQ are > migrated when a vCPU is moving to another pCPU. > > The function pirq_set_affinity, will change the affinity of a PIRQ but only in > software (see irq_set_affinity). This is not yet replicated the configuration > into the hardware. > > In the case of ARM, we directly modify the configuration of the hardware. This > adds much more overhead because you have to do an hardware access for every > single IRQ. George, Dario, any comments on whether this would make sense and how to do it?
On Tue, 2016-12-06 at 11:36 -0800, Stefano Stabellini wrote: > On Tue, 6 Dec 2016, Julien Grall wrote: > > > > > Another approach is to let the scheduler know that migration is > > > slower. > > > In fact this is not a new problem: it can be slow to migrate > > > interrupts, > > > even few non-LPIs interrupts, even on x86. I wonder if the Xen > > > scheduler > > > has any knowledge of that (CC'ing George and Dario). I guess > > > that's the > > > reason why most people run with dom0_vcpus_pin. > > > > I gave a quick look at x86, arch_move_irqs is not implemented. Only > > PIRQ are > > migrated when a vCPU is moving to another pCPU. > > > > In the case of ARM, we directly modify the configuration of the > > hardware. This > > adds much more overhead because you have to do an hardware access > > for every > > single IRQ. > > George, Dario, any comments on whether this would make sense and how > to > do it? > I was actually looking into this, but I think I don't know enough of ARM in general, and about this issue in particular to be useful. That being said, perhaps you could clarify a bit what you mean with "let the scheduler know that migration is slower". What you'd expect the scheduler to do? Checking the code, as Julien says, on x86 all we do when we move vCPUs around is calling evtchn_move_pirqs(). In fact, it was right that function that was called multiple times in schedule.c, and it was you that (as Julien pointed out already): 1) in 5bd62a757b9 ("xen/arm: physical irq follow virtual irq"), created arch_move_irqs() as something that does something on ARM, and as an empty stub in x86. 2) in 14f7e3b8a70 ("xen: introduce sched_move_irqs"), generalized schedule.c code by implementing sched_move_irqs(). So, if I understood correctly what Julien said here "I don't think this would modify the irq migration work flow. etc.", it looks to me that the suggested lazy approach could be a good solution (but I'm saying that lacking the knowledge of what it would actually mean to implement that). If you want something inside the scheduler that sort of delays the wakeup of a domain on the new pCPU until some condition in IRQ handling code is verified (but, please, confirm whether or not it was this that you were thinking of), my thoughts, out of the top of my head about this are: - in general, I think it should be possible; - it has to be arch-specific, I think? - It's easy to avoid the vCPU being woken as a consequence of vcpu_wake() being called, e.g., at the end of vcpu_migrate(); - we must be careful about not forgetting/failing to (re)wakeup the vCPU when the condition verifies Sorry if I can't be more useful than this for now. :-/ Regards, Dario
On Mon, 2016-12-05 at 11:51 -0800, Stefano Stabellini wrote: > Another approach is to let the scheduler know that migration is > slower. > In fact this is not a new problem: it can be slow to migrate > interrupts, > even few non-LPIs interrupts, even on x86. I wonder if the Xen > scheduler > has any knowledge of that (CC'ing George and Dario). I guess that's > the > reason why most people run with dom0_vcpus_pin. > Oh, and about this last sentence. I may indeed be lacking knowledge/understanding, but if you think this is a valid use case for dom0_vcpus_pin, I'd indeed be interested in knowing why. In fact, that configuration has always looked rather awkward to me, and I think we should start thinking stopping providing the option at all (or changing/extending its behavior). So, if you think you need it, please spell that out, and let's see if there are better ways to achieve the same. :-) Thanks and Regards, Dario
On Tue, 6 Dec 2016, Dario Faggioli wrote: > On Tue, 2016-12-06 at 11:36 -0800, Stefano Stabellini wrote: > > On Tue, 6 Dec 2016, Julien Grall wrote: > > > > > > > Another approach is to let the scheduler know that migration is > > > > slower. > > > > In fact this is not a new problem: it can be slow to migrate > > > > interrupts, > > > > even few non-LPIs interrupts, even on x86. I wonder if the Xen > > > > scheduler > > > > has any knowledge of that (CC'ing George and Dario). I guess > > > > that's the > > > > reason why most people run with dom0_vcpus_pin. > > > > > > I gave a quick look at x86, arch_move_irqs is not implemented. Only > > > PIRQ are > > > migrated when a vCPU is moving to another pCPU. > > > > > > In the case of ARM, we directly modify the configuration of the > > > hardware. This > > > adds much more overhead because you have to do an hardware access > > > for every > > > single IRQ. > > > > George, Dario, any comments on whether this would make sense and how > > to > > do it? > > > I was actually looking into this, but I think I don't know enough of > ARM in general, and about this issue in particular to be useful. > > That being said, perhaps you could clarify a bit what you mean with > "let the scheduler know that migration is slower". What you'd expect > the scheduler to do? > > Checking the code, as Julien says, on x86 all we do when we move vCPUs > around is calling evtchn_move_pirqs(). In fact, it was right that > function that was called multiple times in schedule.c, and it was you > that (as Julien pointed out already): > 1) in 5bd62a757b9 ("xen/arm: physical irq follow virtual irq"), > created arch_move_irqs() as something that does something on ARM, > and as an empty stub in x86. > 2) in 14f7e3b8a70 ("xen: introduce sched_move_irqs"), generalized > schedule.c code by implementing sched_move_irqs(). > > So, if I understood correctly what Julien said here "I don't think this > would modify the irq migration work flow. etc.", it looks to me that > the suggested lazy approach could be a good solution (but I'm saying > that lacking the knowledge of what it would actually mean to implement > that). > > If you want something inside the scheduler that sort of delays the > wakeup of a domain on the new pCPU until some condition in IRQ handling > code is verified (but, please, confirm whether or not it was this that > you were thinking of), my thoughts, out of the top of my head about > this are: > - in general, I think it should be possible; > - it has to be arch-specific, I think? > - It's easy to avoid the vCPU being woken as a consequence of > vcpu_wake() being called, e.g., at the end of vcpu_migrate(); > - we must be careful about not forgetting/failing to (re)wakeup the > vCPU when the condition verifies > > Sorry if I can't be more useful than this for now. :-/ We don't need scheduler support to implement interrupt migration. The question was much simpler than that: moving a vCPU with interrupts assigned to it is slower than moving a vCPU without interrupts assigned to it. You could say that the slowness is directly proportional do the number of interrupts assigned to the vCPU. Does the scheduler know that? Or blindly moves vCPUs around? Also see below. > On Mon, 2016-12-05 at 11:51 -0800, Stefano Stabellini wrote: > > Another approach is to let the scheduler know that migration is > > slower. > > In fact this is not a new problem: it can be slow to migrate > > interrupts, > > even few non-LPIs interrupts, even on x86. I wonder if the Xen > > scheduler > > has any knowledge of that (CC'ing George and Dario). I guess that's > > the > > reason why most people run with dom0_vcpus_pin. > > > Oh, and about this last sentence. > > I may indeed be lacking knowledge/understanding, but if you think this > is a valid use case for dom0_vcpus_pin, I'd indeed be interested in > knowing why. > > In fact, that configuration has always looked rather awkward to me, and > I think we should start thinking stopping providing the option at all > (or changing/extending its behavior). > > So, if you think you need it, please spell that out, and let's see if > there are better ways to achieve the same. :-) That's right, I think dom0_vcpus_pin is a good work-around for the lack of scheduler knowledge about interrupts. If the scheduler knew that moving vCPU0 from pCPU0 to pCPU1 is far more expensive than moving vCPU3 from pCPU3 to pCPU1 then it would make better decision and we wouldn't need dom0_vcpus_pin.
On Tue, 6 Dec 2016, Stefano Stabellini wrote: > moving a vCPU with interrupts assigned to it is slower than moving a > vCPU without interrupts assigned to it. You could say that the > slowness is directly proportional do the number of interrupts assigned > to the vCPU. To be pedantic, by "assigned" I mean that a physical interrupt is routed to a given pCPU and is set to be forwarded to a guest vCPU running on it by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of these physical interrupts, a corresponding virtual interrupt (could be a different irq) will be injected into the guest vCPU. When the vCPU is migrated to a new pCPU, the physical interrupts that are configured to be injected as virtual interrupts into the vCPU, are migrated with it. The physical interrupt migration has a cost. However, receiving physical interrupts on the wrong pCPU has an higher cost.
On Tue, 2016-12-06 at 14:01 -0800, Stefano Stabellini wrote: > On Tue, 6 Dec 2016, Stefano Stabellini wrote: > > > > moving a vCPU with interrupts assigned to it is slower than moving > > a > > vCPU without interrupts assigned to it. You could say that the > > slowness is directly proportional do the number of interrupts > > assigned > > to the vCPU. > > To be pedantic, by "assigned" I mean that a physical interrupt is > routed > to a given pCPU and is set to be forwarded to a guest vCPU running on > it > by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one > of > these physical interrupts, a corresponding virtual interrupt (could > be a > different irq) will be injected into the guest vCPU. > > When the vCPU is migrated to a new pCPU, the physical interrupts that > are configured to be injected as virtual interrupts into the vCPU, > are > migrated with it. The physical interrupt migration has a cost. > However, > receiving physical interrupts on the wrong pCPU has an higher cost. > Yeah, I got in what sense you said "assigned", but thanks anyway for this clarification. It indeed makes the picture more clear (even just FTR) :-) Dario
On Tue, 2016-12-06 at 13:53 -0800, Stefano Stabellini wrote: > On Tue, 6 Dec 2016, Dario Faggioli wrote: > > Sorry if I can't be more useful than this for now. :-/ > > We don't need scheduler support to implement interrupt migration. The > question was much simpler than that: moving a vCPU with interrupts > assigned to it is slower than moving a vCPU without interrupts > assigned > to it. You could say that the slowness is directly proportional do > the > number of interrupts assigned to the vCPU. Does the scheduler know > that? > Or blindly moves vCPUs around? Also see below. > Ah, ok, it is indeed a simpler question than I thought! :-) Answer: no, the scheduler does not use the information of how many or what interrupts are being routed to a vCPU in any way. Just for the sake of correctness and precision, it does not "blindly moves vCPUs around", as in, it follows some criteria for deciding whether or not to move a vCPU, and if yes, where to, but among those criteria, there is no trace of anything related to routed interrupts. Let me also add that the criteria are scheduler specific, so they're different, e.g., between Credit and Credit2. Starting considering routed interrupt as a migration criteria in Credit would be rather difficult. Credit use a 'best effort' approach for migrating vCPUs, which is hard to augment. Starting considering routed interrupt as a migration criteria in Credit2 would be much easier. Credit2's load balancer is specifically designed for being extendible with things like that. It would require some thinking, though, in order to figure out how important this particular aspect would be, wrt others that are considered. E.g., if I have pCPU 0 loaded at 75% and pCPU 1 loaded at 25%, vCPU A has a lot of routed interrupts, and moving it gives me perfect load balancing (i.e., load will become 50% on pCPU 0 and 50% on pCPU 1) should I move it or not? Well, it depends if whether or not we think that the overhead we save by not migrating outweights the benefit of a perfectly balanced system. Something like that... Regards, Dario
Hi Stefano, On 06/12/2016 22:01, Stefano Stabellini wrote: > On Tue, 6 Dec 2016, Stefano Stabellini wrote: >> moving a vCPU with interrupts assigned to it is slower than moving a >> vCPU without interrupts assigned to it. You could say that the >> slowness is directly proportional do the number of interrupts assigned >> to the vCPU. > > To be pedantic, by "assigned" I mean that a physical interrupt is routed > to a given pCPU and is set to be forwarded to a guest vCPU running on it > by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of > these physical interrupts, a corresponding virtual interrupt (could be a > different irq) will be injected into the guest vCPU. > > When the vCPU is migrated to a new pCPU, the physical interrupts that > are configured to be injected as virtual interrupts into the vCPU, are > migrated with it. The physical interrupt migration has a cost. However, > receiving physical interrupts on the wrong pCPU has an higher cost. I don't understand why it is a problem for you to receive the first interrupt to the wrong pCPU and moving it if necessary. While this may have an higher cost (I don't believe so) on the first received interrupt, migrating thousands of interrupts at the same time is very expensive and will likely get Xen stuck for a while (think about ITS with a single command queue). Furthermore, the current approach will move every single interrupt routed a the vCPU, even those disabled. That's pointless and a waste of resource. You may argue that we can skip the ones disabled, but in that case what would be the benefits to migrate the IRQs while migrate the vCPUs? So I would suggest to spread it over the time. This also means less headache for the scheduler developers. Regards,
Hi Dario, On 06/12/2016 22:39, Dario Faggioli wrote: > On Tue, 2016-12-06 at 13:53 -0800, Stefano Stabellini wrote: >> On Tue, 6 Dec 2016, Dario Faggioli wrote: >>> Sorry if I can't be more useful than this for now. :-/ >> >> We don't need scheduler support to implement interrupt migration. The >> question was much simpler than that: moving a vCPU with interrupts >> assigned to it is slower than moving a vCPU without interrupts >> assigned >> to it. You could say that the slowness is directly proportional do >> the >> number of interrupts assigned to the vCPU. Does the scheduler know >> that? >> Or blindly moves vCPUs around? Also see below. >> > Ah, ok, it is indeed a simpler question than I thought! :-) > > Answer: no, the scheduler does not use the information of how many or > what interrupts are being routed to a vCPU in any way. > > Just for the sake of correctness and precision, it does not "blindly > moves vCPUs around", as in, it follows some criteria for deciding > whether or not to move a vCPU, and if yes, where to, but among those > criteria, there is no trace of anything related to routed interrupts. > > Let me also add that the criteria are scheduler specific, so they're > different, e.g., between Credit and Credit2. > > Starting considering routed interrupt as a migration criteria in Credit > would be rather difficult. Credit use a 'best effort' approach for > migrating vCPUs, which is hard to augment. > > Starting considering routed interrupt as a migration criteria in > Credit2 would be much easier. Credit2's load balancer is specifically > designed for being extendible with things like that. It would require > some thinking, though, in order to figure out how important this > particular aspect would be, wrt others that are considered. > > E.g., if I have pCPU 0 loaded at 75% and pCPU 1 loaded at 25%, vCPU A > has a lot of routed interrupts, and moving it gives me perfect load > balancing (i.e., load will become 50% on pCPU 0 and 50% on pCPU 1) > should I move it or not? > Well, it depends if whether or not we think that the overhead we save > by not migrating outweights the benefit of a perfectly balanced system. > > Something like that... This idea to migrate IRQ while migrating vCPU is already wrong to me as this is really expensive, I am talking in term of milliseconds easily as in some case Xen will have to interact with a shared command queue. Spreading the load is much better and avoid to migrate directly IRQ that are disabled when the vCPU is been migrated. I cannot see any reason to not do that as changing the interrupt affinity may be not taken into account directly. By that, I mean it might be possible to receive another interrupt on the wrong pCPU. I really think we should make the vCPU migration much simpler (e.g avoid this big loop over interrupt). In fine, if we really expect the scheduler to migrate the vCPU on a different pCPU. We should also expect receiving the interrupt on the wrong pCPU may not happen often. Regards,
On Tue, 2016-12-06 at 23:24 +0000, Julien Grall wrote: > I really think we should make the vCPU migration much simpler (e.g > avoid > this big loop over interrupt). In fine, if we really expect the > scheduler to migrate the vCPU on a different pCPU. We should also > expect > receiving the interrupt on the wrong pCPU may not happen often. > This makes sense to me, but as I said, I don't really know. I mean, I understand what you're explaining but I didn't consider this before, and I don't have any performance figure. I hope I manage to explain that, if we want to take this into account during migration, as Stefano was asking about, there's a way to do that in Credit2. That's it. I'll be happy to help dealing with whatever you withing yourselves decide it's best for ARM, if it has scheduling implications. :-) And while we're here, if considering this specific aspect is not a good idea, but you (anyone!) have in mind other things that it could be interesting to take into account when evaluating whether or not to migrate a vCPU, I'd be interested to know that. After all, the advantage of having our own scheduler (e.g., wrt KVM that has to use the Linux one), is exactly this, i.e., that we can focus a lot more on virtualization specific aspects. So, really, I'm all ears. :-D Regards, Dario
On Tue, 6 Dec 2016, Julien Grall wrote: > On 06/12/2016 22:01, Stefano Stabellini wrote: > > On Tue, 6 Dec 2016, Stefano Stabellini wrote: > > > moving a vCPU with interrupts assigned to it is slower than moving a > > > vCPU without interrupts assigned to it. You could say that the > > > slowness is directly proportional do the number of interrupts assigned > > > to the vCPU. > > > > To be pedantic, by "assigned" I mean that a physical interrupt is routed > > to a given pCPU and is set to be forwarded to a guest vCPU running on it > > by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of > > these physical interrupts, a corresponding virtual interrupt (could be a > > different irq) will be injected into the guest vCPU. > > > > When the vCPU is migrated to a new pCPU, the physical interrupts that > > are configured to be injected as virtual interrupts into the vCPU, are > > migrated with it. The physical interrupt migration has a cost. However, > > receiving physical interrupts on the wrong pCPU has an higher cost. > > I don't understand why it is a problem for you to receive the first interrupt > to the wrong pCPU and moving it if necessary. > > While this may have an higher cost (I don't believe so) on the first received > interrupt, migrating thousands of interrupts at the same time is very > expensive and will likely get Xen stuck for a while (think about ITS with a > single command queue). > > Furthermore, the current approach will move every single interrupt routed a > the vCPU, even those disabled. That's pointless and a waste of resource. You > may argue that we can skip the ones disabled, but in that case what would be > the benefits to migrate the IRQs while migrate the vCPUs? > > So I would suggest to spread it over the time. This also means less headache > for the scheduler developers. The most important aspect of interrupts handling in Xen is latency, measured as the time between Xen receiving a physical interrupt and the guest receiving it. This latency should be both small and deterministic. We all agree so far, right? The issue with spreading interrupts migrations over time is that it makes interrupt latency less deterministic. It is OK, in the uncommon case of vCPU migration with interrupts, to take a hit for a short time. This "hit" can be measured. It can be known. If your workload cannot tolerate it, vCPUs can be pinned. It should be a rare event anyway. On the other hand, by spreading interrupts migrations, we make it harder to predict latency. Aside from determinism, another problem with this approach is that it ensures that every interrupt assigned to a vCPU will first hit the wrong pCPU, then it will be moved. It guarantees the worst-case scenario for interrupt latency for the vCPU that has been moved. If we migrated all interrupts as soon as possible, we would minimize the amount of interrupts delivered to the wrong pCPU. Most interrupts would be delivered to the new pCPU right away, reducing interrupt latency. Regardless of how we implement interrupts migrations on ARM, I think it still makes sense for the scheduler to know about it. I realize that this is a separate point. Even if we spread interrupts migrations over time, it still has a cost, in terms of latency as I wrote above, but also in terms of interactions with interrupt controllers and ITSes. A vCPU with no interrupts assigned to it poses no such problems. The scheduler should be aware of the difference. If the scheduler knew, I bet that vCPU migration would be a rare event for vCPUs that have many interrupts assigned to them. For example, Dom0 vCPU0 would never be moved, and dom0_pin_vcpus would be superfluous.
On Tue, 6 Dec 2016, Dario Faggioli wrote: > On Tue, 2016-12-06 at 13:53 -0800, Stefano Stabellini wrote: > > On Tue, 6 Dec 2016, Dario Faggioli wrote: > > > Sorry if I can't be more useful than this for now. :-/ > > > > We don't need scheduler support to implement interrupt migration. The > > question was much simpler than that: moving a vCPU with interrupts > > assigned to it is slower than moving a vCPU without interrupts > > assigned > > to it. You could say that the slowness is directly proportional do > > the > > number of interrupts assigned to the vCPU. Does the scheduler know > > that? > > Or blindly moves vCPUs around? Also see below. > > > Ah, ok, it is indeed a simpler question than I thought! :-) > > Answer: no, the scheduler does not use the information of how many or > what interrupts are being routed to a vCPU in any way. > > Just for the sake of correctness and precision, it does not "blindly > moves vCPUs around", as in, it follows some criteria for deciding > whether or not to move a vCPU, and if yes, where to, but among those > criteria, there is no trace of anything related to routed interrupts. > > Let me also add that the criteria are scheduler specific, so they're > different, e.g., between Credit and Credit2. > > Starting considering routed interrupt as a migration criteria in Credit > would be rather difficult. Credit use a 'best effort' approach for > migrating vCPUs, which is hard to augment. > > Starting considering routed interrupt as a migration criteria in > Credit2 would be much easier. Credit2's load balancer is specifically > designed for being extendible with things like that. It would require > some thinking, though, in order to figure out how important this > particular aspect would be, wrt others that are considered. > > E.g., if I have pCPU 0 loaded at 75% and pCPU 1 loaded at 25%, vCPU A > has a lot of routed interrupts, and moving it gives me perfect load > balancing (i.e., load will become 50% on pCPU 0 and 50% on pCPU 1) > should I move it or not? > Well, it depends if whether or not we think that the overhead we save > by not migrating outweights the benefit of a perfectly balanced system. Right. I don't know where to draw the line. I don't how much weight it should have, but certainly it shouldn't be considered the same thing as moving any other vCPU.
On Wed, 2016-12-07 at 12:21 -0800, Stefano Stabellini wrote: > On Tue, 6 Dec 2016, Dario Faggioli wrote: > > E.g., if I have pCPU 0 loaded at 75% and pCPU 1 loaded at 25%, vCPU > > A > > has a lot of routed interrupts, and moving it gives me perfect load > > balancing (i.e., load will become 50% on pCPU 0 and 50% on pCPU 1) > > should I move it or not? > > Well, it depends if whether or not we think that the overhead we > > save > > by not migrating outweights the benefit of a perfectly balanced > > system. > > Right. I don't know where to draw the line. I don't how much weight > it > should have, but certainly it shouldn't be considered the same thing > as > moving any other vCPU. > Right. As I said, Credit2 load balancer is nice and easy to extend already --and needs to become nicer and easier to extend in order to deal with soft-affinity, so I'll work on that soon (there's patches out for soft-affinity which does sort of something like that, but I'm not entirely satisfied of them, so I'll probably rework that part). At that point, I'll be more than happy to consider this, and try to reason about how much it should be weighted. After all, the only thing we need to take this information into account when making load balancing decisions is a mechanism for knowing how many of these routed interrupt a vCPU has, and of course this needs to be: - easy to use, - super quick (load balancing is an hot path), - architecture independent, is this the case already? :-) Thanks and Regards, Dario
Hi Stefano, On 07/12/16 20:20, Stefano Stabellini wrote: > On Tue, 6 Dec 2016, Julien Grall wrote: >> On 06/12/2016 22:01, Stefano Stabellini wrote: >>> On Tue, 6 Dec 2016, Stefano Stabellini wrote: >>>> moving a vCPU with interrupts assigned to it is slower than moving a >>>> vCPU without interrupts assigned to it. You could say that the >>>> slowness is directly proportional do the number of interrupts assigned >>>> to the vCPU. >>> >>> To be pedantic, by "assigned" I mean that a physical interrupt is routed >>> to a given pCPU and is set to be forwarded to a guest vCPU running on it >>> by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of >>> these physical interrupts, a corresponding virtual interrupt (could be a >>> different irq) will be injected into the guest vCPU. >>> >>> When the vCPU is migrated to a new pCPU, the physical interrupts that >>> are configured to be injected as virtual interrupts into the vCPU, are >>> migrated with it. The physical interrupt migration has a cost. However, >>> receiving physical interrupts on the wrong pCPU has an higher cost. >> >> I don't understand why it is a problem for you to receive the first interrupt >> to the wrong pCPU and moving it if necessary. >> >> While this may have an higher cost (I don't believe so) on the first received >> interrupt, migrating thousands of interrupts at the same time is very >> expensive and will likely get Xen stuck for a while (think about ITS with a >> single command queue). >> >> Furthermore, the current approach will move every single interrupt routed a >> the vCPU, even those disabled. That's pointless and a waste of resource. You >> may argue that we can skip the ones disabled, but in that case what would be >> the benefits to migrate the IRQs while migrate the vCPUs? >> >> So I would suggest to spread it over the time. This also means less headache >> for the scheduler developers. > > The most important aspect of interrupts handling in Xen is latency, > measured as the time between Xen receiving a physical interrupt and the > guest receiving it. This latency should be both small and deterministic. > > We all agree so far, right? > > > The issue with spreading interrupts migrations over time is that it makes > interrupt latency less deterministic. It is OK, in the uncommon case of > vCPU migration with interrupts, to take a hit for a short time. This > "hit" can be measured. It can be known. If your workload cannot tolerate > it, vCPUs can be pinned. It should be a rare event anyway. On the other > hand, by spreading interrupts migrations, we make it harder to predict > latency. Aside from determinism, another problem with this approach is > that it ensures that every interrupt assigned to a vCPU will first hit > the wrong pCPU, then it will be moved. It guarantees the worst-case > scenario for interrupt latency for the vCPU that has been moved. If we > migrated all interrupts as soon as possible, we would minimize the > amount of interrupts delivered to the wrong pCPU. Most interrupts would > be delivered to the new pCPU right away, reducing interrupt latency. Migrating all the interrupts can be really expensive because in the current state we have to go through every single interrupt and check whether the interrupt has been routed to this vCPU. We will also route disabled interrupt. And this seems really pointless. This may need some optimization here. With ITS, we may have thousand of interrupts routed to a vCPU. This means that for every interrupt we have to issue a command in the host ITS queue. You will likely fill up the command queue and add much more latency. Even if you consider the vCPU migration to be a rare case. You could still get the pCPU stuck for tens of milliseconds, the time to migrate everything. And I don't think this is not acceptable. Anyway, I would like to see measurement in both situation before deciding when LPIs will be migrated. > Regardless of how we implement interrupts migrations on ARM, I think it > still makes sense for the scheduler to know about it. I realize that > this is a separate point. Even if we spread interrupts migrations over > time, it still has a cost, in terms of latency as I wrote above, but also > in terms of interactions with interrupt controllers and ITSes. A vCPU > with no interrupts assigned to it poses no such problems. The scheduler > should be aware of the difference. If the scheduler knew, I bet that > vCPU migration would be a rare event for vCPUs that have many interrupts > assigned to them. For example, Dom0 vCPU0 would never be moved, and > dom0_pin_vcpus would be superfluous. The number of interrupts routed to a vCPU will vary over the time, this will depend what the guest decides to do, so you need scheduler to adapt. And in fine, you give the guest a chance to "control" the scheduler depending how the interrupts are spread between vCPU. If the number increases, you may end up to have the scheduler to decide to not migrate the vCPU because it will be too expensive. But you may have a situation where migrating a vCPU with many interrupts is the only possible choice and you will slow down the platform. Cheers,
Hi, On 07/12/16 20:20, Stefano Stabellini wrote: > On Tue, 6 Dec 2016, Julien Grall wrote: >> On 06/12/2016 22:01, Stefano Stabellini wrote: >>> On Tue, 6 Dec 2016, Stefano Stabellini wrote: >>>> moving a vCPU with interrupts assigned to it is slower than moving a >>>> vCPU without interrupts assigned to it. You could say that the >>>> slowness is directly proportional do the number of interrupts assigned >>>> to the vCPU. >>> >>> To be pedantic, by "assigned" I mean that a physical interrupt is routed >>> to a given pCPU and is set to be forwarded to a guest vCPU running on it >>> by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of >>> these physical interrupts, a corresponding virtual interrupt (could be a >>> different irq) will be injected into the guest vCPU. >>> >>> When the vCPU is migrated to a new pCPU, the physical interrupts that >>> are configured to be injected as virtual interrupts into the vCPU, are >>> migrated with it. The physical interrupt migration has a cost. However, >>> receiving physical interrupts on the wrong pCPU has an higher cost. >> >> I don't understand why it is a problem for you to receive the first interrupt >> to the wrong pCPU and moving it if necessary. >> >> While this may have an higher cost (I don't believe so) on the first received >> interrupt, migrating thousands of interrupts at the same time is very >> expensive and will likely get Xen stuck for a while (think about ITS with a >> single command queue). >> >> Furthermore, the current approach will move every single interrupt routed a >> the vCPU, even those disabled. That's pointless and a waste of resource. You >> may argue that we can skip the ones disabled, but in that case what would be >> the benefits to migrate the IRQs while migrate the vCPUs? >> >> So I would suggest to spread it over the time. This also means less headache >> for the scheduler developers. > > The most important aspect of interrupts handling in Xen is latency, > measured as the time between Xen receiving a physical interrupt and the > guest receiving it. This latency should be both small and deterministic. > > We all agree so far, right? > > > The issue with spreading interrupts migrations over time is that it makes > interrupt latency less deterministic. It is OK, in the uncommon case of > vCPU migration with interrupts, to take a hit for a short time. This > "hit" can be measured. It can be known. If your workload cannot tolerate > it, vCPUs can be pinned. It should be a rare event anyway. On the other > hand, by spreading interrupts migrations, we make it harder to predict > latency. Aside from determinism, another problem with this approach is > that it ensures that every interrupt assigned to a vCPU will first hit > the wrong pCPU, then it will be moved. It guarantees the worst-case > scenario for interrupt latency for the vCPU that has been moved. If we > migrated all interrupts as soon as possible, we would minimize the > amount of interrupts delivered to the wrong pCPU. Most interrupts would > be delivered to the new pCPU right away, reducing interrupt latency. So if this is such a crucial issue, why don't we use the ITS for good this time? The ITS hardware probably supports 16 bits worth of collection IDs, so what about we assign each VCPU (in every guest) a unique collection ID on the host and do a MAPC & MOVALL on a VCPU migration to let it point to the right physical redistributor. I see that this does not cover all use cases (> 65536 VCPUs, for instance), also depends much of many implementation details: - How costly is a MOVALL? It needs to scan the pending table and transfer set bits to the other redistributor, which may take a while. - Is there an impact if we exceed the number of hardware backed collections (GITS_TYPE[HCC])? If the ITS is forced to access system memory for every table lookup, this may slow down everyday operations. - How likely are those misdirected interrupts in the first place? How often do we migrate VCPU compared to the the interrupt frequency? There are more, subtle parameters to consider, so I guess we just need to try and measure. > Regardless of how we implement interrupts migrations on ARM, I think it > still makes sense for the scheduler to know about it. I realize that > this is a separate point. Even if we spread interrupts migrations over > time, it still has a cost, in terms of latency as I wrote above, but also > in terms of interactions with interrupt controllers and ITSes. A vCPU > with no interrupts assigned to it poses no such problems. The scheduler > should be aware of the difference. If the scheduler knew, I bet that > vCPU migration would be a rare event for vCPUs that have many interrupts > assigned to them. For example, Dom0 vCPU0 would never be moved, and > dom0_pin_vcpus would be superfluous. That's a good point, so indeed the "interrupt load" should be a scheduler parameter. But as you said: that's a different story. Cheers, Andre.
On 03/12/16 00:46, Stefano Stabellini wrote: > On Fri, 2 Dec 2016, Andre Przywara wrote: >> Hi, Hi Stefano, I started to answer this email some days ago, but then spend some time on actually implementing what I suggested, hence the delay ... >> >> sorry for chiming in late .... >> >> I've been spending some time thinking about this, and I think we can in >> fact get away without ever propagating command from domains to the host. >> >> I made a list of all commands that possible require host ITS command >> propagation. There are two groups: >> 1: enabling/disabling LPIs: INV, INVALL >> 2: mapping/unmapping devices/events: DISCARD, MAPD, MAPTI. >> >> The second group can be handled by mapping all required devices up >> front, I will elaborate on that in a different email. >> >> For the first group, read below ... >> >> On 01/12/16 01:19, Stefano Stabellini wrote: >>> On Fri, 25 Nov 2016, Julien Grall wrote: >>>> Hi, >>>> >>>> On 18/11/16 18:39, Stefano Stabellini wrote: >>>>> On Fri, 11 Nov 2016, Stefano Stabellini wrote: >>>>>> On Fri, 11 Nov 2016, Julien Grall wrote: >>>>>>> On 10/11/16 20:42, Stefano Stabellini wrote: >>>>>>> That's why in the approach we had on the previous series was "host ITS >>>>>>> command >>>>>>> should be limited when emulating guest ITS command". From my recall, in >>>>>>> that >>>>>>> series the host and guest LPIs was fully separated (enabling a guest >>>>>>> LPIs was >>>>>>> not enabling host LPIs). >>>>>> >>>>>> I am interested in reading what Ian suggested to do when the physical >>>>>> ITS queue is full, but I cannot find anything specific about it in the >>>>>> doc. >>>>>> >>>>>> Do you have a suggestion for this? >>>>>> >>>>>> The only things that come to mind right now are: >>>>>> >>>>>> 1) check if the ITS queue is full and busy loop until it is not (spin_lock >>>>>> style) >>>>>> 2) check if the ITS queue is full and sleep until it is not (mutex style) >>>>> >>>>> Another, probably better idea, is to map all pLPIs of a device when the >>>>> device is assigned to a guest (including Dom0). This is what was written >>>>> in Ian's design doc. The advantage of this approach is that Xen doesn't >>>>> need to take any actions on the physical ITS command queue when the >>>>> guest issues virtual ITS commands, therefore completely solving this >>>>> problem at the root. (Although I am not sure about enable/disable >>>>> commands: could we avoid issuing enable/disable on pLPIs?) >>>> >>>> In the previous design document (see [1]), the pLPIs are enabled when the >>>> device is assigned to the guest. This means that it is not necessary to send >>>> command there. This is also means we may receive a pLPI before the associated >>>> vLPI has been configured. >>>> >>>> That said, given that LPIs are edge-triggered, there is no deactivate state >>>> (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same >>>> LPIs could potentially be raised again. This could generate a storm. >>> >>> Thank you for raising this important point. You are correct. >>> >>>> The priority drop is necessary if we don't want to block the reception of >>>> interrupt for the current physical CPU. >>>> >>>> What I am more concerned about is this problem can also happen in normal >>>> running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For >>>> edge-triggered interrupt, there is no way to prevent them to fire again. Maybe >>>> it is time to introduce rate-limit interrupt for ARM. Any opinions? >>> >>> Yes. It could be as simple as disabling the pLPI when Xen receives a >>> second pLPI before the guest EOIs the first corresponding vLPI, which >>> shouldn't happen in normal circumstances. >>> >>> We need a simple per-LPI inflight counter, incremented when a pLPI is >>> received, decremented when the corresponding vLPI is EOIed (the LR is >>> cleared). >>> >>> When the counter > 1, we disable the pLPI and request a maintenance >>> interrupt for the corresponding vLPI. >> >> So why do we need a _counter_? This is about edge triggered interrupts, >> I think we can just accumulate all of them into one. > > The counter is not to re-inject the same amount of interrupts into the > guest, but to detect interrupt storms. I was wondering if an interrupt "storm" could already be defined by "receiving an LPI while there is already one pending (in the guest's virtual pending table) and it being disabled by the guest". I admit that declaring two interrupts as a storm is a bit of a stretch, but in fact the guest had probably a reason for disabling it even though it fires, so Xen should just follow suit. The only difference is that we don't do it _immediately_ when the guest tells us (via INV), but only if needed (LPI actually fires). >> So here is what I think: >> - We use the guest provided pending table to hold a pending bit for each >> VLPI. We can unmap the memory from the guest, since software is not >> supposed to access this table as per the spec. >> - We use the guest provided property table, without trapping it. There >> is nothing to be "validated" in that table, since it's a really tight >> encoding and every value written in there is legal. We only look at bit >> 0 for this exercise here anyway. > > I am following... > > >> - Upon reception of a physical LPI, we look it up to find the VCPU and >> virtual LPI number. This is what we need to do anyway and it's a quick >> two-level table lookup at the moment. >> - We use the VCPU's domain and the VLPI number to index the guest's >> property table and read the enabled bit. Again a quick table lookup. > > They should be both O(2), correct? The second is even O(1). And even the first one could be a single table, if desperately needed. >> - If the VLPI is enabled, we EOI it on the host and inject it. >> - If the VLPI is disabled, we set the pending bit in the VCPU's >> pending table and EOI on the host - to allow other IRQs. >> - On a guest INV command, we check whether that vLPI is now enabled: >> - If it is disabled now, we don't need to do anything. >> - If it is enabled now, we check the pending bit for that VLPI: >> - If it is 0, we don't do anything. >> - If it is 1, we inject the VLPI and clear the pending bit. >> - On a guest INVALL command, we just need to iterate over the virtual >> LPIs. > > Right, much better. > > >> If you look at the conditions above, the only immediate action is >> when a VLPI gets enabled _and_ its pending bit is set. So we can do >> 64-bit read accesses over the whole pending table to find non-zero words >> and thus set bits, which should be rare in practice. We can store the >> highest mapped VLPI to avoid iterating over the whole of the table. >> Ideally the guest has no direct control over the pending bits, since >> this is what the device generates. Also we limit the number of VLPIs in >> total per guest anyway. > > I wonder if we could even use a fully packed bitmask with only the > pending bits, so 1 bit per vLPI, rather than 1 byte per vLPI. That would > be a nice improvement. The _pending_ table is exactly that: one bit per VLPI. So by doing a 64-bit read we cover 64 VLPIs. And normally if an LPI fires it will probably be enabled (otherwise the guest would have disabled it in the device), so we inject it and don't need this table. It's really just for storing the pending status should an LPI arrive while the guest had _disabled_ it. I assume this is rather rare, so the table will mostly be empty: that's why I expect most reads to be 0 and the iteration of the table to be very quick. As an additional optimization we could store the highest and lowest virtually pending LPI, to avoid scanning the whole table. We can't do so much about the property table, though, because its layout is described in the spec - in contrast to the ITS tables, which are IMPDEF. But as we only need to do something if the LPI is _both_ enabled _and_ pending, scanning the pending table gives us a quite good filter already. For the few LPIs that hit here, we can just access the right byte in the property table. >> If that still sounds like a DOS vector, we could additionally rate-limit >> INVALLs, and/or track additions to the pending table after the last >> INVALL: if there haven't been any new pending bits since the last scan, >> INVALL is a NOP. >> >> Does that makes sense so far? > > It makes sense. It should be OK. > > >> So that just leaves us with this IRQ storm issue, which I am thinking >> about now. But I guess this is not a show stopper given we can disable >> the physical LPI if we sense this situation. > > That is true and it's exactly what we should do. > > >>> When we receive the maintenance interrupt and we clear the LR of the >>> vLPI, Xen should re-enable the pLPI. So I was thinking why you would need to wait for the guest to actually EOI it? Can't we end the interrupt storm condition at the moment the guest enables the interrupt? LPIs are edge triggered and so a storm in the past is easily merged into a single guest LPI once the guest enables it again. From there on we inject every triggered LPI into the guest. This special handling for the interrupt storm just stems from the fact that have to keep LPIs enabled on the h/w interrupt controller level, despite the guest having disabled it on it's own _virtual_ GIC. So once the guest enables it again, we are in line with the current GICv2/GICv3, aren't we? Do we have interrupt storm detection/prevention in the moment? And please keep in mind that LPIs are _always_ edge triggered: So once we EOI an LPI on the host, this "very same" LPI is gone from a h/w GIC point of view, the next incoming interrupt must have been triggered by a new interrupt condition in the device (new network packet, for instance). In contrast to GICv2 this applies to _every_ LPI. So I am not sure we should really care _too_ much about this (apart from the "guest has disabled it" part): Once we assign a device to a guest, we lose some control over the machine anyway and at least trust the device to not completely block the system. I don't see how the ITS differs in that respect from the GICv3/GICv2. A quick update on my side: I implemented the scheme I described in my earlier mail now and it boots to the Dom0 prompt on a fastmodel with an ITS: - On receiving the PHYSDEVOP_manage_pci_add hypercall in xen/arch/arm/physdev.c, we MAPD the device on the host, MAPTI a bunch of interrupts and enable them. We keep them unassigned in our host pLPI->VLPI table, so we discard them should they fire. This hypercall is issued by Dom0 Linux before bringing up any PCI devices, so it works even for Dom0 without any Linux changes. For DomUs with PCI passthrough Dom0 is expected to issue this hypercall on behalf of the to-be-created domain. - When a guest (be it Dom0 or Domu) actually maps an LPI (MAPTI), we just enter the virtual LPI number and the target VCPU in our pLPI-vLPI table and be done. Should it fire now, we know where to inject it, but refer to the enabled bit in the guest's property table before doing so. - When a guest (be it Dom0 or DomU) enables or disabled an interrupt, we don't do much, as we refer to the enable bit every time we want to inject already. The only thing I actually do is to inject an LPI if there is a virtual LPI pending and the LPI is now enabled. I will spend some time next week on updating the design document, describing the new approach. I hope it becomes a bit clearer then. Cheers, Andre. >>> Given that the state of the LRs is sync'ed before calling gic_interrupt, >>> we can be sure to know exactly in what state the vLPI is at any given >>> time. But for this to work correctly, it is important to configure the >>> pLPI to be delivered to the same pCPU running the vCPU which handles >>> the vLPI (as it is already the case today for SPIs). >> >> Why would that be necessary? > > Because the state of the LRs of other pCPUs won't be up to date: we > wouldn't know for sure whether the guest EOI'ed the vLPI or not.
On Fri, 9 Dec 2016, Julien Grall wrote: > Hi Stefano, > > On 07/12/16 20:20, Stefano Stabellini wrote: > > On Tue, 6 Dec 2016, Julien Grall wrote: > > > On 06/12/2016 22:01, Stefano Stabellini wrote: > > > > On Tue, 6 Dec 2016, Stefano Stabellini wrote: > > > > > moving a vCPU with interrupts assigned to it is slower than moving a > > > > > vCPU without interrupts assigned to it. You could say that the > > > > > slowness is directly proportional do the number of interrupts assigned > > > > > to the vCPU. > > > > > > > > To be pedantic, by "assigned" I mean that a physical interrupt is routed > > > > to a given pCPU and is set to be forwarded to a guest vCPU running on it > > > > by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of > > > > these physical interrupts, a corresponding virtual interrupt (could be a > > > > different irq) will be injected into the guest vCPU. > > > > > > > > When the vCPU is migrated to a new pCPU, the physical interrupts that > > > > are configured to be injected as virtual interrupts into the vCPU, are > > > > migrated with it. The physical interrupt migration has a cost. However, > > > > receiving physical interrupts on the wrong pCPU has an higher cost. > > > > > > I don't understand why it is a problem for you to receive the first > > > interrupt > > > to the wrong pCPU and moving it if necessary. > > > > > > While this may have an higher cost (I don't believe so) on the first > > > received > > > interrupt, migrating thousands of interrupts at the same time is very > > > expensive and will likely get Xen stuck for a while (think about ITS with > > > a > > > single command queue). > > > > > > Furthermore, the current approach will move every single interrupt routed > > > a > > > the vCPU, even those disabled. That's pointless and a waste of resource. > > > You > > > may argue that we can skip the ones disabled, but in that case what would > > > be > > > the benefits to migrate the IRQs while migrate the vCPUs? > > > > > > So I would suggest to spread it over the time. This also means less > > > headache > > > for the scheduler developers. > > > > The most important aspect of interrupts handling in Xen is latency, > > measured as the time between Xen receiving a physical interrupt and the > > guest receiving it. This latency should be both small and deterministic. > > > > We all agree so far, right? > > > > > > The issue with spreading interrupts migrations over time is that it makes > > interrupt latency less deterministic. It is OK, in the uncommon case of > > vCPU migration with interrupts, to take a hit for a short time. This > > "hit" can be measured. It can be known. If your workload cannot tolerate > > it, vCPUs can be pinned. It should be a rare event anyway. On the other > > hand, by spreading interrupts migrations, we make it harder to predict > > latency. Aside from determinism, another problem with this approach is > > that it ensures that every interrupt assigned to a vCPU will first hit > > the wrong pCPU, then it will be moved. It guarantees the worst-case > > scenario for interrupt latency for the vCPU that has been moved. If we > > migrated all interrupts as soon as possible, we would minimize the > > amount of interrupts delivered to the wrong pCPU. Most interrupts would > > be delivered to the new pCPU right away, reducing interrupt latency. > > Migrating all the interrupts can be really expensive because in the current > state we have to go through every single interrupt and check whether the > interrupt has been routed to this vCPU. We will also route disabled interrupt. > And this seems really pointless. This may need some optimization here. Indeed, that should be fixed. > With ITS, we may have thousand of interrupts routed to a vCPU. This means that > for every interrupt we have to issue a command in the host ITS queue. You will > likely fill up the command queue and add much more latency. > > Even if you consider the vCPU migration to be a rare case. You could still get > the pCPU stuck for tens of milliseconds, the time to migrate everything. And I > don't think this is not acceptable. [...] > If the number increases, you may end up to have the scheduler to decide to not > migrate the vCPU because it will be too expensive. But you may have a > situation where migrating a vCPU with many interrupts is the only possible > choice and you will slow down the platform. A vCPU with thousand of interrupts routed to it, is the case where I would push back to the scheduler. It should know that moving the vcpu would be very costly. Regardless, we need to figure out a way to move the interrupts without "blocking" the platform for long. In practice, we might find a threshold: a number of active interrupts above which we cannot move them all at once anymore. Something like: we move the first 500 active interrupts immediately, we delay the rest. We can find this threshold only with practical measurements. > Anyway, I would like to see measurement in both situation before deciding when > LPIs will be migrated. Yes, let's be scientific about this.
On Fri, 9 Dec 2016, Andre Przywara wrote: > On 07/12/16 20:20, Stefano Stabellini wrote: > > On Tue, 6 Dec 2016, Julien Grall wrote: > >> On 06/12/2016 22:01, Stefano Stabellini wrote: > >>> On Tue, 6 Dec 2016, Stefano Stabellini wrote: > >>>> moving a vCPU with interrupts assigned to it is slower than moving a > >>>> vCPU without interrupts assigned to it. You could say that the > >>>> slowness is directly proportional do the number of interrupts assigned > >>>> to the vCPU. > >>> > >>> To be pedantic, by "assigned" I mean that a physical interrupt is routed > >>> to a given pCPU and is set to be forwarded to a guest vCPU running on it > >>> by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of > >>> these physical interrupts, a corresponding virtual interrupt (could be a > >>> different irq) will be injected into the guest vCPU. > >>> > >>> When the vCPU is migrated to a new pCPU, the physical interrupts that > >>> are configured to be injected as virtual interrupts into the vCPU, are > >>> migrated with it. The physical interrupt migration has a cost. However, > >>> receiving physical interrupts on the wrong pCPU has an higher cost. > >> > >> I don't understand why it is a problem for you to receive the first interrupt > >> to the wrong pCPU and moving it if necessary. > >> > >> While this may have an higher cost (I don't believe so) on the first received > >> interrupt, migrating thousands of interrupts at the same time is very > >> expensive and will likely get Xen stuck for a while (think about ITS with a > >> single command queue). > >> > >> Furthermore, the current approach will move every single interrupt routed a > >> the vCPU, even those disabled. That's pointless and a waste of resource. You > >> may argue that we can skip the ones disabled, but in that case what would be > >> the benefits to migrate the IRQs while migrate the vCPUs? > >> > >> So I would suggest to spread it over the time. This also means less headache > >> for the scheduler developers. > > > > The most important aspect of interrupts handling in Xen is latency, > > measured as the time between Xen receiving a physical interrupt and the > > guest receiving it. This latency should be both small and deterministic. > > > > We all agree so far, right? > > > > > > The issue with spreading interrupts migrations over time is that it makes > > interrupt latency less deterministic. It is OK, in the uncommon case of > > vCPU migration with interrupts, to take a hit for a short time. This > > "hit" can be measured. It can be known. If your workload cannot tolerate > > it, vCPUs can be pinned. It should be a rare event anyway. On the other > > hand, by spreading interrupts migrations, we make it harder to predict > > latency. Aside from determinism, another problem with this approach is > > that it ensures that every interrupt assigned to a vCPU will first hit > > the wrong pCPU, then it will be moved. It guarantees the worst-case > > scenario for interrupt latency for the vCPU that has been moved. If we > > migrated all interrupts as soon as possible, we would minimize the > > amount of interrupts delivered to the wrong pCPU. Most interrupts would > > be delivered to the new pCPU right away, reducing interrupt latency. > > So if this is such a crucial issue, why don't we use the ITS for good > this time? The ITS hardware probably supports 16 bits worth of > collection IDs, so what about we assign each VCPU (in every guest) a > unique collection ID on the host and do a MAPC & MOVALL on a VCPU > migration to let it point to the right physical redistributor. > I see that this does not cover all use cases (> 65536 VCPUs, for > instance), also depends much of many implementation details: This is certainly an idea worth exploring. We don't need to assign a collection ID to every vCPU, just the ones that have LPIs assigned to them, which should be considerably fewer. > - How costly is a MOVALL? It needs to scan the pending table and > transfer set bits to the other redistributor, which may take a while. This is an hardware operation, even if it is not fast, I'd prefer to rely on that, rather than implementing something complex in software. Usually hardware gets better over time at this sort of things. > - Is there an impact if we exceed the number of hardware backed > collections (GITS_TYPE[HCC])? If the ITS is forced to access system > memory for every table lookup, this may slow down everyday operations. We'll have to fall back to manually moving them one by one. > - How likely are those misdirected interrupts in the first place? How > often do we migrate VCPU compared to the the interrupt frequency? This is where is scheduler work comes in. > There are more, subtle parameters to consider, so I guess we just need > to try and measure. That's right. This is why I have been saying that we need numbers. This is difficult hardware, difficult code and difficult scenarios. Intuition only gets us so far. We need to be scientific and measure the approach we decide to take, and maybe even one that we decided not to take, to figure out whether it is actually acceptable.
On Fri, 9 Dec 2016, Andre Przywara wrote: > >> I've been spending some time thinking about this, and I think we can in > >> fact get away without ever propagating command from domains to the host. > >> > >> I made a list of all commands that possible require host ITS command > >> propagation. There are two groups: > >> 1: enabling/disabling LPIs: INV, INVALL > >> 2: mapping/unmapping devices/events: DISCARD, MAPD, MAPTI. > >> > >> The second group can be handled by mapping all required devices up > >> front, I will elaborate on that in a different email. > >> > >> For the first group, read below ... > >> > >> On 01/12/16 01:19, Stefano Stabellini wrote: > >>> On Fri, 25 Nov 2016, Julien Grall wrote: > >>>> Hi, > >>>> > >>>> On 18/11/16 18:39, Stefano Stabellini wrote: > >>>>> On Fri, 11 Nov 2016, Stefano Stabellini wrote: > >>>>>> On Fri, 11 Nov 2016, Julien Grall wrote: > >>>>>>> On 10/11/16 20:42, Stefano Stabellini wrote: > >>>>>>> That's why in the approach we had on the previous series was "host ITS > >>>>>>> command > >>>>>>> should be limited when emulating guest ITS command". From my recall, in > >>>>>>> that > >>>>>>> series the host and guest LPIs was fully separated (enabling a guest > >>>>>>> LPIs was > >>>>>>> not enabling host LPIs). > >>>>>> > >>>>>> I am interested in reading what Ian suggested to do when the physical > >>>>>> ITS queue is full, but I cannot find anything specific about it in the > >>>>>> doc. > >>>>>> > >>>>>> Do you have a suggestion for this? > >>>>>> > >>>>>> The only things that come to mind right now are: > >>>>>> > >>>>>> 1) check if the ITS queue is full and busy loop until it is not (spin_lock > >>>>>> style) > >>>>>> 2) check if the ITS queue is full and sleep until it is not (mutex style) > >>>>> > >>>>> Another, probably better idea, is to map all pLPIs of a device when the > >>>>> device is assigned to a guest (including Dom0). This is what was written > >>>>> in Ian's design doc. The advantage of this approach is that Xen doesn't > >>>>> need to take any actions on the physical ITS command queue when the > >>>>> guest issues virtual ITS commands, therefore completely solving this > >>>>> problem at the root. (Although I am not sure about enable/disable > >>>>> commands: could we avoid issuing enable/disable on pLPIs?) > >>>> > >>>> In the previous design document (see [1]), the pLPIs are enabled when the > >>>> device is assigned to the guest. This means that it is not necessary to send > >>>> command there. This is also means we may receive a pLPI before the associated > >>>> vLPI has been configured. > >>>> > >>>> That said, given that LPIs are edge-triggered, there is no deactivate state > >>>> (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same > >>>> LPIs could potentially be raised again. This could generate a storm. > >>> > >>> Thank you for raising this important point. You are correct. > >>> > >>>> The priority drop is necessary if we don't want to block the reception of > >>>> interrupt for the current physical CPU. > >>>> > >>>> What I am more concerned about is this problem can also happen in normal > >>>> running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For > >>>> edge-triggered interrupt, there is no way to prevent them to fire again. Maybe > >>>> it is time to introduce rate-limit interrupt for ARM. Any opinions? > >>> > >>> Yes. It could be as simple as disabling the pLPI when Xen receives a > >>> second pLPI before the guest EOIs the first corresponding vLPI, which > >>> shouldn't happen in normal circumstances. > >>> > >>> We need a simple per-LPI inflight counter, incremented when a pLPI is > >>> received, decremented when the corresponding vLPI is EOIed (the LR is > >>> cleared). > >>> > >>> When the counter > 1, we disable the pLPI and request a maintenance > >>> interrupt for the corresponding vLPI. > >> > >> So why do we need a _counter_? This is about edge triggered interrupts, > >> I think we can just accumulate all of them into one. > > > > The counter is not to re-inject the same amount of interrupts into the > > guest, but to detect interrupt storms. > > I was wondering if an interrupt "storm" could already be defined by > "receiving an LPI while there is already one pending (in the guest's > virtual pending table) and it being disabled by the guest". I admit that > declaring two interrupts as a storm is a bit of a stretch, but in fact > the guest had probably a reason for disabling it even though it > fires, so Xen should just follow suit. > The only difference is that we don't do it _immediately_ when the guest > tells us (via INV), but only if needed (LPI actually fires). Either way should work OK, I think. > >> - If the VLPI is enabled, we EOI it on the host and inject it. > >> - If the VLPI is disabled, we set the pending bit in the VCPU's > >> pending table and EOI on the host - to allow other IRQs. > >> - On a guest INV command, we check whether that vLPI is now enabled: > >> - If it is disabled now, we don't need to do anything. > >> - If it is enabled now, we check the pending bit for that VLPI: > >> - If it is 0, we don't do anything. > >> - If it is 1, we inject the VLPI and clear the pending bit. > >> - On a guest INVALL command, we just need to iterate over the virtual > >> LPIs. > > > > Right, much better. > > > > > >> If you look at the conditions above, the only immediate action is > >> when a VLPI gets enabled _and_ its pending bit is set. So we can do > >> 64-bit read accesses over the whole pending table to find non-zero words > >> and thus set bits, which should be rare in practice. We can store the > >> highest mapped VLPI to avoid iterating over the whole of the table. > >> Ideally the guest has no direct control over the pending bits, since > >> this is what the device generates. Also we limit the number of VLPIs in > >> total per guest anyway. > > > > I wonder if we could even use a fully packed bitmask with only the > > pending bits, so 1 bit per vLPI, rather than 1 byte per vLPI. That would > > be a nice improvement. > > The _pending_ table is exactly that: one bit per VLPI. Actually the spec says about the pending table, ch 6.1.2: "Each Redistributor maintains entries in a separate LPI Pending table that indicates the pending state of each LPI when GICR_CTLR.EnableLPIs == 1 in the Redistributor: 0 The LPI is not pending. 1 The LPI is pending. For a given LPI: • The corresponding byte in the LPI Pending table is (base address + (N / 8)). • The bit position in the byte is (N MOD 8)." It seems to me that each LPI is supposed to have a byte, not a bit. Am I looking at the wrong table? In any case you suggested to trap the pending table, so we can actually write anything we want to those guest provided pages. > So by doing a > 64-bit read we cover 64 VLPIs. And normally if an LPI fires it will > probably be enabled (otherwise the guest would have disabled it in the > device), so we inject it and don't need this table. It's > really just for storing the pending status should an LPI arrive while > the guest had _disabled_ it. I assume this is rather rare, so the table > will mostly be empty: that's why I expect most reads to be 0 and the > iteration of the table to be very quick. As an additional optimization > we could store the highest and lowest virtually pending LPI, to avoid > scanning the whole table. > > We can't do so much about the property table, though, because its layout > is described in the spec - in contrast to the ITS tables, which are IMPDEF. > But as we only need to do something if the LPI is _both_ enabled _and_ > pending, scanning the pending table gives us a quite good filter > already. For the few LPIs that hit here, we can just access the right > byte in the property table. OK > >> If that still sounds like a DOS vector, we could additionally rate-limit > >> INVALLs, and/or track additions to the pending table after the last > >> INVALL: if there haven't been any new pending bits since the last scan, > >> INVALL is a NOP. > >> > >> Does that makes sense so far? > > > > It makes sense. It should be OK. > > > > > >> So that just leaves us with this IRQ storm issue, which I am thinking > >> about now. But I guess this is not a show stopper given we can disable > >> the physical LPI if we sense this situation. > > > > That is true and it's exactly what we should do. > > > > > >>> When we receive the maintenance interrupt and we clear the LR of the > >>> vLPI, Xen should re-enable the pLPI. > > So I was thinking why you would need to wait for the guest to actually > EOI it? > Can't we end the interrupt storm condition at the moment the guest > enables the interrupt? LPIs are edge triggered and so a storm in the > past is easily merged into a single guest LPI once the guest enables it > again. From there on we inject every triggered LPI into the guest. What you describe works if the guest disables the interrupt. But what if it doesn't? Xen should also be able to cope with non-cooperative guests which might even have triggered the interrupt storm on purpose. I think that you are asking this question because we are actually talking about two different issues. See below. > This special handling for the interrupt storm just stems from the fact > that have to keep LPIs enabled on the h/w interrupt controller level, > despite the guest having disabled it on it's own _virtual_ GIC. So once > the guest enables it again, we are in line with the current GICv2/GICv3, > aren't we? Do we have interrupt storm detection/prevention in the moment? No, that's not the cause of the storm. Julien described well before: given that LPIs are edge-triggered, there is no deactivate state (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same LPIs could potentially be raised again. This could generate a storm. The problem is that Xen has to do priority drop upon receiving an pLPI, but, given that LPIs don't have a deactivate state, the priority drop is enough to let the hardware inject a second LPI, even if the guest didn't EOI the first one yet. In the case of SPIs, the hardware cannot inject a second interrupt after Xen does priority drop, it has to wait for the guest to EOI it. > And please keep in mind that LPIs are _always_ edge triggered: So once > we EOI an LPI on the host, this "very same" LPI is gone from a h/w GIC > point of view, the next incoming interrupt must have been triggered by a > new interrupt condition in the device (new network packet, for > instance). This is actually the problem: what if the guest configured the device on purpose to keep generating LPIs without pause? Nice and simple way to take down the host. > In contrast to GICv2 this applies to _every_ LPI. > So I am not sure we should really care _too_ much about this (apart from > the "guest has disabled it" part): Once we assign a device to a guest, > we lose some control over the machine anyway and at least trust the > device to not completely block the system. No we don't! Hardware engineers make mistakes too! We have to protect Xen from devices which purposely or mistakenly generate interrupt storms. This is actually a pretty common problem. > I don't see how the ITS differs in that respect from the GICv3/GICv2. It differs because in the case of GICv2, every single interrupt has to be EOI'd by the guest. Therefore the Xen scheduler can still decide to schedule it out. In the case of the ITS, Xen could be stuck in an interrupt handling loop. > A quick update on my side: I implemented the scheme I described in my > earlier mail now and it boots to the Dom0 prompt on a fastmodel with an ITS: Nice! > - On receiving the PHYSDEVOP_manage_pci_add hypercall in > xen/arch/arm/physdev.c, we MAPD the device on the host, MAPTI a bunch of > interrupts and enable them. We keep them unassigned in our host > pLPI->VLPI table, so we discard them should they fire. > This hypercall is issued by Dom0 Linux before bringing up any PCI > devices, so it works even for Dom0 without any Linux changes. For DomUs > with PCI passthrough Dom0 is expected to issue this hypercall on behalf > of the to-be-created domain. > - When a guest (be it Dom0 or Domu) actually maps an LPI (MAPTI), we > just enter the virtual LPI number and the target VCPU in our pLPI-vLPI > table and be done. Should it fire now, we know where to inject it, but > refer to the enabled bit in the guest's property table before doing so. > - When a guest (be it Dom0 or DomU) enables or disabled an interrupt, we > don't do much, as we refer to the enable bit every time we want to > inject already. The only thing I actually do is to inject an LPI if > there is a virtual LPI pending and the LPI is now enabled. Sounds good, well done!
Hi Stefano, thanks for the prompt and helpful answer! On 10/12/16 00:30, Stefano Stabellini wrote: > On Fri, 9 Dec 2016, Andre Przywara wrote: >>>> I've been spending some time thinking about this, and I think we can in >>>> fact get away without ever propagating command from domains to the host. >>>> >>>> I made a list of all commands that possible require host ITS command >>>> propagation. There are two groups: >>>> 1: enabling/disabling LPIs: INV, INVALL >>>> 2: mapping/unmapping devices/events: DISCARD, MAPD, MAPTI. >>>> >>>> The second group can be handled by mapping all required devices up >>>> front, I will elaborate on that in a different email. >>>> >>>> For the first group, read below ... >>>> >>>> On 01/12/16 01:19, Stefano Stabellini wrote: >>>>> On Fri, 25 Nov 2016, Julien Grall wrote: >>>>>> Hi, >>>>>> >>>>>> On 18/11/16 18:39, Stefano Stabellini wrote: >>>>>>> On Fri, 11 Nov 2016, Stefano Stabellini wrote: >>>>>>>> On Fri, 11 Nov 2016, Julien Grall wrote: >>>>>>>>> On 10/11/16 20:42, Stefano Stabellini wrote: >>>>>>>>> That's why in the approach we had on the previous series was "host ITS >>>>>>>>> command >>>>>>>>> should be limited when emulating guest ITS command". From my recall, in >>>>>>>>> that >>>>>>>>> series the host and guest LPIs was fully separated (enabling a guest >>>>>>>>> LPIs was >>>>>>>>> not enabling host LPIs). >>>>>>>> >>>>>>>> I am interested in reading what Ian suggested to do when the physical >>>>>>>> ITS queue is full, but I cannot find anything specific about it in the >>>>>>>> doc. >>>>>>>> >>>>>>>> Do you have a suggestion for this? >>>>>>>> >>>>>>>> The only things that come to mind right now are: >>>>>>>> >>>>>>>> 1) check if the ITS queue is full and busy loop until it is not (spin_lock >>>>>>>> style) >>>>>>>> 2) check if the ITS queue is full and sleep until it is not (mutex style) >>>>>>> >>>>>>> Another, probably better idea, is to map all pLPIs of a device when the >>>>>>> device is assigned to a guest (including Dom0). This is what was written >>>>>>> in Ian's design doc. The advantage of this approach is that Xen doesn't >>>>>>> need to take any actions on the physical ITS command queue when the >>>>>>> guest issues virtual ITS commands, therefore completely solving this >>>>>>> problem at the root. (Although I am not sure about enable/disable >>>>>>> commands: could we avoid issuing enable/disable on pLPIs?) >>>>>> >>>>>> In the previous design document (see [1]), the pLPIs are enabled when the >>>>>> device is assigned to the guest. This means that it is not necessary to send >>>>>> command there. This is also means we may receive a pLPI before the associated >>>>>> vLPI has been configured. >>>>>> >>>>>> That said, given that LPIs are edge-triggered, there is no deactivate state >>>>>> (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same >>>>>> LPIs could potentially be raised again. This could generate a storm. >>>>> >>>>> Thank you for raising this important point. You are correct. >>>>> >>>>>> The priority drop is necessary if we don't want to block the reception of >>>>>> interrupt for the current physical CPU. >>>>>> >>>>>> What I am more concerned about is this problem can also happen in normal >>>>>> running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For >>>>>> edge-triggered interrupt, there is no way to prevent them to fire again. Maybe >>>>>> it is time to introduce rate-limit interrupt for ARM. Any opinions? >>>>> >>>>> Yes. It could be as simple as disabling the pLPI when Xen receives a >>>>> second pLPI before the guest EOIs the first corresponding vLPI, which >>>>> shouldn't happen in normal circumstances. >>>>> >>>>> We need a simple per-LPI inflight counter, incremented when a pLPI is >>>>> received, decremented when the corresponding vLPI is EOIed (the LR is >>>>> cleared). >>>>> >>>>> When the counter > 1, we disable the pLPI and request a maintenance >>>>> interrupt for the corresponding vLPI. >>>> >>>> So why do we need a _counter_? This is about edge triggered interrupts, >>>> I think we can just accumulate all of them into one. >>> >>> The counter is not to re-inject the same amount of interrupts into the >>> guest, but to detect interrupt storms. >> >> I was wondering if an interrupt "storm" could already be defined by >> "receiving an LPI while there is already one pending (in the guest's >> virtual pending table) and it being disabled by the guest". I admit that >> declaring two interrupts as a storm is a bit of a stretch, but in fact >> the guest had probably a reason for disabling it even though it >> fires, so Xen should just follow suit. >> The only difference is that we don't do it _immediately_ when the guest >> tells us (via INV), but only if needed (LPI actually fires). > > Either way should work OK, I think. > > >>>> - If the VLPI is enabled, we EOI it on the host and inject it. >>>> - If the VLPI is disabled, we set the pending bit in the VCPU's >>>> pending table and EOI on the host - to allow other IRQs. >>>> - On a guest INV command, we check whether that vLPI is now enabled: >>>> - If it is disabled now, we don't need to do anything. >>>> - If it is enabled now, we check the pending bit for that VLPI: >>>> - If it is 0, we don't do anything. >>>> - If it is 1, we inject the VLPI and clear the pending bit. >>>> - On a guest INVALL command, we just need to iterate over the virtual >>>> LPIs. >>> >>> Right, much better. >>> >>> >>>> If you look at the conditions above, the only immediate action is >>>> when a VLPI gets enabled _and_ its pending bit is set. So we can do >>>> 64-bit read accesses over the whole pending table to find non-zero words >>>> and thus set bits, which should be rare in practice. We can store the >>>> highest mapped VLPI to avoid iterating over the whole of the table. >>>> Ideally the guest has no direct control over the pending bits, since >>>> this is what the device generates. Also we limit the number of VLPIs in >>>> total per guest anyway. >>> >>> I wonder if we could even use a fully packed bitmask with only the >>> pending bits, so 1 bit per vLPI, rather than 1 byte per vLPI. That would >>> be a nice improvement. >> >> The _pending_ table is exactly that: one bit per VLPI. > > Actually the spec says about the pending table, ch 6.1.2: > > "Each Redistributor maintains entries in a separate LPI Pending table > that indicates the pending state of each LPI when GICR_CTLR.EnableLPIs > == 1 in the Redistributor: > 0 The LPI is not pending. > 1 The LPI is pending. > > For a given LPI: > • The corresponding byte in the LPI Pending table is (base address + (N / 8)). > • The bit position in the byte is (N MOD 8)." ^^^ > It seems to me that each LPI is supposed to have a byte, not a bit. Am I > looking at the wrong table? Well, the explanation could indeed be a bit more explicit, but it's really meant to be a bit: 1) The two lines above describe how to address a single bit in a byte-addressed array. 2) The following paragraphs talk about "the first 1KB" when it comes to non-LPI interrupts. This matches 8192 bits. 3) In section 6.1 the spec states: "Memory-backed storage for LPI pending _bits_ in an LPI Pending table." 4) The actual instance, Marc's Linux driver, also speaks of a bit: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/irqchip/irq-gic-v3-its.c#n785 > In any case you suggested to trap the pending table, so we can actually > write anything we want to those guest provided pages. Indeed: "During normal operation, the LPI Pending table is maintained solely by the Redistributor." >> So by doing a >> 64-bit read we cover 64 VLPIs. And normally if an LPI fires it will >> probably be enabled (otherwise the guest would have disabled it in the >> device), so we inject it and don't need this table. It's >> really just for storing the pending status should an LPI arrive while >> the guest had _disabled_ it. I assume this is rather rare, so the table >> will mostly be empty: that's why I expect most reads to be 0 and the >> iteration of the table to be very quick. As an additional optimization >> we could store the highest and lowest virtually pending LPI, to avoid >> scanning the whole table. >> >> We can't do so much about the property table, though, because its layout >> is described in the spec - in contrast to the ITS tables, which are IMPDEF. >> But as we only need to do something if the LPI is _both_ enabled _and_ >> pending, scanning the pending table gives us a quite good filter >> already. For the few LPIs that hit here, we can just access the right >> byte in the property table. > > OK > > >>>> If that still sounds like a DOS vector, we could additionally rate-limit >>>> INVALLs, and/or track additions to the pending table after the last >>>> INVALL: if there haven't been any new pending bits since the last scan, >>>> INVALL is a NOP. >>>> >>>> Does that makes sense so far? >>> >>> It makes sense. It should be OK. >>> >>> >>>> So that just leaves us with this IRQ storm issue, which I am thinking >>>> about now. But I guess this is not a show stopper given we can disable >>>> the physical LPI if we sense this situation. >>> >>> That is true and it's exactly what we should do. >>> >>> >>>>> When we receive the maintenance interrupt and we clear the LR of the >>>>> vLPI, Xen should re-enable the pLPI. >> >> So I was thinking why you would need to wait for the guest to actually >> EOI it? >> Can't we end the interrupt storm condition at the moment the guest >> enables the interrupt? LPIs are edge triggered and so a storm in the >> past is easily merged into a single guest LPI once the guest enables it >> again. From there on we inject every triggered LPI into the guest. > > What you describe works if the guest disables the interrupt. But what if > it doesn't? Xen should also be able to cope with non-cooperative guests > which might even have triggered the interrupt storm on purpose. > > I think that you are asking this question because we are actually > talking about two different issues. See below. Indeed I was wondering about that ... > >> This special handling for the interrupt storm just stems from the fact >> that have to keep LPIs enabled on the h/w interrupt controller level, >> despite the guest having disabled it on it's own _virtual_ GIC. So once >> the guest enables it again, we are in line with the current GICv2/GICv3, >> aren't we? Do we have interrupt storm detection/prevention in the moment? > > No, that's not the cause of the storm. Julien described well before: > > given that LPIs are edge-triggered, there is no deactivate state (see 4.1 > in ARM IHI 0069C). So as soon as the priority drop is done, the same LPIs could > potentially be raised again. This could generate a storm. > > The problem is that Xen has to do priority drop upon receiving an pLPI, > but, given that LPIs don't have a deactivate state, the priority drop is > enough to let the hardware inject a second LPI, even if the guest didn't > EOI the first one yet. > > In the case of SPIs, the hardware cannot inject a second interrupt after > Xen does priority drop, it has to wait for the guest to EOI it. I understand that ... >> And please keep in mind that LPIs are _always_ edge triggered: So once >> we EOI an LPI on the host, this "very same" LPI is gone from a h/w GIC >> point of view, the next incoming interrupt must have been triggered by a >> new interrupt condition in the device (new network packet, for >> instance). > > This is actually the problem: what if the guest configured the device on > purpose to keep generating LPIs without pause? Nice and simple way to > take down the host. I see, I just wasn't sure we were talking about the same thing: actual interrupt storm triggered by the device vs. "virtual" interrupt storm due to an interrupt line not being lowered by the IRQ handler. And I was hoping for the latter, but well ... So thanks for the clarification. >> In contrast to GICv2 this applies to _every_ LPI. >> So I am not sure we should really care _too_ much about this (apart from >> the "guest has disabled it" part): Once we assign a device to a guest, >> we lose some control over the machine anyway and at least trust the >> device to not completely block the system. > > No we don't! Hardware engineers make mistakes too! You tell me ... ;-) > We have to protect > Xen from devices which purposely or mistakenly generate interrupt > storms. This is actually a pretty common problem. I see, though this doesn't make this whole problem easier ;-) >> I don't see how the ITS differs in that respect from the GICv3/GICv2. > > It differs because in the case of GICv2, every single interrupt has to > be EOI'd by the guest. Sure, though I think technically it's "deactivated" here that matters (we EOI LPIs as well). And since LPIs have no active state, this makes the difference. > Therefore the Xen scheduler can still decide to > schedule it out. In the case of the ITS, Xen could be stuck in an > interrupt handling loop. So I was wondering as we might need to relax our new strict "No (unprivileged) guest ever causes a host ITS command to be queued" rule a bit, because: - Disabling an LPI is a separate issue, as we can trigger this in Xen interrupt context once we decide that this is an interrupt storm. - But enabling it again has to both happen in timely manner (as the guest expects interrupts to come in) and to be triggered by a guest action, which causes the INV command to be send when handling a guest fault. Now this INV command (and possibly a follow-up SYNC) for enabling an LPI would be the only critical ones, so I was wondering if we could ensure that these commands can always be queued immediately, by making sure we have at least two ITS command queue slots available all of the time. Other ITS commands (triggered by device pass-throughs, for instance), would then have to potentially wait if we foresee that they could fill up the host command queue. Something like QoS for ITS commands. And I think we should map the maximum command queue size on the host (1MB => 32768 commands) to make this scenario less likely. I will need to think about this a bit further, maybe implement something as a proof of concept. Cheers, Andre. > >> A quick update on my side: I implemented the scheme I described in my >> earlier mail now and it boots to the Dom0 prompt on a fastmodel with an ITS: > > Nice! > > >> - On receiving the PHYSDEVOP_manage_pci_add hypercall in >> xen/arch/arm/physdev.c, we MAPD the device on the host, MAPTI a bunch of >> interrupts and enable them. We keep them unassigned in our host >> pLPI->VLPI table, so we discard them should they fire. >> This hypercall is issued by Dom0 Linux before bringing up any PCI >> devices, so it works even for Dom0 without any Linux changes. For DomUs >> with PCI passthrough Dom0 is expected to issue this hypercall on behalf >> of the to-be-created domain. >> - When a guest (be it Dom0 or Domu) actually maps an LPI (MAPTI), we >> just enter the virtual LPI number and the target VCPU in our pLPI-vLPI >> table and be done. Should it fire now, we know where to inject it, but >> refer to the enabled bit in the guest's property table before doing so. >> - When a guest (be it Dom0 or DomU) enables or disabled an interrupt, we >> don't do much, as we refer to the enable bit every time we want to >> inject already. The only thing I actually do is to inject an LPI if >> there is a virtual LPI pending and the LPI is now enabled. > > Sounds good, well done! >
On Mon, 12 Dec 2016, Andre Przywara wrote: > >> The _pending_ table is exactly that: one bit per VLPI. > > > > Actually the spec says about the pending table, ch 6.1.2: > > > > "Each Redistributor maintains entries in a separate LPI Pending table > > that indicates the pending state of each LPI when GICR_CTLR.EnableLPIs > > == 1 in the Redistributor: > > 0 The LPI is not pending. > > 1 The LPI is pending. > > > > For a given LPI: > > • The corresponding byte in the LPI Pending table is (base address + (N / 8)). > > • The bit position in the byte is (N MOD 8)." > > ^^^ > > > It seems to me that each LPI is supposed to have a byte, not a bit. Am I > > looking at the wrong table? > > Well, the explanation could indeed be a bit more explicit, but it's > really meant to be a bit: > 1) The two lines above describe how to address a single bit in a > byte-addressed array. > 2) The following paragraphs talk about "the first 1KB" when it comes to > non-LPI interrupts. This matches 8192 bits. > 3) In section 6.1 the spec states: "Memory-backed storage for LPI > pending _bits_ in an LPI Pending table." > 4) The actual instance, Marc's Linux driver, also speaks of a bit: > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/irqchip/irq-gic-v3-its.c#n785 I was mislead by the doc. Better this way. > >> This special handling for the interrupt storm just stems from the fact > >> that have to keep LPIs enabled on the h/w interrupt controller level, > >> despite the guest having disabled it on it's own _virtual_ GIC. So once > >> the guest enables it again, we are in line with the current GICv2/GICv3, > >> aren't we? Do we have interrupt storm detection/prevention in the moment? > > > > No, that's not the cause of the storm. Julien described well before: > > > > given that LPIs are edge-triggered, there is no deactivate state (see 4.1 > > in ARM IHI 0069C). So as soon as the priority drop is done, the same LPIs could > > potentially be raised again. This could generate a storm. > > > > The problem is that Xen has to do priority drop upon receiving an pLPI, > > but, given that LPIs don't have a deactivate state, the priority drop is > > enough to let the hardware inject a second LPI, even if the guest didn't > > EOI the first one yet. > > > > In the case of SPIs, the hardware cannot inject a second interrupt after > > Xen does priority drop, it has to wait for the guest to EOI it. > > I understand that ... > > >> And please keep in mind that LPIs are _always_ edge triggered: So once > >> we EOI an LPI on the host, this "very same" LPI is gone from a h/w GIC > >> point of view, the next incoming interrupt must have been triggered by a > >> new interrupt condition in the device (new network packet, for > >> instance). > > > > This is actually the problem: what if the guest configured the device on > > purpose to keep generating LPIs without pause? Nice and simple way to > > take down the host. > > I see, I just wasn't sure we were talking about the same thing: actual > interrupt storm triggered by the device vs. "virtual" interrupt storm > due to an interrupt line not being lowered by the IRQ handler. > And I was hoping for the latter, but well ... > So thanks for the clarification. > > >> In contrast to GICv2 this applies to _every_ LPI. > >> So I am not sure we should really care _too_ much about this (apart from > >> the "guest has disabled it" part): Once we assign a device to a guest, > >> we lose some control over the machine anyway and at least trust the > >> device to not completely block the system. > > > > No we don't! Hardware engineers make mistakes too! > > You tell me ... ;-) > > > We have to protect > > Xen from devices which purposely or mistakenly generate interrupt > > storms. This is actually a pretty common problem. > > I see, though this doesn't make this whole problem easier ;-) > > >> I don't see how the ITS differs in that respect from the GICv3/GICv2. > > > > It differs because in the case of GICv2, every single interrupt has to > > be EOI'd by the guest. > > Sure, though I think technically it's "deactivated" here that matters > (we EOI LPIs as well). And since LPIs have no active state, this makes > the difference. > > > Therefore the Xen scheduler can still decide to > > schedule it out. In the case of the ITS, Xen could be stuck in an > > interrupt handling loop. > > So I was wondering as we might need to relax our new strict "No > (unprivileged) guest ever causes a host ITS command to be queued" rule a > bit, because: > - Disabling an LPI is a separate issue, as we can trigger this in Xen > interrupt context once we decide that this is an interrupt storm. > - But enabling it again has to both happen in timely manner (as the > guest expects interrupts to come in) and to be triggered by a guest > action, which causes the INV command to be send when handling a guest fault. > > Now this INV command (and possibly a follow-up SYNC) for enabling an LPI > would be the only critical ones, so I was wondering if we could ensure > that these commands can always be queued immediately, by making sure we > have at least two ITS command queue slots available all of the time. > Other ITS commands (triggered by device pass-throughs, for instance), > would then have to potentially wait if we foresee that they could fill > up the host command queue. > Something like QoS for ITS commands. > And I think we should map the maximum command queue size on the host > (1MB => 32768 commands) to make this scenario less likely. > > I will need to think about this a bit further, maybe implement something > as a proof of concept. On one hand, I'd say that it should be rare to re-enable an interrupt, after it was disabled due to a storm. Extremely rare. It should be OK to issue a physical INV and SYNC in that event. However, if that happens, it could very well be because the guest is trying to take down the host, and issuing physical ITS commands in response to a malicious guest action, could be a good way to help the attacker. We need to be extra careful. Given that a storm is supposed to be an exceptional circumstance, it is OK to enforce very strict limits on the amount of times we are willing to issue physical ITS commands as a consequence of a guest action. For example, we could decide to do it just once, or twice, then label the guest as "untrustworthy" and destroy it. After all if a storm keeps happening, it must be due to a malicious guest or faulty hardware - in both cases it is best to terminate the VM.
> On Dec 10, 2016, at 4:18 AM, Stefano Stabellini <sstabellini@kernel.org> wrote: > > On Fri, 9 Dec 2016, Andre Przywara wrote: >> On 07/12/16 20:20, Stefano Stabellini wrote: >>> On Tue, 6 Dec 2016, Julien Grall wrote: >>>> On 06/12/2016 22:01, Stefano Stabellini wrote: >>>>> On Tue, 6 Dec 2016, Stefano Stabellini wrote: >>>>>> moving a vCPU with interrupts assigned to it is slower than moving a >>>>>> vCPU without interrupts assigned to it. You could say that the >>>>>> slowness is directly proportional do the number of interrupts assigned >>>>>> to the vCPU. >>>>> >>>>> To be pedantic, by "assigned" I mean that a physical interrupt is routed >>>>> to a given pCPU and is set to be forwarded to a guest vCPU running on it >>>>> by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of >>>>> these physical interrupts, a corresponding virtual interrupt (could be a >>>>> different irq) will be injected into the guest vCPU. >>>>> >>>>> When the vCPU is migrated to a new pCPU, the physical interrupts that >>>>> are configured to be injected as virtual interrupts into the vCPU, are >>>>> migrated with it. The physical interrupt migration has a cost. However, >>>>> receiving physical interrupts on the wrong pCPU has an higher cost. >>>> >>>> I don't understand why it is a problem for you to receive the first interrupt >>>> to the wrong pCPU and moving it if necessary. >>>> >>>> While this may have an higher cost (I don't believe so) on the first received >>>> interrupt, migrating thousands of interrupts at the same time is very >>>> expensive and will likely get Xen stuck for a while (think about ITS with a >>>> single command queue). >>>> >>>> Furthermore, the current approach will move every single interrupt routed a >>>> the vCPU, even those disabled. That's pointless and a waste of resource. You >>>> may argue that we can skip the ones disabled, but in that case what would be >>>> the benefits to migrate the IRQs while migrate the vCPUs? >>>> >>>> So I would suggest to spread it over the time. This also means less headache >>>> for the scheduler developers. >>> >>> The most important aspect of interrupts handling in Xen is latency, >>> measured as the time between Xen receiving a physical interrupt and the >>> guest receiving it. This latency should be both small and deterministic. >>> >>> We all agree so far, right? >>> >>> >>> The issue with spreading interrupts migrations over time is that it makes >>> interrupt latency less deterministic. It is OK, in the uncommon case of >>> vCPU migration with interrupts, to take a hit for a short time. This >>> "hit" can be measured. It can be known. If your workload cannot tolerate >>> it, vCPUs can be pinned. It should be a rare event anyway. On the other >>> hand, by spreading interrupts migrations, we make it harder to predict >>> latency. Aside from determinism, another problem with this approach is >>> that it ensures that every interrupt assigned to a vCPU will first hit >>> the wrong pCPU, then it will be moved. It guarantees the worst-case >>> scenario for interrupt latency for the vCPU that has been moved. If we >>> migrated all interrupts as soon as possible, we would minimize the >>> amount of interrupts delivered to the wrong pCPU. Most interrupts would >>> be delivered to the new pCPU right away, reducing interrupt latency. OK, so ultimately for each interrupt we can take an “eager” approach and move it as soon as the vcpu moves, or a “lazy” approach and move it after it fires. The two options which have been discussed are: 1. Always take an eager approach, and try to tell the scheduler to limit the migration frequency for these vcpus more than others 2. Always take a lazy approach, and leave the scheduler the way it is. Another approach which one might take: 3. Eagerly migrate a subset of the interrupts and lazily migrate the others. For instance, we could eagerly migrate all the interrupts which have fired since the last vcpu migration. In a system where migrations happen frequently, this should only be a handful; in a system that migrates infrequently, this will be more, but it won’t matter, because it will happen less often. Workloads which need predictable IRQ latencies should probably be pinning their vcpus anyway. So at the moment, the scheduler already tries to avoid migrating things *a little bit* if it can (see migrate_resist). It’s not clear to me at the moment whether this is enough or not. Or to put it a different way — how long should the scheduler try to wait before moving one of these vcpus? At the moment I haven’t seen a good way of calculating this. #3 to me has the feeling of being somewhat more satisfying, but also potentially fairly complicated. Since the scheduler already does migration resistance somewhat, #1 would be a simpler to implement in the sort run. If it turns out that #1 has other drawbacks, we can implement #3 as and when needed. Thoughts? -George >
On Wed, 2016-12-14 at 03:39 +0100, George Dunlap wrote: > > On Dec 10, 2016, at 4:18 AM, Stefano Stabellini <sstabellini@kernel > > .org> wrote: > > > > The issue with spreading interrupts migrations over time is > > > > that it makes > > > > interrupt latency less deterministic. It is OK, in the uncommon > > > > case of > > > > vCPU migration with interrupts, to take a hit for a short time. > > > > This > > > > "hit" can be measured. It can be known. If your workload cannot > > > > tolerate > > > > it, vCPUs can be pinned. It should be a rare event anyway. On > > > > the other > > > > hand, by spreading interrupts migrations, we make it harder to > > > > predict > > > > latency. Aside from determinism, another problem with this > > > > approach is > > > > that it ensures that every interrupt assigned to a vCPU will > > > > first hit > > > > the wrong pCPU, then it will be moved. It guarantees the worst- > > > > case > > > > scenario for interrupt latency for the vCPU that has been > > > > moved. If we > > > > migrated all interrupts as soon as possible, we would minimize > > > > the > > > > amount of interrupts delivered to the wrong pCPU. Most > > > > interrupts would > > > > be delivered to the new pCPU right away, reducing interrupt > > > > latency. > > Another approach which one might take: > 3. Eagerly migrate a subset of the interrupts and lazily migrate the > others. For instance, we could eagerly migrate all the interrupts > which have fired since the last vcpu migration. In a system where > migrations happen frequently, this should only be a handful; in a > system that migrates infrequently, this will be more, but it won’t > matter, because it will happen less often. > Yes, if doable (e.g., I don't know how easy and practical is to know and keep track of fired interrupts) this looks a good solution to me too. > So at the moment, the scheduler already tries to avoid migrating > things *a little bit* if it can (see migrate_resist). It’s not clear > to me at the moment whether this is enough or not. > Well, true, but migration resistance, in Credit2, is just a fixed value which: 1. is set at boot time; 2. is always the same for all vcpus; 3. is always the same, no matter what a vcpu is doing. And even if we make it tunable and changeable at runtime (which I intend to do), it's still something pretty "static" because of 2 and 3. And even if we make it tunable per-vcpu (which is doable), it would be rather hard to decide to what value to set it, for each vcpu. And, of course, 3 would still apply (i.e., it would change according to the vcpu workload or characteristics). So, it's guessing. More or less fine grained, but always guessing. On the other hand, using something proportional to nr. of routed interrupt as the migration resistance threshold would overcome all 1, 2 and 3. It would give us a migrate_resist value which is adaptive, and is determined according to actual workload of properties of a specific vcpu. Feeding routed interrupt info to the load balancer comes from similar reasoning (and we actually may want to do both). FTR, Credit1 has a similar mechanism, i.e., it *even wilded guesses* whether a vcpu could still have some of its data in cache, and tries not to migrate it if it's likely (see __csched_vcpu_is_cache_hot()). We can improve that too, although it is a lot more complex and less predictable, as usual with Credit1. > Or to put it a different way — how long should the scheduler try to > wait before moving one of these vcpus? > Yep, it's similar to the "anticipation" problem in I/O schedulers (where "excessive seeks" ~= "too frequent migrations"). https://en.wikipedia.org/wiki/Anticipatory_scheduling > At the moment I haven’t seen a good way of calculating this. > Exactly, and basing the calculation on the number of routed interrupt --and, if possible, other metrics too-- could be that "good way" we're looking for. It would need experimenting, of course, but I like the idea. > #3 to me has the feeling of being somewhat more satisfying, but also > potentially fairly complicated. Since the scheduler already does > migration resistance somewhat, #1 would be a simpler to implement in > the sort run. If it turns out that #1 has other drawbacks, we can > implement #3 as and when needed. > > Thoughts? > Yes, we can do things incrementally, which is always good. I like your #1 proposal because it has the really positive side effect of bringing us in the camp of adaptive migration resistance, which is something pretty advanced and pretty cool, if we manage to do it right. :-) Regards, Dario
diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c index 6f4329f..5129d6e 100644 --- a/xen/arch/arm/gic-its.c +++ b/xen/arch/arm/gic-its.c @@ -228,6 +228,18 @@ static int its_send_cmd_inv(struct host_its *its, return its_send_command(its, cmd); } +static int its_send_cmd_invall(struct host_its *its, int cpu) +{ + uint64_t cmd[4]; + + cmd[0] = GITS_CMD_INVALL; + cmd[1] = 0x00; + cmd[2] = cpu & GENMASK(15, 0); + cmd[3] = 0x00; + + return its_send_command(its, cmd); +} + int gicv3_its_map_device(struct host_its *hw_its, struct domain *d, int devid, int bits, bool valid) { @@ -668,6 +680,52 @@ uint32_t gicv3_lpi_lookup_lpi(struct domain *d, uint32_t host_lpi, int *vcpu_id) return hlpi.virt_lpi; } +/* Iterate over all host LPIs, and updating the "enabled" state for a given + * guest redistributor (VCPU) given the respective state in the provided + * proptable. This proptable is indexed by the stored virtual LPI number. + * This is to implement a guest INVALL command. + */ +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable) +{ + int chunk, i; + struct host_its *its; + + for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++) + { + if ( !lpi_data.host_lpis[chunk] ) + continue; + + for (i = 0; i < HOST_LPIS_PER_PAGE; i++) + { + union host_lpi *hlpip = &lpi_data.host_lpis[chunk][i], hlpi; + uint32_t hlpi_nr; + + hlpi.data = hlpip->data; + if ( !hlpi.virt_lpi ) + continue; + + if ( hlpi.dom_id != v->domain->domain_id ) + continue; + + if ( hlpi.vcpu_id != v->vcpu_id ) + continue; + + hlpi_nr = chunk * HOST_LPIS_PER_PAGE + i; + + if ( proptable[hlpi.virt_lpi] & LPI_PROP_ENABLED ) + lpi_data.lpi_property[hlpi_nr - 8192] |= LPI_PROP_ENABLED; + else + lpi_data.lpi_property[hlpi_nr - 8192] &= ~LPI_PROP_ENABLED; + } + } + + /* Tell all ITSes that they should update the property table for CPU 0, + * which is where we map all LPIs to. + */ + list_for_each_entry(its, &host_its_list, entry) + its_send_cmd_invall(its, 0); +} + void gicv3_lpi_set_enable(struct host_its *its, uint32_t deviceid, uint32_t eventid, uint32_t host_lpi, bool enabled) diff --git a/xen/arch/arm/vgic-its.c b/xen/arch/arm/vgic-its.c index 74da8fc..1e429b7 100644 --- a/xen/arch/arm/vgic-its.c +++ b/xen/arch/arm/vgic-its.c @@ -294,6 +294,33 @@ out_unlock: return ret; } +/* INVALL updates the per-LPI configuration status for every LPI mapped to + * this redistributor. For the guest side we don't need to update anything, + * as we always refer to the actual table for the enabled bit and the + * priority. + * Enabling or disabling a virtual LPI however needs to be propagated to + * the respective host LPI. Instead of iterating over all mapped LPIs in our + * emulated GIC (which is expensive due to the required on-demand mapping), + * we iterate over all mapped _host_ LPIs and filter for those which are + * forwarded to this virtual redistributor. + */ +static int its_handle_invall(struct virt_its *its, uint64_t *cmdptr) +{ + uint32_t collid = its_cmd_get_collection(cmdptr); + struct vcpu *vcpu; + + spin_lock(&its->its_lock); + vcpu = get_vcpu_from_collection(its, collid); + spin_unlock(&its->its_lock); + + if ( !vcpu ) + return -1; + + gicv3_lpi_update_configurations(vcpu, its->d->arch.vgic.proptable); + + return 0; +} + static int its_handle_mapc(struct virt_its *its, uint64_t *cmdptr) { uint32_t collid = its_cmd_get_collection(cmdptr); @@ -515,6 +542,9 @@ static int vgic_its_handle_cmds(struct domain *d, struct virt_its *its, case GITS_CMD_INV: its_handle_inv(its, cmdptr); break; + case GITS_CMD_INVALL: + its_handle_invall(its, cmdptr); + break; case GITS_CMD_MAPC: its_handle_mapc(its, cmdptr); break; diff --git a/xen/include/asm-arm/gic-its.h b/xen/include/asm-arm/gic-its.h index 2cdb3e1..ba6b2d5 100644 --- a/xen/include/asm-arm/gic-its.h +++ b/xen/include/asm-arm/gic-its.h @@ -146,6 +146,8 @@ int gicv3_lpi_drop_host_lpi(struct host_its *its, uint32_t devid, uint32_t eventid, uint32_t host_lpi); +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable); + static inline int gicv3_lpi_get_priority(struct domain *d, uint32_t lpi) { return d->arch.vgic.proptable[lpi - 8192] & 0xfc;
The INVALL command instructs an ITS to invalidate the configuration data for all LPIs associated with a given redistributor (read: VCPU). To avoid iterating (and mapping!) all guest tables, we instead go through the host LPI table to find any LPIs targetting this VCPU. We then update the configuration bits for the connected virtual LPIs. Signed-off-by: Andre Przywara <andre.przywara@arm.com> --- xen/arch/arm/gic-its.c | 58 +++++++++++++++++++++++++++++++++++++++++++ xen/arch/arm/vgic-its.c | 30 ++++++++++++++++++++++ xen/include/asm-arm/gic-its.h | 2 ++ 3 files changed, 90 insertions(+)