diff mbox

[RFC,21/24] ARM: vITS: handle INVALL command

Message ID 20160928182457.12433-22-andre.przywara@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Andre Przywara Sept. 28, 2016, 6:24 p.m. UTC
The INVALL command instructs an ITS to invalidate the configuration
data for all LPIs associated with a given redistributor (read: VCPU).
To avoid iterating (and mapping!) all guest tables, we instead go through
the host LPI table to find any LPIs targetting this VCPU. We then update
the configuration bits for the connected virtual LPIs.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
---
 xen/arch/arm/gic-its.c        | 58 +++++++++++++++++++++++++++++++++++++++++++
 xen/arch/arm/vgic-its.c       | 30 ++++++++++++++++++++++
 xen/include/asm-arm/gic-its.h |  2 ++
 3 files changed, 90 insertions(+)

Comments

Vijay Kilari Oct. 24, 2016, 3:32 p.m. UTC | #1
On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara <andre.przywara@arm.com> wrote:
> The INVALL command instructs an ITS to invalidate the configuration
> data for all LPIs associated with a given redistributor (read: VCPU).
> To avoid iterating (and mapping!) all guest tables, we instead go through
> the host LPI table to find any LPIs targetting this VCPU. We then update
> the configuration bits for the connected virtual LPIs.
>
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> ---
>  xen/arch/arm/gic-its.c        | 58 +++++++++++++++++++++++++++++++++++++++++++
>  xen/arch/arm/vgic-its.c       | 30 ++++++++++++++++++++++
>  xen/include/asm-arm/gic-its.h |  2 ++
>  3 files changed, 90 insertions(+)
>
> diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c
> index 6f4329f..5129d6e 100644
> --- a/xen/arch/arm/gic-its.c
> +++ b/xen/arch/arm/gic-its.c
> @@ -228,6 +228,18 @@ static int its_send_cmd_inv(struct host_its *its,
>      return its_send_command(its, cmd);
>  }
>
> +static int its_send_cmd_invall(struct host_its *its, int cpu)
> +{
> +    uint64_t cmd[4];
> +
> +    cmd[0] = GITS_CMD_INVALL;
> +    cmd[1] = 0x00;
> +    cmd[2] = cpu & GENMASK(15, 0);
> +    cmd[3] = 0x00;
> +
> +    return its_send_command(its, cmd);
> +}
> +
>  int gicv3_its_map_device(struct host_its *hw_its, struct domain *d,
>                           int devid, int bits, bool valid)
>  {
> @@ -668,6 +680,52 @@ uint32_t gicv3_lpi_lookup_lpi(struct domain *d, uint32_t host_lpi, int *vcpu_id)
>      return hlpi.virt_lpi;
>  }
>
> +/* Iterate over all host LPIs, and updating the "enabled" state for a given
> + * guest redistributor (VCPU) given the respective state in the provided
> + * proptable. This proptable is indexed by the stored virtual LPI number.
> + * This is to implement a guest INVALL command.
> + */
> +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable)
> +{
> +    int chunk, i;
> +    struct host_its *its;
> +
> +    for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++)
> +    {
> +        if ( !lpi_data.host_lpis[chunk] )
> +            continue;
> +
> +        for (i = 0; i < HOST_LPIS_PER_PAGE; i++)
> +        {
> +            union host_lpi *hlpip = &lpi_data.host_lpis[chunk][i], hlpi;
> +            uint32_t hlpi_nr;
> +
> +            hlpi.data = hlpip->data;
> +            if ( !hlpi.virt_lpi )
> +                continue;
> +
> +            if ( hlpi.dom_id != v->domain->domain_id )
> +                continue;
> +
> +            if ( hlpi.vcpu_id != v->vcpu_id )
> +                continue;
> +
> +            hlpi_nr = chunk * HOST_LPIS_PER_PAGE + i;
> +
> +            if ( proptable[hlpi.virt_lpi] & LPI_PROP_ENABLED )
> +                lpi_data.lpi_property[hlpi_nr - 8192] |= LPI_PROP_ENABLED;
> +            else
> +                lpi_data.lpi_property[hlpi_nr - 8192] &= ~LPI_PROP_ENABLED;
> +        }
> +    }
        AFAIK, the initial design is to use tasklet to update property
table as it consumes
lot of time to update the table.

> +
> +    /* Tell all ITSes that they should update the property table for CPU 0,
> +     * which is where we map all LPIs to.
> +     */
> +    list_for_each_entry(its, &host_its_list, entry)
> +        its_send_cmd_invall(its, 0);
> +}
> +
>  void gicv3_lpi_set_enable(struct host_its *its,
>                            uint32_t deviceid, uint32_t eventid,
>                            uint32_t host_lpi, bool enabled)
> diff --git a/xen/arch/arm/vgic-its.c b/xen/arch/arm/vgic-its.c
> index 74da8fc..1e429b7 100644
> --- a/xen/arch/arm/vgic-its.c
> +++ b/xen/arch/arm/vgic-its.c
> @@ -294,6 +294,33 @@ out_unlock:
>      return ret;
>  }
>
> +/* INVALL updates the per-LPI configuration status for every LPI mapped to
> + * this redistributor. For the guest side we don't need to update anything,
> + * as we always refer to the actual table for the enabled bit and the
> + * priority.
> + * Enabling or disabling a virtual LPI however needs to be propagated to
> + * the respective host LPI. Instead of iterating over all mapped LPIs in our
> + * emulated GIC (which is expensive due to the required on-demand mapping),
> + * we iterate over all mapped _host_ LPIs and filter for those which are
> + * forwarded to this virtual redistributor.
> + */
> +static int its_handle_invall(struct virt_its *its, uint64_t *cmdptr)
> +{
> +    uint32_t collid = its_cmd_get_collection(cmdptr);
> +    struct vcpu *vcpu;
> +
> +    spin_lock(&its->its_lock);
> +    vcpu = get_vcpu_from_collection(its, collid);
> +    spin_unlock(&its->its_lock);
> +
> +    if ( !vcpu )
> +        return -1;
> +
> +    gicv3_lpi_update_configurations(vcpu, its->d->arch.vgic.proptable);
> +
> +    return 0;
> +}
> +
>  static int its_handle_mapc(struct virt_its *its, uint64_t *cmdptr)
>  {
>      uint32_t collid = its_cmd_get_collection(cmdptr);
> @@ -515,6 +542,9 @@ static int vgic_its_handle_cmds(struct domain *d, struct virt_its *its,
>          case GITS_CMD_INV:
>              its_handle_inv(its, cmdptr);
>             break;
> +        case GITS_CMD_INVALL:
> +            its_handle_invall(its, cmdptr);
> +           break;
>          case GITS_CMD_MAPC:
>              its_handle_mapc(its, cmdptr);
>              break;
> diff --git a/xen/include/asm-arm/gic-its.h b/xen/include/asm-arm/gic-its.h
> index 2cdb3e1..ba6b2d5 100644
> --- a/xen/include/asm-arm/gic-its.h
> +++ b/xen/include/asm-arm/gic-its.h
> @@ -146,6 +146,8 @@ int gicv3_lpi_drop_host_lpi(struct host_its *its,
>                              uint32_t devid, uint32_t eventid,
>                              uint32_t host_lpi);
>
> +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable);
> +
>  static inline int gicv3_lpi_get_priority(struct domain *d, uint32_t lpi)
>  {
>      return d->arch.vgic.proptable[lpi - 8192] & 0xfc;
> --
> 2.9.0
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> https://lists.xen.org/xen-devel
Andre Przywara Nov. 4, 2016, 9:22 a.m. UTC | #2
Hi,

On 24/10/16 16:32, Vijay Kilari wrote:
> On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara <andre.przywara@arm.com> wrote:
>> The INVALL command instructs an ITS to invalidate the configuration
>> data for all LPIs associated with a given redistributor (read: VCPU).
>> To avoid iterating (and mapping!) all guest tables, we instead go through
>> the host LPI table to find any LPIs targetting this VCPU. We then update
>> the configuration bits for the connected virtual LPIs.
>>
>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>> ---
>>  xen/arch/arm/gic-its.c        | 58 +++++++++++++++++++++++++++++++++++++++++++
>>  xen/arch/arm/vgic-its.c       | 30 ++++++++++++++++++++++
>>  xen/include/asm-arm/gic-its.h |  2 ++
>>  3 files changed, 90 insertions(+)
>>
>> diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c
>> index 6f4329f..5129d6e 100644
>> --- a/xen/arch/arm/gic-its.c
>> +++ b/xen/arch/arm/gic-its.c
>> @@ -228,6 +228,18 @@ static int its_send_cmd_inv(struct host_its *its,
>>      return its_send_command(its, cmd);
>>  }
>>
>> +static int its_send_cmd_invall(struct host_its *its, int cpu)
>> +{
>> +    uint64_t cmd[4];
>> +
>> +    cmd[0] = GITS_CMD_INVALL;
>> +    cmd[1] = 0x00;
>> +    cmd[2] = cpu & GENMASK(15, 0);
>> +    cmd[3] = 0x00;
>> +
>> +    return its_send_command(its, cmd);
>> +}
>> +
>>  int gicv3_its_map_device(struct host_its *hw_its, struct domain *d,
>>                           int devid, int bits, bool valid)
>>  {
>> @@ -668,6 +680,52 @@ uint32_t gicv3_lpi_lookup_lpi(struct domain *d, uint32_t host_lpi, int *vcpu_id)
>>      return hlpi.virt_lpi;
>>  }
>>
>> +/* Iterate over all host LPIs, and updating the "enabled" state for a given
>> + * guest redistributor (VCPU) given the respective state in the provided
>> + * proptable. This proptable is indexed by the stored virtual LPI number.
>> + * This is to implement a guest INVALL command.
>> + */
>> +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable)
>> +{
>> +    int chunk, i;
>> +    struct host_its *its;
>> +
>> +    for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++)
>> +    {
>> +        if ( !lpi_data.host_lpis[chunk] )
>> +            continue;
>> +
>> +        for (i = 0; i < HOST_LPIS_PER_PAGE; i++)
>> +        {
>> +            union host_lpi *hlpip = &lpi_data.host_lpis[chunk][i], hlpi;
>> +            uint32_t hlpi_nr;
>> +
>> +            hlpi.data = hlpip->data;
>> +            if ( !hlpi.virt_lpi )
>> +                continue;
>> +
>> +            if ( hlpi.dom_id != v->domain->domain_id )
>> +                continue;
>> +
>> +            if ( hlpi.vcpu_id != v->vcpu_id )
>> +                continue;
>> +
>> +            hlpi_nr = chunk * HOST_LPIS_PER_PAGE + i;
>> +
>> +            if ( proptable[hlpi.virt_lpi] & LPI_PROP_ENABLED )
>> +                lpi_data.lpi_property[hlpi_nr - 8192] |= LPI_PROP_ENABLED;
>> +            else
>> +                lpi_data.lpi_property[hlpi_nr - 8192] &= ~LPI_PROP_ENABLED;
>> +        }
>> +    }
>         AFAIK, the initial design is to use tasklet to update property
> table as it consumes
> lot of time to update the table.

This is a possible, but premature optimization.
Linux (at the moment, at least) only calls INVALL _once_, just after
initialising the collections. And at this point no LPI is mapped, so the
whole routine does basically nothing - and that quite fast.
We can later have any kind of fancy algorithm if there is a need for.

Cheers,
Andre.


>> +
>> +    /* Tell all ITSes that they should update the property table for CPU 0,
>> +     * which is where we map all LPIs to.
>> +     */
>> +    list_for_each_entry(its, &host_its_list, entry)
>> +        its_send_cmd_invall(its, 0);
>> +}
>> +
>>  void gicv3_lpi_set_enable(struct host_its *its,
>>                            uint32_t deviceid, uint32_t eventid,
>>                            uint32_t host_lpi, bool enabled)
>> diff --git a/xen/arch/arm/vgic-its.c b/xen/arch/arm/vgic-its.c
>> index 74da8fc..1e429b7 100644
>> --- a/xen/arch/arm/vgic-its.c
>> +++ b/xen/arch/arm/vgic-its.c
>> @@ -294,6 +294,33 @@ out_unlock:
>>      return ret;
>>  }
>>
>> +/* INVALL updates the per-LPI configuration status for every LPI mapped to
>> + * this redistributor. For the guest side we don't need to update anything,
>> + * as we always refer to the actual table for the enabled bit and the
>> + * priority.
>> + * Enabling or disabling a virtual LPI however needs to be propagated to
>> + * the respective host LPI. Instead of iterating over all mapped LPIs in our
>> + * emulated GIC (which is expensive due to the required on-demand mapping),
>> + * we iterate over all mapped _host_ LPIs and filter for those which are
>> + * forwarded to this virtual redistributor.
>> + */
>> +static int its_handle_invall(struct virt_its *its, uint64_t *cmdptr)
>> +{
>> +    uint32_t collid = its_cmd_get_collection(cmdptr);
>> +    struct vcpu *vcpu;
>> +
>> +    spin_lock(&its->its_lock);
>> +    vcpu = get_vcpu_from_collection(its, collid);
>> +    spin_unlock(&its->its_lock);
>> +
>> +    if ( !vcpu )
>> +        return -1;
>> +
>> +    gicv3_lpi_update_configurations(vcpu, its->d->arch.vgic.proptable);
>> +
>> +    return 0;
>> +}
>> +
>>  static int its_handle_mapc(struct virt_its *its, uint64_t *cmdptr)
>>  {
>>      uint32_t collid = its_cmd_get_collection(cmdptr);
>> @@ -515,6 +542,9 @@ static int vgic_its_handle_cmds(struct domain *d, struct virt_its *its,
>>          case GITS_CMD_INV:
>>              its_handle_inv(its, cmdptr);
>>             break;
>> +        case GITS_CMD_INVALL:
>> +            its_handle_invall(its, cmdptr);
>> +           break;
>>          case GITS_CMD_MAPC:
>>              its_handle_mapc(its, cmdptr);
>>              break;
>> diff --git a/xen/include/asm-arm/gic-its.h b/xen/include/asm-arm/gic-its.h
>> index 2cdb3e1..ba6b2d5 100644
>> --- a/xen/include/asm-arm/gic-its.h
>> +++ b/xen/include/asm-arm/gic-its.h
>> @@ -146,6 +146,8 @@ int gicv3_lpi_drop_host_lpi(struct host_its *its,
>>                              uint32_t devid, uint32_t eventid,
>>                              uint32_t host_lpi);
>>
>> +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable);
>> +
>>  static inline int gicv3_lpi_get_priority(struct domain *d, uint32_t lpi)
>>  {
>>      return d->arch.vgic.proptable[lpi - 8192] & 0xfc;
>> --
>> 2.9.0
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> https://lists.xen.org/xen-devel
>
Stefano Stabellini Nov. 10, 2016, 12:21 a.m. UTC | #3
On Fri, 4 Nov 2016, Andre Przywara wrote:
> Hi,
> 
> On 24/10/16 16:32, Vijay Kilari wrote:
> > On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara <andre.przywara@arm.com> wrote:
> >> The INVALL command instructs an ITS to invalidate the configuration
> >> data for all LPIs associated with a given redistributor (read: VCPU).
> >> To avoid iterating (and mapping!) all guest tables, we instead go through
> >> the host LPI table to find any LPIs targetting this VCPU. We then update
> >> the configuration bits for the connected virtual LPIs.
> >>
> >> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> >> ---
> >>  xen/arch/arm/gic-its.c        | 58 +++++++++++++++++++++++++++++++++++++++++++
> >>  xen/arch/arm/vgic-its.c       | 30 ++++++++++++++++++++++
> >>  xen/include/asm-arm/gic-its.h |  2 ++
> >>  3 files changed, 90 insertions(+)
> >>
> >> diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c
> >> index 6f4329f..5129d6e 100644
> >> --- a/xen/arch/arm/gic-its.c
> >> +++ b/xen/arch/arm/gic-its.c
> >> @@ -228,6 +228,18 @@ static int its_send_cmd_inv(struct host_its *its,
> >>      return its_send_command(its, cmd);
> >>  }
> >>
> >> +static int its_send_cmd_invall(struct host_its *its, int cpu)
> >> +{
> >> +    uint64_t cmd[4];
> >> +
> >> +    cmd[0] = GITS_CMD_INVALL;
> >> +    cmd[1] = 0x00;
> >> +    cmd[2] = cpu & GENMASK(15, 0);
> >> +    cmd[3] = 0x00;
> >> +
> >> +    return its_send_command(its, cmd);
> >> +}
> >> +
> >>  int gicv3_its_map_device(struct host_its *hw_its, struct domain *d,
> >>                           int devid, int bits, bool valid)
> >>  {
> >> @@ -668,6 +680,52 @@ uint32_t gicv3_lpi_lookup_lpi(struct domain *d, uint32_t host_lpi, int *vcpu_id)
> >>      return hlpi.virt_lpi;
> >>  }
> >>
> >> +/* Iterate over all host LPIs, and updating the "enabled" state for a given
> >> + * guest redistributor (VCPU) given the respective state in the provided
> >> + * proptable. This proptable is indexed by the stored virtual LPI number.
> >> + * This is to implement a guest INVALL command.
> >> + */
> >> +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable)
> >> +{
> >> +    int chunk, i;
> >> +    struct host_its *its;
> >> +
> >> +    for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++)
> >> +    {
> >> +        if ( !lpi_data.host_lpis[chunk] )
> >> +            continue;
> >> +
> >> +        for (i = 0; i < HOST_LPIS_PER_PAGE; i++)
> >> +        {
> >> +            union host_lpi *hlpip = &lpi_data.host_lpis[chunk][i], hlpi;
> >> +            uint32_t hlpi_nr;
> >> +
> >> +            hlpi.data = hlpip->data;
> >> +            if ( !hlpi.virt_lpi )
> >> +                continue;
> >> +
> >> +            if ( hlpi.dom_id != v->domain->domain_id )
> >> +                continue;
> >> +
> >> +            if ( hlpi.vcpu_id != v->vcpu_id )
> >> +                continue;
> >> +
> >> +            hlpi_nr = chunk * HOST_LPIS_PER_PAGE + i;
> >> +
> >> +            if ( proptable[hlpi.virt_lpi] & LPI_PROP_ENABLED )
> >> +                lpi_data.lpi_property[hlpi_nr - 8192] |= LPI_PROP_ENABLED;
> >> +            else
> >> +                lpi_data.lpi_property[hlpi_nr - 8192] &= ~LPI_PROP_ENABLED;
> >> +        }
> >> +    }
> >         AFAIK, the initial design is to use tasklet to update property
> > table as it consumes
> > lot of time to update the table.
> 
> This is a possible, but premature optimization.
> Linux (at the moment, at least) only calls INVALL _once_, just after
> initialising the collections. And at this point no LPI is mapped, so the
> whole routine does basically nothing - and that quite fast.
> We can later have any kind of fancy algorithm if there is a need for.

I understand, but as-is it's so expensive that could be a DOS vector.
Also other OSes could issue INVALL much more often than Linux.

Considering that we might support device assigment with ITS soon, I
think it might be best to parse per-domain virtual tables rather than
the full list of physical LPIs, which theoretically could be much
larger. Or alternatively we need to think about adding another field to
lpi_data, to link together all lpis assigned to the same domain, but
that would cost even more memory. Or we could rate-limit the INVALL
calls to one every few seconds or something. Or all of the above :-)

We need to protect Xen from too frequent and too expensive requests like
this.


> >> +
> >> +    /* Tell all ITSes that they should update the property table for CPU 0,
> >> +     * which is where we map all LPIs to.
> >> +     */
> >> +    list_for_each_entry(its, &host_its_list, entry)
> >> +        its_send_cmd_invall(its, 0);
> >> +}
> >> +
> >>  void gicv3_lpi_set_enable(struct host_its *its,
> >>                            uint32_t deviceid, uint32_t eventid,
> >>                            uint32_t host_lpi, bool enabled)
> >> diff --git a/xen/arch/arm/vgic-its.c b/xen/arch/arm/vgic-its.c
> >> index 74da8fc..1e429b7 100644
> >> --- a/xen/arch/arm/vgic-its.c
> >> +++ b/xen/arch/arm/vgic-its.c
> >> @@ -294,6 +294,33 @@ out_unlock:
> >>      return ret;
> >>  }
> >>
> >> +/* INVALL updates the per-LPI configuration status for every LPI mapped to
> >> + * this redistributor. For the guest side we don't need to update anything,
> >> + * as we always refer to the actual table for the enabled bit and the
> >> + * priority.
> >> + * Enabling or disabling a virtual LPI however needs to be propagated to
> >> + * the respective host LPI. Instead of iterating over all mapped LPIs in our
> >> + * emulated GIC (which is expensive due to the required on-demand mapping),
> >> + * we iterate over all mapped _host_ LPIs and filter for those which are
> >> + * forwarded to this virtual redistributor.
> >> + */
> >> +static int its_handle_invall(struct virt_its *its, uint64_t *cmdptr)
> >> +{
> >> +    uint32_t collid = its_cmd_get_collection(cmdptr);
> >> +    struct vcpu *vcpu;
> >> +
> >> +    spin_lock(&its->its_lock);
> >> +    vcpu = get_vcpu_from_collection(its, collid);
> >> +    spin_unlock(&its->its_lock);
> >> +
> >> +    if ( !vcpu )
> >> +        return -1;
> >> +
> >> +    gicv3_lpi_update_configurations(vcpu, its->d->arch.vgic.proptable);
> >> +
> >> +    return 0;
> >> +}
> >> +
> >>  static int its_handle_mapc(struct virt_its *its, uint64_t *cmdptr)
> >>  {
> >>      uint32_t collid = its_cmd_get_collection(cmdptr);
> >> @@ -515,6 +542,9 @@ static int vgic_its_handle_cmds(struct domain *d, struct virt_its *its,
> >>          case GITS_CMD_INV:
> >>              its_handle_inv(its, cmdptr);
> >>             break;
> >> +        case GITS_CMD_INVALL:
> >> +            its_handle_invall(its, cmdptr);
> >> +           break;
> >>          case GITS_CMD_MAPC:
> >>              its_handle_mapc(its, cmdptr);
> >>              break;
> >> diff --git a/xen/include/asm-arm/gic-its.h b/xen/include/asm-arm/gic-its.h
> >> index 2cdb3e1..ba6b2d5 100644
> >> --- a/xen/include/asm-arm/gic-its.h
> >> +++ b/xen/include/asm-arm/gic-its.h
> >> @@ -146,6 +146,8 @@ int gicv3_lpi_drop_host_lpi(struct host_its *its,
> >>                              uint32_t devid, uint32_t eventid,
> >>                              uint32_t host_lpi);
> >>
> >> +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable);
> >> +
> >>  static inline int gicv3_lpi_get_priority(struct domain *d, uint32_t lpi)
> >>  {
> >>      return d->arch.vgic.proptable[lpi - 8192] & 0xfc;
> >> --
> >> 2.9.0
> >>
> >>
> >> _______________________________________________
> >> Xen-devel mailing list
> >> Xen-devel@lists.xen.org
> >> https://lists.xen.org/xen-devel
> > 
>
Julien Grall Nov. 10, 2016, 11:57 a.m. UTC | #4
Hi,

On 10/11/16 00:21, Stefano Stabellini wrote:
> On Fri, 4 Nov 2016, Andre Przywara wrote:
>> On 24/10/16 16:32, Vijay Kilari wrote:
>>> On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara <andre.przywara@arm.com> wrote:
>>>> The INVALL command instructs an ITS to invalidate the configuration
>>>> data for all LPIs associated with a given redistributor (read: VCPU).
>>>> To avoid iterating (and mapping!) all guest tables, we instead go through
>>>> the host LPI table to find any LPIs targetting this VCPU. We then update
>>>> the configuration bits for the connected virtual LPIs.
>>>>
>>>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>>>> ---
>>>>  xen/arch/arm/gic-its.c        | 58 +++++++++++++++++++++++++++++++++++++++++++
>>>>  xen/arch/arm/vgic-its.c       | 30 ++++++++++++++++++++++
>>>>  xen/include/asm-arm/gic-its.h |  2 ++
>>>>  3 files changed, 90 insertions(+)
>>>>
>>>> diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c
>>>> index 6f4329f..5129d6e 100644
>>>> --- a/xen/arch/arm/gic-its.c
>>>> +++ b/xen/arch/arm/gic-its.c
>>>> @@ -228,6 +228,18 @@ static int its_send_cmd_inv(struct host_its *its,
>>>>      return its_send_command(its, cmd);
>>>>  }
>>>>
>>>> +static int its_send_cmd_invall(struct host_its *its, int cpu)
>>>> +{
>>>> +    uint64_t cmd[4];
>>>> +
>>>> +    cmd[0] = GITS_CMD_INVALL;
>>>> +    cmd[1] = 0x00;
>>>> +    cmd[2] = cpu & GENMASK(15, 0);
>>>> +    cmd[3] = 0x00;
>>>> +
>>>> +    return its_send_command(its, cmd);
>>>> +}
>>>> +
>>>>  int gicv3_its_map_device(struct host_its *hw_its, struct domain *d,
>>>>                           int devid, int bits, bool valid)
>>>>  {
>>>> @@ -668,6 +680,52 @@ uint32_t gicv3_lpi_lookup_lpi(struct domain *d, uint32_t host_lpi, int *vcpu_id)
>>>>      return hlpi.virt_lpi;
>>>>  }
>>>>
>>>> +/* Iterate over all host LPIs, and updating the "enabled" state for a given
>>>> + * guest redistributor (VCPU) given the respective state in the provided
>>>> + * proptable. This proptable is indexed by the stored virtual LPI number.
>>>> + * This is to implement a guest INVALL command.
>>>> + */
>>>> +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable)
>>>> +{
>>>> +    int chunk, i;
>>>> +    struct host_its *its;
>>>> +
>>>> +    for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++)
>>>> +    {
>>>> +        if ( !lpi_data.host_lpis[chunk] )
>>>> +            continue;
>>>> +
>>>> +        for (i = 0; i < HOST_LPIS_PER_PAGE; i++)
>>>> +        {
>>>> +            union host_lpi *hlpip = &lpi_data.host_lpis[chunk][i], hlpi;
>>>> +            uint32_t hlpi_nr;
>>>> +
>>>> +            hlpi.data = hlpip->data;
>>>> +            if ( !hlpi.virt_lpi )
>>>> +                continue;
>>>> +
>>>> +            if ( hlpi.dom_id != v->domain->domain_id )
>>>> +                continue;
>>>> +
>>>> +            if ( hlpi.vcpu_id != v->vcpu_id )
>>>> +                continue;
>>>> +
>>>> +            hlpi_nr = chunk * HOST_LPIS_PER_PAGE + i;
>>>> +
>>>> +            if ( proptable[hlpi.virt_lpi] & LPI_PROP_ENABLED )
>>>> +                lpi_data.lpi_property[hlpi_nr - 8192] |= LPI_PROP_ENABLED;
>>>> +            else
>>>> +                lpi_data.lpi_property[hlpi_nr - 8192] &= ~LPI_PROP_ENABLED;
>>>> +        }
>>>> +    }
>>>         AFAIK, the initial design is to use tasklet to update property
>>> table as it consumes
>>> lot of time to update the table.
>>
>> This is a possible, but premature optimization.
>> Linux (at the moment, at least) only calls INVALL _once_, just after
>> initialising the collections. And at this point no LPI is mapped, so the
>> whole routine does basically nothing - and that quite fast.
>> We can later have any kind of fancy algorithm if there is a need for.
>
> I understand, but as-is it's so expensive that could be a DOS vector.
> Also other OSes could issue INVALL much more often than Linux.
>
> Considering that we might support device assigment with ITS soon, I
> think it might be best to parse per-domain virtual tables rather than
> the full list of physical LPIs, which theoretically could be much
> larger. Or alternatively we need to think about adding another field to
> lpi_data, to link together all lpis assigned to the same domain, but
> that would cost even more memory. Or we could rate-limit the INVALL
> calls to one every few seconds or something. Or all of the above :-)

It is not necessary for an ITS implementation to wait until an 
INVALL/INV command is issued to take into account the change of the LPI 
configuration tables (aka property table in this thread).

So how about trapping the property table? We would still have to go 
through the property table the first time (i.e when writing into the 
GICR_PROPBASER), but INVALL would be a nop.

The idea would be unmapping the region when GICR_PROPBASER is written. 
So any read/write access would be trapped. For a write access, Xen will 
update the LPIs internal data structures and write the value in the 
guest page unmapped. If we don't want to have an overhead for the read 
access, we could just write-protect the page in stage-2 page table. So 
only write access would be trapped.

Going further, for the ITS, Xen is using the guest memory to store the 
ITS information. This means Xen has to validate the information at every 
access. So how about restricting the access in stage-2 page table? That 
would remove the overhead of validating data.

Any thoughts?

Regards,
Stefano Stabellini Nov. 10, 2016, 8:42 p.m. UTC | #5
On Thu, 10 Nov 2016, Julien Grall wrote:
> Hi,
> 
> On 10/11/16 00:21, Stefano Stabellini wrote:
> > On Fri, 4 Nov 2016, Andre Przywara wrote:
> > > On 24/10/16 16:32, Vijay Kilari wrote:
> > > > On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara
> > > > <andre.przywara@arm.com> wrote:
> > > > > The INVALL command instructs an ITS to invalidate the configuration
> > > > > data for all LPIs associated with a given redistributor (read: VCPU).
> > > > > To avoid iterating (and mapping!) all guest tables, we instead go
> > > > > through
> > > > > the host LPI table to find any LPIs targetting this VCPU. We then
> > > > > update
> > > > > the configuration bits for the connected virtual LPIs.
> > > > > 
> > > > > Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> > > > > ---
> > > > >  xen/arch/arm/gic-its.c        | 58
> > > > > +++++++++++++++++++++++++++++++++++++++++++
> > > > >  xen/arch/arm/vgic-its.c       | 30 ++++++++++++++++++++++
> > > > >  xen/include/asm-arm/gic-its.h |  2 ++
> > > > >  3 files changed, 90 insertions(+)
> > > > > 
> > > > > diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c
> > > > > index 6f4329f..5129d6e 100644
> > > > > --- a/xen/arch/arm/gic-its.c
> > > > > +++ b/xen/arch/arm/gic-its.c
> > > > > @@ -228,6 +228,18 @@ static int its_send_cmd_inv(struct host_its *its,
> > > > >      return its_send_command(its, cmd);
> > > > >  }
> > > > > 
> > > > > +static int its_send_cmd_invall(struct host_its *its, int cpu)
> > > > > +{
> > > > > +    uint64_t cmd[4];
> > > > > +
> > > > > +    cmd[0] = GITS_CMD_INVALL;
> > > > > +    cmd[1] = 0x00;
> > > > > +    cmd[2] = cpu & GENMASK(15, 0);
> > > > > +    cmd[3] = 0x00;
> > > > > +
> > > > > +    return its_send_command(its, cmd);
> > > > > +}
> > > > > +
> > > > >  int gicv3_its_map_device(struct host_its *hw_its, struct domain *d,
> > > > >                           int devid, int bits, bool valid)
> > > > >  {
> > > > > @@ -668,6 +680,52 @@ uint32_t gicv3_lpi_lookup_lpi(struct domain *d,
> > > > > uint32_t host_lpi, int *vcpu_id)
> > > > >      return hlpi.virt_lpi;
> > > > >  }
> > > > > 
> > > > > +/* Iterate over all host LPIs, and updating the "enabled" state for a
> > > > > given
> > > > > + * guest redistributor (VCPU) given the respective state in the
> > > > > provided
> > > > > + * proptable. This proptable is indexed by the stored virtual LPI
> > > > > number.
> > > > > + * This is to implement a guest INVALL command.
> > > > > + */
> > > > > +void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t
> > > > > *proptable)
> > > > > +{
> > > > > +    int chunk, i;
> > > > > +    struct host_its *its;
> > > > > +
> > > > > +    for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE;
> > > > > chunk++)
> > > > > +    {
> > > > > +        if ( !lpi_data.host_lpis[chunk] )
> > > > > +            continue;
> > > > > +
> > > > > +        for (i = 0; i < HOST_LPIS_PER_PAGE; i++)
> > > > > +        {
> > > > > +            union host_lpi *hlpip = &lpi_data.host_lpis[chunk][i],
> > > > > hlpi;
> > > > > +            uint32_t hlpi_nr;
> > > > > +
> > > > > +            hlpi.data = hlpip->data;
> > > > > +            if ( !hlpi.virt_lpi )
> > > > > +                continue;
> > > > > +
> > > > > +            if ( hlpi.dom_id != v->domain->domain_id )
> > > > > +                continue;
> > > > > +
> > > > > +            if ( hlpi.vcpu_id != v->vcpu_id )
> > > > > +                continue;
> > > > > +
> > > > > +            hlpi_nr = chunk * HOST_LPIS_PER_PAGE + i;
> > > > > +
> > > > > +            if ( proptable[hlpi.virt_lpi] & LPI_PROP_ENABLED )
> > > > > +                lpi_data.lpi_property[hlpi_nr - 8192] |=
> > > > > LPI_PROP_ENABLED;
> > > > > +            else
> > > > > +                lpi_data.lpi_property[hlpi_nr - 8192] &=
> > > > > ~LPI_PROP_ENABLED;
> > > > > +        }
> > > > > +    }
> > > >         AFAIK, the initial design is to use tasklet to update property
> > > > table as it consumes
> > > > lot of time to update the table.
> > > 
> > > This is a possible, but premature optimization.
> > > Linux (at the moment, at least) only calls INVALL _once_, just after
> > > initialising the collections. And at this point no LPI is mapped, so the
> > > whole routine does basically nothing - and that quite fast.
> > > We can later have any kind of fancy algorithm if there is a need for.
> > 
> > I understand, but as-is it's so expensive that could be a DOS vector.
> > Also other OSes could issue INVALL much more often than Linux.
> > 
> > Considering that we might support device assigment with ITS soon, I
> > think it might be best to parse per-domain virtual tables rather than
> > the full list of physical LPIs, which theoretically could be much
> > larger. Or alternatively we need to think about adding another field to
> > lpi_data, to link together all lpis assigned to the same domain, but
> > that would cost even more memory. Or we could rate-limit the INVALL
> > calls to one every few seconds or something. Or all of the above :-)
> 
> It is not necessary for an ITS implementation to wait until an INVALL/INV
> command is issued to take into account the change of the LPI configuration
> tables (aka property table in this thread).
> 
> So how about trapping the property table? We would still have to go through
> the property table the first time (i.e when writing into the GICR_PROPBASER),
> but INVALL would be a nop.
> 
> The idea would be unmapping the region when GICR_PROPBASER is written. So any
> read/write access would be trapped. For a write access, Xen will update the
> LPIs internal data structures and write the value in the guest page unmapped.
> If we don't want to have an overhead for the read access, we could just
> write-protect the page in stage-2 page table. So only write access would be
> trapped.
> 
> Going further, for the ITS, Xen is using the guest memory to store the ITS
> information. This means Xen has to validate the information at every access.
> So how about restricting the access in stage-2 page table? That would remove
> the overhead of validating data.
> 
> Any thoughts?

It is a promising idea. Let me expand on this.

I agree that on INVALL if we need to do anything, we should go through
the virtual property table rather than the full list of host lpis.

Once we agree on that, the two options we have are:

1) We let the guest write anything to the table, then we do a full
validation of the table on INVALL. We also do a validation of the table
entries used as parameters for any other commands.

2) We map the table read-only, then do a validation of every guest
write. INVALL becomes a NOP and parameters validation for many commands
could be removed or at least reduced.

Conceptually the two options should both lead to exactly the same
result. Therefore I think the decision should be made purely on
performance: which one is faster?  If it is true that INVALL is only
typically called once I suspect that 1) is faster, but I would like to
see some simple benchmarks, such as the time that it takes to configure
the ITS from scratch with the two approaches.


That said, even if 1) turns out to be faster and the approach of choice,
the idea of making the tables read-only in stage-2 could still be useful
to simplify parameters validation and protect Xen from concurrent
changes of the table entries from another guest vcpu. If the tables as
RW, we need to be very careful in Xen and use barriers to avoid
re-reading any guest table entry twice, as the guest could be changing
it in parallel to exploit the hypervisor.
Julien Grall Nov. 11, 2016, 3:53 p.m. UTC | #6
Hi Stefano,

On 10/11/16 20:42, Stefano Stabellini wrote:
> On Thu, 10 Nov 2016, Julien Grall wrote:
>> On 10/11/16 00:21, Stefano Stabellini wrote:
>>> On Fri, 4 Nov 2016, Andre Przywara wrote:
>>>> On 24/10/16 16:32, Vijay Kilari wrote:
>>>>> On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara
>>>>>         AFAIK, the initial design is to use tasklet to update property
>>>>> table as it consumes
>>>>> lot of time to update the table.
>>>>
>>>> This is a possible, but premature optimization.
>>>> Linux (at the moment, at least) only calls INVALL _once_, just after
>>>> initialising the collections. And at this point no LPI is mapped, so the
>>>> whole routine does basically nothing - and that quite fast.
>>>> We can later have any kind of fancy algorithm if there is a need for.
>>>
>>> I understand, but as-is it's so expensive that could be a DOS vector.
>>> Also other OSes could issue INVALL much more often than Linux.
>>>
>>> Considering that we might support device assigment with ITS soon, I
>>> think it might be best to parse per-domain virtual tables rather than
>>> the full list of physical LPIs, which theoretically could be much
>>> larger. Or alternatively we need to think about adding another field to
>>> lpi_data, to link together all lpis assigned to the same domain, but
>>> that would cost even more memory. Or we could rate-limit the INVALL
>>> calls to one every few seconds or something. Or all of the above :-)
>>
>> It is not necessary for an ITS implementation to wait until an INVALL/INV
>> command is issued to take into account the change of the LPI configuration
>> tables (aka property table in this thread).
>>
>> So how about trapping the property table? We would still have to go through
>> the property table the first time (i.e when writing into the GICR_PROPBASER),
>> but INVALL would be a nop.
>>
>> The idea would be unmapping the region when GICR_PROPBASER is written. So any
>> read/write access would be trapped. For a write access, Xen will update the
>> LPIs internal data structures and write the value in the guest page unmapped.
>> If we don't want to have an overhead for the read access, we could just
>> write-protect the page in stage-2 page table. So only write access would be
>> trapped.
>>
>> Going further, for the ITS, Xen is using the guest memory to store the ITS
>> information. This means Xen has to validate the information at every access.
>> So how about restricting the access in stage-2 page table? That would remove
>> the overhead of validating data.
>>
>> Any thoughts?
>
> It is a promising idea. Let me expand on this.
>
> I agree that on INVALL if we need to do anything, we should go through
> the virtual property table rather than the full list of host lpis.

I agree on that.

>
> Once we agree on that, the two options we have are:

I believe we had a similar discussion when Vijay worked on the vITS (see 
[1]). I would have hoped that this new proposal took into account the 
constraint mentioned back then.

>
> 1) We let the guest write anything to the table, then we do a full
> validation of the table on INVALL. We also do a validation of the table
> entries used as parameters for any other commands.
>
> 2) We map the table read-only, then do a validation of every guest
> write. INVALL becomes a NOP and parameters validation for many commands
> could be removed or at least reduced.
>
> Conceptually the two options should both lead to exactly the same
> result. Therefore I think the decision should be made purely on
> performance: which one is faster?  If it is true that INVALL is only
> typically called once I suspect that 1) is faster, but I would like to
> see some simple benchmarks, such as the time that it takes to configure
> the ITS from scratch with the two approaches.

The problem is not which one is faster but which one will not take down 
the hypervisor.

The guest is allowed to create a command queue as big as 1MB, a command 
use 32 bytes, so the command queue can fit up 32640 commands.

Now imagine a malicious guest filling up the command queue with INVALL 
and then notify the via (via GITS_CWRITER). Based on patch #5, all those 
commands will be handled in one. So you have to multiple the time of one 
command by 32640 times.

Given that the hypervisor is not preemptible, it likely means a DOS.
A similar problem would happen if an vITS command is translate to an ITS 
command (see the implementation of INVALL). Multiple malicious guest 
could slow down the other guest by filling up the host command queue. 
Worst, a command from a normal guest could be discarded because the host 
ITS command queue is full (see its_send_command in gic-its.c).

That's why in the approach we had on the previous series was "host ITS 
command should be limited when emulating guest ITS command". From my 
recall, in that series the host and guest LPIs was fully separated 
(enabling a guest LPIs was not enabling host LPIs).

That said, a design doc explaining all the constraints and code flow 
would have been really helpful. It took me a while to digest and 
understand the interaction between each part of the code. The design 
document would have also been a good place to discuss about problems 
that span across multiple patches (like the command queue emulation).

>
>
> That said, even if 1) turns out to be faster and the approach of choice,
> the idea of making the tables read-only in stage-2 could still be useful
> to simplify parameters validation and protect Xen from concurrent
> changes of the table entries from another guest vcpu. If the tables as
> RW, we need to be very careful in Xen and use barriers to avoid
> re-reading any guest table entry twice, as the guest could be changing
> it in parallel to exploit the hypervisor.

Yes, and this is true for all the tables (PROPBASER, BASER,...) that 
reside on guest memory. Most of them should not be touched by the guest.

This is the same for the command queue (patch #12), accessing the 
command directly from the guest memory is not safe. A guest could modify 
the value behind our back.

Regards,

[1] https://xenbits.xen.org/people/ianc/vits/draftG.html
Stefano Stabellini Nov. 11, 2016, 8:31 p.m. UTC | #7
On Fri, 11 Nov 2016, Julien Grall wrote:
> Hi Stefano,
> 
> On 10/11/16 20:42, Stefano Stabellini wrote:
> > On Thu, 10 Nov 2016, Julien Grall wrote:
> > > On 10/11/16 00:21, Stefano Stabellini wrote:
> > > > On Fri, 4 Nov 2016, Andre Przywara wrote:
> > > > > On 24/10/16 16:32, Vijay Kilari wrote:
> > > > > > On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara
> > > > > >         AFAIK, the initial design is to use tasklet to update
> > > > > > property
> > > > > > table as it consumes
> > > > > > lot of time to update the table.
> > > > > 
> > > > > This is a possible, but premature optimization.
> > > > > Linux (at the moment, at least) only calls INVALL _once_, just after
> > > > > initialising the collections. And at this point no LPI is mapped, so
> > > > > the
> > > > > whole routine does basically nothing - and that quite fast.
> > > > > We can later have any kind of fancy algorithm if there is a need for.
> > > > 
> > > > I understand, but as-is it's so expensive that could be a DOS vector.
> > > > Also other OSes could issue INVALL much more often than Linux.
> > > > 
> > > > Considering that we might support device assigment with ITS soon, I
> > > > think it might be best to parse per-domain virtual tables rather than
> > > > the full list of physical LPIs, which theoretically could be much
> > > > larger. Or alternatively we need to think about adding another field to
> > > > lpi_data, to link together all lpis assigned to the same domain, but
> > > > that would cost even more memory. Or we could rate-limit the INVALL
> > > > calls to one every few seconds or something. Or all of the above :-)
> > > 
> > > It is not necessary for an ITS implementation to wait until an INVALL/INV
> > > command is issued to take into account the change of the LPI configuration
> > > tables (aka property table in this thread).
> > > 
> > > So how about trapping the property table? We would still have to go
> > > through
> > > the property table the first time (i.e when writing into the
> > > GICR_PROPBASER),
> > > but INVALL would be a nop.
> > > 
> > > The idea would be unmapping the region when GICR_PROPBASER is written. So
> > > any
> > > read/write access would be trapped. For a write access, Xen will update
> > > the
> > > LPIs internal data structures and write the value in the guest page
> > > unmapped.
> > > If we don't want to have an overhead for the read access, we could just
> > > write-protect the page in stage-2 page table. So only write access would
> > > be
> > > trapped.
> > > 
> > > Going further, for the ITS, Xen is using the guest memory to store the ITS
> > > information. This means Xen has to validate the information at every
> > > access.
> > > So how about restricting the access in stage-2 page table? That would
> > > remove
> > > the overhead of validating data.
> > > 
> > > Any thoughts?
> > 
> > It is a promising idea. Let me expand on this.
> > 
> > I agree that on INVALL if we need to do anything, we should go through
> > the virtual property table rather than the full list of host lpis.
> 
> I agree on that.
> 
> > 
> > Once we agree on that, the two options we have are:
> 
> I believe we had a similar discussion when Vijay worked on the vITS (see [1]).
> I would have hoped that this new proposal took into account the constraint
> mentioned back then.
> 
> > 
> > 1) We let the guest write anything to the table, then we do a full
> > validation of the table on INVALL. We also do a validation of the table
> > entries used as parameters for any other commands.
> > 
> > 2) We map the table read-only, then do a validation of every guest
> > write. INVALL becomes a NOP and parameters validation for many commands
> > could be removed or at least reduced.
> > 
> > Conceptually the two options should both lead to exactly the same
> > result. Therefore I think the decision should be made purely on
> > performance: which one is faster?  If it is true that INVALL is only
> > typically called once I suspect that 1) is faster, but I would like to
> > see some simple benchmarks, such as the time that it takes to configure
> > the ITS from scratch with the two approaches.
> 
> The problem is not which one is faster but which one will not take down the
> hypervisor.
> 
> The guest is allowed to create a command queue as big as 1MB, a command use 32
> bytes, so the command queue can fit up 32640 commands.
> 
> Now imagine a malicious guest filling up the command queue with INVALL and
> then notify the via (via GITS_CWRITER). Based on patch #5, all those commands
> will be handled in one. So you have to multiple the time of one command by
> 32640 times.
> 
> Given that the hypervisor is not preemptible, it likely means a DOS.

I think it can be made to work safely using a rate-limiting technique.
Such as: Xen is only going to emulate an INVALL for a given domain only
once every one or two seconds and no more often than that. x86 has
something like that under xen/arch/x86/irq.c, see irq_ratelimit.

But to be clear, I am not saying that this is necessarily the best way
to do it. I would like to see some benchmarks first.


> A similar problem would happen if an vITS command is translate to an ITS
> command (see the implementation of INVALL). Multiple malicious guest could
> slow down the other guest by filling up the host command queue. Worst, a
> command from a normal guest could be discarded because the host ITS command
> queue is full (see its_send_command in gic-its.c).
 
Looking at the patches, nothing checks for discarded physical ITS
commands. Not good.


> That's why in the approach we had on the previous series was "host ITS command
> should be limited when emulating guest ITS command". From my recall, in that
> series the host and guest LPIs was fully separated (enabling a guest LPIs was
> not enabling host LPIs).

I am interested in reading what Ian suggested to do when the physical
ITS queue is full, but I cannot find anything specific about it in the
doc.

Do you have a suggestion for this? 

The only things that come to mind right now are:

1) check if the ITS queue is full and busy loop until it is not (spin_lock style)
2) check if the ITS queue is full and sleep until it is not (mutex style)


> That said, a design doc explaining all the constraints and code flow would
> have been really helpful. It took me a while to digest and understand the
> interaction between each part of the code. The design document would have also
> been a good place to discuss about problems that span across multiple patches
> (like the command queue emulation).
> 
> > 
> > 
> > That said, even if 1) turns out to be faster and the approach of choice,
> > the idea of making the tables read-only in stage-2 could still be useful
> > to simplify parameters validation and protect Xen from concurrent
> > changes of the table entries from another guest vcpu. If the tables as
> > RW, we need to be very careful in Xen and use barriers to avoid
> > re-reading any guest table entry twice, as the guest could be changing
> > it in parallel to exploit the hypervisor.
> 
> Yes, and this is true for all the tables (PROPBASER, BASER,...) that reside on
> guest memory. Most of them should not be touched by the guest.
> 
> This is the same for the command queue (patch #12), accessing the command
> directly from the guest memory is not safe. A guest could modify the value
> behind our back.
> 
> Regards,
> 
> [1] https://xenbits.xen.org/people/ianc/vits/draftG.html
Stefano Stabellini Nov. 18, 2016, 6:39 p.m. UTC | #8
On Fri, 11 Nov 2016, Stefano Stabellini wrote:
> On Fri, 11 Nov 2016, Julien Grall wrote:
> > Hi Stefano,
> > 
> > On 10/11/16 20:42, Stefano Stabellini wrote:
> > > On Thu, 10 Nov 2016, Julien Grall wrote:
> > > > On 10/11/16 00:21, Stefano Stabellini wrote:
> > > > > On Fri, 4 Nov 2016, Andre Przywara wrote:
> > > > > > On 24/10/16 16:32, Vijay Kilari wrote:
> > > > > > > On Wed, Sep 28, 2016 at 11:54 PM, Andre Przywara
> > > > > > >         AFAIK, the initial design is to use tasklet to update
> > > > > > > property
> > > > > > > table as it consumes
> > > > > > > lot of time to update the table.
> > > > > > 
> > > > > > This is a possible, but premature optimization.
> > > > > > Linux (at the moment, at least) only calls INVALL _once_, just after
> > > > > > initialising the collections. And at this point no LPI is mapped, so
> > > > > > the
> > > > > > whole routine does basically nothing - and that quite fast.
> > > > > > We can later have any kind of fancy algorithm if there is a need for.
> > > > > 
> > > > > I understand, but as-is it's so expensive that could be a DOS vector.
> > > > > Also other OSes could issue INVALL much more often than Linux.
> > > > > 
> > > > > Considering that we might support device assigment with ITS soon, I
> > > > > think it might be best to parse per-domain virtual tables rather than
> > > > > the full list of physical LPIs, which theoretically could be much
> > > > > larger. Or alternatively we need to think about adding another field to
> > > > > lpi_data, to link together all lpis assigned to the same domain, but
> > > > > that would cost even more memory. Or we could rate-limit the INVALL
> > > > > calls to one every few seconds or something. Or all of the above :-)
> > > > 
> > > > It is not necessary for an ITS implementation to wait until an INVALL/INV
> > > > command is issued to take into account the change of the LPI configuration
> > > > tables (aka property table in this thread).
> > > > 
> > > > So how about trapping the property table? We would still have to go
> > > > through
> > > > the property table the first time (i.e when writing into the
> > > > GICR_PROPBASER),
> > > > but INVALL would be a nop.
> > > > 
> > > > The idea would be unmapping the region when GICR_PROPBASER is written. So
> > > > any
> > > > read/write access would be trapped. For a write access, Xen will update
> > > > the
> > > > LPIs internal data structures and write the value in the guest page
> > > > unmapped.
> > > > If we don't want to have an overhead for the read access, we could just
> > > > write-protect the page in stage-2 page table. So only write access would
> > > > be
> > > > trapped.
> > > > 
> > > > Going further, for the ITS, Xen is using the guest memory to store the ITS
> > > > information. This means Xen has to validate the information at every
> > > > access.
> > > > So how about restricting the access in stage-2 page table? That would
> > > > remove
> > > > the overhead of validating data.
> > > > 
> > > > Any thoughts?
> > > 
> > > It is a promising idea. Let me expand on this.
> > > 
> > > I agree that on INVALL if we need to do anything, we should go through
> > > the virtual property table rather than the full list of host lpis.
> > 
> > I agree on that.
> > 
> > > 
> > > Once we agree on that, the two options we have are:
> > 
> > I believe we had a similar discussion when Vijay worked on the vITS (see [1]).
> > I would have hoped that this new proposal took into account the constraint
> > mentioned back then.
> > 
> > > 
> > > 1) We let the guest write anything to the table, then we do a full
> > > validation of the table on INVALL. We also do a validation of the table
> > > entries used as parameters for any other commands.
> > > 
> > > 2) We map the table read-only, then do a validation of every guest
> > > write. INVALL becomes a NOP and parameters validation for many commands
> > > could be removed or at least reduced.
> > > 
> > > Conceptually the two options should both lead to exactly the same
> > > result. Therefore I think the decision should be made purely on
> > > performance: which one is faster?  If it is true that INVALL is only
> > > typically called once I suspect that 1) is faster, but I would like to
> > > see some simple benchmarks, such as the time that it takes to configure
> > > the ITS from scratch with the two approaches.
> > 
> > The problem is not which one is faster but which one will not take down the
> > hypervisor.
> > 
> > The guest is allowed to create a command queue as big as 1MB, a command use 32
> > bytes, so the command queue can fit up 32640 commands.
> > 
> > Now imagine a malicious guest filling up the command queue with INVALL and
> > then notify the via (via GITS_CWRITER). Based on patch #5, all those commands
> > will be handled in one. So you have to multiple the time of one command by
> > 32640 times.
> > 
> > Given that the hypervisor is not preemptible, it likely means a DOS.
> 
> I think it can be made to work safely using a rate-limiting technique.
> Such as: Xen is only going to emulate an INVALL for a given domain only
> once every one or two seconds and no more often than that. x86 has
> something like that under xen/arch/x86/irq.c, see irq_ratelimit.
> 
> But to be clear, I am not saying that this is necessarily the best way
> to do it. I would like to see some benchmarks first.
> 
> 
> > A similar problem would happen if an vITS command is translate to an ITS
> > command (see the implementation of INVALL). Multiple malicious guest could
> > slow down the other guest by filling up the host command queue. Worst, a
> > command from a normal guest could be discarded because the host ITS command
> > queue is full (see its_send_command in gic-its.c).
>  
> Looking at the patches, nothing checks for discarded physical ITS
> commands. Not good.
> 
> 
> > That's why in the approach we had on the previous series was "host ITS command
> > should be limited when emulating guest ITS command". From my recall, in that
> > series the host and guest LPIs was fully separated (enabling a guest LPIs was
> > not enabling host LPIs).
> 
> I am interested in reading what Ian suggested to do when the physical
> ITS queue is full, but I cannot find anything specific about it in the
> doc.
> 
> Do you have a suggestion for this? 
> 
> The only things that come to mind right now are:
> 
> 1) check if the ITS queue is full and busy loop until it is not (spin_lock style)
> 2) check if the ITS queue is full and sleep until it is not (mutex style)

Another, probably better idea, is to map all pLPIs of a device when the
device is assigned to a guest (including Dom0). This is what was written
in Ian's design doc. The advantage of this approach is that Xen doesn't
need to take any actions on the physical ITS command queue when the
guest issues virtual ITS commands, therefore completely solving this
problem at the root. (Although I am not sure about enable/disable
commands: could we avoid issuing enable/disable on pLPIs?) It also helps
toward solving the INVALL potential DOS issue, because it significantly
reduces the computation needed when an INVALL is issued by the guest.

On the other end, this approach has the potential of consuming much more
memory to map all the possible pLPIs that a device could use up to the
theoretical max. Of course that is not good either. But fortunately for
PCI devices we know how many events a device can generate. Also we
should be able to get that info on device tree for other devices. So I
suggest Xen only maps as many pLPIs as events the device can generate,
when the device is assigned to the guest. This way there would be no
wasted memory.

Does it make sense? Do you think it could work?
Julien Grall Nov. 25, 2016, 4:10 p.m. UTC | #9
Hi,

On 18/11/16 18:39, Stefano Stabellini wrote:
> On Fri, 11 Nov 2016, Stefano Stabellini wrote:
>> On Fri, 11 Nov 2016, Julien Grall wrote:
>>> On 10/11/16 20:42, Stefano Stabellini wrote:
>>> That's why in the approach we had on the previous series was "host ITS command
>>> should be limited when emulating guest ITS command". From my recall, in that
>>> series the host and guest LPIs was fully separated (enabling a guest LPIs was
>>> not enabling host LPIs).
>>
>> I am interested in reading what Ian suggested to do when the physical
>> ITS queue is full, but I cannot find anything specific about it in the
>> doc.
>>
>> Do you have a suggestion for this?
>>
>> The only things that come to mind right now are:
>>
>> 1) check if the ITS queue is full and busy loop until it is not (spin_lock style)
>> 2) check if the ITS queue is full and sleep until it is not (mutex style)
>
> Another, probably better idea, is to map all pLPIs of a device when the
> device is assigned to a guest (including Dom0). This is what was written
> in Ian's design doc. The advantage of this approach is that Xen doesn't
> need to take any actions on the physical ITS command queue when the
> guest issues virtual ITS commands, therefore completely solving this
> problem at the root. (Although I am not sure about enable/disable
> commands: could we avoid issuing enable/disable on pLPIs?)

In the previous design document (see [1]), the pLPIs are enabled when 
the device is assigned to the guest. This means that it is not necessary 
to send command there. This is also means we may receive a pLPI before 
the associated vLPI has been configured.

That said, given that LPIs are edge-triggered, there is no deactivate 
state (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is 
done, the same LPIs could potentially be raised again. This could 
generate a storm.

The priority drop is necessary if we don't want to block the reception 
of interrupt for the current physical CPU.

What I am more concerned about is this problem can also happen in normal 
running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For 
edge-triggered interrupt, there is no way to prevent them to fire again. 
Maybe it is time to introduce rate-limit interrupt for ARM. Any opinions?

 > It also helps
> toward solving the INVALL potential DOS issue, because it significantly
> reduces the computation needed when an INVALL is issued by the guest.
>
> On the other end, this approach has the potential of consuming much more
> memory to map all the possible pLPIs that a device could use up to the
> theoretical max. Of course that is not good either. But fortunately for
> PCI devices we know how many events a device can generate. Also we
> should be able to get that info on device tree for other devices. So I
> suggest Xen only maps as many pLPIs as events the device can generate,
> when the device is assigned to the guest. This way there would be no
> wasted memory.
>
> Does it make sense? Do you think it could work?

Aside the point I raised above, I think the approach looks sensible.

Regards,

[1] https://xenbits.xen.org/people/ianc/vits/draftG.html#device-assignment
Stefano Stabellini Dec. 1, 2016, 1:19 a.m. UTC | #10
On Fri, 25 Nov 2016, Julien Grall wrote:
> Hi,
> 
> On 18/11/16 18:39, Stefano Stabellini wrote:
> > On Fri, 11 Nov 2016, Stefano Stabellini wrote:
> > > On Fri, 11 Nov 2016, Julien Grall wrote:
> > > > On 10/11/16 20:42, Stefano Stabellini wrote:
> > > > That's why in the approach we had on the previous series was "host ITS
> > > > command
> > > > should be limited when emulating guest ITS command". From my recall, in
> > > > that
> > > > series the host and guest LPIs was fully separated (enabling a guest
> > > > LPIs was
> > > > not enabling host LPIs).
> > > 
> > > I am interested in reading what Ian suggested to do when the physical
> > > ITS queue is full, but I cannot find anything specific about it in the
> > > doc.
> > > 
> > > Do you have a suggestion for this?
> > > 
> > > The only things that come to mind right now are:
> > > 
> > > 1) check if the ITS queue is full and busy loop until it is not (spin_lock
> > > style)
> > > 2) check if the ITS queue is full and sleep until it is not (mutex style)
> > 
> > Another, probably better idea, is to map all pLPIs of a device when the
> > device is assigned to a guest (including Dom0). This is what was written
> > in Ian's design doc. The advantage of this approach is that Xen doesn't
> > need to take any actions on the physical ITS command queue when the
> > guest issues virtual ITS commands, therefore completely solving this
> > problem at the root. (Although I am not sure about enable/disable
> > commands: could we avoid issuing enable/disable on pLPIs?)
> 
> In the previous design document (see [1]), the pLPIs are enabled when the
> device is assigned to the guest. This means that it is not necessary to send
> command there. This is also means we may receive a pLPI before the associated
> vLPI has been configured.
> 
> That said, given that LPIs are edge-triggered, there is no deactivate state
> (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same
> LPIs could potentially be raised again. This could generate a storm.

Thank you for raising this important point. You are correct.


> The priority drop is necessary if we don't want to block the reception of
> interrupt for the current physical CPU.
> 
> What I am more concerned about is this problem can also happen in normal
> running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For
> edge-triggered interrupt, there is no way to prevent them to fire again. Maybe
> it is time to introduce rate-limit interrupt for ARM. Any opinions?

Yes. It could be as simple as disabling the pLPI when Xen receives a
second pLPI before the guest EOIs the first corresponding vLPI, which
shouldn't happen in normal circumstances.

We need a simple per-LPI inflight counter, incremented when a pLPI is
received, decremented when the corresponding vLPI is EOIed (the LR is
cleared).

When the counter > 1, we disable the pLPI and request a maintenance
interrupt for the corresponding vLPI.

When we receive the maintenance interrupt and we clear the LR of the
vLPI, Xen should re-enable the pLPI.

Given that the state of the LRs is sync'ed before calling gic_interrupt,
we can be sure to know exactly in what state the vLPI is at any given
time. But for this to work correctly, it is important to configure the
pLPI to be delivered to the same pCPU running the vCPU which handles
the vLPI (as it is already the case today for SPIs).
Andre Przywara Dec. 2, 2016, 4:18 p.m. UTC | #11
Hi,

sorry for chiming in late ....

I've been spending some time thinking about this, and I think we can in
fact get away without ever propagating command from domains to the host.

I made a list of all commands that possible require host ITS command
propagation. There are two groups:
1: enabling/disabling LPIs: INV, INVALL
2: mapping/unmapping devices/events: DISCARD, MAPD, MAPTI.

The second group can be handled by mapping all required devices up
front, I will elaborate on that in a different email.

For the first group, read below ...

On 01/12/16 01:19, Stefano Stabellini wrote:
> On Fri, 25 Nov 2016, Julien Grall wrote:
>> Hi,
>>
>> On 18/11/16 18:39, Stefano Stabellini wrote:
>>> On Fri, 11 Nov 2016, Stefano Stabellini wrote:
>>>> On Fri, 11 Nov 2016, Julien Grall wrote:
>>>>> On 10/11/16 20:42, Stefano Stabellini wrote:
>>>>> That's why in the approach we had on the previous series was "host ITS
>>>>> command
>>>>> should be limited when emulating guest ITS command". From my recall, in
>>>>> that
>>>>> series the host and guest LPIs was fully separated (enabling a guest
>>>>> LPIs was
>>>>> not enabling host LPIs).
>>>>
>>>> I am interested in reading what Ian suggested to do when the physical
>>>> ITS queue is full, but I cannot find anything specific about it in the
>>>> doc.
>>>>
>>>> Do you have a suggestion for this?
>>>>
>>>> The only things that come to mind right now are:
>>>>
>>>> 1) check if the ITS queue is full and busy loop until it is not (spin_lock
>>>> style)
>>>> 2) check if the ITS queue is full and sleep until it is not (mutex style)
>>>
>>> Another, probably better idea, is to map all pLPIs of a device when the
>>> device is assigned to a guest (including Dom0). This is what was written
>>> in Ian's design doc. The advantage of this approach is that Xen doesn't
>>> need to take any actions on the physical ITS command queue when the
>>> guest issues virtual ITS commands, therefore completely solving this
>>> problem at the root. (Although I am not sure about enable/disable
>>> commands: could we avoid issuing enable/disable on pLPIs?)
>>
>> In the previous design document (see [1]), the pLPIs are enabled when the
>> device is assigned to the guest. This means that it is not necessary to send
>> command there. This is also means we may receive a pLPI before the associated
>> vLPI has been configured.
>>
>> That said, given that LPIs are edge-triggered, there is no deactivate state
>> (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same
>> LPIs could potentially be raised again. This could generate a storm.
> 
> Thank you for raising this important point. You are correct.
>
>> The priority drop is necessary if we don't want to block the reception of
>> interrupt for the current physical CPU.
>>
>> What I am more concerned about is this problem can also happen in normal
>> running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For
>> edge-triggered interrupt, there is no way to prevent them to fire again. Maybe
>> it is time to introduce rate-limit interrupt for ARM. Any opinions?
> 
> Yes. It could be as simple as disabling the pLPI when Xen receives a
> second pLPI before the guest EOIs the first corresponding vLPI, which
> shouldn't happen in normal circumstances.
> 
> We need a simple per-LPI inflight counter, incremented when a pLPI is
> received, decremented when the corresponding vLPI is EOIed (the LR is
> cleared).
> 
> When the counter > 1, we disable the pLPI and request a maintenance
> interrupt for the corresponding vLPI.

So why do we need a _counter_? This is about edge triggered interrupts,
I think we can just accumulate all of them into one.

So here is what I think:
- We use the guest provided pending table to hold a pending bit for each
VLPI. We can unmap the memory from the guest, since software is not
supposed to access this table as per the spec.
- We use the guest provided property table, without trapping it. There
is nothing to be "validated" in that table, since it's a really tight
encoding and every value written in there is legal. We only look at bit
0 for this exercise here anyway.
- Upon reception of a physical LPI, we look it up to find the VCPU and
virtual LPI number. This is what we need to do anyway and it's a quick
two-level table lookup at the moment.
- We use the VCPU's domain and the VLPI number to index the guest's
property table and read the enabled bit. Again a quick table lookup.
 - If the VLPI is enabled, we EOI it on the host and inject it.
 - If the VLPI is disabled, we set the pending bit in the VCPU's
   pending table and EOI on the host - to allow other IRQs.
- On a guest INV command, we check whether that vLPI is now enabled:
 - If it is disabled now, we don't need to do anything.
 - If it is enabled now, we check the pending bit for that VLPI:
  - If it is 0, we don't do anything.
  - If it is 1, we inject the VLPI and clear the pending bit.
- On a guest INVALL command, we just need to iterate over the virtual
LPIs. If you look at the conditions above, the only immediate action is
when a VLPI gets enabled _and_ its pending bit is set. So we can do
64-bit read accesses over the whole pending table to find non-zero words
and thus set bits, which should be rare in practice. We can store the
highest mapped VLPI to avoid iterating over the whole of the table.
Ideally the guest has no direct control over the pending bits, since
this is what the device generates. Also we limit the number of VLPIs in
total per guest anyway.

If that still sounds like a DOS vector, we could additionally rate-limit
INVALLs, and/or track additions to the pending table after the last
INVALL: if there haven't been any new pending bits since the last scan,
INVALL is a NOP.

Does that makes sense so far?

So that just leaves us with this IRQ storm issue, which I am thinking
about now. But I guess this is not a show stopper given we can disable
the physical LPI if we sense this situation.

> When we receive the maintenance interrupt and we clear the LR of the
> vLPI, Xen should re-enable the pLPI.
> Given that the state of the LRs is sync'ed before calling gic_interrupt,
> we can be sure to know exactly in what state the vLPI is at any given
> time. But for this to work correctly, it is important to configure the
> pLPI to be delivered to the same pCPU running the vCPU which handles
> the vLPI (as it is already the case today for SPIs).

Why would that be necessary?

Cheers,
Andre
Stefano Stabellini Dec. 3, 2016, 12:46 a.m. UTC | #12
On Fri, 2 Dec 2016, Andre Przywara wrote:
> Hi,
> 
> sorry for chiming in late ....
> 
> I've been spending some time thinking about this, and I think we can in
> fact get away without ever propagating command from domains to the host.
> 
> I made a list of all commands that possible require host ITS command
> propagation. There are two groups:
> 1: enabling/disabling LPIs: INV, INVALL
> 2: mapping/unmapping devices/events: DISCARD, MAPD, MAPTI.
> 
> The second group can be handled by mapping all required devices up
> front, I will elaborate on that in a different email.
> 
> For the first group, read below ...
> 
> On 01/12/16 01:19, Stefano Stabellini wrote:
> > On Fri, 25 Nov 2016, Julien Grall wrote:
> >> Hi,
> >>
> >> On 18/11/16 18:39, Stefano Stabellini wrote:
> >>> On Fri, 11 Nov 2016, Stefano Stabellini wrote:
> >>>> On Fri, 11 Nov 2016, Julien Grall wrote:
> >>>>> On 10/11/16 20:42, Stefano Stabellini wrote:
> >>>>> That's why in the approach we had on the previous series was "host ITS
> >>>>> command
> >>>>> should be limited when emulating guest ITS command". From my recall, in
> >>>>> that
> >>>>> series the host and guest LPIs was fully separated (enabling a guest
> >>>>> LPIs was
> >>>>> not enabling host LPIs).
> >>>>
> >>>> I am interested in reading what Ian suggested to do when the physical
> >>>> ITS queue is full, but I cannot find anything specific about it in the
> >>>> doc.
> >>>>
> >>>> Do you have a suggestion for this?
> >>>>
> >>>> The only things that come to mind right now are:
> >>>>
> >>>> 1) check if the ITS queue is full and busy loop until it is not (spin_lock
> >>>> style)
> >>>> 2) check if the ITS queue is full and sleep until it is not (mutex style)
> >>>
> >>> Another, probably better idea, is to map all pLPIs of a device when the
> >>> device is assigned to a guest (including Dom0). This is what was written
> >>> in Ian's design doc. The advantage of this approach is that Xen doesn't
> >>> need to take any actions on the physical ITS command queue when the
> >>> guest issues virtual ITS commands, therefore completely solving this
> >>> problem at the root. (Although I am not sure about enable/disable
> >>> commands: could we avoid issuing enable/disable on pLPIs?)
> >>
> >> In the previous design document (see [1]), the pLPIs are enabled when the
> >> device is assigned to the guest. This means that it is not necessary to send
> >> command there. This is also means we may receive a pLPI before the associated
> >> vLPI has been configured.
> >>
> >> That said, given that LPIs are edge-triggered, there is no deactivate state
> >> (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same
> >> LPIs could potentially be raised again. This could generate a storm.
> > 
> > Thank you for raising this important point. You are correct.
> >
> >> The priority drop is necessary if we don't want to block the reception of
> >> interrupt for the current physical CPU.
> >>
> >> What I am more concerned about is this problem can also happen in normal
> >> running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For
> >> edge-triggered interrupt, there is no way to prevent them to fire again. Maybe
> >> it is time to introduce rate-limit interrupt for ARM. Any opinions?
> > 
> > Yes. It could be as simple as disabling the pLPI when Xen receives a
> > second pLPI before the guest EOIs the first corresponding vLPI, which
> > shouldn't happen in normal circumstances.
> > 
> > We need a simple per-LPI inflight counter, incremented when a pLPI is
> > received, decremented when the corresponding vLPI is EOIed (the LR is
> > cleared).
> > 
> > When the counter > 1, we disable the pLPI and request a maintenance
> > interrupt for the corresponding vLPI.
> 
> So why do we need a _counter_? This is about edge triggered interrupts,
> I think we can just accumulate all of them into one.

The counter is not to re-inject the same amount of interrupts into the
guest, but to detect interrupt storms.


> So here is what I think:
> - We use the guest provided pending table to hold a pending bit for each
> VLPI. We can unmap the memory from the guest, since software is not
> supposed to access this table as per the spec.
> - We use the guest provided property table, without trapping it. There
> is nothing to be "validated" in that table, since it's a really tight
> encoding and every value written in there is legal. We only look at bit
> 0 for this exercise here anyway.

I am following...


> - Upon reception of a physical LPI, we look it up to find the VCPU and
> virtual LPI number. This is what we need to do anyway and it's a quick
> two-level table lookup at the moment.
> - We use the VCPU's domain and the VLPI number to index the guest's
> property table and read the enabled bit. Again a quick table lookup.

They should be both O(2), correct?


>  - If the VLPI is enabled, we EOI it on the host and inject it.
>  - If the VLPI is disabled, we set the pending bit in the VCPU's
>    pending table and EOI on the host - to allow other IRQs.
> - On a guest INV command, we check whether that vLPI is now enabled:
>  - If it is disabled now, we don't need to do anything.
>  - If it is enabled now, we check the pending bit for that VLPI:
>   - If it is 0, we don't do anything.
>   - If it is 1, we inject the VLPI and clear the pending bit.
> - On a guest INVALL command, we just need to iterate over the virtual
> LPIs.

Right, much better.


> If you look at the conditions above, the only immediate action is
> when a VLPI gets enabled _and_ its pending bit is set. So we can do
> 64-bit read accesses over the whole pending table to find non-zero words
> and thus set bits, which should be rare in practice. We can store the
> highest mapped VLPI to avoid iterating over the whole of the table.
> Ideally the guest has no direct control over the pending bits, since
> this is what the device generates. Also we limit the number of VLPIs in
> total per guest anyway.

I wonder if we could even use a fully packed bitmask with only the
pending bits, so 1 bit per vLPI, rather than 1 byte per vLPI. That would
be a nice improvement.


> If that still sounds like a DOS vector, we could additionally rate-limit
> INVALLs, and/or track additions to the pending table after the last
> INVALL: if there haven't been any new pending bits since the last scan,
> INVALL is a NOP.
> 
> Does that makes sense so far?

It makes sense. It should be OK.


> So that just leaves us with this IRQ storm issue, which I am thinking
> about now. But I guess this is not a show stopper given we can disable
> the physical LPI if we sense this situation.

That is true and it's exactly what we should do.


> > When we receive the maintenance interrupt and we clear the LR of the
> > vLPI, Xen should re-enable the pLPI.
> > Given that the state of the LRs is sync'ed before calling gic_interrupt,
> > we can be sure to know exactly in what state the vLPI is at any given
> > time. But for this to work correctly, it is important to configure the
> > pLPI to be delivered to the same pCPU running the vCPU which handles
> > the vLPI (as it is already the case today for SPIs).
> 
> Why would that be necessary?

Because the state of the LRs of other pCPUs won't be up to date: we
wouldn't know for sure whether the guest EOI'ed the vLPI or not.
Julien Grall Dec. 5, 2016, 1:36 p.m. UTC | #13
Hi Stefano,

On 03/12/16 00:46, Stefano Stabellini wrote:
> On Fri, 2 Dec 2016, Andre Przywara wrote:
>>> When we receive the maintenance interrupt and we clear the LR of the
>>> vLPI, Xen should re-enable the pLPI.
>>> Given that the state of the LRs is sync'ed before calling gic_interrupt,
>>> we can be sure to know exactly in what state the vLPI is at any given
>>> time. But for this to work correctly, it is important to configure the
>>> pLPI to be delivered to the same pCPU running the vCPU which handles
>>> the vLPI (as it is already the case today for SPIs).
>>
>> Why would that be necessary?
>
> Because the state of the LRs of other pCPUs won't be up to date: we
> wouldn't know for sure whether the guest EOI'ed the vLPI or not.

Well, there is still a small window when the interrupt may be received 
on the previous pCPU. So we have to take into account this case.

This window may be bigger with LPIs, because a single vCPU may have 
thousand interrupts routed. This would take a long time to move all of 
them when the vCPU is migrating. So we may want to take a lazy approach 
and moving them when they are received on the "wrong" pCPU.

Cheers,
Stefano Stabellini Dec. 5, 2016, 7:51 p.m. UTC | #14
On Mon, 5 Dec 2016, Julien Grall wrote:
> Hi Stefano,
> 
> On 03/12/16 00:46, Stefano Stabellini wrote:
> > On Fri, 2 Dec 2016, Andre Przywara wrote:
> > > > When we receive the maintenance interrupt and we clear the LR of the
> > > > vLPI, Xen should re-enable the pLPI.
> > > > Given that the state of the LRs is sync'ed before calling gic_interrupt,
> > > > we can be sure to know exactly in what state the vLPI is at any given
> > > > time. But for this to work correctly, it is important to configure the
> > > > pLPI to be delivered to the same pCPU running the vCPU which handles
> > > > the vLPI (as it is already the case today for SPIs).
> > > 
> > > Why would that be necessary?
> > 
> > Because the state of the LRs of other pCPUs won't be up to date: we
> > wouldn't know for sure whether the guest EOI'ed the vLPI or not.
> 
> Well, there is still a small window when the interrupt may be received on the
> previous pCPU. So we have to take into account this case.

That's right. We already have a mechanism to deal with that, based on
the GIC_IRQ_GUEST_MIGRATING flag. It should work with LPIs too.


> This window may be bigger with LPIs, because a single vCPU may have thousand
> interrupts routed. This would take a long time to move all of them when the
> vCPU is migrating. So we may want to take a lazy approach and moving them when
> they are received on the "wrong" pCPU.

That's possible. The only downside is that modifying the irq migration
workflow is difficult and we might want to avoid it if possible.

Another approach is to let the scheduler know that migration is slower.
In fact this is not a new problem: it can be slow to migrate interrupts,
even few non-LPIs interrupts, even on x86. I wonder if the Xen scheduler
has any knowledge of that (CC'ing George and Dario). I guess that's the
reason why most people run with dom0_vcpus_pin.
Julien Grall Dec. 6, 2016, 3:56 p.m. UTC | #15
Hi Stefano,

On 05/12/16 19:51, Stefano Stabellini wrote:
> On Mon, 5 Dec 2016, Julien Grall wrote:
>> Hi Stefano,
>>
>> On 03/12/16 00:46, Stefano Stabellini wrote:
>>> On Fri, 2 Dec 2016, Andre Przywara wrote:
>>>>> When we receive the maintenance interrupt and we clear the LR of the
>>>>> vLPI, Xen should re-enable the pLPI.
>>>>> Given that the state of the LRs is sync'ed before calling gic_interrupt,
>>>>> we can be sure to know exactly in what state the vLPI is at any given
>>>>> time. But for this to work correctly, it is important to configure the
>>>>> pLPI to be delivered to the same pCPU running the vCPU which handles
>>>>> the vLPI (as it is already the case today for SPIs).
>>>>
>>>> Why would that be necessary?
>>>
>>> Because the state of the LRs of other pCPUs won't be up to date: we
>>> wouldn't know for sure whether the guest EOI'ed the vLPI or not.
>>
>> Well, there is still a small window when the interrupt may be received on the
>> previous pCPU. So we have to take into account this case.
>
> That's right. We already have a mechanism to deal with that, based on
> the GIC_IRQ_GUEST_MIGRATING flag. It should work with LPIs too.

Right.

>> This window may be bigger with LPIs, because a single vCPU may have thousand
>> interrupts routed. This would take a long time to move all of them when the
>> vCPU is migrating. So we may want to take a lazy approach and moving them when
>> they are received on the "wrong" pCPU.
>
> That's possible. The only downside is that modifying the irq migration
> workflow is difficult and we might want to avoid it if possible.

I don't think this would modify the irq migration work flow. If you look 
at the implementation of arch_move_irqs, it will just go over the vIRQ 
and call irq_set_affinity.

irq_set_affinity will directly modify the hardware and that's all.

>
> Another approach is to let the scheduler know that migration is slower.
> In fact this is not a new problem: it can be slow to migrate interrupts,
> even few non-LPIs interrupts, even on x86. I wonder if the Xen scheduler
> has any knowledge of that (CC'ing George and Dario). I guess that's the
> reason why most people run with dom0_vcpus_pin.

I gave a quick look at x86, arch_move_irqs is not implemented. Only PIRQ 
are migrated when a vCPU is moving to another pCPU.

The function pirq_set_affinity, will change the affinity of a PIRQ but 
only in software (see irq_set_affinity). This is not yet replicated the 
configuration into the hardware.

In the case of ARM, we directly modify the configuration of the 
hardware. This adds much more overhead because you have to do an 
hardware access for every single IRQ.

Regards,
Stefano Stabellini Dec. 6, 2016, 7:36 p.m. UTC | #16
On Tue, 6 Dec 2016, Julien Grall wrote:
> > > This window may be bigger with LPIs, because a single vCPU may have
> > > thousand
> > > interrupts routed. This would take a long time to move all of them when
> > > the
> > > vCPU is migrating. So we may want to take a lazy approach and moving them
> > > when
> > > they are received on the "wrong" pCPU.
> > 
> > That's possible. The only downside is that modifying the irq migration
> > workflow is difficult and we might want to avoid it if possible.
> 
> I don't think this would modify the irq migration work flow. If you look at
> the implementation of arch_move_irqs, it will just go over the vIRQ and call
> irq_set_affinity.
> 
> irq_set_affinity will directly modify the hardware and that's all.
> 
> > 
> > Another approach is to let the scheduler know that migration is slower.
> > In fact this is not a new problem: it can be slow to migrate interrupts,
> > even few non-LPIs interrupts, even on x86. I wonder if the Xen scheduler
> > has any knowledge of that (CC'ing George and Dario). I guess that's the
> > reason why most people run with dom0_vcpus_pin.
> 
> I gave a quick look at x86, arch_move_irqs is not implemented. Only PIRQ are
> migrated when a vCPU is moving to another pCPU.
> 
> The function pirq_set_affinity, will change the affinity of a PIRQ but only in
> software (see irq_set_affinity). This is not yet replicated the configuration
> into the hardware.
> 
> In the case of ARM, we directly modify the configuration of the hardware. This
> adds much more overhead because you have to do an hardware access for every
> single IRQ.

George, Dario, any comments on whether this would make sense and how to
do it?
Dario Faggioli Dec. 6, 2016, 9:32 p.m. UTC | #17
On Tue, 2016-12-06 at 11:36 -0800, Stefano Stabellini wrote:
> On Tue, 6 Dec 2016, Julien Grall wrote:
> > 
> > > Another approach is to let the scheduler know that migration is
> > > slower.
> > > In fact this is not a new problem: it can be slow to migrate
> > > interrupts,
> > > even few non-LPIs interrupts, even on x86. I wonder if the Xen
> > > scheduler
> > > has any knowledge of that (CC'ing George and Dario). I guess
> > > that's the
> > > reason why most people run with dom0_vcpus_pin.
> > 
> > I gave a quick look at x86, arch_move_irqs is not implemented. Only
> > PIRQ are
> > migrated when a vCPU is moving to another pCPU.
> > 
> > In the case of ARM, we directly modify the configuration of the
> > hardware. This
> > adds much more overhead because you have to do an hardware access
> > for every
> > single IRQ.
> 
> George, Dario, any comments on whether this would make sense and how
> to
> do it?
>
I was actually looking into this, but I think I don't know enough of
ARM in general, and about this issue in particular to be useful.

That being said, perhaps you could clarify a bit what you mean with
"let the scheduler know that migration is slower". What you'd expect
the scheduler to do?

Checking the code, as Julien says, on x86 all we do when we move vCPUs
around is calling evtchn_move_pirqs(). In fact, it was right that
function that was called multiple times in schedule.c, and it was you
that (as Julien pointed out already):
1) in 5bd62a757b9 ("xen/arm: physical irq follow virtual irq"), 
   created arch_move_irqs() as something that does something on ARM,
   and as an empty stub in x86.
2) in 14f7e3b8a70 ("xen: introduce sched_move_irqs"), generalized 
   schedule.c code by implementing sched_move_irqs().

So, if I understood correctly what Julien said here "I don't think this
would modify the irq migration work flow. etc.", it looks to me that
the suggested lazy approach could be a good solution (but I'm saying
that lacking the knowledge of what it would actually mean to implement
that).

If you want something inside the scheduler that sort of delays the
wakeup of a domain on the new pCPU until some condition in IRQ handling
code is verified (but, please, confirm whether or not it was this that
you were thinking of), my thoughts, out of the top of my head about
this are:
- in general, I think it should be possible;
- it has to be arch-specific, I think?
- It's easy to avoid the vCPU being woken as a consequence of
  vcpu_wake() being called, e.g., at the end of vcpu_migrate();
- we must be careful about not forgetting/failing to (re)wakeup the 
  vCPU when the condition verifies

Sorry if I can't be more useful than this for now. :-/

Regards,
Dario
Dario Faggioli Dec. 6, 2016, 9:36 p.m. UTC | #18
On Mon, 2016-12-05 at 11:51 -0800, Stefano Stabellini wrote:
> Another approach is to let the scheduler know that migration is
> slower.
> In fact this is not a new problem: it can be slow to migrate
> interrupts,
> even few non-LPIs interrupts, even on x86. I wonder if the Xen
> scheduler
> has any knowledge of that (CC'ing George and Dario). I guess that's
> the
> reason why most people run with dom0_vcpus_pin.
>
Oh, and about this last sentence.

I may indeed be lacking knowledge/understanding, but if you think this
is a valid use case for dom0_vcpus_pin, I'd indeed be interested in
knowing why.

In fact, that configuration has always looked rather awkward to me, and
I think we should start thinking stopping providing the option at all
(or changing/extending its behavior).

So, if you think you need it, please spell that out, and let's see if
there are better ways to achieve the same. :-)

Thanks and Regards,
Dario
Stefano Stabellini Dec. 6, 2016, 9:53 p.m. UTC | #19
On Tue, 6 Dec 2016, Dario Faggioli wrote:
> On Tue, 2016-12-06 at 11:36 -0800, Stefano Stabellini wrote:
> > On Tue, 6 Dec 2016, Julien Grall wrote:
> > > 
> > > > Another approach is to let the scheduler know that migration is
> > > > slower.
> > > > In fact this is not a new problem: it can be slow to migrate
> > > > interrupts,
> > > > even few non-LPIs interrupts, even on x86. I wonder if the Xen
> > > > scheduler
> > > > has any knowledge of that (CC'ing George and Dario). I guess
> > > > that's the
> > > > reason why most people run with dom0_vcpus_pin.
> > > 
> > > I gave a quick look at x86, arch_move_irqs is not implemented. Only
> > > PIRQ are
> > > migrated when a vCPU is moving to another pCPU.
> > > 
> > > In the case of ARM, we directly modify the configuration of the
> > > hardware. This
> > > adds much more overhead because you have to do an hardware access
> > > for every
> > > single IRQ.
> > 
> > George, Dario, any comments on whether this would make sense and how
> > to
> > do it?
> >
> I was actually looking into this, but I think I don't know enough of
> ARM in general, and about this issue in particular to be useful.
> 
> That being said, perhaps you could clarify a bit what you mean with
> "let the scheduler know that migration is slower". What you'd expect
> the scheduler to do?
> 
> Checking the code, as Julien says, on x86 all we do when we move vCPUs
> around is calling evtchn_move_pirqs(). In fact, it was right that
> function that was called multiple times in schedule.c, and it was you
> that (as Julien pointed out already):
> 1) in 5bd62a757b9 ("xen/arm: physical irq follow virtual irq"), 
>    created arch_move_irqs() as something that does something on ARM,
>    and as an empty stub in x86.
> 2) in 14f7e3b8a70 ("xen: introduce sched_move_irqs"), generalized 
>    schedule.c code by implementing sched_move_irqs().
> 
> So, if I understood correctly what Julien said here "I don't think this
> would modify the irq migration work flow. etc.", it looks to me that
> the suggested lazy approach could be a good solution (but I'm saying
> that lacking the knowledge of what it would actually mean to implement
> that).
> 
> If you want something inside the scheduler that sort of delays the
> wakeup of a domain on the new pCPU until some condition in IRQ handling
> code is verified (but, please, confirm whether or not it was this that
> you were thinking of), my thoughts, out of the top of my head about
> this are:
> - in general, I think it should be possible;
> - it has to be arch-specific, I think?
> - It's easy to avoid the vCPU being woken as a consequence of
>   vcpu_wake() being called, e.g., at the end of vcpu_migrate();
> - we must be careful about not forgetting/failing to (re)wakeup the 
>   vCPU when the condition verifies
> 
> Sorry if I can't be more useful than this for now. :-/

We don't need scheduler support to implement interrupt migration. The
question was much simpler than that: moving a vCPU with interrupts
assigned to it is slower than moving a vCPU without interrupts assigned
to it. You could say that the slowness is directly proportional do the
number of interrupts assigned to the vCPU. Does the scheduler know that?
Or blindly moves vCPUs around? Also see below.



> On Mon, 2016-12-05 at 11:51 -0800, Stefano Stabellini wrote:
> > Another approach is to let the scheduler know that migration is
> > slower.
> > In fact this is not a new problem: it can be slow to migrate
> > interrupts,
> > even few non-LPIs interrupts, even on x86. I wonder if the Xen
> > scheduler
> > has any knowledge of that (CC'ing George and Dario). I guess that's
> > the
> > reason why most people run with dom0_vcpus_pin.
> >
> Oh, and about this last sentence.
> 
> I may indeed be lacking knowledge/understanding, but if you think this
> is a valid use case for dom0_vcpus_pin, I'd indeed be interested in
> knowing why.
> 
> In fact, that configuration has always looked rather awkward to me, and
> I think we should start thinking stopping providing the option at all
> (or changing/extending its behavior).
> 
> So, if you think you need it, please spell that out, and let's see if
> there are better ways to achieve the same. :-)

That's right, I think dom0_vcpus_pin is a good work-around for the lack
of scheduler knowledge about interrupts. If the scheduler knew that
moving vCPU0 from pCPU0 to pCPU1 is far more expensive than moving vCPU3
from pCPU3 to pCPU1 then it would make better decision and we wouldn't
need dom0_vcpus_pin.
Stefano Stabellini Dec. 6, 2016, 10:01 p.m. UTC | #20
On Tue, 6 Dec 2016, Stefano Stabellini wrote:
> moving a vCPU with interrupts assigned to it is slower than moving a
> vCPU without interrupts assigned to it. You could say that the
> slowness is directly proportional do the number of interrupts assigned
> to the vCPU.

To be pedantic, by "assigned" I mean that a physical interrupt is routed
to a given pCPU and is set to be forwarded to a guest vCPU running on it
by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of
these physical interrupts, a corresponding virtual interrupt (could be a
different irq) will be injected into the guest vCPU.

When the vCPU is migrated to a new pCPU, the physical interrupts that
are configured to be injected as virtual interrupts into the vCPU, are
migrated with it. The physical interrupt migration has a cost. However,
receiving physical interrupts on the wrong pCPU has an higher cost.
Dario Faggioli Dec. 6, 2016, 10:12 p.m. UTC | #21
On Tue, 2016-12-06 at 14:01 -0800, Stefano Stabellini wrote:
> On Tue, 6 Dec 2016, Stefano Stabellini wrote:
> > 
> > moving a vCPU with interrupts assigned to it is slower than moving
> > a
> > vCPU without interrupts assigned to it. You could say that the
> > slowness is directly proportional do the number of interrupts
> > assigned
> > to the vCPU.
> 
> To be pedantic, by "assigned" I mean that a physical interrupt is
> routed
> to a given pCPU and is set to be forwarded to a guest vCPU running on
> it
> by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one
> of
> these physical interrupts, a corresponding virtual interrupt (could
> be a
> different irq) will be injected into the guest vCPU.
> 
> When the vCPU is migrated to a new pCPU, the physical interrupts that
> are configured to be injected as virtual interrupts into the vCPU,
> are
> migrated with it. The physical interrupt migration has a cost.
> However,
> receiving physical interrupts on the wrong pCPU has an higher cost.
>
Yeah, I got in what sense you said "assigned", but thanks anyway for
this clarification. It indeed makes the picture more clear (even just
FTR) :-)

Dario
Dario Faggioli Dec. 6, 2016, 10:39 p.m. UTC | #22
On Tue, 2016-12-06 at 13:53 -0800, Stefano Stabellini wrote:
> On Tue, 6 Dec 2016, Dario Faggioli wrote:
> > Sorry if I can't be more useful than this for now. :-/
> 
> We don't need scheduler support to implement interrupt migration. The
> question was much simpler than that: moving a vCPU with interrupts
> assigned to it is slower than moving a vCPU without interrupts
> assigned
> to it. You could say that the slowness is directly proportional do
> the
> number of interrupts assigned to the vCPU. Does the scheduler know
> that?
> Or blindly moves vCPUs around? Also see below.
> 
Ah, ok, it is indeed a simpler question than I thought! :-)

Answer: no, the scheduler does not use the information of how many or
what interrupts are being routed to a vCPU in any way.

Just for the sake of correctness and precision, it does not "blindly
moves vCPUs around", as in, it follows some criteria for deciding
whether or not to move a vCPU, and if yes, where to, but among those
criteria, there is no trace of anything related to routed interrupts.

Let me also add that the criteria are scheduler specific, so they're
different, e.g., between Credit and Credit2.

Starting considering routed interrupt as a migration criteria in Credit
would be rather difficult. Credit use a 'best effort' approach for
migrating vCPUs, which is hard to augment.

Starting considering routed interrupt as a migration criteria in
Credit2 would be much easier. Credit2's load balancer is specifically
designed for being extendible with things like that. It would require
some thinking, though, in order to figure out how important this
particular aspect would be, wrt others that are considered.

E.g., if I have pCPU 0 loaded at 75% and pCPU 1 loaded at 25%, vCPU A
has a lot of routed interrupts, and moving it gives me perfect load
balancing (i.e., load will become 50% on pCPU 0 and 50% on pCPU 1)
should I move it or not?
Well, it depends if whether or not we think that the overhead we save
by not migrating outweights the benefit of a perfectly balanced system.

Something like that...

Regards,
Dario
Julien Grall Dec. 6, 2016, 11:13 p.m. UTC | #23
Hi Stefano,

On 06/12/2016 22:01, Stefano Stabellini wrote:
> On Tue, 6 Dec 2016, Stefano Stabellini wrote:
>> moving a vCPU with interrupts assigned to it is slower than moving a
>> vCPU without interrupts assigned to it. You could say that the
>> slowness is directly proportional do the number of interrupts assigned
>> to the vCPU.
>
> To be pedantic, by "assigned" I mean that a physical interrupt is routed
> to a given pCPU and is set to be forwarded to a guest vCPU running on it
> by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of
> these physical interrupts, a corresponding virtual interrupt (could be a
> different irq) will be injected into the guest vCPU.
>
> When the vCPU is migrated to a new pCPU, the physical interrupts that
> are configured to be injected as virtual interrupts into the vCPU, are
> migrated with it. The physical interrupt migration has a cost. However,
> receiving physical interrupts on the wrong pCPU has an higher cost.

I don't understand why it is a problem for you to receive the first 
interrupt to the wrong pCPU and moving it if necessary.

While this may have an higher cost (I don't believe so) on the first 
received interrupt, migrating thousands of interrupts at the same time 
is very expensive and will likely get Xen stuck for a while (think about 
ITS with a single command queue).

Furthermore, the current approach will move every single interrupt 
routed a the vCPU, even those disabled. That's pointless and a waste of 
resource. You may argue that we can skip the ones disabled, but in that 
case what would be the benefits to migrate the IRQs while migrate the vCPUs?

So I would suggest to spread it over the time. This also means less 
headache for the scheduler developers.

Regards,
Julien Grall Dec. 6, 2016, 11:24 p.m. UTC | #24
Hi Dario,

On 06/12/2016 22:39, Dario Faggioli wrote:
> On Tue, 2016-12-06 at 13:53 -0800, Stefano Stabellini wrote:
>> On Tue, 6 Dec 2016, Dario Faggioli wrote:
>>> Sorry if I can't be more useful than this for now. :-/
>>
>> We don't need scheduler support to implement interrupt migration. The
>> question was much simpler than that: moving a vCPU with interrupts
>> assigned to it is slower than moving a vCPU without interrupts
>> assigned
>> to it. You could say that the slowness is directly proportional do
>> the
>> number of interrupts assigned to the vCPU. Does the scheduler know
>> that?
>> Or blindly moves vCPUs around? Also see below.
>>
> Ah, ok, it is indeed a simpler question than I thought! :-)
>
> Answer: no, the scheduler does not use the information of how many or
> what interrupts are being routed to a vCPU in any way.
>
> Just for the sake of correctness and precision, it does not "blindly
> moves vCPUs around", as in, it follows some criteria for deciding
> whether or not to move a vCPU, and if yes, where to, but among those
> criteria, there is no trace of anything related to routed interrupts.
>
> Let me also add that the criteria are scheduler specific, so they're
> different, e.g., between Credit and Credit2.
>
> Starting considering routed interrupt as a migration criteria in Credit
> would be rather difficult. Credit use a 'best effort' approach for
> migrating vCPUs, which is hard to augment.
>
> Starting considering routed interrupt as a migration criteria in
> Credit2 would be much easier. Credit2's load balancer is specifically
> designed for being extendible with things like that. It would require
> some thinking, though, in order to figure out how important this
> particular aspect would be, wrt others that are considered.
>
> E.g., if I have pCPU 0 loaded at 75% and pCPU 1 loaded at 25%, vCPU A
> has a lot of routed interrupts, and moving it gives me perfect load
> balancing (i.e., load will become 50% on pCPU 0 and 50% on pCPU 1)
> should I move it or not?
> Well, it depends if whether or not we think that the overhead we save
> by not migrating outweights the benefit of a perfectly balanced system.
>
> Something like that...

This idea to migrate IRQ while migrating vCPU is already wrong to me as 
this is really expensive, I am talking in term of milliseconds easily as 
in some case Xen will have to interact with a shared command queue.

Spreading the load is much better and avoid to migrate directly IRQ that 
are disabled when the vCPU is been migrated.

I cannot see any reason to not do that as changing the interrupt 
affinity may be not taken into account directly. By that, I mean it 
might be possible to receive another interrupt on the wrong pCPU.

I really think we should make the vCPU migration much simpler (e.g avoid 
this big loop over interrupt). In fine, if we really expect the 
scheduler to migrate the vCPU on a different pCPU. We should also expect 
receiving the interrupt on the wrong pCPU may not happen often.

Regards,
Dario Faggioli Dec. 7, 2016, 12:17 a.m. UTC | #25
On Tue, 2016-12-06 at 23:24 +0000, Julien Grall wrote:
> I really think we should make the vCPU migration much simpler (e.g
> avoid 
> this big loop over interrupt). In fine, if we really expect the 
> scheduler to migrate the vCPU on a different pCPU. We should also
> expect 
> receiving the interrupt on the wrong pCPU may not happen often.
> 
This makes sense to me, but as I said, I don't really know. I mean, I
understand what you're explaining but I didn't consider this before,
and I don't have any performance figure.

I hope I manage to explain that, if we want to take this into account
during migration, as Stefano was asking about, there's a way to do that
in Credit2. That's it. I'll be happy to help dealing with whatever you
withing yourselves decide it's best for ARM, if it has scheduling
implications. :-)

And while we're here, if considering this specific aspect is not a good
idea, but you (anyone!) have in mind other things that it could be
interesting to take into account when evaluating whether or not to
migrate a vCPU, I'd be interested to know that.

After all, the advantage of having our own scheduler (e.g., wrt KVM
that has to use the Linux one), is exactly this, i.e., that we can
focus a lot more on virtualization specific aspects. So, really, I'm
all ears. :-D

Regards,
Dario
Stefano Stabellini Dec. 7, 2016, 8:20 p.m. UTC | #26
On Tue, 6 Dec 2016, Julien Grall wrote:
> On 06/12/2016 22:01, Stefano Stabellini wrote:
> > On Tue, 6 Dec 2016, Stefano Stabellini wrote:
> > > moving a vCPU with interrupts assigned to it is slower than moving a
> > > vCPU without interrupts assigned to it. You could say that the
> > > slowness is directly proportional do the number of interrupts assigned
> > > to the vCPU.
> > 
> > To be pedantic, by "assigned" I mean that a physical interrupt is routed
> > to a given pCPU and is set to be forwarded to a guest vCPU running on it
> > by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of
> > these physical interrupts, a corresponding virtual interrupt (could be a
> > different irq) will be injected into the guest vCPU.
> > 
> > When the vCPU is migrated to a new pCPU, the physical interrupts that
> > are configured to be injected as virtual interrupts into the vCPU, are
> > migrated with it. The physical interrupt migration has a cost. However,
> > receiving physical interrupts on the wrong pCPU has an higher cost.
> 
> I don't understand why it is a problem for you to receive the first interrupt
> to the wrong pCPU and moving it if necessary.
> 
> While this may have an higher cost (I don't believe so) on the first received
> interrupt, migrating thousands of interrupts at the same time is very
> expensive and will likely get Xen stuck for a while (think about ITS with a
> single command queue).
> 
> Furthermore, the current approach will move every single interrupt routed a
> the vCPU, even those disabled. That's pointless and a waste of resource. You
> may argue that we can skip the ones disabled, but in that case what would be
> the benefits to migrate the IRQs while migrate the vCPUs?
>
> So I would suggest to spread it over the time. This also means less headache
> for the scheduler developers.

The most important aspect of interrupts handling in Xen is latency,
measured as the time between Xen receiving a physical interrupt and the
guest receiving it. This latency should be both small and deterministic.

We all agree so far, right?


The issue with spreading interrupts migrations over time is that it makes
interrupt latency less deterministic. It is OK, in the uncommon case of
vCPU migration with interrupts, to take a hit for a short time. This
"hit" can be measured. It can be known. If your workload cannot tolerate
it, vCPUs can be pinned. It should be a rare event anyway. On the other
hand, by spreading interrupts migrations, we make it harder to predict
latency. Aside from determinism, another problem with this approach is
that it ensures that every interrupt assigned to a vCPU will first hit
the wrong pCPU, then it will be moved. It guarantees the worst-case
scenario for interrupt latency for the vCPU that has been moved. If we
migrated all interrupts as soon as possible, we would minimize the
amount of interrupts delivered to the wrong pCPU. Most interrupts would
be delivered to the new pCPU right away, reducing interrupt latency.

Regardless of how we implement interrupts migrations on ARM, I think it
still makes sense for the scheduler to know about it. I realize that
this is a separate point. Even if we spread interrupts migrations over
time, it still has a cost, in terms of latency as I wrote above, but also
in terms of interactions with interrupt controllers and ITSes. A vCPU
with no interrupts assigned to it poses no such problems. The scheduler
should be aware of the difference. If the scheduler knew, I bet that
vCPU migration would be a rare event for vCPUs that have many interrupts
assigned to them. For example, Dom0 vCPU0 would never be moved, and
dom0_pin_vcpus would be superfluous.
Stefano Stabellini Dec. 7, 2016, 8:21 p.m. UTC | #27
On Tue, 6 Dec 2016, Dario Faggioli wrote:
> On Tue, 2016-12-06 at 13:53 -0800, Stefano Stabellini wrote:
> > On Tue, 6 Dec 2016, Dario Faggioli wrote:
> > > Sorry if I can't be more useful than this for now. :-/
> > 
> > We don't need scheduler support to implement interrupt migration. The
> > question was much simpler than that: moving a vCPU with interrupts
> > assigned to it is slower than moving a vCPU without interrupts
> > assigned
> > to it. You could say that the slowness is directly proportional do
> > the
> > number of interrupts assigned to the vCPU. Does the scheduler know
> > that?
> > Or blindly moves vCPUs around? Also see below.
> > 
> Ah, ok, it is indeed a simpler question than I thought! :-)
> 
> Answer: no, the scheduler does not use the information of how many or
> what interrupts are being routed to a vCPU in any way.
> 
> Just for the sake of correctness and precision, it does not "blindly
> moves vCPUs around", as in, it follows some criteria for deciding
> whether or not to move a vCPU, and if yes, where to, but among those
> criteria, there is no trace of anything related to routed interrupts.
> 
> Let me also add that the criteria are scheduler specific, so they're
> different, e.g., between Credit and Credit2.
> 
> Starting considering routed interrupt as a migration criteria in Credit
> would be rather difficult. Credit use a 'best effort' approach for
> migrating vCPUs, which is hard to augment.
> 
> Starting considering routed interrupt as a migration criteria in
> Credit2 would be much easier. Credit2's load balancer is specifically
> designed for being extendible with things like that. It would require
> some thinking, though, in order to figure out how important this
> particular aspect would be, wrt others that are considered.
> 
> E.g., if I have pCPU 0 loaded at 75% and pCPU 1 loaded at 25%, vCPU A
> has a lot of routed interrupts, and moving it gives me perfect load
> balancing (i.e., load will become 50% on pCPU 0 and 50% on pCPU 1)
> should I move it or not?
> Well, it depends if whether or not we think that the overhead we save
> by not migrating outweights the benefit of a perfectly balanced system.

Right. I don't know where to draw the line. I don't how much weight it
should have, but certainly it shouldn't be considered the same thing as
moving any other vCPU.
Dario Faggioli Dec. 9, 2016, 10:14 a.m. UTC | #28
On Wed, 2016-12-07 at 12:21 -0800, Stefano Stabellini wrote:
> On Tue, 6 Dec 2016, Dario Faggioli wrote:
> > E.g., if I have pCPU 0 loaded at 75% and pCPU 1 loaded at 25%, vCPU
> > A
> > has a lot of routed interrupts, and moving it gives me perfect load
> > balancing (i.e., load will become 50% on pCPU 0 and 50% on pCPU 1)
> > should I move it or not?
> > Well, it depends if whether or not we think that the overhead we
> > save
> > by not migrating outweights the benefit of a perfectly balanced
> > system.
> 
> Right. I don't know where to draw the line. I don't how much weight
> it
> should have, but certainly it shouldn't be considered the same thing
> as
> moving any other vCPU.
>
Right. As I said, Credit2 load balancer is nice and easy to extend
already --and needs to become nicer and easier to extend in order to
deal with soft-affinity, so I'll work on that soon (there's patches out
for soft-affinity which does sort of something like that, but I'm not
entirely satisfied of them, so I'll probably rework that part).

At that point, I'll be more than happy to consider this, and try to
reason about how much it should be weighted. After all, the only thing
we need to take this information into account when making load
balancing decisions is a mechanism for knowing how many of these routed
interrupt a vCPU has, and of course this needs to be:
 - easy to use,
 - super quick (load balancing is an hot path),
 - architecture independent,

is this the case already? :-)

Thanks and Regards,
Dario
Julien Grall Dec. 9, 2016, 6:01 p.m. UTC | #29
Hi Stefano,

On 07/12/16 20:20, Stefano Stabellini wrote:
> On Tue, 6 Dec 2016, Julien Grall wrote:
>> On 06/12/2016 22:01, Stefano Stabellini wrote:
>>> On Tue, 6 Dec 2016, Stefano Stabellini wrote:
>>>> moving a vCPU with interrupts assigned to it is slower than moving a
>>>> vCPU without interrupts assigned to it. You could say that the
>>>> slowness is directly proportional do the number of interrupts assigned
>>>> to the vCPU.
>>>
>>> To be pedantic, by "assigned" I mean that a physical interrupt is routed
>>> to a given pCPU and is set to be forwarded to a guest vCPU running on it
>>> by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of
>>> these physical interrupts, a corresponding virtual interrupt (could be a
>>> different irq) will be injected into the guest vCPU.
>>>
>>> When the vCPU is migrated to a new pCPU, the physical interrupts that
>>> are configured to be injected as virtual interrupts into the vCPU, are
>>> migrated with it. The physical interrupt migration has a cost. However,
>>> receiving physical interrupts on the wrong pCPU has an higher cost.
>>
>> I don't understand why it is a problem for you to receive the first interrupt
>> to the wrong pCPU and moving it if necessary.
>>
>> While this may have an higher cost (I don't believe so) on the first received
>> interrupt, migrating thousands of interrupts at the same time is very
>> expensive and will likely get Xen stuck for a while (think about ITS with a
>> single command queue).
>>
>> Furthermore, the current approach will move every single interrupt routed a
>> the vCPU, even those disabled. That's pointless and a waste of resource. You
>> may argue that we can skip the ones disabled, but in that case what would be
>> the benefits to migrate the IRQs while migrate the vCPUs?
>>
>> So I would suggest to spread it over the time. This also means less headache
>> for the scheduler developers.
>
> The most important aspect of interrupts handling in Xen is latency,
> measured as the time between Xen receiving a physical interrupt and the
> guest receiving it. This latency should be both small and deterministic.
>
> We all agree so far, right?
>
>
> The issue with spreading interrupts migrations over time is that it makes
> interrupt latency less deterministic. It is OK, in the uncommon case of
> vCPU migration with interrupts, to take a hit for a short time.  This
> "hit" can be measured. It can be known. If your workload cannot tolerate
> it, vCPUs can be pinned. It should be a rare event anyway.  On the other
> hand, by spreading interrupts migrations, we make it harder to predict
> latency. Aside from determinism, another problem with this approach is
> that it ensures that every interrupt assigned to a vCPU will first hit
> the wrong pCPU, then it will be moved.  It guarantees the worst-case
> scenario for interrupt latency for the vCPU that has been moved. If we
> migrated all interrupts as soon as possible, we would minimize the
> amount of interrupts delivered to the wrong pCPU. Most interrupts would
> be delivered to the new pCPU right away, reducing interrupt latency.

Migrating all the interrupts can be really expensive because in the 
current state we have to go through every single interrupt and check 
whether the interrupt has been routed to this vCPU. We will also route 
disabled interrupt. And this seems really pointless. This may need some 
optimization here.

With ITS, we may have thousand of interrupts routed to a vCPU. This 
means that for every interrupt we have to issue a command in the host 
ITS queue. You will likely fill up the command queue and add much more 
latency.

Even if you consider the vCPU migration to be a rare case. You could 
still get the pCPU stuck for tens of milliseconds, the time to migrate 
everything. And I don't think this is not acceptable.

Anyway, I would like to see measurement in both situation before 
deciding when LPIs will be migrated.

> Regardless of how we implement interrupts migrations on ARM, I think it
> still makes sense for the scheduler to know about it. I realize that
> this is a separate point. Even if we spread interrupts migrations over
> time, it still has a cost, in terms of latency as I wrote above, but also
> in terms of interactions with interrupt controllers and ITSes. A vCPU
> with no interrupts assigned to it poses no such problems. The scheduler
> should be aware of the difference. If the scheduler knew, I bet that
> vCPU migration would be a rare event for vCPUs that have many interrupts
> assigned to them. For example, Dom0 vCPU0 would never be moved, and
> dom0_pin_vcpus would be superfluous.

The number of interrupts routed to a vCPU will vary over the time, this 
will depend what the guest decides to do, so you need scheduler to 
adapt. And in fine, you give the guest a chance to "control" the 
scheduler depending how the interrupts are spread between vCPU.

If the number increases, you may end up to have the scheduler to decide 
to not migrate the vCPU because it will be too expensive. But you may 
have a situation where migrating a vCPU with many interrupts is the only 
possible choice and you will slow down the platform.

Cheers,
Andre Przywara Dec. 9, 2016, 6:07 p.m. UTC | #30
Hi,

On 07/12/16 20:20, Stefano Stabellini wrote:
> On Tue, 6 Dec 2016, Julien Grall wrote:
>> On 06/12/2016 22:01, Stefano Stabellini wrote:
>>> On Tue, 6 Dec 2016, Stefano Stabellini wrote:
>>>> moving a vCPU with interrupts assigned to it is slower than moving a
>>>> vCPU without interrupts assigned to it. You could say that the
>>>> slowness is directly proportional do the number of interrupts assigned
>>>> to the vCPU.
>>>
>>> To be pedantic, by "assigned" I mean that a physical interrupt is routed
>>> to a given pCPU and is set to be forwarded to a guest vCPU running on it
>>> by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of
>>> these physical interrupts, a corresponding virtual interrupt (could be a
>>> different irq) will be injected into the guest vCPU.
>>>
>>> When the vCPU is migrated to a new pCPU, the physical interrupts that
>>> are configured to be injected as virtual interrupts into the vCPU, are
>>> migrated with it. The physical interrupt migration has a cost. However,
>>> receiving physical interrupts on the wrong pCPU has an higher cost.
>>
>> I don't understand why it is a problem for you to receive the first interrupt
>> to the wrong pCPU and moving it if necessary.
>>
>> While this may have an higher cost (I don't believe so) on the first received
>> interrupt, migrating thousands of interrupts at the same time is very
>> expensive and will likely get Xen stuck for a while (think about ITS with a
>> single command queue).
>>
>> Furthermore, the current approach will move every single interrupt routed a
>> the vCPU, even those disabled. That's pointless and a waste of resource. You
>> may argue that we can skip the ones disabled, but in that case what would be
>> the benefits to migrate the IRQs while migrate the vCPUs?
>>
>> So I would suggest to spread it over the time. This also means less headache
>> for the scheduler developers.
> 
> The most important aspect of interrupts handling in Xen is latency,
> measured as the time between Xen receiving a physical interrupt and the
> guest receiving it. This latency should be both small and deterministic.
> 
> We all agree so far, right?
> 
> 
> The issue with spreading interrupts migrations over time is that it makes
> interrupt latency less deterministic. It is OK, in the uncommon case of
> vCPU migration with interrupts, to take a hit for a short time. This
> "hit" can be measured. It can be known. If your workload cannot tolerate
> it, vCPUs can be pinned. It should be a rare event anyway. On the other
> hand, by spreading interrupts migrations, we make it harder to predict
> latency. Aside from determinism, another problem with this approach is
> that it ensures that every interrupt assigned to a vCPU will first hit
> the wrong pCPU, then it will be moved. It guarantees the worst-case
> scenario for interrupt latency for the vCPU that has been moved. If we
> migrated all interrupts as soon as possible, we would minimize the
> amount of interrupts delivered to the wrong pCPU. Most interrupts would
> be delivered to the new pCPU right away, reducing interrupt latency.

So if this is such a crucial issue, why don't we use the ITS for good
this time? The ITS hardware probably supports 16 bits worth of
collection IDs, so what about we assign each VCPU (in every guest) a
unique collection ID on the host and do a MAPC & MOVALL on a VCPU
migration to let it point to the right physical redistributor.
I see that this does not cover all use cases (> 65536 VCPUs, for
instance), also depends much of many implementation details:
- How costly is a MOVALL? It needs to scan the pending table and
transfer set bits to the other redistributor, which may take a while.
- Is there an impact if we exceed the number of hardware backed
collections (GITS_TYPE[HCC])? If the ITS is forced to access system
memory for every table lookup, this may slow down everyday operations.
- How likely are those misdirected interrupts in the first place? How
often do we migrate VCPU compared to the the interrupt frequency?

There are more, subtle parameters to consider, so I guess we just need
to try and measure.

> Regardless of how we implement interrupts migrations on ARM, I think it
> still makes sense for the scheduler to know about it. I realize that
> this is a separate point. Even if we spread interrupts migrations over
> time, it still has a cost, in terms of latency as I wrote above, but also
> in terms of interactions with interrupt controllers and ITSes. A vCPU
> with no interrupts assigned to it poses no such problems. The scheduler
> should be aware of the difference. If the scheduler knew, I bet that
> vCPU migration would be a rare event for vCPUs that have many interrupts
> assigned to them. For example, Dom0 vCPU0 would never be moved, and
> dom0_pin_vcpus would be superfluous.

That's a good point, so indeed the "interrupt load" should be a
scheduler parameter. But as you said: that's a different story.

Cheers,
Andre.
Andre Przywara Dec. 9, 2016, 7 p.m. UTC | #31
On 03/12/16 00:46, Stefano Stabellini wrote:
> On Fri, 2 Dec 2016, Andre Przywara wrote:
>> Hi,

Hi Stefano,

I started to answer this email some days ago, but then spend some time
on actually implementing what I suggested, hence the delay ...

>>
>> sorry for chiming in late ....
>>
>> I've been spending some time thinking about this, and I think we can in
>> fact get away without ever propagating command from domains to the host.
>>
>> I made a list of all commands that possible require host ITS command
>> propagation. There are two groups:
>> 1: enabling/disabling LPIs: INV, INVALL
>> 2: mapping/unmapping devices/events: DISCARD, MAPD, MAPTI.
>>
>> The second group can be handled by mapping all required devices up
>> front, I will elaborate on that in a different email.
>>
>> For the first group, read below ...
>>
>> On 01/12/16 01:19, Stefano Stabellini wrote:
>>> On Fri, 25 Nov 2016, Julien Grall wrote:
>>>> Hi,
>>>>
>>>> On 18/11/16 18:39, Stefano Stabellini wrote:
>>>>> On Fri, 11 Nov 2016, Stefano Stabellini wrote:
>>>>>> On Fri, 11 Nov 2016, Julien Grall wrote:
>>>>>>> On 10/11/16 20:42, Stefano Stabellini wrote:
>>>>>>> That's why in the approach we had on the previous series was "host ITS
>>>>>>> command
>>>>>>> should be limited when emulating guest ITS command". From my recall, in
>>>>>>> that
>>>>>>> series the host and guest LPIs was fully separated (enabling a guest
>>>>>>> LPIs was
>>>>>>> not enabling host LPIs).
>>>>>>
>>>>>> I am interested in reading what Ian suggested to do when the physical
>>>>>> ITS queue is full, but I cannot find anything specific about it in the
>>>>>> doc.
>>>>>>
>>>>>> Do you have a suggestion for this?
>>>>>>
>>>>>> The only things that come to mind right now are:
>>>>>>
>>>>>> 1) check if the ITS queue is full and busy loop until it is not (spin_lock
>>>>>> style)
>>>>>> 2) check if the ITS queue is full and sleep until it is not (mutex style)
>>>>>
>>>>> Another, probably better idea, is to map all pLPIs of a device when the
>>>>> device is assigned to a guest (including Dom0). This is what was written
>>>>> in Ian's design doc. The advantage of this approach is that Xen doesn't
>>>>> need to take any actions on the physical ITS command queue when the
>>>>> guest issues virtual ITS commands, therefore completely solving this
>>>>> problem at the root. (Although I am not sure about enable/disable
>>>>> commands: could we avoid issuing enable/disable on pLPIs?)
>>>>
>>>> In the previous design document (see [1]), the pLPIs are enabled when the
>>>> device is assigned to the guest. This means that it is not necessary to send
>>>> command there. This is also means we may receive a pLPI before the associated
>>>> vLPI has been configured.
>>>>
>>>> That said, given that LPIs are edge-triggered, there is no deactivate state
>>>> (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same
>>>> LPIs could potentially be raised again. This could generate a storm.
>>>
>>> Thank you for raising this important point. You are correct.
>>>
>>>> The priority drop is necessary if we don't want to block the reception of
>>>> interrupt for the current physical CPU.
>>>>
>>>> What I am more concerned about is this problem can also happen in normal
>>>> running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For
>>>> edge-triggered interrupt, there is no way to prevent them to fire again. Maybe
>>>> it is time to introduce rate-limit interrupt for ARM. Any opinions?
>>>
>>> Yes. It could be as simple as disabling the pLPI when Xen receives a
>>> second pLPI before the guest EOIs the first corresponding vLPI, which
>>> shouldn't happen in normal circumstances.
>>>
>>> We need a simple per-LPI inflight counter, incremented when a pLPI is
>>> received, decremented when the corresponding vLPI is EOIed (the LR is
>>> cleared).
>>>
>>> When the counter > 1, we disable the pLPI and request a maintenance
>>> interrupt for the corresponding vLPI.
>>
>> So why do we need a _counter_? This is about edge triggered interrupts,
>> I think we can just accumulate all of them into one.
> 
> The counter is not to re-inject the same amount of interrupts into the
> guest, but to detect interrupt storms.

I was wondering if an interrupt "storm" could already be defined by
"receiving an LPI while there is already one pending (in the guest's
virtual pending table) and it being disabled by the guest". I admit that
declaring two interrupts as a storm is a bit of a stretch, but in fact
the guest had probably a reason for disabling it even though it
fires, so Xen should just follow suit.
The only difference is that we don't do it _immediately_ when the guest
tells us (via INV), but only if needed (LPI actually fires).

>> So here is what I think:
>> - We use the guest provided pending table to hold a pending bit for each
>> VLPI. We can unmap the memory from the guest, since software is not
>> supposed to access this table as per the spec.
>> - We use the guest provided property table, without trapping it. There
>> is nothing to be "validated" in that table, since it's a really tight
>> encoding and every value written in there is legal. We only look at bit
>> 0 for this exercise here anyway.
> 
> I am following...
> 
> 
>> - Upon reception of a physical LPI, we look it up to find the VCPU and
>> virtual LPI number. This is what we need to do anyway and it's a quick
>> two-level table lookup at the moment.
>> - We use the VCPU's domain and the VLPI number to index the guest's
>> property table and read the enabled bit. Again a quick table lookup.
> 
> They should be both O(2), correct?

The second is even O(1). And even the first one could be a single table,
if desperately needed.

>>  - If the VLPI is enabled, we EOI it on the host and inject it.
>>  - If the VLPI is disabled, we set the pending bit in the VCPU's
>>    pending table and EOI on the host - to allow other IRQs.
>> - On a guest INV command, we check whether that vLPI is now enabled:
>>  - If it is disabled now, we don't need to do anything.
>>  - If it is enabled now, we check the pending bit for that VLPI:
>>   - If it is 0, we don't do anything.
>>   - If it is 1, we inject the VLPI and clear the pending bit.
>> - On a guest INVALL command, we just need to iterate over the virtual
>> LPIs.
> 
> Right, much better.
> 
> 
>> If you look at the conditions above, the only immediate action is
>> when a VLPI gets enabled _and_ its pending bit is set. So we can do
>> 64-bit read accesses over the whole pending table to find non-zero words
>> and thus set bits, which should be rare in practice. We can store the
>> highest mapped VLPI to avoid iterating over the whole of the table.
>> Ideally the guest has no direct control over the pending bits, since
>> this is what the device generates. Also we limit the number of VLPIs in
>> total per guest anyway.
> 
> I wonder if we could even use a fully packed bitmask with only the
> pending bits, so 1 bit per vLPI, rather than 1 byte per vLPI. That would
> be a nice improvement.

The _pending_ table is exactly that: one bit per VLPI. So by doing a
64-bit read we cover 64 VLPIs. And normally if an LPI fires it will
probably be enabled (otherwise the guest would have disabled it in the
device), so we inject it and don't need this table. It's
really just for storing the pending status should an LPI arrive while
the guest had _disabled_ it. I assume this is rather rare, so the table
will mostly be empty: that's why I expect most reads to be 0 and the
iteration of the table to be very quick. As an additional optimization
we could store the highest and lowest virtually pending LPI, to avoid
scanning the whole table.

We can't do so much about the property table, though, because its layout
is described in the spec - in contrast to the ITS tables, which are IMPDEF.
But as we only need to do something if the LPI is _both_ enabled _and_
pending, scanning the pending table gives us a quite good filter
already. For the few LPIs that hit here, we can just access the right
byte in the property table.

>> If that still sounds like a DOS vector, we could additionally rate-limit
>> INVALLs, and/or track additions to the pending table after the last
>> INVALL: if there haven't been any new pending bits since the last scan,
>> INVALL is a NOP.
>>
>> Does that makes sense so far?
> 
> It makes sense. It should be OK.
> 
> 
>> So that just leaves us with this IRQ storm issue, which I am thinking
>> about now. But I guess this is not a show stopper given we can disable
>> the physical LPI if we sense this situation.
> 
> That is true and it's exactly what we should do.
> 
> 
>>> When we receive the maintenance interrupt and we clear the LR of the
>>> vLPI, Xen should re-enable the pLPI.

So I was thinking why you would need to wait for the guest to actually
EOI it?
Can't we end the interrupt storm condition at the moment the guest
enables the interrupt? LPIs are edge triggered and so a storm in the
past is easily merged into a single guest LPI once the guest enables it
again. From there on we inject every triggered LPI into the guest.

This special handling for the interrupt storm just stems from the fact
that have to keep LPIs enabled on the h/w interrupt controller level,
despite the guest having disabled it on it's own _virtual_ GIC. So once
the guest enables it again, we are in line with the current GICv2/GICv3,
aren't we? Do we have interrupt storm detection/prevention in the moment?
And please keep in mind that LPIs are _always_ edge triggered: So once
we EOI an LPI on the host, this "very same" LPI is gone from a h/w GIC
point of view, the next incoming interrupt must have been triggered by a
new interrupt condition in the device (new network packet, for
instance). In contrast to GICv2 this applies to _every_ LPI.

So I am not sure we should really care _too_ much about this (apart from
the "guest has disabled it" part): Once we assign a device to a guest,
we lose some control over the machine anyway and at least trust the
device to not completely block the system. I don't see how the ITS
differs in that respect from the GICv3/GICv2.


A quick update on my side: I implemented the scheme I described in my
earlier mail now and it boots to the Dom0 prompt on a fastmodel with an ITS:
- On receiving the PHYSDEVOP_manage_pci_add hypercall in
xen/arch/arm/physdev.c, we MAPD the device on the host, MAPTI a bunch of
interrupts and enable them. We keep them unassigned in our host
pLPI->VLPI table, so we discard them should they fire.
This hypercall is issued by Dom0 Linux before bringing up any PCI
devices, so it works even for Dom0 without any Linux changes. For DomUs
with PCI passthrough Dom0 is expected to issue this hypercall on behalf
of the to-be-created domain.
- When a guest (be it Dom0 or Domu) actually maps an LPI (MAPTI), we
just enter the virtual LPI number and the target VCPU in our pLPI-vLPI
table and be done. Should it fire now, we know where to inject it, but
refer to the enabled bit in the guest's property table before doing so.
- When a guest (be it Dom0 or DomU) enables or disabled an interrupt, we
don't do much, as we refer to the enable bit every time we want to
inject already. The only thing I actually do is to inject an LPI if
there is a virtual LPI pending and the LPI is now enabled.

I will spend some time next week on updating the design document,
describing the new approach. I hope it becomes a bit clearer then.

Cheers,
Andre.

>>> Given that the state of the LRs is sync'ed before calling gic_interrupt,
>>> we can be sure to know exactly in what state the vLPI is at any given
>>> time. But for this to work correctly, it is important to configure the
>>> pLPI to be delivered to the same pCPU running the vCPU which handles
>>> the vLPI (as it is already the case today for SPIs).
>>
>> Why would that be necessary?
> 
> Because the state of the LRs of other pCPUs won't be up to date: we
> wouldn't know for sure whether the guest EOI'ed the vLPI or not.
Stefano Stabellini Dec. 9, 2016, 8:13 p.m. UTC | #32
On Fri, 9 Dec 2016, Julien Grall wrote:
> Hi Stefano,
> 
> On 07/12/16 20:20, Stefano Stabellini wrote:
> > On Tue, 6 Dec 2016, Julien Grall wrote:
> > > On 06/12/2016 22:01, Stefano Stabellini wrote:
> > > > On Tue, 6 Dec 2016, Stefano Stabellini wrote:
> > > > > moving a vCPU with interrupts assigned to it is slower than moving a
> > > > > vCPU without interrupts assigned to it. You could say that the
> > > > > slowness is directly proportional do the number of interrupts assigned
> > > > > to the vCPU.
> > > > 
> > > > To be pedantic, by "assigned" I mean that a physical interrupt is routed
> > > > to a given pCPU and is set to be forwarded to a guest vCPU running on it
> > > > by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of
> > > > these physical interrupts, a corresponding virtual interrupt (could be a
> > > > different irq) will be injected into the guest vCPU.
> > > > 
> > > > When the vCPU is migrated to a new pCPU, the physical interrupts that
> > > > are configured to be injected as virtual interrupts into the vCPU, are
> > > > migrated with it. The physical interrupt migration has a cost. However,
> > > > receiving physical interrupts on the wrong pCPU has an higher cost.
> > > 
> > > I don't understand why it is a problem for you to receive the first
> > > interrupt
> > > to the wrong pCPU and moving it if necessary.
> > > 
> > > While this may have an higher cost (I don't believe so) on the first
> > > received
> > > interrupt, migrating thousands of interrupts at the same time is very
> > > expensive and will likely get Xen stuck for a while (think about ITS with
> > > a
> > > single command queue).
> > > 
> > > Furthermore, the current approach will move every single interrupt routed
> > > a
> > > the vCPU, even those disabled. That's pointless and a waste of resource.
> > > You
> > > may argue that we can skip the ones disabled, but in that case what would
> > > be
> > > the benefits to migrate the IRQs while migrate the vCPUs?
> > > 
> > > So I would suggest to spread it over the time. This also means less
> > > headache
> > > for the scheduler developers.
> > 
> > The most important aspect of interrupts handling in Xen is latency,
> > measured as the time between Xen receiving a physical interrupt and the
> > guest receiving it. This latency should be both small and deterministic.
> > 
> > We all agree so far, right?
> > 
> > 
> > The issue with spreading interrupts migrations over time is that it makes
> > interrupt latency less deterministic. It is OK, in the uncommon case of
> > vCPU migration with interrupts, to take a hit for a short time.  This
> > "hit" can be measured. It can be known. If your workload cannot tolerate
> > it, vCPUs can be pinned. It should be a rare event anyway.  On the other
> > hand, by spreading interrupts migrations, we make it harder to predict
> > latency. Aside from determinism, another problem with this approach is
> > that it ensures that every interrupt assigned to a vCPU will first hit
> > the wrong pCPU, then it will be moved.  It guarantees the worst-case
> > scenario for interrupt latency for the vCPU that has been moved. If we
> > migrated all interrupts as soon as possible, we would minimize the
> > amount of interrupts delivered to the wrong pCPU. Most interrupts would
> > be delivered to the new pCPU right away, reducing interrupt latency.
> 
> Migrating all the interrupts can be really expensive because in the current
> state we have to go through every single interrupt and check whether the
> interrupt has been routed to this vCPU. We will also route disabled interrupt.
> And this seems really pointless. This may need some optimization here.

Indeed, that should be fixed.


> With ITS, we may have thousand of interrupts routed to a vCPU. This means that
> for every interrupt we have to issue a command in the host ITS queue. You will
> likely fill up the command queue and add much more latency.
> 
> Even if you consider the vCPU migration to be a rare case. You could still get
> the pCPU stuck for tens of milliseconds, the time to migrate everything. And I
> don't think this is not acceptable.
[...]
> If the number increases, you may end up to have the scheduler to decide to not
> migrate the vCPU because it will be too expensive. But you may have a
> situation where migrating a vCPU with many interrupts is the only possible
> choice and you will slow down the platform.

A vCPU with thousand of interrupts routed to it, is the case where I
would push back to the scheduler. It should know that moving the vcpu
would be very costly.

Regardless, we need to figure out a way to move the interrupts without
"blocking" the platform for long. In practice, we might find a
threshold: a number of active interrupts above which we cannot move them
all at once anymore. Something like: we move the first 500 active
interrupts immediately, we delay the rest. We can find this threshold
only with practical measurements.


> Anyway, I would like to see measurement in both situation before deciding when
> LPIs will be migrated.

Yes, let's be scientific about this.
Stefano Stabellini Dec. 9, 2016, 8:18 p.m. UTC | #33
On Fri, 9 Dec 2016, Andre Przywara wrote:
> On 07/12/16 20:20, Stefano Stabellini wrote:
> > On Tue, 6 Dec 2016, Julien Grall wrote:
> >> On 06/12/2016 22:01, Stefano Stabellini wrote:
> >>> On Tue, 6 Dec 2016, Stefano Stabellini wrote:
> >>>> moving a vCPU with interrupts assigned to it is slower than moving a
> >>>> vCPU without interrupts assigned to it. You could say that the
> >>>> slowness is directly proportional do the number of interrupts assigned
> >>>> to the vCPU.
> >>>
> >>> To be pedantic, by "assigned" I mean that a physical interrupt is routed
> >>> to a given pCPU and is set to be forwarded to a guest vCPU running on it
> >>> by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of
> >>> these physical interrupts, a corresponding virtual interrupt (could be a
> >>> different irq) will be injected into the guest vCPU.
> >>>
> >>> When the vCPU is migrated to a new pCPU, the physical interrupts that
> >>> are configured to be injected as virtual interrupts into the vCPU, are
> >>> migrated with it. The physical interrupt migration has a cost. However,
> >>> receiving physical interrupts on the wrong pCPU has an higher cost.
> >>
> >> I don't understand why it is a problem for you to receive the first interrupt
> >> to the wrong pCPU and moving it if necessary.
> >>
> >> While this may have an higher cost (I don't believe so) on the first received
> >> interrupt, migrating thousands of interrupts at the same time is very
> >> expensive and will likely get Xen stuck for a while (think about ITS with a
> >> single command queue).
> >>
> >> Furthermore, the current approach will move every single interrupt routed a
> >> the vCPU, even those disabled. That's pointless and a waste of resource. You
> >> may argue that we can skip the ones disabled, but in that case what would be
> >> the benefits to migrate the IRQs while migrate the vCPUs?
> >>
> >> So I would suggest to spread it over the time. This also means less headache
> >> for the scheduler developers.
> > 
> > The most important aspect of interrupts handling in Xen is latency,
> > measured as the time between Xen receiving a physical interrupt and the
> > guest receiving it. This latency should be both small and deterministic.
> > 
> > We all agree so far, right?
> > 
> > 
> > The issue with spreading interrupts migrations over time is that it makes
> > interrupt latency less deterministic. It is OK, in the uncommon case of
> > vCPU migration with interrupts, to take a hit for a short time. This
> > "hit" can be measured. It can be known. If your workload cannot tolerate
> > it, vCPUs can be pinned. It should be a rare event anyway. On the other
> > hand, by spreading interrupts migrations, we make it harder to predict
> > latency. Aside from determinism, another problem with this approach is
> > that it ensures that every interrupt assigned to a vCPU will first hit
> > the wrong pCPU, then it will be moved. It guarantees the worst-case
> > scenario for interrupt latency for the vCPU that has been moved. If we
> > migrated all interrupts as soon as possible, we would minimize the
> > amount of interrupts delivered to the wrong pCPU. Most interrupts would
> > be delivered to the new pCPU right away, reducing interrupt latency.
> 
> So if this is such a crucial issue, why don't we use the ITS for good
> this time? The ITS hardware probably supports 16 bits worth of
> collection IDs, so what about we assign each VCPU (in every guest) a
> unique collection ID on the host and do a MAPC & MOVALL on a VCPU
> migration to let it point to the right physical redistributor.
> I see that this does not cover all use cases (> 65536 VCPUs, for
> instance), also depends much of many implementation details:

This is certainly an idea worth exploring. We don't need to assign a
collection ID to every vCPU, just the ones that have LPIs assigned to
them, which should be considerably fewer.


> - How costly is a MOVALL? It needs to scan the pending table and
> transfer set bits to the other redistributor, which may take a while.

This is an hardware operation, even if it is not fast, I'd prefer to
rely on that, rather than implementing something complex in software.
Usually hardware gets better over time at this sort of things.


> - Is there an impact if we exceed the number of hardware backed
> collections (GITS_TYPE[HCC])? If the ITS is forced to access system
> memory for every table lookup, this may slow down everyday operations.

We'll have to fall back to manually moving them one by one.


> - How likely are those misdirected interrupts in the first place? How
> often do we migrate VCPU compared to the the interrupt frequency?

This is where is scheduler work comes in.


> There are more, subtle parameters to consider, so I guess we just need
> to try and measure.

That's right. This is why I have been saying that we need numbers. This
is difficult hardware, difficult code and difficult scenarios. Intuition
only gets us so far. We need to be scientific and measure the approach
we decide to take, and maybe even one that we decided not to take, to
figure out whether it is actually acceptable.
Stefano Stabellini Dec. 10, 2016, 12:30 a.m. UTC | #34
On Fri, 9 Dec 2016, Andre Przywara wrote:
> >> I've been spending some time thinking about this, and I think we can in
> >> fact get away without ever propagating command from domains to the host.
> >>
> >> I made a list of all commands that possible require host ITS command
> >> propagation. There are two groups:
> >> 1: enabling/disabling LPIs: INV, INVALL
> >> 2: mapping/unmapping devices/events: DISCARD, MAPD, MAPTI.
> >>
> >> The second group can be handled by mapping all required devices up
> >> front, I will elaborate on that in a different email.
> >>
> >> For the first group, read below ...
> >>
> >> On 01/12/16 01:19, Stefano Stabellini wrote:
> >>> On Fri, 25 Nov 2016, Julien Grall wrote:
> >>>> Hi,
> >>>>
> >>>> On 18/11/16 18:39, Stefano Stabellini wrote:
> >>>>> On Fri, 11 Nov 2016, Stefano Stabellini wrote:
> >>>>>> On Fri, 11 Nov 2016, Julien Grall wrote:
> >>>>>>> On 10/11/16 20:42, Stefano Stabellini wrote:
> >>>>>>> That's why in the approach we had on the previous series was "host ITS
> >>>>>>> command
> >>>>>>> should be limited when emulating guest ITS command". From my recall, in
> >>>>>>> that
> >>>>>>> series the host and guest LPIs was fully separated (enabling a guest
> >>>>>>> LPIs was
> >>>>>>> not enabling host LPIs).
> >>>>>>
> >>>>>> I am interested in reading what Ian suggested to do when the physical
> >>>>>> ITS queue is full, but I cannot find anything specific about it in the
> >>>>>> doc.
> >>>>>>
> >>>>>> Do you have a suggestion for this?
> >>>>>>
> >>>>>> The only things that come to mind right now are:
> >>>>>>
> >>>>>> 1) check if the ITS queue is full and busy loop until it is not (spin_lock
> >>>>>> style)
> >>>>>> 2) check if the ITS queue is full and sleep until it is not (mutex style)
> >>>>>
> >>>>> Another, probably better idea, is to map all pLPIs of a device when the
> >>>>> device is assigned to a guest (including Dom0). This is what was written
> >>>>> in Ian's design doc. The advantage of this approach is that Xen doesn't
> >>>>> need to take any actions on the physical ITS command queue when the
> >>>>> guest issues virtual ITS commands, therefore completely solving this
> >>>>> problem at the root. (Although I am not sure about enable/disable
> >>>>> commands: could we avoid issuing enable/disable on pLPIs?)
> >>>>
> >>>> In the previous design document (see [1]), the pLPIs are enabled when the
> >>>> device is assigned to the guest. This means that it is not necessary to send
> >>>> command there. This is also means we may receive a pLPI before the associated
> >>>> vLPI has been configured.
> >>>>
> >>>> That said, given that LPIs are edge-triggered, there is no deactivate state
> >>>> (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same
> >>>> LPIs could potentially be raised again. This could generate a storm.
> >>>
> >>> Thank you for raising this important point. You are correct.
> >>>
> >>>> The priority drop is necessary if we don't want to block the reception of
> >>>> interrupt for the current physical CPU.
> >>>>
> >>>> What I am more concerned about is this problem can also happen in normal
> >>>> running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For
> >>>> edge-triggered interrupt, there is no way to prevent them to fire again. Maybe
> >>>> it is time to introduce rate-limit interrupt for ARM. Any opinions?
> >>>
> >>> Yes. It could be as simple as disabling the pLPI when Xen receives a
> >>> second pLPI before the guest EOIs the first corresponding vLPI, which
> >>> shouldn't happen in normal circumstances.
> >>>
> >>> We need a simple per-LPI inflight counter, incremented when a pLPI is
> >>> received, decremented when the corresponding vLPI is EOIed (the LR is
> >>> cleared).
> >>>
> >>> When the counter > 1, we disable the pLPI and request a maintenance
> >>> interrupt for the corresponding vLPI.
> >>
> >> So why do we need a _counter_? This is about edge triggered interrupts,
> >> I think we can just accumulate all of them into one.
> > 
> > The counter is not to re-inject the same amount of interrupts into the
> > guest, but to detect interrupt storms.
> 
> I was wondering if an interrupt "storm" could already be defined by
> "receiving an LPI while there is already one pending (in the guest's
> virtual pending table) and it being disabled by the guest". I admit that
> declaring two interrupts as a storm is a bit of a stretch, but in fact
> the guest had probably a reason for disabling it even though it
> fires, so Xen should just follow suit.
> The only difference is that we don't do it _immediately_ when the guest
> tells us (via INV), but only if needed (LPI actually fires).

Either way should work OK, I think.


> >>  - If the VLPI is enabled, we EOI it on the host and inject it.
> >>  - If the VLPI is disabled, we set the pending bit in the VCPU's
> >>    pending table and EOI on the host - to allow other IRQs.
> >> - On a guest INV command, we check whether that vLPI is now enabled:
> >>  - If it is disabled now, we don't need to do anything.
> >>  - If it is enabled now, we check the pending bit for that VLPI:
> >>   - If it is 0, we don't do anything.
> >>   - If it is 1, we inject the VLPI and clear the pending bit.
> >> - On a guest INVALL command, we just need to iterate over the virtual
> >> LPIs.
> > 
> > Right, much better.
> > 
> > 
> >> If you look at the conditions above, the only immediate action is
> >> when a VLPI gets enabled _and_ its pending bit is set. So we can do
> >> 64-bit read accesses over the whole pending table to find non-zero words
> >> and thus set bits, which should be rare in practice. We can store the
> >> highest mapped VLPI to avoid iterating over the whole of the table.
> >> Ideally the guest has no direct control over the pending bits, since
> >> this is what the device generates. Also we limit the number of VLPIs in
> >> total per guest anyway.
> > 
> > I wonder if we could even use a fully packed bitmask with only the
> > pending bits, so 1 bit per vLPI, rather than 1 byte per vLPI. That would
> > be a nice improvement.
> 
> The _pending_ table is exactly that: one bit per VLPI.

Actually the spec says about the pending table, ch 6.1.2:

"Each Redistributor maintains entries in a separate LPI Pending table
that indicates the pending state of each LPI when GICR_CTLR.EnableLPIs
== 1 in the Redistributor:
  0 The LPI is not pending.
  1 The LPI is pending.

For a given LPI:
• The corresponding byte in the LPI Pending table is (base address + (N / 8)).
• The bit position in the byte is (N MOD 8)."

It seems to me that each LPI is supposed to have a byte, not a bit. Am I
looking at the wrong table?

In any case you suggested to trap the pending table, so we can actually
write anything we want to those guest provided pages.


> So by doing a
> 64-bit read we cover 64 VLPIs. And normally if an LPI fires it will
> probably be enabled (otherwise the guest would have disabled it in the
> device), so we inject it and don't need this table. It's
> really just for storing the pending status should an LPI arrive while
> the guest had _disabled_ it. I assume this is rather rare, so the table
> will mostly be empty: that's why I expect most reads to be 0 and the
> iteration of the table to be very quick. As an additional optimization
> we could store the highest and lowest virtually pending LPI, to avoid
> scanning the whole table.
>
> We can't do so much about the property table, though, because its layout
> is described in the spec - in contrast to the ITS tables, which are IMPDEF.
> But as we only need to do something if the LPI is _both_ enabled _and_
> pending, scanning the pending table gives us a quite good filter
> already. For the few LPIs that hit here, we can just access the right
> byte in the property table.

OK


> >> If that still sounds like a DOS vector, we could additionally rate-limit
> >> INVALLs, and/or track additions to the pending table after the last
> >> INVALL: if there haven't been any new pending bits since the last scan,
> >> INVALL is a NOP.
> >>
> >> Does that makes sense so far?
> > 
> > It makes sense. It should be OK.
> > 
> > 
> >> So that just leaves us with this IRQ storm issue, which I am thinking
> >> about now. But I guess this is not a show stopper given we can disable
> >> the physical LPI if we sense this situation.
> > 
> > That is true and it's exactly what we should do.
> > 
> > 
> >>> When we receive the maintenance interrupt and we clear the LR of the
> >>> vLPI, Xen should re-enable the pLPI.
> 
> So I was thinking why you would need to wait for the guest to actually
> EOI it?
> Can't we end the interrupt storm condition at the moment the guest
> enables the interrupt? LPIs are edge triggered and so a storm in the
> past is easily merged into a single guest LPI once the guest enables it
> again. From there on we inject every triggered LPI into the guest.

What you describe works if the guest disables the interrupt. But what if
it doesn't? Xen should also be able to cope with non-cooperative guests
which might even have triggered the interrupt storm on purpose.

I think that you are asking this question because we are actually
talking about two different issues. See below.


> This special handling for the interrupt storm just stems from the fact
> that have to keep LPIs enabled on the h/w interrupt controller level,
> despite the guest having disabled it on it's own _virtual_ GIC. So once
> the guest enables it again, we are in line with the current GICv2/GICv3,
> aren't we? Do we have interrupt storm detection/prevention in the moment?

No, that's not the cause of the storm. Julien described well before:

  given that LPIs are edge-triggered, there is no deactivate state (see 4.1
  in ARM IHI 0069C). So as soon as the priority drop is done, the same LPIs could
  potentially be raised again. This could generate a storm.

The problem is that Xen has to do priority drop upon receiving an pLPI,
but, given that LPIs don't have a deactivate state, the priority drop is
enough to let the hardware inject a second LPI, even if the guest didn't
EOI the first one yet.

In the case of SPIs, the hardware cannot inject a second interrupt after
Xen does priority drop, it has to wait for the guest to EOI it.



> And please keep in mind that LPIs are _always_ edge triggered: So once
> we EOI an LPI on the host, this "very same" LPI is gone from a h/w GIC
> point of view, the next incoming interrupt must have been triggered by a
> new interrupt condition in the device (new network packet, for
> instance).

This is actually the problem: what if the guest configured the device on
purpose to keep generating LPIs without pause? Nice and simple way to
take down the host.


> In contrast to GICv2 this applies to _every_ LPI.
> So I am not sure we should really care _too_ much about this (apart from
> the "guest has disabled it" part): Once we assign a device to a guest,
> we lose some control over the machine anyway and at least trust the
> device to not completely block the system.

No we don't! Hardware engineers make mistakes too! We have to protect
Xen from devices which purposely or mistakenly generate interrupt
storms. This is actually a pretty common problem.


> I don't see how the ITS differs in that respect from the GICv3/GICv2.

It differs because in the case of GICv2, every single interrupt has to
be EOI'd by the guest. Therefore the Xen scheduler can still decide to
schedule it out. In the case of the ITS, Xen could be stuck in an
interrupt handling loop.


> A quick update on my side: I implemented the scheme I described in my
> earlier mail now and it boots to the Dom0 prompt on a fastmodel with an ITS:

Nice!


> - On receiving the PHYSDEVOP_manage_pci_add hypercall in
> xen/arch/arm/physdev.c, we MAPD the device on the host, MAPTI a bunch of
> interrupts and enable them. We keep them unassigned in our host
> pLPI->VLPI table, so we discard them should they fire.
> This hypercall is issued by Dom0 Linux before bringing up any PCI
> devices, so it works even for Dom0 without any Linux changes. For DomUs
> with PCI passthrough Dom0 is expected to issue this hypercall on behalf
> of the to-be-created domain.
> - When a guest (be it Dom0 or Domu) actually maps an LPI (MAPTI), we
> just enter the virtual LPI number and the target VCPU in our pLPI-vLPI
> table and be done. Should it fire now, we know where to inject it, but
> refer to the enabled bit in the guest's property table before doing so.
> - When a guest (be it Dom0 or DomU) enables or disabled an interrupt, we
> don't do much, as we refer to the enable bit every time we want to
> inject already. The only thing I actually do is to inject an LPI if
> there is a virtual LPI pending and the LPI is now enabled.

Sounds good, well done!
Andre Przywara Dec. 12, 2016, 10:38 a.m. UTC | #35
Hi Stefano,

thanks for the prompt and helpful answer!

On 10/12/16 00:30, Stefano Stabellini wrote:
> On Fri, 9 Dec 2016, Andre Przywara wrote:
>>>> I've been spending some time thinking about this, and I think we can in
>>>> fact get away without ever propagating command from domains to the host.
>>>>
>>>> I made a list of all commands that possible require host ITS command
>>>> propagation. There are two groups:
>>>> 1: enabling/disabling LPIs: INV, INVALL
>>>> 2: mapping/unmapping devices/events: DISCARD, MAPD, MAPTI.
>>>>
>>>> The second group can be handled by mapping all required devices up
>>>> front, I will elaborate on that in a different email.
>>>>
>>>> For the first group, read below ...
>>>>
>>>> On 01/12/16 01:19, Stefano Stabellini wrote:
>>>>> On Fri, 25 Nov 2016, Julien Grall wrote:
>>>>>> Hi,
>>>>>>
>>>>>> On 18/11/16 18:39, Stefano Stabellini wrote:
>>>>>>> On Fri, 11 Nov 2016, Stefano Stabellini wrote:
>>>>>>>> On Fri, 11 Nov 2016, Julien Grall wrote:
>>>>>>>>> On 10/11/16 20:42, Stefano Stabellini wrote:
>>>>>>>>> That's why in the approach we had on the previous series was "host ITS
>>>>>>>>> command
>>>>>>>>> should be limited when emulating guest ITS command". From my recall, in
>>>>>>>>> that
>>>>>>>>> series the host and guest LPIs was fully separated (enabling a guest
>>>>>>>>> LPIs was
>>>>>>>>> not enabling host LPIs).
>>>>>>>>
>>>>>>>> I am interested in reading what Ian suggested to do when the physical
>>>>>>>> ITS queue is full, but I cannot find anything specific about it in the
>>>>>>>> doc.
>>>>>>>>
>>>>>>>> Do you have a suggestion for this?
>>>>>>>>
>>>>>>>> The only things that come to mind right now are:
>>>>>>>>
>>>>>>>> 1) check if the ITS queue is full and busy loop until it is not (spin_lock
>>>>>>>> style)
>>>>>>>> 2) check if the ITS queue is full and sleep until it is not (mutex style)
>>>>>>>
>>>>>>> Another, probably better idea, is to map all pLPIs of a device when the
>>>>>>> device is assigned to a guest (including Dom0). This is what was written
>>>>>>> in Ian's design doc. The advantage of this approach is that Xen doesn't
>>>>>>> need to take any actions on the physical ITS command queue when the
>>>>>>> guest issues virtual ITS commands, therefore completely solving this
>>>>>>> problem at the root. (Although I am not sure about enable/disable
>>>>>>> commands: could we avoid issuing enable/disable on pLPIs?)
>>>>>>
>>>>>> In the previous design document (see [1]), the pLPIs are enabled when the
>>>>>> device is assigned to the guest. This means that it is not necessary to send
>>>>>> command there. This is also means we may receive a pLPI before the associated
>>>>>> vLPI has been configured.
>>>>>>
>>>>>> That said, given that LPIs are edge-triggered, there is no deactivate state
>>>>>> (see 4.1 in ARM IHI 0069C). So as soon as the priority drop is done, the same
>>>>>> LPIs could potentially be raised again. This could generate a storm.
>>>>>
>>>>> Thank you for raising this important point. You are correct.
>>>>>
>>>>>> The priority drop is necessary if we don't want to block the reception of
>>>>>> interrupt for the current physical CPU.
>>>>>>
>>>>>> What I am more concerned about is this problem can also happen in normal
>>>>>> running (i.e the pLPI is bound to an vLPI and the vLPI is enabled). For
>>>>>> edge-triggered interrupt, there is no way to prevent them to fire again. Maybe
>>>>>> it is time to introduce rate-limit interrupt for ARM. Any opinions?
>>>>>
>>>>> Yes. It could be as simple as disabling the pLPI when Xen receives a
>>>>> second pLPI before the guest EOIs the first corresponding vLPI, which
>>>>> shouldn't happen in normal circumstances.
>>>>>
>>>>> We need a simple per-LPI inflight counter, incremented when a pLPI is
>>>>> received, decremented when the corresponding vLPI is EOIed (the LR is
>>>>> cleared).
>>>>>
>>>>> When the counter > 1, we disable the pLPI and request a maintenance
>>>>> interrupt for the corresponding vLPI.
>>>>
>>>> So why do we need a _counter_? This is about edge triggered interrupts,
>>>> I think we can just accumulate all of them into one.
>>>
>>> The counter is not to re-inject the same amount of interrupts into the
>>> guest, but to detect interrupt storms.
>>
>> I was wondering if an interrupt "storm" could already be defined by
>> "receiving an LPI while there is already one pending (in the guest's
>> virtual pending table) and it being disabled by the guest". I admit that
>> declaring two interrupts as a storm is a bit of a stretch, but in fact
>> the guest had probably a reason for disabling it even though it
>> fires, so Xen should just follow suit.
>> The only difference is that we don't do it _immediately_ when the guest
>> tells us (via INV), but only if needed (LPI actually fires).
> 
> Either way should work OK, I think.
> 
> 
>>>>  - If the VLPI is enabled, we EOI it on the host and inject it.
>>>>  - If the VLPI is disabled, we set the pending bit in the VCPU's
>>>>    pending table and EOI on the host - to allow other IRQs.
>>>> - On a guest INV command, we check whether that vLPI is now enabled:
>>>>  - If it is disabled now, we don't need to do anything.
>>>>  - If it is enabled now, we check the pending bit for that VLPI:
>>>>   - If it is 0, we don't do anything.
>>>>   - If it is 1, we inject the VLPI and clear the pending bit.
>>>> - On a guest INVALL command, we just need to iterate over the virtual
>>>> LPIs.
>>>
>>> Right, much better.
>>>
>>>
>>>> If you look at the conditions above, the only immediate action is
>>>> when a VLPI gets enabled _and_ its pending bit is set. So we can do
>>>> 64-bit read accesses over the whole pending table to find non-zero words
>>>> and thus set bits, which should be rare in practice. We can store the
>>>> highest mapped VLPI to avoid iterating over the whole of the table.
>>>> Ideally the guest has no direct control over the pending bits, since
>>>> this is what the device generates. Also we limit the number of VLPIs in
>>>> total per guest anyway.
>>>
>>> I wonder if we could even use a fully packed bitmask with only the
>>> pending bits, so 1 bit per vLPI, rather than 1 byte per vLPI. That would
>>> be a nice improvement.
>>
>> The _pending_ table is exactly that: one bit per VLPI.
> 
> Actually the spec says about the pending table, ch 6.1.2:
> 
> "Each Redistributor maintains entries in a separate LPI Pending table
> that indicates the pending state of each LPI when GICR_CTLR.EnableLPIs
> == 1 in the Redistributor:
>   0 The LPI is not pending.
>   1 The LPI is pending.
> 
> For a given LPI:
> • The corresponding byte in the LPI Pending table is (base address + (N / 8)).
> • The bit position in the byte is (N MOD 8)."

        ^^^

> It seems to me that each LPI is supposed to have a byte, not a bit. Am I
> looking at the wrong table?

Well, the explanation could indeed be a bit more explicit, but it's
really meant to be a bit:
1) The two lines above describe how to address a single bit in a
byte-addressed array.
2) The following paragraphs talk about "the first 1KB" when it comes to
non-LPI interrupts. This matches 8192 bits.
3) In section 6.1 the spec states: "Memory-backed storage for LPI
pending _bits_ in an LPI Pending table."
4) The actual instance, Marc's Linux driver, also speaks of a bit:
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/irqchip/irq-gic-v3-its.c#n785

> In any case you suggested to trap the pending table, so we can actually
> write anything we want to those guest provided pages.

Indeed:
"During normal operation, the LPI Pending table is maintained solely by
the Redistributor."

>> So by doing a
>> 64-bit read we cover 64 VLPIs. And normally if an LPI fires it will
>> probably be enabled (otherwise the guest would have disabled it in the
>> device), so we inject it and don't need this table. It's
>> really just for storing the pending status should an LPI arrive while
>> the guest had _disabled_ it. I assume this is rather rare, so the table
>> will mostly be empty: that's why I expect most reads to be 0 and the
>> iteration of the table to be very quick. As an additional optimization
>> we could store the highest and lowest virtually pending LPI, to avoid
>> scanning the whole table.
>>
>> We can't do so much about the property table, though, because its layout
>> is described in the spec - in contrast to the ITS tables, which are IMPDEF.
>> But as we only need to do something if the LPI is _both_ enabled _and_
>> pending, scanning the pending table gives us a quite good filter
>> already. For the few LPIs that hit here, we can just access the right
>> byte in the property table.
> 
> OK
> 
> 
>>>> If that still sounds like a DOS vector, we could additionally rate-limit
>>>> INVALLs, and/or track additions to the pending table after the last
>>>> INVALL: if there haven't been any new pending bits since the last scan,
>>>> INVALL is a NOP.
>>>>
>>>> Does that makes sense so far?
>>>
>>> It makes sense. It should be OK.
>>>
>>>
>>>> So that just leaves us with this IRQ storm issue, which I am thinking
>>>> about now. But I guess this is not a show stopper given we can disable
>>>> the physical LPI if we sense this situation.
>>>
>>> That is true and it's exactly what we should do.
>>>
>>>
>>>>> When we receive the maintenance interrupt and we clear the LR of the
>>>>> vLPI, Xen should re-enable the pLPI.
>>
>> So I was thinking why you would need to wait for the guest to actually
>> EOI it?
>> Can't we end the interrupt storm condition at the moment the guest
>> enables the interrupt? LPIs are edge triggered and so a storm in the
>> past is easily merged into a single guest LPI once the guest enables it
>> again. From there on we inject every triggered LPI into the guest.
> 
> What you describe works if the guest disables the interrupt. But what if
> it doesn't? Xen should also be able to cope with non-cooperative guests
> which might even have triggered the interrupt storm on purpose.
> 
> I think that you are asking this question because we are actually
> talking about two different issues. See below.

Indeed I was wondering about that ...

> 
>> This special handling for the interrupt storm just stems from the fact
>> that have to keep LPIs enabled on the h/w interrupt controller level,
>> despite the guest having disabled it on it's own _virtual_ GIC. So once
>> the guest enables it again, we are in line with the current GICv2/GICv3,
>> aren't we? Do we have interrupt storm detection/prevention in the moment?
> 
> No, that's not the cause of the storm. Julien described well before:
> 
>   given that LPIs are edge-triggered, there is no deactivate state (see 4.1
>   in ARM IHI 0069C). So as soon as the priority drop is done, the same LPIs could
>   potentially be raised again. This could generate a storm.
> 
> The problem is that Xen has to do priority drop upon receiving an pLPI,
> but, given that LPIs don't have a deactivate state, the priority drop is
> enough to let the hardware inject a second LPI, even if the guest didn't
> EOI the first one yet.
> 
> In the case of SPIs, the hardware cannot inject a second interrupt after
> Xen does priority drop, it has to wait for the guest to EOI it.

I understand that ...

>> And please keep in mind that LPIs are _always_ edge triggered: So once
>> we EOI an LPI on the host, this "very same" LPI is gone from a h/w GIC
>> point of view, the next incoming interrupt must have been triggered by a
>> new interrupt condition in the device (new network packet, for
>> instance).
> 
> This is actually the problem: what if the guest configured the device on
> purpose to keep generating LPIs without pause? Nice and simple way to
> take down the host.

I see, I just wasn't sure we were talking about the same thing: actual
interrupt storm triggered by the device vs. "virtual" interrupt storm
due to an interrupt line not being lowered by the IRQ handler.
And I was hoping for the latter, but well ...
So thanks for the clarification.

>> In contrast to GICv2 this applies to _every_ LPI.
>> So I am not sure we should really care _too_ much about this (apart from
>> the "guest has disabled it" part): Once we assign a device to a guest,
>> we lose some control over the machine anyway and at least trust the
>> device to not completely block the system.
> 
> No we don't! Hardware engineers make mistakes too!

You tell me ... ;-)

> We have to protect
> Xen from devices which purposely or mistakenly generate interrupt
> storms. This is actually a pretty common problem.

I see, though this doesn't make this whole problem easier ;-)

>> I don't see how the ITS differs in that respect from the GICv3/GICv2.
> 
> It differs because in the case of GICv2, every single interrupt has to
> be EOI'd by the guest.

Sure, though I think technically it's "deactivated" here that matters
(we EOI LPIs as well). And since LPIs have no active state, this makes
the difference.

> Therefore the Xen scheduler can still decide to
> schedule it out. In the case of the ITS, Xen could be stuck in an
> interrupt handling loop.

So I was wondering as we might need to relax our new strict "No
(unprivileged) guest ever causes a host ITS command to be queued" rule a
bit, because:
- Disabling an LPI is a separate issue, as we can trigger this in Xen
interrupt context once we decide that this is an interrupt storm.
- But enabling it again has to both happen in timely manner (as the
guest expects interrupts to come in) and to be triggered by a guest
action, which causes the INV command to be send when handling a guest fault.

Now this INV command (and possibly a follow-up SYNC) for enabling an LPI
would be the only critical ones, so I was wondering if we could ensure
that these commands can always be queued immediately, by making sure we
have at least two ITS command queue slots available all of the time.
Other ITS commands (triggered by device pass-throughs, for instance),
would then have to potentially wait if we foresee that they could fill
up the host command queue.
Something like QoS for ITS commands.
And I think we should map the maximum command queue size on the host
(1MB => 32768 commands) to make this scenario less likely.

I will need to think about this a bit further, maybe implement something
as a proof of concept.

Cheers,
Andre.

> 
>> A quick update on my side: I implemented the scheme I described in my
>> earlier mail now and it boots to the Dom0 prompt on a fastmodel with an ITS:
> 
> Nice!
> 
> 
>> - On receiving the PHYSDEVOP_manage_pci_add hypercall in
>> xen/arch/arm/physdev.c, we MAPD the device on the host, MAPTI a bunch of
>> interrupts and enable them. We keep them unassigned in our host
>> pLPI->VLPI table, so we discard them should they fire.
>> This hypercall is issued by Dom0 Linux before bringing up any PCI
>> devices, so it works even for Dom0 without any Linux changes. For DomUs
>> with PCI passthrough Dom0 is expected to issue this hypercall on behalf
>> of the to-be-created domain.
>> - When a guest (be it Dom0 or Domu) actually maps an LPI (MAPTI), we
>> just enter the virtual LPI number and the target VCPU in our pLPI-vLPI
>> table and be done. Should it fire now, we know where to inject it, but
>> refer to the enabled bit in the guest's property table before doing so.
>> - When a guest (be it Dom0 or DomU) enables or disabled an interrupt, we
>> don't do much, as we refer to the enable bit every time we want to
>> inject already. The only thing I actually do is to inject an LPI if
>> there is a virtual LPI pending and the LPI is now enabled.
> 
> Sounds good, well done!
>
Stefano Stabellini Dec. 14, 2016, 12:38 a.m. UTC | #36
On Mon, 12 Dec 2016, Andre Przywara wrote:
> >> The _pending_ table is exactly that: one bit per VLPI.
> > 
> > Actually the spec says about the pending table, ch 6.1.2:
> > 
> > "Each Redistributor maintains entries in a separate LPI Pending table
> > that indicates the pending state of each LPI when GICR_CTLR.EnableLPIs
> > == 1 in the Redistributor:
> >   0 The LPI is not pending.
> >   1 The LPI is pending.
> > 
> > For a given LPI:
> > • The corresponding byte in the LPI Pending table is (base address + (N / 8)).
> > • The bit position in the byte is (N MOD 8)."
> 
>         ^^^
> 
> > It seems to me that each LPI is supposed to have a byte, not a bit. Am I
> > looking at the wrong table?
> 
> Well, the explanation could indeed be a bit more explicit, but it's
> really meant to be a bit:
> 1) The two lines above describe how to address a single bit in a
> byte-addressed array.
> 2) The following paragraphs talk about "the first 1KB" when it comes to
> non-LPI interrupts. This matches 8192 bits.
> 3) In section 6.1 the spec states: "Memory-backed storage for LPI
> pending _bits_ in an LPI Pending table."
> 4) The actual instance, Marc's Linux driver, also speaks of a bit:
> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/irqchip/irq-gic-v3-its.c#n785

I was mislead by the doc. Better this way.


> >> This special handling for the interrupt storm just stems from the fact
> >> that have to keep LPIs enabled on the h/w interrupt controller level,
> >> despite the guest having disabled it on it's own _virtual_ GIC. So once
> >> the guest enables it again, we are in line with the current GICv2/GICv3,
> >> aren't we? Do we have interrupt storm detection/prevention in the moment?
> > 
> > No, that's not the cause of the storm. Julien described well before:
> > 
> >   given that LPIs are edge-triggered, there is no deactivate state (see 4.1
> >   in ARM IHI 0069C). So as soon as the priority drop is done, the same LPIs could
> >   potentially be raised again. This could generate a storm.
> > 
> > The problem is that Xen has to do priority drop upon receiving an pLPI,
> > but, given that LPIs don't have a deactivate state, the priority drop is
> > enough to let the hardware inject a second LPI, even if the guest didn't
> > EOI the first one yet.
> > 
> > In the case of SPIs, the hardware cannot inject a second interrupt after
> > Xen does priority drop, it has to wait for the guest to EOI it.
> 
> I understand that ...
> 
> >> And please keep in mind that LPIs are _always_ edge triggered: So once
> >> we EOI an LPI on the host, this "very same" LPI is gone from a h/w GIC
> >> point of view, the next incoming interrupt must have been triggered by a
> >> new interrupt condition in the device (new network packet, for
> >> instance).
> > 
> > This is actually the problem: what if the guest configured the device on
> > purpose to keep generating LPIs without pause? Nice and simple way to
> > take down the host.
> 
> I see, I just wasn't sure we were talking about the same thing: actual
> interrupt storm triggered by the device vs. "virtual" interrupt storm
> due to an interrupt line not being lowered by the IRQ handler.
> And I was hoping for the latter, but well ...
> So thanks for the clarification.
> 
> >> In contrast to GICv2 this applies to _every_ LPI.
> >> So I am not sure we should really care _too_ much about this (apart from
> >> the "guest has disabled it" part): Once we assign a device to a guest,
> >> we lose some control over the machine anyway and at least trust the
> >> device to not completely block the system.
> > 
> > No we don't! Hardware engineers make mistakes too!
> 
> You tell me ... ;-)
> 
> > We have to protect
> > Xen from devices which purposely or mistakenly generate interrupt
> > storms. This is actually a pretty common problem.
> 
> I see, though this doesn't make this whole problem easier ;-)
> 
> >> I don't see how the ITS differs in that respect from the GICv3/GICv2.
> > 
> > It differs because in the case of GICv2, every single interrupt has to
> > be EOI'd by the guest.
> 
> Sure, though I think technically it's "deactivated" here that matters
> (we EOI LPIs as well). And since LPIs have no active state, this makes
> the difference.
> 
> > Therefore the Xen scheduler can still decide to
> > schedule it out. In the case of the ITS, Xen could be stuck in an
> > interrupt handling loop.
> 
> So I was wondering as we might need to relax our new strict "No
> (unprivileged) guest ever causes a host ITS command to be queued" rule a
> bit, because:
> - Disabling an LPI is a separate issue, as we can trigger this in Xen
> interrupt context once we decide that this is an interrupt storm.
> - But enabling it again has to both happen in timely manner (as the
> guest expects interrupts to come in) and to be triggered by a guest
> action, which causes the INV command to be send when handling a guest fault.
> 
> Now this INV command (and possibly a follow-up SYNC) for enabling an LPI
> would be the only critical ones, so I was wondering if we could ensure
> that these commands can always be queued immediately, by making sure we
> have at least two ITS command queue slots available all of the time.
> Other ITS commands (triggered by device pass-throughs, for instance),
> would then have to potentially wait if we foresee that they could fill
> up the host command queue.
> Something like QoS for ITS commands.
> And I think we should map the maximum command queue size on the host
> (1MB => 32768 commands) to make this scenario less likely.
> 
> I will need to think about this a bit further, maybe implement something
> as a proof of concept.

On one hand, I'd say that it should be rare to re-enable an interrupt,
after it was disabled due to a storm. Extremely rare. It should be OK
to issue a physical INV and SYNC in that event.

However, if that happens, it could very well be because the guest is
trying to take down the host, and issuing physical ITS commands in
response to a malicious guest action, could be a good way to help the
attacker. We need to be extra careful.

Given that a storm is supposed to be an exceptional circumstance, it is
OK to enforce very strict limits on the amount of times we are willing
to issue physical ITS commands as a consequence of a guest action. For
example, we could decide to do it just once, or twice, then label the
guest as "untrustworthy" and destroy it. After all if a storm keeps
happening, it must be due to a malicious guest or faulty hardware - in
both cases it is best to terminate the VM.
George Dunlap Dec. 14, 2016, 2:39 a.m. UTC | #37
> On Dec 10, 2016, at 4:18 AM, Stefano Stabellini <sstabellini@kernel.org> wrote:

> 

> On Fri, 9 Dec 2016, Andre Przywara wrote:

>> On 07/12/16 20:20, Stefano Stabellini wrote:

>>> On Tue, 6 Dec 2016, Julien Grall wrote:

>>>> On 06/12/2016 22:01, Stefano Stabellini wrote:

>>>>> On Tue, 6 Dec 2016, Stefano Stabellini wrote:

>>>>>> moving a vCPU with interrupts assigned to it is slower than moving a

>>>>>> vCPU without interrupts assigned to it. You could say that the

>>>>>> slowness is directly proportional do the number of interrupts assigned

>>>>>> to the vCPU.

>>>>> 

>>>>> To be pedantic, by "assigned" I mean that a physical interrupt is routed

>>>>> to a given pCPU and is set to be forwarded to a guest vCPU running on it

>>>>> by the _IRQ_GUEST flag. The guest could be dom0. Upon receiving one of

>>>>> these physical interrupts, a corresponding virtual interrupt (could be a

>>>>> different irq) will be injected into the guest vCPU.

>>>>> 

>>>>> When the vCPU is migrated to a new pCPU, the physical interrupts that

>>>>> are configured to be injected as virtual interrupts into the vCPU, are

>>>>> migrated with it. The physical interrupt migration has a cost. However,

>>>>> receiving physical interrupts on the wrong pCPU has an higher cost.

>>>> 

>>>> I don't understand why it is a problem for you to receive the first interrupt

>>>> to the wrong pCPU and moving it if necessary.

>>>> 

>>>> While this may have an higher cost (I don't believe so) on the first received

>>>> interrupt, migrating thousands of interrupts at the same time is very

>>>> expensive and will likely get Xen stuck for a while (think about ITS with a

>>>> single command queue).

>>>> 

>>>> Furthermore, the current approach will move every single interrupt routed a

>>>> the vCPU, even those disabled. That's pointless and a waste of resource. You

>>>> may argue that we can skip the ones disabled, but in that case what would be

>>>> the benefits to migrate the IRQs while migrate the vCPUs?

>>>> 

>>>> So I would suggest to spread it over the time. This also means less headache

>>>> for the scheduler developers.

>>> 

>>> The most important aspect of interrupts handling in Xen is latency,

>>> measured as the time between Xen receiving a physical interrupt and the

>>> guest receiving it. This latency should be both small and deterministic.

>>> 

>>> We all agree so far, right?

>>> 

>>> 

>>> The issue with spreading interrupts migrations over time is that it makes

>>> interrupt latency less deterministic. It is OK, in the uncommon case of

>>> vCPU migration with interrupts, to take a hit for a short time. This

>>> "hit" can be measured. It can be known. If your workload cannot tolerate

>>> it, vCPUs can be pinned. It should be a rare event anyway. On the other

>>> hand, by spreading interrupts migrations, we make it harder to predict

>>> latency. Aside from determinism, another problem with this approach is

>>> that it ensures that every interrupt assigned to a vCPU will first hit

>>> the wrong pCPU, then it will be moved. It guarantees the worst-case

>>> scenario for interrupt latency for the vCPU that has been moved. If we

>>> migrated all interrupts as soon as possible, we would minimize the

>>> amount of interrupts delivered to the wrong pCPU. Most interrupts would

>>> be delivered to the new pCPU right away, reducing interrupt latency.


OK, so ultimately for each interrupt we can take an “eager” approach and move it as soon as the vcpu moves, or a “lazy” approach and move it after it fires.

The two options which have been discussed are:
1. Always take an eager approach, and try to tell the scheduler to limit the migration frequency for these vcpus more than others
2. Always take a lazy approach, and leave the scheduler the way it is.

Another approach which one might take:
3. Eagerly migrate a subset of the interrupts and lazily migrate the others.  For instance, we could eagerly migrate all the interrupts which have fired since the last vcpu migration.  In a system where migrations happen frequently, this should only be a handful; in a system that migrates infrequently, this will be more, but it won’t matter, because it will happen less often.

Workloads which need predictable IRQ latencies should probably be pinning their vcpus anyway.

So at the moment, the scheduler already tries to avoid migrating things *a little bit* if it can (see migrate_resist).  It’s not clear to me at the moment whether this is enough or not.  Or to put it a different way — how long should the scheduler try to wait before moving one of these vcpus?  At the moment I haven’t seen a good way of calculating this.

#3 to me has the feeling of being somewhat more satisfying, but also potentially fairly complicated.  Since the scheduler already does migration resistance somewhat, #1 would be a simpler to implement in the sort run.  If it turns out that #1 has other drawbacks, we can implement #3 as and when needed.

Thoughts?

 -George
>
Dario Faggioli Dec. 16, 2016, 1:30 a.m. UTC | #38
On Wed, 2016-12-14 at 03:39 +0100, George Dunlap wrote:
> > On Dec 10, 2016, at 4:18 AM, Stefano Stabellini <sstabellini@kernel
> > .org> wrote:
> > > > The issue with spreading interrupts migrations over time is
> > > > that it makes
> > > > interrupt latency less deterministic. It is OK, in the uncommon
> > > > case of
> > > > vCPU migration with interrupts, to take a hit for a short time.
> > > > This
> > > > "hit" can be measured. It can be known. If your workload cannot
> > > > tolerate
> > > > it, vCPUs can be pinned. It should be a rare event anyway. On
> > > > the other
> > > > hand, by spreading interrupts migrations, we make it harder to
> > > > predict
> > > > latency. Aside from determinism, another problem with this
> > > > approach is
> > > > that it ensures that every interrupt assigned to a vCPU will
> > > > first hit
> > > > the wrong pCPU, then it will be moved. It guarantees the worst-
> > > > case
> > > > scenario for interrupt latency for the vCPU that has been
> > > > moved. If we
> > > > migrated all interrupts as soon as possible, we would minimize
> > > > the
> > > > amount of interrupts delivered to the wrong pCPU. Most
> > > > interrupts would
> > > > be delivered to the new pCPU right away, reducing interrupt
> > > > latency.

> Another approach which one might take:
> 3. Eagerly migrate a subset of the interrupts and lazily migrate the
> others.  For instance, we could eagerly migrate all the interrupts
> which have fired since the last vcpu migration.  In a system where
> migrations happen frequently, this should only be a handful; in a
> system that migrates infrequently, this will be more, but it won’t
> matter, because it will happen less often.
> 
Yes, if doable (e.g., I don't know how easy and practical is to know
and keep track of fired interrupts) this looks a good solution to me
too.

> So at the moment, the scheduler already tries to avoid migrating
> things *a little bit* if it can (see migrate_resist).  It’s not clear
> to me at the moment whether this is enough or not.  
>
Well, true, but migration resistance, in Credit2, is just a fixed value
which:
 1. is set at boot time;
 2. is always the same for all vcpus;
 3. is always the same, no matter what a vcpu is doing.

And even if we make it tunable and changeable at runtime (which I
intend to do), it's still something pretty "static" because of 2 and 3.

And even if we make it tunable per-vcpu (which is doable), it would be
rather hard to decide to what value to set it, for each vcpu. And, of
course, 3 would still apply (i.e., it would change according to the
vcpu workload or characteristics).

So, it's guessing. More or less fine grained, but always guessing.

On the other hand, using something proportional to nr. of routed
interrupt as the migration resistance threshold would overcome all 1, 2
and 3. It would give us a migrate_resist value which is adaptive, and
is determined according to actual workload of properties of a specific
vcpu.
Feeding routed interrupt info to the load balancer comes from similar
reasoning (and we actually may want to do both).

FTR, Credit1 has a similar mechanism, i.e., it *even wilded guesses*
whether a vcpu could still have some of its data in cache, and tries
not to migrate it if it's likely (see __csched_vcpu_is_cache_hot()).
We can improve that too, although it is a lot more complex and less
predictable, as usual with Credit1.

> Or to put it a different way — how long should the scheduler try to
> wait before moving one of these vcpus?  
>
Yep, it's similar to the "anticipation" problem in I/O schedulers
(where "excessive seeks" ~= "too frequent migrations").

 https://en.wikipedia.org/wiki/Anticipatory_scheduling

> At the moment I haven’t seen a good way of calculating this.
> 
Exactly, and basing the calculation on the number of routed interrupt
--and, if possible, other metrics too-- could be that "good way" we're
looking for.

It would need experimenting, of course, but I like the idea.

> #3 to me has the feeling of being somewhat more satisfying, but also
> potentially fairly complicated.  Since the scheduler already does
> migration resistance somewhat, #1 would be a simpler to implement in
> the sort run.  If it turns out that #1 has other drawbacks, we can
> implement #3 as and when needed.
> 
> Thoughts?
> 
Yes, we can do things incrementally, which is always good. I like your
#1 proposal because it has the really positive side effect of bringing
us in the camp of adaptive migration resistance, which is something
pretty advanced and pretty cool, if we manage to do it right. :-)

Regards,
Dario
diff mbox

Patch

diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c
index 6f4329f..5129d6e 100644
--- a/xen/arch/arm/gic-its.c
+++ b/xen/arch/arm/gic-its.c
@@ -228,6 +228,18 @@  static int its_send_cmd_inv(struct host_its *its,
     return its_send_command(its, cmd);
 }
 
+static int its_send_cmd_invall(struct host_its *its, int cpu)
+{
+    uint64_t cmd[4];
+
+    cmd[0] = GITS_CMD_INVALL;
+    cmd[1] = 0x00;
+    cmd[2] = cpu & GENMASK(15, 0);
+    cmd[3] = 0x00;
+
+    return its_send_command(its, cmd);
+}
+
 int gicv3_its_map_device(struct host_its *hw_its, struct domain *d,
                          int devid, int bits, bool valid)
 {
@@ -668,6 +680,52 @@  uint32_t gicv3_lpi_lookup_lpi(struct domain *d, uint32_t host_lpi, int *vcpu_id)
     return hlpi.virt_lpi;
 }
 
+/* Iterate over all host LPIs, and updating the "enabled" state for a given
+ * guest redistributor (VCPU) given the respective state in the provided
+ * proptable. This proptable is indexed by the stored virtual LPI number.
+ * This is to implement a guest INVALL command.
+ */
+void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable)
+{
+    int chunk, i;
+    struct host_its *its;
+
+    for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++)
+    {
+        if ( !lpi_data.host_lpis[chunk] )
+            continue;
+
+        for (i = 0; i < HOST_LPIS_PER_PAGE; i++)
+        {
+            union host_lpi *hlpip = &lpi_data.host_lpis[chunk][i], hlpi;
+            uint32_t hlpi_nr;
+
+            hlpi.data = hlpip->data;
+            if ( !hlpi.virt_lpi )
+                continue;
+
+            if ( hlpi.dom_id != v->domain->domain_id )
+                continue;
+
+            if ( hlpi.vcpu_id != v->vcpu_id )
+                continue;
+
+            hlpi_nr = chunk * HOST_LPIS_PER_PAGE + i;
+
+            if ( proptable[hlpi.virt_lpi] & LPI_PROP_ENABLED )
+                lpi_data.lpi_property[hlpi_nr - 8192] |= LPI_PROP_ENABLED;
+            else
+                lpi_data.lpi_property[hlpi_nr - 8192] &= ~LPI_PROP_ENABLED;
+        }
+    }
+
+    /* Tell all ITSes that they should update the property table for CPU 0,
+     * which is where we map all LPIs to.
+     */
+    list_for_each_entry(its, &host_its_list, entry)
+        its_send_cmd_invall(its, 0);
+}
+
 void gicv3_lpi_set_enable(struct host_its *its,
                           uint32_t deviceid, uint32_t eventid,
                           uint32_t host_lpi, bool enabled)
diff --git a/xen/arch/arm/vgic-its.c b/xen/arch/arm/vgic-its.c
index 74da8fc..1e429b7 100644
--- a/xen/arch/arm/vgic-its.c
+++ b/xen/arch/arm/vgic-its.c
@@ -294,6 +294,33 @@  out_unlock:
     return ret;
 }
 
+/* INVALL updates the per-LPI configuration status for every LPI mapped to
+ * this redistributor. For the guest side we don't need to update anything,
+ * as we always refer to the actual table for the enabled bit and the
+ * priority.
+ * Enabling or disabling a virtual LPI however needs to be propagated to
+ * the respective host LPI. Instead of iterating over all mapped LPIs in our
+ * emulated GIC (which is expensive due to the required on-demand mapping),
+ * we iterate over all mapped _host_ LPIs and filter for those which are
+ * forwarded to this virtual redistributor.
+ */
+static int its_handle_invall(struct virt_its *its, uint64_t *cmdptr)
+{
+    uint32_t collid = its_cmd_get_collection(cmdptr);
+    struct vcpu *vcpu;
+
+    spin_lock(&its->its_lock);
+    vcpu = get_vcpu_from_collection(its, collid);
+    spin_unlock(&its->its_lock);
+
+    if ( !vcpu )
+        return -1;
+
+    gicv3_lpi_update_configurations(vcpu, its->d->arch.vgic.proptable);
+
+    return 0;
+}
+
 static int its_handle_mapc(struct virt_its *its, uint64_t *cmdptr)
 {
     uint32_t collid = its_cmd_get_collection(cmdptr);
@@ -515,6 +542,9 @@  static int vgic_its_handle_cmds(struct domain *d, struct virt_its *its,
         case GITS_CMD_INV:
             its_handle_inv(its, cmdptr);
 	    break;
+        case GITS_CMD_INVALL:
+            its_handle_invall(its, cmdptr);
+	    break;
         case GITS_CMD_MAPC:
             its_handle_mapc(its, cmdptr);
             break;
diff --git a/xen/include/asm-arm/gic-its.h b/xen/include/asm-arm/gic-its.h
index 2cdb3e1..ba6b2d5 100644
--- a/xen/include/asm-arm/gic-its.h
+++ b/xen/include/asm-arm/gic-its.h
@@ -146,6 +146,8 @@  int gicv3_lpi_drop_host_lpi(struct host_its *its,
                             uint32_t devid, uint32_t eventid,
                             uint32_t host_lpi);
 
+void gicv3_lpi_update_configurations(struct vcpu *v, uint8_t *proptable);
+
 static inline int gicv3_lpi_get_priority(struct domain *d, uint32_t lpi)
 {
     return d->arch.vgic.proptable[lpi - 8192] & 0xfc;