[RFC,06/24] ARM: GICv3 ITS: introduce host LPI array

Message ID	20160928182457.12433-7-andre.przywara@arm.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <xen-devel-bounces@lists.xen.org> From: Andre Przywara <andre.przywara@arm.com> To: Julien Grall <julien.grall@arm.com>, Stefano Stabellini <sstabellini@kernel.org> Date: Wed, 28 Sep 2016 19:24:39 +0100 Message-Id: <20160928182457.12433-7-andre.przywara@arm.com> In-Reply-To: <20160928182457.12433-1-andre.przywara@arm.com> References: <20160928182457.12433-1-andre.przywara@arm.com> Cc: xen-devel@lists.xenproject.org Subject: [Xen-devel] [RFC PATCH 06/24] ARM: GICv3 ITS: introduce host LPI array Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: xen-devel-bounces@lists.xen.org Sender: "Xen-devel" <xen-devel-bounces@lists.xen.org>

Message ID

20160928182457.12433-7-andre.przywara@arm.com (mailing list archive)

State

New, archived

Headers

From: Andre Przywara <andre.przywara@arm.com>
To: Julien Grall <julien.grall@arm.com>,
	Stefano Stabellini <sstabellini@kernel.org>
Date: Wed, 28 Sep 2016 19:24:39 +0100
Message-Id: <20160928182457.12433-7-andre.przywara@arm.com>
In-Reply-To: <20160928182457.12433-1-andre.przywara@arm.com>
References: <20160928182457.12433-1-andre.przywara@arm.com>
Cc: xen-devel@lists.xenproject.org
Subject: [Xen-devel] [RFC PATCH 06/24] ARM: GICv3 ITS: introduce host LPI
	array
Precedence: list
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
Errors-To: xen-devel-bounces@lists.xen.org
Sender: "Xen-devel" <xen-devel-bounces@lists.xen.org>

Commit Message

Andre Przywara Sept. 28, 2016, 6:24 p.m. UTC

The number of LPIs on a host can be potentially huge (millions),
although in practise will be mostly reasonable. So prematurely allocating
an array of struct irq_desc's for each LPI is not an option.
However Xen itself does not care about LPIs, as every LPI will be injected
into a guest (Dom0 for now).
Create a dense data structure (8 Bytes) for each LPI which holds just
enough information to determine the virtual IRQ number and the VCPU into
which the LPI needs to be injected.
Also to not artificially limit the number of LPIs, we create a 2-level
table for holding those structures.
This patch introduces functions to initialize these tables and to
create, lookup and destroy entries for a given LPI.
We allocate and access LPI information in a way that does not require
a lock.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
---
 xen/arch/arm/gic-its.c        | 154 ++++++++++++++++++++++++++++++++++++++++++
 xen/include/asm-arm/gic-its.h |  18 +++++
 2 files changed, 172 insertions(+)

Comments

Stefano Stabellini Oct. 27, 2016, 10:59 p.m. UTC | #1

On Wed, 28 Sep 2016, Andre Przywara wrote:
> The number of LPIs on a host can be potentially huge (millions),
> although in practise will be mostly reasonable. So prematurely allocating
> an array of struct irq_desc's for each LPI is not an option.
> However Xen itself does not care about LPIs, as every LPI will be injected
> into a guest (Dom0 for now).
> Create a dense data structure (8 Bytes) for each LPI which holds just
> enough information to determine the virtual IRQ number and the VCPU into
> which the LPI needs to be injected.
> Also to not artificially limit the number of LPIs, we create a 2-level
> table for holding those structures.
> This patch introduces functions to initialize these tables and to
> create, lookup and destroy entries for a given LPI.
> We allocate and access LPI information in a way that does not require
> a lock.
> 
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> ---
>  xen/arch/arm/gic-its.c        | 154 ++++++++++++++++++++++++++++++++++++++++++
>  xen/include/asm-arm/gic-its.h |  18 +++++
>  2 files changed, 172 insertions(+)
> 
> diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c
> index 88397bc..2140e4a 100644
> --- a/xen/arch/arm/gic-its.c
> +++ b/xen/arch/arm/gic-its.c
> @@ -18,18 +18,31 @@
>  
>  #include <xen/config.h>
>  #include <xen/lib.h>
> +#include <xen/sched.h>
>  #include <xen/err.h>
>  #include <xen/device_tree.h>
>  #include <xen/libfdt/libfdt.h>
>  #include <asm/p2m.h>
> +#include <asm/domain.h>
>  #include <asm/io.h>
>  #include <asm/gic.h>
>  #include <asm/gic_v3_defs.h>
>  #include <asm/gic-its.h>
>  
> +/* LPIs on the host always go to a guest, so no struct irq_desc for them. */
> +union host_lpi {
> +    uint64_t data;
> +    struct {
> +        uint64_t virt_lpi:32;
> +        uint64_t dom_id:16;
> +        uint64_t vcpu_id:16;
> +    };
> +};

Why not the following?

  union host_lpi {
      uint64_t data;
      struct {
          uint32_t virt_lpi;
          uint16_t dom_id;
          uint16_t vcpu_id;
      };
  };


>  /* Global state */
>  static struct {
>      uint8_t *lpi_property;
> +    union host_lpi **host_lpis;
>      int host_lpi_bits;
>  } lpi_data;
>  
> @@ -43,6 +56,26 @@ static DEFINE_PER_CPU(void *, pending_table);
>  #define MAX_HOST_LPI_BITS                                                \
>          min_t(unsigned int, lpi_data.host_lpi_bits, CONFIG_HOST_LPI_BITS)
>  #define MAX_HOST_LPIS   (BIT(MAX_HOST_LPI_BITS) - 8192)
> +#define HOST_LPIS_PER_PAGE      (PAGE_SIZE / sizeof(union host_lpi))
> +
> +static union host_lpi *gic_find_host_lpi(uint32_t lpi, struct domain *d)

I take "lpi" is the physical lpi here. Maybe we would rename it to "plpi"
for clarity.


> +{
> +    union host_lpi *hlpi;
> +
> +    if ( lpi < 8192 || lpi >= MAX_HOST_LPIS + 8192 )
> +        return NULL;
> +
> +    lpi -= 8192;
> +    if ( !lpi_data.host_lpis[lpi / HOST_LPIS_PER_PAGE] )
> +        return NULL;
> +
> +    hlpi = &lpi_data.host_lpis[lpi / HOST_LPIS_PER_PAGE][lpi % HOST_LPIS_PER_PAGE];

I realize I am sometimes obsessive about this, but division operations
are expensive and this is on the hot path, so I would do:

#define HOST_LPIS_PER_PAGE      (PAGE_SIZE >> 3)

unsigned int table = lpi / HOST_LPIS_PER_PAGE;

then use table throughout this function.


> +    if ( d && hlpi->dom_id != d->domain_id )
> +        return NULL;

I think this function is very useful so I would avoid making any domain
checks here: one day we might want to retrieve hlpi even if hlpi->dom_id
!= d->domain_id. I would move the domain check outside.


> +    return hlpi;
> +}
>  
>  #define ITS_COMMAND_SIZE        32
>  
> @@ -96,6 +129,33 @@ static int its_send_cmd_sync(struct host_its *its, int cpu)
>      return its_send_command(its, cmd);
>  }
>  
> +static int its_send_cmd_discard(struct host_its *its,
> +                                uint32_t deviceid, uint32_t eventid)
> +{
> +    uint64_t cmd[4];
> +
> +    cmd[0] = GITS_CMD_DISCARD | ((uint64_t)deviceid << 32);
> +    cmd[1] = eventid;
> +    cmd[2] = 0x00;
> +    cmd[3] = 0x00;
> +
> +    return its_send_command(its, cmd);
> +}
> +
> +static int its_send_cmd_mapti(struct host_its *its,
> +                              uint32_t deviceid, uint32_t eventid,
> +                              uint32_t pintid, uint16_t icid)
> +{
> +    uint64_t cmd[4];
> +
> +    cmd[0] = GITS_CMD_MAPTI | ((uint64_t)deviceid << 32);
> +    cmd[1] = eventid | ((uint64_t)pintid << 32);
> +    cmd[2] = icid;
> +    cmd[3] = 0x00;
> +
> +    return its_send_command(its, cmd);
> +}
> +
>  static int its_send_cmd_mapc(struct host_its *its, int collection_id, int cpu)
>  {
>      uint64_t cmd[4];
> @@ -330,15 +390,109 @@ uint64_t gicv3_lpi_get_proptable()
>      return reg;
>  }
>  
> +/* Allocate the 2nd level array for host LPIs. This one holds pointers
> + * to the page with the actual "union host_lpi" entries. Our LPI limit
> + * avoids excessive memory usage.
> + */
>  int gicv3_lpi_init_host_lpis(int lpi_bits)
>  {
> +    int nr_lpi_ptrs;
> +
>      lpi_data.host_lpi_bits = lpi_bits;
>  
> +    nr_lpi_ptrs = MAX_HOST_LPIS / (PAGE_SIZE / sizeof(union host_lpi));
> +
> +    lpi_data.host_lpis = xzalloc_array(union host_lpi *, nr_lpi_ptrs);
> +    if ( !lpi_data.host_lpis )
> +        return -ENOMEM;

Why are we not allocating the 2nd level right away? To save memory? If
so, I would like some numbers in a real use case scenario written either
here on in the commit message.


>      printk("GICv3: using at most %ld LPIs on the host.\n", MAX_HOST_LPIS);
>  
>      return 0;
>  }
>  
> +/* Allocates a new host LPI to be injected as "virt_lpi" into the specified
> + * VCPU. Returns the host LPI ID or a negative error value.
> + */
> +int gicv3_lpi_allocate_host_lpi(struct host_its *its,
> +                                uint32_t devid, uint32_t eventid,
> +                                struct vcpu *v, int virt_lpi)
> +{
> +    int chunk, i;
> +    union host_lpi hlpi, *new_chunk;
> +
> +    /* TODO: handle some kind of preassigned LPI mapping for DomUs */
> +    if ( !its )
> +        return -EPERM;
> +
> +    /* TODO: This could be optimized by storing some "next available" hint and
> +     * only iterate if this one doesn't work. But this function should be
> +     * called rarely.
> +     */

Yes please. Even a trivial pointer to last would be far better than this.
It would be nice to run some numbers and prove that in realistic
scenarios finding an empty plpi doesn't take more than 5-10 ops, which
should be the case unless we have to loop over and the initial chucks
are still fully populated, causing Xen to scan for 512 units at a time.
We defenitely want to avoid that, if not in rare worse case scenarios.


> +    for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++)
> +    {
> +        /* If we hit an unallocated chunk, we initialize it and use entry 0. */
> +        if ( !lpi_data.host_lpis[chunk] )
> +        {
> +            new_chunk = alloc_xenheap_pages(0, 0);
> +            if ( !new_chunk )
> +                return -ENOMEM;
> +
> +            memset(new_chunk, 0, PAGE_SIZE);
> +            lpi_data.host_lpis[chunk] = new_chunk;
> +            i = 0;
> +        }
> +        else
> +        {
> +            /* Find an unallocted entry in this chunk. */
> +            for (i = 0; i < HOST_LPIS_PER_PAGE; i++)
> +                if ( !lpi_data.host_lpis[chunk][i].virt_lpi )
> +                    break;
> +
> +            /* If this chunk is fully allocted, advance to the next one. */
                                           ^ allocated


> +            if ( i == HOST_LPIS_PER_PAGE)
> +                continue;
> +        }
> +
> +        hlpi.virt_lpi = virt_lpi;
> +        hlpi.dom_id = v->domain->domain_id;
> +        hlpi.vcpu_id = v->vcpu_id;
> +        lpi_data.host_lpis[chunk][i].data = hlpi.data;
> +
> +        if (its)

code style


> +        {
> +            its_send_cmd_mapti(its, devid, eventid,
> +                               chunk * HOST_LPIS_PER_PAGE + i + 8192, 0);
> +            its_send_cmd_sync(its, 0);

Why hardcode the physical cpu to 0? Should we get the pcpu the vcpu is
currently running on?


> +        }
> +
> +        return chunk * HOST_LPIS_PER_PAGE + i + 8192;
> +    }
> +
> +    return -ENOSPC;
> +}
> +
> +/* Drops the connection of the given host LPI to a virtual LPI.
> + */
> +int gicv3_lpi_drop_host_lpi(struct host_its *its,
> +                            uint32_t devid, uint32_t eventid, uint32_t host_lpi)
> +{
> +    union host_lpi *hlpip;
> +
> +    if ( !its )
> +        return -EPERM;
> +
> +    hlpip = gic_find_host_lpi(host_lpi, NULL);
> +    if ( !hlpip )
> +        return -1;
> +
> +    hlpip->data = 0;
> +
> +    its_send_cmd_discard(its, devid, eventid);
> +
> +    return 0;
> +}
> +
>  void gicv3_its_dt_init(const struct dt_device_node *node)
>  {
>      const struct dt_device_node *its = NULL;
> diff --git a/xen/include/asm-arm/gic-its.h b/xen/include/asm-arm/gic-its.h
> index b49d274..512a388 100644
> --- a/xen/include/asm-arm/gic-its.h
> +++ b/xen/include/asm-arm/gic-its.h
> @@ -114,6 +114,12 @@ void gicv3_set_redist_addr(paddr_t address, int redist_id);
>  /* Map a collection for this host CPU to each host ITS. */
>  void gicv3_its_setup_collection(int cpu);
>  
> +int gicv3_lpi_allocate_host_lpi(struct host_its *its,
> +                                uint32_t devid, uint32_t eventid,
> +                                struct vcpu *v, int virt_lpi);
> +int gicv3_lpi_drop_host_lpi(struct host_its *its,
> +                            uint32_t devid, uint32_t eventid,
> +                            uint32_t host_lpi);
>  #else
>  
>  static inline void gicv3_its_dt_init(const struct dt_device_node *node)
> @@ -141,6 +147,18 @@ static inline void gicv3_set_redist_addr(paddr_t address, int redist_id)
>  static inline void gicv3_its_setup_collection(int cpu)
>  {
>  }
> +static inline int gicv3_lpi_allocate_host_lpi(struct host_its *its,
> +                                              uint32_t devid, uint32_t eventid,
> +                                              struct vcpu *v, int virt_lpi)
> +{
> +    return 0;
> +}
> +static inline int gicv3_lpi_drop_host_lpi(struct host_its *its,
> +                                          uint32_t devid, uint32_t eventid,
> +                                          uint32_t host_lpi)
> +{
> +    return 0;
> +}
>  
>  #endif /* CONFIG_HAS_ITS */
>  
> -- 
> 2.9.0
>

Julien Grall Nov. 2, 2016, 3:14 p.m. UTC | #2

Hi,

On 27/10/16 23:59, Stefano Stabellini wrote:
> On Wed, 28 Sep 2016, Andre Przywara wrote:
>> The number of LPIs on a host can be potentially huge (millions),
>> although in practise will be mostly reasonable. So prematurely allocating
>> an array of struct irq_desc's for each LPI is not an option.
>> However Xen itself does not care about LPIs, as every LPI will be injected
>> into a guest (Dom0 for now).
>> Create a dense data structure (8 Bytes) for each LPI which holds just
>> enough information to determine the virtual IRQ number and the VCPU into
>> which the LPI needs to be injected.
>> Also to not artificially limit the number of LPIs, we create a 2-level
>> table for holding those structures.
>> This patch introduces functions to initialize these tables and to
>> create, lookup and destroy entries for a given LPI.
>> We allocate and access LPI information in a way that does not require
>> a lock.
>>
>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>> ---
>>  xen/arch/arm/gic-its.c        | 154 ++++++++++++++++++++++++++++++++++++++++++
>>  xen/include/asm-arm/gic-its.h |  18 +++++
>>  2 files changed, 172 insertions(+)
>>
>> diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c
>> index 88397bc..2140e4a 100644
>> --- a/xen/arch/arm/gic-its.c
>> +++ b/xen/arch/arm/gic-its.c
>> @@ -18,18 +18,31 @@
>>
>>  #include <xen/config.h>
>>  #include <xen/lib.h>
>> +#include <xen/sched.h>
>>  #include <xen/err.h>
>>  #include <xen/device_tree.h>
>>  #include <xen/libfdt/libfdt.h>
>>  #include <asm/p2m.h>
>> +#include <asm/domain.h>
>>  #include <asm/io.h>
>>  #include <asm/gic.h>
>>  #include <asm/gic_v3_defs.h>
>>  #include <asm/gic-its.h>
>>
>> +/* LPIs on the host always go to a guest, so no struct irq_desc for them. */
>> +union host_lpi {
>> +    uint64_t data;
>> +    struct {
>> +        uint64_t virt_lpi:32;
>> +        uint64_t dom_id:16;
>> +        uint64_t vcpu_id:16;
>> +    };
>> +};
>
> Why not the following?
>
>   union host_lpi {
>       uint64_t data;
>       struct {
>           uint32_t virt_lpi;
>           uint16_t dom_id;
>           uint16_t vcpu_id;
>       };
>   };
>
>
>>  /* Global state */
>>  static struct {
>>      uint8_t *lpi_property;
>> +    union host_lpi **host_lpis;
>>      int host_lpi_bits;
>>  } lpi_data;
>>
>> @@ -43,6 +56,26 @@ static DEFINE_PER_CPU(void *, pending_table);
>>  #define MAX_HOST_LPI_BITS                                                \
>>          min_t(unsigned int, lpi_data.host_lpi_bits, CONFIG_HOST_LPI_BITS)
>>  #define MAX_HOST_LPIS   (BIT(MAX_HOST_LPI_BITS) - 8192)
>> +#define HOST_LPIS_PER_PAGE      (PAGE_SIZE / sizeof(union host_lpi))
>> +
>> +static union host_lpi *gic_find_host_lpi(uint32_t lpi, struct domain *d)
>
> I take "lpi" is the physical lpi here. Maybe we would rename it to "plpi"
> for clarity.

+1 here. We tend to use the prefix 'p' for physical and 'v' for virtual 
(e.g virq/pirq, vcpu/pcpu). I'd like to see the same for the LPIs.

While we are here, I think the function should be named gic_find_plpi.

>
>
>> +{
>> +    union host_lpi *hlpi;
>> +
>> +    if ( lpi < 8192 || lpi >= MAX_HOST_LPIS + 8192 )
>> +        return NULL;
>> +
>> +    lpi -= 8192;
>> +    if ( !lpi_data.host_lpis[lpi / HOST_LPIS_PER_PAGE] )
>> +        return NULL;
>> +
>> +    hlpi = &lpi_data.host_lpis[lpi / HOST_LPIS_PER_PAGE][lpi % HOST_LPIS_PER_PAGE];
>
> I realize I am sometimes obsessive about this, but division operations
> are expensive and this is on the hot path, so I would do:
>
> #define HOST_LPIS_PER_PAGE      (PAGE_SIZE >> 3)
>
> unsigned int table = lpi / HOST_LPIS_PER_PAGE;
>
> then use table throughout this function.
>
>
>> +    if ( d && hlpi->dom_id != d->domain_id )
>> +        return NULL;
>
> I think this function is very useful so I would avoid making any domain
> checks here: one day we might want to retrieve hlpi even if hlpi->dom_id
> != d->domain_id. I would move the domain check outside.

+1.

Regards,

Andre Przywara Nov. 10, 2016, 5:22 p.m. UTC | #3

Hi,

On 27/10/16 23:59, Stefano Stabellini wrote:
> On Wed, 28 Sep 2016, Andre Przywara wrote:
>> The number of LPIs on a host can be potentially huge (millions),
>> although in practise will be mostly reasonable. So prematurely allocating
>> an array of struct irq_desc's for each LPI is not an option.
>> However Xen itself does not care about LPIs, as every LPI will be injected
>> into a guest (Dom0 for now).
>> Create a dense data structure (8 Bytes) for each LPI which holds just
>> enough information to determine the virtual IRQ number and the VCPU into
>> which the LPI needs to be injected.
>> Also to not artificially limit the number of LPIs, we create a 2-level
>> table for holding those structures.
>> This patch introduces functions to initialize these tables and to
>> create, lookup and destroy entries for a given LPI.
>> We allocate and access LPI information in a way that does not require
>> a lock.
>>
>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>> ---
>>  xen/arch/arm/gic-its.c        | 154 ++++++++++++++++++++++++++++++++++++++++++
>>  xen/include/asm-arm/gic-its.h |  18 +++++
>>  2 files changed, 172 insertions(+)
>>
>> diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c
>> index 88397bc..2140e4a 100644
>> --- a/xen/arch/arm/gic-its.c
>> +++ b/xen/arch/arm/gic-its.c
>> @@ -18,18 +18,31 @@
>>  
>>  #include <xen/config.h>
>>  #include <xen/lib.h>
>> +#include <xen/sched.h>
>>  #include <xen/err.h>
>>  #include <xen/device_tree.h>
>>  #include <xen/libfdt/libfdt.h>
>>  #include <asm/p2m.h>
>> +#include <asm/domain.h>
>>  #include <asm/io.h>
>>  #include <asm/gic.h>
>>  #include <asm/gic_v3_defs.h>
>>  #include <asm/gic-its.h>
>>  
>> +/* LPIs on the host always go to a guest, so no struct irq_desc for them. */
>> +union host_lpi {
>> +    uint64_t data;
>> +    struct {
>> +        uint64_t virt_lpi:32;
>> +        uint64_t dom_id:16;
>> +        uint64_t vcpu_id:16;
>> +    };
>> +};
> 
> Why not the following?
> 
>   union host_lpi {
>       uint64_t data;
>       struct {
>           uint32_t virt_lpi;
>           uint16_t dom_id;
>           uint16_t vcpu_id;
>       };
>   };

I am not sure that gives me a guarantee of stuffing everything into a
u64 (as per the C standard). It probably will on arm64 with gcc, but I
thought better safe than sorry.

>>  /* Global state */
>>  static struct {
>>      uint8_t *lpi_property;
>> +    union host_lpi **host_lpis;
>>      int host_lpi_bits;
>>  } lpi_data;
>>  
>> @@ -43,6 +56,26 @@ static DEFINE_PER_CPU(void *, pending_table);
>>  #define MAX_HOST_LPI_BITS                                                \
>>          min_t(unsigned int, lpi_data.host_lpi_bits, CONFIG_HOST_LPI_BITS)
>>  #define MAX_HOST_LPIS   (BIT(MAX_HOST_LPI_BITS) - 8192)
>> +#define HOST_LPIS_PER_PAGE      (PAGE_SIZE / sizeof(union host_lpi))
>> +
>> +static union host_lpi *gic_find_host_lpi(uint32_t lpi, struct domain *d)
> 
> I take "lpi" is the physical lpi here. Maybe we would rename it to "plpi"
> for clarity.

Indeed.

> 
>> +{
>> +    union host_lpi *hlpi;
>> +
>> +    if ( lpi < 8192 || lpi >= MAX_HOST_LPIS + 8192 )
>> +        return NULL;
>> +
>> +    lpi -= 8192;
>> +    if ( !lpi_data.host_lpis[lpi / HOST_LPIS_PER_PAGE] )
>> +        return NULL;
>> +
>> +    hlpi = &lpi_data.host_lpis[lpi / HOST_LPIS_PER_PAGE][lpi % HOST_LPIS_PER_PAGE];
> 
> I realize I am sometimes obsessive about this, but division operations
> are expensive and this is on the hot path, so I would do:
> 
> #define HOST_LPIS_PER_PAGE      (PAGE_SIZE >> 3)

to replace
#define HOST_LPIS_PER_PAGE      (PAGE_SIZE / sizeof(union host_lpi))?

This should be computed by the compiler, as it's constant.

> unsigned int table = lpi / HOST_LPIS_PER_PAGE;

So I'd rather replace this by ">> (PAGE_SIZE - 3)".
But again the compiler would do this for us, as replacing "constant
divisions by power-of-two" with "right shifts" are a text book example
of easy optimization, if I remember this compiler class at uni correctly ;-)

> then use table throughout this function.

I see your point (though this is ARMv8, which always has udiv).
But to prove your paranoia wrong: I don't see any divisions in the
disassembly, but a lsr #3 and a lsr #9 and various other clever and
cheap ARMv8 instructions ;-)
Compilers have really come a long way in 2016 ...

> 
>> +    if ( d && hlpi->dom_id != d->domain_id )
>> +        return NULL;
> 
> I think this function is very useful so I would avoid making any domain
> checks here: one day we might want to retrieve hlpi even if hlpi->dom_id
> != d->domain_id. I would move the domain check outside.

That's why I have "d && ..." in front. If you pass in NULL for the
domain, it will skip this check. That saves us from coding the check in
every caller.
Is that not good enough?

> 
>> +    return hlpi;
>> +}
>>  
>>  #define ITS_COMMAND_SIZE        32
>>  
>> @@ -96,6 +129,33 @@ static int its_send_cmd_sync(struct host_its *its, int cpu)
>>      return its_send_command(its, cmd);
>>  }
>>  
>> +static int its_send_cmd_discard(struct host_its *its,
>> +                                uint32_t deviceid, uint32_t eventid)
>> +{
>> +    uint64_t cmd[4];
>> +
>> +    cmd[0] = GITS_CMD_DISCARD | ((uint64_t)deviceid << 32);
>> +    cmd[1] = eventid;
>> +    cmd[2] = 0x00;
>> +    cmd[3] = 0x00;
>> +
>> +    return its_send_command(its, cmd);
>> +}
>> +
>> +static int its_send_cmd_mapti(struct host_its *its,
>> +                              uint32_t deviceid, uint32_t eventid,
>> +                              uint32_t pintid, uint16_t icid)
>> +{
>> +    uint64_t cmd[4];
>> +
>> +    cmd[0] = GITS_CMD_MAPTI | ((uint64_t)deviceid << 32);
>> +    cmd[1] = eventid | ((uint64_t)pintid << 32);
>> +    cmd[2] = icid;
>> +    cmd[3] = 0x00;
>> +
>> +    return its_send_command(its, cmd);
>> +}
>> +
>>  static int its_send_cmd_mapc(struct host_its *its, int collection_id, int cpu)
>>  {
>>      uint64_t cmd[4];
>> @@ -330,15 +390,109 @@ uint64_t gicv3_lpi_get_proptable()
>>      return reg;
>>  }
>>  
>> +/* Allocate the 2nd level array for host LPIs. This one holds pointers
>> + * to the page with the actual "union host_lpi" entries. Our LPI limit
>> + * avoids excessive memory usage.
>> + */
>>  int gicv3_lpi_init_host_lpis(int lpi_bits)
>>  {
>> +    int nr_lpi_ptrs;
>> +
>>      lpi_data.host_lpi_bits = lpi_bits;
>>  
>> +    nr_lpi_ptrs = MAX_HOST_LPIS / (PAGE_SIZE / sizeof(union host_lpi));
>> +
>> +    lpi_data.host_lpis = xzalloc_array(union host_lpi *, nr_lpi_ptrs);
>> +    if ( !lpi_data.host_lpis )
>> +        return -ENOMEM;
> 
> Why are we not allocating the 2nd level right away? To save memory? If
> so, I would like some numbers in a real use case scenario written either
> here on in the commit message.

LPIs can be allocated sparsely. Each LPI uses 8 Bytes, chances are we
never use more than a few dozen on a real system, so we just use two
pages with this scheme.

Allocating memory for all 2 << 20 (the default) takes 8 MB (probably for
nothing), extending this to 24 bits uses 128 MB already.
The problem is that Xen cannot know how many LPIs Dom0 will use, so I'd
rather make this number generous here - hence the allocation scheme.

Not sure if this is actually overkill or paranoid and we would get away
with a much smaller single level allocation, driven by a config option
or runtime parameter, though.

>>      printk("GICv3: using at most %ld LPIs on the host.\n", MAX_HOST_LPIS);
>>  
>>      return 0;
>>  }
>>  
>> +/* Allocates a new host LPI to be injected as "virt_lpi" into the specified
>> + * VCPU. Returns the host LPI ID or a negative error value.
>> + */
>> +int gicv3_lpi_allocate_host_lpi(struct host_its *its,
>> +                                uint32_t devid, uint32_t eventid,
>> +                                struct vcpu *v, int virt_lpi)
>> +{
>> +    int chunk, i;
>> +    union host_lpi hlpi, *new_chunk;
>> +
>> +    /* TODO: handle some kind of preassigned LPI mapping for DomUs */
>> +    if ( !its )
>> +        return -EPERM;
>> +
>> +    /* TODO: This could be optimized by storing some "next available" hint and
>> +     * only iterate if this one doesn't work. But this function should be
>> +     * called rarely.
>> +     */
> 
> Yes please. Even a trivial pointer to last would be far better than this.
> It would be nice to run some numbers and prove that in realistic
> scenarios finding an empty plpi doesn't take more than 5-10 ops, which
> should be the case unless we have to loop over and the initial chucks
> are still fully populated, causing Xen to scan for 512 units at a time.
> We defenitely want to avoid that, if not in rare worse case scenarios.

I can try, though keep in mind that this code is really only called on
allocating a host LPI, which would only happen when an LPI gets mapped.
And this is done only upon a Dom0 driver initializing a device. Normally
you wouldn't expect this during the actual guest runtime.

> 
>> +    for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++)
>> +    {
>> +        /* If we hit an unallocated chunk, we initialize it and use entry 0. */
>> +        if ( !lpi_data.host_lpis[chunk] )
>> +        {
>> +            new_chunk = alloc_xenheap_pages(0, 0);
>> +            if ( !new_chunk )
>> +                return -ENOMEM;
>> +
>> +            memset(new_chunk, 0, PAGE_SIZE);
>> +            lpi_data.host_lpis[chunk] = new_chunk;
>> +            i = 0;
>> +        }
>> +        else
>> +        {
>> +            /* Find an unallocted entry in this chunk. */
>> +            for (i = 0; i < HOST_LPIS_PER_PAGE; i++)
>> +                if ( !lpi_data.host_lpis[chunk][i].virt_lpi )
>> +                    break;
>> +
>> +            /* If this chunk is fully allocted, advance to the next one. */
>                                            ^ allocated
> 
> 
>> +            if ( i == HOST_LPIS_PER_PAGE)
>> +                continue;
>> +        }
>> +
>> +        hlpi.virt_lpi = virt_lpi;
>> +        hlpi.dom_id = v->domain->domain_id;
>> +        hlpi.vcpu_id = v->vcpu_id;
>> +        lpi_data.host_lpis[chunk][i].data = hlpi.data;
>> +
>> +        if (its)
> 
> code style
> 
> 
>> +        {
>> +            its_send_cmd_mapti(its, devid, eventid,
>> +                               chunk * HOST_LPIS_PER_PAGE + i + 8192, 0);
>> +            its_send_cmd_sync(its, 0);
> 
> Why hardcode the physical cpu to 0? Should we get the pcpu the vcpu is
> currently running on?

Yes, admittedly I was papering over this for the RFC (as I am afraid
there's more than that).
Will look at this.

Cheers,
Andre.

>> +        }
>> +
>> +        return chunk * HOST_LPIS_PER_PAGE + i + 8192;
>> +    }
>> +
>> +    return -ENOSPC;
>> +}
>> +
>> +/* Drops the connection of the given host LPI to a virtual LPI.
>> + */
>> +int gicv3_lpi_drop_host_lpi(struct host_its *its,
>> +                            uint32_t devid, uint32_t eventid, uint32_t host_lpi)
>> +{
>> +    union host_lpi *hlpip;
>> +
>> +    if ( !its )
>> +        return -EPERM;
>> +
>> +    hlpip = gic_find_host_lpi(host_lpi, NULL);
>> +    if ( !hlpip )
>> +        return -1;
>> +
>> +    hlpip->data = 0;
>> +
>> +    its_send_cmd_discard(its, devid, eventid);
>> +
>> +    return 0;
>> +}
>> +
>>  void gicv3_its_dt_init(const struct dt_device_node *node)
>>  {
>>      const struct dt_device_node *its = NULL;
>> diff --git a/xen/include/asm-arm/gic-its.h b/xen/include/asm-arm/gic-its.h
>> index b49d274..512a388 100644
>> --- a/xen/include/asm-arm/gic-its.h
>> +++ b/xen/include/asm-arm/gic-its.h
>> @@ -114,6 +114,12 @@ void gicv3_set_redist_addr(paddr_t address, int redist_id);
>>  /* Map a collection for this host CPU to each host ITS. */
>>  void gicv3_its_setup_collection(int cpu);
>>  
>> +int gicv3_lpi_allocate_host_lpi(struct host_its *its,
>> +                                uint32_t devid, uint32_t eventid,
>> +                                struct vcpu *v, int virt_lpi);
>> +int gicv3_lpi_drop_host_lpi(struct host_its *its,
>> +                            uint32_t devid, uint32_t eventid,
>> +                            uint32_t host_lpi);
>>  #else
>>  
>>  static inline void gicv3_its_dt_init(const struct dt_device_node *node)
>> @@ -141,6 +147,18 @@ static inline void gicv3_set_redist_addr(paddr_t address, int redist_id)
>>  static inline void gicv3_its_setup_collection(int cpu)
>>  {
>>  }
>> +static inline int gicv3_lpi_allocate_host_lpi(struct host_its *its,
>> +                                              uint32_t devid, uint32_t eventid,
>> +                                              struct vcpu *v, int virt_lpi)
>> +{
>> +    return 0;
>> +}
>> +static inline int gicv3_lpi_drop_host_lpi(struct host_its *its,
>> +                                          uint32_t devid, uint32_t eventid,
>> +                                          uint32_t host_lpi)
>> +{
>> +    return 0;
>> +}
>>  
>>  #endif /* CONFIG_HAS_ITS */
>>  
>> -- 
>> 2.9.0
>>
>

Stefano Stabellini Nov. 10, 2016, 9:48 p.m. UTC | #4

On Thu, 10 Nov 2016, Andre Przywara wrote:
> Hi,
> 
> On 27/10/16 23:59, Stefano Stabellini wrote:
> > On Wed, 28 Sep 2016, Andre Przywara wrote:
> >> The number of LPIs on a host can be potentially huge (millions),
> >> although in practise will be mostly reasonable. So prematurely allocating
> >> an array of struct irq_desc's for each LPI is not an option.
> >> However Xen itself does not care about LPIs, as every LPI will be injected
> >> into a guest (Dom0 for now).
> >> Create a dense data structure (8 Bytes) for each LPI which holds just
> >> enough information to determine the virtual IRQ number and the VCPU into
> >> which the LPI needs to be injected.
> >> Also to not artificially limit the number of LPIs, we create a 2-level
> >> table for holding those structures.
> >> This patch introduces functions to initialize these tables and to
> >> create, lookup and destroy entries for a given LPI.
> >> We allocate and access LPI information in a way that does not require
> >> a lock.
> >>
> >> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> >> ---
> >>  xen/arch/arm/gic-its.c        | 154 ++++++++++++++++++++++++++++++++++++++++++
> >>  xen/include/asm-arm/gic-its.h |  18 +++++
> >>  2 files changed, 172 insertions(+)
> >>
> >> diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c
> >> index 88397bc..2140e4a 100644
> >> --- a/xen/arch/arm/gic-its.c
> >> +++ b/xen/arch/arm/gic-its.c
> >> @@ -18,18 +18,31 @@
> >>  
> >>  #include <xen/config.h>
> >>  #include <xen/lib.h>
> >> +#include <xen/sched.h>
> >>  #include <xen/err.h>
> >>  #include <xen/device_tree.h>
> >>  #include <xen/libfdt/libfdt.h>
> >>  #include <asm/p2m.h>
> >> +#include <asm/domain.h>
> >>  #include <asm/io.h>
> >>  #include <asm/gic.h>
> >>  #include <asm/gic_v3_defs.h>
> >>  #include <asm/gic-its.h>
> >>  
> >> +/* LPIs on the host always go to a guest, so no struct irq_desc for them. */
> >> +union host_lpi {
> >> +    uint64_t data;
> >> +    struct {
> >> +        uint64_t virt_lpi:32;
> >> +        uint64_t dom_id:16;
> >> +        uint64_t vcpu_id:16;
> >> +    };
> >> +};
> > 
> > Why not the following?
> > 
> >   union host_lpi {
> >       uint64_t data;
> >       struct {
> >           uint32_t virt_lpi;
> >           uint16_t dom_id;
> >           uint16_t vcpu_id;
> >       };
> >   };
> 
> I am not sure that gives me a guarantee of stuffing everything into a
> u64 (as per the C standard). It probably will on arm64 with gcc, but I
> thought better safe than sorry.

I am pretty sure that it is covered by the standard, also see
IHI0055A_aapcs64. Additionally I don't think the union with "data" is
actually required either.


> >>  /* Global state */
> >>  static struct {
> >>      uint8_t *lpi_property;
> >> +    union host_lpi **host_lpis;
> >>      int host_lpi_bits;
> >>  } lpi_data;
> >>  
> >> @@ -43,6 +56,26 @@ static DEFINE_PER_CPU(void *, pending_table);
> >>  #define MAX_HOST_LPI_BITS                                                \
> >>          min_t(unsigned int, lpi_data.host_lpi_bits, CONFIG_HOST_LPI_BITS)
> >>  #define MAX_HOST_LPIS   (BIT(MAX_HOST_LPI_BITS) - 8192)
> >> +#define HOST_LPIS_PER_PAGE      (PAGE_SIZE / sizeof(union host_lpi))
> >> +
> >> +static union host_lpi *gic_find_host_lpi(uint32_t lpi, struct domain *d)
> > 
> > I take "lpi" is the physical lpi here. Maybe we would rename it to "plpi"
> > for clarity.
> 
> Indeed.
> 
> > 
> >> +{
> >> +    union host_lpi *hlpi;
> >> +
> >> +    if ( lpi < 8192 || lpi >= MAX_HOST_LPIS + 8192 )
> >> +        return NULL;
> >> +
> >> +    lpi -= 8192;
> >> +    if ( !lpi_data.host_lpis[lpi / HOST_LPIS_PER_PAGE] )
> >> +        return NULL;
> >> +
> >> +    hlpi = &lpi_data.host_lpis[lpi / HOST_LPIS_PER_PAGE][lpi % HOST_LPIS_PER_PAGE];
> > 
> > I realize I am sometimes obsessive about this, but division operations
> > are expensive and this is on the hot path, so I would do:
> > 
> > #define HOST_LPIS_PER_PAGE      (PAGE_SIZE >> 3)
> 
> to replace
> #define HOST_LPIS_PER_PAGE      (PAGE_SIZE / sizeof(union host_lpi))?
> 
> This should be computed by the compiler, as it's constant.
> 
> > unsigned int table = lpi / HOST_LPIS_PER_PAGE;
> 
> So I'd rather replace this by ">> (PAGE_SIZE - 3)".

This is actually what I meant, thanks.


> But again the compiler would do this for us, as replacing "constant
> divisions by power-of-two" with "right shifts" are a text book example
> of easy optimization, if I remember this compiler class at uni correctly ;-)

Yet, we found instanced where this didn't happen in the common Xen
scheduler code on x86.


> > then use table throughout this function.
> 
> I see your point (though this is ARMv8, which always has udiv).
> But to prove your paranoia wrong: I don't see any divisions in the
> disassembly, but a lsr #3 and a lsr #9 and various other clever and
> cheap ARMv8 instructions ;-)
> Compilers have really come a long way in 2016 ...

Fair enough, thanks for checking. That is enough for me.


> >> +    if ( d && hlpi->dom_id != d->domain_id )
> >> +        return NULL;
> > 
> > I think this function is very useful so I would avoid making any domain
> > checks here: one day we might want to retrieve hlpi even if hlpi->dom_id
> > != d->domain_id. I would move the domain check outside.
> 
> That's why I have "d && ..." in front. If you pass in NULL for the
> domain, it will skip this check. That saves us from coding the check in
> every caller.
> Is that not good enough?

There is a simple solution to this: write two functions, one without the
check and a wrapper to it with the check.


> > 
> >> +    return hlpi;
> >> +}
> >>  
> >>  #define ITS_COMMAND_SIZE        32
> >>  
> >> @@ -96,6 +129,33 @@ static int its_send_cmd_sync(struct host_its *its, int cpu)
> >>      return its_send_command(its, cmd);
> >>  }
> >>  
> >> +static int its_send_cmd_discard(struct host_its *its,
> >> +                                uint32_t deviceid, uint32_t eventid)
> >> +{
> >> +    uint64_t cmd[4];
> >> +
> >> +    cmd[0] = GITS_CMD_DISCARD | ((uint64_t)deviceid << 32);
> >> +    cmd[1] = eventid;
> >> +    cmd[2] = 0x00;
> >> +    cmd[3] = 0x00;
> >> +
> >> +    return its_send_command(its, cmd);
> >> +}
> >> +
> >> +static int its_send_cmd_mapti(struct host_its *its,
> >> +                              uint32_t deviceid, uint32_t eventid,
> >> +                              uint32_t pintid, uint16_t icid)
> >> +{
> >> +    uint64_t cmd[4];
> >> +
> >> +    cmd[0] = GITS_CMD_MAPTI | ((uint64_t)deviceid << 32);
> >> +    cmd[1] = eventid | ((uint64_t)pintid << 32);
> >> +    cmd[2] = icid;
> >> +    cmd[3] = 0x00;
> >> +
> >> +    return its_send_command(its, cmd);
> >> +}
> >> +
> >>  static int its_send_cmd_mapc(struct host_its *its, int collection_id, int cpu)
> >>  {
> >>      uint64_t cmd[4];
> >> @@ -330,15 +390,109 @@ uint64_t gicv3_lpi_get_proptable()
> >>      return reg;
> >>  }
> >>  
> >> +/* Allocate the 2nd level array for host LPIs. This one holds pointers
> >> + * to the page with the actual "union host_lpi" entries. Our LPI limit
> >> + * avoids excessive memory usage.
> >> + */
> >>  int gicv3_lpi_init_host_lpis(int lpi_bits)
> >>  {
> >> +    int nr_lpi_ptrs;
> >> +
> >>      lpi_data.host_lpi_bits = lpi_bits;
> >>  
> >> +    nr_lpi_ptrs = MAX_HOST_LPIS / (PAGE_SIZE / sizeof(union host_lpi));
> >> +
> >> +    lpi_data.host_lpis = xzalloc_array(union host_lpi *, nr_lpi_ptrs);
> >> +    if ( !lpi_data.host_lpis )
> >> +        return -ENOMEM;
> > 
> > Why are we not allocating the 2nd level right away? To save memory? If
> > so, I would like some numbers in a real use case scenario written either
> > here on in the commit message.
> 
> LPIs can be allocated sparsely. Each LPI uses 8 Bytes, chances are we
> never use more than a few dozen on a real system, so we just use two
> pages with this scheme.
> 
> Allocating memory for all 2 << 20 (the default) takes 8 MB (probably for
> nothing), extending this to 24 bits uses 128 MB already.
> The problem is that Xen cannot know how many LPIs Dom0 will use, so I'd
> rather make this number generous here - hence the allocation scheme.
> 
> Not sure if this is actually overkill or paranoid and we would get away
> with a much smaller single level allocation, driven by a config option
> or runtime parameter, though.

All right. Please write an in-code comment explaining this reasoning
with a sample number of LPIs used by Dom0 on a real case scenario.


> >>      printk("GICv3: using at most %ld LPIs on the host.\n", MAX_HOST_LPIS);
> >>  
> >>      return 0;
> >>  }
> >>  
> >> +/* Allocates a new host LPI to be injected as "virt_lpi" into the specified
> >> + * VCPU. Returns the host LPI ID or a negative error value.
> >> + */
> >> +int gicv3_lpi_allocate_host_lpi(struct host_its *its,
> >> +                                uint32_t devid, uint32_t eventid,
> >> +                                struct vcpu *v, int virt_lpi)
> >> +{
> >> +    int chunk, i;
> >> +    union host_lpi hlpi, *new_chunk;
> >> +
> >> +    /* TODO: handle some kind of preassigned LPI mapping for DomUs */
> >> +    if ( !its )
> >> +        return -EPERM;
> >> +
> >> +    /* TODO: This could be optimized by storing some "next available" hint and
> >> +     * only iterate if this one doesn't work. But this function should be
> >> +     * called rarely.
> >> +     */
> > 
> > Yes please. Even a trivial pointer to last would be far better than this.
> > It would be nice to run some numbers and prove that in realistic
> > scenarios finding an empty plpi doesn't take more than 5-10 ops, which
> > should be the case unless we have to loop over and the initial chucks
> > are still fully populated, causing Xen to scan for 512 units at a time.
> > We defenitely want to avoid that, if not in rare worse case scenarios.
> 
> I can try, though keep in mind that this code is really only called on
> allocating a host LPI, which would only happen when an LPI gets mapped.
> And this is done only upon a Dom0 driver initializing a device. Normally
> you wouldn't expect this during the actual guest runtime.

It would happen during actual guest runtime with device assignment,
wouldn't?

A simple pointer to last would be a good start, and in-code comment about
how many times we loop to find an empty plpi on a normal case (no device
assignment).

diff --git a/xen/arch/arm/gic-its.c b/xen/arch/arm/gic-its.c
index 88397bc..2140e4a 100644
--- a/xen/arch/arm/gic-its.c
+++ b/xen/arch/arm/gic-its.c
@@ -18,18 +18,31 @@ 
 
 #include <xen/config.h>
 #include <xen/lib.h>
+#include <xen/sched.h>
 #include <xen/err.h>
 #include <xen/device_tree.h>
 #include <xen/libfdt/libfdt.h>
 #include <asm/p2m.h>
+#include <asm/domain.h>
 #include <asm/io.h>
 #include <asm/gic.h>
 #include <asm/gic_v3_defs.h>
 #include <asm/gic-its.h>
 
+/* LPIs on the host always go to a guest, so no struct irq_desc for them. */
+union host_lpi {
+    uint64_t data;
+    struct {
+        uint64_t virt_lpi:32;
+        uint64_t dom_id:16;
+        uint64_t vcpu_id:16;
+    };
+};
+
 /* Global state */
 static struct {
     uint8_t *lpi_property;
+    union host_lpi **host_lpis;
     int host_lpi_bits;
 } lpi_data;
 
@@ -43,6 +56,26 @@  static DEFINE_PER_CPU(void *, pending_table);
 #define MAX_HOST_LPI_BITS                                                \
         min_t(unsigned int, lpi_data.host_lpi_bits, CONFIG_HOST_LPI_BITS)
 #define MAX_HOST_LPIS   (BIT(MAX_HOST_LPI_BITS) - 8192)
+#define HOST_LPIS_PER_PAGE      (PAGE_SIZE / sizeof(union host_lpi))
+
+static union host_lpi *gic_find_host_lpi(uint32_t lpi, struct domain *d)
+{
+    union host_lpi *hlpi;
+
+    if ( lpi < 8192 || lpi >= MAX_HOST_LPIS + 8192 )
+        return NULL;
+
+    lpi -= 8192;
+    if ( !lpi_data.host_lpis[lpi / HOST_LPIS_PER_PAGE] )
+        return NULL;
+
+    hlpi = &lpi_data.host_lpis[lpi / HOST_LPIS_PER_PAGE][lpi % HOST_LPIS_PER_PAGE];
+
+    if ( d && hlpi->dom_id != d->domain_id )
+        return NULL;
+
+    return hlpi;
+}
 
 #define ITS_COMMAND_SIZE        32
 
@@ -96,6 +129,33 @@  static int its_send_cmd_sync(struct host_its *its, int cpu)
     return its_send_command(its, cmd);
 }
 
+static int its_send_cmd_discard(struct host_its *its,
+                                uint32_t deviceid, uint32_t eventid)
+{
+    uint64_t cmd[4];
+
+    cmd[0] = GITS_CMD_DISCARD | ((uint64_t)deviceid << 32);
+    cmd[1] = eventid;
+    cmd[2] = 0x00;
+    cmd[3] = 0x00;
+
+    return its_send_command(its, cmd);
+}
+
+static int its_send_cmd_mapti(struct host_its *its,
+                              uint32_t deviceid, uint32_t eventid,
+                              uint32_t pintid, uint16_t icid)
+{
+    uint64_t cmd[4];
+
+    cmd[0] = GITS_CMD_MAPTI | ((uint64_t)deviceid << 32);
+    cmd[1] = eventid | ((uint64_t)pintid << 32);
+    cmd[2] = icid;
+    cmd[3] = 0x00;
+
+    return its_send_command(its, cmd);
+}
+
 static int its_send_cmd_mapc(struct host_its *its, int collection_id, int cpu)
 {
     uint64_t cmd[4];
@@ -330,15 +390,109 @@  uint64_t gicv3_lpi_get_proptable()
     return reg;
 }
 
+/* Allocate the 2nd level array for host LPIs. This one holds pointers
+ * to the page with the actual "union host_lpi" entries. Our LPI limit
+ * avoids excessive memory usage.
+ */
 int gicv3_lpi_init_host_lpis(int lpi_bits)
 {
+    int nr_lpi_ptrs;
+
     lpi_data.host_lpi_bits = lpi_bits;
 
+    nr_lpi_ptrs = MAX_HOST_LPIS / (PAGE_SIZE / sizeof(union host_lpi));
+
+    lpi_data.host_lpis = xzalloc_array(union host_lpi *, nr_lpi_ptrs);
+    if ( !lpi_data.host_lpis )
+        return -ENOMEM;
+
     printk("GICv3: using at most %ld LPIs on the host.\n", MAX_HOST_LPIS);
 
     return 0;
 }
 
+/* Allocates a new host LPI to be injected as "virt_lpi" into the specified
+ * VCPU. Returns the host LPI ID or a negative error value.
+ */
+int gicv3_lpi_allocate_host_lpi(struct host_its *its,
+                                uint32_t devid, uint32_t eventid,
+                                struct vcpu *v, int virt_lpi)
+{
+    int chunk, i;
+    union host_lpi hlpi, *new_chunk;
+
+    /* TODO: handle some kind of preassigned LPI mapping for DomUs */
+    if ( !its )
+        return -EPERM;
+
+    /* TODO: This could be optimized by storing some "next available" hint and
+     * only iterate if this one doesn't work. But this function should be
+     * called rarely.
+     */
+    for (chunk = 0; chunk < MAX_HOST_LPIS / HOST_LPIS_PER_PAGE; chunk++)
+    {
+        /* If we hit an unallocated chunk, we initialize it and use entry 0. */
+        if ( !lpi_data.host_lpis[chunk] )
+        {
+            new_chunk = alloc_xenheap_pages(0, 0);
+            if ( !new_chunk )
+                return -ENOMEM;
+
+            memset(new_chunk, 0, PAGE_SIZE);
+            lpi_data.host_lpis[chunk] = new_chunk;
+            i = 0;
+        }
+        else
+        {
+            /* Find an unallocted entry in this chunk. */
+            for (i = 0; i < HOST_LPIS_PER_PAGE; i++)
+                if ( !lpi_data.host_lpis[chunk][i].virt_lpi )
+                    break;
+
+            /* If this chunk is fully allocted, advance to the next one. */
+            if ( i == HOST_LPIS_PER_PAGE)
+                continue;
+        }
+
+        hlpi.virt_lpi = virt_lpi;
+        hlpi.dom_id = v->domain->domain_id;
+        hlpi.vcpu_id = v->vcpu_id;
+        lpi_data.host_lpis[chunk][i].data = hlpi.data;
+
+        if (its)
+        {
+            its_send_cmd_mapti(its, devid, eventid,
+                               chunk * HOST_LPIS_PER_PAGE + i + 8192, 0);
+            its_send_cmd_sync(its, 0);
+        }
+
+        return chunk * HOST_LPIS_PER_PAGE + i + 8192;
+    }
+
+    return -ENOSPC;
+}
+
+/* Drops the connection of the given host LPI to a virtual LPI.
+ */
+int gicv3_lpi_drop_host_lpi(struct host_its *its,
+                            uint32_t devid, uint32_t eventid, uint32_t host_lpi)
+{
+    union host_lpi *hlpip;
+
+    if ( !its )
+        return -EPERM;
+
+    hlpip = gic_find_host_lpi(host_lpi, NULL);
+    if ( !hlpip )
+        return -1;
+
+    hlpip->data = 0;
+
+    its_send_cmd_discard(its, devid, eventid);
+
+    return 0;
+}
+
 void gicv3_its_dt_init(const struct dt_device_node *node)
 {
     const struct dt_device_node *its = NULL;
diff --git a/xen/include/asm-arm/gic-its.h b/xen/include/asm-arm/gic-its.h
index b49d274..512a388 100644
--- a/xen/include/asm-arm/gic-its.h
+++ b/xen/include/asm-arm/gic-its.h
@@ -114,6 +114,12 @@  void gicv3_set_redist_addr(paddr_t address, int redist_id);
 /* Map a collection for this host CPU to each host ITS. */
 void gicv3_its_setup_collection(int cpu);
 
+int gicv3_lpi_allocate_host_lpi(struct host_its *its,
+                                uint32_t devid, uint32_t eventid,
+                                struct vcpu *v, int virt_lpi);
+int gicv3_lpi_drop_host_lpi(struct host_its *its,
+                            uint32_t devid, uint32_t eventid,
+                            uint32_t host_lpi);
 #else
 
 static inline void gicv3_its_dt_init(const struct dt_device_node *node)
@@ -141,6 +147,18 @@  static inline void gicv3_set_redist_addr(paddr_t address, int redist_id)
 static inline void gicv3_its_setup_collection(int cpu)
 {
 }
+static inline int gicv3_lpi_allocate_host_lpi(struct host_its *its,
+                                              uint32_t devid, uint32_t eventid,
+                                              struct vcpu *v, int virt_lpi)
+{
+    return 0;
+}
+static inline int gicv3_lpi_drop_host_lpi(struct host_its *its,
+                                          uint32_t devid, uint32_t eventid,
+                                          uint32_t host_lpi)
+{
+    return 0;
+}
 
 #endif /* CONFIG_HAS_ITS */

[RFC,06/24] ARM: GICv3 ITS: introduce host LPI array

Commit Message

Comments

Patch