Message ID | 1651947548-4055-3-git-send-email-olekstysh@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | virtio: Solution to restrict memory access under Xen using xen-grant DMA-mapping layer | expand |
On 07.05.22 21:19, Oleksandr Tyshchenko wrote: Hello Boris, Stefano > From: Juergen Gross <jgross@suse.com> > > For support of virtio via grant mappings in rare cases larger mappings > using consecutive grants are needed. Support those by adding a bitmap > of free grants. > > As consecutive grants will be needed only in very rare cases (e.g. when > configuring a virtio device with a multi-page ring), optimize for the > normal case of non-consecutive allocations. > > Signed-off-by: Juergen Gross <jgross@suse.com> > --- > Changes RFC -> V1: > - no changes > > Changes V1 -> V2: > - no changes May I please ask for the review here? This patch has been tested in various scenarios: 1. Guest with Xen PV drivers only (gnttab_alloc(free)_grant_reference() usage only) 2. Guest with Virtio drivers only (gnttab_alloc(free)_grant_reference_seq() usage only) 3. Guest with Virtio and Xen PV drivers (combined gnttab_alloc(free)_grant_reference() and gnttab_alloc(free)_grant_reference_seq() usage) > --- > drivers/xen/grant-table.c | 238 +++++++++++++++++++++++++++++++++++++++------- > include/xen/grant_table.h | 4 + > 2 files changed, 210 insertions(+), 32 deletions(-) > > diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c > index 8ccccac..1b458c0 100644 > --- a/drivers/xen/grant-table.c > +++ b/drivers/xen/grant-table.c > @@ -33,6 +33,7 @@ > > #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt > > +#include <linux/bitmap.h> > #include <linux/memblock.h> > #include <linux/sched.h> > #include <linux/mm.h> > @@ -72,9 +73,32 @@ > > static grant_ref_t **gnttab_list; > static unsigned int nr_grant_frames; > + > +/* > + * Handling of free grants: > + * > + * Free grants are in a simple list anchored in gnttab_free_head. They are > + * linked by grant ref, the last element contains GNTTAB_LIST_END. The number > + * of free entries is stored in gnttab_free_count. > + * Additionally there is a bitmap of free entries anchored in > + * gnttab_free_bitmap. This is being used for simplifying allocation of > + * multiple consecutive grants, which is needed e.g. for support of virtio. > + * gnttab_last_free is used to add free entries of new frames at the end of > + * the free list. > + * gnttab_free_tail_ptr specifies the variable which references the start > + * of consecutive free grants ending with gnttab_last_free. This pointer is > + * updated in a rather defensive way, in order to avoid performance hits in > + * hot paths. > + * All those variables are protected by gnttab_list_lock. > + */ > static int gnttab_free_count; > -static grant_ref_t gnttab_free_head; > +static unsigned int gnttab_size; > +static grant_ref_t gnttab_free_head = GNTTAB_LIST_END; > +static grant_ref_t gnttab_last_free = GNTTAB_LIST_END; > +static grant_ref_t *gnttab_free_tail_ptr; > +static unsigned long *gnttab_free_bitmap; > static DEFINE_SPINLOCK(gnttab_list_lock); > + > struct grant_frames xen_auto_xlat_grant_frames; > static unsigned int xen_gnttab_version; > module_param_named(version, xen_gnttab_version, uint, 0); > @@ -170,16 +194,111 @@ static int get_free_entries(unsigned count) > > ref = head = gnttab_free_head; > gnttab_free_count -= count; > - while (count-- > 1) > - head = gnttab_entry(head); > + while (count--) { > + bitmap_clear(gnttab_free_bitmap, head, 1); > + if (gnttab_free_tail_ptr == __gnttab_entry(head)) > + gnttab_free_tail_ptr = &gnttab_free_head; > + if (count) > + head = gnttab_entry(head); > + } > gnttab_free_head = gnttab_entry(head); > gnttab_entry(head) = GNTTAB_LIST_END; > > + if (!gnttab_free_count) { > + gnttab_last_free = GNTTAB_LIST_END; > + gnttab_free_tail_ptr = NULL; > + } > + > spin_unlock_irqrestore(&gnttab_list_lock, flags); > > return ref; > } > > +static int get_seq_entry_count(void) > +{ > + if (gnttab_last_free == GNTTAB_LIST_END || !gnttab_free_tail_ptr || > + *gnttab_free_tail_ptr == GNTTAB_LIST_END) > + return 0; > + > + return gnttab_last_free - *gnttab_free_tail_ptr + 1; > +} > + > +/* Rebuilds the free grant list and tries to find count consecutive entries. */ > +static int get_free_seq(unsigned int count) > +{ > + int ret = -ENOSPC; > + unsigned int from, to; > + grant_ref_t *last; > + > + gnttab_free_tail_ptr = &gnttab_free_head; > + last = &gnttab_free_head; > + > + for (from = find_first_bit(gnttab_free_bitmap, gnttab_size); > + from < gnttab_size; > + from = find_next_bit(gnttab_free_bitmap, gnttab_size, to + 1)) { > + to = find_next_zero_bit(gnttab_free_bitmap, gnttab_size, > + from + 1); > + if (ret < 0 && to - from >= count) { > + ret = from; > + bitmap_clear(gnttab_free_bitmap, ret, count); > + from += count; > + gnttab_free_count -= count; > + if (from == to) > + continue; > + } > + > + while (from < to) { > + *last = from; > + last = __gnttab_entry(from); > + gnttab_last_free = from; > + from++; > + } > + if (to < gnttab_size) > + gnttab_free_tail_ptr = __gnttab_entry(to - 1); > + } > + > + *last = GNTTAB_LIST_END; > + if (gnttab_last_free != gnttab_size - 1) > + gnttab_free_tail_ptr = NULL; > + > + return ret; > +} > + > +static int get_free_entries_seq(unsigned int count) > +{ > + unsigned long flags; > + int ret = 0; > + > + spin_lock_irqsave(&gnttab_list_lock, flags); > + > + if (gnttab_free_count < count) { > + ret = gnttab_expand(count - gnttab_free_count); > + if (ret < 0) > + goto out; > + } > + > + if (get_seq_entry_count() < count) { > + ret = get_free_seq(count); > + if (ret >= 0) > + goto out; > + ret = gnttab_expand(count - get_seq_entry_count()); > + if (ret < 0) > + goto out; > + } > + > + ret = *gnttab_free_tail_ptr; > + *gnttab_free_tail_ptr = gnttab_entry(ret + count - 1); > + gnttab_free_count -= count; > + if (!gnttab_free_count) > + gnttab_free_tail_ptr = NULL; > + bitmap_clear(gnttab_free_bitmap, ret, count); > + > + out: > + spin_unlock_irqrestore(&gnttab_list_lock, flags); > + > + return ret; > +} > + > static void do_free_callbacks(void) > { > struct gnttab_free_callback *callback, *next; > @@ -206,17 +325,48 @@ static inline void check_free_callbacks(void) > do_free_callbacks(); > } > > -static void put_free_entry(grant_ref_t ref) > +static void put_free_entry_locked(grant_ref_t ref) > { > - unsigned long flags; > - spin_lock_irqsave(&gnttab_list_lock, flags); > gnttab_entry(ref) = gnttab_free_head; > gnttab_free_head = ref; > + if (!gnttab_free_count) > + gnttab_last_free = ref; > + if (gnttab_free_tail_ptr == &gnttab_free_head) > + gnttab_free_tail_ptr = __gnttab_entry(ref); > gnttab_free_count++; > + bitmap_set(gnttab_free_bitmap, ref, 1); > +} > + > +static void put_free_entry(grant_ref_t ref) > +{ > + unsigned long flags; > + > + spin_lock_irqsave(&gnttab_list_lock, flags); > + put_free_entry_locked(ref); > check_free_callbacks(); > spin_unlock_irqrestore(&gnttab_list_lock, flags); > } > > +static void gnttab_set_free(unsigned int start, unsigned int n) > +{ > + unsigned int i; > + > + for (i = start; i < start + n - 1; i++) > + gnttab_entry(i) = i + 1; > + > + gnttab_entry(i) = GNTTAB_LIST_END; > + if (!gnttab_free_count) { > + gnttab_free_head = start; > + gnttab_free_tail_ptr = &gnttab_free_head; > + } else { > + gnttab_entry(gnttab_last_free) = start; > + } > + gnttab_free_count += n; > + gnttab_last_free = i; > + > + bitmap_set(gnttab_free_bitmap, start, n); > +} > + > /* > * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2. > * Introducing a valid entry into the grant table: > @@ -448,23 +598,31 @@ void gnttab_free_grant_references(grant_ref_t head) > { > grant_ref_t ref; > unsigned long flags; > - int count = 1; > - if (head == GNTTAB_LIST_END) > - return; > + > spin_lock_irqsave(&gnttab_list_lock, flags); > - ref = head; > - while (gnttab_entry(ref) != GNTTAB_LIST_END) { > - ref = gnttab_entry(ref); > - count++; > + while (head != GNTTAB_LIST_END) { > + ref = gnttab_entry(head); > + put_free_entry_locked(head); > + head = ref; > } > - gnttab_entry(ref) = gnttab_free_head; > - gnttab_free_head = head; > - gnttab_free_count += count; > check_free_callbacks(); > spin_unlock_irqrestore(&gnttab_list_lock, flags); > } > EXPORT_SYMBOL_GPL(gnttab_free_grant_references); > > +void gnttab_free_grant_reference_seq(grant_ref_t head, unsigned int count) > +{ > + unsigned long flags; > + unsigned int i; > + > + spin_lock_irqsave(&gnttab_list_lock, flags); > + for (i = count; i > 0; i--) > + put_free_entry_locked(head + i - 1); > + check_free_callbacks(); > + spin_unlock_irqrestore(&gnttab_list_lock, flags); > +} > +EXPORT_SYMBOL_GPL(gnttab_free_grant_reference_seq); > + > int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) > { > int h = get_free_entries(count); > @@ -478,6 +636,24 @@ int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) > } > EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references); > > +int gnttab_alloc_grant_reference_seq(unsigned int count, grant_ref_t *first) > +{ > + int h; > + > + if (count == 1) > + h = get_free_entries(1); > + else > + h = get_free_entries_seq(count); > + > + if (h < 0) > + return -ENOSPC; > + > + *first = h; > + > + return 0; > +} > +EXPORT_SYMBOL_GPL(gnttab_alloc_grant_reference_seq); > + > int gnttab_empty_grant_references(const grant_ref_t *private_head) > { > return (*private_head == GNTTAB_LIST_END); > @@ -570,16 +746,13 @@ static int grow_gnttab_list(unsigned int more_frames) > goto grow_nomem; > } > > + gnttab_set_free(gnttab_size, extra_entries); > > - for (i = grefs_per_frame * nr_grant_frames; > - i < grefs_per_frame * new_nr_grant_frames - 1; i++) > - gnttab_entry(i) = i + 1; > - > - gnttab_entry(i) = gnttab_free_head; > - gnttab_free_head = grefs_per_frame * nr_grant_frames; > - gnttab_free_count += extra_entries; > + if (!gnttab_free_tail_ptr) > + gnttab_free_tail_ptr = __gnttab_entry(gnttab_size); > > nr_grant_frames = new_nr_grant_frames; > + gnttab_size += extra_entries; > > check_free_callbacks(); > > @@ -1424,7 +1597,6 @@ int gnttab_init(void) > int i; > unsigned long max_nr_grant_frames; > unsigned int max_nr_glist_frames, nr_glist_frames; > - unsigned int nr_init_grefs; > int ret; > > gnttab_request_version(); > @@ -1452,6 +1624,13 @@ int gnttab_init(void) > } > } > > + i = gnttab_interface->grefs_per_grant_frame * max_nr_grant_frames; > + gnttab_free_bitmap = bitmap_zalloc(i, GFP_KERNEL); > + if (!gnttab_free_bitmap) { > + ret = -ENOMEM; > + goto ini_nomem; > + } > + > ret = arch_gnttab_init(max_nr_grant_frames, > nr_status_frames(max_nr_grant_frames)); > if (ret < 0) > @@ -1462,15 +1641,9 @@ int gnttab_init(void) > goto ini_nomem; > } > > - nr_init_grefs = nr_grant_frames * > - gnttab_interface->grefs_per_grant_frame; > - > - for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) > - gnttab_entry(i) = i + 1; > + gnttab_size = nr_grant_frames * gnttab_interface->grefs_per_grant_frame; > > - gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END; > - gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES; > - gnttab_free_head = NR_RESERVED_ENTRIES; > + gnttab_set_free(NR_RESERVED_ENTRIES, gnttab_size - NR_RESERVED_ENTRIES); > > printk("Grant table initialized\n"); > return 0; > @@ -1479,6 +1652,7 @@ int gnttab_init(void) > for (i--; i >= 0; i--) > free_page((unsigned long)gnttab_list[i]); > kfree(gnttab_list); > + bitmap_free(gnttab_free_bitmap); > return ret; > } > EXPORT_SYMBOL_GPL(gnttab_init); > diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h > index dfd5bf3..d815e1d 100644 > --- a/include/xen/grant_table.h > +++ b/include/xen/grant_table.h > @@ -129,10 +129,14 @@ int gnttab_try_end_foreign_access(grant_ref_t ref); > */ > int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head); > > +int gnttab_alloc_grant_reference_seq(unsigned int count, grant_ref_t *first); > + > void gnttab_free_grant_reference(grant_ref_t ref); > > void gnttab_free_grant_references(grant_ref_t head); > > +void gnttab_free_grant_reference_seq(grant_ref_t head, unsigned int count); > + > int gnttab_empty_grant_references(const grant_ref_t *pprivate_head); > > int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
On 5/11/22 2:00 PM, Oleksandr wrote: > > On 07.05.22 21:19, Oleksandr Tyshchenko wrote: > > Hello Boris, Stefano > > >> From: Juergen Gross <jgross@suse.com> >> >> For support of virtio via grant mappings in rare cases larger mappings >> using consecutive grants are needed. Support those by adding a bitmap >> of free grants. >> >> As consecutive grants will be needed only in very rare cases (e.g. when >> configuring a virtio device with a multi-page ring), optimize for the >> normal case of non-consecutive allocations. >> >> Signed-off-by: Juergen Gross <jgross@suse.com> >> --- >> Changes RFC -> V1: >> - no changes >> Changes V1 -> V2: >> - no changes > > > May I please ask for the review here? I had a quick look but I am stuck on get_free_seq(), I need to stare at it some more. Unless someone else reviews this, I will try to get to this in the next couple of days. One thing I did notice is > >> @@ -1452,6 +1624,13 @@ int gnttab_init(void) >> } >> } >> + i = gnttab_interface->grefs_per_grant_frame * max_nr_grant_frames; >> + gnttab_free_bitmap = bitmap_zalloc(i, GFP_KERNEL); >> + if (!gnttab_free_bitmap) { >> + ret = -ENOMEM; >> + goto ini_nomem; >> + } This overwrites 'i' and will break error handling at ini_nomem. -boris
On 12.05.22 00:09, Boris Ostrovsky wrote: Hello Boris > > On 5/11/22 2:00 PM, Oleksandr wrote: >> >> On 07.05.22 21:19, Oleksandr Tyshchenko wrote: >> >> Hello Boris, Stefano >> >> >>> From: Juergen Gross <jgross@suse.com> >>> >>> For support of virtio via grant mappings in rare cases larger mappings >>> using consecutive grants are needed. Support those by adding a bitmap >>> of free grants. >>> >>> As consecutive grants will be needed only in very rare cases (e.g. when >>> configuring a virtio device with a multi-page ring), optimize for the >>> normal case of non-consecutive allocations. >>> >>> Signed-off-by: Juergen Gross <jgross@suse.com> >>> --- >>> Changes RFC -> V1: >>> - no changes >>> Changes V1 -> V2: >>> - no changes >> >> >> May I please ask for the review here? > > > > I had a quick look but I am stuck on get_free_seq(), I need to stare > at it some more. Unless someone else reviews this, I will try to get > to this in the next couple of days. Thank you! > > > > One thing I did notice is > > >> >>> @@ -1452,6 +1624,13 @@ int gnttab_init(void) >>> } >>> } >>> + i = gnttab_interface->grefs_per_grant_frame * >>> max_nr_grant_frames; >>> + gnttab_free_bitmap = bitmap_zalloc(i, GFP_KERNEL); >>> + if (!gnttab_free_bitmap) { >>> + ret = -ENOMEM; >>> + goto ini_nomem; >>> + } > > > This overwrites 'i' and will break error handling at ini_nomem. Indeed, will fix. Thank you for pointing this out. > > > -boris > >
On 5/7/22 2:19 PM, Oleksandr Tyshchenko wrote: > + > +/* > + * Handling of free grants: > + * > + * Free grants are in a simple list anchored in gnttab_free_head. They are > + * linked by grant ref, the last element contains GNTTAB_LIST_END. The number > + * of free entries is stored in gnttab_free_count. > + * Additionally there is a bitmap of free entries anchored in > + * gnttab_free_bitmap. This is being used for simplifying allocation of > + * multiple consecutive grants, which is needed e.g. for support of virtio. > + * gnttab_last_free is used to add free entries of new frames at the end of > + * the free list. > + * gnttab_free_tail_ptr specifies the variable which references the start If this references the start of the free interval, why is it called gnttab_free_*tail*_ptr? > + * of consecutive free grants ending with gnttab_last_free. This pointer is > + * updated in a rather defensive way, in order to avoid performance hits in > + * hot paths. > + * All those variables are protected by gnttab_list_lock. > + */ > static int gnttab_free_count; > -static grant_ref_t gnttab_free_head; > +static unsigned int gnttab_size; > +static grant_ref_t gnttab_free_head = GNTTAB_LIST_END; > +static grant_ref_t gnttab_last_free = GNTTAB_LIST_END; > +static grant_ref_t *gnttab_free_tail_ptr; > +static unsigned long *gnttab_free_bitmap; > static DEFINE_SPINLOCK(gnttab_list_lock); > + > struct grant_frames xen_auto_xlat_grant_frames; > static unsigned int xen_gnttab_version; > module_param_named(version, xen_gnttab_version, uint, 0); > @@ -170,16 +194,111 @@ static int get_free_entries(unsigned count) > > ref = head = gnttab_free_head; > gnttab_free_count -= count; > - while (count-- > 1) > - head = gnttab_entry(head); > + while (count--) { > + bitmap_clear(gnttab_free_bitmap, head, 1); > + if (gnttab_free_tail_ptr == __gnttab_entry(head)) > + gnttab_free_tail_ptr = &gnttab_free_head; > + if (count) > + head = gnttab_entry(head); > + } > gnttab_free_head = gnttab_entry(head); > gnttab_entry(head) = GNTTAB_LIST_END; > > + if (!gnttab_free_count) { > + gnttab_last_free = GNTTAB_LIST_END; > + gnttab_free_tail_ptr = NULL; > + } > + > spin_unlock_irqrestore(&gnttab_list_lock, flags); > > return ref; > } > > +static int get_seq_entry_count(void) > +{ > + if (gnttab_last_free == GNTTAB_LIST_END || !gnttab_free_tail_ptr || > + *gnttab_free_tail_ptr == GNTTAB_LIST_END) > + return 0; > + > + return gnttab_last_free - *gnttab_free_tail_ptr + 1; > +} > + > +/* Rebuilds the free grant list and tries to find count consecutive entries. */ > +static int get_free_seq(unsigned int count) > +{ > + int ret = -ENOSPC; > + unsigned int from, to; > + grant_ref_t *last; > + > + gnttab_free_tail_ptr = &gnttab_free_head; > + last = &gnttab_free_head; > + > + for (from = find_first_bit(gnttab_free_bitmap, gnttab_size); > + from < gnttab_size; > + from = find_next_bit(gnttab_free_bitmap, gnttab_size, to + 1)) { > + to = find_next_zero_bit(gnttab_free_bitmap, gnttab_size, > + from + 1); > + if (ret < 0 && to - from >= count) { > + ret = from; > + bitmap_clear(gnttab_free_bitmap, ret, count); > + from += count; > + gnttab_free_count -= count; IIUIC we can have multiple passes over this, meaning that the gnttab_free_count may be decremented more than once. Is that intentional? > + if (from == to) > + continue; > + } > + > + while (from < to) { > + *last = from; > + last = __gnttab_entry(from); > + gnttab_last_free = from; > + from++; > + } I have been looking at this loop and I can't understand what it is doing ;-( Can you enlighten me? -boris > + if (to < gnttab_size) > + gnttab_free_tail_ptr = __gnttab_entry(to - 1); > + } > + > + *last = GNTTAB_LIST_END; > + if (gnttab_last_free != gnttab_size - 1) > + gnttab_free_tail_ptr = NULL; > + > + return ret; > +} > + > +static int get_free_entries_seq(unsigned int count) > +{ > + unsigned long flags; > + int ret = 0; > + > + spin_lock_irqsave(&gnttab_list_lock, flags); > + > + if (gnttab_free_count < count) { > + ret = gnttab_expand(count - gnttab_free_count); > + if (ret < 0) > + goto out; > + } > + > + if (get_seq_entry_count() < count) { > + ret = get_free_seq(count); > + if (ret >= 0) > + goto out; > + ret = gnttab_expand(count - get_seq_entry_count()); > + if (ret < 0) > + goto out; > + } > + > + ret = *gnttab_free_tail_ptr; > + *gnttab_free_tail_ptr = gnttab_entry(ret + count - 1); > + gnttab_free_count -= count; > + if (!gnttab_free_count) > + gnttab_free_tail_ptr = NULL; > + bitmap_clear(gnttab_free_bitmap, ret, count); > + > + out: > + spin_unlock_irqrestore(&gnttab_list_lock, flags); > + > + return ret; > +} > + > static void do_free_callbacks(void) > { > struct gnttab_free_callback *callback, *next; > @@ -206,17 +325,48 @@ static inline void check_free_callbacks(void) > do_free_callbacks(); > } > > -static void put_free_entry(grant_ref_t ref) > +static void put_free_entry_locked(grant_ref_t ref) > { > - unsigned long flags; > - spin_lock_irqsave(&gnttab_list_lock, flags); > gnttab_entry(ref) = gnttab_free_head; > gnttab_free_head = ref; > + if (!gnttab_free_count) > + gnttab_last_free = ref; > + if (gnttab_free_tail_ptr == &gnttab_free_head) > + gnttab_free_tail_ptr = __gnttab_entry(ref); > gnttab_free_count++; > + bitmap_set(gnttab_free_bitmap, ref, 1); > +} > + > +static void put_free_entry(grant_ref_t ref) > +{ > + unsigned long flags; > + > + spin_lock_irqsave(&gnttab_list_lock, flags); > + put_free_entry_locked(ref); > check_free_callbacks(); > spin_unlock_irqrestore(&gnttab_list_lock, flags); > } > > +static void gnttab_set_free(unsigned int start, unsigned int n) > +{ > + unsigned int i; > + > + for (i = start; i < start + n - 1; i++) > + gnttab_entry(i) = i + 1; > + > + gnttab_entry(i) = GNTTAB_LIST_END; > + if (!gnttab_free_count) { > + gnttab_free_head = start; > + gnttab_free_tail_ptr = &gnttab_free_head; > + } else { > + gnttab_entry(gnttab_last_free) = start; > + } > + gnttab_free_count += n; > + gnttab_last_free = i; > + > + bitmap_set(gnttab_free_bitmap, start, n); > +} > + > /* > * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2. > * Introducing a valid entry into the grant table: > @@ -448,23 +598,31 @@ void gnttab_free_grant_references(grant_ref_t head) > { > grant_ref_t ref; > unsigned long flags; > - int count = 1; > - if (head == GNTTAB_LIST_END) > - return; > + > spin_lock_irqsave(&gnttab_list_lock, flags); > - ref = head; > - while (gnttab_entry(ref) != GNTTAB_LIST_END) { > - ref = gnttab_entry(ref); > - count++; > + while (head != GNTTAB_LIST_END) { > + ref = gnttab_entry(head); > + put_free_entry_locked(head); > + head = ref; > } > - gnttab_entry(ref) = gnttab_free_head; > - gnttab_free_head = head; > - gnttab_free_count += count; > check_free_callbacks(); > spin_unlock_irqrestore(&gnttab_list_lock, flags); > } > EXPORT_SYMBOL_GPL(gnttab_free_grant_references); > > +void gnttab_free_grant_reference_seq(grant_ref_t head, unsigned int count) > +{ > + unsigned long flags; > + unsigned int i; > + > + spin_lock_irqsave(&gnttab_list_lock, flags); > + for (i = count; i > 0; i--) > + put_free_entry_locked(head + i - 1); > + check_free_callbacks(); > + spin_unlock_irqrestore(&gnttab_list_lock, flags); > +} > +EXPORT_SYMBOL_GPL(gnttab_free_grant_reference_seq); > + > int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) > { > int h = get_free_entries(count); > @@ -478,6 +636,24 @@ int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) > } > EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references); > > +int gnttab_alloc_grant_reference_seq(unsigned int count, grant_ref_t *first) > +{ > + int h; > + > + if (count == 1) > + h = get_free_entries(1); > + else > + h = get_free_entries_seq(count); > + > + if (h < 0) > + return -ENOSPC; > + > + *first = h; > + > + return 0; > +} > +EXPORT_SYMBOL_GPL(gnttab_alloc_grant_reference_seq); > + > int gnttab_empty_grant_references(const grant_ref_t *private_head) > { > return (*private_head == GNTTAB_LIST_END); > @@ -570,16 +746,13 @@ static int grow_gnttab_list(unsigned int more_frames) > goto grow_nomem; > } > > + gnttab_set_free(gnttab_size, extra_entries); > > - for (i = grefs_per_frame * nr_grant_frames; > - i < grefs_per_frame * new_nr_grant_frames - 1; i++) > - gnttab_entry(i) = i + 1; > - > - gnttab_entry(i) = gnttab_free_head; > - gnttab_free_head = grefs_per_frame * nr_grant_frames; > - gnttab_free_count += extra_entries; > + if (!gnttab_free_tail_ptr) > + gnttab_free_tail_ptr = __gnttab_entry(gnttab_size); > > nr_grant_frames = new_nr_grant_frames; > + gnttab_size += extra_entries; > > check_free_callbacks(); > > @@ -1424,7 +1597,6 @@ int gnttab_init(void) > int i; > unsigned long max_nr_grant_frames; > unsigned int max_nr_glist_frames, nr_glist_frames; > - unsigned int nr_init_grefs; > int ret; > > gnttab_request_version(); > @@ -1452,6 +1624,13 @@ int gnttab_init(void) > } > } > > + i = gnttab_interface->grefs_per_grant_frame * max_nr_grant_frames; > + gnttab_free_bitmap = bitmap_zalloc(i, GFP_KERNEL); > + if (!gnttab_free_bitmap) { > + ret = -ENOMEM; > + goto ini_nomem; > + } > + > ret = arch_gnttab_init(max_nr_grant_frames, > nr_status_frames(max_nr_grant_frames)); > if (ret < 0) > @@ -1462,15 +1641,9 @@ int gnttab_init(void) > goto ini_nomem; > } > > - nr_init_grefs = nr_grant_frames * > - gnttab_interface->grefs_per_grant_frame; > - > - for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) > - gnttab_entry(i) = i + 1; > + gnttab_size = nr_grant_frames * gnttab_interface->grefs_per_grant_frame; > > - gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END; > - gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES; > - gnttab_free_head = NR_RESERVED_ENTRIES; > + gnttab_set_free(NR_RESERVED_ENTRIES, gnttab_size - NR_RESERVED_ENTRIES); > > printk("Grant table initialized\n"); > return 0; > @@ -1479,6 +1652,7 @@ int gnttab_init(void) > for (i--; i >= 0; i--) > free_page((unsigned long)gnttab_list[i]); > kfree(gnttab_list); > + bitmap_free(gnttab_free_bitmap); > return ret; > } > EXPORT_SYMBOL_GPL(gnttab_init); > diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h > index dfd5bf3..d815e1d 100644 > --- a/include/xen/grant_table.h > +++ b/include/xen/grant_table.h > @@ -129,10 +129,14 @@ int gnttab_try_end_foreign_access(grant_ref_t ref); > */ > int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head); > > +int gnttab_alloc_grant_reference_seq(unsigned int count, grant_ref_t *first); > + > void gnttab_free_grant_reference(grant_ref_t ref); > > void gnttab_free_grant_references(grant_ref_t head); > > +void gnttab_free_grant_reference_seq(grant_ref_t head, unsigned int count); > + > int gnttab_empty_grant_references(const grant_ref_t *pprivate_head); > > int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
On 12.05.22 22:01, Boris Ostrovsky wrote: > > On 5/7/22 2:19 PM, Oleksandr Tyshchenko wrote: >> + >> +/* >> + * Handling of free grants: >> + * >> + * Free grants are in a simple list anchored in gnttab_free_head. They are >> + * linked by grant ref, the last element contains GNTTAB_LIST_END. The number >> + * of free entries is stored in gnttab_free_count. >> + * Additionally there is a bitmap of free entries anchored in >> + * gnttab_free_bitmap. This is being used for simplifying allocation of >> + * multiple consecutive grants, which is needed e.g. for support of virtio. >> + * gnttab_last_free is used to add free entries of new frames at the end of >> + * the free list. >> + * gnttab_free_tail_ptr specifies the variable which references the start > > > If this references the start of the free interval, why is it called > gnttab_free_*tail*_ptr? Because this is the tail of the whole area which is free. It could be renamed to gnttab_free_tail_start_ptr. :-) > > >> + * of consecutive free grants ending with gnttab_last_free. This pointer is >> + * updated in a rather defensive way, in order to avoid performance hits in >> + * hot paths. >> + * All those variables are protected by gnttab_list_lock. >> + */ >> static int gnttab_free_count; >> -static grant_ref_t gnttab_free_head; >> +static unsigned int gnttab_size; >> +static grant_ref_t gnttab_free_head = GNTTAB_LIST_END; >> +static grant_ref_t gnttab_last_free = GNTTAB_LIST_END; >> +static grant_ref_t *gnttab_free_tail_ptr; >> +static unsigned long *gnttab_free_bitmap; >> static DEFINE_SPINLOCK(gnttab_list_lock); >> + >> struct grant_frames xen_auto_xlat_grant_frames; >> static unsigned int xen_gnttab_version; >> module_param_named(version, xen_gnttab_version, uint, 0); >> @@ -170,16 +194,111 @@ static int get_free_entries(unsigned count) >> ref = head = gnttab_free_head; >> gnttab_free_count -= count; >> - while (count-- > 1) >> - head = gnttab_entry(head); >> + while (count--) { >> + bitmap_clear(gnttab_free_bitmap, head, 1); >> + if (gnttab_free_tail_ptr == __gnttab_entry(head)) >> + gnttab_free_tail_ptr = &gnttab_free_head; >> + if (count) >> + head = gnttab_entry(head); >> + } >> gnttab_free_head = gnttab_entry(head); >> gnttab_entry(head) = GNTTAB_LIST_END; >> + if (!gnttab_free_count) { >> + gnttab_last_free = GNTTAB_LIST_END; >> + gnttab_free_tail_ptr = NULL; >> + } >> + >> spin_unlock_irqrestore(&gnttab_list_lock, flags); >> return ref; >> } >> +static int get_seq_entry_count(void) >> +{ >> + if (gnttab_last_free == GNTTAB_LIST_END || !gnttab_free_tail_ptr || >> + *gnttab_free_tail_ptr == GNTTAB_LIST_END) >> + return 0; >> + >> + return gnttab_last_free - *gnttab_free_tail_ptr + 1; >> +} >> + >> +/* Rebuilds the free grant list and tries to find count consecutive entries. */ >> +static int get_free_seq(unsigned int count) >> +{ >> + int ret = -ENOSPC; >> + unsigned int from, to; >> + grant_ref_t *last; >> + >> + gnttab_free_tail_ptr = &gnttab_free_head; >> + last = &gnttab_free_head; >> + >> + for (from = find_first_bit(gnttab_free_bitmap, gnttab_size); >> + from < gnttab_size; >> + from = find_next_bit(gnttab_free_bitmap, gnttab_size, to + 1)) { >> + to = find_next_zero_bit(gnttab_free_bitmap, gnttab_size, >> + from + 1); >> + if (ret < 0 && to - from >= count) { >> + ret = from; >> + bitmap_clear(gnttab_free_bitmap, ret, count); >> + from += count; >> + gnttab_free_count -= count; > > > IIUIC we can have multiple passes over this, meaning that the gnttab_free_count > may be decremented more than once. Is that intentional? After the first pass decrementing gnttab_free_cnt, ret will no longer be less than zero, so this can be hit only once. > > >> + if (from == to) >> + continue; >> + } >> + >> + while (from < to) { >> + *last = from; >> + last = __gnttab_entry(from); >> + gnttab_last_free = from; >> + from++; >> + } > > > I have been looking at this loop and I can't understand what it is doing ;-( Can > you enlighten me? It is recreating the free list in order to have it properly sorted. This is needed to make sure that the free tail has the maximum possible size (you can take the tail off the list without having to worry about breaking the linked list because of references into the tail). Juergen
On 13.05.22 08:33, Juergen Gross wrote: > On 12.05.22 22:01, Boris Ostrovsky wrote: Hello Juergen, Boris [Juergen, thank you for your explanation] > >> >> On 5/7/22 2:19 PM, Oleksandr Tyshchenko wrote: >>> + >>> +/* >>> + * Handling of free grants: >>> + * >>> + * Free grants are in a simple list anchored in gnttab_free_head. >>> They are >>> + * linked by grant ref, the last element contains GNTTAB_LIST_END. >>> The number >>> + * of free entries is stored in gnttab_free_count. >>> + * Additionally there is a bitmap of free entries anchored in >>> + * gnttab_free_bitmap. This is being used for simplifying >>> allocation of >>> + * multiple consecutive grants, which is needed e.g. for support of >>> virtio. >>> + * gnttab_last_free is used to add free entries of new frames at >>> the end of >>> + * the free list. >>> + * gnttab_free_tail_ptr specifies the variable which references the >>> start >> >> >> If this references the start of the free interval, why is it called >> gnttab_free_*tail*_ptr? > > Because this is the tail of the whole area which is free. > > It could be renamed to gnttab_free_tail_start_ptr. :-) > >> >> >>> + * of consecutive free grants ending with gnttab_last_free. This >>> pointer is >>> + * updated in a rather defensive way, in order to avoid performance >>> hits in >>> + * hot paths. >>> + * All those variables are protected by gnttab_list_lock. >>> + */ >>> static int gnttab_free_count; >>> -static grant_ref_t gnttab_free_head; >>> +static unsigned int gnttab_size; >>> +static grant_ref_t gnttab_free_head = GNTTAB_LIST_END; >>> +static grant_ref_t gnttab_last_free = GNTTAB_LIST_END; >>> +static grant_ref_t *gnttab_free_tail_ptr; >>> +static unsigned long *gnttab_free_bitmap; >>> static DEFINE_SPINLOCK(gnttab_list_lock); >>> + >>> struct grant_frames xen_auto_xlat_grant_frames; >>> static unsigned int xen_gnttab_version; >>> module_param_named(version, xen_gnttab_version, uint, 0); >>> @@ -170,16 +194,111 @@ static int get_free_entries(unsigned count) >>> ref = head = gnttab_free_head; >>> gnttab_free_count -= count; >>> - while (count-- > 1) >>> - head = gnttab_entry(head); >>> + while (count--) { >>> + bitmap_clear(gnttab_free_bitmap, head, 1); >>> + if (gnttab_free_tail_ptr == __gnttab_entry(head)) >>> + gnttab_free_tail_ptr = &gnttab_free_head; >>> + if (count) >>> + head = gnttab_entry(head); >>> + } >>> gnttab_free_head = gnttab_entry(head); >>> gnttab_entry(head) = GNTTAB_LIST_END; >>> + if (!gnttab_free_count) { >>> + gnttab_last_free = GNTTAB_LIST_END; >>> + gnttab_free_tail_ptr = NULL; >>> + } >>> + >>> spin_unlock_irqrestore(&gnttab_list_lock, flags); >>> return ref; >>> } >>> +static int get_seq_entry_count(void) >>> +{ >>> + if (gnttab_last_free == GNTTAB_LIST_END || >>> !gnttab_free_tail_ptr || >>> + *gnttab_free_tail_ptr == GNTTAB_LIST_END) >>> + return 0; >>> + >>> + return gnttab_last_free - *gnttab_free_tail_ptr + 1; >>> +} >>> + >>> +/* Rebuilds the free grant list and tries to find count consecutive >>> entries. */ >>> +static int get_free_seq(unsigned int count) >>> +{ >>> + int ret = -ENOSPC; >>> + unsigned int from, to; >>> + grant_ref_t *last; >>> + >>> + gnttab_free_tail_ptr = &gnttab_free_head; >>> + last = &gnttab_free_head; >>> + >>> + for (from = find_first_bit(gnttab_free_bitmap, gnttab_size); >>> + from < gnttab_size; >>> + from = find_next_bit(gnttab_free_bitmap, gnttab_size, to + >>> 1)) { >>> + to = find_next_zero_bit(gnttab_free_bitmap, gnttab_size, >>> + from + 1); >>> + if (ret < 0 && to - from >= count) { >>> + ret = from; >>> + bitmap_clear(gnttab_free_bitmap, ret, count); >>> + from += count; >>> + gnttab_free_count -= count; >> >> >> IIUIC we can have multiple passes over this, meaning that the >> gnttab_free_count may be decremented more than once. Is that >> intentional? > > After the first pass decrementing gnttab_free_cnt, ret will no > longer be less than zero, so this can be hit only once. > >> >> >>> + if (from == to) >>> + continue; >>> + } >>> + >>> + while (from < to) { >>> + *last = from; >>> + last = __gnttab_entry(from); >>> + gnttab_last_free = from; >>> + from++; >>> + } >> >> >> I have been looking at this loop and I can't understand what it is >> doing ;-( Can you enlighten me? > > It is recreating the free list in order to have it properly sorted. > This is needed to make sure that the free tail has the maximum > possible size (you can take the tail off the list without having > to worry about breaking the linked list because of references into > the tail). I think the loop deserves a comment on top, I will add it. > > > > Juergen
On 5/13/22 1:33 AM, Juergen Gross wrote: > On 12.05.22 22:01, Boris Ostrovsky wrote: >> >> On 5/7/22 2:19 PM, Oleksandr Tyshchenko wrote: >>> +/* Rebuilds the free grant list and tries to find count consecutive entries. */ >>> +static int get_free_seq(unsigned int count) >>> +{ >>> + int ret = -ENOSPC; >>> + unsigned int from, to; >>> + grant_ref_t *last; >>> + >>> + gnttab_free_tail_ptr = &gnttab_free_head; >>> + last = &gnttab_free_head; >>> + >>> + for (from = find_first_bit(gnttab_free_bitmap, gnttab_size); >>> + from < gnttab_size; >>> + from = find_next_bit(gnttab_free_bitmap, gnttab_size, to + 1)) { >>> + to = find_next_zero_bit(gnttab_free_bitmap, gnttab_size, >>> + from + 1); >>> + if (ret < 0 && to - from >= count) { >>> + ret = from; >>> + bitmap_clear(gnttab_free_bitmap, ret, count); >>> + from += count; >>> + gnttab_free_count -= count; >> >> >> IIUIC we can have multiple passes over this, meaning that the gnttab_free_count may be decremented more than once. Is that intentional? > > After the first pass decrementing gnttab_free_cnt, ret will no > longer be less than zero, so this can be hit only once. Oh, yes, of course. > >> >> >>> + if (from == to) >>> + continue; >>> + } >>> + >>> + while (from < to) { >>> + *last = from; >>> + last = __gnttab_entry(from); >>> + gnttab_last_free = from; >>> + from++; >>> + } >> >> >> I have been looking at this loop and I can't understand what it is doing ;-( Can you enlighten me? > > It is recreating the free list in order to have it properly sorted. > This is needed to make sure that the free tail has the maximum > possible size (you can take the tail off the list without having > to worry about breaking the linked list because of references into > the tail). So let's say we have the (one-dimensional) table of length 13 idx .. 2 3 ... 10 11 12 grant 12 11 2 -1 3 and gnttab_free_head is 10. I.e. the free list is 2, 12, 3, 11. What will this look like after the 2 iterations of the outer loop? (I am really having a mental block on this). -boris
On 14.05.22 04:34, Boris Ostrovsky wrote: > > > On 5/13/22 1:33 AM, Juergen Gross wrote: >> On 12.05.22 22:01, Boris Ostrovsky wrote: >>> >>> On 5/7/22 2:19 PM, Oleksandr Tyshchenko wrote: > >>>> +/* Rebuilds the free grant list and tries to find count consecutive >>>> entries. */ >>>> +static int get_free_seq(unsigned int count) >>>> +{ >>>> + int ret = -ENOSPC; >>>> + unsigned int from, to; >>>> + grant_ref_t *last; >>>> + >>>> + gnttab_free_tail_ptr = &gnttab_free_head; >>>> + last = &gnttab_free_head; >>>> + >>>> + for (from = find_first_bit(gnttab_free_bitmap, gnttab_size); >>>> + from < gnttab_size; >>>> + from = find_next_bit(gnttab_free_bitmap, gnttab_size, to + 1)) { >>>> + to = find_next_zero_bit(gnttab_free_bitmap, gnttab_size, >>>> + from + 1); >>>> + if (ret < 0 && to - from >= count) { >>>> + ret = from; >>>> + bitmap_clear(gnttab_free_bitmap, ret, count); >>>> + from += count; >>>> + gnttab_free_count -= count; >>> >>> >>> IIUIC we can have multiple passes over this, meaning that the >>> gnttab_free_count may be decremented more than once. Is that intentional? >> >> After the first pass decrementing gnttab_free_cnt, ret will no >> longer be less than zero, so this can be hit only once. > > Oh, yes, of course. > >> >>> >>> >>>> + if (from == to) >>>> + continue; >>>> + } >>>> + >>>> + while (from < to) { >>>> + *last = from; >>>> + last = __gnttab_entry(from); >>>> + gnttab_last_free = from; >>>> + from++; >>>> + } >>> >>> >>> I have been looking at this loop and I can't understand what it is doing ;-( >>> Can you enlighten me? >> >> It is recreating the free list in order to have it properly sorted. >> This is needed to make sure that the free tail has the maximum >> possible size (you can take the tail off the list without having >> to worry about breaking the linked list because of references into >> the tail). > > > So let's say we have the (one-dimensional) table of length 13 > > idx .. 2 3 ... 10 11 12 > > grant 12 11 2 -1 3 > > > and gnttab_free_head is 10. I.e. the free list is 2, 12, 3, 11. You meant 10, 2, 12, 3, 11, I guess? > > What will this look like after the 2 iterations of the outer loop? idx .. 2 3 ... 10 11 12 grant 3 10 11 12 -1 with gnttab_free_head being 2, i.e the free list is now 2, 3, 10, 11, 12. Juergen
On 5/16/22 1:59 AM, Juergen Gross wrote: > On 14.05.22 04:34, Boris Ostrovsky wrote: >> >> >> On 5/13/22 1:33 AM, Juergen Gross wrote: >>> On 12.05.22 22:01, Boris Ostrovsky wrote: >>>> >>>> On 5/7/22 2:19 PM, Oleksandr Tyshchenko wrote: >> >>>>> +/* Rebuilds the free grant list and tries to find count consecutive entries. */ >>>>> +static int get_free_seq(unsigned int count) >>>>> +{ >>>>> + int ret = -ENOSPC; >>>>> + unsigned int from, to; >>>>> + grant_ref_t *last; >>>>> + >>>>> + gnttab_free_tail_ptr = &gnttab_free_head; >>>>> + last = &gnttab_free_head; >>>>> + >>>>> + for (from = find_first_bit(gnttab_free_bitmap, gnttab_size); >>>>> + from < gnttab_size; >>>>> + from = find_next_bit(gnttab_free_bitmap, gnttab_size, to + 1)) { >>>>> + to = find_next_zero_bit(gnttab_free_bitmap, gnttab_size, >>>>> + from + 1); >>>>> + if (ret < 0 && to - from >= count) { >>>>> + ret = from; >>>>> + bitmap_clear(gnttab_free_bitmap, ret, count); >>>>> + from += count; >>>>> + gnttab_free_count -= count; >>>> >>>> >>>> IIUIC we can have multiple passes over this, meaning that the gnttab_free_count may be decremented more than once. Is that intentional? >>> >>> After the first pass decrementing gnttab_free_cnt, ret will no >>> longer be less than zero, so this can be hit only once. >> >> Oh, yes, of course. >> >>> >>>> >>>> >>>>> + if (from == to) >>>>> + continue; >>>>> + } >>>>> + >>>>> + while (from < to) { >>>>> + *last = from; >>>>> + last = __gnttab_entry(from); >>>>> + gnttab_last_free = from; >>>>> + from++; >>>>> + } >>>> >>>> >>>> I have been looking at this loop and I can't understand what it is doing ;-( Can you enlighten me? >>> >>> It is recreating the free list in order to have it properly sorted. >>> This is needed to make sure that the free tail has the maximum >>> possible size (you can take the tail off the list without having >>> to worry about breaking the linked list because of references into >>> the tail). >> >> >> So let's say we have the (one-dimensional) table of length 13 >> >> idx .. 2 3 ... 10 11 12 >> >> grant 12 11 2 -1 3 >> >> >> and gnttab_free_head is 10. I.e. the free list is 2, 12, 3, 11. > > You meant 10, 2, 12, 3, 11, I guess? > >> >> What will this look like after the 2 iterations of the outer loop? > > idx .. 2 3 ... 10 11 12 > > grant 3 10 11 12 -1 > > with gnttab_free_head being 2, i.e the free list is now 2, 3, 10, 11, 12. OK, thanks, that helped. I couldn't link the free chunks in my head With the error handling in gnttab_init() fixed Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> -boris
On 16.05.22 19:00, Boris Ostrovsky wrote: Hello Boris > > > On 5/16/22 1:59 AM, Juergen Gross wrote: >> On 14.05.22 04:34, Boris Ostrovsky wrote: >>> >>> >>> On 5/13/22 1:33 AM, Juergen Gross wrote: >>>> On 12.05.22 22:01, Boris Ostrovsky wrote: >>>>> >>>>> On 5/7/22 2:19 PM, Oleksandr Tyshchenko wrote: >>> >>>>>> +/* Rebuilds the free grant list and tries to find count >>>>>> consecutive entries. */ >>>>>> +static int get_free_seq(unsigned int count) >>>>>> +{ >>>>>> + int ret = -ENOSPC; >>>>>> + unsigned int from, to; >>>>>> + grant_ref_t *last; >>>>>> + >>>>>> + gnttab_free_tail_ptr = &gnttab_free_head; >>>>>> + last = &gnttab_free_head; >>>>>> + >>>>>> + for (from = find_first_bit(gnttab_free_bitmap, gnttab_size); >>>>>> + from < gnttab_size; >>>>>> + from = find_next_bit(gnttab_free_bitmap, gnttab_size, >>>>>> to + 1)) { >>>>>> + to = find_next_zero_bit(gnttab_free_bitmap, gnttab_size, >>>>>> + from + 1); >>>>>> + if (ret < 0 && to - from >= count) { >>>>>> + ret = from; >>>>>> + bitmap_clear(gnttab_free_bitmap, ret, count); >>>>>> + from += count; >>>>>> + gnttab_free_count -= count; >>>>> >>>>> >>>>> IIUIC we can have multiple passes over this, meaning that the >>>>> gnttab_free_count may be decremented more than once. Is that >>>>> intentional? >>>> >>>> After the first pass decrementing gnttab_free_cnt, ret will no >>>> longer be less than zero, so this can be hit only once. >>> >>> Oh, yes, of course. >>> >>>> >>>>> >>>>> >>>>>> + if (from == to) >>>>>> + continue; >>>>>> + } >>>>>> + >>>>>> + while (from < to) { >>>>>> + *last = from; >>>>>> + last = __gnttab_entry(from); >>>>>> + gnttab_last_free = from; >>>>>> + from++; >>>>>> + } >>>>> >>>>> >>>>> I have been looking at this loop and I can't understand what it is >>>>> doing ;-( Can you enlighten me? >>>> >>>> It is recreating the free list in order to have it properly sorted. >>>> This is needed to make sure that the free tail has the maximum >>>> possible size (you can take the tail off the list without having >>>> to worry about breaking the linked list because of references into >>>> the tail). >>> >>> >>> So let's say we have the (one-dimensional) table of length 13 >>> >>> idx .. 2 3 ... 10 11 12 >>> >>> grant 12 11 2 -1 3 >>> >>> >>> and gnttab_free_head is 10. I.e. the free list is 2, 12, 3, 11. >> >> You meant 10, 2, 12, 3, 11, I guess? >> >>> >>> What will this look like after the 2 iterations of the outer loop? >> >> idx .. 2 3 ... 10 11 12 >> >> grant 3 10 11 12 -1 >> >> with gnttab_free_head being 2, i.e the free list is now 2, 3, 10, 11, >> 12. > > > > OK, thanks, that helped. I couldn't link the free chunks in my head > > > With the error handling in gnttab_init() fixed yes, this is a diff that I am going to apply for the next version: [snip] @@ -1596,19 +1601,20 @@ static int gnttab_expand(unsigned int req_entries) int gnttab_init(void) { int i; - unsigned long max_nr_grant_frames; + unsigned long max_nr_grant_frames, max_nr_grefs; unsigned int max_nr_glist_frames, nr_glist_frames; int ret; gnttab_request_version(); max_nr_grant_frames = gnttab_max_grant_frames(); + max_nr_grefs = max_nr_grant_frames * + gnttab_interface->grefs_per_grant_frame; nr_grant_frames = 1; /* Determine the maximum number of frames required for the * grant reference free list on the current hypervisor. */ - max_nr_glist_frames = (max_nr_grant_frames * - gnttab_interface->grefs_per_grant_frame / RPP); + max_nr_glist_frames = max_nr_grefs / RPP; gnttab_list = kmalloc_array(max_nr_glist_frames, sizeof(grant_ref_t *), @@ -1625,8 +1631,7 @@ int gnttab_init(void) } } - i = gnttab_interface->grefs_per_grant_frame * max_nr_grant_frames; - gnttab_free_bitmap = bitmap_zalloc(i, GFP_KERNEL); + gnttab_free_bitmap = bitmap_zalloc(max_nr_grefs, GFP_KERNEL); if (!gnttab_free_bitmap) { ret = -ENOMEM; goto ini_nomem; > > > Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> Thanks! > > > > -boris
On 5/16/22 2:30 PM, Oleksandr wrote: > > On 16.05.22 19:00, Boris Ostrovsky wrote: > > >> >> >> With the error handling in gnttab_init() fixed > > yes, this is a diff that I am going to apply for the next version: > > > [snip] > > @@ -1596,19 +1601,20 @@ static int gnttab_expand(unsigned int req_entries) > int gnttab_init(void) > { > int i; > - unsigned long max_nr_grant_frames; > + unsigned long max_nr_grant_frames, max_nr_grefs; > unsigned int max_nr_glist_frames, nr_glist_frames; > int ret; > > gnttab_request_version(); > max_nr_grant_frames = gnttab_max_grant_frames(); > + max_nr_grefs = max_nr_grant_frames * > + gnttab_interface->grefs_per_grant_frame; > nr_grant_frames = 1; > > /* Determine the maximum number of frames required for the > * grant reference free list on the current hypervisor. > */ > - max_nr_glist_frames = (max_nr_grant_frames * > - gnttab_interface->grefs_per_grant_frame / RPP); > + max_nr_glist_frames = max_nr_grefs / RPP; > > gnttab_list = kmalloc_array(max_nr_glist_frames, > sizeof(grant_ref_t *), > @@ -1625,8 +1631,7 @@ int gnttab_init(void) > } > } > > - i = gnttab_interface->grefs_per_grant_frame * max_nr_grant_frames; > - gnttab_free_bitmap = bitmap_zalloc(i, GFP_KERNEL); > + gnttab_free_bitmap = bitmap_zalloc(max_nr_grefs, GFP_KERNEL); > if (!gnttab_free_bitmap) { > ret = -ENOMEM; > goto ini_nomem; > Looks good, thanks. -boris
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 8ccccac..1b458c0 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -33,6 +33,7 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +#include <linux/bitmap.h> #include <linux/memblock.h> #include <linux/sched.h> #include <linux/mm.h> @@ -72,9 +73,32 @@ static grant_ref_t **gnttab_list; static unsigned int nr_grant_frames; + +/* + * Handling of free grants: + * + * Free grants are in a simple list anchored in gnttab_free_head. They are + * linked by grant ref, the last element contains GNTTAB_LIST_END. The number + * of free entries is stored in gnttab_free_count. + * Additionally there is a bitmap of free entries anchored in + * gnttab_free_bitmap. This is being used for simplifying allocation of + * multiple consecutive grants, which is needed e.g. for support of virtio. + * gnttab_last_free is used to add free entries of new frames at the end of + * the free list. + * gnttab_free_tail_ptr specifies the variable which references the start + * of consecutive free grants ending with gnttab_last_free. This pointer is + * updated in a rather defensive way, in order to avoid performance hits in + * hot paths. + * All those variables are protected by gnttab_list_lock. + */ static int gnttab_free_count; -static grant_ref_t gnttab_free_head; +static unsigned int gnttab_size; +static grant_ref_t gnttab_free_head = GNTTAB_LIST_END; +static grant_ref_t gnttab_last_free = GNTTAB_LIST_END; +static grant_ref_t *gnttab_free_tail_ptr; +static unsigned long *gnttab_free_bitmap; static DEFINE_SPINLOCK(gnttab_list_lock); + struct grant_frames xen_auto_xlat_grant_frames; static unsigned int xen_gnttab_version; module_param_named(version, xen_gnttab_version, uint, 0); @@ -170,16 +194,111 @@ static int get_free_entries(unsigned count) ref = head = gnttab_free_head; gnttab_free_count -= count; - while (count-- > 1) - head = gnttab_entry(head); + while (count--) { + bitmap_clear(gnttab_free_bitmap, head, 1); + if (gnttab_free_tail_ptr == __gnttab_entry(head)) + gnttab_free_tail_ptr = &gnttab_free_head; + if (count) + head = gnttab_entry(head); + } gnttab_free_head = gnttab_entry(head); gnttab_entry(head) = GNTTAB_LIST_END; + if (!gnttab_free_count) { + gnttab_last_free = GNTTAB_LIST_END; + gnttab_free_tail_ptr = NULL; + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); return ref; } +static int get_seq_entry_count(void) +{ + if (gnttab_last_free == GNTTAB_LIST_END || !gnttab_free_tail_ptr || + *gnttab_free_tail_ptr == GNTTAB_LIST_END) + return 0; + + return gnttab_last_free - *gnttab_free_tail_ptr + 1; +} + +/* Rebuilds the free grant list and tries to find count consecutive entries. */ +static int get_free_seq(unsigned int count) +{ + int ret = -ENOSPC; + unsigned int from, to; + grant_ref_t *last; + + gnttab_free_tail_ptr = &gnttab_free_head; + last = &gnttab_free_head; + + for (from = find_first_bit(gnttab_free_bitmap, gnttab_size); + from < gnttab_size; + from = find_next_bit(gnttab_free_bitmap, gnttab_size, to + 1)) { + to = find_next_zero_bit(gnttab_free_bitmap, gnttab_size, + from + 1); + if (ret < 0 && to - from >= count) { + ret = from; + bitmap_clear(gnttab_free_bitmap, ret, count); + from += count; + gnttab_free_count -= count; + if (from == to) + continue; + } + + while (from < to) { + *last = from; + last = __gnttab_entry(from); + gnttab_last_free = from; + from++; + } + if (to < gnttab_size) + gnttab_free_tail_ptr = __gnttab_entry(to - 1); + } + + *last = GNTTAB_LIST_END; + if (gnttab_last_free != gnttab_size - 1) + gnttab_free_tail_ptr = NULL; + + return ret; +} + +static int get_free_entries_seq(unsigned int count) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&gnttab_list_lock, flags); + + if (gnttab_free_count < count) { + ret = gnttab_expand(count - gnttab_free_count); + if (ret < 0) + goto out; + } + + if (get_seq_entry_count() < count) { + ret = get_free_seq(count); + if (ret >= 0) + goto out; + ret = gnttab_expand(count - get_seq_entry_count()); + if (ret < 0) + goto out; + } + + ret = *gnttab_free_tail_ptr; + *gnttab_free_tail_ptr = gnttab_entry(ret + count - 1); + gnttab_free_count -= count; + if (!gnttab_free_count) + gnttab_free_tail_ptr = NULL; + bitmap_clear(gnttab_free_bitmap, ret, count); + + out: + spin_unlock_irqrestore(&gnttab_list_lock, flags); + + return ret; +} + static void do_free_callbacks(void) { struct gnttab_free_callback *callback, *next; @@ -206,17 +325,48 @@ static inline void check_free_callbacks(void) do_free_callbacks(); } -static void put_free_entry(grant_ref_t ref) +static void put_free_entry_locked(grant_ref_t ref) { - unsigned long flags; - spin_lock_irqsave(&gnttab_list_lock, flags); gnttab_entry(ref) = gnttab_free_head; gnttab_free_head = ref; + if (!gnttab_free_count) + gnttab_last_free = ref; + if (gnttab_free_tail_ptr == &gnttab_free_head) + gnttab_free_tail_ptr = __gnttab_entry(ref); gnttab_free_count++; + bitmap_set(gnttab_free_bitmap, ref, 1); +} + +static void put_free_entry(grant_ref_t ref) +{ + unsigned long flags; + + spin_lock_irqsave(&gnttab_list_lock, flags); + put_free_entry_locked(ref); check_free_callbacks(); spin_unlock_irqrestore(&gnttab_list_lock, flags); } +static void gnttab_set_free(unsigned int start, unsigned int n) +{ + unsigned int i; + + for (i = start; i < start + n - 1; i++) + gnttab_entry(i) = i + 1; + + gnttab_entry(i) = GNTTAB_LIST_END; + if (!gnttab_free_count) { + gnttab_free_head = start; + gnttab_free_tail_ptr = &gnttab_free_head; + } else { + gnttab_entry(gnttab_last_free) = start; + } + gnttab_free_count += n; + gnttab_last_free = i; + + bitmap_set(gnttab_free_bitmap, start, n); +} + /* * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2. * Introducing a valid entry into the grant table: @@ -448,23 +598,31 @@ void gnttab_free_grant_references(grant_ref_t head) { grant_ref_t ref; unsigned long flags; - int count = 1; - if (head == GNTTAB_LIST_END) - return; + spin_lock_irqsave(&gnttab_list_lock, flags); - ref = head; - while (gnttab_entry(ref) != GNTTAB_LIST_END) { - ref = gnttab_entry(ref); - count++; + while (head != GNTTAB_LIST_END) { + ref = gnttab_entry(head); + put_free_entry_locked(head); + head = ref; } - gnttab_entry(ref) = gnttab_free_head; - gnttab_free_head = head; - gnttab_free_count += count; check_free_callbacks(); spin_unlock_irqrestore(&gnttab_list_lock, flags); } EXPORT_SYMBOL_GPL(gnttab_free_grant_references); +void gnttab_free_grant_reference_seq(grant_ref_t head, unsigned int count) +{ + unsigned long flags; + unsigned int i; + + spin_lock_irqsave(&gnttab_list_lock, flags); + for (i = count; i > 0; i--) + put_free_entry_locked(head + i - 1); + check_free_callbacks(); + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} +EXPORT_SYMBOL_GPL(gnttab_free_grant_reference_seq); + int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) { int h = get_free_entries(count); @@ -478,6 +636,24 @@ int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) } EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references); +int gnttab_alloc_grant_reference_seq(unsigned int count, grant_ref_t *first) +{ + int h; + + if (count == 1) + h = get_free_entries(1); + else + h = get_free_entries_seq(count); + + if (h < 0) + return -ENOSPC; + + *first = h; + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_alloc_grant_reference_seq); + int gnttab_empty_grant_references(const grant_ref_t *private_head) { return (*private_head == GNTTAB_LIST_END); @@ -570,16 +746,13 @@ static int grow_gnttab_list(unsigned int more_frames) goto grow_nomem; } + gnttab_set_free(gnttab_size, extra_entries); - for (i = grefs_per_frame * nr_grant_frames; - i < grefs_per_frame * new_nr_grant_frames - 1; i++) - gnttab_entry(i) = i + 1; - - gnttab_entry(i) = gnttab_free_head; - gnttab_free_head = grefs_per_frame * nr_grant_frames; - gnttab_free_count += extra_entries; + if (!gnttab_free_tail_ptr) + gnttab_free_tail_ptr = __gnttab_entry(gnttab_size); nr_grant_frames = new_nr_grant_frames; + gnttab_size += extra_entries; check_free_callbacks(); @@ -1424,7 +1597,6 @@ int gnttab_init(void) int i; unsigned long max_nr_grant_frames; unsigned int max_nr_glist_frames, nr_glist_frames; - unsigned int nr_init_grefs; int ret; gnttab_request_version(); @@ -1452,6 +1624,13 @@ int gnttab_init(void) } } + i = gnttab_interface->grefs_per_grant_frame * max_nr_grant_frames; + gnttab_free_bitmap = bitmap_zalloc(i, GFP_KERNEL); + if (!gnttab_free_bitmap) { + ret = -ENOMEM; + goto ini_nomem; + } + ret = arch_gnttab_init(max_nr_grant_frames, nr_status_frames(max_nr_grant_frames)); if (ret < 0) @@ -1462,15 +1641,9 @@ int gnttab_init(void) goto ini_nomem; } - nr_init_grefs = nr_grant_frames * - gnttab_interface->grefs_per_grant_frame; - - for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) - gnttab_entry(i) = i + 1; + gnttab_size = nr_grant_frames * gnttab_interface->grefs_per_grant_frame; - gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END; - gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES; - gnttab_free_head = NR_RESERVED_ENTRIES; + gnttab_set_free(NR_RESERVED_ENTRIES, gnttab_size - NR_RESERVED_ENTRIES); printk("Grant table initialized\n"); return 0; @@ -1479,6 +1652,7 @@ int gnttab_init(void) for (i--; i >= 0; i--) free_page((unsigned long)gnttab_list[i]); kfree(gnttab_list); + bitmap_free(gnttab_free_bitmap); return ret; } EXPORT_SYMBOL_GPL(gnttab_init); diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index dfd5bf3..d815e1d 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -129,10 +129,14 @@ int gnttab_try_end_foreign_access(grant_ref_t ref); */ int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head); +int gnttab_alloc_grant_reference_seq(unsigned int count, grant_ref_t *first); + void gnttab_free_grant_reference(grant_ref_t ref); void gnttab_free_grant_references(grant_ref_t head); +void gnttab_free_grant_reference_seq(grant_ref_t head, unsigned int count); + int gnttab_empty_grant_references(const grant_ref_t *pprivate_head); int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);