diff mbox

[v2,2/2] KVM: s390: Fix storage attributes migration with memory slots

Message ID 1516280198-25386-3-git-send-email-imbrenda@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Claudio Imbrenda Jan. 18, 2018, 12:56 p.m. UTC
This is a fix for several issues that were found in the original code
for storage attributes migration.

Now no bitmap is allocated to keep track of dirty storage attributes;
the extra bits of the per-memslot bitmap that are always present anyway
are now used for this purpose.

The code has also been refactored a little to improve readability.

Fixes: 190df4a212a ("KVM: s390: CMMA tracking, ESSA emulation, migration mode")
Fixes: 4036e3874a1 ("KVM: s390: ioctls to get and set guest storage attributes")
Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |   9 +-
 arch/s390/kvm/kvm-s390.c         | 265 ++++++++++++++++++++++-----------------
 arch/s390/kvm/priv.c             |  26 ++--
 3 files changed, 168 insertions(+), 132 deletions(-)

Comments

Christian Borntraeger Jan. 19, 2018, 3:41 p.m. UTC | #1
On 01/18/2018 01:56 PM, Claudio Imbrenda wrote:
> This is a fix for several issues that were found in the original code
> for storage attributes migration.
> 
> Now no bitmap is allocated to keep track of dirty storage attributes;
> the extra bits of the per-memslot bitmap that are always present anyway
> are now used for this purpose.
> 
> The code has also been refactored a little to improve readability.
> 
> Fixes: 190df4a212a ("KVM: s390: CMMA tracking, ESSA emulation, migration mode")
> Fixes: 4036e3874a1 ("KVM: s390: ioctls to get and set guest storage attributes")
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
[..]
> +	/* mark all the pages in active slots as dirty */
> +	for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
> +		sl = slots->memslots + slotnr;
>  		/*
> -		 * Get the last slot. They should be sorted by base_gfn, so the
> -		 * last slot is also the one at the end of the address space.
> -		 * We have verified above that at least one slot is present.

Can you please rebase on top of 4.15-rc8 (or master)? This code has already some
fixes.

[...]

> +static unsigned long kvm_s390_next_dirty_cmma(struct kvm *kvm,
> +					      unsigned long cur)
> +{
> +	struct kvm_memslots *slots = kvm_memslots(kvm);
> +	struct kvm_memory_slot *sl;
> +	unsigned long ofs;
> +	int slotidx;
> +
> +	slotidx = gfn_to_memslot_approx(kvm, cur);
> +	sl = slots->memslots + slotidx;
> +
> +	if (sl->base_gfn + sl->npages <= cur) {
> +		slotidx--;
> +		/* If we are above the highest slot, wrap around */
> +		if (slotidx < 0)
> +			slotidx = slots->used_slots - 1;
> +
> +		sl = slots->memslots + slotidx;
> +		cur = sl->base_gfn;
> +	}
> +	ofs = find_next_bit(cmma_bitmap(sl), sl->npages, cur - sl->base_gfn);
> +	while ((slotidx > 0) && (ofs >= sl->npages)) {
> +		slotidx--;
> +		sl = slots->memslots + slotidx;
> +		ofs = find_next_bit(cmma_bitmap(sl), sl->npages, 0);
> +	}
> +	return sl->base_gfn + ofs;
> +}
> +
> +static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
> +			     u8 *res, unsigned long bufsize)
> +{
> +	unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev;
> +	struct kvm_memslots *slots = kvm_memslots(kvm);
> +	struct kvm_memory_slot *sl;
> +
> +	cur_gfn = kvm_s390_next_dirty_cmma(kvm, args->start_gfn);
> +	sl = gfn_to_memslot(kvm, cur_gfn);
> +	args->count = 0;
> +	args->start_gfn = cur_gfn;
> +	if (!sl)
> +		return 0;
> +	next_gfn = kvm_s390_next_dirty_cmma(kvm, cur_gfn + 1);
> +	mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages;
> +
> +	while (args->count < bufsize) {
> +		hva = gfn_to_hva(kvm, cur_gfn);
> +		if (kvm_is_error_hva(hva))
> +			break;
> +		/* decrement only if we actually flipped the bit to 0 */
> +		if (test_and_clear_bit(cur_gfn - sl->base_gfn, cmma_bitmap(sl)))
> +			atomic64_dec(&kvm->arch.cmma_dirty_pages);
> +		if (get_pgste(kvm->mm, hva, &pgstev) < 0)
> +			pgstev = 0;
> +		/* save the value */
> +		res[args->count++] = (pgstev >> 24) & 0x43;
> +		/*
> +		 * if the next bit is too far away, stop.
> +		 * if we reached the previous "next", find the next one
> +		 */
> +		if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE)
> +			break;
> +		if (cur_gfn == next_gfn)
> +			next_gfn = kvm_s390_next_dirty_cmma(kvm, cur_gfn + 1);
> +		/* reached the end of memory or of the buffer, stop */
> +		if ((next_gfn >= mem_end) ||
> +		    (next_gfn - args->start_gfn >= bufsize))
> +			break;
> +		cur_gfn++;
> +		if (cur_gfn - sl->base_gfn >= sl->npages) {
> +			sl = gfn_to_memslot(kvm, cur_gfn);
> +			if (!sl)
> +				break;
> +		}
> +	}

I have to say that this code does not get easier to understand, I fear that I wont be able 
to review this is time for 4.15.
Christian Borntraeger Jan. 22, 2018, 2:26 p.m. UTC | #2
On 01/18/2018 01:56 PM, Claudio Imbrenda wrote:
> This is a fix for several issues that were found in the original code
> for storage attributes migration.
> 
> Now no bitmap is allocated to keep track of dirty storage attributes;
> the extra bits of the per-memslot bitmap that are always present anyway
> are now used for this purpose.
> 
> The code has also been refactored a little to improve readability.
> 
> Fixes: 190df4a212a ("KVM: s390: CMMA tracking, ESSA emulation, migration mode")
> Fixes: 4036e3874a1 ("KVM: s390: ioctls to get and set guest storage attributes")
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
> ---
>  arch/s390/include/asm/kvm_host.h |   9 +-
>  arch/s390/kvm/kvm-s390.c         | 265 ++++++++++++++++++++++-----------------
>  arch/s390/kvm/priv.c             |  26 ++--
>  3 files changed, 168 insertions(+), 132 deletions(-)
> 
> diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
> index 819838c..2ca73b0 100644
> --- a/arch/s390/include/asm/kvm_host.h
> +++ b/arch/s390/include/asm/kvm_host.h
> @@ -763,12 +763,6 @@ struct kvm_s390_vsie {
>  	struct page *pages[KVM_MAX_VCPUS];
>  };
> 
> -struct kvm_s390_migration_state {
> -	unsigned long bitmap_size;	/* in bits (number of guest pages) */
> -	atomic64_t dirty_pages;		/* number of dirty pages */
> -	unsigned long *pgste_bitmap;
> -};
> -
>  struct kvm_arch{
>  	void *sca;
>  	int use_esca;
> @@ -796,7 +790,8 @@ struct kvm_arch{
>  	struct kvm_s390_vsie vsie;
>  	u8 epdx;
>  	u64 epoch;
> -	struct kvm_s390_migration_state *migration_state;
> +	atomic_t migration_mode;
> +	atomic64_t cmma_dirty_pages;
>  	/* subset of available cpu features enabled by user space */
>  	DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
>  	struct kvm_s390_gisa *gisa;
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index 5a69334..85448a2 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -770,53 +770,37 @@ static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
>   */
>  static int kvm_s390_vm_start_migration(struct kvm *kvm)
>  {
> -	struct kvm_s390_migration_state *mgs;
> -	struct kvm_memory_slot *ms;
> -	/* should be the only one */
>  	struct kvm_memslots *slots;
> -	unsigned long ram_pages;
> +	struct kvm_memory_slot *sl;
> +	unsigned long ram_pages = 0;
>  	int slotnr;
> 
>  	/* migration mode already enabled */
> -	if (kvm->arch.migration_state)
> +	if (atomic_read(&kvm->arch.migration_mode))
>  		return 0;

Being atomic does not help here, because you just use read and write
(and not an atomic op). We are protected by a mutex, though. So maybe
just use a normal bool variable and add a comment.

Please also have a look at 
 KVM: s390: add proper locking for CMMA migration bitmap
which changes the locking. Do we want to keep some of these
changes (e.g. use the slots_lock)? Or do we want to schedule that fix
for stable and use your patches only for 4.16 and later?


[...]

> +	atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages);
> +	atomic_set(&kvm->arch.migration_mode, 1);
> +	kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
>  	return 0;
>  }
> 
> @@ -826,19 +810,12 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
>   */
>  static int kvm_s390_vm_stop_migration(struct kvm *kvm)
>  {
> -	struct kvm_s390_migration_state *mgs;
> -
>  	/* migration mode already disabled */
> -	if (!kvm->arch.migration_state)
> +	if (!atomic_read(&kvm->arch.migration_mode))
>  		return 0;
> -	mgs = kvm->arch.migration_state;
> -	kvm->arch.migration_state = NULL;
> -
> -	if (kvm->arch.use_cmma) {
> +	atomic_set(&kvm->arch.migration_mode, 0);
> +	if (kvm->arch.use_cmma)
>  		kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION);
> -		vfree(mgs->pgste_bitmap);
> -	}
> -	kfree(mgs);
>  	return 0;
>  }
> 
> @@ -868,7 +845,7 @@ static int kvm_s390_vm_set_migration(struct kvm *kvm,
>  static int kvm_s390_vm_get_migration(struct kvm *kvm,
>  				     struct kvm_device_attr *attr)
>  {
> -	u64 mig = (kvm->arch.migration_state != NULL);
> +	u64 mig = atomic_read(&kvm->arch.migration_mode);
> 
>  	if (attr->attr != KVM_S390_VM_MIGRATION_STATUS)
>  		return -ENXIO;
> @@ -1543,6 +1520,108 @@ static int gfn_to_memslot_approx(struct kvm *kvm, gfn_t gfn)
>  	return start;
>  }
> 
> +static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
> +			      u8 *res, unsigned long bufsize)
> +{
> +	unsigned long pgstev, cur_gfn, hva;
> +
> +	cur_gfn = args->start_gfn;
> +	args->count = 0;
> +	while (args->count < bufsize) {
> +		hva = gfn_to_hva(kvm, cur_gfn);
> +		/*
> +		 * We return an error if the first value was invalid, but we
> +		 * return successfully if at least one value was copied.
> +		 */
> +		if (kvm_is_error_hva(hva))
> +			return args->count ? 0 : -EFAULT;
> +		if (get_pgste(kvm->mm, hva, &pgstev) < 0)
> +			pgstev = 0;
> +		res[args->count++] = (pgstev >> 24) & 0x43;
> +		cur_gfn++;
> +	}
> +
> +	return 0;
> +}
> +
> +static unsigned long kvm_s390_next_dirty_cmma(struct kvm *kvm,
> +					      unsigned long cur)
> +{
> +	struct kvm_memslots *slots = kvm_memslots(kvm);
> +	struct kvm_memory_slot *sl;
> +	unsigned long ofs;
> +	int slotidx;
> +
> +	slotidx = gfn_to_memslot_approx(kvm, cur);
> +	sl = slots->memslots + slotidx;
> +
> +	if (sl->base_gfn + sl->npages <= cur) {
> +		slotidx--;
> +		/* If we are above the highest slot, wrap around */
> +		if (slotidx < 0)
> +			slotidx = slots->used_slots - 1;
> +
> +		sl = slots->memslots + slotidx;
> +		cur = sl->base_gfn;
> +	}
> +	ofs = find_next_bit(cmma_bitmap(sl), sl->npages, cur - sl->base_gfn);
> +	while ((slotidx > 0) && (ofs >= sl->npages)) {
> +		slotidx--;
> +		sl = slots->memslots + slotidx;
> +		ofs = find_next_bit(cmma_bitmap(sl), sl->npages, 0);
> +	}
> +	return sl->base_gfn + ofs;
> +}
> +
> +static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
> +			     u8 *res, unsigned long bufsize)
> +{
> +	unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev;
> +	struct kvm_memslots *slots = kvm_memslots(kvm);
> +	struct kvm_memory_slot *sl;
> +
> +	cur_gfn = kvm_s390_next_dirty_cmma(kvm, args->start_gfn);
> +	sl = gfn_to_memslot(kvm, cur_gfn);
> +	args->count = 0;
> +	args->start_gfn = cur_gfn;
> +	if (!sl)
> +		return 0;
> +	next_gfn = kvm_s390_next_dirty_cmma(kvm, cur_gfn + 1);
> +	mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages;
> +
> +	while (args->count < bufsize) {
> +		hva = gfn_to_hva(kvm, cur_gfn);
> +		if (kvm_is_error_hva(hva))
> +			break;
> +		/* decrement only if we actually flipped the bit to 0 */
> +		if (test_and_clear_bit(cur_gfn - sl->base_gfn, cmma_bitmap(sl)))
> +			atomic64_dec(&kvm->arch.cmma_dirty_pages);
> +		if (get_pgste(kvm->mm, hva, &pgstev) < 0)
> +			pgstev = 0;
> +		/* save the value */
> +		res[args->count++] = (pgstev >> 24) & 0x43;
> +		/*
> +		 * if the next bit is too far away, stop.
> +		 * if we reached the previous "next", find the next one
> +		 */
> +		if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE)
> +			break;
> +		if (cur_gfn == next_gfn)
> +			next_gfn = kvm_s390_next_dirty_cmma(kvm, cur_gfn + 1);
> +		/* reached the end of memory or of the buffer, stop */
> +		if ((next_gfn >= mem_end) ||
> +		    (next_gfn - args->start_gfn >= bufsize))
> +			break;
> +		cur_gfn++;
> +		if (cur_gfn - sl->base_gfn >= sl->npages) {
> +			sl = gfn_to_memslot(kvm, cur_gfn);
> +			if (!sl)
> +				break;
> +		}
> +	}
> +	return 0;
> +}
> +
>  /*
>   * This function searches for the next page with dirty CMMA attributes, and
>   * saves the attributes in the buffer up to either the end of the buffer or
> @@ -1554,97 +1633,53 @@ static int gfn_to_memslot_approx(struct kvm *kvm, gfn_t gfn)
>  static int kvm_s390_get_cmma_bits(struct kvm *kvm,
>  				  struct kvm_s390_cmma_log *args)
>  {
> -	struct kvm_s390_migration_state *s = kvm->arch.migration_state;
> -	unsigned long bufsize, hva, pgstev, i, next, cur;
> -	int srcu_idx, peek, r = 0, rr;
> -	u8 *res;
> +	unsigned long bufsize;
> +	int srcu_idx, peek, migr, ret;
> +	u8 *values;
> 
> -	cur = args->start_gfn;
> -	i = next = pgstev = 0;
> +	migr = atomic_read(&kvm->arch.migration_mode);
> 
>  	if (unlikely(!kvm->arch.use_cmma))
>  		return -ENXIO;
>  	/* Invalid/unsupported flags were specified */
> -	if (args->flags & ~KVM_S390_CMMA_PEEK)
> +	if (unlikely(args->flags & ~KVM_S390_CMMA_PEEK))
>  		return -EINVAL;
>  	/* Migration mode query, and we are not doing a migration */
>  	peek = !!(args->flags & KVM_S390_CMMA_PEEK);
> -	if (!peek && !s)
> +	if (unlikely(!peek && !migr))
>  		return -EINVAL;
>  	/* CMMA is disabled or was not used, or the buffer has length zero */
>  	bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
> -	if (!bufsize || !kvm->mm->context.use_cmma) {
> +	if (unlikely(!bufsize || !kvm->mm->context.use_cmma)) {
>  		memset(args, 0, sizeof(*args));
>  		return 0;
>  	}
> -
> -	if (!peek) {
> -		/* We are not peeking, and there are no dirty pages */
> -		if (!atomic64_read(&s->dirty_pages)) {
> -			memset(args, 0, sizeof(*args));
> -			return 0;
> -		}
> -		cur = find_next_bit(s->pgste_bitmap, s->bitmap_size,
> -				    args->start_gfn);
> -		if (cur >= s->bitmap_size)	/* nothing found, loop back */
> -			cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0);
> -		if (cur >= s->bitmap_size) {	/* again! (very unlikely) */
> -			memset(args, 0, sizeof(*args));
> -			return 0;
> -		}
> -		next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1);
> +	/* We are not peeking, and there are no dirty pages */
> +	if (!peek && !atomic64_read(&kvm->arch.cmma_dirty_pages)) {
> +		memset(args, 0, sizeof(*args));
> +		return 0;
>  	}
> 
> -	res = vmalloc(bufsize);
> -	if (!res)
> +	values = vmalloc(bufsize);
> +	if (!values)
>  		return -ENOMEM;
> 
> -	args->start_gfn = cur;
> -
>  	down_read(&kvm->mm->mmap_sem);
>  	srcu_idx = srcu_read_lock(&kvm->srcu);
> -	while (i < bufsize) {
> -		hva = gfn_to_hva(kvm, cur);
> -		if (kvm_is_error_hva(hva)) {
> -			r = -EFAULT;
> -			break;
> -		}
> -		/* decrement only if we actually flipped the bit to 0 */
> -		if (!peek && test_and_clear_bit(cur, s->pgste_bitmap))
> -			atomic64_dec(&s->dirty_pages);
> -		r = get_pgste(kvm->mm, hva, &pgstev);
> -		if (r < 0)
> -			pgstev = 0;
> -		/* save the value */
> -		res[i++] = (pgstev >> 24) & 0x43;
> -		/*
> -		 * if the next bit is too far away, stop.
> -		 * if we reached the previous "next", find the next one
> -		 */
> -		if (!peek) {
> -			if (next > cur + KVM_S390_MAX_BIT_DISTANCE)
> -				break;
> -			if (cur == next)
> -				next = find_next_bit(s->pgste_bitmap,
> -						     s->bitmap_size, cur + 1);
> -		/* reached the end of the bitmap or of the buffer, stop */
> -			if ((next >= s->bitmap_size) ||
> -			    (next >= args->start_gfn + bufsize))
> -				break;
> -		}
> -		cur++;
> -	}
> +	if (peek)
> +		ret = kvm_s390_peek_cmma(kvm, args, values, bufsize);
> +	else
> +		ret = kvm_s390_get_cmma(kvm, args, values, bufsize);
>  	srcu_read_unlock(&kvm->srcu, srcu_idx);
>  	up_read(&kvm->mm->mmap_sem);
> -	args->count = i;
> -	args->remaining = s ? atomic64_read(&s->dirty_pages) : 0;
> 
> -	rr = copy_to_user((void __user *)args->values, res, args->count);
> -	if (rr)
> -		r = -EFAULT;
> +	args->remaining = migr ? atomic64_read(&kvm->arch.cmma_dirty_pages) : 0;
> 
> -	vfree(res);
> -	return r;
> +	if (copy_to_user((void __user *)args->values, values, args->count))
> +		ret = -EFAULT;
> +
> +	vfree(values);
> +	return ret;
>  }
> 
>  /*
> @@ -2088,10 +2123,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
>  	kvm_s390_destroy_adapters(kvm);
>  	kvm_s390_clear_float_irqs(kvm);
>  	kvm_s390_vsie_destroy(kvm);
> -	if (kvm->arch.migration_state) {
> -		vfree(kvm->arch.migration_state->pgste_bitmap);
> -		kfree(kvm->arch.migration_state);
> -	}
>  	KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
>  }
> 
> diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
> index 572496c..7ae4893 100644
> --- a/arch/s390/kvm/priv.c
> +++ b/arch/s390/kvm/priv.c
> @@ -954,9 +954,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
>  	return 0;
>  }
> 
> -static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
> +/*
> + * Must be called with relevant read locks held (kvm->mm->mmap_sem, kvm->srcu)
> + */
> +static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
>  {
> -	struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state;
>  	int r1, r2, nappended, entries;
>  	unsigned long gfn, hva, res, pgstev, ptev;
>  	unsigned long *cbrlo;
> @@ -1007,9 +1009,11 @@ static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
>  	}
> 
>  	if (orc) {
> -		/* increment only if we are really flipping the bit to 1 */
> -		if (!test_and_set_bit(gfn, ms->pgste_bitmap))
> -			atomic64_inc(&ms->dirty_pages);
> +		struct kvm_memory_slot *s = gfn_to_memslot(vcpu->kvm, gfn);
> +
> +		/* Increment only if we are really flipping the bit */
> +		if (s && !test_and_set_bit(gfn - s->base_gfn, cmma_bitmap(s)))
> +			atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages);
>  	}
> 
>  	return nappended;
> @@ -1038,7 +1042,7 @@ static int handle_essa(struct kvm_vcpu *vcpu)
>  						: ESSA_SET_STABLE_IF_RESIDENT))
>  		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
> 
> -	if (likely(!vcpu->kvm->arch.migration_state)) {
> +	if (likely(!atomic_read(&vcpu->kvm->arch.migration_mode))) {
>  		/*
>  		 * CMMA is enabled in the KVM settings, but is disabled in
>  		 * the SIE block and in the mm_context, and we are not doing
> @@ -1066,10 +1070,16 @@ static int handle_essa(struct kvm_vcpu *vcpu)
>  		/* Retry the ESSA instruction */
>  		kvm_s390_retry_instr(vcpu);
>  	} else {
> -		/* Account for the possible extra cbrl entry */
> -		i = do_essa(vcpu, orc);
> +		int srcu_idx;
> +
> +		down_read(&vcpu->kvm->mm->mmap_sem);
> +		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> +		i = __do_essa(vcpu, orc);
> +		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
> +		up_read(&vcpu->kvm->mm->mmap_sem);
>  		if (i < 0)
>  			return i;
> +		/* Account for the possible extra cbrl entry */
>  		entries += i;
>  	}
>  	vcpu->arch.sie_block->cbrlo &= PAGE_MASK;	/* reset nceo */
>
diff mbox

Patch

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 819838c..2ca73b0 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -763,12 +763,6 @@  struct kvm_s390_vsie {
 	struct page *pages[KVM_MAX_VCPUS];
 };
 
-struct kvm_s390_migration_state {
-	unsigned long bitmap_size;	/* in bits (number of guest pages) */
-	atomic64_t dirty_pages;		/* number of dirty pages */
-	unsigned long *pgste_bitmap;
-};
-
 struct kvm_arch{
 	void *sca;
 	int use_esca;
@@ -796,7 +790,8 @@  struct kvm_arch{
 	struct kvm_s390_vsie vsie;
 	u8 epdx;
 	u64 epoch;
-	struct kvm_s390_migration_state *migration_state;
+	atomic_t migration_mode;
+	atomic64_t cmma_dirty_pages;
 	/* subset of available cpu features enabled by user space */
 	DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
 	struct kvm_s390_gisa *gisa;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5a69334..85448a2 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -770,53 +770,37 @@  static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
  */
 static int kvm_s390_vm_start_migration(struct kvm *kvm)
 {
-	struct kvm_s390_migration_state *mgs;
-	struct kvm_memory_slot *ms;
-	/* should be the only one */
 	struct kvm_memslots *slots;
-	unsigned long ram_pages;
+	struct kvm_memory_slot *sl;
+	unsigned long ram_pages = 0;
 	int slotnr;
 
 	/* migration mode already enabled */
-	if (kvm->arch.migration_state)
+	if (atomic_read(&kvm->arch.migration_mode))
 		return 0;
-
 	slots = kvm_memslots(kvm);
 	if (!slots || !slots->used_slots)
 		return -EINVAL;
 
-	mgs = kzalloc(sizeof(*mgs), GFP_KERNEL);
-	if (!mgs)
-		return -ENOMEM;
-	kvm->arch.migration_state = mgs;
-
-	if (kvm->arch.use_cmma) {
+	if (!kvm->arch.use_cmma) {
+		atomic_set(&kvm->arch.migration_mode, 1);
+		return 0;
+	}
+	/* mark all the pages in active slots as dirty */
+	for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
+		sl = slots->memslots + slotnr;
 		/*
-		 * Get the last slot. They should be sorted by base_gfn, so the
-		 * last slot is also the one at the end of the address space.
-		 * We have verified above that at least one slot is present.
+		 * The second half of the bitmap is only used on x86,
+		 * and would be wasted otherwise, so we put it to good
+		 * use here to keep track of the state of the storage
+		 * attributes.
 		 */
-		ms = slots->memslots + slots->used_slots - 1;
-		/* round up so we only use full longs */
-		ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG);
-		/* allocate enough bytes to store all the bits */
-		mgs->pgste_bitmap = vmalloc(ram_pages / 8);
-		if (!mgs->pgste_bitmap) {
-			kfree(mgs);
-			kvm->arch.migration_state = NULL;
-			return -ENOMEM;
-		}
-
-		mgs->bitmap_size = ram_pages;
-		atomic64_set(&mgs->dirty_pages, ram_pages);
-		/* mark all the pages in active slots as dirty */
-		for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
-			ms = slots->memslots + slotnr;
-			bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages);
-		}
-
-		kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
+		memset(cmma_bitmap(sl), 0xff, kvm_dirty_bitmap_bytes(sl));
+		ram_pages += sl->npages;
 	}
+	atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages);
+	atomic_set(&kvm->arch.migration_mode, 1);
+	kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
 	return 0;
 }
 
@@ -826,19 +810,12 @@  static int kvm_s390_vm_start_migration(struct kvm *kvm)
  */
 static int kvm_s390_vm_stop_migration(struct kvm *kvm)
 {
-	struct kvm_s390_migration_state *mgs;
-
 	/* migration mode already disabled */
-	if (!kvm->arch.migration_state)
+	if (!atomic_read(&kvm->arch.migration_mode))
 		return 0;
-	mgs = kvm->arch.migration_state;
-	kvm->arch.migration_state = NULL;
-
-	if (kvm->arch.use_cmma) {
+	atomic_set(&kvm->arch.migration_mode, 0);
+	if (kvm->arch.use_cmma)
 		kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION);
-		vfree(mgs->pgste_bitmap);
-	}
-	kfree(mgs);
 	return 0;
 }
 
@@ -868,7 +845,7 @@  static int kvm_s390_vm_set_migration(struct kvm *kvm,
 static int kvm_s390_vm_get_migration(struct kvm *kvm,
 				     struct kvm_device_attr *attr)
 {
-	u64 mig = (kvm->arch.migration_state != NULL);
+	u64 mig = atomic_read(&kvm->arch.migration_mode);
 
 	if (attr->attr != KVM_S390_VM_MIGRATION_STATUS)
 		return -ENXIO;
@@ -1543,6 +1520,108 @@  static int gfn_to_memslot_approx(struct kvm *kvm, gfn_t gfn)
 	return start;
 }
 
+static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
+			      u8 *res, unsigned long bufsize)
+{
+	unsigned long pgstev, cur_gfn, hva;
+
+	cur_gfn = args->start_gfn;
+	args->count = 0;
+	while (args->count < bufsize) {
+		hva = gfn_to_hva(kvm, cur_gfn);
+		/*
+		 * We return an error if the first value was invalid, but we
+		 * return successfully if at least one value was copied.
+		 */
+		if (kvm_is_error_hva(hva))
+			return args->count ? 0 : -EFAULT;
+		if (get_pgste(kvm->mm, hva, &pgstev) < 0)
+			pgstev = 0;
+		res[args->count++] = (pgstev >> 24) & 0x43;
+		cur_gfn++;
+	}
+
+	return 0;
+}
+
+static unsigned long kvm_s390_next_dirty_cmma(struct kvm *kvm,
+					      unsigned long cur)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	struct kvm_memory_slot *sl;
+	unsigned long ofs;
+	int slotidx;
+
+	slotidx = gfn_to_memslot_approx(kvm, cur);
+	sl = slots->memslots + slotidx;
+
+	if (sl->base_gfn + sl->npages <= cur) {
+		slotidx--;
+		/* If we are above the highest slot, wrap around */
+		if (slotidx < 0)
+			slotidx = slots->used_slots - 1;
+
+		sl = slots->memslots + slotidx;
+		cur = sl->base_gfn;
+	}
+	ofs = find_next_bit(cmma_bitmap(sl), sl->npages, cur - sl->base_gfn);
+	while ((slotidx > 0) && (ofs >= sl->npages)) {
+		slotidx--;
+		sl = slots->memslots + slotidx;
+		ofs = find_next_bit(cmma_bitmap(sl), sl->npages, 0);
+	}
+	return sl->base_gfn + ofs;
+}
+
+static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
+			     u8 *res, unsigned long bufsize)
+{
+	unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev;
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	struct kvm_memory_slot *sl;
+
+	cur_gfn = kvm_s390_next_dirty_cmma(kvm, args->start_gfn);
+	sl = gfn_to_memslot(kvm, cur_gfn);
+	args->count = 0;
+	args->start_gfn = cur_gfn;
+	if (!sl)
+		return 0;
+	next_gfn = kvm_s390_next_dirty_cmma(kvm, cur_gfn + 1);
+	mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages;
+
+	while (args->count < bufsize) {
+		hva = gfn_to_hva(kvm, cur_gfn);
+		if (kvm_is_error_hva(hva))
+			break;
+		/* decrement only if we actually flipped the bit to 0 */
+		if (test_and_clear_bit(cur_gfn - sl->base_gfn, cmma_bitmap(sl)))
+			atomic64_dec(&kvm->arch.cmma_dirty_pages);
+		if (get_pgste(kvm->mm, hva, &pgstev) < 0)
+			pgstev = 0;
+		/* save the value */
+		res[args->count++] = (pgstev >> 24) & 0x43;
+		/*
+		 * if the next bit is too far away, stop.
+		 * if we reached the previous "next", find the next one
+		 */
+		if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE)
+			break;
+		if (cur_gfn == next_gfn)
+			next_gfn = kvm_s390_next_dirty_cmma(kvm, cur_gfn + 1);
+		/* reached the end of memory or of the buffer, stop */
+		if ((next_gfn >= mem_end) ||
+		    (next_gfn - args->start_gfn >= bufsize))
+			break;
+		cur_gfn++;
+		if (cur_gfn - sl->base_gfn >= sl->npages) {
+			sl = gfn_to_memslot(kvm, cur_gfn);
+			if (!sl)
+				break;
+		}
+	}
+	return 0;
+}
+
 /*
  * This function searches for the next page with dirty CMMA attributes, and
  * saves the attributes in the buffer up to either the end of the buffer or
@@ -1554,97 +1633,53 @@  static int gfn_to_memslot_approx(struct kvm *kvm, gfn_t gfn)
 static int kvm_s390_get_cmma_bits(struct kvm *kvm,
 				  struct kvm_s390_cmma_log *args)
 {
-	struct kvm_s390_migration_state *s = kvm->arch.migration_state;
-	unsigned long bufsize, hva, pgstev, i, next, cur;
-	int srcu_idx, peek, r = 0, rr;
-	u8 *res;
+	unsigned long bufsize;
+	int srcu_idx, peek, migr, ret;
+	u8 *values;
 
-	cur = args->start_gfn;
-	i = next = pgstev = 0;
+	migr = atomic_read(&kvm->arch.migration_mode);
 
 	if (unlikely(!kvm->arch.use_cmma))
 		return -ENXIO;
 	/* Invalid/unsupported flags were specified */
-	if (args->flags & ~KVM_S390_CMMA_PEEK)
+	if (unlikely(args->flags & ~KVM_S390_CMMA_PEEK))
 		return -EINVAL;
 	/* Migration mode query, and we are not doing a migration */
 	peek = !!(args->flags & KVM_S390_CMMA_PEEK);
-	if (!peek && !s)
+	if (unlikely(!peek && !migr))
 		return -EINVAL;
 	/* CMMA is disabled or was not used, or the buffer has length zero */
 	bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
-	if (!bufsize || !kvm->mm->context.use_cmma) {
+	if (unlikely(!bufsize || !kvm->mm->context.use_cmma)) {
 		memset(args, 0, sizeof(*args));
 		return 0;
 	}
-
-	if (!peek) {
-		/* We are not peeking, and there are no dirty pages */
-		if (!atomic64_read(&s->dirty_pages)) {
-			memset(args, 0, sizeof(*args));
-			return 0;
-		}
-		cur = find_next_bit(s->pgste_bitmap, s->bitmap_size,
-				    args->start_gfn);
-		if (cur >= s->bitmap_size)	/* nothing found, loop back */
-			cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0);
-		if (cur >= s->bitmap_size) {	/* again! (very unlikely) */
-			memset(args, 0, sizeof(*args));
-			return 0;
-		}
-		next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1);
+	/* We are not peeking, and there are no dirty pages */
+	if (!peek && !atomic64_read(&kvm->arch.cmma_dirty_pages)) {
+		memset(args, 0, sizeof(*args));
+		return 0;
 	}
 
-	res = vmalloc(bufsize);
-	if (!res)
+	values = vmalloc(bufsize);
+	if (!values)
 		return -ENOMEM;
 
-	args->start_gfn = cur;
-
 	down_read(&kvm->mm->mmap_sem);
 	srcu_idx = srcu_read_lock(&kvm->srcu);
-	while (i < bufsize) {
-		hva = gfn_to_hva(kvm, cur);
-		if (kvm_is_error_hva(hva)) {
-			r = -EFAULT;
-			break;
-		}
-		/* decrement only if we actually flipped the bit to 0 */
-		if (!peek && test_and_clear_bit(cur, s->pgste_bitmap))
-			atomic64_dec(&s->dirty_pages);
-		r = get_pgste(kvm->mm, hva, &pgstev);
-		if (r < 0)
-			pgstev = 0;
-		/* save the value */
-		res[i++] = (pgstev >> 24) & 0x43;
-		/*
-		 * if the next bit is too far away, stop.
-		 * if we reached the previous "next", find the next one
-		 */
-		if (!peek) {
-			if (next > cur + KVM_S390_MAX_BIT_DISTANCE)
-				break;
-			if (cur == next)
-				next = find_next_bit(s->pgste_bitmap,
-						     s->bitmap_size, cur + 1);
-		/* reached the end of the bitmap or of the buffer, stop */
-			if ((next >= s->bitmap_size) ||
-			    (next >= args->start_gfn + bufsize))
-				break;
-		}
-		cur++;
-	}
+	if (peek)
+		ret = kvm_s390_peek_cmma(kvm, args, values, bufsize);
+	else
+		ret = kvm_s390_get_cmma(kvm, args, values, bufsize);
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	up_read(&kvm->mm->mmap_sem);
-	args->count = i;
-	args->remaining = s ? atomic64_read(&s->dirty_pages) : 0;
 
-	rr = copy_to_user((void __user *)args->values, res, args->count);
-	if (rr)
-		r = -EFAULT;
+	args->remaining = migr ? atomic64_read(&kvm->arch.cmma_dirty_pages) : 0;
 
-	vfree(res);
-	return r;
+	if (copy_to_user((void __user *)args->values, values, args->count))
+		ret = -EFAULT;
+
+	vfree(values);
+	return ret;
 }
 
 /*
@@ -2088,10 +2123,6 @@  void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_s390_destroy_adapters(kvm);
 	kvm_s390_clear_float_irqs(kvm);
 	kvm_s390_vsie_destroy(kvm);
-	if (kvm->arch.migration_state) {
-		vfree(kvm->arch.migration_state->pgste_bitmap);
-		kfree(kvm->arch.migration_state);
-	}
 	KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
 }
 
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 572496c..7ae4893 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -954,9 +954,11 @@  static int handle_pfmf(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
+/*
+ * Must be called with relevant read locks held (kvm->mm->mmap_sem, kvm->srcu)
+ */
+static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
 {
-	struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state;
 	int r1, r2, nappended, entries;
 	unsigned long gfn, hva, res, pgstev, ptev;
 	unsigned long *cbrlo;
@@ -1007,9 +1009,11 @@  static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
 	}
 
 	if (orc) {
-		/* increment only if we are really flipping the bit to 1 */
-		if (!test_and_set_bit(gfn, ms->pgste_bitmap))
-			atomic64_inc(&ms->dirty_pages);
+		struct kvm_memory_slot *s = gfn_to_memslot(vcpu->kvm, gfn);
+
+		/* Increment only if we are really flipping the bit */
+		if (s && !test_and_set_bit(gfn - s->base_gfn, cmma_bitmap(s)))
+			atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages);
 	}
 
 	return nappended;
@@ -1038,7 +1042,7 @@  static int handle_essa(struct kvm_vcpu *vcpu)
 						: ESSA_SET_STABLE_IF_RESIDENT))
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-	if (likely(!vcpu->kvm->arch.migration_state)) {
+	if (likely(!atomic_read(&vcpu->kvm->arch.migration_mode))) {
 		/*
 		 * CMMA is enabled in the KVM settings, but is disabled in
 		 * the SIE block and in the mm_context, and we are not doing
@@ -1066,10 +1070,16 @@  static int handle_essa(struct kvm_vcpu *vcpu)
 		/* Retry the ESSA instruction */
 		kvm_s390_retry_instr(vcpu);
 	} else {
-		/* Account for the possible extra cbrl entry */
-		i = do_essa(vcpu, orc);
+		int srcu_idx;
+
+		down_read(&vcpu->kvm->mm->mmap_sem);
+		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+		i = __do_essa(vcpu, orc);
+		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+		up_read(&vcpu->kvm->mm->mmap_sem);
 		if (i < 0)
 			return i;
+		/* Account for the possible extra cbrl entry */
 		entries += i;
 	}
 	vcpu->arch.sie_block->cbrlo &= PAGE_MASK;	/* reset nceo */