diff mbox series

[v2,09/16] KVM: PPC: Book3S HV: XIVE: add a control to dirty the XIVE EQ pages

Message ID 20190222112840.25000-10-clg@kaod.org (mailing list archive)
State New, archived
Headers show
Series KVM: PPC: Book3S HV: add XIVE native exploitation mode | expand

Commit Message

Cédric Le Goater Feb. 22, 2019, 11:28 a.m. UTC
When migration of a VM is initiated, a first copy of the RAM is
transferred to the destination before the VM is stopped, but there is
no guarantee that the EQ pages in which the event notification are
queued have not been modified.

To make sure migration will capture a consistent memory state, the
XIVE device should perform a XIVE quiesce sequence to stop the flow of
event notifications and stabilize the EQs. This is the purpose of the
KVM_DEV_XIVE_EQ_SYNC control which will also marks the EQ pages dirty
to force their transfer.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 arch/powerpc/include/uapi/asm/kvm.h        |  1 +
 arch/powerpc/kvm/book3s_xive_native.c      | 67 ++++++++++++++++++++++
 Documentation/virtual/kvm/devices/xive.txt | 29 ++++++++++
 3 files changed, 97 insertions(+)

Comments

David Gibson Feb. 25, 2019, 2:53 a.m. UTC | #1
On Fri, Feb 22, 2019 at 12:28:33PM +0100, Cédric Le Goater wrote:
> When migration of a VM is initiated, a first copy of the RAM is
> transferred to the destination before the VM is stopped, but there is
> no guarantee that the EQ pages in which the event notification are
> queued have not been modified.
> 
> To make sure migration will capture a consistent memory state, the
> XIVE device should perform a XIVE quiesce sequence to stop the flow of
> event notifications and stabilize the EQs. This is the purpose of the
> KVM_DEV_XIVE_EQ_SYNC control which will also marks the EQ pages dirty
> to force their transfer.
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  arch/powerpc/include/uapi/asm/kvm.h        |  1 +
>  arch/powerpc/kvm/book3s_xive_native.c      | 67 ++++++++++++++++++++++
>  Documentation/virtual/kvm/devices/xive.txt | 29 ++++++++++
>  3 files changed, 97 insertions(+)
> 
> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> index 289c504b7c1d..cd78ad1020fe 100644
> --- a/arch/powerpc/include/uapi/asm/kvm.h
> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> @@ -678,6 +678,7 @@ struct kvm_ppc_cpu_char {
>  /* POWER9 XIVE Native Interrupt Controller */
>  #define KVM_DEV_XIVE_GRP_CTRL		1
>  #define   KVM_DEV_XIVE_RESET		1
> +#define   KVM_DEV_XIVE_EQ_SYNC		2
>  #define KVM_DEV_XIVE_GRP_SOURCE		2	/* 64-bit source attributes */
>  #define KVM_DEV_XIVE_GRP_SOURCE_CONFIG	3	/* 64-bit source attributes */
>  #define KVM_DEV_XIVE_GRP_EQ_CONFIG	4	/* 64-bit eq attributes */
> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
> index dd2a9d411fe7..3debc876d5a0 100644
> --- a/arch/powerpc/kvm/book3s_xive_native.c
> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> @@ -640,6 +640,70 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive)
>  	return 0;
>  }
>  
> +static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
> +{
> +	int j;
> +
> +	for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
> +		struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
> +		struct xive_irq_data *xd;
> +		u32 hw_num;
> +
> +		if (!state->valid)
> +			continue;
> +		if (state->act_priority == MASKED)

Is this correct?  If you masked an irq, then immediately did a sync,
couldn't there still be some of the irqs in flight?  I thought the
reason we needed a sync was that masking and other such operations
_didn't_ implicitly synchronize.

> +			continue;
> +
> +		arch_spin_lock(&sb->lock);
> +		kvmppc_xive_select_irq(state, &hw_num, &xd);
> +		xive_native_sync_source(hw_num);
> +		xive_native_sync_queue(hw_num);
> +		arch_spin_unlock(&sb->lock);
> +	}
> +}
> +
> +static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +	unsigned int prio;
> +
> +	if (!xc)
> +		return -ENOENT;
> +
> +	for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
> +		struct xive_q *q = &xc->queues[prio];
> +
> +		if (!q->qpage)
> +			continue;
> +
> +		/* Mark EQ page dirty for migration */
> +		mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qpage));
> +	}
> +	return 0;
> +}
> +
> +static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
> +{
> +	struct kvm *kvm = xive->kvm;
> +	struct kvm_vcpu *vcpu;
> +	unsigned int i;
> +
> +	pr_devel("%s\n", __func__);
> +
> +	for (i = 0; i <= xive->max_sbid; i++) {
> +		if (xive->src_blocks[i])
> +			kvmppc_xive_native_sync_sources(xive->src_blocks[i]);
> +	}
> +
> +	mutex_lock(&kvm->lock);
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		kvmppc_xive_native_vcpu_eq_sync(vcpu);
> +	}
> +	mutex_unlock(&kvm->lock);
> +
> +	return 0;
> +}
> +
>  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>  				       struct kvm_device_attr *attr)
>  {
> @@ -650,6 +714,8 @@ static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>  		switch (attr->attr) {
>  		case KVM_DEV_XIVE_RESET:
>  			return kvmppc_xive_reset(xive);
> +		case KVM_DEV_XIVE_EQ_SYNC:
> +			return kvmppc_xive_native_eq_sync(xive);
>  		}
>  		break;
>  	case KVM_DEV_XIVE_GRP_SOURCE:
> @@ -688,6 +754,7 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
>  	case KVM_DEV_XIVE_GRP_CTRL:
>  		switch (attr->attr) {
>  		case KVM_DEV_XIVE_RESET:
> +		case KVM_DEV_XIVE_EQ_SYNC:
>  			return 0;
>  		}
>  		break;
> diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
> index 267634eae9e0..a26be635cff9 100644
> --- a/Documentation/virtual/kvm/devices/xive.txt
> +++ b/Documentation/virtual/kvm/devices/xive.txt
> @@ -23,6 +23,12 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>      queues. To be used by kexec and kdump.
>      Errors: none
>  
> +    1.2 KVM_DEV_XIVE_EQ_SYNC (write only)
> +    Sync all the sources and queues and mark the EQ pages dirty. This
> +    to make sure that a consistent memory state is captured when
> +    migrating the VM.
> +    Errors: none
> +
>    2. KVM_DEV_XIVE_GRP_SOURCE (write only)
>    Initializes a new source in the XIVE device and mask it.
>    Attributes:
> @@ -95,3 +101,26 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>      -ENOENT: Unknown source number
>      -EINVAL: Not initialized source number, invalid priority or
>               invalid CPU number.
> +
> +* Migration:
> +
> +  Saving the state of a VM using the XIVE native exploitation mode
> +  should follow a specific sequence. When the VM is stopped :
> +
> +  1. Mask all sources (PQ=01) to stop the flow of events.
> +
> +  2. Sync the XIVE device with the KVM control KVM_DEV_XIVE_EQ_SYNC to
> +  flush any in-flight event notification and to stabilize the EQs. At
> +  this stage, the EQ pages are marked dirty to make sure they are
> +  transferred in the migration sequence.
> +
> +  3. Capture the state of the source targeting, the EQs configuration
> +  and the state of thread interrupt context registers.
> +
> +  Restore is similar :
> +
> +  1. Restore the EQ configuration. As targeting depends on it.
> +  2. Restore targeting
> +  3. Restore the thread interrupt contexts
> +  4. Restore the source states
> +  5. Let the vCPU run
Cédric Le Goater March 13, 2019, 11:48 a.m. UTC | #2
On 2/25/19 3:53 AM, David Gibson wrote:
> On Fri, Feb 22, 2019 at 12:28:33PM +0100, Cédric Le Goater wrote:
>> When migration of a VM is initiated, a first copy of the RAM is
>> transferred to the destination before the VM is stopped, but there is
>> no guarantee that the EQ pages in which the event notification are
>> queued have not been modified.
>>
>> To make sure migration will capture a consistent memory state, the
>> XIVE device should perform a XIVE quiesce sequence to stop the flow of
>> event notifications and stabilize the EQs. This is the purpose of the
>> KVM_DEV_XIVE_EQ_SYNC control which will also marks the EQ pages dirty
>> to force their transfer.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  arch/powerpc/include/uapi/asm/kvm.h        |  1 +
>>  arch/powerpc/kvm/book3s_xive_native.c      | 67 ++++++++++++++++++++++
>>  Documentation/virtual/kvm/devices/xive.txt | 29 ++++++++++
>>  3 files changed, 97 insertions(+)
>>
>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>> index 289c504b7c1d..cd78ad1020fe 100644
>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>> @@ -678,6 +678,7 @@ struct kvm_ppc_cpu_char {
>>  /* POWER9 XIVE Native Interrupt Controller */
>>  #define KVM_DEV_XIVE_GRP_CTRL		1
>>  #define   KVM_DEV_XIVE_RESET		1
>> +#define   KVM_DEV_XIVE_EQ_SYNC		2
>>  #define KVM_DEV_XIVE_GRP_SOURCE		2	/* 64-bit source attributes */
>>  #define KVM_DEV_XIVE_GRP_SOURCE_CONFIG	3	/* 64-bit source attributes */
>>  #define KVM_DEV_XIVE_GRP_EQ_CONFIG	4	/* 64-bit eq attributes */
>> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
>> index dd2a9d411fe7..3debc876d5a0 100644
>> --- a/arch/powerpc/kvm/book3s_xive_native.c
>> +++ b/arch/powerpc/kvm/book3s_xive_native.c
>> @@ -640,6 +640,70 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive)
>>  	return 0;
>>  }
>>  
>> +static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
>> +{
>> +	int j;
>> +
>> +	for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
>> +		struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
>> +		struct xive_irq_data *xd;
>> +		u32 hw_num;
>> +
>> +		if (!state->valid)
>> +			continue;
>> +		if (state->act_priority == MASKED)
> 
> Is this correct?  If you masked an irq, then immediately did a sync,
> couldn't there still be some of the irqs in flight?  I thought the
> reason we needed a sync was that masking and other such operations
> _didn't_ implicitly synchronize.

The struct kvmppc_xive_irq_state reflects the state of the EAS 
configuration and not the state of the source. The source is masked 
setting the PQ bits to '-Q', which is what is being done before calling 
the KVM_DEV_XIVE_EQ_SYNC control. 

If a source EAS is configured, OPAL syncs the XIVE IC of the source and
the XIVE IC of the previous target if any.  

So I think we are fine.

C.
  

  
>> +			continue;
>> +
>> +		arch_spin_lock(&sb->lock);
>> +		kvmppc_xive_select_irq(state, &hw_num, &xd);
>> +		xive_native_sync_source(hw_num);
>> +		xive_native_sync_queue(hw_num);
>> +		arch_spin_unlock(&sb->lock);
>> +	}
>> +}
>> +
>> +static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
>> +{
>> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>> +	unsigned int prio;
>> +
>> +	if (!xc)
>> +		return -ENOENT;
>> +
>> +	for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
>> +		struct xive_q *q = &xc->queues[prio];
>> +
>> +		if (!q->qpage)
>> +			continue;
>> +
>> +		/* Mark EQ page dirty for migration */
>> +		mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qpage));
>> +	}
>> +	return 0;
>> +}
>> +
>> +static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
>> +{
>> +	struct kvm *kvm = xive->kvm;
>> +	struct kvm_vcpu *vcpu;
>> +	unsigned int i;
>> +
>> +	pr_devel("%s\n", __func__);
>> +
>> +	for (i = 0; i <= xive->max_sbid; i++) {
>> +		if (xive->src_blocks[i])
>> +			kvmppc_xive_native_sync_sources(xive->src_blocks[i]);
>> +	}
>> +
>> +	mutex_lock(&kvm->lock);
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		kvmppc_xive_native_vcpu_eq_sync(vcpu);
>> +	}
>> +	mutex_unlock(&kvm->lock);
>> +
>> +	return 0;
>> +}
>> +
>>  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>>  				       struct kvm_device_attr *attr)
>>  {
>> @@ -650,6 +714,8 @@ static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>>  		switch (attr->attr) {
>>  		case KVM_DEV_XIVE_RESET:
>>  			return kvmppc_xive_reset(xive);
>> +		case KVM_DEV_XIVE_EQ_SYNC:
>> +			return kvmppc_xive_native_eq_sync(xive);
>>  		}
>>  		break;
>>  	case KVM_DEV_XIVE_GRP_SOURCE:
>> @@ -688,6 +754,7 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
>>  	case KVM_DEV_XIVE_GRP_CTRL:
>>  		switch (attr->attr) {
>>  		case KVM_DEV_XIVE_RESET:
>> +		case KVM_DEV_XIVE_EQ_SYNC:
>>  			return 0;
>>  		}
>>  		break;
>> diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
>> index 267634eae9e0..a26be635cff9 100644
>> --- a/Documentation/virtual/kvm/devices/xive.txt
>> +++ b/Documentation/virtual/kvm/devices/xive.txt
>> @@ -23,6 +23,12 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>>      queues. To be used by kexec and kdump.
>>      Errors: none
>>  
>> +    1.2 KVM_DEV_XIVE_EQ_SYNC (write only)
>> +    Sync all the sources and queues and mark the EQ pages dirty. This
>> +    to make sure that a consistent memory state is captured when
>> +    migrating the VM.
>> +    Errors: none
>> +
>>    2. KVM_DEV_XIVE_GRP_SOURCE (write only)
>>    Initializes a new source in the XIVE device and mask it.
>>    Attributes:
>> @@ -95,3 +101,26 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>>      -ENOENT: Unknown source number
>>      -EINVAL: Not initialized source number, invalid priority or
>>               invalid CPU number.
>> +
>> +* Migration:
>> +
>> +  Saving the state of a VM using the XIVE native exploitation mode
>> +  should follow a specific sequence. When the VM is stopped :
>> +
>> +  1. Mask all sources (PQ=01) to stop the flow of events.
>> +
>> +  2. Sync the XIVE device with the KVM control KVM_DEV_XIVE_EQ_SYNC to
>> +  flush any in-flight event notification and to stabilize the EQs. At
>> +  this stage, the EQ pages are marked dirty to make sure they are
>> +  transferred in the migration sequence.
>> +
>> +  3. Capture the state of the source targeting, the EQs configuration
>> +  and the state of thread interrupt context registers.
>> +
>> +  Restore is similar :
>> +
>> +  1. Restore the EQ configuration. As targeting depends on it.
>> +  2. Restore targeting
>> +  3. Restore the thread interrupt contexts
>> +  4. Restore the source states
>> +  5. Let the vCPU run
>
David Gibson March 14, 2019, 2:33 a.m. UTC | #3
On Wed, Mar 13, 2019 at 12:48:57PM +0100, Cédric Le Goater wrote:
> On 2/25/19 3:53 AM, David Gibson wrote:
> > On Fri, Feb 22, 2019 at 12:28:33PM +0100, Cédric Le Goater wrote:
> >> When migration of a VM is initiated, a first copy of the RAM is
> >> transferred to the destination before the VM is stopped, but there is
> >> no guarantee that the EQ pages in which the event notification are
> >> queued have not been modified.
> >>
> >> To make sure migration will capture a consistent memory state, the
> >> XIVE device should perform a XIVE quiesce sequence to stop the flow of
> >> event notifications and stabilize the EQs. This is the purpose of the
> >> KVM_DEV_XIVE_EQ_SYNC control which will also marks the EQ pages dirty
> >> to force their transfer.
> >>
> >> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> >> ---
> >>  arch/powerpc/include/uapi/asm/kvm.h        |  1 +
> >>  arch/powerpc/kvm/book3s_xive_native.c      | 67 ++++++++++++++++++++++
> >>  Documentation/virtual/kvm/devices/xive.txt | 29 ++++++++++
> >>  3 files changed, 97 insertions(+)
> >>
> >> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
> >> index 289c504b7c1d..cd78ad1020fe 100644
> >> --- a/arch/powerpc/include/uapi/asm/kvm.h
> >> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> >> @@ -678,6 +678,7 @@ struct kvm_ppc_cpu_char {
> >>  /* POWER9 XIVE Native Interrupt Controller */
> >>  #define KVM_DEV_XIVE_GRP_CTRL		1
> >>  #define   KVM_DEV_XIVE_RESET		1
> >> +#define   KVM_DEV_XIVE_EQ_SYNC		2
> >>  #define KVM_DEV_XIVE_GRP_SOURCE		2	/* 64-bit source attributes */
> >>  #define KVM_DEV_XIVE_GRP_SOURCE_CONFIG	3	/* 64-bit source attributes */
> >>  #define KVM_DEV_XIVE_GRP_EQ_CONFIG	4	/* 64-bit eq attributes */
> >> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
> >> index dd2a9d411fe7..3debc876d5a0 100644
> >> --- a/arch/powerpc/kvm/book3s_xive_native.c
> >> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> >> @@ -640,6 +640,70 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive)
> >>  	return 0;
> >>  }
> >>  
> >> +static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
> >> +{
> >> +	int j;
> >> +
> >> +	for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
> >> +		struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
> >> +		struct xive_irq_data *xd;
> >> +		u32 hw_num;
> >> +
> >> +		if (!state->valid)
> >> +			continue;
> >> +		if (state->act_priority == MASKED)
> > 
> > Is this correct?  If you masked an irq, then immediately did a sync,
> > couldn't there still be some of the irqs in flight?  I thought the
> > reason we needed a sync was that masking and other such operations
> > _didn't_ implicitly synchronize.
> 
> The struct kvmppc_xive_irq_state reflects the state of the EAS 
> configuration and not the state of the source. The source is masked 
> setting the PQ bits to '-Q', which is what is being done before calling 
> the KVM_DEV_XIVE_EQ_SYNC control. 
> 
> If a source EAS is configured, OPAL syncs the XIVE IC of the source and
> the XIVE IC of the previous target if any.  
> 
> So I think we are fine.

Ok.
diff mbox series

Patch

diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 289c504b7c1d..cd78ad1020fe 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -678,6 +678,7 @@  struct kvm_ppc_cpu_char {
 /* POWER9 XIVE Native Interrupt Controller */
 #define KVM_DEV_XIVE_GRP_CTRL		1
 #define   KVM_DEV_XIVE_RESET		1
+#define   KVM_DEV_XIVE_EQ_SYNC		2
 #define KVM_DEV_XIVE_GRP_SOURCE		2	/* 64-bit source attributes */
 #define KVM_DEV_XIVE_GRP_SOURCE_CONFIG	3	/* 64-bit source attributes */
 #define KVM_DEV_XIVE_GRP_EQ_CONFIG	4	/* 64-bit eq attributes */
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index dd2a9d411fe7..3debc876d5a0 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -640,6 +640,70 @@  static int kvmppc_xive_reset(struct kvmppc_xive *xive)
 	return 0;
 }
 
+static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
+{
+	int j;
+
+	for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
+		struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
+		struct xive_irq_data *xd;
+		u32 hw_num;
+
+		if (!state->valid)
+			continue;
+		if (state->act_priority == MASKED)
+			continue;
+
+		arch_spin_lock(&sb->lock);
+		kvmppc_xive_select_irq(state, &hw_num, &xd);
+		xive_native_sync_source(hw_num);
+		xive_native_sync_queue(hw_num);
+		arch_spin_unlock(&sb->lock);
+	}
+}
+
+static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	unsigned int prio;
+
+	if (!xc)
+		return -ENOENT;
+
+	for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
+		struct xive_q *q = &xc->queues[prio];
+
+		if (!q->qpage)
+			continue;
+
+		/* Mark EQ page dirty for migration */
+		mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qpage));
+	}
+	return 0;
+}
+
+static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
+{
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	unsigned int i;
+
+	pr_devel("%s\n", __func__);
+
+	for (i = 0; i <= xive->max_sbid; i++) {
+		if (xive->src_blocks[i])
+			kvmppc_xive_native_sync_sources(xive->src_blocks[i]);
+	}
+
+	mutex_lock(&kvm->lock);
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		kvmppc_xive_native_vcpu_eq_sync(vcpu);
+	}
+	mutex_unlock(&kvm->lock);
+
+	return 0;
+}
+
 static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
 				       struct kvm_device_attr *attr)
 {
@@ -650,6 +714,8 @@  static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
 		switch (attr->attr) {
 		case KVM_DEV_XIVE_RESET:
 			return kvmppc_xive_reset(xive);
+		case KVM_DEV_XIVE_EQ_SYNC:
+			return kvmppc_xive_native_eq_sync(xive);
 		}
 		break;
 	case KVM_DEV_XIVE_GRP_SOURCE:
@@ -688,6 +754,7 @@  static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
 	case KVM_DEV_XIVE_GRP_CTRL:
 		switch (attr->attr) {
 		case KVM_DEV_XIVE_RESET:
+		case KVM_DEV_XIVE_EQ_SYNC:
 			return 0;
 		}
 		break;
diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
index 267634eae9e0..a26be635cff9 100644
--- a/Documentation/virtual/kvm/devices/xive.txt
+++ b/Documentation/virtual/kvm/devices/xive.txt
@@ -23,6 +23,12 @@  the legacy interrupt mode, referred as XICS (POWER7/8).
     queues. To be used by kexec and kdump.
     Errors: none
 
+    1.2 KVM_DEV_XIVE_EQ_SYNC (write only)
+    Sync all the sources and queues and mark the EQ pages dirty. This
+    to make sure that a consistent memory state is captured when
+    migrating the VM.
+    Errors: none
+
   2. KVM_DEV_XIVE_GRP_SOURCE (write only)
   Initializes a new source in the XIVE device and mask it.
   Attributes:
@@ -95,3 +101,26 @@  the legacy interrupt mode, referred as XICS (POWER7/8).
     -ENOENT: Unknown source number
     -EINVAL: Not initialized source number, invalid priority or
              invalid CPU number.
+
+* Migration:
+
+  Saving the state of a VM using the XIVE native exploitation mode
+  should follow a specific sequence. When the VM is stopped :
+
+  1. Mask all sources (PQ=01) to stop the flow of events.
+
+  2. Sync the XIVE device with the KVM control KVM_DEV_XIVE_EQ_SYNC to
+  flush any in-flight event notification and to stabilize the EQs. At
+  this stage, the EQ pages are marked dirty to make sure they are
+  transferred in the migration sequence.
+
+  3. Capture the state of the source targeting, the EQs configuration
+  and the state of thread interrupt context registers.
+
+  Restore is similar :
+
+  1. Restore the EQ configuration. As targeting depends on it.
+  2. Restore targeting
+  3. Restore the thread interrupt contexts
+  4. Restore the source states
+  5. Let the vCPU run