diff mbox series

[v3,2/2] x86/hvm: finish IOREQs correctly on completion path

Message ID 1552602649-14358-2-git-send-email-igor.druzhinin@citrix.com (mailing list archive)
State Superseded
Headers show
Series [v3,1/2] x86/hvm: split all linear reads and writes at page boundary | expand

Commit Message

Igor Druzhinin March 14, 2019, 10:30 p.m. UTC
Since the introduction of linear_{read,write}() helpers in 3bdec530a5
(x86/HVM: split page straddling emulated accesses in more cases) the
completion path for IOREQs has been broken: if there is an IOREQ in
progress but hvm_copy_{to,from}_guest_linear() returns HVMTRANS_okay
(e.g. when P2M type of source/destination has been changed by IOREQ
handler) the execution will never re-enter hvmemul_do_io() where
IOREQs are completed. This usually results in a domain crash upon
the execution of the next IOREQ entering hvmemul_do_io() and finding
the remnants of the previous IOREQ in the state machine.

This particular issue has been discovered in relation to p2m_ioreq_server
type where an emulator changed the memory type between p2m_ioreq_server
and p2m_ram_rw in process of responding to IOREQ which made
hvm_copy_..() to behave differently on the way back.

Fix it for now by checking if IOREQ completion is required (which
can be identified by quering MMIO cache) before trying to finish
a memory access immediately through hvm_copy_..(), re-enter
hvmemul_do_io() otherwise. This change alone addresses IOREQ
completion issue where P2M type is modified in the middle of emulation
but is not enough for a more general case where machine state
arbitrarely changes behind our back.

Signed-off-by: Igor Druzhinin <igor.druzhinin@citrix.com>
---
Changes in v3:
* made it more clear that it's still a partial fix in the commit description
* other minor suggestions
---
 xen/arch/x86/hvm/emulate.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

Comments

Paul Durrant March 15, 2019, 9:28 a.m. UTC | #1
> -----Original Message-----
> From: Igor Druzhinin [mailto:igor.druzhinin@citrix.com]
> Sent: 14 March 2019 22:31
> To: xen-devel@lists.xenproject.org
> Cc: Paul Durrant <Paul.Durrant@citrix.com>; jbeulich@suse.com; Andrew Cooper
> <Andrew.Cooper3@citrix.com>; Wei Liu <wei.liu2@citrix.com>; Roger Pau Monne <roger.pau@citrix.com>;
> Igor Druzhinin <igor.druzhinin@citrix.com>
> Subject: [PATCH v3 2/2] x86/hvm: finish IOREQs correctly on completion path
> 
> Since the introduction of linear_{read,write}() helpers in 3bdec530a5
> (x86/HVM: split page straddling emulated accesses in more cases) the
> completion path for IOREQs has been broken: if there is an IOREQ in
> progress but hvm_copy_{to,from}_guest_linear() returns HVMTRANS_okay
> (e.g. when P2M type of source/destination has been changed by IOREQ
> handler) the execution will never re-enter hvmemul_do_io() where
> IOREQs are completed. This usually results in a domain crash upon
> the execution of the next IOREQ entering hvmemul_do_io() and finding
> the remnants of the previous IOREQ in the state machine.
> 
> This particular issue has been discovered in relation to p2m_ioreq_server
> type where an emulator changed the memory type between p2m_ioreq_server
> and p2m_ram_rw in process of responding to IOREQ which made
> hvm_copy_..() to behave differently on the way back.
> 
> Fix it for now by checking if IOREQ completion is required (which
> can be identified by quering MMIO cache) before trying to finish

^ querying

> a memory access immediately through hvm_copy_..(), re-enter
> hvmemul_do_io() otherwise. This change alone addresses IOREQ
> completion issue where P2M type is modified in the middle of emulation
> but is not enough for a more general case where machine state
> arbitrarely changes behind our back.

^ arbitrarily

> 
> Signed-off-by: Igor Druzhinin <igor.druzhinin@citrix.com>
> ---
> Changes in v3:
> * made it more clear that it's still a partial fix in the commit description
> * other minor suggestions
> ---
>  xen/arch/x86/hvm/emulate.c | 31 +++++++++++++++++++++++++------
>  1 file changed, 25 insertions(+), 6 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
> index 4879ccb..92a9b82 100644
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -952,7 +952,7 @@ static int hvmemul_phys_mmio_access(
>   * cache indexed by linear MMIO address.
>   */
>  static struct hvm_mmio_cache *hvmemul_find_mmio_cache(
> -    struct hvm_vcpu_io *vio, unsigned long gla, uint8_t dir)
> +    struct hvm_vcpu_io *vio, unsigned long gla, uint8_t dir, bool create)
>  {
>      unsigned int i;
>      struct hvm_mmio_cache *cache;
> @@ -966,6 +966,9 @@ static struct hvm_mmio_cache *hvmemul_find_mmio_cache(
>              return cache;
>      }
> 
> +    if ( !create )
> +        return NULL;
> +
>      i = vio->mmio_cache_count;
>      if( i == ARRAY_SIZE(vio->mmio_cache) )
>          return NULL;
> @@ -1000,7 +1003,7 @@ static int hvmemul_linear_mmio_access(
>  {
>      struct hvm_vcpu_io *vio = &current->arch.hvm.hvm_io;
>      unsigned long offset = gla & ~PAGE_MASK;
> -    struct hvm_mmio_cache *cache = hvmemul_find_mmio_cache(vio, gla, dir);
> +    struct hvm_mmio_cache *cache = hvmemul_find_mmio_cache(vio, gla, dir, true);
>      unsigned int chunk, buffer_offset = 0;
>      paddr_t gpa;
>      unsigned long one_rep = 1;
> @@ -1089,8 +1092,9 @@ static int linear_read(unsigned long addr, unsigned int bytes, void *p_data,
>                         uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt)
>  {
>      pagefault_info_t pfinfo;
> +    struct hvm_vcpu_io *vio = &current->arch.hvm.hvm_io;
>      unsigned int offset = addr & ~PAGE_MASK;
> -    int rc;
> +    int rc = HVMTRANS_bad_gfn_to_mfn;
> 
>      if ( offset + bytes > PAGE_SIZE )
>      {
> @@ -1104,7 +1108,14 @@ static int linear_read(unsigned long addr, unsigned int bytes, void *p_data,
>          return rc;
>      }
> 
> -    rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo);
> +    /*
> +     * If there is an MMIO cache entry for that access then we must be re-issuing

^ s/that/the

> +     * an access that was previously handled as MMIO. Thus it is imperative that
> +     * we handle this access in the same way to guarantee completion and hence
> +     * clean up any interim state.
> +     */
> +    if ( !hvmemul_find_mmio_cache(vio, addr, IOREQ_READ, false) )
> +        rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo);
> 
>      switch ( rc )
>      {
> @@ -1134,8 +1145,9 @@ static int linear_write(unsigned long addr, unsigned int bytes, void *p_data,
>                          uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt)
>  {
>      pagefault_info_t pfinfo;
> +    struct hvm_vcpu_io *vio = &current->arch.hvm.hvm_io;
>      unsigned int offset = addr & ~PAGE_MASK;
> -    int rc;
> +    int rc = HVMTRANS_bad_gfn_to_mfn;
> 
>      if ( offset + bytes > PAGE_SIZE )
>      {
> @@ -1149,7 +1161,14 @@ static int linear_write(unsigned long addr, unsigned int bytes, void *p_data,
>          return rc;
>      }
> 
> -    rc = hvm_copy_to_guest_linear(addr, p_data, bytes, pfec, &pfinfo);
> +    /*
> +     * If there is an MMIO cache entry for that acces then we must be re-issuing

Same here.

With these fixed...

Reviewed-by: Paul Durrant <paul.durrant@citrix.com>


> +     * an access that was previously handled as MMIO. Thus it is imperative that
> +     * we handle this access in the same way to guarantee completion and hence
> +     * clean up any interim state.
> +     */
> +    if ( !hvmemul_find_mmio_cache(vio, addr, IOREQ_WRITE, false) )
> +        rc = hvm_copy_to_guest_linear(addr, p_data, bytes, pfec, &pfinfo);
> 
>      switch ( rc )
>      {
> --
> 2.7.4
Jan Beulich March 15, 2019, 12:27 p.m. UTC | #2
>>> On 14.03.19 at 23:30, <igor.druzhinin@citrix.com> wrote:
> Since the introduction of linear_{read,write}() helpers in 3bdec530a5
> (x86/HVM: split page straddling emulated accesses in more cases) the
> completion path for IOREQs has been broken: if there is an IOREQ in
> progress but hvm_copy_{to,from}_guest_linear() returns HVMTRANS_okay
> (e.g. when P2M type of source/destination has been changed by IOREQ
> handler) the execution will never re-enter hvmemul_do_io() where
> IOREQs are completed. This usually results in a domain crash upon
> the execution of the next IOREQ entering hvmemul_do_io() and finding
> the remnants of the previous IOREQ in the state machine.

From an archeological pov I'm not sure you point at the offending
commit: I'd rather expect d7bff2bc00 ("x86/HVM: __hvm_copy()
should not write to p2m_ioreq_server pages") to be the culprit,
which went in two months later.

> This particular issue has been discovered in relation to p2m_ioreq_server
> type where an emulator changed the memory type between p2m_ioreq_server
> and p2m_ram_rw in process of responding to IOREQ which made
> hvm_copy_..() to behave differently on the way back.
> 
> Fix it for now by checking if IOREQ completion is required (which
> can be identified by quering MMIO cache) before trying to finish
> a memory access immediately through hvm_copy_..(), re-enter
> hvmemul_do_io() otherwise. This change alone addresses IOREQ
> completion issue where P2M type is modified in the middle of emulation
> but is not enough for a more general case where machine state
> arbitrarely changes behind our back.

I'm afraid this still claims to address cases which don't get fixed
here. For example, take a page changing _to_ p2m_ioreq_server
behind our backs: You won't find an MMIO cache entry for it,
hvm_copy_to_guest_linear() will fail, and you'll try to issue an
MMIO write when in reality the write was already done (emulated
for whatever other reason, e.g. introspection). This example
may be pretty contrived, but Andrew's ballooning scenario really
applies both ways (balloon-in and balloon-out), while the change
deals only with the balloon-in case.

So while I'm fine with the code change, I'd still like to ask to
further refine the description.

Jan
Igor Druzhinin March 15, 2019, 1:05 p.m. UTC | #3
On 15/03/2019 12:27, Jan Beulich wrote:
>>>> On 14.03.19 at 23:30, <igor.druzhinin@citrix.com> wrote:
>> Since the introduction of linear_{read,write}() helpers in 3bdec530a5
>> (x86/HVM: split page straddling emulated accesses in more cases) the
>> completion path for IOREQs has been broken: if there is an IOREQ in
>> progress but hvm_copy_{to,from}_guest_linear() returns HVMTRANS_okay
>> (e.g. when P2M type of source/destination has been changed by IOREQ
>> handler) the execution will never re-enter hvmemul_do_io() where
>> IOREQs are completed. This usually results in a domain crash upon
>> the execution of the next IOREQ entering hvmemul_do_io() and finding
>> the remnants of the previous IOREQ in the state machine.
> 
> From an archeological pov I'm not sure you point at the offending
> commit: I'd rather expect d7bff2bc00 ("x86/HVM: __hvm_copy()
> should not write to p2m_ioreq_server pages") to be the culprit,
> which went in two months later.
> 
>> This particular issue has been discovered in relation to p2m_ioreq_server
>> type where an emulator changed the memory type between p2m_ioreq_server
>> and p2m_ram_rw in process of responding to IOREQ which made
>> hvm_copy_..() to behave differently on the way back.
>>
>> Fix it for now by checking if IOREQ completion is required (which
>> can be identified by quering MMIO cache) before trying to finish
>> a memory access immediately through hvm_copy_..(), re-enter
>> hvmemul_do_io() otherwise. This change alone addresses IOREQ
>> completion issue where P2M type is modified in the middle of emulation
>> but is not enough for a more general case where machine state
>> arbitrarely changes behind our back.
> 
> I'm afraid this still claims to address cases which don't get fixed
> here. For example, take a page changing _to_ p2m_ioreq_server
> behind our backs: You won't find an MMIO cache entry for it,
> hvm_copy_to_guest_linear() will fail, and you'll try to issue an
> MMIO write when in reality the write was already done (emulated
> for whatever other reason, e.g. introspection). This example
> may be pretty contrived, but Andrew's ballooning scenario really
> applies both ways (balloon-in and balloon-out), while the change
> deals only with the balloon-in case.
> 
> So while I'm fine with the code change, I'd still like to ask to
> further refine the description.

Thanks for clarification. I discussed with Paul - there is definitely
still a hole in general case where 1st half of the instruction is memory
and 2nd half is MMIO and the 1st half is changed *to* MMIO. But it's
hard to deal with these types of accesses without a complete re-write of
MMIO cache into general insn access cache - so to lift it up to
linear_{read,write} layer. I hope my understanding is now correct and
I'll put into the description. Until then the fix should do fine with
scenarios we're seeing.

Igor
Jan Beulich March 15, 2019, 1:16 p.m. UTC | #4
>>> On 15.03.19 at 14:05, <igor.druzhinin@citrix.com> wrote:
> Thanks for clarification. I discussed with Paul - there is definitely
> still a hole in general case where 1st half of the instruction is memory
> and 2nd half is MMIO and the 1st half is changed *to* MMIO. But it's
> hard to deal with these types of accesses without a complete re-write of
> MMIO cache into general insn access cache - so to lift it up to
> linear_{read,write} layer. I hope my understanding is now correct and
> I'll put into the description.

Well, mostly. With patch 1 there's no dependency anymore on an
access to be page straddling afaict. I.e. the scenarios I gave
should apply also to aligned accesses.

> Until then the fix should do fine with scenarios we're seeing.

Indeed.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 4879ccb..92a9b82 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -952,7 +952,7 @@  static int hvmemul_phys_mmio_access(
  * cache indexed by linear MMIO address.
  */
 static struct hvm_mmio_cache *hvmemul_find_mmio_cache(
-    struct hvm_vcpu_io *vio, unsigned long gla, uint8_t dir)
+    struct hvm_vcpu_io *vio, unsigned long gla, uint8_t dir, bool create)
 {
     unsigned int i;
     struct hvm_mmio_cache *cache;
@@ -966,6 +966,9 @@  static struct hvm_mmio_cache *hvmemul_find_mmio_cache(
             return cache;
     }
 
+    if ( !create )
+        return NULL;
+
     i = vio->mmio_cache_count;
     if( i == ARRAY_SIZE(vio->mmio_cache) )
         return NULL;
@@ -1000,7 +1003,7 @@  static int hvmemul_linear_mmio_access(
 {
     struct hvm_vcpu_io *vio = &current->arch.hvm.hvm_io;
     unsigned long offset = gla & ~PAGE_MASK;
-    struct hvm_mmio_cache *cache = hvmemul_find_mmio_cache(vio, gla, dir);
+    struct hvm_mmio_cache *cache = hvmemul_find_mmio_cache(vio, gla, dir, true);
     unsigned int chunk, buffer_offset = 0;
     paddr_t gpa;
     unsigned long one_rep = 1;
@@ -1089,8 +1092,9 @@  static int linear_read(unsigned long addr, unsigned int bytes, void *p_data,
                        uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt)
 {
     pagefault_info_t pfinfo;
+    struct hvm_vcpu_io *vio = &current->arch.hvm.hvm_io;
     unsigned int offset = addr & ~PAGE_MASK;
-    int rc;
+    int rc = HVMTRANS_bad_gfn_to_mfn;
 
     if ( offset + bytes > PAGE_SIZE )
     {
@@ -1104,7 +1108,14 @@  static int linear_read(unsigned long addr, unsigned int bytes, void *p_data,
         return rc;
     }
 
-    rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo);
+    /*
+     * If there is an MMIO cache entry for that access then we must be re-issuing
+     * an access that was previously handled as MMIO. Thus it is imperative that
+     * we handle this access in the same way to guarantee completion and hence
+     * clean up any interim state.
+     */
+    if ( !hvmemul_find_mmio_cache(vio, addr, IOREQ_READ, false) )
+        rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo);
 
     switch ( rc )
     {
@@ -1134,8 +1145,9 @@  static int linear_write(unsigned long addr, unsigned int bytes, void *p_data,
                         uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt)
 {
     pagefault_info_t pfinfo;
+    struct hvm_vcpu_io *vio = &current->arch.hvm.hvm_io;
     unsigned int offset = addr & ~PAGE_MASK;
-    int rc;
+    int rc = HVMTRANS_bad_gfn_to_mfn;
 
     if ( offset + bytes > PAGE_SIZE )
     {
@@ -1149,7 +1161,14 @@  static int linear_write(unsigned long addr, unsigned int bytes, void *p_data,
         return rc;
     }
 
-    rc = hvm_copy_to_guest_linear(addr, p_data, bytes, pfec, &pfinfo);
+    /*
+     * If there is an MMIO cache entry for that acces then we must be re-issuing
+     * an access that was previously handled as MMIO. Thus it is imperative that
+     * we handle this access in the same way to guarantee completion and hence
+     * clean up any interim state.
+     */
+    if ( !hvmemul_find_mmio_cache(vio, addr, IOREQ_WRITE, false) )
+        rc = hvm_copy_to_guest_linear(addr, p_data, bytes, pfec, &pfinfo);
 
     switch ( rc )
     {