diff mbox series

[2/2] x86/livepatch: Prevent patching with active waitqueues

Message ID 20191105194317.16232-3-andrew.cooper3@citrix.com (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

Andrew Cooper Nov. 5, 2019, 7:43 p.m. UTC
The safety of livepatching depends on every stack having been unwound, but
there is one corner case where this is not true.  The Sharing/Paging/Monitor
infrastructure may use waitqueues, which copy the stack frame sideways and
longjmp() to a different vcpu.

This case is rare, and can be worked around by pausing the offending
domain(s), waiting for their rings to drain, then performing a livepatch.

In the case that there is an active waitqueue, fail the livepatch attempt with
-EBUSY, which is preforable to the fireworks which occur from trying to unwind
the old stack frame at a later point.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
CC: Ross Lagerwall <ross.lagerwall@citrix.com>
CC: Juergen Gross <jgross@suse.com>

This fix wants backporting, and is long overdue for posting upstream.
---
 xen/arch/arm/livepatch.c    |  5 +++++
 xen/arch/x86/livepatch.c    | 39 +++++++++++++++++++++++++++++++++++++++
 xen/common/livepatch.c      |  7 +++++++
 xen/include/xen/livepatch.h |  1 +
 4 files changed, 52 insertions(+)

Comments

Konrad Rzeszutek Wilk Nov. 6, 2019, 2:25 p.m. UTC | #1
On Tue, Nov 05, 2019 at 07:43:17PM +0000, Andrew Cooper wrote:
> The safety of livepatching depends on every stack having been unwound, but
> there is one corner case where this is not true.  The Sharing/Paging/Monitor
> infrastructure may use waitqueues, which copy the stack frame sideways and
> longjmp() to a different vcpu.
> 
> This case is rare, and can be worked around by pausing the offending
> domain(s), waiting for their rings to drain, then performing a livepatch.
> 
> In the case that there is an active waitqueue, fail the livepatch attempt with
> -EBUSY, which is preforable to the fireworks which occur from trying to unwind
> the old stack frame at a later point.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

> CC: Ross Lagerwall <ross.lagerwall@citrix.com>
> CC: Juergen Gross <jgross@suse.com>
> 
> This fix wants backporting, and is long overdue for posting upstream.
> ---
>  xen/arch/arm/livepatch.c    |  5 +++++
>  xen/arch/x86/livepatch.c    | 39 +++++++++++++++++++++++++++++++++++++++
>  xen/common/livepatch.c      |  7 +++++++
>  xen/include/xen/livepatch.h |  1 +
>  4 files changed, 52 insertions(+)
> 
> diff --git a/xen/arch/arm/livepatch.c b/xen/arch/arm/livepatch.c
> index 00c5e2bc45..915e9d926a 100644
> --- a/xen/arch/arm/livepatch.c
> +++ b/xen/arch/arm/livepatch.c
> @@ -18,6 +18,11 @@
>  
>  void *vmap_of_xen_text;
>  
> +int arch_livepatch_safety_check(void)
> +{
> +    return 0;
> +}
> +
>  int arch_livepatch_quiesce(void)
>  {
>      mfn_t text_mfn;
> diff --git a/xen/arch/x86/livepatch.c b/xen/arch/x86/livepatch.c
> index c82cf53b9e..0f129fa6b2 100644
> --- a/xen/arch/x86/livepatch.c
> +++ b/xen/arch/x86/livepatch.c
> @@ -14,6 +14,45 @@
>  #include <asm/nmi.h>
>  #include <asm/livepatch.h>
>  
> +static bool has_active_waitqueue(const struct vm_event_domain *ved)
> +{
> +    /* ved may be xzalloc()'d without INIT_LIST_HEAD() yet. */
> +    return (ved && !list_head_is_null(&ved->wq.list) &&
> +            !list_empty(&ved->wq.list));
> +}
> +
> +/*
> + * x86's implementation of waitqueue violates the livepatching safey principle
> + * of having unwound every CPUs stack before modifying live content.
> + *
> + * Search through every domain and check that no vCPUs have an active
> + * waitqueue.
> + */
> +int arch_livepatch_safety_check(void);
> +{
> +    struct domain *d;
> +
> +    for_each_domain ( d )
> +    {
> +#ifdef CONFIG_MEM_SHARING
> +        if ( has_active_waitqueue(d->vm_event_share) )
> +            goto fail;
> +#endif
> +#ifdef CONFIG_MEM_PAGING
> +        if ( has_active_waitqueue(d->vm_event_paging) )
> +            goto fail;
> +#endif
> +        if ( has_active_waitqueue(d->vm_event_monitor) )
> +            goto fail;
> +    }
> +
> +    return 0;
> +
> + fail:
> +    printk(XENLOG_ERR LIVEPATCH "%pd found with active waitqueue\n", d);
> +    return -EBUSY;
> +}
> +
>  int arch_livepatch_quiesce(void)
>  {
>      /* Disable WP to allow changes to read-only pages. */
> diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c
> index 962647616a..27ee5bdeb7 100644
> --- a/xen/common/livepatch.c
> +++ b/xen/common/livepatch.c
> @@ -1060,6 +1060,13 @@ static int apply_payload(struct payload *data)
>      unsigned int i;
>      int rc;
>  
> +    rc = apply_safety_checks();
> +    if ( rc )
> +    {
> +        printk(XENLOG_ERR LIVEPATCH "%s: Safety checks failed\n", data->name);
> +        return rc;
> +    }
> +
>      printk(XENLOG_INFO LIVEPATCH "%s: Applying %u functions\n",
>              data->name, data->nfuncs);
>  
> diff --git a/xen/include/xen/livepatch.h b/xen/include/xen/livepatch.h
> index 1b1817ca0d..69ede75d20 100644
> --- a/xen/include/xen/livepatch.h
> +++ b/xen/include/xen/livepatch.h
> @@ -104,6 +104,7 @@ static inline int livepatch_verify_distance(const struct livepatch_func *func)
>   * These functions are called around the critical region patching live code,
>   * for an architecture to take make appropratie global state adjustments.
>   */
> +int arch_livepatch_safety_check(void);
>  int arch_livepatch_quiesce(void);
>  void arch_livepatch_revive(void);
>  
> -- 
> 2.11.0
>
Ross Lagerwall Nov. 8, 2019, 10:18 a.m. UTC | #2
On 11/5/19 7:43 PM, Andrew Cooper wrote:
> The safety of livepatching depends on every stack having been unwound, but
> there is one corner case where this is not true.  The Sharing/Paging/Monitor
> infrastructure may use waitqueues, which copy the stack frame sideways and
> longjmp() to a different vcpu.
> 
> This case is rare, and can be worked around by pausing the offending
> domain(s), waiting for their rings to drain, then performing a livepatch.
> 
> In the case that there is an active waitqueue, fail the livepatch attempt with
> -EBUSY, which is preforable to the fireworks which occur from trying to unwind
> the old stack frame at a later point.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com>
diff mbox series

Patch

diff --git a/xen/arch/arm/livepatch.c b/xen/arch/arm/livepatch.c
index 00c5e2bc45..915e9d926a 100644
--- a/xen/arch/arm/livepatch.c
+++ b/xen/arch/arm/livepatch.c
@@ -18,6 +18,11 @@ 
 
 void *vmap_of_xen_text;
 
+int arch_livepatch_safety_check(void)
+{
+    return 0;
+}
+
 int arch_livepatch_quiesce(void)
 {
     mfn_t text_mfn;
diff --git a/xen/arch/x86/livepatch.c b/xen/arch/x86/livepatch.c
index c82cf53b9e..0f129fa6b2 100644
--- a/xen/arch/x86/livepatch.c
+++ b/xen/arch/x86/livepatch.c
@@ -14,6 +14,45 @@ 
 #include <asm/nmi.h>
 #include <asm/livepatch.h>
 
+static bool has_active_waitqueue(const struct vm_event_domain *ved)
+{
+    /* ved may be xzalloc()'d without INIT_LIST_HEAD() yet. */
+    return (ved && !list_head_is_null(&ved->wq.list) &&
+            !list_empty(&ved->wq.list));
+}
+
+/*
+ * x86's implementation of waitqueue violates the livepatching safey principle
+ * of having unwound every CPUs stack before modifying live content.
+ *
+ * Search through every domain and check that no vCPUs have an active
+ * waitqueue.
+ */
+int arch_livepatch_safety_check(void);
+{
+    struct domain *d;
+
+    for_each_domain ( d )
+    {
+#ifdef CONFIG_MEM_SHARING
+        if ( has_active_waitqueue(d->vm_event_share) )
+            goto fail;
+#endif
+#ifdef CONFIG_MEM_PAGING
+        if ( has_active_waitqueue(d->vm_event_paging) )
+            goto fail;
+#endif
+        if ( has_active_waitqueue(d->vm_event_monitor) )
+            goto fail;
+    }
+
+    return 0;
+
+ fail:
+    printk(XENLOG_ERR LIVEPATCH "%pd found with active waitqueue\n", d);
+    return -EBUSY;
+}
+
 int arch_livepatch_quiesce(void)
 {
     /* Disable WP to allow changes to read-only pages. */
diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c
index 962647616a..27ee5bdeb7 100644
--- a/xen/common/livepatch.c
+++ b/xen/common/livepatch.c
@@ -1060,6 +1060,13 @@  static int apply_payload(struct payload *data)
     unsigned int i;
     int rc;
 
+    rc = apply_safety_checks();
+    if ( rc )
+    {
+        printk(XENLOG_ERR LIVEPATCH "%s: Safety checks failed\n", data->name);
+        return rc;
+    }
+
     printk(XENLOG_INFO LIVEPATCH "%s: Applying %u functions\n",
             data->name, data->nfuncs);
 
diff --git a/xen/include/xen/livepatch.h b/xen/include/xen/livepatch.h
index 1b1817ca0d..69ede75d20 100644
--- a/xen/include/xen/livepatch.h
+++ b/xen/include/xen/livepatch.h
@@ -104,6 +104,7 @@  static inline int livepatch_verify_distance(const struct livepatch_func *func)
  * These functions are called around the critical region patching live code,
  * for an architecture to take make appropratie global state adjustments.
  */
+int arch_livepatch_safety_check(void);
 int arch_livepatch_quiesce(void);
 void arch_livepatch_revive(void);