diff mbox series

[v7,06/10] microcode: split out apply_microcode() from cpu_request_microcode()

Message ID 1558945891-3015-7-git-send-email-chao.gao@intel.com (mailing list archive)
State Superseded
Headers show
Series improve late microcode loading | expand

Commit Message

Chao Gao May 27, 2019, 8:31 a.m. UTC
During late microcode update, apply_microcode() is invoked in
cpu_request_microcode(). To make late microcode update more reliable,
we want to put the apply_microcode() into stop_machine context. So
we split out it from cpu_request_microcode(). As a consequence,
apply_microcode() should be invoked explicitly in the common code.

Previously, apply_microcode() gets the microcode patch to be applied from
the microcode cache. Now, the patch is passed as a function argument and
a patch is cached for cpu-hotplug and cpu resuming, only after it has
been loaded to a cpu without any error. As a consequence, the
'match_cpu' check in microcode_update_cache is removed, which otherwise
would fail.

Assuming that all CPUs have the same signature, one patch matching with
current CPU should match with others. Then parsing microcode only needs
to be done once; cpu_request_microcode() is also moved out of
microcode_update_cpu().

On AMD side, svm_host_osvw_init() is supposed to be called after
microcode update. As apply_micrcode() won't be called by
cpu_request_microcode() now, svm_host_osvw_init() is moved to the
end of apply_microcode().

Signed-off-by: Chao Gao <chao.gao@intel.com>
---
Changes in v7:
 - to handle load failure, unvalidated patches won't be cached. They
 are passed as function arguments. So if update failed, we needn't
 any cleanup to microcode cache.
 - microcode_info which passes microcode blob to be parsed to each CPU is
 replaced by microcode_patch.

Changes in v6:
 - during early microcode update, BSP and APs call different functions.
   Thus AP can bypass parsing microcode blob.
---
 xen/arch/x86/acpi/power.c       |   2 +-
 xen/arch/x86/microcode.c        | 209 ++++++++++++++++++++++++++--------------
 xen/arch/x86/microcode_amd.c    |  41 ++++----
 xen/arch/x86/microcode_intel.c  |  69 ++++++-------
 xen/arch/x86/smpboot.c          |   5 +-
 xen/include/asm-x86/microcode.h |   8 +-
 xen/include/asm-x86/processor.h |   3 +-
 7 files changed, 193 insertions(+), 144 deletions(-)

Comments

Jan Beulich June 5, 2019, 12:37 p.m. UTC | #1
>>> On 27.05.19 at 10:31, <chao.gao@intel.com> wrote:
> During late microcode update, apply_microcode() is invoked in
> cpu_request_microcode(). To make late microcode update more reliable,
> we want to put the apply_microcode() into stop_machine context. So
> we split out it from cpu_request_microcode(). As a consequence,
> apply_microcode() should be invoked explicitly in the common code.
> 
> Previously, apply_microcode() gets the microcode patch to be applied from
> the microcode cache. Now, the patch is passed as a function argument and
> a patch is cached for cpu-hotplug and cpu resuming, only after it has
> been loaded to a cpu without any error. As a consequence, the
> 'match_cpu' check in microcode_update_cache is removed, which otherwise
> would fail.

The "only after it has been loaded to a cpu without any error" is a
problem, precisely for the case where ucode on the different cores
is not in sync initially. I would actually like to put up this question:
When a core has no ucode loaded at all yet and only strictly older
(than loaded on some other cores) ucode is found to be available,
whether then it wouldn't still be better to apply that ucode to
_at least_ the cores that have none loaded yet.

To get the system into "sane" state it may even be necessary to
downgrade ucode on the cores which did have it loaded already,
in such a situation.

> On AMD side, svm_host_osvw_init() is supposed to be called after
> microcode update. As apply_micrcode() won't be called by
> cpu_request_microcode() now, svm_host_osvw_init() is moved to the
> end of apply_microcode().

I guess this really ought to become a vendor hook as well, but I
wouldn't insist on you doing so here.

> --- a/xen/arch/x86/acpi/power.c
> +++ b/xen/arch/x86/acpi/power.c
> @@ -253,7 +253,7 @@ static int enter_state(u32 state)
>  
>      console_end_sync();
>  
> -    microcode_resume_cpu();
> +    early_microcode_update_cpu();

The use here, the (changed) use in start_secondary(), and the dropping
of its __init suggest to make an attempt to find a better name for the
function. Maybe microcode_update_one()?

> +/*
> + * Load a microcode update to current CPU.
> + *
> + * If no patch is provided, the cached patch will be loaded. Microcode update
> + * during APs bringup and CPU resuming falls into this case.
> + */
> +static int microcode_update_cpu(struct microcode_patch *patch)

const?

>  {
> -    int err;
> -    struct cpu_signature *sig = &this_cpu(cpu_sig);
> +    int ret = microcode_ops->collect_cpu_info(&this_cpu(cpu_sig));
>  
> -    if ( !microcode_ops )
> -        return 0;
> +    if ( unlikely(ret) )
> +        return ret;
>  
>      spin_lock(&microcode_mutex);
>  
> -    err = microcode_ops->collect_cpu_info(sig);
> -    if ( likely(!err) )
> -        err = microcode_ops->apply_microcode();
> -    spin_unlock(&microcode_mutex);
> +    if ( patch )
> +    {
> +        /*
> +         * If a patch is specified, it should has newer revision than
> +         * that of the patch cached.
> +         */
> +        if ( microcode_cache &&
> +             microcode_ops->compare_patch(patch, microcode_cache) != NEW_UCODE )
> +        {
> +            spin_unlock(&microcode_mutex);
> +            return -EINVAL;
> +        }
>  
> -    return err;
> -}
> +        ret = microcode_ops->apply_microcode(patch);

There's no printk() here but ...

> +    }
> +    else if ( microcode_cache )
> +    {
> +        ret = microcode_ops->apply_microcode(microcode_cache);
> +        if ( ret == -EIO )
> +            printk("Update failed. Reboot needed\n");

... you emit a log message here. Why the difference? And wouldn't
it be better to have just a single call to ->apply anyway, by simply
assigning "microcode_cache" to "patch" and moving the call a little
further down?

> +    }
> +    else
> +        /* No patch to update */
> +        ret = -EINVAL;

-ENOENT?

> @@ -247,46 +270,32 @@ bool microcode_update_cache(struct microcode_patch *patch)
>      return true;
>  }
>  
> -static int microcode_update_cpu(const void *buf, size_t size)
> +static long do_microcode_update(void *patch)
>  {
> -    int err;
> -    unsigned int cpu = smp_processor_id();
> -    struct cpu_signature *sig = &per_cpu(cpu_sig, cpu);
> -
> -    spin_lock(&microcode_mutex);
> +    int error, cpu;

While "int" is fine for error codes, it almost certainly wants to be
"unsigned int" for "cpu". The more that it had been that was before.
I also don't see why you need to switch from "err" to "error" - oh,
...

> -    err = microcode_ops->collect_cpu_info(sig);
> -    if ( likely(!err) )
> -        err = microcode_ops->cpu_request_microcode(buf, size);
> -    spin_unlock(&microcode_mutex);
> -
> -    return err;
> -}
> -
> -static long do_microcode_update(void *_info)
> -{
> -    struct microcode_info *info = _info;
> -    int error;
> +    error = microcode_update_cpu(patch);

... there was a pre-existing variable of that name here.

> +    if ( error )
> +    {
> +        microcode_ops->free_patch(microcode_cache);

Does this also set "microcode_cache" to NULL? I didn't think so.
It's anyway not really clear why _all_ forms of errors should lead
to clearing of the cache. However - looking at the code further
down, don't you rather mean to free "patch" here anyway?

> +        return error;
> +    }
>  
> -    BUG_ON(info->cpu != smp_processor_id());
>  
> -    error = microcode_update_cpu(info->buffer, info->buffer_size);
> -    if ( error )
> -        info->error = error;
> +    cpu = cpumask_next(smp_processor_id(), &cpu_online_map);
> +    if ( cpu < nr_cpu_ids )
> +        return continue_hypercall_on_cpu(cpu, do_microcode_update, patch);
>  
> -    info->cpu = cpumask_next(info->cpu, &cpu_online_map);
> -    if ( info->cpu < nr_cpu_ids )
> -        return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
> +    microcode_update_cache(patch);

Independent of my remarks at the top I would think that updating of
the cache should happen after the first successful loading on a CPU,
not after all CPUs have been updated successfully. There would then
also not be any need to pass "patch" on to continue_hypercall_on_cpu()
a few lines up from here (albeit from a general logic perspective it may
indeed be easier to keep it that way).

> @@ -294,32 +303,49 @@ int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void) buf, unsigned long len)
>      if ( microcode_ops == NULL )
>          return -EINVAL;
>  
> -    info = xmalloc_bytes(sizeof(*info) + len);
> -    if ( info == NULL )
> -        return -ENOMEM;
> -
> -    ret = copy_from_guest(info->buffer, buf, len);
> -    if ( ret != 0 )
> +    buffer = xmalloc_bytes(len);
> +    if ( !buffer )
>      {
> -        xfree(info);
> -        return ret;
> +        ret = -ENOMEM;
> +        goto free;
>      }
>  
> -    info->buffer_size = len;
> -    info->error = 0;
> -    info->cpu = cpumask_first(&cpu_online_map);
> +    if ( copy_from_guest(buffer, buf, len) )
> +    {
> +        ret = -EFAULT;
> +        goto free;
> +    }
>  
>      if ( microcode_ops->start_update )
>      {
>          ret = microcode_ops->start_update();
>          if ( ret != 0 )
> -        {
> -            xfree(info);
> -            return ret;
> -        }
> +            goto free;
>      }
>  
> -    return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
> +    patch = microcode_parse_blob(buffer, len);
> +    if ( IS_ERR(patch) )
> +    {
> +        printk(XENLOG_ERR "Parsing microcode blob error %ld\n", PTR_ERR(patch));
> +        ret = PTR_ERR(patch);

So I assume we would get here when the system already has the
newest (or even newer) ucode loaded. That's not an error, and
imo no log message suggesting so should be issued. Perhaps the
parsing code could return NULL to indicate so? Although, judging
by the code further down you already expect NULL potentially
coming back, but I can't seem to be able to figure the condition(s).

Also please switch the two lines around and use "ret" in the printk()
invocation.

> +        goto free;
> +    }
> +
> +    if ( !microcode_ops->match_cpu(patch) )
> +    {
> +        printk(XENLOG_ERR "No matching or newer ucode found. Update aborted!\n");

I assume the "matching" here is meant to cover the CPU signature?
The wording is ambiguous this way, because it could also mean there
was no ucode found matching that which is already loaded (which, as
per above, may end up being relevant).

Furthermore - why this check, when microcode_parse_blob() already
looks for something that's newer than what is currently loaded (and
matches the CPU signature)?

> @@ -362,13 +397,41 @@ int __init early_microcode_update_cpu(bool start_update)
>      }
>      if ( data )
>      {
> -        if ( start_update && microcode_ops->start_update )
> +        struct microcode_patch *patch;
> +
> +        if ( microcode_ops->start_update )
>              rc = microcode_ops->start_update();
>  
>          if ( rc )
>              return rc;
>  
> -        return microcode_update_cpu(data, len);
> +        patch = microcode_parse_blob(data, len);
> +        if ( IS_ERR(patch) )
> +        {
> +            printk(XENLOG_ERR "Parsing microcode blob error %ld\n",
> +                   PTR_ERR(patch));
> +            return PTR_ERR(patch);
> +        }
> +
> +        if ( !microcode_ops->match_cpu(patch) )
> +        {
> +            printk(XENLOG_ERR "No matching or newer ucode found. Update aborted!\n");
> +            if ( patch )
> +                microcode_ops->free_patch(patch);
> +            return -EINVAL;
> +        }

Same remarks here then.

> @@ -292,6 +291,10 @@ static int apply_microcode(void)
>  
>      sig->rev = rev;
>  
> +#ifdef CONFIG_HVM
> +    svm_host_osvw_init();
> +#endif
> +
>      return 0;
>  }

While this now sits on the success path only, ...

> @@ -592,17 +596,10 @@ static int cpu_request_microcode(const void *buf, size_t bufsize)
>      }
>  
>    out:
> -#if CONFIG_HVM
> -    svm_host_osvw_init();
> -#endif
> +    if ( error && !patch )
> +        patch = ERR_PTR(error);
>  
> -    /*
> -     * In some cases we may return an error even if processor's microcode has
> -     * been updated. For example, the first patch in a container file is loaded
> -     * successfully but subsequent container file processing encounters a
> -     * failure.
> -     */
> -    return error;
> +    return patch;
>  }

... previously it has also been invoked in the error case. See the
comment in start_update().

> --- a/xen/arch/x86/microcode_intel.c
> +++ b/xen/arch/x86/microcode_intel.c
> @@ -273,46 +273,27 @@ static enum microcode_match_result compare_patch(
>                                    old_header->pf, old_header->rev);
>  }
>  
> -/*
> - * return 0 - no update found
> - * return 1 - found update
> - * return < 0 - error
> - */
> -static int get_matching_microcode(const void *mc)
> +static struct microcode_patch *allow_microcode_patch(

Did you perhaps mean this to be alloc_microcode_patch()?

> @@ -388,26 +368,39 @@ static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
>      return offset + total_size;
>  }
>  
> -static int cpu_request_microcode(const void *buf, size_t size)
> +static struct microcode_patch *cpu_request_microcode(const void *buf,
> +                                                     size_t size)
>  {
>      long offset = 0;
>      int error = 0;
>      void *mc;
> +    struct microcode_patch *patch = NULL;
>  
>      while ( (offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) > 0 )
>      {
> +        struct microcode_patch *new_patch;
> +
>          error = microcode_sanity_check(mc);
>          if ( error )
>              break;
> -        error = get_matching_microcode(mc);
> -        if ( error < 0 )
> +
> +        new_patch = allow_microcode_patch(mc);
> +        if ( IS_ERR(new_patch) )
> +        {
> +            error = PTR_ERR(new_patch);
>              break;
> -        /*
> -         * It's possible the data file has multiple matching ucode,
> -         * lets keep searching till the latest version
> -         */
> -        if ( error == 1 )
> -            error = 0;
> +        }
> +
> +        /* Compare patches and store the one with higher revision */
> +        if ( !patch && match_cpu(new_patch) )
> +            patch = new_patch;
> +        else if ( patch && (compare_patch(new_patch, patch) == NEW_UCODE) )

At least the appearance of this is misleading, but unless I'm missing
something it's actually wrong: You seem to imply that from
match_cpu(patch1) returning true and compare_patch(patch2, patch1)
returning true it follows that also match_cpu(patch2) would return
true. But I don't think that's the case, because of how the "pf" field
works.

Even in case I've overlooked an implicit match_cpu(new_patch) I'd
like to ask for this to be clarified, either by changing the code
structure, or by attaching a suitable comment.

> --- a/xen/arch/x86/smpboot.c
> +++ b/xen/arch/x86/smpboot.c
> @@ -363,10 +363,7 @@ void start_secondary(void *unused)
>  
>      initialize_cpu_data(cpu);
>  
> -    if ( system_state <= SYS_STATE_smp_boot )
> -        early_microcode_update_cpu(false);
> -    else
> -        microcode_resume_cpu();
> +    early_microcode_update_cpu();

I'm struggling to understand how you get away without the "false"
argument that was passed here before. You look to now be calling
->start_update() unconditionally (so long as the hook is not NULL),
which I don't think is correct. This should be called only once by
the CPU _leading_ an update (the BSP during boot, and the CPU
the hypercall gets invoked on (or the first CPU an update gets
issued one) for a late update. Am I missing something?

Jan
Chao Gao June 11, 2019, 3:32 a.m. UTC | #2
On Wed, Jun 05, 2019 at 06:37:27AM -0600, Jan Beulich wrote:
>>>> On 27.05.19 at 10:31, <chao.gao@intel.com> wrote:
>> During late microcode update, apply_microcode() is invoked in
>> cpu_request_microcode(). To make late microcode update more reliable,
>> we want to put the apply_microcode() into stop_machine context. So
>> we split out it from cpu_request_microcode(). As a consequence,
>> apply_microcode() should be invoked explicitly in the common code.
>> 
>> Previously, apply_microcode() gets the microcode patch to be applied from
>> the microcode cache. Now, the patch is passed as a function argument and
>> a patch is cached for cpu-hotplug and cpu resuming, only after it has
>> been loaded to a cpu without any error. As a consequence, the
>> 'match_cpu' check in microcode_update_cache is removed, which otherwise
>> would fail.
>
>The "only after it has been loaded to a cpu without any error" is a
>problem, precisely for the case where ucode on the different cores
>is not in sync initially. I would actually like to put up this question:
>When a core has no ucode loaded at all yet and only strictly older
>(than loaded on some other cores) ucode is found to be available,
>whether then it wouldn't still be better to apply that ucode to
>_at least_ the cores that have none loaded yet.

Yes, it is better for this special case. And I agree to support this case.

This in v7, a patch is loaded only if its revision is newer than that
loaded to current CPU. And it is stored only if it has been loaded
successfully. But, as you described, a broken bios might puts the system
in an inconsistent state (multiple microcode revision in the system) and
furthermore in this case, if no or an older microcode update is
provided, early loading cannot get the system into sane state. So for
both early and late microcode loading, we could face a situation that
the patch to be loaded has equal or old revision than microcode of some
CPUs.

Changes I plan to make in next version are:
1. For early microcode, a patch would be stored if it covers current CPU's
signature. All CPUs would try to load from the cache.
2. For late microcode, a patch is loaded only if its revision is newer than
*the patch cached*. And it is stored only if has been loaded without an
"EIO" error.
3. Cache replacement remains the same.

But it is a temperary solution, especially for CSPs. A better way
might be getting the newest ucode or upgrading to the newest bios,
even downgrading the bios to an older version which wouldn't put
the system into "insane" state.

>
>To get the system into "sane" state it may even be necessary to
>downgrade ucode on the cores which did have it loaded already,
>in such a situation.
>
>> On AMD side, svm_host_osvw_init() is supposed to be called after
>> microcode update. As apply_micrcode() won't be called by
>> cpu_request_microcode() now, svm_host_osvw_init() is moved to the
>> end of apply_microcode().
>
>I guess this really ought to become a vendor hook as well, but I
>wouldn't insist on you doing so here.
>
>> --- a/xen/arch/x86/acpi/power.c
>> +++ b/xen/arch/x86/acpi/power.c
>> @@ -253,7 +253,7 @@ static int enter_state(u32 state)
>>  
>>      console_end_sync();
>>  
>> -    microcode_resume_cpu();
>> +    early_microcode_update_cpu();
>
>The use here, the (changed) use in start_secondary(), and the dropping
>of its __init suggest to make an attempt to find a better name for the
>function. Maybe microcode_update_one()?

Will do.

>> +/*
>> + * Load a microcode update to current CPU.
>> + *
>> + * If no patch is provided, the cached patch will be loaded. Microcode update
>> + * during APs bringup and CPU resuming falls into this case.
>> + */
>> +static int microcode_update_cpu(struct microcode_patch *patch)
>
>const?
>
>>  {
>> -    int err;
>> -    struct cpu_signature *sig = &this_cpu(cpu_sig);
>> +    int ret = microcode_ops->collect_cpu_info(&this_cpu(cpu_sig));
>>  
>> -    if ( !microcode_ops )
>> -        return 0;
>> +    if ( unlikely(ret) )
>> +        return ret;
>>  
>>      spin_lock(&microcode_mutex);
>>  
>> -    err = microcode_ops->collect_cpu_info(sig);
>> -    if ( likely(!err) )
>> -        err = microcode_ops->apply_microcode();
>> -    spin_unlock(&microcode_mutex);
>> +    if ( patch )
>> +    {
>> +        /*
>> +         * If a patch is specified, it should has newer revision than
>> +         * that of the patch cached.
>> +         */
>> +        if ( microcode_cache &&
>> +             microcode_ops->compare_patch(patch, microcode_cache) != NEW_UCODE )
>> +        {
>> +            spin_unlock(&microcode_mutex);
>> +            return -EINVAL;
>> +        }
>>  
>> -    return err;
>> -}
>> +        ret = microcode_ops->apply_microcode(patch);
>
>There's no printk() here but ...
>
>> +    }
>> +    else if ( microcode_cache )
>> +    {
>> +        ret = microcode_ops->apply_microcode(microcode_cache);
>> +        if ( ret == -EIO )
>> +            printk("Update failed. Reboot needed\n");
>
>... you emit a log message here. Why the difference? And wouldn't
>it be better to have just a single call to ->apply anyway, by simply
>assigning "microcode_cache" to "patch" and moving the call a little
>further down?

-EIO means we loaded the patch but the revision didn't change. This
error code indicates an error in the patch. It is unlikely to happen.
And in this v7, a patch is stored after being loading successfully
on a CPU. To simplify handling of load failure and avoid cleanup to the
global cache (if a patch has a success rate, we need to consider which
one to save when we have a new patch with 95% rate and an old one with
100% rate, it would be really complex), we assume that loading the cache
(being loaded successfully on a CPU) shouldn't return EIO. Otherwise,
an error message is prompted.

>
>> +    }
>> +    else
>> +        /* No patch to update */
>> +        ret = -EINVAL;
>
>-ENOENT?
>
>> @@ -247,46 +270,32 @@ bool microcode_update_cache(struct microcode_patch *patch)
>>      return true;
>>  }
>>  
>> -static int microcode_update_cpu(const void *buf, size_t size)
>> +static long do_microcode_update(void *patch)
>>  {
>> -    int err;
>> -    unsigned int cpu = smp_processor_id();
>> -    struct cpu_signature *sig = &per_cpu(cpu_sig, cpu);
>> -
>> -    spin_lock(&microcode_mutex);
>> +    int error, cpu;
>
>While "int" is fine for error codes, it almost certainly wants to be
>"unsigned int" for "cpu". The more that it had been that was before.
>I also don't see why you need to switch from "err" to "error" - oh,
>...
>
>> -    err = microcode_ops->collect_cpu_info(sig);
>> -    if ( likely(!err) )
>> -        err = microcode_ops->cpu_request_microcode(buf, size);
>> -    spin_unlock(&microcode_mutex);
>> -
>> -    return err;
>> -}
>> -
>> -static long do_microcode_update(void *_info)
>> -{
>> -    struct microcode_info *info = _info;
>> -    int error;
>> +    error = microcode_update_cpu(patch);
>
>... there was a pre-existing variable of that name here.
>
>> +    if ( error )
>> +    {
>> +        microcode_ops->free_patch(microcode_cache);
>
>Does this also set "microcode_cache" to NULL? I didn't think so.
>It's anyway not really clear why _all_ forms of errors should lead
>to clearing of the cache. However - looking at the code further
>down, don't you rather mean to free "patch" here anyway?

Sorry, it should be "patch".

>
>> +        return error;
>> +    }
>>  
>> -    BUG_ON(info->cpu != smp_processor_id());
>>  
>> -    error = microcode_update_cpu(info->buffer, info->buffer_size);
>> -    if ( error )
>> -        info->error = error;
>> +    cpu = cpumask_next(smp_processor_id(), &cpu_online_map);
>> +    if ( cpu < nr_cpu_ids )
>> +        return continue_hypercall_on_cpu(cpu, do_microcode_update, patch);
>>  
>> -    info->cpu = cpumask_next(info->cpu, &cpu_online_map);
>> -    if ( info->cpu < nr_cpu_ids )
>> -        return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
>> +    microcode_update_cache(patch);
>
>Independent of my remarks at the top I would think that updating of
>the cache should happen after the first successful loading on a CPU,
>not after all CPUs have been updated successfully. There would then
>also not be any need to pass "patch" on to continue_hypercall_on_cpu()
>a few lines up from here (albeit from a general logic perspective it may
>indeed be easier to keep it that way).

The logic is removed in the next patch. So I prefer to keep it the same.

>
>> @@ -294,32 +303,49 @@ int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void) buf, unsigned long len)
>>      if ( microcode_ops == NULL )
>>          return -EINVAL;
>>  
>> -    info = xmalloc_bytes(sizeof(*info) + len);
>> -    if ( info == NULL )
>> -        return -ENOMEM;
>> -
>> -    ret = copy_from_guest(info->buffer, buf, len);
>> -    if ( ret != 0 )
>> +    buffer = xmalloc_bytes(len);
>> +    if ( !buffer )
>>      {
>> -        xfree(info);
>> -        return ret;
>> +        ret = -ENOMEM;
>> +        goto free;
>>      }
>>  
>> -    info->buffer_size = len;
>> -    info->error = 0;
>> -    info->cpu = cpumask_first(&cpu_online_map);
>> +    if ( copy_from_guest(buffer, buf, len) )
>> +    {
>> +        ret = -EFAULT;
>> +        goto free;
>> +    }
>>  
>>      if ( microcode_ops->start_update )
>>      {
>>          ret = microcode_ops->start_update();
>>          if ( ret != 0 )
>> -        {
>> -            xfree(info);
>> -            return ret;
>> -        }
>> +            goto free;
>>      }
>>  
>> -    return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
>> +    patch = microcode_parse_blob(buffer, len);
>> +    if ( IS_ERR(patch) )
>> +    {
>> +        printk(XENLOG_ERR "Parsing microcode blob error %ld\n", PTR_ERR(patch));
>> +        ret = PTR_ERR(patch);
>
>So I assume we would get here when the system already has the
>newest (or even newer) ucode loaded. That's not an error, and
>imo no log message suggesting so should be issued. Perhaps the
>parsing code could return NULL to indicate so?

Yes. 'patch' is set to indicate an error only if no matching patch has
been found and an error (for example, -ENOMEM) happens during parsing.
If no patch in the blob covers the current cpu signature, the parsing
code will return NULL.

>Although, judging
>by the code further down you already expect NULL potentially
>coming back, but I can't seem to be able to figure the condition(s).
>
>Also please switch the two lines around and use "ret" in the printk()
>invocation.
>
>> +        goto free;
>> +    }
>> +
>> +    if ( !microcode_ops->match_cpu(patch) )
>> +    {
>> +        printk(XENLOG_ERR "No matching or newer ucode found. Update aborted!\n");
>
>I assume the "matching" here is meant to cover the CPU signature?

Yes.

>The wording is ambiguous this way, because it could also mean there
>was no ucode found matching that which is already loaded (which, as
>per above, may end up being relevant).

Then I suppose it is fine to remove the "matching or".

>
>Furthermore - why this check, when microcode_parse_blob() already
>looks for something that's newer than what is currently loaded (and
>matches the CPU signature)?

Yes. It can be replaced with a null check.

>
>> @@ -362,13 +397,41 @@ int __init early_microcode_update_cpu(bool start_update)
>>      }
>>      if ( data )
>>      {
>> -        if ( start_update && microcode_ops->start_update )
>> +        struct microcode_patch *patch;
>> +
>> +        if ( microcode_ops->start_update )
>>              rc = microcode_ops->start_update();
>>  
>>          if ( rc )
>>              return rc;
>>  
>> -        return microcode_update_cpu(data, len);
>> +        patch = microcode_parse_blob(data, len);
>> +        if ( IS_ERR(patch) )
>> +        {
>> +            printk(XENLOG_ERR "Parsing microcode blob error %ld\n",
>> +                   PTR_ERR(patch));
>> +            return PTR_ERR(patch);
>> +        }
>> +
>> +        if ( !microcode_ops->match_cpu(patch) )
>> +        {
>> +            printk(XENLOG_ERR "No matching or newer ucode found. Update aborted!\n");
>> +            if ( patch )
>> +                microcode_ops->free_patch(patch);
>> +            return -EINVAL;
>> +        }
>
>Same remarks here then.
>
>> @@ -292,6 +291,10 @@ static int apply_microcode(void)
>>  
>>      sig->rev = rev;
>>  
>> +#ifdef CONFIG_HVM
>> +    svm_host_osvw_init();
>> +#endif
>> +
>>      return 0;
>>  }
>
>While this now sits on the success path only, ...
>
>> @@ -592,17 +596,10 @@ static int cpu_request_microcode(const void *buf, size_t bufsize)
>>      }
>>  
>>    out:
>> -#if CONFIG_HVM
>> -    svm_host_osvw_init();
>> -#endif
>> +    if ( error && !patch )
>> +        patch = ERR_PTR(error);
>>  
>> -    /*
>> -     * In some cases we may return an error even if processor's microcode has
>> -     * been updated. For example, the first patch in a container file is loaded
>> -     * successfully but subsequent container file processing encounters a
>> -     * failure.
>> -     */
>> -    return error;
>> +    return patch;
>>  }
>
>... previously it has also been invoked in the error case. See the
>comment in start_update().

It depends on the scope of related MSRs. But I failed to find such
information in AMD SDM. So I will find a better place for
svm_host_osvw_init(). The last resort is to introduce another callback,
probably ->end_update().

>
>> --- a/xen/arch/x86/microcode_intel.c
>> +++ b/xen/arch/x86/microcode_intel.c
>> @@ -273,46 +273,27 @@ static enum microcode_match_result compare_patch(
>>                                    old_header->pf, old_header->rev);
>>  }
>>  
>> -/*
>> - * return 0 - no update found
>> - * return 1 - found update
>> - * return < 0 - error
>> - */
>> -static int get_matching_microcode(const void *mc)
>> +static struct microcode_patch *allow_microcode_patch(
>
>Did you perhaps mean this to be alloc_microcode_patch()?
>
>> @@ -388,26 +368,39 @@ static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
>>      return offset + total_size;
>>  }
>>  
>> -static int cpu_request_microcode(const void *buf, size_t size)
>> +static struct microcode_patch *cpu_request_microcode(const void *buf,
>> +                                                     size_t size)
>>  {
>>      long offset = 0;
>>      int error = 0;
>>      void *mc;
>> +    struct microcode_patch *patch = NULL;
>>  
>>      while ( (offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) > 0 )
>>      {
>> +        struct microcode_patch *new_patch;
>> +
>>          error = microcode_sanity_check(mc);
>>          if ( error )
>>              break;
>> -        error = get_matching_microcode(mc);
>> -        if ( error < 0 )
>> +
>> +        new_patch = allow_microcode_patch(mc);
>> +        if ( IS_ERR(new_patch) )
>> +        {
>> +            error = PTR_ERR(new_patch);
>>              break;
>> -        /*
>> -         * It's possible the data file has multiple matching ucode,
>> -         * lets keep searching till the latest version
>> -         */
>> -        if ( error == 1 )
>> -            error = 0;
>> +        }
>> +
>> +        /* Compare patches and store the one with higher revision */
>> +        if ( !patch && match_cpu(new_patch) )
>> +            patch = new_patch;
>> +        else if ( patch && (compare_patch(new_patch, patch) == NEW_UCODE) )
>
>At least the appearance of this is misleading, but unless I'm missing
>something it's actually wrong: You seem to imply that from
>match_cpu(patch1) returning true and compare_patch(patch2, patch1)
>returning true it follows that also match_cpu(patch2) would return
>true. But I don't think that's the case, because of how the "pf" field
>works.
>
>Even in case I've overlooked an implicit match_cpu(new_patch) I'd
>like to ask for this to be clarified, either by changing the code
>structure, or by attaching a suitable comment.

I did take it for granted. Checking whether each patch covers current
signature before the comparison can solve this problem.

>
>> --- a/xen/arch/x86/smpboot.c
>> +++ b/xen/arch/x86/smpboot.c
>> @@ -363,10 +363,7 @@ void start_secondary(void *unused)
>>  
>>      initialize_cpu_data(cpu);
>>  
>> -    if ( system_state <= SYS_STATE_smp_boot )
>> -        early_microcode_update_cpu(false);
>> -    else
>> -        microcode_resume_cpu();
>> +    early_microcode_update_cpu();
>
>I'm struggling to understand how you get away without the "false"
>argument that was passed here before. You look to now be calling
>->start_update() unconditionally (so long as the hook is not NULL),
>which I don't think is correct. This should be called only once by
>the CPU _leading_ an update (the BSP during boot, and the CPU
>the hypercall gets invoked on (or the first CPU an update gets
>issued one) for a late update. Am I missing something?

BSP and APs call different functions, early_microcode_parse_and_update_cpu()
and early_microcode_update_cpu() respectively. The latter won't call
->start_update().

Thanks
Chao
Jan Beulich June 11, 2019, 7:08 a.m. UTC | #3
>>> On 11.06.19 at 05:32, <chao.gao@intel.com> wrote:
> On Wed, Jun 05, 2019 at 06:37:27AM -0600, Jan Beulich wrote:
>>>>> On 27.05.19 at 10:31, <chao.gao@intel.com> wrote:
>>> During late microcode update, apply_microcode() is invoked in
>>> cpu_request_microcode(). To make late microcode update more reliable,
>>> we want to put the apply_microcode() into stop_machine context. So
>>> we split out it from cpu_request_microcode(). As a consequence,
>>> apply_microcode() should be invoked explicitly in the common code.
>>> 
>>> Previously, apply_microcode() gets the microcode patch to be applied from
>>> the microcode cache. Now, the patch is passed as a function argument and
>>> a patch is cached for cpu-hotplug and cpu resuming, only after it has
>>> been loaded to a cpu without any error. As a consequence, the
>>> 'match_cpu' check in microcode_update_cache is removed, which otherwise
>>> would fail.
>>
>>The "only after it has been loaded to a cpu without any error" is a
>>problem, precisely for the case where ucode on the different cores
>>is not in sync initially. I would actually like to put up this question:
>>When a core has no ucode loaded at all yet and only strictly older
>>(than loaded on some other cores) ucode is found to be available,
>>whether then it wouldn't still be better to apply that ucode to
>>_at least_ the cores that have none loaded yet.
> 
> Yes, it is better for this special case. And I agree to support this case.
> 
> This in v7, a patch is loaded only if its revision is newer than that
> loaded to current CPU. And it is stored only if it has been loaded
> successfully. But, as you described, a broken bios might puts the system
> in an inconsistent state (multiple microcode revision in the system) and
> furthermore in this case, if no or an older microcode update is
> provided, early loading cannot get the system into sane state. So for
> both early and late microcode loading, we could face a situation that
> the patch to be loaded has equal or old revision than microcode of some
> CPUs.
> 
> Changes I plan to make in next version are:
> 1. For early microcode, a patch would be stored if it covers current CPU's
> signature. All CPUs would try to load from the cache.
> 2. For late microcode, a patch is loaded only if its revision is newer than
> *the patch cached*. And it is stored only if has been loaded without an
> "EIO" error.
> 3. Cache replacement remains the same.

Why the difference between early and late loading?

> But it is a temperary solution, especially for CSPs. A better way
> might be getting the newest ucode or upgrading to the newest bios,
> even downgrading the bios to an older version which wouldn't put
> the system into "insane" state.

On the quoted system, all BIOS versions I've ever been provided
had the same odd behavior.

>>> +static int microcode_update_cpu(struct microcode_patch *patch)
>>>  {
>>> -    int err;
>>> -    struct cpu_signature *sig = &this_cpu(cpu_sig);
>>> +    int ret = microcode_ops->collect_cpu_info(&this_cpu(cpu_sig));
>>>  
>>> -    if ( !microcode_ops )
>>> -        return 0;
>>> +    if ( unlikely(ret) )
>>> +        return ret;
>>>  
>>>      spin_lock(&microcode_mutex);
>>>  
>>> -    err = microcode_ops->collect_cpu_info(sig);
>>> -    if ( likely(!err) )
>>> -        err = microcode_ops->apply_microcode();
>>> -    spin_unlock(&microcode_mutex);
>>> +    if ( patch )
>>> +    {
>>> +        /*
>>> +         * If a patch is specified, it should has newer revision than
>>> +         * that of the patch cached.
>>> +         */
>>> +        if ( microcode_cache &&
>>> +             microcode_ops->compare_patch(patch, microcode_cache) != NEW_UCODE )
>>> +        {
>>> +            spin_unlock(&microcode_mutex);
>>> +            return -EINVAL;
>>> +        }
>>>  
>>> -    return err;
>>> -}
>>> +        ret = microcode_ops->apply_microcode(patch);
>>
>>There's no printk() here but ...
>>
>>> +    }
>>> +    else if ( microcode_cache )
>>> +    {
>>> +        ret = microcode_ops->apply_microcode(microcode_cache);
>>> +        if ( ret == -EIO )
>>> +            printk("Update failed. Reboot needed\n");
>>
>>... you emit a log message here. Why the difference? And wouldn't
>>it be better to have just a single call to ->apply anyway, by simply
>>assigning "microcode_cache" to "patch" and moving the call a little
>>further down?
> 
> -EIO means we loaded the patch but the revision didn't change. This
> error code indicates an error in the patch. It is unlikely to happen.
> And in this v7, a patch is stored after being loading successfully
> on a CPU. To simplify handling of load failure and avoid cleanup to the
> global cache (if a patch has a success rate, we need to consider which
> one to save when we have a new patch with 95% rate and an old one with
> 100% rate, it would be really complex), we assume that loading the cache
> (being loaded successfully on a CPU) shouldn't return EIO. Otherwise,
> an error message is prompted.

But then the log message itself needs to be more specific. That'll
then either allow to understand why one is wanted here but not
elsewhere, or additionally a comment should be attached.

>>> --- a/xen/arch/x86/smpboot.c
>>> +++ b/xen/arch/x86/smpboot.c
>>> @@ -363,10 +363,7 @@ void start_secondary(void *unused)
>>>  
>>>      initialize_cpu_data(cpu);
>>>  
>>> -    if ( system_state <= SYS_STATE_smp_boot )
>>> -        early_microcode_update_cpu(false);
>>> -    else
>>> -        microcode_resume_cpu();
>>> +    early_microcode_update_cpu();
>>
>>I'm struggling to understand how you get away without the "false"
>>argument that was passed here before. You look to now be calling
>>->start_update() unconditionally (so long as the hook is not NULL),
>>which I don't think is correct. This should be called only once by
>>the CPU _leading_ an update (the BSP during boot, and the CPU
>>the hypercall gets invoked on (or the first CPU an update gets
>>issued one) for a late update. Am I missing something?
> 
> BSP and APs call different functions, early_microcode_parse_and_update_cpu()
> and early_microcode_update_cpu() respectively. The latter won't call
> ->start_update().

Oh, I see - I'm sorry. I've been mislead by the patch context
indicators. But still, by their names there's not supposed to be
such a difference. I wonder whether we wouldn't better stick
to just one function (early_microcode_update_cpu()), having
it take a boolean as is the case now. That would control both
parsing and ->start_update() invocation.

Jan
Chao Gao June 11, 2019, 8:53 a.m. UTC | #4
On Tue, Jun 11, 2019 at 01:08:36AM -0600, Jan Beulich wrote:
>>>> On 11.06.19 at 05:32, <chao.gao@intel.com> wrote:
>> On Wed, Jun 05, 2019 at 06:37:27AM -0600, Jan Beulich wrote:
>>>>>> On 27.05.19 at 10:31, <chao.gao@intel.com> wrote:
>>>> During late microcode update, apply_microcode() is invoked in
>>>> cpu_request_microcode(). To make late microcode update more reliable,
>>>> we want to put the apply_microcode() into stop_machine context. So
>>>> we split out it from cpu_request_microcode(). As a consequence,
>>>> apply_microcode() should be invoked explicitly in the common code.
>>>> 
>>>> Previously, apply_microcode() gets the microcode patch to be applied from
>>>> the microcode cache. Now, the patch is passed as a function argument and
>>>> a patch is cached for cpu-hotplug and cpu resuming, only after it has
>>>> been loaded to a cpu without any error. As a consequence, the
>>>> 'match_cpu' check in microcode_update_cache is removed, which otherwise
>>>> would fail.
>>>
>>>The "only after it has been loaded to a cpu without any error" is a
>>>problem, precisely for the case where ucode on the different cores
>>>is not in sync initially. I would actually like to put up this question:
>>>When a core has no ucode loaded at all yet and only strictly older
>>>(than loaded on some other cores) ucode is found to be available,
>>>whether then it wouldn't still be better to apply that ucode to
>>>_at least_ the cores that have none loaded yet.
>> 
>> Yes, it is better for this special case. And I agree to support this case.
>> 
>> This in v7, a patch is loaded only if its revision is newer than that
>> loaded to current CPU. And it is stored only if it has been loaded
>> successfully. But, as you described, a broken bios might puts the system
>> in an inconsistent state (multiple microcode revision in the system) and
>> furthermore in this case, if no or an older microcode update is
>> provided, early loading cannot get the system into sane state. So for
>> both early and late microcode loading, we could face a situation that
>> the patch to be loaded has equal or old revision than microcode of some
>> CPUs.
>> 
>> Changes I plan to make in next version are:
>> 1. For early microcode, a patch would be stored if it covers current CPU's
>> signature. All CPUs would try to load from the cache.
>> 2. For late microcode, a patch is loaded only if its revision is newer than
>> *the patch cached*. And it is stored only if has been loaded without an
>> "EIO" error.
>> 3. Cache replacement remains the same.
>
>Why the difference between early and late loading?

Storing a patch without loading it is problematic. We need complex logics
to restore the old patch if the current patch is proved to be broken.
I really want to avoid going this way. So for late microcode, we still
stick to the rule: storing a patch only after it has been loaded. For
late loading, we can try to load a patch as long as the patch covers
current cpu signature to avoid missing any possible update. But thanks
to early loading, the oldest microcode revision on all online CPUs
shouldn't be older than the cache. So as an optimization, we initiate an
update system-wide only if the patch's revision is newer than the cache.

For early loading, to avoid discarding a potential useful patch, an
exception is made to store the newest matching patch without loading it
and all CPus try to load the patch. One problem is if a broken patch
with very high revision is provided, any subsequent attempt of late
loading would fail. It is unlikely to happen, so I plan to leave it
aside. Otherwise, we can clean up the cache in microcode_init() if no
cpu has loaded this patch (we need a global variable to track the status).

>
>> But it is a temperary solution, especially for CSPs. A better way
>> might be getting the newest ucode or upgrading to the newest bios,
>> even downgrading the bios to an older version which wouldn't put
>> the system into "insane" state.
>
>On the quoted system, all BIOS versions I've ever been provided
>had the same odd behavior.
>
>>>> +static int microcode_update_cpu(struct microcode_patch *patch)
>>>>  {
>>>> -    int err;
>>>> -    struct cpu_signature *sig = &this_cpu(cpu_sig);
>>>> +    int ret = microcode_ops->collect_cpu_info(&this_cpu(cpu_sig));
>>>>  
>>>> -    if ( !microcode_ops )
>>>> -        return 0;
>>>> +    if ( unlikely(ret) )
>>>> +        return ret;
>>>>  
>>>>      spin_lock(&microcode_mutex);
>>>>  
>>>> -    err = microcode_ops->collect_cpu_info(sig);
>>>> -    if ( likely(!err) )
>>>> -        err = microcode_ops->apply_microcode();
>>>> -    spin_unlock(&microcode_mutex);
>>>> +    if ( patch )
>>>> +    {
>>>> +        /*
>>>> +         * If a patch is specified, it should has newer revision than
>>>> +         * that of the patch cached.
>>>> +         */
>>>> +        if ( microcode_cache &&
>>>> +             microcode_ops->compare_patch(patch, microcode_cache) != NEW_UCODE )
>>>> +        {
>>>> +            spin_unlock(&microcode_mutex);
>>>> +            return -EINVAL;
>>>> +        }
>>>>  
>>>> -    return err;
>>>> -}
>>>> +        ret = microcode_ops->apply_microcode(patch);
>>>
>>>There's no printk() here but ...
>>>
>>>> +    }
>>>> +    else if ( microcode_cache )
>>>> +    {
>>>> +        ret = microcode_ops->apply_microcode(microcode_cache);
>>>> +        if ( ret == -EIO )
>>>> +            printk("Update failed. Reboot needed\n");
>>>
>>>... you emit a log message here. Why the difference? And wouldn't
>>>it be better to have just a single call to ->apply anyway, by simply
>>>assigning "microcode_cache" to "patch" and moving the call a little
>>>further down?
>> 
>> -EIO means we loaded the patch but the revision didn't change. This
>> error code indicates an error in the patch. It is unlikely to happen.
>> And in this v7, a patch is stored after being loading successfully
>> on a CPU. To simplify handling of load failure and avoid cleanup to the
>> global cache (if a patch has a success rate, we need to consider which
>> one to save when we have a new patch with 95% rate and an old one with
>> 100% rate, it would be really complex), we assume that loading the cache
>> (being loaded successfully on a CPU) shouldn't return EIO. Otherwise,
>> an error message is prompted.
>
>But then the log message itself needs to be more specific. That'll
>then either allow to understand why one is wanted here but not
>elsewhere, or additionally a comment should be attached.
>
>>>> --- a/xen/arch/x86/smpboot.c
>>>> +++ b/xen/arch/x86/smpboot.c
>>>> @@ -363,10 +363,7 @@ void start_secondary(void *unused)
>>>>  
>>>>      initialize_cpu_data(cpu);
>>>>  
>>>> -    if ( system_state <= SYS_STATE_smp_boot )
>>>> -        early_microcode_update_cpu(false);
>>>> -    else
>>>> -        microcode_resume_cpu();
>>>> +    early_microcode_update_cpu();
>>>
>>>I'm struggling to understand how you get away without the "false"
>>>argument that was passed here before. You look to now be calling
>>>->start_update() unconditionally (so long as the hook is not NULL),
>>>which I don't think is correct. This should be called only once by
>>>the CPU _leading_ an update (the BSP during boot, and the CPU
>>>the hypercall gets invoked on (or the first CPU an update gets
>>>issued one) for a late update. Am I missing something?
>> 
>> BSP and APs call different functions, early_microcode_parse_and_update_cpu()
>> and early_microcode_update_cpu() respectively. The latter won't call
>> ->start_update().
>
>Oh, I see - I'm sorry. I've been mislead by the patch context
>indicators. But still, by their names there's not supposed to be
>such a difference. I wonder whether we wouldn't better stick
>to just one function (early_microcode_update_cpu()), having
>it take a boolean as is the case now. That would control both
>parsing and ->start_update() invocation.

Will do

Thanks
Chao
>
Jan Beulich June 11, 2019, 9:15 a.m. UTC | #5
>>> On 11.06.19 at 10:53, <chao.gao@intel.com> wrote:
> On Tue, Jun 11, 2019 at 01:08:36AM -0600, Jan Beulich wrote:
>>>>> On 11.06.19 at 05:32, <chao.gao@intel.com> wrote:
>>> On Wed, Jun 05, 2019 at 06:37:27AM -0600, Jan Beulich wrote:
>>>>>>> On 27.05.19 at 10:31, <chao.gao@intel.com> wrote:
>>>>> During late microcode update, apply_microcode() is invoked in
>>>>> cpu_request_microcode(). To make late microcode update more reliable,
>>>>> we want to put the apply_microcode() into stop_machine context. So
>>>>> we split out it from cpu_request_microcode(). As a consequence,
>>>>> apply_microcode() should be invoked explicitly in the common code.
>>>>> 
>>>>> Previously, apply_microcode() gets the microcode patch to be applied from
>>>>> the microcode cache. Now, the patch is passed as a function argument and
>>>>> a patch is cached for cpu-hotplug and cpu resuming, only after it has
>>>>> been loaded to a cpu without any error. As a consequence, the
>>>>> 'match_cpu' check in microcode_update_cache is removed, which otherwise
>>>>> would fail.
>>>>
>>>>The "only after it has been loaded to a cpu without any error" is a
>>>>problem, precisely for the case where ucode on the different cores
>>>>is not in sync initially. I would actually like to put up this question:
>>>>When a core has no ucode loaded at all yet and only strictly older
>>>>(than loaded on some other cores) ucode is found to be available,
>>>>whether then it wouldn't still be better to apply that ucode to
>>>>_at least_ the cores that have none loaded yet.
>>> 
>>> Yes, it is better for this special case. And I agree to support this case.
>>> 
>>> This in v7, a patch is loaded only if its revision is newer than that
>>> loaded to current CPU. And it is stored only if it has been loaded
>>> successfully. But, as you described, a broken bios might puts the system
>>> in an inconsistent state (multiple microcode revision in the system) and
>>> furthermore in this case, if no or an older microcode update is
>>> provided, early loading cannot get the system into sane state. So for
>>> both early and late microcode loading, we could face a situation that
>>> the patch to be loaded has equal or old revision than microcode of some
>>> CPUs.
>>> 
>>> Changes I plan to make in next version are:
>>> 1. For early microcode, a patch would be stored if it covers current CPU's
>>> signature. All CPUs would try to load from the cache.
>>> 2. For late microcode, a patch is loaded only if its revision is newer than
>>> *the patch cached*. And it is stored only if has been loaded without an
>>> "EIO" error.
>>> 3. Cache replacement remains the same.
>>
>>Why the difference between early and late loading?
> 
> Storing a patch without loading it is problematic. We need complex logics
> to restore the old patch if the current patch is proved to be broken.
> I really want to avoid going this way. So for late microcode, we still
> stick to the rule: storing a patch only after it has been loaded. For
> late loading, we can try to load a patch as long as the patch covers
> current cpu signature to avoid missing any possible update. But thanks
> to early loading, the oldest microcode revision on all online CPUs
> shouldn't be older than the cache. So as an optimization, we initiate an
> update system-wide only if the patch's revision is newer than the cache.

Well, you seem to be considering only one out of two possible cases.
What if no ucode was loaded early at all? There's no "thanks to early
loading" then.

> For early loading, to avoid discarding a potential useful patch, an
> exception is made to store the newest matching patch without loading it
> and all CPus try to load the patch. One problem is if a broken patch
> with very high revision is provided, any subsequent attempt of late
> loading would fail. It is unlikely to happen, so I plan to leave it
> aside. Otherwise, we can clean up the cache in microcode_init() if no
> cpu has loaded this patch (we need a global variable to track the status).

Yes, if a patch that claims to be newer than what any CPU has
already loaded fails to apply on every CPU, then yes, it surely
should be purged from the cache.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c
index 4f21903..9583172 100644
--- a/xen/arch/x86/acpi/power.c
+++ b/xen/arch/x86/acpi/power.c
@@ -253,7 +253,7 @@  static int enter_state(u32 state)
 
     console_end_sync();
 
-    microcode_resume_cpu();
+    early_microcode_update_cpu();
 
     if ( !recheck_cpu_features(0) )
         panic("Missing previously available feature(s)\n");
diff --git a/xen/arch/x86/microcode.c b/xen/arch/x86/microcode.c
index 16a6d50..23cf550 100644
--- a/xen/arch/x86/microcode.c
+++ b/xen/arch/x86/microcode.c
@@ -189,36 +189,62 @@  static DEFINE_SPINLOCK(microcode_mutex);
 
 DEFINE_PER_CPU(struct cpu_signature, cpu_sig);
 
-struct microcode_info {
-    unsigned int cpu;
-    uint32_t buffer_size;
-    int error;
-    char buffer[1];
-};
+/*
+ * Return the patch with the highest revision id among all matching
+ * patches in the blob. Return NULL if no suitable patch.
+ */
+static struct microcode_patch *microcode_parse_blob(const char *buf,
+                                                    uint32_t len)
+{
+    if ( likely(!microcode_ops->collect_cpu_info(&this_cpu(cpu_sig))) )
+        return microcode_ops->cpu_request_microcode(buf, len);
 
-int microcode_resume_cpu(void)
+    return NULL;
+}
+
+/*
+ * Load a microcode update to current CPU.
+ *
+ * If no patch is provided, the cached patch will be loaded. Microcode update
+ * during APs bringup and CPU resuming falls into this case.
+ */
+static int microcode_update_cpu(struct microcode_patch *patch)
 {
-    int err;
-    struct cpu_signature *sig = &this_cpu(cpu_sig);
+    int ret = microcode_ops->collect_cpu_info(&this_cpu(cpu_sig));
 
-    if ( !microcode_ops )
-        return 0;
+    if ( unlikely(ret) )
+        return ret;
 
     spin_lock(&microcode_mutex);
 
-    err = microcode_ops->collect_cpu_info(sig);
-    if ( likely(!err) )
-        err = microcode_ops->apply_microcode();
-    spin_unlock(&microcode_mutex);
+    if ( patch )
+    {
+        /*
+         * If a patch is specified, it should has newer revision than
+         * that of the patch cached.
+         */
+        if ( microcode_cache &&
+             microcode_ops->compare_patch(patch, microcode_cache) != NEW_UCODE )
+        {
+            spin_unlock(&microcode_mutex);
+            return -EINVAL;
+        }
 
-    return err;
-}
+        ret = microcode_ops->apply_microcode(patch);
+    }
+    else if ( microcode_cache )
+    {
+        ret = microcode_ops->apply_microcode(microcode_cache);
+        if ( ret == -EIO )
+            printk("Update failed. Reboot needed\n");
+    }
+    else
+        /* No patch to update */
+        ret = -EINVAL;
 
-const struct microcode_patch *microcode_get_cache(void)
-{
-    ASSERT(spin_is_locked(&microcode_mutex));
+    spin_unlock(&microcode_mutex);
 
-    return microcode_cache;
+    return ret;
 }
 
 /* Return true if cache gets updated. Otherwise, return false */
@@ -227,9 +253,6 @@  bool microcode_update_cache(struct microcode_patch *patch)
 
     ASSERT(spin_is_locked(&microcode_mutex));
 
-    if ( !microcode_ops->match_cpu(patch) )
-        return false;
-
     if ( !microcode_cache )
         microcode_cache = patch;
     else if ( microcode_ops->compare_patch(patch, microcode_cache) ==
@@ -247,46 +270,32 @@  bool microcode_update_cache(struct microcode_patch *patch)
     return true;
 }
 
-static int microcode_update_cpu(const void *buf, size_t size)
+static long do_microcode_update(void *patch)
 {
-    int err;
-    unsigned int cpu = smp_processor_id();
-    struct cpu_signature *sig = &per_cpu(cpu_sig, cpu);
-
-    spin_lock(&microcode_mutex);
+    int error, cpu;
 
-    err = microcode_ops->collect_cpu_info(sig);
-    if ( likely(!err) )
-        err = microcode_ops->cpu_request_microcode(buf, size);
-    spin_unlock(&microcode_mutex);
-
-    return err;
-}
-
-static long do_microcode_update(void *_info)
-{
-    struct microcode_info *info = _info;
-    int error;
+    error = microcode_update_cpu(patch);
+    if ( error )
+    {
+        microcode_ops->free_patch(microcode_cache);
+        return error;
+    }
 
-    BUG_ON(info->cpu != smp_processor_id());
 
-    error = microcode_update_cpu(info->buffer, info->buffer_size);
-    if ( error )
-        info->error = error;
+    cpu = cpumask_next(smp_processor_id(), &cpu_online_map);
+    if ( cpu < nr_cpu_ids )
+        return continue_hypercall_on_cpu(cpu, do_microcode_update, patch);
 
-    info->cpu = cpumask_next(info->cpu, &cpu_online_map);
-    if ( info->cpu < nr_cpu_ids )
-        return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
+    microcode_update_cache(patch);
 
-    error = info->error;
-    xfree(info);
     return error;
 }
 
 int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void) buf, unsigned long len)
 {
     int ret;
-    struct microcode_info *info;
+    void *buffer;
+    struct microcode_patch *patch;
 
     if ( len != (uint32_t)len )
         return -E2BIG;
@@ -294,32 +303,49 @@  int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void) buf, unsigned long len)
     if ( microcode_ops == NULL )
         return -EINVAL;
 
-    info = xmalloc_bytes(sizeof(*info) + len);
-    if ( info == NULL )
-        return -ENOMEM;
-
-    ret = copy_from_guest(info->buffer, buf, len);
-    if ( ret != 0 )
+    buffer = xmalloc_bytes(len);
+    if ( !buffer )
     {
-        xfree(info);
-        return ret;
+        ret = -ENOMEM;
+        goto free;
     }
 
-    info->buffer_size = len;
-    info->error = 0;
-    info->cpu = cpumask_first(&cpu_online_map);
+    if ( copy_from_guest(buffer, buf, len) )
+    {
+        ret = -EFAULT;
+        goto free;
+    }
 
     if ( microcode_ops->start_update )
     {
         ret = microcode_ops->start_update();
         if ( ret != 0 )
-        {
-            xfree(info);
-            return ret;
-        }
+            goto free;
     }
 
-    return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
+    patch = microcode_parse_blob(buffer, len);
+    if ( IS_ERR(patch) )
+    {
+        printk(XENLOG_ERR "Parsing microcode blob error %ld\n", PTR_ERR(patch));
+        ret = PTR_ERR(patch);
+        goto free;
+    }
+
+    if ( !microcode_ops->match_cpu(patch) )
+    {
+        printk(XENLOG_ERR "No matching or newer ucode found. Update aborted!\n");
+        if ( patch )
+            microcode_ops->free_patch(patch);
+        ret = -EINVAL;
+        goto free;
+    }
+
+    ret = continue_hypercall_on_cpu(cpumask_first(&cpu_online_map),
+                                    do_microcode_update, patch);
+
+ free:
+    xfree(buffer);
+    return ret;
 }
 
 static int __init microcode_init(void)
@@ -344,7 +370,16 @@  static int __init microcode_init(void)
 }
 __initcall(microcode_init);
 
-int __init early_microcode_update_cpu(bool start_update)
+int early_microcode_update_cpu(void)
+{
+    return microcode_ops ? microcode_update_cpu(NULL) : 0;
+}
+
+/*
+ * BSP needs to parse the ucode blob and then apply an update.
+ * APs just apply an update by calling early_microcode_update_cpu().
+ */
+static int __init early_microcode_parse_and_update_cpu(void)
 {
     int rc = 0;
     void *data = NULL;
@@ -362,13 +397,41 @@  int __init early_microcode_update_cpu(bool start_update)
     }
     if ( data )
     {
-        if ( start_update && microcode_ops->start_update )
+        struct microcode_patch *patch;
+
+        if ( microcode_ops->start_update )
             rc = microcode_ops->start_update();
 
         if ( rc )
             return rc;
 
-        return microcode_update_cpu(data, len);
+        patch = microcode_parse_blob(data, len);
+        if ( IS_ERR(patch) )
+        {
+            printk(XENLOG_ERR "Parsing microcode blob error %ld\n",
+                   PTR_ERR(patch));
+            return PTR_ERR(patch);
+        }
+
+        if ( !microcode_ops->match_cpu(patch) )
+        {
+            printk(XENLOG_ERR "No matching or newer ucode found. Update aborted!\n");
+            if ( patch )
+                microcode_ops->free_patch(patch);
+            return -EINVAL;
+        }
+
+        rc = microcode_update_cpu(patch);
+        if ( !rc )
+        {
+            spin_lock(&microcode_mutex);
+            microcode_update_cache(patch);
+            spin_unlock(&microcode_mutex);
+        }
+        else
+            microcode_ops->free_patch(patch);
+
+        return rc;
     }
     else
         return -ENOMEM;
@@ -387,8 +450,10 @@  int __init early_microcode_init(void)
         return rc;
 
     if ( microcode_ops )
+    {
         if ( ucode_mod.mod_end || ucode_blob.size )
-            rc = early_microcode_update_cpu(true);
+            rc = early_microcode_parse_and_update_cpu();
+    }
 
     return rc;
 }
diff --git a/xen/arch/x86/microcode_amd.c b/xen/arch/x86/microcode_amd.c
index 0144df1..c819028 100644
--- a/xen/arch/x86/microcode_amd.c
+++ b/xen/arch/x86/microcode_amd.c
@@ -249,7 +249,7 @@  static enum microcode_match_result compare_patch(
     return MIS_UCODE;
 }
 
-static int apply_microcode(void)
+static int apply_microcode(const struct microcode_patch *patch)
 {
     unsigned long flags;
     uint32_t rev;
@@ -257,7 +257,6 @@  static int apply_microcode(void)
     unsigned int cpu = smp_processor_id();
     struct cpu_signature *sig = &per_cpu(cpu_sig, cpu);
     const struct microcode_header_amd *hdr;
-    const struct microcode_patch *patch = microcode_get_cache();
 
     if ( !match_cpu(patch) )
         return -EINVAL;
@@ -292,6 +291,10 @@  static int apply_microcode(void)
 
     sig->rev = rev;
 
+#ifdef CONFIG_HVM
+    svm_host_osvw_init();
+#endif
+
     return 0;
 }
 
@@ -449,9 +452,11 @@  static bool check_final_patch_levels(void)
     return 0;
 }
 
-static int cpu_request_microcode(const void *buf, size_t bufsize)
+static struct microcode_patch *cpu_request_microcode(const void *buf,
+                                                     size_t bufsize)
 {
     struct microcode_amd *mc_amd;
+    struct microcode_patch *patch = NULL;
     size_t offset = 0;
     int error = 0;
     unsigned int current_cpu_id;
@@ -551,17 +556,16 @@  static int cpu_request_microcode(const void *buf, size_t bufsize)
             break;
         }
 
-        if ( match_cpu(new_patch) )
-            microcode_update_cache(new_patch);
-        else
-            free_patch(new_patch);
-
-        if ( match_cpu(microcode_get_cache()) )
+        /* Compare patches and store the one with higher revision */
+        if ( !patch && match_cpu(new_patch) )
+            patch = new_patch;
+        else if ( patch && (compare_patch(new_patch, patch) == NEW_UCODE) )
         {
-            error = apply_microcode();
-            if ( error )
-                break;
+            free_patch(patch);
+            patch = new_patch;
         }
+        else
+            free_patch(new_patch);
 
         if ( offset >= bufsize )
             break;
@@ -592,17 +596,10 @@  static int cpu_request_microcode(const void *buf, size_t bufsize)
     }
 
   out:
-#if CONFIG_HVM
-    svm_host_osvw_init();
-#endif
+    if ( error && !patch )
+        patch = ERR_PTR(error);
 
-    /*
-     * In some cases we may return an error even if processor's microcode has
-     * been updated. For example, the first patch in a container file is loaded
-     * successfully but subsequent container file processing encounters a
-     * failure.
-     */
-    return error;
+    return patch;
 }
 
 static int start_update(void)
diff --git a/xen/arch/x86/microcode_intel.c b/xen/arch/x86/microcode_intel.c
index b66844d..650495d 100644
--- a/xen/arch/x86/microcode_intel.c
+++ b/xen/arch/x86/microcode_intel.c
@@ -273,46 +273,27 @@  static enum microcode_match_result compare_patch(
                                   old_header->pf, old_header->rev);
 }
 
-/*
- * return 0 - no update found
- * return 1 - found update
- * return < 0 - error
- */
-static int get_matching_microcode(const void *mc)
+static struct microcode_patch *allow_microcode_patch(
+    const struct microcode_header_intel *mc_header)
 {
-    const struct microcode_header_intel *mc_header = mc;
     unsigned long total_size = get_totalsize(mc_header);
     void *new_mc = xmalloc_bytes(total_size);
     struct microcode_patch *new_patch = xmalloc(struct microcode_patch);
-    unsigned int __maybe_unused cpu = smp_processor_id();
 
     if ( !new_patch || !new_mc )
     {
         xfree(new_patch);
         xfree(new_mc);
         printk(XENLOG_ERR "microcode: Can not allocate memory\n");
-        return -ENOMEM;
+        return ERR_PTR(-ENOMEM);
     }
-    memcpy(new_mc, mc, total_size);
+    memcpy(new_mc, mc_header, total_size);
     new_patch->mc_intel = new_mc;
 
-    if ( !match_cpu(new_patch) )
-    {
-        free_patch(new_patch);
-        return 0;
-    }
-
-    if ( !microcode_update_cache(new_patch) )
-        return 0;
-
-    pr_debug("microcode: CPU%d found a matching microcode update with"
-             " version %#x (current=%#x)\n",
-             cpu, mc_header->rev, this_cpu(cpu_sig).rev);
-
-    return 1;
+    return new_patch;
 }
 
-static int apply_microcode(void)
+static int apply_microcode(const struct microcode_patch *patch)
 {
     unsigned long flags;
     uint64_t msr_content;
@@ -320,7 +301,6 @@  static int apply_microcode(void)
     unsigned int cpu_num = raw_smp_processor_id();
     struct cpu_signature *sig = &this_cpu(cpu_sig);
     const struct microcode_intel *mc_intel;
-    const struct microcode_patch *patch = microcode_get_cache();
 
     if ( !match_cpu(patch) )
         return -EINVAL;
@@ -388,26 +368,39 @@  static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
     return offset + total_size;
 }
 
-static int cpu_request_microcode(const void *buf, size_t size)
+static struct microcode_patch *cpu_request_microcode(const void *buf,
+                                                     size_t size)
 {
     long offset = 0;
     int error = 0;
     void *mc;
+    struct microcode_patch *patch = NULL;
 
     while ( (offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) > 0 )
     {
+        struct microcode_patch *new_patch;
+
         error = microcode_sanity_check(mc);
         if ( error )
             break;
-        error = get_matching_microcode(mc);
-        if ( error < 0 )
+
+        new_patch = allow_microcode_patch(mc);
+        if ( IS_ERR(new_patch) )
+        {
+            error = PTR_ERR(new_patch);
             break;
-        /*
-         * It's possible the data file has multiple matching ucode,
-         * lets keep searching till the latest version
-         */
-        if ( error == 1 )
-            error = 0;
+        }
+
+        /* Compare patches and store the one with higher revision */
+        if ( !patch && match_cpu(new_patch) )
+            patch = new_patch;
+        else if ( patch && (compare_patch(new_patch, patch) == NEW_UCODE) )
+        {
+            free_patch(patch);
+            patch = new_patch;
+        }
+        else
+            free_patch(new_patch);
 
         xfree(mc);
     }
@@ -416,10 +409,10 @@  static int cpu_request_microcode(const void *buf, size_t size)
     if ( offset < 0 )
         error = offset;
 
-    if ( !error && match_cpu(microcode_get_cache()) )
-        error = apply_microcode();
+    if ( error && !patch )
+        patch = ERR_PTR(error);
 
-    return error;
+    return patch;
 }
 
 static const struct microcode_ops microcode_intel_ops = {
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index de19b67..cd5f9cc 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -363,10 +363,7 @@  void start_secondary(void *unused)
 
     initialize_cpu_data(cpu);
 
-    if ( system_state <= SYS_STATE_smp_boot )
-        early_microcode_update_cpu(false);
-    else
-        microcode_resume_cpu();
+    early_microcode_update_cpu();
 
     /*
      * If MSR_SPEC_CTRL is available, apply Xen's default setting and discard
diff --git a/xen/include/asm-x86/microcode.h b/xen/include/asm-x86/microcode.h
index e6842d4..3fa3acd 100644
--- a/xen/include/asm-x86/microcode.h
+++ b/xen/include/asm-x86/microcode.h
@@ -19,9 +19,10 @@  struct microcode_patch {
 };
 
 struct microcode_ops {
-    int (*cpu_request_microcode)(const void *buf, size_t size);
+    struct microcode_patch *(*cpu_request_microcode)(const void *buf,
+                                                     size_t size);
     int (*collect_cpu_info)(struct cpu_signature *csig);
-    int (*apply_microcode)(void);
+    int (*apply_microcode)(const struct microcode_patch *patch);
     int (*start_update)(void);
     void (*free_patch)(struct microcode_patch *patch);
     bool (*match_cpu)(const struct microcode_patch *patch);
@@ -39,7 +40,4 @@  struct cpu_signature {
 DECLARE_PER_CPU(struct cpu_signature, cpu_sig);
 extern const struct microcode_ops *microcode_ops;
 
-const struct microcode_patch *microcode_get_cache(void);
-bool microcode_update_cache(struct microcode_patch *patch);
-
 #endif /* ASM_X86__MICROCODE_H */
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 8b7e484..d8668e6 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -573,8 +573,7 @@  int guest_wrmsr_xen(struct vcpu *v, uint32_t idx, uint64_t val);
 
 void microcode_set_module(unsigned int);
 int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void), unsigned long len);
-int microcode_resume_cpu(void);
-int early_microcode_update_cpu(bool start_update);
+int early_microcode_update_cpu(void);
 int early_microcode_init(void);
 int microcode_init_intel(void);
 int microcode_init_amd(void);