diff mbox

[v4,4/8] mm: Scrub memory from idle loop

Message ID 1495209040-11101-5-git-send-email-boris.ostrovsky@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Boris Ostrovsky May 19, 2017, 3:50 p.m. UTC
Instead of scrubbing pages during guest destruction (from
free_heap_pages()) do this opportunistically, from the idle loop.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
Changes in v4:
* Be careful with tasklets in idle_loop()
* Use per-cpu mapcache override
* Update node_to_scrub() algorithm to select closest node (and add comment
   explaining what it does)
* Put buddy back in the heap directly (as opposed to using merge_and_free_buddy()
  which is dropped anyway)
* Don't stop scrubbing immediately when softirq is pending, try to scrub at least
  a few (8) pages.

 xen/arch/arm/domain.c      |  16 ++++---
 xen/arch/x86/domain.c      |   3 +-
 xen/arch/x86/domain_page.c |   8 ++--
 xen/common/page_alloc.c    | 113 +++++++++++++++++++++++++++++++++++++++------
 xen/include/xen/mm.h       |   1 +
 5 files changed, 117 insertions(+), 24 deletions(-)

Comments

Jan Beulich June 12, 2017, 8:08 a.m. UTC | #1
>>> On 19.05.17 at 17:50, <boris.ostrovsky@oracle.com> wrote:
> Instead of scrubbing pages during guest destruction (from
> free_heap_pages()) do this opportunistically, from the idle loop.

This is too brief for my taste. In particular the re-ordering ...

> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -118,8 +118,9 @@ static void idle_loop(void)
>      {
>          if ( cpu_is_offline(smp_processor_id()) )
>              play_dead();
> -        (*pm_idle)();
>          do_tasklet();
> +        if ( cpu_is_haltable(smp_processor_id()) && !scrub_free_pages() )
> +            (*pm_idle)();
>          do_softirq();

... here (and its correctness / safety) needs an explanation. Not
processing tasklets right after idle wakeup is a not obviously
correct change. Nor is it immediately clear why this needs to be
pulled ahead for your purposes.

> --- a/xen/arch/x86/domain_page.c
> +++ b/xen/arch/x86/domain_page.c
> @@ -18,12 +18,14 @@
>  #include <asm/hardirq.h>
>  #include <asm/setup.h>
>  
> -static struct vcpu *__read_mostly override;
> +static DEFINE_PER_CPU(struct vcpu *, override);
>  
>  static inline struct vcpu *mapcache_current_vcpu(void)
>  {
> +    struct vcpu *v, *this_vcpu = this_cpu(override);
> +
>      /* In the common case we use the mapcache of the running VCPU. */
> -    struct vcpu *v = override ?: current;
> +    v = this_vcpu ?: current;

What's wrong with

    struct vcpu *v = this_cpu(override) ?: current;

? And this, btw, is another change which should have an explanation
in the commit message.

> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -1010,15 +1010,79 @@ static int reserve_offlined_page(struct page_info *head)
>      return count;
>  }
>  
> -static void scrub_free_pages(unsigned int node)
> +static nodemask_t node_scrubbing;
> +
> +/*
> + * If get_node is true this will return closest node that needs to be scrubbed,
> + * with appropriate bit in node_scrubbing set.
> + * If get_node is not set, this will return *a* node that needs to be scrubbed.
> + * node_scrubbing bitmask will no be updated.
> + * If no node needs scrubbing then NUMA_NO_NODE is returned.
> + */
> +static unsigned int node_to_scrub(bool get_node)
>  {
> -    struct page_info *pg;
> -    unsigned int zone;
> +    nodeid_t node = cpu_to_node(smp_processor_id()), local_node;
> +    nodeid_t closest = NUMA_NO_NODE;
> +    u8 dist, shortest = 0xff;
>  
> -    ASSERT(spin_is_locked(&heap_lock));
> +    if ( node == NUMA_NO_NODE )
> +        node = 0;
>  
> -    if ( !node_need_scrub[node] )
> -        return;
> +    if ( node_need_scrub[node] &&
> +         (!get_node || !node_test_and_set(node, node_scrubbing)) )
> +        return node;
> +
> +    /*
> +     * See if there are memory-only nodes that need scrubbing and choose
> +     * the closest one.
> +     */
> +    local_node = node;
> +    while ( 1 )

As some compilers / compiler versions warn about such constructs
("condition is always true"), I generally recommend using "for ( ; ; )"
instead.

> +    {
> +        do {
> +            node = cycle_node(node, node_online_map);
> +        } while ( !cpumask_empty(&node_to_cpumask(node)) &&
> +                  (node != local_node) );
> +
> +        if ( node == local_node )
> +            break;
> +
> +        if ( node_need_scrub[node] )
> +        {
> +            if ( !get_node )
> +                return node;
> +
> +            dist = __node_distance(local_node, node);
> +            if ( dist < shortest || closest == NUMA_NO_NODE )
> +            {
> +                if ( !node_test_and_set(node, node_scrubbing) )
> +                {
> +                    if ( closest != NUMA_NO_NODE )
> +                        node_clear(closest, node_scrubbing);

You call this function with no locks held, yet you temporarily set a
bit in node_scrubbing which you then may clear again here. That'll
prevent another idle CPU to do scrubbing on this node afaict,
which, while not a bug, is not optimal. Therefore I think for this
second part of the function you actually want to acquire the heap
lock.

> +                    shortest = dist;
> +                    closest = node;
> +                }
> +            }

Also note that two if()s like the above, when the inner one also
has no else, can and should be combined into one.

> @@ -1035,22 +1099,46 @@ static void scrub_free_pages(unsigned int node)
>  
>                  for ( i = pg->u.free.first_dirty; i < (1U << order); i++)
>                  {
> +                    cnt++;
>                      if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
>                      {
>                          scrub_one_page(&pg[i]);
>                          pg[i].count_info &= ~PGC_need_scrub;
>                          node_need_scrub[node]--;
> +                        cnt += 100; /* scrubbed pages add heavier weight. */
>                      }

While it doesn't matter much, I would guess that your intention
really was to either do the "cnt++" in an "else" to this "if()", or
use 99 instead of 100 above.

> -                }
>  
> -                page_list_del(pg, &heap(node, zone, order));
> -                page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
> +                    /*
> +                     * Scrub a few (8) pages before becoming eligible for
> +                     * preemtion. But also count non-scrubbing loop iteration

"preemption" and "iterations"

> +                     * so that we don't get stuck here with an almost clean
> +                     * heap.
> +                     */
> +                    if ( softirq_pending(cpu) && cnt > 800 )

Please switch both sides of the &&.

> +                    {
> +                        preempt = true;
> +                        break;
> +                    }
> +                }
>  
> -                if ( node_need_scrub[node] == 0 )
> -                    return;
> +                if ( i == (1U << order) )
> +                {
> +                    page_list_del(pg, &heap(node, zone, order));
> +                    page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
> +                }
> +                else
> +                    pg->u.free.first_dirty = i + 1;

This seems wrong if you set "preempt" after scrubbing the last page
of a buddy - in that case the increment of i in the loop header is being
skipped yet the entire buddy is now clean, so the if() condition here
wants to be "i >= (1U << order) - 1" afaict. In any event it needs to
be impossible for first_dirty to obtain a value of 1U << order here.

> +                if ( preempt || (node_need_scrub[node] == 0) )
> +                    goto out;
>              }
>          } while ( order-- != 0 );
>      }
> +
> + out:
> +    spin_unlock(&heap_lock);
> +    node_clear(node, node_scrubbing);

With the memory-less node comment above it may be necessary
to switch these two around.

Jan
Boris Ostrovsky June 12, 2017, 5:01 p.m. UTC | #2
On 06/12/2017 04:08 AM, Jan Beulich wrote:
>>>> On 19.05.17 at 17:50, <boris.ostrovsky@oracle.com> wrote:
>> Instead of scrubbing pages during guest destruction (from
>> free_heap_pages()) do this opportunistically, from the idle loop.
> This is too brief for my taste. In particular the re-ordering ...
>
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -118,8 +118,9 @@ static void idle_loop(void)
>>      {
>>          if ( cpu_is_offline(smp_processor_id()) )
>>              play_dead();
>> -        (*pm_idle)();
>>          do_tasklet();
>> +        if ( cpu_is_haltable(smp_processor_id()) && !scrub_free_pages() )
>> +            (*pm_idle)();
>>          do_softirq();
> ... here (and its correctness / safety) needs an explanation. Not
> processing tasklets right after idle wakeup is a not obviously
> correct change. Nor is it immediately clear why this needs to be
> pulled ahead for your purposes.

Are you asking for a comment here (i.e. the explanation given by Dario
(added)  in
https://lists.xenproject.org/archives/html/xen-devel/2017-05/msg01215.html)
or are you saying something is wrong?


>
>> +        if ( node_need_scrub[node] )
>> +        {
>> +            if ( !get_node )
>> +                return node;
>> +
>> +            dist = __node_distance(local_node, node);
>> +            if ( dist < shortest || closest == NUMA_NO_NODE )
>> +            {
>> +                if ( !node_test_and_set(node, node_scrubbing) )
>> +                {
>> +                    if ( closest != NUMA_NO_NODE )
>> +                        node_clear(closest, node_scrubbing);
> You call this function with no locks held, yet you temporarily set a
> bit in node_scrubbing which you then may clear again here. That'll
> prevent another idle CPU to do scrubbing on this node afaict,
> which, while not a bug, is not optimal. Therefore I think for this
> second part of the function you actually want to acquire the heap
> lock.

I actually specifically didn't want to take the heap lock here since we
will be calling this routine quite often and in most cases no scrubbing
will be needed.


-boris
Dario Faggioli June 12, 2017, 9:28 p.m. UTC | #3
On Mon, 2017-06-12 at 13:01 -0400, Boris Ostrovsky wrote:
> On 06/12/2017 04:08 AM, Jan Beulich wrote:
> > > > > On 19.05.17 at 17:50, <boris.ostrovsky@oracle.com> wrote:
> > > 
> > > Instead of scrubbing pages during guest destruction (from
> > > free_heap_pages()) do this opportunistically, from the idle loop.
> > 
> > This is too brief for my taste. In particular the re-ordering ...
> > 
> > > --- a/xen/arch/x86/domain.c
> > > +++ b/xen/arch/x86/domain.c
> > > @@ -118,8 +118,9 @@ static void idle_loop(void)
> > >      {
> > >          if ( cpu_is_offline(smp_processor_id()) )
> > >              play_dead();
> > > -        (*pm_idle)();
> > >          do_tasklet();
> > > +        if ( cpu_is_haltable(smp_processor_id()) &&
> > > !scrub_free_pages() )
> > > +            (*pm_idle)();
> > >          do_softirq();
> > 
> > ... here (and its correctness / safety) needs an explanation. Not
> > processing tasklets right after idle wakeup is a not obviously
> > correct change. 
>
Well, one can also see things the other way round, though. I.e.:
considering that do_tasklet() is here for when we force the idle vcpu
into execution right because we want to process tasklets, doing that
only after having tried to sleep is not obviously correct.

And in fact, there's an unwritten (AFAICT) requirement that every
implementation of pm_idle should not actually sleep if there are
tasklets pending.

Truth is, IMO, that we may be here for two reasons: 1) going to sleep
or 2) running tasklet, and the only think we can do is guessing (and
ordering the call according to such guess) and checking whether we
guessed right or wrong. That is:
 - guess it's 1. Check whether it's really 1. If it is, go ahead with  
    it; if not, go for 2;
 - guess it's 2. Check whether it's really 2. If it is, go ahead with
   it, if not, go for 1;

Now scrubbing is kind of a third reason why we may be here, and doing
as done in the code above (although I'm not super happy of the final
look of the result either), should make all the use cases happy.

Also, what's the scenario where you think this may be problematic?
AFAICT, tasklets are vcpu context, or softirq context. If some softirq
context tasklet work is scheduled for a CPU while it is sleeping,
TASKLET_SOFTIRQ is raised, and the call to do_softirq() --which still
happens right after the wakeup-- will take care of it.

If some vcpu context work is scheduled, SCHEDULE_SOFTIRQ is raised.
do_softirq() will call the scheduler, which will see that there is vcpu
tasklet work to do, and hence confirm in execution the idle vcpu, which
will get to execute do_tasklet().

Anyway...

> > Nor is it immediately clear why this needs to be
> > pulled ahead for your purposes.
> 
> Are you asking for a comment here (i.e. the explanation given by
> Dario
> (added)  in
> https://lists.xenproject.org/archives/html/xen-devel/2017-05/msg01215
> .html)
> or are you saying something is wrong?
> 
...If it's more commenting that's being asked, either in the code or in
the changelog, that would indeed improve things, I agree.

Regards,
Dario
Jan Beulich June 13, 2017, 8:12 a.m. UTC | #4
>>> On 12.06.17 at 19:01, <boris.ostrovsky@oracle.com> wrote:
> On 06/12/2017 04:08 AM, Jan Beulich wrote:
>>>>> On 19.05.17 at 17:50, <boris.ostrovsky@oracle.com> wrote:
>>> Instead of scrubbing pages during guest destruction (from
>>> free_heap_pages()) do this opportunistically, from the idle loop.
>> This is too brief for my taste. In particular the re-ordering ...
>>
>>> --- a/xen/arch/x86/domain.c
>>> +++ b/xen/arch/x86/domain.c
>>> @@ -118,8 +118,9 @@ static void idle_loop(void)
>>>      {
>>>          if ( cpu_is_offline(smp_processor_id()) )
>>>              play_dead();
>>> -        (*pm_idle)();
>>>          do_tasklet();
>>> +        if ( cpu_is_haltable(smp_processor_id()) && !scrub_free_pages() )
>>> +            (*pm_idle)();
>>>          do_softirq();
>> ... here (and its correctness / safety) needs an explanation. Not
>> processing tasklets right after idle wakeup is a not obviously
>> correct change. Nor is it immediately clear why this needs to be
>> pulled ahead for your purposes.
> 
> Are you asking for a comment here (i.e. the explanation given by Dario
> (added)  in
> https://lists.xenproject.org/archives/html/xen-devel/2017-05/msg01215.html)
> or are you saying something is wrong?

To judge whether the change is correct I'd like to have an
explanation in the commit message.

>>> +        if ( node_need_scrub[node] )
>>> +        {
>>> +            if ( !get_node )
>>> +                return node;
>>> +
>>> +            dist = __node_distance(local_node, node);
>>> +            if ( dist < shortest || closest == NUMA_NO_NODE )
>>> +            {
>>> +                if ( !node_test_and_set(node, node_scrubbing) )
>>> +                {
>>> +                    if ( closest != NUMA_NO_NODE )
>>> +                        node_clear(closest, node_scrubbing);
>> You call this function with no locks held, yet you temporarily set a
>> bit in node_scrubbing which you then may clear again here. That'll
>> prevent another idle CPU to do scrubbing on this node afaict,
>> which, while not a bug, is not optimal. Therefore I think for this
>> second part of the function you actually want to acquire the heap
>> lock.
> 
> I actually specifically didn't want to take the heap lock here since we
> will be calling this routine quite often and in most cases no scrubbing
> will be needed.

I'm not convinced - memory-only nodes shouldn't be that common,
so in a presumably large share of cases we don't even need to
enter this second part of the function. And if a debatable choice
is being made, giving the reason in a short comment would help
reviewers judge for themselves whether indeed the chosen
approach is the lesser of two (or more) possible evils.

Jan
Jan Beulich June 13, 2017, 8:19 a.m. UTC | #5
>>> On 12.06.17 at 23:28, <dario.faggioli@citrix.com> wrote:
> On Mon, 2017-06-12 at 13:01 -0400, Boris Ostrovsky wrote:
>> On 06/12/2017 04:08 AM, Jan Beulich wrote:
>> > > > > On 19.05.17 at 17:50, <boris.ostrovsky@oracle.com> wrote:
>> > > 
>> > > Instead of scrubbing pages during guest destruction (from
>> > > free_heap_pages()) do this opportunistically, from the idle loop.
>> > 
>> > This is too brief for my taste. In particular the re-ordering ...
>> > 
>> > > --- a/xen/arch/x86/domain.c
>> > > +++ b/xen/arch/x86/domain.c
>> > > @@ -118,8 +118,9 @@ static void idle_loop(void)
>> > >      {
>> > >          if ( cpu_is_offline(smp_processor_id()) )
>> > >              play_dead();
>> > > -        (*pm_idle)();
>> > >          do_tasklet();
>> > > +        if ( cpu_is_haltable(smp_processor_id()) &&
>> > > !scrub_free_pages() )
>> > > +            (*pm_idle)();
>> > >          do_softirq();
>> > 
>> > ... here (and its correctness / safety) needs an explanation. Not
>> > processing tasklets right after idle wakeup is a not obviously
>> > correct change. 
>>
> Well, one can also see things the other way round, though. I.e.:
> considering that do_tasklet() is here for when we force the idle vcpu
> into execution right because we want to process tasklets, doing that
> only after having tried to sleep is not obviously correct.

That's a valid point, but would then call for the re-ordering to
be done in a separate commit with proper explanation.

> And in fact, there's an unwritten (AFAICT) requirement that every
> implementation of pm_idle should not actually sleep if there are
> tasklets pending.

Unwritten or not - that check before actually going to sleep is
quite obviously required, as it needs to be done with interrupts
already disabled (i.e. can't be done _only_ here).

> Truth is, IMO, that we may be here for two reasons: 1) going to sleep
> or 2) running tasklet, and the only think we can do is guessing (and
> ordering the call according to such guess) and checking whether we
> guessed right or wrong. That is:
>  - guess it's 1. Check whether it's really 1. If it is, go ahead with  
>     it; if not, go for 2;
>  - guess it's 2. Check whether it's really 2. If it is, go ahead with
>    it, if not, go for 1;
> 
> Now scrubbing is kind of a third reason why we may be here, and doing
> as done in the code above (although I'm not super happy of the final
> look of the result either), should make all the use cases happy.
> 
> Also, what's the scenario where you think this may be problematic?

First of all I'm not sure there's anything problematic here. But with
no explanation given at all, the change also isn't obviously fine, as
it does alter behavior. If there's indeed nothing that can affect
what do_tasklet() would do and that might happen while we're in
the low level idle handler, then fine. But this needs to be proven.

> AFAICT, tasklets are vcpu context, or softirq context. If some softirq
> context tasklet work is scheduled for a CPU while it is sleeping,
> TASKLET_SOFTIRQ is raised, and the call to do_softirq() --which still
> happens right after the wakeup-- will take care of it.
> 
> If some vcpu context work is scheduled, SCHEDULE_SOFTIRQ is raised.
> do_softirq() will call the scheduler, which will see that there is vcpu
> tasklet work to do, and hence confirm in execution the idle vcpu, which
> will get to execute do_tasklet().

Right, so something along these lines will need to go into the commit
message.

Jan
Boris Ostrovsky June 13, 2017, 6:20 p.m. UTC | #6
>>>> +        if ( node_need_scrub[node] )
>>>> +        {
>>>> +            if ( !get_node )
>>>> +                return node;
>>>> +
>>>> +            dist = __node_distance(local_node, node);
>>>> +            if ( dist < shortest || closest == NUMA_NO_NODE )
>>>> +            {
>>>> +                if ( !node_test_and_set(node, node_scrubbing) )
>>>> +                {
>>>> +                    if ( closest != NUMA_NO_NODE )
>>>> +                        node_clear(closest, node_scrubbing);
>>> You call this function with no locks held, yet you temporarily set a
>>> bit in node_scrubbing which you then may clear again here. That'll
>>> prevent another idle CPU to do scrubbing on this node afaict,
>>> which, while not a bug, is not optimal. Therefore I think for this
>>> second part of the function you actually want to acquire the heap
>>> lock.
>> I actually specifically didn't want to take the heap lock here since we
>> will be calling this routine quite often and in most cases no scrubbing
>> will be needed.
> I'm not convinced - memory-only nodes shouldn't be that common,
> so in a presumably large share of cases we don't even need to
> enter this second part of the function. And if a debatable choice
> is being made, giving the reason in a short comment would help
> reviewers judge for themselves whether indeed the chosen
> approach is the lesser of two (or more) possible evils.

I realize that CPU-less nodes are rare but if we are on such a system we
will always be adding pressure on the heap lock. Especially on systems
with many CPUs.

Even if a CPU unnecessarily loses its turn to scrub because another
processor held the node and decided not to scrub it the first CPU can
come back next time when it wakes up from sleep.

Another alternative could be adding a local lock, just for this routine.

-boris


-boris
Boris Ostrovsky June 13, 2017, 6:39 p.m. UTC | #7
On 06/13/2017 04:19 AM, Jan Beulich wrote:
>>>> On 12.06.17 at 23:28, <dario.faggioli@citrix.com> wrote:
>> On Mon, 2017-06-12 at 13:01 -0400, Boris Ostrovsky wrote:
>>> On 06/12/2017 04:08 AM, Jan Beulich wrote:
>>>>>>> On 19.05.17 at 17:50, <boris.ostrovsky@oracle.com> wrote:
>>>>> Instead of scrubbing pages during guest destruction (from
>>>>> free_heap_pages()) do this opportunistically, from the idle loop.
>>>> This is too brief for my taste. In particular the re-ordering ...
>>>>
>>>>> --- a/xen/arch/x86/domain.c
>>>>> +++ b/xen/arch/x86/domain.c
>>>>> @@ -118,8 +118,9 @@ static void idle_loop(void)
>>>>>      {
>>>>>          if ( cpu_is_offline(smp_processor_id()) )
>>>>>              play_dead();
>>>>> -        (*pm_idle)();
>>>>>          do_tasklet();
>>>>> +        if ( cpu_is_haltable(smp_processor_id()) &&
>>>>> !scrub_free_pages() )
>>>>> +            (*pm_idle)();
>>>>>          do_softirq();
>>>> ... here (and its correctness / safety) needs an explanation. Not
>>>> processing tasklets right after idle wakeup is a not obviously
>>>> correct change. 
>> Well, one can also see things the other way round, though. I.e.:
>> considering that do_tasklet() is here for when we force the idle vcpu
>> into execution right because we want to process tasklets, doing that
>> only after having tried to sleep is not obviously correct.
> That's a valid point, but would then call for the re-ordering to
> be done in a separate commit with proper explanation.
>
>> And in fact, there's an unwritten (AFAICT) requirement that every
>> implementation of pm_idle should not actually sleep if there are
>> tasklets pending.
> Unwritten or not - that check before actually going to sleep is
> quite obviously required, as it needs to be done with interrupts
> already disabled (i.e. can't be done _only_ here).
>
>> Truth is, IMO, that we may be here for two reasons: 1) going to sleep
>> or 2) running tasklet, and the only think we can do is guessing (and
>> ordering the call according to such guess) and checking whether we
>> guessed right or wrong. That is:
>>  - guess it's 1. Check whether it's really 1. If it is, go ahead with  
>>     it; if not, go for 2;
>>  - guess it's 2. Check whether it's really 2. If it is, go ahead with
>>    it, if not, go for 1;
>>
>> Now scrubbing is kind of a third reason why we may be here, and doing
>> as done in the code above (although I'm not super happy of the final
>> look of the result either), should make all the use cases happy.
>>
>> Also, what's the scenario where you think this may be problematic?
> First of all I'm not sure there's anything problematic here. But with
> no explanation given at all, the change also isn't obviously fine, as
> it does alter behavior. If there's indeed nothing that can affect
> what do_tasklet() would do and that might happen while we're in
> the low level idle handler, then fine. But this needs to be proven.
>
>> AFAICT, tasklets are vcpu context, or softirq context. If some softirq
>> context tasklet work is scheduled for a CPU while it is sleeping,
>> TASKLET_SOFTIRQ is raised, and the call to do_softirq() --which still
>> happens right after the wakeup-- will take care of it.
>>
>> If some vcpu context work is scheduled, SCHEDULE_SOFTIRQ is raised.
>> do_softirq() will call the scheduler, which will see that there is vcpu
>> tasklet work to do, and hence confirm in execution the idle vcpu, which
>> will get to execute do_tasklet().
> Right, so something along these lines will need to go into the commit
> message.


So would you then prefer to separate this into two patches, with the
first just moving do_tasklet() above sleeping?

-boris
Dario Faggioli June 13, 2017, 8:36 p.m. UTC | #8
On Tue, 2017-06-13 at 14:39 -0400, Boris Ostrovsky wrote:
> On 06/13/2017 04:19 AM, Jan Beulich wrote:
> > > > > On 12.06.17 at 23:28, <dario.faggioli@citrix.com> wrote:
> > > If some vcpu context work is scheduled, SCHEDULE_SOFTIRQ is
> > > raised.
> > > do_softirq() will call the scheduler, which will see that there
> > > is vcpu
> > > tasklet work to do, and hence confirm in execution the idle vcpu,
> > > which
> > > will get to execute do_tasklet().
> > 
> > Right, so something along these lines will need to go into the
> > commit
> > message.
> 
> 
> So would you then prefer to separate this into two patches, with the
> first just moving do_tasklet() above sleeping?
> 
If you want, I can send a patch to this effect (which can then either
just go in, or Boris can carry it in this series).

Regards,
Dario
Boris Ostrovsky June 13, 2017, 9:54 p.m. UTC | #9
On 06/13/2017 04:36 PM, Dario Faggioli wrote:
> On Tue, 2017-06-13 at 14:39 -0400, Boris Ostrovsky wrote:
>> On 06/13/2017 04:19 AM, Jan Beulich wrote:
>>>>>> On 12.06.17 at 23:28, <dario.faggioli@citrix.com> wrote:
>>>> If some vcpu context work is scheduled, SCHEDULE_SOFTIRQ is
>>>> raised.
>>>> do_softirq() will call the scheduler, which will see that there
>>>> is vcpu
>>>> tasklet work to do, and hence confirm in execution the idle vcpu,
>>>> which
>>>> will get to execute do_tasklet().
>>> Right, so something along these lines will need to go into the
>>> commit
>>> message.
>>
>> So would you then prefer to separate this into two patches, with the
>> first just moving do_tasklet() above sleeping?
>>
> If you want, I can send a patch to this effect (which can then either
> just go in, or Boris can carry it in this series).

Thanks for the offer Dario.

We can do it either way but FYI I am leaving for ~3 weeks next Friday so
I am not sure I will be able have this series ready and tested by then.

-boris
Jan Beulich June 14, 2017, 9:17 a.m. UTC | #10
>>> On 13.06.17 at 20:20, <boris.ostrovsky@oracle.com> wrote:

>>>>> +        if ( node_need_scrub[node] )
>>>>> +        {
>>>>> +            if ( !get_node )
>>>>> +                return node;
>>>>> +
>>>>> +            dist = __node_distance(local_node, node);
>>>>> +            if ( dist < shortest || closest == NUMA_NO_NODE )
>>>>> +            {
>>>>> +                if ( !node_test_and_set(node, node_scrubbing) )
>>>>> +                {
>>>>> +                    if ( closest != NUMA_NO_NODE )
>>>>> +                        node_clear(closest, node_scrubbing);
>>>> You call this function with no locks held, yet you temporarily set a
>>>> bit in node_scrubbing which you then may clear again here. That'll
>>>> prevent another idle CPU to do scrubbing on this node afaict,
>>>> which, while not a bug, is not optimal. Therefore I think for this
>>>> second part of the function you actually want to acquire the heap
>>>> lock.
>>> I actually specifically didn't want to take the heap lock here since we
>>> will be calling this routine quite often and in most cases no scrubbing
>>> will be needed.
>> I'm not convinced - memory-only nodes shouldn't be that common,
>> so in a presumably large share of cases we don't even need to
>> enter this second part of the function. And if a debatable choice
>> is being made, giving the reason in a short comment would help
>> reviewers judge for themselves whether indeed the chosen
>> approach is the lesser of two (or more) possible evils.
> 
> I realize that CPU-less nodes are rare but if we are on such a system we
> will always be adding pressure on the heap lock. Especially on systems
> with many CPUs.
> 
> Even if a CPU unnecessarily loses its turn to scrub because another
> processor held the node and decided not to scrub it the first CPU can
> come back next time when it wakes up from sleep.
> 
> Another alternative could be adding a local lock, just for this routine.

If no accesses elsewhere are affected, that's certainly an option.
But no matter what route you go, please reason about it in the
commit message.

Jan
Jan Beulich June 14, 2017, 9:18 a.m. UTC | #11
>>> On 13.06.17 at 20:39, <boris.ostrovsky@oracle.com> wrote:
> On 06/13/2017 04:19 AM, Jan Beulich wrote:
>>>>> On 12.06.17 at 23:28, <dario.faggioli@citrix.com> wrote:
>>> On Mon, 2017-06-12 at 13:01 -0400, Boris Ostrovsky wrote:
>>>> On 06/12/2017 04:08 AM, Jan Beulich wrote:
>>>>>>>> On 19.05.17 at 17:50, <boris.ostrovsky@oracle.com> wrote:
>>>>>> Instead of scrubbing pages during guest destruction (from
>>>>>> free_heap_pages()) do this opportunistically, from the idle loop.
>>>>> This is too brief for my taste. In particular the re-ordering ...
>>>>>
>>>>>> --- a/xen/arch/x86/domain.c
>>>>>> +++ b/xen/arch/x86/domain.c
>>>>>> @@ -118,8 +118,9 @@ static void idle_loop(void)
>>>>>>      {
>>>>>>          if ( cpu_is_offline(smp_processor_id()) )
>>>>>>              play_dead();
>>>>>> -        (*pm_idle)();
>>>>>>          do_tasklet();
>>>>>> +        if ( cpu_is_haltable(smp_processor_id()) &&
>>>>>> !scrub_free_pages() )
>>>>>> +            (*pm_idle)();
>>>>>>          do_softirq();
>>>>> ... here (and its correctness / safety) needs an explanation. Not
>>>>> processing tasklets right after idle wakeup is a not obviously
>>>>> correct change. 
>>> Well, one can also see things the other way round, though. I.e.:
>>> considering that do_tasklet() is here for when we force the idle vcpu
>>> into execution right because we want to process tasklets, doing that
>>> only after having tried to sleep is not obviously correct.
>> That's a valid point, but would then call for the re-ordering to
>> be done in a separate commit with proper explanation.
>>
>>> And in fact, there's an unwritten (AFAICT) requirement that every
>>> implementation of pm_idle should not actually sleep if there are
>>> tasklets pending.
>> Unwritten or not - that check before actually going to sleep is
>> quite obviously required, as it needs to be done with interrupts
>> already disabled (i.e. can't be done _only_ here).
>>
>>> Truth is, IMO, that we may be here for two reasons: 1) going to sleep
>>> or 2) running tasklet, and the only think we can do is guessing (and
>>> ordering the call according to such guess) and checking whether we
>>> guessed right or wrong. That is:
>>>  - guess it's 1. Check whether it's really 1. If it is, go ahead with  
>>>     it; if not, go for 2;
>>>  - guess it's 2. Check whether it's really 2. If it is, go ahead with
>>>    it, if not, go for 1;
>>>
>>> Now scrubbing is kind of a third reason why we may be here, and doing
>>> as done in the code above (although I'm not super happy of the final
>>> look of the result either), should make all the use cases happy.
>>>
>>> Also, what's the scenario where you think this may be problematic?
>> First of all I'm not sure there's anything problematic here. But with
>> no explanation given at all, the change also isn't obviously fine, as
>> it does alter behavior. If there's indeed nothing that can affect
>> what do_tasklet() would do and that might happen while we're in
>> the low level idle handler, then fine. But this needs to be proven.
>>
>>> AFAICT, tasklets are vcpu context, or softirq context. If some softirq
>>> context tasklet work is scheduled for a CPU while it is sleeping,
>>> TASKLET_SOFTIRQ is raised, and the call to do_softirq() --which still
>>> happens right after the wakeup-- will take care of it.
>>>
>>> If some vcpu context work is scheduled, SCHEDULE_SOFTIRQ is raised.
>>> do_softirq() will call the scheduler, which will see that there is vcpu
>>> tasklet work to do, and hence confirm in execution the idle vcpu, which
>>> will get to execute do_tasklet().
>> Right, so something along these lines will need to go into the commit
>> message.
> 
> 
> So would you then prefer to separate this into two patches, with the
> first just moving do_tasklet() above sleeping?

Yes, please (no matter who of you two does so).

Jan
diff mbox

Patch

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 76310ed..9931ca2 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -46,15 +46,19 @@  void idle_loop(void)
         if ( cpu_is_offline(smp_processor_id()) )
             stop_cpu();
 
-        local_irq_disable();
-        if ( cpu_is_haltable(smp_processor_id()) )
+        do_tasklet();
+
+        if ( cpu_is_haltable(smp_processor_id()) && !scrub_free_pages() )
         {
-            dsb(sy);
-            wfi();
+            local_irq_disable();
+            if ( cpu_is_haltable(smp_processor_id()) )
+            {
+                dsb(sy);
+                wfi();
+            }
+            local_irq_enable();
         }
-        local_irq_enable();
 
-        do_tasklet();
         do_softirq();
         /*
          * We MUST be last (or before dsb, wfi). Otherwise after we get the
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 13cdc50..229711f 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -118,8 +118,9 @@  static void idle_loop(void)
     {
         if ( cpu_is_offline(smp_processor_id()) )
             play_dead();
-        (*pm_idle)();
         do_tasklet();
+        if ( cpu_is_haltable(smp_processor_id()) && !scrub_free_pages() )
+            (*pm_idle)();
         do_softirq();
         /*
          * We MUST be last (or before pm_idle). Otherwise after we get the
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 71baede..cfe7cc1 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -18,12 +18,14 @@ 
 #include <asm/hardirq.h>
 #include <asm/setup.h>
 
-static struct vcpu *__read_mostly override;
+static DEFINE_PER_CPU(struct vcpu *, override);
 
 static inline struct vcpu *mapcache_current_vcpu(void)
 {
+    struct vcpu *v, *this_vcpu = this_cpu(override);
+
     /* In the common case we use the mapcache of the running VCPU. */
-    struct vcpu *v = override ?: current;
+    v = this_vcpu ?: current;
 
     /*
      * When current isn't properly set up yet, this is equivalent to
@@ -59,7 +61,7 @@  static inline struct vcpu *mapcache_current_vcpu(void)
 
 void __init mapcache_override_current(struct vcpu *v)
 {
-    override = v;
+    this_cpu(override) = v;
 }
 
 #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index b7c7426..6e505b1 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1010,15 +1010,79 @@  static int reserve_offlined_page(struct page_info *head)
     return count;
 }
 
-static void scrub_free_pages(unsigned int node)
+static nodemask_t node_scrubbing;
+
+/*
+ * If get_node is true this will return closest node that needs to be scrubbed,
+ * with appropriate bit in node_scrubbing set.
+ * If get_node is not set, this will return *a* node that needs to be scrubbed.
+ * node_scrubbing bitmask will no be updated.
+ * If no node needs scrubbing then NUMA_NO_NODE is returned.
+ */
+static unsigned int node_to_scrub(bool get_node)
 {
-    struct page_info *pg;
-    unsigned int zone;
+    nodeid_t node = cpu_to_node(smp_processor_id()), local_node;
+    nodeid_t closest = NUMA_NO_NODE;
+    u8 dist, shortest = 0xff;
 
-    ASSERT(spin_is_locked(&heap_lock));
+    if ( node == NUMA_NO_NODE )
+        node = 0;
 
-    if ( !node_need_scrub[node] )
-        return;
+    if ( node_need_scrub[node] &&
+         (!get_node || !node_test_and_set(node, node_scrubbing)) )
+        return node;
+
+    /*
+     * See if there are memory-only nodes that need scrubbing and choose
+     * the closest one.
+     */
+    local_node = node;
+    while ( 1 )
+    {
+        do {
+            node = cycle_node(node, node_online_map);
+        } while ( !cpumask_empty(&node_to_cpumask(node)) &&
+                  (node != local_node) );
+
+        if ( node == local_node )
+            break;
+
+        if ( node_need_scrub[node] )
+        {
+            if ( !get_node )
+                return node;
+
+            dist = __node_distance(local_node, node);
+            if ( dist < shortest || closest == NUMA_NO_NODE )
+            {
+                if ( !node_test_and_set(node, node_scrubbing) )
+                {
+                    if ( closest != NUMA_NO_NODE )
+                        node_clear(closest, node_scrubbing);
+                    shortest = dist;
+                    closest = node;
+                }
+            }
+        }
+    }
+
+    return closest;
+}
+
+bool scrub_free_pages(void)
+{
+    struct page_info *pg;
+    unsigned int zone;
+    unsigned int cpu = smp_processor_id();
+    bool preempt = false;
+    nodeid_t node;
+    unsigned int cnt = 0;
+  
+    node = node_to_scrub(true);
+    if ( node == NUMA_NO_NODE )
+        return false;
+ 
+    spin_lock(&heap_lock);
 
     for ( zone = 0; zone < NR_ZONES; zone++ )
     {
@@ -1035,22 +1099,46 @@  static void scrub_free_pages(unsigned int node)
 
                 for ( i = pg->u.free.first_dirty; i < (1U << order); i++)
                 {
+                    cnt++;
                     if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
                     {
                         scrub_one_page(&pg[i]);
                         pg[i].count_info &= ~PGC_need_scrub;
                         node_need_scrub[node]--;
+                        cnt += 100; /* scrubbed pages add heavier weight. */
                     }
-                }
 
-                page_list_del(pg, &heap(node, zone, order));
-                page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
+                    /*
+                     * Scrub a few (8) pages before becoming eligible for
+                     * preemtion. But also count non-scrubbing loop iteration
+                     * so that we don't get stuck here with an almost clean
+                     * heap.
+                     */
+                    if ( softirq_pending(cpu) && cnt > 800 )
+                    {
+                        preempt = true;
+                        break;
+                    }
+                }
 
-                if ( node_need_scrub[node] == 0 )
-                    return;
+                if ( i == (1U << order) )
+                {
+                    page_list_del(pg, &heap(node, zone, order));
+                    page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
+                }
+                else
+                    pg->u.free.first_dirty = i + 1;
+ 
+                if ( preempt || (node_need_scrub[node] == 0) )
+                    goto out;
             }
         } while ( order-- != 0 );
     }
+
+ out:
+    spin_unlock(&heap_lock);
+    node_clear(node, node_scrubbing);
+    return softirq_pending(cpu) || (node_to_scrub(false) != NUMA_NO_NODE);
 }
 
 /* Free 2^@order set of pages. */
@@ -1166,9 +1254,6 @@  static void free_heap_pages(
     if ( tainted )
         reserve_offlined_page(pg);
 
-    if ( need_scrub )
-        scrub_free_pages(node);
-
     spin_unlock(&heap_lock);
 }
 
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index 0d4b7c2..ed90a61 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -138,6 +138,7 @@  void init_xenheap_pages(paddr_t ps, paddr_t pe);
 void xenheap_max_mfn(unsigned long mfn);
 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags);
 void free_xenheap_pages(void *v, unsigned int order);
+bool scrub_free_pages(void);
 #define alloc_xenheap_page() (alloc_xenheap_pages(0,0))
 #define free_xenheap_page(v) (free_xenheap_pages(v,0))
 /* Map machine page range in Xen virtual address space. */