Message ID | 20220208081902.3550911-5-yuzhao@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Multigenerational LRU Framework | expand |
Hello Yu, On Tue, Feb 08, 2022 at 01:18:54AM -0700, Yu Zhao wrote: > @@ -92,11 +92,196 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio) > return lru; > } > > +#ifdef CONFIG_LRU_GEN > + > +static inline bool lru_gen_enabled(void) > +{ > + return true; > +} > + > +static inline bool lru_gen_in_fault(void) > +{ > + return current->in_lru_fault; > +} > + > +static inline int lru_gen_from_seq(unsigned long seq) > +{ > + return seq % MAX_NR_GENS; > +} > + > +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) > +{ > + unsigned long max_seq = lruvec->lrugen.max_seq; > + > + VM_BUG_ON(gen >= MAX_NR_GENS); > + > + /* see the comment on MIN_NR_GENS */ > + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); > +} I'm still reading the series, so correct me if I'm wrong: the "active" set is split into two generations for the sole purpose of the second-chance policy for fresh faults, right? If so, it'd be better to have the comment here instead of down by MIN_NR_GENS. This is the place that defines what "active" is, so this is where the reader asks what it means and what it implies. The definition of MIN_NR_GENS can be briefer: "need at least two for second chance, see lru_gen_is_active() for details". > +static inline void lru_gen_update_size(struct lruvec *lruvec, enum lru_list lru, > + int zone, long delta) > +{ > + struct pglist_data *pgdat = lruvec_pgdat(lruvec); > + > + lockdep_assert_held(&lruvec->lru_lock); > + WARN_ON_ONCE(delta != (int)delta); > + > + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, delta); > + __mod_zone_page_state(&pgdat->node_zones[zone], NR_ZONE_LRU_BASE + lru, delta); > +} This is a duplicate of update_lru_size(), please use that instead. Yeah technically you don't need the mem_cgroup_update_lru_size() but that's not worth sweating over, better to keep it simple. > +static inline void lru_gen_balance_size(struct lruvec *lruvec, struct folio *folio, > + int old_gen, int new_gen) lru_gen_update_lru_sizes() for this one would be more descriptive imo and in line with update_lru_size() that it's built on. > +{ > + int type = folio_is_file_lru(folio); > + int zone = folio_zonenum(folio); > + int delta = folio_nr_pages(folio); > + enum lru_list lru = type * LRU_INACTIVE_FILE; > + struct lru_gen_struct *lrugen = &lruvec->lrugen; > + > + VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS); > + VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS); > + VM_BUG_ON(old_gen == -1 && new_gen == -1); Could be a bit easier to read quickly with high-level descriptions: > + if (old_gen >= 0) > + WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], > + lrugen->nr_pages[old_gen][type][zone] - delta); > + if (new_gen >= 0) > + WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], > + lrugen->nr_pages[new_gen][type][zone] + delta); > + /* Addition */ > + if (old_gen < 0) { > + if (lru_gen_is_active(lruvec, new_gen)) > + lru += LRU_ACTIVE; > + lru_gen_update_size(lruvec, lru, zone, delta); > + return; > + } > + /* Removal */ > + if (new_gen < 0) { > + if (lru_gen_is_active(lruvec, old_gen)) > + lru += LRU_ACTIVE; > + lru_gen_update_size(lruvec, lru, zone, -delta); > + return; > + } > + /* Promotion */ > + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { > + lru_gen_update_size(lruvec, lru, zone, -delta); > + lru_gen_update_size(lruvec, lru + LRU_ACTIVE, zone, delta); > + } > + > + /* Promotion is legit while a page is on an LRU list, but demotion isn't. */ /* Demotion happens during aging when pages are isolated, never on-LRU */ > + VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); > +} On that note, please move introduction of the promotion and demotion bits to the next patch. They aren't used here yet, and I spent some time jumping around patches to verify the promotion callers and confirm the validy of the BUG_ON. > +static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) > +{ > + int gen; > + unsigned long old_flags, new_flags; > + int type = folio_is_file_lru(folio); > + int zone = folio_zonenum(folio); > + struct lru_gen_struct *lrugen = &lruvec->lrugen; > + > + if (folio_test_unevictable(folio) || !lrugen->enabled) > + return false; These two checks should be in the callsite and the function should return void. Otherwise you can't understand the callsite without drilling down into lrugen code, even if lrugen is disabled. folio_add_lru() gets it right. > + /* > + * There are three common cases for this page: > + * 1) If it shouldn't be evicted, e.g., it was just faulted in, add it > + * to the youngest generation. "shouldn't be evicted" makes it sound like mlock. But they should just be evicted last, right? Maybe: /* * Pages start in different generations depending on * advance knowledge we have about their hotness and * evictability: * * 1. Already active pages start out youngest. This can be * fresh faults, or refaults of previously hot pages. * 2. Cold pages that require writeback before becoming * evictable start on the second oldest generation. * 3. Everything else (clean, cold) starts old. */ On that note, I think #1 is reintroducing a problem we have fixed before, which is trashing the workingset with a flood of use-once mmapped pages. It's the classic scenario where LFU beats LRU. Mapped streaming IO isn't very common, but it does happen. See these commits: dfc8d636cdb95f7b792d5ba8c9f3b295809c125d 31c0569c3b0b6cc8a867ac6665ca081553f7984c 645747462435d84c6c6a64269ed49cc3015f753d From the changelog: The used-once mapped file page detection patchset. It is meant to help workloads with large amounts of shortly used file mappings, like rtorrent hashing a file or git when dealing with loose objects (git gc on a bigger site?). Right now, the VM activates referenced mapped file pages on first encounter on the inactive list and it takes a full memory cycle to reclaim them again. When those pages dominate memory, the system no longer has a meaningful notion of 'working set' and is required to give up the active list to make reclaim progress. Obviously, this results in rather bad scanning latencies and the wrong pages being reclaimed. This patch makes the VM be more careful about activating mapped file pages in the first place. The minimum granted lifetime without another memory access becomes an inactive list cycle instead of the full memory cycle, which is more natural given the mentioned loads. Translating this to multigen, it seems fresh faults should really start on the second oldest rather than on the youngest generation, to get a second chance but without jeopardizing the workingset if they don't take it. > + * 2) If it can't be evicted immediately, i.e., it's an anon page and > + * not in swapcache, or a dirty page pending writeback, add it to the > + * second oldest generation. > + * 3) If it may be evicted immediately, e.g., it's a clean page, add it > + * to the oldest generation. > + */ > + if (folio_test_active(folio)) > + gen = lru_gen_from_seq(lrugen->max_seq); > + else if ((!type && !folio_test_swapcache(folio)) || > + (folio_test_reclaim(folio) && > + (folio_test_dirty(folio) || folio_test_writeback(folio)))) > + gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); > + else > + gen = lru_gen_from_seq(lrugen->min_seq[type]); Condition #2 is not quite clear to me, and the comment is incomplete: The code does put dirty/writeback pages on the oldest gen as long as they haven't been marked for immediate reclaim by the scanner yet. HOWEVER, once the scanner does see those pages and sets PG_reclaim, it will also activate them to move them out of the way until writeback finishes (see shrink_page_list()) - at which point we'll trigger #1. So that second part of #2 appears unreachable. It could be a good exercise to describe how cache pages move through the generations, similar to the comment on lru_deactivate_file_fn(). It's a good example of intent vs implementation. On another note, "!type" meaning "anon" is a bit rough. Please follow the "bool file" convention used elsewhere. > @@ -113,6 +298,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) > { > enum lru_list lru = folio_lru_list(folio); > > + if (lru_gen_add_folio(lruvec, folio, true)) > + return; > + bool parameters are notoriously hard to follow in the callsite. Can you please add lru_gen_add_folio_tail() instead and have them use a common helper? > @@ -127,6 +315,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, > static __always_inline > void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) > { > + if (lru_gen_del_folio(lruvec, folio, false)) > + return; > + > list_del(&folio->lru); > update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio), > -folio_nr_pages(folio)); > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index aed44e9b5d89..0f5e8a995781 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -303,6 +303,78 @@ enum lruvec_flags { > */ > }; > > +struct lruvec; > + > +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) > +#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) > + > +#ifdef CONFIG_LRU_GEN > + > +#define MIN_LRU_BATCH BITS_PER_LONG > +#define MAX_LRU_BATCH (MIN_LRU_BATCH * 128) Those two aren't used in this patch, so it's hard to say whether they are chosen correctly. > + * Evictable pages are divided into multiple generations. The youngest and the > + * oldest generation numbers, max_seq and min_seq, are monotonically increasing. > + * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An > + * offset within MAX_NR_GENS, gen, indexes the LRU list of the corresponding > + * generation. The gen counter in folio->flags stores gen+1 while a page is on > + * one of lrugen->lists[]. Otherwise it stores 0. > + * > + * A page is added to the youngest generation on faulting. The aging needs to > + * check the accessed bit at least twice before handing this page over to the > + * eviction. The first check takes care of the accessed bit set on the initial > + * fault; the second check makes sure this page hasn't been used since then. > + * This process, AKA second chance, requires a minimum of two generations, > + * hence MIN_NR_GENS. And to be compatible with the active/inactive LRU, these > + * two generations are mapped to the active; the rest of generations, if they > + * exist, are mapped to the inactive. PG_active is always cleared while a page > + * is on one of lrugen->lists[] so that demotion, which happens consequently > + * when the aging produces a new generation, needs not to worry about it. > + */ > +#define MIN_NR_GENS 2U > +#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) > + > +struct lru_gen_struct { struct lrugen? In fact, "lrugen" for the general function and variable namespace might be better, the _ doesn't seem to pull its weight. CONFIG_LRUGEN struct lrugen lrugen_foo() etc. > + /* the aging increments the youngest generation number */ > + unsigned long max_seq; > + /* the eviction increments the oldest generation numbers */ > + unsigned long min_seq[ANON_AND_FILE]; The singular max_seq vs the split min_seq raises questions. Please add a comment that explains or points to an explanation. > + /* the birth time of each generation in jiffies */ > + unsigned long timestamps[MAX_NR_GENS]; This isn't in use until the thrashing-based OOM killing patch. > + /* the multigenerational LRU lists */ > + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; > + /* the sizes of the above lists */ > + unsigned long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; > + /* whether the multigenerational LRU is enabled */ > + bool enabled; Not (really) in use until the runtime switch. Best to keep everybody checking the global flag for now, and have the runtime switch patch introduce this flag and switch necessary callsites over. > +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); "state" is what we usually init :) How about lrugen_init_lruvec()? You can drop the memcg parameter and use lruvec_memcg(). > +#ifdef CONFIG_MEMCG > +void lru_gen_init_memcg(struct mem_cgroup *memcg); > +void lru_gen_free_memcg(struct mem_cgroup *memcg); This should be either init+exit, or alloc+free. Thanks
On Tue, Feb 08, 2022 at 01:18:54AM -0700, Yu Zhao wrote: > Evictable pages are divided into multiple generations for each lruvec. > The youngest generation number is stored in lrugen->max_seq for both > anon and file types as they're aged on an equal footing. The oldest > generation numbers are stored in lrugen->min_seq[] separately for anon > and file types as clean file pages can be evicted regardless of swap > constraints. These three variables are monotonically increasing. > > Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits > in order to fit into the gen counter in folio->flags. Each truncated > generation number is an index to lrugen->lists[]. The sliding window > technique is used to track at least MIN_NR_GENS and at most > MAX_NR_GENS generations. The gen counter stores (seq%MAX_NR_GENS)+1 > while a page is on one of lrugen->lists[]. Otherwise it stores 0. > > There are two conceptually independent processes (as in the > manufacturing process): "the aging", which produces young generations, > and "the eviction", which consumes old generations. They form a > closed-loop system, i.e., "the page reclaim". Both processes can be > invoked from userspace for the purposes of working set estimation and > proactive reclaim. These features are required to optimize job > scheduling (bin packing) in data centers. The variable size of the > sliding window is designed for such use cases [1][2]. > > To avoid confusions, the terms "hot" and "cold" will be applied to the > multigenerational LRU, as a new convention; the terms "active" and > "inactive" will be applied to the active/inactive LRU, as usual. [...] > +++ b/include/linux/page-flags-layout.h > @@ -26,6 +26,14 @@ > > #define ZONES_WIDTH ZONES_SHIFT > > +#ifdef CONFIG_LRU_GEN > +/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ > +#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2) > +#else > +#define LRU_GEN_WIDTH 0 > +#define LRU_REFS_WIDTH 0 > +#endif /* CONFIG_LRU_GEN */ I'm concerned about the number of bits being used in page->flags. It seems to me that we already have six bits in use to aid us in choosing which pages to reclaim: referenced, lru, active, workingset, reclaim, unevictable. What I was hoping to see from this patch set was reuse of those bits. That would give us 32 queues in total. Some would be special (eg pages cannot migrate out of the unevictable queue), but it seems to me that you effectively have 4 queues for active and 4 queues for inactive at this point (unless I misunderstood that). I think we need special numbers for: Not on the LRU and Unevictable, but that still leaves us with 30 generations to split between active & inactive. But maybe we still need some of those bits? Perhaps it's not OK to say that queue id 0 is !LRU, queue 1 is unevictable, queue #2 is workingset, queues 3-7 are active, queues 8-15 are various degrees of inactive. I'm assuming that it's not sensible to have a page that's marked as both "reclaim" and "workingset", but perhaps it is. Anyway, I don't understand this area well enough. I was just hoping for some simplification.
On Thu, Feb 10, 2022 at 09:37:25PM +0000, Matthew Wilcox wrote: > On Tue, Feb 08, 2022 at 01:18:54AM -0700, Yu Zhao wrote: [...] > > +++ b/include/linux/page-flags-layout.h > > @@ -26,6 +26,14 @@ > > > > #define ZONES_WIDTH ZONES_SHIFT > > > > +#ifdef CONFIG_LRU_GEN > > +/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ > > +#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2) > > +#else > > +#define LRU_GEN_WIDTH 0 > > +#define LRU_REFS_WIDTH 0 > > +#endif /* CONFIG_LRU_GEN */ > > I'm concerned about the number of bits being used in page->flags. > It seems to me that we already have six bits in use to aid us in choosing > which pages to reclaim: referenced, lru, active, workingset, reclaim, > unevictable. > > What I was hoping to see from this patch set was reuse of those bits. Agreed. I have a plan to *reduce* some of those bits but it's a relatively low priority item on my to-do list. > That would give us 32 queues in total. Some would be special (eg pages > cannot migrate out of the unevictable queue), but it seems to me that you > effectively have 4 queues for active and 4 queues for inactive at this > point (unless I misunderstood that). I think we need special numbers > for: Not on the LRU and Unevictable, but that still leaves us with 30 > generations to split between active & inactive. > > But maybe we still need some of those bits? Perhaps it's not OK to say > that queue id 0 is !LRU, queue 1 is unevictable, queue #2 is workingset, > queues 3-7 are active, queues 8-15 are various degrees of inactive. > I'm assuming that it's not sensible to have a page that's marked as both > "reclaim" and "workingset", but perhaps it is. > > Anyway, I don't understand this area well enough. I was just hoping > for some simplification. I plan to use the spare bits in folio->lru to indicate which lru list a folio is on, i.e., active/inactive or generations or unevictable. In addition, swapbacked could go to folio->mapping -- we wouldn't need it if there were no MADV_FREE, i.e., it would be equivalent to PageAnon() || shmem_mapping(). These two work items can be done separately and in parallel with everything else that's been going on lately. It'd awesome if somebody volunteers, and I can find some resource from our side to test/review the code so that we can have them done sooner. The rest, in theory, can also be moved to somewhere but, IMO, it's not really worth the effort given the situation isn't dire at the moment. referenced and workingset are already reused between the active/inactive lru and the multigenerational lru; reclaim is reused for readahead, but readahead could be split out as an xa tag; lru is reused for isolation synchronization.
On Thu, Feb 10, 2022 at 03:41:57PM -0500, Johannes Weiner wrote: Thanks for reviewing. > > +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) > > +{ > > + unsigned long max_seq = lruvec->lrugen.max_seq; > > + > > + VM_BUG_ON(gen >= MAX_NR_GENS); > > + > > + /* see the comment on MIN_NR_GENS */ > > + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); > > +} > > I'm still reading the series, so correct me if I'm wrong: the "active" > set is split into two generations for the sole purpose of the > second-chance policy for fresh faults, right? To be precise, the active/inactive notion on top of generations is just for ABI compatibility, e.g., the counters in /proc/vmstat. Otherwise, this function wouldn't be needed. > If so, it'd be better to have the comment here instead of down by > MIN_NR_GENS. This is the place that defines what "active" is, so this > is where the reader asks what it means and what it implies. The > definition of MIN_NR_GENS can be briefer: "need at least two for > second chance, see lru_gen_is_active() for details". This could be understood this way. It'd be more appropriate to see this function as an auxiliary and MIN_NR_GENS as something fundamental. Therefore the former should refer to the latter. Specifically, the "see the comment on MIN_NR_GENS" refers to this part: And to be compatible with the active/inactive LRU, these two generations are mapped to the active; the rest of generations, if they exist, are mapped to the inactive. > > +static inline void lru_gen_update_size(struct lruvec *lruvec, enum lru_list lru, > > + int zone, long delta) > > +{ > > + struct pglist_data *pgdat = lruvec_pgdat(lruvec); > > + > > + lockdep_assert_held(&lruvec->lru_lock); > > + WARN_ON_ONCE(delta != (int)delta); > > + > > + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, delta); > > + __mod_zone_page_state(&pgdat->node_zones[zone], NR_ZONE_LRU_BASE + lru, delta); > > +} > > This is a duplicate of update_lru_size(), please use that instead. > > Yeah technically you don't need the mem_cgroup_update_lru_size() but > that's not worth sweating over, better to keep it simple. I agree we don't need the mem_cgroup_update_lru_size() -- let me spell out why: this function is not needed here because it updates the counters used only by the active/inactive lru code, i.e., get_scan_count(). However, we can't reuse update_lru_size() because MGLRU can trip the WARN_ONCE() in mem_cgroup_update_lru_size(). Unlike lru_zone_size[], lrugen->nr_pages[] is eventually consistent. To move a page to a different generation, the gen counter in page->flags is updated first, which doesn't require the LRU lock. The second step, i.e., the update of lrugen->nr_pages[], requires the LRU lock, and it usually isn't done immediately due to batching. Meanwhile, if this page is, for example, isolated, nr_pages[] becomes temporarily unbalanced. And this trips the WARN_ONCE(). <snipped> > /* Promotion */ > > + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { > > + lru_gen_update_size(lruvec, lru, zone, -delta); > > + lru_gen_update_size(lruvec, lru + LRU_ACTIVE, zone, delta); > > + } > > + > > + /* Promotion is legit while a page is on an LRU list, but demotion isn't. */ > > /* Demotion happens during aging when pages are isolated, never on-LRU */ > > + VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); > > +} > > On that note, please move introduction of the promotion and demotion > bits to the next patch. They aren't used here yet, and I spent some > time jumping around patches to verify the promotion callers and > confirm the validy of the BUG_ON. Will do. > > +static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) > > +{ > > + int gen; > > + unsigned long old_flags, new_flags; > > + int type = folio_is_file_lru(folio); > > + int zone = folio_zonenum(folio); > > + struct lru_gen_struct *lrugen = &lruvec->lrugen; > > + > > + if (folio_test_unevictable(folio) || !lrugen->enabled) > > + return false; > > These two checks should be in the callsite and the function should > return void. Otherwise you can't understand the callsite without > drilling down into lrugen code, even if lrugen is disabled. I agree it's a bit of nuisance this way. The alternative is we'd need ifdef or another helper at the call sites because lrugen->enabled is specific to lrugen. > > + /* > > + * There are three common cases for this page: > > + * 1) If it shouldn't be evicted, e.g., it was just faulted in, add it > > + * to the youngest generation. > > "shouldn't be evicted" makes it sound like mlock. But they should just > be evicted last, right? Maybe: > > /* > * Pages start in different generations depending on > * advance knowledge we have about their hotness and > * evictability: > * > * 1. Already active pages start out youngest. This can be > * fresh faults, or refaults of previously hot pages. > * 2. Cold pages that require writeback before becoming > * evictable start on the second oldest generation. > * 3. Everything else (clean, cold) starts old. > */ Will do. > On that note, I think #1 is reintroducing a problem we have fixed > before, which is trashing the workingset with a flood of use-once > mmapped pages. It's the classic scenario where LFU beats LRU. > > Mapped streaming IO isn't very common, but it does happen. See these > commits: > > dfc8d636cdb95f7b792d5ba8c9f3b295809c125d > 31c0569c3b0b6cc8a867ac6665ca081553f7984c > 645747462435d84c6c6a64269ed49cc3015f753d > > From the changelog: > > The used-once mapped file page detection patchset. > > It is meant to help workloads with large amounts of shortly used file > mappings, like rtorrent hashing a file or git when dealing with loose > objects (git gc on a bigger site?). > > Right now, the VM activates referenced mapped file pages on first > encounter on the inactive list and it takes a full memory cycle to > reclaim them again. When those pages dominate memory, the system > no longer has a meaningful notion of 'working set' and is required > to give up the active list to make reclaim progress. Obviously, > this results in rather bad scanning latencies and the wrong pages > being reclaimed. > > This patch makes the VM be more careful about activating mapped file > pages in the first place. The minimum granted lifetime without > another memory access becomes an inactive list cycle instead of the > full memory cycle, which is more natural given the mentioned loads. > > Translating this to multigen, it seems fresh faults should really > start on the second oldest rather than on the youngest generation, to > get a second chance but without jeopardizing the workingset if they > don't take it. This is a good point, and I had worked on a similar idea but failed to measure its benefits. In addition to placing mmapped file pages in older generations, I also tried placing refaulted anon pages in older generations. My conclusion was that the initial LRU positions of NFU pages are not a bottleneck for workloads I've tested. The efficiency of testing/clearing the accessed bit is. And some applications are smart enough to leverage MADV_SEQUENTIAL. In this case, MGLRU does place mmapped file pages in the oldest generation. I have an oversimplified script that uses memcached to mimic a non-streaming workload and fio a (mmapped) streaming workload: 1. With MADV_SEQUENTIAL, the non-streaming workload is about 5 times faster when using MGLRU. Somehow the baseline (rc3) swapped a lot. (It shouldn't, and I haven't figured out why.) 2. Without MADV_SEQUENTIAL, the non-streaming workload is about 1 time faster when using MGLRU. Both MGLRU and the baseline swapped a lot. MADV_SEQUENTIAL non-streaming ops/sec (memcached) rc3 yes 292k rc3 no 203k rc3+v7 yes 1967k rc3+v7 no 436k cat mmap.sh modprobe brd rd_nr=2 rd_size=56623104 mkswap /dev/ram0 swapon /dev/ram0 mkfs.ext4 /dev/ram1 mount -t ext4 /dev/ram1 /mnt memtier_benchmark -S /var/run/memcached/memcached.sock -P memcache_binary \ -n allkeys --key-minimum=1 --key-maximum=50000000 --key-pattern=P:P -c 1 \ -t 36 --ratio 1:0 --pipeline 8 -d 2000 # streaming workload: --fadvise_hint=0 disables MADV_SEQUENTIAL fio -name=mglru --numjobs=12 --directory=/mnt --size=4224m --buffered=1 \ --ioengine=mmap --iodepth=128 --iodepth_batch_submit=32 \ --iodepth_batch_complete=32 --rw=read --time_based --ramp_time=10m \ --runtime=180m --group_reporting & pid=$! sleep 200 # non-streaming workload memtier_benchmark -S /var/run/memcached/memcached.sock -P memcache_binary \ -n allkeys --key-minimum=1 --key-maximum=50000000 --key-pattern=R:R \ -c 1 -t 36 --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed kill -INT $pid wait > > + * 2) If it can't be evicted immediately, i.e., it's an anon page and > > + * not in swapcache, or a dirty page pending writeback, add it to the > > + * second oldest generation. > > + * 3) If it may be evicted immediately, e.g., it's a clean page, add it > > + * to the oldest generation. > > + */ > > + if (folio_test_active(folio)) > > + gen = lru_gen_from_seq(lrugen->max_seq); > > + else if ((!type && !folio_test_swapcache(folio)) || > > + (folio_test_reclaim(folio) && > > + (folio_test_dirty(folio) || folio_test_writeback(folio)))) > > + gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); > > + else > > + gen = lru_gen_from_seq(lrugen->min_seq[type]); > > Condition #2 is not quite clear to me, and the comment is incomplete: > The code does put dirty/writeback pages on the oldest gen as long as > they haven't been marked for immediate reclaim by the scanner > yet. Right. > HOWEVER, once the scanner does see those pages and sets > PG_reclaim, it will also activate them to move them out of the way > until writeback finishes (see shrink_page_list()) - at which point > we'll trigger #1. So that second part of #2 appears unreachable. Yes, dirty file pages go to #1; dirty pages in swapcache go to #2. (Ideally we want dirty file pages go to #2 too. IMO, the code would be cleaner that way.) > It could be a good exercise to describe how cache pages move through > the generations, similar to the comment on lru_deactivate_file_fn(). > It's a good example of intent vs implementation. Will do. > On another note, "!type" meaning "anon" is a bit rough. Please follow > the "bool file" convention used elsewhere. Originally I used "file", e.g., in v2: https://lore.kernel.org/linux-mm/20210413065633.2782273-9-yuzhao@google.com/ But I was told to renamed it since "file" usually means file. Let me rename it back to "file", unless somebody still objects. > > @@ -113,6 +298,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) > > { > > enum lru_list lru = folio_lru_list(folio); > > > > + if (lru_gen_add_folio(lruvec, folio, true)) > > + return; > > + > > bool parameters are notoriously hard to follow in the callsite. Can > you please add lru_gen_add_folio_tail() instead and have them use a > common helper? I'm not sure -- there are several places like this one. My question is whether we want to do it throughout this patchset. We'd end up with many helpers and duplicate code. E.g., in this file alone, we have two functions taking bool parameters: lru_gen_add_folio(..., bool reclaiming) lru_gen_del_folio(..., bool reclaiming) I can't say they are very readable; at least they are very compact right now. My concern is that we might loose the latter without having enough of the former. Perhaps this is something that we could revisit after you've finished reviewing the entire patchset? > > @@ -127,6 +315,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, > > static __always_inline > > void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) > > { > > + if (lru_gen_del_folio(lruvec, folio, false)) > > + return; > > + > > list_del(&folio->lru); > > update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio), > > -folio_nr_pages(folio)); > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > > index aed44e9b5d89..0f5e8a995781 100644 > > --- a/include/linux/mmzone.h > > +++ b/include/linux/mmzone.h > > @@ -303,6 +303,78 @@ enum lruvec_flags { > > */ > > }; > > > > +struct lruvec; > > + > > +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) > > +#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) > > + > > +#ifdef CONFIG_LRU_GEN > > + > > +#define MIN_LRU_BATCH BITS_PER_LONG > > +#define MAX_LRU_BATCH (MIN_LRU_BATCH * 128) > > Those two aren't used in this patch, so it's hard to say whether they > are chosen correctly. Right. They slipped during the v6/v7 refactoring. Will move them to the next patch. > > + * Evictable pages are divided into multiple generations. The youngest and the > > + * oldest generation numbers, max_seq and min_seq, are monotonically increasing. > > + * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An > > + * offset within MAX_NR_GENS, gen, indexes the LRU list of the corresponding > > + * generation. The gen counter in folio->flags stores gen+1 while a page is on > > + * one of lrugen->lists[]. Otherwise it stores 0. > > + * > > + * A page is added to the youngest generation on faulting. The aging needs to > > + * check the accessed bit at least twice before handing this page over to the > > + * eviction. The first check takes care of the accessed bit set on the initial > > + * fault; the second check makes sure this page hasn't been used since then. > > + * This process, AKA second chance, requires a minimum of two generations, > > + * hence MIN_NR_GENS. And to be compatible with the active/inactive LRU, these > > + * two generations are mapped to the active; the rest of generations, if they > > + * exist, are mapped to the inactive. PG_active is always cleared while a page > > + * is on one of lrugen->lists[] so that demotion, which happens consequently > > + * when the aging produces a new generation, needs not to worry about it. > > + */ > > +#define MIN_NR_GENS 2U > > +#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) > > + > > +struct lru_gen_struct { > > struct lrugen? > > In fact, "lrugen" for the general function and variable namespace > might be better, the _ doesn't seem to pull its weight. > > CONFIG_LRUGEN > struct lrugen > lrugen_foo() > etc. No strong opinion here. I usually add underscores to functions and types so that grep doesn't end up with tons of local variables. > > + /* the aging increments the youngest generation number */ > > + unsigned long max_seq; > > + /* the eviction increments the oldest generation numbers */ > > + unsigned long min_seq[ANON_AND_FILE]; > > The singular max_seq vs the split min_seq raises questions. Please add > a comment that explains or points to an explanation. Will do. > > + /* the birth time of each generation in jiffies */ > > + unsigned long timestamps[MAX_NR_GENS]; > > This isn't in use until the thrashing-based OOM killing patch. Will move it there. > > + /* the multigenerational LRU lists */ > > + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; > > + /* the sizes of the above lists */ > > + unsigned long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; > > + /* whether the multigenerational LRU is enabled */ > > + bool enabled; > > Not (really) in use until the runtime switch. Best to keep everybody > checking the global flag for now, and have the runtime switch patch > introduce this flag and switch necessary callsites over. Will do. > > +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); > > "state" is what we usually init :) How about lrugen_init_lruvec()? Same story as "file", lol -- this used to be lru_gen_init_lruvec(): https://lore.kernel.org/linux-mm/20210413065633.2782273-9-yuzhao@google.com/ Naming is hard. Hopefully we can finalize it this time. > You can drop the memcg parameter and use lruvec_memcg(). lruvec_memcg() isn't available yet when pgdat_init_internals() calls this function because mem_cgroup_disabled() is initialized afterward. > > +#ifdef CONFIG_MEMCG > > +void lru_gen_init_memcg(struct mem_cgroup *memcg); > > +void lru_gen_free_memcg(struct mem_cgroup *memcg); > > This should be either init+exit, or alloc+free. Will do.
Hi Yu, On Tue, Feb 15, 2022 at 02:43:05AM -0700, Yu Zhao wrote: > On Thu, Feb 10, 2022 at 03:41:57PM -0500, Johannes Weiner wrote: > > > +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) > > > +{ > > > + unsigned long max_seq = lruvec->lrugen.max_seq; > > > + > > > + VM_BUG_ON(gen >= MAX_NR_GENS); > > > + > > > + /* see the comment on MIN_NR_GENS */ > > > + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); > > > +} > > > > I'm still reading the series, so correct me if I'm wrong: the "active" > > set is split into two generations for the sole purpose of the > > second-chance policy for fresh faults, right? > > To be precise, the active/inactive notion on top of generations is > just for ABI compatibility, e.g., the counters in /proc/vmstat. > Otherwise, this function wouldn't be needed. Ah! would you mind adding this as a comment to the function? But AFAICS there is the lru_gen_del_folio() callsite that maps it to the PG_active flag - which in turn gets used by add_folio() to place the thing back on the max_seq generation. So I suppose there is a secondary purpose of the function for remembering the page's rough age for non-reclaim isolation. It would be good to capture that as well in a comment on the function. > > If so, it'd be better to have the comment here instead of down by > > MIN_NR_GENS. This is the place that defines what "active" is, so this > > is where the reader asks what it means and what it implies. The > > definition of MIN_NR_GENS can be briefer: "need at least two for > > second chance, see lru_gen_is_active() for details". > > This could be understood this way. It'd be more appropriate to see > this function as an auxiliary and MIN_NR_GENS as something fundamental. > Therefore the former should refer to the latter. Specifically, the > "see the comment on MIN_NR_GENS" refers to this part: > And to be compatible with the active/inactive LRU, these two > generations are mapped to the active; the rest of generations, if > they exist, are mapped to the inactive. I agree, thanks for enlightening me. > > > +static inline void lru_gen_update_size(struct lruvec *lruvec, enum lru_list lru, > > > + int zone, long delta) > > > +{ > > > + struct pglist_data *pgdat = lruvec_pgdat(lruvec); > > > + > > > + lockdep_assert_held(&lruvec->lru_lock); > > > + WARN_ON_ONCE(delta != (int)delta); > > > + > > > + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, delta); > > > + __mod_zone_page_state(&pgdat->node_zones[zone], NR_ZONE_LRU_BASE + lru, delta); > > > +} > > > > This is a duplicate of update_lru_size(), please use that instead. > > > > Yeah technically you don't need the mem_cgroup_update_lru_size() but > > that's not worth sweating over, better to keep it simple. > > I agree we don't need the mem_cgroup_update_lru_size() -- let me spell > out why: > this function is not needed here because it updates the counters used > only by the active/inactive lru code, i.e., get_scan_count(). > > However, we can't reuse update_lru_size() because MGLRU can trip the > WARN_ONCE() in mem_cgroup_update_lru_size(). > > Unlike lru_zone_size[], lrugen->nr_pages[] is eventually consistent. > To move a page to a different generation, the gen counter in page->flags > is updated first, which doesn't require the LRU lock. The second step, > i.e., the update of lrugen->nr_pages[], requires the LRU lock, and it > usually isn't done immediately due to batching. Meanwhile, if this page > is, for example, isolated, nr_pages[] becomes temporarily unbalanced. > And this trips the WARN_ONCE(). Good insight. But in that case, I'd still think it's better to use update_lru_size() and gate the memcg update on lrugen-enabled, with a short comment saying that lrugen has its own per-cgroup counts already. It's just a bit too error prone to duplicate the stat updates. Even better would be: static __always_inline void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); if (lrugen_enabled(lruvec)) lrugen_add_folio(lruvec, folio); else list_add(&folio->lru, &lruvec->lists[lru]); } But it does mean you'd have to handle unevictable pages. I'm reviewing from the position that mglru is going to supplant the existing reclaim algorithm in the long term, though, so being more comprehensive and eliminating special cases where possible is all-positive, IMO. Up to you. I'd only insist on reusing update_lru_size() at least. > > > +static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) > > > +{ > > > + int gen; > > > + unsigned long old_flags, new_flags; > > > + int type = folio_is_file_lru(folio); > > > + int zone = folio_zonenum(folio); > > > + struct lru_gen_struct *lrugen = &lruvec->lrugen; > > > + > > > + if (folio_test_unevictable(folio) || !lrugen->enabled) > > > + return false; > > > > These two checks should be in the callsite and the function should > > return void. Otherwise you can't understand the callsite without > > drilling down into lrugen code, even if lrugen is disabled. > > I agree it's a bit of nuisance this way. The alternative is we'd need > ifdef or another helper at the call sites because lrugen->enabled is > specific to lrugen. Coming from memcg, my experience has been that when you have a compile time-optional MM extension like this, you'll sooner or later need a config-independent helper to gate callbacks in generic code. So I think it's a good idea to add one now. One of these? lruvec_on_lrugen() lruvec_using_lrugen() lruvec_lrugen_enabled() lruvec_has_generations() :-) > > On that note, I think #1 is reintroducing a problem we have fixed > > before, which is trashing the workingset with a flood of use-once > > mmapped pages. It's the classic scenario where LFU beats LRU. > > > > Mapped streaming IO isn't very common, but it does happen. See these > > commits: > > > > dfc8d636cdb95f7b792d5ba8c9f3b295809c125d > > 31c0569c3b0b6cc8a867ac6665ca081553f7984c > > 645747462435d84c6c6a64269ed49cc3015f753d > > > > From the changelog: > > > > The used-once mapped file page detection patchset. > > > > It is meant to help workloads with large amounts of shortly used file > > mappings, like rtorrent hashing a file or git when dealing with loose > > objects (git gc on a bigger site?). > > > > Right now, the VM activates referenced mapped file pages on first > > encounter on the inactive list and it takes a full memory cycle to > > reclaim them again. When those pages dominate memory, the system > > no longer has a meaningful notion of 'working set' and is required > > to give up the active list to make reclaim progress. Obviously, > > this results in rather bad scanning latencies and the wrong pages > > being reclaimed. > > > > This patch makes the VM be more careful about activating mapped file > > pages in the first place. The minimum granted lifetime without > > another memory access becomes an inactive list cycle instead of the > > full memory cycle, which is more natural given the mentioned loads. > > > > Translating this to multigen, it seems fresh faults should really > > start on the second oldest rather than on the youngest generation, to > > get a second chance but without jeopardizing the workingset if they > > don't take it. > > This is a good point, and I had worked on a similar idea but failed > to measure its benefits. In addition to placing mmapped file pages in > older generations, I also tried placing refaulted anon pages in older > generations. My conclusion was that the initial LRU positions of NFU > pages are not a bottleneck for workloads I've tested. The efficiency > of testing/clearing the accessed bit is. The concern isn't the scan overhead, but jankiness from the workingset being flooded out by streaming IO. The concrete usecase at the time was a torrent client hashing a downloaded file and thereby kicking out the desktop environment, which caused jankiness. The hashing didn't benefit from caching - the file wouldn't have fit into RAM anyway - so this was pointless to boot. Essentially, the tradeoff is this: 1) If you treat new pages as hot, you accelerate workingset transitions, but on the flipside you risk unnecessary refaults in running applications when those new pages are one-off. 2) If you take new pages with a grain of salt, you protect existing applications better from one-off floods, but risk refaults in NEW application while they're trying to start up. There are two arguments for why 2) is preferable: 1) Users are tolerant of cache misses when applications first launch, much less so after they've been running for hours. 2) Workingset transitions (and associated jankiness) are bounded by the amount of RAM you need to repopulate. But streaming IO is bounded by storage, and datasets are routinely several times the amount of RAM. Uncacheable sets in excess of RAM can produce an infinite stream of "new" references; not protecting the workingset from that means longer or even sustained jankiness. > And some applications are smart enough to leverage MADV_SEQUENTIAL. > In this case, MGLRU does place mmapped file pages in the oldest > generation. Yes, it makes sense to optimize when MADV_SEQUENTIAL is requested. But that hint isn't reliably there, so it matters that we don't do poorly when it's missing. > I have an oversimplified script that uses memcached to mimic a > non-streaming workload and fio a (mmapped) streaming workload: Looking at the paramters and observed behavior, let me say up front that this looks like a useful benchmark, but doesn't capture the scenario I was talking about above. For one, the presence of swapping in both kernels suggests that the "streaming IO" component actually has repeat access that could benefit from caching. Second, I would expect memcache is accessing its memory frequently and consistently, and so could withstand workingset challenges from streaming IO better than, say, a desktop environment. More on that below. > 1. With MADV_SEQUENTIAL, the non-streaming workload is about 5 times > faster when using MGLRU. Somehow the baseline (rc3) swapped a lot. > (It shouldn't, and I haven't figured out why.) Baseline swaps when there are cache refaults. This is regardless of the hint: you may say you're accessing these pages sequentially, but the refaults say you're reusing them, with a frequency that suggests they might be cacheable. So it tries to cache them. I'd be curious if that results in fio being faster, or whether it's all just pointless thrashing. Can you share the fio results too? We could patch baseline to prioritize MADV_SEQUENTIAL more, but... > 2. Without MADV_SEQUENTIAL, the non-streaming workload is about 1 > time faster when using MGLRU. Both MGLRU and the baseline swapped > a lot. ...in practice I think this scenario will matter to a lot more users. I would again be interested in the fio results. > MADV_SEQUENTIAL non-streaming ops/sec (memcached) > rc3 yes 292k > rc3 no 203k > rc3+v7 yes 1967k > rc3+v7 no 436k > > cat mmap.sh > modprobe brd rd_nr=2 rd_size=56623104 > > mkswap /dev/ram0 > swapon /dev/ram0 > > mkfs.ext4 /dev/ram1 > mount -t ext4 /dev/ram1 /mnt > > memtier_benchmark -S /var/run/memcached/memcached.sock -P memcache_binary \ > -n allkeys --key-minimum=1 --key-maximum=50000000 --key-pattern=P:P -c 1 \ > -t 36 --ratio 1:0 --pipeline 8 -d 2000 > > # streaming workload: --fadvise_hint=0 disables MADV_SEQUENTIAL > fio -name=mglru --numjobs=12 --directory=/mnt --size=4224m --buffered=1 \ > --ioengine=mmap --iodepth=128 --iodepth_batch_submit=32 \ > --iodepth_batch_complete=32 --rw=read --time_based --ramp_time=10m \ > --runtime=180m --group_reporting & As per above, I think this would be closer to a cacheable workingset than a streaming IO pattern. It depends on total RAM of course, but size=4G and time_based should loop around pretty quickly. Would you mind rerunning with files larger than RAM, to avoid repeat accesses (or at least only repeat with large distances)? Depending on how hot memcache runs, it may or may not be able to hold onto its workingset. Testing interactivity is notoriously hard, but using a smaller, intermittent workload is probably more representative of overall responsiveness. Let fio ramp until memory is full, then do perf stat -r 10 /bin/sh -c 'git shortlog v5.0.. >/dev/null; sleep 1' I'll try to reproduce this again too. Back then, that workload gave me a very janky desktop experience, and the patch very obvious relief. > > > @@ -113,6 +298,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) > > > { > > > enum lru_list lru = folio_lru_list(folio); > > > > > > + if (lru_gen_add_folio(lruvec, folio, true)) > > > + return; > > > + > > > > bool parameters are notoriously hard to follow in the callsite. Can > > you please add lru_gen_add_folio_tail() instead and have them use a > > common helper? > > I'm not sure -- there are several places like this one. My question is > whether we want to do it throughout this patchset. We'd end up with > many helpers and duplicate code. E.g., in this file alone, we have two > functions taking bool parameters: > lru_gen_add_folio(..., bool reclaiming) > lru_gen_del_folio(..., bool reclaiming) > > I can't say they are very readable; at least they are very compact > right now. My concern is that we might loose the latter without having > enough of the former. > > Perhaps this is something that we could revisit after you've finished > reviewing the entire patchset? Sure, fair enough. > > > +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); > > > > "state" is what we usually init :) How about lrugen_init_lruvec()? > > Same story as "file", lol -- this used to be lru_gen_init_lruvec(): > https://lore.kernel.org/linux-mm/20210413065633.2782273-9-yuzhao@google.com/ > > Naming is hard. Hopefully we can finalize it this time. Was that internal feedback? The revisions show this function went through several names, but I don't see reviews requesting those. If they weren't public I'm gonna pretend they didn't happen ;-) > > You can drop the memcg parameter and use lruvec_memcg(). > > lruvec_memcg() isn't available yet when pgdat_init_internals() calls > this function because mem_cgroup_disabled() is initialized afterward. Good catch. That'll container_of() into garbage. However, we have to assume that somebody's going to try that simplification again, so we should set up the code now to prevent issues. cgroup_disable parsing is self-contained, so we can pull it ahead in the init sequence. How about this? diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9d05c3ca2d5e..b544d768edc8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6464,9 +6464,9 @@ static int __init cgroup_disable(char *str) break; } } - return 1; + return 0; } -__setup("cgroup_disable=", cgroup_disable); +early_param("cgroup_disable", cgroup_disable); void __init __weak enable_debug_cgroup(void) { } Thanks!
On Tue, Feb 15, 2022 at 04:53:56PM -0500, Johannes Weiner wrote: > Hi Yu, > > On Tue, Feb 15, 2022 at 02:43:05AM -0700, Yu Zhao wrote: > > On Thu, Feb 10, 2022 at 03:41:57PM -0500, Johannes Weiner wrote: > > > > +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) > > > > +{ > > > > + unsigned long max_seq = lruvec->lrugen.max_seq; > > > > + > > > > + VM_BUG_ON(gen >= MAX_NR_GENS); > > > > + > > > > + /* see the comment on MIN_NR_GENS */ > > > > + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); > > > > +} > > > > > > I'm still reading the series, so correct me if I'm wrong: the "active" > > > set is split into two generations for the sole purpose of the > > > second-chance policy for fresh faults, right? > > > > To be precise, the active/inactive notion on top of generations is > > just for ABI compatibility, e.g., the counters in /proc/vmstat. > > Otherwise, this function wouldn't be needed. > > Ah! would you mind adding this as a comment to the function? Will do. > But AFAICS there is the lru_gen_del_folio() callsite that maps it to > the PG_active flag - which in turn gets used by add_folio() to place > the thing back on the max_seq generation. So I suppose there is a > secondary purpose of the function for remembering the page's rough age > for non-reclaim isolation.> Yes, e.g., migration. > It would be good to capture that as well in a comment on the function. Will do. > > > > +static inline void lru_gen_update_size(struct lruvec *lruvec, enum lru_list lru, > > > > + int zone, long delta) > > > > +{ > > > > + struct pglist_data *pgdat = lruvec_pgdat(lruvec); > > > > + > > > > + lockdep_assert_held(&lruvec->lru_lock); > > > > + WARN_ON_ONCE(delta != (int)delta); > > > > + > > > > + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, delta); > > > > + __mod_zone_page_state(&pgdat->node_zones[zone], NR_ZONE_LRU_BASE + lru, delta); > > > > +} > > > > > > This is a duplicate of update_lru_size(), please use that instead. > > > > > > Yeah technically you don't need the mem_cgroup_update_lru_size() but > > > that's not worth sweating over, better to keep it simple. > > > > I agree we don't need the mem_cgroup_update_lru_size() -- let me spell > > out why: > > this function is not needed here because it updates the counters used > > only by the active/inactive lru code, i.e., get_scan_count(). > > > > However, we can't reuse update_lru_size() because MGLRU can trip the > > WARN_ONCE() in mem_cgroup_update_lru_size(). > > > > Unlike lru_zone_size[], lrugen->nr_pages[] is eventually consistent. > > To move a page to a different generation, the gen counter in page->flags > > is updated first, which doesn't require the LRU lock. The second step, > > i.e., the update of lrugen->nr_pages[], requires the LRU lock, and it > > usually isn't done immediately due to batching. Meanwhile, if this page > > is, for example, isolated, nr_pages[] becomes temporarily unbalanced. > > And this trips the WARN_ONCE(). > > Good insight. > > But in that case, I'd still think it's better to use update_lru_size() > and gate the memcg update on lrugen-enabled, with a short comment > saying that lrugen has its own per-cgroup counts already. It's just a > bit too error prone to duplicate the stat updates. > > Even better would be: > > static __always_inline > void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) > { > enum lru_list lru = folio_lru_list(folio); > > update_lru_size(lruvec, lru, folio_zonenum(folio), > folio_nr_pages(folio)); > if (lrugen_enabled(lruvec)) > lrugen_add_folio(lruvec, folio); > else > list_add(&folio->lru, &lruvec->lists[lru]); > } > > But it does mean you'd have to handle unevictable pages. I'm reviewing > from the position that mglru is going to supplant the existing reclaim > algorithm in the long term, though, so being more comprehensive and > eliminating special cases where possible is all-positive, IMO. > > Up to you. I'd only insist on reusing update_lru_size() at least. Will do. > > > > +static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) > > > > +{ > > > > + int gen; > > > > + unsigned long old_flags, new_flags; > > > > + int type = folio_is_file_lru(folio); > > > > + int zone = folio_zonenum(folio); > > > > + struct lru_gen_struct *lrugen = &lruvec->lrugen; > > > > + > > > > + if (folio_test_unevictable(folio) || !lrugen->enabled) > > > > + return false; > > > > > > These two checks should be in the callsite and the function should > > > return void. Otherwise you can't understand the callsite without > > > drilling down into lrugen code, even if lrugen is disabled. > > > > I agree it's a bit of nuisance this way. The alternative is we'd need > > ifdef or another helper at the call sites because lrugen->enabled is > > specific to lrugen. > > Coming from memcg, my experience has been that when you have a compile > time-optional MM extension like this, you'll sooner or later need a > config-independent helper to gate callbacks in generic code. So I > think it's a good idea to add one now. > > One of these? > > lruvec_on_lrugen() SGTM. Personally I'd reuse lru_gen_enabled(), by passing NULL/lruvec. But my guess is you wouldn't like it. > lruvec_using_lrugen() > lruvec_lrugen_enabled() > > lruvec_has_generations() :-) > > > > On that note, I think #1 is reintroducing a problem we have fixed > > > before, which is trashing the workingset with a flood of use-once > > > mmapped pages. It's the classic scenario where LFU beats LRU. > > > > > > Mapped streaming IO isn't very common, but it does happen. See these > > > commits: > > > > > > dfc8d636cdb95f7b792d5ba8c9f3b295809c125d > > > 31c0569c3b0b6cc8a867ac6665ca081553f7984c > > > 645747462435d84c6c6a64269ed49cc3015f753d > > > > > > From the changelog: > > > > > > The used-once mapped file page detection patchset. > > > > > > It is meant to help workloads with large amounts of shortly used file > > > mappings, like rtorrent hashing a file or git when dealing with loose > > > objects (git gc on a bigger site?). > > > > > > Right now, the VM activates referenced mapped file pages on first > > > encounter on the inactive list and it takes a full memory cycle to > > > reclaim them again. When those pages dominate memory, the system > > > no longer has a meaningful notion of 'working set' and is required > > > to give up the active list to make reclaim progress. Obviously, > > > this results in rather bad scanning latencies and the wrong pages > > > being reclaimed. > > > > > > This patch makes the VM be more careful about activating mapped file > > > pages in the first place. The minimum granted lifetime without > > > another memory access becomes an inactive list cycle instead of the > > > full memory cycle, which is more natural given the mentioned loads. > > > > > > Translating this to multigen, it seems fresh faults should really > > > start on the second oldest rather than on the youngest generation, to > > > get a second chance but without jeopardizing the workingset if they > > > don't take it. > > > > This is a good point, and I had worked on a similar idea but failed > > to measure its benefits. In addition to placing mmapped file pages in > > older generations, I also tried placing refaulted anon pages in older > > generations. My conclusion was that the initial LRU positions of NFU > > pages are not a bottleneck for workloads I've tested. The efficiency > > of testing/clearing the accessed bit is. > > The concern isn't the scan overhead, but jankiness from the workingset > being flooded out by streaming IO. Yes, MGLRU uses a different approach to solve this problem, and for its approach, the scan overhead is the concern. MGLRU detects (defines) the working set by scanning the entire memory for each generation, and it counters the flooding by accelerating the creation of generations. IOW, all mapped pages have an equal chance to get scanned, no matter which generation they are in. This is a design difference compared with the active/inactive LRU, which tries to scans the active/inactive lists less/more frequently. > The concrete usecase at the time was a torrent client hashing a > downloaded file and thereby kicking out the desktop environment, which > caused jankiness. The hashing didn't benefit from caching - the file > wouldn't have fit into RAM anyway - so this was pointless to boot. > > Essentially, the tradeoff is this: > > 1) If you treat new pages as hot, you accelerate workingset > transitions, but on the flipside you risk unnecessary refaults in > running applications when those new pages are one-off. > > 2) If you take new pages with a grain of salt, you protect existing > applications better from one-off floods, but risk refaults in NEW > application while they're trying to start up. Agreed. > There are two arguments for why 2) is preferable: > > 1) Users are tolerant of cache misses when applications first launch, > much less so after they've been running for hours. Our CUJs (Critical User Journeys) respectfully disagree :) They are built on the observation that once users have moved onto another tab/app, they are more likely to stay with the new tab/app rather than go back to the old ones. Speaking for myself, this is generally the case. > 2) Workingset transitions (and associated jankiness) are bounded by > the amount of RAM you need to repopulate. But streaming IO is > bounded by storage, and datasets are routinely several times the > amount of RAM. Uncacheable sets in excess of RAM can produce an > infinite stream of "new" references; not protecting the workingset > from that means longer or even sustained jankiness. I'd argue the opposite -- we shouldn't risk refaulting fresh hot pages just to accommodate this concrete yet minor use case, especially considering torrent has been given the means (MADV_SEQUENTIAL) to help itself. I appreciate all your points here. The bottom line is we agree this is a trade off. For what disagree about, we could be both right -- it comes down to what workloads we care about *more*. To move forward, I propose we look at it from a non-technical POV: would we want to offer users an alternative trade off so that they can have greater flexibility? > > And some applications are smart enough to leverage MADV_SEQUENTIAL. > > In this case, MGLRU does place mmapped file pages in the oldest > > generation. > > Yes, it makes sense to optimize when MADV_SEQUENTIAL is requested. But > that hint isn't reliably there, so it matters that we don't do poorly > when it's missing. Agreed. > > I have an oversimplified script that uses memcached to mimic a > > non-streaming workload and fio a (mmapped) streaming workload: > > Looking at the paramters and observed behavior, let me say up front > that this looks like a useful benchmark, but doesn't capture the > scenario I was talking about above. > > For one, the presence of swapping in both kernels suggests that the > "streaming IO" component actually has repeat access that could benefit > from caching. Second, I would expect memcache is accessing its memory > frequently and consistently, and so could withstand workingset > challenges from streaming IO better than, say, a desktop environment. The fio workload is a real streaming workload, but the memcached workload might have been too large to be a typical desktop workload. More below. > More on that below. > > > 1. With MADV_SEQUENTIAL, the non-streaming workload is about 5 times > > faster when using MGLRU. Somehow the baseline (rc3) swapped a lot. > > (It shouldn't, and I haven't figured out why.) > > Baseline swaps when there are cache refaults. This is regardless of > the hint: you may say you're accessing these pages sequentially, but > the refaults say you're reusing them, with a frequency that suggests > they might be cacheable. So it tries to cache them. > > I'd be curious if that results in fio being faster, or whether it's > all just pointless thrashing. Can you share the fio results too? More below. > We could patch baseline to prioritize MADV_SEQUENTIAL more, but... > > > 2. Without MADV_SEQUENTIAL, the non-streaming workload is about 1 > > time faster when using MGLRU. Both MGLRU and the baseline swapped > > a lot. > > ...in practice I think this scenario will matter to a lot more users. I strongly feel we should prioritize what's advertised on a man page over an unspecified (performance) behavior. > I would again be interested in the fio results. > > > MADV_SEQUENTIAL non-streaming ops/sec (memcached) > > rc3 yes 292k > > rc3 no 203k > > rc3+v7 yes 1967k > > rc3+v7 no 436k > > > > cat mmap.sh > > modprobe brd rd_nr=2 rd_size=56623104 > > > > mkswap /dev/ram0 > > swapon /dev/ram0 > > > > mkfs.ext4 /dev/ram1 > > mount -t ext4 /dev/ram1 /mnt > > > > memtier_benchmark -S /var/run/memcached/memcached.sock -P memcache_binary \ > > -n allkeys --key-minimum=1 --key-maximum=50000000 --key-pattern=P:P -c 1 \ > > -t 36 --ratio 1:0 --pipeline 8 -d 2000 > > > > # streaming workload: --fadvise_hint=0 disables MADV_SEQUENTIAL > > fio -name=mglru --numjobs=12 --directory=/mnt --size=4224m --buffered=1 \ > > --ioengine=mmap --iodepth=128 --iodepth_batch_submit=32 \ > > --iodepth_batch_complete=32 --rw=read --time_based --ramp_time=10m \ > > --runtime=180m --group_reporting & > > As per above, I think this would be closer to a cacheable workingset > than a streaming IO pattern. It depends on total RAM of course, but > size=4G and time_based should loop around pretty quickly. The file size here shouldn't matter since fio is smart enough to invalidate page cache before it rewinds (for sequential access): https://fio.readthedocs.io/en/latest/fio_doc.html#cmdoption-arg-invalidate https://github.com/axboe/fio/blob/master/filesetup.c#L602 I think the problem might have been the memory size for memcached was too large (100GB) to be all hot (limited by memory bandwidth). > Would you mind rerunning with files larger than RAM, to avoid repeat > accesses (or at least only repeat with large distances)? Retested with the same free memory (120GB) for 40GB memcached and 200GB fio. MADV_SEQUENTIAL FADV_DONTNEED memcached fio rc4 no yes 4716k 232k rc4+v7 no yes 4307k 265k delta -9% +14% MGLRU lost with memcached but won with fio for the same reason: it doesn't have any heuristics to detect the streaming characteristic (and therefore lost with memcached) but relies on faster scanning (and therefore won with fio) to keep the working set in memory. The baseline didn't swap this time (MGLRU did slightly), but it lost with fio because it had to walk the rmap for each page in the entire 200GB VMA, at least once, even for this streaming workload. This reflects the design difference I mentioned earlier. cat test.sh modprobe brd rd_nr=1 rd_size=268435456 mkfs.ext4 /dev/ram0 mount -t ext4 /dev/ram0 /mnt fallocate -l 40g /mnt/swapfile mkswap /mnt/swapfile swapon /mnt/swapfile fio -name=mglru --numjobs=1 --directory=/mnt --size=204800m \ --buffered=1 --ioengine=mmap --fadvise_hint=0 --iodepth=128 \ --iodepth_batch_submit=32 --iodepth_batch_complete=32 \ --rw=read --time_based --ramp_time=10m --runtime=180m \ --group_reporting & pid=$! sleep 600 # load objects memtier_benchmark -S /var/run/memcached/memcached.sock \ -P memcache_binary -n allkeys --key-minimum=1 \ --key-maximum=20000000 --key-pattern=P:P -c 1 -t 36 \ --ratio 1:0 --pipeline 8 -d 2000 # read objects memtier_benchmark -S /var/run/memcached/memcached.sock \ -P memcache_binary -n allkeys --key-minimum=1 \ --key-maximum=20000000 --key-pattern=R:R -c 1 -t 36 \ --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed kill -INT $pid > Depending on how hot memcache runs, it may or may not be able to hold > onto its workingset. Agreed. > Testing interactivity is notoriously hard, but > using a smaller, intermittent workload is probably more representative > of overall responsiveness. Let fio ramp until memory is full, then do > perf stat -r 10 /bin/sh -c 'git shortlog v5.0.. >/dev/null; sleep 1' I'll also check with the downstream maintainers to see if they have heard any complaints about streaming workloads negatively impacting user experience. > I'll try to reproduce this again too. Back then, that workload gave me > a very janky desktop experience, and the patch very obvious relief. SGTM. > > > > @@ -113,6 +298,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) > > > > { > > > > enum lru_list lru = folio_lru_list(folio); > > > > > > > > + if (lru_gen_add_folio(lruvec, folio, true)) > > > > + return; > > > > + > > > > > > bool parameters are notoriously hard to follow in the callsite. Can > > > you please add lru_gen_add_folio_tail() instead and have them use a > > > common helper? > > > > I'm not sure -- there are several places like this one. My question is > > whether we want to do it throughout this patchset. We'd end up with > > many helpers and duplicate code. E.g., in this file alone, we have two > > functions taking bool parameters: > > lru_gen_add_folio(..., bool reclaiming) > > lru_gen_del_folio(..., bool reclaiming) > > > > I can't say they are very readable; at least they are very compact > > right now. My concern is that we might loose the latter without having > > enough of the former. > > > > Perhaps this is something that we could revisit after you've finished > > reviewing the entire patchset? > > Sure, fair enough. > > > > > +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); > > > > > > "state" is what we usually init :) How about lrugen_init_lruvec()? > > > > Same story as "file", lol -- this used to be lru_gen_init_lruvec(): > > https://lore.kernel.org/linux-mm/20210413065633.2782273-9-yuzhao@google.com/ > > > > Naming is hard. Hopefully we can finalize it this time. > > Was that internal feedback? The revisions show this function went > through several names, but I don't see reviews requesting those. If > they weren't public I'm gonna pretend they didn't happen ;-) Indeed. I lost track. > > > You can drop the memcg parameter and use lruvec_memcg(). > > > > lruvec_memcg() isn't available yet when pgdat_init_internals() calls > > this function because mem_cgroup_disabled() is initialized afterward. > > Good catch. That'll container_of() into garbage. However, we have to > assume that somebody's going to try that simplification again, so we > should set up the code now to prevent issues. > > cgroup_disable parsing is self-contained, so we can pull it ahead in > the init sequence. How about this? > > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > index 9d05c3ca2d5e..b544d768edc8 100644 > --- a/kernel/cgroup/cgroup.c > +++ b/kernel/cgroup/cgroup.c > @@ -6464,9 +6464,9 @@ static int __init cgroup_disable(char *str) > break; > } > } > - return 1; > + return 0; > } > -__setup("cgroup_disable=", cgroup_disable); > +early_param("cgroup_disable", cgroup_disable); I think early_param() is still after pgdat_init_internals(), no? Thanks!
. On Mon, Feb 21, 2022 at 1:14 AM Yu Zhao <yuzhao@google.com> wrote: > > On Tue, Feb 15, 2022 at 04:53:56PM -0500, Johannes Weiner wrote: > > Hi Yu, > > > > On Tue, Feb 15, 2022 at 02:43:05AM -0700, Yu Zhao wrote: > > > On Thu, Feb 10, 2022 at 03:41:57PM -0500, Johannes Weiner wrote: > > > > > +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) > > > > > +{ > > > > > + unsigned long max_seq = lruvec->lrugen.max_seq; > > > > > + > > > > > + VM_BUG_ON(gen >= MAX_NR_GENS); > > > > > + > > > > > + /* see the comment on MIN_NR_GENS */ > > > > > + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); > > > > > +} > > > > > > > > I'm still reading the series, so correct me if I'm wrong: the "active" > > > > set is split into two generations for the sole purpose of the > > > > second-chance policy for fresh faults, right? > > > > > > To be precise, the active/inactive notion on top of generations is > > > just for ABI compatibility, e.g., the counters in /proc/vmstat. > > > Otherwise, this function wouldn't be needed. > > > > Ah! would you mind adding this as a comment to the function? > > Will do. > > > But AFAICS there is the lru_gen_del_folio() callsite that maps it to > > the PG_active flag - which in turn gets used by add_folio() to place > > the thing back on the max_seq generation. So I suppose there is a > > secondary purpose of the function for remembering the page's rough age > > for non-reclaim isolation.> > > Yes, e.g., migration. > > > It would be good to capture that as well in a comment on the function. > > Will do. > > > > > > +static inline void lru_gen_update_size(struct lruvec *lruvec, enum lru_list lru, > > > > > + int zone, long delta) > > > > > +{ > > > > > + struct pglist_data *pgdat = lruvec_pgdat(lruvec); > > > > > + > > > > > + lockdep_assert_held(&lruvec->lru_lock); > > > > > + WARN_ON_ONCE(delta != (int)delta); > > > > > + > > > > > + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, delta); > > > > > + __mod_zone_page_state(&pgdat->node_zones[zone], NR_ZONE_LRU_BASE + lru, delta); > > > > > +} > > > > > > > > This is a duplicate of update_lru_size(), please use that instead. > > > > > > > > Yeah technically you don't need the mem_cgroup_update_lru_size() but > > > > that's not worth sweating over, better to keep it simple. > > > > > > I agree we don't need the mem_cgroup_update_lru_size() -- let me spell > > > out why: > > > this function is not needed here because it updates the counters used > > > only by the active/inactive lru code, i.e., get_scan_count(). > > > > > > However, we can't reuse update_lru_size() because MGLRU can trip the > > > WARN_ONCE() in mem_cgroup_update_lru_size(). > > > > > > Unlike lru_zone_size[], lrugen->nr_pages[] is eventually consistent. > > > To move a page to a different generation, the gen counter in page->flags > > > is updated first, which doesn't require the LRU lock. The second step, > > > i.e., the update of lrugen->nr_pages[], requires the LRU lock, and it > > > usually isn't done immediately due to batching. Meanwhile, if this page > > > is, for example, isolated, nr_pages[] becomes temporarily unbalanced. > > > And this trips the WARN_ONCE(). > > > > Good insight. > > > > But in that case, I'd still think it's better to use update_lru_size() > > and gate the memcg update on lrugen-enabled, with a short comment > > saying that lrugen has its own per-cgroup counts already. It's just a > > bit too error prone to duplicate the stat updates. > > > > Even better would be: > > > > static __always_inline > > void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) > > { > > enum lru_list lru = folio_lru_list(folio); > > > > update_lru_size(lruvec, lru, folio_zonenum(folio), > > folio_nr_pages(folio)); > > if (lrugen_enabled(lruvec)) > > lrugen_add_folio(lruvec, folio); > > else > > list_add(&folio->lru, &lruvec->lists[lru]); > > } > > > > But it does mean you'd have to handle unevictable pages. I'm reviewing > > from the position that mglru is going to supplant the existing reclaim > > algorithm in the long term, though, so being more comprehensive and > > eliminating special cases where possible is all-positive, IMO. > > > > Up to you. I'd only insist on reusing update_lru_size() at least. > > Will do. > > > > > > +static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) > > > > > +{ > > > > > + int gen; > > > > > + unsigned long old_flags, new_flags; > > > > > + int type = folio_is_file_lru(folio); > > > > > + int zone = folio_zonenum(folio); > > > > > + struct lru_gen_struct *lrugen = &lruvec->lrugen; > > > > > + > > > > > + if (folio_test_unevictable(folio) || !lrugen->enabled) > > > > > + return false; > > > > > > > > These two checks should be in the callsite and the function should > > > > return void. Otherwise you can't understand the callsite without > > > > drilling down into lrugen code, even if lrugen is disabled. > > > > > > I agree it's a bit of nuisance this way. The alternative is we'd need > > > ifdef or another helper at the call sites because lrugen->enabled is > > > specific to lrugen. > > > > Coming from memcg, my experience has been that when you have a compile > > time-optional MM extension like this, you'll sooner or later need a > > config-independent helper to gate callbacks in generic code. So I > > think it's a good idea to add one now. > > > > One of these? > > > > lruvec_on_lrugen() > > SGTM. > > Personally I'd reuse lru_gen_enabled(), by passing NULL/lruvec. But > my guess is you wouldn't like it. > > > lruvec_using_lrugen() > > lruvec_lrugen_enabled() > > > > lruvec_has_generations() :-) > > > > > > On that note, I think #1 is reintroducing a problem we have fixed > > > > before, which is trashing the workingset with a flood of use-once > > > > mmapped pages. It's the classic scenario where LFU beats LRU. > > > > > > > > Mapped streaming IO isn't very common, but it does happen. See these > > > > commits: > > > > > > > > dfc8d636cdb95f7b792d5ba8c9f3b295809c125d > > > > 31c0569c3b0b6cc8a867ac6665ca081553f7984c > > > > 645747462435d84c6c6a64269ed49cc3015f753d > > > > > > > > From the changelog: > > > > > > > > The used-once mapped file page detection patchset. > > > > > > > > It is meant to help workloads with large amounts of shortly used file > > > > mappings, like rtorrent hashing a file or git when dealing with loose > > > > objects (git gc on a bigger site?). > > > > > > > > Right now, the VM activates referenced mapped file pages on first > > > > encounter on the inactive list and it takes a full memory cycle to > > > > reclaim them again. When those pages dominate memory, the system > > > > no longer has a meaningful notion of 'working set' and is required > > > > to give up the active list to make reclaim progress. Obviously, > > > > this results in rather bad scanning latencies and the wrong pages > > > > being reclaimed. > > > > > > > > This patch makes the VM be more careful about activating mapped file > > > > pages in the first place. The minimum granted lifetime without > > > > another memory access becomes an inactive list cycle instead of the > > > > full memory cycle, which is more natural given the mentioned loads. > > > > > > > > Translating this to multigen, it seems fresh faults should really > > > > start on the second oldest rather than on the youngest generation, to > > > > get a second chance but without jeopardizing the workingset if they > > > > don't take it. > > > > > > This is a good point, and I had worked on a similar idea but failed > > > to measure its benefits. In addition to placing mmapped file pages in > > > older generations, I also tried placing refaulted anon pages in older > > > generations. My conclusion was that the initial LRU positions of NFU > > > pages are not a bottleneck for workloads I've tested. The efficiency > > > of testing/clearing the accessed bit is. > > > > The concern isn't the scan overhead, but jankiness from the workingset > > being flooded out by streaming IO. > > Yes, MGLRU uses a different approach to solve this problem, and for > its approach, the scan overhead is the concern. > > MGLRU detects (defines) the working set by scanning the entire memory > for each generation, and it counters the flooding by accelerating the > creation of generations. IOW, all mapped pages have an equal chance to > get scanned, no matter which generation they are in. This is a design > difference compared with the active/inactive LRU, which tries to scans > the active/inactive lists less/more frequently. > > > The concrete usecase at the time was a torrent client hashing a > > downloaded file and thereby kicking out the desktop environment, which > > caused jankiness. The hashing didn't benefit from caching - the file > > wouldn't have fit into RAM anyway - so this was pointless to boot. > > > > Essentially, the tradeoff is this: > > > > 1) If you treat new pages as hot, you accelerate workingset > > transitions, but on the flipside you risk unnecessary refaults in > > running applications when those new pages are one-off. > > > > 2) If you take new pages with a grain of salt, you protect existing > > applications better from one-off floods, but risk refaults in NEW > > application while they're trying to start up. > > Agreed. > > > There are two arguments for why 2) is preferable: > > > > 1) Users are tolerant of cache misses when applications first launch, > > much less so after they've been running for hours. > > Our CUJs (Critical User Journeys) respectfully disagree :) > > They are built on the observation that once users have moved onto > another tab/app, they are more likely to stay with the new tab/app > rather than go back to the old ones. Speaking for myself, this is > generally the case. > > > 2) Workingset transitions (and associated jankiness) are bounded by > > the amount of RAM you need to repopulate. But streaming IO is > > bounded by storage, and datasets are routinely several times the > > amount of RAM. Uncacheable sets in excess of RAM can produce an > > infinite stream of "new" references; not protecting the workingset > > from that means longer or even sustained jankiness. > > I'd argue the opposite -- we shouldn't risk refaulting fresh hot pages > just to accommodate this concrete yet minor use case, especially > considering torrent has been given the means (MADV_SEQUENTIAL) to help > itself. > > I appreciate all your points here. The bottom line is we agree this is > a trade off. For what disagree about, we could be both right -- it > comes down to what workloads we care about *more*. > > To move forward, I propose we look at it from a non-technical POV: > would we want to offer users an alternative trade off so that they can > have greater flexibility? > > > > And some applications are smart enough to leverage MADV_SEQUENTIAL. > > > In this case, MGLRU does place mmapped file pages in the oldest > > > generation. > > > > Yes, it makes sense to optimize when MADV_SEQUENTIAL is requested. But > > that hint isn't reliably there, so it matters that we don't do poorly > > when it's missing. > > Agreed. > > > > I have an oversimplified script that uses memcached to mimic a > > > non-streaming workload and fio a (mmapped) streaming workload: > > > > Looking at the paramters and observed behavior, let me say up front > > that this looks like a useful benchmark, but doesn't capture the > > scenario I was talking about above. > > > > For one, the presence of swapping in both kernels suggests that the > > "streaming IO" component actually has repeat access that could benefit > > from caching. Second, I would expect memcache is accessing its memory > > frequently and consistently, and so could withstand workingset > > challenges from streaming IO better than, say, a desktop environment. > > The fio workload is a real streaming workload, but the memcached > workload might have been too large to be a typical desktop workload. > > More below. > > > More on that below. > > > > > 1. With MADV_SEQUENTIAL, the non-streaming workload is about 5 times > > > faster when using MGLRU. Somehow the baseline (rc3) swapped a lot. > > > (It shouldn't, and I haven't figured out why.) > > > > Baseline swaps when there are cache refaults. This is regardless of > > the hint: you may say you're accessing these pages sequentially, but > > the refaults say you're reusing them, with a frequency that suggests > > they might be cacheable. So it tries to cache them. > > > > I'd be curious if that results in fio being faster, or whether it's > > all just pointless thrashing. Can you share the fio results too? > > More below. > > > We could patch baseline to prioritize MADV_SEQUENTIAL more, but... > > > > > 2. Without MADV_SEQUENTIAL, the non-streaming workload is about 1 > > > time faster when using MGLRU. Both MGLRU and the baseline swapped > > > a lot. > > > > ...in practice I think this scenario will matter to a lot more users. > > I strongly feel we should prioritize what's advertised on a man page > over an unspecified (performance) behavior. > > > I would again be interested in the fio results. > > > > > MADV_SEQUENTIAL non-streaming ops/sec (memcached) > > > rc3 yes 292k > > > rc3 no 203k > > > rc3+v7 yes 1967k > > > rc3+v7 no 436k Appending a few notes on the baseline results: 1. Apparently FADV_DONTNEED rejects mmaped pages -- I found no reasons from the man page or the original commit why it should. I propose we remove the page_mapped() check in lru_deactivate_file_fn(). Adding Minchan to see how he thinks about this. 2. For MADV_SEQUENTIAL, I made workingset_refault() check (VM_SEQ_READ | VM_RAND_READ) and return if they are set, and the performance got a lot better (x3). 3. In page_referenced_one(), I think we should also exclude VM_RAND_READ in addition to VM_SEQ_READ. > > > cat mmap.sh > > > modprobe brd rd_nr=2 rd_size=56623104 > > > > > > mkswap /dev/ram0 > > > swapon /dev/ram0 > > > > > > mkfs.ext4 /dev/ram1 > > > mount -t ext4 /dev/ram1 /mnt > > > > > > memtier_benchmark -S /var/run/memcached/memcached.sock -P memcache_binary \ > > > -n allkeys --key-minimum=1 --key-maximum=50000000 --key-pattern=P:P -c 1 \ > > > -t 36 --ratio 1:0 --pipeline 8 -d 2000 > > > > > > # streaming workload: --fadvise_hint=0 disables MADV_SEQUENTIAL > > > fio -name=mglru --numjobs=12 --directory=/mnt --size=4224m --buffered=1 \ > > > --ioengine=mmap --iodepth=128 --iodepth_batch_submit=32 \ > > > --iodepth_batch_complete=32 --rw=read --time_based --ramp_time=10m \ > > > --runtime=180m --group_reporting & > > > > As per above, I think this would be closer to a cacheable workingset > > than a streaming IO pattern. It depends on total RAM of course, but > > size=4G and time_based should loop around pretty quickly. > > The file size here shouldn't matter since fio is smart enough to > invalidate page cache before it rewinds (for sequential access): > > https://fio.readthedocs.io/en/latest/fio_doc.html#cmdoption-arg-invalidate > https://github.com/axboe/fio/blob/master/filesetup.c#L602 > > I think the problem might have been the memory size for memcached was > too large (100GB) to be all hot (limited by memory bandwidth). > > > Would you mind rerunning with files larger than RAM, to avoid repeat > > accesses (or at least only repeat with large distances)? > > Retested with the same free memory (120GB) for 40GB memcached and 200GB > fio. > > MADV_SEQUENTIAL FADV_DONTNEED memcached fio > rc4 no yes 4716k 232k > rc4+v7 no yes 4307k 265k > delta -9% +14% > > MGLRU lost with memcached but won with fio for the same reason: it > doesn't have any heuristics to detect the streaming characteristic > (and therefore lost with memcached) but relies on faster scanning > (and therefore won with fio) to keep the working set in memory. > > The baseline didn't swap this time (MGLRU did slightly), but it lost > with fio because it had to walk the rmap for each page in the entire > 200GB VMA, at least once, even for this streaming workload. > > This reflects the design difference I mentioned earlier. > > cat test.sh > modprobe brd rd_nr=1 rd_size=268435456 > > mkfs.ext4 /dev/ram0 > mount -t ext4 /dev/ram0 /mnt > > fallocate -l 40g /mnt/swapfile > mkswap /mnt/swapfile > swapon /mnt/swapfile > > fio -name=mglru --numjobs=1 --directory=/mnt --size=204800m \ > --buffered=1 --ioengine=mmap --fadvise_hint=0 --iodepth=128 \ > --iodepth_batch_submit=32 --iodepth_batch_complete=32 \ > --rw=read --time_based --ramp_time=10m --runtime=180m \ > --group_reporting & > pid=$! > > sleep 600 > > # load objects > memtier_benchmark -S /var/run/memcached/memcached.sock \ > -P memcache_binary -n allkeys --key-minimum=1 \ > --key-maximum=20000000 --key-pattern=P:P -c 1 -t 36 \ > --ratio 1:0 --pipeline 8 -d 2000 > # read objects > memtier_benchmark -S /var/run/memcached/memcached.sock \ > -P memcache_binary -n allkeys --key-minimum=1 \ > --key-maximum=20000000 --key-pattern=R:R -c 1 -t 36 \ > --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed > > kill -INT $pid > > > Depending on how hot memcache runs, it may or may not be able to hold > > onto its workingset. > > Agreed. > > > Testing interactivity is notoriously hard, but > > using a smaller, intermittent workload is probably more representative > > of overall responsiveness. Let fio ramp until memory is full, then do > > perf stat -r 10 /bin/sh -c 'git shortlog v5.0.. >/dev/null; sleep 1' > > I'll also check with the downstream maintainers to see if they have > heard any complaints about streaming workloads negatively impacting > user experience. > > > I'll try to reproduce this again too. Back then, that workload gave me > > a very janky desktop experience, and the patch very obvious relief. > > SGTM. > > > > > > @@ -113,6 +298,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) > > > > > { > > > > > enum lru_list lru = folio_lru_list(folio); > > > > > > > > > > + if (lru_gen_add_folio(lruvec, folio, true)) > > > > > + return; > > > > > + > > > > > > > > bool parameters are notoriously hard to follow in the callsite. Can > > > > you please add lru_gen_add_folio_tail() instead and have them use a > > > > common helper? > > > > > > I'm not sure -- there are several places like this one. My question is > > > whether we want to do it throughout this patchset. We'd end up with > > > many helpers and duplicate code. E.g., in this file alone, we have two > > > functions taking bool parameters: > > > lru_gen_add_folio(..., bool reclaiming) > > > lru_gen_del_folio(..., bool reclaiming) > > > > > > I can't say they are very readable; at least they are very compact > > > right now. My concern is that we might loose the latter without having > > > enough of the former. > > > > > > Perhaps this is something that we could revisit after you've finished > > > reviewing the entire patchset? > > > > Sure, fair enough. > > > > > > > +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); > > > > > > > > "state" is what we usually init :) How about lrugen_init_lruvec()? > > > > > > Same story as "file", lol -- this used to be lru_gen_init_lruvec(): > > > https://lore.kernel.org/linux-mm/20210413065633.2782273-9-yuzhao@google.com/ > > > > > > Naming is hard. Hopefully we can finalize it this time. > > > > Was that internal feedback? The revisions show this function went > > through several names, but I don't see reviews requesting those. If > > they weren't public I'm gonna pretend they didn't happen ;-) > > Indeed. I lost track. > > > > > You can drop the memcg parameter and use lruvec_memcg(). > > > > > > lruvec_memcg() isn't available yet when pgdat_init_internals() calls > > > this function because mem_cgroup_disabled() is initialized afterward. > > > > Good catch. That'll container_of() into garbage. However, we have to > > assume that somebody's going to try that simplification again, so we > > should set up the code now to prevent issues. > > > > cgroup_disable parsing is self-contained, so we can pull it ahead in > > the init sequence. How about this? > > > > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > > index 9d05c3ca2d5e..b544d768edc8 100644 > > --- a/kernel/cgroup/cgroup.c > > +++ b/kernel/cgroup/cgroup.c > > @@ -6464,9 +6464,9 @@ static int __init cgroup_disable(char *str) > > break; > > } > > } > > - return 1; > > + return 0; > > } > > -__setup("cgroup_disable=", cgroup_disable); > > +early_param("cgroup_disable", cgroup_disable); > > I think early_param() is still after pgdat_init_internals(), no? > > Thanks!
On Wed, Feb 23, 2022 at 02:18:24PM -0700, Yu Zhao wrote: > . > On Mon, Feb 21, 2022 at 1:14 AM Yu Zhao <yuzhao@google.com> wrote: > > > > On Tue, Feb 15, 2022 at 04:53:56PM -0500, Johannes Weiner wrote: > > > Hi Yu, > > > > > > On Tue, Feb 15, 2022 at 02:43:05AM -0700, Yu Zhao wrote: > > > > On Thu, Feb 10, 2022 at 03:41:57PM -0500, Johannes Weiner wrote: > > > > > > +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) > > > > > > +{ > > > > > > + unsigned long max_seq = lruvec->lrugen.max_seq; > > > > > > + > > > > > > + VM_BUG_ON(gen >= MAX_NR_GENS); > > > > > > + > > > > > > + /* see the comment on MIN_NR_GENS */ > > > > > > + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); > > > > > > +} > > > > > > > > > > I'm still reading the series, so correct me if I'm wrong: the "active" > > > > > set is split into two generations for the sole purpose of the > > > > > second-chance policy for fresh faults, right? > > > > > > > > To be precise, the active/inactive notion on top of generations is > > > > just for ABI compatibility, e.g., the counters in /proc/vmstat. > > > > Otherwise, this function wouldn't be needed. > > > > > > Ah! would you mind adding this as a comment to the function? > > > > Will do. > > > > > But AFAICS there is the lru_gen_del_folio() callsite that maps it to > > > the PG_active flag - which in turn gets used by add_folio() to place > > > the thing back on the max_seq generation. So I suppose there is a > > > secondary purpose of the function for remembering the page's rough age > > > for non-reclaim isolation.> > > > > Yes, e.g., migration. > > > > > It would be good to capture that as well in a comment on the function. > > > > Will do. > > > > > > > > +static inline void lru_gen_update_size(struct lruvec *lruvec, enum lru_list lru, > > > > > > + int zone, long delta) > > > > > > +{ > > > > > > + struct pglist_data *pgdat = lruvec_pgdat(lruvec); > > > > > > + > > > > > > + lockdep_assert_held(&lruvec->lru_lock); > > > > > > + WARN_ON_ONCE(delta != (int)delta); > > > > > > + > > > > > > + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, delta); > > > > > > + __mod_zone_page_state(&pgdat->node_zones[zone], NR_ZONE_LRU_BASE + lru, delta); > > > > > > +} > > > > > > > > > > This is a duplicate of update_lru_size(), please use that instead. > > > > > > > > > > Yeah technically you don't need the mem_cgroup_update_lru_size() but > > > > > that's not worth sweating over, better to keep it simple. > > > > > > > > I agree we don't need the mem_cgroup_update_lru_size() -- let me spell > > > > out why: > > > > this function is not needed here because it updates the counters used > > > > only by the active/inactive lru code, i.e., get_scan_count(). > > > > > > > > However, we can't reuse update_lru_size() because MGLRU can trip the > > > > WARN_ONCE() in mem_cgroup_update_lru_size(). > > > > > > > > Unlike lru_zone_size[], lrugen->nr_pages[] is eventually consistent. > > > > To move a page to a different generation, the gen counter in page->flags > > > > is updated first, which doesn't require the LRU lock. The second step, > > > > i.e., the update of lrugen->nr_pages[], requires the LRU lock, and it > > > > usually isn't done immediately due to batching. Meanwhile, if this page > > > > is, for example, isolated, nr_pages[] becomes temporarily unbalanced. > > > > And this trips the WARN_ONCE(). > > > > > > Good insight. > > > > > > But in that case, I'd still think it's better to use update_lru_size() > > > and gate the memcg update on lrugen-enabled, with a short comment > > > saying that lrugen has its own per-cgroup counts already. It's just a > > > bit too error prone to duplicate the stat updates. > > > > > > Even better would be: > > > > > > static __always_inline > > > void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) > > > { > > > enum lru_list lru = folio_lru_list(folio); > > > > > > update_lru_size(lruvec, lru, folio_zonenum(folio), > > > folio_nr_pages(folio)); > > > if (lrugen_enabled(lruvec)) > > > lrugen_add_folio(lruvec, folio); > > > else > > > list_add(&folio->lru, &lruvec->lists[lru]); > > > } > > > > > > But it does mean you'd have to handle unevictable pages. I'm reviewing > > > from the position that mglru is going to supplant the existing reclaim > > > algorithm in the long term, though, so being more comprehensive and > > > eliminating special cases where possible is all-positive, IMO. > > > > > > Up to you. I'd only insist on reusing update_lru_size() at least. > > > > Will do. > > > > > > > > +static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) > > > > > > +{ > > > > > > + int gen; > > > > > > + unsigned long old_flags, new_flags; > > > > > > + int type = folio_is_file_lru(folio); > > > > > > + int zone = folio_zonenum(folio); > > > > > > + struct lru_gen_struct *lrugen = &lruvec->lrugen; > > > > > > + > > > > > > + if (folio_test_unevictable(folio) || !lrugen->enabled) > > > > > > + return false; > > > > > > > > > > These two checks should be in the callsite and the function should > > > > > return void. Otherwise you can't understand the callsite without > > > > > drilling down into lrugen code, even if lrugen is disabled. > > > > > > > > I agree it's a bit of nuisance this way. The alternative is we'd need > > > > ifdef or another helper at the call sites because lrugen->enabled is > > > > specific to lrugen. > > > > > > Coming from memcg, my experience has been that when you have a compile > > > time-optional MM extension like this, you'll sooner or later need a > > > config-independent helper to gate callbacks in generic code. So I > > > think it's a good idea to add one now. > > > > > > One of these? > > > > > > lruvec_on_lrugen() > > > > SGTM. > > > > Personally I'd reuse lru_gen_enabled(), by passing NULL/lruvec. But > > my guess is you wouldn't like it. > > > > > lruvec_using_lrugen() > > > lruvec_lrugen_enabled() > > > > > > lruvec_has_generations() :-) > > > > > > > > On that note, I think #1 is reintroducing a problem we have fixed > > > > > before, which is trashing the workingset with a flood of use-once > > > > > mmapped pages. It's the classic scenario where LFU beats LRU. > > > > > > > > > > Mapped streaming IO isn't very common, but it does happen. See these > > > > > commits: > > > > > > > > > > dfc8d636cdb95f7b792d5ba8c9f3b295809c125d > > > > > 31c0569c3b0b6cc8a867ac6665ca081553f7984c > > > > > 645747462435d84c6c6a64269ed49cc3015f753d > > > > > > > > > > From the changelog: > > > > > > > > > > The used-once mapped file page detection patchset. > > > > > > > > > > It is meant to help workloads with large amounts of shortly used file > > > > > mappings, like rtorrent hashing a file or git when dealing with loose > > > > > objects (git gc on a bigger site?). > > > > > > > > > > Right now, the VM activates referenced mapped file pages on first > > > > > encounter on the inactive list and it takes a full memory cycle to > > > > > reclaim them again. When those pages dominate memory, the system > > > > > no longer has a meaningful notion of 'working set' and is required > > > > > to give up the active list to make reclaim progress. Obviously, > > > > > this results in rather bad scanning latencies and the wrong pages > > > > > being reclaimed. > > > > > > > > > > This patch makes the VM be more careful about activating mapped file > > > > > pages in the first place. The minimum granted lifetime without > > > > > another memory access becomes an inactive list cycle instead of the > > > > > full memory cycle, which is more natural given the mentioned loads. > > > > > > > > > > Translating this to multigen, it seems fresh faults should really > > > > > start on the second oldest rather than on the youngest generation, to > > > > > get a second chance but without jeopardizing the workingset if they > > > > > don't take it. > > > > > > > > This is a good point, and I had worked on a similar idea but failed > > > > to measure its benefits. In addition to placing mmapped file pages in > > > > older generations, I also tried placing refaulted anon pages in older > > > > generations. My conclusion was that the initial LRU positions of NFU > > > > pages are not a bottleneck for workloads I've tested. The efficiency > > > > of testing/clearing the accessed bit is. > > > > > > The concern isn't the scan overhead, but jankiness from the workingset > > > being flooded out by streaming IO. > > > > Yes, MGLRU uses a different approach to solve this problem, and for > > its approach, the scan overhead is the concern. > > > > MGLRU detects (defines) the working set by scanning the entire memory > > for each generation, and it counters the flooding by accelerating the > > creation of generations. IOW, all mapped pages have an equal chance to > > get scanned, no matter which generation they are in. This is a design > > difference compared with the active/inactive LRU, which tries to scans > > the active/inactive lists less/more frequently. > > > > > The concrete usecase at the time was a torrent client hashing a > > > downloaded file and thereby kicking out the desktop environment, which > > > caused jankiness. The hashing didn't benefit from caching - the file > > > wouldn't have fit into RAM anyway - so this was pointless to boot. > > > > > > Essentially, the tradeoff is this: > > > > > > 1) If you treat new pages as hot, you accelerate workingset > > > transitions, but on the flipside you risk unnecessary refaults in > > > running applications when those new pages are one-off. > > > > > > 2) If you take new pages with a grain of salt, you protect existing > > > applications better from one-off floods, but risk refaults in NEW > > > application while they're trying to start up. > > > > Agreed. > > > > > There are two arguments for why 2) is preferable: > > > > > > 1) Users are tolerant of cache misses when applications first launch, > > > much less so after they've been running for hours. > > > > Our CUJs (Critical User Journeys) respectfully disagree :) > > > > They are built on the observation that once users have moved onto > > another tab/app, they are more likely to stay with the new tab/app > > rather than go back to the old ones. Speaking for myself, this is > > generally the case. > > > > > 2) Workingset transitions (and associated jankiness) are bounded by > > > the amount of RAM you need to repopulate. But streaming IO is > > > bounded by storage, and datasets are routinely several times the > > > amount of RAM. Uncacheable sets in excess of RAM can produce an > > > infinite stream of "new" references; not protecting the workingset > > > from that means longer or even sustained jankiness. > > > > I'd argue the opposite -- we shouldn't risk refaulting fresh hot pages > > just to accommodate this concrete yet minor use case, especially > > considering torrent has been given the means (MADV_SEQUENTIAL) to help > > itself. > > > > I appreciate all your points here. The bottom line is we agree this is > > a trade off. For what disagree about, we could be both right -- it > > comes down to what workloads we care about *more*. > > > > To move forward, I propose we look at it from a non-technical POV: > > would we want to offer users an alternative trade off so that they can > > have greater flexibility? > > > > > > And some applications are smart enough to leverage MADV_SEQUENTIAL. > > > > In this case, MGLRU does place mmapped file pages in the oldest > > > > generation. > > > > > > Yes, it makes sense to optimize when MADV_SEQUENTIAL is requested. But > > > that hint isn't reliably there, so it matters that we don't do poorly > > > when it's missing. > > > > Agreed. > > > > > > I have an oversimplified script that uses memcached to mimic a > > > > non-streaming workload and fio a (mmapped) streaming workload: > > > > > > Looking at the paramters and observed behavior, let me say up front > > > that this looks like a useful benchmark, but doesn't capture the > > > scenario I was talking about above. > > > > > > For one, the presence of swapping in both kernels suggests that the > > > "streaming IO" component actually has repeat access that could benefit > > > from caching. Second, I would expect memcache is accessing its memory > > > frequently and consistently, and so could withstand workingset > > > challenges from streaming IO better than, say, a desktop environment. > > > > The fio workload is a real streaming workload, but the memcached > > workload might have been too large to be a typical desktop workload. > > > > More below. > > > > > More on that below. > > > > > > > 1. With MADV_SEQUENTIAL, the non-streaming workload is about 5 times > > > > faster when using MGLRU. Somehow the baseline (rc3) swapped a lot. > > > > (It shouldn't, and I haven't figured out why.) > > > > > > Baseline swaps when there are cache refaults. This is regardless of > > > the hint: you may say you're accessing these pages sequentially, but > > > the refaults say you're reusing them, with a frequency that suggests > > > they might be cacheable. So it tries to cache them. > > > > > > I'd be curious if that results in fio being faster, or whether it's > > > all just pointless thrashing. Can you share the fio results too? > > > > More below. > > > > > We could patch baseline to prioritize MADV_SEQUENTIAL more, but... > > > > > > > 2. Without MADV_SEQUENTIAL, the non-streaming workload is about 1 > > > > time faster when using MGLRU. Both MGLRU and the baseline swapped > > > > a lot. > > > > > > ...in practice I think this scenario will matter to a lot more users. > > > > I strongly feel we should prioritize what's advertised on a man page > > over an unspecified (performance) behavior. > > > > > I would again be interested in the fio results. > > > > > > > MADV_SEQUENTIAL non-streaming ops/sec (memcached) > > > > rc3 yes 292k > > > > rc3 no 203k > > > > rc3+v7 yes 1967k > > > > rc3+v7 no 436k > > Appending a few notes on the baseline results: > 1. Apparently FADV_DONTNEED rejects mmaped pages -- I found no reasons > from the man page or the original commit why it should. I propose we > remove the page_mapped() check in lru_deactivate_file_fn(). Adding > Minchan to see how he thinks about this. Hi Yu, It's quite old code. I already forgot all the details. Maybe, I wanted t minimize behavior changes since invalidate_inode_page already fiter mapped pages out. I don't have any strong reason not to move mapped pages for deactivation but if we do, it would be better to move them into head of inactive list instead of tail to give a promote chance since other processes are still mappping the page in their address space. Thanks.
Hi Yu, On Mon, Feb 21, 2022 at 01:14:24AM -0700, Yu Zhao wrote: > On Tue, Feb 15, 2022 at 04:53:56PM -0500, Johannes Weiner wrote: > > On Tue, Feb 15, 2022 at 02:43:05AM -0700, Yu Zhao wrote: > > > On Thu, Feb 10, 2022 at 03:41:57PM -0500, Johannes Weiner wrote: > > > > > +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) > > > > > +{ > > > > > + unsigned long max_seq = lruvec->lrugen.max_seq; > > > > > + > > > > > + VM_BUG_ON(gen >= MAX_NR_GENS); > > > > > + > > > > > + /* see the comment on MIN_NR_GENS */ > > > > > + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); > > > > > +} > > > > > > > > I'm still reading the series, so correct me if I'm wrong: the "active" > > > > set is split into two generations for the sole purpose of the > > > > second-chance policy for fresh faults, right? > > > > > > To be precise, the active/inactive notion on top of generations is > > > just for ABI compatibility, e.g., the counters in /proc/vmstat. > > > Otherwise, this function wouldn't be needed. > > > > Ah! would you mind adding this as a comment to the function? > > Will do. > > > But AFAICS there is the lru_gen_del_folio() callsite that maps it to > > the PG_active flag - which in turn gets used by add_folio() to place > > the thing back on the max_seq generation. So I suppose there is a > > secondary purpose of the function for remembering the page's rough age > > for non-reclaim isolation.> > > Yes, e.g., migration. Ok, thanks for clarifying. That should also be in the comment. On scan resistance: > > The concern isn't the scan overhead, but jankiness from the workingset > > being flooded out by streaming IO. > > Yes, MGLRU uses a different approach to solve this problem, and for > its approach, the scan overhead is the concern. > > MGLRU detects (defines) the working set by scanning the entire memory > for each generation, and it counters the flooding by accelerating the > creation of generations. IOW, all mapped pages have an equal chance to > get scanned, no matter which generation they are in. This is a design > difference compared with the active/inactive LRU, which tries to scans > the active/inactive lists less/more frequently. > > > The concrete usecase at the time was a torrent client hashing a > > downloaded file and thereby kicking out the desktop environment, which > > caused jankiness. The hashing didn't benefit from caching - the file > > wouldn't have fit into RAM anyway - so this was pointless to boot. > > > > Essentially, the tradeoff is this: > > > > 1) If you treat new pages as hot, you accelerate workingset > > transitions, but on the flipside you risk unnecessary refaults in > > running applications when those new pages are one-off. > > > > 2) If you take new pages with a grain of salt, you protect existing > > applications better from one-off floods, but risk refaults in NEW > > application while they're trying to start up. > > Agreed. > > > There are two arguments for why 2) is preferable: > > > > 1) Users are tolerant of cache misses when applications first launch, > > much less so after they've been running for hours. > > Our CUJs (Critical User Journeys) respectfully disagree :) > > They are built on the observation that once users have moved onto > another tab/app, they are more likely to stay with the new tab/app > rather than go back to the old ones. Speaking for myself, this is > generally the case. That's in line with what I said. Where is the disagreement? > > 2) Workingset transitions (and associated jankiness) are bounded by > > the amount of RAM you need to repopulate. But streaming IO is > > bounded by storage, and datasets are routinely several times the > > amount of RAM. Uncacheable sets in excess of RAM can produce an > > infinite stream of "new" references; not protecting the workingset > > from that means longer or even sustained jankiness. > > I'd argue the opposite -- we shouldn't risk refaulting fresh hot pages > just to accommodate this concrete yet minor use case, especially > considering torrent has been given the means (MADV_SEQUENTIAL) to help > itself. > > I appreciate all your points here. The bottom line is we agree this is > a trade off. For what disagree about, we could be both right -- it > comes down to what workloads we care about *more*. It's a straight-forward question: How does MGLRU avoid cache pollution from scans? Your answer above seems to be "it just does". Your answer here seems to be "it doesn't, but it doesn't matter". Forgive me if I'm misreading what you're saying. But it's not a minor concern. Read the motivation behind any modern cache algorithm - ARC, LIRS, Clock-Pro, LRU-K, 2Q - and scan resistance is the reason for why they all exist in the first place. "The LRU-K algorithm surpasses conventional buffering algorithms in discriminating between frequently and infrequently referenced pages." - The LRU-K page replacement algorithm for database disk buffering, O'Neil et al, 1993 "Although LRU replacement policy has been commonly used in the buffer cache management, it is well known for its inability to cope with access patterns with weak locality." - LIRS: an efficient low inter-reference recency set replacement policy to improve buffer cache performance, Jiang, Zhang, 2002 "The self-tuning, low-overhead, scan-resistant adaptive replacement cache algorithm outperforms the least-recently-used algorithm by dynamically responding to changing access patterns and continually balancing between workload recency and frequency features." - Outperforming LRU with an adaptive replacement cache algorithm, Megiddo, Modha, 2004 "Over the last three decades, the inability of LRU as well as CLOCK to handle weak locality accesses has become increasingly serious, and an effective fix becomes increasingly desirable. - CLOCK-Pro: An Effective Improvement of the CLOCK Replacement, Jiang et al, 2005 We can't rely on MADV_SEQUENTIAL alone. Not all accesses know in advance that they'll be one-off; it can be a group of uncoordinated tasks causing the pattern etc. This is a pretty fundamental issue. It would be good to get a more satisfying answer on this. > > > > You can drop the memcg parameter and use lruvec_memcg(). > > > > > > lruvec_memcg() isn't available yet when pgdat_init_internals() calls > > > this function because mem_cgroup_disabled() is initialized afterward. > > > > Good catch. That'll container_of() into garbage. However, we have to > > assume that somebody's going to try that simplification again, so we > > should set up the code now to prevent issues. > > > > cgroup_disable parsing is self-contained, so we can pull it ahead in > > the init sequence. How about this? > > > > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > > index 9d05c3ca2d5e..b544d768edc8 100644 > > --- a/kernel/cgroup/cgroup.c > > +++ b/kernel/cgroup/cgroup.c > > @@ -6464,9 +6464,9 @@ static int __init cgroup_disable(char *str) > > break; > > } > > } > > - return 1; > > + return 0; > > } > > -__setup("cgroup_disable=", cgroup_disable); > > +early_param("cgroup_disable", cgroup_disable); > > I think early_param() is still after pgdat_init_internals(), no? It's called twice for some reason, but AFAICS the first one is always called before pgdat_init_internals(): start_kernel() setup_arch() parse_early_param() x86_init.paging.pagetable_init(); paging_init() zone_sizes_init() free_area_init() free_area_init_node() free_area_init_core() pgdat_init_internals() parse_early_param() It's the same/similar for arm, sparc and mips.
On Thu, Mar 3, 2022 at 8:29 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > Hi Yu, > > On Mon, Feb 21, 2022 at 01:14:24AM -0700, Yu Zhao wrote: > > On Tue, Feb 15, 2022 at 04:53:56PM -0500, Johannes Weiner wrote: > > > On Tue, Feb 15, 2022 at 02:43:05AM -0700, Yu Zhao wrote: > > > > On Thu, Feb 10, 2022 at 03:41:57PM -0500, Johannes Weiner wrote: > > > > > > +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) > > > > > > +{ > > > > > > + unsigned long max_seq = lruvec->lrugen.max_seq; > > > > > > + > > > > > > + VM_BUG_ON(gen >= MAX_NR_GENS); > > > > > > + > > > > > > + /* see the comment on MIN_NR_GENS */ > > > > > > + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); > > > > > > +} > > > > > > > > > > I'm still reading the series, so correct me if I'm wrong: the "active" > > > > > set is split into two generations for the sole purpose of the > > > > > second-chance policy for fresh faults, right? > > > > > > > > To be precise, the active/inactive notion on top of generations is > > > > just for ABI compatibility, e.g., the counters in /proc/vmstat. > > > > Otherwise, this function wouldn't be needed. > > > > > > Ah! would you mind adding this as a comment to the function? > > > > Will do. > > > > > But AFAICS there is the lru_gen_del_folio() callsite that maps it to > > > the PG_active flag - which in turn gets used by add_folio() to place > > > the thing back on the max_seq generation. So I suppose there is a > > > secondary purpose of the function for remembering the page's rough age > > > for non-reclaim isolation.> > > > > Yes, e.g., migration. > > Ok, thanks for clarifying. That should also be in the comment. Thanks. Will do. > On scan resistance: > > > > The concern isn't the scan overhead, but jankiness from the workingset > > > being flooded out by streaming IO. > > > > Yes, MGLRU uses a different approach to solve this problem, and for > > its approach, the scan overhead is the concern. > > > > MGLRU detects (defines) the working set by scanning the entire memory > > for each generation, and it counters the flooding by accelerating the > > creation of generations. IOW, all mapped pages have an equal chance to > > get scanned, no matter which generation they are in. This is a design > > difference compared with the active/inactive LRU, which tries to scans > > the active/inactive lists less/more frequently. > > > > > The concrete usecase at the time was a torrent client hashing a > > > downloaded file and thereby kicking out the desktop environment, which > > > caused jankiness. The hashing didn't benefit from caching - the file > > > wouldn't have fit into RAM anyway - so this was pointless to boot. > > > > > > Essentially, the tradeoff is this: > > > > > > 1) If you treat new pages as hot, you accelerate workingset > > > transitions, but on the flipside you risk unnecessary refaults in > > > running applications when those new pages are one-off. > > > > > > 2) If you take new pages with a grain of salt, you protect existing > > > applications better from one-off floods, but risk refaults in NEW > > > application while they're trying to start up. > > > > Agreed. > > > > > There are two arguments for why 2) is preferable: > > > > > > 1) Users are tolerant of cache misses when applications first launch, > > > much less so after they've been running for hours. > > > > Our CUJs (Critical User Journeys) respectfully disagree :) > > > > They are built on the observation that once users have moved onto > > another tab/app, they are more likely to stay with the new tab/app > > rather than go back to the old ones. Speaking for myself, this is > > generally the case. > > That's in line with what I said. Where is the disagreement? Probably I've misinterpreted what you meant. The reasoning behind 1) sounds to me like: Cache misses of existing apps are more detrimental to user experience, and therefore we choose to sacrifice the performance of newly launched apps to avoid flooding. My argument is that (phone/laptop/desktop) users usually care more about the performance of newly launched apps -- this sounds a contradiction of what 1) says -- and therefore sacrificing the performance of newly launched apps is not generally a good idea. > > > 2) Workingset transitions (and associated jankiness) are bounded by > > > the amount of RAM you need to repopulate. But streaming IO is > > > bounded by storage, and datasets are routinely several times the > > > amount of RAM. Uncacheable sets in excess of RAM can produce an > > > infinite stream of "new" references; not protecting the workingset > > > from that means longer or even sustained jankiness. > > > > I'd argue the opposite -- we shouldn't risk refaulting fresh hot pages > > just to accommodate this concrete yet minor use case, especially > > considering torrent has been given the means (MADV_SEQUENTIAL) to help > > itself. > > > > I appreciate all your points here. The bottom line is we agree this is > > a trade off. For what disagree about, we could be both right -- it > > comes down to what workloads we care about *more*. > > It's a straight-forward question: How does MGLRU avoid cache pollution > from scans? > > Your answer above seems to be "it just does". Your answer here seems > to be "it doesn't, but it doesn't matter". Forgive me if I'm > misreading what you're saying. This might have gotten lost. At the beginning, I explained: > > Yes, MGLRU uses a different approach to solve this problem, and for > > its approach, the scan overhead is the concern. > > > > MGLRU detects (defines) the working set by scanning the entire memory > > for each generation, and it counters the flooding by accelerating the > > creation of generations. IOW, all mapped pages have an equal chance to > > get scanned, no matter which generation they are in. This is a design > > difference compared with the active/inactive LRU, which tries to scan > > the active/inactive lists less/more frequently. To summarize: MGLRU counteracts flooding by shortening the time interval that defines an entire working set. In contrast, the active/inactive LRU uses a longer time interval for the active (established working set) and a shorter one for the inactive (flood or new working set). > But it's not a minor concern. This depends on the POV :) For what workloads I care about more, i.e., the majority of phone/laptop/desktop users and top open-source memory hogs running on servers, I've heard no complaints (yet). > Read the motivation behind any modern > cache algorithm - ARC, LIRS, Clock-Pro, LRU-K, 2Q - and scan > resistance is the reason for why they all exist in the first place. I agree that flooding is a concern (major from your POV; minor from my POV). I assume that you agree that the solution is always a tradeoff. I'm merely suggesting both tradeoffs I summarized above have their merits and we shouldn't put all eggs in one bucket. > "The LRU-K algorithm surpasses conventional buffering algorithms > in discriminating between frequently and infrequently referenced > pages." > > - The LRU-K page replacement algorithm for database disk > buffering, O'Neil et al, 1993 > > "Although LRU replacement policy has been commonly used in the > buffer cache management, it is well known for its inability to > cope with access patterns with weak locality." > > - LIRS: an efficient low inter-reference recency set > replacement policy to improve buffer cache performance, > Jiang, Zhang, 2002 > > "The self-tuning, low-overhead, scan-resistant adaptive > replacement cache algorithm outperforms the least-recently-used > algorithm by dynamically responding to changing access patterns > and continually balancing between workload recency and frequency > features." > > - Outperforming LRU with an adaptive replacement cache > algorithm, Megiddo, Modha, 2004 > > "Over the last three decades, the inability of LRU as well as > CLOCK to handle weak locality accesses has become increasingly > serious, and an effective fix becomes increasingly desirable. > > - CLOCK-Pro: An Effective Improvement of the CLOCK > Replacement, Jiang et al, 2005 > > > We can't rely on MADV_SEQUENTIAL alone. Agreed. > Not all accesses know in > advance that they'll be one-off; it can be a group of uncoordinated > tasks causing the pattern etc. For what we know, MADV_SEQUENTIAL can be a solution. There are things we don't know, hence the "agreed" above :) > This is a pretty fundamental issue. Agreed. My points are: 1. The practice value of this fundamental issue (minor or major). 2. The merits of different tradeoffs (better or worse). IMO, both depend on the POV. > It would be good to get a more > satisfying answer on this. Agreed. > > > > > You can drop the memcg parameter and use lruvec_memcg(). > > > > > > > > lruvec_memcg() isn't available yet when pgdat_init_internals() calls > > > > this function because mem_cgroup_disabled() is initialized afterward. > > > > > > Good catch. That'll container_of() into garbage. However, we have to > > > assume that somebody's going to try that simplification again, so we > > > should set up the code now to prevent issues. > > > > > > cgroup_disable parsing is self-contained, so we can pull it ahead in > > > the init sequence. How about this? > > > > > > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > > > index 9d05c3ca2d5e..b544d768edc8 100644 > > > --- a/kernel/cgroup/cgroup.c > > > +++ b/kernel/cgroup/cgroup.c > > > @@ -6464,9 +6464,9 @@ static int __init cgroup_disable(char *str) > > > break; > > > } > > > } > > > - return 1; > > > + return 0; > > > } > > > -__setup("cgroup_disable=", cgroup_disable); > > > +early_param("cgroup_disable", cgroup_disable); > > > > I think early_param() is still after pgdat_init_internals(), no? > > It's called twice for some reason, but AFAICS the first one is always > called before pgdat_init_internals(): > > start_kernel() > setup_arch() > parse_early_param() > x86_init.paging.pagetable_init(); > paging_init() > zone_sizes_init() > free_area_init() > free_area_init_node() > free_area_init_core() > pgdat_init_internals() > parse_early_param() > > It's the same/similar for arm, sparc and mips. Thanks for checking. But I'd rather live with an additional parameter than risk breaking some archs.
On Thu, Mar 03, 2022 at 12:26:45PM -0700, Yu Zhao wrote: > On Thu, Mar 3, 2022 at 8:29 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Mon, Feb 21, 2022 at 01:14:24AM -0700, Yu Zhao wrote: > > > On Tue, Feb 15, 2022 at 04:53:56PM -0500, Johannes Weiner wrote: > > > > On Tue, Feb 15, 2022 at 02:43:05AM -0700, Yu Zhao wrote: > > > > > On Thu, Feb 10, 2022 at 03:41:57PM -0500, Johannes Weiner wrote: > > > > > > You can drop the memcg parameter and use lruvec_memcg(). > > > > > > > > > > lruvec_memcg() isn't available yet when pgdat_init_internals() calls > > > > > this function because mem_cgroup_disabled() is initialized afterward. > > > > > > > > Good catch. That'll container_of() into garbage. However, we have to > > > > assume that somebody's going to try that simplification again, so we > > > > should set up the code now to prevent issues. > > > > > > > > cgroup_disable parsing is self-contained, so we can pull it ahead in > > > > the init sequence. How about this? > > > > > > > > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > > > > index 9d05c3ca2d5e..b544d768edc8 100644 > > > > --- a/kernel/cgroup/cgroup.c > > > > +++ b/kernel/cgroup/cgroup.c > > > > @@ -6464,9 +6464,9 @@ static int __init cgroup_disable(char *str) > > > > break; > > > > } > > > > } > > > > - return 1; > > > > + return 0; > > > > } > > > > -__setup("cgroup_disable=", cgroup_disable); > > > > +early_param("cgroup_disable", cgroup_disable); > > > > > > I think early_param() is still after pgdat_init_internals(), no? > > > > It's called twice for some reason, but AFAICS the first one is always > > called before pgdat_init_internals(): > > > > start_kernel() > > setup_arch() > > parse_early_param() > > x86_init.paging.pagetable_init(); > > paging_init() > > zone_sizes_init() > > free_area_init() > > free_area_init_node() > > free_area_init_core() > > pgdat_init_internals() > > parse_early_param() > > > > It's the same/similar for arm, sparc and mips. > > Thanks for checking. But I'd rather live with an additional parameter > than risk breaking some archs. As per above, somebody is going to try to make that simplification again in the future. It doesn't make a lot of sense to have a reviewer trip over it, have a discussion about just how subtle this dependency is, and then still leave it in for others. parse_early_param() is documented to be called by arch code early on, there isn't a good reason to mistrust our own codebase like that. And special-casing this situation just complicates maintainability and hackability. Please just fix the ordering and use lruvec_memcg(), thanks.
On Tue, Feb 15, 2022 at 10:43 PM Yu Zhao <yuzhao@google.com> wrote: > > On Thu, Feb 10, 2022 at 03:41:57PM -0500, Johannes Weiner wrote: > > Thanks for reviewing. > > > > +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) > > > +{ > > > + unsigned long max_seq = lruvec->lrugen.max_seq; > > > + > > > + VM_BUG_ON(gen >= MAX_NR_GENS); > > > + > > > + /* see the comment on MIN_NR_GENS */ > > > + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); > > > +} > > > > I'm still reading the series, so correct me if I'm wrong: the "active" > > set is split into two generations for the sole purpose of the > > second-chance policy for fresh faults, right? > > To be precise, the active/inactive notion on top of generations is > just for ABI compatibility, e.g., the counters in /proc/vmstat. > Otherwise, this function wouldn't be needed. Hi Yu, I am still quite confused as i am seeing both active/inactive and lru_gen. eg: root@ubuntu:~# cat /proc/vmstat | grep active nr_zone_inactive_anon 22797 nr_zone_active_anon 578405 nr_zone_inactive_file 0 nr_zone_active_file 4156 nr_inactive_anon 22800 nr_active_anon 578574 nr_inactive_file 0 nr_active_file 4215 and: root@ubuntu:~# cat /sys//kernel/debug/lru_gen ... memcg 36 /user.slice/user-0.slice/user@0.service node 0 20 18820 22 0 21 7452 0 0 22 7448 0 0 memcg 33 /user.slice/user-0.slice/user@0.service/app.slice node 0 0 2171452 0 0 1 2171452 0 0 2 2171452 0 0 3 2171452 0 0 memcg 37 /user.slice/user-0.slice/session-1.scope node 0 42 51804 102127 0 43 18840 275622 0 44 16104 216805 1 Does it mean one page could be in both one of the generations and one of the active/inactive lists? Do we have some mapping relationship between active/inactive lists with generations? > > > If so, it'd be better to have the comment here instead of down by > > MIN_NR_GENS. This is the place that defines what "active" is, so this > > is where the reader asks what it means and what it implies. The > > definition of MIN_NR_GENS can be briefer: "need at least two for > > second chance, see lru_gen_is_active() for details". > > This could be understood this way. It'd be more appropriate to see > this function as an auxiliary and MIN_NR_GENS as something fundamental. > Therefore the former should refer to the latter. Specifically, the > "see the comment on MIN_NR_GENS" refers to this part: > And to be compatible with the active/inactive LRU, these two > generations are mapped to the active; the rest of generations, if > they exist, are mapped to the inactive. > > > > +static inline void lru_gen_update_size(struct lruvec *lruvec, enum lru_list lru, > > > + int zone, long delta) > > > +{ > > > + struct pglist_data *pgdat = lruvec_pgdat(lruvec); > > > + > > > + lockdep_assert_held(&lruvec->lru_lock); > > > + WARN_ON_ONCE(delta != (int)delta); > > > + > > > + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, delta); > > > + __mod_zone_page_state(&pgdat->node_zones[zone], NR_ZONE_LRU_BASE + lru, delta); > > > +} > > > > This is a duplicate of update_lru_size(), please use that instead. > > > > Yeah technically you don't need the mem_cgroup_update_lru_size() but > > that's not worth sweating over, better to keep it simple. > > I agree we don't need the mem_cgroup_update_lru_size() -- let me spell > out why: > this function is not needed here because it updates the counters used > only by the active/inactive lru code, i.e., get_scan_count(). > > However, we can't reuse update_lru_size() because MGLRU can trip the > WARN_ONCE() in mem_cgroup_update_lru_size(). > > Unlike lru_zone_size[], lrugen->nr_pages[] is eventually consistent. > To move a page to a different generation, the gen counter in page->flags > is updated first, which doesn't require the LRU lock. The second step, > i.e., the update of lrugen->nr_pages[], requires the LRU lock, and it > usually isn't done immediately due to batching. Meanwhile, if this page > is, for example, isolated, nr_pages[] becomes temporarily unbalanced. > And this trips the WARN_ONCE(). > > <snipped> > > > /* Promotion */ > > > + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { > > > + lru_gen_update_size(lruvec, lru, zone, -delta); > > > + lru_gen_update_size(lruvec, lru + LRU_ACTIVE, zone, delta); > > > + } > > > + > > > + /* Promotion is legit while a page is on an LRU list, but demotion isn't. */ > > > > /* Demotion happens during aging when pages are isolated, never on-LRU */ > > > + VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); > > > +} > > > > On that note, please move introduction of the promotion and demotion > > bits to the next patch. They aren't used here yet, and I spent some > > time jumping around patches to verify the promotion callers and > > confirm the validy of the BUG_ON. > > Will do. > > > > +static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) > > > +{ > > > + int gen; > > > + unsigned long old_flags, new_flags; > > > + int type = folio_is_file_lru(folio); > > > + int zone = folio_zonenum(folio); > > > + struct lru_gen_struct *lrugen = &lruvec->lrugen; > > > + > > > + if (folio_test_unevictable(folio) || !lrugen->enabled) > > > + return false; > > > > These two checks should be in the callsite and the function should > > return void. Otherwise you can't understand the callsite without > > drilling down into lrugen code, even if lrugen is disabled. > > I agree it's a bit of nuisance this way. The alternative is we'd need > ifdef or another helper at the call sites because lrugen->enabled is > specific to lrugen. > > > > + /* > > > + * There are three common cases for this page: > > > + * 1) If it shouldn't be evicted, e.g., it was just faulted in, add it > > > + * to the youngest generation. > > > > "shouldn't be evicted" makes it sound like mlock. But they should just > > be evicted last, right? Maybe: > > > > /* > > * Pages start in different generations depending on > > * advance knowledge we have about their hotness and > > * evictability: > > * > > * 1. Already active pages start out youngest. This can be > > * fresh faults, or refaults of previously hot pages. > > * 2. Cold pages that require writeback before becoming > > * evictable start on the second oldest generation. > > * 3. Everything else (clean, cold) starts old. > > */ > > Will do. > > > On that note, I think #1 is reintroducing a problem we have fixed > > before, which is trashing the workingset with a flood of use-once > > mmapped pages. It's the classic scenario where LFU beats LRU. > > > > Mapped streaming IO isn't very common, but it does happen. See these > > commits: > > > > dfc8d636cdb95f7b792d5ba8c9f3b295809c125d > > 31c0569c3b0b6cc8a867ac6665ca081553f7984c > > 645747462435d84c6c6a64269ed49cc3015f753d > > > > From the changelog: > > > > The used-once mapped file page detection patchset. > > > > It is meant to help workloads with large amounts of shortly used file > > mappings, like rtorrent hashing a file or git when dealing with loose > > objects (git gc on a bigger site?). > > > > Right now, the VM activates referenced mapped file pages on first > > encounter on the inactive list and it takes a full memory cycle to > > reclaim them again. When those pages dominate memory, the system > > no longer has a meaningful notion of 'working set' and is required > > to give up the active list to make reclaim progress. Obviously, > > this results in rather bad scanning latencies and the wrong pages > > being reclaimed. > > > > This patch makes the VM be more careful about activating mapped file > > pages in the first place. The minimum granted lifetime without > > another memory access becomes an inactive list cycle instead of the > > full memory cycle, which is more natural given the mentioned loads. > > > > Translating this to multigen, it seems fresh faults should really > > start on the second oldest rather than on the youngest generation, to > > get a second chance but without jeopardizing the workingset if they > > don't take it. > > This is a good point, and I had worked on a similar idea but failed > to measure its benefits. In addition to placing mmapped file pages in > older generations, I also tried placing refaulted anon pages in older > generations. My conclusion was that the initial LRU positions of NFU > pages are not a bottleneck for workloads I've tested. The efficiency > of testing/clearing the accessed bit is. > > And some applications are smart enough to leverage MADV_SEQUENTIAL. > In this case, MGLRU does place mmapped file pages in the oldest > generation. > > I have an oversimplified script that uses memcached to mimic a > non-streaming workload and fio a (mmapped) streaming workload: > 1. With MADV_SEQUENTIAL, the non-streaming workload is about 5 times > faster when using MGLRU. Somehow the baseline (rc3) swapped a lot. > (It shouldn't, and I haven't figured out why.) > 2. Without MADV_SEQUENTIAL, the non-streaming workload is about 1 > time faster when using MGLRU. Both MGLRU and the baseline swapped > a lot. > > MADV_SEQUENTIAL non-streaming ops/sec (memcached) > rc3 yes 292k > rc3 no 203k > rc3+v7 yes 1967k > rc3+v7 no 436k > > cat mmap.sh > modprobe brd rd_nr=2 rd_size=56623104 > > mkswap /dev/ram0 > swapon /dev/ram0 > > mkfs.ext4 /dev/ram1 > mount -t ext4 /dev/ram1 /mnt > > memtier_benchmark -S /var/run/memcached/memcached.sock -P memcache_binary \ > -n allkeys --key-minimum=1 --key-maximum=50000000 --key-pattern=P:P -c 1 \ > -t 36 --ratio 1:0 --pipeline 8 -d 2000 > > # streaming workload: --fadvise_hint=0 disables MADV_SEQUENTIAL > fio -name=mglru --numjobs=12 --directory=/mnt --size=4224m --buffered=1 \ > --ioengine=mmap --iodepth=128 --iodepth_batch_submit=32 \ > --iodepth_batch_complete=32 --rw=read --time_based --ramp_time=10m \ > --runtime=180m --group_reporting & > pid=$! > > sleep 200 > > # non-streaming workload > memtier_benchmark -S /var/run/memcached/memcached.sock -P memcache_binary \ > -n allkeys --key-minimum=1 --key-maximum=50000000 --key-pattern=R:R \ > -c 1 -t 36 --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed > > kill -INT $pid > wait We used to put a faulted file page in inactive, if we access it a second time, it can be promoted to active. then in recent years, we have also applied this to anon pages while kernel adds workingset protection for anon pages. so basically both anon and file pages go into the inactive list for the 1st time, if we access it for the second time, they go to the active list. if we don't access it any more, they are likely to be reclaimed as they are inactive. we do have some special fastpath for code section, executable file pages are kept on active list as long as they are accessed. so all of the above concerns are actually not that correct? > > > > + * 2) If it can't be evicted immediately, i.e., it's an anon page and > > > + * not in swapcache, or a dirty page pending writeback, add it to the > > > + * second oldest generation. > > > + * 3) If it may be evicted immediately, e.g., it's a clean page, add it > > > + * to the oldest generation. > > > + */ > > > + if (folio_test_active(folio)) > > > + gen = lru_gen_from_seq(lrugen->max_seq); > > > + else if ((!type && !folio_test_swapcache(folio)) || > > > + (folio_test_reclaim(folio) && > > > + (folio_test_dirty(folio) || folio_test_writeback(folio)))) > > > + gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); > > > + else > > > + gen = lru_gen_from_seq(lrugen->min_seq[type]); > > > > Condition #2 is not quite clear to me, and the comment is incomplete: > > The code does put dirty/writeback pages on the oldest gen as long as > > they haven't been marked for immediate reclaim by the scanner > > yet. > > Right. > > > HOWEVER, once the scanner does see those pages and sets > > PG_reclaim, it will also activate them to move them out of the way > > until writeback finishes (see shrink_page_list()) - at which point > > we'll trigger #1. So that second part of #2 appears unreachable. > > Yes, dirty file pages go to #1; dirty pages in swapcache go to #2. > (Ideally we want dirty file pages go to #2 too. IMO, the code would > be cleaner that way.) > > > It could be a good exercise to describe how cache pages move through > > the generations, similar to the comment on lru_deactivate_file_fn(). > > It's a good example of intent vs implementation. > > Will do. > > > On another note, "!type" meaning "anon" is a bit rough. Please follow > > the "bool file" convention used elsewhere. > > Originally I used "file", e.g., in v2: > https://lore.kernel.org/linux-mm/20210413065633.2782273-9-yuzhao@google.com/ > > But I was told to renamed it since "file" usually means file. Let me > rename it back to "file", unless somebody still objects. > > > > @@ -113,6 +298,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) > > > { > > > enum lru_list lru = folio_lru_list(folio); > > > > > > + if (lru_gen_add_folio(lruvec, folio, true)) > > > + return; > > > + > > > > bool parameters are notoriously hard to follow in the callsite. Can > > you please add lru_gen_add_folio_tail() instead and have them use a > > common helper? > > I'm not sure -- there are several places like this one. My question is > whether we want to do it throughout this patchset. We'd end up with > many helpers and duplicate code. E.g., in this file alone, we have two > functions taking bool parameters: > lru_gen_add_folio(..., bool reclaiming) > lru_gen_del_folio(..., bool reclaiming) > > I can't say they are very readable; at least they are very compact > right now. My concern is that we might loose the latter without having > enough of the former. > > Perhaps this is something that we could revisit after you've finished > reviewing the entire patchset? > > > > @@ -127,6 +315,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, > > > static __always_inline > > > void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) > > > { > > > + if (lru_gen_del_folio(lruvec, folio, false)) > > > + return; > > > + > > > list_del(&folio->lru); > > > update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio), > > > -folio_nr_pages(folio)); > > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > > > index aed44e9b5d89..0f5e8a995781 100644 > > > --- a/include/linux/mmzone.h > > > +++ b/include/linux/mmzone.h > > > @@ -303,6 +303,78 @@ enum lruvec_flags { > > > */ > > > }; > > > > > > +struct lruvec; > > > + > > > +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) > > > +#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) > > > + > > > +#ifdef CONFIG_LRU_GEN > > > + > > > +#define MIN_LRU_BATCH BITS_PER_LONG > > > +#define MAX_LRU_BATCH (MIN_LRU_BATCH * 128) > > > > Those two aren't used in this patch, so it's hard to say whether they > > are chosen correctly. > > Right. They slipped during the v6/v7 refactoring. Will move them to > the next patch. > > > > + * Evictable pages are divided into multiple generations. The youngest and the > > > + * oldest generation numbers, max_seq and min_seq, are monotonically increasing. > > > + * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An > > > + * offset within MAX_NR_GENS, gen, indexes the LRU list of the corresponding > > > + * generation. The gen counter in folio->flags stores gen+1 while a page is on > > > + * one of lrugen->lists[]. Otherwise it stores 0. > > > + * > > > + * A page is added to the youngest generation on faulting. The aging needs to > > > + * check the accessed bit at least twice before handing this page over to the > > > + * eviction. The first check takes care of the accessed bit set on the initial > > > + * fault; the second check makes sure this page hasn't been used since then. > > > + * This process, AKA second chance, requires a minimum of two generations, > > > + * hence MIN_NR_GENS. And to be compatible with the active/inactive LRU, these > > > + * two generations are mapped to the active; the rest of generations, if they > > > + * exist, are mapped to the inactive. PG_active is always cleared while a page > > > + * is on one of lrugen->lists[] so that demotion, which happens consequently > > > + * when the aging produces a new generation, needs not to worry about it. > > > + */ > > > +#define MIN_NR_GENS 2U > > > +#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) > > > + > > > +struct lru_gen_struct { > > > > struct lrugen? > > > > In fact, "lrugen" for the general function and variable namespace > > might be better, the _ doesn't seem to pull its weight. > > > > CONFIG_LRUGEN > > struct lrugen > > lrugen_foo() > > etc. > > No strong opinion here. I usually add underscores to functions and > types so that grep doesn't end up with tons of local variables. > > > > + /* the aging increments the youngest generation number */ > > > + unsigned long max_seq; > > > + /* the eviction increments the oldest generation numbers */ > > > + unsigned long min_seq[ANON_AND_FILE]; > > > > The singular max_seq vs the split min_seq raises questions. Please add > > a comment that explains or points to an explanation. > > Will do. > > > > + /* the birth time of each generation in jiffies */ > > > + unsigned long timestamps[MAX_NR_GENS]; > > > > This isn't in use until the thrashing-based OOM killing patch. > > Will move it there. > > > > + /* the multigenerational LRU lists */ > > > + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; > > > + /* the sizes of the above lists */ > > > + unsigned long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; > > > + /* whether the multigenerational LRU is enabled */ > > > + bool enabled; > > > > Not (really) in use until the runtime switch. Best to keep everybody > > checking the global flag for now, and have the runtime switch patch > > introduce this flag and switch necessary callsites over. > > Will do. > > > > +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); > > > > "state" is what we usually init :) How about lrugen_init_lruvec()? > > Same story as "file", lol -- this used to be lru_gen_init_lruvec(): > https://lore.kernel.org/linux-mm/20210413065633.2782273-9-yuzhao@google.com/ > > Naming is hard. Hopefully we can finalize it this time. > > > You can drop the memcg parameter and use lruvec_memcg(). > > lruvec_memcg() isn't available yet when pgdat_init_internals() calls > this function because mem_cgroup_disabled() is initialized afterward. > > > > +#ifdef CONFIG_MEMCG > > > +void lru_gen_init_memcg(struct mem_cgroup *memcg); > > > +void lru_gen_free_memcg(struct mem_cgroup *memcg); > > > > This should be either init+exit, or alloc+free. > > Will do. Thanks Barry
On Fri, Mar 11, 2022 at 3:16 AM Barry Song <21cnbao@gmail.com> wrote: > > On Tue, Feb 15, 2022 at 10:43 PM Yu Zhao <yuzhao@google.com> wrote: > > > > On Thu, Feb 10, 2022 at 03:41:57PM -0500, Johannes Weiner wrote: > > > > Thanks for reviewing. > > > > > > +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) > > > > +{ > > > > + unsigned long max_seq = lruvec->lrugen.max_seq; > > > > + > > > > + VM_BUG_ON(gen >= MAX_NR_GENS); > > > > + > > > > + /* see the comment on MIN_NR_GENS */ > > > > + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); > > > > +} > > > > > > I'm still reading the series, so correct me if I'm wrong: the "active" > > > set is split into two generations for the sole purpose of the > > > second-chance policy for fresh faults, right? > > > > To be precise, the active/inactive notion on top of generations is > > just for ABI compatibility, e.g., the counters in /proc/vmstat. > > Otherwise, this function wouldn't be needed. > > Hi Yu, > I am still quite confused as i am seeing both active/inactive and lru_gen. > eg: > > root@ubuntu:~# cat /proc/vmstat | grep active > nr_zone_inactive_anon 22797 > nr_zone_active_anon 578405 > nr_zone_inactive_file 0 > nr_zone_active_file 4156 > nr_inactive_anon 22800 > nr_active_anon 578574 > nr_inactive_file 0 > nr_active_file 4215 Yes, this is expected. We have to maintain the ABI, i.e., the *_active/inactive_* counters. > and: > > root@ubuntu:~# cat /sys//kernel/debug/lru_gen > > ... > memcg 36 /user.slice/user-0.slice/user@0.service > node 0 > 20 18820 22 0 > 21 7452 0 0 > 22 7448 0 0 > memcg 33 /user.slice/user-0.slice/user@0.service/app.slice > node 0 > 0 2171452 0 0 > 1 2171452 0 0 > 2 2171452 0 0 > 3 2171452 0 0 > memcg 37 /user.slice/user-0.slice/session-1.scope > node 0 > 42 51804 102127 0 > 43 18840 275622 0 > 44 16104 216805 1 > > Does it mean one page could be in both one of the generations and one > of the active/inactive lists? In terms of the data structure, evictable pages are either on lruvec->lists or lrugen->lists. > Do we have some mapping relationship between active/inactive lists > with generations? For the counters, yes -- pages in max_seq and max_seq-1 are counted as active, and the rest are inactive. > We used to put a faulted file page in inactive, if we access it a > second time, it can be promoted > to active. then in recent years, we have also applied this to anon > pages while kernel adds > workingset protection for anon pages. so basically both anon and file > pages go into the inactive > list for the 1st time, if we access it for the second time, they go to > the active list. if we don't access > it any more, they are likely to be reclaimed as they are inactive. > we do have some special fastpath for code section, executable file > pages are kept on active list > as long as they are accessed. Yes. > so all of the above concerns are actually not that correct? They are valid concerns but I don't know any popular workloads that care about them.
On Sat, Mar 12, 2022 at 12:45 PM Yu Zhao <yuzhao@google.com> wrote: > > On Fri, Mar 11, 2022 at 3:16 AM Barry Song <21cnbao@gmail.com> wrote: > > > > On Tue, Feb 15, 2022 at 10:43 PM Yu Zhao <yuzhao@google.com> wrote: > > > > > > On Thu, Feb 10, 2022 at 03:41:57PM -0500, Johannes Weiner wrote: > > > > > > Thanks for reviewing. > > > > > > > > +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) > > > > > +{ > > > > > + unsigned long max_seq = lruvec->lrugen.max_seq; > > > > > + > > > > > + VM_BUG_ON(gen >= MAX_NR_GENS); > > > > > + > > > > > + /* see the comment on MIN_NR_GENS */ > > > > > + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); > > > > > +} > > > > > > > > I'm still reading the series, so correct me if I'm wrong: the "active" > > > > set is split into two generations for the sole purpose of the > > > > second-chance policy for fresh faults, right? > > > > > > To be precise, the active/inactive notion on top of generations is > > > just for ABI compatibility, e.g., the counters in /proc/vmstat. > > > Otherwise, this function wouldn't be needed. > > > > Hi Yu, > > I am still quite confused as i am seeing both active/inactive and lru_gen. > > eg: > > > > root@ubuntu:~# cat /proc/vmstat | grep active > > nr_zone_inactive_anon 22797 > > nr_zone_active_anon 578405 > > nr_zone_inactive_file 0 > > nr_zone_active_file 4156 > > nr_inactive_anon 22800 > > nr_active_anon 578574 > > nr_inactive_file 0 > > nr_active_file 4215 > > Yes, this is expected. We have to maintain the ABI, i.e., the > *_active/inactive_* counters. > > > and: > > > > root@ubuntu:~# cat /sys//kernel/debug/lru_gen > > > > ... > > memcg 36 /user.slice/user-0.slice/user@0.service > > node 0 > > 20 18820 22 0 > > 21 7452 0 0 > > 22 7448 0 0 > > memcg 33 /user.slice/user-0.slice/user@0.service/app.slice > > node 0 > > 0 2171452 0 0 > > 1 2171452 0 0 > > 2 2171452 0 0 > > 3 2171452 0 0 > > memcg 37 /user.slice/user-0.slice/session-1.scope > > node 0 > > 42 51804 102127 0 > > 43 18840 275622 0 > > 44 16104 216805 1 > > > > Does it mean one page could be in both one of the generations and one > > of the active/inactive lists? > > In terms of the data structure, evictable pages are either on > lruvec->lists or lrugen->lists. > > > Do we have some mapping relationship between active/inactive lists > > with generations? > > For the counters, yes -- pages in max_seq and max_seq-1 are counted as > active, and the rest are inactive. > > > We used to put a faulted file page in inactive, if we access it a > > second time, it can be promoted > > to active. then in recent years, we have also applied this to anon > > pages while kernel adds > > workingset protection for anon pages. so basically both anon and file > > pages go into the inactive > > list for the 1st time, if we access it for the second time, they go to > > the active list. if we don't access > > it any more, they are likely to be reclaimed as they are inactive. > > we do have some special fastpath for code section, executable file > > pages are kept on active list > > as long as they are accessed. > > Yes. > > > so all of the above concerns are actually not that correct? > > They are valid concerns but I don't know any popular workloads that > care about them. Hi Yu, here we can get a workload in Kim's patchset while he added workingset protection for anon pages: https://patchwork.kernel.org/project/linux-mm/cover/1581401993-20041-1-git-send-email-iamjoonsoo.kim@lge.com/ anon pages used to go to active rather than inactive, but kim's patchset moved to use inactive first. then only after the anon page is accessed second time, it can move to active. "In current implementation, newly created or swap-in anonymous page is started on the active list. Growing the active list results in rebalancing active/inactive list so old pages on the active list are demoted to the inactive list. Hence, hot page on the active list isn't protected at all. Following is an example of this situation. Assume that 50 hot pages on active list and system can contain total 100 pages. Numbers denote the number of pages on active/inactive list (active | inactive). (h) stands for hot pages and (uo) stands for used-once pages. 1. 50 hot pages on active list 50(h) | 0 2. workload: 50 newly created (used-once) pages 50(uo) | 50(h) 3. workload: another 50 newly created (used-once) pages 50(uo) | 50(uo), swap-out 50(h) As we can see, hot pages are swapped-out and it would cause swap-in later." Is MGLRU able to avoid the swap-out of the 50 hot pages? since MGLRU is putting faulted pages to the youngest generation directly, do we have the risk mentioned in Kim's patchset? Thanks Barry
On Sat, Mar 12, 2022 at 3:37 AM Barry Song <21cnbao@gmail.com> wrote: > > On Sat, Mar 12, 2022 at 12:45 PM Yu Zhao <yuzhao@google.com> wrote: > > > > On Fri, Mar 11, 2022 at 3:16 AM Barry Song <21cnbao@gmail.com> wrote: > > > > > > On Tue, Feb 15, 2022 at 10:43 PM Yu Zhao <yuzhao@google.com> wrote: > > > > > > > > On Thu, Feb 10, 2022 at 03:41:57PM -0500, Johannes Weiner wrote: > > > > > > > > Thanks for reviewing. > > > > > > > > > > +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) > > > > > > +{ > > > > > > + unsigned long max_seq = lruvec->lrugen.max_seq; > > > > > > + > > > > > > + VM_BUG_ON(gen >= MAX_NR_GENS); > > > > > > + > > > > > > + /* see the comment on MIN_NR_GENS */ > > > > > > + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); > > > > > > +} > > > > > > > > > > I'm still reading the series, so correct me if I'm wrong: the "active" > > > > > set is split into two generations for the sole purpose of the > > > > > second-chance policy for fresh faults, right? > > > > > > > > To be precise, the active/inactive notion on top of generations is > > > > just for ABI compatibility, e.g., the counters in /proc/vmstat. > > > > Otherwise, this function wouldn't be needed. > > > > > > Hi Yu, > > > I am still quite confused as i am seeing both active/inactive and lru_gen. > > > eg: > > > > > > root@ubuntu:~# cat /proc/vmstat | grep active > > > nr_zone_inactive_anon 22797 > > > nr_zone_active_anon 578405 > > > nr_zone_inactive_file 0 > > > nr_zone_active_file 4156 > > > nr_inactive_anon 22800 > > > nr_active_anon 578574 > > > nr_inactive_file 0 > > > nr_active_file 4215 > > > > Yes, this is expected. We have to maintain the ABI, i.e., the > > *_active/inactive_* counters. > > > > > and: > > > > > > root@ubuntu:~# cat /sys//kernel/debug/lru_gen > > > > > > ... > > > memcg 36 /user.slice/user-0.slice/user@0.service > > > node 0 > > > 20 18820 22 0 > > > 21 7452 0 0 > > > 22 7448 0 0 > > > memcg 33 /user.slice/user-0.slice/user@0.service/app.slice > > > node 0 > > > 0 2171452 0 0 > > > 1 2171452 0 0 > > > 2 2171452 0 0 > > > 3 2171452 0 0 > > > memcg 37 /user.slice/user-0.slice/session-1.scope > > > node 0 > > > 42 51804 102127 0 > > > 43 18840 275622 0 > > > 44 16104 216805 1 > > > > > > Does it mean one page could be in both one of the generations and one > > > of the active/inactive lists? > > > > In terms of the data structure, evictable pages are either on > > lruvec->lists or lrugen->lists. > > > > > Do we have some mapping relationship between active/inactive lists > > > with generations? > > > > For the counters, yes -- pages in max_seq and max_seq-1 are counted as > > active, and the rest are inactive. > > > > > We used to put a faulted file page in inactive, if we access it a > > > second time, it can be promoted > > > to active. then in recent years, we have also applied this to anon > > > pages while kernel adds > > > workingset protection for anon pages. so basically both anon and file > > > pages go into the inactive > > > list for the 1st time, if we access it for the second time, they go to > > > the active list. if we don't access > > > it any more, they are likely to be reclaimed as they are inactive. > > > we do have some special fastpath for code section, executable file > > > pages are kept on active list > > > as long as they are accessed. > > > > Yes. > > > > > so all of the above concerns are actually not that correct? > > > > They are valid concerns but I don't know any popular workloads that > > care about them. > > Hi Yu, > here we can get a workload in Kim's patchset while he added workingset > protection > for anon pages: > https://patchwork.kernel.org/project/linux-mm/cover/1581401993-20041-1-git-send-email-iamjoonsoo.kim@lge.com/ Thanks. I wouldn't call that a workload because it's not a real application. By popular workloads, I mean applications that the majority of people actually run on phones, in cloud, etc. > anon pages used to go to active rather than inactive, but kim's patchset > moved to use inactive first. then only after the anon page is accessed > second time, it can move to active. Yes. To clarify, the A-bit doesn't really mean the first or second access. It can be many accesses each time it's set. > "In current implementation, newly created or swap-in anonymous page is > > started on the active list. Growing the active list results in rebalancing > active/inactive list so old pages on the active list are demoted to the > inactive list. Hence, hot page on the active list isn't protected at all. > > Following is an example of this situation. > > Assume that 50 hot pages on active list and system can contain total > 100 pages. Numbers denote the number of pages on active/inactive > list (active | inactive). (h) stands for hot pages and (uo) stands for > used-once pages. > > 1. 50 hot pages on active list > 50(h) | 0 > > 2. workload: 50 newly created (used-once) pages > 50(uo) | 50(h) > > 3. workload: another 50 newly created (used-once) pages > 50(uo) | 50(uo), swap-out 50(h) > > As we can see, hot pages are swapped-out and it would cause swap-in later." > > Is MGLRU able to avoid the swap-out of the 50 hot pages? I think the real question is why the 50 hot pages can be moved to the inactive list. If they are really hot, the A-bit should protect them. > since MGLRU > is putting faulted pages to the youngest generation directly, do we have the > risk mentioned in Kim's patchset? There are always risks :) I could imagine a thousand ways to make VM suffer, but all of them could be irrelevant to how it actually does in production. So a concrete use case of yours would be much appreciated for this discussion.
On Sun, Mar 13, 2022 at 10:12 AM Yu Zhao <yuzhao@google.com> wrote: > > On Sat, Mar 12, 2022 at 3:37 AM Barry Song <21cnbao@gmail.com> wrote: > > > > On Sat, Mar 12, 2022 at 12:45 PM Yu Zhao <yuzhao@google.com> wrote: > > > > > > On Fri, Mar 11, 2022 at 3:16 AM Barry Song <21cnbao@gmail.com> wrote: > > > > > > > > On Tue, Feb 15, 2022 at 10:43 PM Yu Zhao <yuzhao@google.com> wrote: > > > > > > > > > > On Thu, Feb 10, 2022 at 03:41:57PM -0500, Johannes Weiner wrote: > > > > > > > > > > Thanks for reviewing. > > > > > > > > > > > > +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) > > > > > > > +{ > > > > > > > + unsigned long max_seq = lruvec->lrugen.max_seq; > > > > > > > + > > > > > > > + VM_BUG_ON(gen >= MAX_NR_GENS); > > > > > > > + > > > > > > > + /* see the comment on MIN_NR_GENS */ > > > > > > > + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); > > > > > > > +} > > > > > > > > > > > > I'm still reading the series, so correct me if I'm wrong: the "active" > > > > > > set is split into two generations for the sole purpose of the > > > > > > second-chance policy for fresh faults, right? > > > > > > > > > > To be precise, the active/inactive notion on top of generations is > > > > > just for ABI compatibility, e.g., the counters in /proc/vmstat. > > > > > Otherwise, this function wouldn't be needed. > > > > > > > > Hi Yu, > > > > I am still quite confused as i am seeing both active/inactive and lru_gen. > > > > eg: > > > > > > > > root@ubuntu:~# cat /proc/vmstat | grep active > > > > nr_zone_inactive_anon 22797 > > > > nr_zone_active_anon 578405 > > > > nr_zone_inactive_file 0 > > > > nr_zone_active_file 4156 > > > > nr_inactive_anon 22800 > > > > nr_active_anon 578574 > > > > nr_inactive_file 0 > > > > nr_active_file 4215 > > > > > > Yes, this is expected. We have to maintain the ABI, i.e., the > > > *_active/inactive_* counters. > > > > > > > and: > > > > > > > > root@ubuntu:~# cat /sys//kernel/debug/lru_gen > > > > > > > > ... > > > > memcg 36 /user.slice/user-0.slice/user@0.service > > > > node 0 > > > > 20 18820 22 0 > > > > 21 7452 0 0 > > > > 22 7448 0 0 > > > > memcg 33 /user.slice/user-0.slice/user@0.service/app.slice > > > > node 0 > > > > 0 2171452 0 0 > > > > 1 2171452 0 0 > > > > 2 2171452 0 0 > > > > 3 2171452 0 0 > > > > memcg 37 /user.slice/user-0.slice/session-1.scope > > > > node 0 > > > > 42 51804 102127 0 > > > > 43 18840 275622 0 > > > > 44 16104 216805 1 > > > > > > > > Does it mean one page could be in both one of the generations and one > > > > of the active/inactive lists? > > > > > > In terms of the data structure, evictable pages are either on > > > lruvec->lists or lrugen->lists. > > > > > > > Do we have some mapping relationship between active/inactive lists > > > > with generations? > > > > > > For the counters, yes -- pages in max_seq and max_seq-1 are counted as > > > active, and the rest are inactive. > > > > > > > We used to put a faulted file page in inactive, if we access it a > > > > second time, it can be promoted > > > > to active. then in recent years, we have also applied this to anon > > > > pages while kernel adds > > > > workingset protection for anon pages. so basically both anon and file > > > > pages go into the inactive > > > > list for the 1st time, if we access it for the second time, they go to > > > > the active list. if we don't access > > > > it any more, they are likely to be reclaimed as they are inactive. > > > > we do have some special fastpath for code section, executable file > > > > pages are kept on active list > > > > as long as they are accessed. > > > > > > Yes. > > > > > > > so all of the above concerns are actually not that correct? > > > > > > They are valid concerns but I don't know any popular workloads that > > > care about them. > > > > Hi Yu, > > here we can get a workload in Kim's patchset while he added workingset > > protection > > for anon pages: > > https://patchwork.kernel.org/project/linux-mm/cover/1581401993-20041-1-git-send-email-iamjoonsoo.kim@lge.com/ > > Thanks. I wouldn't call that a workload because it's not a real > application. By popular workloads, I mean applications that the > majority of people actually run on phones, in cloud, etc. > > > anon pages used to go to active rather than inactive, but kim's patchset > > moved to use inactive first. then only after the anon page is accessed > > second time, it can move to active. > > Yes. To clarify, the A-bit doesn't really mean the first or second > access. It can be many accesses each time it's set. > > > "In current implementation, newly created or swap-in anonymous page is > > > > started on the active list. Growing the active list results in rebalancing > > active/inactive list so old pages on the active list are demoted to the > > inactive list. Hence, hot page on the active list isn't protected at all. > > > > Following is an example of this situation. > > > > Assume that 50 hot pages on active list and system can contain total > > 100 pages. Numbers denote the number of pages on active/inactive > > list (active | inactive). (h) stands for hot pages and (uo) stands for > > used-once pages. > > > > 1. 50 hot pages on active list > > 50(h) | 0 > > > > 2. workload: 50 newly created (used-once) pages > > 50(uo) | 50(h) > > > > 3. workload: another 50 newly created (used-once) pages > > 50(uo) | 50(uo), swap-out 50(h) > > > > As we can see, hot pages are swapped-out and it would cause swap-in later." > > > > Is MGLRU able to avoid the swap-out of the 50 hot pages? > > I think the real question is why the 50 hot pages can be moved to the > inactive list. If they are really hot, the A-bit should protect them. This is a good question. I guess it is probably because the current lru is trying to maintain a balance between the sizes of active and inactive lists. Thus, it can shrink active list even though pages might be still "hot" but not the recently accessed ones. 1. 50 hot pages on active list 50(h) | 0 2. workload: 50 newly created (used-once) pages 50(uo) | 50(h) 3. workload: another 50 newly created (used-once) pages 50(uo) | 50(uo), swap-out 50(h) the old kernel without anon workingset protection put workload 2 on active, so pushed 50 hot pages from active to inactive. workload 3 would further contribute to evict the 50 hot pages. it seems mglru doesn't demote pages from the youngest generation to older generation only in order to balance the list size? so mglru is probably safe in these cases. I will run some tests mentioned in Kim's patchset and report the result to you afterwards. > > > since MGLRU > > is putting faulted pages to the youngest generation directly, do we have the > > risk mentioned in Kim's patchset? > > There are always risks :) I could imagine a thousand ways to make VM > suffer, but all of them could be irrelevant to how it actually does in > production. So a concrete use case of yours would be much appreciated > for this discussion. Thanks Barry
> > > > > > > > > We used to put a faulted file page in inactive, if we access it a > > > > > second time, it can be promoted > > > > > to active. then in recent years, we have also applied this to anon > > > > > pages while kernel adds > > > > > workingset protection for anon pages. so basically both anon and file > > > > > pages go into the inactive > > > > > list for the 1st time, if we access it for the second time, they go to > > > > > the active list. if we don't access > > > > > it any more, they are likely to be reclaimed as they are inactive. > > > > > we do have some special fastpath for code section, executable file > > > > > pages are kept on active list > > > > > as long as they are accessed. > > > > > > > > Yes. > > > > > > > > > so all of the above concerns are actually not that correct? > > > > > > > > They are valid concerns but I don't know any popular workloads that > > > > care about them. > > > > > > Hi Yu, > > > here we can get a workload in Kim's patchset while he added workingset > > > protection > > > for anon pages: > > > https://patchwork.kernel.org/project/linux-mm/cover/1581401993-20041-1-git-send-email-iamjoonsoo.kim@lge.com/ > > > > Thanks. I wouldn't call that a workload because it's not a real > > application. By popular workloads, I mean applications that the > > majority of people actually run on phones, in cloud, etc. > > > > > anon pages used to go to active rather than inactive, but kim's patchset > > > moved to use inactive first. then only after the anon page is accessed > > > second time, it can move to active. > > > > Yes. To clarify, the A-bit doesn't really mean the first or second > > access. It can be many accesses each time it's set. > > > > > "In current implementation, newly created or swap-in anonymous page is > > > > > > started on the active list. Growing the active list results in rebalancing > > > active/inactive list so old pages on the active list are demoted to the > > > inactive list. Hence, hot page on the active list isn't protected at all. > > > > > > Following is an example of this situation. > > > > > > Assume that 50 hot pages on active list and system can contain total > > > 100 pages. Numbers denote the number of pages on active/inactive > > > list (active | inactive). (h) stands for hot pages and (uo) stands for > > > used-once pages. > > > > > > 1. 50 hot pages on active list > > > 50(h) | 0 > > > > > > 2. workload: 50 newly created (used-once) pages > > > 50(uo) | 50(h) > > > > > > 3. workload: another 50 newly created (used-once) pages > > > 50(uo) | 50(uo), swap-out 50(h) > > > > > > As we can see, hot pages are swapped-out and it would cause swap-in later." > > > > > > Is MGLRU able to avoid the swap-out of the 50 hot pages? > > > > I think the real question is why the 50 hot pages can be moved to the > > inactive list. If they are really hot, the A-bit should protect them. > > This is a good question. > > I guess it is probably because the current lru is trying to maintain a balance > between the sizes of active and inactive lists. Thus, it can shrink active list > even though pages might be still "hot" but not the recently accessed ones. > > 1. 50 hot pages on active list > 50(h) | 0 > > 2. workload: 50 newly created (used-once) pages > 50(uo) | 50(h) > > 3. workload: another 50 newly created (used-once) pages > 50(uo) | 50(uo), swap-out 50(h) > > the old kernel without anon workingset protection put workload 2 on active, so > pushed 50 hot pages from active to inactive. workload 3 would further contribute > to evict the 50 hot pages. > > it seems mglru doesn't demote pages from the youngest generation to older > generation only in order to balance the list size? so mglru is probably safe > in these cases. > > I will run some tests mentioned in Kim's patchset and report the result to you > afterwards. > Hi Yu, I did find putting faulted pages to the youngest generation lead to some regression in the case ebizzy Kim's patchset mentioned while he tried to support workingset protection for anon pages. i did a little bit modification for rand_chunk() which is probably similar with the modifcation() Kim mentioned in his patchset. The modification can be found here: https://github.com/21cnbao/ltp/commit/7134413d747bfa9ef The test env is a x86 machine in which I have set memory size to 2.5GB and set zRAM to 2GB and disabled external disk swap. with the vanilla kernel: \time -v ./a.out -vv -t 4 -s 209715200 -S 200000 so we have 10 chunks and 4 threads, each trunk is 209715200(200MB) typical result: Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" User time (seconds): 36.19 System time (seconds): 229.72 Percent of CPU this job got: 371% Elapsed (wall clock) time (h:mm:ss or m:ss): 1:11.59 Average shared text size (kbytes): 0 Average unshared data size (kbytes): 0 Average stack size (kbytes): 0 Average total size (kbytes): 0 Maximum resident set size (kbytes): 2166196 Average resident set size (kbytes): 0 Major (requiring I/O) page faults: 9990128 Minor (reclaiming a frame) page faults: 33315945 Voluntary context switches: 59144 Involuntary context switches: 167754 Swaps: 0 File system inputs: 2760 File system outputs: 8 Socket messages sent: 0 Socket messages received: 0 Signals delivered: 0 Page size (bytes): 4096 Exit status: 0 with gen_lru and lru_gen/enabled=0x3: typical result: Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" User time (seconds): 36.34 System time (seconds): 276.07 Percent of CPU this job got: 378% Elapsed (wall clock) time (h:mm:ss or m:ss): 1:22.46 **** 15% time + Average shared text size (kbytes): 0 Average unshared data size (kbytes): 0 Average stack size (kbytes): 0 Average total size (kbytes): 0 Maximum resident set size (kbytes): 2168120 Average resident set size (kbytes): 0 Major (requiring I/O) page faults: 13362810 ***** 30% page fault + Minor (reclaiming a frame) page faults: 33394617 Voluntary context switches: 55216 Involuntary context switches: 137220 Swaps: 0 File system inputs: 4088 File system outputs: 8 Socket messages sent: 0 Socket messages received: 0 Signals delivered: 0 Page size (bytes): 4096 Exit status: 0 with gen_lru and lru_gen/enabled=0x7: typical result: Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" User time (seconds): 36.13 System time (seconds): 251.71 Percent of CPU this job got: 378% Elapsed (wall clock) time (h:mm:ss or m:ss): 1:16.00 *****better than enabled=0x3, worse than vanilla Average shared text size (kbytes): 0 Average unshared data size (kbytes): 0 Average stack size (kbytes): 0 Average total size (kbytes): 0 Maximum resident set size (kbytes): 2120988 Average resident set size (kbytes): 0 Major (requiring I/O) page faults: 12706512 Minor (reclaiming a frame) page faults: 33422243 Voluntary context switches: 49485 Involuntary context switches: 126765 Swaps: 0 File system inputs: 2976 File system outputs: 8 Socket messages sent: 0 Socket messages received: 0 Signals delivered: 0 Page size (bytes): 4096 Exit status: 0 I can also reproduce the problem on arm64. I am not saying this is going to block mglru from being mainlined. But I am still curious if this is an issue worth being addressed somehow in mglru. > > > > > since MGLRU > > > is putting faulted pages to the youngest generation directly, do we have the > > > risk mentioned in Kim's patchset? > > > > There are always risks :) I could imagine a thousand ways to make VM > > suffer, but all of them could be irrelevant to how it actually does in > > production. So a concrete use case of yours would be much appreciated > > for this discussion. > Thanks Barry
On Mon, Mar 14, 2022 at 5:12 AM Barry Song <21cnbao@gmail.com> wrote: > > > > > > > > > > > > We used to put a faulted file page in inactive, if we access it a > > > > > > second time, it can be promoted > > > > > > to active. then in recent years, we have also applied this to anon > > > > > > pages while kernel adds > > > > > > workingset protection for anon pages. so basically both anon and file > > > > > > pages go into the inactive > > > > > > list for the 1st time, if we access it for the second time, they go to > > > > > > the active list. if we don't access > > > > > > it any more, they are likely to be reclaimed as they are inactive. > > > > > > we do have some special fastpath for code section, executable file > > > > > > pages are kept on active list > > > > > > as long as they are accessed. > > > > > > > > > > Yes. > > > > > > > > > > > so all of the above concerns are actually not that correct? > > > > > > > > > > They are valid concerns but I don't know any popular workloads that > > > > > care about them. > > > > > > > > Hi Yu, > > > > here we can get a workload in Kim's patchset while he added workingset > > > > protection > > > > for anon pages: > > > > https://patchwork.kernel.org/project/linux-mm/cover/1581401993-20041-1-git-send-email-iamjoonsoo.kim@lge.com/ > > > > > > Thanks. I wouldn't call that a workload because it's not a real > > > application. By popular workloads, I mean applications that the > > > majority of people actually run on phones, in cloud, etc. > > > > > > > anon pages used to go to active rather than inactive, but kim's patchset > > > > moved to use inactive first. then only after the anon page is accessed > > > > second time, it can move to active. > > > > > > Yes. To clarify, the A-bit doesn't really mean the first or second > > > access. It can be many accesses each time it's set. > > > > > > > "In current implementation, newly created or swap-in anonymous page is > > > > > > > > started on the active list. Growing the active list results in rebalancing > > > > active/inactive list so old pages on the active list are demoted to the > > > > inactive list. Hence, hot page on the active list isn't protected at all. > > > > > > > > Following is an example of this situation. > > > > > > > > Assume that 50 hot pages on active list and system can contain total > > > > 100 pages. Numbers denote the number of pages on active/inactive > > > > list (active | inactive). (h) stands for hot pages and (uo) stands for > > > > used-once pages. > > > > > > > > 1. 50 hot pages on active list > > > > 50(h) | 0 > > > > > > > > 2. workload: 50 newly created (used-once) pages > > > > 50(uo) | 50(h) > > > > > > > > 3. workload: another 50 newly created (used-once) pages > > > > 50(uo) | 50(uo), swap-out 50(h) > > > > > > > > As we can see, hot pages are swapped-out and it would cause swap-in later." > > > > > > > > Is MGLRU able to avoid the swap-out of the 50 hot pages? > > > > > > I think the real question is why the 50 hot pages can be moved to the > > > inactive list. If they are really hot, the A-bit should protect them. > > > > This is a good question. > > > > I guess it is probably because the current lru is trying to maintain a balance > > between the sizes of active and inactive lists. Thus, it can shrink active list > > even though pages might be still "hot" but not the recently accessed ones. > > > > 1. 50 hot pages on active list > > 50(h) | 0 > > > > 2. workload: 50 newly created (used-once) pages > > 50(uo) | 50(h) > > > > 3. workload: another 50 newly created (used-once) pages > > 50(uo) | 50(uo), swap-out 50(h) > > > > the old kernel without anon workingset protection put workload 2 on active, so > > pushed 50 hot pages from active to inactive. workload 3 would further contribute > > to evict the 50 hot pages. > > > > it seems mglru doesn't demote pages from the youngest generation to older > > generation only in order to balance the list size? so mglru is probably safe > > in these cases. > > > > I will run some tests mentioned in Kim's patchset and report the result to you > > afterwards. > > > > Hi Yu, > I did find putting faulted pages to the youngest generation lead to some > regression in the case ebizzy Kim's patchset mentioned while he tried > to support workingset protection for anon pages. > i did a little bit modification for rand_chunk() which is probably similar > with the modifcation() Kim mentioned in his patchset. The modification > can be found here: > https://github.com/21cnbao/ltp/commit/7134413d747bfa9ef > > The test env is a x86 machine in which I have set memory size to 2.5GB and > set zRAM to 2GB and disabled external disk swap. > > with the vanilla kernel: > \time -v ./a.out -vv -t 4 -s 209715200 -S 200000 > > so we have 10 chunks and 4 threads, each trunk is 209715200(200MB) > > typical result: > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > User time (seconds): 36.19 > System time (seconds): 229.72 > Percent of CPU this job got: 371% > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:11.59 > Average shared text size (kbytes): 0 > Average unshared data size (kbytes): 0 > Average stack size (kbytes): 0 > Average total size (kbytes): 0 > Maximum resident set size (kbytes): 2166196 > Average resident set size (kbytes): 0 > Major (requiring I/O) page faults: 9990128 > Minor (reclaiming a frame) page faults: 33315945 > Voluntary context switches: 59144 > Involuntary context switches: 167754 > Swaps: 0 > File system inputs: 2760 > File system outputs: 8 > Socket messages sent: 0 > Socket messages received: 0 > Signals delivered: 0 > Page size (bytes): 4096 > Exit status: 0 > > with gen_lru and lru_gen/enabled=0x3: > typical result: > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > User time (seconds): 36.34 > System time (seconds): 276.07 > Percent of CPU this job got: 378% > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:22.46 > **** 15% time + > Average shared text size (kbytes): 0 > Average unshared data size (kbytes): 0 > Average stack size (kbytes): 0 > Average total size (kbytes): 0 > Maximum resident set size (kbytes): 2168120 > Average resident set size (kbytes): 0 > Major (requiring I/O) page faults: 13362810 > ***** 30% page fault + > Minor (reclaiming a frame) page faults: 33394617 > Voluntary context switches: 55216 > Involuntary context switches: 137220 > Swaps: 0 > File system inputs: 4088 > File system outputs: 8 > Socket messages sent: 0 > Socket messages received: 0 > Signals delivered: 0 > Page size (bytes): 4096 > Exit status: 0 > > with gen_lru and lru_gen/enabled=0x7: > typical result: > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > User time (seconds): 36.13 > System time (seconds): 251.71 > Percent of CPU this job got: 378% > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:16.00 > *****better than enabled=0x3, worse than vanilla > Average shared text size (kbytes): 0 > Average unshared data size (kbytes): 0 > Average stack size (kbytes): 0 > Average total size (kbytes): 0 > Maximum resident set size (kbytes): 2120988 > Average resident set size (kbytes): 0 > Major (requiring I/O) page faults: 12706512 > Minor (reclaiming a frame) page faults: 33422243 > Voluntary context switches: 49485 > Involuntary context switches: 126765 > Swaps: 0 > File system inputs: 2976 > File system outputs: 8 > Socket messages sent: 0 > Socket messages received: 0 > Signals delivered: 0 > Page size (bytes): 4096 > Exit status: 0 > > I can also reproduce the problem on arm64. > > I am not saying this is going to block mglru from being mainlined. But I am > still curious if this is an issue worth being addressed somehow in mglru. You've missed something very important: *thoughput* :) Dollars to doughnuts there was a large increase in throughput -- I haven't tried this benchmark but I've seen many reports similar to this one.
On Tue, Mar 15, 2022 at 5:45 AM Yu Zhao <yuzhao@google.com> wrote: > > On Mon, Mar 14, 2022 at 5:12 AM Barry Song <21cnbao@gmail.com> wrote: > > > > > > > > > > > > > > > We used to put a faulted file page in inactive, if we access it a > > > > > > > second time, it can be promoted > > > > > > > to active. then in recent years, we have also applied this to anon > > > > > > > pages while kernel adds > > > > > > > workingset protection for anon pages. so basically both anon and file > > > > > > > pages go into the inactive > > > > > > > list for the 1st time, if we access it for the second time, they go to > > > > > > > the active list. if we don't access > > > > > > > it any more, they are likely to be reclaimed as they are inactive. > > > > > > > we do have some special fastpath for code section, executable file > > > > > > > pages are kept on active list > > > > > > > as long as they are accessed. > > > > > > > > > > > > Yes. > > > > > > > > > > > > > so all of the above concerns are actually not that correct? > > > > > > > > > > > > They are valid concerns but I don't know any popular workloads that > > > > > > care about them. > > > > > > > > > > Hi Yu, > > > > > here we can get a workload in Kim's patchset while he added workingset > > > > > protection > > > > > for anon pages: > > > > > https://patchwork.kernel.org/project/linux-mm/cover/1581401993-20041-1-git-send-email-iamjoonsoo.kim@lge.com/ > > > > > > > > Thanks. I wouldn't call that a workload because it's not a real > > > > application. By popular workloads, I mean applications that the > > > > majority of people actually run on phones, in cloud, etc. > > > > > > > > > anon pages used to go to active rather than inactive, but kim's patchset > > > > > moved to use inactive first. then only after the anon page is accessed > > > > > second time, it can move to active. > > > > > > > > Yes. To clarify, the A-bit doesn't really mean the first or second > > > > access. It can be many accesses each time it's set. > > > > > > > > > "In current implementation, newly created or swap-in anonymous page is > > > > > > > > > > started on the active list. Growing the active list results in rebalancing > > > > > active/inactive list so old pages on the active list are demoted to the > > > > > inactive list. Hence, hot page on the active list isn't protected at all. > > > > > > > > > > Following is an example of this situation. > > > > > > > > > > Assume that 50 hot pages on active list and system can contain total > > > > > 100 pages. Numbers denote the number of pages on active/inactive > > > > > list (active | inactive). (h) stands for hot pages and (uo) stands for > > > > > used-once pages. > > > > > > > > > > 1. 50 hot pages on active list > > > > > 50(h) | 0 > > > > > > > > > > 2. workload: 50 newly created (used-once) pages > > > > > 50(uo) | 50(h) > > > > > > > > > > 3. workload: another 50 newly created (used-once) pages > > > > > 50(uo) | 50(uo), swap-out 50(h) > > > > > > > > > > As we can see, hot pages are swapped-out and it would cause swap-in later." > > > > > > > > > > Is MGLRU able to avoid the swap-out of the 50 hot pages? > > > > > > > > I think the real question is why the 50 hot pages can be moved to the > > > > inactive list. If they are really hot, the A-bit should protect them. > > > > > > This is a good question. > > > > > > I guess it is probably because the current lru is trying to maintain a balance > > > between the sizes of active and inactive lists. Thus, it can shrink active list > > > even though pages might be still "hot" but not the recently accessed ones. > > > > > > 1. 50 hot pages on active list > > > 50(h) | 0 > > > > > > 2. workload: 50 newly created (used-once) pages > > > 50(uo) | 50(h) > > > > > > 3. workload: another 50 newly created (used-once) pages > > > 50(uo) | 50(uo), swap-out 50(h) > > > > > > the old kernel without anon workingset protection put workload 2 on active, so > > > pushed 50 hot pages from active to inactive. workload 3 would further contribute > > > to evict the 50 hot pages. > > > > > > it seems mglru doesn't demote pages from the youngest generation to older > > > generation only in order to balance the list size? so mglru is probably safe > > > in these cases. > > > > > > I will run some tests mentioned in Kim's patchset and report the result to you > > > afterwards. > > > > > > > Hi Yu, > > I did find putting faulted pages to the youngest generation lead to some > > regression in the case ebizzy Kim's patchset mentioned while he tried > > to support workingset protection for anon pages. > > i did a little bit modification for rand_chunk() which is probably similar > > with the modifcation() Kim mentioned in his patchset. The modification > > can be found here: > > https://github.com/21cnbao/ltp/commit/7134413d747bfa9ef > > > > The test env is a x86 machine in which I have set memory size to 2.5GB and > > set zRAM to 2GB and disabled external disk swap. > > > > with the vanilla kernel: > > \time -v ./a.out -vv -t 4 -s 209715200 -S 200000 > > > > so we have 10 chunks and 4 threads, each trunk is 209715200(200MB) > > > > typical result: > > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > > User time (seconds): 36.19 > > System time (seconds): 229.72 > > Percent of CPU this job got: 371% > > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:11.59 > > Average shared text size (kbytes): 0 > > Average unshared data size (kbytes): 0 > > Average stack size (kbytes): 0 > > Average total size (kbytes): 0 > > Maximum resident set size (kbytes): 2166196 > > Average resident set size (kbytes): 0 > > Major (requiring I/O) page faults: 9990128 > > Minor (reclaiming a frame) page faults: 33315945 > > Voluntary context switches: 59144 > > Involuntary context switches: 167754 > > Swaps: 0 > > File system inputs: 2760 > > File system outputs: 8 > > Socket messages sent: 0 > > Socket messages received: 0 > > Signals delivered: 0 > > Page size (bytes): 4096 > > Exit status: 0 > > > > with gen_lru and lru_gen/enabled=0x3: > > typical result: > > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > > User time (seconds): 36.34 > > System time (seconds): 276.07 > > Percent of CPU this job got: 378% > > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:22.46 > > **** 15% time + > > Average shared text size (kbytes): 0 > > Average unshared data size (kbytes): 0 > > Average stack size (kbytes): 0 > > Average total size (kbytes): 0 > > Maximum resident set size (kbytes): 2168120 > > Average resident set size (kbytes): 0 > > Major (requiring I/O) page faults: 13362810 > > ***** 30% page fault + > > Minor (reclaiming a frame) page faults: 33394617 > > Voluntary context switches: 55216 > > Involuntary context switches: 137220 > > Swaps: 0 > > File system inputs: 4088 > > File system outputs: 8 > > Socket messages sent: 0 > > Socket messages received: 0 > > Signals delivered: 0 > > Page size (bytes): 4096 > > Exit status: 0 > > > > with gen_lru and lru_gen/enabled=0x7: > > typical result: > > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > > User time (seconds): 36.13 > > System time (seconds): 251.71 > > Percent of CPU this job got: 378% > > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:16.00 > > *****better than enabled=0x3, worse than vanilla > > Average shared text size (kbytes): 0 > > Average unshared data size (kbytes): 0 > > Average stack size (kbytes): 0 > > Average total size (kbytes): 0 > > Maximum resident set size (kbytes): 2120988 > > Average resident set size (kbytes): 0 > > Major (requiring I/O) page faults: 12706512 > > Minor (reclaiming a frame) page faults: 33422243 > > Voluntary context switches: 49485 > > Involuntary context switches: 126765 > > Swaps: 0 > > File system inputs: 2976 > > File system outputs: 8 > > Socket messages sent: 0 > > Socket messages received: 0 > > Signals delivered: 0 > > Page size (bytes): 4096 > > Exit status: 0 > > > > I can also reproduce the problem on arm64. > > > > I am not saying this is going to block mglru from being mainlined. But I am > > still curious if this is an issue worth being addressed somehow in mglru. > > You've missed something very important: *thoughput* :) > noop :-) in the test case, there are 4 threads. they are searching a key in 10 chunks of memory. for each chunk, the size is 200MB. a "random" chunk index is returned for those threads to search. but chunk2 is the hottest, and chunk3, 7, 4 are relatively hotter than others. static inline unsigned int rand_chunk(void) { /* simulate hot and cold chunk */ unsigned int rand[16] = {2, 2, 3, 4, 5, 2, 6, 7, 9, 2, 8, 3, 7, 2, 2, 4}; static int nr = 0; return rand[nr++%16]; } each thread does search_mem(): static unsigned int search_mem(void) { record_t key, *found; record_t *src, *copy; unsigned int chunk; size_t copy_size = chunk_size; unsigned int i; unsigned int state = 0; /* run 160 loops or till timeout */ for (i = 0; threads_go == 1 && i < 160; i++) { chunk = rand_chunk(); src = mem[chunk]; ... copy = alloc_mem(copy_size); ... memcpy(copy, src, copy_size); key = rand_num(copy_size / record_size, &state); bsearch(&key, copy, copy_size / record_size, record_size, compare); /* Below check is mainly for memory corruption or other bug */ if (found == NULL) { fprintf(stderr, "Couldn't find key %zd\n", key); exit(1); } } /* end if ! touch_pages */ free_mem(copy, copy_size); } return (i); } each thread picks up a chunk, then allocates a new memory and copies the chunk to the new allocated memory, and searches a key in the allocated memory. as i have set time to rather big by -S, so each thread actually exits while it completes 160 loops. $ \time -v ./ebizzy -t 4 -s $((200*1024*1024)) -S 6000000 so the one who finishes the whole jobs earlier wins in throughput as well. > Dollars to doughnuts there was a large increase in throughput -- I > haven't tried this benchmark but I've seen many reports similar to > this one. I have no doubt about this. I am just trying to figure out some potential we can further achieve in mglru. Thanks, Barry
On Mon, Mar 14, 2022 at 5:38 PM Barry Song <21cnbao@gmail.com> wrote: > > On Tue, Mar 15, 2022 at 5:45 AM Yu Zhao <yuzhao@google.com> wrote: > > > > On Mon, Mar 14, 2022 at 5:12 AM Barry Song <21cnbao@gmail.com> wrote: > > > > > > > > > > > > > > > > > > We used to put a faulted file page in inactive, if we access it a > > > > > > > > second time, it can be promoted > > > > > > > > to active. then in recent years, we have also applied this to anon > > > > > > > > pages while kernel adds > > > > > > > > workingset protection for anon pages. so basically both anon and file > > > > > > > > pages go into the inactive > > > > > > > > list for the 1st time, if we access it for the second time, they go to > > > > > > > > the active list. if we don't access > > > > > > > > it any more, they are likely to be reclaimed as they are inactive. > > > > > > > > we do have some special fastpath for code section, executable file > > > > > > > > pages are kept on active list > > > > > > > > as long as they are accessed. > > > > > > > > > > > > > > Yes. > > > > > > > > > > > > > > > so all of the above concerns are actually not that correct? > > > > > > > > > > > > > > They are valid concerns but I don't know any popular workloads that > > > > > > > care about them. > > > > > > > > > > > > Hi Yu, > > > > > > here we can get a workload in Kim's patchset while he added workingset > > > > > > protection > > > > > > for anon pages: > > > > > > https://patchwork.kernel.org/project/linux-mm/cover/1581401993-20041-1-git-send-email-iamjoonsoo.kim@lge.com/ > > > > > > > > > > Thanks. I wouldn't call that a workload because it's not a real > > > > > application. By popular workloads, I mean applications that the > > > > > majority of people actually run on phones, in cloud, etc. > > > > > > > > > > > anon pages used to go to active rather than inactive, but kim's patchset > > > > > > moved to use inactive first. then only after the anon page is accessed > > > > > > second time, it can move to active. > > > > > > > > > > Yes. To clarify, the A-bit doesn't really mean the first or second > > > > > access. It can be many accesses each time it's set. > > > > > > > > > > > "In current implementation, newly created or swap-in anonymous page is > > > > > > > > > > > > started on the active list. Growing the active list results in rebalancing > > > > > > active/inactive list so old pages on the active list are demoted to the > > > > > > inactive list. Hence, hot page on the active list isn't protected at all. > > > > > > > > > > > > Following is an example of this situation. > > > > > > > > > > > > Assume that 50 hot pages on active list and system can contain total > > > > > > 100 pages. Numbers denote the number of pages on active/inactive > > > > > > list (active | inactive). (h) stands for hot pages and (uo) stands for > > > > > > used-once pages. > > > > > > > > > > > > 1. 50 hot pages on active list > > > > > > 50(h) | 0 > > > > > > > > > > > > 2. workload: 50 newly created (used-once) pages > > > > > > 50(uo) | 50(h) > > > > > > > > > > > > 3. workload: another 50 newly created (used-once) pages > > > > > > 50(uo) | 50(uo), swap-out 50(h) > > > > > > > > > > > > As we can see, hot pages are swapped-out and it would cause swap-in later." > > > > > > > > > > > > Is MGLRU able to avoid the swap-out of the 50 hot pages? > > > > > > > > > > I think the real question is why the 50 hot pages can be moved to the > > > > > inactive list. If they are really hot, the A-bit should protect them. > > > > > > > > This is a good question. > > > > > > > > I guess it is probably because the current lru is trying to maintain a balance > > > > between the sizes of active and inactive lists. Thus, it can shrink active list > > > > even though pages might be still "hot" but not the recently accessed ones. > > > > > > > > 1. 50 hot pages on active list > > > > 50(h) | 0 > > > > > > > > 2. workload: 50 newly created (used-once) pages > > > > 50(uo) | 50(h) > > > > > > > > 3. workload: another 50 newly created (used-once) pages > > > > 50(uo) | 50(uo), swap-out 50(h) > > > > > > > > the old kernel without anon workingset protection put workload 2 on active, so > > > > pushed 50 hot pages from active to inactive. workload 3 would further contribute > > > > to evict the 50 hot pages. > > > > > > > > it seems mglru doesn't demote pages from the youngest generation to older > > > > generation only in order to balance the list size? so mglru is probably safe > > > > in these cases. > > > > > > > > I will run some tests mentioned in Kim's patchset and report the result to you > > > > afterwards. > > > > > > > > > > Hi Yu, > > > I did find putting faulted pages to the youngest generation lead to some > > > regression in the case ebizzy Kim's patchset mentioned while he tried > > > to support workingset protection for anon pages. > > > i did a little bit modification for rand_chunk() which is probably similar > > > with the modifcation() Kim mentioned in his patchset. The modification > > > can be found here: > > > https://github.com/21cnbao/ltp/commit/7134413d747bfa9ef > > > > > > The test env is a x86 machine in which I have set memory size to 2.5GB and > > > set zRAM to 2GB and disabled external disk swap. > > > > > > with the vanilla kernel: > > > \time -v ./a.out -vv -t 4 -s 209715200 -S 200000 > > > > > > so we have 10 chunks and 4 threads, each trunk is 209715200(200MB) > > > > > > typical result: > > > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > > > User time (seconds): 36.19 > > > System time (seconds): 229.72 > > > Percent of CPU this job got: 371% > > > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:11.59 > > > Average shared text size (kbytes): 0 > > > Average unshared data size (kbytes): 0 > > > Average stack size (kbytes): 0 > > > Average total size (kbytes): 0 > > > Maximum resident set size (kbytes): 2166196 > > > Average resident set size (kbytes): 0 > > > Major (requiring I/O) page faults: 9990128 > > > Minor (reclaiming a frame) page faults: 33315945 > > > Voluntary context switches: 59144 > > > Involuntary context switches: 167754 > > > Swaps: 0 > > > File system inputs: 2760 > > > File system outputs: 8 > > > Socket messages sent: 0 > > > Socket messages received: 0 > > > Signals delivered: 0 > > > Page size (bytes): 4096 > > > Exit status: 0 > > > > > > with gen_lru and lru_gen/enabled=0x3: > > > typical result: > > > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > > > User time (seconds): 36.34 > > > System time (seconds): 276.07 > > > Percent of CPU this job got: 378% > > > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:22.46 > > > **** 15% time + > > > Average shared text size (kbytes): 0 > > > Average unshared data size (kbytes): 0 > > > Average stack size (kbytes): 0 > > > Average total size (kbytes): 0 > > > Maximum resident set size (kbytes): 2168120 > > > Average resident set size (kbytes): 0 > > > Major (requiring I/O) page faults: 13362810 > > > ***** 30% page fault + > > > Minor (reclaiming a frame) page faults: 33394617 > > > Voluntary context switches: 55216 > > > Involuntary context switches: 137220 > > > Swaps: 0 > > > File system inputs: 4088 > > > File system outputs: 8 > > > Socket messages sent: 0 > > > Socket messages received: 0 > > > Signals delivered: 0 > > > Page size (bytes): 4096 > > > Exit status: 0 > > > > > > with gen_lru and lru_gen/enabled=0x7: > > > typical result: > > > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > > > User time (seconds): 36.13 > > > System time (seconds): 251.71 > > > Percent of CPU this job got: 378% > > > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:16.00 > > > *****better than enabled=0x3, worse than vanilla > > > Average shared text size (kbytes): 0 > > > Average unshared data size (kbytes): 0 > > > Average stack size (kbytes): 0 > > > Average total size (kbytes): 0 > > > Maximum resident set size (kbytes): 2120988 > > > Average resident set size (kbytes): 0 > > > Major (requiring I/O) page faults: 12706512 > > > Minor (reclaiming a frame) page faults: 33422243 > > > Voluntary context switches: 49485 > > > Involuntary context switches: 126765 > > > Swaps: 0 > > > File system inputs: 2976 > > > File system outputs: 8 > > > Socket messages sent: 0 > > > Socket messages received: 0 > > > Signals delivered: 0 > > > Page size (bytes): 4096 > > > Exit status: 0 > > > > > > I can also reproduce the problem on arm64. > > > > > > I am not saying this is going to block mglru from being mainlined. But I am > > > still curious if this is an issue worth being addressed somehow in mglru. > > > > You've missed something very important: *thoughput* :) > > > > noop :-) > in the test case, there are 4 threads. they are searching a key in 10 chunks > of memory. for each chunk, the size is 200MB. > a "random" chunk index is returned for those threads to search. but chunk2 > is the hottest, and chunk3, 7, 4 are relatively hotter than others. > static inline unsigned int rand_chunk(void) > { > /* simulate hot and cold chunk */ > unsigned int rand[16] = {2, 2, 3, 4, 5, 2, 6, 7, 9, 2, 8, 3, 7, 2, 2, 4}; This is sequential access, not what you claim above, because you have a repeating sequence. In this case MGLRU is expected to be slower because it doesn't try to optimize it, as discussed before [1]. The reason is, with a manageable complexity, we can only optimize so many things. And MGLRU chose to optimize (arguably) popular workloads, since, AFAIK, no real-world applications streams anon memory. To verify this is indeed sequential access, you could make rand[] larger, e.g., 160, with the same portions of 2s, 3s, 4s, etc, but their positions are random. The following change shows MGLRU is ~20% faster on my Snapdragon 7c + 2.5G DRAM + 2GB zram. static inline unsigned int rand_chunk(void) { /* simulate hot and cold chunk */ - unsigned int rand[16] = {2, 2, 3, 4, 5, 2, 6, 7, 9, 2, 8, 3, 7, 2, 2, 4}; + unsigned int rand[160] = { + 2, 4, 7, 3, 4, 2, 7, 2, 7, 8, 6, 9, 7, 6, 5, 4, + 6, 2, 6, 4, 2, 9, 2, 5, 5, 4, 7, 2, 7, 7, 5, 2, + 4, 4, 3, 3, 2, 4, 2, 2, 5, 2, 4, 2, 8, 2, 2, 3, + 2, 2, 2, 2, 2, 8, 4, 2, 2, 4, 2, 2, 2, 2, 3, 2, + 8, 5, 2, 2, 3, 2, 8, 2, 6, 2, 4, 8, 5, 2, 9, 2, + 8, 7, 9, 2, 4, 4, 3, 3, 2, 8, 2, 2, 3, 3, 2, 7, + 7, 5, 2, 2, 8, 2, 2, 2, 5, 2, 4, 3, 2, 3, 6, 3, + 3, 3, 9, 4, 2, 3, 9, 7, 7, 6, 2, 2, 4, 2, 6, 2, + 9, 7, 7, 7, 9, 3, 4, 2, 3, 2, 7, 3, 2, 2, 2, 6, + 8, 3, 7, 6, 2, 2, 2, 4, 7, 2, 5, 7, 4, 7, 9, 9, + }; static int nr = 0; - return rand[nr++%16]; + return rand[nr++%160]; } Yet better, you could use some standard benchmark suites, written by reputable organizations, e.g., memtier, YCSB, to generate more realistic distributions, as I've suggested before [2]. > static int nr = 0; > return rand[nr++%16]; > } > > each thread does search_mem(): > static unsigned int search_mem(void) > { > record_t key, *found; > record_t *src, *copy; > unsigned int chunk; > size_t copy_size = chunk_size; > unsigned int i; > unsigned int state = 0; > > /* run 160 loops or till timeout */ > for (i = 0; threads_go == 1 && i < 160; i++) { I see you've modified the original benchmark. But with "-S 200000", should this test finish within an hour instead of the following? Elapsed (wall clock) time (h:mm:ss or m:ss): 1:11.59 > chunk = rand_chunk(); > src = mem[chunk]; > ... > copy = alloc_mem(copy_size); > ... > memcpy(copy, src, copy_size); > > key = rand_num(copy_size / record_size, &state); > > bsearch(&key, copy, copy_size / record_size, > record_size, compare); > > /* Below check is mainly for memory corruption or other bug */ > if (found == NULL) { > fprintf(stderr, "Couldn't find key %zd\n", key); > exit(1); > } > } /* end if ! touch_pages */ > > free_mem(copy, copy_size); > } > > return (i); > } > > each thread picks up a chunk, then allocates a new memory and copies the chunk to the > new allocated memory, and searches a key in the allocated memory. > > as i have set time to rather big by -S, so each thread actually exits while it > completes 160 loops. > $ \time -v ./ebizzy -t 4 -s $((200*1024*1024)) -S 6000000 Ok, you actually used "-S 6000000". [1] https://lore.kernel.org/linux-mm/YhNJ4LVWpmZgLh4I@google.com/ [2] https://lore.kernel.org/linux-mm/YgggI+vvtNvh3jBY@google.com/
On Tue, Mar 15, 2022 at 6:18 PM Yu Zhao <yuzhao@google.com> wrote: > > On Mon, Mar 14, 2022 at 5:38 PM Barry Song <21cnbao@gmail.com> wrote: > > > > On Tue, Mar 15, 2022 at 5:45 AM Yu Zhao <yuzhao@google.com> wrote: > > > > > > On Mon, Mar 14, 2022 at 5:12 AM Barry Song <21cnbao@gmail.com> wrote: > > > > > > > > > > > > > > > > > > > > > We used to put a faulted file page in inactive, if we access it a > > > > > > > > > second time, it can be promoted > > > > > > > > > to active. then in recent years, we have also applied this to anon > > > > > > > > > pages while kernel adds > > > > > > > > > workingset protection for anon pages. so basically both anon and file > > > > > > > > > pages go into the inactive > > > > > > > > > list for the 1st time, if we access it for the second time, they go to > > > > > > > > > the active list. if we don't access > > > > > > > > > it any more, they are likely to be reclaimed as they are inactive. > > > > > > > > > we do have some special fastpath for code section, executable file > > > > > > > > > pages are kept on active list > > > > > > > > > as long as they are accessed. > > > > > > > > > > > > > > > > Yes. > > > > > > > > > > > > > > > > > so all of the above concerns are actually not that correct? > > > > > > > > > > > > > > > > They are valid concerns but I don't know any popular workloads that > > > > > > > > care about them. > > > > > > > > > > > > > > Hi Yu, > > > > > > > here we can get a workload in Kim's patchset while he added workingset > > > > > > > protection > > > > > > > for anon pages: > > > > > > > https://patchwork.kernel.org/project/linux-mm/cover/1581401993-20041-1-git-send-email-iamjoonsoo.kim@lge.com/ > > > > > > > > > > > > Thanks. I wouldn't call that a workload because it's not a real > > > > > > application. By popular workloads, I mean applications that the > > > > > > majority of people actually run on phones, in cloud, etc. > > > > > > > > > > > > > anon pages used to go to active rather than inactive, but kim's patchset > > > > > > > moved to use inactive first. then only after the anon page is accessed > > > > > > > second time, it can move to active. > > > > > > > > > > > > Yes. To clarify, the A-bit doesn't really mean the first or second > > > > > > access. It can be many accesses each time it's set. > > > > > > > > > > > > > "In current implementation, newly created or swap-in anonymous page is > > > > > > > > > > > > > > started on the active list. Growing the active list results in rebalancing > > > > > > > active/inactive list so old pages on the active list are demoted to the > > > > > > > inactive list. Hence, hot page on the active list isn't protected at all. > > > > > > > > > > > > > > Following is an example of this situation. > > > > > > > > > > > > > > Assume that 50 hot pages on active list and system can contain total > > > > > > > 100 pages. Numbers denote the number of pages on active/inactive > > > > > > > list (active | inactive). (h) stands for hot pages and (uo) stands for > > > > > > > used-once pages. > > > > > > > > > > > > > > 1. 50 hot pages on active list > > > > > > > 50(h) | 0 > > > > > > > > > > > > > > 2. workload: 50 newly created (used-once) pages > > > > > > > 50(uo) | 50(h) > > > > > > > > > > > > > > 3. workload: another 50 newly created (used-once) pages > > > > > > > 50(uo) | 50(uo), swap-out 50(h) > > > > > > > > > > > > > > As we can see, hot pages are swapped-out and it would cause swap-in later." > > > > > > > > > > > > > > Is MGLRU able to avoid the swap-out of the 50 hot pages? > > > > > > > > > > > > I think the real question is why the 50 hot pages can be moved to the > > > > > > inactive list. If they are really hot, the A-bit should protect them. > > > > > > > > > > This is a good question. > > > > > > > > > > I guess it is probably because the current lru is trying to maintain a balance > > > > > between the sizes of active and inactive lists. Thus, it can shrink active list > > > > > even though pages might be still "hot" but not the recently accessed ones. > > > > > > > > > > 1. 50 hot pages on active list > > > > > 50(h) | 0 > > > > > > > > > > 2. workload: 50 newly created (used-once) pages > > > > > 50(uo) | 50(h) > > > > > > > > > > 3. workload: another 50 newly created (used-once) pages > > > > > 50(uo) | 50(uo), swap-out 50(h) > > > > > > > > > > the old kernel without anon workingset protection put workload 2 on active, so > > > > > pushed 50 hot pages from active to inactive. workload 3 would further contribute > > > > > to evict the 50 hot pages. > > > > > > > > > > it seems mglru doesn't demote pages from the youngest generation to older > > > > > generation only in order to balance the list size? so mglru is probably safe > > > > > in these cases. > > > > > > > > > > I will run some tests mentioned in Kim's patchset and report the result to you > > > > > afterwards. > > > > > > > > > > > > > Hi Yu, > > > > I did find putting faulted pages to the youngest generation lead to some > > > > regression in the case ebizzy Kim's patchset mentioned while he tried > > > > to support workingset protection for anon pages. > > > > i did a little bit modification for rand_chunk() which is probably similar > > > > with the modifcation() Kim mentioned in his patchset. The modification > > > > can be found here: > > > > https://github.com/21cnbao/ltp/commit/7134413d747bfa9ef > > > > > > > > The test env is a x86 machine in which I have set memory size to 2.5GB and > > > > set zRAM to 2GB and disabled external disk swap. > > > > > > > > with the vanilla kernel: > > > > \time -v ./a.out -vv -t 4 -s 209715200 -S 200000 > > > > > > > > so we have 10 chunks and 4 threads, each trunk is 209715200(200MB) > > > > > > > > typical result: > > > > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > > > > User time (seconds): 36.19 > > > > System time (seconds): 229.72 > > > > Percent of CPU this job got: 371% > > > > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:11.59 > > > > Average shared text size (kbytes): 0 > > > > Average unshared data size (kbytes): 0 > > > > Average stack size (kbytes): 0 > > > > Average total size (kbytes): 0 > > > > Maximum resident set size (kbytes): 2166196 > > > > Average resident set size (kbytes): 0 > > > > Major (requiring I/O) page faults: 9990128 > > > > Minor (reclaiming a frame) page faults: 33315945 > > > > Voluntary context switches: 59144 > > > > Involuntary context switches: 167754 > > > > Swaps: 0 > > > > File system inputs: 2760 > > > > File system outputs: 8 > > > > Socket messages sent: 0 > > > > Socket messages received: 0 > > > > Signals delivered: 0 > > > > Page size (bytes): 4096 > > > > Exit status: 0 > > > > > > > > with gen_lru and lru_gen/enabled=0x3: > > > > typical result: > > > > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > > > > User time (seconds): 36.34 > > > > System time (seconds): 276.07 > > > > Percent of CPU this job got: 378% > > > > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:22.46 > > > > **** 15% time + > > > > Average shared text size (kbytes): 0 > > > > Average unshared data size (kbytes): 0 > > > > Average stack size (kbytes): 0 > > > > Average total size (kbytes): 0 > > > > Maximum resident set size (kbytes): 2168120 > > > > Average resident set size (kbytes): 0 > > > > Major (requiring I/O) page faults: 13362810 > > > > ***** 30% page fault + > > > > Minor (reclaiming a frame) page faults: 33394617 > > > > Voluntary context switches: 55216 > > > > Involuntary context switches: 137220 > > > > Swaps: 0 > > > > File system inputs: 4088 > > > > File system outputs: 8 > > > > Socket messages sent: 0 > > > > Socket messages received: 0 > > > > Signals delivered: 0 > > > > Page size (bytes): 4096 > > > > Exit status: 0 > > > > > > > > with gen_lru and lru_gen/enabled=0x7: > > > > typical result: > > > > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > > > > User time (seconds): 36.13 > > > > System time (seconds): 251.71 > > > > Percent of CPU this job got: 378% > > > > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:16.00 > > > > *****better than enabled=0x3, worse than vanilla > > > > Average shared text size (kbytes): 0 > > > > Average unshared data size (kbytes): 0 > > > > Average stack size (kbytes): 0 > > > > Average total size (kbytes): 0 > > > > Maximum resident set size (kbytes): 2120988 > > > > Average resident set size (kbytes): 0 > > > > Major (requiring I/O) page faults: 12706512 > > > > Minor (reclaiming a frame) page faults: 33422243 > > > > Voluntary context switches: 49485 > > > > Involuntary context switches: 126765 > > > > Swaps: 0 > > > > File system inputs: 2976 > > > > File system outputs: 8 > > > > Socket messages sent: 0 > > > > Socket messages received: 0 > > > > Signals delivered: 0 > > > > Page size (bytes): 4096 > > > > Exit status: 0 > > > > > > > > I can also reproduce the problem on arm64. > > > > > > > > I am not saying this is going to block mglru from being mainlined. But I am > > > > still curious if this is an issue worth being addressed somehow in mglru. > > > > > > You've missed something very important: *thoughput* :) > > > > > > > noop :-) > > in the test case, there are 4 threads. they are searching a key in 10 chunks > > of memory. for each chunk, the size is 200MB. > > a "random" chunk index is returned for those threads to search. but chunk2 > > is the hottest, and chunk3, 7, 4 are relatively hotter than others. > > static inline unsigned int rand_chunk(void) > > { > > /* simulate hot and cold chunk */ > > unsigned int rand[16] = {2, 2, 3, 4, 5, 2, 6, 7, 9, 2, 8, 3, 7, 2, 2, 4}; > > This is sequential access, not what you claim above, because you have > a repeating sequence. > > In this case MGLRU is expected to be slower because it doesn't try to > optimize it, as discussed before [1]. The reason is, with a manageable > complexity, we can only optimize so many things. And MGLRU chose to > optimize (arguably) popular workloads, since, AFAIK, no real-world > applications streams anon memory. > > To verify this is indeed sequential access, you could make rand[] > larger, e.g., 160, with the same portions of 2s, 3s, 4s, etc, but > their positions are random. The following change shows MGLRU is ~20% > faster on my Snapdragon 7c + 2.5G DRAM + 2GB zram. > > static inline unsigned int rand_chunk(void) > { > /* simulate hot and cold chunk */ > - unsigned int rand[16] = {2, 2, 3, 4, 5, 2, 6, 7, 9, 2, 8, 3, > 7, 2, 2, 4}; > + unsigned int rand[160] = { > + 2, 4, 7, 3, 4, 2, 7, 2, 7, 8, 6, 9, 7, 6, 5, 4, > + 6, 2, 6, 4, 2, 9, 2, 5, 5, 4, 7, 2, 7, 7, 5, 2, > + 4, 4, 3, 3, 2, 4, 2, 2, 5, 2, 4, 2, 8, 2, 2, 3, > + 2, 2, 2, 2, 2, 8, 4, 2, 2, 4, 2, 2, 2, 2, 3, 2, > + 8, 5, 2, 2, 3, 2, 8, 2, 6, 2, 4, 8, 5, 2, 9, 2, > + 8, 7, 9, 2, 4, 4, 3, 3, 2, 8, 2, 2, 3, 3, 2, 7, > + 7, 5, 2, 2, 8, 2, 2, 2, 5, 2, 4, 3, 2, 3, 6, 3, > + 3, 3, 9, 4, 2, 3, 9, 7, 7, 6, 2, 2, 4, 2, 6, 2, > + 9, 7, 7, 7, 9, 3, 4, 2, 3, 2, 7, 3, 2, 2, 2, 6, > + 8, 3, 7, 6, 2, 2, 2, 4, 7, 2, 5, 7, 4, 7, 9, 9, > + }; > static int nr = 0; > - return rand[nr++%16]; > + return rand[nr++%160]; > } > > Yet better, you could use some standard benchmark suites, written by > reputable organizations, e.g., memtier, YCSB, to generate more > realistic distributions, as I've suggested before [2]. > > > static int nr = 0; > > return rand[nr++%16]; > > } > > > > each thread does search_mem(): > > static unsigned int search_mem(void) > > { > > record_t key, *found; > > record_t *src, *copy; > > unsigned int chunk; > > size_t copy_size = chunk_size; > > unsigned int i; > > unsigned int state = 0; > > > > /* run 160 loops or till timeout */ > > for (i = 0; threads_go == 1 && i < 160; i++) { > > I see you've modified the original benchmark. But with "-S 200000", > should this test finish within an hour instead of the following? > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:11.59 > > > chunk = rand_chunk(); > > src = mem[chunk]; > > ... > > copy = alloc_mem(copy_size); > > ... > > memcpy(copy, src, copy_size); > > > > key = rand_num(copy_size / record_size, &state); > > > > bsearch(&key, copy, copy_size / record_size, > > record_size, compare); > > > > /* Below check is mainly for memory corruption or other bug */ > > if (found == NULL) { > > fprintf(stderr, "Couldn't find key %zd\n", key); > > exit(1); > > } > > } /* end if ! touch_pages */ > > > > free_mem(copy, copy_size); > > } > > > > return (i); > > } > > > > each thread picks up a chunk, then allocates a new memory and copies the chunk to the > > new allocated memory, and searches a key in the allocated memory. > > > > as i have set time to rather big by -S, so each thread actually exits while it > > completes 160 loops. > > $ \time -v ./ebizzy -t 4 -s $((200*1024*1024)) -S 6000000 > > Ok, you actually used "-S 6000000". I have two exits, either 160 loops have been done or -S gets timeout. Since -S is very big, the process exits from the completion of 160 loops. I am seeing mglru is getting very similar speed with vanilla lru by using your rand_chunk() with 160 entries. the command is like: \time -v ./a.out -t 4 -s $((200*1024*1024)) -S 600000 -m The time to complete jobs begins to be more random, but on average, mglru seems to be 5% faster. actually, i am seeing mglru can be faster than vanilla even with more page faults. for example, MGLRU: Command being timed: "./mt.out -t 4 -s 209715200 -S 600000 -m" User time (seconds): 32.68 System time (seconds): 227.19 Percent of CPU this job got: 370% Elapsed (wall clock) time (h:mm:ss or m:ss): 1:10.23 Average shared text size (kbytes): 0 Average unshared data size (kbytes): 0 Average stack size (kbytes): 0 Average total size (kbytes): 0 Maximum resident set size (kbytes): 2175292 Average resident set size (kbytes): 0 Major (requiring I/O) page faults: 10977244 Minor (reclaiming a frame) page faults: 33447638 Voluntary context switches: 44466 Involuntary context switches: 108413 Swaps: 0 File system inputs: 7704 File system outputs: 8 Socket messages sent: 0 Socket messages received: 0 Signals delivered: 0 Page size (bytes): 4096 Exit status: 0 VANILLA: Command being timed: "./mt.out -t 4 -s 209715200 -S 600000 -m" User time (seconds): 32.20 System time (seconds): 248.18 Percent of CPU this job got: 371% Elapsed (wall clock) time (h:mm:ss or m:ss): 1:15.55 Average shared text size (kbytes): 0 Average unshared data size (kbytes): 0 Average stack size (kbytes): 0 Average total size (kbytes): 0 Maximum resident set size (kbytes): 2174384 Average resident set size (kbytes): 0 Major (requiring I/O) page faults: 10002206 Minor (reclaiming a frame) page faults: 33392151 Voluntary context switches: 76966 Involuntary context switches: 184841 Swaps: 0 File system inputs: 2032 File system outputs: 8 Socket messages sent: 0 Socket messages received: 0 Signals delivered: 0 Page size (bytes): 4096 Exit status: 0 I guess the main cause of the regression for the previous sequence with 16 entries is that the ebizzy has a new allocated copy in search_mem(), which is mapped and used only once in each loop. and the temp copy can push out those hot chunks. Anyway, I understand it is a trade-off between warmly embracing new pages and holding old pages tightly. Real user cases from phone, server, desktop will be judging this better. > > [1] https://lore.kernel.org/linux-mm/YhNJ4LVWpmZgLh4I@google.com/ > [2] https://lore.kernel.org/linux-mm/YgggI+vvtNvh3jBY@google.com/ Thanks Barry
On Tue, Mar 15, 2022 at 10:27 PM Barry Song <21cnbao@gmail.com> wrote: > > On Tue, Mar 15, 2022 at 6:18 PM Yu Zhao <yuzhao@google.com> wrote: > > > > On Mon, Mar 14, 2022 at 5:38 PM Barry Song <21cnbao@gmail.com> wrote: > > > > > > On Tue, Mar 15, 2022 at 5:45 AM Yu Zhao <yuzhao@google.com> wrote: > > > > > > > > On Mon, Mar 14, 2022 at 5:12 AM Barry Song <21cnbao@gmail.com> wrote: > > > > > > > > > > > > > > > > > > > > > > > > We used to put a faulted file page in inactive, if we access it a > > > > > > > > > > second time, it can be promoted > > > > > > > > > > to active. then in recent years, we have also applied this to anon > > > > > > > > > > pages while kernel adds > > > > > > > > > > workingset protection for anon pages. so basically both anon and file > > > > > > > > > > pages go into the inactive > > > > > > > > > > list for the 1st time, if we access it for the second time, they go to > > > > > > > > > > the active list. if we don't access > > > > > > > > > > it any more, they are likely to be reclaimed as they are inactive. > > > > > > > > > > we do have some special fastpath for code section, executable file > > > > > > > > > > pages are kept on active list > > > > > > > > > > as long as they are accessed. > > > > > > > > > > > > > > > > > > Yes. > > > > > > > > > > > > > > > > > > > so all of the above concerns are actually not that correct? > > > > > > > > > > > > > > > > > > They are valid concerns but I don't know any popular workloads that > > > > > > > > > care about them. > > > > > > > > > > > > > > > > Hi Yu, > > > > > > > > here we can get a workload in Kim's patchset while he added workingset > > > > > > > > protection > > > > > > > > for anon pages: > > > > > > > > https://patchwork.kernel.org/project/linux-mm/cover/1581401993-20041-1-git-send-email-iamjoonsoo.kim@lge.com/ > > > > > > > > > > > > > > Thanks. I wouldn't call that a workload because it's not a real > > > > > > > application. By popular workloads, I mean applications that the > > > > > > > majority of people actually run on phones, in cloud, etc. > > > > > > > > > > > > > > > anon pages used to go to active rather than inactive, but kim's patchset > > > > > > > > moved to use inactive first. then only after the anon page is accessed > > > > > > > > second time, it can move to active. > > > > > > > > > > > > > > Yes. To clarify, the A-bit doesn't really mean the first or second > > > > > > > access. It can be many accesses each time it's set. > > > > > > > > > > > > > > > "In current implementation, newly created or swap-in anonymous page is > > > > > > > > > > > > > > > > started on the active list. Growing the active list results in rebalancing > > > > > > > > active/inactive list so old pages on the active list are demoted to the > > > > > > > > inactive list. Hence, hot page on the active list isn't protected at all. > > > > > > > > > > > > > > > > Following is an example of this situation. > > > > > > > > > > > > > > > > Assume that 50 hot pages on active list and system can contain total > > > > > > > > 100 pages. Numbers denote the number of pages on active/inactive > > > > > > > > list (active | inactive). (h) stands for hot pages and (uo) stands for > > > > > > > > used-once pages. > > > > > > > > > > > > > > > > 1. 50 hot pages on active list > > > > > > > > 50(h) | 0 > > > > > > > > > > > > > > > > 2. workload: 50 newly created (used-once) pages > > > > > > > > 50(uo) | 50(h) > > > > > > > > > > > > > > > > 3. workload: another 50 newly created (used-once) pages > > > > > > > > 50(uo) | 50(uo), swap-out 50(h) > > > > > > > > > > > > > > > > As we can see, hot pages are swapped-out and it would cause swap-in later." > > > > > > > > > > > > > > > > Is MGLRU able to avoid the swap-out of the 50 hot pages? > > > > > > > > > > > > > > I think the real question is why the 50 hot pages can be moved to the > > > > > > > inactive list. If they are really hot, the A-bit should protect them. > > > > > > > > > > > > This is a good question. > > > > > > > > > > > > I guess it is probably because the current lru is trying to maintain a balance > > > > > > between the sizes of active and inactive lists. Thus, it can shrink active list > > > > > > even though pages might be still "hot" but not the recently accessed ones. > > > > > > > > > > > > 1. 50 hot pages on active list > > > > > > 50(h) | 0 > > > > > > > > > > > > 2. workload: 50 newly created (used-once) pages > > > > > > 50(uo) | 50(h) > > > > > > > > > > > > 3. workload: another 50 newly created (used-once) pages > > > > > > 50(uo) | 50(uo), swap-out 50(h) > > > > > > > > > > > > the old kernel without anon workingset protection put workload 2 on active, so > > > > > > pushed 50 hot pages from active to inactive. workload 3 would further contribute > > > > > > to evict the 50 hot pages. > > > > > > > > > > > > it seems mglru doesn't demote pages from the youngest generation to older > > > > > > generation only in order to balance the list size? so mglru is probably safe > > > > > > in these cases. > > > > > > > > > > > > I will run some tests mentioned in Kim's patchset and report the result to you > > > > > > afterwards. > > > > > > > > > > > > > > > > Hi Yu, > > > > > I did find putting faulted pages to the youngest generation lead to some > > > > > regression in the case ebizzy Kim's patchset mentioned while he tried > > > > > to support workingset protection for anon pages. > > > > > i did a little bit modification for rand_chunk() which is probably similar > > > > > with the modifcation() Kim mentioned in his patchset. The modification > > > > > can be found here: > > > > > https://github.com/21cnbao/ltp/commit/7134413d747bfa9ef > > > > > > > > > > The test env is a x86 machine in which I have set memory size to 2.5GB and > > > > > set zRAM to 2GB and disabled external disk swap. > > > > > > > > > > with the vanilla kernel: > > > > > \time -v ./a.out -vv -t 4 -s 209715200 -S 200000 > > > > > > > > > > so we have 10 chunks and 4 threads, each trunk is 209715200(200MB) > > > > > > > > > > typical result: > > > > > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > > > > > User time (seconds): 36.19 > > > > > System time (seconds): 229.72 > > > > > Percent of CPU this job got: 371% > > > > > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:11.59 > > > > > Average shared text size (kbytes): 0 > > > > > Average unshared data size (kbytes): 0 > > > > > Average stack size (kbytes): 0 > > > > > Average total size (kbytes): 0 > > > > > Maximum resident set size (kbytes): 2166196 > > > > > Average resident set size (kbytes): 0 > > > > > Major (requiring I/O) page faults: 9990128 > > > > > Minor (reclaiming a frame) page faults: 33315945 > > > > > Voluntary context switches: 59144 > > > > > Involuntary context switches: 167754 > > > > > Swaps: 0 > > > > > File system inputs: 2760 > > > > > File system outputs: 8 > > > > > Socket messages sent: 0 > > > > > Socket messages received: 0 > > > > > Signals delivered: 0 > > > > > Page size (bytes): 4096 > > > > > Exit status: 0 > > > > > > > > > > with gen_lru and lru_gen/enabled=0x3: > > > > > typical result: > > > > > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > > > > > User time (seconds): 36.34 > > > > > System time (seconds): 276.07 > > > > > Percent of CPU this job got: 378% > > > > > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:22.46 > > > > > **** 15% time + > > > > > Average shared text size (kbytes): 0 > > > > > Average unshared data size (kbytes): 0 > > > > > Average stack size (kbytes): 0 > > > > > Average total size (kbytes): 0 > > > > > Maximum resident set size (kbytes): 2168120 > > > > > Average resident set size (kbytes): 0 > > > > > Major (requiring I/O) page faults: 13362810 > > > > > ***** 30% page fault + > > > > > Minor (reclaiming a frame) page faults: 33394617 > > > > > Voluntary context switches: 55216 > > > > > Involuntary context switches: 137220 > > > > > Swaps: 0 > > > > > File system inputs: 4088 > > > > > File system outputs: 8 > > > > > Socket messages sent: 0 > > > > > Socket messages received: 0 > > > > > Signals delivered: 0 > > > > > Page size (bytes): 4096 > > > > > Exit status: 0 > > > > > > > > > > with gen_lru and lru_gen/enabled=0x7: > > > > > typical result: > > > > > Command being timed: "./a.out -vv -t 4 -s 209715200 -S 200000" > > > > > User time (seconds): 36.13 > > > > > System time (seconds): 251.71 > > > > > Percent of CPU this job got: 378% > > > > > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:16.00 > > > > > *****better than enabled=0x3, worse than vanilla > > > > > Average shared text size (kbytes): 0 > > > > > Average unshared data size (kbytes): 0 > > > > > Average stack size (kbytes): 0 > > > > > Average total size (kbytes): 0 > > > > > Maximum resident set size (kbytes): 2120988 > > > > > Average resident set size (kbytes): 0 > > > > > Major (requiring I/O) page faults: 12706512 > > > > > Minor (reclaiming a frame) page faults: 33422243 > > > > > Voluntary context switches: 49485 > > > > > Involuntary context switches: 126765 > > > > > Swaps: 0 > > > > > File system inputs: 2976 > > > > > File system outputs: 8 > > > > > Socket messages sent: 0 > > > > > Socket messages received: 0 > > > > > Signals delivered: 0 > > > > > Page size (bytes): 4096 > > > > > Exit status: 0 > > > > > > > > > > I can also reproduce the problem on arm64. > > > > > > > > > > I am not saying this is going to block mglru from being mainlined. But I am > > > > > still curious if this is an issue worth being addressed somehow in mglru. > > > > > > > > You've missed something very important: *thoughput* :) > > > > > > > > > > noop :-) > > > in the test case, there are 4 threads. they are searching a key in 10 chunks > > > of memory. for each chunk, the size is 200MB. > > > a "random" chunk index is returned for those threads to search. but chunk2 > > > is the hottest, and chunk3, 7, 4 are relatively hotter than others. > > > static inline unsigned int rand_chunk(void) > > > { > > > /* simulate hot and cold chunk */ > > > unsigned int rand[16] = {2, 2, 3, 4, 5, 2, 6, 7, 9, 2, 8, 3, 7, 2, 2, 4}; > > > > This is sequential access, not what you claim above, because you have > > a repeating sequence. > > > > In this case MGLRU is expected to be slower because it doesn't try to > > optimize it, as discussed before [1]. The reason is, with a manageable > > complexity, we can only optimize so many things. And MGLRU chose to > > optimize (arguably) popular workloads, since, AFAIK, no real-world > > applications streams anon memory. > > > > To verify this is indeed sequential access, you could make rand[] > > larger, e.g., 160, with the same portions of 2s, 3s, 4s, etc, but > > their positions are random. The following change shows MGLRU is ~20% > > faster on my Snapdragon 7c + 2.5G DRAM + 2GB zram. > > > > static inline unsigned int rand_chunk(void) > > { > > /* simulate hot and cold chunk */ > > - unsigned int rand[16] = {2, 2, 3, 4, 5, 2, 6, 7, 9, 2, 8, 3, > > 7, 2, 2, 4}; > > + unsigned int rand[160] = { > > + 2, 4, 7, 3, 4, 2, 7, 2, 7, 8, 6, 9, 7, 6, 5, 4, > > + 6, 2, 6, 4, 2, 9, 2, 5, 5, 4, 7, 2, 7, 7, 5, 2, > > + 4, 4, 3, 3, 2, 4, 2, 2, 5, 2, 4, 2, 8, 2, 2, 3, > > + 2, 2, 2, 2, 2, 8, 4, 2, 2, 4, 2, 2, 2, 2, 3, 2, > > + 8, 5, 2, 2, 3, 2, 8, 2, 6, 2, 4, 8, 5, 2, 9, 2, > > + 8, 7, 9, 2, 4, 4, 3, 3, 2, 8, 2, 2, 3, 3, 2, 7, > > + 7, 5, 2, 2, 8, 2, 2, 2, 5, 2, 4, 3, 2, 3, 6, 3, > > + 3, 3, 9, 4, 2, 3, 9, 7, 7, 6, 2, 2, 4, 2, 6, 2, > > + 9, 7, 7, 7, 9, 3, 4, 2, 3, 2, 7, 3, 2, 2, 2, 6, > > + 8, 3, 7, 6, 2, 2, 2, 4, 7, 2, 5, 7, 4, 7, 9, 9, > > + }; > > static int nr = 0; > > - return rand[nr++%16]; > > + return rand[nr++%160]; > > } > > > > Yet better, you could use some standard benchmark suites, written by > > reputable organizations, e.g., memtier, YCSB, to generate more > > realistic distributions, as I've suggested before [2]. > > > > > static int nr = 0; > > > return rand[nr++%16]; > > > } > > > > > > each thread does search_mem(): > > > static unsigned int search_mem(void) > > > { > > > record_t key, *found; > > > record_t *src, *copy; > > > unsigned int chunk; > > > size_t copy_size = chunk_size; > > > unsigned int i; > > > unsigned int state = 0; > > > > > > /* run 160 loops or till timeout */ > > > for (i = 0; threads_go == 1 && i < 160; i++) { > > > > I see you've modified the original benchmark. But with "-S 200000", > > should this test finish within an hour instead of the following? > > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:11.59 > > > > > chunk = rand_chunk(); > > > src = mem[chunk]; > > > ... > > > copy = alloc_mem(copy_size); > > > ... > > > memcpy(copy, src, copy_size); > > > > > > key = rand_num(copy_size / record_size, &state); > > > > > > bsearch(&key, copy, copy_size / record_size, > > > record_size, compare); > > > > > > /* Below check is mainly for memory corruption or other bug */ > > > if (found == NULL) { > > > fprintf(stderr, "Couldn't find key %zd\n", key); > > > exit(1); > > > } > > > } /* end if ! touch_pages */ > > > > > > free_mem(copy, copy_size); > > > } > > > > > > return (i); > > > } > > > > > > each thread picks up a chunk, then allocates a new memory and copies the chunk to the > > > new allocated memory, and searches a key in the allocated memory. > > > > > > as i have set time to rather big by -S, so each thread actually exits while it > > > completes 160 loops. > > > $ \time -v ./ebizzy -t 4 -s $((200*1024*1024)) -S 6000000 > > > > Ok, you actually used "-S 6000000". > > I have two exits, either 160 loops have been done or -S gets timeout. > Since -S is very big, the process exits from the completion of 160 > loops. > > I am seeing mglru is getting very similar speed with vanilla lru by > using your rand_chunk() with 160 entries. the command is like: > \time -v ./a.out -t 4 -s $((200*1024*1024)) -S 600000 -m > > The time to complete jobs begins to be more random, but on average, > mglru seems to be 5% faster. actually, i am seeing mglru can be faster > than vanilla even with more page faults. for example, > > MGLRU: > Command being timed: "./mt.out -t 4 -s 209715200 -S 600000 -m" > User time (seconds): 32.68 > System time (seconds): 227.19 > Percent of CPU this job got: 370% > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:10.23 > Average shared text size (kbytes): 0 > Average unshared data size (kbytes): 0 > Average stack size (kbytes): 0 > Average total size (kbytes): 0 > Maximum resident set size (kbytes): 2175292 > Average resident set size (kbytes): 0 > Major (requiring I/O) page faults: 10977244 > Minor (reclaiming a frame) page faults: 33447638 > Voluntary context switches: 44466 > Involuntary context switches: 108413 > Swaps: 0 > File system inputs: 7704 > File system outputs: 8 > Socket messages sent: 0 > Socket messages received: 0 > Signals delivered: 0 > Page size (bytes): 4096 > Exit status: 0 > > > VANILLA: > Command being timed: "./mt.out -t 4 -s 209715200 -S 600000 -m" > User time (seconds): 32.20 > System time (seconds): 248.18 > Percent of CPU this job got: 371% > Elapsed (wall clock) time (h:mm:ss or m:ss): 1:15.55 > Average shared text size (kbytes): 0 > Average unshared data size (kbytes): 0 > Average stack size (kbytes): 0 > Average total size (kbytes): 0 > Maximum resident set size (kbytes): 2174384 > Average resident set size (kbytes): 0 > Major (requiring I/O) page faults: 10002206 > Minor (reclaiming a frame) page faults: 33392151 > Voluntary context switches: 76966 > Involuntary context switches: 184841 > Swaps: 0 > File system inputs: 2032 > File system outputs: 8 > Socket messages sent: 0 > Socket messages received: 0 > Signals delivered: 0 > Page size (bytes): 4096 > Exit status: 0 > basically a perf comparison: vanilla: 23.81% [lz4_compress] [k] LZ4_compress_fast_extState 14.15% [kernel] [k] LZ4_decompress_safe 10.48% libc-2.33.so [.] __memmove_avx_unaligned_erms 2.49% [kernel] [k] native_queued_spin_lock_slowpath 2.05% [kernel] [k] clear_page_erms 1.69% [kernel] [k] native_irq_return_iret 1.49% [kernel] [k] mem_cgroup_css_rstat_flush 1.05% [kernel] [k] _raw_spin_lock 1.05% [kernel] [k] sync_regs 1.00% [kernel] [k] smp_call_function_many_cond 0.97% [kernel] [k] memset_erms 0.95% [zram] [k] zram_bvec_rw.constprop.0 0.91% [kernel] [k] down_read_trylock 0.90% [kernel] [k] memcpy_erms 0.89% [zram] [k] __zram_bvec_read.constprop.0 0.88% [kernel] [k] psi_group_change 0.84% [kernel] [k] isolate_lru_pages 0.78% [kernel] [k] zs_map_object 0.76% [kernel] [k] __handle_mm_fault 0.72% [kernel] [k] page_vma_mapped_walk mglru: 23.43% [lz4_compress] [k] LZ4_compress_fast_extState 16.90% [kernel] [k] LZ4_decompress_safe 12.60% libc-2.33.so [.] __memmove_avx_unaligned_erms 2.26% [kernel] [k] clear_page_erms 2.06% [kernel] [k] native_queued_spin_lock_slowpath 1.77% [kernel] [k] native_irq_return_iret 1.18% [kernel] [k] sync_regs 1.12% [zram] [k] __zram_bvec_read.constprop.0 0.98% [kernel] [k] psi_group_change 0.97% [zram] [k] zram_bvec_rw.constprop.0 0.96% [kernel] [k] memset_erms 0.95% [kernel] [k] isolate_folios 0.92% [kernel] [k] zs_map_object 0.92% [kernel] [k] _raw_spin_lock 0.87% [kernel] [k] memcpy_erms 0.83% [kernel] [k] smp_call_function_many_cond 0.83% [kernel] [k] __handle_mm_fault 0.78% [kernel] [k] unmap_page_range 0.71% [kernel] [k] rmqueue_bulk 0.70% [kernel] [k] page_counter_uncharge it seems vanilla kernel puts more time on native_queued_spin_lock_slowpath(), down_read_trylock(), mem_cgroup_css_rstat_flush(), isolate_lru_pages() and page_vma_mapped_walk(), but mglru puts more time on decompress, memmove and isolate_folios(). That is probably why mglru can be a bit faster even with more major page faults. > > I guess the main cause of the regression for the previous sequence > with 16 entries is that the ebizzy has a new allocated copy in > search_mem(), which is mapped and used only once in each loop. > and the temp copy can push out those hot chunks. > > Anyway, I understand it is a trade-off between warmly embracing new > pages and holding old pages tightly. Real user cases from phone, server, > desktop will be judging this better. > > > > > [1] https://lore.kernel.org/linux-mm/YhNJ4LVWpmZgLh4I@google.com/ > > [2] https://lore.kernel.org/linux-mm/YgggI+vvtNvh3jBY@google.com/ > Thanks Barry
On Tue, Mar 15, 2022 at 4:29 AM Barry Song <21cnbao@gmail.com> wrote: <snipped> > > I guess the main cause of the regression for the previous sequence > > with 16 entries is that the ebizzy has a new allocated copy in > > search_mem(), which is mapped and used only once in each loop. > > and the temp copy can push out those hot chunks. > > > > Anyway, I understand it is a trade-off between warmly embracing new > > pages and holding old pages tightly. Real user cases from phone, server, > > desktop will be judging this better. Thanks for all the details. I looked into them today and found no regressions when running with your original program. After I explain why, I hope you'd be convinced that using programs like this one is not a good way to measure things :) Problems: 1) Given the 2.5GB configuration and a sequence of cold/hot chunks, I assume your program tries to simulate a handful of apps running on a phone. A short repeating sequence is closer to sequential access than to real user behaviors, as I suggested last time. You could check out how something similar is done here [1]. 2) Under the same assumption (phone), C programs are very different from Android apps in terms of runtime memory behaviors, e.g., JVM GC [2]. 3) Assuming you are interested in the runtime memory behavior of C/C++ programs, your program is still not very representative. All C/C++ programs I'm familiar with choose to link against TCmalloc, jemalloc or implement their own allocators. GNU libc, IMO, has a small market share nowadays. 4) TCmalloc/jemalloc are not only optimized for multithreading, they are also THP aware. THP is very important when benchmarking page reclaim, e.g., two similarly warm THPs can comprise 511+1 or 1+511 of warm+cold 4K pages. The LRU algorithm that chooses more of the former is at the disadvantage. Unless it's recommended by the applications you are trying to benchmark, THP should be disabled. (Android generally doesn't use THP.) 5) Swap devices are also important. Zram should NOT be used unless you know your benchmark doesn't generate incompressible data. The LRU algorithm that chooses more incompressible pages is at disadvantage. Here is my result: on the same Snapdragon 7c + 2.5GB RAM + 1.5GB ramdisk swap, with your original program compiled against libc malloc and TCMalloc, to 32-bit and 64-bit binaries: # cat /sys/kernel/mm/lru_gen/enabled 0x0003 # cat /sys/kernel/mm/transparent_hugepage/enabled always madvise [never] # modprobe brd rd_nr=1 rd_size=1572864 # if=/dev/zero of=/dev/ram0 bs=1M # mkswap /dev/ram0 # swapoff -a # swapon /dev/ram0 # ldd test_absl_32 linux-vdso.so.1 (0xf6e7f000) libabsl_malloc.so.2103.0.1 => /usr/lib/libabsl_malloc.so.2103.0.1 (0xf6e23000) libpthread.so.0 => /lib/libpthread.so.0 (0xf6dff000) libc.so.6 => /lib/libc.so.6 (0xf6d07000) /lib/ld-linux-armhf.so.3 (0x09df0000) libabsl_base.so.2103.0.1 => /usr/lib/libabsl_base.so.2103.0.1 (0xf6ce5000) libabsl_raw_logging.so.2103.0.1 => /usr/lib/libabsl_raw_logging.so.2103.0.1 (0xf6cc4000) libabsl_spinlock_wait.so.2103.0.1 => /usr/lib/libabsl_spinlock_wait.so.2103.0.1 (0xf6ca3000) libc++.so.1 => /usr/lib/libc++.so.1 (0xf6c04000) libc++abi.so.1 => /usr/lib/libc++abi.so.1 (0xf6bcd000) # file test_absl_64 test_absl_64: ELF 64-bit LSB executable, ARM aarch64, version 1 (SYSV), statically linked # ldd test_gnu_32 linux-vdso.so.1 (0xeabef000) libpthread.so.0 => /lib/libpthread.so.0 (0xeab92000) libc.so.6 => /lib/libc.so.6 (0xeaa9a000) /lib/ld-linux-armhf.so.3 (0x05690000) # file test_gnu_64 test_gnu_64: ELF 64-bit LSB executable, ARM aarch64, version 1 (SYSV), statically linked ### baseline 5.17-rc8 # perf record ./test_gnu_64 -t 4 -s $((200*1024*1024)) -S 6000000 10 records/s real 59.00 s user 39.83 s sys 174.18 s 18.51% [.] memcpy 15.98% [k] __pi_clear_page 5.59% [k] rmqueue_pcplist 5.19% [k] do_raw_spin_lock 5.09% [k] memmove 4.60% [k] _raw_spin_unlock_irq 3.62% [k] _raw_spin_unlock_irqrestore 3.61% [k] free_unref_page_list 3.29% [k] zap_pte_range 2.53% [k] local_daif_restore 2.50% [k] down_read_trylock 1.41% [k] handle_mm_fault 1.32% [k] do_anonymous_page 1.31% [k] up_read 1.03% [k] free_swap_cache ### MGLRU v9 # perf record ./test_gnu_64 -t 4 -s $((200*1024*1024)) -S 6000000 11 records/s real 57.00 s user 39.39 s 19.36% [.] memcpy 16.50% [k] __pi_clear_page 6.21% [k] memmove 5.57% [k] rmqueue_pcplist 5.07% [k] do_raw_spin_lock 4.96% [k] _raw_spin_unlock_irqrestore 4.25% [k] free_unref_page_list 3.80% [k] zap_pte_range 3.69% [k] _raw_spin_unlock_irq 2.71% [k] local_daif_restore 2.10% [k] down_read_trylock 1.50% [k] handle_mm_fault 1.29% [k] do_anonymous_page 1.17% [k] free_swap_cache 1.08% [k] up_read [1] https://chromium.googlesource.com/chromiumos/platform/tast-tests/+/refs/heads/main/src/chromiumos/tast/local/memory/mempressure/mempressure.go [2] https://developer.android.com/topic/performance/memory-overview
On Wed, Mar 16, 2022 at 3:47 PM Yu Zhao <yuzhao@google.com> wrote: > > On Tue, Mar 15, 2022 at 4:29 AM Barry Song <21cnbao@gmail.com> wrote: > > <snipped> > > > > I guess the main cause of the regression for the previous sequence > > > with 16 entries is that the ebizzy has a new allocated copy in > > > search_mem(), which is mapped and used only once in each loop. > > > and the temp copy can push out those hot chunks. > > > > > > Anyway, I understand it is a trade-off between warmly embracing new > > > pages and holding old pages tightly. Real user cases from phone, server, > > > desktop will be judging this better. > > Thanks for all the details. I looked into them today and found no > regressions when running with your original program. > > After I explain why, I hope you'd be convinced that using programs > like this one is not a good way to measure things :) > Yep. I agree ebizzy might not be a good one to measure things. I chose it only because Kim's patchset which moved anon pages to inactive at the first detected access was using it. Before kim's patchset, anon pages were placed in the active list from the first beginning: https://patchwork.kernel.org/project/linux-mm/cover/1581401993-20041-1-git-send-email-iamjoonsoo.kim@lge.com/ in ebizzy, there is a used-once allocated memory in each search_mem(). I guess that is why Kim's patchset chose it. > Problems: > 1) Given the 2.5GB configuration and a sequence of cold/hot chunks, I > assume your program tries to simulate a handful of apps running on a > phone. A short repeating sequence is closer to sequential access than > to real user behaviors, as I suggested last time. You could check out > how something similar is done here [1]. > 2) Under the same assumption (phone), C programs are very different > from Android apps in terms of runtime memory behaviors, e.g., JVM GC > [2]. > 3) Assuming you are interested in the runtime memory behavior of C/C++ > programs, your program is still not very representative. All C/C++ > programs I'm familiar with choose to link against TCmalloc, jemalloc > or implement their own allocators. GNU libc, IMO, has a small market > share nowadays. > 4) TCmalloc/jemalloc are not only optimized for multithreading, they > are also THP aware. THP is very important when benchmarking page > reclaim, e.g., two similarly warm THPs can comprise 511+1 or 1+511 of > warm+cold 4K pages. The LRU algorithm that chooses more of the former > is at the disadvantage. Unless it's recommended by the applications > you are trying to benchmark, THP should be disabled. (Android > generally doesn't use THP.) > 5) Swap devices are also important. Zram should NOT be used unless you > know your benchmark doesn't generate incompressible data. The LRU > algorithm that chooses more incompressible pages is at disadvantage. > Thanks for all the information above. very useful. > Here is my result: on the same Snapdragon 7c + 2.5GB RAM + 1.5GB > ramdisk swap, with your original program compiled against libc malloc > and TCMalloc, to 32-bit and 64-bit binaries: I noticed an important difference is that you are using ramdisk, so there is no cost on "i/o". I assume compression/decompression is the i/o cost to zRAM. > > # cat /sys/kernel/mm/lru_gen/enabled > 0x0003 > # cat /sys/kernel/mm/transparent_hugepage/enabled > always madvise [never] > > # modprobe brd rd_nr=1 rd_size=1572864 > # if=/dev/zero of=/dev/ram0 bs=1M > # mkswap /dev/ram0 > # swapoff -a > # swapon /dev/ram0 > > # ldd test_absl_32 > linux-vdso.so.1 (0xf6e7f000) > libabsl_malloc.so.2103.0.1 => > /usr/lib/libabsl_malloc.so.2103.0.1 (0xf6e23000) > libpthread.so.0 => /lib/libpthread.so.0 (0xf6dff000) > libc.so.6 => /lib/libc.so.6 (0xf6d07000) > /lib/ld-linux-armhf.so.3 (0x09df0000) > libabsl_base.so.2103.0.1 => /usr/lib/libabsl_base.so.2103.0.1 > (0xf6ce5000) > libabsl_raw_logging.so.2103.0.1 => > /usr/lib/libabsl_raw_logging.so.2103.0.1 (0xf6cc4000) > libabsl_spinlock_wait.so.2103.0.1 => > /usr/lib/libabsl_spinlock_wait.so.2103.0.1 (0xf6ca3000) > libc++.so.1 => /usr/lib/libc++.so.1 (0xf6c04000) > libc++abi.so.1 => /usr/lib/libc++abi.so.1 (0xf6bcd000) > # file test_absl_64 > test_absl_64: ELF 64-bit LSB executable, ARM aarch64, version 1 > (SYSV), statically linked > # ldd test_gnu_32 > linux-vdso.so.1 (0xeabef000) > libpthread.so.0 => /lib/libpthread.so.0 (0xeab92000) > libc.so.6 => /lib/libc.so.6 (0xeaa9a000) > /lib/ld-linux-armhf.so.3 (0x05690000) > # file test_gnu_64 > test_gnu_64: ELF 64-bit LSB executable, ARM aarch64, version 1 (SYSV), > statically linked > > ### baseline 5.17-rc8 > > # perf record ./test_gnu_64 -t 4 -s $((200*1024*1024)) -S 6000000 > 10 records/s > real 59.00 s > user 39.83 s > sys 174.18 s > > 18.51% [.] memcpy > 15.98% [k] __pi_clear_page > 5.59% [k] rmqueue_pcplist > 5.19% [k] do_raw_spin_lock > 5.09% [k] memmove > 4.60% [k] _raw_spin_unlock_irq > 3.62% [k] _raw_spin_unlock_irqrestore > 3.61% [k] free_unref_page_list > 3.29% [k] zap_pte_range > 2.53% [k] local_daif_restore > 2.50% [k] down_read_trylock > 1.41% [k] handle_mm_fault > 1.32% [k] do_anonymous_page > 1.31% [k] up_read > 1.03% [k] free_swap_cache > > ### MGLRU v9 > > # perf record ./test_gnu_64 -t 4 -s $((200*1024*1024)) -S 6000000 > 11 records/s > real 57.00 s > user 39.39 s > > 19.36% [.] memcpy > 16.50% [k] __pi_clear_page > 6.21% [k] memmove > 5.57% [k] rmqueue_pcplist > 5.07% [k] do_raw_spin_lock > 4.96% [k] _raw_spin_unlock_irqrestore > 4.25% [k] free_unref_page_list > 3.80% [k] zap_pte_range > 3.69% [k] _raw_spin_unlock_irq > 2.71% [k] local_daif_restore > 2.10% [k] down_read_trylock > 1.50% [k] handle_mm_fault > 1.29% [k] do_anonymous_page > 1.17% [k] free_swap_cache > 1.08% [k] up_read > I think your result is right. but if you take a look at the number of major faults, will you find mglru have more page faults? i ask this question because i can see mglru even wins with lower hit ratio in the previous report I sent. > [1] https://chromium.googlesource.com/chromiumos/platform/tast-tests/+/refs/heads/main/src/chromiumos/tast/local/memory/mempressure/mempressure.go > [2] https://developer.android.com/topic/performance/memory-overview Thanks Barry
On Tue, Mar 15, 2022 at 10:37 PM Barry Song <21cnbao@gmail.com> wrote: > > On Wed, Mar 16, 2022 at 3:47 PM Yu Zhao <yuzhao@google.com> wrote: > > > > On Tue, Mar 15, 2022 at 4:29 AM Barry Song <21cnbao@gmail.com> wrote: > > > > <snipped> > > > > > > I guess the main cause of the regression for the previous sequence > > > > with 16 entries is that the ebizzy has a new allocated copy in > > > > search_mem(), which is mapped and used only once in each loop. > > > > and the temp copy can push out those hot chunks. > > > > > > > > Anyway, I understand it is a trade-off between warmly embracing new > > > > pages and holding old pages tightly. Real user cases from phone, server, > > > > desktop will be judging this better. > > > > Thanks for all the details. I looked into them today and found no > > regressions when running with your original program. > > > > After I explain why, I hope you'd be convinced that using programs > > like this one is not a good way to measure things :) > > > > Yep. I agree ebizzy might not be a good one to measure things. > I chose it only because Kim's patchset which moved anon pages > to inactive at the first detected access was using it. Before kim's > patchset, anon pages were placed in the active list from the first > beginning: > https://patchwork.kernel.org/project/linux-mm/cover/1581401993-20041-1-git-send-email-iamjoonsoo.kim@lge.com/ > > in ebizzy, there is a used-once allocated memory in each > search_mem(). I guess that is why Kim's patchset chose > it. > > > Problems: > > 1) Given the 2.5GB configuration and a sequence of cold/hot chunks, I > > assume your program tries to simulate a handful of apps running on a > > phone. A short repeating sequence is closer to sequential access than > > to real user behaviors, as I suggested last time. You could check out > > how something similar is done here [1]. > > 2) Under the same assumption (phone), C programs are very different > > from Android apps in terms of runtime memory behaviors, e.g., JVM GC > > [2]. > > 3) Assuming you are interested in the runtime memory behavior of C/C++ > > programs, your program is still not very representative. All C/C++ > > programs I'm familiar with choose to link against TCmalloc, jemalloc > > or implement their own allocators. GNU libc, IMO, has a small market > > share nowadays. > > 4) TCmalloc/jemalloc are not only optimized for multithreading, they > > are also THP aware. THP is very important when benchmarking page > > reclaim, e.g., two similarly warm THPs can comprise 511+1 or 1+511 of > > warm+cold 4K pages. The LRU algorithm that chooses more of the former > > is at the disadvantage. Unless it's recommended by the applications > > you are trying to benchmark, THP should be disabled. (Android > > generally doesn't use THP.) > > 5) Swap devices are also important. Zram should NOT be used unless you > > know your benchmark doesn't generate incompressible data. The LRU > > algorithm that chooses more incompressible pages is at disadvantage. > > > > Thanks for all the information above. very useful. > > > Here is my result: on the same Snapdragon 7c + 2.5GB RAM + 1.5GB > > ramdisk swap, with your original program compiled against libc malloc > > and TCMalloc, to 32-bit and 64-bit binaries: > > I noticed an important difference is that you are using ramdisk, so there > is no cost on "i/o". I assume compression/decompression is the i/o cost to > zRAM. The cost is not the point; the fairness is: 1) Ramdisk is fair to both LRU algorithms. 2) Zram punishes the LRU algorithm that chooses incompressible pages. IOW, this algorithm needs to compress more pages in order to save the same amount of memory. > > # cat /sys/kernel/mm/lru_gen/enabled > > 0x0003 > > # cat /sys/kernel/mm/transparent_hugepage/enabled > > always madvise [never] > > > > # modprobe brd rd_nr=1 rd_size=1572864 > > # if=/dev/zero of=/dev/ram0 bs=1M > > # mkswap /dev/ram0 > > # swapoff -a > > # swapon /dev/ram0 > > > > # ldd test_absl_32 > > linux-vdso.so.1 (0xf6e7f000) > > libabsl_malloc.so.2103.0.1 => > > /usr/lib/libabsl_malloc.so.2103.0.1 (0xf6e23000) > > libpthread.so.0 => /lib/libpthread.so.0 (0xf6dff000) > > libc.so.6 => /lib/libc.so.6 (0xf6d07000) > > /lib/ld-linux-armhf.so.3 (0x09df0000) > > libabsl_base.so.2103.0.1 => /usr/lib/libabsl_base.so.2103.0.1 > > (0xf6ce5000) > > libabsl_raw_logging.so.2103.0.1 => > > /usr/lib/libabsl_raw_logging.so.2103.0.1 (0xf6cc4000) > > libabsl_spinlock_wait.so.2103.0.1 => > > /usr/lib/libabsl_spinlock_wait.so.2103.0.1 (0xf6ca3000) > > libc++.so.1 => /usr/lib/libc++.so.1 (0xf6c04000) > > libc++abi.so.1 => /usr/lib/libc++abi.so.1 (0xf6bcd000) > > # file test_absl_64 > > test_absl_64: ELF 64-bit LSB executable, ARM aarch64, version 1 > > (SYSV), statically linked > > # ldd test_gnu_32 > > linux-vdso.so.1 (0xeabef000) > > libpthread.so.0 => /lib/libpthread.so.0 (0xeab92000) > > libc.so.6 => /lib/libc.so.6 (0xeaa9a000) > > /lib/ld-linux-armhf.so.3 (0x05690000) > > # file test_gnu_64 > > test_gnu_64: ELF 64-bit LSB executable, ARM aarch64, version 1 (SYSV), > > statically linked > > > > ### baseline 5.17-rc8 > > > > # perf record ./test_gnu_64 -t 4 -s $((200*1024*1024)) -S 6000000 > > 10 records/s > > real 59.00 s > > user 39.83 s > > sys 174.18 s > > > > 18.51% [.] memcpy > > 15.98% [k] __pi_clear_page > > 5.59% [k] rmqueue_pcplist > > 5.19% [k] do_raw_spin_lock > > 5.09% [k] memmove > > 4.60% [k] _raw_spin_unlock_irq > > 3.62% [k] _raw_spin_unlock_irqrestore > > 3.61% [k] free_unref_page_list > > 3.29% [k] zap_pte_range > > 2.53% [k] local_daif_restore > > 2.50% [k] down_read_trylock > > 1.41% [k] handle_mm_fault > > 1.32% [k] do_anonymous_page > > 1.31% [k] up_read > > 1.03% [k] free_swap_cache > > > > ### MGLRU v9 > > > > # perf record ./test_gnu_64 -t 4 -s $((200*1024*1024)) -S 6000000 > > 11 records/s > > real 57.00 s > > user 39.39 s > > > > 19.36% [.] memcpy > > 16.50% [k] __pi_clear_page > > 6.21% [k] memmove > > 5.57% [k] rmqueue_pcplist > > 5.07% [k] do_raw_spin_lock > > 4.96% [k] _raw_spin_unlock_irqrestore > > 4.25% [k] free_unref_page_list > > 3.80% [k] zap_pte_range > > 3.69% [k] _raw_spin_unlock_irq > > 2.71% [k] local_daif_restore > > 2.10% [k] down_read_trylock > > 1.50% [k] handle_mm_fault > > 1.29% [k] do_anonymous_page > > 1.17% [k] free_swap_cache > > 1.08% [k] up_read > > > > I think your result is right. but if you take a look at the number of > major faults, will you find mglru have more page faults? > i ask this question because i can see mglru even wins with lower > hit ratio in the previous report I sent. Yes, I did see the elevated major faults: # baseline total 11503878 majfault 4745116 pgsteal_kswapd 3056793 pgsteal_direct 3701969 # MGLRU total 11928659 pgmajfault 5762213 pgsteal_kswapd 2098253 pgsteal_direct 4068193
On Wed, Mar 16, 2022 at 6:44 PM Yu Zhao <yuzhao@google.com> wrote: > > On Tue, Mar 15, 2022 at 10:37 PM Barry Song <21cnbao@gmail.com> wrote: > > > > On Wed, Mar 16, 2022 at 3:47 PM Yu Zhao <yuzhao@google.com> wrote: > > > > > > On Tue, Mar 15, 2022 at 4:29 AM Barry Song <21cnbao@gmail.com> wrote: > > > > > > <snipped> > > > > > > > > I guess the main cause of the regression for the previous sequence > > > > > with 16 entries is that the ebizzy has a new allocated copy in > > > > > search_mem(), which is mapped and used only once in each loop. > > > > > and the temp copy can push out those hot chunks. > > > > > > > > > > Anyway, I understand it is a trade-off between warmly embracing new > > > > > pages and holding old pages tightly. Real user cases from phone, server, > > > > > desktop will be judging this better. > > > > > > Thanks for all the details. I looked into them today and found no > > > regressions when running with your original program. > > > > > > After I explain why, I hope you'd be convinced that using programs > > > like this one is not a good way to measure things :) > > > > > > > Yep. I agree ebizzy might not be a good one to measure things. > > I chose it only because Kim's patchset which moved anon pages > > to inactive at the first detected access was using it. Before kim's > > patchset, anon pages were placed in the active list from the first > > beginning: > > https://patchwork.kernel.org/project/linux-mm/cover/1581401993-20041-1-git-send-email-iamjoonsoo.kim@lge.com/ > > > > in ebizzy, there is a used-once allocated memory in each > > search_mem(). I guess that is why Kim's patchset chose > > it. > > > > > Problems: > > > 1) Given the 2.5GB configuration and a sequence of cold/hot chunks, I > > > assume your program tries to simulate a handful of apps running on a > > > phone. A short repeating sequence is closer to sequential access than > > > to real user behaviors, as I suggested last time. You could check out > > > how something similar is done here [1]. > > > 2) Under the same assumption (phone), C programs are very different > > > from Android apps in terms of runtime memory behaviors, e.g., JVM GC > > > [2]. > > > 3) Assuming you are interested in the runtime memory behavior of C/C++ > > > programs, your program is still not very representative. All C/C++ > > > programs I'm familiar with choose to link against TCmalloc, jemalloc > > > or implement their own allocators. GNU libc, IMO, has a small market > > > share nowadays. > > > 4) TCmalloc/jemalloc are not only optimized for multithreading, they > > > are also THP aware. THP is very important when benchmarking page > > > reclaim, e.g., two similarly warm THPs can comprise 511+1 or 1+511 of > > > warm+cold 4K pages. The LRU algorithm that chooses more of the former > > > is at the disadvantage. Unless it's recommended by the applications > > > you are trying to benchmark, THP should be disabled. (Android > > > generally doesn't use THP.) > > > 5) Swap devices are also important. Zram should NOT be used unless you > > > know your benchmark doesn't generate incompressible data. The LRU > > > algorithm that chooses more incompressible pages is at disadvantage. > > > > > > > Thanks for all the information above. very useful. > > > > > Here is my result: on the same Snapdragon 7c + 2.5GB RAM + 1.5GB > > > ramdisk swap, with your original program compiled against libc malloc > > > and TCMalloc, to 32-bit and 64-bit binaries: > > > > I noticed an important difference is that you are using ramdisk, so there > > is no cost on "i/o". I assume compression/decompression is the i/o cost to > > zRAM. > > The cost is not the point; the fairness is: > > 1) Ramdisk is fair to both LRU algorithms. > 2) Zram punishes the LRU algorithm that chooses incompressible pages. > IOW, this algorithm needs to compress more pages in order to save the > same amount of memory. I see your point. but my point is that with higher I/O cost to swap in and swap out pages, more major faults(lower hit ratio) will contribute to the loss of final performance. So for the particular case, if we move to a real disk as a swap device, we might see the same result as zRAM I was using since you also reported more page faults. > > > > # cat /sys/kernel/mm/lru_gen/enabled > > > 0x0003 > > > # cat /sys/kernel/mm/transparent_hugepage/enabled > > > always madvise [never] > > > > > > # modprobe brd rd_nr=1 rd_size=1572864 > > > # if=/dev/zero of=/dev/ram0 bs=1M > > > # mkswap /dev/ram0 > > > # swapoff -a > > > # swapon /dev/ram0 > > > > > > # ldd test_absl_32 > > > linux-vdso.so.1 (0xf6e7f000) > > > libabsl_malloc.so.2103.0.1 => > > > /usr/lib/libabsl_malloc.so.2103.0.1 (0xf6e23000) > > > libpthread.so.0 => /lib/libpthread.so.0 (0xf6dff000) > > > libc.so.6 => /lib/libc.so.6 (0xf6d07000) > > > /lib/ld-linux-armhf.so.3 (0x09df0000) > > > libabsl_base.so.2103.0.1 => /usr/lib/libabsl_base.so.2103.0.1 > > > (0xf6ce5000) > > > libabsl_raw_logging.so.2103.0.1 => > > > /usr/lib/libabsl_raw_logging.so.2103.0.1 (0xf6cc4000) > > > libabsl_spinlock_wait.so.2103.0.1 => > > > /usr/lib/libabsl_spinlock_wait.so.2103.0.1 (0xf6ca3000) > > > libc++.so.1 => /usr/lib/libc++.so.1 (0xf6c04000) > > > libc++abi.so.1 => /usr/lib/libc++abi.so.1 (0xf6bcd000) > > > # file test_absl_64 > > > test_absl_64: ELF 64-bit LSB executable, ARM aarch64, version 1 > > > (SYSV), statically linked > > > # ldd test_gnu_32 > > > linux-vdso.so.1 (0xeabef000) > > > libpthread.so.0 => /lib/libpthread.so.0 (0xeab92000) > > > libc.so.6 => /lib/libc.so.6 (0xeaa9a000) > > > /lib/ld-linux-armhf.so.3 (0x05690000) > > > # file test_gnu_64 > > > test_gnu_64: ELF 64-bit LSB executable, ARM aarch64, version 1 (SYSV), > > > statically linked > > > > > > ### baseline 5.17-rc8 > > > > > > # perf record ./test_gnu_64 -t 4 -s $((200*1024*1024)) -S 6000000 > > > 10 records/s > > > real 59.00 s > > > user 39.83 s > > > sys 174.18 s > > > > > > 18.51% [.] memcpy > > > 15.98% [k] __pi_clear_page > > > 5.59% [k] rmqueue_pcplist > > > 5.19% [k] do_raw_spin_lock > > > 5.09% [k] memmove > > > 4.60% [k] _raw_spin_unlock_irq > > > 3.62% [k] _raw_spin_unlock_irqrestore > > > 3.61% [k] free_unref_page_list > > > 3.29% [k] zap_pte_range > > > 2.53% [k] local_daif_restore > > > 2.50% [k] down_read_trylock > > > 1.41% [k] handle_mm_fault > > > 1.32% [k] do_anonymous_page > > > 1.31% [k] up_read > > > 1.03% [k] free_swap_cache > > > > > > ### MGLRU v9 > > > > > > # perf record ./test_gnu_64 -t 4 -s $((200*1024*1024)) -S 6000000 > > > 11 records/s > > > real 57.00 s > > > user 39.39 s > > > > > > 19.36% [.] memcpy > > > 16.50% [k] __pi_clear_page > > > 6.21% [k] memmove > > > 5.57% [k] rmqueue_pcplist > > > 5.07% [k] do_raw_spin_lock > > > 4.96% [k] _raw_spin_unlock_irqrestore Enabling ARM64_PSEUDO_NMI and irqchip.gicv3_pseudo_nmi= might help figure out the real code which is taking CPU time in a spin_lock_irqsave area. > > > 4.25% [k] free_unref_page_list > > > 3.80% [k] zap_pte_range > > > 3.69% [k] _raw_spin_unlock_irq > > > 2.71% [k] local_daif_restore > > > 2.10% [k] down_read_trylock > > > 1.50% [k] handle_mm_fault > > > 1.29% [k] do_anonymous_page > > > 1.17% [k] free_swap_cache > > > 1.08% [k] up_read > > > > > > > I think your result is right. but if you take a look at the number of > > major faults, will you find mglru have more page faults? > > i ask this question because i can see mglru even wins with lower > > hit ratio in the previous report I sent. > > Yes, I did see the elevated major faults: > > # baseline total 11503878 > majfault 4745116 > pgsteal_kswapd 3056793 > pgsteal_direct 3701969 > > # MGLRU total 11928659 > pgmajfault 5762213 > pgsteal_kswapd 2098253 > pgsteal_direct 4068193 This is a really good sign. Thanks to MGLRU's good implementation, it seems the kernel is spending more time on useful jobs, regardless of the hit ratio. Thanks Barry
On Wed, Mar 16, 2022 at 12:06 AM Barry Song <21cnbao@gmail.com> wrote: < snipped> > > The cost is not the point; the fairness is: > > > > 1) Ramdisk is fair to both LRU algorithms. > > 2) Zram punishes the LRU algorithm that chooses incompressible pages. > > IOW, this algorithm needs to compress more pages in order to save the > > same amount of memory. > > I see your point. but my point is that with higher I/O cost to swap > in and swap out pages, more major faults(lower hit ratio) will > contribute to the loss of final performance. > > So for the particular case, if we move to a real disk as a swap > device, we might see the same result as zRAM I was using > since you also reported more page faults. If we wanted to talk about I/O cost, we would need to consider the number of writes and writeback patterns as well. The LRU algorithm that *unconsciously* picks more clean pages has an advantage because writes are usually slower than reads. Similarly, the LRU algorithm that *unconsciously* picks a cluster of cold pages that later would be faulted in together also has the advantage because sequential reads are faster than random reads. Do we want to go into this rabbit hole? I think not. That's exactly why I suggested we focus on the fairness. But, just outta curiosity, MGLRU was faster when swapping to a slow MMC disk. # mmc cid read /sys/class/mmc_host/mmc1/mmc1:0001 type: 'MMC' manufacturer: 'SanDisk-Toshiba Corporation' '' product: 'DA4064' 1.24400152 serial: 0x00000000 manfacturing date: 2006 aug # baseline + THP=never 0 records/s real 872.00 s user 51.69 s sys 483.09 s 13.07% __memcpy_neon 11.37% __pi_clear_page 9.35% _raw_spin_unlock_irq 5.52% mod_delayed_work_on 5.17% _raw_spin_unlock_irqrestore 3.95% do_raw_spin_lock 3.87% rmqueue_pcplist 3.60% local_daif_restore 3.17% free_unref_page_list 2.74% zap_pte_range 2.00% handle_mm_fault 1.19% do_anonymous_page # MGLRU + THP=never 0 records/s real 821.00 s user 44.45 s sys 428.21 s 13.28% __memcpy_neon 12.78% __pi_clear_page 9.14% _raw_spin_unlock_irq 5.95% _raw_spin_unlock_irqrestore 5.08% mod_delayed_work_on 4.45% do_raw_spin_lock 3.86% local_daif_restore 3.81% rmqueue_pcplist 3.32% free_unref_page_list 2.89% zap_pte_range 1.89% handle_mm_fault 1.10% do_anonymous_page # baseline + THP=madvise 0 records/s real 1341.00 s user 68.15 s sys 681.42 s 12.33% __memcpy_neon 11.78% _raw_spin_unlock_irq 8.79% __pi_clear_page 7.63% mod_delayed_work_on 5.49% _raw_spin_unlock_irqrestore 3.23% local_daif_restore 3.00% do_raw_spin_lock 2.83% rmqueue_pcplist 2.21% handle_mm_fault 2.00% zap_pte_range 1.51% free_unref_page_list 1.33% do_swap_page 1.17% do_anonymous_page # MGLRU + THP=madvise 0 records/s real 1315.00 s user 60.59 s sys 620.56 s 12.34% __memcpy_neon 12.17% _raw_spin_unlock_irq 9.33% __pi_clear_page 7.33% mod_delayed_work_on 6.01% _raw_spin_unlock_irqrestore 3.27% local_daif_restore 3.23% do_raw_spin_lock 2.98% rmqueue_pcplist 2.12% handle_mm_fault 2.04% zap_pte_range 1.65% free_unref_page_list 1.27% do_swap_page 1.11% do_anonymous_page
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cd54a529460d..769139a8be86 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -785,7 +785,8 @@ static int fuse_check_page(struct page *page) 1 << PG_active | 1 << PG_workingset | 1 << PG_reclaim | - 1 << PG_waiters))) { + 1 << PG_waiters | + LRU_GEN_MASK | LRU_REFS_MASK))) { dump_page(page, "fuse: trying to steal weird page"); return 1; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b192..05dd33265740 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1032,6 +1032,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) +#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) +#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) /* * Define the bit shifts to access each section. For non-existent diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index b725839dfe71..46f4fde0299f 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -92,11 +92,196 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio) return lru; } +#ifdef CONFIG_LRU_GEN + +static inline bool lru_gen_enabled(void) +{ + return true; +} + +static inline bool lru_gen_in_fault(void) +{ + return current->in_lru_fault; +} + +static inline int lru_gen_from_seq(unsigned long seq) +{ + return seq % MAX_NR_GENS; +} + +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) +{ + unsigned long max_seq = lruvec->lrugen.max_seq; + + VM_BUG_ON(gen >= MAX_NR_GENS); + + /* see the comment on MIN_NR_GENS */ + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); +} + +static inline void lru_gen_update_size(struct lruvec *lruvec, enum lru_list lru, + int zone, long delta) +{ + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + lockdep_assert_held(&lruvec->lru_lock); + WARN_ON_ONCE(delta != (int)delta); + + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, delta); + __mod_zone_page_state(&pgdat->node_zones[zone], NR_ZONE_LRU_BASE + lru, delta); +} + +static inline void lru_gen_balance_size(struct lruvec *lruvec, struct folio *folio, + int old_gen, int new_gen) +{ + int type = folio_is_file_lru(folio); + int zone = folio_zonenum(folio); + int delta = folio_nr_pages(folio); + enum lru_list lru = type * LRU_INACTIVE_FILE; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS); + VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS); + VM_BUG_ON(old_gen == -1 && new_gen == -1); + + if (old_gen >= 0) + WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], + lrugen->nr_pages[old_gen][type][zone] - delta); + if (new_gen >= 0) + WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], + lrugen->nr_pages[new_gen][type][zone] + delta); + + if (old_gen < 0) { + if (lru_gen_is_active(lruvec, new_gen)) + lru += LRU_ACTIVE; + lru_gen_update_size(lruvec, lru, zone, delta); + return; + } + + if (new_gen < 0) { + if (lru_gen_is_active(lruvec, old_gen)) + lru += LRU_ACTIVE; + lru_gen_update_size(lruvec, lru, zone, -delta); + return; + } + + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { + lru_gen_update_size(lruvec, lru, zone, -delta); + lru_gen_update_size(lruvec, lru + LRU_ACTIVE, zone, delta); + } + + /* Promotion is legit while a page is on an LRU list, but demotion isn't. */ + VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); +} + +static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +{ + int gen; + unsigned long old_flags, new_flags; + int type = folio_is_file_lru(folio); + int zone = folio_zonenum(folio); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + if (folio_test_unevictable(folio) || !lrugen->enabled) + return false; + /* + * There are three common cases for this page: + * 1) If it shouldn't be evicted, e.g., it was just faulted in, add it + * to the youngest generation. + * 2) If it can't be evicted immediately, i.e., it's an anon page and + * not in swapcache, or a dirty page pending writeback, add it to the + * second oldest generation. + * 3) If it may be evicted immediately, e.g., it's a clean page, add it + * to the oldest generation. + */ + if (folio_test_active(folio)) + gen = lru_gen_from_seq(lrugen->max_seq); + else if ((!type && !folio_test_swapcache(folio)) || + (folio_test_reclaim(folio) && + (folio_test_dirty(folio) || folio_test_writeback(folio)))) + gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); + else + gen = lru_gen_from_seq(lrugen->min_seq[type]); + + do { + new_flags = old_flags = READ_ONCE(folio->flags); + VM_BUG_ON_FOLIO(new_flags & LRU_GEN_MASK, folio); + + new_flags &= ~(LRU_GEN_MASK | BIT(PG_active)); + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + } while (cmpxchg(&folio->flags, old_flags, new_flags) != old_flags); + + lru_gen_balance_size(lruvec, folio, -1, gen); + /* for folio_rotate_reclaimable() */ + if (reclaiming) + list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]); + else + list_add(&folio->lru, &lrugen->lists[gen][type][zone]); + + return true; +} + +static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +{ + int gen; + unsigned long old_flags, new_flags; + + do { + new_flags = old_flags = READ_ONCE(folio->flags); + if (!(new_flags & LRU_GEN_MASK)) + return false; + + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); + VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio); + + gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + + new_flags &= ~LRU_GEN_MASK; + /* for shrink_page_list() */ + if (reclaiming) + new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); + else if (lru_gen_is_active(lruvec, gen)) + new_flags |= BIT(PG_active); + } while (cmpxchg(&folio->flags, old_flags, new_flags) != old_flags); + + lru_gen_balance_size(lruvec, folio, gen, -1); + list_del(&folio->lru); + + return true; +} + +#else + +static inline bool lru_gen_enabled(void) +{ + return false; +} + +static inline bool lru_gen_in_fault(void) +{ + return false; +} + +static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +{ + return false; +} + +static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +{ + return false; +} + +#endif /* CONFIG_LRU_GEN */ + static __always_inline void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + if (lru_gen_add_folio(lruvec, folio, false)) + return; + update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); list_add(&folio->lru, &lruvec->lists[lru]); @@ -113,6 +298,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + if (lru_gen_add_folio(lruvec, folio, true)) + return; + update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); list_add_tail(&folio->lru, &lruvec->lists[lru]); @@ -127,6 +315,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { + if (lru_gen_del_folio(lruvec, folio, false)) + return; + list_del(&folio->lru); update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio), -folio_nr_pages(folio)); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index aed44e9b5d89..0f5e8a995781 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -303,6 +303,78 @@ enum lruvec_flags { */ }; +struct lruvec; + +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) +#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) + +#ifdef CONFIG_LRU_GEN + +#define MIN_LRU_BATCH BITS_PER_LONG +#define MAX_LRU_BATCH (MIN_LRU_BATCH * 128) + +/* + * Evictable pages are divided into multiple generations. The youngest and the + * oldest generation numbers, max_seq and min_seq, are monotonically increasing. + * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An + * offset within MAX_NR_GENS, gen, indexes the LRU list of the corresponding + * generation. The gen counter in folio->flags stores gen+1 while a page is on + * one of lrugen->lists[]. Otherwise it stores 0. + * + * A page is added to the youngest generation on faulting. The aging needs to + * check the accessed bit at least twice before handing this page over to the + * eviction. The first check takes care of the accessed bit set on the initial + * fault; the second check makes sure this page hasn't been used since then. + * This process, AKA second chance, requires a minimum of two generations, + * hence MIN_NR_GENS. And to be compatible with the active/inactive LRU, these + * two generations are mapped to the active; the rest of generations, if they + * exist, are mapped to the inactive. PG_active is always cleared while a page + * is on one of lrugen->lists[] so that demotion, which happens consequently + * when the aging produces a new generation, needs not to worry about it. + */ +#define MIN_NR_GENS 2U +#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) + +struct lru_gen_struct { + /* the aging increments the youngest generation number */ + unsigned long max_seq; + /* the eviction increments the oldest generation numbers */ + unsigned long min_seq[ANON_AND_FILE]; + /* the birth time of each generation in jiffies */ + unsigned long timestamps[MAX_NR_GENS]; + /* the multigenerational LRU lists */ + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the sizes of the above lists */ + unsigned long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* whether the multigenerational LRU is enabled */ + bool enabled; +}; + +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); + +#ifdef CONFIG_MEMCG +void lru_gen_init_memcg(struct mem_cgroup *memcg); +void lru_gen_free_memcg(struct mem_cgroup *memcg); +#endif + +#else /* !CONFIG_LRU_GEN */ + +static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) +{ +} + +#ifdef CONFIG_MEMCG +static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_free_memcg(struct mem_cgroup *memcg) +{ +} +#endif + +#endif /* CONFIG_LRU_GEN */ + struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* per lruvec lru_lock for memcg */ @@ -320,6 +392,10 @@ struct lruvec { unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; +#ifdef CONFIG_LRU_GEN + /* evictable pages divided into generations */ + struct lru_gen_struct lrugen; +#endif #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index ef1e3e736e14..8cdbbdccb5ad 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -26,6 +26,14 @@ #define ZONES_WIDTH ZONES_SHIFT +#ifdef CONFIG_LRU_GEN +/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ +#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2) +#else +#define LRU_GEN_WIDTH 0 +#define LRU_REFS_WIDTH 0 +#endif /* CONFIG_LRU_GEN */ + #ifdef CONFIG_SPARSEMEM #include <asm/sparsemem.h> #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) @@ -55,7 +63,8 @@ #define SECTIONS_WIDTH 0 #endif -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ + <= BITS_PER_LONG - NR_PAGEFLAGS #define NODES_WIDTH NODES_SHIFT #elif defined(CONFIG_SPARSEMEM_VMEMMAP) #error "Vmemmap: No space for nodes field in page flags" @@ -89,8 +98,8 @@ #define LAST_CPUPID_SHIFT 0 #endif -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ - <= BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT #else #define LAST_CPUPID_WIDTH 0 @@ -100,8 +109,8 @@ #define LAST_CPUPID_NOT_IN_PAGE_FLAGS #endif -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ - > BITS_PER_LONG - NR_PAGEFLAGS +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS #error "Not enough bits in page flags" #endif diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1c3b6e5c8bfd..a95518ca98eb 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -935,7 +935,7 @@ __PAGEFLAG(Isolated, isolated, PF_ANY); 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_slab | 1UL << PG_active | \ - 1UL << PG_unevictable | __PG_MLOCKED) + 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) /* * Flags checked when a page is prepped for return by the page allocator. @@ -946,7 +946,7 @@ __PAGEFLAG(Isolated, isolated, PF_ANY); * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ - (PAGEFLAGS_MASK & ~__PG_HWPOISON) + ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/include/linux/sched.h b/include/linux/sched.h index 75ba8aa60248..e7fe784b11aa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -914,6 +914,10 @@ struct task_struct { #ifdef CONFIG_MEMCG unsigned in_user_fault:1; #endif +#ifdef CONFIG_LRU_GEN + /* whether the LRU algorithm may apply to this access */ + unsigned in_lru_fault:1; +#endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif diff --git a/kernel/bounds.c b/kernel/bounds.c index 9795d75b09b2..aba13aa7336c 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -22,6 +22,9 @@ int main(void) DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); +#ifdef CONFIG_LRU_GEN + DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1)); +#endif /* End of constants */ return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 406a3c28c026..3df389fd307f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struct page *head, int tail, #ifdef CONFIG_64BIT (1L << PG_arch_2) | #endif - (1L << PG_dirty))); + (1L << PG_dirty) | + LRU_GEN_MASK | LRU_REFS_MASK)); /* ->mapping in first tail page is compound_mapcount */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 09d342c7cbd0..cabb5085531b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5121,6 +5121,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) static void mem_cgroup_free(struct mem_cgroup *memcg) { + lru_gen_free_memcg(memcg); memcg_wb_domain_exit(memcg); __mem_cgroup_free(memcg); } @@ -5180,6 +5181,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->deferred_split_queue.split_queue_len = 0; #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); + lru_gen_init_memcg(memcg); return memcg; fail: mem_cgroup_id_remove(memcg); diff --git a/mm/memory.c b/mm/memory.c index a7379196a47e..d27e5f1a2533 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4754,6 +4754,27 @@ static inline void mm_account_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } +#ifdef CONFIG_LRU_GEN +static void lru_gen_enter_fault(struct vm_area_struct *vma) +{ + /* the LRU algorithm doesn't apply to sequential or random reads */ + current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)); +} + +static void lru_gen_exit_fault(void) +{ + current->in_lru_fault = false; +} +#else +static void lru_gen_enter_fault(struct vm_area_struct *vma) +{ +} + +static void lru_gen_exit_fault(void) +{ +} +#endif /* CONFIG_LRU_GEN */ + /* * By the time we get here, we already hold the mm semaphore * @@ -4785,11 +4806,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (flags & FAULT_FLAG_USER) mem_cgroup_enter_user_fault(); + lru_gen_enter_fault(vma); + if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else ret = __handle_mm_fault(vma, address, flags); + lru_gen_exit_fault(); + if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* diff --git a/mm/mm_init.c b/mm/mm_init.c index 9ddaf0e1b0ab..0d7b2bd2454a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void) shift = 8 * sizeof(unsigned long); width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, LAST_CPUPID_WIDTH, KASAN_TAG_WIDTH, + LRU_GEN_WIDTH, + LRU_REFS_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3589febc6d31..a3faa8c02c07 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7480,6 +7480,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) pgdat_page_ext_init(pgdat); lruvec_init(&pgdat->__lruvec); + lru_gen_init_state(NULL, &pgdat->__lruvec); } static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, diff --git a/mm/swap.c b/mm/swap.c index bcf3ac288b56..e2ef2acccc74 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -462,6 +462,11 @@ void folio_add_lru(struct folio *folio) VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + /* see the comment in lru_gen_add_folio() */ + if (lru_gen_enabled() && !folio_test_unevictable(folio) && + lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) + folio_set_active(folio); + folio_get(folio); local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_add); diff --git a/mm/vmscan.c b/mm/vmscan.c index b7228b73e1b3..d75a5738d1dc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3040,6 +3040,91 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, return can_demote(pgdat->node_id, sc); } +#ifdef CONFIG_LRU_GEN + +/****************************************************************************** + * shorthand helpers + ******************************************************************************/ + +#define for_each_gen_type_zone(gen, type, zone) \ + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) + +static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + +#ifdef CONFIG_MEMCG + if (memcg) { + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; + + /* for hotadd_new_pgdat() */ + if (!lruvec->pgdat) + lruvec->pgdat = pgdat; + + return lruvec; + } +#endif + return pgdat ? &pgdat->__lruvec : NULL; +} + +/****************************************************************************** + * initialization + ******************************************************************************/ + +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) +{ + int i; + int gen, type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled = lru_gen_enabled(); + + for (i = 0; i <= MIN_NR_GENS + 1; i++) + lrugen->timestamps[i] = jiffies; + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); +} + +#ifdef CONFIG_MEMCG +void lru_gen_init_memcg(struct mem_cgroup *memcg) +{ + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + lru_gen_init_state(memcg, lruvec); + } +} + +void lru_gen_free_memcg(struct mem_cgroup *memcg) +{ + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + VM_BUG_ON(memchr_inv(lruvec->lrugen.nr_pages, 0, + sizeof(lruvec->lrugen.nr_pages))); + } +} +#endif + +static int __init init_lru_gen(void) +{ + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + + return 0; +}; +late_initcall(init_lru_gen); + +#endif /* CONFIG_LRU_GEN */ + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS];