Message ID | 20240730-swap-allocator-v5-2-cb9c148b9297@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: swap: mTHP swap allocator base on swap cluster order | expand |
On Wed, Aug 7, 2024 at 6:14 PM Huang, Ying <ying.huang@intel.com> wrote: > > Chris Li <chrisl@kernel.org> writes: > > > Track the nonfull cluster as well as the empty cluster > > on lists. Each order has one nonfull cluster list. > > > > The cluster will remember which order it was used during > > new cluster allocation. > > > > When the cluster has free entry, add to the nonfull[order] > > list. When the free cluster list is empty, also allocate > > from the nonempty list of that order. > > > > This improves the mTHP swap allocation success rate. > > > > There are limitations if the distribution of numbers of > > different orders of mTHP changes a lot. e.g. there are a lot > > of nonfull cluster assign to order A while later time there > > are a lot of order B allocation while very little allocation > > in order A. Currently the cluster used by order A will not > > reused by order B unless the cluster is 100% empty. > > > > Signed-off-by: Chris Li <chrisl@kernel.org> > > --- > > include/linux/swap.h | 4 ++++ > > mm/swapfile.c | 38 +++++++++++++++++++++++++++++++++++--- > > 2 files changed, 39 insertions(+), 3 deletions(-) > > > > diff --git a/include/linux/swap.h b/include/linux/swap.h > > index edafd52d7ac4..6716ef236766 100644 > > --- a/include/linux/swap.h > > +++ b/include/linux/swap.h > > @@ -254,9 +254,11 @@ struct swap_cluster_info { > > */ > > u16 count; > > u8 flags; > > + u8 order; > > struct list_head list; > > }; > > #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ > > +#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ > > > > /* > > * The first page in the swap file is the swap header, which is always marked > > @@ -295,6 +297,8 @@ struct swap_info_struct { > > unsigned long *zeromap; /* vmalloc'ed bitmap to track zero pages */ > > struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ > > struct list_head free_clusters; /* free clusters list */ > > + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; > > + /* list of cluster that contains at least one free slot */ > > unsigned int lowest_bit; /* index of first free in swap_map */ > > unsigned int highest_bit; /* index of last free in swap_map */ > > unsigned int pages; /* total of usable pages of swap */ > > diff --git a/mm/swapfile.c b/mm/swapfile.c > > index bceead7f9e3c..dcf09eb549db 100644 > > --- a/mm/swapfile.c > > +++ b/mm/swapfile.c > > @@ -361,14 +361,22 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, > > memset(si->swap_map + idx * SWAPFILE_CLUSTER, > > SWAP_MAP_BAD, SWAPFILE_CLUSTER); > > > > - list_add_tail(&ci->list, &si->discard_clusters); > > + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); > > + if (ci->flags & CLUSTER_FLAG_NONFULL) > > + list_move_tail(&ci->list, &si->discard_clusters); > > + else > > + list_add_tail(&ci->list, &si->discard_clusters); > > + ci->flags = 0; > > As Ryan pointed out before, it's better to clear the specific bit > instead of assigning 0. This will make code future proof. Ack. BTW, I plan to change the bit to a list number in the future.I can define a list bit mask for now. > > > schedule_work(&si->discard_work); > > } > > > > static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) > > { > > + if (ci->flags & CLUSTER_FLAG_NONFULL) > > + list_move_tail(&ci->list, &si->free_clusters); > > + else > > + list_add_tail(&ci->list, &si->free_clusters); > > ci->flags = CLUSTER_FLAG_FREE; > > - list_add_tail(&ci->list, &si->free_clusters); > > } > > > > /* > > @@ -491,8 +499,15 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste > > VM_BUG_ON(ci->count == 0); > > ci->count--; > > > > - if (!ci->count) > > + if (!ci->count) { > > free_cluster(p, ci); > > + return; > > + } > > + > > + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { > > + list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); > > + ci->flags |= CLUSTER_FLAG_NONFULL; > > + } > > } > > > > /* > > @@ -553,6 +568,19 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, > > if (tmp == SWAP_NEXT_INVALID) { > > if (!list_empty(&si->free_clusters)) { > > ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); > > + list_del(&ci->list); > > + spin_lock(&ci->lock); > > + ci->order = order; > > + ci->flags = 0; > > + spin_unlock(&ci->lock); > > + tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; > > + } else if (!list_empty(&si->nonfull_clusters[order])) { > > + ci = list_first_entry(&si->nonfull_clusters[order], > > + struct swap_cluster_info, list); > > + list_del(&ci->list); > > + spin_lock(&ci->lock); > > + ci->flags = 0; > > + spin_unlock(&ci->lock); > > tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; > > } else if (!list_empty(&si->discard_clusters)) { > > We should check discard_clusters before nonfull clusters. And the reason behind that is? I see the discard_cluster can take a long time. It will take a synchronous wait for the issuing the discard command. Why not just use the nonfull list and return immediately. When the discard command finished. It will show up in the free list anyway. BTW, what is your take on my previous analysis of the current SSD prefer write new cluster can wear out the SSD faster? I think it might be useful to provide users an option to choose to write a non full list first. The trade off is more friendly to SSD wear out than preferring to write new blocks. If you keep doing the swap long enough, there will be no new free cluster anyway. The example I give in this email: https://lore.kernel.org/linux-mm/CACePvbXGBNC9WzzL4s2uB2UciOkV6nb4bKKkc5TBZP6QuHS_aQ@mail.gmail.com/ Chris > > > /* > > @@ -967,6 +995,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) > > ci = lock_cluster(si, offset); > > memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); > > ci->count = 0; > > + ci->order = 0; > > ci->flags = 0; > > free_cluster(si, ci); > > unlock_cluster(ci); > > @@ -2922,6 +2951,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, > > INIT_LIST_HEAD(&p->free_clusters); > > INIT_LIST_HEAD(&p->discard_clusters); > > > > + for (i = 0; i < SWAP_NR_ORDERS; i++) > > + INIT_LIST_HEAD(&p->nonfull_clusters[i]); > > + > > for (i = 0; i < swap_header->info.nr_badpages; i++) { > > unsigned int page_nr = swap_header->info.badpages[i]; > > if (page_nr == 0 || page_nr > swap_header->info.last_page) > > -- > Best Regards, > Huang, Ying
Chris Li <chrisl@kernel.org> writes: > On Wed, Aug 7, 2024 at 6:14 PM Huang, Ying <ying.huang@intel.com> wrote: >> >> Chris Li <chrisl@kernel.org> writes: >> [snip] >> > >> > /* >> > @@ -553,6 +568,19 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, >> > if (tmp == SWAP_NEXT_INVALID) { >> > if (!list_empty(&si->free_clusters)) { >> > ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); >> > + list_del(&ci->list); >> > + spin_lock(&ci->lock); >> > + ci->order = order; >> > + ci->flags = 0; >> > + spin_unlock(&ci->lock); >> > + tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; >> > + } else if (!list_empty(&si->nonfull_clusters[order])) { >> > + ci = list_first_entry(&si->nonfull_clusters[order], >> > + struct swap_cluster_info, list); >> > + list_del(&ci->list); >> > + spin_lock(&ci->lock); >> > + ci->flags = 0; >> > + spin_unlock(&ci->lock); >> > tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; >> > } else if (!list_empty(&si->discard_clusters)) { >> >> We should check discard_clusters before nonfull clusters. > > And the reason behind that is? > > I see the discard_cluster can take a long time. It will take a > synchronous wait for the issuing the discard command. Why not just use > the nonfull list and return immediately. When the discard command > finished. It will show up in the free list anyway. I think that you are right. We don't need to wait for discard here. > BTW, what is your take on my previous analysis of the current SSD > prefer write new cluster can wear out the SSD faster? No. I don't agree with you on that. However, my knowledge on SSD wearing out algorithm is quite limited. > I think it might be useful to provide users an option to choose to > write a non full list first. The trade off is more friendly to SSD > wear out than preferring to write new blocks. If you keep doing the > swap long enough, there will be no new free cluster anyway. It depends on workloads. Some workloads may demonstrate better spatial locality. > The example I give in this email: > > https://lore.kernel.org/linux-mm/CACePvbXGBNC9WzzL4s2uB2UciOkV6nb4bKKkc5TBZP6QuHS_aQ@mail.gmail.com/ > > Chris >> >> > /* >> > @@ -967,6 +995,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) >> > ci = lock_cluster(si, offset); >> > memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); >> > ci->count = 0; >> > + ci->order = 0; >> > ci->flags = 0; >> > free_cluster(si, ci); >> > unlock_cluster(ci); >> > @@ -2922,6 +2951,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, >> > INIT_LIST_HEAD(&p->free_clusters); >> > INIT_LIST_HEAD(&p->discard_clusters); >> > >> > + for (i = 0; i < SWAP_NR_ORDERS; i++) >> > + INIT_LIST_HEAD(&p->nonfull_clusters[i]); >> > + >> > for (i = 0; i < swap_header->info.nr_badpages; i++) { >> > unsigned int page_nr = swap_header->info.badpages[i]; >> > if (page_nr == 0 || page_nr > swap_header->info.last_page) -- Best Regards, Huang, Ying
On Mon, Aug 19, 2024 at 1:11 AM Huang, Ying <ying.huang@intel.com> wrote: > > BTW, what is your take on my previous analysis of the current SSD > > prefer write new cluster can wear out the SSD faster? > > No. I don't agree with you on that. However, my knowledge on SSD > wearing out algorithm is quite limited. Hi Ying, Can you please clarify. You said you have limited knowledge on SSD wearing internals. Does that mean you have low confidence in your verdict? I would like to understand your reasoning for the disagreement. Starting from which part of my analysis you are disagreeing with. At the same time, we can consult someone who works in the SSD space and understand the SSD internal wearing better. I see this is a serious issue for using SSD as swapping for data center usage cases. In your laptop usage case, you are not using the LLM training 24/7 right? So it still fits the usage model of the occasional user of the swap file. It might not be as big a deal. In the data center workload, e.g. Google's swap write 24/7. The amount of data swapped out is much higher than typical laptop usage as well. There the SSD wearing out issue would be much higher because the SSD is under constant write and much bigger swap usage. I am claiming that *some* SSD would have a higher internal write amplification factor if doing random 4K write all over the drive, than random 4K write to a small area of the drive. I do believe having a different swap out policy controlling preferring old vs new clusters is beneficial to the data center SSD swap usage case. It come downs to: 1) SSD are slow to erase. So most of the SSD performance erases at a huge erase block size. 2) SSD remaps the logical block address to the internal erase block. Most of the new data rewritten, regardless of the logical block address of the SSD drive, grouped together and written to the erase block. 3) When new data is overridden to the old logical data address, SSD firmware marks those over-written data as obsolete. The discard command has the similar effect without introducing new data. 4) When the SSD driver runs out of new erase block, it would need to GC the old fragmented erased block and pertectial write out of old data to make room for new erase block. Where the discard command can be beneficial. It tells the SSD firmware which part of the old data the GC process can just ignore and skip rewriting. GC of the obsolete logical blocks is a general hard problem for the SSD. I am not claiming every SSD has this kind of behavior, but it is common enough to be worth providing an option. > > I think it might be useful to provide users an option to choose to > > write a non full list first. The trade off is more friendly to SSD > > wear out than preferring to write new blocks. If you keep doing the > > swap long enough, there will be no new free cluster anyway. > > It depends on workloads. Some workloads may demonstrate better spatial > locality. Yes, agree that it may happen or may not happen depending on the workload . The random distribution swap entry is a common pattern we need to consider as well. The odds are against us. As in the quoted email where I did the calculation, the odds of getting the whole cluster free in the random model is very low, 4.4E10-15 even if we are only using 1/16 swap entries in the swapfile. Chris > > > The example I give in this email: > > > > https://lore.kernel.org/linux-mm/CACePvbXGBNC9WzzL4s2uB2UciOkV6nb4bKKkc5TBZP6QuHS_aQ@mail.gmail.com/ > > > > Chris > >> > >> > /* > >> > @@ -967,6 +995,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) > >> > ci = lock_cluster(si, offset); > >> > memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); > >> > ci->count = 0; > >> > + ci->order = 0; > >> > ci->flags = 0; > >> > free_cluster(si, ci); > >> > unlock_cluster(ci); > >> > @@ -2922,6 +2951,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, > >> > INIT_LIST_HEAD(&p->free_clusters); > >> > INIT_LIST_HEAD(&p->discard_clusters); > >> > > >> > + for (i = 0; i < SWAP_NR_ORDERS; i++) > >> > + INIT_LIST_HEAD(&p->nonfull_clusters[i]); > >> > + > >> > for (i = 0; i < swap_header->info.nr_badpages; i++) { > >> > unsigned int page_nr = swap_header->info.badpages[i]; > >> > if (page_nr == 0 || page_nr > swap_header->info.last_page) > > -- > Best Regards, > Huang, Ying
Chris Li <chrisl@kernel.org> writes: > On Mon, Aug 19, 2024 at 1:11 AM Huang, Ying <ying.huang@intel.com> wrote: >> > BTW, what is your take on my previous analysis of the current SSD >> > prefer write new cluster can wear out the SSD faster? >> >> No. I don't agree with you on that. However, my knowledge on SSD >> wearing out algorithm is quite limited. > > Hi Ying, > > Can you please clarify. You said you have limited knowledge on SSD > wearing internals. Does that mean you have low confidence in your > verdict? Yes. > I would like to understand your reasoning for the disagreement. > Starting from which part of my analysis you are disagreeing with. > > At the same time, we can consult someone who works in the SSD space > and understand the SSD internal wearing better. I think that is a good idea. > I see this is a serious issue for using SSD as swapping for data > center usage cases. In your laptop usage case, you are not using the > LLM training 24/7 right? So it still fits the usage model of the > occasional user of the swap file. It might not be as big a deal. In > the data center workload, e.g. Google's swap write 24/7. The amount of > data swapped out is much higher than typical laptop usage as well. > There the SSD wearing out issue would be much higher because the SSD > is under constant write and much bigger swap usage. > > I am claiming that *some* SSD would have a higher internal write > amplification factor if doing random 4K write all over the drive, than > random 4K write to a small area of the drive. > I do believe having a different swap out policy controlling preferring > old vs new clusters is beneficial to the data center SSD swap usage > case. > It come downs to: > 1) SSD are slow to erase. So most of the SSD performance erases at a > huge erase block size. > 2) SSD remaps the logical block address to the internal erase block. > Most of the new data rewritten, regardless of the logical block > address of the SSD drive, grouped together and written to the erase > block. > 3) When new data is overridden to the old logical data address, SSD > firmware marks those over-written data as obsolete. The discard > command has the similar effect without introducing new data. > 4) When the SSD driver runs out of new erase block, it would need to > GC the old fragmented erased block and pertectial write out of old > data to make room for new erase block. Where the discard command can > be beneficial. It tells the SSD firmware which part of the old data > the GC process can just ignore and skip rewriting. > > GC of the obsolete logical blocks is a general hard problem for the SSD. > > I am not claiming every SSD has this kind of behavior, but it is > common enough to be worth providing an option. > >> > I think it might be useful to provide users an option to choose to >> > write a non full list first. The trade off is more friendly to SSD >> > wear out than preferring to write new blocks. If you keep doing the >> > swap long enough, there will be no new free cluster anyway. >> >> It depends on workloads. Some workloads may demonstrate better spatial >> locality. > > Yes, agree that it may happen or may not happen depending on the > workload . The random distribution swap entry is a common pattern we > need to consider as well. The odds are against us. As in the quoted > email where I did the calculation, the odds of getting the whole > cluster free in the random model is very low, 4.4E10-15 even if we are > only using 1/16 swap entries in the swapfile. Do you have real workloads? For example, some trace? -- Best Regards, Huang, Ying
diff --git a/include/linux/swap.h b/include/linux/swap.h index edafd52d7ac4..6716ef236766 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -254,9 +254,11 @@ struct swap_cluster_info { */ u16 count; u8 flags; + u8 order; struct list_head list; }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ +#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ /* * The first page in the swap file is the swap header, which is always marked @@ -295,6 +297,8 @@ struct swap_info_struct { unsigned long *zeromap; /* vmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; + /* list of cluster that contains at least one free slot */ unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ diff --git a/mm/swapfile.c b/mm/swapfile.c index bceead7f9e3c..dcf09eb549db 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -361,14 +361,22 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); - list_add_tail(&ci->list, &si->discard_clusters); + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); + if (ci->flags & CLUSTER_FLAG_NONFULL) + list_move_tail(&ci->list, &si->discard_clusters); + else + list_add_tail(&ci->list, &si->discard_clusters); + ci->flags = 0; schedule_work(&si->discard_work); } static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { + if (ci->flags & CLUSTER_FLAG_NONFULL) + list_move_tail(&ci->list, &si->free_clusters); + else + list_add_tail(&ci->list, &si->free_clusters); ci->flags = CLUSTER_FLAG_FREE; - list_add_tail(&ci->list, &si->free_clusters); } /* @@ -491,8 +499,15 @@ static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluste VM_BUG_ON(ci->count == 0); ci->count--; - if (!ci->count) + if (!ci->count) { free_cluster(p, ci); + return; + } + + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { + list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); + ci->flags |= CLUSTER_FLAG_NONFULL; + } } /* @@ -553,6 +568,19 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, if (tmp == SWAP_NEXT_INVALID) { if (!list_empty(&si->free_clusters)) { ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); + list_del(&ci->list); + spin_lock(&ci->lock); + ci->order = order; + ci->flags = 0; + spin_unlock(&ci->lock); + tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; + } else if (!list_empty(&si->nonfull_clusters[order])) { + ci = list_first_entry(&si->nonfull_clusters[order], + struct swap_cluster_info, list); + list_del(&ci->list); + spin_lock(&ci->lock); + ci->flags = 0; + spin_unlock(&ci->lock); tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER; } else if (!list_empty(&si->discard_clusters)) { /* @@ -967,6 +995,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) ci = lock_cluster(si, offset); memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); ci->count = 0; + ci->order = 0; ci->flags = 0; free_cluster(si, ci); unlock_cluster(ci); @@ -2922,6 +2951,9 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, INIT_LIST_HEAD(&p->free_clusters); INIT_LIST_HEAD(&p->discard_clusters); + for (i = 0; i < SWAP_NR_ORDERS; i++) + INIT_LIST_HEAD(&p->nonfull_clusters[i]); + for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page)
Track the nonfull cluster as well as the empty cluster on lists. Each order has one nonfull cluster list. The cluster will remember which order it was used during new cluster allocation. When the cluster has free entry, add to the nonfull[order] list. When the free cluster list is empty, also allocate from the nonempty list of that order. This improves the mTHP swap allocation success rate. There are limitations if the distribution of numbers of different orders of mTHP changes a lot. e.g. there are a lot of nonfull cluster assign to order A while later time there are a lot of order B allocation while very little allocation in order A. Currently the cluster used by order A will not reused by order B unless the cluster is 100% empty. Signed-off-by: Chris Li <chrisl@kernel.org> --- include/linux/swap.h | 4 ++++ mm/swapfile.c | 38 +++++++++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-)