Message ID | 20241112083414.78174-1-ryncsn@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm, swap: fix allocation and scanning race with swapoff | expand |
Kairui Song <ryncsn@gmail.com> writes: > From: Kairui Song <kasong@tencent.com> > > There are two flags used to synchronize allocation and scanning with > swapoff: SWP_WRITEOK and SWP_SCANNING. > > SWP_WRITEOK: Swapoff will first unset this flag, at this point any > further swap allocation or scanning on this device should just abort > so no more new entries will be referencing this device. Swapoff > will then unuse all existing swap entries. > > SWP_SCANNING: This flag is set when device is being scanned. Swapoff > will wait for all scanner to stop before the final release of the swap > device structures to avoid UAF. Note this flag is the highest used bit > of si->flags so it could be added up arithmetically, if there are > multiple scanner. > > commit 5f843a9a3a1e ("mm: swap: separate SSD allocation from > scan_swap_map_slots()") ignored SWP_SCANNING and SWP_WRITEOK flags while > separating cluster allocation path from the old allocation path. Add > the flags back to fix swapoff race. The race is hard to trigger as > si->lock prevents most parallel operations, but si->lock could be > dropped for reclaim or discard. This issue is found during code review. > > This commit fixes this problem. For SWP_SCANNING, Just like before, > set the flag before scan and remove it afterwards. > > For SWP_WRITEOK, there are several places where si->lock could > be dropped, it will be error-prone and make the code hard to follow > if we try to cover these places one by one. So just do one check before > the real allocation, which is also very similar like before. > With new cluster allocator it may waste a bit of time iterating > the clusters but won't take long, and swapoff is not performance > sensitive. > > Reported-by: "Huang, Ying" <ying.huang@intel.com> > Closes: https://lore.kernel.org/linux-mm/87a5es3f1f.fsf@yhuang6-desk2.ccr.corp.intel.com/ > Fixes: 5f843a9a3a1e ("mm: swap: separate SSD allocation from scan_swap_map_slots()") > Signed-off-by: Kairui Song <kasong@tencent.com> > --- > mm/swapfile.c | 22 +++++++++++++++++++--- > 1 file changed, 19 insertions(+), 3 deletions(-) > > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 9c85bd46ab7f..b0a9071cfe1d 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -664,12 +664,15 @@ static bool cluster_scan_range(struct swap_info_struct *si, > return true; > } > > -static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, > +static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, > unsigned int start, unsigned char usage, > unsigned int order) > { > unsigned int nr_pages = 1 << order; > > + if (!(si->flags & SWP_WRITEOK)) > + return false; > + > if (cluster_is_free(ci)) { > if (nr_pages < SWAPFILE_CLUSTER) { > list_move_tail(&ci->list, &si->nonfull_clusters[order]); > @@ -690,6 +693,8 @@ static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster > list_move_tail(&ci->list, &si->full_clusters); > ci->flags = CLUSTER_FLAG_FULL; > } > + > + return true; > } > > static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, > @@ -713,7 +718,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne > > while (offset <= end) { > if (cluster_scan_range(si, ci, offset, nr_pages)) { > - cluster_alloc_range(si, ci, offset, usage, order); > + if (!cluster_alloc_range(si, ci, offset, usage, order)) { > + offset = SWAP_NEXT_INVALID; We set offset to SWAP_NEXT_INVALID for 3 times in this function. Can we use a local variable to remove the duplication? For example, unsigned long ret = SWAP_NEXT_INVALID; And set ret if we allocate swap entry successfully. We can do this in a separate patch. Otherwise, LGTM, Thanks! Feel free to add, Reviewed-by: "Huang, Ying" <ying.huang@intel.com> > + goto done; > + } > *foundp = offset; > if (ci->count == SWAPFILE_CLUSTER) { > offset = SWAP_NEXT_INVALID; > @@ -805,7 +813,11 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o > if (!list_empty(&si->free_clusters)) { > ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); > offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); > - VM_BUG_ON(!found); > + /* > + * Either we didn't touch the cluster due to swapoff, > + * or the allocation must success. > + */ > + VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); > goto done; > } > > @@ -1041,6 +1053,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si, > > VM_BUG_ON(!si->cluster_info); > > + si->flags += SWP_SCANNING; > + > while (n_ret < nr) { > unsigned long offset = cluster_alloc_swap_entry(si, order, usage); > > @@ -1049,6 +1063,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si, > slots[n_ret++] = swp_entry(si->type, offset); > } > > + si->flags -= SWP_SCANNING; > + > return n_ret; > } -- Best Regards, Huang, Ying
diff --git a/mm/swapfile.c b/mm/swapfile.c index 9c85bd46ab7f..b0a9071cfe1d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -664,12 +664,15 @@ static bool cluster_scan_range(struct swap_info_struct *si, return true; } -static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, +static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned int start, unsigned char usage, unsigned int order) { unsigned int nr_pages = 1 << order; + if (!(si->flags & SWP_WRITEOK)) + return false; + if (cluster_is_free(ci)) { if (nr_pages < SWAPFILE_CLUSTER) { list_move_tail(&ci->list, &si->nonfull_clusters[order]); @@ -690,6 +693,8 @@ static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster list_move_tail(&ci->list, &si->full_clusters); ci->flags = CLUSTER_FLAG_FULL; } + + return true; } static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, @@ -713,7 +718,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne while (offset <= end) { if (cluster_scan_range(si, ci, offset, nr_pages)) { - cluster_alloc_range(si, ci, offset, usage, order); + if (!cluster_alloc_range(si, ci, offset, usage, order)) { + offset = SWAP_NEXT_INVALID; + goto done; + } *foundp = offset; if (ci->count == SWAPFILE_CLUSTER) { offset = SWAP_NEXT_INVALID; @@ -805,7 +813,11 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o if (!list_empty(&si->free_clusters)) { ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); - VM_BUG_ON(!found); + /* + * Either we didn't touch the cluster due to swapoff, + * or the allocation must success. + */ + VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); goto done; } @@ -1041,6 +1053,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si, VM_BUG_ON(!si->cluster_info); + si->flags += SWP_SCANNING; + while (n_ret < nr) { unsigned long offset = cluster_alloc_swap_entry(si, order, usage); @@ -1049,6 +1063,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si, slots[n_ret++] = swp_entry(si->type, offset); } + si->flags -= SWP_SCANNING; + return n_ret; }