@@ -709,6 +709,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#endif
#ifdef CONFIG_64BIT
[ilog2(VM_SEALED)] = "sl",
+#endif
+#ifdef CONFIG_NEED_VM_DROPPABLE
+ [ilog2(VM_DROPPABLE)] = "dp",
#endif
};
size_t i;
@@ -321,12 +321,14 @@ extern unsigned int kobjsize(const void *objp);
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
+#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -357,6 +359,12 @@ extern unsigned int kobjsize(const void *objp);
# define VM_SHADOW_STACK VM_NONE
#endif
+#ifdef CONFIG_NEED_VM_DROPPABLE
+# define VM_DROPPABLE VM_HIGH_ARCH_6
+#else
+# define VM_DROPPABLE VM_NONE
+#endif
+
#if defined(CONFIG_X86)
# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
#elif defined(CONFIG_PPC)
@@ -165,6 +165,12 @@ IF_HAVE_PG_ARCH_X(arch_3)
# define IF_HAVE_UFFD_MINOR(flag, name)
#endif
+#ifdef CONFIG_NEED_VM_DROPPABLE
+# define IF_HAVE_VM_DROPPABLE(flag, name) {flag, name},
+#else
+# define IF_HAVE_VM_DROPPABLE(flag, name)
+#endif
+
#define __def_vmaflag_names \
{VM_READ, "read" }, \
{VM_WRITE, "write" }, \
@@ -197,6 +203,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
{VM_MIXEDMAP, "mixedmap" }, \
{VM_HUGEPAGE, "hugepage" }, \
{VM_NOHUGEPAGE, "nohugepage" }, \
+IF_HAVE_VM_DROPPABLE(VM_DROPPABLE, "droppable" ) \
{VM_MERGEABLE, "mergeable" } \
#define show_vma_flags(flags) \
@@ -1056,6 +1056,9 @@ config ARCH_USES_HIGH_VMA_FLAGS
bool
config ARCH_HAS_PKEYS
bool
+config NEED_VM_DROPPABLE
+ select ARCH_USES_HIGH_VMA_FLAGS
+ bool
config ARCH_USES_PG_ARCH_X
bool
@@ -623,7 +623,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
may_expand_vm(mm, oldflags, nrpages))
return -ENOMEM;
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
- VM_SHARED|VM_NORESERVE))) {
+ VM_SHARED|VM_NORESERVE|VM_DROPPABLE))) {
charged = nrpages;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
@@ -1397,7 +1397,10 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
VM_BUG_ON_VMA(address < vma->vm_start ||
address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
- __folio_set_swapbacked(folio);
+ /* VM_DROPPABLE mappings don't swap; instead they're just dropped when
+ * under memory pressure. */
+ if (!(vma->vm_flags & VM_DROPPABLE))
+ __folio_set_swapbacked(folio);
__folio_set_anon(folio, vma, address, true);
if (likely(!folio_test_large(folio))) {
@@ -1841,7 +1844,11 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* plus the rmap(s) (dropped by discard:).
*/
if (ref_count == 1 + map_count &&
- !folio_test_dirty(folio)) {
+ (!folio_test_dirty(folio) ||
+ /* Unlike MADV_FREE mappings, VM_DROPPABLE
+ * ones can be dropped even if they've
+ * been dirtied. */
+ (vma->vm_flags & VM_DROPPABLE))) {
dec_mm_counter(mm, MM_ANONPAGES);
goto discard;
}
@@ -1851,7 +1858,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* discarded. Remap the page to page table.
*/
set_pte_at(mm, address, pvmw.pte, pteval);
- folio_set_swapbacked(folio);
+ /* Unlike MADV_FREE mappings, VM_DROPPABLE ones
+ * never get swap backed on failure to drop. */
+ if (!(vma->vm_flags & VM_DROPPABLE))
+ folio_set_swapbacked(folio);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
The vDSO getrandom() implementation works with a buffer allocated with a new system call that has certain requirements: - It shouldn't be written to core dumps. * Easy: VM_DONTDUMP. - It should be zeroed on fork. * Easy: VM_WIPEONFORK. - It shouldn't be written to swap. * Uh-oh: mlock is rlimited. * Uh-oh: mlock isn't inherited by forks. It turns out that the vDSO getrandom() function has three really nice characteristics that we can exploit to solve this problem: 1) Due to being wiped during fork(), the vDSO code is already robust to having the contents of the pages it reads zeroed out midway through the function's execution. 2) In the absolute worst case of whatever contingency we're coding for, we have the option to fallback to the getrandom() syscall, and everything is fine. 3) The buffers the function uses are only ever useful for a maximum of 60 seconds -- a sort of cache, rather than a long term allocation. These characteristics mean that we can introduce VM_DROPPABLE, which has the following semantics: a) It never is written out to swap. b) Under memory pressure, mm can just drop the pages (so that they're zero when read back again). c) It is inherited by fork. d) It doesn't count against the mlock budget, since nothing is locked. This is fairly simple to implement, with the one snag that we have to use 64-bit VM_* flags, but this shouldn't be a problem, since the only consumers will probably be 64-bit anyway. This way, allocations used by vDSO getrandom() can use: VM_DROPPABLE | VM_DONTDUMP | VM_WIPEONFORK | VM_NORESERVE And there will be no problem with using memory when not in use, not wiping on fork(), coredumps, or writing out to swap. When this functionality is exposed via vgetrandom_alloc() later in this series, a userspace selftest is added that verifies this is working as intended. Cc: linux-mm@kvack.org Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> --- fs/proc/task_mmu.c | 3 +++ include/linux/mm.h | 8 ++++++++ include/trace/events/mmflags.h | 7 +++++++ mm/Kconfig | 3 +++ mm/mprotect.c | 2 +- mm/rmap.c | 16 +++++++++++++--- 6 files changed, 35 insertions(+), 4 deletions(-)