@@ -264,7 +264,8 @@ extern unsigned int kobjsize(const void *objp);
#define MM_SEAL_ALL ( \
MM_SEAL_SEAL | \
MM_SEAL_BASE | \
- MM_SEAL_PROT_PKEY)
+ MM_SEAL_PROT_PKEY | \
+ MM_SEAL_DISCARD_RO_ANON)
/*
* PROT_SEAL_ALL is all supported flags in mmap().
@@ -273,7 +274,8 @@ extern unsigned int kobjsize(const void *objp);
#define PROT_SEAL_ALL ( \
PROT_SEAL_SEAL | \
PROT_SEAL_BASE | \
- PROT_SEAL_PROT_PKEY)
+ PROT_SEAL_PROT_PKEY | \
+ PROT_SEAL_DISCARD_RO_ANON)
/*
* vm_flags in vm_area_struct, see mm_types.h.
@@ -3354,6 +3356,9 @@ extern bool can_modify_mm(struct mm_struct *mm, unsigned long start,
extern bool can_modify_vma(struct vm_area_struct *vma,
unsigned long checkSeals);
+extern bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int behavior);
+
/*
* Convert prot field of mmap to vm_seals type.
*/
@@ -3362,9 +3367,9 @@ static inline unsigned long convert_mmap_seals(unsigned long prot)
unsigned long seals = 0;
/*
- * set SEAL_PROT_PKEY implies SEAL_BASE.
+ * set SEAL_PROT_PKEY or SEAL_DISCARD_RO_ANON implies SEAL_BASE.
*/
- if (prot & PROT_SEAL_PROT_PKEY)
+ if (prot & (PROT_SEAL_PROT_PKEY | PROT_SEAL_DISCARD_RO_ANON))
prot |= PROT_SEAL_BASE;
/*
@@ -3407,6 +3412,12 @@ static inline bool can_modify_vma(struct vm_area_struct *vma,
return true;
}
+static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int behavior)
+{
+ return true;
+}
+
static inline void update_vma_seals(struct vm_area_struct *vma, unsigned long vm_seals)
{
}
@@ -29,6 +29,8 @@
#define PROT_SEAL_SEAL _BITUL(PROT_SEAL_BIT_BEGIN) /* 0x04000000 seal seal */
#define PROT_SEAL_BASE _BITUL(PROT_SEAL_BIT_BEGIN + 1) /* 0x08000000 base for all sealing types */
#define PROT_SEAL_PROT_PKEY _BITUL(PROT_SEAL_BIT_BEGIN + 2) /* 0x10000000 seal prot and pkey */
+/* seal destructive madvise for non-writeable anonymous memory. */
+#define PROT_SEAL_DISCARD_RO_ANON _BITUL(PROT_SEAL_BIT_BEGIN + 3) /* 0x20000000 */
/* 0x01 - 0x03 are defined in linux/mman.h */
#define MAP_TYPE 0x0f /* Mask for type of mapping */
@@ -58,5 +58,6 @@ struct cachestat {
#define MM_SEAL_SEAL _BITUL(0)
#define MM_SEAL_BASE _BITUL(1)
#define MM_SEAL_PROT_PKEY _BITUL(2)
+#define MM_SEAL_DISCARD_RO_ANON _BITUL(3)
#endif /* _UAPI_LINUX_MMAN_H */
@@ -1403,6 +1403,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
* -EIO - an I/O error occurred while paging in data.
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
+ * -EACCES - memory is sealed.
*/
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
@@ -1446,10 +1447,21 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
start = untagged_addr_remote(mm, start);
end = start + len;
+ /*
+ * Check if the address range is sealed for do_madvise().
+ * can_modify_mm_madv assumes we have acquired the lock on MM.
+ */
+ if (!can_modify_mm_madv(mm, start, end, behavior)) {
+ error = -EACCES;
+ goto out;
+ }
+
blk_start_plug(&plug);
error = madvise_walk_vmas(mm, start, end, behavior,
madvise_vma_behavior);
blk_finish_plug(&plug);
+
+out:
if (write)
mmap_write_unlock(mm);
else
@@ -11,6 +11,7 @@
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
+#include <linux/mmu_context.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include "internal.h"
@@ -66,6 +67,55 @@ bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end,
return true;
}
+static bool is_madv_discard(int behavior)
+{
+ return behavior &
+ (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED |
+ MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK);
+}
+
+static bool is_ro_anon(struct vm_area_struct *vma)
+{
+ /* check anonymous mapping. */
+ if (vma->vm_file || vma->vm_flags & VM_SHARED)
+ return false;
+
+ /*
+ * check for non-writable:
+ * PROT=RO or PKRU is not writeable.
+ */
+ if (!(vma->vm_flags & VM_WRITE) ||
+ !arch_vma_access_permitted(vma, true, false, false))
+ return true;
+
+ return false;
+}
+
+/*
+ * Check if the vmas of a memory range are allowed to be modified by madvise.
+ * the memory ranger can have a gap (unallocated memory).
+ * return true, if it is allowed.
+ */
+bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
+ int behavior)
+{
+ struct vm_area_struct *vma;
+
+ VMA_ITERATOR(vmi, mm, start);
+
+ if (!is_madv_discard(behavior))
+ return true;
+
+ /* going through each vma to check. */
+ for_each_vma_range(vmi, vma, end)
+ if (is_ro_anon(vma) && !can_modify_vma(
+ vma, MM_SEAL_DISCARD_RO_ANON))
+ return false;
+
+ /* Allow by default. */
+ return true;
+}
+
/*
* Check if a seal type can be added to VMA.
*/
@@ -76,6 +126,12 @@ static bool can_add_vma_seals(struct vm_area_struct *vma, unsigned long newSeals
(newSeals & ~(vma_seals(vma))))
return false;
+ /*
+ * For simplicity, we allow adding all sealing types during mmap or mseal.
+ * The actual sealing check will happen later during particular action.
+ * E.g. For MM_SEAL_DISCARD_RO_ANON, we always allow adding it, at the
+ * time madvice() call, we will check if the sealing condition isn't met.
+ */
return true;
}
@@ -225,15 +281,22 @@ static int apply_mm_seal(unsigned long start, unsigned long end,
* mprotect() and pkey_mprotect() will be denied if the memory is
* sealed with MM_SEAL_PROT_PKEY.
*
- * The MM_SEAL_SEAL
- * MM_SEAL_SEAL denies adding a new seal for an VMA.
- *
* The kernel will remember which seal types are applied, and the
* application doesn’t need to repeat all existing seal types in
* the next mseal(). Once a seal type is applied, it can’t be
* unsealed. Call mseal() on an existing seal type is a no-action,
* not a failure.
*
+ * MM_SEAL_DISCARD_RO_ANON: block some destructive madvice()
+ * behavior, such as MADV_DONTNEED, which can effectively
+ * alter gegion contents by discarding pages, block such
+ * operation if users don't have write access to the memory, and
+ * the memory is anonymous memory.
+ * Setting this implies MM_SEAL_BASE is also set.
+ *
+ * The MM_SEAL_SEAL
+ * MM_SEAL_SEAL denies adding a new seal for an VMA.
+ *
* flags: reserved.
*
* return values:
@@ -264,8 +327,8 @@ static int do_mseal(unsigned long start, size_t len_in, unsigned long types,
struct mm_struct *mm = current->mm;
size_t len;
- /* MM_SEAL_BASE is set when other seal types are set. */
- if (types & MM_SEAL_PROT_PKEY)
+ /* MM_SEAL_BASE is set when other seal types are set */
+ if (types & (MM_SEAL_PROT_PKEY | MM_SEAL_DISCARD_RO_ANON))
types |= MM_SEAL_BASE;
if (!can_do_mseal(types, flags))