diff mbox series

[v3,kvm/queue,01/16] mm/shmem: Introduce F_SEAL_INACCESSIBLE

Message ID 20211223123011.41044-2-chao.p.peng@linux.intel.com (mailing list archive)
State New
Headers show
Series KVM: mm: fd-based approach for supporting KVM guest private memory | expand

Commit Message

Chao Peng Dec. 23, 2021, 12:29 p.m. UTC
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>

Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
the file is inaccessible from userspace in any possible ways like
read(),write() or mmap() etc.

It provides semantics required for KVM guest private memory support
that a file descriptor with this seal set is going to be used as the
source of guest memory in confidential computing environments such
as Intel TDX/AMD SEV but may not be accessible from host userspace.

At this time only shmem implements this seal.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 include/uapi/linux/fcntl.h |  1 +
 mm/shmem.c                 | 37 +++++++++++++++++++++++++++++++++++--
 2 files changed, 36 insertions(+), 2 deletions(-)

Comments

David Hildenbrand Jan. 4, 2022, 2:22 p.m. UTC | #1
On 23.12.21 13:29, Chao Peng wrote:
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> 
> Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
> the file is inaccessible from userspace in any possible ways like
> read(),write() or mmap() etc.
> 
> It provides semantics required for KVM guest private memory support
> that a file descriptor with this seal set is going to be used as the
> source of guest memory in confidential computing environments such
> as Intel TDX/AMD SEV but may not be accessible from host userspace.
> 
> At this time only shmem implements this seal.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---
>  include/uapi/linux/fcntl.h |  1 +
>  mm/shmem.c                 | 37 +++++++++++++++++++++++++++++++++++--
>  2 files changed, 36 insertions(+), 2 deletions(-)
> 
> diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> index 2f86b2ad6d7e..e2bad051936f 100644
> --- a/include/uapi/linux/fcntl.h
> +++ b/include/uapi/linux/fcntl.h
> @@ -43,6 +43,7 @@
>  #define F_SEAL_GROW	0x0004	/* prevent file from growing */
>  #define F_SEAL_WRITE	0x0008	/* prevent writes */
>  #define F_SEAL_FUTURE_WRITE	0x0010  /* prevent future writes while mapped */
> +#define F_SEAL_INACCESSIBLE	0x0020  /* prevent file from accessing */

I think this needs more clarification: the file content can still be
accessed using in-kernel mechanisms such as MEMFD_OPS for KVM. It
effectively disallows traditional access to a file (read/write/mmap)
that will result in ordinary MMU access to file content.

Not sure how to best clarify that: maybe, prevent ordinary MMU access
(e.g., read/write/mmap) to file content?

>  /* (1U << 31) is reserved for signed error codes */
>  
>  /*
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 18f93c2d68f1..faa7e9b1b9bc 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1098,6 +1098,10 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
>  		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
>  			return -EPERM;
>  
> +		if ((info->seals & F_SEAL_INACCESSIBLE) &&
> +		    (newsize & ~PAGE_MASK))
> +			return -EINVAL;
> +

What happens when sealing and there are existing mmaps?
Chao Peng Jan. 6, 2022, 1:06 p.m. UTC | #2
On Tue, Jan 04, 2022 at 03:22:07PM +0100, David Hildenbrand wrote:
> On 23.12.21 13:29, Chao Peng wrote:
> > From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> > 
> > Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
> > the file is inaccessible from userspace in any possible ways like
> > read(),write() or mmap() etc.
> > 
> > It provides semantics required for KVM guest private memory support
> > that a file descriptor with this seal set is going to be used as the
> > source of guest memory in confidential computing environments such
> > as Intel TDX/AMD SEV but may not be accessible from host userspace.
> > 
> > At this time only shmem implements this seal.
> > 
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > ---
> >  include/uapi/linux/fcntl.h |  1 +
> >  mm/shmem.c                 | 37 +++++++++++++++++++++++++++++++++++--
> >  2 files changed, 36 insertions(+), 2 deletions(-)
> > 
> > diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> > index 2f86b2ad6d7e..e2bad051936f 100644
> > --- a/include/uapi/linux/fcntl.h
> > +++ b/include/uapi/linux/fcntl.h
> > @@ -43,6 +43,7 @@
> >  #define F_SEAL_GROW	0x0004	/* prevent file from growing */
> >  #define F_SEAL_WRITE	0x0008	/* prevent writes */
> >  #define F_SEAL_FUTURE_WRITE	0x0010  /* prevent future writes while mapped */
> > +#define F_SEAL_INACCESSIBLE	0x0020  /* prevent file from accessing */
> 
> I think this needs more clarification: the file content can still be
> accessed using in-kernel mechanisms such as MEMFD_OPS for KVM. It
> effectively disallows traditional access to a file (read/write/mmap)
> that will result in ordinary MMU access to file content.
> 
> Not sure how to best clarify that: maybe, prevent ordinary MMU access
> (e.g., read/write/mmap) to file content?

Or: prevent userspace access (e.g., read/write/mmap) to file content?
> 
> >  /* (1U << 31) is reserved for signed error codes */
> >  
> >  /*
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 18f93c2d68f1..faa7e9b1b9bc 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -1098,6 +1098,10 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
> >  		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
> >  			return -EPERM;
> >  
> > +		if ((info->seals & F_SEAL_INACCESSIBLE) &&
> > +		    (newsize & ~PAGE_MASK))
> > +			return -EINVAL;
> > +
> 
> What happens when sealing and there are existing mmaps?

I think this is similar to ftruncate, in either case we just allow that.
The existing mmaps will be unmapped and KVM will be notified to
invalidate the mapping in the secondary MMU as well. This assume we
trust the userspace even though it can not access the file content.

Thanks,
Chao
> 
> 
> -- 
> Thanks,
> 
> David / dhildenb
David Hildenbrand Jan. 13, 2022, 3:56 p.m. UTC | #3
On 06.01.22 14:06, Chao Peng wrote:
> On Tue, Jan 04, 2022 at 03:22:07PM +0100, David Hildenbrand wrote:
>> On 23.12.21 13:29, Chao Peng wrote:
>>> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
>>>
>>> Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
>>> the file is inaccessible from userspace in any possible ways like
>>> read(),write() or mmap() etc.
>>>
>>> It provides semantics required for KVM guest private memory support
>>> that a file descriptor with this seal set is going to be used as the
>>> source of guest memory in confidential computing environments such
>>> as Intel TDX/AMD SEV but may not be accessible from host userspace.
>>>
>>> At this time only shmem implements this seal.
>>>
>>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>>> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
>>> ---
>>>  include/uapi/linux/fcntl.h |  1 +
>>>  mm/shmem.c                 | 37 +++++++++++++++++++++++++++++++++++--
>>>  2 files changed, 36 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
>>> index 2f86b2ad6d7e..e2bad051936f 100644
>>> --- a/include/uapi/linux/fcntl.h
>>> +++ b/include/uapi/linux/fcntl.h
>>> @@ -43,6 +43,7 @@
>>>  #define F_SEAL_GROW	0x0004	/* prevent file from growing */
>>>  #define F_SEAL_WRITE	0x0008	/* prevent writes */
>>>  #define F_SEAL_FUTURE_WRITE	0x0010  /* prevent future writes while mapped */
>>> +#define F_SEAL_INACCESSIBLE	0x0020  /* prevent file from accessing */
>>
>> I think this needs more clarification: the file content can still be
>> accessed using in-kernel mechanisms such as MEMFD_OPS for KVM. It
>> effectively disallows traditional access to a file (read/write/mmap)
>> that will result in ordinary MMU access to file content.
>>
>> Not sure how to best clarify that: maybe, prevent ordinary MMU access
>> (e.g., read/write/mmap) to file content?
> 
> Or: prevent userspace access (e.g., read/write/mmap) to file content?

The issue with that phrasing is that userspace will be able to access
that content, just via a different mechanism eventually ... e.g., via
the KVM MMU indirectly. If that makes it clearer what I mean :)

>>
>>>  /* (1U << 31) is reserved for signed error codes */
>>>  
>>>  /*
>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>> index 18f93c2d68f1..faa7e9b1b9bc 100644
>>> --- a/mm/shmem.c
>>> +++ b/mm/shmem.c
>>> @@ -1098,6 +1098,10 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
>>>  		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
>>>  			return -EPERM;
>>>  
>>> +		if ((info->seals & F_SEAL_INACCESSIBLE) &&
>>> +		    (newsize & ~PAGE_MASK))
>>> +			return -EINVAL;
>>> +
>>
>> What happens when sealing and there are existing mmaps?
> 
> I think this is similar to ftruncate, in either case we just allow that.
> The existing mmaps will be unmapped and KVM will be notified to
> invalidate the mapping in the secondary MMU as well. This assume we
> trust the userspace even though it can not access the file content.

Can't we simply check+forbid instead?
diff mbox series

Patch

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..e2bad051936f 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -43,6 +43,7 @@ 
 #define F_SEAL_GROW	0x0004	/* prevent file from growing */
 #define F_SEAL_WRITE	0x0008	/* prevent writes */
 #define F_SEAL_FUTURE_WRITE	0x0010  /* prevent future writes while mapped */
+#define F_SEAL_INACCESSIBLE	0x0020  /* prevent file from accessing */
 /* (1U << 31) is reserved for signed error codes */
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 18f93c2d68f1..faa7e9b1b9bc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1098,6 +1098,10 @@  static int shmem_setattr(struct user_namespace *mnt_userns,
 		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
 			return -EPERM;
 
+		if ((info->seals & F_SEAL_INACCESSIBLE) &&
+		    (newsize & ~PAGE_MASK))
+			return -EINVAL;
+
 		if (newsize != oldsize) {
 			error = shmem_reacct_size(SHMEM_I(inode)->flags,
 					oldsize, newsize);
@@ -1364,6 +1368,8 @@  static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		goto redirty;
 	if (!total_swap_pages)
 		goto redirty;
+	if (info->seals & F_SEAL_INACCESSIBLE)
+		goto redirty;
 
 	/*
 	 * Our capabilities prevent regular writeback or sync from ever calling
@@ -2262,6 +2268,9 @@  static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 	if (ret)
 		return ret;
 
+	if (info->seals & F_SEAL_INACCESSIBLE)
+		return -EPERM;
+
 	/* arm64 - allow memory tagging on RAM-based files */
 	vma->vm_flags |= VM_MTE_ALLOWED;
 
@@ -2459,12 +2468,15 @@  shmem_write_begin(struct file *file, struct address_space *mapping,
 	pgoff_t index = pos >> PAGE_SHIFT;
 
 	/* i_rwsem is held by caller */
-	if (unlikely(info->seals & (F_SEAL_GROW |
-				   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
+	if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE |
+				    F_SEAL_FUTURE_WRITE |
+				    F_SEAL_INACCESSIBLE))) {
 		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
 			return -EPERM;
 		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
 			return -EPERM;
+		if (info->seals & F_SEAL_INACCESSIBLE)
+			return -EPERM;
 	}
 
 	return shmem_getpage(inode, index, pagep, SGP_WRITE);
@@ -2538,6 +2550,21 @@  static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		end_index = i_size >> PAGE_SHIFT;
 		if (index > end_index)
 			break;
+
+		/*
+		 * inode_lock protects setting up seals as well as write to
+		 * i_size. Setting F_SEAL_INACCESSIBLE only allowed with
+		 * i_size == 0.
+		 *
+		 * Check F_SEAL_INACCESSIBLE after i_size. It effectively
+		 * serialize read vs. setting F_SEAL_INACCESSIBLE without
+		 * taking inode_lock in read path.
+		 */
+		if (SHMEM_I(inode)->seals & F_SEAL_INACCESSIBLE) {
+			error = -EPERM;
+			break;
+		}
+
 		if (index == end_index) {
 			nr = i_size & ~PAGE_MASK;
 			if (nr <= offset)
@@ -2663,6 +2690,12 @@  static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 			goto out;
 		}
 
+		if ((info->seals & F_SEAL_INACCESSIBLE) &&
+		    (offset & ~PAGE_MASK || len & ~PAGE_MASK)) {
+			error = -EINVAL;
+			goto out;
+		}
+
 		shmem_falloc.waitq = &shmem_falloc_waitq;
 		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
 		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;