diff mbox

[v4,2/3] mm: introduce MAP_VALIDATE a mechanism for adding new mmap flags

Message ID 150277753660.23945.11500026891611444016.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dan Williams Aug. 15, 2017, 6:12 a.m. UTC
The mmap syscall suffers from the ABI anti-pattern of not validating
unknown flags. However, proposals like MAP_SYNC and MAP_DIRECT need a
mechanism to define new behavior that is known to fail on older kernels
without the feature. Use the fact that specifying MAP_SHARED and
MAP_PRIVATE at the same time is invalid as a cute hack to allow a new
set of validated flags to be introduced.

This also introduces the ->fmmap() file operation that is ->mmap() plus
flags. Each ->fmmap() implementation must fail requests when a locally
unsupported flag is specified.

Cc: Jan Kara <jack@suse.cz>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/fs.h                     |    7 +++++++
 include/linux/mm.h                     |    2 +-
 include/linux/mman.h                   |    3 +++
 include/uapi/asm-generic/mman-common.h |    1 +
 mm/mmap.c                              |   20 +++++++++++++++++---
 5 files changed, 29 insertions(+), 4 deletions(-)

Comments

Jan Kara Aug. 15, 2017, 12:27 p.m. UTC | #1
On Mon 14-08-17 23:12:16, Dan Williams wrote:
> The mmap syscall suffers from the ABI anti-pattern of not validating
> unknown flags. However, proposals like MAP_SYNC and MAP_DIRECT need a
> mechanism to define new behavior that is known to fail on older kernels
> without the feature. Use the fact that specifying MAP_SHARED and
> MAP_PRIVATE at the same time is invalid as a cute hack to allow a new
> set of validated flags to be introduced.
> 
> This also introduces the ->fmmap() file operation that is ->mmap() plus
> flags. Each ->fmmap() implementation must fail requests when a locally
> unsupported flag is specified.
...
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 1104e5df39ef..bbe755d0caee 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1674,6 +1674,7 @@ struct file_operations {
>  	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
>  	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
>  	int (*mmap) (struct file *, struct vm_area_struct *);
> +	int (*fmmap) (struct file *, struct vm_area_struct *, unsigned long);
>  	int (*open) (struct inode *, struct file *);
>  	int (*flush) (struct file *, fl_owner_t id);
>  	int (*release) (struct inode *, struct file *);
> @@ -1748,6 +1749,12 @@ static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
>  	return file->f_op->mmap(file, vma);
>  }
>  
> +static inline int call_fmmap(struct file *file, struct vm_area_struct *vma,
> +		unsigned long flags)
> +{
> +	return file->f_op->fmmap(file, vma, flags);
> +}
> +

Hum, I dislike a new file op for this when the only problem with ->mmap is
that it misses 'flags' argument. I understand there are lots of ->mmap
implementations out there and modifying prototype of them all is painful
but is it so bad? Coccinelle patch for this should be rather easy...

Also for MAP_SYNC I want the flag to be copied in VMA anyway so for that I
don't need additional flags argument anyway. And I wonder how you want to
make things work without VMA flag in case of MAP_DIRECT as well - VMAs can
be split, partially unmapped etc. and so without VMA flag you are going to
have hard time to detect whether there's any mapping left which blocks
block mapping changes.

								Honza
Dan Williams Aug. 15, 2017, 4:24 p.m. UTC | #2
On Tue, Aug 15, 2017 at 5:27 AM, Jan Kara <jack@suse.cz> wrote:
> On Mon 14-08-17 23:12:16, Dan Williams wrote:
>> The mmap syscall suffers from the ABI anti-pattern of not validating
>> unknown flags. However, proposals like MAP_SYNC and MAP_DIRECT need a
>> mechanism to define new behavior that is known to fail on older kernels
>> without the feature. Use the fact that specifying MAP_SHARED and
>> MAP_PRIVATE at the same time is invalid as a cute hack to allow a new
>> set of validated flags to be introduced.
>>
>> This also introduces the ->fmmap() file operation that is ->mmap() plus
>> flags. Each ->fmmap() implementation must fail requests when a locally
>> unsupported flag is specified.
> ...
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 1104e5df39ef..bbe755d0caee 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -1674,6 +1674,7 @@ struct file_operations {
>>       long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
>>       long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
>>       int (*mmap) (struct file *, struct vm_area_struct *);
>> +     int (*fmmap) (struct file *, struct vm_area_struct *, unsigned long);
>>       int (*open) (struct inode *, struct file *);
>>       int (*flush) (struct file *, fl_owner_t id);
>>       int (*release) (struct inode *, struct file *);
>> @@ -1748,6 +1749,12 @@ static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
>>       return file->f_op->mmap(file, vma);
>>  }
>>
>> +static inline int call_fmmap(struct file *file, struct vm_area_struct *vma,
>> +             unsigned long flags)
>> +{
>> +     return file->f_op->fmmap(file, vma, flags);
>> +}
>> +
>
> Hum, I dislike a new file op for this when the only problem with ->mmap is
> that it misses 'flags' argument. I understand there are lots of ->mmap
> implementations out there and modifying prototype of them all is painful
> but is it so bad? Coccinelle patch for this should be rather easy...

Changing the prototype is relatively easy with Coccinelle, but we
still need the code in each ->mmap() implementation to validate a
local list of supported flags. How about adding a 'supported mmap
flags' field to 'struct file_operations' so that the validation code
can be made generic? I'll go with that since it's a bit less
surprising than a new operation type, and not as messy as teaching
every mmap implementation in the kernel to validate flags that they
will likely never care about.

> Also for MAP_SYNC I want the flag to be copied in VMA anyway so for that I
> don't need additional flags argument anyway. And I wonder how you want to
> make things work without VMA flag in case of MAP_DIRECT as well - VMAs can
> be split, partially unmapped etc. and so without VMA flag you are going to
> have hard time to detect whether there's any mapping left which blocks
> block mapping changes.

Outside of requiring a 64-bit arch, we're out of vm_flags. Also, the
core mm does not really care about MAP_DIRECT or MAP_SYNC so that's
why I added a new ->fs_flags field since these are more filesystem
properties than core mm.

The problem of tracking MAP_DIRECT over vma splits appears to already
be handled. __split_vma does:

        /* most fields are the same, copy all, and then fixup */
        *new = *vma;
...

        if (new->vm_ops && new->vm_ops->open)
                new->vm_ops->open(new);

In ->open() I'm checking if 'new' has MAP_DIRECT in ->fs_flags and
taking a reference against the S_IOMAP_SEALED flag.
Andy Lutomirski Aug. 15, 2017, 4:28 p.m. UTC | #3
On Mon, Aug 14, 2017 at 11:12 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> The mmap syscall suffers from the ABI anti-pattern of not validating
> unknown flags. However, proposals like MAP_SYNC and MAP_DIRECT need a
> mechanism to define new behavior that is known to fail on older kernels
> without the feature. Use the fact that specifying MAP_SHARED and
> MAP_PRIVATE at the same time is invalid as a cute hack to allow a new
> set of validated flags to be introduced.

While this is cute, is it actually better than a new syscall?
Dan Williams Aug. 15, 2017, 10:31 p.m. UTC | #4
On Tue, Aug 15, 2017 at 9:28 AM, Andy Lutomirski <luto@kernel.org> wrote:
> On Mon, Aug 14, 2017 at 11:12 PM, Dan Williams <dan.j.williams@intel.com> wrote:
>> The mmap syscall suffers from the ABI anti-pattern of not validating
>> unknown flags. However, proposals like MAP_SYNC and MAP_DIRECT need a
>> mechanism to define new behavior that is known to fail on older kernels
>> without the feature. Use the fact that specifying MAP_SHARED and
>> MAP_PRIVATE at the same time is invalid as a cute hack to allow a new
>> set of validated flags to be introduced.
>
> While this is cute, is it actually better than a new syscall?

After playing with MAP_DIRECT defined as (MAP_SHARED|MAP_PRIVATE|0x40)
I think a new syscall is better. It's very easy to make the mistake
that "MAP_DIRECT" defines a single flag vs representing a multi-bit
encoding.
kernel test robot Aug. 17, 2017, 8:06 a.m. UTC | #5
Hi Dan,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.13-rc5 next-20170816]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Dan-Williams/fs-xfs-introduce-S_IOMAP_SEALED/20170817-114711
config: xtensa-allmodconfig (attached as .config)
compiler: xtensa-linux-gcc (GCC) 4.9.0
reproduce:
        wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=xtensa 

All errors (new ones prefixed by >>):

   mm/mmap.c: In function 'do_mmap':
>> mm/mmap.c:1391:8: error: 'MAP_VALIDATE' undeclared (first use in this function)
      case MAP_VALIDATE:
           ^
   mm/mmap.c:1391:8: note: each undeclared identifier is reported only once for each function it appears in

vim +/MAP_VALIDATE +1391 mm/mmap.c

  1316	
  1317	/*
  1318	 * The caller must hold down_write(&current->mm->mmap_sem).
  1319	 */
  1320	unsigned long do_mmap(struct file *file, unsigned long addr,
  1321				unsigned long len, unsigned long prot,
  1322				unsigned long flags, vm_flags_t vm_flags,
  1323				unsigned long pgoff, unsigned long *populate,
  1324				struct list_head *uf)
  1325	{
  1326		struct mm_struct *mm = current->mm;
  1327		int pkey = 0;
  1328	
  1329		*populate = 0;
  1330	
  1331		if (!len)
  1332			return -EINVAL;
  1333	
  1334		/*
  1335		 * Does the application expect PROT_READ to imply PROT_EXEC?
  1336		 *
  1337		 * (the exception is when the underlying filesystem is noexec
  1338		 *  mounted, in which case we dont add PROT_EXEC.)
  1339		 */
  1340		if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
  1341			if (!(file && path_noexec(&file->f_path)))
  1342				prot |= PROT_EXEC;
  1343	
  1344		if (!(flags & MAP_FIXED))
  1345			addr = round_hint_to_min(addr);
  1346	
  1347		/* Careful about overflows.. */
  1348		len = PAGE_ALIGN(len);
  1349		if (!len)
  1350			return -ENOMEM;
  1351	
  1352		/* offset overflow? */
  1353		if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
  1354			return -EOVERFLOW;
  1355	
  1356		/* Too many mappings? */
  1357		if (mm->map_count > sysctl_max_map_count)
  1358			return -ENOMEM;
  1359	
  1360		/* Obtain the address to map to. we verify (or select) it and ensure
  1361		 * that it represents a valid section of the address space.
  1362		 */
  1363		addr = get_unmapped_area(file, addr, len, pgoff, flags);
  1364		if (offset_in_page(addr))
  1365			return addr;
  1366	
  1367		if (prot == PROT_EXEC) {
  1368			pkey = execute_only_pkey(mm);
  1369			if (pkey < 0)
  1370				pkey = 0;
  1371		}
  1372	
  1373		/* Do simple checking here so the lower-level routines won't have
  1374		 * to. we assume access permissions have been handled by the open
  1375		 * of the memory object, so we don't do any here.
  1376		 */
  1377		vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
  1378				mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
  1379	
  1380		if (flags & MAP_LOCKED)
  1381			if (!can_do_mlock())
  1382				return -EPERM;
  1383	
  1384		if (mlock_future_check(mm, vm_flags, len))
  1385			return -EAGAIN;
  1386	
  1387		if (file) {
  1388			struct inode *inode = file_inode(file);
  1389	
  1390			switch (flags & MAP_TYPE) {
> 1391			case MAP_VALIDATE:
  1392				if (flags & ~(MAP_SUPPORTED_MASK | MAP_VALIDATE))
  1393					return -EINVAL;
  1394				if (!file->f_op->fmmap)
  1395					return -EOPNOTSUPP;
  1396				/* fall through */
  1397			case MAP_SHARED:
  1398				if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
  1399					return -EACCES;
  1400	
  1401				/*
  1402				 * Make sure we don't allow writing to an append-only
  1403				 * file..
  1404				 */
  1405				if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
  1406					return -EACCES;
  1407	
  1408				/*
  1409				 * Make sure there are no mandatory locks on the file.
  1410				 */
  1411				if (locks_verify_locked(file))
  1412					return -EAGAIN;
  1413	
  1414				vm_flags |= VM_SHARED | VM_MAYSHARE;
  1415				if (!(file->f_mode & FMODE_WRITE))
  1416					vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
  1417	
  1418				/* fall through */
  1419			case MAP_PRIVATE:
  1420				if (!(file->f_mode & FMODE_READ))
  1421					return -EACCES;
  1422				if (path_noexec(&file->f_path)) {
  1423					if (vm_flags & VM_EXEC)
  1424						return -EPERM;
  1425					vm_flags &= ~VM_MAYEXEC;
  1426				}
  1427	
  1428				if (!file->f_op->mmap)
  1429					return -ENODEV;
  1430				if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
  1431					return -EINVAL;
  1432				break;
  1433	
  1434			default:
  1435				return -EINVAL;
  1436			}
  1437		} else {
  1438			switch (flags & MAP_TYPE) {
  1439			case MAP_SHARED:
  1440				if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
  1441					return -EINVAL;
  1442				/*
  1443				 * Ignore pgoff.
  1444				 */
  1445				pgoff = 0;
  1446				vm_flags |= VM_SHARED | VM_MAYSHARE;
  1447				break;
  1448			case MAP_PRIVATE:
  1449				/*
  1450				 * Set pgoff according to addr for anon_vma.
  1451				 */
  1452				pgoff = addr >> PAGE_SHIFT;
  1453				break;
  1454			default:
  1455				return -EINVAL;
  1456			}
  1457		}
  1458	
  1459		/*
  1460		 * Set 'VM_NORESERVE' if we should not account for the
  1461		 * memory use of this mapping.
  1462		 */
  1463		if (flags & MAP_NORESERVE) {
  1464			/* We honor MAP_NORESERVE if allowed to overcommit */
  1465			if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
  1466				vm_flags |= VM_NORESERVE;
  1467	
  1468			/* hugetlb applies strict overcommit unless MAP_NORESERVE */
  1469			if (file && is_file_hugepages(file))
  1470				vm_flags |= VM_NORESERVE;
  1471		}
  1472	
  1473		if ((flags & MAP_VALIDATE) == MAP_VALIDATE)
  1474			flags &= MAP_SUPPORTED_MASK;
  1475		else
  1476			flags = 0;
  1477	
  1478		addr = mmap_region(file, addr, len, vm_flags, pgoff, uf, flags);
  1479		if (!IS_ERR_VALUE(addr) &&
  1480		    ((vm_flags & VM_LOCKED) ||
  1481		     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
  1482			*populate = len;
  1483		return addr;
  1484	}
  1485	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Dan Williams Sept. 17, 2017, 3:44 a.m. UTC | #6
On Tue, Aug 15, 2017 at 5:27 AM, Jan Kara <jack@suse.cz> wrote:
> On Mon 14-08-17 23:12:16, Dan Williams wrote:
>> The mmap syscall suffers from the ABI anti-pattern of not validating
>> unknown flags. However, proposals like MAP_SYNC and MAP_DIRECT need a
>> mechanism to define new behavior that is known to fail on older kernels
>> without the feature. Use the fact that specifying MAP_SHARED and
>> MAP_PRIVATE at the same time is invalid as a cute hack to allow a new
>> set of validated flags to be introduced.
>>
>> This also introduces the ->fmmap() file operation that is ->mmap() plus
>> flags. Each ->fmmap() implementation must fail requests when a locally
>> unsupported flag is specified.
> ...
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 1104e5df39ef..bbe755d0caee 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -1674,6 +1674,7 @@ struct file_operations {
>>       long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
>>       long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
>>       int (*mmap) (struct file *, struct vm_area_struct *);
>> +     int (*fmmap) (struct file *, struct vm_area_struct *, unsigned long);
>>       int (*open) (struct inode *, struct file *);
>>       int (*flush) (struct file *, fl_owner_t id);
>>       int (*release) (struct inode *, struct file *);
>> @@ -1748,6 +1749,12 @@ static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
>>       return file->f_op->mmap(file, vma);
>>  }
>>
>> +static inline int call_fmmap(struct file *file, struct vm_area_struct *vma,
>> +             unsigned long flags)
>> +{
>> +     return file->f_op->fmmap(file, vma, flags);
>> +}
>> +
>
> Hum, I dislike a new file op for this when the only problem with ->mmap is
> that it misses 'flags' argument. I understand there are lots of ->mmap
> implementations out there and modifying prototype of them all is painful
> but is it so bad? Coccinelle patch for this should be rather easy...

So it wasn't all that easy, and Linus declined to take it. I think we
should add a new ->mmap_validate() file operation and save the
tree-wide cleanup until later.
Christoph Hellwig Sept. 17, 2017, 5:39 p.m. UTC | #7
On Sat, Sep 16, 2017 at 08:44:14PM -0700, Dan Williams wrote:
> So it wasn't all that easy, and Linus declined to take it. I think we
> should add a new ->mmap_validate() file operation and save the
> tree-wide cleanup until later.

Note that we already have a mmap_capabilities callout for nommu,
I wonder if we could generalize that.
Jan Kara Sept. 18, 2017, 9:26 a.m. UTC | #8
On Sat 16-09-17 20:44:14, Dan Williams wrote:
> On Tue, Aug 15, 2017 at 5:27 AM, Jan Kara <jack@suse.cz> wrote:
> > On Mon 14-08-17 23:12:16, Dan Williams wrote:
> >> The mmap syscall suffers from the ABI anti-pattern of not validating
> >> unknown flags. However, proposals like MAP_SYNC and MAP_DIRECT need a
> >> mechanism to define new behavior that is known to fail on older kernels
> >> without the feature. Use the fact that specifying MAP_SHARED and
> >> MAP_PRIVATE at the same time is invalid as a cute hack to allow a new
> >> set of validated flags to be introduced.
> >>
> >> This also introduces the ->fmmap() file operation that is ->mmap() plus
> >> flags. Each ->fmmap() implementation must fail requests when a locally
> >> unsupported flag is specified.
> > ...
> >> diff --git a/include/linux/fs.h b/include/linux/fs.h
> >> index 1104e5df39ef..bbe755d0caee 100644
> >> --- a/include/linux/fs.h
> >> +++ b/include/linux/fs.h
> >> @@ -1674,6 +1674,7 @@ struct file_operations {
> >>       long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
> >>       long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
> >>       int (*mmap) (struct file *, struct vm_area_struct *);
> >> +     int (*fmmap) (struct file *, struct vm_area_struct *, unsigned long);
> >>       int (*open) (struct inode *, struct file *);
> >>       int (*flush) (struct file *, fl_owner_t id);
> >>       int (*release) (struct inode *, struct file *);
> >> @@ -1748,6 +1749,12 @@ static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
> >>       return file->f_op->mmap(file, vma);
> >>  }
> >>
> >> +static inline int call_fmmap(struct file *file, struct vm_area_struct *vma,
> >> +             unsigned long flags)
> >> +{
> >> +     return file->f_op->fmmap(file, vma, flags);
> >> +}
> >> +
> >
> > Hum, I dislike a new file op for this when the only problem with ->mmap is
> > that it misses 'flags' argument. I understand there are lots of ->mmap
> > implementations out there and modifying prototype of them all is painful
> > but is it so bad? Coccinelle patch for this should be rather easy...
> 
> So it wasn't all that easy, and Linus declined to take it. I think we
> should add a new ->mmap_validate() file operation and save the
> tree-wide cleanup until later.

Well, we don't even strictly need the flags passed to ->mmap callback if we
are willing to use VMA flags. I want to use it for MAP_SYNC anyway... So
bumping vma->flags to u64 and using a flag is also an option (and frankly
I'd personally just go for that).

								Honza
Jan Kara Sept. 18, 2017, 9:31 a.m. UTC | #9
On Sun 17-09-17 19:39:45, Christoph Hellwig wrote:
> On Sat, Sep 16, 2017 at 08:44:14PM -0700, Dan Williams wrote:
> > So it wasn't all that easy, and Linus declined to take it. I think we
> > should add a new ->mmap_validate() file operation and save the
> > tree-wide cleanup until later.
> 
> Note that we already have a mmap_capabilities callout for nommu,
> I wonder if we could generalize that.

So if I understood Dan right, Linus refused to merge the patch which adds
'flags' argument to ->mmap callback. That is actually logically independent
change from validating flags passed to mmap(2) syscall. Dan did it just to
save himself from adding a VMA flag for MAP_DIRECT.

For validating flags passed to mmap(2), I agree we could use
->mmap_capabilities() instead of mmap_supported_mask Dan has added. But I
don't have a strong opinion there.

								Honza
Dan Williams Sept. 18, 2017, 3:47 p.m. UTC | #10
On Mon, Sep 18, 2017 at 2:31 AM, Jan Kara <jack@suse.cz> wrote:
> On Sun 17-09-17 19:39:45, Christoph Hellwig wrote:
>> On Sat, Sep 16, 2017 at 08:44:14PM -0700, Dan Williams wrote:
>> > So it wasn't all that easy, and Linus declined to take it. I think we
>> > should add a new ->mmap_validate() file operation and save the
>> > tree-wide cleanup until later.
>>
>> Note that we already have a mmap_capabilities callout for nommu,
>> I wonder if we could generalize that.
>
> So if I understood Dan right, Linus refused to merge the patch which adds
> 'flags' argument to ->mmap callback. That is actually logically independent
> change from validating flags passed to mmap(2) syscall. Dan did it just to
> save himself from adding a VMA flag for MAP_DIRECT.
>
> For validating flags passed to mmap(2), I agree we could use
> ->mmap_capabilities() instead of mmap_supported_mask Dan has added. But I
> don't have a strong opinion there.

The drawback I see with mmap_capabilities is that it requires all mmap
flags to have a corresponding vm_flag. After the cold reaction the
VM_DAX flag received I'd want to be sure they were on board with this
direction.
diff mbox

Patch

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1104e5df39ef..bbe755d0caee 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1674,6 +1674,7 @@  struct file_operations {
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
+	int (*fmmap) (struct file *, struct vm_area_struct *, unsigned long);
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
@@ -1748,6 +1749,12 @@  static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
 	return file->f_op->mmap(file, vma);
 }
 
+static inline int call_fmmap(struct file *file, struct vm_area_struct *vma,
+		unsigned long flags)
+{
+	return file->f_op->fmmap(file, vma, flags);
+}
+
 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 			      unsigned long nr_segs, unsigned long fast_segs,
 			      struct iovec *fast_pointer,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46b9ac5e8569..49eef48da4b7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2090,7 +2090,7 @@  extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
-	struct list_head *uf);
+	struct list_head *uf, unsigned long flags);
 extern unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot, unsigned long flags,
 	vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
diff --git a/include/linux/mman.h b/include/linux/mman.h
index c8367041fafd..73d4ac7e7136 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -7,6 +7,9 @@ 
 #include <linux/atomic.h>
 #include <uapi/linux/mman.h>
 
+/* the MAP_VALIDATE set of supported flags */
+#define	MAP_SUPPORTED_MASK (0)
+
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern unsigned long sysctl_overcommit_kbytes;
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 8c27db0c5c08..8bf8c7828275 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -24,6 +24,7 @@ 
 #else
 # define MAP_UNINITIALIZED 0x0		/* Don't support this flag */
 #endif
+#define MAP_VALIDATE (MAP_SHARED|MAP_PRIVATE) /* mechanism to define new shared semantics */
 
 /*
  * Flags for mlock
diff --git a/mm/mmap.c b/mm/mmap.c
index f19efcf75418..d2919a9e25bf 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1388,6 +1388,12 @@  unsigned long do_mmap(struct file *file, unsigned long addr,
 		struct inode *inode = file_inode(file);
 
 		switch (flags & MAP_TYPE) {
+		case MAP_VALIDATE:
+			if (flags & ~(MAP_SUPPORTED_MASK | MAP_VALIDATE))
+				return -EINVAL;
+			if (!file->f_op->fmmap)
+				return -EOPNOTSUPP;
+			/* fall through */
 		case MAP_SHARED:
 			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
 				return -EACCES;
@@ -1464,7 +1470,12 @@  unsigned long do_mmap(struct file *file, unsigned long addr,
 			vm_flags |= VM_NORESERVE;
 	}
 
-	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
+	if ((flags & MAP_VALIDATE) == MAP_VALIDATE)
+		flags &= MAP_SUPPORTED_MASK;
+	else
+		flags = 0;
+
+	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf, flags);
 	if (!IS_ERR_VALUE(addr) &&
 	    ((vm_flags & VM_LOCKED) ||
 	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
@@ -1601,7 +1612,7 @@  static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
 		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
-		struct list_head *uf)
+		struct list_head *uf, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -1686,7 +1697,10 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 		 * new file must not have been exposed to user-space, yet.
 		 */
 		vma->vm_file = get_file(file);
-		error = call_mmap(file, vma);
+		if (flags)
+			error = call_fmmap(file, vma, flags);
+		else
+			error = call_mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;