Message ID | 20210119162204.2081137-3-mszeredi@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | capability conversion fixes | expand |
Miklos Szeredi <mszeredi@redhat.com> writes: > If a capability is stored on disk in v2 format cap_inode_getsecurity() will > currently return in v2 format unconditionally. > > This is wrong: v2 cap should be equivalent to a v3 cap with zero rootid, > and so the same conversions performed on it. > > If the rootid cannot be mapped v3 is returned unconverted. Fix this so > that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of the fs > user namespace in case of v2) cannot be mapped in the current user > namespace. This looks like a good cleanup. I do wonder how well this works with stacking. In particular ovl_xattr_set appears to call vfs_getxattr without overriding the creds. What the purpose of that is I haven't quite figured out. It looks like it is just a probe to see if an xattr is present so maybe it is ok. Acked-by: "Eric W. Biederman" <ebiederm@xmission.com> > > Signed-off-by: Miklos Szeredi <mszeredi@redhat.com> > --- > security/commoncap.c | 67 ++++++++++++++++++++++++++++---------------- > 1 file changed, 43 insertions(+), 24 deletions(-) > > diff --git a/security/commoncap.c b/security/commoncap.c > index bacc1111d871..c9d99f8f4c82 100644 > --- a/security/commoncap.c > +++ b/security/commoncap.c > @@ -371,10 +371,11 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, > { > int size, ret; > kuid_t kroot; > + __le32 nsmagic, magic; > uid_t root, mappedroot; > char *tmpbuf = NULL; > struct vfs_cap_data *cap; > - struct vfs_ns_cap_data *nscap; > + struct vfs_ns_cap_data *nscap = NULL; > struct dentry *dentry; > struct user_namespace *fs_ns; > > @@ -396,46 +397,61 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, > fs_ns = inode->i_sb->s_user_ns; > cap = (struct vfs_cap_data *) tmpbuf; > if (is_v2header((size_t) ret, cap)) { > - /* If this is sizeof(vfs_cap_data) then we're ok with the > - * on-disk value, so return that. */ > - if (alloc) > - *buffer = tmpbuf; > - else > - kfree(tmpbuf); > - return ret; > - } else if (!is_v3header((size_t) ret, cap)) { > - kfree(tmpbuf); > - return -EINVAL; > + root = 0; > + } else if (is_v3header((size_t) ret, cap)) { > + nscap = (struct vfs_ns_cap_data *) tmpbuf; > + root = le32_to_cpu(nscap->rootid); > + } else { > + size = -EINVAL; > + goto out_free; > } > > - nscap = (struct vfs_ns_cap_data *) tmpbuf; > - root = le32_to_cpu(nscap->rootid); > kroot = make_kuid(fs_ns, root); > > /* If the root kuid maps to a valid uid in current ns, then return > * this as a nscap. */ > mappedroot = from_kuid(current_user_ns(), kroot); > if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { > + size = sizeof(struct vfs_ns_cap_data); > if (alloc) { > - *buffer = tmpbuf; > + if (!nscap) { > + /* v2 -> v3 conversion */ > + nscap = kzalloc(size, GFP_ATOMIC); > + if (!nscap) { > + size = -ENOMEM; > + goto out_free; > + } > + nsmagic = VFS_CAP_REVISION_3; > + magic = le32_to_cpu(cap->magic_etc); > + if (magic & VFS_CAP_FLAGS_EFFECTIVE) > + nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; > + memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); > + nscap->magic_etc = cpu_to_le32(nsmagic); > + } else { > + /* use allocated v3 buffer */ > + tmpbuf = NULL; > + } > nscap->rootid = cpu_to_le32(mappedroot); > - } else > - kfree(tmpbuf); > - return size; > + *buffer = nscap; > + } > + goto out_free; > } > > if (!rootid_owns_currentns(kroot)) { > - kfree(tmpbuf); > - return -EOPNOTSUPP; > + size = -EOVERFLOW; > + goto out_free; > } > > /* This comes from a parent namespace. Return as a v2 capability */ > size = sizeof(struct vfs_cap_data); > if (alloc) { > - *buffer = kmalloc(size, GFP_ATOMIC); > - if (*buffer) { > - struct vfs_cap_data *cap = *buffer; > - __le32 nsmagic, magic; > + if (nscap) { > + /* v3 -> v2 conversion */ > + cap = kzalloc(size, GFP_ATOMIC); > + if (!cap) { > + size = -ENOMEM; > + goto out_free; > + } > magic = VFS_CAP_REVISION_2; > nsmagic = le32_to_cpu(nscap->magic_etc); > if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) > @@ -443,9 +459,12 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, > memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); > cap->magic_etc = cpu_to_le32(magic); > } else { > - size = -ENOMEM; > + /* use unconverted v2 */ > + tmpbuf = NULL; > } > + *buffer = cap; > } > +out_free: > kfree(tmpbuf); > return size; > }
On Wed, Jan 20, 2021 at 2:39 AM Eric W. Biederman <ebiederm@xmission.com> wrote: > > Miklos Szeredi <mszeredi@redhat.com> writes: > > > If a capability is stored on disk in v2 format cap_inode_getsecurity() will > > currently return in v2 format unconditionally. > > > > This is wrong: v2 cap should be equivalent to a v3 cap with zero rootid, > > and so the same conversions performed on it. > > > > If the rootid cannot be mapped v3 is returned unconverted. Fix this so > > that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of the fs > > user namespace in case of v2) cannot be mapped in the current user > > namespace. > > This looks like a good cleanup. > > I do wonder how well this works with stacking. In particular > ovl_xattr_set appears to call vfs_getxattr without overriding the creds. > What the purpose of that is I haven't quite figured out. It looks like > it is just a probe to see if an xattr is present so maybe it is ok. Yeah, it's checking in the removexattr case whether copy-up is needed or not (i.e. if trying to remove a non-existent xattr, then no need to copy up). But for consistency it should also be wrapped in override creds. Adding fix to this series. I'll also audit for any remaining omissions. One known and documented case is vfs_ioctl(FS_IOC_{[SG]ETFLAGS,FS[SG]ETXATTR}), but that shouldn't be affected by user namespaces. Thanks, Miklos
Hi Miklos, I love your patch! Perhaps something to improve: [auto build test WARNING on security/next-testing] [also build test WARNING on linux/master linus/master v5.11-rc4 next-20210120] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Miklos-Szeredi/capability-conversion-fixes/20210120-152933 base: https://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security.git next-testing config: xtensa-randconfig-s032-20210120 (attached as .config) compiler: xtensa-linux-gcc (GCC) 9.3.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # apt-get install sparse # sparse version: v0.6.3-208-g46a52ca4-dirty # https://github.com/0day-ci/linux/commit/bcf70adf8bcc3e52cb1b262ae2e1d9154da75097 git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Miklos-Szeredi/capability-conversion-fixes/20210120-152933 git checkout bcf70adf8bcc3e52cb1b262ae2e1d9154da75097 # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=xtensa If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot <lkp@intel.com> "sparse warnings: (new ones prefixed by >>)" security/commoncap.c:424:41: sparse: sparse: incorrect type in assignment (different base types) @@ expected restricted __le32 [usertype] nsmagic @@ got int @@ security/commoncap.c:424:41: sparse: expected restricted __le32 [usertype] nsmagic security/commoncap.c:424:41: sparse: got int >> security/commoncap.c:425:39: sparse: sparse: incorrect type in assignment (different base types) @@ expected restricted __le32 [usertype] magic @@ got unsigned int @@ security/commoncap.c:425:39: sparse: expected restricted __le32 [usertype] magic security/commoncap.c:425:39: sparse: got unsigned int security/commoncap.c:426:37: sparse: sparse: restricted __le32 degrades to integer security/commoncap.c:427:49: sparse: sparse: invalid assignment: |= security/commoncap.c:427:49: sparse: left side has type restricted __le32 security/commoncap.c:427:49: sparse: right side has type int security/commoncap.c:429:52: sparse: sparse: cast from restricted __le32 >> security/commoncap.c:429:52: sparse: sparse: incorrect type in argument 1 (different base types) @@ expected unsigned int [usertype] val @@ got restricted __le32 [usertype] nsmagic @@ security/commoncap.c:429:52: sparse: expected unsigned int [usertype] val security/commoncap.c:429:52: sparse: got restricted __le32 [usertype] nsmagic security/commoncap.c:429:52: sparse: sparse: cast from restricted __le32 security/commoncap.c:429:52: sparse: sparse: cast from restricted __le32 security/commoncap.c:429:52: sparse: sparse: cast from restricted __le32 security/commoncap.c:429:52: sparse: sparse: cast from restricted __le32 security/commoncap.c:455:31: sparse: sparse: incorrect type in assignment (different base types) @@ expected restricted __le32 [usertype] magic @@ got int @@ security/commoncap.c:455:31: sparse: expected restricted __le32 [usertype] magic security/commoncap.c:455:31: sparse: got int security/commoncap.c:456:33: sparse: sparse: incorrect type in assignment (different base types) @@ expected restricted __le32 [usertype] nsmagic @@ got unsigned int @@ security/commoncap.c:456:33: sparse: expected restricted __le32 [usertype] nsmagic security/commoncap.c:456:33: sparse: got unsigned int security/commoncap.c:457:29: sparse: sparse: restricted __le32 degrades to integer security/commoncap.c:458:39: sparse: sparse: invalid assignment: |= security/commoncap.c:458:39: sparse: left side has type restricted __le32 security/commoncap.c:458:39: sparse: right side has type int security/commoncap.c:460:42: sparse: sparse: cast from restricted __le32 security/commoncap.c:460:42: sparse: sparse: incorrect type in argument 1 (different base types) @@ expected unsigned int [usertype] val @@ got restricted __le32 [usertype] magic @@ security/commoncap.c:460:42: sparse: expected unsigned int [usertype] val security/commoncap.c:460:42: sparse: got restricted __le32 [usertype] magic security/commoncap.c:460:42: sparse: sparse: cast from restricted __le32 security/commoncap.c:460:42: sparse: sparse: cast from restricted __le32 security/commoncap.c:460:42: sparse: sparse: cast from restricted __le32 security/commoncap.c:460:42: sparse: sparse: cast from restricted __le32 security/commoncap.c:1281:41: sparse: sparse: dubious: !x | y vim +425 security/commoncap.c 357 358 /* 359 * getsecurity: We are called for security.* before any attempt to read the 360 * xattr from the inode itself. 361 * 362 * This gives us a chance to read the on-disk value and convert it. If we 363 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler. 364 * 365 * Note we are not called by vfs_getxattr_alloc(), but that is only called 366 * by the integrity subsystem, which really wants the unconverted values - 367 * so that's good. 368 */ 369 int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, 370 bool alloc) 371 { 372 int size, ret; 373 kuid_t kroot; 374 __le32 nsmagic, magic; 375 uid_t root, mappedroot; 376 char *tmpbuf = NULL; 377 struct vfs_cap_data *cap; 378 struct vfs_ns_cap_data *nscap = NULL; 379 struct dentry *dentry; 380 struct user_namespace *fs_ns; 381 382 if (strcmp(name, "capability") != 0) 383 return -EOPNOTSUPP; 384 385 dentry = d_find_any_alias(inode); 386 if (!dentry) 387 return -EINVAL; 388 389 size = sizeof(struct vfs_ns_cap_data); 390 ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS, 391 &tmpbuf, size, GFP_NOFS); 392 dput(dentry); 393 394 if (ret < 0) 395 return ret; 396 397 fs_ns = inode->i_sb->s_user_ns; 398 cap = (struct vfs_cap_data *) tmpbuf; 399 if (is_v2header((size_t) ret, cap)) { 400 root = 0; 401 } else if (is_v3header((size_t) ret, cap)) { 402 nscap = (struct vfs_ns_cap_data *) tmpbuf; 403 root = le32_to_cpu(nscap->rootid); 404 } else { 405 size = -EINVAL; 406 goto out_free; 407 } 408 409 kroot = make_kuid(fs_ns, root); 410 411 /* If the root kuid maps to a valid uid in current ns, then return 412 * this as a nscap. */ 413 mappedroot = from_kuid(current_user_ns(), kroot); 414 if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { 415 size = sizeof(struct vfs_ns_cap_data); 416 if (alloc) { 417 if (!nscap) { 418 /* v2 -> v3 conversion */ 419 nscap = kzalloc(size, GFP_ATOMIC); 420 if (!nscap) { 421 size = -ENOMEM; 422 goto out_free; 423 } 424 nsmagic = VFS_CAP_REVISION_3; > 425 magic = le32_to_cpu(cap->magic_etc); 426 if (magic & VFS_CAP_FLAGS_EFFECTIVE) 427 nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; 428 memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); > 429 nscap->magic_etc = cpu_to_le32(nsmagic); 430 } else { 431 /* use allocated v3 buffer */ 432 tmpbuf = NULL; 433 } 434 nscap->rootid = cpu_to_le32(mappedroot); 435 *buffer = nscap; 436 } 437 goto out_free; 438 } 439 440 if (!rootid_owns_currentns(kroot)) { 441 size = -EOVERFLOW; 442 goto out_free; 443 } 444 445 /* This comes from a parent namespace. Return as a v2 capability */ 446 size = sizeof(struct vfs_cap_data); 447 if (alloc) { 448 if (nscap) { 449 /* v3 -> v2 conversion */ 450 cap = kzalloc(size, GFP_ATOMIC); 451 if (!cap) { 452 size = -ENOMEM; 453 goto out_free; 454 } 455 magic = VFS_CAP_REVISION_2; 456 nsmagic = le32_to_cpu(nscap->magic_etc); 457 if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) 458 magic |= VFS_CAP_FLAGS_EFFECTIVE; 459 memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); 460 cap->magic_etc = cpu_to_le32(magic); 461 } else { 462 /* use unconverted v2 */ 463 tmpbuf = NULL; 464 } 465 *buffer = cap; 466 } 467 out_free: 468 kfree(tmpbuf); 469 return size; 470 } 471 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Hi Miklos, I love your patch! Perhaps something to improve: [auto build test WARNING on security/next-testing] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Miklos-Szeredi/capability-conversion-fixes/20210120-152933 base: https://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security.git next-testing config: x86_64-randconfig-s022-20210120 (attached as .config) compiler: gcc-9 (Debian 9.3.0-15) 9.3.0 reproduce: # apt-get install sparse # sparse version: v0.6.3-208-g46a52ca4-dirty # https://github.com/0day-ci/linux/commit/bcf70adf8bcc3e52cb1b262ae2e1d9154da75097 git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Miklos-Szeredi/capability-conversion-fixes/20210120-152933 git checkout bcf70adf8bcc3e52cb1b262ae2e1d9154da75097 # save the attached .config to linux build tree make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=x86_64 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot <lkp@intel.com> "sparse warnings: (new ones prefixed by >>)" >> security/commoncap.c:424:41: sparse: sparse: incorrect type in assignment (different base types) @@ expected restricted __le32 [usertype] nsmagic @@ got int @@ security/commoncap.c:424:41: sparse: expected restricted __le32 [usertype] nsmagic security/commoncap.c:424:41: sparse: got int >> security/commoncap.c:425:39: sparse: sparse: incorrect type in assignment (different base types) @@ expected restricted __le32 [usertype] magic @@ got unsigned int [usertype] @@ security/commoncap.c:425:39: sparse: expected restricted __le32 [usertype] magic security/commoncap.c:425:39: sparse: got unsigned int [usertype] security/commoncap.c:426:37: sparse: sparse: restricted __le32 degrades to integer security/commoncap.c:427:49: sparse: sparse: invalid assignment: |= security/commoncap.c:427:49: sparse: left side has type restricted __le32 security/commoncap.c:427:49: sparse: right side has type int security/commoncap.c:429:52: sparse: sparse: cast from restricted __le32 security/commoncap.c:455:31: sparse: sparse: incorrect type in assignment (different base types) @@ expected restricted __le32 [usertype] magic @@ got int @@ security/commoncap.c:455:31: sparse: expected restricted __le32 [usertype] magic security/commoncap.c:455:31: sparse: got int security/commoncap.c:456:33: sparse: sparse: incorrect type in assignment (different base types) @@ expected restricted __le32 [usertype] nsmagic @@ got unsigned int [usertype] @@ security/commoncap.c:456:33: sparse: expected restricted __le32 [usertype] nsmagic security/commoncap.c:456:33: sparse: got unsigned int [usertype] security/commoncap.c:457:29: sparse: sparse: restricted __le32 degrades to integer security/commoncap.c:458:39: sparse: sparse: invalid assignment: |= security/commoncap.c:458:39: sparse: left side has type restricted __le32 security/commoncap.c:458:39: sparse: right side has type int security/commoncap.c:460:42: sparse: sparse: cast from restricted __le32 security/commoncap.c:1281:41: sparse: sparse: dubious: !x | y vim +424 security/commoncap.c 357 358 /* 359 * getsecurity: We are called for security.* before any attempt to read the 360 * xattr from the inode itself. 361 * 362 * This gives us a chance to read the on-disk value and convert it. If we 363 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler. 364 * 365 * Note we are not called by vfs_getxattr_alloc(), but that is only called 366 * by the integrity subsystem, which really wants the unconverted values - 367 * so that's good. 368 */ 369 int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, 370 bool alloc) 371 { 372 int size, ret; 373 kuid_t kroot; 374 __le32 nsmagic, magic; 375 uid_t root, mappedroot; 376 char *tmpbuf = NULL; 377 struct vfs_cap_data *cap; 378 struct vfs_ns_cap_data *nscap = NULL; 379 struct dentry *dentry; 380 struct user_namespace *fs_ns; 381 382 if (strcmp(name, "capability") != 0) 383 return -EOPNOTSUPP; 384 385 dentry = d_find_any_alias(inode); 386 if (!dentry) 387 return -EINVAL; 388 389 size = sizeof(struct vfs_ns_cap_data); 390 ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS, 391 &tmpbuf, size, GFP_NOFS); 392 dput(dentry); 393 394 if (ret < 0) 395 return ret; 396 397 fs_ns = inode->i_sb->s_user_ns; 398 cap = (struct vfs_cap_data *) tmpbuf; 399 if (is_v2header((size_t) ret, cap)) { 400 root = 0; 401 } else if (is_v3header((size_t) ret, cap)) { 402 nscap = (struct vfs_ns_cap_data *) tmpbuf; 403 root = le32_to_cpu(nscap->rootid); 404 } else { 405 size = -EINVAL; 406 goto out_free; 407 } 408 409 kroot = make_kuid(fs_ns, root); 410 411 /* If the root kuid maps to a valid uid in current ns, then return 412 * this as a nscap. */ 413 mappedroot = from_kuid(current_user_ns(), kroot); 414 if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { 415 size = sizeof(struct vfs_ns_cap_data); 416 if (alloc) { 417 if (!nscap) { 418 /* v2 -> v3 conversion */ 419 nscap = kzalloc(size, GFP_ATOMIC); 420 if (!nscap) { 421 size = -ENOMEM; 422 goto out_free; 423 } > 424 nsmagic = VFS_CAP_REVISION_3; > 425 magic = le32_to_cpu(cap->magic_etc); 426 if (magic & VFS_CAP_FLAGS_EFFECTIVE) 427 nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; 428 memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); 429 nscap->magic_etc = cpu_to_le32(nsmagic); 430 } else { 431 /* use allocated v3 buffer */ 432 tmpbuf = NULL; 433 } 434 nscap->rootid = cpu_to_le32(mappedroot); 435 *buffer = nscap; 436 } 437 goto out_free; 438 } 439 440 if (!rootid_owns_currentns(kroot)) { 441 size = -EOVERFLOW; 442 goto out_free; 443 } 444 445 /* This comes from a parent namespace. Return as a v2 capability */ 446 size = sizeof(struct vfs_cap_data); 447 if (alloc) { 448 if (nscap) { 449 /* v3 -> v2 conversion */ 450 cap = kzalloc(size, GFP_ATOMIC); 451 if (!cap) { 452 size = -ENOMEM; 453 goto out_free; 454 } 455 magic = VFS_CAP_REVISION_2; 456 nsmagic = le32_to_cpu(nscap->magic_etc); 457 if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) 458 magic |= VFS_CAP_FLAGS_EFFECTIVE; 459 memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); 460 cap->magic_etc = cpu_to_le32(magic); 461 } else { 462 /* use unconverted v2 */ 463 tmpbuf = NULL; 464 } 465 *buffer = cap; 466 } 467 out_free: 468 kfree(tmpbuf); 469 return size; 470 } 471 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
On Tue, Jan 19, 2021 at 07:34:49PM -0600, Eric W. Biederman wrote: > Miklos Szeredi <mszeredi@redhat.com> writes: > > > If a capability is stored on disk in v2 format cap_inode_getsecurity() will > > currently return in v2 format unconditionally. > > > > This is wrong: v2 cap should be equivalent to a v3 cap with zero rootid, > > and so the same conversions performed on it. > > > > If the rootid cannot be mapped v3 is returned unconverted. Fix this so > > that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of the fs > > user namespace in case of v2) cannot be mapped in the current user > > namespace. > > This looks like a good cleanup. Sorry, I'm not following. Why is this a good cleanup? Why should the xattr be shown as faked v3 in this case? A separate question below. > I do wonder how well this works with stacking. In particular > ovl_xattr_set appears to call vfs_getxattr without overriding the creds. > What the purpose of that is I haven't quite figured out. It looks like > it is just a probe to see if an xattr is present so maybe it is ok. > > Acked-by: "Eric W. Biederman" <ebiederm@xmission.com> > > > > > Signed-off-by: Miklos Szeredi <mszeredi@redhat.com> > > --- > > security/commoncap.c | 67 ++++++++++++++++++++++++++++---------------- > > 1 file changed, 43 insertions(+), 24 deletions(-) > > > > diff --git a/security/commoncap.c b/security/commoncap.c > > index bacc1111d871..c9d99f8f4c82 100644 > > --- a/security/commoncap.c > > +++ b/security/commoncap.c > > @@ -371,10 +371,11 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, > > { > > int size, ret; > > kuid_t kroot; > > + __le32 nsmagic, magic; > > uid_t root, mappedroot; > > char *tmpbuf = NULL; > > struct vfs_cap_data *cap; > > - struct vfs_ns_cap_data *nscap; > > + struct vfs_ns_cap_data *nscap = NULL; > > struct dentry *dentry; > > struct user_namespace *fs_ns; > > > > @@ -396,46 +397,61 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, > > fs_ns = inode->i_sb->s_user_ns; > > cap = (struct vfs_cap_data *) tmpbuf; > > if (is_v2header((size_t) ret, cap)) { > > - /* If this is sizeof(vfs_cap_data) then we're ok with the > > - * on-disk value, so return that. */ > > - if (alloc) > > - *buffer = tmpbuf; > > - else > > - kfree(tmpbuf); > > - return ret; > > - } else if (!is_v3header((size_t) ret, cap)) { > > - kfree(tmpbuf); > > - return -EINVAL; > > + root = 0; > > + } else if (is_v3header((size_t) ret, cap)) { > > + nscap = (struct vfs_ns_cap_data *) tmpbuf; > > + root = le32_to_cpu(nscap->rootid); > > + } else { > > + size = -EINVAL; > > + goto out_free; > > } > > > > - nscap = (struct vfs_ns_cap_data *) tmpbuf; > > - root = le32_to_cpu(nscap->rootid); > > kroot = make_kuid(fs_ns, root); > > > > /* If the root kuid maps to a valid uid in current ns, then return > > * this as a nscap. */ > > mappedroot = from_kuid(current_user_ns(), kroot); > > if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { > > + size = sizeof(struct vfs_ns_cap_data); > > if (alloc) { > > - *buffer = tmpbuf; > > + if (!nscap) { > > + /* v2 -> v3 conversion */ > > + nscap = kzalloc(size, GFP_ATOMIC); > > + if (!nscap) { > > + size = -ENOMEM; > > + goto out_free; > > + } > > + nsmagic = VFS_CAP_REVISION_3; > > + magic = le32_to_cpu(cap->magic_etc); > > + if (magic & VFS_CAP_FLAGS_EFFECTIVE) > > + nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; > > + memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); > > + nscap->magic_etc = cpu_to_le32(nsmagic); > > + } else { > > + /* use allocated v3 buffer */ > > + tmpbuf = NULL; > > + } > > nscap->rootid = cpu_to_le32(mappedroot); > > - } else > > - kfree(tmpbuf); > > - return size; > > + *buffer = nscap; > > + } > > + goto out_free; > > } > > > > if (!rootid_owns_currentns(kroot)) { > > - kfree(tmpbuf); > > - return -EOPNOTSUPP; > > + size = -EOVERFLOW; Why this change? Christian (cc:d) noticed that this is a user visible change. Without this change, if you are in a userns which has different rootid, the EOVERFLOW tells vfs_getxattr to vall back to __vfs_getxattr() and so you can see the v3 capability with its rootid. With this change, you instead just get EOVERFLOW. > > + goto out_free; > > } > > > > /* This comes from a parent namespace. Return as a v2 capability */ > > size = sizeof(struct vfs_cap_data); > > if (alloc) { > > - *buffer = kmalloc(size, GFP_ATOMIC); > > - if (*buffer) { > > - struct vfs_cap_data *cap = *buffer; > > - __le32 nsmagic, magic; > > + if (nscap) { > > + /* v3 -> v2 conversion */ > > + cap = kzalloc(size, GFP_ATOMIC); > > + if (!cap) { > > + size = -ENOMEM; > > + goto out_free; > > + } > > magic = VFS_CAP_REVISION_2; > > nsmagic = le32_to_cpu(nscap->magic_etc); > > if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) > > @@ -443,9 +459,12 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, > > memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); > > cap->magic_etc = cpu_to_le32(magic); > > } else { > > - size = -ENOMEM; > > + /* use unconverted v2 */ > > + tmpbuf = NULL; > > } > > + *buffer = cap; > > } > > +out_free: > > kfree(tmpbuf); > > return size; > > }
"Serge E. Hallyn" <serge@hallyn.com> writes: > On Tue, Jan 19, 2021 at 07:34:49PM -0600, Eric W. Biederman wrote: >> Miklos Szeredi <mszeredi@redhat.com> writes: >> >> > If a capability is stored on disk in v2 format cap_inode_getsecurity() will >> > currently return in v2 format unconditionally. >> > >> > This is wrong: v2 cap should be equivalent to a v3 cap with zero rootid, >> > and so the same conversions performed on it. >> > >> > If the rootid cannot be mapped v3 is returned unconverted. Fix this so >> > that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of the fs >> > user namespace in case of v2) cannot be mapped in the current user >> > namespace. >> >> This looks like a good cleanup. > > Sorry, I'm not following. Why is this a good cleanup? Why should > the xattr be shown as faked v3 in this case? If the reader is in &init_user_ns. If the filesystem was mounted in a user namespace. Then the reader looses the information that the capability xattr only applies to a subset of user namespaces. A trivial place where this would be important is if userspace was to copy the file and the associated capability xattr to another filesystem, that is mounted differently. <aside> From our previous discussions I would also argue it would be good if there was a bypass that skipped all conversions if the reader and the filesystem are in the same user namespace. </aside> > A separate question below. > >> I do wonder how well this works with stacking. In particular >> ovl_xattr_set appears to call vfs_getxattr without overriding the creds. >> What the purpose of that is I haven't quite figured out. It looks like >> it is just a probe to see if an xattr is present so maybe it is ok. >> >> Acked-by: "Eric W. Biederman" <ebiederm@xmission.com> >> >> > >> > Signed-off-by: Miklos Szeredi <mszeredi@redhat.com> >> > --- >> > security/commoncap.c | 67 ++++++++++++++++++++++++++++---------------- >> > 1 file changed, 43 insertions(+), 24 deletions(-) >> > >> > diff --git a/security/commoncap.c b/security/commoncap.c >> > index bacc1111d871..c9d99f8f4c82 100644 >> > --- a/security/commoncap.c >> > +++ b/security/commoncap.c >> > @@ -371,10 +371,11 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, >> > { >> > int size, ret; >> > kuid_t kroot; >> > + __le32 nsmagic, magic; >> > uid_t root, mappedroot; >> > char *tmpbuf = NULL; >> > struct vfs_cap_data *cap; >> > - struct vfs_ns_cap_data *nscap; >> > + struct vfs_ns_cap_data *nscap = NULL; >> > struct dentry *dentry; >> > struct user_namespace *fs_ns; >> > >> > @@ -396,46 +397,61 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, >> > fs_ns = inode->i_sb->s_user_ns; >> > cap = (struct vfs_cap_data *) tmpbuf; >> > if (is_v2header((size_t) ret, cap)) { >> > - /* If this is sizeof(vfs_cap_data) then we're ok with the >> > - * on-disk value, so return that. */ >> > - if (alloc) >> > - *buffer = tmpbuf; >> > - else >> > - kfree(tmpbuf); >> > - return ret; >> > - } else if (!is_v3header((size_t) ret, cap)) { >> > - kfree(tmpbuf); >> > - return -EINVAL; >> > + root = 0; >> > + } else if (is_v3header((size_t) ret, cap)) { >> > + nscap = (struct vfs_ns_cap_data *) tmpbuf; >> > + root = le32_to_cpu(nscap->rootid); >> > + } else { >> > + size = -EINVAL; >> > + goto out_free; >> > } >> > >> > - nscap = (struct vfs_ns_cap_data *) tmpbuf; >> > - root = le32_to_cpu(nscap->rootid); >> > kroot = make_kuid(fs_ns, root); >> > >> > /* If the root kuid maps to a valid uid in current ns, then return >> > * this as a nscap. */ >> > mappedroot = from_kuid(current_user_ns(), kroot); >> > if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { >> > + size = sizeof(struct vfs_ns_cap_data); >> > if (alloc) { >> > - *buffer = tmpbuf; >> > + if (!nscap) { >> > + /* v2 -> v3 conversion */ >> > + nscap = kzalloc(size, GFP_ATOMIC); >> > + if (!nscap) { >> > + size = -ENOMEM; >> > + goto out_free; >> > + } >> > + nsmagic = VFS_CAP_REVISION_3; >> > + magic = le32_to_cpu(cap->magic_etc); >> > + if (magic & VFS_CAP_FLAGS_EFFECTIVE) >> > + nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; >> > + memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); >> > + nscap->magic_etc = cpu_to_le32(nsmagic); >> > + } else { >> > + /* use allocated v3 buffer */ >> > + tmpbuf = NULL; >> > + } >> > nscap->rootid = cpu_to_le32(mappedroot); >> > - } else >> > - kfree(tmpbuf); >> > - return size; >> > + *buffer = nscap; >> > + } >> > + goto out_free; >> > } >> > >> > if (!rootid_owns_currentns(kroot)) { >> > - kfree(tmpbuf); >> > - return -EOPNOTSUPP; >> > + size = -EOVERFLOW; > > Why this change? Christian (cc:d) noticed that this is a user visible change. > Without this change, if you are in a userns which has different rootid, the > EOVERFLOW tells vfs_getxattr to vall back to __vfs_getxattr() and so you can > see the v3 capability with its rootid. > > With this change, you instead just get EOVERFLOW. Returning EOVERFLOW is the desired behavior when the rootid can not be represented by the calling userspace. Today when you execute such a file from such a namespace the file will run without any file capabilities because get_vfs_caps_from_disk returns -ENODATA. However today if you copy the file will all of it's xattrs onto another filesystem the new file will have a v3 cap that will grant capabilities in some contexts. That mismatch is potentially a security problem. Which means the only sane thing to do is to fail so userspace does not think it can safely copy or comprehend all of the xattrs of the file. >> > + goto out_free; >> > } >> > >> > /* This comes from a parent namespace. Return as a v2 capability */ >> > size = sizeof(struct vfs_cap_data); >> > if (alloc) { >> > - *buffer = kmalloc(size, GFP_ATOMIC); >> > - if (*buffer) { >> > - struct vfs_cap_data *cap = *buffer; >> > - __le32 nsmagic, magic; >> > + if (nscap) { >> > + /* v3 -> v2 conversion */ >> > + cap = kzalloc(size, GFP_ATOMIC); >> > + if (!cap) { >> > + size = -ENOMEM; >> > + goto out_free; >> > + } >> > magic = VFS_CAP_REVISION_2; >> > nsmagic = le32_to_cpu(nscap->magic_etc); >> > if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) >> > @@ -443,9 +459,12 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, >> > memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); >> > cap->magic_etc = cpu_to_le32(magic); >> > } else { >> > - size = -ENOMEM; >> > + /* use unconverted v2 */ >> > + tmpbuf = NULL; >> > } >> > + *buffer = cap; >> > } >> > +out_free: >> > kfree(tmpbuf); >> > return size; >> > } Eric
On Thu, Jan 28, 2021 at 9:24 PM Eric W. Biederman <ebiederm@xmission.com> wrote: > <aside> > From our previous discussions I would also argue it would be good > if there was a bypass that skipped all conversions if the reader > and the filesystem are in the same user namespace. > </aside> That's however just an optimization (AFAICS) that only makes sense if it helps a read world workload. I'm not convinced that that's the case. Thanks, Miklos
Miklos Szeredi <miklos@szeredi.hu> writes: > On Thu, Jan 28, 2021 at 9:24 PM Eric W. Biederman <ebiederm@xmission.com> wrote: > >> <aside> >> From our previous discussions I would also argue it would be good >> if there was a bypass that skipped all conversions if the reader >> and the filesystem are in the same user namespace. >> </aside> > > That's however just an optimization (AFAICS) that only makes sense if > it helps a read world workload. I'm not convinced that that's the > case. It is definitely a different issue. From previous conversations with Serge, there is a concern with a sysadmin wanting to see what is actually on disk. In case there are bugs that care about the different layout. Just passing everything through when no translation is necessary will allow that kind of diagnosis. As your patch demonstrates we already have had bugs in this area so being able to get at the raw data may help people if they get into a situation where bugs matter. Eric
"Serge E. Hallyn" <serge@hallyn.com> writes: > On Thu, Jan 28, 2021 at 02:19:13PM -0600, Eric W. Biederman wrote: >> "Serge E. Hallyn" <serge@hallyn.com> writes: >> >> > On Tue, Jan 19, 2021 at 07:34:49PM -0600, Eric W. Biederman wrote: >> >> Miklos Szeredi <mszeredi@redhat.com> writes: >> >> >> >> > If a capability is stored on disk in v2 format cap_inode_getsecurity() will >> >> > currently return in v2 format unconditionally. >> >> > >> >> > This is wrong: v2 cap should be equivalent to a v3 cap with zero rootid, >> >> > and so the same conversions performed on it. >> >> > >> >> > If the rootid cannot be mapped v3 is returned unconverted. Fix this so >> >> > that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of the fs >> >> > user namespace in case of v2) cannot be mapped in the current user >> >> > namespace. >> >> >> >> This looks like a good cleanup. >> > >> > Sorry, I'm not following. Why is this a good cleanup? Why should >> > the xattr be shown as faked v3 in this case? >> >> If the reader is in &init_user_ns. If the filesystem was mounted in a >> user namespace. Then the reader looses the information that the > > Can you be more precise about "filesystem was mounted in a user namespace"? > Is this a FUSE thing, the fs is marked as being mounted in a non-init userns? > If that's a possible case, then yes that must be represented as v3. Using > is_v2header() may be the simpler way to check for that, but the more accurate > check would be "is it v2 header and mounted by init_user_ns". I think the filesystems current relevant are fuse,overlayfs,ramfs,tmpfs. > Basically yes, in as many cases as possible we want to just give a v2 > cap because more userspace knows what to do with that, but a non-init-userns > mounted fs which provides a v2 fscap should have it represented as v3 cap > with rootid being the kuid that owns the userns. That is the case we that is being fixed in the patch. > Or am I still thinking wrongly? Wouldn't be entirely surprised :) No you got it. Eric
"Serge E. Hallyn" <serge@hallyn.com> writes: > On Thu, Jan 28, 2021 at 08:44:26PM +0100, Miklos Szeredi wrote: >> On Thu, Jan 28, 2021 at 6:09 PM Serge E. Hallyn <serge@hallyn.com> wrote: >> > >> > On Tue, Jan 19, 2021 at 07:34:49PM -0600, Eric W. Biederman wrote: >> > > Miklos Szeredi <mszeredi@redhat.com> writes: >> > > >> > > > if (!rootid_owns_currentns(kroot)) { >> > > > - kfree(tmpbuf); >> > > > - return -EOPNOTSUPP; >> > > > + size = -EOVERFLOW; >> > >> > Why this change? Christian (cc:d) noticed that this is a user visible change. >> > Without this change, if you are in a userns which has different rootid, the >> > EOVERFLOW tells vfs_getxattr to vall back to __vfs_getxattr() and so you can >> > see the v3 capability with its rootid. >> > >> > With this change, you instead just get EOVERFLOW. >> >> Why would the user want to see nonsense (in its own userns) rootid and >> what would it do with it? > > They would know that the data is there. But an error of -EOVERFLOW still indicates data is there. You just don't get the data because it can not be represented. >> Please give an example where an untranslatable rootid would make any >> sense at all to the user. > > I may have accidentally, from init_user_ns, as uid 1000, set an > fscap with rootid 100001 instead of 100000, and wonder why the > cap is not working in the container where 100000 is root. Getting -EOVERFLOW when attempting to read the cap from inside the user namespace will immediately tell you what is wrong. The rootid does not map. That is how all the non-mapping situations are handled. Either -EOVERFLOW or returning INVALID_UID/the unmapped user id aka nobody. The existing code is wrong because it returns a completely untranslated uid, which is completely non-sense. An argument could be made for returning a rootid of 0xffffffff aka INVALID_UID in a v3 cap xattr when the rootid can not be mapped. I think that is what we do with posix_acls that contain ids that don't map. My sense is returning -EOVERFLOW inside the container and returning the v3 cap xattr outside the container will most quickly get the problem diagnosed, and will be the most likely to not cause problems. If there is a good case for returning a v3 cap with rootid of 0xffffffff instead of -EOVERFLOW we can do that. Right now I don't see anything that would be compelling in either direction. Eric
On Fri, Jan 29, 2021 at 05:11:53PM -0600, Eric W. Biederman wrote: > "Serge E. Hallyn" <serge@hallyn.com> writes: > > > On Thu, Jan 28, 2021 at 08:44:26PM +0100, Miklos Szeredi wrote: > >> On Thu, Jan 28, 2021 at 6:09 PM Serge E. Hallyn <serge@hallyn.com> wrote: > >> > > >> > On Tue, Jan 19, 2021 at 07:34:49PM -0600, Eric W. Biederman wrote: > >> > > Miklos Szeredi <mszeredi@redhat.com> writes: > >> > > > >> > > > if (!rootid_owns_currentns(kroot)) { > >> > > > - kfree(tmpbuf); > >> > > > - return -EOPNOTSUPP; > >> > > > + size = -EOVERFLOW; > >> > > >> > Why this change? Christian (cc:d) noticed that this is a user visible change. > >> > Without this change, if you are in a userns which has different rootid, the > >> > EOVERFLOW tells vfs_getxattr to vall back to __vfs_getxattr() and so you can > >> > see the v3 capability with its rootid. > >> > > >> > With this change, you instead just get EOVERFLOW. > >> > >> Why would the user want to see nonsense (in its own userns) rootid and > >> what would it do with it? > > > > They would know that the data is there. > > But an error of -EOVERFLOW still indicates data is there. > You just don't get the data because it can not be represented. Ok - and this happens *after* the check for whether the rootid to maps into the current ns. That sounds reasonable, thanks. > >> Please give an example where an untranslatable rootid would make any > >> sense at all to the user. > > > > I may have accidentally, from init_user_ns, as uid 1000, set an > > fscap with rootid 100001 instead of 100000, and wonder why the > > cap is not working in the container where 100000 is root. > > Getting -EOVERFLOW when attempting to read the cap from inside > the user namespace will immediately tell you what is wrong. The rootid > does not map. > > That is how all the non-mapping situations are handled. Either > -EOVERFLOW or returning INVALID_UID/the unmapped user id aka nobody. > > The existing code is wrong because it returns a completely untranslated > uid, which is completely non-sense. > > An argument could be made for returning a rootid of 0xffffffff aka > INVALID_UID in a v3 cap xattr when the rootid can not be mapped. I > think that is what we do with posix_acls that contain ids that don't > map. My sense is returning -EOVERFLOW inside the container and > returning the v3 cap xattr outside the container will most quickly get > the problem diagnosed, and will be the most likely to not cause > problems. > > If there is a good case for returning a v3 cap with rootid of 0xffffffff > instead of -EOVERFLOW we can do that. Right now I don't see anything > that would be compelling in either direction. > > Eric > > >
On Fri, Jan 29, 2021 at 04:55:29PM -0600, Eric W. Biederman wrote: > "Serge E. Hallyn" <serge@hallyn.com> writes: > > > On Thu, Jan 28, 2021 at 02:19:13PM -0600, Eric W. Biederman wrote: > >> "Serge E. Hallyn" <serge@hallyn.com> writes: > >> > >> > On Tue, Jan 19, 2021 at 07:34:49PM -0600, Eric W. Biederman wrote: > >> >> Miklos Szeredi <mszeredi@redhat.com> writes: > >> >> > >> >> > If a capability is stored on disk in v2 format cap_inode_getsecurity() will > >> >> > currently return in v2 format unconditionally. > >> >> > > >> >> > This is wrong: v2 cap should be equivalent to a v3 cap with zero rootid, > >> >> > and so the same conversions performed on it. > >> >> > > >> >> > If the rootid cannot be mapped v3 is returned unconverted. Fix this so > >> >> > that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of the fs > >> >> > user namespace in case of v2) cannot be mapped in the current user > >> >> > namespace. > >> >> > >> >> This looks like a good cleanup. > >> > > >> > Sorry, I'm not following. Why is this a good cleanup? Why should > >> > the xattr be shown as faked v3 in this case? > >> > >> If the reader is in &init_user_ns. If the filesystem was mounted in a > >> user namespace. Then the reader looses the information that the > > > > Can you be more precise about "filesystem was mounted in a user namespace"? > > Is this a FUSE thing, the fs is marked as being mounted in a non-init userns? > > If that's a possible case, then yes that must be represented as v3. Using > > is_v2header() may be the simpler way to check for that, but the more accurate > > check would be "is it v2 header and mounted by init_user_ns". > > I think the filesystems current relevant are fuse,overlayfs,ramfs,tmpfs. > > > Basically yes, in as many cases as possible we want to just give a v2 > > cap because more userspace knows what to do with that, but a non-init-userns > > mounted fs which provides a v2 fscap should have it represented as v3 cap > > with rootid being the kuid that owns the userns. > > That is the case we that is being fixed in the patch. > > > Or am I still thinking wrongly? Wouldn't be entirely surprised :) > > No you got it. So then can we make faking a v3 gated on whether sb->s_user_ns != &init_user_ns ?
"Serge E. Hallyn" <serge@hallyn.com> writes: > On Fri, Jan 29, 2021 at 04:55:29PM -0600, Eric W. Biederman wrote: >> "Serge E. Hallyn" <serge@hallyn.com> writes: >> >> > On Thu, Jan 28, 2021 at 02:19:13PM -0600, Eric W. Biederman wrote: >> >> "Serge E. Hallyn" <serge@hallyn.com> writes: >> >> >> >> > On Tue, Jan 19, 2021 at 07:34:49PM -0600, Eric W. Biederman wrote: >> >> >> Miklos Szeredi <mszeredi@redhat.com> writes: >> >> >> >> >> >> > If a capability is stored on disk in v2 format cap_inode_getsecurity() will >> >> >> > currently return in v2 format unconditionally. >> >> >> > >> >> >> > This is wrong: v2 cap should be equivalent to a v3 cap with zero rootid, >> >> >> > and so the same conversions performed on it. >> >> >> > >> >> >> > If the rootid cannot be mapped v3 is returned unconverted. Fix this so >> >> >> > that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of the fs >> >> >> > user namespace in case of v2) cannot be mapped in the current user >> >> >> > namespace. >> >> >> >> >> >> This looks like a good cleanup. >> >> > >> >> > Sorry, I'm not following. Why is this a good cleanup? Why should >> >> > the xattr be shown as faked v3 in this case? >> >> >> >> If the reader is in &init_user_ns. If the filesystem was mounted in a >> >> user namespace. Then the reader looses the information that the >> > >> > Can you be more precise about "filesystem was mounted in a user namespace"? >> > Is this a FUSE thing, the fs is marked as being mounted in a non-init userns? >> > If that's a possible case, then yes that must be represented as v3. Using >> > is_v2header() may be the simpler way to check for that, but the more accurate >> > check would be "is it v2 header and mounted by init_user_ns". >> >> I think the filesystems current relevant are fuse,overlayfs,ramfs,tmpfs. >> >> > Basically yes, in as many cases as possible we want to just give a v2 >> > cap because more userspace knows what to do with that, but a non-init-userns >> > mounted fs which provides a v2 fscap should have it represented as v3 cap >> > with rootid being the kuid that owns the userns. >> >> That is the case we that is being fixed in the patch. >> >> > Or am I still thinking wrongly? Wouldn't be entirely surprised :) >> >> No you got it. > > So then can we make faking a v3 gated on whether > sb->s_user_ns != &init_user_ns ? Sort of. What Miklos's patch implements is always treating a v2 cap xattr on disk as v3 internally. > if (is_v2header((size_t) ret, cap)) { > root = 0; > } else if (is_v3header((size_t) ret, cap)) { > nscap = (struct vfs_ns_cap_data *) tmpbuf; > root = le32_to_cpu(nscap->rootid); > } else { > size = -EINVAL; > goto out_free; > } Then v3 is returned if: > /* If the root kuid maps to a valid uid in current ns, then return > * this as a nscap. */ > mappedroot = from_kuid(current_user_ns(), kroot); > if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { After that we verify that the fs capability can be seen by the caller as a v2 cap xattr with: > > if (!rootid_owns_currentns(kroot)) { > > size = -EOVERFLOW; > > goto out_free; Anything that passes that test and does not encounter a memory allocation error is returned as a v2. ... Which in practice does mean that if sb->s_user_ns != &init_user_ns, then mappedroot != 0, and is returned as a v3. The rest of the logic takes care of all of the other crazy silly combinations. Like a user namespace that identity maps uid 0, and then mounts a filesystem. Eric
diff --git a/security/commoncap.c b/security/commoncap.c index bacc1111d871..c9d99f8f4c82 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -371,10 +371,11 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, { int size, ret; kuid_t kroot; + __le32 nsmagic, magic; uid_t root, mappedroot; char *tmpbuf = NULL; struct vfs_cap_data *cap; - struct vfs_ns_cap_data *nscap; + struct vfs_ns_cap_data *nscap = NULL; struct dentry *dentry; struct user_namespace *fs_ns; @@ -396,46 +397,61 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, fs_ns = inode->i_sb->s_user_ns; cap = (struct vfs_cap_data *) tmpbuf; if (is_v2header((size_t) ret, cap)) { - /* If this is sizeof(vfs_cap_data) then we're ok with the - * on-disk value, so return that. */ - if (alloc) - *buffer = tmpbuf; - else - kfree(tmpbuf); - return ret; - } else if (!is_v3header((size_t) ret, cap)) { - kfree(tmpbuf); - return -EINVAL; + root = 0; + } else if (is_v3header((size_t) ret, cap)) { + nscap = (struct vfs_ns_cap_data *) tmpbuf; + root = le32_to_cpu(nscap->rootid); + } else { + size = -EINVAL; + goto out_free; } - nscap = (struct vfs_ns_cap_data *) tmpbuf; - root = le32_to_cpu(nscap->rootid); kroot = make_kuid(fs_ns, root); /* If the root kuid maps to a valid uid in current ns, then return * this as a nscap. */ mappedroot = from_kuid(current_user_ns(), kroot); if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { + size = sizeof(struct vfs_ns_cap_data); if (alloc) { - *buffer = tmpbuf; + if (!nscap) { + /* v2 -> v3 conversion */ + nscap = kzalloc(size, GFP_ATOMIC); + if (!nscap) { + size = -ENOMEM; + goto out_free; + } + nsmagic = VFS_CAP_REVISION_3; + magic = le32_to_cpu(cap->magic_etc); + if (magic & VFS_CAP_FLAGS_EFFECTIVE) + nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; + memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); + nscap->magic_etc = cpu_to_le32(nsmagic); + } else { + /* use allocated v3 buffer */ + tmpbuf = NULL; + } nscap->rootid = cpu_to_le32(mappedroot); - } else - kfree(tmpbuf); - return size; + *buffer = nscap; + } + goto out_free; } if (!rootid_owns_currentns(kroot)) { - kfree(tmpbuf); - return -EOPNOTSUPP; + size = -EOVERFLOW; + goto out_free; } /* This comes from a parent namespace. Return as a v2 capability */ size = sizeof(struct vfs_cap_data); if (alloc) { - *buffer = kmalloc(size, GFP_ATOMIC); - if (*buffer) { - struct vfs_cap_data *cap = *buffer; - __le32 nsmagic, magic; + if (nscap) { + /* v3 -> v2 conversion */ + cap = kzalloc(size, GFP_ATOMIC); + if (!cap) { + size = -ENOMEM; + goto out_free; + } magic = VFS_CAP_REVISION_2; nsmagic = le32_to_cpu(nscap->magic_etc); if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) @@ -443,9 +459,12 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); cap->magic_etc = cpu_to_le32(magic); } else { - size = -ENOMEM; + /* use unconverted v2 */ + tmpbuf = NULL; } + *buffer = cap; } +out_free: kfree(tmpbuf); return size; }
If a capability is stored on disk in v2 format cap_inode_getsecurity() will currently return in v2 format unconditionally. This is wrong: v2 cap should be equivalent to a v3 cap with zero rootid, and so the same conversions performed on it. If the rootid cannot be mapped v3 is returned unconverted. Fix this so that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of the fs user namespace in case of v2) cannot be mapped in the current user namespace. Signed-off-by: Miklos Szeredi <mszeredi@redhat.com> --- security/commoncap.c | 67 ++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 24 deletions(-)