diff mbox

[v2,2/3] /dev/dax, core: file operations and dax-mmap

Message ID 146329358943.17948.12857084981536373315.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dan Williams May 15, 2016, 6:26 a.m. UTC
The "Device DAX" core enables dax mappings of performance / feature
differentiated memory.  An open mapping or file handle keeps the backing
struct device live, but new mappings are only possible while the device
is enabled.   Faults are handled under rcu_read_lock to synchronize
with the enabled state of the device.

Similar to the filesystem-dax case the backing memory may optionally
have struct page entries.  However, unlike fs-dax there is no support
for private mappings, or mappings that are not backed by media (see
use of zero-page in fs-dax).

Mappings are always guaranteed to match the alignment of the dax_region.
If the dax_region is configured to have a 2MB alignment, all mappings
are guaranteed to be backed by a pmd entry.  Contrast this determinism
with the fs-dax case where pmd mappings are opportunistic.  If userspace
attempts to force a misaligned mapping, the driver will fail the mmap
attempt.  See dax_dev_check_vma() for other scenarios that are rejected,
like MAP_PRIVATE mappings.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/Kconfig |    1 
 drivers/dax/dax.c   |  316 +++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/huge_memory.c    |    1 
 mm/hugetlb.c        |    1 
 4 files changed, 319 insertions(+)


--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Johannes Thumshirn May 17, 2016, 10:57 a.m. UTC | #1
On Sat, May 14, 2016 at 11:26:29PM -0700, Dan Williams wrote:
> The "Device DAX" core enables dax mappings of performance / feature
> differentiated memory.  An open mapping or file handle keeps the backing
> struct device live, but new mappings are only possible while the device
> is enabled.   Faults are handled under rcu_read_lock to synchronize
> with the enabled state of the device.
> 
> Similar to the filesystem-dax case the backing memory may optionally
> have struct page entries.  However, unlike fs-dax there is no support
> for private mappings, or mappings that are not backed by media (see
> use of zero-page in fs-dax).
> 
> Mappings are always guaranteed to match the alignment of the dax_region.
> If the dax_region is configured to have a 2MB alignment, all mappings
> are guaranteed to be backed by a pmd entry.  Contrast this determinism
> with the fs-dax case where pmd mappings are opportunistic.  If userspace
> attempts to force a misaligned mapping, the driver will fail the mmap
> attempt.  See dax_dev_check_vma() for other scenarios that are rejected,
> like MAP_PRIVATE mappings.
> 
> Cc: Jeff Moyer <jmoyer@redhat.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  drivers/dax/Kconfig |    1 
>  drivers/dax/dax.c   |  316 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  mm/huge_memory.c    |    1 
>  mm/hugetlb.c        |    1 
>  4 files changed, 319 insertions(+)
> 
> diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
> index 86ffbaa891ad..cedab7572de3 100644
> --- a/drivers/dax/Kconfig
> +++ b/drivers/dax/Kconfig
> @@ -1,6 +1,7 @@
>  menuconfig DEV_DAX
>  	tristate "DAX: direct access to differentiated memory"
>  	default m if NVDIMM_DAX
> +	depends on TRANSPARENT_HUGEPAGE
>  	help
>  	  Support raw access to differentiated (persistence, bandwidth,
>  	  latency...) memory via an mmap(2) capable character
> diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
> index 8207fb33a992..b2fe8a0ce866 100644
> --- a/drivers/dax/dax.c
> +++ b/drivers/dax/dax.c
> @@ -49,6 +49,7 @@ struct dax_region {
>   * @region - parent region
>   * @dev - device backing the character device
>   * @kref - enable this data to be tracked in filp->private_data
> + * @alive - !alive + rcu grace period == no new mappings can be established
>   * @id - child id in the region
>   * @num_resources - number of physical address extents in this device
>   * @res - array of physical address ranges
> @@ -57,6 +58,7 @@ struct dax_dev {
>  	struct dax_region *region;
>  	struct device *dev;
>  	struct kref kref;
> +	bool alive;
>  	int id;
>  	int num_resources;
>  	struct resource res[0];
> @@ -150,6 +152,10 @@ static void destroy_dax_dev(void *_dev)
>  
>  	dev_dbg(dev, "%s\n", __func__);
>  
> +	/* disable and flush fault handlers, TODO unmap inodes */
> +	dax_dev->alive = false;
> +	synchronize_rcu();
> +

IIRC RCU is only protecting a pointer, not the content of the pointer, so this
looks wrong to me.

>  	get_device(dev);
>  	device_unregister(dev);
>  	ida_simple_remove(&dax_region->ida, dax_dev->id);
> @@ -173,6 +179,7 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
>  	memcpy(dax_dev->res, res, sizeof(*res) * count);
>  	dax_dev->num_resources = count;
>  	kref_init(&dax_dev->kref);
> +	dax_dev->alive = true;
>  	dax_dev->region = dax_region;
>  	kref_get(&dax_region->kref);
>  
> @@ -217,9 +224,318 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
>  }
>  EXPORT_SYMBOL_GPL(devm_create_dax_dev);
>  
> +/* return an unmapped area aligned to the dax region specified alignment */
> +static unsigned long dax_dev_get_unmapped_area(struct file *filp,
> +		unsigned long addr, unsigned long len, unsigned long pgoff,
> +		unsigned long flags)
> +{
> +	unsigned long off, off_end, off_align, len_align, addr_align, align;
> +	struct dax_dev *dax_dev = filp ? filp->private_data : NULL;
> +	struct dax_region *dax_region;
> +
> +	if (!dax_dev || addr)
> +		goto out;
> +
> +	dax_region = dax_dev->region;
> +	align = dax_region->align;
> +	off = pgoff << PAGE_SHIFT;
> +	off_end = off + len;
> +	off_align = round_up(off, align);
> +
> +	if ((off_end <= off_align) || ((off_end - off_align) < align))
> +		goto out;
> +
> +	len_align = len + align;
> +	if ((off + len_align) < off)
> +		goto out;
> +
> +	addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
> +			pgoff, flags);
> +	if (!IS_ERR_VALUE(addr_align)) {
> +		addr_align += (off - addr_align) & (align - 1);
> +		return addr_align;
> +	}
> + out:
> +	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
> +}
> +
> +static int __match_devt(struct device *dev, const void *data)
> +{
> +	const dev_t *devt = data;
> +
> +	return dev->devt == *devt;
> +}
> +
> +static struct device *dax_dev_find(dev_t dev_t)
> +{
> +	return class_find_device(dax_class, NULL, &dev_t, __match_devt);
> +}
> +
> +static int dax_dev_open(struct inode *inode, struct file *filp)
> +{
> +	struct dax_dev *dax_dev = NULL;
> +	struct device *dev;
> +
> +	dev = dax_dev_find(inode->i_rdev);
> +	if (!dev)
> +		return -ENXIO;
> +
> +	device_lock(dev);
> +	dax_dev = dev_get_drvdata(dev);
> +	if (dax_dev) {
> +		dev_dbg(dev, "%s\n", __func__);
> +		filp->private_data = dax_dev;
> +		kref_get(&dax_dev->kref);
> +		inode->i_flags = S_DAX;
> +	}
> +	device_unlock(dev);
> +

Does this block really need to be protected by the dev mutex? If yes, have you
considered re-ordering it like this?

	device_lock(dev);
	dax_dev = dev_get_drvdata(dev);
	if (!dax_dev) {
		device_unlock(dev);
		goto out_put;
	}
	filp->private_data = dax_dev;
	kref_get(&dax_dev->kref); // or get_dax_device(dax_dev)
	inode->i_flags = S_DAX;
	device_unlock(dev);

	return 0;

out_put:
	put_device(dev);
	return -ENXIO;

The only thing I see that could be needed to be protected here, is the
inode->i_flags and shouldn't that be protected by the inode->i_mutex? But I'm
not sure, hence the question. Also S_DAX is the only valid flag for a DAX
device, isn't it?

> +	if (!dax_dev) {
> +		put_device(dev);
> +		return -ENXIO;
> +	}
> +	return 0;
> +}
> +
> +static int dax_dev_release(struct inode *inode, struct file *filp)
> +{
> +	struct dax_dev *dax_dev = filp->private_data;
> +	struct device *dev = dax_dev->dev;
> +
> +	dev_dbg(dax_dev->dev, "%s\n", __func__);
> +	dax_dev_put(dax_dev);
> +	put_device(dev);
> +

For reasons of consistency one could reset the inode's i_flags here.

> +	return 0;
> +}
> +
> +static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
> +		const char *func)
> +{
> +	struct dax_region *dax_region = dax_dev->region;
> +	struct device *dev = dax_dev->dev;
> +	unsigned long mask;
> +
> +	if (!dax_dev->alive)
> +		return -ENXIO;
> +
> +	/* prevent private / writable mappings from being established */
> +	if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) {
> +		dev_dbg(dev, "%s: %s: fail, attempted private mapping\n",
> +				current->comm, func);

This deserves a higher log-level than debug, IMHO.

> +		return -EINVAL;
> +	}
> +
> +	mask = dax_region->align - 1;
> +	if (vma->vm_start & mask || vma->vm_end & mask) {
> +		dev_dbg(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
> +				current->comm, func, vma->vm_start, vma->vm_end,
> +				mask);

Ditto.

> +		return -EINVAL;
> +	}
> +
> +	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
> +			&& (vma->vm_flags & VM_DONTCOPY) == 0) {
> +		dev_dbg(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
> +				current->comm, func);

Ditto.

> +		return -EINVAL;
> +	}
> +
> +	if (!vma_is_dax(vma)) {
> +		dev_dbg(dev, "%s: %s: fail, vma is not DAX capable\n",
> +				current->comm, func);

Ditto.

> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
> +		unsigned long size)
> +{
> +	struct resource *res;
> +	phys_addr_t phys;
> +	int i;
> +
> +	for (i = 0; i < dax_dev->num_resources; i++) {
> +		res = &dax_dev->res[i];
> +		phys = pgoff * PAGE_SIZE + res->start;
> +		if (phys >= res->start && phys <= res->end)
> +			break;
> +		pgoff -= PHYS_PFN(resource_size(res));
> +	}
> +
> +	if (i < dax_dev->num_resources) {
> +		res = &dax_dev->res[i];
> +		if (phys + size - 1 <= res->end)
> +			return phys;
> +	}
> +
> +	return -1;
> +}
> +
> +static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
> +		struct vm_fault *vmf)
> +{
> +	unsigned long vaddr = (unsigned long) vmf->virtual_address;
> +	struct device *dev = dax_dev->dev;
> +	struct dax_region *dax_region;
> +	int rc = VM_FAULT_SIGBUS;
> +	phys_addr_t phys;
> +	pfn_t pfn;
> +
> +	if (check_vma(dax_dev, vma, __func__))
> +		return VM_FAULT_SIGBUS;
> +
> +	dax_region = dax_dev->region;
> +	if (dax_region->align > PAGE_SIZE) {
> +		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
> +		return VM_FAULT_SIGBUS;
> +	}
> +
> +	phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
> +	if (phys == -1) {
> +		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
> +				vmf->pgoff);
> +		return VM_FAULT_SIGBUS;
> +	}
> +
> +	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
> +
> +	rc = vm_insert_mixed(vma, vaddr, pfn);
> +
> +	if (rc == -ENOMEM)
> +		return VM_FAULT_OOM;
> +	if (rc < 0 && rc != -EBUSY)
> +		return VM_FAULT_SIGBUS;
> +
> +	return VM_FAULT_NOPAGE;
> +}
> +
> +static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	int rc;
> +	struct file *filp = vma->vm_file;
> +	struct dax_dev *dax_dev = filp->private_data;
> +
> +	dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
> +			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
> +			? "write" : "read", vma->vm_start, vma->vm_end);
> +	rcu_read_lock();
> +	rc = __dax_dev_fault(dax_dev, vma, vmf);
> +	rcu_read_unlock();

Similarly, what are you protecting? I just see you're locking something to be
read, but don't do a rcu_dereference() to actually access a rcu protected
pointer. Or am I missing something totally here?

> +
> +	return rc;
> +}
> +
> +static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
> +		struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
> +		unsigned int flags)
> +{
> +	unsigned long pmd_addr = addr & PMD_MASK;
> +	struct device *dev = dax_dev->dev;
> +	struct dax_region *dax_region;
> +	phys_addr_t phys;
> +	pgoff_t pgoff;
> +	pfn_t pfn;
> +
> +	if (check_vma(dax_dev, vma, __func__))
> +		return VM_FAULT_SIGBUS;
> +
> +	dax_region = dax_dev->region;
> +	if (dax_region->align > PMD_SIZE) {
> +		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
> +		return VM_FAULT_SIGBUS;
> +	}
> +
> +	/* dax pmd mappings require pfn_t_devmap() */
> +	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
> +		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
> +		return VM_FAULT_SIGBUS;
> +	}
> +
> +	pgoff = linear_page_index(vma, pmd_addr);
> +	phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE);
> +	if (phys == -1) {
> +		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
> +				pgoff);
> +		return VM_FAULT_SIGBUS;
> +	}
> +
> +	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
> +
> +	return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
> +			flags & FAULT_FLAG_WRITE);
> +}
> +
> +static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
> +		pmd_t *pmd, unsigned int flags)
> +{
> +	int rc;
> +	struct file *filp = vma->vm_file;
> +	struct dax_dev *dax_dev = filp->private_data;
> +
> +	dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
> +			current->comm, (flags & FAULT_FLAG_WRITE)
> +			? "write" : "read", vma->vm_start, vma->vm_end);
> +
> +	rcu_read_lock();
> +	rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
> +	rcu_read_unlock();
> +
> +	return rc;
> +}
> +
> +static void dax_dev_vm_open(struct vm_area_struct *vma)
> +{
> +	struct file *filp = vma->vm_file;
> +	struct dax_dev *dax_dev = filp->private_data;
> +
> +	dev_dbg(dax_dev->dev, "%s\n", __func__);
> +	kref_get(&dax_dev->kref);
> +}
> +
> +static void dax_dev_vm_close(struct vm_area_struct *vma)
> +{
> +	struct file *filp = vma->vm_file;
> +	struct dax_dev *dax_dev = filp->private_data;
> +
> +	dev_dbg(dax_dev->dev, "%s\n", __func__);
> +	dax_dev_put(dax_dev);
> +}
> +
> +static const struct vm_operations_struct dax_dev_vm_ops = {
> +	.fault = dax_dev_fault,
> +	.pmd_fault = dax_dev_pmd_fault,
> +	.open = dax_dev_vm_open,
> +	.close = dax_dev_vm_close,
> +};
> +
> +static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> +	struct dax_dev *dax_dev = filp->private_data;
> +	int rc;
> +
> +	dev_dbg(dax_dev->dev, "%s\n", __func__);
> +
> +	rc = check_vma(dax_dev, vma, __func__);
> +	if (rc)
> +		return rc;
> +
> +	kref_get(&dax_dev->kref);
> +	vma->vm_ops = &dax_dev_vm_ops;
> +	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
> +	return 0;
> +
> +}
> +
>  static const struct file_operations dax_fops = {
>  	.llseek = noop_llseek,
>  	.owner = THIS_MODULE,
> +	.open = dax_dev_open,
> +	.release = dax_dev_release,
> +	.get_unmapped_area = dax_dev_get_unmapped_area,
> +	.mmap = dax_dev_mmap,
>  };
>  
>  static int __init dax_init(void)
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 86f9f8b82f8e..52ea012d8a80 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1013,6 +1013,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
>  	insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
>  	return VM_FAULT_NOPAGE;
>  }
> +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
>  
>  static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
>  		pmd_t *pmd)
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 19d0d08b396f..b14e98129b07 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -624,6 +624,7 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
>  {
>  	return vma_hugecache_offset(hstate_vma(vma), vma, address);
>  }
> +EXPORT_SYMBOL_GPL(linear_hugepage_index);
>  
>  /*
>   * Return the size of the pages allocated when backing a VMA. In the majority
> 
> _______________________________________________
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
Dan Williams May 17, 2016, 10:19 p.m. UTC | #2
On Tue, May 17, 2016 at 3:57 AM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
> On Sat, May 14, 2016 at 11:26:29PM -0700, Dan Williams wrote:
>> The "Device DAX" core enables dax mappings of performance / feature
>> differentiated memory.  An open mapping or file handle keeps the backing
>> struct device live, but new mappings are only possible while the device
>> is enabled.   Faults are handled under rcu_read_lock to synchronize
>> with the enabled state of the device.
>>
>> Similar to the filesystem-dax case the backing memory may optionally
>> have struct page entries.  However, unlike fs-dax there is no support
>> for private mappings, or mappings that are not backed by media (see
>> use of zero-page in fs-dax).
>>
>> Mappings are always guaranteed to match the alignment of the dax_region.
>> If the dax_region is configured to have a 2MB alignment, all mappings
>> are guaranteed to be backed by a pmd entry.  Contrast this determinism
>> with the fs-dax case where pmd mappings are opportunistic.  If userspace
>> attempts to force a misaligned mapping, the driver will fail the mmap
>> attempt.  See dax_dev_check_vma() for other scenarios that are rejected,
>> like MAP_PRIVATE mappings.
>>
>> Cc: Jeff Moyer <jmoyer@redhat.com>
>> Cc: Christoph Hellwig <hch@lst.de>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> ---
>>  drivers/dax/Kconfig |    1
>>  drivers/dax/dax.c   |  316 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>  mm/huge_memory.c    |    1
>>  mm/hugetlb.c        |    1
>>  4 files changed, 319 insertions(+)
>>
>> diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
>> index 86ffbaa891ad..cedab7572de3 100644
>> --- a/drivers/dax/Kconfig
>> +++ b/drivers/dax/Kconfig
>> @@ -1,6 +1,7 @@
>>  menuconfig DEV_DAX
>>       tristate "DAX: direct access to differentiated memory"
>>       default m if NVDIMM_DAX
>> +     depends on TRANSPARENT_HUGEPAGE
>>       help
>>         Support raw access to differentiated (persistence, bandwidth,
>>         latency...) memory via an mmap(2) capable character
>> diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
>> index 8207fb33a992..b2fe8a0ce866 100644
>> --- a/drivers/dax/dax.c
>> +++ b/drivers/dax/dax.c
>> @@ -49,6 +49,7 @@ struct dax_region {
>>   * @region - parent region
>>   * @dev - device backing the character device
>>   * @kref - enable this data to be tracked in filp->private_data
>> + * @alive - !alive + rcu grace period == no new mappings can be established
>>   * @id - child id in the region
>>   * @num_resources - number of physical address extents in this device
>>   * @res - array of physical address ranges
>> @@ -57,6 +58,7 @@ struct dax_dev {
>>       struct dax_region *region;
>>       struct device *dev;
>>       struct kref kref;
>> +     bool alive;
>>       int id;
>>       int num_resources;
>>       struct resource res[0];
>> @@ -150,6 +152,10 @@ static void destroy_dax_dev(void *_dev)
>>
>>       dev_dbg(dev, "%s\n", __func__);
>>
>> +     /* disable and flush fault handlers, TODO unmap inodes */
>> +     dax_dev->alive = false;
>> +     synchronize_rcu();
>> +
>
> IIRC RCU is only protecting a pointer, not the content of the pointer, so this
> looks wrong to me.

The driver is using RCU to guarantee that all currently running fault
handlers have either completed or will see the new state of ->alive
when they start.  Reference counts are protecting the actual dax_dev
object.

>>       get_device(dev);
>>       device_unregister(dev);
>>       ida_simple_remove(&dax_region->ida, dax_dev->id);
>> @@ -173,6 +179,7 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
>>       memcpy(dax_dev->res, res, sizeof(*res) * count);
>>       dax_dev->num_resources = count;
>>       kref_init(&dax_dev->kref);
>> +     dax_dev->alive = true;
>>       dax_dev->region = dax_region;
>>       kref_get(&dax_region->kref);
>>
>> @@ -217,9 +224,318 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
>>  }
>>  EXPORT_SYMBOL_GPL(devm_create_dax_dev);
>>
>> +/* return an unmapped area aligned to the dax region specified alignment */
>> +static unsigned long dax_dev_get_unmapped_area(struct file *filp,
>> +             unsigned long addr, unsigned long len, unsigned long pgoff,
>> +             unsigned long flags)
>> +{
>> +     unsigned long off, off_end, off_align, len_align, addr_align, align;
>> +     struct dax_dev *dax_dev = filp ? filp->private_data : NULL;
>> +     struct dax_region *dax_region;
>> +
>> +     if (!dax_dev || addr)
>> +             goto out;
>> +
>> +     dax_region = dax_dev->region;
>> +     align = dax_region->align;
>> +     off = pgoff << PAGE_SHIFT;
>> +     off_end = off + len;
>> +     off_align = round_up(off, align);
>> +
>> +     if ((off_end <= off_align) || ((off_end - off_align) < align))
>> +             goto out;
>> +
>> +     len_align = len + align;
>> +     if ((off + len_align) < off)
>> +             goto out;
>> +
>> +     addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
>> +                     pgoff, flags);
>> +     if (!IS_ERR_VALUE(addr_align)) {
>> +             addr_align += (off - addr_align) & (align - 1);
>> +             return addr_align;
>> +     }
>> + out:
>> +     return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
>> +}
>> +
>> +static int __match_devt(struct device *dev, const void *data)
>> +{
>> +     const dev_t *devt = data;
>> +
>> +     return dev->devt == *devt;
>> +}
>> +
>> +static struct device *dax_dev_find(dev_t dev_t)
>> +{
>> +     return class_find_device(dax_class, NULL, &dev_t, __match_devt);
>> +}
>> +
>> +static int dax_dev_open(struct inode *inode, struct file *filp)
>> +{
>> +     struct dax_dev *dax_dev = NULL;
>> +     struct device *dev;
>> +
>> +     dev = dax_dev_find(inode->i_rdev);
>> +     if (!dev)
>> +             return -ENXIO;
>> +
>> +     device_lock(dev);
>> +     dax_dev = dev_get_drvdata(dev);
>> +     if (dax_dev) {
>> +             dev_dbg(dev, "%s\n", __func__);
>> +             filp->private_data = dax_dev;
>> +             kref_get(&dax_dev->kref);
>> +             inode->i_flags = S_DAX;
>> +     }
>> +     device_unlock(dev);
>> +
>
> Does this block really need to be protected by the dev mutex? If yes, have you
> considered re-ordering it like this?

We need to make sure the dax_dev is not freed while we're trying to
take a reference against it, hence the lock.

>
>         device_lock(dev);
>         dax_dev = dev_get_drvdata(dev);
>         if (!dax_dev) {
>                 device_unlock(dev);
>                 goto out_put;
>         }
>         filp->private_data = dax_dev;
>         kref_get(&dax_dev->kref); // or get_dax_device(dax_dev)

dax_dev may already be invalid at this point if open() is racing
device_unregister().

>         inode->i_flags = S_DAX;
>         device_unlock(dev);
>
>         return 0;
>
> out_put:
>         put_device(dev);
>         return -ENXIO;
>
> The only thing I see that could be needed to be protected here, is the
> inode->i_flags and shouldn't that be protected by the inode->i_mutex? But I'm
> not sure, hence the question.

Nothing else should be mutating the inode flags so I don't think we
need protection.

> Also S_DAX is the only valid flag for a DAX
> device, isn't it?

Correct, we need that capability so that vma_is_dax() will recognize
these mappings.

>
>> +     if (!dax_dev) {
>> +             put_device(dev);
>> +             return -ENXIO;
>> +     }
>> +     return 0;
>> +}
>> +
>> +static int dax_dev_release(struct inode *inode, struct file *filp)
>> +{
>> +     struct dax_dev *dax_dev = filp->private_data;
>> +     struct device *dev = dax_dev->dev;
>> +
>> +     dev_dbg(dax_dev->dev, "%s\n", __func__);
>> +     dax_dev_put(dax_dev);
>> +     put_device(dev);
>> +
>
> For reasons of consistency one could reset the inode's i_flags here.

That change seems like a bug to me, it doesn't stop being a dax inode
just because an open file is released, right?

>> +     return 0;
>> +}
>> +
>> +static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
>> +             const char *func)
>> +{
>> +     struct dax_region *dax_region = dax_dev->region;
>> +     struct device *dev = dax_dev->dev;
>> +     unsigned long mask;
>> +
>> +     if (!dax_dev->alive)
>> +             return -ENXIO;
>> +
>> +     /* prevent private / writable mappings from being established */
>> +     if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) {
>> +             dev_dbg(dev, "%s: %s: fail, attempted private mapping\n",
>> +                             current->comm, func);
>
> This deserves a higher log-level than debug, IMHO.
>
>> +             return -EINVAL;
>> +     }
>> +
>> +     mask = dax_region->align - 1;
>> +     if (vma->vm_start & mask || vma->vm_end & mask) {
>> +             dev_dbg(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
>> +                             current->comm, func, vma->vm_start, vma->vm_end,
>> +                             mask);
>
> Ditto.
>
>> +             return -EINVAL;
>> +     }
>> +
>> +     if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
>> +                     && (vma->vm_flags & VM_DONTCOPY) == 0) {
>> +             dev_dbg(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
>> +                             current->comm, func);
>
> Ditto.
>
>> +             return -EINVAL;
>> +     }
>> +
>> +     if (!vma_is_dax(vma)) {
>> +             dev_dbg(dev, "%s: %s: fail, vma is not DAX capable\n",
>> +                             current->comm, func);
>
> Ditto.

Ok, will bump these up to dev_info.

>> +             return -EINVAL;
>> +     }
>> +
>> +     return 0;
>> +}
>> +
>> +static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
>> +             unsigned long size)
>> +{
>> +     struct resource *res;
>> +     phys_addr_t phys;
>> +     int i;
>> +
>> +     for (i = 0; i < dax_dev->num_resources; i++) {
>> +             res = &dax_dev->res[i];
>> +             phys = pgoff * PAGE_SIZE + res->start;
>> +             if (phys >= res->start && phys <= res->end)
>> +                     break;
>> +             pgoff -= PHYS_PFN(resource_size(res));
>> +     }
>> +
>> +     if (i < dax_dev->num_resources) {
>> +             res = &dax_dev->res[i];
>> +             if (phys + size - 1 <= res->end)
>> +                     return phys;
>> +     }
>> +
>> +     return -1;
>> +}
>> +
>> +static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
>> +             struct vm_fault *vmf)
>> +{
>> +     unsigned long vaddr = (unsigned long) vmf->virtual_address;
>> +     struct device *dev = dax_dev->dev;
>> +     struct dax_region *dax_region;
>> +     int rc = VM_FAULT_SIGBUS;
>> +     phys_addr_t phys;
>> +     pfn_t pfn;
>> +
>> +     if (check_vma(dax_dev, vma, __func__))
>> +             return VM_FAULT_SIGBUS;
>> +
>> +     dax_region = dax_dev->region;
>> +     if (dax_region->align > PAGE_SIZE) {
>> +             dev_dbg(dev, "%s: alignment > fault size\n", __func__);
>> +             return VM_FAULT_SIGBUS;
>> +     }
>> +
>> +     phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
>> +     if (phys == -1) {
>> +             dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
>> +                             vmf->pgoff);
>> +             return VM_FAULT_SIGBUS;
>> +     }
>> +
>> +     pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
>> +
>> +     rc = vm_insert_mixed(vma, vaddr, pfn);
>> +
>> +     if (rc == -ENOMEM)
>> +             return VM_FAULT_OOM;
>> +     if (rc < 0 && rc != -EBUSY)
>> +             return VM_FAULT_SIGBUS;
>> +
>> +     return VM_FAULT_NOPAGE;
>> +}
>> +
>> +static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>> +{
>> +     int rc;
>> +     struct file *filp = vma->vm_file;
>> +     struct dax_dev *dax_dev = filp->private_data;
>> +
>> +     dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
>> +                     current->comm, (vmf->flags & FAULT_FLAG_WRITE)
>> +                     ? "write" : "read", vma->vm_start, vma->vm_end);
>> +     rcu_read_lock();
>> +     rc = __dax_dev_fault(dax_dev, vma, vmf);
>> +     rcu_read_unlock();
>
> Similarly, what are you protecting? I just see you're locking something to be
> read, but don't do a rcu_dereference() to actually access a rcu protected
> pointer. Or am I missing something totally here?

Like I mentioned above, I'm protecting the fault handler vs shutdown.
I need this building-block-guarantee later when I go to fix the vfs to
unmap inodes on device-removal.  Same bug currently in filesystem-DAX
[1].

[1]: http://thread.gmane.org/gmane.linux.file-systems/103333/focus=72179

[..]

Thanks for the review Johannes.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Reinecke May 18, 2016, 8:07 a.m. UTC | #3
On 05/18/2016 12:19 AM, Dan Williams wrote:
> On Tue, May 17, 2016 at 3:57 AM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
>> On Sat, May 14, 2016 at 11:26:29PM -0700, Dan Williams wrote:
>>> The "Device DAX" core enables dax mappings of performance / feature
>>> differentiated memory.  An open mapping or file handle keeps the backing
>>> struct device live, but new mappings are only possible while the device
>>> is enabled.   Faults are handled under rcu_read_lock to synchronize
>>> with the enabled state of the device.
>>>
>>> Similar to the filesystem-dax case the backing memory may optionally
>>> have struct page entries.  However, unlike fs-dax there is no support
>>> for private mappings, or mappings that are not backed by media (see
>>> use of zero-page in fs-dax).
>>>
>>> Mappings are always guaranteed to match the alignment of the dax_region.
>>> If the dax_region is configured to have a 2MB alignment, all mappings
>>> are guaranteed to be backed by a pmd entry.  Contrast this determinism
>>> with the fs-dax case where pmd mappings are opportunistic.  If userspace
>>> attempts to force a misaligned mapping, the driver will fail the mmap
>>> attempt.  See dax_dev_check_vma() for other scenarios that are rejected,
>>> like MAP_PRIVATE mappings.
>>>
>>> Cc: Jeff Moyer <jmoyer@redhat.com>
>>> Cc: Christoph Hellwig <hch@lst.de>
>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>>> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
>>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>>> ---
>>>  drivers/dax/Kconfig |    1
>>>  drivers/dax/dax.c   |  316 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  mm/huge_memory.c    |    1
>>>  mm/hugetlb.c        |    1
>>>  4 files changed, 319 insertions(+)
>>>
>>> diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
>>> index 86ffbaa891ad..cedab7572de3 100644
>>> --- a/drivers/dax/Kconfig
>>> +++ b/drivers/dax/Kconfig
>>> @@ -1,6 +1,7 @@
>>>  menuconfig DEV_DAX
>>>       tristate "DAX: direct access to differentiated memory"
>>>       default m if NVDIMM_DAX
>>> +     depends on TRANSPARENT_HUGEPAGE
>>>       help
>>>         Support raw access to differentiated (persistence, bandwidth,
>>>         latency...) memory via an mmap(2) capable character
>>> diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
>>> index 8207fb33a992..b2fe8a0ce866 100644
>>> --- a/drivers/dax/dax.c
>>> +++ b/drivers/dax/dax.c
>>> @@ -49,6 +49,7 @@ struct dax_region {
>>>   * @region - parent region
>>>   * @dev - device backing the character device
>>>   * @kref - enable this data to be tracked in filp->private_data
>>> + * @alive - !alive + rcu grace period == no new mappings can be established
>>>   * @id - child id in the region
>>>   * @num_resources - number of physical address extents in this device
>>>   * @res - array of physical address ranges
>>> @@ -57,6 +58,7 @@ struct dax_dev {
>>>       struct dax_region *region;
>>>       struct device *dev;
>>>       struct kref kref;
>>> +     bool alive;
>>>       int id;
>>>       int num_resources;
>>>       struct resource res[0];
>>> @@ -150,6 +152,10 @@ static void destroy_dax_dev(void *_dev)
>>>
>>>       dev_dbg(dev, "%s\n", __func__);
>>>
>>> +     /* disable and flush fault handlers, TODO unmap inodes */
>>> +     dax_dev->alive = false;
>>> +     synchronize_rcu();
>>> +
>>
>> IIRC RCU is only protecting a pointer, not the content of the pointer, so this
>> looks wrong to me.
> 
> The driver is using RCU to guarantee that all currently running fault
> handlers have either completed or will see the new state of ->alive
> when they start.  Reference counts are protecting the actual dax_dev
> object.
> 
Hmm.
This is the same 'creative' RCU usage Mike Snitzer has been trying
when trying to improve device-mapper performance.

From my understanding RCU is protecting the _pointer_, not the
values of the structure pointed to.
IOW we are guaranteed to have a valid pointer at any time.
But at the same time _no_ guarantee is made about the _contents_ of
the structure.
It might well be that using 'synchronize_rcu' giving you similar
results (as synchronize_rcu() is essentially waiting a SMP grace
period, after which all CPUs should be seeing the update).
However, I haven't been able to find that this is a guaranteed
behaviour.
So from my understanding you have to use locking primitives
protecting the contents of the structure or exchange the _entire_
structure if you want to rely on RCU here.

Can we get some clarification here?
Paul?

Cheers,

Hannes
Paul Mackerras May 18, 2016, 9:10 a.m. UTC | #4
On Wed, May 18, 2016 at 10:07:19AM +0200, Hannes Reinecke wrote:
> On 05/18/2016 12:19 AM, Dan Williams wrote:
> > On Tue, May 17, 2016 at 3:57 AM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
> >> On Sat, May 14, 2016 at 11:26:29PM -0700, Dan Williams wrote:
> >>> The "Device DAX" core enables dax mappings of performance / feature
> >>> differentiated memory.  An open mapping or file handle keeps the backing
> >>> struct device live, but new mappings are only possible while the device
> >>> is enabled.   Faults are handled under rcu_read_lock to synchronize
> >>> with the enabled state of the device.
> >>>
> >>> Similar to the filesystem-dax case the backing memory may optionally
> >>> have struct page entries.  However, unlike fs-dax there is no support
> >>> for private mappings, or mappings that are not backed by media (see
> >>> use of zero-page in fs-dax).
> >>>
> >>> Mappings are always guaranteed to match the alignment of the dax_region.
> >>> If the dax_region is configured to have a 2MB alignment, all mappings
> >>> are guaranteed to be backed by a pmd entry.  Contrast this determinism
> >>> with the fs-dax case where pmd mappings are opportunistic.  If userspace
> >>> attempts to force a misaligned mapping, the driver will fail the mmap
> >>> attempt.  See dax_dev_check_vma() for other scenarios that are rejected,
> >>> like MAP_PRIVATE mappings.
> >>>
> >>> Cc: Jeff Moyer <jmoyer@redhat.com>
> >>> Cc: Christoph Hellwig <hch@lst.de>
> >>> Cc: Andrew Morton <akpm@linux-foundation.org>
> >>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> >>> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> >>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> >>> ---
> >>>  drivers/dax/Kconfig |    1
> >>>  drivers/dax/dax.c   |  316 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>  mm/huge_memory.c    |    1
> >>>  mm/hugetlb.c        |    1
> >>>  4 files changed, 319 insertions(+)
> >>>
> >>> diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
> >>> index 86ffbaa891ad..cedab7572de3 100644
> >>> --- a/drivers/dax/Kconfig
> >>> +++ b/drivers/dax/Kconfig
> >>> @@ -1,6 +1,7 @@
> >>>  menuconfig DEV_DAX
> >>>       tristate "DAX: direct access to differentiated memory"
> >>>       default m if NVDIMM_DAX
> >>> +     depends on TRANSPARENT_HUGEPAGE
> >>>       help
> >>>         Support raw access to differentiated (persistence, bandwidth,
> >>>         latency...) memory via an mmap(2) capable character
> >>> diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
> >>> index 8207fb33a992..b2fe8a0ce866 100644
> >>> --- a/drivers/dax/dax.c
> >>> +++ b/drivers/dax/dax.c
> >>> @@ -49,6 +49,7 @@ struct dax_region {
> >>>   * @region - parent region
> >>>   * @dev - device backing the character device
> >>>   * @kref - enable this data to be tracked in filp->private_data
> >>> + * @alive - !alive + rcu grace period == no new mappings can be established
> >>>   * @id - child id in the region
> >>>   * @num_resources - number of physical address extents in this device
> >>>   * @res - array of physical address ranges
> >>> @@ -57,6 +58,7 @@ struct dax_dev {
> >>>       struct dax_region *region;
> >>>       struct device *dev;
> >>>       struct kref kref;
> >>> +     bool alive;
> >>>       int id;
> >>>       int num_resources;
> >>>       struct resource res[0];
> >>> @@ -150,6 +152,10 @@ static void destroy_dax_dev(void *_dev)
> >>>
> >>>       dev_dbg(dev, "%s\n", __func__);
> >>>
> >>> +     /* disable and flush fault handlers, TODO unmap inodes */
> >>> +     dax_dev->alive = false;
> >>> +     synchronize_rcu();
> >>> +
> >>
> >> IIRC RCU is only protecting a pointer, not the content of the pointer, so this
> >> looks wrong to me.
> > 
> > The driver is using RCU to guarantee that all currently running fault
> > handlers have either completed or will see the new state of ->alive
> > when they start.  Reference counts are protecting the actual dax_dev
> > object.
> > 
> Hmm.
> This is the same 'creative' RCU usage Mike Snitzer has been trying
> when trying to improve device-mapper performance.
> 
> >From my understanding RCU is protecting the _pointer_, not the
> values of the structure pointed to.
> IOW we are guaranteed to have a valid pointer at any time.
> But at the same time _no_ guarantee is made about the _contents_ of
> the structure.
> It might well be that using 'synchronize_rcu' giving you similar
> results (as synchronize_rcu() is essentially waiting a SMP grace
> period, after which all CPUs should be seeing the update).
> However, I haven't been able to find that this is a guaranteed
> behaviour.
> So from my understanding you have to use locking primitives
> protecting the contents of the structure or exchange the _entire_
> structure if you want to rely on RCU here.
> 
> Can we get some clarification here?
> Paul?

I think you want the other Paul, Paul McKenney.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Reinecke May 18, 2016, 9:15 a.m. UTC | #5
On 05/18/2016 11:10 AM, Paul Mackerras wrote:
> On Wed, May 18, 2016 at 10:07:19AM +0200, Hannes Reinecke wrote:
>> On 05/18/2016 12:19 AM, Dan Williams wrote:
>>> On Tue, May 17, 2016 at 3:57 AM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
>>>> On Sat, May 14, 2016 at 11:26:29PM -0700, Dan Williams wrote:
>>>>> The "Device DAX" core enables dax mappings of performance / feature
>>>>> differentiated memory.  An open mapping or file handle keeps the backing
>>>>> struct device live, but new mappings are only possible while the device
>>>>> is enabled.   Faults are handled under rcu_read_lock to synchronize
>>>>> with the enabled state of the device.
>>>>>
>>>>> Similar to the filesystem-dax case the backing memory may optionally
>>>>> have struct page entries.  However, unlike fs-dax there is no support
>>>>> for private mappings, or mappings that are not backed by media (see
>>>>> use of zero-page in fs-dax).
>>>>>
>>>>> Mappings are always guaranteed to match the alignment of the dax_region.
>>>>> If the dax_region is configured to have a 2MB alignment, all mappings
>>>>> are guaranteed to be backed by a pmd entry.  Contrast this determinism
>>>>> with the fs-dax case where pmd mappings are opportunistic.  If userspace
>>>>> attempts to force a misaligned mapping, the driver will fail the mmap
>>>>> attempt.  See dax_dev_check_vma() for other scenarios that are rejected,
>>>>> like MAP_PRIVATE mappings.
>>>>>
>>>>> Cc: Jeff Moyer <jmoyer@redhat.com>
>>>>> Cc: Christoph Hellwig <hch@lst.de>
>>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>>>>> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
>>>>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>>>>> ---
>>>>>  drivers/dax/Kconfig |    1
>>>>>  drivers/dax/dax.c   |  316 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>  mm/huge_memory.c    |    1
>>>>>  mm/hugetlb.c        |    1
>>>>>  4 files changed, 319 insertions(+)
>>>>>
>>>>> diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
>>>>> index 86ffbaa891ad..cedab7572de3 100644
>>>>> --- a/drivers/dax/Kconfig
>>>>> +++ b/drivers/dax/Kconfig
>>>>> @@ -1,6 +1,7 @@
>>>>>  menuconfig DEV_DAX
>>>>>       tristate "DAX: direct access to differentiated memory"
>>>>>       default m if NVDIMM_DAX
>>>>> +     depends on TRANSPARENT_HUGEPAGE
>>>>>       help
>>>>>         Support raw access to differentiated (persistence, bandwidth,
>>>>>         latency...) memory via an mmap(2) capable character
>>>>> diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
>>>>> index 8207fb33a992..b2fe8a0ce866 100644
>>>>> --- a/drivers/dax/dax.c
>>>>> +++ b/drivers/dax/dax.c
>>>>> @@ -49,6 +49,7 @@ struct dax_region {
>>>>>   * @region - parent region
>>>>>   * @dev - device backing the character device
>>>>>   * @kref - enable this data to be tracked in filp->private_data
>>>>> + * @alive - !alive + rcu grace period == no new mappings can be established
>>>>>   * @id - child id in the region
>>>>>   * @num_resources - number of physical address extents in this device
>>>>>   * @res - array of physical address ranges
>>>>> @@ -57,6 +58,7 @@ struct dax_dev {
>>>>>       struct dax_region *region;
>>>>>       struct device *dev;
>>>>>       struct kref kref;
>>>>> +     bool alive;
>>>>>       int id;
>>>>>       int num_resources;
>>>>>       struct resource res[0];
>>>>> @@ -150,6 +152,10 @@ static void destroy_dax_dev(void *_dev)
>>>>>
>>>>>       dev_dbg(dev, "%s\n", __func__);
>>>>>
>>>>> +     /* disable and flush fault handlers, TODO unmap inodes */
>>>>> +     dax_dev->alive = false;
>>>>> +     synchronize_rcu();
>>>>> +
>>>>
>>>> IIRC RCU is only protecting a pointer, not the content of the pointer, so this
>>>> looks wrong to me.
>>>
>>> The driver is using RCU to guarantee that all currently running fault
>>> handlers have either completed or will see the new state of ->alive
>>> when they start.  Reference counts are protecting the actual dax_dev
>>> object.
>>>
>> Hmm.
>> This is the same 'creative' RCU usage Mike Snitzer has been trying
>> when trying to improve device-mapper performance.
>>
>> >From my understanding RCU is protecting the _pointer_, not the
>> values of the structure pointed to.
>> IOW we are guaranteed to have a valid pointer at any time.
>> But at the same time _no_ guarantee is made about the _contents_ of
>> the structure.
>> It might well be that using 'synchronize_rcu' giving you similar
>> results (as synchronize_rcu() is essentially waiting a SMP grace
>> period, after which all CPUs should be seeing the update).
>> However, I haven't been able to find that this is a guaranteed
>> behaviour.
>> So from my understanding you have to use locking primitives
>> protecting the contents of the structure or exchange the _entire_
>> structure if you want to rely on RCU here.
>>
>> Can we get some clarification here?
>> Paul?
> 
> I think you want the other Paul, Paul McKenney.
> 
I think you are in fact right.
Sorry for the Paul-confusion :-)

Cheers,

Hannes
Paul E. McKenney May 18, 2016, 5:12 p.m. UTC | #6
On Wed, May 18, 2016 at 11:15:11AM +0200, Hannes Reinecke wrote:
> On 05/18/2016 11:10 AM, Paul Mackerras wrote:
> > On Wed, May 18, 2016 at 10:07:19AM +0200, Hannes Reinecke wrote:
> >> On 05/18/2016 12:19 AM, Dan Williams wrote:
> >>> On Tue, May 17, 2016 at 3:57 AM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
> >>>> On Sat, May 14, 2016 at 11:26:29PM -0700, Dan Williams wrote:
> >>>>> The "Device DAX" core enables dax mappings of performance / feature
> >>>>> differentiated memory.  An open mapping or file handle keeps the backing
> >>>>> struct device live, but new mappings are only possible while the device
> >>>>> is enabled.   Faults are handled under rcu_read_lock to synchronize
> >>>>> with the enabled state of the device.
> >>>>>
> >>>>> Similar to the filesystem-dax case the backing memory may optionally
> >>>>> have struct page entries.  However, unlike fs-dax there is no support
> >>>>> for private mappings, or mappings that are not backed by media (see
> >>>>> use of zero-page in fs-dax).
> >>>>>
> >>>>> Mappings are always guaranteed to match the alignment of the dax_region.
> >>>>> If the dax_region is configured to have a 2MB alignment, all mappings
> >>>>> are guaranteed to be backed by a pmd entry.  Contrast this determinism
> >>>>> with the fs-dax case where pmd mappings are opportunistic.  If userspace
> >>>>> attempts to force a misaligned mapping, the driver will fail the mmap
> >>>>> attempt.  See dax_dev_check_vma() for other scenarios that are rejected,
> >>>>> like MAP_PRIVATE mappings.
> >>>>>
> >>>>> Cc: Jeff Moyer <jmoyer@redhat.com>
> >>>>> Cc: Christoph Hellwig <hch@lst.de>
> >>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
> >>>>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> >>>>> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> >>>>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> >>>>> ---
> >>>>>  drivers/dax/Kconfig |    1
> >>>>>  drivers/dax/dax.c   |  316 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>>  mm/huge_memory.c    |    1
> >>>>>  mm/hugetlb.c        |    1
> >>>>>  4 files changed, 319 insertions(+)
> >>>>>
> >>>>> diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
> >>>>> index 86ffbaa891ad..cedab7572de3 100644
> >>>>> --- a/drivers/dax/Kconfig
> >>>>> +++ b/drivers/dax/Kconfig
> >>>>> @@ -1,6 +1,7 @@
> >>>>>  menuconfig DEV_DAX
> >>>>>       tristate "DAX: direct access to differentiated memory"
> >>>>>       default m if NVDIMM_DAX
> >>>>> +     depends on TRANSPARENT_HUGEPAGE
> >>>>>       help
> >>>>>         Support raw access to differentiated (persistence, bandwidth,
> >>>>>         latency...) memory via an mmap(2) capable character
> >>>>> diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
> >>>>> index 8207fb33a992..b2fe8a0ce866 100644
> >>>>> --- a/drivers/dax/dax.c
> >>>>> +++ b/drivers/dax/dax.c
> >>>>> @@ -49,6 +49,7 @@ struct dax_region {
> >>>>>   * @region - parent region
> >>>>>   * @dev - device backing the character device
> >>>>>   * @kref - enable this data to be tracked in filp->private_data
> >>>>> + * @alive - !alive + rcu grace period == no new mappings can be established
> >>>>>   * @id - child id in the region
> >>>>>   * @num_resources - number of physical address extents in this device
> >>>>>   * @res - array of physical address ranges
> >>>>> @@ -57,6 +58,7 @@ struct dax_dev {
> >>>>>       struct dax_region *region;
> >>>>>       struct device *dev;
> >>>>>       struct kref kref;
> >>>>> +     bool alive;
> >>>>>       int id;
> >>>>>       int num_resources;
> >>>>>       struct resource res[0];
> >>>>> @@ -150,6 +152,10 @@ static void destroy_dax_dev(void *_dev)
> >>>>>
> >>>>>       dev_dbg(dev, "%s\n", __func__);
> >>>>>
> >>>>> +     /* disable and flush fault handlers, TODO unmap inodes */
> >>>>> +     dax_dev->alive = false;
> >>>>> +     synchronize_rcu();

If you need to wait for fault handlers, you need synchronize_sched()
instead of synchronize_rcu().  Please note that synchronize_rcu() is
guaranteed to wait only for tasks that have done rcu_read_lock() to reach
the corresponding rcu_read_unlock().  In contrast, synchronize_sched()
is guaranteed to wait for any non-idle preempt-disable region of code
to complete, regardless of exactly what is disabling preemptiong.

And the "non-idle" is not an idle qualifier.  If you need to wait on fault
handlers that somehow occur from an idle hardware thread, you will need
those fault handlers to do rcu_irq_enter() on entry and rcu_irq_exit()
on exit.  (My guess is that you cannot take faults in the idle loop,
but I have learned not to trust such guesses all that far.)

And last, but definitely not least, synchronize_sched() waits only
for pre-existing preempt-disable regions of code.  So if you do
synchronize_sched(), and immediately after a fault handler starts,
synchronize_sched() won't necessarily wait on it.  However, you -are-
guaranteed that synchronize_shced() -will- wait for any fault handler
that might possibly see dax_dev->alive with a non-false value.

Are these the guarantees you are looking for?  (Yes, I did recently
watch "A New Hope".  Why do you ask?)

> >>>>> +
> >>>>
> >>>> IIRC RCU is only protecting a pointer, not the content of the pointer, so this
> >>>> looks wrong to me.

RCU can be, and usually is, used to protect pointers, but it can be and
sometimes is used for other things as well.  At its core, RCU is about
waiting for pre-existing RCU readers to complete.

> >>> The driver is using RCU to guarantee that all currently running fault
> >>> handlers have either completed or will see the new state of ->alive
> >>> when they start.  Reference counts are protecting the actual dax_dev
> >>> object.
> >>>
> >> Hmm.
> >> This is the same 'creative' RCU usage Mike Snitzer has been trying
> >> when trying to improve device-mapper performance.

To repeat, unless all your fault handlers begin with rcu_read_lock()
and end with rcu_read_unlock(), and as long as you don't care about not
waiting for fault handlers that are currently executing just before
the rcu_read_lock() and just after the rcu_read_unlock(), you need
synchronize_sched() rather than synchronize_rcu() for this job.

> >> >From my understanding RCU is protecting the _pointer_, not the
> >> values of the structure pointed to.
> >> IOW we are guaranteed to have a valid pointer at any time.
> >> But at the same time _no_ guarantee is made about the _contents_ of
> >> the structure.
> >> It might well be that using 'synchronize_rcu' giving you similar
> >> results (as synchronize_rcu() is essentially waiting a SMP grace
> >> period, after which all CPUs should be seeing the update).
> >> However, I haven't been able to find that this is a guaranteed
> >> behaviour.
> >> So from my understanding you have to use locking primitives
> >> protecting the contents of the structure or exchange the _entire_
> >> structure if you want to rely on RCU here.
> >>
> >> Can we get some clarification here?

Maybe...  What exactly is your synchronization design needing here?

> >> Paul?
> > 
> > I think you want the other Paul, Paul McKenney.
> > 
> I think you are in fact right.
> Sorry for the Paul-confusion :-)

Did I keep my end of the confusion up?  ;-)

								Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Williams May 18, 2016, 5:26 p.m. UTC | #7
On Wed, May 18, 2016 at 10:12 AM, Paul E. McKenney
<paulmck@linux.vnet.ibm.com> wrote:
> On Wed, May 18, 2016 at 11:15:11AM +0200, Hannes Reinecke wrote:
>> On 05/18/2016 11:10 AM, Paul Mackerras wrote:
>> > On Wed, May 18, 2016 at 10:07:19AM +0200, Hannes Reinecke wrote:
>> >> On 05/18/2016 12:19 AM, Dan Williams wrote:
>> >>> On Tue, May 17, 2016 at 3:57 AM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
>> >>>> On Sat, May 14, 2016 at 11:26:29PM -0700, Dan Williams wrote:
>> >>>>> The "Device DAX" core enables dax mappings of performance / feature
>> >>>>> differentiated memory.  An open mapping or file handle keeps the backing
>> >>>>> struct device live, but new mappings are only possible while the device
>> >>>>> is enabled.   Faults are handled under rcu_read_lock to synchronize
>> >>>>> with the enabled state of the device.
>> >>>>>
>> >>>>> Similar to the filesystem-dax case the backing memory may optionally
>> >>>>> have struct page entries.  However, unlike fs-dax there is no support
>> >>>>> for private mappings, or mappings that are not backed by media (see
>> >>>>> use of zero-page in fs-dax).
>> >>>>>
>> >>>>> Mappings are always guaranteed to match the alignment of the dax_region.
>> >>>>> If the dax_region is configured to have a 2MB alignment, all mappings
>> >>>>> are guaranteed to be backed by a pmd entry.  Contrast this determinism
>> >>>>> with the fs-dax case where pmd mappings are opportunistic.  If userspace
>> >>>>> attempts to force a misaligned mapping, the driver will fail the mmap
>> >>>>> attempt.  See dax_dev_check_vma() for other scenarios that are rejected,
>> >>>>> like MAP_PRIVATE mappings.
>> >>>>>
>> >>>>> Cc: Jeff Moyer <jmoyer@redhat.com>
>> >>>>> Cc: Christoph Hellwig <hch@lst.de>
>> >>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> >>>>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
>> >>>>> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
>> >>>>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>> >>>>> ---
>> >>>>>  drivers/dax/Kconfig |    1
>> >>>>>  drivers/dax/dax.c   |  316 +++++++++++++++++++++++++++++++++++++++++++++++++++
>> >>>>>  mm/huge_memory.c    |    1
>> >>>>>  mm/hugetlb.c        |    1
>> >>>>>  4 files changed, 319 insertions(+)
>> >>>>>
>> >>>>> diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
>> >>>>> index 86ffbaa891ad..cedab7572de3 100644
>> >>>>> --- a/drivers/dax/Kconfig
>> >>>>> +++ b/drivers/dax/Kconfig
>> >>>>> @@ -1,6 +1,7 @@
>> >>>>>  menuconfig DEV_DAX
>> >>>>>       tristate "DAX: direct access to differentiated memory"
>> >>>>>       default m if NVDIMM_DAX
>> >>>>> +     depends on TRANSPARENT_HUGEPAGE
>> >>>>>       help
>> >>>>>         Support raw access to differentiated (persistence, bandwidth,
>> >>>>>         latency...) memory via an mmap(2) capable character
>> >>>>> diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
>> >>>>> index 8207fb33a992..b2fe8a0ce866 100644
>> >>>>> --- a/drivers/dax/dax.c
>> >>>>> +++ b/drivers/dax/dax.c
>> >>>>> @@ -49,6 +49,7 @@ struct dax_region {
>> >>>>>   * @region - parent region
>> >>>>>   * @dev - device backing the character device
>> >>>>>   * @kref - enable this data to be tracked in filp->private_data
>> >>>>> + * @alive - !alive + rcu grace period == no new mappings can be established
>> >>>>>   * @id - child id in the region
>> >>>>>   * @num_resources - number of physical address extents in this device
>> >>>>>   * @res - array of physical address ranges
>> >>>>> @@ -57,6 +58,7 @@ struct dax_dev {
>> >>>>>       struct dax_region *region;
>> >>>>>       struct device *dev;
>> >>>>>       struct kref kref;
>> >>>>> +     bool alive;
>> >>>>>       int id;
>> >>>>>       int num_resources;
>> >>>>>       struct resource res[0];
>> >>>>> @@ -150,6 +152,10 @@ static void destroy_dax_dev(void *_dev)
>> >>>>>
>> >>>>>       dev_dbg(dev, "%s\n", __func__);
>> >>>>>
>> >>>>> +     /* disable and flush fault handlers, TODO unmap inodes */
>> >>>>> +     dax_dev->alive = false;
>> >>>>> +     synchronize_rcu();
>
> If you need to wait for fault handlers, you need synchronize_sched()
> instead of synchronize_rcu().  Please note that synchronize_rcu() is
> guaranteed to wait only for tasks that have done rcu_read_lock() to reach
> the corresponding rcu_read_unlock().  In contrast, synchronize_sched()
> is guaranteed to wait for any non-idle preempt-disable region of code
> to complete, regardless of exactly what is disabling preemptiong.
>
> And the "non-idle" is not an idle qualifier.  If you need to wait on fault
> handlers that somehow occur from an idle hardware thread, you will need
> those fault handlers to do rcu_irq_enter() on entry and rcu_irq_exit()
> on exit.  (My guess is that you cannot take faults in the idle loop,
> but I have learned not to trust such guesses all that far.)
>
> And last, but definitely not least, synchronize_sched() waits only
> for pre-existing preempt-disable regions of code.  So if you do
> synchronize_sched(), and immediately after a fault handler starts,
> synchronize_sched() won't necessarily wait on it.  However, you -are-
> guaranteed that synchronize_shced() -will- wait for any fault handler
> that might possibly see dax_dev->alive with a non-false value.
>
> Are these the guarantees you are looking for?  (Yes, I did recently
> watch "A New Hope".  Why do you ask?)

Spoken like a true rcu-Jedi.

So in this case the fault handlers are indeed running under
rcu_read_lock(), and I can't fathom how these faults would trigger
from an idle thread...

>
>> >>>>> +
>> >>>>
>> >>>> IIRC RCU is only protecting a pointer, not the content of the pointer, so this
>> >>>> looks wrong to me.
>
> RCU can be, and usually is, used to protect pointers, but it can be and
> sometimes is used for other things as well.  At its core, RCU is about
> waiting for pre-existing RCU readers to complete.
>
>> >>> The driver is using RCU to guarantee that all currently running fault
>> >>> handlers have either completed or will see the new state of ->alive
>> >>> when they start.  Reference counts are protecting the actual dax_dev
>> >>> object.
>> >>>
>> >> Hmm.
>> >> This is the same 'creative' RCU usage Mike Snitzer has been trying
>> >> when trying to improve device-mapper performance.
>
> To repeat, unless all your fault handlers begin with rcu_read_lock()
> and end with rcu_read_unlock(), and as long as you don't care about not
> waiting for fault handlers that are currently executing just before
> the rcu_read_lock() and just after the rcu_read_unlock(), you need
> synchronize_sched() rather than synchronize_rcu() for this job.
>
>> >> >From my understanding RCU is protecting the _pointer_, not the
>> >> values of the structure pointed to.
>> >> IOW we are guaranteed to have a valid pointer at any time.
>> >> But at the same time _no_ guarantee is made about the _contents_ of
>> >> the structure.
>> >> It might well be that using 'synchronize_rcu' giving you similar
>> >> results (as synchronize_rcu() is essentially waiting a SMP grace
>> >> period, after which all CPUs should be seeing the update).
>> >> However, I haven't been able to find that this is a guaranteed
>> >> behaviour.
>> >> So from my understanding you have to use locking primitives
>> >> protecting the contents of the structure or exchange the _entire_
>> >> structure if you want to rely on RCU here.
>> >>
>> >> Can we get some clarification here?
>
> Maybe...  What exactly is your synchronization design needing here?
>
>> >> Paul?
>> >
>> > I think you want the other Paul, Paul McKenney.
>> >
>> I think you are in fact right.
>> Sorry for the Paul-confusion :-)
>
> Did I keep my end of the confusion up?  ;-)

Yes, I think we're good, but please double check I am not mistaken in
the following clarification comment:

@@ -150,6 +152,16 @@ static void destroy_dax_dev(void *_dev)

        dev_dbg(dev, "%s\n", __func__);

+       /*
+        * Note, rcu is not protecting the liveness of dax_dev, rcu is
+        * ensuring that any fault handlers that might have seen
+        * dax_dev->alive == true, have completed.  Any fault handlers
+        * that start after synchronize_rcu() has started will abort
+        * upon seeing dax_dev->alive == false.
+        */
+       dax_dev->alive = false;
+       synchronize_rcu();
+
        get_device(dev);
        device_unregister(dev);
        ida_simple_remove(&dax_region->ida, dax_dev->id);
@@ -173,6 +185,7 @@ int devm_create_dax_dev(struct dax_region
*dax_region, struct resource *re
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul E. McKenney May 18, 2016, 5:57 p.m. UTC | #8
On Wed, May 18, 2016 at 10:26:57AM -0700, Dan Williams wrote:
> On Wed, May 18, 2016 at 10:12 AM, Paul E. McKenney
> <paulmck@linux.vnet.ibm.com> wrote:
> > On Wed, May 18, 2016 at 11:15:11AM +0200, Hannes Reinecke wrote:
> >> On 05/18/2016 11:10 AM, Paul Mackerras wrote:
> >> > On Wed, May 18, 2016 at 10:07:19AM +0200, Hannes Reinecke wrote:
> >> >> On 05/18/2016 12:19 AM, Dan Williams wrote:
> >> >>> On Tue, May 17, 2016 at 3:57 AM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
> >> >>>> On Sat, May 14, 2016 at 11:26:29PM -0700, Dan Williams wrote:
> >> >>>>> The "Device DAX" core enables dax mappings of performance / feature
> >> >>>>> differentiated memory.  An open mapping or file handle keeps the backing
> >> >>>>> struct device live, but new mappings are only possible while the device
> >> >>>>> is enabled.   Faults are handled under rcu_read_lock to synchronize
> >> >>>>> with the enabled state of the device.
> >> >>>>>
> >> >>>>> Similar to the filesystem-dax case the backing memory may optionally
> >> >>>>> have struct page entries.  However, unlike fs-dax there is no support
> >> >>>>> for private mappings, or mappings that are not backed by media (see
> >> >>>>> use of zero-page in fs-dax).
> >> >>>>>
> >> >>>>> Mappings are always guaranteed to match the alignment of the dax_region.
> >> >>>>> If the dax_region is configured to have a 2MB alignment, all mappings
> >> >>>>> are guaranteed to be backed by a pmd entry.  Contrast this determinism
> >> >>>>> with the fs-dax case where pmd mappings are opportunistic.  If userspace
> >> >>>>> attempts to force a misaligned mapping, the driver will fail the mmap
> >> >>>>> attempt.  See dax_dev_check_vma() for other scenarios that are rejected,
> >> >>>>> like MAP_PRIVATE mappings.
> >> >>>>>
> >> >>>>> Cc: Jeff Moyer <jmoyer@redhat.com>
> >> >>>>> Cc: Christoph Hellwig <hch@lst.de>
> >> >>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
> >> >>>>> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> >> >>>>> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> >> >>>>> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> >> >>>>> ---
> >> >>>>>  drivers/dax/Kconfig |    1
> >> >>>>>  drivers/dax/dax.c   |  316 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >> >>>>>  mm/huge_memory.c    |    1
> >> >>>>>  mm/hugetlb.c        |    1
> >> >>>>>  4 files changed, 319 insertions(+)
> >> >>>>>
> >> >>>>> diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
> >> >>>>> index 86ffbaa891ad..cedab7572de3 100644
> >> >>>>> --- a/drivers/dax/Kconfig
> >> >>>>> +++ b/drivers/dax/Kconfig
> >> >>>>> @@ -1,6 +1,7 @@
> >> >>>>>  menuconfig DEV_DAX
> >> >>>>>       tristate "DAX: direct access to differentiated memory"
> >> >>>>>       default m if NVDIMM_DAX
> >> >>>>> +     depends on TRANSPARENT_HUGEPAGE
> >> >>>>>       help
> >> >>>>>         Support raw access to differentiated (persistence, bandwidth,
> >> >>>>>         latency...) memory via an mmap(2) capable character
> >> >>>>> diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
> >> >>>>> index 8207fb33a992..b2fe8a0ce866 100644
> >> >>>>> --- a/drivers/dax/dax.c
> >> >>>>> +++ b/drivers/dax/dax.c
> >> >>>>> @@ -49,6 +49,7 @@ struct dax_region {
> >> >>>>>   * @region - parent region
> >> >>>>>   * @dev - device backing the character device
> >> >>>>>   * @kref - enable this data to be tracked in filp->private_data
> >> >>>>> + * @alive - !alive + rcu grace period == no new mappings can be established
> >> >>>>>   * @id - child id in the region
> >> >>>>>   * @num_resources - number of physical address extents in this device
> >> >>>>>   * @res - array of physical address ranges
> >> >>>>> @@ -57,6 +58,7 @@ struct dax_dev {
> >> >>>>>       struct dax_region *region;
> >> >>>>>       struct device *dev;
> >> >>>>>       struct kref kref;
> >> >>>>> +     bool alive;
> >> >>>>>       int id;
> >> >>>>>       int num_resources;
> >> >>>>>       struct resource res[0];
> >> >>>>> @@ -150,6 +152,10 @@ static void destroy_dax_dev(void *_dev)
> >> >>>>>
> >> >>>>>       dev_dbg(dev, "%s\n", __func__);
> >> >>>>>
> >> >>>>> +     /* disable and flush fault handlers, TODO unmap inodes */
> >> >>>>> +     dax_dev->alive = false;
> >> >>>>> +     synchronize_rcu();
> >
> > If you need to wait for fault handlers, you need synchronize_sched()
> > instead of synchronize_rcu().  Please note that synchronize_rcu() is
> > guaranteed to wait only for tasks that have done rcu_read_lock() to reach
> > the corresponding rcu_read_unlock().  In contrast, synchronize_sched()
> > is guaranteed to wait for any non-idle preempt-disable region of code
> > to complete, regardless of exactly what is disabling preemptiong.
> >
> > And the "non-idle" is not an idle qualifier.  If you need to wait on fault
> > handlers that somehow occur from an idle hardware thread, you will need
> > those fault handlers to do rcu_irq_enter() on entry and rcu_irq_exit()
> > on exit.  (My guess is that you cannot take faults in the idle loop,
> > but I have learned not to trust such guesses all that far.)
> >
> > And last, but definitely not least, synchronize_sched() waits only
> > for pre-existing preempt-disable regions of code.  So if you do
> > synchronize_sched(), and immediately after a fault handler starts,
> > synchronize_sched() won't necessarily wait on it.  However, you -are-
> > guaranteed that synchronize_shced() -will- wait for any fault handler
> > that might possibly see dax_dev->alive with a non-false value.
> >
> > Are these the guarantees you are looking for?  (Yes, I did recently
> > watch "A New Hope".  Why do you ask?)
> 
> Spoken like a true rcu-Jedi.

;-)

> So in this case the fault handlers are indeed running under
> rcu_read_lock(), and I can't fathom how these faults would trigger
> from an idle thread...

OK, given all fault handlers use rcu_read_lock() and either:

1.	As you say, faults never trigger from idle, or

2.	The fault handlers call rcu_irq_enter() on entry and
	rcu_irq_exit() on exit

Then, yes, you can keep on using synchronize_rcu().

> >> >>>>> +
> >> >>>>
> >> >>>> IIRC RCU is only protecting a pointer, not the content of the pointer, so this
> >> >>>> looks wrong to me.
> >
> > RCU can be, and usually is, used to protect pointers, but it can be and
> > sometimes is used for other things as well.  At its core, RCU is about
> > waiting for pre-existing RCU readers to complete.
> >
> >> >>> The driver is using RCU to guarantee that all currently running fault
> >> >>> handlers have either completed or will see the new state of ->alive
> >> >>> when they start.  Reference counts are protecting the actual dax_dev
> >> >>> object.
> >> >>>
> >> >> Hmm.
> >> >> This is the same 'creative' RCU usage Mike Snitzer has been trying
> >> >> when trying to improve device-mapper performance.
> >
> > To repeat, unless all your fault handlers begin with rcu_read_lock()
> > and end with rcu_read_unlock(), and as long as you don't care about not
> > waiting for fault handlers that are currently executing just before
> > the rcu_read_lock() and just after the rcu_read_unlock(), you need
> > synchronize_sched() rather than synchronize_rcu() for this job.
> >
> >> >> >From my understanding RCU is protecting the _pointer_, not the
> >> >> values of the structure pointed to.
> >> >> IOW we are guaranteed to have a valid pointer at any time.
> >> >> But at the same time _no_ guarantee is made about the _contents_ of
> >> >> the structure.
> >> >> It might well be that using 'synchronize_rcu' giving you similar
> >> >> results (as synchronize_rcu() is essentially waiting a SMP grace
> >> >> period, after which all CPUs should be seeing the update).
> >> >> However, I haven't been able to find that this is a guaranteed
> >> >> behaviour.
> >> >> So from my understanding you have to use locking primitives
> >> >> protecting the contents of the structure or exchange the _entire_
> >> >> structure if you want to rely on RCU here.
> >> >>
> >> >> Can we get some clarification here?
> >
> > Maybe...  What exactly is your synchronization design needing here?
> >
> >> >> Paul?
> >> >
> >> > I think you want the other Paul, Paul McKenney.
> >> >
> >> I think you are in fact right.
> >> Sorry for the Paul-confusion :-)
> >
> > Did I keep my end of the confusion up?  ;-)
> 
> Yes, I think we're good, but please double check I am not mistaken in
> the following clarification comment:
> 
> @@ -150,6 +152,16 @@ static void destroy_dax_dev(void *_dev)
> 
>         dev_dbg(dev, "%s\n", __func__);
> 
> +       /*
> +        * Note, rcu is not protecting the liveness of dax_dev, rcu is
> +        * ensuring that any fault handlers that might have seen
> +        * dax_dev->alive == true, have completed.  Any fault handlers
> +        * that start after synchronize_rcu() has started will abort
> +        * upon seeing dax_dev->alive == false.
> +        */
> +       dax_dev->alive = false;
> +       synchronize_rcu();
> +
>         get_device(dev);
>         device_unregister(dev);
>         ida_simple_remove(&dax_region->ida, dax_dev->id);
> @@ -173,6 +185,7 @@ int devm_create_dax_dev(struct dax_region
> *dax_region, struct resource *re

Given your comment and your statement that fault handlers never happen
on idle CPUs and are always protected by rcu_read_lock(), from an RCU
point of view:

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

							Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 86ffbaa891ad..cedab7572de3 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -1,6 +1,7 @@ 
 menuconfig DEV_DAX
 	tristate "DAX: direct access to differentiated memory"
 	default m if NVDIMM_DAX
+	depends on TRANSPARENT_HUGEPAGE
 	help
 	  Support raw access to differentiated (persistence, bandwidth,
 	  latency...) memory via an mmap(2) capable character
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 8207fb33a992..b2fe8a0ce866 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -49,6 +49,7 @@  struct dax_region {
  * @region - parent region
  * @dev - device backing the character device
  * @kref - enable this data to be tracked in filp->private_data
+ * @alive - !alive + rcu grace period == no new mappings can be established
  * @id - child id in the region
  * @num_resources - number of physical address extents in this device
  * @res - array of physical address ranges
@@ -57,6 +58,7 @@  struct dax_dev {
 	struct dax_region *region;
 	struct device *dev;
 	struct kref kref;
+	bool alive;
 	int id;
 	int num_resources;
 	struct resource res[0];
@@ -150,6 +152,10 @@  static void destroy_dax_dev(void *_dev)
 
 	dev_dbg(dev, "%s\n", __func__);
 
+	/* disable and flush fault handlers, TODO unmap inodes */
+	dax_dev->alive = false;
+	synchronize_rcu();
+
 	get_device(dev);
 	device_unregister(dev);
 	ida_simple_remove(&dax_region->ida, dax_dev->id);
@@ -173,6 +179,7 @@  int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
 	memcpy(dax_dev->res, res, sizeof(*res) * count);
 	dax_dev->num_resources = count;
 	kref_init(&dax_dev->kref);
+	dax_dev->alive = true;
 	dax_dev->region = dax_region;
 	kref_get(&dax_region->kref);
 
@@ -217,9 +224,318 @@  int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
 }
 EXPORT_SYMBOL_GPL(devm_create_dax_dev);
 
+/* return an unmapped area aligned to the dax region specified alignment */
+static unsigned long dax_dev_get_unmapped_area(struct file *filp,
+		unsigned long addr, unsigned long len, unsigned long pgoff,
+		unsigned long flags)
+{
+	unsigned long off, off_end, off_align, len_align, addr_align, align;
+	struct dax_dev *dax_dev = filp ? filp->private_data : NULL;
+	struct dax_region *dax_region;
+
+	if (!dax_dev || addr)
+		goto out;
+
+	dax_region = dax_dev->region;
+	align = dax_region->align;
+	off = pgoff << PAGE_SHIFT;
+	off_end = off + len;
+	off_align = round_up(off, align);
+
+	if ((off_end <= off_align) || ((off_end - off_align) < align))
+		goto out;
+
+	len_align = len + align;
+	if ((off + len_align) < off)
+		goto out;
+
+	addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
+			pgoff, flags);
+	if (!IS_ERR_VALUE(addr_align)) {
+		addr_align += (off - addr_align) & (align - 1);
+		return addr_align;
+	}
+ out:
+	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+
+static int __match_devt(struct device *dev, const void *data)
+{
+	const dev_t *devt = data;
+
+	return dev->devt == *devt;
+}
+
+static struct device *dax_dev_find(dev_t dev_t)
+{
+	return class_find_device(dax_class, NULL, &dev_t, __match_devt);
+}
+
+static int dax_dev_open(struct inode *inode, struct file *filp)
+{
+	struct dax_dev *dax_dev = NULL;
+	struct device *dev;
+
+	dev = dax_dev_find(inode->i_rdev);
+	if (!dev)
+		return -ENXIO;
+
+	device_lock(dev);
+	dax_dev = dev_get_drvdata(dev);
+	if (dax_dev) {
+		dev_dbg(dev, "%s\n", __func__);
+		filp->private_data = dax_dev;
+		kref_get(&dax_dev->kref);
+		inode->i_flags = S_DAX;
+	}
+	device_unlock(dev);
+
+	if (!dax_dev) {
+		put_device(dev);
+		return -ENXIO;
+	}
+	return 0;
+}
+
+static int dax_dev_release(struct inode *inode, struct file *filp)
+{
+	struct dax_dev *dax_dev = filp->private_data;
+	struct device *dev = dax_dev->dev;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+	dax_dev_put(dax_dev);
+	put_device(dev);
+
+	return 0;
+}
+
+static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
+		const char *func)
+{
+	struct dax_region *dax_region = dax_dev->region;
+	struct device *dev = dax_dev->dev;
+	unsigned long mask;
+
+	if (!dax_dev->alive)
+		return -ENXIO;
+
+	/* prevent private / writable mappings from being established */
+	if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) {
+		dev_dbg(dev, "%s: %s: fail, attempted private mapping\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	mask = dax_region->align - 1;
+	if (vma->vm_start & mask || vma->vm_end & mask) {
+		dev_dbg(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
+				current->comm, func, vma->vm_start, vma->vm_end,
+				mask);
+		return -EINVAL;
+	}
+
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
+			&& (vma->vm_flags & VM_DONTCOPY) == 0) {
+		dev_dbg(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	if (!vma_is_dax(vma)) {
+		dev_dbg(dev, "%s: %s: fail, vma is not DAX capable\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
+		unsigned long size)
+{
+	struct resource *res;
+	phys_addr_t phys;
+	int i;
+
+	for (i = 0; i < dax_dev->num_resources; i++) {
+		res = &dax_dev->res[i];
+		phys = pgoff * PAGE_SIZE + res->start;
+		if (phys >= res->start && phys <= res->end)
+			break;
+		pgoff -= PHYS_PFN(resource_size(res));
+	}
+
+	if (i < dax_dev->num_resources) {
+		res = &dax_dev->res[i];
+		if (phys + size - 1 <= res->end)
+			return phys;
+	}
+
+	return -1;
+}
+
+static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
+		struct vm_fault *vmf)
+{
+	unsigned long vaddr = (unsigned long) vmf->virtual_address;
+	struct device *dev = dax_dev->dev;
+	struct dax_region *dax_region;
+	int rc = VM_FAULT_SIGBUS;
+	phys_addr_t phys;
+	pfn_t pfn;
+
+	if (check_vma(dax_dev, vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dax_dev->region;
+	if (dax_region->align > PAGE_SIZE) {
+		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
+				vmf->pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	rc = vm_insert_mixed(vma, vaddr, pfn);
+
+	if (rc == -ENOMEM)
+		return VM_FAULT_OOM;
+	if (rc < 0 && rc != -EBUSY)
+		return VM_FAULT_SIGBUS;
+
+	return VM_FAULT_NOPAGE;
+}
+
+static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int rc;
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
+			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
+			? "write" : "read", vma->vm_start, vma->vm_end);
+	rcu_read_lock();
+	rc = __dax_dev_fault(dax_dev, vma, vmf);
+	rcu_read_unlock();
+
+	return rc;
+}
+
+static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
+		struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
+		unsigned int flags)
+{
+	unsigned long pmd_addr = addr & PMD_MASK;
+	struct device *dev = dax_dev->dev;
+	struct dax_region *dax_region;
+	phys_addr_t phys;
+	pgoff_t pgoff;
+	pfn_t pfn;
+
+	if (check_vma(dax_dev, vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dax_dev->region;
+	if (dax_region->align > PMD_SIZE) {
+		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	/* dax pmd mappings require pfn_t_devmap() */
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
+		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pgoff = linear_page_index(vma, pmd_addr);
+	phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
+				pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
+			flags & FAULT_FLAG_WRITE);
+}
+
+static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+		pmd_t *pmd, unsigned int flags)
+{
+	int rc;
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
+			current->comm, (flags & FAULT_FLAG_WRITE)
+			? "write" : "read", vma->vm_start, vma->vm_end);
+
+	rcu_read_lock();
+	rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
+	rcu_read_unlock();
+
+	return rc;
+}
+
+static void dax_dev_vm_open(struct vm_area_struct *vma)
+{
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+	kref_get(&dax_dev->kref);
+}
+
+static void dax_dev_vm_close(struct vm_area_struct *vma)
+{
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+	dax_dev_put(dax_dev);
+}
+
+static const struct vm_operations_struct dax_dev_vm_ops = {
+	.fault = dax_dev_fault,
+	.pmd_fault = dax_dev_pmd_fault,
+	.open = dax_dev_vm_open,
+	.close = dax_dev_vm_close,
+};
+
+static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct dax_dev *dax_dev = filp->private_data;
+	int rc;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+
+	rc = check_vma(dax_dev, vma, __func__);
+	if (rc)
+		return rc;
+
+	kref_get(&dax_dev->kref);
+	vma->vm_ops = &dax_dev_vm_ops;
+	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+	return 0;
+
+}
+
 static const struct file_operations dax_fops = {
 	.llseek = noop_llseek,
 	.owner = THIS_MODULE,
+	.open = dax_dev_open,
+	.release = dax_dev_release,
+	.get_unmapped_area = dax_dev_get_unmapped_area,
+	.mmap = dax_dev_mmap,
 };
 
 static int __init dax_init(void)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 86f9f8b82f8e..52ea012d8a80 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1013,6 +1013,7 @@  int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 	insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
 	return VM_FAULT_NOPAGE;
 }
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
 
 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 19d0d08b396f..b14e98129b07 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -624,6 +624,7 @@  pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
 {
 	return vma_hugecache_offset(hstate_vma(vma), vma, address);
 }
+EXPORT_SYMBOL_GPL(linear_hugepage_index);
 
 /*
  * Return the size of the pages allocated when backing a VMA. In the majority