diff mbox series

[RFC,1/3] fs: dax.c: move fs hole signifier from DAX_ZERO_PAGE to XA_ZERO_ENTRY

Message ID CAE1WUT7ke9TR_H+et5_BUg93OYcDF0LD2ku+Cto59PhP6nz8qg@mail.gmail.com (mailing list archive)
State New, archived
Headers show
Series Migrate zero page DAX bit from DAX_ZERO_PAGE to XA_ZERO_ENTRY | expand

Commit Message

Amy Parker Nov. 29, 2020, 4:36 a.m. UTC
DAX uses the DAX_ZERO_PAGE bit to represent holes in files. It could also use
a single entry, such as XArray's XA_ZERO_ENTRY. This distinguishes zero pages
and allows us to shift DAX_EMPTY down (see patch 2/3).

Signed-off-by: Amy Parker <enbyamy@gmail.com>
---
 fs/dax.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

         pgtable = pte_alloc_one(vma->vm_mm);

Comments

Jan Kara Nov. 30, 2020, 1:36 p.m. UTC | #1
On Sat 28-11-20 20:36:29, Amy Parker wrote:
> DAX uses the DAX_ZERO_PAGE bit to represent holes in files. It could also use
> a single entry, such as XArray's XA_ZERO_ENTRY. This distinguishes zero pages
> and allows us to shift DAX_EMPTY down (see patch 2/3).
> 
> Signed-off-by: Amy Parker <enbyamy@gmail.com>

Thanks for the patch. The idea looks nice however I think technically there
are some problems with the patch. See below.

> +/*
> + * A zero entry, XA_ZERO_ENTRY, is used to represent a zero page. This
> + * definition helps with checking if an entry is a PMD size.
> + */
> +#define XA_ZERO_PMD_ENTRY DAX_PMD | (unsigned long)XA_ZERO_ENTRY
> +

Firstly, if you define a macro, we usually wrap it inside braces like:

#define XA_ZERO_PMD_ENTRY (DAX_PMD | (unsigned long)XA_ZERO_ENTRY)

to avoid unexpected issues when macro expands and surrounding operators
have higher priority.

Secondly, I don't think you can combine XA_ZERO_ENTRY with DAX_PMD (or any
other bits for that matter). XA_ZERO_ENTRY is defined as
xa_mk_internal(257) which is ((257 << 2) | 2) - DAX bits will overlap with
the bits xarray internal entries are using and things will break.

Honestly, I find it somewhat cumbersome to use xarray internal entries for
DAX purposes since all the locking (using DAX_LOCKED) and size checking
(using DAX_PMD) functions will have to special-case internal entries to
operate on different set of bits. It could be done, sure, but I'm not sure
it is worth the trouble for saving two bits (we could get rid of
DAX_ZERO_PAGE and DAX_EMPTY bits in this way) in DAX entries. But maybe
Matthew had some idea how to do this in an elegant way...

								Honza

>  static unsigned long dax_to_pfn(void *entry)
>  {
>      return xa_to_value(entry) >> DAX_SHIFT;
> @@ -114,7 +119,7 @@ static bool dax_is_pte_entry(void *entry)
> 
>  static int dax_is_zero_entry(void *entry)
>  {
> -    return xa_to_value(entry) & DAX_ZERO_PAGE;
> +    return xa_to_value(entry) & (unsigned long)XA_ZERO_ENTRY;
>  }
> 
>  static int dax_is_empty_entry(void *entry)
> @@ -738,7 +743,7 @@ static void *dax_insert_entry(struct xa_state *xas,
>      if (dirty)
>          __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> 
> -    if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
> +    if (dax_is_zero_entry(entry) && !(flags & (unsigned long)XA_ZERO_ENTRY)) {
>          unsigned long index = xas->xa_index;
>          /* we are replacing a zero page with block mapping */
>          if (dax_is_pmd_entry(entry))
> @@ -1047,7 +1052,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
>      vm_fault_t ret;
> 
>      *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
> -            DAX_ZERO_PAGE, false);
> +            XA_ZERO_ENTRY, false);
> 
>      ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
>      trace_dax_load_hole(inode, vmf, ret);
> @@ -1434,7 +1439,7 @@ static vm_fault_t dax_pmd_load_hole(struct
> xa_state *xas, struct vm_fault *vmf,
> 
>      pfn = page_to_pfn_t(zero_page);
>      *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
> -            DAX_PMD | DAX_ZERO_PAGE, false);
> +            XA_ZERO_PMD_ENTRY, false);
> 
>      if (arch_needs_pgtable_deposit()) {
>          pgtable = pte_alloc_one(vma->vm_mm);
> -- 
> 2.29.2
Amy Parker Nov. 30, 2020, 2:22 p.m. UTC | #2
> > +/*
> > + * A zero entry, XA_ZERO_ENTRY, is used to represent a zero page. This
> > + * definition helps with checking if an entry is a PMD size.
> > + */
> > +#define XA_ZERO_PMD_ENTRY DAX_PMD | (unsigned long)XA_ZERO_ENTRY
> > +
>
> Firstly, if you define a macro, we usually wrap it inside braces like:
>
> #define XA_ZERO_PMD_ENTRY (DAX_PMD | (unsigned long)XA_ZERO_ENTRY)
>
> to avoid unexpected issues when macro expands and surrounding operators
> have higher priority.

Oops! Must've missed that - I'll make sure to get on that when
revising this patch.

> Secondly, I don't think you can combine XA_ZERO_ENTRY with DAX_PMD (or any
> other bits for that matter). XA_ZERO_ENTRY is defined as
> xa_mk_internal(257) which is ((257 << 2) | 2) - DAX bits will overlap with
> the bits xarray internal entries are using and things will break.

Could you provide an example of this overlap? I can't seem to find any.

> Honestly, I find it somewhat cumbersome to use xarray internal entries for
> DAX purposes since all the locking (using DAX_LOCKED) and size checking
> (using DAX_PMD) functions will have to special-case internal entries to
> operate on different set of bits. It could be done, sure, but I'm not sure
> it is worth the trouble for saving two bits (we could get rid of
> DAX_ZERO_PAGE and DAX_EMPTY bits in this way) in DAX entries. But maybe
> Matthew had some idea how to do this in an elegant way...

Yeah, that's likely the best we've got for now. Hopefully Matthew can
provide some
insight into this.

Best regards,
Amy Parker
(she/her)

>
>                                                                 Honza
>
> >  static unsigned long dax_to_pfn(void *entry)
> >  {
> >      return xa_to_value(entry) >> DAX_SHIFT;
> > @@ -114,7 +119,7 @@ static bool dax_is_pte_entry(void *entry)
> >
> >  static int dax_is_zero_entry(void *entry)
> >  {
> > -    return xa_to_value(entry) & DAX_ZERO_PAGE;
> > +    return xa_to_value(entry) & (unsigned long)XA_ZERO_ENTRY;
> >  }
> >
> >  static int dax_is_empty_entry(void *entry)
> > @@ -738,7 +743,7 @@ static void *dax_insert_entry(struct xa_state *xas,
> >      if (dirty)
> >          __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> >
> > -    if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
> > +    if (dax_is_zero_entry(entry) && !(flags & (unsigned long)XA_ZERO_ENTRY)) {
> >          unsigned long index = xas->xa_index;
> >          /* we are replacing a zero page with block mapping */
> >          if (dax_is_pmd_entry(entry))
> > @@ -1047,7 +1052,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
> >      vm_fault_t ret;
> >
> >      *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
> > -            DAX_ZERO_PAGE, false);
> > +            XA_ZERO_ENTRY, false);
> >
> >      ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
> >      trace_dax_load_hole(inode, vmf, ret);
> > @@ -1434,7 +1439,7 @@ static vm_fault_t dax_pmd_load_hole(struct
> > xa_state *xas, struct vm_fault *vmf,
> >
> >      pfn = page_to_pfn_t(zero_page);
> >      *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
> > -            DAX_PMD | DAX_ZERO_PAGE, false);
> > +            XA_ZERO_PMD_ENTRY, false);
> >
> >      if (arch_needs_pgtable_deposit()) {
> >          pgtable = pte_alloc_one(vma->vm_mm);
> > --
> > 2.29.2
> --
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Jan Kara Nov. 30, 2020, 3:09 p.m. UTC | #3
On Mon 30-11-20 06:22:42, Amy Parker wrote:
> > > +/*
> > > + * A zero entry, XA_ZERO_ENTRY, is used to represent a zero page. This
> > > + * definition helps with checking if an entry is a PMD size.
> > > + */
> > > +#define XA_ZERO_PMD_ENTRY DAX_PMD | (unsigned long)XA_ZERO_ENTRY
> > > +
> >
> > Firstly, if you define a macro, we usually wrap it inside braces like:
> >
> > #define XA_ZERO_PMD_ENTRY (DAX_PMD | (unsigned long)XA_ZERO_ENTRY)
> >
> > to avoid unexpected issues when macro expands and surrounding operators
> > have higher priority.
> 
> Oops! Must've missed that - I'll make sure to get on that when
> revising this patch.
> 
> > Secondly, I don't think you can combine XA_ZERO_ENTRY with DAX_PMD (or any
> > other bits for that matter). XA_ZERO_ENTRY is defined as
> > xa_mk_internal(257) which is ((257 << 2) | 2) - DAX bits will overlap with
> > the bits xarray internal entries are using and things will break.
> 
> Could you provide an example of this overlap? I can't seem to find any.

Well XA_ZERO_ENTRY | DAX_PMD == ((257 << 2) | 2) | (1 << 1). So the way
you've defined XA_ZERO_PMD_ENTRY the DAX_PMD will just get lost. AFAIU (but
Matthew might correct me here), for internal entries (and XA_ZERO_ENTRY is
one instance of such entry) low 10-bits of the of the entry values are
reserved for internal xarray usage so DAX could use only higher bits. For
classical value entries, only the lowest bit is reserved for xarray usage,
all the rest is available for the user (and so DAX uses it).
 
								Honza
Amy Parker Nov. 30, 2020, 3:15 p.m. UTC | #4
On Mon, Nov 30, 2020 at 7:09 AM Jan Kara <jack@suse.cz> wrote:
>
> On Mon 30-11-20 06:22:42, Amy Parker wrote:
> > > > +/*
> > > > + * A zero entry, XA_ZERO_ENTRY, is used to represent a zero page. This
> > > > + * definition helps with checking if an entry is a PMD size.
> > > > + */
> > > > +#define XA_ZERO_PMD_ENTRY DAX_PMD | (unsigned long)XA_ZERO_ENTRY
> > > > +
> > >
> > > Firstly, if you define a macro, we usually wrap it inside braces like:
> > >
> > > #define XA_ZERO_PMD_ENTRY (DAX_PMD | (unsigned long)XA_ZERO_ENTRY)
> > >
> > > to avoid unexpected issues when macro expands and surrounding operators
> > > have higher priority.
> >
> > Oops! Must've missed that - I'll make sure to get on that when
> > revising this patch.
> >
> > > Secondly, I don't think you can combine XA_ZERO_ENTRY with DAX_PMD (or any
> > > other bits for that matter). XA_ZERO_ENTRY is defined as
> > > xa_mk_internal(257) which is ((257 << 2) | 2) - DAX bits will overlap with
> > > the bits xarray internal entries are using and things will break.
> >
> > Could you provide an example of this overlap? I can't seem to find any.
>
> Well XA_ZERO_ENTRY | DAX_PMD == ((257 << 2) | 2) | (1 << 1). So the way
> you've defined XA_ZERO_PMD_ENTRY the DAX_PMD will just get lost. AFAIU (but
> Matthew might correct me here), for internal entries (and XA_ZERO_ENTRY is
> one instance of such entry) low 10-bits of the of the entry values are
> reserved for internal xarray usage so DAX could use only higher bits. For
> classical value entries, only the lowest bit is reserved for xarray usage,
> all the rest is available for the user (and so DAX uses it).

Ah, thank you. I'm not as familiar with DAX as I'd like. So what should we do?

Best regards,
Amy Parker
(she/her)
Matthew Wilcox Nov. 30, 2020, 4:36 p.m. UTC | #5
On Mon, Nov 30, 2020 at 04:09:23PM +0100, Jan Kara wrote:
> On Mon 30-11-20 06:22:42, Amy Parker wrote:
> > > > +/*
> > > > + * A zero entry, XA_ZERO_ENTRY, is used to represent a zero page. This
> > > > + * definition helps with checking if an entry is a PMD size.
> > > > + */
> > > > +#define XA_ZERO_PMD_ENTRY DAX_PMD | (unsigned long)XA_ZERO_ENTRY
> > > > +
> > >
> > > Firstly, if you define a macro, we usually wrap it inside braces like:
> > >
> > > #define XA_ZERO_PMD_ENTRY (DAX_PMD | (unsigned long)XA_ZERO_ENTRY)
> > >
> > > to avoid unexpected issues when macro expands and surrounding operators
> > > have higher priority.
> > 
> > Oops! Must've missed that - I'll make sure to get on that when
> > revising this patch.
> > 
> > > Secondly, I don't think you can combine XA_ZERO_ENTRY with DAX_PMD (or any
> > > other bits for that matter). XA_ZERO_ENTRY is defined as
> > > xa_mk_internal(257) which is ((257 << 2) | 2) - DAX bits will overlap with
> > > the bits xarray internal entries are using and things will break.
> > 
> > Could you provide an example of this overlap? I can't seem to find any.
> 
> Well XA_ZERO_ENTRY | DAX_PMD == ((257 << 2) | 2) | (1 << 1). So the way
> you've defined XA_ZERO_PMD_ENTRY the DAX_PMD will just get lost. AFAIU (but
> Matthew might correct me here), for internal entries (and XA_ZERO_ENTRY is
> one instance of such entry) low 10-bits of the of the entry values are
> reserved for internal xarray usage so DAX could use only higher bits. For
> classical value entries, only the lowest bit is reserved for xarray usage,
> all the rest is available for the user (and so DAX uses it).

The XArray entries are pretty inventive in how they are used ...

1. If bit 0 is set, it's a value entry.  That is, it encodes an integer
between 0 and LONG_MAX.
2. If bits 0 & 1 are clear, it's a pointer.
3. If bit 0 is clear and bit 1 is set, it's _either_ an internal entry,
_or_ it's a pointer that's only 2-byte aligned.  These can exist on m68k,
alas.

Internal entries above -MAX_ERRNO are used for returning errors.
Internal entries below 1024 (256 << 2) are used for sibling entries.
Internal entry 256 is the retry entry.
Internal entry 257 is the zero entry.
Internal entries 258-1023 are not currently used.
Internal entries between 4096 and MAX_ERRNO are pointers to the next
level of the tree.

The m68k pointer problem is "solved" by only allowing them to be in a
node which is the bottom of the tree.  This means that the optimisation
of placing a single pointer at index 0 in the root of the tree has to be
disabled for these pointers.  That's unfortunate, but there's no other
way to solve it, given the need for RCU readers.  You also can't use
an m68k pointer for a multi-index entry.

There's also support for pointers tagged in their lower bits.  Those are
incompatible with value entries.  And you can't use pointer tag 2 ...
diff mbox series

Patch

diff --git a/fs/dax.c b/fs/dax.c
index 5b47834f2e1b..fa8ca1a71bbd 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -77,9 +77,14 @@  fs_initcall(init_dax_wait_table);
 #define DAX_SHIFT    (4)
 #define DAX_LOCKED    (1UL << 0)
 #define DAX_PMD        (1UL << 1)
-#define DAX_ZERO_PAGE    (1UL << 2)
 #define DAX_EMPTY    (1UL << 3)

+/*
+ * A zero entry, XA_ZERO_ENTRY, is used to represent a zero page. This
+ * definition helps with checking if an entry is a PMD size.
+ */
+#define XA_ZERO_PMD_ENTRY DAX_PMD | (unsigned long)XA_ZERO_ENTRY
+
 static unsigned long dax_to_pfn(void *entry)
 {
     return xa_to_value(entry) >> DAX_SHIFT;
@@ -114,7 +119,7 @@  static bool dax_is_pte_entry(void *entry)

 static int dax_is_zero_entry(void *entry)
 {
-    return xa_to_value(entry) & DAX_ZERO_PAGE;
+    return xa_to_value(entry) & (unsigned long)XA_ZERO_ENTRY;
 }

 static int dax_is_empty_entry(void *entry)
@@ -738,7 +743,7 @@  static void *dax_insert_entry(struct xa_state *xas,
     if (dirty)
         __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

-    if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
+    if (dax_is_zero_entry(entry) && !(flags & (unsigned long)XA_ZERO_ENTRY)) {
         unsigned long index = xas->xa_index;
         /* we are replacing a zero page with block mapping */
         if (dax_is_pmd_entry(entry))
@@ -1047,7 +1052,7 @@  static vm_fault_t dax_load_hole(struct xa_state *xas,
     vm_fault_t ret;

     *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
-            DAX_ZERO_PAGE, false);
+            XA_ZERO_ENTRY, false);

     ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
     trace_dax_load_hole(inode, vmf, ret);
@@ -1434,7 +1439,7 @@  static vm_fault_t dax_pmd_load_hole(struct
xa_state *xas, struct vm_fault *vmf,

     pfn = page_to_pfn_t(zero_page);
     *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
-            DAX_PMD | DAX_ZERO_PAGE, false);
+            XA_ZERO_PMD_ENTRY, false);

     if (arch_needs_pgtable_deposit()) {