diff mbox series

[v3,1/5] Xarray: Do not return sibling entries from xas_find_marked()

Message ID 20241213122523.12764-2-shikemeng@huaweicloud.com (mailing list archive)
State New
Headers show
Series Fixes and cleanups to xarray | expand

Commit Message

Kemeng Shi Dec. 13, 2024, 12:25 p.m. UTC
Similar to issue fixed in commit cbc02854331ed ("XArray: Do not return
sibling entries from xa_load()"), we may return sibling entries from
xas_find_marked as following:
    Thread A:               Thread B:
                            xa_store_range(xa, entry, 6, 7, gfp);
			    xa_set_mark(xa, 6, mark)
    XA_STATE(xas, xa, 6);
    xas_find_marked(&xas, 7, mark);
    offset = xas_find_chunk(xas, advance, mark);
    [offset is 6 which points to a valid entry]
                            xa_store_range(xa, entry, 4, 7, gfp);
    entry = xa_entry(xa, node, 6);
    [entry is a sibling of 4]
    if (!xa_is_node(entry))
        return entry;

Skip sibling entry like xas_find() does to protect caller from seeing
sibling entry from xas_find_marked() or caller may use sibling entry
as a valid entry and crash the kernel.

Besides, load_race() test is modified to catch mentioned issue and modified
load_race() only passes after this fix is merged.

Here is an example how this bug could be triggerred in tmpfs which
enables large folio in mapping:
Let's take a look at involved racer:
1. How pages could be created and dirtied in shmem file.
write
 ksys_write
  vfs_write
   new_sync_write
    shmem_file_write_iter
     generic_perform_write
      shmem_write_begin
       shmem_get_folio
        shmem_allowable_huge_orders
        shmem_alloc_and_add_folios
        shmem_alloc_folio
        __folio_set_locked
        shmem_add_to_page_cache
         XA_STATE_ORDER(..., index, order)
         xax_store()
      shmem_write_end
       folio_mark_dirty()

2. How dirty pages could be deleted in shmem file.
ioctl
 do_vfs_ioctl
  file_ioctl
   ioctl_preallocate
    vfs_fallocate
     shmem_fallocate
      shmem_truncate_range
       shmem_undo_range
        truncate_inode_folio
         filemap_remove_folio
          page_cache_delete
           xas_store(&xas, NULL);

3. How dirty pages could be lockless searched
sync_file_range
 ksys_sync_file_range
  __filemap_fdatawrite_range
   filemap_fdatawrite_wbc
    do_writepages
     writeback_use_writepage
      writeback_iter
       writeback_get_folio
        filemap_get_folios_tag
         find_get_entry
          folio = xas_find_marked()
          folio_try_get(folio)

Kernel will crash as following:
1.Create               2.Search             3.Delete
/* write page 2,3 */
write
 ...
  shmem_write_begin
   XA_STATE_ORDER(xas, i_pages, index = 2, order = 1)
   xa_store(&xas, folio)
  shmem_write_end
   folio_mark_dirty()

                       /* sync page 2 and page 3 */
                       sync_file_range
                        ...
                         find_get_entry
                          folio = xas_find_marked()
                          /* offset will be 2 */
                          offset = xas_find_chunk()

                                             /* delete page 2 and page 3 */
                                             ioctl
                                              ...
                                               xas_store(&xas, NULL);

/* write page 0-3 */
write
 ...
  shmem_write_begin
   XA_STATE_ORDER(xas, i_pages, index = 0, order = 2)
   xa_store(&xas, folio)
  shmem_write_end
   folio_mark_dirty(folio)

                          /* get sibling entry from offset 2 */
                          entry = xa_entry(.., 2)
                          /* use sibling entry as folio and crash kernel */
                          folio_try_get(folio)

Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
---
 lib/xarray.c                          | 2 ++
 tools/testing/radix-tree/multiorder.c | 4 ++++
 2 files changed, 6 insertions(+)

Comments

Baolin Wang Dec. 13, 2024, 6:12 a.m. UTC | #1
On 2024/12/13 20:25, Kemeng Shi wrote:
> Similar to issue fixed in commit cbc02854331ed ("XArray: Do not return
> sibling entries from xa_load()"), we may return sibling entries from
> xas_find_marked as following:
>      Thread A:               Thread B:
>                              xa_store_range(xa, entry, 6, 7, gfp);
> 			    xa_set_mark(xa, 6, mark)
>      XA_STATE(xas, xa, 6);
>      xas_find_marked(&xas, 7, mark);
>      offset = xas_find_chunk(xas, advance, mark);
>      [offset is 6 which points to a valid entry]
>                              xa_store_range(xa, entry, 4, 7, gfp);
>      entry = xa_entry(xa, node, 6);
>      [entry is a sibling of 4]
>      if (!xa_is_node(entry))
>          return entry;
> 
> Skip sibling entry like xas_find() does to protect caller from seeing
> sibling entry from xas_find_marked() or caller may use sibling entry
> as a valid entry and crash the kernel.
> 
> Besides, load_race() test is modified to catch mentioned issue and modified
> load_race() only passes after this fix is merged.
> 
> Here is an example how this bug could be triggerred in tmpfs which
> enables large folio in mapping:
> Let's take a look at involved racer:
> 1. How pages could be created and dirtied in shmem file.
> write
>   ksys_write
>    vfs_write
>     new_sync_write
>      shmem_file_write_iter
>       generic_perform_write
>        shmem_write_begin
>         shmem_get_folio
>          shmem_allowable_huge_orders
>          shmem_alloc_and_add_folios
>          shmem_alloc_folio
>          __folio_set_locked
>          shmem_add_to_page_cache
>           XA_STATE_ORDER(..., index, order)
>           xax_store()
>        shmem_write_end
>         folio_mark_dirty()
> 
> 2. How dirty pages could be deleted in shmem file.
> ioctl
>   do_vfs_ioctl
>    file_ioctl
>     ioctl_preallocate
>      vfs_fallocate
>       shmem_fallocate
>        shmem_truncate_range
>         shmem_undo_range
>          truncate_inode_folio
>           filemap_remove_folio
>            page_cache_delete
>             xas_store(&xas, NULL);
> 
> 3. How dirty pages could be lockless searched
> sync_file_range
>   ksys_sync_file_range
>    __filemap_fdatawrite_range
>     filemap_fdatawrite_wbc

Seems not a good example, IIUC, tmpfs doesn't support writeback 
(mapping_can_writeback() will return false), right?

>      do_writepages
>       writeback_use_writepage
>        writeback_iter
>         writeback_get_folio
>          filemap_get_folios_tag
>           find_get_entry
>            folio = xas_find_marked()
>            folio_try_get(folio)
>
Kemeng Shi Dec. 16, 2024, 7:05 a.m. UTC | #2
on 12/13/2024 2:12 PM, Baolin Wang wrote:
> 
> 
> On 2024/12/13 20:25, Kemeng Shi wrote:
>> Similar to issue fixed in commit cbc02854331ed ("XArray: Do not return
>> sibling entries from xa_load()"), we may return sibling entries from
>> xas_find_marked as following:
>>      Thread A:               Thread B:
>>                              xa_store_range(xa, entry, 6, 7, gfp);
>>                 xa_set_mark(xa, 6, mark)
>>      XA_STATE(xas, xa, 6);
>>      xas_find_marked(&xas, 7, mark);
>>      offset = xas_find_chunk(xas, advance, mark);
>>      [offset is 6 which points to a valid entry]
>>                              xa_store_range(xa, entry, 4, 7, gfp);
>>      entry = xa_entry(xa, node, 6);
>>      [entry is a sibling of 4]
>>      if (!xa_is_node(entry))
>>          return entry;
>>
>> Skip sibling entry like xas_find() does to protect caller from seeing
>> sibling entry from xas_find_marked() or caller may use sibling entry
>> as a valid entry and crash the kernel.
>>
>> Besides, load_race() test is modified to catch mentioned issue and modified
>> load_race() only passes after this fix is merged.
>>
>> Here is an example how this bug could be triggerred in tmpfs which
>> enables large folio in mapping:
>> Let's take a look at involved racer:
>> 1. How pages could be created and dirtied in shmem file.
>> write
>>   ksys_write
>>    vfs_write
>>     new_sync_write
>>      shmem_file_write_iter
>>       generic_perform_write
>>        shmem_write_begin
>>         shmem_get_folio
>>          shmem_allowable_huge_orders
>>          shmem_alloc_and_add_folios
>>          shmem_alloc_folio
>>          __folio_set_locked
>>          shmem_add_to_page_cache
>>           XA_STATE_ORDER(..., index, order)
>>           xax_store()
>>        shmem_write_end
>>         folio_mark_dirty()
>>
>> 2. How dirty pages could be deleted in shmem file.
>> ioctl
>>   do_vfs_ioctl
>>    file_ioctl
>>     ioctl_preallocate
>>      vfs_fallocate
>>       shmem_fallocate
>>        shmem_truncate_range
>>         shmem_undo_range
>>          truncate_inode_folio
>>           filemap_remove_folio
>>            page_cache_delete
>>             xas_store(&xas, NULL);
>>
>> 3. How dirty pages could be lockless searched
>> sync_file_range
>>   ksys_sync_file_range
>>    __filemap_fdatawrite_range
>>     filemap_fdatawrite_wbc
> 
> Seems not a good example, IIUC, tmpfs doesn't support writeback (mapping_can_writeback() will return false), right?
> 
Ahhh, right. Thank you for correcting me. Then I would like to use nfs as low-level
 filesystem in example and the potential crash could be triggered in the same steps.

Invovled racers:
1. How pages could be created and dirtied in nfs.
write
 ksys_write
  vfs_write
   new_sync_write
    nfs_file_write
     generic_perform_write
      nfs_write_begin
	   fgf_set_order
	   __filemap_get_folio
      nfs_write_end
       nfs_update_folio
	    nfs_writepage_setup
		 nfs_mark_request_dirty
		  filemap_dirty_folio
		   __folio_mark_dirty
		    __xa_set_mark

2. How dirty pages could be deleted in nfs.
ioctl
 do_vfs_ioctl
  file_ioctl
   ioctl_preallocate
    vfs_fallocate
     nfs42_fallocate
      nfs42_proc_deallocate
       truncate_pagecache_range
        truncate_inode_pages_range
         truncate_inode_folio
	  filemap_remove_folio
	   page_cache_delete
	    xas_store(&xas, NULL);


3. How dirty pages could be lockless searched
sync_file_range
 ksys_sync_file_range
  __filemap_fdatawrite_range
   filemap_fdatawrite_wbc
    do_writepages
     writeback_use_writepage
      writeback_iter
       writeback_get_folio
        filemap_get_folios_tag
         find_get_entry
          folio = xas_find_marked()
          folio_try_get(folio)

Steps to crash kernel:
1.Create               2.Search             3.Delete
/* write page 2,3 */
write
 ...
  nfs_write_begin
   fgf_set_order
   __filemap_get_folio
    ...
     xa_store(&xas, folio)
  nfs_write_end
   ...
    __folio_mark_dirty

                       /* sync page 2 and page 3 */
                       sync_file_range
                        ...
                         find_get_entry
                          folio = xas_find_marked()
                          /* offset will be 2 */
                          offset = xas_find_chunk()

                                             /* delete page 2 and page 3 */
                                             ioctl
                                              ...
                                               xas_store(&xas, NULL);

/* write page 0-3 */
write
 ...
  nfs_write_begin
   fgf_set_order
   __filemap_get_folio
    ...
     xa_store(&xas, folio)
  nfs_write_end
   ...
    __folio_mark_dirty

                          /* get sibling entry from offset 2 */
                          entry = xa_entry(.., 2)
                          /* use sibling entry as folio and crash kernel */
                          folio_try_get(folio)
Matthew Wilcox (Oracle) Dec. 16, 2024, 2:04 p.m. UTC | #3
On Mon, Dec 16, 2024 at 03:05:26PM +0800, Kemeng Shi wrote:
> Ahhh, right. Thank you for correcting me. Then I would like to use nfs as low-level
>  filesystem in example and the potential crash could be triggered in the same steps.

Have you actually seen this crash, or do you just think it can happen?
Kemeng Shi Dec. 17, 2024, 12:59 a.m. UTC | #4
on 12/16/2024 10:04 PM, Matthew Wilcox wrote:
> On Mon, Dec 16, 2024 at 03:05:26PM +0800, Kemeng Shi wrote:
>> Ahhh, right. Thank you for correcting me. Then I would like to use nfs as low-level
>>  filesystem in example and the potential crash could be triggered in the same steps.
> 
> Have you actually seen this crash, or do you just think it can happen?
> 
I just think it can happen. Sorry for confusion...
diff mbox series

Patch

diff --git a/lib/xarray.c b/lib/xarray.c
index 32d4bac8c94c..fa87949719a0 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1382,6 +1382,8 @@  void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark)
 		entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
 		if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK))
 			continue;
+		if (xa_is_sibling(entry))
+			continue;
 		if (!xa_is_node(entry))
 			return entry;
 		xas->xa_node = xa_to_node(entry);
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index cffaf2245d4f..eaff1b036989 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -227,6 +227,7 @@  static void *load_creator(void *ptr)
 			unsigned long index = (3 << RADIX_TREE_MAP_SHIFT) -
 						(1 << order);
 			item_insert_order(tree, index, order);
+			xa_set_mark(tree, index, XA_MARK_1);
 			item_delete_rcu(tree, index);
 		}
 	}
@@ -242,8 +243,11 @@  static void *load_worker(void *ptr)
 
 	rcu_register_thread();
 	while (!stop_iteration) {
+		unsigned long find_index = (2 << RADIX_TREE_MAP_SHIFT) + 1;
 		struct item *item = xa_load(ptr, index);
 		assert(!xa_is_internal(item));
+		item = xa_find(ptr, &find_index, index, XA_MARK_1);
+		assert(!xa_is_internal(item));
 	}
 	rcu_unregister_thread();