diff mbox series

[Resend,v3] ocfs2: fix kernel crash after mounting when journal doesn't ready

Message ID 20220318150401.2411-1-heming.zhao@suse.com (mailing list archive)
State New, archived
Headers show
Series [Resend,v3] ocfs2: fix kernel crash after mounting when journal doesn't ready | expand

Commit Message

heming.zhao@suse.com March 18, 2022, 3:04 p.m. UTC
Call trace:

  ocfs2: Registered cluster interface user
  dlm: no local IP address has been set
  dlm: cannot start dlm lowcomms -107
  (mount.ocfs2,2225,0):ocfs2_dlm_init:3355 ERROR: status = -107
  (mount.ocfs2,2225,0):ocfs2_mount_volume:1817 ERROR: status = -107
  (mount.ocfs2,2225,0):ocfs2_fill_super:1186 ERROR: status = -107
  BUG: kernel NULL pointer dereference, address: 0000000000000030
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  PGD 0 P4D 0
  Oops: 0000 [#1] PREEMPT SMP NOPTI
  CPU: 0 PID: 2225 Comm: mount.ocfs2 Not tainted 5.16.2-1-default #1
openSUSE Tumbleweed b40...e84
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ...
  RIP: 0010:ocfs2_clear_inode+0x2e9/0x720 [ocfs2]
  ...
  Call Trace:
   <TASK>
   ? ocfs2_evict_inode+0x1fe/0x630 [ocfs2 411bc..281]
   evict+0xc0/0x1c0
   ocfs2_release_system_inodes+0x21/0xc0 [ocfs2 411bc..281]
   ocfs2_dismount_volume+0x10b/0x2d0 [ocfs2 411bc..281]
   ocfs2_fill_super+0xaf/0x19e0 [ocfs2 411bc..281]
   mount_bdev+0x182/0x1b0
   ? ocfs2_initialize_super.isra.0+0xf50/0xf50 [ocfs2 411bc..281]
   legacy_get_tree+0x24/0x40
   vfs_get_tree+0x22/0xb0
   path_mount+0x465/0xac0
   __x64_sys_mount+0x103/0x140
   do_syscall_64+0x59/0x80
   ? syscall_exit_to_user_mode+0x18/0x40
   ? do_syscall_64+0x69/0x80
   ? syscall_exit_to_user_mode+0x18/0x40
   ? do_syscall_64+0x69/0x80
   ? exc_page_fault+0x68/0x150
   entry_SYSCALL_64_after_hwframe+0x44/0xae

How to trigger:

  tb-ocfs1 # dd if=/dev/zero of=/dev/vdb bs=1M count=20 oflag=direct
  tb-ocfs1 # mkfs.ocfs2 --cluster-stack=pcmk -N 4 /dev/vdb \
             --cluster-name=ocfstst

             == there is no dlm running ==
  tb-ocfs1 # mount -t ocfs2 /dev/vdb /mnt
             == kernel crash ==

Analysis:

After commit da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown"),
Journal init later than before, it makes NULL pointer access in free
routine.

ocfs2_fill_super
 + ocfs2_mount_volume
 |  ocfs2_dlm_init //failed, osb->journal still NULL.
 + ...
 + ocfs2_dismount_volume
    ocfs2_release_system_inodes
      ...
       evict
        ...
         ocfs2_clear_inode
          ocfs2_checkpoint_inode
           ocfs2_ci_fully_checkpointed
            time_after(journal->j_trans_id, ci->ci_last_trans)
             + journal is empty, crash!

How to fix:

create a new function _inode_direct_free() to handle journal non-exist
case.

Fixes: da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown")
Signed-off-by: Heming Zhao <heming.zhao@suse.com>
---
v3: change patch subject
    change fix method:
    - v1/v2 focused on the points of accessing osb->journal.
    - v3 directly free inodes
v2: revise commit log
    add checking in ocfs2_dismount_volume()
---
 fs/ocfs2/localalloc.c | 12 ++++++++
 fs/ocfs2/super.c      | 67 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 75 insertions(+), 4 deletions(-)

Comments

Joseph Qi March 20, 2022, 1:51 p.m. UTC | #1
I don't think it deserves a complex fix like this.
iput() is a common action and we don't want to re-create a private flow
in ocfs2. Also I think it's not safe.
I suggest we take the way to make sure ocfs2 journal is there as a
prerequisite.

Thanks,
Joseph

On 3/18/22 11:04 PM, Heming Zhao wrote:
> Call trace:
> 
>   ocfs2: Registered cluster interface user
>   dlm: no local IP address has been set
>   dlm: cannot start dlm lowcomms -107
>   (mount.ocfs2,2225,0):ocfs2_dlm_init:3355 ERROR: status = -107
>   (mount.ocfs2,2225,0):ocfs2_mount_volume:1817 ERROR: status = -107
>   (mount.ocfs2,2225,0):ocfs2_fill_super:1186 ERROR: status = -107
>   BUG: kernel NULL pointer dereference, address: 0000000000000030
>   #PF: supervisor read access in kernel mode
>   #PF: error_code(0x0000) - not-present page
>   PGD 0 P4D 0
>   Oops: 0000 [#1] PREEMPT SMP NOPTI
>   CPU: 0 PID: 2225 Comm: mount.ocfs2 Not tainted 5.16.2-1-default #1
> openSUSE Tumbleweed b40...e84
>   Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ...
>   RIP: 0010:ocfs2_clear_inode+0x2e9/0x720 [ocfs2]
>   ...
>   Call Trace:
>    <TASK>
>    ? ocfs2_evict_inode+0x1fe/0x630 [ocfs2 411bc..281]
>    evict+0xc0/0x1c0
>    ocfs2_release_system_inodes+0x21/0xc0 [ocfs2 411bc..281]
>    ocfs2_dismount_volume+0x10b/0x2d0 [ocfs2 411bc..281]
>    ocfs2_fill_super+0xaf/0x19e0 [ocfs2 411bc..281]
>    mount_bdev+0x182/0x1b0
>    ? ocfs2_initialize_super.isra.0+0xf50/0xf50 [ocfs2 411bc..281]
>    legacy_get_tree+0x24/0x40
>    vfs_get_tree+0x22/0xb0
>    path_mount+0x465/0xac0
>    __x64_sys_mount+0x103/0x140
>    do_syscall_64+0x59/0x80
>    ? syscall_exit_to_user_mode+0x18/0x40
>    ? do_syscall_64+0x69/0x80
>    ? syscall_exit_to_user_mode+0x18/0x40
>    ? do_syscall_64+0x69/0x80
>    ? exc_page_fault+0x68/0x150
>    entry_SYSCALL_64_after_hwframe+0x44/0xae
> 
> How to trigger:
> 
>   tb-ocfs1 # dd if=/dev/zero of=/dev/vdb bs=1M count=20 oflag=direct
>   tb-ocfs1 # mkfs.ocfs2 --cluster-stack=pcmk -N 4 /dev/vdb \
>              --cluster-name=ocfstst
> 
>              == there is no dlm running ==
>   tb-ocfs1 # mount -t ocfs2 /dev/vdb /mnt
>              == kernel crash ==
> 
> Analysis:
> 
> After commit da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown"),
> Journal init later than before, it makes NULL pointer access in free
> routine.
> 
> ocfs2_fill_super
>  + ocfs2_mount_volume
>  |  ocfs2_dlm_init //failed, osb->journal still NULL.
>  + ...
>  + ocfs2_dismount_volume
>     ocfs2_release_system_inodes
>       ...
>        evict
>         ...
>          ocfs2_clear_inode
>           ocfs2_checkpoint_inode
>            ocfs2_ci_fully_checkpointed
>             time_after(journal->j_trans_id, ci->ci_last_trans)
>              + journal is empty, crash!
> 
> How to fix:
> 
> create a new function _inode_direct_free() to handle journal non-exist
> case.
> 
> Fixes: da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown")
> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
> ---
> v3: change patch subject
>     change fix method:
>     - v1/v2 focused on the points of accessing osb->journal.
>     - v3 directly free inodes
> v2: revise commit log
>     add checking in ocfs2_dismount_volume()
> ---
>  fs/ocfs2/localalloc.c | 12 ++++++++
>  fs/ocfs2/super.c      | 67 ++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 75 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
> index 5f6bacbeef6b..c7694c2a1000 100644
> --- a/fs/ocfs2/localalloc.c
> +++ b/fs/ocfs2/localalloc.c
> @@ -412,6 +412,18 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
>  		goto out_mutex;
>  	}
>  
> +	if (!osb->journal) {
> +		/* Does here make sense to handle bh? */
> +		if (osb->local_alloc_bh)
> +			brelse(osb->local_alloc_bh);
> +		osb->local_alloc_bh = NULL;
> +		osb->local_alloc_state = OCFS2_LA_UNUSED;
> +
> +		/*  After out_unlock calling iput() is safe, in theory system inode
> +		 * still be hold in dismount flow */
> +		goto out_unlock;
> +	}
> +
>  	/* WINDOW_MOVE_CREDITS is a bit heavy... */
>  	handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
>  	if (IS_ERR(handle)) {
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index 8bde30fa5387..65aedfdcb2f9 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -504,28 +504,79 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
>  	return status;
>  }
>  
> +/*
> + * This function does the directly release action, which makes sure
> + * there is no memory & lockres leak for inode.
> + *
> + * How it works:
> + *
> + * refer iput() flow, which will call following key functions:
> + *   ocfs2_drop_inode, ocfs2_evict_inode, ocfs2_clear_inode, ocfs2_free_inode.
> + *
> + * - ocfs2_drop_inode() & ocfs2_clear_inode() are useless when directly remove.
> + * - other ocfs2_xx_inode() should be mentioned in this function.
> + */
> +static void _inode_direct_free(struct ocfs2_super *osb, struct inode *inode)
> +{
> +	int status;
> +	struct ocfs2_inode_info *oi = OCFS2_I(inode);
> +
> +	/* 1> copy from ocfs2_evict_inode() */
> +	truncate_inode_pages_final(&inode->i_data);
> +
> +	ocfs2_open_unlock(inode);
> +
> +	ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
> +	ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
> +	ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
> +
> +	status = ocfs2_drop_inode_locks(inode);
> +	if (status < 0)
> +		mlog_errno(status);
> +
> +	ocfs2_lock_res_free(&oi->ip_rw_lockres);
> +	ocfs2_lock_res_free(&oi->ip_inode_lockres);
> +	ocfs2_lock_res_free(&oi->ip_open_lockres);
> +	ocfs2_metadata_cache_exit(INODE_CACHE(inode));
> +
> +	oi->ip_flags = 0;
> +	oi->ip_dir_start_lookup = 0;
> +	oi->ip_blkno = 0ULL;
> +
> +	/* 2> free memory */
> +	ocfs2_free_inode(inode);
> +}
> +
> +static void _inode_iput_free(struct ocfs2_super *osb, struct inode *inode)
> +{
> +	iput(inode);
> +}
> +
>  static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
>  {
>  	int i;
>  	struct inode *inode;
> +	void (*_free_inode)(struct ocfs2_super *osb, struct inode *inode);
> +
> +	_free_inode = osb->journal ? _inode_iput_free : _inode_direct_free;
>  
>  	for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
>  		inode = osb->global_system_inodes[i];
>  		if (inode) {
> -			iput(inode);
> +			_free_inode(osb, inode);
>  			osb->global_system_inodes[i] = NULL;
>  		}
>  	}
>  
>  	inode = osb->sys_root_inode;
>  	if (inode) {
> -		iput(inode);
> +		_free_inode(osb, inode);
>  		osb->sys_root_inode = NULL;
>  	}
>  
>  	inode = osb->root_inode;
>  	if (inode) {
> -		iput(inode);
> +		_free_inode(osb, inode);
>  		osb->root_inode = NULL;
>  	}
>  
> @@ -534,7 +585,7 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
>  
>  	for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
>  		if (osb->local_system_inodes[i]) {
> -			iput(osb->local_system_inodes[i]);
> +			_free_inode(osb, osb->local_system_inodes[i]);
>  			osb->local_system_inodes[i] = NULL;
>  		}
>  	}
> @@ -1870,6 +1921,13 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
>  	osb = OCFS2_SB(sb);
>  	BUG_ON(!osb);
>  
> +	if (!osb->journal) {
> +		ocfs2_shutdown_local_alloc(osb);
> +		ocfs2_recovery_exit(osb);
> +		ocfs2_purge_refcount_trees(osb);
> +		goto put_slot;
> +	}
> +
>  	/* Remove file check sysfs related directores/files,
>  	 * and wait for the pending file check operations */
>  	ocfs2_filecheck_remove_sysfs(osb);
> @@ -1897,6 +1955,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
>  
>  	ocfs2_purge_refcount_trees(osb);
>  
> +put_slot:
>  	/* No cluster connection means we've failed during mount, so skip
>  	 * all the steps which depended on that to complete. */
>  	if (osb->cconn) {
David Sterba via Ocfs2-devel March 20, 2022, 3:17 p.m. UTC | #2
On 3/20/22 21:51, Joseph Qi wrote:
> I don't think it deserves a complex fix like this.
> iput() is a common action and we don't want to re-create a private flow
> in ocfs2. Also I think it's not safe.

I have another viewpoint, it's not complex, the v3 is only related with "error mount"
or umount flow (I could change it only focus on error mount flow).
all modified functions: ocfs2_shutdown_local_alloc & ocfs2_release_system_inodes.
these two funcs won't be used in normal IOs flow. IMO the changes are safe.



> I suggest we take the way to make sure ocfs2 journal is there as a
> prerequisite.

with this idea, journal should be inited before any inode operations. it's tricky and
developers should take care of it all the time. commit da5e7c87827e8 is the cost of the
tricky code. But with v3 patch, (in theory) journal could be initialized at any time when
mounting. which could avoid similar bugs afterwards.

Thanks,
Heming

> 
> On 3/18/22 11:04 PM, Heming Zhao wrote:
>> Call trace:
>>
>>    ocfs2: Registered cluster interface user
>>    dlm: no local IP address has been set
>>    dlm: cannot start dlm lowcomms -107
>>    (mount.ocfs2,2225,0):ocfs2_dlm_init:3355 ERROR: status = -107
>>    (mount.ocfs2,2225,0):ocfs2_mount_volume:1817 ERROR: status = -107
>>    (mount.ocfs2,2225,0):ocfs2_fill_super:1186 ERROR: status = -107
>>    BUG: kernel NULL pointer dereference, address: 0000000000000030
>>    #PF: supervisor read access in kernel mode
>>    #PF: error_code(0x0000) - not-present page
>>    PGD 0 P4D 0
>>    Oops: 0000 [#1] PREEMPT SMP NOPTI
>>    CPU: 0 PID: 2225 Comm: mount.ocfs2 Not tainted 5.16.2-1-default #1
>> openSUSE Tumbleweed b40...e84
>>    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ...
>>    RIP: 0010:ocfs2_clear_inode+0x2e9/0x720 [ocfs2]
>>    ...
>>    Call Trace:
>>     <TASK>
>>     ? ocfs2_evict_inode+0x1fe/0x630 [ocfs2 411bc..281]
>>     evict+0xc0/0x1c0
>>     ocfs2_release_system_inodes+0x21/0xc0 [ocfs2 411bc..281]
>>     ocfs2_dismount_volume+0x10b/0x2d0 [ocfs2 411bc..281]
>>     ocfs2_fill_super+0xaf/0x19e0 [ocfs2 411bc..281]
>>     mount_bdev+0x182/0x1b0
>>     ? ocfs2_initialize_super.isra.0+0xf50/0xf50 [ocfs2 411bc..281]
>>     legacy_get_tree+0x24/0x40
>>     vfs_get_tree+0x22/0xb0
>>     path_mount+0x465/0xac0
>>     __x64_sys_mount+0x103/0x140
>>     do_syscall_64+0x59/0x80
>>     ? syscall_exit_to_user_mode+0x18/0x40
>>     ? do_syscall_64+0x69/0x80
>>     ? syscall_exit_to_user_mode+0x18/0x40
>>     ? do_syscall_64+0x69/0x80
>>     ? exc_page_fault+0x68/0x150
>>     entry_SYSCALL_64_after_hwframe+0x44/0xae
>>
>> How to trigger:
>>
>>    tb-ocfs1 # dd if=/dev/zero of=/dev/vdb bs=1M count=20 oflag=direct
>>    tb-ocfs1 # mkfs.ocfs2 --cluster-stack=pcmk -N 4 /dev/vdb \
>>               --cluster-name=ocfstst
>>
>>               == there is no dlm running ==
>>    tb-ocfs1 # mount -t ocfs2 /dev/vdb /mnt
>>               == kernel crash ==
>>
>> Analysis:
>>
>> After commit da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown"),
>> Journal init later than before, it makes NULL pointer access in free
>> routine.
>>
>> ocfs2_fill_super
>>   + ocfs2_mount_volume
>>   |  ocfs2_dlm_init //failed, osb->journal still NULL.
>>   + ...
>>   + ocfs2_dismount_volume
>>      ocfs2_release_system_inodes
>>        ...
>>         evict
>>          ...
>>           ocfs2_clear_inode
>>            ocfs2_checkpoint_inode
>>             ocfs2_ci_fully_checkpointed
>>              time_after(journal->j_trans_id, ci->ci_last_trans)
>>               + journal is empty, crash!
>>
>> ...
David Sterba via Ocfs2-devel March 21, 2022, 8:28 a.m. UTC | #3
On 3/20/22 23:17, heming.zhao--- via Ocfs2-devel wrote:
> On 3/20/22 21:51, Joseph Qi wrote:
>> I don't think it deserves a complex fix like this.
>> iput() is a common action and we don't want to re-create a private flow
>> in ocfs2. Also I think it's not safe.
> 
> I have another viewpoint, it's not complex, the v3 is only related with "error mount"
> or umount flow (I could change it only focus on error mount flow).
> all modified functions: ocfs2_shutdown_local_alloc & ocfs2_release_system_inodes.
> these two funcs won't be used in normal IOs flow. IMO the changes are safe.
> 

v3 patch still needs to modify/polish, but the core idea doesn't change:
There will create a new func, which calls iput() or _inode_direct_free() according to the
osb->journal value.

Thanks,

> 
>> I suggest we take the way to make sure ocfs2 journal is there as a
>> prerequisite.
> 
> with this idea, journal should be inited before any inode operations. it's tricky and
> developers should take care of it all the time. commit da5e7c87827e8 is the cost of the
> tricky code. But with v3 patch, (in theory) journal could be initialized at any time when
> mounting. which could avoid similar bugs afterwards.
> 
> Thanks,
> Heming
> 
>>
>> On 3/18/22 11:04 PM, Heming Zhao wrote:
>>> Call trace:
>>>
>>>    ocfs2: Registered cluster interface user
>>>    dlm: no local IP address has been set
>>>    dlm: cannot start dlm lowcomms -107
>>>    (mount.ocfs2,2225,0):ocfs2_dlm_init:3355 ERROR: status = -107
>>>    (mount.ocfs2,2225,0):ocfs2_mount_volume:1817 ERROR: status = -107
>>>    (mount.ocfs2,2225,0):ocfs2_fill_super:1186 ERROR: status = -107
>>>    BUG: kernel NULL pointer dereference, address: 0000000000000030
>>>    #PF: supervisor read access in kernel mode
>>>    #PF: error_code(0x0000) - not-present page
>>>    PGD 0 P4D 0
>>>    Oops: 0000 [#1] PREEMPT SMP NOPTI
>>>    CPU: 0 PID: 2225 Comm: mount.ocfs2 Not tainted 5.16.2-1-default #1
>>> openSUSE Tumbleweed b40...e84
>>>    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ...
>>>    RIP: 0010:ocfs2_clear_inode+0x2e9/0x720 [ocfs2]
>>>    ...
>>>    Call Trace:
>>>     <TASK>
>>>     ? ocfs2_evict_inode+0x1fe/0x630 [ocfs2 411bc..281]
>>>     evict+0xc0/0x1c0
>>>     ocfs2_release_system_inodes+0x21/0xc0 [ocfs2 411bc..281]
>>>     ocfs2_dismount_volume+0x10b/0x2d0 [ocfs2 411bc..281]
>>>     ocfs2_fill_super+0xaf/0x19e0 [ocfs2 411bc..281]
>>>     mount_bdev+0x182/0x1b0
>>>     ? ocfs2_initialize_super.isra.0+0xf50/0xf50 [ocfs2 411bc..281]
>>>     legacy_get_tree+0x24/0x40
>>>     vfs_get_tree+0x22/0xb0
>>>     path_mount+0x465/0xac0
>>>     __x64_sys_mount+0x103/0x140
>>>     do_syscall_64+0x59/0x80
>>>     ? syscall_exit_to_user_mode+0x18/0x40
>>>     ? do_syscall_64+0x69/0x80
>>>     ? syscall_exit_to_user_mode+0x18/0x40
>>>     ? do_syscall_64+0x69/0x80
>>>     ? exc_page_fault+0x68/0x150
>>>     entry_SYSCALL_64_after_hwframe+0x44/0xae
>>>
>>> How to trigger:
>>>
>>>    tb-ocfs1 # dd if=/dev/zero of=/dev/vdb bs=1M count=20 oflag=direct
>>>    tb-ocfs1 # mkfs.ocfs2 --cluster-stack=pcmk -N 4 /dev/vdb \
>>>               --cluster-name=ocfstst
>>>
>>>               == there is no dlm running ==
>>>    tb-ocfs1 # mount -t ocfs2 /dev/vdb /mnt
>>>               == kernel crash ==
>>>
>>> Analysis:
>>>
>>> After commit da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown"),
>>> Journal init later than before, it makes NULL pointer access in free
>>> routine.
>>>
>>> ocfs2_fill_super
>>>   + ocfs2_mount_volume
>>>   |  ocfs2_dlm_init //failed, osb->journal still NULL.
>>>   + ...
>>>   + ocfs2_dismount_volume
>>>      ocfs2_release_system_inodes
>>>        ...
>>>         evict
>>>          ...
>>>           ocfs2_clear_inode
>>>            ocfs2_checkpoint_inode
>>>             ocfs2_ci_fully_checkpointed
>>>              time_after(journal->j_trans_id, ci->ci_last_trans)
>>>               + journal is empty, crash!
>>>
>>> ...
> 
> 
> 
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel@oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
David Sterba via Ocfs2-devel March 27, 2022, 10:37 a.m. UTC | #4
Hello Joseph,

Could we speed up this crash fixing? I could trigger this kind of crash at least two ways.
Do you have another alternative patch?

On 3/20/22 21:51, Joseph Qi wrote:
> I don't think it deserves a complex fix like this.
> iput() is a common action and we don't want to re-create a private flow
> in ocfs2. Also I think it's not safe.
> I suggest we take the way to make sure ocfs2 journal is there as a
> prerequisite.

let's image the journal init before any inode operations. and then there will
create a gap between "journal init" and "dlm init". for a rule, any inode operation
should be under dlm lock protecting. if there is a failed before dlm init.
how to handle locking consistency?
the correct code logic should do journal init after dlm lock init. IMO the commit
ccd979bdbce9f ("OCFS2: The Second Oracle Cluster Filesystem") made the locking
mistake on first code version. But author wrote comment to explain the reason.

     ```
     /* FIXME
      * This should be done in ocfs2_journal_init(), but unknown
      * ordering issues will cause the filesystem to crash.
      * If anyone wants to figure out what part of the code
      * refers to osb->journal before ocfs2_journal_init() is run,
      * be my guest.
      */
     ```

We don't simply turn back to pick up the problematic code logic. we should
find a new method.

- Heming

> 
> On 3/18/22 11:04 PM, Heming Zhao wrote:
>> Call trace:
>>
>>    ocfs2: Registered cluster interface user
>>    dlm: no local IP address has been set
>>    dlm: cannot start dlm lowcomms -107
>>    (mount.ocfs2,2225,0):ocfs2_dlm_init:3355 ERROR: status = -107
>>    (mount.ocfs2,2225,0):ocfs2_mount_volume:1817 ERROR: status = -107
>>    (mount.ocfs2,2225,0):ocfs2_fill_super:1186 ERROR: status = -107
>>    BUG: kernel NULL pointer dereference, address: 0000000000000030
>>    #PF: supervisor read access in kernel mode
>>    #PF: error_code(0x0000) - not-present page
>>    PGD 0 P4D 0
>>    Oops: 0000 [#1] PREEMPT SMP NOPTI
>>    CPU: 0 PID: 2225 Comm: mount.ocfs2 Not tainted 5.16.2-1-default #1
>> openSUSE Tumbleweed b40...e84
>>    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ...
>>    RIP: 0010:ocfs2_clear_inode+0x2e9/0x720 [ocfs2]
>>    ...
>>    Call Trace:
>>     <TASK>
>>     ? ocfs2_evict_inode+0x1fe/0x630 [ocfs2 411bc..281]
>>     evict+0xc0/0x1c0
>>     ocfs2_release_system_inodes+0x21/0xc0 [ocfs2 411bc..281]
>>     ocfs2_dismount_volume+0x10b/0x2d0 [ocfs2 411bc..281]
>>     ocfs2_fill_super+0xaf/0x19e0 [ocfs2 411bc..281]
>>     mount_bdev+0x182/0x1b0
>>     ? ocfs2_initialize_super.isra.0+0xf50/0xf50 [ocfs2 411bc..281]
>>     legacy_get_tree+0x24/0x40
>>     vfs_get_tree+0x22/0xb0
>>     path_mount+0x465/0xac0
>>     __x64_sys_mount+0x103/0x140
>>     do_syscall_64+0x59/0x80
>>     ? syscall_exit_to_user_mode+0x18/0x40
>>     ? do_syscall_64+0x69/0x80
>>     ? syscall_exit_to_user_mode+0x18/0x40
>>     ? do_syscall_64+0x69/0x80
>>     ? exc_page_fault+0x68/0x150
>>     entry_SYSCALL_64_after_hwframe+0x44/0xae
>>
>> How to trigger:
>>
>>    tb-ocfs1 # dd if=/dev/zero of=/dev/vdb bs=1M count=20 oflag=direct
>>    tb-ocfs1 # mkfs.ocfs2 --cluster-stack=pcmk -N 4 /dev/vdb \
>>               --cluster-name=ocfstst
>>
>>               == there is no dlm running ==
>>    tb-ocfs1 # mount -t ocfs2 /dev/vdb /mnt
>>               == kernel crash ==
>>
>> Analysis:
>>
>> After commit da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown"),
>> Journal init later than before, it makes NULL pointer access in free
>> routine.
>>
>> ocfs2_fill_super
>>   + ocfs2_mount_volume
>>   |  ocfs2_dlm_init //failed, osb->journal still NULL.
>>   + ...
>>   + ocfs2_dismount_volume
>>      ocfs2_release_system_inodes
>>        ...
>>         evict
>>          ...
>>           ocfs2_clear_inode
>>            ocfs2_checkpoint_inode
>>             ocfs2_ci_fully_checkpointed
>>              time_after(journal->j_trans_id, ci->ci_last_trans)
>>               + journal is empty, crash!
>>
>> How to fix:
>>
>> create a new function _inode_direct_free() to handle journal non-exist
>> case.
>>
>> Fixes: da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown")
>> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
>> ---
>> v3: change patch subject
>>      change fix method:
>>      - v1/v2 focused on the points of accessing osb->journal.
>>      - v3 directly free inodes
>> v2: revise commit log
>>      add checking in ocfs2_dismount_volume()
>> ---
>>   fs/ocfs2/localalloc.c | 12 ++++++++
>>   fs/ocfs2/super.c      | 67 ++++++++++++++++++++++++++++++++++++++++---
>>   2 files changed, 75 insertions(+), 4 deletions(-)
>> ...
>
Joseph Qi March 27, 2022, 4:04 p.m. UTC | #5
On 3/27/22 6:37 PM, heming.zhao@suse.com wrote:
> Hello Joseph,
> 
> Could we speed up this crash fixing? I could trigger this kind of crash at least two ways.
> Do you have another alternative patch?
> 
As I've described in previous mail, I don't want to fix it in the way
you proposed.
And I am afraid we need do a thoroughly check since I've found there
are other issues in ocfs2_fill_super(), e.g.

ocfs2_fill_super
  ocfs2_initialize_super // fails in check max_slots

Then osb->osb_mount_event is not properly initialized, and then will
crash at wake_up(&osb->osb_mount_event).

Thanks,
Joseph

> On 3/20/22 21:51, Joseph Qi wrote:
>> I don't think it deserves a complex fix like this.
>> iput() is a common action and we don't want to re-create a private flow
>> in ocfs2. Also I think it's not safe.
>> I suggest we take the way to make sure ocfs2 journal is there as a
>> prerequisite.
> 
> let's image the journal init before any inode operations. and then there will
> create a gap between "journal init" and "dlm init". for a rule, any inode operation
> should be under dlm lock protecting. if there is a failed before dlm init.
> how to handle locking consistency?
> the correct code logic should do journal init after dlm lock init. IMO the commit
> ccd979bdbce9f ("OCFS2: The Second Oracle Cluster Filesystem") made the locking
> mistake on first code version. But author wrote comment to explain the reason.
> 
>     ```
>     /* FIXME
>      * This should be done in ocfs2_journal_init(), but unknown
>      * ordering issues will cause the filesystem to crash.
>      * If anyone wants to figure out what part of the code
>      * refers to osb->journal before ocfs2_journal_init() is run,
>      * be my guest.
>      */
>     ```
> 
> We don't simply turn back to pick up the problematic code logic. we should
> find a new method.
> 
> - Heming
> 
>>
>> On 3/18/22 11:04 PM, Heming Zhao wrote:
>>> Call trace:
>>>
>>>    ocfs2: Registered cluster interface user
>>>    dlm: no local IP address has been set
>>>    dlm: cannot start dlm lowcomms -107
>>>    (mount.ocfs2,2225,0):ocfs2_dlm_init:3355 ERROR: status = -107
>>>    (mount.ocfs2,2225,0):ocfs2_mount_volume:1817 ERROR: status = -107
>>>    (mount.ocfs2,2225,0):ocfs2_fill_super:1186 ERROR: status = -107
>>>    BUG: kernel NULL pointer dereference, address: 0000000000000030
>>>    #PF: supervisor read access in kernel mode
>>>    #PF: error_code(0x0000) - not-present page
>>>    PGD 0 P4D 0
>>>    Oops: 0000 [#1] PREEMPT SMP NOPTI
>>>    CPU: 0 PID: 2225 Comm: mount.ocfs2 Not tainted 5.16.2-1-default #1
>>> openSUSE Tumbleweed b40...e84
>>>    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ...
>>>    RIP: 0010:ocfs2_clear_inode+0x2e9/0x720 [ocfs2]
>>>    ...
>>>    Call Trace:
>>>     <TASK>
>>>     ? ocfs2_evict_inode+0x1fe/0x630 [ocfs2 411bc..281]
>>>     evict+0xc0/0x1c0
>>>     ocfs2_release_system_inodes+0x21/0xc0 [ocfs2 411bc..281]
>>>     ocfs2_dismount_volume+0x10b/0x2d0 [ocfs2 411bc..281]
>>>     ocfs2_fill_super+0xaf/0x19e0 [ocfs2 411bc..281]
>>>     mount_bdev+0x182/0x1b0
>>>     ? ocfs2_initialize_super.isra.0+0xf50/0xf50 [ocfs2 411bc..281]
>>>     legacy_get_tree+0x24/0x40
>>>     vfs_get_tree+0x22/0xb0
>>>     path_mount+0x465/0xac0
>>>     __x64_sys_mount+0x103/0x140
>>>     do_syscall_64+0x59/0x80
>>>     ? syscall_exit_to_user_mode+0x18/0x40
>>>     ? do_syscall_64+0x69/0x80
>>>     ? syscall_exit_to_user_mode+0x18/0x40
>>>     ? do_syscall_64+0x69/0x80
>>>     ? exc_page_fault+0x68/0x150
>>>     entry_SYSCALL_64_after_hwframe+0x44/0xae
>>>
>>> How to trigger:
>>>
>>>    tb-ocfs1 # dd if=/dev/zero of=/dev/vdb bs=1M count=20 oflag=direct
>>>    tb-ocfs1 # mkfs.ocfs2 --cluster-stack=pcmk -N 4 /dev/vdb \
>>>               --cluster-name=ocfstst
>>>
>>>               == there is no dlm running ==
>>>    tb-ocfs1 # mount -t ocfs2 /dev/vdb /mnt
>>>               == kernel crash ==
>>>
>>> Analysis:
>>>
>>> After commit da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown"),
>>> Journal init later than before, it makes NULL pointer access in free
>>> routine.
>>>
>>> ocfs2_fill_super
>>>   + ocfs2_mount_volume
>>>   |  ocfs2_dlm_init //failed, osb->journal still NULL.
>>>   + ...
>>>   + ocfs2_dismount_volume
>>>      ocfs2_release_system_inodes
>>>        ...
>>>         evict
>>>          ...
>>>           ocfs2_clear_inode
>>>            ocfs2_checkpoint_inode
>>>             ocfs2_ci_fully_checkpointed
>>>              time_after(journal->j_trans_id, ci->ci_last_trans)
>>>               + journal is empty, crash!
>>>
>>> How to fix:
>>>
>>> create a new function _inode_direct_free() to handle journal non-exist
>>> case.
>>>
>>> Fixes: da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown")
>>> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
>>> ---
>>> v3: change patch subject
>>>      change fix method:
>>>      - v1/v2 focused on the points of accessing osb->journal.
>>>      - v3 directly free inodes
>>> v2: revise commit log
>>>      add checking in ocfs2_dismount_volume()
>>> ---
>>>   fs/ocfs2/localalloc.c | 12 ++++++++
>>>   fs/ocfs2/super.c      | 67 ++++++++++++++++++++++++++++++++++++++++---
>>>   2 files changed, 75 insertions(+), 4 deletions(-)
>>> ...
>>
David Sterba via Ocfs2-devel March 27, 2022, 4:56 p.m. UTC | #6
On 3/28/22 00:04, Joseph Qi wrote:
> 
> 
> On 3/27/22 6:37 PM, heming.zhao@suse.com wrote:
>> Hello Joseph,
>>
>> Could we speed up this crash fixing? I could trigger this kind of crash at least two ways.
>> Do you have another alternative patch?
>>
> As I've described in previous mail, I don't want to fix it in the way
> you proposed.

I am OK with your mind, and only hope to fix this problem ASAP.

> And I am afraid we need do a thoroughly check since I've found there
> are other issues in ocfs2_fill_super(), e.g.
> 
> ocfs2_fill_super
>    ocfs2_initialize_super // fails in check max_slots
> 
> Then osb->osb_mount_event is not properly initialized, and then will
> crash at wake_up(&osb->osb_mount_event).
> 

The error handling in ocfs2_fill_super() is too simple. IMO, there needs to add more
kinds of "goto lable" or replace some "goto" with "return" for directly breaking
init routine.

the ocfs2_initialize_super() looks too long to handle error cases gracefully.
I can image there could be two type solutions:
1> split ocfs2_initialize_super() into some smaller funcs. and ocfs2_fill_super()
    calls these splited funcs, handles different error cases.
2> ocfs2_initialize_super() returns more meaningful error number, then
    ocfs2_fill_super() could handle them case by case.
    i.e. add a new enum for ocfs2_initialize_super() return
    enum {

       OSB_ALLOC_ERR,

       MAX_SLOTS_ERR,

       RECOVERY_ERR,

       RESMAP_ERR,

       ...

    };

- Heming

> 
>> On 3/20/22 21:51, Joseph Qi wrote:
>>> I don't think it deserves a complex fix like this.
>>> iput() is a common action and we don't want to re-create a private flow
>>> in ocfs2. Also I think it's not safe.
>>> I suggest we take the way to make sure ocfs2 journal is there as a
>>> prerequisite.
>>
>> let's image the journal init before any inode operations. and then there will
>> create a gap between "journal init" and "dlm init". for a rule, any inode operation
>> should be under dlm lock protecting. if there is a failed before dlm init.
>> how to handle locking consistency?
>> the correct code logic should do journal init after dlm lock init. IMO the commit
>> ccd979bdbce9f ("OCFS2: The Second Oracle Cluster Filesystem") made the locking
>> mistake on first code version. But author wrote comment to explain the reason.
>>
>>      ```
>>      /* FIXME
>>       * This should be done in ocfs2_journal_init(), but unknown
>>       * ordering issues will cause the filesystem to crash.
>>       * If anyone wants to figure out what part of the code
>>       * refers to osb->journal before ocfs2_journal_init() is run,
>>       * be my guest.
>>       */
>>      ```
>>
>> We don't simply turn back to pick up the problematic code logic. we should
>> find a new method.
>>
>> - Heming
>>
>>>
>>> On 3/18/22 11:04 PM, Heming Zhao wrote:
>>>> Call trace:
>>>>
>>>>     ocfs2: Registered cluster interface user
>>>>     dlm: no local IP address has been set
>>>>     dlm: cannot start dlm lowcomms -107
>>>>     (mount.ocfs2,2225,0):ocfs2_dlm_init:3355 ERROR: status = -107
>>>>     (mount.ocfs2,2225,0):ocfs2_mount_volume:1817 ERROR: status = -107
>>>>     (mount.ocfs2,2225,0):ocfs2_fill_super:1186 ERROR: status = -107
>>>>     BUG: kernel NULL pointer dereference, address: 0000000000000030
>>>>     #PF: supervisor read access in kernel mode
>>>>     #PF: error_code(0x0000) - not-present page
>>>>     PGD 0 P4D 0
>>>>     Oops: 0000 [#1] PREEMPT SMP NOPTI
>>>>     CPU: 0 PID: 2225 Comm: mount.ocfs2 Not tainted 5.16.2-1-default #1
>>>> openSUSE Tumbleweed b40...e84
>>>>     Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ...
>>>>     RIP: 0010:ocfs2_clear_inode+0x2e9/0x720 [ocfs2]
>>>>     ...
>>>>     Call Trace:
>>>>      <TASK>
>>>>      ? ocfs2_evict_inode+0x1fe/0x630 [ocfs2 411bc..281]
>>>>      evict+0xc0/0x1c0
>>>>      ocfs2_release_system_inodes+0x21/0xc0 [ocfs2 411bc..281]
>>>>      ocfs2_dismount_volume+0x10b/0x2d0 [ocfs2 411bc..281]
>>>>      ocfs2_fill_super+0xaf/0x19e0 [ocfs2 411bc..281]
>>>>      mount_bdev+0x182/0x1b0
>>>>      ? ocfs2_initialize_super.isra.0+0xf50/0xf50 [ocfs2 411bc..281]
>>>>      legacy_get_tree+0x24/0x40
>>>>      vfs_get_tree+0x22/0xb0
>>>>      path_mount+0x465/0xac0
>>>>      __x64_sys_mount+0x103/0x140
>>>>      do_syscall_64+0x59/0x80
>>>>      ? syscall_exit_to_user_mode+0x18/0x40
>>>>      ? do_syscall_64+0x69/0x80
>>>>      ? syscall_exit_to_user_mode+0x18/0x40
>>>>      ? do_syscall_64+0x69/0x80
>>>>      ? exc_page_fault+0x68/0x150
>>>>      entry_SYSCALL_64_after_hwframe+0x44/0xae
>>>>
>>>> How to trigger:
>>>>
>>>>     tb-ocfs1 # dd if=/dev/zero of=/dev/vdb bs=1M count=20 oflag=direct
>>>>     tb-ocfs1 # mkfs.ocfs2 --cluster-stack=pcmk -N 4 /dev/vdb \
>>>>                --cluster-name=ocfstst
>>>>
>>>>                == there is no dlm running ==
>>>>     tb-ocfs1 # mount -t ocfs2 /dev/vdb /mnt
>>>>                == kernel crash ==
>>>>
>>>> Analysis:
>>>>
>>>> After commit da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown"),
>>>> Journal init later than before, it makes NULL pointer access in free
>>>> routine.
>>>>
>>>> ocfs2_fill_super
>>>>    + ocfs2_mount_volume
>>>>    |  ocfs2_dlm_init //failed, osb->journal still NULL.
>>>>    + ...
>>>>    + ocfs2_dismount_volume
>>>>       ocfs2_release_system_inodes
>>>>         ...
>>>>          evict
>>>>           ...
>>>>            ocfs2_clear_inode
>>>>             ocfs2_checkpoint_inode
>>>>              ocfs2_ci_fully_checkpointed
>>>>               time_after(journal->j_trans_id, ci->ci_last_trans)
>>>>                + journal is empty, crash!
>>>>
>>>> How to fix:
>>>>
>>>> create a new function _inode_direct_free() to handle journal non-exist
>>>> case.
>>>>
>>>> Fixes: da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown")
>>>> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
>>>> ---
>>>> v3: change patch subject
>>>>       change fix method:
>>>>       - v1/v2 focused on the points of accessing osb->journal.
>>>>       - v3 directly free inodes
>>>> v2: revise commit log
>>>>       add checking in ocfs2_dismount_volume()
>>>> ---
>>>>    fs/ocfs2/localalloc.c | 12 ++++++++
>>>>    fs/ocfs2/super.c      | 67 ++++++++++++++++++++++++++++++++++++++++---
>>>>    2 files changed, 75 insertions(+), 4 deletions(-)
>>>> ...
>>>
>
Joseph Qi March 28, 2022, 2:08 a.m. UTC | #7
On 3/28/22 12:56 AM, heming.zhao@suse.com wrote:
> On 3/28/22 00:04, Joseph Qi wrote:
>>
>>
>> On 3/27/22 6:37 PM, heming.zhao@suse.com wrote:
>>> Hello Joseph,
>>>
>>> Could we speed up this crash fixing? I could trigger this kind of crash at least two ways.
>>> Do you have another alternative patch?
>>>
>> As I've described in previous mail, I don't want to fix it in the way
>> you proposed.
> 
> I am OK with your mind, and only hope to fix this problem ASAP.
> 
>> And I am afraid we need do a thoroughly check since I've found there
>> are other issues in ocfs2_fill_super(), e.g.
>>
>> ocfs2_fill_super
>>    ocfs2_initialize_super // fails in check max_slots
>>
>> Then osb->osb_mount_event is not properly initialized, and then will
>> crash at wake_up(&osb->osb_mount_event).
>>
> 
> The error handling in ocfs2_fill_super() is too simple. IMO, there needs to add more
> kinds of "goto lable" or replace some "goto" with "return" for directly breaking
> init routine.

Agree, we need handle error cases more gracefully.
And we can't simply call ocfs2_dismount_volume() in error handling
since it involves too much actions.
So do you have interest to take this?

Thanks,
Joseph  

> 
> the ocfs2_initialize_super() looks too long to handle error cases gracefully.
> I can image there could be two type solutions:
> 1> split ocfs2_initialize_super() into some smaller funcs. and ocfs2_fill_super()
>    calls these splited funcs, handles different error cases.
> 2> ocfs2_initialize_super() returns more meaningful error number, then
>    ocfs2_fill_super() could handle them case by case.
>    i.e. add a new enum for ocfs2_initialize_super() return
>    enum {
> 
>       OSB_ALLOC_ERR,
> 
>       MAX_SLOTS_ERR,
> 
>       RECOVERY_ERR,
> 
>       RESMAP_ERR,
> 
>       ...
> 
>    };
> 
> - Heming
> 
>>
>>> On 3/20/22 21:51, Joseph Qi wrote:
>>>> I don't think it deserves a complex fix like this.
>>>> iput() is a common action and we don't want to re-create a private flow
>>>> in ocfs2. Also I think it's not safe.
>>>> I suggest we take the way to make sure ocfs2 journal is there as a
>>>> prerequisite.
>>>
>>> let's image the journal init before any inode operations. and then there will
>>> create a gap between "journal init" and "dlm init". for a rule, any inode operation
>>> should be under dlm lock protecting. if there is a failed before dlm init.
>>> how to handle locking consistency?
>>> the correct code logic should do journal init after dlm lock init. IMO the commit
>>> ccd979bdbce9f ("OCFS2: The Second Oracle Cluster Filesystem") made the locking
>>> mistake on first code version. But author wrote comment to explain the reason.
>>>
>>>      ```
>>>      /* FIXME
>>>       * This should be done in ocfs2_journal_init(), but unknown
>>>       * ordering issues will cause the filesystem to crash.
>>>       * If anyone wants to figure out what part of the code
>>>       * refers to osb->journal before ocfs2_journal_init() is run,
>>>       * be my guest.
>>>       */
>>>      ```
>>>
>>> We don't simply turn back to pick up the problematic code logic. we should
>>> find a new method.
>>>
>>> - Heming
>>>
>>>>
>>>> On 3/18/22 11:04 PM, Heming Zhao wrote:
>>>>> Call trace:
>>>>>
>>>>>     ocfs2: Registered cluster interface user
>>>>>     dlm: no local IP address has been set
>>>>>     dlm: cannot start dlm lowcomms -107
>>>>>     (mount.ocfs2,2225,0):ocfs2_dlm_init:3355 ERROR: status = -107
>>>>>     (mount.ocfs2,2225,0):ocfs2_mount_volume:1817 ERROR: status = -107
>>>>>     (mount.ocfs2,2225,0):ocfs2_fill_super:1186 ERROR: status = -107
>>>>>     BUG: kernel NULL pointer dereference, address: 0000000000000030
>>>>>     #PF: supervisor read access in kernel mode
>>>>>     #PF: error_code(0x0000) - not-present page
>>>>>     PGD 0 P4D 0
>>>>>     Oops: 0000 [#1] PREEMPT SMP NOPTI
>>>>>     CPU: 0 PID: 2225 Comm: mount.ocfs2 Not tainted 5.16.2-1-default #1
>>>>> openSUSE Tumbleweed b40...e84
>>>>>     Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ...
>>>>>     RIP: 0010:ocfs2_clear_inode+0x2e9/0x720 [ocfs2]
>>>>>     ...
>>>>>     Call Trace:
>>>>>      <TASK>
>>>>>      ? ocfs2_evict_inode+0x1fe/0x630 [ocfs2 411bc..281]
>>>>>      evict+0xc0/0x1c0
>>>>>      ocfs2_release_system_inodes+0x21/0xc0 [ocfs2 411bc..281]
>>>>>      ocfs2_dismount_volume+0x10b/0x2d0 [ocfs2 411bc..281]
>>>>>      ocfs2_fill_super+0xaf/0x19e0 [ocfs2 411bc..281]
>>>>>      mount_bdev+0x182/0x1b0
>>>>>      ? ocfs2_initialize_super.isra.0+0xf50/0xf50 [ocfs2 411bc..281]
>>>>>      legacy_get_tree+0x24/0x40
>>>>>      vfs_get_tree+0x22/0xb0
>>>>>      path_mount+0x465/0xac0
>>>>>      __x64_sys_mount+0x103/0x140
>>>>>      do_syscall_64+0x59/0x80
>>>>>      ? syscall_exit_to_user_mode+0x18/0x40
>>>>>      ? do_syscall_64+0x69/0x80
>>>>>      ? syscall_exit_to_user_mode+0x18/0x40
>>>>>      ? do_syscall_64+0x69/0x80
>>>>>      ? exc_page_fault+0x68/0x150
>>>>>      entry_SYSCALL_64_after_hwframe+0x44/0xae
>>>>>
>>>>> How to trigger:
>>>>>
>>>>>     tb-ocfs1 # dd if=/dev/zero of=/dev/vdb bs=1M count=20 oflag=direct
>>>>>     tb-ocfs1 # mkfs.ocfs2 --cluster-stack=pcmk -N 4 /dev/vdb \
>>>>>                --cluster-name=ocfstst
>>>>>
>>>>>                == there is no dlm running ==
>>>>>     tb-ocfs1 # mount -t ocfs2 /dev/vdb /mnt
>>>>>                == kernel crash ==
>>>>>
>>>>> Analysis:
>>>>>
>>>>> After commit da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown"),
>>>>> Journal init later than before, it makes NULL pointer access in free
>>>>> routine.
>>>>>
>>>>> ocfs2_fill_super
>>>>>    + ocfs2_mount_volume
>>>>>    |  ocfs2_dlm_init //failed, osb->journal still NULL.
>>>>>    + ...
>>>>>    + ocfs2_dismount_volume
>>>>>       ocfs2_release_system_inodes
>>>>>         ...
>>>>>          evict
>>>>>           ...
>>>>>            ocfs2_clear_inode
>>>>>             ocfs2_checkpoint_inode
>>>>>              ocfs2_ci_fully_checkpointed
>>>>>               time_after(journal->j_trans_id, ci->ci_last_trans)
>>>>>                + journal is empty, crash!
>>>>>
>>>>> How to fix:
>>>>>
>>>>> create a new function _inode_direct_free() to handle journal non-exist
>>>>> case.
>>>>>
>>>>> Fixes: da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown")
>>>>> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
>>>>> ---
>>>>> v3: change patch subject
>>>>>       change fix method:
>>>>>       - v1/v2 focused on the points of accessing osb->journal.
>>>>>       - v3 directly free inodes
>>>>> v2: revise commit log
>>>>>       add checking in ocfs2_dismount_volume()
>>>>> ---
>>>>>    fs/ocfs2/localalloc.c | 12 ++++++++
>>>>>    fs/ocfs2/super.c      | 67 ++++++++++++++++++++++++++++++++++++++++---
>>>>>    2 files changed, 75 insertions(+), 4 deletions(-)
>>>>> ...
>>>>
>>
David Sterba via Ocfs2-devel March 28, 2022, 2:44 a.m. UTC | #8
On 3/28/22 10:08, Joseph Qi wrote:
> 
> 
> On 3/28/22 12:56 AM, heming.zhao@suse.com wrote:
>> On 3/28/22 00:04, Joseph Qi wrote:
>>>
>>>
>>> On 3/27/22 6:37 PM, heming.zhao@suse.com wrote:
>>>> Hello Joseph,
>>>>
>>>> Could we speed up this crash fixing? I could trigger this kind of crash at least two ways.
>>>> Do you have another alternative patch?
>>>>
>>> As I've described in previous mail, I don't want to fix it in the way
>>> you proposed.
>>
>> I am OK with your mind, and only hope to fix this problem ASAP.
>>
>>> And I am afraid we need do a thoroughly check since I've found there
>>> are other issues in ocfs2_fill_super(), e.g.
>>>
>>> ocfs2_fill_super
>>>     ocfs2_initialize_super // fails in check max_slots
>>>
>>> Then osb->osb_mount_event is not properly initialized, and then will
>>> crash at wake_up(&osb->osb_mount_event).
>>>
>>
>> The error handling in ocfs2_fill_super() is too simple. IMO, there needs to add more
>> kinds of "goto lable" or replace some "goto" with "return" for directly breaking
>> init routine.
> 
> Agree, we need handle error cases more gracefully.
> And we can't simply call ocfs2_dismount_volume() in error handling
> since it involves too much actions.
> So do you have interest to take this?
> 

NO problem, Let me try.
I will start a new mail thread for the mounting error handling story.

- Heming

> 
>>
>> the ocfs2_initialize_super() looks too long to handle error cases gracefully.
>> I can image there could be two type solutions:
>> 1> split ocfs2_initialize_super() into some smaller funcs. and ocfs2_fill_super()
>>     calls these splited funcs, handles different error cases.
>> 2> ocfs2_initialize_super() returns more meaningful error number, then
>>     ocfs2_fill_super() could handle them case by case.
>>     i.e. add a new enum for ocfs2_initialize_super() return
>>     enum {
>>
>>        OSB_ALLOC_ERR,
>>
>>        MAX_SLOTS_ERR,
>>
>>        RECOVERY_ERR,
>>
>>        RESMAP_ERR,
>>
>>        ...
>>
>>     };
>>
>> - Heming
>>
>>>
>>>> On 3/20/22 21:51, Joseph Qi wrote:
>>>>> I don't think it deserves a complex fix like this.
>>>>> iput() is a common action and we don't want to re-create a private flow
>>>>> in ocfs2. Also I think it's not safe.
>>>>> I suggest we take the way to make sure ocfs2 journal is there as a
>>>>> prerequisite.
>>>>
>>>> let's image the journal init before any inode operations. and then there will
>>>> create a gap between "journal init" and "dlm init". for a rule, any inode operation
>>>> should be under dlm lock protecting. if there is a failed before dlm init.
>>>> how to handle locking consistency?
>>>> the correct code logic should do journal init after dlm lock init. IMO the commit
>>>> ccd979bdbce9f ("OCFS2: The Second Oracle Cluster Filesystem") made the locking
>>>> mistake on first code version. But author wrote comment to explain the reason.
>>>>
>>>>       ```
>>>>       /* FIXME
>>>>        * This should be done in ocfs2_journal_init(), but unknown
>>>>        * ordering issues will cause the filesystem to crash.
>>>>        * If anyone wants to figure out what part of the code
>>>>        * refers to osb->journal before ocfs2_journal_init() is run,
>>>>        * be my guest.
>>>>        */
>>>>       ```
>>>>
>>>> We don't simply turn back to pick up the problematic code logic. we should
>>>> find a new method.
>>>>
>>>> - Heming
>>>>
>>>>>
>>>>> On 3/18/22 11:04 PM, Heming Zhao wrote:
>>>>>> Call trace:
>>>>>>
>>>>>>      ocfs2: Registered cluster interface user
>>>>>>      dlm: no local IP address has been set
>>>>>>      dlm: cannot start dlm lowcomms -107
>>>>>>      (mount.ocfs2,2225,0):ocfs2_dlm_init:3355 ERROR: status = -107
>>>>>>      (mount.ocfs2,2225,0):ocfs2_mount_volume:1817 ERROR: status = -107
>>>>>>      (mount.ocfs2,2225,0):ocfs2_fill_super:1186 ERROR: status = -107
>>>>>>      BUG: kernel NULL pointer dereference, address: 0000000000000030
>>>>>>      #PF: supervisor read access in kernel mode
>>>>>>      #PF: error_code(0x0000) - not-present page
>>>>>>      PGD 0 P4D 0
>>>>>>      Oops: 0000 [#1] PREEMPT SMP NOPTI
>>>>>>      CPU: 0 PID: 2225 Comm: mount.ocfs2 Not tainted 5.16.2-1-default #1
>>>>>> openSUSE Tumbleweed b40...e84
>>>>>>      Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ...
>>>>>>      RIP: 0010:ocfs2_clear_inode+0x2e9/0x720 [ocfs2]
>>>>>>      ...
>>>>>>      Call Trace:
>>>>>>       <TASK>
>>>>>>       ? ocfs2_evict_inode+0x1fe/0x630 [ocfs2 411bc..281]
>>>>>>       evict+0xc0/0x1c0
>>>>>>       ocfs2_release_system_inodes+0x21/0xc0 [ocfs2 411bc..281]
>>>>>>       ocfs2_dismount_volume+0x10b/0x2d0 [ocfs2 411bc..281]
>>>>>>       ocfs2_fill_super+0xaf/0x19e0 [ocfs2 411bc..281]
>>>>>>       mount_bdev+0x182/0x1b0
>>>>>>       ? ocfs2_initialize_super.isra.0+0xf50/0xf50 [ocfs2 411bc..281]
>>>>>>       legacy_get_tree+0x24/0x40
>>>>>>       vfs_get_tree+0x22/0xb0
>>>>>>       path_mount+0x465/0xac0
>>>>>>       __x64_sys_mount+0x103/0x140
>>>>>>       do_syscall_64+0x59/0x80
>>>>>>       ? syscall_exit_to_user_mode+0x18/0x40
>>>>>>       ? do_syscall_64+0x69/0x80
>>>>>>       ? syscall_exit_to_user_mode+0x18/0x40
>>>>>>       ? do_syscall_64+0x69/0x80
>>>>>>       ? exc_page_fault+0x68/0x150
>>>>>>       entry_SYSCALL_64_after_hwframe+0x44/0xae
>>>>>>
>>>>>> How to trigger:
>>>>>>
>>>>>>      tb-ocfs1 # dd if=/dev/zero of=/dev/vdb bs=1M count=20 oflag=direct
>>>>>>      tb-ocfs1 # mkfs.ocfs2 --cluster-stack=pcmk -N 4 /dev/vdb \
>>>>>>                 --cluster-name=ocfstst
>>>>>>
>>>>>>                 == there is no dlm running ==
>>>>>>      tb-ocfs1 # mount -t ocfs2 /dev/vdb /mnt
>>>>>>                 == kernel crash ==
>>>>>>
>>>>>> Analysis:
>>>>>>
>>>>>> After commit da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown"),
>>>>>> Journal init later than before, it makes NULL pointer access in free
>>>>>> routine.
>>>>>>
>>>>>> ocfs2_fill_super
>>>>>>     + ocfs2_mount_volume
>>>>>>     |  ocfs2_dlm_init //failed, osb->journal still NULL.
>>>>>>     + ...
>>>>>>     + ocfs2_dismount_volume
>>>>>>        ocfs2_release_system_inodes
>>>>>>          ...
>>>>>>           evict
>>>>>>            ...
>>>>>>             ocfs2_clear_inode
>>>>>>              ocfs2_checkpoint_inode
>>>>>>               ocfs2_ci_fully_checkpointed
>>>>>>                time_after(journal->j_trans_id, ci->ci_last_trans)
>>>>>>                 + journal is empty, crash!
>>>>>>
>>>>>> How to fix:
>>>>>>
>>>>>> create a new function _inode_direct_free() to handle journal non-exist
>>>>>> case.
>>>>>>
>>>>>> Fixes: da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown")
>>>>>> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
>>>>>> ---
>>>>>> v3: change patch subject
>>>>>>        change fix method:
>>>>>>        - v1/v2 focused on the points of accessing osb->journal.
>>>>>>        - v3 directly free inodes
>>>>>> v2: revise commit log
>>>>>>        add checking in ocfs2_dismount_volume()
>>>>>> ---
>>>>>>     fs/ocfs2/localalloc.c | 12 ++++++++
>>>>>>     fs/ocfs2/super.c      | 67 ++++++++++++++++++++++++++++++++++++++++---
>>>>>>     2 files changed, 75 insertions(+), 4 deletions(-)
>>>>>> ...
>>>>>
>>>
>
Joseph Qi March 28, 2022, 3:18 a.m. UTC | #9
On 3/28/22 10:44 AM, heming.zhao--- via Ocfs2-devel wrote:
> On 3/28/22 10:08, Joseph Qi wrote:
>>
>>
>> On 3/28/22 12:56 AM, heming.zhao@suse.com wrote:
>>> On 3/28/22 00:04, Joseph Qi wrote:
>>>>
>>>>
>>>> On 3/27/22 6:37 PM, heming.zhao@suse.com wrote:
>>>>> Hello Joseph,
>>>>>
>>>>> Could we speed up this crash fixing? I could trigger this kind of crash at least two ways.
>>>>> Do you have another alternative patch?
>>>>>
>>>> As I've described in previous mail, I don't want to fix it in the way
>>>> you proposed.
>>>
>>> I am OK with your mind, and only hope to fix this problem ASAP.
>>>
>>>> And I am afraid we need do a thoroughly check since I've found there
>>>> are other issues in ocfs2_fill_super(), e.g.
>>>>
>>>> ocfs2_fill_super
>>>>     ocfs2_initialize_super // fails in check max_slots
>>>>
>>>> Then osb->osb_mount_event is not properly initialized, and then will
>>>> crash at wake_up(&osb->osb_mount_event).
>>>>
>>>
>>> The error handling in ocfs2_fill_super() is too simple. IMO, there needs to add more
>>> kinds of "goto lable" or replace some "goto" with "return" for directly breaking
>>> init routine.
>>
>> Agree, we need handle error cases more gracefully.
>> And we can't simply call ocfs2_dismount_volume() in error handling
>> since it involves too much actions.
>> So do you have interest to take this?
>>
> 
> NO problem, Let me try.
> I will start a new mail thread for the mounting error handling story.
> 
Glad to here that. You may split these into a series.
And I will try my best to review ASAP.

Thanks,
Joseph
diff mbox series

Patch

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 5f6bacbeef6b..c7694c2a1000 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -412,6 +412,18 @@  void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 		goto out_mutex;
 	}
 
+	if (!osb->journal) {
+		/* Does here make sense to handle bh? */
+		if (osb->local_alloc_bh)
+			brelse(osb->local_alloc_bh);
+		osb->local_alloc_bh = NULL;
+		osb->local_alloc_state = OCFS2_LA_UNUSED;
+
+		/*  After out_unlock calling iput() is safe, in theory system inode
+		 * still be hold in dismount flow */
+		goto out_unlock;
+	}
+
 	/* WINDOW_MOVE_CREDITS is a bit heavy... */
 	handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
 	if (IS_ERR(handle)) {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8bde30fa5387..65aedfdcb2f9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -504,28 +504,79 @@  static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
 	return status;
 }
 
+/*
+ * This function does the directly release action, which makes sure
+ * there is no memory & lockres leak for inode.
+ *
+ * How it works:
+ *
+ * refer iput() flow, which will call following key functions:
+ *   ocfs2_drop_inode, ocfs2_evict_inode, ocfs2_clear_inode, ocfs2_free_inode.
+ *
+ * - ocfs2_drop_inode() & ocfs2_clear_inode() are useless when directly remove.
+ * - other ocfs2_xx_inode() should be mentioned in this function.
+ */
+static void _inode_direct_free(struct ocfs2_super *osb, struct inode *inode)
+{
+	int status;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	/* 1> copy from ocfs2_evict_inode() */
+	truncate_inode_pages_final(&inode->i_data);
+
+	ocfs2_open_unlock(inode);
+
+	ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
+	ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
+	ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
+
+	status = ocfs2_drop_inode_locks(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_lock_res_free(&oi->ip_rw_lockres);
+	ocfs2_lock_res_free(&oi->ip_inode_lockres);
+	ocfs2_lock_res_free(&oi->ip_open_lockres);
+	ocfs2_metadata_cache_exit(INODE_CACHE(inode));
+
+	oi->ip_flags = 0;
+	oi->ip_dir_start_lookup = 0;
+	oi->ip_blkno = 0ULL;
+
+	/* 2> free memory */
+	ocfs2_free_inode(inode);
+}
+
+static void _inode_iput_free(struct ocfs2_super *osb, struct inode *inode)
+{
+	iput(inode);
+}
+
 static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 {
 	int i;
 	struct inode *inode;
+	void (*_free_inode)(struct ocfs2_super *osb, struct inode *inode);
+
+	_free_inode = osb->journal ? _inode_iput_free : _inode_direct_free;
 
 	for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
 		inode = osb->global_system_inodes[i];
 		if (inode) {
-			iput(inode);
+			_free_inode(osb, inode);
 			osb->global_system_inodes[i] = NULL;
 		}
 	}
 
 	inode = osb->sys_root_inode;
 	if (inode) {
-		iput(inode);
+		_free_inode(osb, inode);
 		osb->sys_root_inode = NULL;
 	}
 
 	inode = osb->root_inode;
 	if (inode) {
-		iput(inode);
+		_free_inode(osb, inode);
 		osb->root_inode = NULL;
 	}
 
@@ -534,7 +585,7 @@  static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 
 	for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
 		if (osb->local_system_inodes[i]) {
-			iput(osb->local_system_inodes[i]);
+			_free_inode(osb, osb->local_system_inodes[i]);
 			osb->local_system_inodes[i] = NULL;
 		}
 	}
@@ -1870,6 +1921,13 @@  static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 	osb = OCFS2_SB(sb);
 	BUG_ON(!osb);
 
+	if (!osb->journal) {
+		ocfs2_shutdown_local_alloc(osb);
+		ocfs2_recovery_exit(osb);
+		ocfs2_purge_refcount_trees(osb);
+		goto put_slot;
+	}
+
 	/* Remove file check sysfs related directores/files,
 	 * and wait for the pending file check operations */
 	ocfs2_filecheck_remove_sysfs(osb);
@@ -1897,6 +1955,7 @@  static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	ocfs2_purge_refcount_trees(osb);
 
+put_slot:
 	/* No cluster connection means we've failed during mount, so skip
 	 * all the steps which depended on that to complete. */
 	if (osb->cconn) {