diff mbox series

[v3] ocfs2: fix kernel crash after mounting when journal doesn't ready

Message ID 20220318134845.1749-1-heming.zhao@suse.com (mailing list archive)
State New, archived
Headers show
Series [v3] ocfs2: fix kernel crash after mounting when journal doesn't ready | expand

Commit Message

heming.zhao@suse.com March 18, 2022, 1:48 p.m. UTC
Call trace:

  ocfs2: Registered cluster interface user
  dlm: no local IP address has been set
  dlm: cannot start dlm lowcomms -107
  (mount.ocfs2,2225,0):ocfs2_dlm_init:3355 ERROR: status = -107
  (mount.ocfs2,2225,0):ocfs2_mount_volume:1817 ERROR: status = -107
  (mount.ocfs2,2225,0):ocfs2_fill_super:1186 ERROR: status = -107
  BUG: kernel NULL pointer dereference, address: 0000000000000030
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  PGD 0 P4D 0
  Oops: 0000 [#1] PREEMPT SMP NOPTI
  CPU: 0 PID: 2225 Comm: mount.ocfs2 Not tainted 5.16.2-1-default #1
openSUSE Tumbleweed b40...e84
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ...
  RIP: 0010:ocfs2_clear_inode+0x2e9/0x720 [ocfs2]
  ...
  Call Trace:
   <TASK>
   ? ocfs2_evict_inode+0x1fe/0x630 [ocfs2 411bc..281]
   evict+0xc0/0x1c0
   ocfs2_release_system_inodes+0x21/0xc0 [ocfs2 411bc..281]
   ocfs2_dismount_volume+0x10b/0x2d0 [ocfs2 411bc..281]
   ocfs2_fill_super+0xaf/0x19e0 [ocfs2 411bc..281]
   mount_bdev+0x182/0x1b0
   ? ocfs2_initialize_super.isra.0+0xf50/0xf50 [ocfs2 411bc..281]
   legacy_get_tree+0x24/0x40
   vfs_get_tree+0x22/0xb0
   path_mount+0x465/0xac0
   __x64_sys_mount+0x103/0x140
   do_syscall_64+0x59/0x80
   ? syscall_exit_to_user_mode+0x18/0x40
   ? do_syscall_64+0x69/0x80
   ? syscall_exit_to_user_mode+0x18/0x40
   ? do_syscall_64+0x69/0x80
   ? exc_page_fault+0x68/0x150
   entry_SYSCALL_64_after_hwframe+0x44/0xae

How to trigger:

  tb-ocfs1 # dd if=/dev/zero of=/dev/vdb bs=1M count=20 oflag=direct
  tb-ocfs1 # mkfs.ocfs2 --cluster-stack=pcmk -N 4 /dev/vdb \
             --cluster-name=ocfstst

             == there is no dlm running ==
  tb-ocfs1 # mount -t ocfs2 /dev/vdb /mnt
             == kernel crash ==

Analysis:

After commit da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown"),
Journal init later than before, it makes NULL pointer access in free
routine.

ocfs2_fill_super
 + ocfs2_mount_volume
 |  ocfs2_dlm_init //failed, osb->journal still NULL.
 + ...
 + ocfs2_dismount_volume
    ocfs2_release_system_inodes
      ...
       evict
        ...
         ocfs2_clear_inode
          ocfs2_checkpoint_inode
           ocfs2_ci_fully_checkpointed
            time_after(journal->j_trans_id, ci->ci_last_trans)
             + journal is empty, crash!

How to fix:

create a new function _inode_direct_free() to handle journal non-exist
case.

Fixes: da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown")
Signed-off-by: Heming Zhao <heming.zhao@suse.com>
---
 fs/ocfs2/localalloc.c | 12 ++++++++
 fs/ocfs2/super.c      | 67 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 75 insertions(+), 4 deletions(-)

Comments

David Sterba via Ocfs2-devel March 18, 2022, 2:39 p.m. UTC | #1
Sorry, I forgot to add changelog in patch v3, will resend it.
Please review with resent v3 patch.

Thanks

On 3/18/22 21:48, Heming Zhao wrote:
> Call trace:
> 
>    ocfs2: Registered cluster interface user
>    dlm: no local IP address has been set
>    dlm: cannot start dlm lowcomms -107
>    (mount.ocfs2,2225,0):ocfs2_dlm_init:3355 ERROR: status = -107
>    (mount.ocfs2,2225,0):ocfs2_mount_volume:1817 ERROR: status = -107
>    (mount.ocfs2,2225,0):ocfs2_fill_super:1186 ERROR: status = -107
>    BUG: kernel NULL pointer dereference, address: 0000000000000030
>    #PF: supervisor read access in kernel mode
>    #PF: error_code(0x0000) - not-present page
>    PGD 0 P4D 0
>    Oops: 0000 [#1] PREEMPT SMP NOPTI
>    CPU: 0 PID: 2225 Comm: mount.ocfs2 Not tainted 5.16.2-1-default #1
> openSUSE Tumbleweed b40...e84
>    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ...
>    RIP: 0010:ocfs2_clear_inode+0x2e9/0x720 [ocfs2]
>    ...
>    Call Trace:
>     <TASK>
>     ? ocfs2_evict_inode+0x1fe/0x630 [ocfs2 411bc..281]
>     evict+0xc0/0x1c0
>     ocfs2_release_system_inodes+0x21/0xc0 [ocfs2 411bc..281]
>     ocfs2_dismount_volume+0x10b/0x2d0 [ocfs2 411bc..281]
>     ocfs2_fill_super+0xaf/0x19e0 [ocfs2 411bc..281]
>     mount_bdev+0x182/0x1b0
>     ? ocfs2_initialize_super.isra.0+0xf50/0xf50 [ocfs2 411bc..281]
>     legacy_get_tree+0x24/0x40
>     vfs_get_tree+0x22/0xb0
>     path_mount+0x465/0xac0
>     __x64_sys_mount+0x103/0x140
>     do_syscall_64+0x59/0x80
>     ? syscall_exit_to_user_mode+0x18/0x40
>     ? do_syscall_64+0x69/0x80
>     ? syscall_exit_to_user_mode+0x18/0x40
>     ? do_syscall_64+0x69/0x80
>     ? exc_page_fault+0x68/0x150
>     entry_SYSCALL_64_after_hwframe+0x44/0xae
> 
> How to trigger:
> 
>    tb-ocfs1 # dd if=/dev/zero of=/dev/vdb bs=1M count=20 oflag=direct
>    tb-ocfs1 # mkfs.ocfs2 --cluster-stack=pcmk -N 4 /dev/vdb \
>               --cluster-name=ocfstst
> 
>               == there is no dlm running ==
>    tb-ocfs1 # mount -t ocfs2 /dev/vdb /mnt
>               == kernel crash ==
> 
> Analysis:
> 
> After commit da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown"),
> Journal init later than before, it makes NULL pointer access in free
> routine.
> 
> ocfs2_fill_super
>   + ocfs2_mount_volume
>   |  ocfs2_dlm_init //failed, osb->journal still NULL.
>   + ...
>   + ocfs2_dismount_volume
>      ocfs2_release_system_inodes
>        ...
>         evict
>          ...
>           ocfs2_clear_inode
>            ocfs2_checkpoint_inode
>             ocfs2_ci_fully_checkpointed
>              time_after(journal->j_trans_id, ci->ci_last_trans)
>               + journal is empty, crash!
> 
> How to fix:
> 
> create a new function _inode_direct_free() to handle journal non-exist
> case.
> 
> Fixes: da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown")
> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
> ---
>   fs/ocfs2/localalloc.c | 12 ++++++++
>   fs/ocfs2/super.c      | 67 ++++++++++++++++++++++++++++++++++++++++---
>   2 files changed, 75 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
> index 5f6bacbeef6b..c7694c2a1000 100644
> ...
>   
> +put_slot:
>   	/* No cluster connection means we've failed during mount, so skip
>   	 * all the steps which depended on that to complete. */
>   	if (osb->cconn) {
diff mbox series

Patch

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 5f6bacbeef6b..c7694c2a1000 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -412,6 +412,18 @@  void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 		goto out_mutex;
 	}
 
+	if (!osb->journal) {
+		/* Does here make sense to handle bh? */
+		if (osb->local_alloc_bh)
+			brelse(osb->local_alloc_bh);
+		osb->local_alloc_bh = NULL;
+		osb->local_alloc_state = OCFS2_LA_UNUSED;
+
+		/*  After out_unlock calling iput() is safe, in theory system inode
+		 * still be hold in dismount flow */
+		goto out_unlock;
+	}
+
 	/* WINDOW_MOVE_CREDITS is a bit heavy... */
 	handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
 	if (IS_ERR(handle)) {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8bde30fa5387..65aedfdcb2f9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -504,28 +504,79 @@  static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
 	return status;
 }
 
+/*
+ * This function does the directly release action, which makes sure
+ * there is no memory & lockres leak for inode.
+ *
+ * How it works:
+ *
+ * refer iput() flow, which will call following key functions:
+ *   ocfs2_drop_inode, ocfs2_evict_inode, ocfs2_clear_inode, ocfs2_free_inode.
+ *
+ * - ocfs2_drop_inode() & ocfs2_clear_inode() are useless when directly remove.
+ * - other ocfs2_xx_inode() should be mentioned in this function.
+ */
+static void _inode_direct_free(struct ocfs2_super *osb, struct inode *inode)
+{
+	int status;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	/* 1> copy from ocfs2_evict_inode() */
+	truncate_inode_pages_final(&inode->i_data);
+
+	ocfs2_open_unlock(inode);
+
+	ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
+	ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
+	ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
+
+	status = ocfs2_drop_inode_locks(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_lock_res_free(&oi->ip_rw_lockres);
+	ocfs2_lock_res_free(&oi->ip_inode_lockres);
+	ocfs2_lock_res_free(&oi->ip_open_lockres);
+	ocfs2_metadata_cache_exit(INODE_CACHE(inode));
+
+	oi->ip_flags = 0;
+	oi->ip_dir_start_lookup = 0;
+	oi->ip_blkno = 0ULL;
+
+	/* 2> free memory */
+	ocfs2_free_inode(inode);
+}
+
+static void _inode_iput_free(struct ocfs2_super *osb, struct inode *inode)
+{
+	iput(inode);
+}
+
 static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 {
 	int i;
 	struct inode *inode;
+	void (*_free_inode)(struct ocfs2_super *osb, struct inode *inode);
+
+	_free_inode = osb->journal ? _inode_iput_free : _inode_direct_free;
 
 	for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
 		inode = osb->global_system_inodes[i];
 		if (inode) {
-			iput(inode);
+			_free_inode(osb, inode);
 			osb->global_system_inodes[i] = NULL;
 		}
 	}
 
 	inode = osb->sys_root_inode;
 	if (inode) {
-		iput(inode);
+		_free_inode(osb, inode);
 		osb->sys_root_inode = NULL;
 	}
 
 	inode = osb->root_inode;
 	if (inode) {
-		iput(inode);
+		_free_inode(osb, inode);
 		osb->root_inode = NULL;
 	}
 
@@ -534,7 +585,7 @@  static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 
 	for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
 		if (osb->local_system_inodes[i]) {
-			iput(osb->local_system_inodes[i]);
+			_free_inode(osb, osb->local_system_inodes[i]);
 			osb->local_system_inodes[i] = NULL;
 		}
 	}
@@ -1870,6 +1921,13 @@  static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 	osb = OCFS2_SB(sb);
 	BUG_ON(!osb);
 
+	if (!osb->journal) {
+		ocfs2_shutdown_local_alloc(osb);
+		ocfs2_recovery_exit(osb);
+		ocfs2_purge_refcount_trees(osb);
+		goto put_slot;
+	}
+
 	/* Remove file check sysfs related directores/files,
 	 * and wait for the pending file check operations */
 	ocfs2_filecheck_remove_sysfs(osb);
@@ -1897,6 +1955,7 @@  static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	ocfs2_purge_refcount_trees(osb);
 
+put_slot:
 	/* No cluster connection means we've failed during mount, so skip
 	 * all the steps which depended on that to complete. */
 	if (osb->cconn) {