@@ -412,6 +412,18 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
goto out_mutex;
}
+ if (!osb->journal) {
+ /* Does here make sense to handle bh? */
+ if (osb->local_alloc_bh)
+ brelse(osb->local_alloc_bh);
+ osb->local_alloc_bh = NULL;
+ osb->local_alloc_state = OCFS2_LA_UNUSED;
+
+ /* After out_unlock calling iput() is safe, in theory system inode
+ * still be hold in dismount flow */
+ goto out_unlock;
+ }
+
/* WINDOW_MOVE_CREDITS is a bit heavy... */
handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
if (IS_ERR(handle)) {
@@ -504,28 +504,79 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
return status;
}
+/*
+ * This function does the directly release action, which makes sure
+ * there is no memory & lockres leak for inode.
+ *
+ * How it works:
+ *
+ * refer iput() flow, which will call following key functions:
+ * ocfs2_drop_inode, ocfs2_evict_inode, ocfs2_clear_inode, ocfs2_free_inode.
+ *
+ * - ocfs2_drop_inode() & ocfs2_clear_inode() are useless when directly remove.
+ * - other ocfs2_xx_inode() should be mentioned in this function.
+ */
+static void _inode_direct_free(struct ocfs2_super *osb, struct inode *inode)
+{
+ int status;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ /* 1> copy from ocfs2_evict_inode() */
+ truncate_inode_pages_final(&inode->i_data);
+
+ ocfs2_open_unlock(inode);
+
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
+
+ status = ocfs2_drop_inode_locks(inode);
+ if (status < 0)
+ mlog_errno(status);
+
+ ocfs2_lock_res_free(&oi->ip_rw_lockres);
+ ocfs2_lock_res_free(&oi->ip_inode_lockres);
+ ocfs2_lock_res_free(&oi->ip_open_lockres);
+ ocfs2_metadata_cache_exit(INODE_CACHE(inode));
+
+ oi->ip_flags = 0;
+ oi->ip_dir_start_lookup = 0;
+ oi->ip_blkno = 0ULL;
+
+ /* 2> free memory */
+ ocfs2_free_inode(inode);
+}
+
+static void _inode_iput_free(struct ocfs2_super *osb, struct inode *inode)
+{
+ iput(inode);
+}
+
static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
{
int i;
struct inode *inode;
+ void (*_free_inode)(struct ocfs2_super *osb, struct inode *inode);
+
+ _free_inode = osb->journal ? _inode_iput_free : _inode_direct_free;
for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
inode = osb->global_system_inodes[i];
if (inode) {
- iput(inode);
+ _free_inode(osb, inode);
osb->global_system_inodes[i] = NULL;
}
}
inode = osb->sys_root_inode;
if (inode) {
- iput(inode);
+ _free_inode(osb, inode);
osb->sys_root_inode = NULL;
}
inode = osb->root_inode;
if (inode) {
- iput(inode);
+ _free_inode(osb, inode);
osb->root_inode = NULL;
}
@@ -534,7 +585,7 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
if (osb->local_system_inodes[i]) {
- iput(osb->local_system_inodes[i]);
+ _free_inode(osb, osb->local_system_inodes[i]);
osb->local_system_inodes[i] = NULL;
}
}
@@ -1870,6 +1921,13 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
osb = OCFS2_SB(sb);
BUG_ON(!osb);
+ if (!osb->journal) {
+ ocfs2_shutdown_local_alloc(osb);
+ ocfs2_recovery_exit(osb);
+ ocfs2_purge_refcount_trees(osb);
+ goto put_slot;
+ }
+
/* Remove file check sysfs related directores/files,
* and wait for the pending file check operations */
ocfs2_filecheck_remove_sysfs(osb);
@@ -1897,6 +1955,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_purge_refcount_trees(osb);
+put_slot:
/* No cluster connection means we've failed during mount, so skip
* all the steps which depended on that to complete. */
if (osb->cconn) {
Call trace: ocfs2: Registered cluster interface user dlm: no local IP address has been set dlm: cannot start dlm lowcomms -107 (mount.ocfs2,2225,0):ocfs2_dlm_init:3355 ERROR: status = -107 (mount.ocfs2,2225,0):ocfs2_mount_volume:1817 ERROR: status = -107 (mount.ocfs2,2225,0):ocfs2_fill_super:1186 ERROR: status = -107 BUG: kernel NULL pointer dereference, address: 0000000000000030 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 2225 Comm: mount.ocfs2 Not tainted 5.16.2-1-default #1 openSUSE Tumbleweed b40...e84 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ... RIP: 0010:ocfs2_clear_inode+0x2e9/0x720 [ocfs2] ... Call Trace: <TASK> ? ocfs2_evict_inode+0x1fe/0x630 [ocfs2 411bc..281] evict+0xc0/0x1c0 ocfs2_release_system_inodes+0x21/0xc0 [ocfs2 411bc..281] ocfs2_dismount_volume+0x10b/0x2d0 [ocfs2 411bc..281] ocfs2_fill_super+0xaf/0x19e0 [ocfs2 411bc..281] mount_bdev+0x182/0x1b0 ? ocfs2_initialize_super.isra.0+0xf50/0xf50 [ocfs2 411bc..281] legacy_get_tree+0x24/0x40 vfs_get_tree+0x22/0xb0 path_mount+0x465/0xac0 __x64_sys_mount+0x103/0x140 do_syscall_64+0x59/0x80 ? syscall_exit_to_user_mode+0x18/0x40 ? do_syscall_64+0x69/0x80 ? syscall_exit_to_user_mode+0x18/0x40 ? do_syscall_64+0x69/0x80 ? exc_page_fault+0x68/0x150 entry_SYSCALL_64_after_hwframe+0x44/0xae How to trigger: tb-ocfs1 # dd if=/dev/zero of=/dev/vdb bs=1M count=20 oflag=direct tb-ocfs1 # mkfs.ocfs2 --cluster-stack=pcmk -N 4 /dev/vdb \ --cluster-name=ocfstst == there is no dlm running == tb-ocfs1 # mount -t ocfs2 /dev/vdb /mnt == kernel crash == Analysis: After commit da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown"), Journal init later than before, it makes NULL pointer access in free routine. ocfs2_fill_super + ocfs2_mount_volume | ocfs2_dlm_init //failed, osb->journal still NULL. + ... + ocfs2_dismount_volume ocfs2_release_system_inodes ... evict ... ocfs2_clear_inode ocfs2_checkpoint_inode ocfs2_ci_fully_checkpointed time_after(journal->j_trans_id, ci->ci_last_trans) + journal is empty, crash! How to fix: create a new function _inode_direct_free() to handle journal non-exist case. Fixes: da5e7c87827e8 ("ocfs2: cleanup journal init and shutdown") Signed-off-by: Heming Zhao <heming.zhao@suse.com> --- fs/ocfs2/localalloc.c | 12 ++++++++ fs/ocfs2/super.c | 67 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 75 insertions(+), 4 deletions(-)