ocfs2: fix the application IO timeout when fstrim is running
diff mbox series

Message ID 20190111090014.31645-1-ghe@suse.com
State New
Headers show
Series
  • ocfs2: fix the application IO timeout when fstrim is running
Related show

Commit Message

Gang He Jan. 11, 2019, 9 a.m. UTC
The user reported this problem, the upper application IO was
timeout when fstrim was running on this ocfs2 partition. the
application monitoring resource agent considered that this
application did not work, then this node was fenced by the cluster
brain (e.g. pacemaker).
The root cause is that fstrim thread always holds main_bm meta-file
related locks until all the cluster groups are trimmed.
This patch will make fstrim thread release main_bm meta-file
related locks when each cluster group is trimmed, this will let
the current application IO has a chance to claim the clusters from
main_bm meta-file.

Signed-off-by: Gang He <ghe@suse.com>
---
 fs/ocfs2/alloc.c       | 159 +++++++++++++++++++++++++----------------
 fs/ocfs2/dlmglue.c     |   5 ++
 fs/ocfs2/ocfs2.h       |   1 +
 fs/ocfs2/ocfs2_trace.h |   2 +
 fs/ocfs2/super.c       |   2 +
 5 files changed, 106 insertions(+), 63 deletions(-)

Comments

Changwei Ge Jan. 11, 2019, 9:54 a.m. UTC | #1
Hi Gang,

On 2019/1/11 17:01, Gang He wrote:
> The user reported this problem, the upper application IO was
> timeout when fstrim was running on this ocfs2 partition. the
> application monitoring resource agent considered that this
> application did not work, then this node was fenced by the cluster
> brain (e.g. pacemaker).
> The root cause is that fstrim thread always holds main_bm meta-file
> related locks until all the cluster groups are trimmed.
> This patch will make fstrim thread release main_bm meta-file
> related locks when each cluster group is trimmed, this will let
> the current application IO has a chance to claim the clusters from
> main_bm meta-file.

Sounds a good idea, I will look into this.

Thanks,
Changwei

> 
> Signed-off-by: Gang He <ghe@suse.com>
> ---
>   fs/ocfs2/alloc.c       | 159 +++++++++++++++++++++++++----------------
>   fs/ocfs2/dlmglue.c     |   5 ++
>   fs/ocfs2/ocfs2.h       |   1 +
>   fs/ocfs2/ocfs2_trace.h |   2 +
>   fs/ocfs2/super.c       |   2 +
>   5 files changed, 106 insertions(+), 63 deletions(-)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index d1cbb27808e2..6f0999015a44 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
>   	return count;
>   }
>   
> -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
> +static
> +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
>   {
>   	struct ocfs2_super *osb = OCFS2_SB(sb);
> -	u64 start, len, trimmed, first_group, last_group, group;
> +	u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
>   	int ret, cnt;
>   	u32 first_bit, last_bit, minlen;
>   	struct buffer_head *main_bm_bh = NULL;
> @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>   	struct buffer_head *gd_bh = NULL;
>   	struct ocfs2_dinode *main_bm;
>   	struct ocfs2_group_desc *gd = NULL;
> -	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>   
>   	start = range->start >> osb->s_clustersize_bits;
>   	len = range->len >> osb->s_clustersize_bits;
> @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>   	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
>   		return -EINVAL;
>   
> +	trace_ocfs2_trim_mainbm(start, len, minlen);
> +
> +next_group:
>   	main_bm_inode = ocfs2_get_system_file_inode(osb,
>   						    GLOBAL_BITMAP_SYSTEM_INODE,
>   						    OCFS2_INVALID_SLOT);
> @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>   	}
>   	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>   
> -	if (start >= le32_to_cpu(main_bm->i_clusters)) {
> -		ret = -EINVAL;
> -		goto out_unlock;
> -	}
> -
> -	len = range->len >> osb->s_clustersize_bits;
> -	if (start + len > le32_to_cpu(main_bm->i_clusters))
> -		len = le32_to_cpu(main_bm->i_clusters) - start;
> -
> -	trace_ocfs2_trim_fs(start, len, minlen);
> -
> -	ocfs2_trim_fs_lock_res_init(osb);
> -	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> -	if (ret < 0) {
> -		if (ret != -EAGAIN) {
> -			mlog_errno(ret);
> -			ocfs2_trim_fs_lock_res_uninit(osb);
> +	/*
> +	 * Do some check before trim the first group.
> +	 */
> +	if (!group) {
> +		if (start >= le32_to_cpu(main_bm->i_clusters)) {
> +			ret = -EINVAL;
>   			goto out_unlock;
>   		}
>   
> -		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> -		     "finish, which is running from another node.\n",
> -		     osb->dev_str);
> -		ret = ocfs2_trim_fs_lock(osb, &info, 0);
> -		if (ret < 0) {
> -			mlog_errno(ret);
> -			ocfs2_trim_fs_lock_res_uninit(osb);
> -			goto out_unlock;
> -		}
> +		if (start + len > le32_to_cpu(main_bm->i_clusters))
> +			len = le32_to_cpu(main_bm->i_clusters) - start;
>   
> -		if (info.tf_valid && info.tf_success &&
> -		    info.tf_start == start && info.tf_len == len &&
> -		    info.tf_minlen == minlen) {
> -			/* Avoid sending duplicated trim to a shared device */
> -			mlog(ML_NOTICE, "The same trim on device (%s) was "
> -			     "just done from node (%u), return.\n",
> -			     osb->dev_str, info.tf_nodenum);
> -			range->len = info.tf_trimlen;
> -			goto out_trimunlock;
> -		}
> +		/*
> +		 * Determine first and last group to examine based on
> +		 * start and len
> +		 */
> +		first_group = ocfs2_which_cluster_group(main_bm_inode, start);
> +		if (first_group == osb->first_cluster_group_blkno)
> +			first_bit = start;
> +		else
> +			first_bit = start - ocfs2_blocks_to_clusters(sb,
> +								first_group);
> +		last_group = ocfs2_which_cluster_group(main_bm_inode,
> +						       start + len - 1);
> +		group = first_group;
>   	}
>   
> -	info.tf_nodenum = osb->node_num;
> -	info.tf_start = start;
> -	info.tf_len = len;
> -	info.tf_minlen = minlen;
> -
> -	/* Determine first and last group to examine based on start and len */
> -	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
> -	if (first_group == osb->first_cluster_group_blkno)
> -		first_bit = start;
> -	else
> -		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
> -	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
> -	last_bit = osb->bitmap_cpg;
> -
> -	trimmed = 0;
> -	for (group = first_group; group <= last_group;) {
> +	do {
>   		if (first_bit + len >= osb->bitmap_cpg)
>   			last_bit = osb->bitmap_cpg;
>   		else
> @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>   			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>   		else
>   			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
> -	}
> -	range->len = trimmed * sb->s_blocksize;
> +	} while (0);
>   
> -	info.tf_trimlen = range->len;
> -	info.tf_success = (ret ? 0 : 1);
> -	pinfo = &info;
> -out_trimunlock:
> -	ocfs2_trim_fs_unlock(osb, pinfo);
> -	ocfs2_trim_fs_lock_res_uninit(osb);
>   out_unlock:
>   	ocfs2_inode_unlock(main_bm_inode, 0);
>   	brelse(main_bm_bh);
> +	main_bm_bh = NULL;
>   out_mutex:
>   	inode_unlock(main_bm_inode);
>   	iput(main_bm_inode);
> +
> +	/*
> +	 * If all the groups trim are not done or failed, but we should release
> +	 * main_bm related locks for avoiding the current IO starve, then go to
> +	 * trim the next group
> +	 */
> +	if (ret >= 0 && group <= last_group)
> +		goto next_group;
>   out:
> +	range->len = trimmed * sb->s_blocksize;
> +	return ret;
> +}
> +
> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
> +{
> +	int ret;
> +	struct ocfs2_super *osb = OCFS2_SB(sb);
> +	struct ocfs2_trim_fs_info info, *pinfo = NULL;
> +
> +	ocfs2_trim_fs_lock_res_init(osb);
> +
> +	trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
> +
> +	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> +	if (ret < 0) {
> +		if (ret != -EAGAIN) {
> +			mlog_errno(ret);
> +			ocfs2_trim_fs_lock_res_uninit(osb);
> +			return ret;
> +		}
> +
> +		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> +		     "finish, which is running from another node.\n",
> +		     osb->dev_str);
> +		ret = ocfs2_trim_fs_lock(osb, &info, 0);
> +		if (ret < 0) {
> +			mlog_errno(ret);
> +			ocfs2_trim_fs_lock_res_uninit(osb);
> +			return ret;
> +		}
> +
> +		if (info.tf_valid && info.tf_success &&
> +		    info.tf_start == range->start &&
> +		    info.tf_len == range->len &&
> +		    info.tf_minlen == range->minlen) {
> +			/* Avoid sending duplicated trim to a shared device */
> +			mlog(ML_NOTICE, "The same trim on device (%s) was "
> +			     "just done from node (%u), return.\n",
> +			     osb->dev_str, info.tf_nodenum);
> +			range->len = info.tf_trimlen;
> +			goto out;
> +		}
> +	}
> +
> +	info.tf_nodenum = osb->node_num;
> +	info.tf_start = range->start;
> +	info.tf_len = range->len;
> +	info.tf_minlen = range->minlen;
> +
> +	ret = ocfs2_trim_mainbm(sb, range);
> +
> +	info.tf_trimlen = range->len;
> +	info.tf_success = (ret < 0 ? 0 : 1);
> +	pinfo = &info;
> +out:
> +	ocfs2_trim_fs_unlock(osb, pinfo);
> +	ocfs2_trim_fs_lock_res_uninit(osb);
>   	return ret;
>   }
> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
> index 7c835824247e..af405586c5b1 100644
> --- a/fs/ocfs2/dlmglue.c
> +++ b/fs/ocfs2/dlmglue.c
> @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
>   {
>   	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
>   
> +	/* Only one trimfs thread are allowed to work at the same time. */
> +	mutex_lock(&osb->obs_trim_fs_mutex);
> +
>   	ocfs2_lock_res_init_once(lockres);
>   	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
>   	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
> @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
>   
>   	ocfs2_simple_drop_lockres(osb, lockres);
>   	ocfs2_lock_res_free(lockres);
> +
> +	mutex_unlock(&osb->obs_trim_fs_mutex);
>   }
>   
>   static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
> index 4f86ac0027b5..1f029fbe8b8d 100644
> --- a/fs/ocfs2/ocfs2.h
> +++ b/fs/ocfs2/ocfs2.h
> @@ -407,6 +407,7 @@ struct ocfs2_super
>   	struct ocfs2_lock_res osb_rename_lockres;
>   	struct ocfs2_lock_res osb_nfs_sync_lockres;
>   	struct ocfs2_lock_res osb_trim_fs_lockres;
> +	struct mutex obs_trim_fs_mutex;
>   	struct ocfs2_dlm_debug *osb_dlm_debug;
>   
>   	struct dentry *osb_debug_root;
> diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
> index 2ee76a90ba8f..dc4bce1649c1 100644
> --- a/fs/ocfs2/ocfs2_trace.h
> +++ b/fs/ocfs2/ocfs2_trace.h
> @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
>   
>   DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
>   
> +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
> +
>   DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
>   
>   /* End of trace events for fs/ocfs2/alloc.c. */
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index 3415e0b09398..96ae7cedd487 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
>   	if (ocfs2_is_hard_readonly(osb))
>   		goto leave;
>   
> +	mutex_init(&osb->obs_trim_fs_mutex);
> +
>   	status = ocfs2_dlm_init(osb);
>   	if (status < 0) {
>   		mlog_errno(status);
>
Changwei Ge Jan. 15, 2019, 3:50 a.m. UTC | #2
Hi Gang,

Most parts of this patch look sane to me, just a tiny question...

On 2019/1/11 17:01, Gang He wrote:
> The user reported this problem, the upper application IO was
> timeout when fstrim was running on this ocfs2 partition. the
> application monitoring resource agent considered that this
> application did not work, then this node was fenced by the cluster
> brain (e.g. pacemaker).
> The root cause is that fstrim thread always holds main_bm meta-file
> related locks until all the cluster groups are trimmed.
> This patch will make fstrim thread release main_bm meta-file
> related locks when each cluster group is trimmed, this will let
> the current application IO has a chance to claim the clusters from
> main_bm meta-file.
> 
> Signed-off-by: Gang He <ghe@suse.com>
> ---
>   fs/ocfs2/alloc.c       | 159 +++++++++++++++++++++++++----------------
>   fs/ocfs2/dlmglue.c     |   5 ++
>   fs/ocfs2/ocfs2.h       |   1 +
>   fs/ocfs2/ocfs2_trace.h |   2 +
>   fs/ocfs2/super.c       |   2 +
>   5 files changed, 106 insertions(+), 63 deletions(-)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index d1cbb27808e2..6f0999015a44 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
>   	return count;
>   }
>   
> -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
> +static
> +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
>   {
>   	struct ocfs2_super *osb = OCFS2_SB(sb);
> -	u64 start, len, trimmed, first_group, last_group, group;
> +	u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
>   	int ret, cnt;
>   	u32 first_bit, last_bit, minlen;
>   	struct buffer_head *main_bm_bh = NULL;
> @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>   	struct buffer_head *gd_bh = NULL;
>   	struct ocfs2_dinode *main_bm;
>   	struct ocfs2_group_desc *gd = NULL;
> -	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>   
>   	start = range->start >> osb->s_clustersize_bits;
>   	len = range->len >> osb->s_clustersize_bits;
> @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>   	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
>   		return -EINVAL;
>   
> +	trace_ocfs2_trim_mainbm(start, len, minlen);
> +
> +next_group:
>   	main_bm_inode = ocfs2_get_system_file_inode(osb,
>   						    GLOBAL_BITMAP_SYSTEM_INODE,
>   						    OCFS2_INVALID_SLOT);
> @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>   	}
>   	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>   
> -	if (start >= le32_to_cpu(main_bm->i_clusters)) {
> -		ret = -EINVAL;
> -		goto out_unlock;
> -	}
> -
> -	len = range->len >> osb->s_clustersize_bits;
> -	if (start + len > le32_to_cpu(main_bm->i_clusters))
> -		len = le32_to_cpu(main_bm->i_clusters) - start;
> -
> -	trace_ocfs2_trim_fs(start, len, minlen);
> -
> -	ocfs2_trim_fs_lock_res_init(osb);
> -	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> -	if (ret < 0) {
> -		if (ret != -EAGAIN) {
> -			mlog_errno(ret);
> -			ocfs2_trim_fs_lock_res_uninit(osb);
> +	/*
> +	 * Do some check before trim the first group.
> +	 */
> +	if (!group) {
> +		if (start >= le32_to_cpu(main_bm->i_clusters)) {
> +			ret = -EINVAL;
>   			goto out_unlock;
>   		}
>   
> -		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> -		     "finish, which is running from another node.\n",
> -		     osb->dev_str);
> -		ret = ocfs2_trim_fs_lock(osb, &info, 0);
> -		if (ret < 0) {
> -			mlog_errno(ret);
> -			ocfs2_trim_fs_lock_res_uninit(osb);
> -			goto out_unlock;
> -		}
> +		if (start + len > le32_to_cpu(main_bm->i_clusters))
> +			len = le32_to_cpu(main_bm->i_clusters) - start;
>   
> -		if (info.tf_valid && info.tf_success &&
> -		    info.tf_start == start && info.tf_len == len &&
> -		    info.tf_minlen == minlen) {
> -			/* Avoid sending duplicated trim to a shared device */
> -			mlog(ML_NOTICE, "The same trim on device (%s) was "
> -			     "just done from node (%u), return.\n",
> -			     osb->dev_str, info.tf_nodenum);
> -			range->len = info.tf_trimlen;
> -			goto out_trimunlock;
> -		}
> +		/*
> +		 * Determine first and last group to examine based on
> +		 * start and len
> +		 */
> +		first_group = ocfs2_which_cluster_group(main_bm_inode, start);
> +		if (first_group == osb->first_cluster_group_blkno)
> +			first_bit = start;
> +		else
> +			first_bit = start - ocfs2_blocks_to_clusters(sb,
> +								first_group);
> +		last_group = ocfs2_which_cluster_group(main_bm_inode,
> +						       start + len - 1);
> +		group = first_group;
>   	}
>   
> -	info.tf_nodenum = osb->node_num;
> -	info.tf_start = start;
> -	info.tf_len = len;
> -	info.tf_minlen = minlen;
> -
> -	/* Determine first and last group to examine based on start and len */
> -	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
> -	if (first_group == osb->first_cluster_group_blkno)
> -		first_bit = start;
> -	else
> -		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
> -	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
> -	last_bit = osb->bitmap_cpg;
> -
> -	trimmed = 0;
> -	for (group = first_group; group <= last_group;) {
> +	do {
>   		if (first_bit + len >= osb->bitmap_cpg)
>   			last_bit = osb->bitmap_cpg;
>   		else
> @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>   			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>   		else
>   			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
> -	}
> -	range->len = trimmed * sb->s_blocksize;
> +	} while (0);
>   
> -	info.tf_trimlen = range->len;
> -	info.tf_success = (ret ? 0 : 1);
> -	pinfo = &info;
> -out_trimunlock:
> -	ocfs2_trim_fs_unlock(osb, pinfo);
> -	ocfs2_trim_fs_lock_res_uninit(osb);
>   out_unlock:
>   	ocfs2_inode_unlock(main_bm_inode, 0);
>   	brelse(main_bm_bh);
> +	main_bm_bh = NULL;
>   out_mutex:
>   	inode_unlock(main_bm_inode);
>   	iput(main_bm_inode);
> +
> +	/*
> +	 * If all the groups trim are not done or failed, but we should release
> +	 * main_bm related locks for avoiding the current IO starve, then go to
> +	 * trim the next group
> +	 */
> +	if (ret >= 0 && group <= last_group)
> +		goto next_group;
>   out:
> +	range->len = trimmed * sb->s_blocksize;
> +	return ret;
> +}
> +
> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
> +{
> +	int ret;
> +	struct ocfs2_super *osb = OCFS2_SB(sb);
> +	struct ocfs2_trim_fs_info info, *pinfo = NULL;
> +
> +	ocfs2_trim_fs_lock_res_init(osb);
> +
> +	trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
> +
> +	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> +	if (ret < 0) {
> +		if (ret != -EAGAIN) {
> +			mlog_errno(ret);
> +			ocfs2_trim_fs_lock_res_uninit(osb);
> +			return ret;
> +		}
> +
> +		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> +		     "finish, which is running from another node.\n",
> +		     osb->dev_str);
> +		ret = ocfs2_trim_fs_lock(osb, &info, 0);
> +		if (ret < 0) {
> +			mlog_errno(ret);
> +			ocfs2_trim_fs_lock_res_uninit(osb);
> +			return ret;
> +		}
> +
> +		if (info.tf_valid && info.tf_success &&
> +		    info.tf_start == range->start &&
> +		    info.tf_len == range->len &&
> +		    info.tf_minlen == range->minlen) {
> +			/* Avoid sending duplicated trim to a shared device */
> +			mlog(ML_NOTICE, "The same trim on device (%s) was "
> +			     "just done from node (%u), return.\n",
> +			     osb->dev_str, info.tf_nodenum);
> +			range->len = info.tf_trimlen;
> +			goto out;
> +		}
> +	}
> +
> +	info.tf_nodenum = osb->node_num;
> +	info.tf_start = range->start;
> +	info.tf_len = range->len;
> +	info.tf_minlen = range->minlen;
> +
> +	ret = ocfs2_trim_mainbm(sb, range);
> +
> +	info.tf_trimlen = range->len;
> +	info.tf_success = (ret < 0 ? 0 : 1);
> +	pinfo = &info;
> +out:
> +	ocfs2_trim_fs_unlock(osb, pinfo);
> +	ocfs2_trim_fs_lock_res_uninit(osb);
>   	return ret;
>   }
> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
> index 7c835824247e..af405586c5b1 100644
> --- a/fs/ocfs2/dlmglue.c
> +++ b/fs/ocfs2/dlmglue.c
> @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
>   {
>   	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
>   
> +	/* Only one trimfs thread are allowed to work at the same time. */
> +	mutex_lock(&osb->obs_trim_fs_mutex);
> +

Cluster lock of fstrim have a trylock behavior, will it be better if we trylock here?

Thanks,
Changwei

>   	ocfs2_lock_res_init_once(lockres);
>   	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
>   	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
> @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
>   
>   	ocfs2_simple_drop_lockres(osb, lockres);
>   	ocfs2_lock_res_free(lockres);
> +
> +	mutex_unlock(&osb->obs_trim_fs_mutex);
>   }
>   
>   static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
> index 4f86ac0027b5..1f029fbe8b8d 100644
> --- a/fs/ocfs2/ocfs2.h
> +++ b/fs/ocfs2/ocfs2.h
> @@ -407,6 +407,7 @@ struct ocfs2_super
>   	struct ocfs2_lock_res osb_rename_lockres;
>   	struct ocfs2_lock_res osb_nfs_sync_lockres;
>   	struct ocfs2_lock_res osb_trim_fs_lockres;
> +	struct mutex obs_trim_fs_mutex;
>   	struct ocfs2_dlm_debug *osb_dlm_debug;
>   
>   	struct dentry *osb_debug_root;
> diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
> index 2ee76a90ba8f..dc4bce1649c1 100644
> --- a/fs/ocfs2/ocfs2_trace.h
> +++ b/fs/ocfs2/ocfs2_trace.h
> @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
>   
>   DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
>   
> +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
> +
>   DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
>   
>   /* End of trace events for fs/ocfs2/alloc.c. */
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index 3415e0b09398..96ae7cedd487 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
>   	if (ocfs2_is_hard_readonly(osb))
>   		goto leave;
>   
> +	mutex_init(&osb->obs_trim_fs_mutex);
> +
>   	status = ocfs2_dlm_init(osb);
>   	if (status < 0) {
>   		mlog_errno(status);
>
Gang He Jan. 15, 2019, 5:49 a.m. UTC | #3
Hello Changewei,

>>> On 2019/1/15 at 11:50, in message
<63ADC13FD55D6546B7DECE290D39E3730127825EFD@H3CMLB12-EX.srv.huawei-3com.com>,
Changwei Ge <ge.changwei@h3c.com> wrote:
> Hi Gang,
> 
> Most parts of this patch look sane to me, just a tiny question...
> 
> On 2019/1/11 17:01, Gang He wrote:
>> The user reported this problem, the upper application IO was
>> timeout when fstrim was running on this ocfs2 partition. the
>> application monitoring resource agent considered that this
>> application did not work, then this node was fenced by the cluster
>> brain (e.g. pacemaker).
>> The root cause is that fstrim thread always holds main_bm meta-file
>> related locks until all the cluster groups are trimmed.
>> This patch will make fstrim thread release main_bm meta-file
>> related locks when each cluster group is trimmed, this will let
>> the current application IO has a chance to claim the clusters from
>> main_bm meta-file.
>> 
>> Signed-off-by: Gang He <ghe@suse.com>
>> ---
>>   fs/ocfs2/alloc.c       | 159 +++++++++++++++++++++++++----------------
>>   fs/ocfs2/dlmglue.c     |   5 ++
>>   fs/ocfs2/ocfs2.h       |   1 +
>>   fs/ocfs2/ocfs2_trace.h |   2 +
>>   fs/ocfs2/super.c       |   2 +
>>   5 files changed, 106 insertions(+), 63 deletions(-)
>> 
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index d1cbb27808e2..6f0999015a44 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
>>   	return count;
>>   }
>>   
>> -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>> +static
>> +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
>>   {
>>   	struct ocfs2_super *osb = OCFS2_SB(sb);
>> -	u64 start, len, trimmed, first_group, last_group, group;
>> +	u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
>>   	int ret, cnt;
>>   	u32 first_bit, last_bit, minlen;
>>   	struct buffer_head *main_bm_bh = NULL;
>> @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>   	struct buffer_head *gd_bh = NULL;
>>   	struct ocfs2_dinode *main_bm;
>>   	struct ocfs2_group_desc *gd = NULL;
>> -	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>   
>>   	start = range->start >> osb->s_clustersize_bits;
>>   	len = range->len >> osb->s_clustersize_bits;
>> @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>   	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
>>   		return -EINVAL;
>>   
>> +	trace_ocfs2_trim_mainbm(start, len, minlen);
>> +
>> +next_group:
>>   	main_bm_inode = ocfs2_get_system_file_inode(osb,
>>   						    GLOBAL_BITMAP_SYSTEM_INODE,
>>   						    OCFS2_INVALID_SLOT);
>> @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>   	}
>>   	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>>   
>> -	if (start >= le32_to_cpu(main_bm->i_clusters)) {
>> -		ret = -EINVAL;
>> -		goto out_unlock;
>> -	}
>> -
>> -	len = range->len >> osb->s_clustersize_bits;
>> -	if (start + len > le32_to_cpu(main_bm->i_clusters))
>> -		len = le32_to_cpu(main_bm->i_clusters) - start;
>> -
>> -	trace_ocfs2_trim_fs(start, len, minlen);
>> -
>> -	ocfs2_trim_fs_lock_res_init(osb);
>> -	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>> -	if (ret < 0) {
>> -		if (ret != -EAGAIN) {
>> -			mlog_errno(ret);
>> -			ocfs2_trim_fs_lock_res_uninit(osb);
>> +	/*
>> +	 * Do some check before trim the first group.
>> +	 */
>> +	if (!group) {
>> +		if (start >= le32_to_cpu(main_bm->i_clusters)) {
>> +			ret = -EINVAL;
>>   			goto out_unlock;
>>   		}
>>   
>> -		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>> -		     "finish, which is running from another node.\n",
>> -		     osb->dev_str);
>> -		ret = ocfs2_trim_fs_lock(osb, &info, 0);
>> -		if (ret < 0) {
>> -			mlog_errno(ret);
>> -			ocfs2_trim_fs_lock_res_uninit(osb);
>> -			goto out_unlock;
>> -		}
>> +		if (start + len > le32_to_cpu(main_bm->i_clusters))
>> +			len = le32_to_cpu(main_bm->i_clusters) - start;
>>   
>> -		if (info.tf_valid && info.tf_success &&
>> -		    info.tf_start == start && info.tf_len == len &&
>> -		    info.tf_minlen == minlen) {
>> -			/* Avoid sending duplicated trim to a shared device */
>> -			mlog(ML_NOTICE, "The same trim on device (%s) was "
>> -			     "just done from node (%u), return.\n",
>> -			     osb->dev_str, info.tf_nodenum);
>> -			range->len = info.tf_trimlen;
>> -			goto out_trimunlock;
>> -		}
>> +		/*
>> +		 * Determine first and last group to examine based on
>> +		 * start and len
>> +		 */
>> +		first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>> +		if (first_group == osb->first_cluster_group_blkno)
>> +			first_bit = start;
>> +		else
>> +			first_bit = start - ocfs2_blocks_to_clusters(sb,
>> +								first_group);
>> +		last_group = ocfs2_which_cluster_group(main_bm_inode,
>> +						       start + len - 1);
>> +		group = first_group;
>>   	}
>>   
>> -	info.tf_nodenum = osb->node_num;
>> -	info.tf_start = start;
>> -	info.tf_len = len;
>> -	info.tf_minlen = minlen;
>> -
>> -	/* Determine first and last group to examine based on start and len */
>> -	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>> -	if (first_group == osb->first_cluster_group_blkno)
>> -		first_bit = start;
>> -	else
>> -		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
>> -	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
>> -	last_bit = osb->bitmap_cpg;
>> -
>> -	trimmed = 0;
>> -	for (group = first_group; group <= last_group;) {
>> +	do {
>>   		if (first_bit + len >= osb->bitmap_cpg)
>>   			last_bit = osb->bitmap_cpg;
>>   		else
>> @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>   			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>   		else
>>   			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>> -	}
>> -	range->len = trimmed * sb->s_blocksize;
>> +	} while (0);
>>   
>> -	info.tf_trimlen = range->len;
>> -	info.tf_success = (ret ? 0 : 1);
>> -	pinfo = &info;
>> -out_trimunlock:
>> -	ocfs2_trim_fs_unlock(osb, pinfo);
>> -	ocfs2_trim_fs_lock_res_uninit(osb);
>>   out_unlock:
>>   	ocfs2_inode_unlock(main_bm_inode, 0);
>>   	brelse(main_bm_bh);
>> +	main_bm_bh = NULL;
>>   out_mutex:
>>   	inode_unlock(main_bm_inode);
>>   	iput(main_bm_inode);
>> +
>> +	/*
>> +	 * If all the groups trim are not done or failed, but we should release
>> +	 * main_bm related locks for avoiding the current IO starve, then go to
>> +	 * trim the next group
>> +	 */
>> +	if (ret >= 0 && group <= last_group)
>> +		goto next_group;
>>   out:
>> +	range->len = trimmed * sb->s_blocksize;
>> +	return ret;
>> +}
>> +
>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>> +{
>> +	int ret;
>> +	struct ocfs2_super *osb = OCFS2_SB(sb);
>> +	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>> +
>> +	ocfs2_trim_fs_lock_res_init(osb);
>> +
>> +	trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
>> +
>> +	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>> +	if (ret < 0) {
>> +		if (ret != -EAGAIN) {
>> +			mlog_errno(ret);
>> +			ocfs2_trim_fs_lock_res_uninit(osb);
>> +			return ret;
>> +		}
>> +
>> +		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>> +		     "finish, which is running from another node.\n",
>> +		     osb->dev_str);
>> +		ret = ocfs2_trim_fs_lock(osb, &info, 0);
>> +		if (ret < 0) {
>> +			mlog_errno(ret);
>> +			ocfs2_trim_fs_lock_res_uninit(osb);
>> +			return ret;
>> +		}
>> +
>> +		if (info.tf_valid && info.tf_success &&
>> +		    info.tf_start == range->start &&
>> +		    info.tf_len == range->len &&
>> +		    info.tf_minlen == range->minlen) {
>> +			/* Avoid sending duplicated trim to a shared device */
>> +			mlog(ML_NOTICE, "The same trim on device (%s) was "
>> +			     "just done from node (%u), return.\n",
>> +			     osb->dev_str, info.tf_nodenum);
>> +			range->len = info.tf_trimlen;
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	info.tf_nodenum = osb->node_num;
>> +	info.tf_start = range->start;
>> +	info.tf_len = range->len;
>> +	info.tf_minlen = range->minlen;
>> +
>> +	ret = ocfs2_trim_mainbm(sb, range);
>> +
>> +	info.tf_trimlen = range->len;
>> +	info.tf_success = (ret < 0 ? 0 : 1);
>> +	pinfo = &info;
>> +out:
>> +	ocfs2_trim_fs_unlock(osb, pinfo);
>> +	ocfs2_trim_fs_lock_res_uninit(osb);
>>   	return ret;
>>   }
>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
>> index 7c835824247e..af405586c5b1 100644
>> --- a/fs/ocfs2/dlmglue.c
>> +++ b/fs/ocfs2/dlmglue.c
>> @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
>>   {
>>   	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
>>   
>> +	/* Only one trimfs thread are allowed to work at the same time. */
>> +	mutex_lock(&osb->obs_trim_fs_mutex);
>> +
> 
> Cluster lock of fstrim have a trylock behavior, will it be better if we 
> trylock here?
Here, I prefer to just serialize fstrim threads on the local node to simplify the code logic,
maybe the user want to do like that, although this behavior is not recommended.
You know, on one node, ideally, the user should call  fstrim command once regularly.
If he calls fstrim command more times in a very short time, 
the code will not make each fstrim command return to failure, just do the fstrim task one by one.

Thanks
Gang

> 
> Thanks,
> Changwei
> 
>>   	ocfs2_lock_res_init_once(lockres);
>>   	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
>>   	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
>> @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super 
> *osb)
>>   
>>   	ocfs2_simple_drop_lockres(osb, lockres);
>>   	ocfs2_lock_res_free(lockres);
>> +
>> +	mutex_unlock(&osb->obs_trim_fs_mutex);
>>   }
>>   
>>   static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
>> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
>> index 4f86ac0027b5..1f029fbe8b8d 100644
>> --- a/fs/ocfs2/ocfs2.h
>> +++ b/fs/ocfs2/ocfs2.h
>> @@ -407,6 +407,7 @@ struct ocfs2_super
>>   	struct ocfs2_lock_res osb_rename_lockres;
>>   	struct ocfs2_lock_res osb_nfs_sync_lockres;
>>   	struct ocfs2_lock_res osb_trim_fs_lockres;
>> +	struct mutex obs_trim_fs_mutex;
>>   	struct ocfs2_dlm_debug *osb_dlm_debug;
>>   
>>   	struct dentry *osb_debug_root;
>> diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
>> index 2ee76a90ba8f..dc4bce1649c1 100644
>> --- a/fs/ocfs2/ocfs2_trace.h
>> +++ b/fs/ocfs2/ocfs2_trace.h
>> @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
>>   
>>   DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
>>   
>> +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
>> +
>>   DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
>>   
>>   /* End of trace events for fs/ocfs2/alloc.c. */
>> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
>> index 3415e0b09398..96ae7cedd487 100644
>> --- a/fs/ocfs2/super.c
>> +++ b/fs/ocfs2/super.c
>> @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
>>   	if (ocfs2_is_hard_readonly(osb))
>>   		goto leave;
>>   
>> +	mutex_init(&osb->obs_trim_fs_mutex);
>> +
>>   	status = ocfs2_dlm_init(osb);
>>   	if (status < 0) {
>>   		mlog_errno(status);
>>
Changwei Ge Jan. 15, 2019, 8 a.m. UTC | #4
On 2019/1/15 13:49, Gang He wrote:
> Hello Changewei,
> 
>>>> On 2019/1/15 at 11:50, in message
> <63ADC13FD55D6546B7DECE290D39E3730127825EFD@H3CMLB12-EX.srv.huawei-3com.com>,
> Changwei Ge <ge.changwei@h3c.com> wrote:
>> Hi Gang,
>>
>> Most parts of this patch look sane to me, just a tiny question...
>>
>> On 2019/1/11 17:01, Gang He wrote:
>>> The user reported this problem, the upper application IO was
>>> timeout when fstrim was running on this ocfs2 partition. the
>>> application monitoring resource agent considered that this
>>> application did not work, then this node was fenced by the cluster
>>> brain (e.g. pacemaker).
>>> The root cause is that fstrim thread always holds main_bm meta-file
>>> related locks until all the cluster groups are trimmed.
>>> This patch will make fstrim thread release main_bm meta-file
>>> related locks when each cluster group is trimmed, this will let
>>> the current application IO has a chance to claim the clusters from
>>> main_bm meta-file.
>>>
>>> Signed-off-by: Gang He <ghe@suse.com>
>>> ---
>>>    fs/ocfs2/alloc.c       | 159 +++++++++++++++++++++++++----------------
>>>    fs/ocfs2/dlmglue.c     |   5 ++
>>>    fs/ocfs2/ocfs2.h       |   1 +
>>>    fs/ocfs2/ocfs2_trace.h |   2 +
>>>    fs/ocfs2/super.c       |   2 +
>>>    5 files changed, 106 insertions(+), 63 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index d1cbb27808e2..6f0999015a44 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
>>>    	return count;
>>>    }
>>>    
>>> -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>> +static
>>> +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
>>>    {
>>>    	struct ocfs2_super *osb = OCFS2_SB(sb);
>>> -	u64 start, len, trimmed, first_group, last_group, group;
>>> +	u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
>>>    	int ret, cnt;
>>>    	u32 first_bit, last_bit, minlen;
>>>    	struct buffer_head *main_bm_bh = NULL;
>>> @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>>    	struct buffer_head *gd_bh = NULL;
>>>    	struct ocfs2_dinode *main_bm;
>>>    	struct ocfs2_group_desc *gd = NULL;
>>> -	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>    
>>>    	start = range->start >> osb->s_clustersize_bits;
>>>    	len = range->len >> osb->s_clustersize_bits;
>>> @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>>    	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
>>>    		return -EINVAL;
>>>    
>>> +	trace_ocfs2_trim_mainbm(start, len, minlen);
>>> +
>>> +next_group:
>>>    	main_bm_inode = ocfs2_get_system_file_inode(osb,
>>>    						    GLOBAL_BITMAP_SYSTEM_INODE,
>>>    						    OCFS2_INVALID_SLOT);
>>> @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>>    	}
>>>    	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>>>    
>>> -	if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>> -		ret = -EINVAL;
>>> -		goto out_unlock;
>>> -	}
>>> -
>>> -	len = range->len >> osb->s_clustersize_bits;
>>> -	if (start + len > le32_to_cpu(main_bm->i_clusters))
>>> -		len = le32_to_cpu(main_bm->i_clusters) - start;
>>> -
>>> -	trace_ocfs2_trim_fs(start, len, minlen);
>>> -
>>> -	ocfs2_trim_fs_lock_res_init(osb);
>>> -	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>> -	if (ret < 0) {
>>> -		if (ret != -EAGAIN) {
>>> -			mlog_errno(ret);
>>> -			ocfs2_trim_fs_lock_res_uninit(osb);
>>> +	/*
>>> +	 * Do some check before trim the first group.
>>> +	 */
>>> +	if (!group) {
>>> +		if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>> +			ret = -EINVAL;
>>>    			goto out_unlock;
>>>    		}
>>>    
>>> -		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>> -		     "finish, which is running from another node.\n",
>>> -		     osb->dev_str);
>>> -		ret = ocfs2_trim_fs_lock(osb, &info, 0);
>>> -		if (ret < 0) {
>>> -			mlog_errno(ret);
>>> -			ocfs2_trim_fs_lock_res_uninit(osb);
>>> -			goto out_unlock;
>>> -		}
>>> +		if (start + len > le32_to_cpu(main_bm->i_clusters))
>>> +			len = le32_to_cpu(main_bm->i_clusters) - start;
>>>    
>>> -		if (info.tf_valid && info.tf_success &&
>>> -		    info.tf_start == start && info.tf_len == len &&
>>> -		    info.tf_minlen == minlen) {
>>> -			/* Avoid sending duplicated trim to a shared device */
>>> -			mlog(ML_NOTICE, "The same trim on device (%s) was "
>>> -			     "just done from node (%u), return.\n",
>>> -			     osb->dev_str, info.tf_nodenum);
>>> -			range->len = info.tf_trimlen;
>>> -			goto out_trimunlock;
>>> -		}
>>> +		/*
>>> +		 * Determine first and last group to examine based on
>>> +		 * start and len
>>> +		 */
>>> +		first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>> +		if (first_group == osb->first_cluster_group_blkno)
>>> +			first_bit = start;
>>> +		else
>>> +			first_bit = start - ocfs2_blocks_to_clusters(sb,
>>> +								first_group);
>>> +		last_group = ocfs2_which_cluster_group(main_bm_inode,
>>> +						       start + len - 1);
>>> +		group = first_group;
>>>    	}
>>>    
>>> -	info.tf_nodenum = osb->node_num;
>>> -	info.tf_start = start;
>>> -	info.tf_len = len;
>>> -	info.tf_minlen = minlen;
>>> -
>>> -	/* Determine first and last group to examine based on start and len */
>>> -	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>> -	if (first_group == osb->first_cluster_group_blkno)
>>> -		first_bit = start;
>>> -	else
>>> -		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
>>> -	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
>>> -	last_bit = osb->bitmap_cpg;
>>> -
>>> -	trimmed = 0;
>>> -	for (group = first_group; group <= last_group;) {
>>> +	do {
>>>    		if (first_bit + len >= osb->bitmap_cpg)
>>>    			last_bit = osb->bitmap_cpg;
>>>    		else
>>> @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>>    			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>>    		else
>>>    			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>> -	}
>>> -	range->len = trimmed * sb->s_blocksize;
>>> +	} while (0);
>>>    
>>> -	info.tf_trimlen = range->len;
>>> -	info.tf_success = (ret ? 0 : 1);
>>> -	pinfo = &info;
>>> -out_trimunlock:
>>> -	ocfs2_trim_fs_unlock(osb, pinfo);
>>> -	ocfs2_trim_fs_lock_res_uninit(osb);
>>>    out_unlock:
>>>    	ocfs2_inode_unlock(main_bm_inode, 0);
>>>    	brelse(main_bm_bh);
>>> +	main_bm_bh = NULL;
>>>    out_mutex:
>>>    	inode_unlock(main_bm_inode);
>>>    	iput(main_bm_inode);
>>> +
>>> +	/*
>>> +	 * If all the groups trim are not done or failed, but we should release
>>> +	 * main_bm related locks for avoiding the current IO starve, then go to
>>> +	 * trim the next group
>>> +	 */
>>> +	if (ret >= 0 && group <= last_group)
>>> +		goto next_group;
>>>    out:
>>> +	range->len = trimmed * sb->s_blocksize;
>>> +	return ret;
>>> +}
>>> +
>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>> +{
>>> +	int ret;
>>> +	struct ocfs2_super *osb = OCFS2_SB(sb);
>>> +	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>> +
>>> +	ocfs2_trim_fs_lock_res_init(osb);
>>> +
>>> +	trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
>>> +
>>> +	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>> +	if (ret < 0) {
>>> +		if (ret != -EAGAIN) {
>>> +			mlog_errno(ret);
>>> +			ocfs2_trim_fs_lock_res_uninit(osb);
>>> +			return ret;
>>> +		}
>>> +
>>> +		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>> +		     "finish, which is running from another node.\n",
>>> +		     osb->dev_str);
>>> +		ret = ocfs2_trim_fs_lock(osb, &info, 0);
>>> +		if (ret < 0) {
>>> +			mlog_errno(ret);
>>> +			ocfs2_trim_fs_lock_res_uninit(osb);
>>> +			return ret;
>>> +		}
>>> +
>>> +		if (info.tf_valid && info.tf_success &&
>>> +		    info.tf_start == range->start &&
>>> +		    info.tf_len == range->len &&
>>> +		    info.tf_minlen == range->minlen) {
>>> +			/* Avoid sending duplicated trim to a shared device */
>>> +			mlog(ML_NOTICE, "The same trim on device (%s) was "
>>> +			     "just done from node (%u), return.\n",
>>> +			     osb->dev_str, info.tf_nodenum);
>>> +			range->len = info.tf_trimlen;
>>> +			goto out;
>>> +		}
>>> +	}
>>> +
>>> +	info.tf_nodenum = osb->node_num;
>>> +	info.tf_start = range->start;
>>> +	info.tf_len = range->len;
>>> +	info.tf_minlen = range->minlen;
>>> +
>>> +	ret = ocfs2_trim_mainbm(sb, range);
>>> +
>>> +	info.tf_trimlen = range->len;
>>> +	info.tf_success = (ret < 0 ? 0 : 1);
>>> +	pinfo = &info;
>>> +out:
>>> +	ocfs2_trim_fs_unlock(osb, pinfo);
>>> +	ocfs2_trim_fs_lock_res_uninit(osb);
>>>    	return ret;
>>>    }
>>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
>>> index 7c835824247e..af405586c5b1 100644
>>> --- a/fs/ocfs2/dlmglue.c
>>> +++ b/fs/ocfs2/dlmglue.c
>>> @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
>>>    {
>>>    	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
>>>    
>>> +	/* Only one trimfs thread are allowed to work at the same time. */
>>> +	mutex_lock(&osb->obs_trim_fs_mutex);
>>> +
>>
>> Cluster lock of fstrim have a trylock behavior, will it be better if we
>> trylock here?
> Here, I prefer to just serialize fstrim threads on the local node to simplify the code logic,
> maybe the user want to do like that, although this behavior is not recommended.
> You know, on one node, ideally, the user should call  fstrim command once regularly.
> If he calls fstrim command more times in a very short time,
> the code will not make each fstrim command return to failure, just do the fstrim task one by one.

I have a thought having nothing to do with your patch.
Do you think it's possible for us to implement *discard on unlink file* like ext4 does.
So the application doesn't have to invoke fstrim periodically.

Thanks,
Changwei

> 
> Thanks
> Gang
> 
>>
>> Thanks,
>> Changwei
>>
>>>    	ocfs2_lock_res_init_once(lockres);
>>>    	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
>>>    	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
>>> @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super
>> *osb)
>>>    
>>>    	ocfs2_simple_drop_lockres(osb, lockres);
>>>    	ocfs2_lock_res_free(lockres);
>>> +
>>> +	mutex_unlock(&osb->obs_trim_fs_mutex);
>>>    }
>>>    
>>>    static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
>>> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
>>> index 4f86ac0027b5..1f029fbe8b8d 100644
>>> --- a/fs/ocfs2/ocfs2.h
>>> +++ b/fs/ocfs2/ocfs2.h
>>> @@ -407,6 +407,7 @@ struct ocfs2_super
>>>    	struct ocfs2_lock_res osb_rename_lockres;
>>>    	struct ocfs2_lock_res osb_nfs_sync_lockres;
>>>    	struct ocfs2_lock_res osb_trim_fs_lockres;
>>> +	struct mutex obs_trim_fs_mutex;
>>>    	struct ocfs2_dlm_debug *osb_dlm_debug;
>>>    
>>>    	struct dentry *osb_debug_root;
>>> diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
>>> index 2ee76a90ba8f..dc4bce1649c1 100644
>>> --- a/fs/ocfs2/ocfs2_trace.h
>>> +++ b/fs/ocfs2/ocfs2_trace.h
>>> @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
>>>    
>>>    DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
>>>    
>>> +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
>>> +
>>>    DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
>>>    
>>>    /* End of trace events for fs/ocfs2/alloc.c. */
>>> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
>>> index 3415e0b09398..96ae7cedd487 100644
>>> --- a/fs/ocfs2/super.c
>>> +++ b/fs/ocfs2/super.c
>>> @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
>>>    	if (ocfs2_is_hard_readonly(osb))
>>>    		goto leave;
>>>    
>>> +	mutex_init(&osb->obs_trim_fs_mutex);
>>> +
>>>    	status = ocfs2_dlm_init(osb);
>>>    	if (status < 0) {
>>>    		mlog_errno(status);
>>>
>
Gang He Jan. 15, 2019, 8:22 a.m. UTC | #5
Hello ChangWei,

>>> On 2019/1/15 at 16:00, in message
<63ADC13FD55D6546B7DECE290D39E37301278265F3@H3CMLB12-EX.srv.huawei-3com.com>,
Changwei Ge <ge.changwei@h3c.com> wrote:
> On 2019/1/15 13:49, Gang He wrote:
>> Hello Changewei,
>> 
>>>>> On 2019/1/15 at 11:50, in message
>> <63ADC13FD55D6546B7DECE290D39E3730127825EFD@H3CMLB12-EX.srv.huawei-3com.com>,
>> Changwei Ge <ge.changwei@h3c.com> wrote:
>>> Hi Gang,
>>>
>>> Most parts of this patch look sane to me, just a tiny question...
>>>
>>> On 2019/1/11 17:01, Gang He wrote:
>>>> The user reported this problem, the upper application IO was
>>>> timeout when fstrim was running on this ocfs2 partition. the
>>>> application monitoring resource agent considered that this
>>>> application did not work, then this node was fenced by the cluster
>>>> brain (e.g. pacemaker).
>>>> The root cause is that fstrim thread always holds main_bm meta-file
>>>> related locks until all the cluster groups are trimmed.
>>>> This patch will make fstrim thread release main_bm meta-file
>>>> related locks when each cluster group is trimmed, this will let
>>>> the current application IO has a chance to claim the clusters from
>>>> main_bm meta-file.
>>>>
>>>> Signed-off-by: Gang He <ghe@suse.com>
>>>> ---
>>>>    fs/ocfs2/alloc.c       | 159 +++++++++++++++++++++++++----------------
>>>>    fs/ocfs2/dlmglue.c     |   5 ++
>>>>    fs/ocfs2/ocfs2.h       |   1 +
>>>>    fs/ocfs2/ocfs2_trace.h |   2 +
>>>>    fs/ocfs2/super.c       |   2 +
>>>>    5 files changed, 106 insertions(+), 63 deletions(-)
>>>>
>>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>>> index d1cbb27808e2..6f0999015a44 100644
>>>> --- a/fs/ocfs2/alloc.c
>>>> +++ b/fs/ocfs2/alloc.c
>>>> @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
>>>>    	return count;
>>>>    }
>>>>    
>>>> -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>>> +static
>>>> +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
>>>>    {
>>>>    	struct ocfs2_super *osb = OCFS2_SB(sb);
>>>> -	u64 start, len, trimmed, first_group, last_group, group;
>>>> +	u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
>>>>    	int ret, cnt;
>>>>    	u32 first_bit, last_bit, minlen;
>>>>    	struct buffer_head *main_bm_bh = NULL;
>>>> @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>> fstrim_range *range)
>>>>    	struct buffer_head *gd_bh = NULL;
>>>>    	struct ocfs2_dinode *main_bm;
>>>>    	struct ocfs2_group_desc *gd = NULL;
>>>> -	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>>    
>>>>    	start = range->start >> osb->s_clustersize_bits;
>>>>    	len = range->len >> osb->s_clustersize_bits;
>>>> @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>> fstrim_range *range)
>>>>    	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
>>>>    		return -EINVAL;
>>>>    
>>>> +	trace_ocfs2_trim_mainbm(start, len, minlen);
>>>> +
>>>> +next_group:
>>>>    	main_bm_inode = ocfs2_get_system_file_inode(osb,
>>>>    						    GLOBAL_BITMAP_SYSTEM_INODE,
>>>>    						    OCFS2_INVALID_SLOT);
>>>> @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>> fstrim_range *range)
>>>>    	}
>>>>    	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>>>>    
>>>> -	if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>>> -		ret = -EINVAL;
>>>> -		goto out_unlock;
>>>> -	}
>>>> -
>>>> -	len = range->len >> osb->s_clustersize_bits;
>>>> -	if (start + len > le32_to_cpu(main_bm->i_clusters))
>>>> -		len = le32_to_cpu(main_bm->i_clusters) - start;
>>>> -
>>>> -	trace_ocfs2_trim_fs(start, len, minlen);
>>>> -
>>>> -	ocfs2_trim_fs_lock_res_init(osb);
>>>> -	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>> -	if (ret < 0) {
>>>> -		if (ret != -EAGAIN) {
>>>> -			mlog_errno(ret);
>>>> -			ocfs2_trim_fs_lock_res_uninit(osb);
>>>> +	/*
>>>> +	 * Do some check before trim the first group.
>>>> +	 */
>>>> +	if (!group) {
>>>> +		if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>>> +			ret = -EINVAL;
>>>>    			goto out_unlock;
>>>>    		}
>>>>    
>>>> -		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>>> -		     "finish, which is running from another node.\n",
>>>> -		     osb->dev_str);
>>>> -		ret = ocfs2_trim_fs_lock(osb, &info, 0);
>>>> -		if (ret < 0) {
>>>> -			mlog_errno(ret);
>>>> -			ocfs2_trim_fs_lock_res_uninit(osb);
>>>> -			goto out_unlock;
>>>> -		}
>>>> +		if (start + len > le32_to_cpu(main_bm->i_clusters))
>>>> +			len = le32_to_cpu(main_bm->i_clusters) - start;
>>>>    
>>>> -		if (info.tf_valid && info.tf_success &&
>>>> -		    info.tf_start == start && info.tf_len == len &&
>>>> -		    info.tf_minlen == minlen) {
>>>> -			/* Avoid sending duplicated trim to a shared device */
>>>> -			mlog(ML_NOTICE, "The same trim on device (%s) was "
>>>> -			     "just done from node (%u), return.\n",
>>>> -			     osb->dev_str, info.tf_nodenum);
>>>> -			range->len = info.tf_trimlen;
>>>> -			goto out_trimunlock;
>>>> -		}
>>>> +		/*
>>>> +		 * Determine first and last group to examine based on
>>>> +		 * start and len
>>>> +		 */
>>>> +		first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>>> +		if (first_group == osb->first_cluster_group_blkno)
>>>> +			first_bit = start;
>>>> +		else
>>>> +			first_bit = start - ocfs2_blocks_to_clusters(sb,
>>>> +								first_group);
>>>> +		last_group = ocfs2_which_cluster_group(main_bm_inode,
>>>> +						       start + len - 1);
>>>> +		group = first_group;
>>>>    	}
>>>>    
>>>> -	info.tf_nodenum = osb->node_num;
>>>> -	info.tf_start = start;
>>>> -	info.tf_len = len;
>>>> -	info.tf_minlen = minlen;
>>>> -
>>>> -	/* Determine first and last group to examine based on start and len */
>>>> -	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>>> -	if (first_group == osb->first_cluster_group_blkno)
>>>> -		first_bit = start;
>>>> -	else
>>>> -		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
>>>> -	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
>>>> -	last_bit = osb->bitmap_cpg;
>>>> -
>>>> -	trimmed = 0;
>>>> -	for (group = first_group; group <= last_group;) {
>>>> +	do {
>>>>    		if (first_bit + len >= osb->bitmap_cpg)
>>>>    			last_bit = osb->bitmap_cpg;
>>>>    		else
>>>> @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>> fstrim_range *range)
>>>>    			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>>>    		else
>>>>    			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>>> -	}
>>>> -	range->len = trimmed * sb->s_blocksize;
>>>> +	} while (0);
>>>>    
>>>> -	info.tf_trimlen = range->len;
>>>> -	info.tf_success = (ret ? 0 : 1);
>>>> -	pinfo = &info;
>>>> -out_trimunlock:
>>>> -	ocfs2_trim_fs_unlock(osb, pinfo);
>>>> -	ocfs2_trim_fs_lock_res_uninit(osb);
>>>>    out_unlock:
>>>>    	ocfs2_inode_unlock(main_bm_inode, 0);
>>>>    	brelse(main_bm_bh);
>>>> +	main_bm_bh = NULL;
>>>>    out_mutex:
>>>>    	inode_unlock(main_bm_inode);
>>>>    	iput(main_bm_inode);
>>>> +
>>>> +	/*
>>>> +	 * If all the groups trim are not done or failed, but we should release
>>>> +	 * main_bm related locks for avoiding the current IO starve, then go to
>>>> +	 * trim the next group
>>>> +	 */
>>>> +	if (ret >= 0 && group <= last_group)
>>>> +		goto next_group;
>>>>    out:
>>>> +	range->len = trimmed * sb->s_blocksize;
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>>> +{
>>>> +	int ret;
>>>> +	struct ocfs2_super *osb = OCFS2_SB(sb);
>>>> +	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>> +
>>>> +	ocfs2_trim_fs_lock_res_init(osb);
>>>> +
>>>> +	trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
>>>> +
>>>> +	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>> +	if (ret < 0) {
>>>> +		if (ret != -EAGAIN) {
>>>> +			mlog_errno(ret);
>>>> +			ocfs2_trim_fs_lock_res_uninit(osb);
>>>> +			return ret;
>>>> +		}
>>>> +
>>>> +		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>>> +		     "finish, which is running from another node.\n",
>>>> +		     osb->dev_str);
>>>> +		ret = ocfs2_trim_fs_lock(osb, &info, 0);
>>>> +		if (ret < 0) {
>>>> +			mlog_errno(ret);
>>>> +			ocfs2_trim_fs_lock_res_uninit(osb);
>>>> +			return ret;
>>>> +		}
>>>> +
>>>> +		if (info.tf_valid && info.tf_success &&
>>>> +		    info.tf_start == range->start &&
>>>> +		    info.tf_len == range->len &&
>>>> +		    info.tf_minlen == range->minlen) {
>>>> +			/* Avoid sending duplicated trim to a shared device */
>>>> +			mlog(ML_NOTICE, "The same trim on device (%s) was "
>>>> +			     "just done from node (%u), return.\n",
>>>> +			     osb->dev_str, info.tf_nodenum);
>>>> +			range->len = info.tf_trimlen;
>>>> +			goto out;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	info.tf_nodenum = osb->node_num;
>>>> +	info.tf_start = range->start;
>>>> +	info.tf_len = range->len;
>>>> +	info.tf_minlen = range->minlen;
>>>> +
>>>> +	ret = ocfs2_trim_mainbm(sb, range);
>>>> +
>>>> +	info.tf_trimlen = range->len;
>>>> +	info.tf_success = (ret < 0 ? 0 : 1);
>>>> +	pinfo = &info;
>>>> +out:
>>>> +	ocfs2_trim_fs_unlock(osb, pinfo);
>>>> +	ocfs2_trim_fs_lock_res_uninit(osb);
>>>>    	return ret;
>>>>    }
>>>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
>>>> index 7c835824247e..af405586c5b1 100644
>>>> --- a/fs/ocfs2/dlmglue.c
>>>> +++ b/fs/ocfs2/dlmglue.c
>>>> @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
>>>>    {
>>>>    	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
>>>>    
>>>> +	/* Only one trimfs thread are allowed to work at the same time. */
>>>> +	mutex_lock(&osb->obs_trim_fs_mutex);
>>>> +
>>>
>>> Cluster lock of fstrim have a trylock behavior, will it be better if we
>>> trylock here?
>> Here, I prefer to just serialize fstrim threads on the local node to 
> simplify the code logic,
>> maybe the user want to do like that, although this behavior is not 
> recommended.
>> You know, on one node, ideally, the user should call  fstrim command once 
> regularly.
>> If he calls fstrim command more times in a very short time,
>> the code will not make each fstrim command return to failure, just do the 
> fstrim task one by one.
> 
> I have a thought having nothing to do with your patch.
> Do you think it's possible for us to implement *discard on unlink file* like 
> ext4 does.
> So the application doesn't have to invoke fstrim periodically.
Yes, we can do some investigation for adding a mount option "discard" to support discard a file when it is deleted.
This can be considered as another feature, since its discard occasion and granularity is different with fstrim.
The scheduled fstrim command can considered as a traditional file system level trim.
If the file system supports the on-demand discard when some blocks were released, that's better.
Of course. these two features can coexist.

Thanks
Gang

> 
> Thanks,
> Changwei
> 
>> 
>> Thanks
>> Gang
>> 
>>>
>>> Thanks,
>>> Changwei
>>>
>>>>    	ocfs2_lock_res_init_once(lockres);
>>>>    	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
>>>>    	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
>>>> @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super
>>> *osb)
>>>>    
>>>>    	ocfs2_simple_drop_lockres(osb, lockres);
>>>>    	ocfs2_lock_res_free(lockres);
>>>> +
>>>> +	mutex_unlock(&osb->obs_trim_fs_mutex);
>>>>    }
>>>>    
>>>>    static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
>>>> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
>>>> index 4f86ac0027b5..1f029fbe8b8d 100644
>>>> --- a/fs/ocfs2/ocfs2.h
>>>> +++ b/fs/ocfs2/ocfs2.h
>>>> @@ -407,6 +407,7 @@ struct ocfs2_super
>>>>    	struct ocfs2_lock_res osb_rename_lockres;
>>>>    	struct ocfs2_lock_res osb_nfs_sync_lockres;
>>>>    	struct ocfs2_lock_res osb_trim_fs_lockres;
>>>> +	struct mutex obs_trim_fs_mutex;
>>>>    	struct ocfs2_dlm_debug *osb_dlm_debug;
>>>>    
>>>>    	struct dentry *osb_debug_root;
>>>> diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
>>>> index 2ee76a90ba8f..dc4bce1649c1 100644
>>>> --- a/fs/ocfs2/ocfs2_trace.h
>>>> +++ b/fs/ocfs2/ocfs2_trace.h
>>>> @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
>>>>    
>>>>    DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
>>>>    
>>>> +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
>>>> +
>>>>    DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
>>>>    
>>>>    /* End of trace events for fs/ocfs2/alloc.c. */
>>>> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
>>>> index 3415e0b09398..96ae7cedd487 100644
>>>> --- a/fs/ocfs2/super.c
>>>> +++ b/fs/ocfs2/super.c
>>>> @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
>>>>    	if (ocfs2_is_hard_readonly(osb))
>>>>    		goto leave;
>>>>    
>>>> +	mutex_init(&osb->obs_trim_fs_mutex);
>>>> +
>>>>    	status = ocfs2_dlm_init(osb);
>>>>    	if (status < 0) {
>>>>    		mlog_errno(status);
>>>>
>>
Changwei Ge Jan. 15, 2019, 9:33 a.m. UTC | #6
Hi Gang,

It looks good to me.

On 2019/1/15 16:23, Gang He wrote:
> Hello ChangWei,
> 
>>>> On 2019/1/15 at 16:00, in message
> <63ADC13FD55D6546B7DECE290D39E37301278265F3@H3CMLB12-EX.srv.huawei-3com.com>,
> Changwei Ge <ge.changwei@h3c.com> wrote:
>> On 2019/1/15 13:49, Gang He wrote:
>>> Hello Changewei,
>>>
>>>>>> On 2019/1/15 at 11:50, in message
>>> <63ADC13FD55D6546B7DECE290D39E3730127825EFD@H3CMLB12-EX.srv.huawei-3com.com>,
>>> Changwei Ge <ge.changwei@h3c.com> wrote:
>>>> Hi Gang,
>>>>
>>>> Most parts of this patch look sane to me, just a tiny question...
>>>>
>>>> On 2019/1/11 17:01, Gang He wrote:
>>>>> The user reported this problem, the upper application IO was
>>>>> timeout when fstrim was running on this ocfs2 partition. the
>>>>> application monitoring resource agent considered that this
>>>>> application did not work, then this node was fenced by the cluster
>>>>> brain (e.g. pacemaker).
>>>>> The root cause is that fstrim thread always holds main_bm meta-file
>>>>> related locks until all the cluster groups are trimmed.
>>>>> This patch will make fstrim thread release main_bm meta-file
>>>>> related locks when each cluster group is trimmed, this will let
>>>>> the current application IO has a chance to claim the clusters from
>>>>> main_bm meta-file.
>>>>>
>>>>> Signed-off-by: Gang He <ghe@suse.com>

Reviewed-by: Changwei Ge <ge.changwei@h3c.com>

>>>>> ---
>>>>>     fs/ocfs2/alloc.c       | 159 +++++++++++++++++++++++++----------------
>>>>>     fs/ocfs2/dlmglue.c     |   5 ++
>>>>>     fs/ocfs2/ocfs2.h       |   1 +
>>>>>     fs/ocfs2/ocfs2_trace.h |   2 +
>>>>>     fs/ocfs2/super.c       |   2 +
>>>>>     5 files changed, 106 insertions(+), 63 deletions(-)
>>>>>
>>>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>>>> index d1cbb27808e2..6f0999015a44 100644
>>>>> --- a/fs/ocfs2/alloc.c
>>>>> +++ b/fs/ocfs2/alloc.c
>>>>> @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
>>>>>     	return count;
>>>>>     }
>>>>>     
>>>>> -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>>>> +static
>>>>> +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
>>>>>     {
>>>>>     	struct ocfs2_super *osb = OCFS2_SB(sb);
>>>>> -	u64 start, len, trimmed, first_group, last_group, group;
>>>>> +	u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
>>>>>     	int ret, cnt;
>>>>>     	u32 first_bit, last_bit, minlen;
>>>>>     	struct buffer_head *main_bm_bh = NULL;
>>>>> @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>>> fstrim_range *range)
>>>>>     	struct buffer_head *gd_bh = NULL;
>>>>>     	struct ocfs2_dinode *main_bm;
>>>>>     	struct ocfs2_group_desc *gd = NULL;
>>>>> -	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>>>     
>>>>>     	start = range->start >> osb->s_clustersize_bits;
>>>>>     	len = range->len >> osb->s_clustersize_bits;
>>>>> @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>>> fstrim_range *range)
>>>>>     	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
>>>>>     		return -EINVAL;
>>>>>     
>>>>> +	trace_ocfs2_trim_mainbm(start, len, minlen);
>>>>> +
>>>>> +next_group:
>>>>>     	main_bm_inode = ocfs2_get_system_file_inode(osb,
>>>>>     						    GLOBAL_BITMAP_SYSTEM_INODE,
>>>>>     						    OCFS2_INVALID_SLOT);
>>>>> @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>>> fstrim_range *range)
>>>>>     	}
>>>>>     	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>>>>>     
>>>>> -	if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>>>> -		ret = -EINVAL;
>>>>> -		goto out_unlock;
>>>>> -	}
>>>>> -
>>>>> -	len = range->len >> osb->s_clustersize_bits;
>>>>> -	if (start + len > le32_to_cpu(main_bm->i_clusters))
>>>>> -		len = le32_to_cpu(main_bm->i_clusters) - start;
>>>>> -
>>>>> -	trace_ocfs2_trim_fs(start, len, minlen);
>>>>> -
>>>>> -	ocfs2_trim_fs_lock_res_init(osb);
>>>>> -	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>>> -	if (ret < 0) {
>>>>> -		if (ret != -EAGAIN) {
>>>>> -			mlog_errno(ret);
>>>>> -			ocfs2_trim_fs_lock_res_uninit(osb);
>>>>> +	/*
>>>>> +	 * Do some check before trim the first group.
>>>>> +	 */
>>>>> +	if (!group) {
>>>>> +		if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>>>> +			ret = -EINVAL;
>>>>>     			goto out_unlock;
>>>>>     		}
>>>>>     
>>>>> -		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>>>> -		     "finish, which is running from another node.\n",
>>>>> -		     osb->dev_str);
>>>>> -		ret = ocfs2_trim_fs_lock(osb, &info, 0);
>>>>> -		if (ret < 0) {
>>>>> -			mlog_errno(ret);
>>>>> -			ocfs2_trim_fs_lock_res_uninit(osb);
>>>>> -			goto out_unlock;
>>>>> -		}
>>>>> +		if (start + len > le32_to_cpu(main_bm->i_clusters))
>>>>> +			len = le32_to_cpu(main_bm->i_clusters) - start;
>>>>>     
>>>>> -		if (info.tf_valid && info.tf_success &&
>>>>> -		    info.tf_start == start && info.tf_len == len &&
>>>>> -		    info.tf_minlen == minlen) {
>>>>> -			/* Avoid sending duplicated trim to a shared device */
>>>>> -			mlog(ML_NOTICE, "The same trim on device (%s) was "
>>>>> -			     "just done from node (%u), return.\n",
>>>>> -			     osb->dev_str, info.tf_nodenum);
>>>>> -			range->len = info.tf_trimlen;
>>>>> -			goto out_trimunlock;
>>>>> -		}
>>>>> +		/*
>>>>> +		 * Determine first and last group to examine based on
>>>>> +		 * start and len
>>>>> +		 */
>>>>> +		first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>>>> +		if (first_group == osb->first_cluster_group_blkno)
>>>>> +			first_bit = start;
>>>>> +		else
>>>>> +			first_bit = start - ocfs2_blocks_to_clusters(sb,
>>>>> +								first_group);
>>>>> +		last_group = ocfs2_which_cluster_group(main_bm_inode,
>>>>> +						       start + len - 1);
>>>>> +		group = first_group;
>>>>>     	}
>>>>>     
>>>>> -	info.tf_nodenum = osb->node_num;
>>>>> -	info.tf_start = start;
>>>>> -	info.tf_len = len;
>>>>> -	info.tf_minlen = minlen;
>>>>> -
>>>>> -	/* Determine first and last group to examine based on start and len */
>>>>> -	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>>>> -	if (first_group == osb->first_cluster_group_blkno)
>>>>> -		first_bit = start;
>>>>> -	else
>>>>> -		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
>>>>> -	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
>>>>> -	last_bit = osb->bitmap_cpg;
>>>>> -
>>>>> -	trimmed = 0;
>>>>> -	for (group = first_group; group <= last_group;) {
>>>>> +	do {
>>>>>     		if (first_bit + len >= osb->bitmap_cpg)
>>>>>     			last_bit = osb->bitmap_cpg;
>>>>>     		else
>>>>> @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>>> fstrim_range *range)
>>>>>     			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>>>>     		else
>>>>>     			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>>>> -	}
>>>>> -	range->len = trimmed * sb->s_blocksize;
>>>>> +	} while (0);
>>>>>     
>>>>> -	info.tf_trimlen = range->len;
>>>>> -	info.tf_success = (ret ? 0 : 1);
>>>>> -	pinfo = &info;
>>>>> -out_trimunlock:
>>>>> -	ocfs2_trim_fs_unlock(osb, pinfo);
>>>>> -	ocfs2_trim_fs_lock_res_uninit(osb);
>>>>>     out_unlock:
>>>>>     	ocfs2_inode_unlock(main_bm_inode, 0);
>>>>>     	brelse(main_bm_bh);
>>>>> +	main_bm_bh = NULL;
>>>>>     out_mutex:
>>>>>     	inode_unlock(main_bm_inode);
>>>>>     	iput(main_bm_inode);
>>>>> +
>>>>> +	/*
>>>>> +	 * If all the groups trim are not done or failed, but we should release
>>>>> +	 * main_bm related locks for avoiding the current IO starve, then go to
>>>>> +	 * trim the next group
>>>>> +	 */
>>>>> +	if (ret >= 0 && group <= last_group)
>>>>> +		goto next_group;
>>>>>     out:
>>>>> +	range->len = trimmed * sb->s_blocksize;
>>>>> +	return ret;
>>>>> +}
>>>>> +
>>>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>>>> +{
>>>>> +	int ret;
>>>>> +	struct ocfs2_super *osb = OCFS2_SB(sb);
>>>>> +	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>>> +
>>>>> +	ocfs2_trim_fs_lock_res_init(osb);
>>>>> +
>>>>> +	trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
>>>>> +
>>>>> +	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>>> +	if (ret < 0) {
>>>>> +		if (ret != -EAGAIN) {
>>>>> +			mlog_errno(ret);
>>>>> +			ocfs2_trim_fs_lock_res_uninit(osb);
>>>>> +			return ret;
>>>>> +		}
>>>>> +
>>>>> +		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>>>> +		     "finish, which is running from another node.\n",
>>>>> +		     osb->dev_str);
>>>>> +		ret = ocfs2_trim_fs_lock(osb, &info, 0);
>>>>> +		if (ret < 0) {
>>>>> +			mlog_errno(ret);
>>>>> +			ocfs2_trim_fs_lock_res_uninit(osb);
>>>>> +			return ret;
>>>>> +		}
>>>>> +
>>>>> +		if (info.tf_valid && info.tf_success &&
>>>>> +		    info.tf_start == range->start &&
>>>>> +		    info.tf_len == range->len &&
>>>>> +		    info.tf_minlen == range->minlen) {
>>>>> +			/* Avoid sending duplicated trim to a shared device */
>>>>> +			mlog(ML_NOTICE, "The same trim on device (%s) was "
>>>>> +			     "just done from node (%u), return.\n",
>>>>> +			     osb->dev_str, info.tf_nodenum);
>>>>> +			range->len = info.tf_trimlen;
>>>>> +			goto out;
>>>>> +		}
>>>>> +	}
>>>>> +
>>>>> +	info.tf_nodenum = osb->node_num;
>>>>> +	info.tf_start = range->start;
>>>>> +	info.tf_len = range->len;
>>>>> +	info.tf_minlen = range->minlen;
>>>>> +
>>>>> +	ret = ocfs2_trim_mainbm(sb, range);
>>>>> +
>>>>> +	info.tf_trimlen = range->len;
>>>>> +	info.tf_success = (ret < 0 ? 0 : 1);
>>>>> +	pinfo = &info;
>>>>> +out:
>>>>> +	ocfs2_trim_fs_unlock(osb, pinfo);
>>>>> +	ocfs2_trim_fs_lock_res_uninit(osb);
>>>>>     	return ret;
>>>>>     }
>>>>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
>>>>> index 7c835824247e..af405586c5b1 100644
>>>>> --- a/fs/ocfs2/dlmglue.c
>>>>> +++ b/fs/ocfs2/dlmglue.c
>>>>> @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
>>>>>     {
>>>>>     	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
>>>>>     
>>>>> +	/* Only one trimfs thread are allowed to work at the same time. */
>>>>> +	mutex_lock(&osb->obs_trim_fs_mutex);
>>>>> +
>>>>
>>>> Cluster lock of fstrim have a trylock behavior, will it be better if we
>>>> trylock here?
>>> Here, I prefer to just serialize fstrim threads on the local node to
>> simplify the code logic,
>>> maybe the user want to do like that, although this behavior is not
>> recommended.
>>> You know, on one node, ideally, the user should call  fstrim command once
>> regularly.
>>> If he calls fstrim command more times in a very short time,
>>> the code will not make each fstrim command return to failure, just do the
>> fstrim task one by one.
>>
>> I have a thought having nothing to do with your patch.
>> Do you think it's possible for us to implement *discard on unlink file* like
>> ext4 does.
>> So the application doesn't have to invoke fstrim periodically.
> Yes, we can do some investigation for adding a mount option "discard" to support discard a file when it is deleted.
> This can be considered as another feature, since its discard occasion and granularity is different with fstrim.
> The scheduled fstrim command can considered as a traditional file system level trim.
> If the file system supports the on-demand discard when some blocks were released, that's better.
> Of course. these two features can coexist.

Good to hear this. :-)

> 
> Thanks
> Gang
> 
>>
>> Thanks,
>> Changwei
>>
>>>
>>> Thanks
>>> Gang
>>>
>>>>
>>>> Thanks,
>>>> Changwei
>>>>
>>>>>     	ocfs2_lock_res_init_once(lockres);
>>>>>     	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
>>>>>     	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
>>>>> @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super
>>>> *osb)
>>>>>     
>>>>>     	ocfs2_simple_drop_lockres(osb, lockres);
>>>>>     	ocfs2_lock_res_free(lockres);
>>>>> +
>>>>> +	mutex_unlock(&osb->obs_trim_fs_mutex);
>>>>>     }
>>>>>     
>>>>>     static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
>>>>> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
>>>>> index 4f86ac0027b5..1f029fbe8b8d 100644
>>>>> --- a/fs/ocfs2/ocfs2.h
>>>>> +++ b/fs/ocfs2/ocfs2.h
>>>>> @@ -407,6 +407,7 @@ struct ocfs2_super
>>>>>     	struct ocfs2_lock_res osb_rename_lockres;
>>>>>     	struct ocfs2_lock_res osb_nfs_sync_lockres;
>>>>>     	struct ocfs2_lock_res osb_trim_fs_lockres;
>>>>> +	struct mutex obs_trim_fs_mutex;
>>>>>     	struct ocfs2_dlm_debug *osb_dlm_debug;
>>>>>     
>>>>>     	struct dentry *osb_debug_root;
>>>>> diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
>>>>> index 2ee76a90ba8f..dc4bce1649c1 100644
>>>>> --- a/fs/ocfs2/ocfs2_trace.h
>>>>> +++ b/fs/ocfs2/ocfs2_trace.h
>>>>> @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
>>>>>     
>>>>>     DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
>>>>>     
>>>>> +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
>>>>> +
>>>>>     DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
>>>>>     
>>>>>     /* End of trace events for fs/ocfs2/alloc.c. */
>>>>> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
>>>>> index 3415e0b09398..96ae7cedd487 100644
>>>>> --- a/fs/ocfs2/super.c
>>>>> +++ b/fs/ocfs2/super.c
>>>>> @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
>>>>>     	if (ocfs2_is_hard_readonly(osb))
>>>>>     		goto leave;
>>>>>     
>>>>> +	mutex_init(&osb->obs_trim_fs_mutex);
>>>>> +
>>>>>     	status = ocfs2_dlm_init(osb);
>>>>>     	if (status < 0) {
>>>>>     		mlog_errno(status);
>>>>>
>>>
>

Patch
diff mbox series

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d1cbb27808e2..6f0999015a44 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7532,10 +7532,11 @@  static int ocfs2_trim_group(struct super_block *sb,
 	return count;
 }
 
-int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+static
+int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
 {
 	struct ocfs2_super *osb = OCFS2_SB(sb);
-	u64 start, len, trimmed, first_group, last_group, group;
+	u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
 	int ret, cnt;
 	u32 first_bit, last_bit, minlen;
 	struct buffer_head *main_bm_bh = NULL;
@@ -7543,7 +7544,6 @@  int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	struct buffer_head *gd_bh = NULL;
 	struct ocfs2_dinode *main_bm;
 	struct ocfs2_group_desc *gd = NULL;
-	struct ocfs2_trim_fs_info info, *pinfo = NULL;
 
 	start = range->start >> osb->s_clustersize_bits;
 	len = range->len >> osb->s_clustersize_bits;
@@ -7552,6 +7552,9 @@  int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
 		return -EINVAL;
 
+	trace_ocfs2_trim_mainbm(start, len, minlen);
+
+next_group:
 	main_bm_inode = ocfs2_get_system_file_inode(osb,
 						    GLOBAL_BITMAP_SYSTEM_INODE,
 						    OCFS2_INVALID_SLOT);
@@ -7570,64 +7573,34 @@  int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	}
 	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
 
-	if (start >= le32_to_cpu(main_bm->i_clusters)) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	len = range->len >> osb->s_clustersize_bits;
-	if (start + len > le32_to_cpu(main_bm->i_clusters))
-		len = le32_to_cpu(main_bm->i_clusters) - start;
-
-	trace_ocfs2_trim_fs(start, len, minlen);
-
-	ocfs2_trim_fs_lock_res_init(osb);
-	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
-	if (ret < 0) {
-		if (ret != -EAGAIN) {
-			mlog_errno(ret);
-			ocfs2_trim_fs_lock_res_uninit(osb);
+	/*
+	 * Do some check before trim the first group.
+	 */
+	if (!group) {
+		if (start >= le32_to_cpu(main_bm->i_clusters)) {
+			ret = -EINVAL;
 			goto out_unlock;
 		}
 
-		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
-		     "finish, which is running from another node.\n",
-		     osb->dev_str);
-		ret = ocfs2_trim_fs_lock(osb, &info, 0);
-		if (ret < 0) {
-			mlog_errno(ret);
-			ocfs2_trim_fs_lock_res_uninit(osb);
-			goto out_unlock;
-		}
+		if (start + len > le32_to_cpu(main_bm->i_clusters))
+			len = le32_to_cpu(main_bm->i_clusters) - start;
 
-		if (info.tf_valid && info.tf_success &&
-		    info.tf_start == start && info.tf_len == len &&
-		    info.tf_minlen == minlen) {
-			/* Avoid sending duplicated trim to a shared device */
-			mlog(ML_NOTICE, "The same trim on device (%s) was "
-			     "just done from node (%u), return.\n",
-			     osb->dev_str, info.tf_nodenum);
-			range->len = info.tf_trimlen;
-			goto out_trimunlock;
-		}
+		/*
+		 * Determine first and last group to examine based on
+		 * start and len
+		 */
+		first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+		if (first_group == osb->first_cluster_group_blkno)
+			first_bit = start;
+		else
+			first_bit = start - ocfs2_blocks_to_clusters(sb,
+								first_group);
+		last_group = ocfs2_which_cluster_group(main_bm_inode,
+						       start + len - 1);
+		group = first_group;
 	}
 
-	info.tf_nodenum = osb->node_num;
-	info.tf_start = start;
-	info.tf_len = len;
-	info.tf_minlen = minlen;
-
-	/* Determine first and last group to examine based on start and len */
-	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
-	if (first_group == osb->first_cluster_group_blkno)
-		first_bit = start;
-	else
-		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
-	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
-	last_bit = osb->bitmap_cpg;
-
-	trimmed = 0;
-	for (group = first_group; group <= last_group;) {
+	do {
 		if (first_bit + len >= osb->bitmap_cpg)
 			last_bit = osb->bitmap_cpg;
 		else
@@ -7659,21 +7632,81 @@  int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
 		else
 			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
-	}
-	range->len = trimmed * sb->s_blocksize;
+	} while (0);
 
-	info.tf_trimlen = range->len;
-	info.tf_success = (ret ? 0 : 1);
-	pinfo = &info;
-out_trimunlock:
-	ocfs2_trim_fs_unlock(osb, pinfo);
-	ocfs2_trim_fs_lock_res_uninit(osb);
 out_unlock:
 	ocfs2_inode_unlock(main_bm_inode, 0);
 	brelse(main_bm_bh);
+	main_bm_bh = NULL;
 out_mutex:
 	inode_unlock(main_bm_inode);
 	iput(main_bm_inode);
+
+	/*
+	 * If all the groups trim are not done or failed, but we should release
+	 * main_bm related locks for avoiding the current IO starve, then go to
+	 * trim the next group
+	 */
+	if (ret >= 0 && group <= last_group)
+		goto next_group;
 out:
+	range->len = trimmed * sb->s_blocksize;
+	return ret;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct ocfs2_trim_fs_info info, *pinfo = NULL;
+
+	ocfs2_trim_fs_lock_res_init(osb);
+
+	trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
+
+	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
+	if (ret < 0) {
+		if (ret != -EAGAIN) {
+			mlog_errno(ret);
+			ocfs2_trim_fs_lock_res_uninit(osb);
+			return ret;
+		}
+
+		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
+		     "finish, which is running from another node.\n",
+		     osb->dev_str);
+		ret = ocfs2_trim_fs_lock(osb, &info, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			ocfs2_trim_fs_lock_res_uninit(osb);
+			return ret;
+		}
+
+		if (info.tf_valid && info.tf_success &&
+		    info.tf_start == range->start &&
+		    info.tf_len == range->len &&
+		    info.tf_minlen == range->minlen) {
+			/* Avoid sending duplicated trim to a shared device */
+			mlog(ML_NOTICE, "The same trim on device (%s) was "
+			     "just done from node (%u), return.\n",
+			     osb->dev_str, info.tf_nodenum);
+			range->len = info.tf_trimlen;
+			goto out;
+		}
+	}
+
+	info.tf_nodenum = osb->node_num;
+	info.tf_start = range->start;
+	info.tf_len = range->len;
+	info.tf_minlen = range->minlen;
+
+	ret = ocfs2_trim_mainbm(sb, range);
+
+	info.tf_trimlen = range->len;
+	info.tf_success = (ret < 0 ? 0 : 1);
+	pinfo = &info;
+out:
+	ocfs2_trim_fs_unlock(osb, pinfo);
+	ocfs2_trim_fs_lock_res_uninit(osb);
 	return ret;
 }
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7c835824247e..af405586c5b1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -686,6 +686,9 @@  void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
 {
 	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
 
+	/* Only one trimfs thread are allowed to work at the same time. */
+	mutex_lock(&osb->obs_trim_fs_mutex);
+
 	ocfs2_lock_res_init_once(lockres);
 	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
 	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
@@ -698,6 +701,8 @@  void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
 
 	ocfs2_simple_drop_lockres(osb, lockres);
 	ocfs2_lock_res_free(lockres);
+
+	mutex_unlock(&osb->obs_trim_fs_mutex);
 }
 
 static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 4f86ac0027b5..1f029fbe8b8d 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -407,6 +407,7 @@  struct ocfs2_super
 	struct ocfs2_lock_res osb_rename_lockres;
 	struct ocfs2_lock_res osb_nfs_sync_lockres;
 	struct ocfs2_lock_res osb_trim_fs_lockres;
+	struct mutex obs_trim_fs_mutex;
 	struct ocfs2_dlm_debug *osb_dlm_debug;
 
 	struct dentry *osb_debug_root;
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 2ee76a90ba8f..dc4bce1649c1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -712,6 +712,8 @@  TRACE_EVENT(ocfs2_trim_extent,
 
 DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
 
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
+
 DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
 
 /* End of trace events for fs/ocfs2/alloc.c. */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3415e0b09398..96ae7cedd487 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1847,6 +1847,8 @@  static int ocfs2_mount_volume(struct super_block *sb)
 	if (ocfs2_is_hard_readonly(osb))
 		goto leave;
 
+	mutex_init(&osb->obs_trim_fs_mutex);
+
 	status = ocfs2_dlm_init(osb);
 	if (status < 0) {
 		mlog_errno(status);