diff mbox series

[v6,10/19] gfs2: Introduce flag for glock holder auto-demotion

Message ID 20210819194102.1491495-11-agruenba@redhat.com (mailing list archive)
State New, archived
Headers show
Series gfs2: Fix mmap + page fault deadlocks | expand

Commit Message

Andreas Gruenbacher Aug. 19, 2021, 7:40 p.m. UTC
From: Bob Peterson <rpeterso@redhat.com>

This patch introduces a new HIF_MAY_DEMOTE flag and infrastructure that
will allow glocks to be demoted automatically on locking conflicts.
When a locking request comes in that isn't compatible with the locking
state of a holder and that holder has the HIF_MAY_DEMOTE flag set, the
holder will be demoted automatically before the incoming locking request
is granted.

Processes that allow a glock holder to be taken away indicate this by
calling gfs2_holder_allow_demote().  When they need the glock again,
they call gfs2_holder_disallow_demote() and then they check if the
holder is still queued: if it is, they're still holding the glock; if it
isn't, they need to re-acquire the glock.

This allows processes to hang on to locks that could become part of a
cyclic locking dependency.  The locks will be given up when a (rare)
conflicting locking request occurs, and don't need to be given up
prematurely.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/glock.c  | 221 +++++++++++++++++++++++++++++++++++++++--------
 fs/gfs2/glock.h  |  20 +++++
 fs/gfs2/incore.h |   1 +
 3 files changed, 206 insertions(+), 36 deletions(-)

Comments

Steven Whitehouse Aug. 20, 2021, 9:35 a.m. UTC | #1
Hi,

On Thu, 2021-08-19 at 21:40 +0200, Andreas Gruenbacher wrote:
> From: Bob Peterson <rpeterso@redhat.com>
> 
> This patch introduces a new HIF_MAY_DEMOTE flag and infrastructure
> that
> will allow glocks to be demoted automatically on locking conflicts.
> When a locking request comes in that isn't compatible with the
> locking
> state of a holder and that holder has the HIF_MAY_DEMOTE flag set,
> the
> holder will be demoted automatically before the incoming locking
> request
> is granted.
> 
I'm not sure I understand what is going on here. When there are locking
conflicts we generate call backs and those result in glock demotion.
There is no need for a flag to indicate that I think, since it is the
default behaviour anyway. Or perhaps the explanation is just a bit
confusing...

> Processes that allow a glock holder to be taken away indicate this by
> calling gfs2_holder_allow_demote().  When they need the glock again,
> they call gfs2_holder_disallow_demote() and then they check if the
> holder is still queued: if it is, they're still holding the glock; if
> it
> isn't, they need to re-acquire the glock.
> 
> This allows processes to hang on to locks that could become part of a
> cyclic locking dependency.  The locks will be given up when a (rare)
> conflicting locking request occurs, and don't need to be given up
> prematurely.
This seems backwards to me. We already have the glock layer cache the
locks until they are required by another node. We also have the min
hold time to make sure that we don't bounce locks too much. So what is
the problem that you are trying to solve here I wonder?

> 
> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> ---
>  fs/gfs2/glock.c  | 221 +++++++++++++++++++++++++++++++++++++++----
> ----
>  fs/gfs2/glock.h  |  20 +++++
>  fs/gfs2/incore.h |   1 +
>  3 files changed, 206 insertions(+), 36 deletions(-)
> 
> diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> index f24db2ececfb..d1b06a09ce2f 100644
> --- a/fs/gfs2/glock.c
> +++ b/fs/gfs2/glock.c
> @@ -58,6 +58,7 @@ struct gfs2_glock_iter {
>  typedef void (*glock_examiner) (struct gfs2_glock * gl);
>  
>  static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh,
> unsigned int target);
> +static void __gfs2_glock_dq(struct gfs2_holder *gh);
>  
>  static struct dentry *gfs2_root;
>  static struct workqueue_struct *glock_workqueue;
> @@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_glock
> *gl)
>  
>  	if (gl->gl_state == LM_ST_UNLOCKED)
>  		return 0;
> +	/*
> +	 * Note that demote_ok is used for the lru process of disposing
> of
> +	 * glocks. For this purpose, we don't care if the glock's
> holders
> +	 * have the HIF_MAY_DEMOTE flag set or not. If someone is using
> +	 * them, don't demote.
> +	 */
>  	if (!list_empty(&gl->gl_holders))
>  		return 0;
>  	if (glops->go_demote_ok)
> @@ -379,7 +386,7 @@ static void do_error(struct gfs2_glock *gl, const
> int ret)
>  	struct gfs2_holder *gh, *tmp;
>  
>  	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
> -		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
> +		if (!test_bit(HIF_WAIT, &gh->gh_iflags))
>  			continue;
>  		if (ret & LM_OUT_ERROR)
>  			gh->gh_error = -EIO;
> @@ -393,6 +400,40 @@ static void do_error(struct gfs2_glock *gl,
> const int ret)
>  	}
>  }
>  
> +/**
> + * demote_incompat_holders - demote incompatible demoteable holders
> + * @gl: the glock we want to promote
> + * @new_gh: the new holder to be promoted
> + */
> +static void demote_incompat_holders(struct gfs2_glock *gl,
> +				    struct gfs2_holder *new_gh)
> +{
> +	struct gfs2_holder *gh;
> +
> +	/*
> +	 * Demote incompatible holders before we make ourselves
> eligible.
> +	 * (This holder may or may not allow auto-demoting, but we
> don't want
> +	 * to demote the new holder before it's even granted.)
> +	 */
> +	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
> +		/*
> +		 * Since holders are at the front of the list, we stop
> when we
> +		 * find the first non-holder.
> +		 */
> +		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
> +			return;
> +		if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
> +		    !may_grant(gl, new_gh, gh)) {
> +			/*
> +			 * We should not recurse into do_promote
> because
> +			 * __gfs2_glock_dq only calls handle_callback,
> +			 * gfs2_glock_add_to_lru and
> __gfs2_glock_queue_work.
> +			 */
> +			__gfs2_glock_dq(gh);
> +		}
> +	}
> +}
> +
>  /**
>   * find_first_holder - find the first "holder" gh
>   * @gl: the glock
> @@ -411,6 +452,26 @@ static inline struct gfs2_holder
> *find_first_holder(const struct gfs2_glock *gl)
>  	return NULL;
>  }
>  
> +/**
> + * find_first_strong_holder - find the first non-demoteable holder
> + * @gl: the glock
> + *
> + * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag
> set.
> + */
> +static inline struct gfs2_holder
> +*find_first_strong_holder(struct gfs2_glock *gl)
> +{
> +	struct gfs2_holder *gh;
> +
> +	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
> +		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
> +			return NULL;
> +		if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
> +			return gh;
> +	}
> +	return NULL;
> +}
> +
>  /**
>   * do_promote - promote as many requests as possible on the current
> queue
>   * @gl: The glock
> @@ -425,15 +486,27 @@ __acquires(&gl->gl_lockref.lock)
>  {
>  	const struct gfs2_glock_operations *glops = gl->gl_ops;
>  	struct gfs2_holder *gh, *tmp, *first_gh;
> +	bool incompat_holders_demoted = false;
>  	int ret;
>  
> -	first_gh = find_first_holder(gl);
> +	first_gh = find_first_strong_holder(gl);
>  
>  restart:
>  	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
> -		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
> +		if (!test_bit(HIF_WAIT, &gh->gh_iflags))
>  			continue;
>  		if (may_grant(gl, first_gh, gh)) {
> +			if (!incompat_holders_demoted) {
> +				demote_incompat_holders(gl, first_gh);
> +				incompat_holders_demoted = true;
> +				first_gh = gh;
> +			}
> +			/*
> +			 * The first holder (and only the first holder)
> on the
> +			 * list to be promoted needs to call the
> go_lock
> +			 * function. This does things like
> inode_refresh
> +			 * to read an inode from disk.
> +			 */
>  			if (gh->gh_list.prev == &gl->gl_holders &&
>  			    glops->go_lock) {
>  				spin_unlock(&gl->gl_lockref.lock);
> @@ -459,6 +532,11 @@ __acquires(&gl->gl_lockref.lock)
>  			gfs2_holder_wake(gh);
>  			continue;
>  		}
> +		/*
> +		 * If we get here, it means we may not grant this
> holder for
> +		 * some reason. If this holder is the head of the list,
> it
> +		 * means we have a blocked holder at the head, so
> return 1.
> +		 */
>  		if (gh->gh_list.prev == &gl->gl_holders)
>  			return 1;
>  		do_error(gl, 0);
> @@ -1373,7 +1451,7 @@ __acquires(&gl->gl_lockref.lock)
>  		if (test_bit(GLF_LOCK, &gl->gl_flags)) {
>  			struct gfs2_holder *first_gh;
>  
> -			first_gh = find_first_holder(gl);
> +			first_gh = find_first_strong_holder(gl);
>  			try_futile = !may_grant(gl, first_gh, gh);
>  		}
>  		if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl-
> >gl_flags))
> @@ -1382,7 +1460,8 @@ __acquires(&gl->gl_lockref.lock)
>  
>  	list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
>  		if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
> -		    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
> +		    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) &&
> +		    !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)))
>  			goto trap_recursive;
>  		if (try_futile &&
>  		    !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
> {
> @@ -1478,51 +1557,83 @@ int gfs2_glock_poll(struct gfs2_holder *gh)
>  	return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
>  }
>  
> -/**
> - * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock
> (release a glock)
> - * @gh: the glock holder
> - *
> - */
> +static inline bool needs_demote(struct gfs2_glock *gl)
> +{
> +	return (test_bit(GLF_DEMOTE, &gl->gl_flags) ||
> +		test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags));
> +}
>  
> -void gfs2_glock_dq(struct gfs2_holder *gh)
> +static void __gfs2_glock_dq(struct gfs2_holder *gh)
>  {
>  	struct gfs2_glock *gl = gh->gh_gl;
>  	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
>  	unsigned delay = 0;
>  	int fast_path = 0;
>  
> -	spin_lock(&gl->gl_lockref.lock);
>  	/*
> -	 * If we're in the process of file system withdraw, we cannot
> just
> -	 * dequeue any glocks until our journal is recovered, lest we
> -	 * introduce file system corruption. We need two exceptions to
> this
> -	 * rule: We need to allow unlocking of nondisk glocks and the
> glock
> -	 * for our own journal that needs recovery.
> +	 * This while loop is similar to function
> demote_incompat_holders:
> +	 * If the glock is due to be demoted (which may be from another
> node
> +	 * or even if this holder is GL_NOCACHE), the weak holders are
> +	 * demoted as well, allowing the glock to be demoted.
>  	 */
> -	if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
> -	    glock_blocked_by_withdraw(gl) &&
> -	    gh->gh_gl != sdp->sd_jinode_gl) {
> -		sdp->sd_glock_dqs_held++;
> -		spin_unlock(&gl->gl_lockref.lock);
> -		might_sleep();
> -		wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
> -			    TASK_UNINTERRUPTIBLE);
> -		spin_lock(&gl->gl_lockref.lock);
> -	}
> -	if (gh->gh_flags & GL_NOCACHE)
> -		handle_callback(gl, LM_ST_UNLOCKED, 0, false);
> +	while (gh) {
> +		/*
> +		 * If we're in the process of file system withdraw, we
> cannot
> +		 * just dequeue any glocks until our journal is
> recovered, lest
> +		 * we introduce file system corruption. We need two
> exceptions
> +		 * to this rule: We need to allow unlocking of nondisk
> glocks
> +		 * and the glock for our own journal that needs
> recovery.
> +		 */
> +		if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
> +		    glock_blocked_by_withdraw(gl) &&
> +		    gh->gh_gl != sdp->sd_jinode_gl) {
> +			sdp->sd_glock_dqs_held++;
> +			spin_unlock(&gl->gl_lockref.lock);
> +			might_sleep();
> +			wait_on_bit(&sdp->sd_flags,
> SDF_WITHDRAW_RECOVERY,
> +				    TASK_UNINTERRUPTIBLE);
> +			spin_lock(&gl->gl_lockref.lock);
> +		}
> +
> +		/*
> +		 * This holder should not be cached, so mark it for
> demote.
> +		 * Note: this should be done before the check for
> needs_demote
> +		 * below.
> +		 */
> +		if (gh->gh_flags & GL_NOCACHE)
> +			handle_callback(gl, LM_ST_UNLOCKED, 0, false);
>  
> -	list_del_init(&gh->gh_list);
> -	clear_bit(HIF_HOLDER, &gh->gh_iflags);
> -	if (list_empty(&gl->gl_holders) &&
> -	    !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
> -	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
> -		fast_path = 1;
> +		list_del_init(&gh->gh_list);
> +		clear_bit(HIF_HOLDER, &gh->gh_iflags);
> +		trace_gfs2_glock_queue(gh, 0);
> +
> +		/*
> +		 * If there hasn't been a demote request we are done.
> +		 * (Let the remaining holders, if any, keep holding
> it.)
> +		 */
> +		if (!needs_demote(gl)) {
> +			if (list_empty(&gl->gl_holders))
> +				fast_path = 1;
> +			break;
> +		}
> +		/*
> +		 * If we have another strong holder (we cannot auto-
> demote)
> +		 * we are done. It keeps holding it until it is done.
> +		 */
> +		if (find_first_strong_holder(gl))
> +			break;
> +
> +		/*
> +		 * If we have a weak holder at the head of the list, it
> +		 * (and all others like it) must be auto-demoted. If
> there
> +		 * are no more weak holders, we exit the while loop.
> +		 */
> +		gh = find_first_holder(gl);
> +	}
>  
>  	if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
>  		gfs2_glock_add_to_lru(gl);
>  
> -	trace_gfs2_glock_queue(gh, 0);
>  	if (unlikely(!fast_path)) {
>  		gl->gl_lockref.count++;
>  		if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
> @@ -1531,6 +1642,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
>  			delay = gl->gl_hold_time;
>  		__gfs2_glock_queue_work(gl, delay);
>  	}
> +}
> +
> +/**
> + * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock
> (release a glock)
> + * @gh: the glock holder
> + *
> + */
> +void gfs2_glock_dq(struct gfs2_holder *gh)
> +{
> +	struct gfs2_glock *gl = gh->gh_gl;
> +
> +	spin_lock(&gl->gl_lockref.lock);
> +	__gfs2_glock_dq(gh);
>  	spin_unlock(&gl->gl_lockref.lock);
>  }
>  
> @@ -1693,6 +1817,7 @@ void gfs2_glock_dq_m(unsigned int num_gh,
> struct gfs2_holder *ghs)
>  
>  void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
>  {
> +	struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state,
> };
>  	unsigned long delay = 0;
>  	unsigned long holdtime;
>  	unsigned long now = jiffies;
> @@ -1707,6 +1832,28 @@ void gfs2_glock_cb(struct gfs2_glock *gl,
> unsigned int state)
>  		if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
>  			delay = gl->gl_hold_time;
>  	}
> +	/*
> +	 * Note 1: We cannot call demote_incompat_holders from
> handle_callback
> +	 * or gfs2_set_demote due to recursion problems like:
> gfs2_glock_dq ->
> +	 * handle_callback -> demote_incompat_holders -> gfs2_glock_dq
> +	 * Plus, we only want to demote the holders if the request
> comes from
> +	 * a remote cluster node because local holder conflicts are
> resolved
> +	 * elsewhere.
> +	 *
> +	 * Note 2: if a remote node wants this glock in EX mode,
> lock_dlm will
> +	 * request that we set our state to UNLOCKED. Here we mock up a
> holder
> +	 * to make it look like someone wants the lock EX locally. Any
> SH
> +	 * and DF requests should be able to share the lock without
> demoting.
> +	 *
> +	 * Note 3: We only want to demote the demoteable holders when
> there
> +	 * are no more strong holders. The demoteable holders might as
> well
> +	 * keep the glock until the last strong holder is done with it.
> +	 */
> +	if (!find_first_strong_holder(gl)) {
> +		if (state == LM_ST_UNLOCKED)
> +			mock_gh.gh_state = LM_ST_EXCLUSIVE;
> +		demote_incompat_holders(gl, &mock_gh);
> +	}
>  	handle_callback(gl, state, delay, true);
>  	__gfs2_glock_queue_work(gl, delay);
>  	spin_unlock(&gl->gl_lockref.lock);
> @@ -2096,6 +2243,8 @@ static const char *hflags2str(char *buf, u16
> flags, unsigned long iflags)
>  		*p++ = 'H';
>  	if (test_bit(HIF_WAIT, &iflags))
>  		*p++ = 'W';
> +	if (test_bit(HIF_MAY_DEMOTE, &iflags))
> +		*p++ = 'D';
>  	*p = 0;
>  	return buf;
>  }
> diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
> index 31a8f2f649b5..9012487da4c6 100644
> --- a/fs/gfs2/glock.h
> +++ b/fs/gfs2/glock.h
> @@ -150,6 +150,8 @@ static inline struct gfs2_holder
> *gfs2_glock_is_locked_by_me(struct gfs2_glock *
>  	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
>  		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
>  			break;
> +		if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
> +			continue;
>  		if (gh->gh_owner_pid == pid)
>  			goto out;
>  	}
> @@ -325,6 +327,24 @@ static inline void glock_clear_object(struct
> gfs2_glock *gl, void *object)
>  	spin_unlock(&gl->gl_lockref.lock);
>  }
>  
> +static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)
> +{
> +	struct gfs2_glock *gl = gh->gh_gl;
> +
> +	spin_lock(&gl->gl_lockref.lock);
> +	set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
> +	spin_unlock(&gl->gl_lockref.lock);
> +}
> +
> +static inline void gfs2_holder_disallow_demote(struct gfs2_holder
> *gh)
> +{
> +	struct gfs2_glock *gl = gh->gh_gl;
> +
> +	spin_lock(&gl->gl_lockref.lock);
> +	clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
> +	spin_unlock(&gl->gl_lockref.lock);
> +}
> +
This looks a bit strange... bit operations are atomic anyway, so why do
we need that spinlock here?

Steve.

>  extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64
> generation);
>  extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64
> generation);
>  
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 5c6b985254aa..e73a81db0714 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -252,6 +252,7 @@ struct gfs2_lkstats {
>  
>  enum {
>  	/* States */
> +	HIF_MAY_DEMOTE		= 1,
>  	HIF_HOLDER		= 6,  /* Set for gh that "holds" the
> glock */
>  	HIF_WAIT		= 10,
>  };
Bob Peterson Aug. 20, 2021, 1:11 p.m. UTC | #2
On 8/20/21 4:35 AM, Steven Whitehouse wrote:
> Hi,
> 
> On Thu, 2021-08-19 at 21:40 +0200, Andreas Gruenbacher wrote:
>> From: Bob Peterson <rpeterso@redhat.com>
>>
>> This patch introduces a new HIF_MAY_DEMOTE flag and infrastructure
>> that
>> will allow glocks to be demoted automatically on locking conflicts.
>> When a locking request comes in that isn't compatible with the
>> locking
>> state of a holder and that holder has the HIF_MAY_DEMOTE flag set,
>> the
>> holder will be demoted automatically before the incoming locking
>> request
>> is granted.
>>
> I'm not sure I understand what is going on here. When there are locking
> conflicts we generate call backs and those result in glock demotion.
> There is no need for a flag to indicate that I think, since it is the
> default behaviour anyway. Or perhaps the explanation is just a bit
> confusing...

I agree that the whole concept and explanation are confusing. Andreas 
and I went through several heated arguments about the symantics, 
comments, patch descriptions, etc. We played around with many different 
flag name ideas, etc. We did not agree on the best way to describe the 
whole concept. He didn't like my explanation and I didn't like his. So 
yes, it is confusing.

My preferred terminology was "DOD" or "Dequeue On Demand" which makes 
the concept more understandable to me. So basically a process can say
"I need to hold this glock, but for an unknown and possibly lengthy 
period of time, but please feel free to dequeue it if it's in your way."
And bear in mind that several processes may do the same, simultaneously.

You can almost think of this as a performance enhancement. This concept 
allows a process to hold a glock for much longer periods of time, at a 
lower priority, for example, when gfs2_file_read_iter needs to hold the 
glock for very long-running iterative reads.

The process requesting a holder with "Demote On Demand" must then 
determine if its holder has been stolen away (dequeued on demand) after 
its lengthy operation, and therefore needs to pick up the pieces of 
where it left off in its process.

Meanwhile, another process may need to hold the glock. If its requested 
mode is compatible, say SH and SH, the lock is simply granted with no 
further delay. If the mode is incompatible, regardless of whether it's 
on the local node or a different node in the cluster, these 
longer-term/lower-priority holders may be dequeued or prempted by 
another request to hold the glock. Note that although these holders are 
dequeued-on-demand, they are never "uninitted" as part of the process. 
Nor must they ever be, since they may be on another process's heap.

This differs from the normal glock demote process in which the demote 
bit is set on ("requesting" the glock be demoted) but still needs to 
block until the holder does its actual dequeue.

>> Processes that allow a glock holder to be taken away indicate this by
>> calling gfs2_holder_allow_demote().  When they need the glock again,
>> they call gfs2_holder_disallow_demote() and then they check if the
>> holder is still queued: if it is, they're still holding the glock; if
>> it
>> isn't, they need to re-acquire the glock.
>>
>> This allows processes to hang on to locks that could become part of a
>> cyclic locking dependency.  The locks will be given up when a (rare)
>> conflicting locking request occurs, and don't need to be given up
>> prematurely.
> This seems backwards to me. We already have the glock layer cache the
> locks until they are required by another node. We also have the min
> hold time to make sure that we don't bounce locks too much. So what is
> the problem that you are trying to solve here I wonder?

Again, this is simply allowing premption of lenghy/low-priority holders 
whereas the normal demote process will only demote when the glock is 
dequeued after this potentially very-long period of time.

The minimum hold time solves a different problem, and Andreas and I 
talked just yesterday about possibly revisiting how that all works. The 
problem with minimum hold time is that in many cases the glock state 
machine does not want to grant new holders if the demote bit is on, so 
it ends up wasting more time than solving the actual problem.
But that's another problem for another day.

Regards,

Bob Peterson
Andreas Gruenbacher Aug. 20, 2021, 1:17 p.m. UTC | #3
On Fri, Aug 20, 2021 at 11:35 AM Steven Whitehouse <swhiteho@redhat.com> wrote:
> On Thu, 2021-08-19 at 21:40 +0200, Andreas Gruenbacher wrote:
> > From: Bob Peterson <rpeterso@redhat.com>
> >
> > This patch introduces a new HIF_MAY_DEMOTE flag and infrastructure
> > that will allow glocks to be demoted automatically on locking conflicts.
> > When a locking request comes in that isn't compatible with the locking
> > state of a holder and that holder has the HIF_MAY_DEMOTE flag set, the
> > holder will be demoted automatically before the incoming locking request
> > is granted.
>
> I'm not sure I understand what is going on here. When there are locking
> conflicts we generate call backs and those result in glock demotion.
> There is no need for a flag to indicate that I think, since it is the
> default behaviour anyway. Or perhaps the explanation is just a bit
> confusing...

When a glock has active holders (with the HIF_HOLDER flag set), the
glock won't be demoted to a state incompatible with any of those
holders.

> > Processes that allow a glock holder to be taken away indicate this by
> > calling gfs2_holder_allow_demote().  When they need the glock again,
> > they call gfs2_holder_disallow_demote() and then they check if the
> > holder is still queued: if it is, they're still holding the glock; if
> > it isn't, they need to re-acquire the glock.
> >
> > This allows processes to hang on to locks that could become part of a
> > cyclic locking dependency.  The locks will be given up when a (rare)
> > conflicting locking request occurs, and don't need to be given up
> > prematurely.
>
> This seems backwards to me. We already have the glock layer cache the
> locks until they are required by another node. We also have the min
> hold time to make sure that we don't bounce locks too much. So what is
> the problem that you are trying to solve here I wonder?

This solves the problem of faulting in pages during read and write
operations: on the one hand, we want to hold the inode glock across
those operations. On the other hand, those operations may fault in
pages, which may require taking the same or other inode glocks,
directly or indirectly, which can deadlock.

So before we fault in pages, we indicate with
gfs2_holder_allow_demote(gh) that we can cope if the glock is taken
away from us. After faulting in the pages, we indicate with
gfs2_holder_disallow_demote(gh) that we now actually need the glock
again. At that point, we either still have the glock (i.e., the holder
is still queued and it has the HIF_HOLDER flag set), or we don't.

The different kinds of read and write operations differ in how they
handle the latter case:

 * When a buffered read or write loses the inode glock, it returns a
short result. This
   prevents torn writes and reading things that have never existed on
disk in that form.

 * When a direct read or write loses the inode glock, it re-acquires
it before resuming
   the operation. Direct I/O is not expected to return partial results
and doesn't provide
   any kind of synchronization among processes.

We could solve this kind of problem in other ways, for example, by
keeping a glock generation number, dropping the glock before faulting
in pages, re-acquiring it afterwards, and checking if the generation
number has changed. This would still be an additional piece of glock
infrastructure, but more heavyweight than the HIF_MAY_DEMOTE flag
which uses the existing glock holder infrastructure.

> > Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> > ---
> >  fs/gfs2/glock.c  | 221 +++++++++++++++++++++++++++++++++++++++----
> > ----
> >  fs/gfs2/glock.h  |  20 +++++
> >  fs/gfs2/incore.h |   1 +
> >  3 files changed, 206 insertions(+), 36 deletions(-)
> >
> > diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> > index f24db2ececfb..d1b06a09ce2f 100644
> > --- a/fs/gfs2/glock.c
> > +++ b/fs/gfs2/glock.c
> > @@ -58,6 +58,7 @@ struct gfs2_glock_iter {
> >  typedef void (*glock_examiner) (struct gfs2_glock * gl);
> >
> >  static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh,
> > unsigned int target);
> > +static void __gfs2_glock_dq(struct gfs2_holder *gh);
> >
> >  static struct dentry *gfs2_root;
> >  static struct workqueue_struct *glock_workqueue;
> > @@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_glock
> > *gl)
> >
> >       if (gl->gl_state == LM_ST_UNLOCKED)
> >               return 0;
> > +     /*
> > +      * Note that demote_ok is used for the lru process of disposing
> > of
> > +      * glocks. For this purpose, we don't care if the glock's
> > holders
> > +      * have the HIF_MAY_DEMOTE flag set or not. If someone is using
> > +      * them, don't demote.
> > +      */
> >       if (!list_empty(&gl->gl_holders))
> >               return 0;
> >       if (glops->go_demote_ok)
> > @@ -379,7 +386,7 @@ static void do_error(struct gfs2_glock *gl, const
> > int ret)
> >       struct gfs2_holder *gh, *tmp;
> >
> >       list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
> > -             if (test_bit(HIF_HOLDER, &gh->gh_iflags))
> > +             if (!test_bit(HIF_WAIT, &gh->gh_iflags))
> >                       continue;
> >               if (ret & LM_OUT_ERROR)
> >                       gh->gh_error = -EIO;
> > @@ -393,6 +400,40 @@ static void do_error(struct gfs2_glock *gl,
> > const int ret)
> >       }
> >  }
> >
> > +/**
> > + * demote_incompat_holders - demote incompatible demoteable holders
> > + * @gl: the glock we want to promote
> > + * @new_gh: the new holder to be promoted
> > + */
> > +static void demote_incompat_holders(struct gfs2_glock *gl,
> > +                                 struct gfs2_holder *new_gh)
> > +{
> > +     struct gfs2_holder *gh;
> > +
> > +     /*
> > +      * Demote incompatible holders before we make ourselves
> > eligible.
> > +      * (This holder may or may not allow auto-demoting, but we
> > don't want
> > +      * to demote the new holder before it's even granted.)
> > +      */
> > +     list_for_each_entry(gh, &gl->gl_holders, gh_list) {
> > +             /*
> > +              * Since holders are at the front of the list, we stop
> > when we
> > +              * find the first non-holder.
> > +              */
> > +             if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
> > +                     return;
> > +             if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
> > +                 !may_grant(gl, new_gh, gh)) {
> > +                     /*
> > +                      * We should not recurse into do_promote
> > because
> > +                      * __gfs2_glock_dq only calls handle_callback,
> > +                      * gfs2_glock_add_to_lru and
> > __gfs2_glock_queue_work.
> > +                      */
> > +                     __gfs2_glock_dq(gh);
> > +             }
> > +     }
> > +}
> > +
> >  /**
> >   * find_first_holder - find the first "holder" gh
> >   * @gl: the glock
> > @@ -411,6 +452,26 @@ static inline struct gfs2_holder
> > *find_first_holder(const struct gfs2_glock *gl)
> >       return NULL;
> >  }
> >
> > +/**
> > + * find_first_strong_holder - find the first non-demoteable holder
> > + * @gl: the glock
> > + *
> > + * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag
> > set.
> > + */
> > +static inline struct gfs2_holder
> > +*find_first_strong_holder(struct gfs2_glock *gl)
> > +{
> > +     struct gfs2_holder *gh;
> > +
> > +     list_for_each_entry(gh, &gl->gl_holders, gh_list) {
> > +             if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
> > +                     return NULL;
> > +             if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
> > +                     return gh;
> > +     }
> > +     return NULL;
> > +}
> > +
> >  /**
> >   * do_promote - promote as many requests as possible on the current
> > queue
> >   * @gl: The glock
> > @@ -425,15 +486,27 @@ __acquires(&gl->gl_lockref.lock)
> >  {
> >       const struct gfs2_glock_operations *glops = gl->gl_ops;
> >       struct gfs2_holder *gh, *tmp, *first_gh;
> > +     bool incompat_holders_demoted = false;
> >       int ret;
> >
> > -     first_gh = find_first_holder(gl);
> > +     first_gh = find_first_strong_holder(gl);
> >
> >  restart:
> >       list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
> > -             if (test_bit(HIF_HOLDER, &gh->gh_iflags))
> > +             if (!test_bit(HIF_WAIT, &gh->gh_iflags))
> >                       continue;
> >               if (may_grant(gl, first_gh, gh)) {
> > +                     if (!incompat_holders_demoted) {
> > +                             demote_incompat_holders(gl, first_gh);
> > +                             incompat_holders_demoted = true;
> > +                             first_gh = gh;
> > +                     }
> > +                     /*
> > +                      * The first holder (and only the first holder)
> > on the
> > +                      * list to be promoted needs to call the
> > go_lock
> > +                      * function. This does things like
> > inode_refresh
> > +                      * to read an inode from disk.
> > +                      */
> >                       if (gh->gh_list.prev == &gl->gl_holders &&
> >                           glops->go_lock) {
> >                               spin_unlock(&gl->gl_lockref.lock);
> > @@ -459,6 +532,11 @@ __acquires(&gl->gl_lockref.lock)
> >                       gfs2_holder_wake(gh);
> >                       continue;
> >               }
> > +             /*
> > +              * If we get here, it means we may not grant this
> > holder for
> > +              * some reason. If this holder is the head of the list,
> > it
> > +              * means we have a blocked holder at the head, so
> > return 1.
> > +              */
> >               if (gh->gh_list.prev == &gl->gl_holders)
> >                       return 1;
> >               do_error(gl, 0);
> > @@ -1373,7 +1451,7 @@ __acquires(&gl->gl_lockref.lock)
> >               if (test_bit(GLF_LOCK, &gl->gl_flags)) {
> >                       struct gfs2_holder *first_gh;
> >
> > -                     first_gh = find_first_holder(gl);
> > +                     first_gh = find_first_strong_holder(gl);
> >                       try_futile = !may_grant(gl, first_gh, gh);
> >               }
> >               if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl-
> > >gl_flags))
> > @@ -1382,7 +1460,8 @@ __acquires(&gl->gl_lockref.lock)
> >
> >       list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
> >               if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
> > -                 (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
> > +                 (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) &&
> > +                 !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)))
> >                       goto trap_recursive;
> >               if (try_futile &&
> >                   !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
> > {
> > @@ -1478,51 +1557,83 @@ int gfs2_glock_poll(struct gfs2_holder *gh)
> >       return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
> >  }
> >
> > -/**
> > - * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock
> > (release a glock)
> > - * @gh: the glock holder
> > - *
> > - */
> > +static inline bool needs_demote(struct gfs2_glock *gl)
> > +{
> > +     return (test_bit(GLF_DEMOTE, &gl->gl_flags) ||
> > +             test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags));
> > +}
> >
> > -void gfs2_glock_dq(struct gfs2_holder *gh)
> > +static void __gfs2_glock_dq(struct gfs2_holder *gh)
> >  {
> >       struct gfs2_glock *gl = gh->gh_gl;
> >       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> >       unsigned delay = 0;
> >       int fast_path = 0;
> >
> > -     spin_lock(&gl->gl_lockref.lock);
> >       /*
> > -      * If we're in the process of file system withdraw, we cannot
> > just
> > -      * dequeue any glocks until our journal is recovered, lest we
> > -      * introduce file system corruption. We need two exceptions to
> > this
> > -      * rule: We need to allow unlocking of nondisk glocks and the
> > glock
> > -      * for our own journal that needs recovery.
> > +      * This while loop is similar to function
> > demote_incompat_holders:
> > +      * If the glock is due to be demoted (which may be from another
> > node
> > +      * or even if this holder is GL_NOCACHE), the weak holders are
> > +      * demoted as well, allowing the glock to be demoted.
> >        */
> > -     if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
> > -         glock_blocked_by_withdraw(gl) &&
> > -         gh->gh_gl != sdp->sd_jinode_gl) {
> > -             sdp->sd_glock_dqs_held++;
> > -             spin_unlock(&gl->gl_lockref.lock);
> > -             might_sleep();
> > -             wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
> > -                         TASK_UNINTERRUPTIBLE);
> > -             spin_lock(&gl->gl_lockref.lock);
> > -     }
> > -     if (gh->gh_flags & GL_NOCACHE)
> > -             handle_callback(gl, LM_ST_UNLOCKED, 0, false);
> > +     while (gh) {
> > +             /*
> > +              * If we're in the process of file system withdraw, we
> > cannot
> > +              * just dequeue any glocks until our journal is
> > recovered, lest
> > +              * we introduce file system corruption. We need two
> > exceptions
> > +              * to this rule: We need to allow unlocking of nondisk
> > glocks
> > +              * and the glock for our own journal that needs
> > recovery.
> > +              */
> > +             if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
> > +                 glock_blocked_by_withdraw(gl) &&
> > +                 gh->gh_gl != sdp->sd_jinode_gl) {
> > +                     sdp->sd_glock_dqs_held++;
> > +                     spin_unlock(&gl->gl_lockref.lock);
> > +                     might_sleep();
> > +                     wait_on_bit(&sdp->sd_flags,
> > SDF_WITHDRAW_RECOVERY,
> > +                                 TASK_UNINTERRUPTIBLE);
> > +                     spin_lock(&gl->gl_lockref.lock);
> > +             }
> > +
> > +             /*
> > +              * This holder should not be cached, so mark it for
> > demote.
> > +              * Note: this should be done before the check for
> > needs_demote
> > +              * below.
> > +              */
> > +             if (gh->gh_flags & GL_NOCACHE)
> > +                     handle_callback(gl, LM_ST_UNLOCKED, 0, false);
> >
> > -     list_del_init(&gh->gh_list);
> > -     clear_bit(HIF_HOLDER, &gh->gh_iflags);
> > -     if (list_empty(&gl->gl_holders) &&
> > -         !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
> > -         !test_bit(GLF_DEMOTE, &gl->gl_flags))
> > -             fast_path = 1;
> > +             list_del_init(&gh->gh_list);
> > +             clear_bit(HIF_HOLDER, &gh->gh_iflags);
> > +             trace_gfs2_glock_queue(gh, 0);
> > +
> > +             /*
> > +              * If there hasn't been a demote request we are done.
> > +              * (Let the remaining holders, if any, keep holding
> > it.)
> > +              */
> > +             if (!needs_demote(gl)) {
> > +                     if (list_empty(&gl->gl_holders))
> > +                             fast_path = 1;
> > +                     break;
> > +             }
> > +             /*
> > +              * If we have another strong holder (we cannot auto-
> > demote)
> > +              * we are done. It keeps holding it until it is done.
> > +              */
> > +             if (find_first_strong_holder(gl))
> > +                     break;
> > +
> > +             /*
> > +              * If we have a weak holder at the head of the list, it
> > +              * (and all others like it) must be auto-demoted. If
> > there
> > +              * are no more weak holders, we exit the while loop.
> > +              */
> > +             gh = find_first_holder(gl);
> > +     }
> >
> >       if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
> >               gfs2_glock_add_to_lru(gl);
> >
> > -     trace_gfs2_glock_queue(gh, 0);
> >       if (unlikely(!fast_path)) {
> >               gl->gl_lockref.count++;
> >               if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
> > @@ -1531,6 +1642,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
> >                       delay = gl->gl_hold_time;
> >               __gfs2_glock_queue_work(gl, delay);
> >       }
> > +}
> > +
> > +/**
> > + * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock
> > (release a glock)
> > + * @gh: the glock holder
> > + *
> > + */
> > +void gfs2_glock_dq(struct gfs2_holder *gh)
> > +{
> > +     struct gfs2_glock *gl = gh->gh_gl;
> > +
> > +     spin_lock(&gl->gl_lockref.lock);
> > +     __gfs2_glock_dq(gh);
> >       spin_unlock(&gl->gl_lockref.lock);
> >  }
> >
> > @@ -1693,6 +1817,7 @@ void gfs2_glock_dq_m(unsigned int num_gh,
> > struct gfs2_holder *ghs)
> >
> >  void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
> >  {
> > +     struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state,
> > };
> >       unsigned long delay = 0;
> >       unsigned long holdtime;
> >       unsigned long now = jiffies;
> > @@ -1707,6 +1832,28 @@ void gfs2_glock_cb(struct gfs2_glock *gl,
> > unsigned int state)
> >               if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
> >                       delay = gl->gl_hold_time;
> >       }
> > +     /*
> > +      * Note 1: We cannot call demote_incompat_holders from
> > handle_callback
> > +      * or gfs2_set_demote due to recursion problems like:
> > gfs2_glock_dq ->
> > +      * handle_callback -> demote_incompat_holders -> gfs2_glock_dq
> > +      * Plus, we only want to demote the holders if the request
> > comes from
> > +      * a remote cluster node because local holder conflicts are
> > resolved
> > +      * elsewhere.
> > +      *
> > +      * Note 2: if a remote node wants this glock in EX mode,
> > lock_dlm will
> > +      * request that we set our state to UNLOCKED. Here we mock up a
> > holder
> > +      * to make it look like someone wants the lock EX locally. Any
> > SH
> > +      * and DF requests should be able to share the lock without
> > demoting.
> > +      *
> > +      * Note 3: We only want to demote the demoteable holders when
> > there
> > +      * are no more strong holders. The demoteable holders might as
> > well
> > +      * keep the glock until the last strong holder is done with it.
> > +      */
> > +     if (!find_first_strong_holder(gl)) {
> > +             if (state == LM_ST_UNLOCKED)
> > +                     mock_gh.gh_state = LM_ST_EXCLUSIVE;
> > +             demote_incompat_holders(gl, &mock_gh);
> > +     }
> >       handle_callback(gl, state, delay, true);
> >       __gfs2_glock_queue_work(gl, delay);
> >       spin_unlock(&gl->gl_lockref.lock);
> > @@ -2096,6 +2243,8 @@ static const char *hflags2str(char *buf, u16
> > flags, unsigned long iflags)
> >               *p++ = 'H';
> >       if (test_bit(HIF_WAIT, &iflags))
> >               *p++ = 'W';
> > +     if (test_bit(HIF_MAY_DEMOTE, &iflags))
> > +             *p++ = 'D';
> >       *p = 0;
> >       return buf;
> >  }
> > diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
> > index 31a8f2f649b5..9012487da4c6 100644
> > --- a/fs/gfs2/glock.h
> > +++ b/fs/gfs2/glock.h
> > @@ -150,6 +150,8 @@ static inline struct gfs2_holder
> > *gfs2_glock_is_locked_by_me(struct gfs2_glock *
> >       list_for_each_entry(gh, &gl->gl_holders, gh_list) {
> >               if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
> >                       break;
> > +             if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
> > +                     continue;
> >               if (gh->gh_owner_pid == pid)
> >                       goto out;
> >       }
> > @@ -325,6 +327,24 @@ static inline void glock_clear_object(struct
> > gfs2_glock *gl, void *object)
> >       spin_unlock(&gl->gl_lockref.lock);
> >  }
> >
> > +static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)
> > +{
> > +     struct gfs2_glock *gl = gh->gh_gl;
> > +
> > +     spin_lock(&gl->gl_lockref.lock);
> > +     set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
> > +     spin_unlock(&gl->gl_lockref.lock);
> > +}
> > +
> > +static inline void gfs2_holder_disallow_demote(struct gfs2_holder
> > *gh)
> > +{
> > +     struct gfs2_glock *gl = gh->gh_gl;
> > +
> > +     spin_lock(&gl->gl_lockref.lock);
> > +     clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
> > +     spin_unlock(&gl->gl_lockref.lock);
> > +}
> > +
>
> This looks a bit strange... bit operations are atomic anyway, so why do
> we need that spinlock here?

This is about making sure that the glock state engine will make
consistent decisions. Currently, those decisions are made under that
spin lock. We could set the HIF_MAY_DEMOTE flag followed by a memory
barrier and the glock state engine would *probably* still make the
right decisions most of the time, but that's not easy to ensure
anymore.

We surely want to prevent the glock state engine from making changes
while clearing the flag, though.

> Steve.
>
> >  extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64
> > generation);
> >  extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64
> > generation);
> >
> > diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> > index 5c6b985254aa..e73a81db0714 100644
> > --- a/fs/gfs2/incore.h
> > +++ b/fs/gfs2/incore.h
> > @@ -252,6 +252,7 @@ struct gfs2_lkstats {
> >
> >  enum {
> >       /* States */
> > +     HIF_MAY_DEMOTE          = 1,
> >       HIF_HOLDER              = 6,  /* Set for gh that "holds" the
> > glock */
> >       HIF_WAIT                = 10,
> >  };
>

Thanks,
Andreas
Steven Whitehouse Aug. 20, 2021, 1:41 p.m. UTC | #4
Hi,

On Fri, 2021-08-20 at 08:11 -0500, Bob Peterson wrote:
> On 8/20/21 4:35 AM, Steven Whitehouse wrote:
> > Hi,
> > 
> > On Thu, 2021-08-19 at 21:40 +0200, Andreas Gruenbacher wrote:
> > > From: Bob Peterson <rpeterso@redhat.com>
> > > 
> > > This patch introduces a new HIF_MAY_DEMOTE flag and
> > > infrastructure
> > > that
> > > will allow glocks to be demoted automatically on locking
> > > conflicts.
> > > When a locking request comes in that isn't compatible with the
> > > locking
> > > state of a holder and that holder has the HIF_MAY_DEMOTE flag
> > > set,
> > > the
> > > holder will be demoted automatically before the incoming locking
> > > request
> > > is granted.
> > > 
> > I'm not sure I understand what is going on here. When there are
> > locking
> > conflicts we generate call backs and those result in glock
> > demotion.
> > There is no need for a flag to indicate that I think, since it is
> > the
> > default behaviour anyway. Or perhaps the explanation is just a bit
> > confusing...
> 
> I agree that the whole concept and explanation are confusing.
> Andreas 
> and I went through several heated arguments about the symantics, 
> comments, patch descriptions, etc. We played around with many
> different 
> flag name ideas, etc. We did not agree on the best way to describe
> the 
> whole concept. He didn't like my explanation and I didn't like his.
> So 
> yes, it is confusing.
> 
That seems to be a good reason to take a step back and look at this a
bit closer. If we are finding this confusing, then someone else looking
at it at a future date, who may not be steeped in GFS2 knowledge is
likely to find it almost impossible.

So at least the description needs some work here I think, to make it
much clearer what the overall aim is. It would be good to start with a
statement of the problem that it is trying to solve which Andreas has
hinted at in his reply just now,

Steve.
Steven Whitehouse Aug. 20, 2021, 1:47 p.m. UTC | #5
Hi,

On Fri, 2021-08-20 at 15:17 +0200, Andreas Gruenbacher wrote:
> On Fri, Aug 20, 2021 at 11:35 AM Steven Whitehouse <
> swhiteho@redhat.com> wrote:
> > On Thu, 2021-08-19 at 21:40 +0200, Andreas Gruenbacher wrote:
> > > From: Bob Peterson <rpeterso@redhat.com>
> > > 
> > > This patch introduces a new HIF_MAY_DEMOTE flag and
> > > infrastructure
> > > that will allow glocks to be demoted automatically on locking
> > > conflicts.
> > > When a locking request comes in that isn't compatible with the
> > > locking
> > > state of a holder and that holder has the HIF_MAY_DEMOTE flag
> > > set, the
> > > holder will be demoted automatically before the incoming locking
> > > request
> > > is granted.
> > 
> > I'm not sure I understand what is going on here. When there are
> > locking
> > conflicts we generate call backs and those result in glock
> > demotion.
> > There is no need for a flag to indicate that I think, since it is
> > the
> > default behaviour anyway. Or perhaps the explanation is just a bit
> > confusing...
> 
> When a glock has active holders (with the HIF_HOLDER flag set), the
> glock won't be demoted to a state incompatible with any of those
> holders.
> 
Ok, that is a much clearer explanation of what the patch does. Active
holders have always prevented demotions previously.

> > > Processes that allow a glock holder to be taken away indicate
> > > this by
> > > calling gfs2_holder_allow_demote().  When they need the glock
> > > again,
> > > they call gfs2_holder_disallow_demote() and then they check if
> > > the
> > > holder is still queued: if it is, they're still holding the
> > > glock; if
> > > it isn't, they need to re-acquire the glock.
> > > 
> > > This allows processes to hang on to locks that could become part
> > > of a
> > > cyclic locking dependency.  The locks will be given up when a
> > > (rare)
> > > conflicting locking request occurs, and don't need to be given up
> > > prematurely.
> > 
> > This seems backwards to me. We already have the glock layer cache
> > the
> > locks until they are required by another node. We also have the min
> > hold time to make sure that we don't bounce locks too much. So what
> > is
> > the problem that you are trying to solve here I wonder?
> 
> This solves the problem of faulting in pages during read and write
> operations: on the one hand, we want to hold the inode glock across
> those operations. On the other hand, those operations may fault in
> pages, which may require taking the same or other inode glocks,
> directly or indirectly, which can deadlock.
> 
> So before we fault in pages, we indicate with
> gfs2_holder_allow_demote(gh) that we can cope if the glock is taken
> away from us. After faulting in the pages, we indicate with
> gfs2_holder_disallow_demote(gh) that we now actually need the glock
> again. At that point, we either still have the glock (i.e., the
> holder
> is still queued and it has the HIF_HOLDER flag set), or we don't.
> 
> The different kinds of read and write operations differ in how they
> handle the latter case:
> 
>  * When a buffered read or write loses the inode glock, it returns a
> short result. This
>    prevents torn writes and reading things that have never existed on
> disk in that form.
> 
>  * When a direct read or write loses the inode glock, it re-acquires
> it before resuming
>    the operation. Direct I/O is not expected to return partial
> results
> and doesn't provide
>    any kind of synchronization among processes.
> 
> We could solve this kind of problem in other ways, for example, by
> keeping a glock generation number, dropping the glock before faulting
> in pages, re-acquiring it afterwards, and checking if the generation
> number has changed. This would still be an additional piece of glock
> infrastructure, but more heavyweight than the HIF_MAY_DEMOTE flag
> which uses the existing glock holder infrastructure.

This is working towards the "why" but could probably be summarised a
bit more. We always used to manage to avoid holding fs locks when
copying to/from userspace to avoid these complications. If that is no
longer possible then it would be good to document what the new
expectations are somewhere suitable in Documentation/filesystems/...
just so we make sure it is clear what the new system is, and everyone
will be on the same page,

Steve.
Andreas Grünbacher Aug. 20, 2021, 2:43 p.m. UTC | #6
Am Fr., 20. Aug. 2021 um 15:49 Uhr schrieb Steven Whitehouse
<swhiteho@redhat.com>:
> We always used to manage to avoid holding fs locks when copying to/from userspace
> to avoid these complications.

I realize the intent, but that goal has never actually been achieved.
Direct I/O has *always* been calling get_user_pages() while holding
the inode glock in deferred mode.

The situation is slightly different for buffered I/O, but we have the
same problem there at least since switching to iomap. (And it's a
fundamental property of iomap that we want to hold the inode glock
across multi-page mappings, not an implementation deficit.)

Here's a slightly outdated version of a test case that makes all
versions of gfs2 prior to this patch queue unhappy:
https://lore.kernel.org/fstests/20210531152604.240462-1-agruenba@redhat.com/

Thanks,
Andreas
Andreas Gruenbacher Aug. 20, 2021, 3:22 p.m. UTC | #7
On Fri, Aug 20, 2021 at 3:11 PM Bob Peterson <rpeterso@redhat.com> wrote:
> On 8/20/21 4:35 AM, Steven Whitehouse wrote:
> > Hi,
> >
> > On Thu, 2021-08-19 at 21:40 +0200, Andreas Gruenbacher wrote:
> >> From: Bob Peterson <rpeterso@redhat.com>
> >>
> >> This patch introduces a new HIF_MAY_DEMOTE flag and infrastructure
> >> that
> >> will allow glocks to be demoted automatically on locking conflicts.
> >> When a locking request comes in that isn't compatible with the
> >> locking
> >> state of a holder and that holder has the HIF_MAY_DEMOTE flag set,
> >> the
> >> holder will be demoted automatically before the incoming locking
> >> request
> >> is granted.
> >>
> > I'm not sure I understand what is going on here. When there are locking
> > conflicts we generate call backs and those result in glock demotion.
> > There is no need for a flag to indicate that I think, since it is the
> > default behaviour anyway. Or perhaps the explanation is just a bit
> > confusing...
>
> I agree that the whole concept and explanation are confusing. Andreas
> and I went through several heated arguments about the semantics,
> comments, patch descriptions, etc. We played around with many different
> flag name ideas, etc. We did not agree on the best way to describe the
> whole concept. He didn't like my explanation and I didn't like his. So
> yes, it is confusing.
>
> My preferred terminology was "DOD" or "Dequeue On Demand"

... which is useless because it adds no clarity as to whose demand
we're talking about.

> which makes
> the concept more understandable to me. So basically a process can say
> "I need to hold this glock, but for an unknown and possibly lengthy
> period of time, but please feel free to dequeue it if it's in your way."
> And bear in mind that several processes may do the same, simultaneously.
>
> You can almost think of this as a performance enhancement. This concept
> allows a process to hold a glock for much longer periods of time, at a
> lower priority, for example, when gfs2_file_read_iter needs to hold the
> glock for very long-running iterative reads.

Consider a process that allocates a somewhat large buffer and reads
into it in chunks that are not page aligned. The buffer initially
won't be faulted in, so we fault in the first chunk and write into it.
Then, when reading the second chunk, we find that the first page of
the second chunk is already present. We fill it, set the
HIF_MAY_DEMOTE flag, fault in more pages, and clear the HIF_MAY_DEMOTE
flag. If we then still have the glock (which is very likely), we
resume the read. Otherwise, we return a short result.

Thanks,
Andreas
Steven Whitehouse Aug. 23, 2021, 8:14 a.m. UTC | #8
On Fri, 2021-08-20 at 17:22 +0200, Andreas Gruenbacher wrote:
> On Fri, Aug 20, 2021 at 3:11 PM Bob Peterson <rpeterso@redhat.com>
> wrote:
> > 
[snip]
> > 
> > You can almost think of this as a performance enhancement. This
> > concept
> > allows a process to hold a glock for much longer periods of time,
> > at a
> > lower priority, for example, when gfs2_file_read_iter needs to hold
> > the
> > glock for very long-running iterative reads.
> 
> Consider a process that allocates a somewhat large buffer and reads
> into it in chunks that are not page aligned. The buffer initially
> won't be faulted in, so we fault in the first chunk and write into
> it.
> Then, when reading the second chunk, we find that the first page of
> the second chunk is already present. We fill it, set the
> HIF_MAY_DEMOTE flag, fault in more pages, and clear the
> HIF_MAY_DEMOTE
> flag. If we then still have the glock (which is very likely), we
> resume the read. Otherwise, we return a short result.
> 
> Thanks,
> Andreas
> 

If the goal here is just to allow the glock to be held for a longer
period of time, but with occasional interruptions to prevent
starvation, then we have a potential model for this. There is
cond_resched_lock() which does this for spin locks. So perhaps we might
do something similar:

/**
 * gfs2_glock_cond_regain - Conditionally drop and regain glock
 * @gl: The glock
 * @gh: A granted holder for the glock
 *
 * If there is a pending demote request for this glock, drop and 
 * requeue a lock request for this glock. If there is no pending
 * demote request, this is a no-op. In either case the glock is
 * held on both entry and exit.
 *
 * Returns: 0 if no pending demote, 1 if lock dropped and regained
 */
int gfs2_glock_cond_regain(struct gfs2_glock *gl, struct gfs2_holder
*gh);

That seems more easily understood, and clearly documents places where
the lock may be dropped and regained. I think that the implementation
should be simpler and cleaner, compared with the current proposed
patch. There are only two bit flags related to pending demotes, for
example, so the check should be trivial.

It may need a few changes depending on the exact circumstances, but
hopefully that illustrates the concept,

Steve.
Andreas Gruenbacher Aug. 23, 2021, 3:18 p.m. UTC | #9
On Mon, Aug 23, 2021 at 10:14 AM Steven Whitehouse <swhiteho@redhat.com> wrote:
> On Fri, 2021-08-20 at 17:22 +0200, Andreas Gruenbacher wrote:
> > On Fri, Aug 20, 2021 at 3:11 PM Bob Peterson <rpeterso@redhat.com>
> > wrote:
> > >
> [snip]
> > >
> > > You can almost think of this as a performance enhancement. This
> > > concept
> > > allows a process to hold a glock for much longer periods of time,
> > > at a
> > > lower priority, for example, when gfs2_file_read_iter needs to hold
> > > the
> > > glock for very long-running iterative reads.
> >
> > Consider a process that allocates a somewhat large buffer and reads
> > into it in chunks that are not page aligned. The buffer initially
> > won't be faulted in, so we fault in the first chunk and write into
> > it.
> > Then, when reading the second chunk, we find that the first page of
> > the second chunk is already present. We fill it, set the
> > HIF_MAY_DEMOTE flag, fault in more pages, and clear the
> > HIF_MAY_DEMOTE
> > flag. If we then still have the glock (which is very likely), we
> > resume the read. Otherwise, we return a short result.
> >
> > Thanks,
> > Andreas
> >
>
> If the goal here is just to allow the glock to be held for a longer
> period of time, but with occasional interruptions to prevent
> starvation, then we have a potential model for this. There is
> cond_resched_lock() which does this for spin locks.

This isn't an appropriate model for what I'm trying to achieve here.
In the cond_resched case, we know at the time of the cond_resched call
whether or not we want to schedule. If we do, we want to drop the spin
lock, schedule, and then re-acquire the spin lock. In the case we're
looking at here, we want to fault in user pages. There is no way of
knowing beforehand if the glock we're currently holding will have to
be dropped to achieve that. In fact, it will almost never have to be
dropped. But if it does, we need to drop it straight away to allow the
conflicting locking request to succeed.

Have a look at how the patch queue uses gfs2_holder_allow_demote() and
gfs2_holder_disallow_demote():

https://listman.redhat.com/archives/cluster-devel/2021-August/msg00128.html
https://listman.redhat.com/archives/cluster-devel/2021-August/msg00134.html

Thanks,
Andreas
Matthew Wilcox Aug. 23, 2021, 4:05 p.m. UTC | #10
On Mon, Aug 23, 2021 at 05:18:12PM +0200, Andreas Gruenbacher wrote:
> On Mon, Aug 23, 2021 at 10:14 AM Steven Whitehouse <swhiteho@redhat.com> wrote:
> > If the goal here is just to allow the glock to be held for a longer
> > period of time, but with occasional interruptions to prevent
> > starvation, then we have a potential model for this. There is
> > cond_resched_lock() which does this for spin locks.
> 
> This isn't an appropriate model for what I'm trying to achieve here.
> In the cond_resched case, we know at the time of the cond_resched call
> whether or not we want to schedule. If we do, we want to drop the spin
> lock, schedule, and then re-acquire the spin lock. In the case we're
> looking at here, we want to fault in user pages. There is no way of
> knowing beforehand if the glock we're currently holding will have to
> be dropped to achieve that. In fact, it will almost never have to be
> dropped. But if it does, we need to drop it straight away to allow the
> conflicting locking request to succeed.

It occurs to me that this is similar to the wound/wait mutexes
(include/linux/ww_mutex.h & Documentation/locking/ww-mutex-design.rst).
You want to mark the glock as woundable before faulting, and then discover
if it was wounded after faulting.  Maybe sharing this terminology will
aid in understanding?
Bob Peterson Aug. 23, 2021, 4:36 p.m. UTC | #11
On 8/23/21 11:05 AM, Matthew Wilcox wrote:
> On Mon, Aug 23, 2021 at 05:18:12PM +0200, Andreas Gruenbacher wrote:
>> On Mon, Aug 23, 2021 at 10:14 AM Steven Whitehouse <swhiteho@redhat.com> wrote:
>>> If the goal here is just to allow the glock to be held for a longer
>>> period of time, but with occasional interruptions to prevent
>>> starvation, then we have a potential model for this. There is
>>> cond_resched_lock() which does this for spin locks.
>>
>> This isn't an appropriate model for what I'm trying to achieve here.
>> In the cond_resched case, we know at the time of the cond_resched call
>> whether or not we want to schedule. If we do, we want to drop the spin
>> lock, schedule, and then re-acquire the spin lock. In the case we're
>> looking at here, we want to fault in user pages. There is no way of
>> knowing beforehand if the glock we're currently holding will have to
>> be dropped to achieve that. In fact, it will almost never have to be
>> dropped. But if it does, we need to drop it straight away to allow the
>> conflicting locking request to succeed.
> 
> It occurs to me that this is similar to the wound/wait mutexes
> (include/linux/ww_mutex.h & Documentation/locking/ww-mutex-design.rst).
> You want to mark the glock as woundable before faulting, and then discover
> if it was wounded after faulting.  Maybe sharing this terminology will
> aid in understanding?
> 
Hmm. Woundable. I like it.
Andreas and I argued about the terminology but we never found a 
middle-ground. Perhaps this is it. Thanks, Matthew.

Regards,

Bob Peterson
Andreas Gruenbacher Aug. 23, 2021, 7:12 p.m. UTC | #12
On Mon, Aug 23, 2021 at 6:06 PM Matthew Wilcox <willy@infradead.org> wrote:
> On Mon, Aug 23, 2021 at 05:18:12PM +0200, Andreas Gruenbacher wrote:
> > On Mon, Aug 23, 2021 at 10:14 AM Steven Whitehouse <swhiteho@redhat.com> wrote:
> > > If the goal here is just to allow the glock to be held for a longer
> > > period of time, but with occasional interruptions to prevent
> > > starvation, then we have a potential model for this. There is
> > > cond_resched_lock() which does this for spin locks.
> >
> > This isn't an appropriate model for what I'm trying to achieve here.
> > In the cond_resched case, we know at the time of the cond_resched call
> > whether or not we want to schedule. If we do, we want to drop the spin
> > lock, schedule, and then re-acquire the spin lock. In the case we're
> > looking at here, we want to fault in user pages. There is no way of
> > knowing beforehand if the glock we're currently holding will have to
> > be dropped to achieve that. In fact, it will almost never have to be
> > dropped. But if it does, we need to drop it straight away to allow the
> > conflicting locking request to succeed.
>
> It occurs to me that this is similar to the wound/wait mutexes
> (include/linux/ww_mutex.h & Documentation/locking/ww-mutex-design.rst).
> You want to mark the glock as woundable before faulting, and then discover
> if it was wounded after faulting.  Maybe sharing this terminology will
> aid in understanding?

I've looked at the ww_mutex documentation. A "transaction" wounds
another "transaction" and that other transaction then "dies", or it
"heals" and restarts. In the glock case, a process sets and clears the
HIF_MAY_DEMOTE flag on one of its own glock holder contexts. After
clearing the flag, it either still holds the glock or it doesn't;
nothing needs to be done to "die" or to "heal". So I'm not sure we
want to conflate two concepts.

One of the earlier terms we've used was "stealing", with a
HIF_MAY_STEAL flag. That works, but it's slightly less obvious what
happens to a glock holder when the glock is stolen from it. (The
holder gets dequeued, __gfs2_glock_dq.) The glock code already uses
the terms promote/demote, acquire/release, enqueue/dequeue, and
_nq/_dq for various forms of acquiring and releasing a glock, so we're
not in a shortage or names right now apparently.

Thanks,
Andreas
Steven Whitehouse Aug. 24, 2021, 7:59 a.m. UTC | #13
Hi,

On Mon, 2021-08-23 at 17:18 +0200, Andreas Gruenbacher wrote:
> On Mon, Aug 23, 2021 at 10:14 AM Steven Whitehouse <
> swhiteho@redhat.com> wrote:
> > On Fri, 2021-08-20 at 17:22 +0200, Andreas Gruenbacher wrote:
> > > On Fri, Aug 20, 2021 at 3:11 PM Bob Peterson <rpeterso@redhat.com
> > > >
> > > wrote:
> > [snip]
> > > > You can almost think of this as a performance enhancement. This
> > > > concept
> > > > allows a process to hold a glock for much longer periods of
> > > > time,
> > > > at a
> > > > lower priority, for example, when gfs2_file_read_iter needs to
> > > > hold
> > > > the
> > > > glock for very long-running iterative reads.
> > > 
> > > Consider a process that allocates a somewhat large buffer and
> > > reads
> > > into it in chunks that are not page aligned. The buffer initially
> > > won't be faulted in, so we fault in the first chunk and write
> > > into
> > > it.
> > > Then, when reading the second chunk, we find that the first page
> > > of
> > > the second chunk is already present. We fill it, set the
> > > HIF_MAY_DEMOTE flag, fault in more pages, and clear the
> > > HIF_MAY_DEMOTE
> > > flag. If we then still have the glock (which is very likely), we
> > > resume the read. Otherwise, we return a short result.
> > > 
> > > Thanks,
> > > Andreas
> > > 
> > 
> > If the goal here is just to allow the glock to be held for a longer
> > period of time, but with occasional interruptions to prevent
> > starvation, then we have a potential model for this. There is
> > cond_resched_lock() which does this for spin locks.
> 
> This isn't an appropriate model for what I'm trying to achieve here.
> In the cond_resched case, we know at the time of the cond_resched
> call
> whether or not we want to schedule. If we do, we want to drop the
> spin
> lock, schedule, and then re-acquire the spin lock. In the case we're
> looking at here, we want to fault in user pages. There is no way of
> knowing beforehand if the glock we're currently holding will have to
> be dropped to achieve that. In fact, it will almost never have to be
> dropped. But if it does, we need to drop it straight away to allow
> the
> conflicting locking request to succeed.
> 
> Have a look at how the patch queue uses gfs2_holder_allow_demote()
> and
> gfs2_holder_disallow_demote():
> 
> https://listman.redhat.com/archives/cluster-devel/2021-August/msg00128.html
> https://listman.redhat.com/archives/cluster-devel/2021-August/msg00134.html
> 
> Thanks,
> Andreas
> 

Ah, now I see! Apologies if I've misunderstood the intent here,
initially. It is complicated and not so obvious - at least to me!

You've added a lot of context to this patch in your various replies on
this thread. The original patch description explains how the feature is
implemented, but doesn't really touch on why - that is left to the
other patches that you pointed to above. A short paragraph or two on
the "why" side of things added to the patch description would be
helpful I think.

Your message on Friday (20 Aug 2021 15:17:41 +0200 (20/08/21 14:17:41))
has a good explanation in the second part of it, which with what you've
said above (or similar) would be a good basis.

Apologies again for not understanding what is going on,

Steve.
diff mbox series

Patch

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f24db2ececfb..d1b06a09ce2f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -58,6 +58,7 @@  struct gfs2_glock_iter {
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
 
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
+static void __gfs2_glock_dq(struct gfs2_holder *gh);
 
 static struct dentry *gfs2_root;
 static struct workqueue_struct *glock_workqueue;
@@ -197,6 +198,12 @@  static int demote_ok(const struct gfs2_glock *gl)
 
 	if (gl->gl_state == LM_ST_UNLOCKED)
 		return 0;
+	/*
+	 * Note that demote_ok is used for the lru process of disposing of
+	 * glocks. For this purpose, we don't care if the glock's holders
+	 * have the HIF_MAY_DEMOTE flag set or not. If someone is using
+	 * them, don't demote.
+	 */
 	if (!list_empty(&gl->gl_holders))
 		return 0;
 	if (glops->go_demote_ok)
@@ -379,7 +386,7 @@  static void do_error(struct gfs2_glock *gl, const int ret)
 	struct gfs2_holder *gh, *tmp;
 
 	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
-		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+		if (!test_bit(HIF_WAIT, &gh->gh_iflags))
 			continue;
 		if (ret & LM_OUT_ERROR)
 			gh->gh_error = -EIO;
@@ -393,6 +400,40 @@  static void do_error(struct gfs2_glock *gl, const int ret)
 	}
 }
 
+/**
+ * demote_incompat_holders - demote incompatible demoteable holders
+ * @gl: the glock we want to promote
+ * @new_gh: the new holder to be promoted
+ */
+static void demote_incompat_holders(struct gfs2_glock *gl,
+				    struct gfs2_holder *new_gh)
+{
+	struct gfs2_holder *gh;
+
+	/*
+	 * Demote incompatible holders before we make ourselves eligible.
+	 * (This holder may or may not allow auto-demoting, but we don't want
+	 * to demote the new holder before it's even granted.)
+	 */
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		/*
+		 * Since holders are at the front of the list, we stop when we
+		 * find the first non-holder.
+		 */
+		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+			return;
+		if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
+		    !may_grant(gl, new_gh, gh)) {
+			/*
+			 * We should not recurse into do_promote because
+			 * __gfs2_glock_dq only calls handle_callback,
+			 * gfs2_glock_add_to_lru and __gfs2_glock_queue_work.
+			 */
+			__gfs2_glock_dq(gh);
+		}
+	}
+}
+
 /**
  * find_first_holder - find the first "holder" gh
  * @gl: the glock
@@ -411,6 +452,26 @@  static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
 	return NULL;
 }
 
+/**
+ * find_first_strong_holder - find the first non-demoteable holder
+ * @gl: the glock
+ *
+ * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set.
+ */
+static inline struct gfs2_holder
+*find_first_strong_holder(struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+			return NULL;
+		if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
+			return gh;
+	}
+	return NULL;
+}
+
 /**
  * do_promote - promote as many requests as possible on the current queue
  * @gl: The glock
@@ -425,15 +486,27 @@  __acquires(&gl->gl_lockref.lock)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	struct gfs2_holder *gh, *tmp, *first_gh;
+	bool incompat_holders_demoted = false;
 	int ret;
 
-	first_gh = find_first_holder(gl);
+	first_gh = find_first_strong_holder(gl);
 
 restart:
 	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
-		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+		if (!test_bit(HIF_WAIT, &gh->gh_iflags))
 			continue;
 		if (may_grant(gl, first_gh, gh)) {
+			if (!incompat_holders_demoted) {
+				demote_incompat_holders(gl, first_gh);
+				incompat_holders_demoted = true;
+				first_gh = gh;
+			}
+			/*
+			 * The first holder (and only the first holder) on the
+			 * list to be promoted needs to call the go_lock
+			 * function. This does things like inode_refresh
+			 * to read an inode from disk.
+			 */
 			if (gh->gh_list.prev == &gl->gl_holders &&
 			    glops->go_lock) {
 				spin_unlock(&gl->gl_lockref.lock);
@@ -459,6 +532,11 @@  __acquires(&gl->gl_lockref.lock)
 			gfs2_holder_wake(gh);
 			continue;
 		}
+		/*
+		 * If we get here, it means we may not grant this holder for
+		 * some reason. If this holder is the head of the list, it
+		 * means we have a blocked holder at the head, so return 1.
+		 */
 		if (gh->gh_list.prev == &gl->gl_holders)
 			return 1;
 		do_error(gl, 0);
@@ -1373,7 +1451,7 @@  __acquires(&gl->gl_lockref.lock)
 		if (test_bit(GLF_LOCK, &gl->gl_flags)) {
 			struct gfs2_holder *first_gh;
 
-			first_gh = find_first_holder(gl);
+			first_gh = find_first_strong_holder(gl);
 			try_futile = !may_grant(gl, first_gh, gh);
 		}
 		if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
@@ -1382,7 +1460,8 @@  __acquires(&gl->gl_lockref.lock)
 
 	list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
 		if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
-		    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
+		    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) &&
+		    !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)))
 			goto trap_recursive;
 		if (try_futile &&
 		    !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
@@ -1478,51 +1557,83 @@  int gfs2_glock_poll(struct gfs2_holder *gh)
 	return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
 }
 
-/**
- * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
- * @gh: the glock holder
- *
- */
+static inline bool needs_demote(struct gfs2_glock *gl)
+{
+	return (test_bit(GLF_DEMOTE, &gl->gl_flags) ||
+		test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags));
+}
 
-void gfs2_glock_dq(struct gfs2_holder *gh)
+static void __gfs2_glock_dq(struct gfs2_holder *gh)
 {
 	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 	unsigned delay = 0;
 	int fast_path = 0;
 
-	spin_lock(&gl->gl_lockref.lock);
 	/*
-	 * If we're in the process of file system withdraw, we cannot just
-	 * dequeue any glocks until our journal is recovered, lest we
-	 * introduce file system corruption. We need two exceptions to this
-	 * rule: We need to allow unlocking of nondisk glocks and the glock
-	 * for our own journal that needs recovery.
+	 * This while loop is similar to function demote_incompat_holders:
+	 * If the glock is due to be demoted (which may be from another node
+	 * or even if this holder is GL_NOCACHE), the weak holders are
+	 * demoted as well, allowing the glock to be demoted.
 	 */
-	if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
-	    glock_blocked_by_withdraw(gl) &&
-	    gh->gh_gl != sdp->sd_jinode_gl) {
-		sdp->sd_glock_dqs_held++;
-		spin_unlock(&gl->gl_lockref.lock);
-		might_sleep();
-		wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
-			    TASK_UNINTERRUPTIBLE);
-		spin_lock(&gl->gl_lockref.lock);
-	}
-	if (gh->gh_flags & GL_NOCACHE)
-		handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+	while (gh) {
+		/*
+		 * If we're in the process of file system withdraw, we cannot
+		 * just dequeue any glocks until our journal is recovered, lest
+		 * we introduce file system corruption. We need two exceptions
+		 * to this rule: We need to allow unlocking of nondisk glocks
+		 * and the glock for our own journal that needs recovery.
+		 */
+		if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
+		    glock_blocked_by_withdraw(gl) &&
+		    gh->gh_gl != sdp->sd_jinode_gl) {
+			sdp->sd_glock_dqs_held++;
+			spin_unlock(&gl->gl_lockref.lock);
+			might_sleep();
+			wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
+				    TASK_UNINTERRUPTIBLE);
+			spin_lock(&gl->gl_lockref.lock);
+		}
+
+		/*
+		 * This holder should not be cached, so mark it for demote.
+		 * Note: this should be done before the check for needs_demote
+		 * below.
+		 */
+		if (gh->gh_flags & GL_NOCACHE)
+			handle_callback(gl, LM_ST_UNLOCKED, 0, false);
 
-	list_del_init(&gh->gh_list);
-	clear_bit(HIF_HOLDER, &gh->gh_iflags);
-	if (list_empty(&gl->gl_holders) &&
-	    !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
-	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
-		fast_path = 1;
+		list_del_init(&gh->gh_list);
+		clear_bit(HIF_HOLDER, &gh->gh_iflags);
+		trace_gfs2_glock_queue(gh, 0);
+
+		/*
+		 * If there hasn't been a demote request we are done.
+		 * (Let the remaining holders, if any, keep holding it.)
+		 */
+		if (!needs_demote(gl)) {
+			if (list_empty(&gl->gl_holders))
+				fast_path = 1;
+			break;
+		}
+		/*
+		 * If we have another strong holder (we cannot auto-demote)
+		 * we are done. It keeps holding it until it is done.
+		 */
+		if (find_first_strong_holder(gl))
+			break;
+
+		/*
+		 * If we have a weak holder at the head of the list, it
+		 * (and all others like it) must be auto-demoted. If there
+		 * are no more weak holders, we exit the while loop.
+		 */
+		gh = find_first_holder(gl);
+	}
 
 	if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
 		gfs2_glock_add_to_lru(gl);
 
-	trace_gfs2_glock_queue(gh, 0);
 	if (unlikely(!fast_path)) {
 		gl->gl_lockref.count++;
 		if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1531,6 +1642,19 @@  void gfs2_glock_dq(struct gfs2_holder *gh)
 			delay = gl->gl_hold_time;
 		__gfs2_glock_queue_work(gl, delay);
 	}
+}
+
+/**
+ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+ * @gh: the glock holder
+ *
+ */
+void gfs2_glock_dq(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+
+	spin_lock(&gl->gl_lockref.lock);
+	__gfs2_glock_dq(gh);
 	spin_unlock(&gl->gl_lockref.lock);
 }
 
@@ -1693,6 +1817,7 @@  void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 
 void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 {
+	struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state, };
 	unsigned long delay = 0;
 	unsigned long holdtime;
 	unsigned long now = jiffies;
@@ -1707,6 +1832,28 @@  void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 		if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
 			delay = gl->gl_hold_time;
 	}
+	/*
+	 * Note 1: We cannot call demote_incompat_holders from handle_callback
+	 * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq ->
+	 * handle_callback -> demote_incompat_holders -> gfs2_glock_dq
+	 * Plus, we only want to demote the holders if the request comes from
+	 * a remote cluster node because local holder conflicts are resolved
+	 * elsewhere.
+	 *
+	 * Note 2: if a remote node wants this glock in EX mode, lock_dlm will
+	 * request that we set our state to UNLOCKED. Here we mock up a holder
+	 * to make it look like someone wants the lock EX locally. Any SH
+	 * and DF requests should be able to share the lock without demoting.
+	 *
+	 * Note 3: We only want to demote the demoteable holders when there
+	 * are no more strong holders. The demoteable holders might as well
+	 * keep the glock until the last strong holder is done with it.
+	 */
+	if (!find_first_strong_holder(gl)) {
+		if (state == LM_ST_UNLOCKED)
+			mock_gh.gh_state = LM_ST_EXCLUSIVE;
+		demote_incompat_holders(gl, &mock_gh);
+	}
 	handle_callback(gl, state, delay, true);
 	__gfs2_glock_queue_work(gl, delay);
 	spin_unlock(&gl->gl_lockref.lock);
@@ -2096,6 +2243,8 @@  static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
 		*p++ = 'H';
 	if (test_bit(HIF_WAIT, &iflags))
 		*p++ = 'W';
+	if (test_bit(HIF_MAY_DEMOTE, &iflags))
+		*p++ = 'D';
 	*p = 0;
 	return buf;
 }
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 31a8f2f649b5..9012487da4c6 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -150,6 +150,8 @@  static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
 		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
 			break;
+		if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
+			continue;
 		if (gh->gh_owner_pid == pid)
 			goto out;
 	}
@@ -325,6 +327,24 @@  static inline void glock_clear_object(struct gfs2_glock *gl, void *object)
 	spin_unlock(&gl->gl_lockref.lock);
 }
 
+static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+
+	spin_lock(&gl->gl_lockref.lock);
+	set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
+	spin_unlock(&gl->gl_lockref.lock);
+}
+
+static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+
+	spin_lock(&gl->gl_lockref.lock);
+	clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
+	spin_unlock(&gl->gl_lockref.lock);
+}
+
 extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
 extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 5c6b985254aa..e73a81db0714 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -252,6 +252,7 @@  struct gfs2_lkstats {
 
 enum {
 	/* States */
+	HIF_MAY_DEMOTE		= 1,
 	HIF_HOLDER		= 6,  /* Set for gh that "holds" the glock */
 	HIF_WAIT		= 10,
 };