[4/8] ceph: allow remounting aborted mount
diff mbox series

Message ID 20190617125529.6230-5-zyan@redhat.com
State New
Headers show
Series
  • ceph: remount aborted mount
Related show

Commit Message

Yan, Zheng June 17, 2019, 12:55 p.m. UTC
When remounting aborted mount, also reset client's entity addr.
'umount -f /ceph; mount -o remount /ceph' can be used for recovering
from blacklist.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/mds_client.c | 16 +++++++++++++---
 fs/ceph/super.c      | 23 +++++++++++++++++++++--
 2 files changed, 34 insertions(+), 5 deletions(-)

Comments

Jeff Layton June 17, 2019, 5:30 p.m. UTC | #1
On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> When remounting aborted mount, also reset client's entity addr.
> 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> from blacklist.
> 

Why do I need to umount here? Once the filesystem is unmounted, then the
'-o remount' becomes superfluous, no? In fact, I get an error back when
I try to remount an unmounted filesystem:

    $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
    mount: /mnt/cephfs: mount point not mounted or bad option.

My client isn't blacklisted above, so I guess you're counting on the
umount returning without having actually unmounted the filesystem?

I think this ought to not need a umount first. From a UI standpoint,
just doing a "mount -o remount" ought to be sufficient to clear this.

Also, how would an admin know that this is something they ought to try?
Is there a way for them to know that their client has been blacklisted?

> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/mds_client.c | 16 +++++++++++++---
>  fs/ceph/super.c      | 23 +++++++++++++++++++++--
>  2 files changed, 34 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 19c62cf7d5b8..188c33709d9a 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -1378,9 +1378,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
>  		struct ceph_cap_flush *cf;
>  		struct ceph_mds_client *mdsc = fsc->mdsc;
>  
> -		if (ci->i_wrbuffer_ref > 0 &&
> -		    READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
> -			invalidate = true;
> +		if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
> +			if (inode->i_data.nrpages > 0)
> +				invalidate = true;
> +			if (ci->i_wrbuffer_ref > 0)
> +				mapping_set_error(&inode->i_data, -EIO);
> +		}
>  
>  		while (!list_empty(&ci->i_cap_flush_list)) {
>  			cf = list_first_entry(&ci->i_cap_flush_list,
> @@ -4350,7 +4353,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
>  		session = __ceph_lookup_mds_session(mdsc, mds);
>  		if (!session)
>  			continue;
> +
> +		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
> +			__unregister_session(mdsc, session);
> +		__wake_requests(mdsc, &session->s_waiting);
>  		mutex_unlock(&mdsc->mutex);
> +
>  		mutex_lock(&session->s_mutex);
>  		__close_session(mdsc, session);
>  		if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
> @@ -4359,9 +4367,11 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
>  		}
>  		mutex_unlock(&session->s_mutex);
>  		ceph_put_mds_session(session);
> +
>  		mutex_lock(&mdsc->mutex);
>  		kick_requests(mdsc, mds);
>  	}
> +
>  	__wake_requests(mdsc, &mdsc->waiting_for_map);
>  	mutex_unlock(&mdsc->mutex);
>  }
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 67eb9d592ab7..a6a3c065f697 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -833,8 +833,27 @@ static void ceph_umount_begin(struct super_block *sb)
>  
>  static int ceph_remount(struct super_block *sb, int *flags, char *data)
>  {
> -	sync_filesystem(sb);
> -	return 0;
> +	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
> +
> +	if (fsc->mount_state != CEPH_MOUNT_SHUTDOWN) {
> +		sync_filesystem(sb);
> +		return 0;
> +	}
> +
> +	/* Make sure all page caches get invalidated.
> +	 * see remove_session_caps_cb() */
> +	flush_workqueue(fsc->inode_wq);
> +	/* In case that we were blacklisted. This also reset
> +	 * all mon/osd connections */
> +	ceph_reset_client_addr(fsc->client);
> +
> +	ceph_osdc_clear_abort_err(&fsc->client->osdc);
> +	fsc->mount_state = 0;
> +
> +	if (!sb->s_root)
> +		return 0;
> +	return __ceph_do_getattr(d_inode(sb->s_root), NULL,
> +				 CEPH_STAT_CAP_INODE, true);
>  }
>  
>  static const struct super_operations ceph_super_ops = {
Huang Zhiteng June 17, 2019, 5:44 p.m. UTC | #2
On Tue, Jun 18, 2019 at 1:30 AM Jeff Layton <jlayton@redhat.com> wrote:
>
> On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> > When remounting aborted mount, also reset client's entity addr.
> > 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> > from blacklist.
> >
>
> Why do I need to umount here? Once the filesystem is unmounted, then the
> '-o remount' becomes superfluous, no? In fact, I get an error back when
> I try to remount an unmounted filesystem:
>
>     $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
>     mount: /mnt/cephfs: mount point not mounted or bad option.
>
> My client isn't blacklisted above, so I guess you're counting on the
> umount returning without having actually unmounted the filesystem?
>
> I think this ought to not need a umount first. From a UI standpoint,
> just doing a "mount -o remount" ought to be sufficient to clear this.
>
> Also, how would an admin know that this is something they ought to try?
> Is there a way for them to know that their client has been blacklisted?
In our deployment, we actually capture the blacklist event and convert
that to a customer facing event to let them know their client(s) has
been blacklisted.  Upon receiving such notification, they can
reconnect clients to MDS and minimize the down time.
>
> > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > ---
> >  fs/ceph/mds_client.c | 16 +++++++++++++---
> >  fs/ceph/super.c      | 23 +++++++++++++++++++++--
> >  2 files changed, 34 insertions(+), 5 deletions(-)
> >
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index 19c62cf7d5b8..188c33709d9a 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -1378,9 +1378,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
> >               struct ceph_cap_flush *cf;
> >               struct ceph_mds_client *mdsc = fsc->mdsc;
> >
> > -             if (ci->i_wrbuffer_ref > 0 &&
> > -                 READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
> > -                     invalidate = true;
> > +             if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
> > +                     if (inode->i_data.nrpages > 0)
> > +                             invalidate = true;
> > +                     if (ci->i_wrbuffer_ref > 0)
> > +                             mapping_set_error(&inode->i_data, -EIO);
> > +             }
> >
> >               while (!list_empty(&ci->i_cap_flush_list)) {
> >                       cf = list_first_entry(&ci->i_cap_flush_list,
> > @@ -4350,7 +4353,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
> >               session = __ceph_lookup_mds_session(mdsc, mds);
> >               if (!session)
> >                       continue;
> > +
> > +             if (session->s_state == CEPH_MDS_SESSION_REJECTED)
> > +                     __unregister_session(mdsc, session);
> > +             __wake_requests(mdsc, &session->s_waiting);
> >               mutex_unlock(&mdsc->mutex);
> > +
> >               mutex_lock(&session->s_mutex);
> >               __close_session(mdsc, session);
> >               if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
> > @@ -4359,9 +4367,11 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
> >               }
> >               mutex_unlock(&session->s_mutex);
> >               ceph_put_mds_session(session);
> > +
> >               mutex_lock(&mdsc->mutex);
> >               kick_requests(mdsc, mds);
> >       }
> > +
> >       __wake_requests(mdsc, &mdsc->waiting_for_map);
> >       mutex_unlock(&mdsc->mutex);
> >  }
> > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > index 67eb9d592ab7..a6a3c065f697 100644
> > --- a/fs/ceph/super.c
> > +++ b/fs/ceph/super.c
> > @@ -833,8 +833,27 @@ static void ceph_umount_begin(struct super_block *sb)
> >
> >  static int ceph_remount(struct super_block *sb, int *flags, char *data)
> >  {
> > -     sync_filesystem(sb);
> > -     return 0;
> > +     struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
> > +
> > +     if (fsc->mount_state != CEPH_MOUNT_SHUTDOWN) {
> > +             sync_filesystem(sb);
> > +             return 0;
> > +     }
> > +
> > +     /* Make sure all page caches get invalidated.
> > +      * see remove_session_caps_cb() */
> > +     flush_workqueue(fsc->inode_wq);
> > +     /* In case that we were blacklisted. This also reset
> > +      * all mon/osd connections */
> > +     ceph_reset_client_addr(fsc->client);
> > +
> > +     ceph_osdc_clear_abort_err(&fsc->client->osdc);
> > +     fsc->mount_state = 0;
> > +
> > +     if (!sb->s_root)
> > +             return 0;
> > +     return __ceph_do_getattr(d_inode(sb->s_root), NULL,
> > +                              CEPH_STAT_CAP_INODE, true);
> >  }
> >
> >  static const struct super_operations ceph_super_ops = {
>
> --
> Jeff Layton <jlayton@redhat.com>
>
Yan, Zheng June 18, 2019, 1:42 a.m. UTC | #3
On 6/18/19 1:30 AM, Jeff Layton wrote:
> On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
>> When remounting aborted mount, also reset client's entity addr.
>> 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
>> from blacklist.
>>
> 
> Why do I need to umount here? Once the filesystem is unmounted, then the
> '-o remount' becomes superfluous, no? In fact, I get an error back when
> I try to remount an unmounted filesystem:
> 
>      $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
>      mount: /mnt/cephfs: mount point not mounted or bad option.
> 
> My client isn't blacklisted above, so I guess you're counting on the
> umount returning without having actually unmounted the filesystem?
> 
> I think this ought to not need a umount first. From a UI standpoint,
> just doing a "mount -o remount" ought to be sufficient to clear this.
> 
This series is mainly for the case that mount point is not umountable.
If mount point is umountable, user should use 'umount -f /ceph; mount 
/ceph'. This avoids all trouble of error handling.

> Also, how would an admin know that this is something they ought to try?
> Is there a way for them to know that their client has been blacklisted?

Admin can use 'ceph osd blacklist ls' command.  It's a little difficult 
to let kernel client aware itself is blacklisted. because osdmap is 
complex, fully decoding it require lots of codes.


> 
>> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
>> ---
>>   fs/ceph/mds_client.c | 16 +++++++++++++---
>>   fs/ceph/super.c      | 23 +++++++++++++++++++++--
>>   2 files changed, 34 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index 19c62cf7d5b8..188c33709d9a 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -1378,9 +1378,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
>>   		struct ceph_cap_flush *cf;
>>   		struct ceph_mds_client *mdsc = fsc->mdsc;
>>   
>> -		if (ci->i_wrbuffer_ref > 0 &&
>> -		    READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
>> -			invalidate = true;
>> +		if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
>> +			if (inode->i_data.nrpages > 0)
>> +				invalidate = true;
>> +			if (ci->i_wrbuffer_ref > 0)
>> +				mapping_set_error(&inode->i_data, -EIO);
>> +		}
>>   
>>   		while (!list_empty(&ci->i_cap_flush_list)) {
>>   			cf = list_first_entry(&ci->i_cap_flush_list,
>> @@ -4350,7 +4353,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
>>   		session = __ceph_lookup_mds_session(mdsc, mds);
>>   		if (!session)
>>   			continue;
>> +
>> +		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
>> +			__unregister_session(mdsc, session);
>> +		__wake_requests(mdsc, &session->s_waiting);
>>   		mutex_unlock(&mdsc->mutex);
>> +
>>   		mutex_lock(&session->s_mutex);
>>   		__close_session(mdsc, session);
>>   		if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
>> @@ -4359,9 +4367,11 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
>>   		}
>>   		mutex_unlock(&session->s_mutex);
>>   		ceph_put_mds_session(session);
>> +
>>   		mutex_lock(&mdsc->mutex);
>>   		kick_requests(mdsc, mds);
>>   	}
>> +
>>   	__wake_requests(mdsc, &mdsc->waiting_for_map);
>>   	mutex_unlock(&mdsc->mutex);
>>   }
>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>> index 67eb9d592ab7..a6a3c065f697 100644
>> --- a/fs/ceph/super.c
>> +++ b/fs/ceph/super.c
>> @@ -833,8 +833,27 @@ static void ceph_umount_begin(struct super_block *sb)
>>   
>>   static int ceph_remount(struct super_block *sb, int *flags, char *data)
>>   {
>> -	sync_filesystem(sb);
>> -	return 0;
>> +	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
>> +
>> +	if (fsc->mount_state != CEPH_MOUNT_SHUTDOWN) {
>> +		sync_filesystem(sb);
>> +		return 0;
>> +	}
>> +
>> +	/* Make sure all page caches get invalidated.
>> +	 * see remove_session_caps_cb() */
>> +	flush_workqueue(fsc->inode_wq);
>> +	/* In case that we were blacklisted. This also reset
>> +	 * all mon/osd connections */
>> +	ceph_reset_client_addr(fsc->client);
>> +
>> +	ceph_osdc_clear_abort_err(&fsc->client->osdc);
>> +	fsc->mount_state = 0;
>> +
>> +	if (!sb->s_root)
>> +		return 0;
>> +	return __ceph_do_getattr(d_inode(sb->s_root), NULL,
>> +				 CEPH_STAT_CAP_INODE, true);
>>   }
>>   
>>   static const struct super_operations ceph_super_ops = {
>
Yan, Zheng June 18, 2019, 6:25 a.m. UTC | #4
On 6/18/19 1:30 AM, Jeff Layton wrote:
> On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
>> When remounting aborted mount, also reset client's entity addr.
>> 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
>> from blacklist.
>>
> 
> Why do I need to umount here? Once the filesystem is unmounted, then the
> '-o remount' becomes superfluous, no? In fact, I get an error back when
> I try to remount an unmounted filesystem:
> 
>      $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
>      mount: /mnt/cephfs: mount point not mounted or bad option.
> 
> My client isn't blacklisted above, so I guess you're counting on the
> umount returning without having actually unmounted the filesystem?
> 
> I think this ought to not need a umount first. From a UI standpoint,
> just doing a "mount -o remount" ought to be sufficient to clear this.
> 

If just doing "mount -o remount", user will expect there is no 
data/metadata get lost.  The 'mount -f' explicitly tell user this 
operation may lose data/metadata.


> Also, how would an admin know that this is something they ought to try?
> Is there a way for them to know that their client has been blacklisted?
> 
>> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
>> ---
>>   fs/ceph/mds_client.c | 16 +++++++++++++---
>>   fs/ceph/super.c      | 23 +++++++++++++++++++++--
>>   2 files changed, 34 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index 19c62cf7d5b8..188c33709d9a 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -1378,9 +1378,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
>>   		struct ceph_cap_flush *cf;
>>   		struct ceph_mds_client *mdsc = fsc->mdsc;
>>   
>> -		if (ci->i_wrbuffer_ref > 0 &&
>> -		    READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
>> -			invalidate = true;
>> +		if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
>> +			if (inode->i_data.nrpages > 0)
>> +				invalidate = true;
>> +			if (ci->i_wrbuffer_ref > 0)
>> +				mapping_set_error(&inode->i_data, -EIO);
>> +		}
>>   
>>   		while (!list_empty(&ci->i_cap_flush_list)) {
>>   			cf = list_first_entry(&ci->i_cap_flush_list,
>> @@ -4350,7 +4353,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
>>   		session = __ceph_lookup_mds_session(mdsc, mds);
>>   		if (!session)
>>   			continue;
>> +
>> +		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
>> +			__unregister_session(mdsc, session);
>> +		__wake_requests(mdsc, &session->s_waiting);
>>   		mutex_unlock(&mdsc->mutex);
>> +
>>   		mutex_lock(&session->s_mutex);
>>   		__close_session(mdsc, session);
>>   		if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
>> @@ -4359,9 +4367,11 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
>>   		}
>>   		mutex_unlock(&session->s_mutex);
>>   		ceph_put_mds_session(session);
>> +
>>   		mutex_lock(&mdsc->mutex);
>>   		kick_requests(mdsc, mds);
>>   	}
>> +
>>   	__wake_requests(mdsc, &mdsc->waiting_for_map);
>>   	mutex_unlock(&mdsc->mutex);
>>   }
>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>> index 67eb9d592ab7..a6a3c065f697 100644
>> --- a/fs/ceph/super.c
>> +++ b/fs/ceph/super.c
>> @@ -833,8 +833,27 @@ static void ceph_umount_begin(struct super_block *sb)
>>   
>>   static int ceph_remount(struct super_block *sb, int *flags, char *data)
>>   {
>> -	sync_filesystem(sb);
>> -	return 0;
>> +	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
>> +
>> +	if (fsc->mount_state != CEPH_MOUNT_SHUTDOWN) {
>> +		sync_filesystem(sb);
>> +		return 0;
>> +	}
>> +
>> +	/* Make sure all page caches get invalidated.
>> +	 * see remove_session_caps_cb() */
>> +	flush_workqueue(fsc->inode_wq);
>> +	/* In case that we were blacklisted. This also reset
>> +	 * all mon/osd connections */
>> +	ceph_reset_client_addr(fsc->client);
>> +
>> +	ceph_osdc_clear_abort_err(&fsc->client->osdc);
>> +	fsc->mount_state = 0;
>> +
>> +	if (!sb->s_root)
>> +		return 0;
>> +	return __ceph_do_getattr(d_inode(sb->s_root), NULL,
>> +				 CEPH_STAT_CAP_INODE, true);
>>   }
>>   
>>   static const struct super_operations ceph_super_ops = {
>
Jeff Layton June 18, 2019, 10:37 a.m. UTC | #5
On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
> On 6/18/19 1:30 AM, Jeff Layton wrote:
> > On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> > > When remounting aborted mount, also reset client's entity addr.
> > > 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> > > from blacklist.
> > > 
> > 
> > Why do I need to umount here? Once the filesystem is unmounted, then the
> > '-o remount' becomes superfluous, no? In fact, I get an error back when
> > I try to remount an unmounted filesystem:
> > 
> >      $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
> >      mount: /mnt/cephfs: mount point not mounted or bad option.
> > 
> > My client isn't blacklisted above, so I guess you're counting on the
> > umount returning without having actually unmounted the filesystem?
> > 
> > I think this ought to not need a umount first. From a UI standpoint,
> > just doing a "mount -o remount" ought to be sufficient to clear this.
> > 

> This series is mainly for the case that mount point is not umountable.
> If mount point is umountable, user should use 'umount -f /ceph; mount 
> /ceph'. This avoids all trouble of error handling.
> 

...

> 
> If just doing "mount -o remount", user will expect there is no 
> data/metadata get lost.  The 'mount -f' explicitly tell user this 
> operation may lose data/metadata.
> 
> 

I don't think they'd expect that and even if they did, that's why we'd
return errors on certain operations until they are cleared. But, I think
all of this points out the main issue I have with this patchset, which
is that it's not clear what problem this is solving.

So: client gets blacklisted and we want to allow it to come back in some
fashion. Do we expect applications that happened to be accessing that
mount to be able to continue running, or will they need to be restarted?
If they need to be restarted why not just expect the admin to kill them
all off, unmount and remount and then start them back up again?

> > Also, how would an admin know that this is something they ought to try?
> > Is there a way for them to know that their client has been blacklisted?
> > 
> > > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > > ---
> > >   fs/ceph/mds_client.c | 16 +++++++++++++---
> > >   fs/ceph/super.c      | 23 +++++++++++++++++++++--
> > >   2 files changed, 34 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index 19c62cf7d5b8..188c33709d9a 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -1378,9 +1378,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
> > >   		struct ceph_cap_flush *cf;
> > >   		struct ceph_mds_client *mdsc = fsc->mdsc;
> > >   
> > > -		if (ci->i_wrbuffer_ref > 0 &&
> > > -		    READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
> > > -			invalidate = true;
> > > +		if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
> > > +			if (inode->i_data.nrpages > 0)
> > > +				invalidate = true;
> > > +			if (ci->i_wrbuffer_ref > 0)
> > > +				mapping_set_error(&inode->i_data, -EIO);
> > > +		}
> > >   
> > >   		while (!list_empty(&ci->i_cap_flush_list)) {
> > >   			cf = list_first_entry(&ci->i_cap_flush_list,
> > > @@ -4350,7 +4353,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
> > >   		session = __ceph_lookup_mds_session(mdsc, mds);
> > >   		if (!session)
> > >   			continue;
> > > +
> > > +		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
> > > +			__unregister_session(mdsc, session);
> > > +		__wake_requests(mdsc, &session->s_waiting);
> > >   		mutex_unlock(&mdsc->mutex);
> > > +
> > >   		mutex_lock(&session->s_mutex);
> > >   		__close_session(mdsc, session);
> > >   		if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
> > > @@ -4359,9 +4367,11 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
> > >   		}
> > >   		mutex_unlock(&session->s_mutex);
> > >   		ceph_put_mds_session(session);
> > > +
> > >   		mutex_lock(&mdsc->mutex);
> > >   		kick_requests(mdsc, mds);
> > >   	}
> > > +
> > >   	__wake_requests(mdsc, &mdsc->waiting_for_map);
> > >   	mutex_unlock(&mdsc->mutex);
> > >   }
> > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > index 67eb9d592ab7..a6a3c065f697 100644
> > > --- a/fs/ceph/super.c
> > > +++ b/fs/ceph/super.c
> > > @@ -833,8 +833,27 @@ static void ceph_umount_begin(struct super_block *sb)
> > >   
> > >   static int ceph_remount(struct super_block *sb, int *flags, char *data)
> > >   {
> > > -	sync_filesystem(sb);
> > > -	return 0;
> > > +	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
> > > +
> > > +	if (fsc->mount_state != CEPH_MOUNT_SHUTDOWN) {
> > > +		sync_filesystem(sb);
> > > +		return 0;
> > > +	}
> > > +
> > > +	/* Make sure all page caches get invalidated.
> > > +	 * see remove_session_caps_cb() */
> > > +	flush_workqueue(fsc->inode_wq);
> > > +	/* In case that we were blacklisted. This also reset
> > > +	 * all mon/osd connections */
> > > +	ceph_reset_client_addr(fsc->client);
> > > +
> > > +	ceph_osdc_clear_abort_err(&fsc->client->osdc);
> > > +	fsc->mount_state = 0;
> > > +
> > > +	if (!sb->s_root)
> > > +		return 0;
> > > +	return __ceph_do_getattr(d_inode(sb->s_root), NULL,
> > > +				 CEPH_STAT_CAP_INODE, true);
> > >   }
> > >   
> > >   static const struct super_operations ceph_super_ops = {
Yan, Zheng June 19, 2019, 12:24 a.m. UTC | #6
On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
>
> On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
> > On 6/18/19 1:30 AM, Jeff Layton wrote:
> > > On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> > > > When remounting aborted mount, also reset client's entity addr.
> > > > 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> > > > from blacklist.
> > > >
> > >
> > > Why do I need to umount here? Once the filesystem is unmounted, then the
> > > '-o remount' becomes superfluous, no? In fact, I get an error back when
> > > I try to remount an unmounted filesystem:
> > >
> > >      $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
> > >      mount: /mnt/cephfs: mount point not mounted or bad option.
> > >
> > > My client isn't blacklisted above, so I guess you're counting on the
> > > umount returning without having actually unmounted the filesystem?
> > >
> > > I think this ought to not need a umount first. From a UI standpoint,
> > > just doing a "mount -o remount" ought to be sufficient to clear this.
> > >
>
> > This series is mainly for the case that mount point is not umountable.
> > If mount point is umountable, user should use 'umount -f /ceph; mount
> > /ceph'. This avoids all trouble of error handling.
> >
>
> ...
>
> >
> > If just doing "mount -o remount", user will expect there is no
> > data/metadata get lost.  The 'mount -f' explicitly tell user this
> > operation may lose data/metadata.
> >
> >
>
> I don't think they'd expect that and even if they did, that's why we'd
> return errors on certain operations until they are cleared. But, I think
> all of this points out the main issue I have with this patchset, which
> is that it's not clear what problem this is solving.
>
> So: client gets blacklisted and we want to allow it to come back in some
> fashion. Do we expect applications that happened to be accessing that
> mount to be able to continue running, or will they need to be restarted?
> If they need to be restarted why not just expect the admin to kill them
> all off, unmount and remount and then start them back up again?
>

The point is let users decide what to do. Some user values
availability over consistency. It's inconvenient to kill all
applications that use the mount, then do umount.

Regards
Yan, Zheng



> > > Also, how would an admin know that this is something they ought to try?
> > > Is there a way for them to know that their client has been blacklisted?
> > >
> > > > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > > > ---
> > > >   fs/ceph/mds_client.c | 16 +++++++++++++---
> > > >   fs/ceph/super.c      | 23 +++++++++++++++++++++--
> > > >   2 files changed, 34 insertions(+), 5 deletions(-)
> > > >
> > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > index 19c62cf7d5b8..188c33709d9a 100644
> > > > --- a/fs/ceph/mds_client.c
> > > > +++ b/fs/ceph/mds_client.c
> > > > @@ -1378,9 +1378,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
> > > >                   struct ceph_cap_flush *cf;
> > > >                   struct ceph_mds_client *mdsc = fsc->mdsc;
> > > >
> > > > -         if (ci->i_wrbuffer_ref > 0 &&
> > > > -             READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
> > > > -                 invalidate = true;
> > > > +         if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
> > > > +                 if (inode->i_data.nrpages > 0)
> > > > +                         invalidate = true;
> > > > +                 if (ci->i_wrbuffer_ref > 0)
> > > > +                         mapping_set_error(&inode->i_data, -EIO);
> > > > +         }
> > > >
> > > >                   while (!list_empty(&ci->i_cap_flush_list)) {
> > > >                           cf = list_first_entry(&ci->i_cap_flush_list,
> > > > @@ -4350,7 +4353,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
> > > >                   session = __ceph_lookup_mds_session(mdsc, mds);
> > > >                   if (!session)
> > > >                           continue;
> > > > +
> > > > +         if (session->s_state == CEPH_MDS_SESSION_REJECTED)
> > > > +                 __unregister_session(mdsc, session);
> > > > +         __wake_requests(mdsc, &session->s_waiting);
> > > >                   mutex_unlock(&mdsc->mutex);
> > > > +
> > > >                   mutex_lock(&session->s_mutex);
> > > >                   __close_session(mdsc, session);
> > > >                   if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
> > > > @@ -4359,9 +4367,11 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
> > > >                   }
> > > >                   mutex_unlock(&session->s_mutex);
> > > >                   ceph_put_mds_session(session);
> > > > +
> > > >                   mutex_lock(&mdsc->mutex);
> > > >                   kick_requests(mdsc, mds);
> > > >           }
> > > > +
> > > >           __wake_requests(mdsc, &mdsc->waiting_for_map);
> > > >           mutex_unlock(&mdsc->mutex);
> > > >   }
> > > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > > index 67eb9d592ab7..a6a3c065f697 100644
> > > > --- a/fs/ceph/super.c
> > > > +++ b/fs/ceph/super.c
> > > > @@ -833,8 +833,27 @@ static void ceph_umount_begin(struct super_block *sb)
> > > >
> > > >   static int ceph_remount(struct super_block *sb, int *flags, char *data)
> > > >   {
> > > > - sync_filesystem(sb);
> > > > - return 0;
> > > > + struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
> > > > +
> > > > + if (fsc->mount_state != CEPH_MOUNT_SHUTDOWN) {
> > > > +         sync_filesystem(sb);
> > > > +         return 0;
> > > > + }
> > > > +
> > > > + /* Make sure all page caches get invalidated.
> > > > +  * see remove_session_caps_cb() */
> > > > + flush_workqueue(fsc->inode_wq);
> > > > + /* In case that we were blacklisted. This also reset
> > > > +  * all mon/osd connections */
> > > > + ceph_reset_client_addr(fsc->client);
> > > > +
> > > > + ceph_osdc_clear_abort_err(&fsc->client->osdc);
> > > > + fsc->mount_state = 0;
> > > > +
> > > > + if (!sb->s_root)
> > > > +         return 0;
> > > > + return __ceph_do_getattr(d_inode(sb->s_root), NULL,
> > > > +                          CEPH_STAT_CAP_INODE, true);
> > > >   }
> > > >
> > > >   static const struct super_operations ceph_super_ops = {
>
> --
> Jeff Layton <jlayton@redhat.com>
>
Jeff Layton June 20, 2019, 3:33 p.m. UTC | #7
On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
> On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
> > On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
> > > On 6/18/19 1:30 AM, Jeff Layton wrote:
> > > > On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> > > > > When remounting aborted mount, also reset client's entity addr.
> > > > > 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> > > > > from blacklist.
> > > > > 
> > > > 
> > > > Why do I need to umount here? Once the filesystem is unmounted, then the
> > > > '-o remount' becomes superfluous, no? In fact, I get an error back when
> > > > I try to remount an unmounted filesystem:
> > > > 
> > > >      $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
> > > >      mount: /mnt/cephfs: mount point not mounted or bad option.
> > > > 
> > > > My client isn't blacklisted above, so I guess you're counting on the
> > > > umount returning without having actually unmounted the filesystem?
> > > > 
> > > > I think this ought to not need a umount first. From a UI standpoint,
> > > > just doing a "mount -o remount" ought to be sufficient to clear this.
> > > > 
> > > This series is mainly for the case that mount point is not umountable.
> > > If mount point is umountable, user should use 'umount -f /ceph; mount
> > > /ceph'. This avoids all trouble of error handling.
> > > 
> > 
> > ...
> > 
> > > If just doing "mount -o remount", user will expect there is no
> > > data/metadata get lost.  The 'mount -f' explicitly tell user this
> > > operation may lose data/metadata.
> > > 
> > > 
> > 
> > I don't think they'd expect that and even if they did, that's why we'd
> > return errors on certain operations until they are cleared. But, I think
> > all of this points out the main issue I have with this patchset, which
> > is that it's not clear what problem this is solving.
> > 
> > So: client gets blacklisted and we want to allow it to come back in some
> > fashion. Do we expect applications that happened to be accessing that
> > mount to be able to continue running, or will they need to be restarted?
> > If they need to be restarted why not just expect the admin to kill them
> > all off, unmount and remount and then start them back up again?
> > 
> 
> The point is let users decide what to do. Some user values
> availability over consistency. It's inconvenient to kill all
> applications that use the mount, then do umount.
> 
> 

I think I have a couple of issues with this patchset. Maybe you can
convince me though:

1) The interface is really weird.

You suggested that we needed to do:

    # umount -f /mnt/foo ; mount -o remount /mnt/foo

...but what if I'm not really blacklisted? Didn't I just kill off all
the calls in-flight with the umount -f? What if that umount actually
succeeds? Then the subsequent remount call will fail.

ISTM, that this interface (should we choose to accept it) should just
be:

    # mount -o remount /mnt/foo

...and if the client figures out that it has been blacklisted, then it
does the right thing during the remount (whatever that right thing is).

2) It's not clear to me who we expect to use this.

Are you targeting applications that do not use file locking? Any that do
use file locking will probably need some special handling, but those
that don't might be able to get by unscathed as long as they can deal
with -EIO on fsync by replaying writes since the last fsync.

The catch here is that not many applications do that. Most just fall
over once fsync hits an error. That is a bit of a chicken and egg
problem though, so that's not necessarily an argument against doing
this.

> 
> > > > Also, how would an admin know that this is something they ought to try?
> > > > Is there a way for them to know that their client has been blacklisted?
> > > > 
> > > > > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > > > > ---
> > > > >   fs/ceph/mds_client.c | 16 +++++++++++++---
> > > > >   fs/ceph/super.c      | 23 +++++++++++++++++++++--
> > > > >   2 files changed, 34 insertions(+), 5 deletions(-)
> > > > > 
> > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > index 19c62cf7d5b8..188c33709d9a 100644
> > > > > --- a/fs/ceph/mds_client.c
> > > > > +++ b/fs/ceph/mds_client.c
> > > > > @@ -1378,9 +1378,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
> > > > >                   struct ceph_cap_flush *cf;
> > > > >                   struct ceph_mds_client *mdsc = fsc->mdsc;
> > > > > 
> > > > > -         if (ci->i_wrbuffer_ref > 0 &&
> > > > > -             READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
> > > > > -                 invalidate = true;
> > > > > +         if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
> > > > > +                 if (inode->i_data.nrpages > 0)
> > > > > +                         invalidate = true;
> > > > > +                 if (ci->i_wrbuffer_ref > 0)
> > > > > +                         mapping_set_error(&inode->i_data, -EIO);
> > > > > +         }
> > > > > 
> > > > >                   while (!list_empty(&ci->i_cap_flush_list)) {
> > > > >                           cf = list_first_entry(&ci->i_cap_flush_list,
> > > > > @@ -4350,7 +4353,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
> > > > >                   session = __ceph_lookup_mds_session(mdsc, mds);
> > > > >                   if (!session)
> > > > >                           continue;
> > > > > +
> > > > > +         if (session->s_state == CEPH_MDS_SESSION_REJECTED)
> > > > > +                 __unregister_session(mdsc, session);
> > > > > +         __wake_requests(mdsc, &session->s_waiting);
> > > > >                   mutex_unlock(&mdsc->mutex);
> > > > > +
> > > > >                   mutex_lock(&session->s_mutex);
> > > > >                   __close_session(mdsc, session);
> > > > >                   if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
> > > > > @@ -4359,9 +4367,11 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
> > > > >                   }
> > > > >                   mutex_unlock(&session->s_mutex);
> > > > >                   ceph_put_mds_session(session);
> > > > > +
> > > > >                   mutex_lock(&mdsc->mutex);
> > > > >                   kick_requests(mdsc, mds);
> > > > >           }
> > > > > +
> > > > >           __wake_requests(mdsc, &mdsc->waiting_for_map);
> > > > >           mutex_unlock(&mdsc->mutex);
> > > > >   }
> > > > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > > > index 67eb9d592ab7..a6a3c065f697 100644
> > > > > --- a/fs/ceph/super.c
> > > > > +++ b/fs/ceph/super.c
> > > > > @@ -833,8 +833,27 @@ static void ceph_umount_begin(struct super_block *sb)
> > > > > 
> > > > >   static int ceph_remount(struct super_block *sb, int *flags, char *data)
> > > > >   {
> > > > > - sync_filesystem(sb);
> > > > > - return 0;
> > > > > + struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
> > > > > +
> > > > > + if (fsc->mount_state != CEPH_MOUNT_SHUTDOWN) {
> > > > > +         sync_filesystem(sb);
> > > > > +         return 0;
> > > > > + }
> > > > > +
> > > > > + /* Make sure all page caches get invalidated.
> > > > > +  * see remove_session_caps_cb() */
> > > > > + flush_workqueue(fsc->inode_wq);
> > > > > + /* In case that we were blacklisted. This also reset
> > > > > +  * all mon/osd connections */
> > > > > + ceph_reset_client_addr(fsc->client);
> > > > > +
> > > > > + ceph_osdc_clear_abort_err(&fsc->client->osdc);
> > > > > + fsc->mount_state = 0;
> > > > > +
> > > > > + if (!sb->s_root)
> > > > > +         return 0;
> > > > > + return __ceph_do_getattr(d_inode(sb->s_root), NULL,
> > > > > +                          CEPH_STAT_CAP_INODE, true);
> > > > >   }
> > > > > 
> > > > >   static const struct super_operations ceph_super_ops = {
> > 
> > --
> > Jeff Layton <jlayton@redhat.com>
> >
Gregory Farnum June 20, 2019, 4:45 p.m. UTC | #8
On Thu, Jun 20, 2019 at 8:34 AM Jeff Layton <jlayton@redhat.com> wrote:
>
> On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
> > On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
> > > On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
> > > > On 6/18/19 1:30 AM, Jeff Layton wrote:
> > > > > On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> > > > > > When remounting aborted mount, also reset client's entity addr.
> > > > > > 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> > > > > > from blacklist.
> > > > > >
> > > > >
> > > > > Why do I need to umount here? Once the filesystem is unmounted, then the
> > > > > '-o remount' becomes superfluous, no? In fact, I get an error back when
> > > > > I try to remount an unmounted filesystem:
> > > > >
> > > > >      $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
> > > > >      mount: /mnt/cephfs: mount point not mounted or bad option.
> > > > >
> > > > > My client isn't blacklisted above, so I guess you're counting on the
> > > > > umount returning without having actually unmounted the filesystem?
> > > > >
> > > > > I think this ought to not need a umount first. From a UI standpoint,
> > > > > just doing a "mount -o remount" ought to be sufficient to clear this.
> > > > >
> > > > This series is mainly for the case that mount point is not umountable.
> > > > If mount point is umountable, user should use 'umount -f /ceph; mount
> > > > /ceph'. This avoids all trouble of error handling.
> > > >
> > >
> > > ...
> > >
> > > > If just doing "mount -o remount", user will expect there is no
> > > > data/metadata get lost.  The 'mount -f' explicitly tell user this
> > > > operation may lose data/metadata.
> > > >
> > > >
> > >
> > > I don't think they'd expect that and even if they did, that's why we'd
> > > return errors on certain operations until they are cleared. But, I think
> > > all of this points out the main issue I have with this patchset, which
> > > is that it's not clear what problem this is solving.
> > >
> > > So: client gets blacklisted and we want to allow it to come back in some
> > > fashion. Do we expect applications that happened to be accessing that
> > > mount to be able to continue running, or will they need to be restarted?
> > > If they need to be restarted why not just expect the admin to kill them
> > > all off, unmount and remount and then start them back up again?
> > >
> >
> > The point is let users decide what to do. Some user values
> > availability over consistency. It's inconvenient to kill all
> > applications that use the mount, then do umount.
> >
> >
>
> I think I have a couple of issues with this patchset. Maybe you can
> convince me though:
>
> 1) The interface is really weird.
>
> You suggested that we needed to do:
>
>     # umount -f /mnt/foo ; mount -o remount /mnt/foo
>
> ...but what if I'm not really blacklisted? Didn't I just kill off all
> the calls in-flight with the umount -f? What if that umount actually
> succeeds? Then the subsequent remount call will fail.
>
> ISTM, that this interface (should we choose to accept it) should just
> be:
>
>     # mount -o remount /mnt/foo
>
> ...and if the client figures out that it has been blacklisted, then it
> does the right thing during the remount (whatever that right thing is).
>
> 2) It's not clear to me who we expect to use this.
>
> Are you targeting applications that do not use file locking? Any that do
> use file locking will probably need some special handling, but those
> that don't might be able to get by unscathed as long as they can deal
> with -EIO on fsync by replaying writes since the last fsync.
>
> The catch here is that not many applications do that. Most just fall
> over once fsync hits an error. That is a bit of a chicken and egg
> problem though, so that's not necessarily an argument against doing
> this.

It seems like a lot of users/admins are okay with their applications
falling over and getting restarted by the init system, but requiring
them to manually reboot a system is a bridge too far (especially if
the reboot hangs due to dirty data they can't force a flush on). So
doing *something* in response that they can trigger easily or that we
do automatically would be nice.

But I also agree with you that the something needs to be carefully
designed to maintain the consistency semantics we promise elsewhere
and be understandable to users, and these recent patch series make me
nervous in that regard so far...
-Greg
Patrick Donnelly June 20, 2019, 5:11 p.m. UTC | #9
On Thu, Jun 20, 2019 at 8:34 AM Jeff Layton <jlayton@redhat.com> wrote:
>
> On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
> > On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
> > > On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
> > > > On 6/18/19 1:30 AM, Jeff Layton wrote:
> > > > > On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> > > > > > When remounting aborted mount, also reset client's entity addr.
> > > > > > 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> > > > > > from blacklist.
> > > > > >
> > > > >
> > > > > Why do I need to umount here? Once the filesystem is unmounted, then the
> > > > > '-o remount' becomes superfluous, no? In fact, I get an error back when
> > > > > I try to remount an unmounted filesystem:
> > > > >
> > > > >      $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
> > > > >      mount: /mnt/cephfs: mount point not mounted or bad option.
> > > > >
> > > > > My client isn't blacklisted above, so I guess you're counting on the
> > > > > umount returning without having actually unmounted the filesystem?
> > > > >
> > > > > I think this ought to not need a umount first. From a UI standpoint,
> > > > > just doing a "mount -o remount" ought to be sufficient to clear this.
> > > > >
> > > > This series is mainly for the case that mount point is not umountable.
> > > > If mount point is umountable, user should use 'umount -f /ceph; mount
> > > > /ceph'. This avoids all trouble of error handling.
> > > >
> > >
> > > ...
> > >
> > > > If just doing "mount -o remount", user will expect there is no
> > > > data/metadata get lost.  The 'mount -f' explicitly tell user this
> > > > operation may lose data/metadata.
> > > >
> > > >
> > >
> > > I don't think they'd expect that and even if they did, that's why we'd
> > > return errors on certain operations until they are cleared. But, I think
> > > all of this points out the main issue I have with this patchset, which
> > > is that it's not clear what problem this is solving.
> > >
> > > So: client gets blacklisted and we want to allow it to come back in some
> > > fashion. Do we expect applications that happened to be accessing that
> > > mount to be able to continue running, or will they need to be restarted?
> > > If they need to be restarted why not just expect the admin to kill them
> > > all off, unmount and remount and then start them back up again?
> > >
> >
> > The point is let users decide what to do. Some user values
> > availability over consistency. It's inconvenient to kill all
> > applications that use the mount, then do umount.
> >
> >
>
> I think I have a couple of issues with this patchset. Maybe you can
> convince me though:
>
> 1) The interface is really weird.
>
> You suggested that we needed to do:
>
>     # umount -f /mnt/foo ; mount -o remount /mnt/foo
>
> ...but what if I'm not really blacklisted? Didn't I just kill off all
> the calls in-flight with the umount -f? What if that umount actually
> succeeds? Then the subsequent remount call will fail.
>
> ISTM, that this interface (should we choose to accept it) should just
> be:
>
>     # mount -o remount /mnt/foo
>
> ...and if the client figures out that it has been blacklisted, then it
> does the right thing during the remount (whatever that right thing is).

This looks reasonable to me. It's not clear to me (without poring over
the code which I lack time to do rigorously) why the umount should be
necessary at all.

Furthermore, I don't like that this is requiring operator intervention
(i.e. remount) of any kind to recover the mount. If undesirable
consistency/cache coherency concerns are what's stopping us from
automatic recovery, then I propose we make client recovery
configurable with mount options and sensible defaults. For example,
the default might be to cause all open file handles to return EIO for
any operation. (Would such a thing even be easily doable within the
Linux kernel? It's not clear to me if Linux file system drivers can
somehow invalidate application file handles in this way.)

Another config option would be to allow file open for reading to
continue functioning but disable files open for writing (and drop all
buffered dirty pages).

Finally: another config to require operator to remount the file system.
Yan, Zheng June 21, 2019, 2:59 a.m. UTC | #10
On Fri, Jun 21, 2019 at 1:11 AM Patrick Donnelly <pdonnell@redhat.com> wrote:
>
> On Thu, Jun 20, 2019 at 8:34 AM Jeff Layton <jlayton@redhat.com> wrote:
> >
> > On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
> > > On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
> > > > On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
> > > > > On 6/18/19 1:30 AM, Jeff Layton wrote:
> > > > > > On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> > > > > > > When remounting aborted mount, also reset client's entity addr.
> > > > > > > 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> > > > > > > from blacklist.
> > > > > > >
> > > > > >
> > > > > > Why do I need to umount here? Once the filesystem is unmounted, then the
> > > > > > '-o remount' becomes superfluous, no? In fact, I get an error back when
> > > > > > I try to remount an unmounted filesystem:
> > > > > >
> > > > > >      $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
> > > > > >      mount: /mnt/cephfs: mount point not mounted or bad option.
> > > > > >
> > > > > > My client isn't blacklisted above, so I guess you're counting on the
> > > > > > umount returning without having actually unmounted the filesystem?
> > > > > >
> > > > > > I think this ought to not need a umount first. From a UI standpoint,
> > > > > > just doing a "mount -o remount" ought to be sufficient to clear this.
> > > > > >
> > > > > This series is mainly for the case that mount point is not umountable.
> > > > > If mount point is umountable, user should use 'umount -f /ceph; mount
> > > > > /ceph'. This avoids all trouble of error handling.
> > > > >
> > > >
> > > > ...
> > > >
> > > > > If just doing "mount -o remount", user will expect there is no
> > > > > data/metadata get lost.  The 'mount -f' explicitly tell user this
> > > > > operation may lose data/metadata.
> > > > >
> > > > >
> > > >
> > > > I don't think they'd expect that and even if they did, that's why we'd
> > > > return errors on certain operations until they are cleared. But, I think
> > > > all of this points out the main issue I have with this patchset, which
> > > > is that it's not clear what problem this is solving.
> > > >
> > > > So: client gets blacklisted and we want to allow it to come back in some
> > > > fashion. Do we expect applications that happened to be accessing that
> > > > mount to be able to continue running, or will they need to be restarted?
> > > > If they need to be restarted why not just expect the admin to kill them
> > > > all off, unmount and remount and then start them back up again?
> > > >
> > >
> > > The point is let users decide what to do. Some user values
> > > availability over consistency. It's inconvenient to kill all
> > > applications that use the mount, then do umount.
> > >
> > >
> >
> > I think I have a couple of issues with this patchset. Maybe you can
> > convince me though:
> >
> > 1) The interface is really weird.
> >
> > You suggested that we needed to do:
> >
> >     # umount -f /mnt/foo ; mount -o remount /mnt/foo
> >
> > ...but what if I'm not really blacklisted? Didn't I just kill off all
> > the calls in-flight with the umount -f? What if that umount actually
> > succeeds? Then the subsequent remount call will fail.
> >
> > ISTM, that this interface (should we choose to accept it) should just
> > be:
> >
> >     # mount -o remount /mnt/foo
> >
> > ...and if the client figures out that it has been blacklisted, then it
> > does the right thing during the remount (whatever that right thing is).
>
> This looks reasonable to me. It's not clear to me (without poring over
> the code which I lack time to do rigorously) why the umount should be
> necessary at all.
>
> Furthermore, I don't like that this is requiring operator intervention
> (i.e. remount) of any kind to recover the mount. If undesirable
> consistency/cache coherency concerns are what's stopping us from
> automatic recovery,

if admin want clients to auto recover. why enabling blacklist on eviction
at the first place.

> then I propose we make client recovery
> configurable with mount options and sensible defaults. For example,
> the default might be to cause all open file handles to return EIO for
> any operation. (Would such a thing even be easily doable within the
> Linux kernel? It's not clear to me if Linux file system drivers can
> somehow invalidate application file handles in this way.)
>

> Another config option would be to allow file open for reading to
> continue functioning but disable files open for writing (and drop all
> buffered dirty pages).
>
> Finally: another config to require operator to remount the file system.
>
> --
> Patrick Donnelly, Ph.D.
> He / Him / His
> Senior Software Engineer
> Red Hat Sunnyvale, CA
> GPG: 19F28A586F808C2402351B93C3301A3E258DD79D
Yan, Zheng June 21, 2019, 8:10 a.m. UTC | #11
On 6/20/19 11:33 PM, Jeff Layton wrote:
> On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
>> On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
>>> On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
>>>> On 6/18/19 1:30 AM, Jeff Layton wrote:
>>>>> On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
>>>>>> When remounting aborted mount, also reset client's entity addr.
>>>>>> 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
>>>>>> from blacklist.
>>>>>>
>>>>>
>>>>> Why do I need to umount here? Once the filesystem is unmounted, then the
>>>>> '-o remount' becomes superfluous, no? In fact, I get an error back when
>>>>> I try to remount an unmounted filesystem:
>>>>>
>>>>>       $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
>>>>>       mount: /mnt/cephfs: mount point not mounted or bad option.
>>>>>
>>>>> My client isn't blacklisted above, so I guess you're counting on the
>>>>> umount returning without having actually unmounted the filesystem?
>>>>>
>>>>> I think this ought to not need a umount first. From a UI standpoint,
>>>>> just doing a "mount -o remount" ought to be sufficient to clear this.
>>>>>
>>>> This series is mainly for the case that mount point is not umountable.
>>>> If mount point is umountable, user should use 'umount -f /ceph; mount
>>>> /ceph'. This avoids all trouble of error handling.
>>>>
>>>
>>> ...
>>>
>>>> If just doing "mount -o remount", user will expect there is no
>>>> data/metadata get lost.  The 'mount -f' explicitly tell user this
>>>> operation may lose data/metadata.
>>>>
>>>>
>>>
>>> I don't think they'd expect that and even if they did, that's why we'd
>>> return errors on certain operations until they are cleared. But, I think
>>> all of this points out the main issue I have with this patchset, which
>>> is that it's not clear what problem this is solving.
>>>
>>> So: client gets blacklisted and we want to allow it to come back in some
>>> fashion. Do we expect applications that happened to be accessing that
>>> mount to be able to continue running, or will they need to be restarted?
>>> If they need to be restarted why not just expect the admin to kill them
>>> all off, unmount and remount and then start them back up again?
>>>
>>
>> The point is let users decide what to do. Some user values
>> availability over consistency. It's inconvenient to kill all
>> applications that use the mount, then do umount.
>>
>>
> 
> I think I have a couple of issues with this patchset. Maybe you can
> convince me though:
> 
> 1) The interface is really weird.
> 
> You suggested that we needed to do:
> 
>      # umount -f /mnt/foo ; mount -o remount /mnt/foo
> 
> ...but what if I'm not really blacklisted? Didn't I just kill off all
> the calls in-flight with the umount -f? What if that umount actually
> succeeds? Then the subsequent remount call will fail.
> 
> ISTM, that this interface (should we choose to accept it) should just
> be:
> 
>      # mount -o remount /mnt/foo
> 

I have patch that does

mount -o remount,force_reconnect /mnt/ceph


> ...and if the client figures out that it has been blacklisted, then it
> does the right thing during the remount (whatever that right thing is).
> 
> 2) It's not clear to me who we expect to use this.
> 
> Are you targeting applications that do not use file locking? Any that do
> use file locking will probably need some special handling, but those
> that don't might be able to get by unscathed as long as they can deal
> with -EIO on fsync by replaying writes since the last fsync.
> 

Several users said they availability over consistency. For example: 
ImageNet training, cephfs is used for storing image files.



> The catch here is that not many applications do that. Most just fall
> over once fsync hits an error. That is a bit of a chicken and egg
> problem though, so that's not necessarily an argument against doing
> this.
> 




>>
>>>>> Also, how would an admin know that this is something they ought to try?
>>>>> Is there a way for them to know that their client has been blacklisted?
>>>>>
>>>>>> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
>>>>>> ---
>>>>>>    fs/ceph/mds_client.c | 16 +++++++++++++---
>>>>>>    fs/ceph/super.c      | 23 +++++++++++++++++++++--
>>>>>>    2 files changed, 34 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>>>> index 19c62cf7d5b8..188c33709d9a 100644
>>>>>> --- a/fs/ceph/mds_client.c
>>>>>> +++ b/fs/ceph/mds_client.c
>>>>>> @@ -1378,9 +1378,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
>>>>>>                    struct ceph_cap_flush *cf;
>>>>>>                    struct ceph_mds_client *mdsc = fsc->mdsc;
>>>>>>
>>>>>> -         if (ci->i_wrbuffer_ref > 0 &&
>>>>>> -             READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
>>>>>> -                 invalidate = true;
>>>>>> +         if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
>>>>>> +                 if (inode->i_data.nrpages > 0)
>>>>>> +                         invalidate = true;
>>>>>> +                 if (ci->i_wrbuffer_ref > 0)
>>>>>> +                         mapping_set_error(&inode->i_data, -EIO);
>>>>>> +         }
>>>>>>
>>>>>>                    while (!list_empty(&ci->i_cap_flush_list)) {
>>>>>>                            cf = list_first_entry(&ci->i_cap_flush_list,
>>>>>> @@ -4350,7 +4353,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
>>>>>>                    session = __ceph_lookup_mds_session(mdsc, mds);
>>>>>>                    if (!session)
>>>>>>                            continue;
>>>>>> +
>>>>>> +         if (session->s_state == CEPH_MDS_SESSION_REJECTED)
>>>>>> +                 __unregister_session(mdsc, session);
>>>>>> +         __wake_requests(mdsc, &session->s_waiting);
>>>>>>                    mutex_unlock(&mdsc->mutex);
>>>>>> +
>>>>>>                    mutex_lock(&session->s_mutex);
>>>>>>                    __close_session(mdsc, session);
>>>>>>                    if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
>>>>>> @@ -4359,9 +4367,11 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
>>>>>>                    }
>>>>>>                    mutex_unlock(&session->s_mutex);
>>>>>>                    ceph_put_mds_session(session);
>>>>>> +
>>>>>>                    mutex_lock(&mdsc->mutex);
>>>>>>                    kick_requests(mdsc, mds);
>>>>>>            }
>>>>>> +
>>>>>>            __wake_requests(mdsc, &mdsc->waiting_for_map);
>>>>>>            mutex_unlock(&mdsc->mutex);
>>>>>>    }
>>>>>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>>>>>> index 67eb9d592ab7..a6a3c065f697 100644
>>>>>> --- a/fs/ceph/super.c
>>>>>> +++ b/fs/ceph/super.c
>>>>>> @@ -833,8 +833,27 @@ static void ceph_umount_begin(struct super_block *sb)
>>>>>>
>>>>>>    static int ceph_remount(struct super_block *sb, int *flags, char *data)
>>>>>>    {
>>>>>> - sync_filesystem(sb);
>>>>>> - return 0;
>>>>>> + struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
>>>>>> +
>>>>>> + if (fsc->mount_state != CEPH_MOUNT_SHUTDOWN) {
>>>>>> +         sync_filesystem(sb);
>>>>>> +         return 0;
>>>>>> + }
>>>>>> +
>>>>>> + /* Make sure all page caches get invalidated.
>>>>>> +  * see remove_session_caps_cb() */
>>>>>> + flush_workqueue(fsc->inode_wq);
>>>>>> + /* In case that we were blacklisted. This also reset
>>>>>> +  * all mon/osd connections */
>>>>>> + ceph_reset_client_addr(fsc->client);
>>>>>> +
>>>>>> + ceph_osdc_clear_abort_err(&fsc->client->osdc);
>>>>>> + fsc->mount_state = 0;
>>>>>> +
>>>>>> + if (!sb->s_root)
>>>>>> +         return 0;
>>>>>> + return __ceph_do_getattr(d_inode(sb->s_root), NULL,
>>>>>> +                          CEPH_STAT_CAP_INODE, true);
>>>>>>    }
>>>>>>
>>>>>>    static const struct super_operations ceph_super_ops = {
>>>
>>> --
>>> Jeff Layton <jlayton@redhat.com>
>>>
>
Jeff Layton June 21, 2019, 4:48 p.m. UTC | #12
On Fri, 2019-06-21 at 16:10 +0800, Yan, Zheng wrote:
> On 6/20/19 11:33 PM, Jeff Layton wrote:
> > On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
> > > On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
> > > > On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
> > > > > On 6/18/19 1:30 AM, Jeff Layton wrote:
> > > > > > On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> > > > > > > When remounting aborted mount, also reset client's entity addr.
> > > > > > > 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> > > > > > > from blacklist.
> > > > > > > 
> > > > > > 
> > > > > > Why do I need to umount here? Once the filesystem is unmounted, then the
> > > > > > '-o remount' becomes superfluous, no? In fact, I get an error back when
> > > > > > I try to remount an unmounted filesystem:
> > > > > > 
> > > > > >       $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
> > > > > >       mount: /mnt/cephfs: mount point not mounted or bad option.
> > > > > > 
> > > > > > My client isn't blacklisted above, so I guess you're counting on the
> > > > > > umount returning without having actually unmounted the filesystem?
> > > > > > 
> > > > > > I think this ought to not need a umount first. From a UI standpoint,
> > > > > > just doing a "mount -o remount" ought to be sufficient to clear this.
> > > > > > 
> > > > > This series is mainly for the case that mount point is not umountable.
> > > > > If mount point is umountable, user should use 'umount -f /ceph; mount
> > > > > /ceph'. This avoids all trouble of error handling.
> > > > > 
> > > > 
> > > > ...
> > > > 
> > > > > If just doing "mount -o remount", user will expect there is no
> > > > > data/metadata get lost.  The 'mount -f' explicitly tell user this
> > > > > operation may lose data/metadata.
> > > > > 
> > > > > 
> > > > 
> > > > I don't think they'd expect that and even if they did, that's why we'd
> > > > return errors on certain operations until they are cleared. But, I think
> > > > all of this points out the main issue I have with this patchset, which
> > > > is that it's not clear what problem this is solving.
> > > > 
> > > > So: client gets blacklisted and we want to allow it to come back in some
> > > > fashion. Do we expect applications that happened to be accessing that
> > > > mount to be able to continue running, or will they need to be restarted?
> > > > If they need to be restarted why not just expect the admin to kill them
> > > > all off, unmount and remount and then start them back up again?
> > > > 
> > > 
> > > The point is let users decide what to do. Some user values
> > > availability over consistency. It's inconvenient to kill all
> > > applications that use the mount, then do umount.
> > > 
> > > 
> > 
> > I think I have a couple of issues with this patchset. Maybe you can
> > convince me though:
> > 
> > 1) The interface is really weird.
> > 
> > You suggested that we needed to do:
> > 
> >      # umount -f /mnt/foo ; mount -o remount /mnt/foo
> > 
> > ...but what if I'm not really blacklisted? Didn't I just kill off all
> > the calls in-flight with the umount -f? What if that umount actually
> > succeeds? Then the subsequent remount call will fail.
> > 
> > ISTM, that this interface (should we choose to accept it) should just
> > be:
> > 
> >      # mount -o remount /mnt/foo
> > 
> 
> I have patch that does
> 
> mount -o remount,force_reconnect /mnt/ceph
> 
> 

That seems clearer.

> > ...and if the client figures out that it has been blacklisted, then it
> > does the right thing during the remount (whatever that right thing is).
> > 
> > 2) It's not clear to me who we expect to use this.
> > 
> > Are you targeting applications that do not use file locking? Any that do
> > use file locking will probably need some special handling, but those
> > that don't might be able to get by unscathed as long as they can deal
> > with -EIO on fsync by replaying writes since the last fsync.
> > 
> 
> Several users said they availability over consistency. For example: 
> ImageNet training, cephfs is used for storing image files.
> 
> 

Which sounds reasonable on its face...but why bother with remounting at
that point? Why not just have the client reattempt connections until it
succeeds (or you forcibly unmount).

For that matter, why not just redirty the pages after the writes fail in
that case instead of forcing those users to rewrite their data? If they
don't care about consistency that much, then that would seem to be a
nicer way to deal with this.

I also find all of this a little difficult to reconcile with Patrick's
desire to forcibly terminate any application that had a file lock at the
time of the blacklisting. That doesn't seem to be something that will
enhance availability.

Again, I think I'm missing some piece of the bigger picture here, which
is why I'm asking about how, specifically, we expect this to be used.
I'd like to understand the actual use-cases here so we can ensure we're
addressing them in the best possible fashion.

> 
> > The catch here is that not many applications do that. Most just fall
> > over once fsync hits an error. That is a bit of a chicken and egg
> > problem though, so that's not necessarily an argument against doing
> > this.
> > 
> 
> 
> 
> > > > > > Also, how would an admin know that this is something they ought to try?
> > > > > > Is there a way for them to know that their client has been blacklisted?
> > > > > > 
> > > > > > > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > > > > > > ---
> > > > > > >    fs/ceph/mds_client.c | 16 +++++++++++++---
> > > > > > >    fs/ceph/super.c      | 23 +++++++++++++++++++++--
> > > > > > >    2 files changed, 34 insertions(+), 5 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > > index 19c62cf7d5b8..188c33709d9a 100644
> > > > > > > --- a/fs/ceph/mds_client.c
> > > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > > @@ -1378,9 +1378,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
> > > > > > >                    struct ceph_cap_flush *cf;
> > > > > > >                    struct ceph_mds_client *mdsc = fsc->mdsc;
> > > > > > > 
> > > > > > > -         if (ci->i_wrbuffer_ref > 0 &&
> > > > > > > -             READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
> > > > > > > -                 invalidate = true;
> > > > > > > +         if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
> > > > > > > +                 if (inode->i_data.nrpages > 0)
> > > > > > > +                         invalidate = true;
> > > > > > > +                 if (ci->i_wrbuffer_ref > 0)
> > > > > > > +                         mapping_set_error(&inode->i_data, -EIO);
> > > > > > > +         }
> > > > > > > 
> > > > > > >                    while (!list_empty(&ci->i_cap_flush_list)) {
> > > > > > >                            cf = list_first_entry(&ci->i_cap_flush_list,
> > > > > > > @@ -4350,7 +4353,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
> > > > > > >                    session = __ceph_lookup_mds_session(mdsc, mds);
> > > > > > >                    if (!session)
> > > > > > >                            continue;
> > > > > > > +
> > > > > > > +         if (session->s_state == CEPH_MDS_SESSION_REJECTED)
> > > > > > > +                 __unregister_session(mdsc, session);
> > > > > > > +         __wake_requests(mdsc, &session->s_waiting);
> > > > > > >                    mutex_unlock(&mdsc->mutex);
> > > > > > > +
> > > > > > >                    mutex_lock(&session->s_mutex);
> > > > > > >                    __close_session(mdsc, session);
> > > > > > >                    if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
> > > > > > > @@ -4359,9 +4367,11 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
> > > > > > >                    }
> > > > > > >                    mutex_unlock(&session->s_mutex);
> > > > > > >                    ceph_put_mds_session(session);
> > > > > > > +
> > > > > > >                    mutex_lock(&mdsc->mutex);
> > > > > > >                    kick_requests(mdsc, mds);
> > > > > > >            }
> > > > > > > +
> > > > > > >            __wake_requests(mdsc, &mdsc->waiting_for_map);
> > > > > > >            mutex_unlock(&mdsc->mutex);
> > > > > > >    }
> > > > > > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > > > > > index 67eb9d592ab7..a6a3c065f697 100644
> > > > > > > --- a/fs/ceph/super.c
> > > > > > > +++ b/fs/ceph/super.c
> > > > > > > @@ -833,8 +833,27 @@ static void ceph_umount_begin(struct super_block *sb)
> > > > > > > 
> > > > > > >    static int ceph_remount(struct super_block *sb, int *flags, char *data)
> > > > > > >    {
> > > > > > > - sync_filesystem(sb);
> > > > > > > - return 0;
> > > > > > > + struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
> > > > > > > +
> > > > > > > + if (fsc->mount_state != CEPH_MOUNT_SHUTDOWN) {
> > > > > > > +         sync_filesystem(sb);
> > > > > > > +         return 0;
> > > > > > > + }
> > > > > > > +
> > > > > > > + /* Make sure all page caches get invalidated.
> > > > > > > +  * see remove_session_caps_cb() */
> > > > > > > + flush_workqueue(fsc->inode_wq);
> > > > > > > + /* In case that we were blacklisted. This also reset
> > > > > > > +  * all mon/osd connections */
> > > > > > > + ceph_reset_client_addr(fsc->client);
> > > > > > > +
> > > > > > > + ceph_osdc_clear_abort_err(&fsc->client->osdc);
> > > > > > > + fsc->mount_state = 0;
> > > > > > > +
> > > > > > > + if (!sb->s_root)
> > > > > > > +         return 0;
> > > > > > > + return __ceph_do_getattr(d_inode(sb->s_root), NULL,
> > > > > > > +                          CEPH_STAT_CAP_INODE, true);
> > > > > > >    }
> > > > > > > 
> > > > > > >    static const struct super_operations ceph_super_ops = {
> > > > 
> > > > --
> > > > Jeff Layton <jlayton@redhat.com>
> > > >
Patrick Donnelly June 22, 2019, 3:20 a.m. UTC | #13
On Thu, Jun 20, 2019 at 7:59 PM Yan, Zheng <ukernel@gmail.com> wrote:
>
> On Fri, Jun 21, 2019 at 1:11 AM Patrick Donnelly <pdonnell@redhat.com> wrote:
> >
> > On Thu, Jun 20, 2019 at 8:34 AM Jeff Layton <jlayton@redhat.com> wrote:
> > >
> > > On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
> > > > On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
> > > > > On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
> > > > > > On 6/18/19 1:30 AM, Jeff Layton wrote:
> > > > > > > On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> > > > > > > > When remounting aborted mount, also reset client's entity addr.
> > > > > > > > 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> > > > > > > > from blacklist.
> > > > > > > >
> > > > > > >
> > > > > > > Why do I need to umount here? Once the filesystem is unmounted, then the
> > > > > > > '-o remount' becomes superfluous, no? In fact, I get an error back when
> > > > > > > I try to remount an unmounted filesystem:
> > > > > > >
> > > > > > >      $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
> > > > > > >      mount: /mnt/cephfs: mount point not mounted or bad option.
> > > > > > >
> > > > > > > My client isn't blacklisted above, so I guess you're counting on the
> > > > > > > umount returning without having actually unmounted the filesystem?
> > > > > > >
> > > > > > > I think this ought to not need a umount first. From a UI standpoint,
> > > > > > > just doing a "mount -o remount" ought to be sufficient to clear this.
> > > > > > >
> > > > > > This series is mainly for the case that mount point is not umountable.
> > > > > > If mount point is umountable, user should use 'umount -f /ceph; mount
> > > > > > /ceph'. This avoids all trouble of error handling.
> > > > > >
> > > > >
> > > > > ...
> > > > >
> > > > > > If just doing "mount -o remount", user will expect there is no
> > > > > > data/metadata get lost.  The 'mount -f' explicitly tell user this
> > > > > > operation may lose data/metadata.
> > > > > >
> > > > > >
> > > > >
> > > > > I don't think they'd expect that and even if they did, that's why we'd
> > > > > return errors on certain operations until they are cleared. But, I think
> > > > > all of this points out the main issue I have with this patchset, which
> > > > > is that it's not clear what problem this is solving.
> > > > >
> > > > > So: client gets blacklisted and we want to allow it to come back in some
> > > > > fashion. Do we expect applications that happened to be accessing that
> > > > > mount to be able to continue running, or will they need to be restarted?
> > > > > If they need to be restarted why not just expect the admin to kill them
> > > > > all off, unmount and remount and then start them back up again?
> > > > >
> > > >
> > > > The point is let users decide what to do. Some user values
> > > > availability over consistency. It's inconvenient to kill all
> > > > applications that use the mount, then do umount.
> > > >
> > > >
> > >
> > > I think I have a couple of issues with this patchset. Maybe you can
> > > convince me though:
> > >
> > > 1) The interface is really weird.
> > >
> > > You suggested that we needed to do:
> > >
> > >     # umount -f /mnt/foo ; mount -o remount /mnt/foo
> > >
> > > ...but what if I'm not really blacklisted? Didn't I just kill off all
> > > the calls in-flight with the umount -f? What if that umount actually
> > > succeeds? Then the subsequent remount call will fail.
> > >
> > > ISTM, that this interface (should we choose to accept it) should just
> > > be:
> > >
> > >     # mount -o remount /mnt/foo
> > >
> > > ...and if the client figures out that it has been blacklisted, then it
> > > does the right thing during the remount (whatever that right thing is).
> >
> > This looks reasonable to me. It's not clear to me (without poring over
> > the code which I lack time to do rigorously) why the umount should be
> > necessary at all.
> >
> > Furthermore, I don't like that this is requiring operator intervention
> > (i.e. remount) of any kind to recover the mount. If undesirable
> > consistency/cache coherency concerns are what's stopping us from
> > automatic recovery,
>
> if admin want clients to auto recover. why enabling blacklist on eviction
> at the first place.

Because the client may not even be aware that it's acting in a rogue
fashion, e.g. by writing to some file. The only correct action for the
MDS is to kill the session and blacklist the client from talking to
the OSDs.

Now, wanting the client to recover from this by re-establishing a
session with the MDS and cluster is a reasonable next step. But the
first step must be to blacklist the client because we have no idea
what it's doing or what kind of network partitions have occurred.
Patrick Donnelly June 22, 2019, 3:21 a.m. UTC | #14
On Fri, Jun 21, 2019 at 1:11 AM Yan, Zheng <zyan@redhat.com> wrote:
>
> On 6/20/19 11:33 PM, Jeff Layton wrote:
> > On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
> >> On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
> >>> On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
> >>>> On 6/18/19 1:30 AM, Jeff Layton wrote:
> >>>>> On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> >>>>>> When remounting aborted mount, also reset client's entity addr.
> >>>>>> 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> >>>>>> from blacklist.
> >>>>>>
> >>>>>
> >>>>> Why do I need to umount here? Once the filesystem is unmounted, then the
> >>>>> '-o remount' becomes superfluous, no? In fact, I get an error back when
> >>>>> I try to remount an unmounted filesystem:
> >>>>>
> >>>>>       $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
> >>>>>       mount: /mnt/cephfs: mount point not mounted or bad option.
> >>>>>
> >>>>> My client isn't blacklisted above, so I guess you're counting on the
> >>>>> umount returning without having actually unmounted the filesystem?
> >>>>>
> >>>>> I think this ought to not need a umount first. From a UI standpoint,
> >>>>> just doing a "mount -o remount" ought to be sufficient to clear this.
> >>>>>
> >>>> This series is mainly for the case that mount point is not umountable.
> >>>> If mount point is umountable, user should use 'umount -f /ceph; mount
> >>>> /ceph'. This avoids all trouble of error handling.
> >>>>
> >>>
> >>> ...
> >>>
> >>>> If just doing "mount -o remount", user will expect there is no
> >>>> data/metadata get lost.  The 'mount -f' explicitly tell user this
> >>>> operation may lose data/metadata.
> >>>>
> >>>>
> >>>
> >>> I don't think they'd expect that and even if they did, that's why we'd
> >>> return errors on certain operations until they are cleared. But, I think
> >>> all of this points out the main issue I have with this patchset, which
> >>> is that it's not clear what problem this is solving.
> >>>
> >>> So: client gets blacklisted and we want to allow it to come back in some
> >>> fashion. Do we expect applications that happened to be accessing that
> >>> mount to be able to continue running, or will they need to be restarted?
> >>> If they need to be restarted why not just expect the admin to kill them
> >>> all off, unmount and remount and then start them back up again?
> >>>
> >>
> >> The point is let users decide what to do. Some user values
> >> availability over consistency. It's inconvenient to kill all
> >> applications that use the mount, then do umount.
> >>
> >>
> >
> > I think I have a couple of issues with this patchset. Maybe you can
> > convince me though:
> >
> > 1) The interface is really weird.
> >
> > You suggested that we needed to do:
> >
> >      # umount -f /mnt/foo ; mount -o remount /mnt/foo
> >
> > ...but what if I'm not really blacklisted? Didn't I just kill off all
> > the calls in-flight with the umount -f? What if that umount actually
> > succeeds? Then the subsequent remount call will fail.
> >
> > ISTM, that this interface (should we choose to accept it) should just
> > be:
> >
> >      # mount -o remount /mnt/foo
> >
>
> I have patch that does
>
> mount -o remount,force_reconnect /mnt/ceph

I think we can improve this with automatic recovery. It just requires
configuration of the behavior via mount options. What is your reason
for preferring an explicit remount operation?
Yan, Zheng June 24, 2019, 1:49 a.m. UTC | #15
On 6/22/19 12:48 AM, Jeff Layton wrote:
> On Fri, 2019-06-21 at 16:10 +0800, Yan, Zheng wrote:
>> On 6/20/19 11:33 PM, Jeff Layton wrote:
>>> On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
>>>> On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
>>>>> On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
>>>>>> On 6/18/19 1:30 AM, Jeff Layton wrote:
>>>>>>> On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
>>>>>>>> When remounting aborted mount, also reset client's entity addr.
>>>>>>>> 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
>>>>>>>> from blacklist.
>>>>>>>>
>>>>>>>
>>>>>>> Why do I need to umount here? Once the filesystem is unmounted, then the
>>>>>>> '-o remount' becomes superfluous, no? In fact, I get an error back when
>>>>>>> I try to remount an unmounted filesystem:
>>>>>>>
>>>>>>>        $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
>>>>>>>        mount: /mnt/cephfs: mount point not mounted or bad option.
>>>>>>>
>>>>>>> My client isn't blacklisted above, so I guess you're counting on the
>>>>>>> umount returning without having actually unmounted the filesystem?
>>>>>>>
>>>>>>> I think this ought to not need a umount first. From a UI standpoint,
>>>>>>> just doing a "mount -o remount" ought to be sufficient to clear this.
>>>>>>>
>>>>>> This series is mainly for the case that mount point is not umountable.
>>>>>> If mount point is umountable, user should use 'umount -f /ceph; mount
>>>>>> /ceph'. This avoids all trouble of error handling.
>>>>>>
>>>>>
>>>>> ...
>>>>>
>>>>>> If just doing "mount -o remount", user will expect there is no
>>>>>> data/metadata get lost.  The 'mount -f' explicitly tell user this
>>>>>> operation may lose data/metadata.
>>>>>>
>>>>>>
>>>>>
>>>>> I don't think they'd expect that and even if they did, that's why we'd
>>>>> return errors on certain operations until they are cleared. But, I think
>>>>> all of this points out the main issue I have with this patchset, which
>>>>> is that it's not clear what problem this is solving.
>>>>>
>>>>> So: client gets blacklisted and we want to allow it to come back in some
>>>>> fashion. Do we expect applications that happened to be accessing that
>>>>> mount to be able to continue running, or will they need to be restarted?
>>>>> If they need to be restarted why not just expect the admin to kill them
>>>>> all off, unmount and remount and then start them back up again?
>>>>>
>>>>
>>>> The point is let users decide what to do. Some user values
>>>> availability over consistency. It's inconvenient to kill all
>>>> applications that use the mount, then do umount.
>>>>
>>>>
>>>
>>> I think I have a couple of issues with this patchset. Maybe you can
>>> convince me though:
>>>
>>> 1) The interface is really weird.
>>>
>>> You suggested that we needed to do:
>>>
>>>       # umount -f /mnt/foo ; mount -o remount /mnt/foo
>>>
>>> ...but what if I'm not really blacklisted? Didn't I just kill off all
>>> the calls in-flight with the umount -f? What if that umount actually
>>> succeeds? Then the subsequent remount call will fail.
>>>
>>> ISTM, that this interface (should we choose to accept it) should just
>>> be:
>>>
>>>       # mount -o remount /mnt/foo
>>>
>>
>> I have patch that does
>>
>> mount -o remount,force_reconnect /mnt/ceph
>>
>>
> 
> That seems clearer.
> 
>>> ...and if the client figures out that it has been blacklisted, then it
>>> does the right thing during the remount (whatever that right thing is).
>>>
>>> 2) It's not clear to me who we expect to use this.
>>>
>>> Are you targeting applications that do not use file locking? Any that do
>>> use file locking will probably need some special handling, but those
>>> that don't might be able to get by unscathed as long as they can deal
>>> with -EIO on fsync by replaying writes since the last fsync.
>>>
>>
>> Several users said they availability over consistency. For example:
>> ImageNet training, cephfs is used for storing image files.
>>
>>
> 
> Which sounds reasonable on its face...but why bother with remounting at
> that point? Why not just have the client reattempt connections until it
> succeeds (or you forcibly unmount).
> 
> For that matter, why not just redirty the pages after the writes fail in
> that case instead of forcing those users to rewrite their data? If they
> don't care about consistency that much, then that would seem to be a
> nicer way to deal with this.
> 

I'm not clear about this either

Patrack said in other email:

Because the client may not even be aware that it's acting in a rogue
fashion, e.g. by writing to some file. The only correct action for the
MDS is to kill the session and blacklist the client from talking to
the OSDs.

Now, wanting the client to recover from this by re-establishing a
session with the MDS and cluster is a reasonable next step. But the
first step must be to blacklist the client because we have no idea
what it's doing or what kind of network partitions have occurred.



> I also find all of this a little difficult to reconcile with Patrick's
> desire to forcibly terminate any application that had a file lock at the
> time of the blacklisting. That doesn't seem to be something that will
> enhance availability.
> 
> Again, I think I'm missing some piece of the bigger picture here, which
> is why I'm asking about how, specifically, we expect this to be used.
> I'd like to understand the actual use-cases here so we can ensure we're
> addressing them in the best possible fashion.
> 
>>
>>> The catch here is that not many applications do that. Most just fall
>>> over once fsync hits an error. That is a bit of a chicken and egg
>>> problem though, so that's not necessarily an argument against doing
>>> this.
>>>
>>
>>
>>
>>>>>>> Also, how would an admin know that this is something they ought to try?
>>>>>>> Is there a way for them to know that their client has been blacklisted?
>>>>>>>
>>>>>>>> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
>>>>>>>> ---
>>>>>>>>     fs/ceph/mds_client.c | 16 +++++++++++++---
>>>>>>>>     fs/ceph/super.c      | 23 +++++++++++++++++++++--
>>>>>>>>     2 files changed, 34 insertions(+), 5 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>>>>>> index 19c62cf7d5b8..188c33709d9a 100644
>>>>>>>> --- a/fs/ceph/mds_client.c
>>>>>>>> +++ b/fs/ceph/mds_client.c
>>>>>>>> @@ -1378,9 +1378,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
>>>>>>>>                     struct ceph_cap_flush *cf;
>>>>>>>>                     struct ceph_mds_client *mdsc = fsc->mdsc;
>>>>>>>>
>>>>>>>> -         if (ci->i_wrbuffer_ref > 0 &&
>>>>>>>> -             READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
>>>>>>>> -                 invalidate = true;
>>>>>>>> +         if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
>>>>>>>> +                 if (inode->i_data.nrpages > 0)
>>>>>>>> +                         invalidate = true;
>>>>>>>> +                 if (ci->i_wrbuffer_ref > 0)
>>>>>>>> +                         mapping_set_error(&inode->i_data, -EIO);
>>>>>>>> +         }
>>>>>>>>
>>>>>>>>                     while (!list_empty(&ci->i_cap_flush_list)) {
>>>>>>>>                             cf = list_first_entry(&ci->i_cap_flush_list,
>>>>>>>> @@ -4350,7 +4353,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
>>>>>>>>                     session = __ceph_lookup_mds_session(mdsc, mds);
>>>>>>>>                     if (!session)
>>>>>>>>                             continue;
>>>>>>>> +
>>>>>>>> +         if (session->s_state == CEPH_MDS_SESSION_REJECTED)
>>>>>>>> +                 __unregister_session(mdsc, session);
>>>>>>>> +         __wake_requests(mdsc, &session->s_waiting);
>>>>>>>>                     mutex_unlock(&mdsc->mutex);
>>>>>>>> +
>>>>>>>>                     mutex_lock(&session->s_mutex);
>>>>>>>>                     __close_session(mdsc, session);
>>>>>>>>                     if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
>>>>>>>> @@ -4359,9 +4367,11 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
>>>>>>>>                     }
>>>>>>>>                     mutex_unlock(&session->s_mutex);
>>>>>>>>                     ceph_put_mds_session(session);
>>>>>>>> +
>>>>>>>>                     mutex_lock(&mdsc->mutex);
>>>>>>>>                     kick_requests(mdsc, mds);
>>>>>>>>             }
>>>>>>>> +
>>>>>>>>             __wake_requests(mdsc, &mdsc->waiting_for_map);
>>>>>>>>             mutex_unlock(&mdsc->mutex);
>>>>>>>>     }
>>>>>>>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>>>>>>>> index 67eb9d592ab7..a6a3c065f697 100644
>>>>>>>> --- a/fs/ceph/super.c
>>>>>>>> +++ b/fs/ceph/super.c
>>>>>>>> @@ -833,8 +833,27 @@ static void ceph_umount_begin(struct super_block *sb)
>>>>>>>>
>>>>>>>>     static int ceph_remount(struct super_block *sb, int *flags, char *data)
>>>>>>>>     {
>>>>>>>> - sync_filesystem(sb);
>>>>>>>> - return 0;
>>>>>>>> + struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
>>>>>>>> +
>>>>>>>> + if (fsc->mount_state != CEPH_MOUNT_SHUTDOWN) {
>>>>>>>> +         sync_filesystem(sb);
>>>>>>>> +         return 0;
>>>>>>>> + }
>>>>>>>> +
>>>>>>>> + /* Make sure all page caches get invalidated.
>>>>>>>> +  * see remove_session_caps_cb() */
>>>>>>>> + flush_workqueue(fsc->inode_wq);
>>>>>>>> + /* In case that we were blacklisted. This also reset
>>>>>>>> +  * all mon/osd connections */
>>>>>>>> + ceph_reset_client_addr(fsc->client);
>>>>>>>> +
>>>>>>>> + ceph_osdc_clear_abort_err(&fsc->client->osdc);
>>>>>>>> + fsc->mount_state = 0;
>>>>>>>> +
>>>>>>>> + if (!sb->s_root)
>>>>>>>> +         return 0;
>>>>>>>> + return __ceph_do_getattr(d_inode(sb->s_root), NULL,
>>>>>>>> +                          CEPH_STAT_CAP_INODE, true);
>>>>>>>>     }
>>>>>>>>
>>>>>>>>     static const struct super_operations ceph_super_ops = {
>>>>>
>>>>> --
>>>>> Jeff Layton <jlayton@redhat.com>
>>>>>
>
Patrick Donnelly June 24, 2019, 3:20 a.m. UTC | #16
On Sun, Jun 23, 2019 at 6:50 PM Yan, Zheng <zyan@redhat.com> wrote:
>
> On 6/22/19 12:48 AM, Jeff Layton wrote:
> > On Fri, 2019-06-21 at 16:10 +0800, Yan, Zheng wrote:
> >> On 6/20/19 11:33 PM, Jeff Layton wrote:
> >>> On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
> >>>> On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
> >>>>> On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
> >>>>>> On 6/18/19 1:30 AM, Jeff Layton wrote:
> >>>>>>> On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> >>>>>>>> When remounting aborted mount, also reset client's entity addr.
> >>>>>>>> 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> >>>>>>>> from blacklist.
> >>>>>>>>
> >>>>>>>
> >>>>>>> Why do I need to umount here? Once the filesystem is unmounted, then the
> >>>>>>> '-o remount' becomes superfluous, no? In fact, I get an error back when
> >>>>>>> I try to remount an unmounted filesystem:
> >>>>>>>
> >>>>>>>        $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
> >>>>>>>        mount: /mnt/cephfs: mount point not mounted or bad option.
> >>>>>>>
> >>>>>>> My client isn't blacklisted above, so I guess you're counting on the
> >>>>>>> umount returning without having actually unmounted the filesystem?
> >>>>>>>
> >>>>>>> I think this ought to not need a umount first. From a UI standpoint,
> >>>>>>> just doing a "mount -o remount" ought to be sufficient to clear this.
> >>>>>>>
> >>>>>> This series is mainly for the case that mount point is not umountable.
> >>>>>> If mount point is umountable, user should use 'umount -f /ceph; mount
> >>>>>> /ceph'. This avoids all trouble of error handling.
> >>>>>>
> >>>>>
> >>>>> ...
> >>>>>
> >>>>>> If just doing "mount -o remount", user will expect there is no
> >>>>>> data/metadata get lost.  The 'mount -f' explicitly tell user this
> >>>>>> operation may lose data/metadata.
> >>>>>>
> >>>>>>
> >>>>>
> >>>>> I don't think they'd expect that and even if they did, that's why we'd
> >>>>> return errors on certain operations until they are cleared. But, I think
> >>>>> all of this points out the main issue I have with this patchset, which
> >>>>> is that it's not clear what problem this is solving.
> >>>>>
> >>>>> So: client gets blacklisted and we want to allow it to come back in some
> >>>>> fashion. Do we expect applications that happened to be accessing that
> >>>>> mount to be able to continue running, or will they need to be restarted?
> >>>>> If they need to be restarted why not just expect the admin to kill them
> >>>>> all off, unmount and remount and then start them back up again?
> >>>>>
> >>>>
> >>>> The point is let users decide what to do. Some user values
> >>>> availability over consistency. It's inconvenient to kill all
> >>>> applications that use the mount, then do umount.
> >>>>
> >>>>
> >>>
> >>> I think I have a couple of issues with this patchset. Maybe you can
> >>> convince me though:
> >>>
> >>> 1) The interface is really weird.
> >>>
> >>> You suggested that we needed to do:
> >>>
> >>>       # umount -f /mnt/foo ; mount -o remount /mnt/foo
> >>>
> >>> ...but what if I'm not really blacklisted? Didn't I just kill off all
> >>> the calls in-flight with the umount -f? What if that umount actually
> >>> succeeds? Then the subsequent remount call will fail.
> >>>
> >>> ISTM, that this interface (should we choose to accept it) should just
> >>> be:
> >>>
> >>>       # mount -o remount /mnt/foo
> >>>
> >>
> >> I have patch that does
> >>
> >> mount -o remount,force_reconnect /mnt/ceph
> >>
> >>
> >
> > That seems clearer.
> >
> >>> ...and if the client figures out that it has been blacklisted, then it
> >>> does the right thing during the remount (whatever that right thing is).
> >>>
> >>> 2) It's not clear to me who we expect to use this.
> >>>
> >>> Are you targeting applications that do not use file locking? Any that do
> >>> use file locking will probably need some special handling, but those
> >>> that don't might be able to get by unscathed as long as they can deal
> >>> with -EIO on fsync by replaying writes since the last fsync.
> >>>
> >>
> >> Several users said they availability over consistency. For example:
> >> ImageNet training, cephfs is used for storing image files.
> >>
> >>
> >
> > Which sounds reasonable on its face...but why bother with remounting at
> > that point? Why not just have the client reattempt connections until it
> > succeeds (or you forcibly unmount).
> >
> > For that matter, why not just redirty the pages after the writes fail in
> > that case instead of forcing those users to rewrite their data? If they
> > don't care about consistency that much, then that would seem to be a
> > nicer way to deal with this.
> >
>
> I'm not clear about this either

As I've said elsewhere: **how** the client recovers from the lost
session and blacklist event is configurable. There should be a range
of mount options which control the behavior: such as a _hypothetical_
"recover_session=<mode>", where mode may be:

- "brute": re-acquire capabilities and flush all dirty data. All open
file handles continue to work normally. Dangerous and definitely not
the default. (How should file locks be handled?)

- "clean": re-acquire read capabilities and drop dirty write buffers.
Writable file handles return -EIO. Locks are lost and the lock owners
are sent SIGIO, si_code=SI_LOST, si_fd=lockedfd (default is
termination!). Read-only handles continue to work and caches are
dropped if necessary. This should probably be the default.

- "fresh": like "clean" but read-only handles also return -EIO. Not
sure if this one is useful but not difficult to add.

No "-o remount" mount commands necessary.

Now, these details are open for change. I'm just trying to suggest a
way forward. I'm not well versed in how difficult this proposal is to
implement in the kernel. There are probably details or challenges I'm
not considering. I recommend that before Zheng writes new code that he
and Jeff work out what the right semantics and configurations should
be and make a proposal to ceph-devel/dev@ceph.io for user feedback.
Jeff Layton June 24, 2019, 5:53 p.m. UTC | #17
On Sun, 2019-06-23 at 20:20 -0700, Patrick Donnelly wrote:
> On Sun, Jun 23, 2019 at 6:50 PM Yan, Zheng <zyan@redhat.com> wrote:
> > On 6/22/19 12:48 AM, Jeff Layton wrote:
> > > On Fri, 2019-06-21 at 16:10 +0800, Yan, Zheng wrote:
> > > > On 6/20/19 11:33 PM, Jeff Layton wrote:
> > > > > On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
> > > > > > On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
> > > > > > > On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
> > > > > > > > On 6/18/19 1:30 AM, Jeff Layton wrote:
> > > > > > > > > On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> > > > > > > > > > When remounting aborted mount, also reset client's entity addr.
> > > > > > > > > > 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> > > > > > > > > > from blacklist.
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Why do I need to umount here? Once the filesystem is unmounted, then the
> > > > > > > > > '-o remount' becomes superfluous, no? In fact, I get an error back when
> > > > > > > > > I try to remount an unmounted filesystem:
> > > > > > > > > 
> > > > > > > > >        $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
> > > > > > > > >        mount: /mnt/cephfs: mount point not mounted or bad option.
> > > > > > > > > 
> > > > > > > > > My client isn't blacklisted above, so I guess you're counting on the
> > > > > > > > > umount returning without having actually unmounted the filesystem?
> > > > > > > > > 
> > > > > > > > > I think this ought to not need a umount first. From a UI standpoint,
> > > > > > > > > just doing a "mount -o remount" ought to be sufficient to clear this.
> > > > > > > > > 
> > > > > > > > This series is mainly for the case that mount point is not umountable.
> > > > > > > > If mount point is umountable, user should use 'umount -f /ceph; mount
> > > > > > > > /ceph'. This avoids all trouble of error handling.
> > > > > > > > 
> > > > > > > 
> > > > > > > ...
> > > > > > > 
> > > > > > > > If just doing "mount -o remount", user will expect there is no
> > > > > > > > data/metadata get lost.  The 'mount -f' explicitly tell user this
> > > > > > > > operation may lose data/metadata.
> > > > > > > > 
> > > > > > > > 
> > > > > > > 
> > > > > > > I don't think they'd expect that and even if they did, that's why we'd
> > > > > > > return errors on certain operations until they are cleared. But, I think
> > > > > > > all of this points out the main issue I have with this patchset, which
> > > > > > > is that it's not clear what problem this is solving.
> > > > > > > 
> > > > > > > So: client gets blacklisted and we want to allow it to come back in some
> > > > > > > fashion. Do we expect applications that happened to be accessing that
> > > > > > > mount to be able to continue running, or will they need to be restarted?
> > > > > > > If they need to be restarted why not just expect the admin to kill them
> > > > > > > all off, unmount and remount and then start them back up again?
> > > > > > > 
> > > > > > 
> > > > > > The point is let users decide what to do. Some user values
> > > > > > availability over consistency. It's inconvenient to kill all
> > > > > > applications that use the mount, then do umount.
> > > > > > 
> > > > > > 
> > > > > 
> > > > > I think I have a couple of issues with this patchset. Maybe you can
> > > > > convince me though:
> > > > > 
> > > > > 1) The interface is really weird.
> > > > > 
> > > > > You suggested that we needed to do:
> > > > > 
> > > > >       # umount -f /mnt/foo ; mount -o remount /mnt/foo
> > > > > 
> > > > > ...but what if I'm not really blacklisted? Didn't I just kill off all
> > > > > the calls in-flight with the umount -f? What if that umount actually
> > > > > succeeds? Then the subsequent remount call will fail.
> > > > > 
> > > > > ISTM, that this interface (should we choose to accept it) should just
> > > > > be:
> > > > > 
> > > > >       # mount -o remount /mnt/foo
> > > > > 
> > > > 
> > > > I have patch that does
> > > > 
> > > > mount -o remount,force_reconnect /mnt/ceph
> > > > 
> > > > 
> > > 
> > > That seems clearer.
> > > 
> > > > > ...and if the client figures out that it has been blacklisted, then it
> > > > > does the right thing during the remount (whatever that right thing is).
> > > > > 
> > > > > 2) It's not clear to me who we expect to use this.
> > > > > 
> > > > > Are you targeting applications that do not use file locking? Any that do
> > > > > use file locking will probably need some special handling, but those
> > > > > that don't might be able to get by unscathed as long as they can deal
> > > > > with -EIO on fsync by replaying writes since the last fsync.
> > > > > 
> > > > 
> > > > Several users said they availability over consistency. For example:
> > > > ImageNet training, cephfs is used for storing image files.
> > > > 
> > > > 
> > > 
> > > Which sounds reasonable on its face...but why bother with remounting at
> > > that point? Why not just have the client reattempt connections until it
> > > succeeds (or you forcibly unmount).
> > > 
> > > For that matter, why not just redirty the pages after the writes fail in
> > > that case instead of forcing those users to rewrite their data? If they
> > > don't care about consistency that much, then that would seem to be a
> > > nicer way to deal with this.
> > > 
> > 
> > I'm not clear about this either
> 
> As I've said elsewhere: **how** the client recovers from the lost
> session and blacklist event is configurable. There should be a range
> of mount options which control the behavior: such as a _hypothetical_
> "recover_session=<mode>", where mode may be:
> 
> - "brute": re-acquire capabilities and flush all dirty data. All open
> file handles continue to work normally. Dangerous and definitely not
> the default. (How should file locks be handled?)
> 

IMO, just reacquire them as if nothing happened for this mode. I see
this as conceptually similar to recover_lost_locks module parameter in
nfs.ko. That said, we will need to consider what to do if the lock can't
be reacquired in this mode.

> - "clean": re-acquire read capabilities and drop dirty write buffers.
> Writable file handles return -EIO. Locks are lost and the lock owners
> are sent SIGIO, si_code=SI_LOST, si_fd=lockedfd (default is
> termination!). Read-only handles continue to work and caches are
> dropped if necessary. This should probably be the default.
> 

Sounds good, except for maybe modulo SIGLOST handling for reasons I
outlined in another thread.

> - "fresh": like "clean" but read-only handles also return -EIO. Not
> sure if this one is useful but not difficult to add.
> 

Meh, maybe. If we don't clearly need it then let's not add it. I'd want
to know that someone has an actual use for this option. Adding
interfaces just because we can, just makes trouble later as the code
ages.

> No "-o remount" mount commands necessary.
> 
> Now, these details are open for change. I'm just trying to suggest a
> way forward. I'm not well versed in how difficult this proposal is to
> implement in the kernel. There are probably details or challenges I'm
> not considering. I recommend that before Zheng writes new code that he
> and Jeff work out what the right semantics and configurations should
> be and make a proposal to ceph-devel/dev@ceph.io for user feedback.
> 

That sounds a bit more reasonable. I'd prefer not having to wait for
admin intervention in order to get things moving again if the goal is
making things more available.

That said, whenever we're doing something like this, it's easy for all
of us to make subtle assumptions and end up talking at cross-purposes to
one another. The first step here is to clearly identify the problem
we're trying to solve. From earlier emails I'd suggest this as a
starting point:

"Clients can end up blacklisted due to various connectivity issues, and
we'd like to offer admins a way to configure the mount to reconnect
after blacklisting/unblacklisting, and continue working. Preferably,
with no disruption to the application other than the client hanging
while blacklisted."

Does this sound about right?

If so, then I think we ought to aim for something closer to what Patrick
is suggesting; a mount option or something that causes the cephfs client
to aggressively attempt to recover after being unblacklisted.

Assuming that is correct, who is asking for this? We should run this
proposal by them (and the wider ceph community) to make sure this
proposal will address their problem.
Yan, Zheng June 24, 2019, 10:31 p.m. UTC | #18
On Tue, Jun 25, 2019 at 5:18 AM Jeff Layton <jlayton@redhat.com> wrote:
>
> On Sun, 2019-06-23 at 20:20 -0700, Patrick Donnelly wrote:
> > On Sun, Jun 23, 2019 at 6:50 PM Yan, Zheng <zyan@redhat.com> wrote:
> > > On 6/22/19 12:48 AM, Jeff Layton wrote:
> > > > On Fri, 2019-06-21 at 16:10 +0800, Yan, Zheng wrote:
> > > > > On 6/20/19 11:33 PM, Jeff Layton wrote:
> > > > > > On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
> > > > > > > On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
> > > > > > > > On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
> > > > > > > > > On 6/18/19 1:30 AM, Jeff Layton wrote:
> > > > > > > > > > On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> > > > > > > > > > > When remounting aborted mount, also reset client's entity addr.
> > > > > > > > > > > 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> > > > > > > > > > > from blacklist.
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Why do I need to umount here? Once the filesystem is unmounted, then the
> > > > > > > > > > '-o remount' becomes superfluous, no? In fact, I get an error back when
> > > > > > > > > > I try to remount an unmounted filesystem:
> > > > > > > > > >
> > > > > > > > > >        $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
> > > > > > > > > >        mount: /mnt/cephfs: mount point not mounted or bad option.
> > > > > > > > > >
> > > > > > > > > > My client isn't blacklisted above, so I guess you're counting on the
> > > > > > > > > > umount returning without having actually unmounted the filesystem?
> > > > > > > > > >
> > > > > > > > > > I think this ought to not need a umount first. From a UI standpoint,
> > > > > > > > > > just doing a "mount -o remount" ought to be sufficient to clear this.
> > > > > > > > > >
> > > > > > > > > This series is mainly for the case that mount point is not umountable.
> > > > > > > > > If mount point is umountable, user should use 'umount -f /ceph; mount
> > > > > > > > > /ceph'. This avoids all trouble of error handling.
> > > > > > > > >
> > > > > > > >
> > > > > > > > ...
> > > > > > > >
> > > > > > > > > If just doing "mount -o remount", user will expect there is no
> > > > > > > > > data/metadata get lost.  The 'mount -f' explicitly tell user this
> > > > > > > > > operation may lose data/metadata.
> > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > > > I don't think they'd expect that and even if they did, that's why we'd
> > > > > > > > return errors on certain operations until they are cleared. But, I think
> > > > > > > > all of this points out the main issue I have with this patchset, which
> > > > > > > > is that it's not clear what problem this is solving.
> > > > > > > >
> > > > > > > > So: client gets blacklisted and we want to allow it to come back in some
> > > > > > > > fashion. Do we expect applications that happened to be accessing that
> > > > > > > > mount to be able to continue running, or will they need to be restarted?
> > > > > > > > If they need to be restarted why not just expect the admin to kill them
> > > > > > > > all off, unmount and remount and then start them back up again?
> > > > > > > >
> > > > > > >
> > > > > > > The point is let users decide what to do. Some user values
> > > > > > > availability over consistency. It's inconvenient to kill all
> > > > > > > applications that use the mount, then do umount.
> > > > > > >
> > > > > > >
> > > > > >
> > > > > > I think I have a couple of issues with this patchset. Maybe you can
> > > > > > convince me though:
> > > > > >
> > > > > > 1) The interface is really weird.
> > > > > >
> > > > > > You suggested that we needed to do:
> > > > > >
> > > > > >       # umount -f /mnt/foo ; mount -o remount /mnt/foo
> > > > > >
> > > > > > ...but what if I'm not really blacklisted? Didn't I just kill off all
> > > > > > the calls in-flight with the umount -f? What if that umount actually
> > > > > > succeeds? Then the subsequent remount call will fail.
> > > > > >
> > > > > > ISTM, that this interface (should we choose to accept it) should just
> > > > > > be:
> > > > > >
> > > > > >       # mount -o remount /mnt/foo
> > > > > >
> > > > >
> > > > > I have patch that does
> > > > >
> > > > > mount -o remount,force_reconnect /mnt/ceph
> > > > >
> > > > >
> > > >
> > > > That seems clearer.
> > > >
> > > > > > ...and if the client figures out that it has been blacklisted, then it
> > > > > > does the right thing during the remount (whatever that right thing is).
> > > > > >
> > > > > > 2) It's not clear to me who we expect to use this.
> > > > > >
> > > > > > Are you targeting applications that do not use file locking? Any that do
> > > > > > use file locking will probably need some special handling, but those
> > > > > > that don't might be able to get by unscathed as long as they can deal
> > > > > > with -EIO on fsync by replaying writes since the last fsync.
> > > > > >
> > > > >
> > > > > Several users said they availability over consistency. For example:
> > > > > ImageNet training, cephfs is used for storing image files.
> > > > >
> > > > >
> > > >
> > > > Which sounds reasonable on its face...but why bother with remounting at
> > > > that point? Why not just have the client reattempt connections until it
> > > > succeeds (or you forcibly unmount).
> > > >
> > > > For that matter, why not just redirty the pages after the writes fail in
> > > > that case instead of forcing those users to rewrite their data? If they
> > > > don't care about consistency that much, then that would seem to be a
> > > > nicer way to deal with this.
> > > >
> > >
> > > I'm not clear about this either
> >
> > As I've said elsewhere: **how** the client recovers from the lost
> > session and blacklist event is configurable. There should be a range
> > of mount options which control the behavior: such as a _hypothetical_
> > "recover_session=<mode>", where mode may be:
> >
> > - "brute": re-acquire capabilities and flush all dirty data. All open
> > file handles continue to work normally. Dangerous and definitely not
> > the default. (How should file locks be handled?)
> >
>
> IMO, just reacquire them as if nothing happened for this mode. I see
> this as conceptually similar to recover_lost_locks module parameter in
> nfs.ko. That said, we will need to consider what to do if the lock can't
> be reacquired in this mode.
>
> > - "clean": re-acquire read capabilities and drop dirty write buffers.
> > Writable file handles return -EIO. Locks are lost and the lock owners
> > are sent SIGIO, si_code=SI_LOST, si_fd=lockedfd (default is
> > termination!). Read-only handles continue to work and caches are
> > dropped if necessary. This should probably be the default.
> >
>
> Sounds good, except for maybe modulo SIGLOST handling for reasons I
> outlined in another thread.
>
> > - "fresh": like "clean" but read-only handles also return -EIO. Not
> > sure if this one is useful but not difficult to add.
> >
>
> Meh, maybe. If we don't clearly need it then let's not add it. I'd want
> to know that someone has an actual use for this option. Adding
> interfaces just because we can, just makes trouble later as the code
> ages.
>
> > No "-o remount" mount commands necessary.
> >
> > Now, these details are open for change. I'm just trying to suggest a
> > way forward. I'm not well versed in how difficult this proposal is to
> > implement in the kernel. There are probably details or challenges I'm
> > not considering. I recommend that before Zheng writes new code that he
> > and Jeff work out what the right semantics and configurations should
> > be and make a proposal to ceph-devel/dev@ceph.io for user feedback.
> >
>
> That sounds a bit more reasonable. I'd prefer not having to wait for
> admin intervention in order to get things moving again if the goal is
> making things more available.
>
> That said, whenever we're doing something like this, it's easy for all
> of us to make subtle assumptions and end up talking at cross-purposes to
> one another. The first step here is to clearly identify the problem
> we're trying to solve. From earlier emails I'd suggest this as a
> starting point:
>
> "Clients can end up blacklisted due to various connectivity issues, and
> we'd like to offer admins a way to configure the mount to reconnect
> after blacklisting/unblacklisting, and continue working. Preferably,
> with no disruption to the application other than the client hanging
> while blacklisted."
>
> Does this sound about right?
>
> If so, then I think we ought to aim for something closer to what Patrick
> is suggesting; a mount option or something that causes the cephfs client
> to aggressively attempt to recover after being unblacklisted.
>

Clients shouldn't be too aggressively in this case. Otherwise they can
easily create too many blacklist entries in osdmap.

> Assuming that is correct, who is asking for this? We should run this
> proposal by them (and the wider ceph community) to make sure this
> proposal will address their problem.
> --
> Jeff Layton <jlayton@redhat.com>
>
Jeff Layton June 25, 2019, 1:39 p.m. UTC | #19
On Tue, 2019-06-25 at 06:31 +0800, Yan, Zheng wrote:
> On Tue, Jun 25, 2019 at 5:18 AM Jeff Layton <jlayton@redhat.com> wrote:
> > On Sun, 2019-06-23 at 20:20 -0700, Patrick Donnelly wrote:
> > > On Sun, Jun 23, 2019 at 6:50 PM Yan, Zheng <zyan@redhat.com> wrote:
> > > > On 6/22/19 12:48 AM, Jeff Layton wrote:
> > > > > On Fri, 2019-06-21 at 16:10 +0800, Yan, Zheng wrote:
> > > > > > On 6/20/19 11:33 PM, Jeff Layton wrote:
> > > > > > > On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
> > > > > > > > On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
> > > > > > > > > On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
> > > > > > > > > > On 6/18/19 1:30 AM, Jeff Layton wrote:
> > > > > > > > > > > On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
> > > > > > > > > > > > When remounting aborted mount, also reset client's entity addr.
> > > > > > > > > > > > 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
> > > > > > > > > > > > from blacklist.
> > > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > Why do I need to umount here? Once the filesystem is unmounted, then the
> > > > > > > > > > > '-o remount' becomes superfluous, no? In fact, I get an error back when
> > > > > > > > > > > I try to remount an unmounted filesystem:
> > > > > > > > > > > 
> > > > > > > > > > >        $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
> > > > > > > > > > >        mount: /mnt/cephfs: mount point not mounted or bad option.
> > > > > > > > > > > 
> > > > > > > > > > > My client isn't blacklisted above, so I guess you're counting on the
> > > > > > > > > > > umount returning without having actually unmounted the filesystem?
> > > > > > > > > > > 
> > > > > > > > > > > I think this ought to not need a umount first. From a UI standpoint,
> > > > > > > > > > > just doing a "mount -o remount" ought to be sufficient to clear this.
> > > > > > > > > > > 
> > > > > > > > > > This series is mainly for the case that mount point is not umountable.
> > > > > > > > > > If mount point is umountable, user should use 'umount -f /ceph; mount
> > > > > > > > > > /ceph'. This avoids all trouble of error handling.
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > ...
> > > > > > > > > 
> > > > > > > > > > If just doing "mount -o remount", user will expect there is no
> > > > > > > > > > data/metadata get lost.  The 'mount -f' explicitly tell user this
> > > > > > > > > > operation may lose data/metadata.
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > I don't think they'd expect that and even if they did, that's why we'd
> > > > > > > > > return errors on certain operations until they are cleared. But, I think
> > > > > > > > > all of this points out the main issue I have with this patchset, which
> > > > > > > > > is that it's not clear what problem this is solving.
> > > > > > > > > 
> > > > > > > > > So: client gets blacklisted and we want to allow it to come back in some
> > > > > > > > > fashion. Do we expect applications that happened to be accessing that
> > > > > > > > > mount to be able to continue running, or will they need to be restarted?
> > > > > > > > > If they need to be restarted why not just expect the admin to kill them
> > > > > > > > > all off, unmount and remount and then start them back up again?
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > The point is let users decide what to do. Some user values
> > > > > > > > availability over consistency. It's inconvenient to kill all
> > > > > > > > applications that use the mount, then do umount.
> > > > > > > > 
> > > > > > > > 
> > > > > > > 
> > > > > > > I think I have a couple of issues with this patchset. Maybe you can
> > > > > > > convince me though:
> > > > > > > 
> > > > > > > 1) The interface is really weird.
> > > > > > > 
> > > > > > > You suggested that we needed to do:
> > > > > > > 
> > > > > > >       # umount -f /mnt/foo ; mount -o remount /mnt/foo
> > > > > > > 
> > > > > > > ...but what if I'm not really blacklisted? Didn't I just kill off all
> > > > > > > the calls in-flight with the umount -f? What if that umount actually
> > > > > > > succeeds? Then the subsequent remount call will fail.
> > > > > > > 
> > > > > > > ISTM, that this interface (should we choose to accept it) should just
> > > > > > > be:
> > > > > > > 
> > > > > > >       # mount -o remount /mnt/foo
> > > > > > > 
> > > > > > 
> > > > > > I have patch that does
> > > > > > 
> > > > > > mount -o remount,force_reconnect /mnt/ceph
> > > > > > 
> > > > > > 
> > > > > 
> > > > > That seems clearer.
> > > > > 
> > > > > > > ...and if the client figures out that it has been blacklisted, then it
> > > > > > > does the right thing during the remount (whatever that right thing is).
> > > > > > > 
> > > > > > > 2) It's not clear to me who we expect to use this.
> > > > > > > 
> > > > > > > Are you targeting applications that do not use file locking? Any that do
> > > > > > > use file locking will probably need some special handling, but those
> > > > > > > that don't might be able to get by unscathed as long as they can deal
> > > > > > > with -EIO on fsync by replaying writes since the last fsync.
> > > > > > > 
> > > > > > 
> > > > > > Several users said they availability over consistency. For example:
> > > > > > ImageNet training, cephfs is used for storing image files.
> > > > > > 
> > > > > > 
> > > > > 
> > > > > Which sounds reasonable on its face...but why bother with remounting at
> > > > > that point? Why not just have the client reattempt connections until it
> > > > > succeeds (or you forcibly unmount).
> > > > > 
> > > > > For that matter, why not just redirty the pages after the writes fail in
> > > > > that case instead of forcing those users to rewrite their data? If they
> > > > > don't care about consistency that much, then that would seem to be a
> > > > > nicer way to deal with this.
> > > > > 
> > > > 
> > > > I'm not clear about this either
> > > 
> > > As I've said elsewhere: **how** the client recovers from the lost
> > > session and blacklist event is configurable. There should be a range
> > > of mount options which control the behavior: such as a _hypothetical_
> > > "recover_session=<mode>", where mode may be:
> > > 
> > > - "brute": re-acquire capabilities and flush all dirty data. All open
> > > file handles continue to work normally. Dangerous and definitely not
> > > the default. (How should file locks be handled?)
> > > 
> > 
> > IMO, just reacquire them as if nothing happened for this mode. I see
> > this as conceptually similar to recover_lost_locks module parameter in
> > nfs.ko. That said, we will need to consider what to do if the lock can't
> > be reacquired in this mode.
> > 
> > > - "clean": re-acquire read capabilities and drop dirty write buffers.
> > > Writable file handles return -EIO. Locks are lost and the lock owners
> > > are sent SIGIO, si_code=SI_LOST, si_fd=lockedfd (default is
> > > termination!). Read-only handles continue to work and caches are
> > > dropped if necessary. This should probably be the default.
> > > 
> > 
> > Sounds good, except for maybe modulo SIGLOST handling for reasons I
> > outlined in another thread.
> > 
> > > - "fresh": like "clean" but read-only handles also return -EIO. Not
> > > sure if this one is useful but not difficult to add.
> > > 
> > 
> > Meh, maybe. If we don't clearly need it then let's not add it. I'd want
> > to know that someone has an actual use for this option. Adding
> > interfaces just because we can, just makes trouble later as the code
> > ages.
> > 
> > > No "-o remount" mount commands necessary.
> > > 
> > > Now, these details are open for change. I'm just trying to suggest a
> > > way forward. I'm not well versed in how difficult this proposal is to
> > > implement in the kernel. There are probably details or challenges I'm
> > > not considering. I recommend that before Zheng writes new code that he
> > > and Jeff work out what the right semantics and configurations should
> > > be and make a proposal to ceph-devel/dev@ceph.io for user feedback.
> > > 
> > 
> > That sounds a bit more reasonable. I'd prefer not having to wait for
> > admin intervention in order to get things moving again if the goal is
> > making things more available.
> > 
> > That said, whenever we're doing something like this, it's easy for all
> > of us to make subtle assumptions and end up talking at cross-purposes to
> > one another. The first step here is to clearly identify the problem
> > we're trying to solve. From earlier emails I'd suggest this as a
> > starting point:
> > 
> > "Clients can end up blacklisted due to various connectivity issues, and
> > we'd like to offer admins a way to configure the mount to reconnect
> > after blacklisting/unblacklisting, and continue working. Preferably,
> > with no disruption to the application other than the client hanging
> > while blacklisted."
> > 
> > Does this sound about right?
> > 
> > If so, then I think we ought to aim for something closer to what Patrick
> > is suggesting; a mount option or something that causes the cephfs client
> > to aggressively attempt to recover after being unblacklisted.
> > 
> 
> Clients shouldn't be too aggressively in this case. Otherwise they can
> easily create too many blacklist entries in osdmap.
> 

When I said "aggressively" I meant on the order of once a minute or so,
though that interval could be tunable.

Can blacklisted clients still request osd maps from the monitors? IOW,
is there a way for the client to determine whether it has been
blacklisted? If so, then when the client suspects that it has been
blacklisted it could just wait until the new OSD map shows otherwise.

In any case, I thought blacklisting mostly occurred when clients fail to
give up their MDS caps. Why would repeated polling create more blacklist
entries?
Yan, Zheng June 26, 2019, 5:16 a.m. UTC | #20
On 6/25/19 9:39 PM, Jeff Layton wrote:
> On Tue, 2019-06-25 at 06:31 +0800, Yan, Zheng wrote:
>> On Tue, Jun 25, 2019 at 5:18 AM Jeff Layton <jlayton@redhat.com> wrote:
>>> On Sun, 2019-06-23 at 20:20 -0700, Patrick Donnelly wrote:
>>>> On Sun, Jun 23, 2019 at 6:50 PM Yan, Zheng <zyan@redhat.com> wrote:
>>>>> On 6/22/19 12:48 AM, Jeff Layton wrote:
>>>>>> On Fri, 2019-06-21 at 16:10 +0800, Yan, Zheng wrote:
>>>>>>> On 6/20/19 11:33 PM, Jeff Layton wrote:
>>>>>>>> On Wed, 2019-06-19 at 08:24 +0800, Yan, Zheng wrote:
>>>>>>>>> On Tue, Jun 18, 2019 at 6:39 PM Jeff Layton <jlayton@redhat.com> wrote:
>>>>>>>>>> On Tue, 2019-06-18 at 14:25 +0800, Yan, Zheng wrote:
>>>>>>>>>>> On 6/18/19 1:30 AM, Jeff Layton wrote:
>>>>>>>>>>>> On Mon, 2019-06-17 at 20:55 +0800, Yan, Zheng wrote:
>>>>>>>>>>>>> When remounting aborted mount, also reset client's entity addr.
>>>>>>>>>>>>> 'umount -f /ceph; mount -o remount /ceph' can be used for recovering
>>>>>>>>>>>>> from blacklist.
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Why do I need to umount here? Once the filesystem is unmounted, then the
>>>>>>>>>>>> '-o remount' becomes superfluous, no? In fact, I get an error back when
>>>>>>>>>>>> I try to remount an unmounted filesystem:
>>>>>>>>>>>>
>>>>>>>>>>>>         $ sudo umount -f /mnt/cephfs ; sudo mount -o remount /mnt/cephfs
>>>>>>>>>>>>         mount: /mnt/cephfs: mount point not mounted or bad option.
>>>>>>>>>>>>
>>>>>>>>>>>> My client isn't blacklisted above, so I guess you're counting on the
>>>>>>>>>>>> umount returning without having actually unmounted the filesystem?
>>>>>>>>>>>>
>>>>>>>>>>>> I think this ought to not need a umount first. From a UI standpoint,
>>>>>>>>>>>> just doing a "mount -o remount" ought to be sufficient to clear this.
>>>>>>>>>>>>
>>>>>>>>>>> This series is mainly for the case that mount point is not umountable.
>>>>>>>>>>> If mount point is umountable, user should use 'umount -f /ceph; mount
>>>>>>>>>>> /ceph'. This avoids all trouble of error handling.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> ...
>>>>>>>>>>
>>>>>>>>>>> If just doing "mount -o remount", user will expect there is no
>>>>>>>>>>> data/metadata get lost.  The 'mount -f' explicitly tell user this
>>>>>>>>>>> operation may lose data/metadata.
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> I don't think they'd expect that and even if they did, that's why we'd
>>>>>>>>>> return errors on certain operations until they are cleared. But, I think
>>>>>>>>>> all of this points out the main issue I have with this patchset, which
>>>>>>>>>> is that it's not clear what problem this is solving.
>>>>>>>>>>
>>>>>>>>>> So: client gets blacklisted and we want to allow it to come back in some
>>>>>>>>>> fashion. Do we expect applications that happened to be accessing that
>>>>>>>>>> mount to be able to continue running, or will they need to be restarted?
>>>>>>>>>> If they need to be restarted why not just expect the admin to kill them
>>>>>>>>>> all off, unmount and remount and then start them back up again?
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> The point is let users decide what to do. Some user values
>>>>>>>>> availability over consistency. It's inconvenient to kill all
>>>>>>>>> applications that use the mount, then do umount.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>
>>>>>>>> I think I have a couple of issues with this patchset. Maybe you can
>>>>>>>> convince me though:
>>>>>>>>
>>>>>>>> 1) The interface is really weird.
>>>>>>>>
>>>>>>>> You suggested that we needed to do:
>>>>>>>>
>>>>>>>>        # umount -f /mnt/foo ; mount -o remount /mnt/foo
>>>>>>>>
>>>>>>>> ...but what if I'm not really blacklisted? Didn't I just kill off all
>>>>>>>> the calls in-flight with the umount -f? What if that umount actually
>>>>>>>> succeeds? Then the subsequent remount call will fail.
>>>>>>>>
>>>>>>>> ISTM, that this interface (should we choose to accept it) should just
>>>>>>>> be:
>>>>>>>>
>>>>>>>>        # mount -o remount /mnt/foo
>>>>>>>>
>>>>>>>
>>>>>>> I have patch that does
>>>>>>>
>>>>>>> mount -o remount,force_reconnect /mnt/ceph
>>>>>>>
>>>>>>>
>>>>>>
>>>>>> That seems clearer.
>>>>>>
>>>>>>>> ...and if the client figures out that it has been blacklisted, then it
>>>>>>>> does the right thing during the remount (whatever that right thing is).
>>>>>>>>
>>>>>>>> 2) It's not clear to me who we expect to use this.
>>>>>>>>
>>>>>>>> Are you targeting applications that do not use file locking? Any that do
>>>>>>>> use file locking will probably need some special handling, but those
>>>>>>>> that don't might be able to get by unscathed as long as they can deal
>>>>>>>> with -EIO on fsync by replaying writes since the last fsync.
>>>>>>>>
>>>>>>>
>>>>>>> Several users said they availability over consistency. For example:
>>>>>>> ImageNet training, cephfs is used for storing image files.
>>>>>>>
>>>>>>>
>>>>>>
>>>>>> Which sounds reasonable on its face...but why bother with remounting at
>>>>>> that point? Why not just have the client reattempt connections until it
>>>>>> succeeds (or you forcibly unmount).
>>>>>>
>>>>>> For that matter, why not just redirty the pages after the writes fail in
>>>>>> that case instead of forcing those users to rewrite their data? If they
>>>>>> don't care about consistency that much, then that would seem to be a
>>>>>> nicer way to deal with this.
>>>>>>
>>>>>
>>>>> I'm not clear about this either
>>>>
>>>> As I've said elsewhere: **how** the client recovers from the lost
>>>> session and blacklist event is configurable. There should be a range
>>>> of mount options which control the behavior: such as a _hypothetical_
>>>> "recover_session=<mode>", where mode may be:
>>>>
>>>> - "brute": re-acquire capabilities and flush all dirty data. All open
>>>> file handles continue to work normally. Dangerous and definitely not
>>>> the default. (How should file locks be handled?)
>>>>
>>>
>>> IMO, just reacquire them as if nothing happened for this mode. I see
>>> this as conceptually similar to recover_lost_locks module parameter in
>>> nfs.ko. That said, we will need to consider what to do if the lock can't
>>> be reacquired in this mode.
>>>
>>>> - "clean": re-acquire read capabilities and drop dirty write buffers.
>>>> Writable file handles return -EIO. Locks are lost and the lock owners
>>>> are sent SIGIO, si_code=SI_LOST, si_fd=lockedfd (default is
>>>> termination!). Read-only handles continue to work and caches are
>>>> dropped if necessary. This should probably be the default.
>>>>
>>>
>>> Sounds good, except for maybe modulo SIGLOST handling for reasons I
>>> outlined in another thread.
>>>
>>>> - "fresh": like "clean" but read-only handles also return -EIO. Not
>>>> sure if this one is useful but not difficult to add.
>>>>
>>>
>>> Meh, maybe. If we don't clearly need it then let's not add it. I'd want
>>> to know that someone has an actual use for this option. Adding
>>> interfaces just because we can, just makes trouble later as the code
>>> ages.
>>>
>>>> No "-o remount" mount commands necessary.
>>>>
>>>> Now, these details are open for change. I'm just trying to suggest a
>>>> way forward. I'm not well versed in how difficult this proposal is to
>>>> implement in the kernel. There are probably details or challenges I'm
>>>> not considering. I recommend that before Zheng writes new code that he
>>>> and Jeff work out what the right semantics and configurations should
>>>> be and make a proposal to ceph-devel/dev@ceph.io for user feedback.
>>>>
>>>
>>> That sounds a bit more reasonable. I'd prefer not having to wait for
>>> admin intervention in order to get things moving again if the goal is
>>> making things more available.
>>>
>>> That said, whenever we're doing something like this, it's easy for all
>>> of us to make subtle assumptions and end up talking at cross-purposes to
>>> one another. The first step here is to clearly identify the problem
>>> we're trying to solve. From earlier emails I'd suggest this as a
>>> starting point:
>>>
>>> "Clients can end up blacklisted due to various connectivity issues, and
>>> we'd like to offer admins a way to configure the mount to reconnect
>>> after blacklisting/unblacklisting, and continue working. Preferably,
>>> with no disruption to the application other than the client hanging
>>> while blacklisted."
>>>
>>> Does this sound about right?
>>>
>>> If so, then I think we ought to aim for something closer to what Patrick
>>> is suggesting; a mount option or something that causes the cephfs client
>>> to aggressively attempt to recover after being unblacklisted.
>>>
>>
>> Clients shouldn't be too aggressively in this case. Otherwise they can
>> easily create too many blacklist entries in osdmap.
>>
> 
> When I said "aggressively" I meant on the order of once a minute or so,
> though that interval could be tunable.
> 
> Can blacklisted clients still request osd maps from the monitors? IOW,
> is there a way for the client to determine whether it has been
> blacklisted? If so, then when the client suspects that it has been
> blacklisted it could just wait until the new OSD map shows otherwise.

Blacklisted client can request osdmap. But for kernel client, osdmap is 
too complex to fully decode.

I don't understand what does "wait until the new OSD map shows" mean

> 
> In any case, I thought blacklisting mostly occurred when clients fail to
> give up their MDS caps. Why would repeated polling create more blacklist
> entries?
> 

mds blacklist client when client is unresponsive. The blacklist entry 
stays in osdmap for a day. If client auto reconnect, a laggy client can 
keep reconnecting and getting blacklisted.
Patrick Donnelly June 26, 2019, 5:33 a.m. UTC | #21
On Tue, Jun 25, 2019 at 10:16 PM Yan, Zheng <zyan@redhat.com> wrote:
> On 6/25/19 9:39 PM, Jeff Layton wrote:
> > In any case, I thought blacklisting mostly occurred when clients fail to
> > give up their MDS caps. Why would repeated polling create more blacklist
> > entries?
> >
>
> mds blacklist client when client is unresponsive. The blacklist entry
> stays in osdmap for a day. If client auto reconnect, a laggy client can
> keep reconnecting and getting blacklisted.

We can address this problem in two directions:

1) MDS simply rejects a client session from a certain IP address if
the MDS has been repeatedly blacklisting clients from that IP. If the
client never has a new session, there is no need add a new blacklist
entry.

2) The client can be smarter and not retry connections if it has been
blacklisted ~3 times in the last hour. (Just be sure to log that it is
deferring reconnect until later when it thinks the session will be
successful.)

Patch
diff mbox series

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 19c62cf7d5b8..188c33709d9a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1378,9 +1378,12 @@  static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		struct ceph_cap_flush *cf;
 		struct ceph_mds_client *mdsc = fsc->mdsc;
 
-		if (ci->i_wrbuffer_ref > 0 &&
-		    READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
-			invalidate = true;
+		if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+			if (inode->i_data.nrpages > 0)
+				invalidate = true;
+			if (ci->i_wrbuffer_ref > 0)
+				mapping_set_error(&inode->i_data, -EIO);
+		}
 
 		while (!list_empty(&ci->i_cap_flush_list)) {
 			cf = list_first_entry(&ci->i_cap_flush_list,
@@ -4350,7 +4353,12 @@  void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
 		session = __ceph_lookup_mds_session(mdsc, mds);
 		if (!session)
 			continue;
+
+		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
+			__unregister_session(mdsc, session);
+		__wake_requests(mdsc, &session->s_waiting);
 		mutex_unlock(&mdsc->mutex);
+
 		mutex_lock(&session->s_mutex);
 		__close_session(mdsc, session);
 		if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
@@ -4359,9 +4367,11 @@  void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
 		}
 		mutex_unlock(&session->s_mutex);
 		ceph_put_mds_session(session);
+
 		mutex_lock(&mdsc->mutex);
 		kick_requests(mdsc, mds);
 	}
+
 	__wake_requests(mdsc, &mdsc->waiting_for_map);
 	mutex_unlock(&mdsc->mutex);
 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 67eb9d592ab7..a6a3c065f697 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -833,8 +833,27 @@  static void ceph_umount_begin(struct super_block *sb)
 
 static int ceph_remount(struct super_block *sb, int *flags, char *data)
 {
-	sync_filesystem(sb);
-	return 0;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+	if (fsc->mount_state != CEPH_MOUNT_SHUTDOWN) {
+		sync_filesystem(sb);
+		return 0;
+	}
+
+	/* Make sure all page caches get invalidated.
+	 * see remove_session_caps_cb() */
+	flush_workqueue(fsc->inode_wq);
+	/* In case that we were blacklisted. This also reset
+	 * all mon/osd connections */
+	ceph_reset_client_addr(fsc->client);
+
+	ceph_osdc_clear_abort_err(&fsc->client->osdc);
+	fsc->mount_state = 0;
+
+	if (!sb->s_root)
+		return 0;
+	return __ceph_do_getattr(d_inode(sb->s_root), NULL,
+				 CEPH_STAT_CAP_INODE, true);
 }
 
 static const struct super_operations ceph_super_ops = {