ceph: add halt mount option support
diff mbox series

Message ID 20200216064945.61726-1-xiubli@redhat.com
State New
Headers show
Series
  • ceph: add halt mount option support
Related show

Commit Message

Xiubo Li Feb. 16, 2020, 6:49 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

This will simulate pulling the power cable situation, which will
do:

- abort all the inflight osd/mds requests and fail them with -EIO.
- reject any new coming osd/mds requests with -EIO.
- close all the mds connections directly without doing any clean up
  and disable mds sessions recovery routine.
- close all the osd connections directly without doing any clean up.
- set the msgr as stopped.

URL: https://tracker.ceph.com/issues/44044
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c            | 12 ++++++++++--
 fs/ceph/mds_client.h            |  3 ++-
 fs/ceph/super.c                 | 33 ++++++++++++++++++++++++++++-----
 fs/ceph/super.h                 |  1 +
 include/linux/ceph/libceph.h    |  1 +
 include/linux/ceph/mon_client.h |  2 ++
 include/linux/ceph/osd_client.h |  1 +
 net/ceph/ceph_common.c          | 14 ++++++++++++++
 net/ceph/mon_client.c           | 16 ++++++++++++++--
 net/ceph/osd_client.c           | 11 +++++++++++
 10 files changed, 84 insertions(+), 10 deletions(-)

Comments

Jeff Layton Feb. 17, 2020, 1:04 p.m. UTC | #1
On Sun, 2020-02-16 at 01:49 -0500, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> This will simulate pulling the power cable situation, which will
> do:
> 
> - abort all the inflight osd/mds requests and fail them with -EIO.
> - reject any new coming osd/mds requests with -EIO.
> - close all the mds connections directly without doing any clean up
>   and disable mds sessions recovery routine.
> - close all the osd connections directly without doing any clean up.
> - set the msgr as stopped.
> 
> URL: https://tracker.ceph.com/issues/44044
> Signed-off-by: Xiubo Li <xiubli@redhat.com>

There is no explanation of how to actually _use_ this feature? I assume
you have to remount the fs with "-o remount,halt" ? Is it possible to
reenable the mount as well?  If not, why keep the mount around? Maybe we
should consider wiring this in to a new umount2() flag instead?

This needs much better documentation.

In the past, I've generally done this using iptables. Granted that that
is difficult with a clustered fs like ceph (given that you potentially
have to set rules for a lot of addresses), but I wonder whether a scheme
like that might be more viable in the long run.

Note too that this may have interesting effects when superblocks end up
being shared between vfsmounts.

> ---
>  fs/ceph/mds_client.c            | 12 ++++++++++--
>  fs/ceph/mds_client.h            |  3 ++-
>  fs/ceph/super.c                 | 33 ++++++++++++++++++++++++++++-----
>  fs/ceph/super.h                 |  1 +
>  include/linux/ceph/libceph.h    |  1 +
>  include/linux/ceph/mon_client.h |  2 ++
>  include/linux/ceph/osd_client.h |  1 +
>  net/ceph/ceph_common.c          | 14 ++++++++++++++
>  net/ceph/mon_client.c           | 16 ++++++++++++++--
>  net/ceph/osd_client.c           | 11 +++++++++++
>  10 files changed, 84 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index b0f34251ad28..b6aa357f7c61 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -4110,6 +4110,9 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
>  {
>  	struct ceph_fs_client *fsc = mdsc->fsc;
>  
> +	if (ceph_test_mount_opt(fsc, HALT))
> +		return;
> +
>  	if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
>  		return;
>  
> @@ -4735,7 +4738,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
>  	dout("stopped\n");
>  }
>  
> -void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
> +void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc, bool halt)
>  {
>  	struct ceph_mds_session *session;
>  	int mds;
> @@ -4748,7 +4751,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
>  		if (!session)
>  			continue;
>  
> -		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
> +		/*
> +		 * when halting the superblock, it will simulate pulling
> +		 * the power cable, so here close the connection before
> +		 * doing any cleanup.
> +		 */
> +		if (halt || (session->s_state == CEPH_MDS_SESSION_REJECTED))
>  			__unregister_session(mdsc, session);

Note that this is not exactly like pulling the power cable. The
connection will be closed, which will send a FIN to the peer.

>  		__wake_requests(mdsc, &session->s_waiting);
>  		mutex_unlock(&mdsc->mutex);
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index c13910da07c4..b66eea830ae1 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -478,7 +478,8 @@ extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
>  
>  extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
>  extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
> -extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
> +extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc,
> +				   bool halt);
>  extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
>  
>  extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 8b52bea13273..2a6fd5d2fffa 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -155,6 +155,7 @@ enum {
>  	Opt_acl,
>  	Opt_quotadf,
>  	Opt_copyfrom,
> +	Opt_halt,
>  };
>  
>  enum ceph_recover_session_mode {
> @@ -194,6 +195,7 @@ static const struct fs_parameter_spec ceph_mount_param_specs[] = {
>  	fsparam_string	("snapdirname",			Opt_snapdirname),
>  	fsparam_string	("source",			Opt_source),
>  	fsparam_u32	("wsize",			Opt_wsize),
> +	fsparam_flag	("halt",			Opt_halt),
>  	{}
>  };
>  
> @@ -435,6 +437,9 @@ static int ceph_parse_mount_param(struct fs_context *fc,
>  			fc->sb_flags &= ~SB_POSIXACL;
>  		}
>  		break;
> +	case Opt_halt:
> +		fsopt->flags |= CEPH_MOUNT_OPT_HALT;
> +		break;
>  	default:
>  		BUG();
>  	}
> @@ -601,6 +606,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
>  	if (m->count == pos)
>  		m->count--;
>  
> +	if (fsopt->flags & CEPH_MOUNT_OPT_HALT)
> +		seq_puts(m, ",halt");
>  	if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
>  		seq_puts(m, ",dirstat");
>  	if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES))
> @@ -877,22 +884,28 @@ static void destroy_caches(void)
>  }
>  
>  /*
> - * ceph_umount_begin - initiate forced umount.  Tear down down the
> - * mount, skipping steps that may hang while waiting for server(s).
> + * ceph_umount_begin - initiate forced umount.  Tear down the mount,
> + * skipping steps that may hang while waiting for server(s).
>   */
> -static void ceph_umount_begin(struct super_block *sb)
> +static void __ceph_umount_begin(struct super_block *sb, bool halt)
>  {
>  	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
>  
> -	dout("ceph_umount_begin - starting forced umount\n");
>  	if (!fsc)
>  		return;
>  	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
>  	ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
> -	ceph_mdsc_force_umount(fsc->mdsc);
> +	ceph_mdsc_force_umount(fsc->mdsc, halt);
>  	fsc->filp_gen++; // invalidate open files
>  }
>  
> +static void ceph_umount_begin(struct super_block *sb)
> +{
> +	dout("%s - starting forced umount\n", __func__);
> +
> +	__ceph_umount_begin(sb, false);
> +}
> +
>  static const struct super_operations ceph_super_ops = {
>  	.alloc_inode	= ceph_alloc_inode,
>  	.free_inode	= ceph_free_inode,
> @@ -1193,6 +1206,16 @@ static int ceph_reconfigure_fc(struct fs_context *fc)
>  	struct ceph_parse_opts_ctx *pctx = fc->fs_private;
>  	struct ceph_mount_options *new_fsopt = pctx->opts;
>  
> +	/* halt the mount point, will ignore other options */
> +	if (new_fsopt->flags & CEPH_MOUNT_OPT_HALT) {
> +		dout("halt the mount point\n");
> +		fsopt->flags |= CEPH_MOUNT_OPT_HALT;
> +		__ceph_umount_begin(sb, true);
> +		ceph_halt_client(fsc->client);
> +
> +		return 0;
> +	}
> +
>  	sync_filesystem(sb);
>  
>  	if (strcmp_null(new_fsopt->snapdir_name, fsopt->snapdir_name))
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 4c40e86ad016..64f16083b216 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -43,6 +43,7 @@
>  #define CEPH_MOUNT_OPT_MOUNTWAIT       (1<<12) /* mount waits if no mds is up */
>  #define CEPH_MOUNT_OPT_NOQUOTADF       (1<<13) /* no root dir quota in statfs */
>  #define CEPH_MOUNT_OPT_NOCOPYFROM      (1<<14) /* don't use RADOS 'copy-from' op */
> +#define CEPH_MOUNT_OPT_HALT            (1<<15) /* halt the mount point */
>  
>  #define CEPH_MOUNT_OPT_DEFAULT			\
>  	(CEPH_MOUNT_OPT_DCACHE |		\
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index 8fe9b80e80a5..12e9f0cc8501 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -295,6 +295,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private);
>  struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
>  u64 ceph_client_gid(struct ceph_client *client);
>  extern void ceph_destroy_client(struct ceph_client *client);
> +void ceph_halt_client(struct ceph_client *client);
>  extern void ceph_reset_client_addr(struct ceph_client *client);
>  extern int __ceph_open_session(struct ceph_client *client,
>  			       unsigned long started);
> diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
> index dbb8a6959a73..7718a2e65d07 100644
> --- a/include/linux/ceph/mon_client.h
> +++ b/include/linux/ceph/mon_client.h
> @@ -78,6 +78,7 @@ struct ceph_mon_client {
>  	struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
>  	int pending_auth;
>  
> +	bool halt;
>  	bool hunting;
>  	int cur_mon;                       /* last monitor i contacted */
>  	unsigned long sub_renew_after;
> @@ -109,6 +110,7 @@ extern int ceph_monmap_contains(struct ceph_monmap *m,
>  
>  extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
>  extern void ceph_monc_stop(struct ceph_mon_client *monc);
> +void ceph_monc_halt(struct ceph_mon_client *monc);
>  extern void ceph_monc_reopen_session(struct ceph_mon_client *monc);
>  
>  enum {
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index 02ff3a302d26..4b9143f7d989 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -382,6 +382,7 @@ extern void ceph_osdc_cleanup(void);
>  extern int ceph_osdc_init(struct ceph_osd_client *osdc,
>  			  struct ceph_client *client);
>  extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
> +extern void ceph_osdc_halt(struct ceph_osd_client *osdc);
>  extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc);
>  
>  extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> index a9d6c97b5b0d..c47578ed0546 100644
> --- a/net/ceph/ceph_common.c
> +++ b/net/ceph/ceph_common.c
> @@ -652,6 +652,20 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
>  }
>  EXPORT_SYMBOL(ceph_create_client);
>  
> +void ceph_halt_client(struct ceph_client *client)
> +{
> +	dout("halt_client %p\n", client);
> +
> +	atomic_set(&client->msgr.stopping, 1);
> +
> +	/* unmount */
> +	ceph_osdc_halt(&client->osdc);
> +	ceph_monc_halt(&client->monc);
> +
> +	dout("halt_client %p done\n", client);
> +}
> +EXPORT_SYMBOL(ceph_halt_client);
> +
>  void ceph_destroy_client(struct ceph_client *client)
>  {
>  	dout("destroy_client %p\n", client);
> diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
> index 9d9e4e4ea600..5819a02af7fe 100644
> --- a/net/ceph/mon_client.c
> +++ b/net/ceph/mon_client.c
> @@ -979,14 +979,16 @@ static void delayed_work(struct work_struct *work)
>  	mutex_lock(&monc->mutex);
>  	if (monc->hunting) {
>  		dout("%s continuing hunt\n", __func__);
> -		reopen_session(monc);
> +		if (!monc->halt)
> +			reopen_session(monc);
>  	} else {
>  		int is_auth = ceph_auth_is_authenticated(monc->auth);
>  		if (ceph_con_keepalive_expired(&monc->con,
>  					       CEPH_MONC_PING_TIMEOUT)) {
>  			dout("monc keepalive timeout\n");
>  			is_auth = 0;
> -			reopen_session(monc);
> +			if (!monc->halt)
> +				reopen_session(monc);
>  		}
>  
>  		if (!monc->hunting) {
> @@ -1115,6 +1117,16 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
>  }
>  EXPORT_SYMBOL(ceph_monc_init);
>  
> +void ceph_monc_halt(struct ceph_mon_client *monc)
> +{
> +	dout("monc halt\n");
> +
> +	mutex_lock(&monc->mutex);
> +	monc->halt = true;
> +	ceph_con_close(&monc->con);
> +	mutex_unlock(&monc->mutex);
> +}
> +

The changelog doesn't mention shutting down connections to the mons.

>  void ceph_monc_stop(struct ceph_mon_client *monc)
>  {
>  	dout("stop\n");
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 108c9457d629..161daf35d7f1 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -5202,6 +5202,17 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
>  	return err;
>  }
>  
> +void ceph_osdc_halt(struct ceph_osd_client *osdc)
> +{
> +	down_write(&osdc->lock);
> +	while (!RB_EMPTY_ROOT(&osdc->osds)) {
> +		struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
> +						struct ceph_osd, o_node);
> +		close_osd(osd);
> +	}
> +	up_write(&osdc->lock);
> +}
> +
>  void ceph_osdc_stop(struct ceph_osd_client *osdc)
>  {
>  	destroy_workqueue(osdc->completion_wq);
Xiubo Li Feb. 17, 2020, 3:45 p.m. UTC | #2
On 2020/2/17 21:04, Jeff Layton wrote:
> On Sun, 2020-02-16 at 01:49 -0500, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> This will simulate pulling the power cable situation, which will
>> do:
>>
>> - abort all the inflight osd/mds requests and fail them with -EIO.
>> - reject any new coming osd/mds requests with -EIO.
>> - close all the mds connections directly without doing any clean up
>>    and disable mds sessions recovery routine.
>> - close all the osd connections directly without doing any clean up.
>> - set the msgr as stopped.
>>
>> URL: https://tracker.ceph.com/issues/44044
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> There is no explanation of how to actually _use_ this feature?

 From the tracker's description I am assuming it just will do something 
like testing some features in the cephfs by simulating connections to 
the client node are lost due to some reasons, such as lost power.

>   I assume
> you have to remount the fs with "-o remount,halt" ?

Yeah, right.


> Is it possible to
> reenable the mount as well?

For the "halt", no.

>    If not, why keep the mount around?

I would let the followed umount to continue to do the cleanup.


>   Maybe we
> should consider wiring this in to a new umount2() flag instead?

For the umount2(), it seems have conflicted with the MNT_FORCE, but a 
little different.


>
> This needs much better documentation.
>
> In the past, I've generally done this using iptables. Granted that that
> is difficult with a clustered fs like ceph (given that you potentially
> have to set rules for a lot of addresses), but I wonder whether a scheme
> like that might be more viable in the long run.
>
> Note too that this may have interesting effects when superblocks end up
> being shared between vfsmounts.

Yeah, this is based the superblock, so for the shared vfsmounts, they 
all will be halted at the same time.


>> @@ -4748,7 +4751,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
>>   		if (!session)
>>   			continue;
>>   
>> -		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
>> +		/*
>> +		 * when halting the superblock, it will simulate pulling
>> +		 * the power cable, so here close the connection before
>> +		 * doing any cleanup.
>> +		 */
>> +		if (halt || (session->s_state == CEPH_MDS_SESSION_REJECTED))
>>   			__unregister_session(mdsc, session);
> Note that this is not exactly like pulling the power cable. The
> connection will be closed, which will send a FIN to the peer.

Yeah, it is.

I was thinking for the fuse client, if we send a KILL signal, the kernel 
will also help us close the socket fds and send the FIN to the peer ?

If the fuse client works for this case, so will it here.

>> @@ -1115,6 +1117,16 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
>>   }
>>   EXPORT_SYMBOL(ceph_monc_init);
>>   
>> +void ceph_monc_halt(struct ceph_mon_client *monc)
>> +{
>> +	dout("monc halt\n");
>> +
>> +	mutex_lock(&monc->mutex);
>> +	monc->halt = true;
>> +	ceph_con_close(&monc->con);
>> +	mutex_unlock(&monc->mutex);
>> +}
>> +
> The changelog doesn't mention shutting down connections to the mons.

Yeah, I missed it.

Thanks,

BRs
Xiubo Li Feb. 18, 2020, 7:19 a.m. UTC | #3
On 2020/2/17 21:04, Jeff Layton wrote:
> On Sun, 2020-02-16 at 01:49 -0500, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> This will simulate pulling the power cable situation, which will
>> do:
>>
>> - abort all the inflight osd/mds requests and fail them with -EIO.
>> - reject any new coming osd/mds requests with -EIO.
>> - close all the mds connections directly without doing any clean up
>>    and disable mds sessions recovery routine.
>> - close all the osd connections directly without doing any clean up.
>> - set the msgr as stopped.
>>
>> URL: https://tracker.ceph.com/issues/44044
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> There is no explanation of how to actually _use_ this feature? I assume
> you have to remount the fs with "-o remount,halt" ? Is it possible to
> reenable the mount as well?  If not, why keep the mount around? Maybe we
> should consider wiring this in to a new umount2() flag instead?
>
> This needs much better documentation.
>
> In the past, I've generally done this using iptables. Granted that that
> is difficult with a clustered fs like ceph (given that you potentially
> have to set rules for a lot of addresses), but I wonder whether a scheme
> like that might be more viable in the long run.
>
How about fulfilling the DROP iptable rules in libceph ? Could you 
foresee any problem ? This seems the one approach could simulate pulling 
the power cable.

Any idea ?

Thanks

BRs
Jeff Layton Feb. 18, 2020, 12:01 p.m. UTC | #4
On Tue, 2020-02-18 at 15:19 +0800, Xiubo Li wrote:
> On 2020/2/17 21:04, Jeff Layton wrote:
> > On Sun, 2020-02-16 at 01:49 -0500, xiubli@redhat.com wrote:
> > > From: Xiubo Li <xiubli@redhat.com>
> > > 
> > > This will simulate pulling the power cable situation, which will
> > > do:
> > > 
> > > - abort all the inflight osd/mds requests and fail them with -EIO.
> > > - reject any new coming osd/mds requests with -EIO.
> > > - close all the mds connections directly without doing any clean up
> > >    and disable mds sessions recovery routine.
> > > - close all the osd connections directly without doing any clean up.
> > > - set the msgr as stopped.
> > > 
> > > URL: https://tracker.ceph.com/issues/44044
> > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > There is no explanation of how to actually _use_ this feature? I assume
> > you have to remount the fs with "-o remount,halt" ? Is it possible to
> > reenable the mount as well?  If not, why keep the mount around? Maybe we
> > should consider wiring this in to a new umount2() flag instead?
> > 
> > This needs much better documentation.
> > 
> > In the past, I've generally done this using iptables. Granted that that
> > is difficult with a clustered fs like ceph (given that you potentially
> > have to set rules for a lot of addresses), but I wonder whether a scheme
> > like that might be more viable in the long run.
> > 
> How about fulfilling the DROP iptable rules in libceph ? Could you 
> foresee any problem ? This seems the one approach could simulate pulling 
> the power cable.
> 

Yeah, I've mostly done this using DROP rules when I needed to test things.
But, I think I was probably just guilty of speculating out loud here.

I think doing this by just closing down the sockets is probably fine. I
wouldn't pursue anything relating to to iptables here, unless we have
some larger reason to go that route.
Xiubo Li Feb. 18, 2020, 2:32 p.m. UTC | #5
On 2020/2/18 20:01, Jeff Layton wrote:
> On Tue, 2020-02-18 at 15:19 +0800, Xiubo Li wrote:
>> On 2020/2/17 21:04, Jeff Layton wrote:
>>> On Sun, 2020-02-16 at 01:49 -0500, xiubli@redhat.com wrote:
>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>
>>>> This will simulate pulling the power cable situation, which will
>>>> do:
>>>>
>>>> - abort all the inflight osd/mds requests and fail them with -EIO.
>>>> - reject any new coming osd/mds requests with -EIO.
>>>> - close all the mds connections directly without doing any clean up
>>>>     and disable mds sessions recovery routine.
>>>> - close all the osd connections directly without doing any clean up.
>>>> - set the msgr as stopped.
>>>>
>>>> URL: https://tracker.ceph.com/issues/44044
>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>> There is no explanation of how to actually _use_ this feature? I assume
>>> you have to remount the fs with "-o remount,halt" ? Is it possible to
>>> reenable the mount as well?  If not, why keep the mount around? Maybe we
>>> should consider wiring this in to a new umount2() flag instead?
>>>
>>> This needs much better documentation.
>>>
>>> In the past, I've generally done this using iptables. Granted that that
>>> is difficult with a clustered fs like ceph (given that you potentially
>>> have to set rules for a lot of addresses), but I wonder whether a scheme
>>> like that might be more viable in the long run.
>>>
>> How about fulfilling the DROP iptable rules in libceph ? Could you
>> foresee any problem ? This seems the one approach could simulate pulling
>> the power cable.
>>
> Yeah, I've mostly done this using DROP rules when I needed to test things.
> But, I think I was probably just guilty of speculating out loud here.
>
> I think doing this by just closing down the sockets is probably fine. I
> wouldn't pursue anything relating to to iptables here, unless we have
> some larger reason to go that route.

I have digged into the socket and netfilter code today, to do this in 
kernel space is kind easy.

Let's have a test if closing socket direct could not work well or as 
expected, let's have try that then.

Thanks.

BRs
Ilya Dryomov Feb. 18, 2020, 2:59 p.m. UTC | #6
On Tue, Feb 18, 2020 at 1:01 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Tue, 2020-02-18 at 15:19 +0800, Xiubo Li wrote:
> > On 2020/2/17 21:04, Jeff Layton wrote:
> > > On Sun, 2020-02-16 at 01:49 -0500, xiubli@redhat.com wrote:
> > > > From: Xiubo Li <xiubli@redhat.com>
> > > >
> > > > This will simulate pulling the power cable situation, which will
> > > > do:
> > > >
> > > > - abort all the inflight osd/mds requests and fail them with -EIO.
> > > > - reject any new coming osd/mds requests with -EIO.
> > > > - close all the mds connections directly without doing any clean up
> > > >    and disable mds sessions recovery routine.
> > > > - close all the osd connections directly without doing any clean up.
> > > > - set the msgr as stopped.
> > > >
> > > > URL: https://tracker.ceph.com/issues/44044
> > > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > There is no explanation of how to actually _use_ this feature? I assume
> > > you have to remount the fs with "-o remount,halt" ? Is it possible to
> > > reenable the mount as well?  If not, why keep the mount around? Maybe we
> > > should consider wiring this in to a new umount2() flag instead?
> > >
> > > This needs much better documentation.
> > >
> > > In the past, I've generally done this using iptables. Granted that that
> > > is difficult with a clustered fs like ceph (given that you potentially
> > > have to set rules for a lot of addresses), but I wonder whether a scheme
> > > like that might be more viable in the long run.
> > >
> > How about fulfilling the DROP iptable rules in libceph ? Could you
> > foresee any problem ? This seems the one approach could simulate pulling
> > the power cable.
> >
>
> Yeah, I've mostly done this using DROP rules when I needed to test things.
> But, I think I was probably just guilty of speculating out loud here.

I'm not sure what exactly Xiubo meant by "fulfilling" iptables rules
in libceph, but I will say that any kind of iptables manipulation from
within libceph is probably out of the question.

>
> I think doing this by just closing down the sockets is probably fine. I
> wouldn't pursue anything relating to to iptables here, unless we have
> some larger reason to go that route.

IMO investing into a set of iptables and tc helpers for teuthology
makes a _lot_ of sense.  It isn't exactly the same as a cable pull,
but it's probably the next best thing.  First, it will be external to
the system under test.  Second, it can be made selective -- you can
cut a single session or all of them, simulate packet loss and latency
issues, etc.  Third, it can be used for recovery and failover/fencing
testing -- what happens when these packets get delivered two minutes
later?  None of this is possible with something that just attempts to
wedge the mount and acts as a point of no return.

Thanks,

                Ilya
Jeff Layton Feb. 18, 2020, 3:25 p.m. UTC | #7
On Tue, 2020-02-18 at 15:59 +0100, Ilya Dryomov wrote:
> On Tue, Feb 18, 2020 at 1:01 PM Jeff Layton <jlayton@kernel.org> wrote:
> > On Tue, 2020-02-18 at 15:19 +0800, Xiubo Li wrote:
> > > On 2020/2/17 21:04, Jeff Layton wrote:
> > > > On Sun, 2020-02-16 at 01:49 -0500, xiubli@redhat.com wrote:
> > > > > From: Xiubo Li <xiubli@redhat.com>
> > > > > 
> > > > > This will simulate pulling the power cable situation, which will
> > > > > do:
> > > > > 
> > > > > - abort all the inflight osd/mds requests and fail them with -EIO.
> > > > > - reject any new coming osd/mds requests with -EIO.
> > > > > - close all the mds connections directly without doing any clean up
> > > > >    and disable mds sessions recovery routine.
> > > > > - close all the osd connections directly without doing any clean up.
> > > > > - set the msgr as stopped.
> > > > > 
> > > > > URL: https://tracker.ceph.com/issues/44044
> > > > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > > There is no explanation of how to actually _use_ this feature? I assume
> > > > you have to remount the fs with "-o remount,halt" ? Is it possible to
> > > > reenable the mount as well?  If not, why keep the mount around? Maybe we
> > > > should consider wiring this in to a new umount2() flag instead?
> > > > 
> > > > This needs much better documentation.
> > > > 
> > > > In the past, I've generally done this using iptables. Granted that that
> > > > is difficult with a clustered fs like ceph (given that you potentially
> > > > have to set rules for a lot of addresses), but I wonder whether a scheme
> > > > like that might be more viable in the long run.
> > > > 
> > > How about fulfilling the DROP iptable rules in libceph ? Could you
> > > foresee any problem ? This seems the one approach could simulate pulling
> > > the power cable.
> > > 
> > 
> > Yeah, I've mostly done this using DROP rules when I needed to test things.
> > But, I think I was probably just guilty of speculating out loud here.
> 
> I'm not sure what exactly Xiubo meant by "fulfilling" iptables rules
> in libceph, but I will say that any kind of iptables manipulation from
> within libceph is probably out of the question.
> 
> > I think doing this by just closing down the sockets is probably fine. I
> > wouldn't pursue anything relating to to iptables here, unless we have
> > some larger reason to go that route.
> 
> IMO investing into a set of iptables and tc helpers for teuthology
> makes a _lot_ of sense.  It isn't exactly the same as a cable pull,
> but it's probably the next best thing.  First, it will be external to
> the system under test.  Second, it can be made selective -- you can
> cut a single session or all of them, simulate packet loss and latency
> issues, etc.  Third, it can be used for recovery and failover/fencing
> testing -- what happens when these packets get delivered two minutes
> later?  None of this is possible with something that just attempts to
> wedge the mount and acts as a point of no return.
> 

That's a great point and does sound tremendously more useful than just
"halting" a mount like this.

That said, one of the stated goals in the tracker bug is:

"It'd be better if we had a way to shutdown the cephfs mount without any
kind of cleanup. This would allow us to have kernel clients all on the
same node and selectively "kill" them."

That latter point sounds rather hard to fulfill with iptables rules.
Ilya Dryomov Feb. 18, 2020, 4:14 p.m. UTC | #8
On Tue, Feb 18, 2020 at 4:25 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Tue, 2020-02-18 at 15:59 +0100, Ilya Dryomov wrote:
> > On Tue, Feb 18, 2020 at 1:01 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > On Tue, 2020-02-18 at 15:19 +0800, Xiubo Li wrote:
> > > > On 2020/2/17 21:04, Jeff Layton wrote:
> > > > > On Sun, 2020-02-16 at 01:49 -0500, xiubli@redhat.com wrote:
> > > > > > From: Xiubo Li <xiubli@redhat.com>
> > > > > >
> > > > > > This will simulate pulling the power cable situation, which will
> > > > > > do:
> > > > > >
> > > > > > - abort all the inflight osd/mds requests and fail them with -EIO.
> > > > > > - reject any new coming osd/mds requests with -EIO.
> > > > > > - close all the mds connections directly without doing any clean up
> > > > > >    and disable mds sessions recovery routine.
> > > > > > - close all the osd connections directly without doing any clean up.
> > > > > > - set the msgr as stopped.
> > > > > >
> > > > > > URL: https://tracker.ceph.com/issues/44044
> > > > > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > > > There is no explanation of how to actually _use_ this feature? I assume
> > > > > you have to remount the fs with "-o remount,halt" ? Is it possible to
> > > > > reenable the mount as well?  If not, why keep the mount around? Maybe we
> > > > > should consider wiring this in to a new umount2() flag instead?
> > > > >
> > > > > This needs much better documentation.
> > > > >
> > > > > In the past, I've generally done this using iptables. Granted that that
> > > > > is difficult with a clustered fs like ceph (given that you potentially
> > > > > have to set rules for a lot of addresses), but I wonder whether a scheme
> > > > > like that might be more viable in the long run.
> > > > >
> > > > How about fulfilling the DROP iptable rules in libceph ? Could you
> > > > foresee any problem ? This seems the one approach could simulate pulling
> > > > the power cable.
> > > >
> > >
> > > Yeah, I've mostly done this using DROP rules when I needed to test things.
> > > But, I think I was probably just guilty of speculating out loud here.
> >
> > I'm not sure what exactly Xiubo meant by "fulfilling" iptables rules
> > in libceph, but I will say that any kind of iptables manipulation from
> > within libceph is probably out of the question.
> >
> > > I think doing this by just closing down the sockets is probably fine. I
> > > wouldn't pursue anything relating to to iptables here, unless we have
> > > some larger reason to go that route.
> >
> > IMO investing into a set of iptables and tc helpers for teuthology
> > makes a _lot_ of sense.  It isn't exactly the same as a cable pull,
> > but it's probably the next best thing.  First, it will be external to
> > the system under test.  Second, it can be made selective -- you can
> > cut a single session or all of them, simulate packet loss and latency
> > issues, etc.  Third, it can be used for recovery and failover/fencing
> > testing -- what happens when these packets get delivered two minutes
> > later?  None of this is possible with something that just attempts to
> > wedge the mount and acts as a point of no return.
> >
>
> That's a great point and does sound tremendously more useful than just
> "halting" a mount like this.
>
> That said, one of the stated goals in the tracker bug is:
>
> "It'd be better if we had a way to shutdown the cephfs mount without any
> kind of cleanup. This would allow us to have kernel clients all on the
> same node and selectively "kill" them."
>
> That latter point sounds rather hard to fulfill with iptables rules.

I think it should be doable, either with IP aliases (harder on the
iptables side since it doesn't recognize them as interfaces for -i/-o),
or with one of the virtual interfaces (easier on the iptables side
since they show up as actual interfaces).

Thanks,

                Ilya
Xiubo Li Feb. 19, 2020, 3:25 a.m. UTC | #9
On 2020/2/18 22:59, Ilya Dryomov wrote:
> On Tue, Feb 18, 2020 at 1:01 PM Jeff Layton <jlayton@kernel.org> wrote:
>> On Tue, 2020-02-18 at 15:19 +0800, Xiubo Li wrote:
>>> On 2020/2/17 21:04, Jeff Layton wrote:
>>>> On Sun, 2020-02-16 at 01:49 -0500, xiubli@redhat.com wrote:
>>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>>
>>>>> This will simulate pulling the power cable situation, which will
>>>>> do:
>>>>>
>>>>> - abort all the inflight osd/mds requests and fail them with -EIO.
>>>>> - reject any new coming osd/mds requests with -EIO.
>>>>> - close all the mds connections directly without doing any clean up
>>>>>     and disable mds sessions recovery routine.
>>>>> - close all the osd connections directly without doing any clean up.
>>>>> - set the msgr as stopped.
>>>>>
>>>>> URL: https://tracker.ceph.com/issues/44044
>>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>> There is no explanation of how to actually _use_ this feature? I assume
>>>> you have to remount the fs with "-o remount,halt" ? Is it possible to
>>>> reenable the mount as well?  If not, why keep the mount around? Maybe we
>>>> should consider wiring this in to a new umount2() flag instead?
>>>>
>>>> This needs much better documentation.
>>>>
>>>> In the past, I've generally done this using iptables. Granted that that
>>>> is difficult with a clustered fs like ceph (given that you potentially
>>>> have to set rules for a lot of addresses), but I wonder whether a scheme
>>>> like that might be more viable in the long run.
>>>>
>>> How about fulfilling the DROP iptable rules in libceph ? Could you
>>> foresee any problem ? This seems the one approach could simulate pulling
>>> the power cable.
>>>
>> Yeah, I've mostly done this using DROP rules when I needed to test things.
>> But, I think I was probably just guilty of speculating out loud here.
> I'm not sure what exactly Xiubo meant by "fulfilling" iptables rules
> in libceph, but I will say that any kind of iptables manipulation from
> within libceph is probably out of the question.

Sorry for confusing here.

I meant in libceph add some helpers to enable/disable dropping the any 
new coming packet on the floor without responding anything to the ceph 
cluster for a specified session.


>
>> I think doing this by just closing down the sockets is probably fine. I
>> wouldn't pursue anything relating to to iptables here, unless we have
>> some larger reason to go that route.
> IMO investing into a set of iptables and tc helpers for teuthology
> makes a _lot_ of sense.  It isn't exactly the same as a cable pull,
> but it's probably the next best thing.  First, it will be external to
> the system under test.  Second, it can be made selective -- you can
> cut a single session or all of them, simulate packet loss and latency
> issues, etc.  Third, it can be used for recovery and failover/fencing
> testing -- what happens when these packets get delivered two minutes
> later?  None of this is possible with something that just attempts to
> wedge the mount and acts as a point of no return.

Yeah, cool and this is what the tracker#44044 intends to.

Thanks
BRs
Xiubo
> Thanks,
>
>                  Ilya
>
Patrick Donnelly Feb. 19, 2020, 7:22 p.m. UTC | #10
On Tue, Feb 18, 2020 at 6:59 AM Ilya Dryomov <idryomov@gmail.com> wrote:
> > Yeah, I've mostly done this using DROP rules when I needed to test things.
> > But, I think I was probably just guilty of speculating out loud here.
>
> I'm not sure what exactly Xiubo meant by "fulfilling" iptables rules
> in libceph, but I will say that any kind of iptables manipulation from
> within libceph is probably out of the question.

I think we're getting confused about two thoughts on iptables: (1) to
use iptables to effectively partition the mount instead of this new
halt option; (2) use iptables in concert with halt to prevent FIN
packets from being sent when the sockets are closed. I think we all
agree (2) is not going to happen.

> > I think doing this by just closing down the sockets is probably fine. I
> > wouldn't pursue anything relating to to iptables here, unless we have
> > some larger reason to go that route.
>
> IMO investing into a set of iptables and tc helpers for teuthology
> makes a _lot_ of sense.  It isn't exactly the same as a cable pull,
> but it's probably the next best thing.  First, it will be external to
> the system under test.  Second, it can be made selective -- you can
> cut a single session or all of them, simulate packet loss and latency
> issues, etc.  Third, it can be used for recovery and failover/fencing
> testing -- what happens when these packets get delivered two minutes
> later?  None of this is possible with something that just attempts to
> wedge the mount and acts as a point of no return.

This sounds attractive but it does require each mount to have its own
IP address? Or are there options? Maybe the kernel driver could mark
the connection with a mount ID we could do filtering on it? From a
quick Google, maybe [1] could be used for this purpose. I wonder
however if the kernel driver would have to do that marking of the
connection... and then we have iptables dependencies in the driver
again which we don't want to do.

From my perspective, this halt patch looks pretty simple and doesn't
appear to be a huge maintenance burden. Is it really so objectionable?

[1] https://home.regit.org/netfilter-en/netfilter-connmark/
Ilya Dryomov Feb. 19, 2020, 8:42 p.m. UTC | #11
On Wed, Feb 19, 2020 at 8:22 PM Patrick Donnelly <pdonnell@redhat.com> wrote:
>
> On Tue, Feb 18, 2020 at 6:59 AM Ilya Dryomov <idryomov@gmail.com> wrote:
> > > Yeah, I've mostly done this using DROP rules when I needed to test things.
> > > But, I think I was probably just guilty of speculating out loud here.
> >
> > I'm not sure what exactly Xiubo meant by "fulfilling" iptables rules
> > in libceph, but I will say that any kind of iptables manipulation from
> > within libceph is probably out of the question.
>
> I think we're getting confused about two thoughts on iptables: (1) to
> use iptables to effectively partition the mount instead of this new
> halt option; (2) use iptables in concert with halt to prevent FIN
> packets from being sent when the sockets are closed. I think we all
> agree (2) is not going to happen.

Right.

>
> > > I think doing this by just closing down the sockets is probably fine. I
> > > wouldn't pursue anything relating to to iptables here, unless we have
> > > some larger reason to go that route.
> >
> > IMO investing into a set of iptables and tc helpers for teuthology
> > makes a _lot_ of sense.  It isn't exactly the same as a cable pull,
> > but it's probably the next best thing.  First, it will be external to
> > the system under test.  Second, it can be made selective -- you can
> > cut a single session or all of them, simulate packet loss and latency
> > issues, etc.  Third, it can be used for recovery and failover/fencing
> > testing -- what happens when these packets get delivered two minutes
> > later?  None of this is possible with something that just attempts to
> > wedge the mount and acts as a point of no return.
>
> This sounds attractive but it does require each mount to have its own
> IP address? Or are there options? Maybe the kernel driver could mark
> the connection with a mount ID we could do filtering on it? From a
> quick Google, maybe [1] could be used for this purpose. I wonder
> however if the kernel driver would have to do that marking of the
> connection... and then we have iptables dependencies in the driver
> again which we don't want to do.

As I said yesterday, I think it should be doable with no kernel
changes -- either with IP aliases or with the help of some virtual
interface.  Exactly how, I'm not sure because I use VMs for my tests
and haven't had to touch iptables in a while, but I would be surprised
to learn otherwise given the myriad of options out there.

>
> From my perspective, this halt patch looks pretty simple and doesn't
> appear to be a huge maintenance burden. Is it really so objectionable?

Well, this patch is simple only because it isn't even remotely
equivalent to a cable pull.  I mean, it aborts in-flight requests
with EIO, closes sockets, etc.  Has it been tested against the test
cases that currently cold reset the node through the BMC?

If it has been tested and the current semantics are sufficient,
are you sure they will remain so in the future?  What happens when
a new test gets added that needs a harder shutdown?  We won't be
able to reuse existing "umount -f" infrastructure anymore...  What
if a new test needs to _actually_ kill the client?

And then a debugging knob that permanently wedges the client sure
can't be a mount option for all the obvious reasons.  This bit is easy
to fix, but the fact that it is submitted as a mount option makes me
suspect that the whole thing hasn't been thought through very well.

Thanks,

                Ilya
Jeff Layton Feb. 19, 2020, 9:21 p.m. UTC | #12
On Wed, 2020-02-19 at 21:42 +0100, Ilya Dryomov wrote:
> On Wed, Feb 19, 2020 at 8:22 PM Patrick Donnelly <pdonnell@redhat.com> wrote:
> > On Tue, Feb 18, 2020 at 6:59 AM Ilya Dryomov <idryomov@gmail.com> wrote:
> > > > Yeah, I've mostly done this using DROP rules when I needed to test things.
> > > > But, I think I was probably just guilty of speculating out loud here.
> > > 
> > > I'm not sure what exactly Xiubo meant by "fulfilling" iptables rules
> > > in libceph, but I will say that any kind of iptables manipulation from
> > > within libceph is probably out of the question.
> > 
> > I think we're getting confused about two thoughts on iptables: (1) to
> > use iptables to effectively partition the mount instead of this new
> > halt option; (2) use iptables in concert with halt to prevent FIN
> > packets from being sent when the sockets are closed. I think we all
> > agree (2) is not going to happen.
> 
> Right.
> 
> > > > I think doing this by just closing down the sockets is probably fine. I
> > > > wouldn't pursue anything relating to to iptables here, unless we have
> > > > some larger reason to go that route.
> > > 
> > > IMO investing into a set of iptables and tc helpers for teuthology
> > > makes a _lot_ of sense.  It isn't exactly the same as a cable pull,
> > > but it's probably the next best thing.  First, it will be external to
> > > the system under test.  Second, it can be made selective -- you can
> > > cut a single session or all of them, simulate packet loss and latency
> > > issues, etc.  Third, it can be used for recovery and failover/fencing
> > > testing -- what happens when these packets get delivered two minutes
> > > later?  None of this is possible with something that just attempts to
> > > wedge the mount and acts as a point of no return.
> > 
> > This sounds attractive but it does require each mount to have its own
> > IP address? Or are there options? Maybe the kernel driver could mark
> > the connection with a mount ID we could do filtering on it? From a
> > quick Google, maybe [1] could be used for this purpose. I wonder
> > however if the kernel driver would have to do that marking of the
> > connection... and then we have iptables dependencies in the driver
> > again which we don't want to do.
> 
> As I said yesterday, I think it should be doable with no kernel
> changes -- either with IP aliases or with the help of some virtual
> interface.  Exactly how, I'm not sure because I use VMs for my tests
> and haven't had to touch iptables in a while, but I would be surprised
> to learn otherwise given the myriad of options out there.
> 

...and really, doing this sort of testing with the kernel client outside
of a vm is sort of a mess anyway, IMO.

That said, I think we might need a way to match up a superblock with the
sockets associated with it -- so mon, osd and mds socket info,
basically. That could be a very simple thing in debugfs though, in the
existing directory hierarchy there. With that info, you could reasonably
do something with iptables like we're suggesting.

> > From my perspective, this halt patch looks pretty simple and doesn't
> > appear to be a huge maintenance burden. Is it really so objectionable?
> 
> Well, this patch is simple only because it isn't even remotely
> equivalent to a cable pull.  I mean, it aborts in-flight requests
> with EIO, closes sockets, etc.  Has it been tested against the test
> cases that currently cold reset the node through the BMC?
> 
> If it has been tested and the current semantics are sufficient,
> are you sure they will remain so in the future?  What happens when
> a new test gets added that needs a harder shutdown?  We won't be
> able to reuse existing "umount -f" infrastructure anymore...  What
> if a new test needs to _actually_ kill the client?
> 
> And then a debugging knob that permanently wedges the client sure
> can't be a mount option for all the obvious reasons.  This bit is easy
> to fix, but the fact that it is submitted as a mount option makes me
> suspect that the whole thing hasn't been thought through very well.

Agreed on all points. This sort of fault injection is really best done
via other means. Otherwise, it's really hard to know whether it'll
behave the way you expect in other situations.

I'll add too that I think experience shows that these sorts of
interfaces end up bitrotted because they're too specialized to use
outside of anything but very specific environments. We need to think
larger than just teuthology's needs here.
Patrick Donnelly Feb. 19, 2020, 10:49 p.m. UTC | #13
Responding to you and Ilya both:

On Wed, Feb 19, 2020 at 1:21 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Wed, 2020-02-19 at 21:42 +0100, Ilya Dryomov wrote:
> > On Wed, Feb 19, 2020 at 8:22 PM Patrick Donnelly <pdonnell@redhat.com> wrote:
> > > On Tue, Feb 18, 2020 at 6:59 AM Ilya Dryomov <idryomov@gmail.com> wrote:
> > > > > Yeah, I've mostly done this using DROP rules when I needed to test things.
> > > > > But, I think I was probably just guilty of speculating out loud here.
> > > >
> > > > I'm not sure what exactly Xiubo meant by "fulfilling" iptables rules
> > > > in libceph, but I will say that any kind of iptables manipulation from
> > > > within libceph is probably out of the question.
> > >
> > > I think we're getting confused about two thoughts on iptables: (1) to
> > > use iptables to effectively partition the mount instead of this new
> > > halt option; (2) use iptables in concert with halt to prevent FIN
> > > packets from being sent when the sockets are closed. I think we all
> > > agree (2) is not going to happen.
> >
> > Right.
> >
> > > > > I think doing this by just closing down the sockets is probably fine. I
> > > > > wouldn't pursue anything relating to to iptables here, unless we have
> > > > > some larger reason to go that route.
> > > >
> > > > IMO investing into a set of iptables and tc helpers for teuthology
> > > > makes a _lot_ of sense.  It isn't exactly the same as a cable pull,
> > > > but it's probably the next best thing.  First, it will be external to
> > > > the system under test.  Second, it can be made selective -- you can
> > > > cut a single session or all of them, simulate packet loss and latency
> > > > issues, etc.  Third, it can be used for recovery and failover/fencing
> > > > testing -- what happens when these packets get delivered two minutes
> > > > later?  None of this is possible with something that just attempts to
> > > > wedge the mount and acts as a point of no return.
> > >
> > > This sounds attractive but it does require each mount to have its own
> > > IP address? Or are there options? Maybe the kernel driver could mark
> > > the connection with a mount ID we could do filtering on it? From a
> > > quick Google, maybe [1] could be used for this purpose. I wonder
> > > however if the kernel driver would have to do that marking of the
> > > connection... and then we have iptables dependencies in the driver
> > > again which we don't want to do.
> >
> > As I said yesterday, I think it should be doable with no kernel
> > changes -- either with IP aliases or with the help of some virtual
> > interface.  Exactly how, I'm not sure because I use VMs for my tests
> > and haven't had to touch iptables in a while, but I would be surprised
> > to learn otherwise given the myriad of options out there.
> >
>
> ...and really, doing this sort of testing with the kernel client outside
> of a vm is sort of a mess anyway, IMO.

Testing often involves making a mess :) I disagree in principle that
having a mechanism for stopping a netfs mount without pulling the plug
(virtually or otherwise) is unnecessary.

> That said, I think we might need a way to match up a superblock with the
> sockets associated with it -- so mon, osd and mds socket info,
> basically. That could be a very simple thing in debugfs though, in the
> existing directory hierarchy there. With that info, you could reasonably
> do something with iptables like we're suggesting.

That's certainly useful information to expose but I don't see how that
would help with constructing iptable rules. The kernel may reconnect
to any Ceph service at any time, especially during potential network
disruption (like an iptables rule dropping packets). Any rules you
construct for those connections would no longer apply. You cannot
construct rules that broadly apply to e.g. the entire ceph cluster as
a destination because it would interfere with other kernel client
mounts. I believe this is why Ilya is suggesting the use of virtual ip
addresses as a unique source address for each mount.

> > > From my perspective, this halt patch looks pretty simple and doesn't
> > > appear to be a huge maintenance burden. Is it really so objectionable?
> >
> > Well, this patch is simple only because it isn't even remotely
> > equivalent to a cable pull.  I mean, it aborts in-flight requests
> > with EIO, closes sockets, etc.  Has it been tested against the test
> > cases that currently cold reset the node through the BMC?

Of course not, this is the initial work soliciting feedback on the concept.

> > If it has been tested and the current semantics are sufficient,
> > are you sure they will remain so in the future?  What happens when
> > a new test gets added that needs a harder shutdown?  We won't be
> > able to reuse existing "umount -f" infrastructure anymore...  What
> > if a new test needs to _actually_ kill the client?
> >
> > And then a debugging knob that permanently wedges the client sure
> > can't be a mount option for all the obvious reasons.  This bit is easy
> > to fix, but the fact that it is submitted as a mount option makes me
> > suspect that the whole thing hasn't been thought through very well.

Or, Xiubo needs advice on a better way to do it. In the tracker ticket
I suggested a sysfs control file. Would that be appropriate?

> Agreed on all points. This sort of fault injection is really best done
> via other means. Otherwise, it's really hard to know whether it'll
> behave the way you expect in other situations.
>
> I'll add too that I think experience shows that these sorts of
> interfaces end up bitrotted because they're too specialized to use
> outside of anything but very specific environments. We need to think
> larger than just teuthology's needs here.

I doubt they'd become bitrotted with regular use in teuthology.

I get that you both see VMs or virtual interfaces would obviate this
PR. VMs are not an option in teuthology. We can try to spend some time
on seeing if something like a bridged virtual network will work. Will
the kernel driver operate in the network namespace of the container
that mounts the volume?
Jeff Layton Feb. 19, 2020, 11:49 p.m. UTC | #14
On Wed, 2020-02-19 at 14:49 -0800, Patrick Donnelly wrote:
> Responding to you and Ilya both:
> 
> On Wed, Feb 19, 2020 at 1:21 PM Jeff Layton <jlayton@kernel.org> wrote:
> > On Wed, 2020-02-19 at 21:42 +0100, Ilya Dryomov wrote:
> > > On Wed, Feb 19, 2020 at 8:22 PM Patrick Donnelly <pdonnell@redhat.com> wrote:
> > > > On Tue, Feb 18, 2020 at 6:59 AM Ilya Dryomov <idryomov@gmail.com> wrote:
> > > > > > Yeah, I've mostly done this using DROP rules when I needed to test things.
> > > > > > But, I think I was probably just guilty of speculating out loud here.
> > > > > 
> > > > > I'm not sure what exactly Xiubo meant by "fulfilling" iptables rules
> > > > > in libceph, but I will say that any kind of iptables manipulation from
> > > > > within libceph is probably out of the question.
> > > > 
> > > > I think we're getting confused about two thoughts on iptables: (1) to
> > > > use iptables to effectively partition the mount instead of this new
> > > > halt option; (2) use iptables in concert with halt to prevent FIN
> > > > packets from being sent when the sockets are closed. I think we all
> > > > agree (2) is not going to happen.
> > > 
> > > Right.
> > > 
> > > > > > I think doing this by just closing down the sockets is probably fine. I
> > > > > > wouldn't pursue anything relating to to iptables here, unless we have
> > > > > > some larger reason to go that route.
> > > > > 
> > > > > IMO investing into a set of iptables and tc helpers for teuthology
> > > > > makes a _lot_ of sense.  It isn't exactly the same as a cable pull,
> > > > > but it's probably the next best thing.  First, it will be external to
> > > > > the system under test.  Second, it can be made selective -- you can
> > > > > cut a single session or all of them, simulate packet loss and latency
> > > > > issues, etc.  Third, it can be used for recovery and failover/fencing
> > > > > testing -- what happens when these packets get delivered two minutes
> > > > > later?  None of this is possible with something that just attempts to
> > > > > wedge the mount and acts as a point of no return.
> > > > 
> > > > This sounds attractive but it does require each mount to have its own
> > > > IP address? Or are there options? Maybe the kernel driver could mark
> > > > the connection with a mount ID we could do filtering on it? From a
> > > > quick Google, maybe [1] could be used for this purpose. I wonder
> > > > however if the kernel driver would have to do that marking of the
> > > > connection... and then we have iptables dependencies in the driver
> > > > again which we don't want to do.
> > > 
> > > As I said yesterday, I think it should be doable with no kernel
> > > changes -- either with IP aliases or with the help of some virtual
> > > interface.  Exactly how, I'm not sure because I use VMs for my tests
> > > and haven't had to touch iptables in a while, but I would be surprised
> > > to learn otherwise given the myriad of options out there.
> > > 
> > 
> > ...and really, doing this sort of testing with the kernel client outside
> > of a vm is sort of a mess anyway, IMO.
> 
> Testing often involves making a mess :) I disagree in principle that
> having a mechanism for stopping a netfs mount without pulling the plug
> (virtually or otherwise) is unnecessary.
> 

Ok, here are some more concerns:

I'm not clear on what value this new mount option really adds. Once you
do this, the client is hosed, so this is really only useful for testing
the MDS. If your goal is to test the MDS with dying clients, then why
not use a synthetic userland client to take state and do whatever you
want?

It could be I'm missing some value in using a kclient for this. If you
did want to do this after all, then why are you keeping the mount around
at all? It's useless after the remount, so you might as well just umount
it.

If you really want to make it just shut down the sockets, then you could
add a new flag to umount2/sys_umount (UMOUNT_KILL or something) that
would kill off the mount w/o talking to the MDS. That seems like a much
cleaner interface than doing this.

> > That said, I think we might need a way to match up a superblock with the
> > sockets associated with it -- so mon, osd and mds socket info,
> > basically. That could be a very simple thing in debugfs though, in the
> > existing directory hierarchy there. With that info, you could reasonably
> > do something with iptables like we're suggesting.
> 
> That's certainly useful information to expose but I don't see how that
> would help with constructing iptable rules. The kernel may reconnect
> to any Ceph service at any time, especially during potential network
> disruption (like an iptables rule dropping packets). Any rules you
> construct for those connections would no longer apply. You cannot
> construct rules that broadly apply to e.g. the entire ceph cluster as
> a destination because it would interfere with other kernel client
> mounts. I believe this is why Ilya is suggesting the use of virtual ip
> addresses as a unique source address for each mount.
> 

Sorry, braino -- sunrpc clients keep their source ports in most cases
(for legacy reasons). I don't think libceph msgr does though. You're
right that a debugfs info file won't really help.

You could roll some sort of deep packet inspection to discern this but
that's more difficult. I wonder if you could do it with BPF these days
though...

> > > > From my perspective, this halt patch looks pretty simple and doesn't
> > > > appear to be a huge maintenance burden. Is it really so objectionable?
> > > 
> > > Well, this patch is simple only because it isn't even remotely
> > > equivalent to a cable pull.  I mean, it aborts in-flight requests
> > > with EIO, closes sockets, etc.  Has it been tested against the test
> > > cases that currently cold reset the node through the BMC?
> 
> Of course not, this is the initial work soliciting feedback on the concept.
> 

Yep. Don't get discouraged, I think we can do something to better
accommodate testing, but I don't think this is the correct direction for
it.

> > > If it has been tested and the current semantics are sufficient,
> > > are you sure they will remain so in the future?  What happens when
> > > a new test gets added that needs a harder shutdown?  We won't be
> > > able to reuse existing "umount -f" infrastructure anymore...  What
> > > if a new test needs to _actually_ kill the client?
> > > 
> > > And then a debugging knob that permanently wedges the client sure
> > > can't be a mount option for all the obvious reasons.  This bit is easy
> > > to fix, but the fact that it is submitted as a mount option makes me
> > > suspect that the whole thing hasn't been thought through very well.
> 
> Or, Xiubo needs advice on a better way to do it. In the tracker ticket
> I suggested a sysfs control file. Would that be appropriate?
> 

I'm not a fan of adding fault injection code to the client. I'd prefer
doing this via some other mechanism. If you really do want something
like this in the kernel, then you may want to consider something like
BPF.

> > Agreed on all points. This sort of fault injection is really best done
> > via other means. Otherwise, it's really hard to know whether it'll
> > behave the way you expect in other situations.
> > 
> > I'll add too that I think experience shows that these sorts of
> > interfaces end up bitrotted because they're too specialized to use
> > outside of anything but very specific environments. We need to think
> > larger than just teuthology's needs here.
> 
> I doubt they'd become bitrotted with regular use in teuthology.
> 

Well, certainly some uses of them might not, but interfaces like this
need to be generically useful across a range of environments. I'm not
terribly interested in plumbing something in that is _only_ used for
teuthology, even as important as that use-case is.

> I get that you both see VMs or virtual interfaces would obviate this
> PR. VMs are not an option in teuthology. We can try to spend some time
> on seeing if something like a bridged virtual network will work. Will
> the kernel driver operate in the network namespace of the container
> that mounts the volume?
> 

That, I'm not sure about. I'm not sure if the sockets end up inheriting
the net namespace of the mounting process. It'd be good to investigate
this. You may be able to just get crafty with the unshare command to
test it out.

In any case, I'd rather see work on adding support for namespace
awareness to enable stuff like this than adding fault injection knobs. I
think that's something that could be generically useful.
Ilya Dryomov Feb. 20, 2020, 3:43 a.m. UTC | #15
On Thu, Feb 20, 2020 at 12:49 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Wed, 2020-02-19 at 14:49 -0800, Patrick Donnelly wrote:
> > Responding to you and Ilya both:
> >
> > On Wed, Feb 19, 2020 at 1:21 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > On Wed, 2020-02-19 at 21:42 +0100, Ilya Dryomov wrote:
> > > > On Wed, Feb 19, 2020 at 8:22 PM Patrick Donnelly <pdonnell@redhat.com> wrote:
> > > > > On Tue, Feb 18, 2020 at 6:59 AM Ilya Dryomov <idryomov@gmail.com> wrote:
> > > > > > > Yeah, I've mostly done this using DROP rules when I needed to test things.
> > > > > > > But, I think I was probably just guilty of speculating out loud here.
> > > > > >
> > > > > > I'm not sure what exactly Xiubo meant by "fulfilling" iptables rules
> > > > > > in libceph, but I will say that any kind of iptables manipulation from
> > > > > > within libceph is probably out of the question.
> > > > >
> > > > > I think we're getting confused about two thoughts on iptables: (1) to
> > > > > use iptables to effectively partition the mount instead of this new
> > > > > halt option; (2) use iptables in concert with halt to prevent FIN
> > > > > packets from being sent when the sockets are closed. I think we all
> > > > > agree (2) is not going to happen.
> > > >
> > > > Right.
> > > >
> > > > > > > I think doing this by just closing down the sockets is probably fine. I
> > > > > > > wouldn't pursue anything relating to to iptables here, unless we have
> > > > > > > some larger reason to go that route.
> > > > > >
> > > > > > IMO investing into a set of iptables and tc helpers for teuthology
> > > > > > makes a _lot_ of sense.  It isn't exactly the same as a cable pull,
> > > > > > but it's probably the next best thing.  First, it will be external to
> > > > > > the system under test.  Second, it can be made selective -- you can
> > > > > > cut a single session or all of them, simulate packet loss and latency
> > > > > > issues, etc.  Third, it can be used for recovery and failover/fencing
> > > > > > testing -- what happens when these packets get delivered two minutes
> > > > > > later?  None of this is possible with something that just attempts to
> > > > > > wedge the mount and acts as a point of no return.
> > > > >
> > > > > This sounds attractive but it does require each mount to have its own
> > > > > IP address? Or are there options? Maybe the kernel driver could mark
> > > > > the connection with a mount ID we could do filtering on it? From a
> > > > > quick Google, maybe [1] could be used for this purpose. I wonder
> > > > > however if the kernel driver would have to do that marking of the
> > > > > connection... and then we have iptables dependencies in the driver
> > > > > again which we don't want to do.
> > > >
> > > > As I said yesterday, I think it should be doable with no kernel
> > > > changes -- either with IP aliases or with the help of some virtual
> > > > interface.  Exactly how, I'm not sure because I use VMs for my tests
> > > > and haven't had to touch iptables in a while, but I would be surprised
> > > > to learn otherwise given the myriad of options out there.
> > > >
> > >
> > > ...and really, doing this sort of testing with the kernel client outside
> > > of a vm is sort of a mess anyway, IMO.
> >
> > Testing often involves making a mess :) I disagree in principle that
> > having a mechanism for stopping a netfs mount without pulling the plug
> > (virtually or otherwise) is unnecessary.
> >
>
> Ok, here are some more concerns:
>
> I'm not clear on what value this new mount option really adds. Once you
> do this, the client is hosed, so this is really only useful for testing
> the MDS. If your goal is to test the MDS with dying clients, then why
> not use a synthetic userland client to take state and do whatever you
> want?
>
> It could be I'm missing some value in using a kclient for this. If you
> did want to do this after all, then why are you keeping the mount around
> at all? It's useless after the remount, so you might as well just umount
> it.
>
> If you really want to make it just shut down the sockets, then you could
> add a new flag to umount2/sys_umount (UMOUNT_KILL or something) that
> would kill off the mount w/o talking to the MDS. That seems like a much
> cleaner interface than doing this.
>
> > > That said, I think we might need a way to match up a superblock with the
> > > sockets associated with it -- so mon, osd and mds socket info,
> > > basically. That could be a very simple thing in debugfs though, in the
> > > existing directory hierarchy there. With that info, you could reasonably
> > > do something with iptables like we're suggesting.
> >
> > That's certainly useful information to expose but I don't see how that
> > would help with constructing iptable rules. The kernel may reconnect
> > to any Ceph service at any time, especially during potential network
> > disruption (like an iptables rule dropping packets). Any rules you
> > construct for those connections would no longer apply. You cannot
> > construct rules that broadly apply to e.g. the entire ceph cluster as
> > a destination because it would interfere with other kernel client
> > mounts. I believe this is why Ilya is suggesting the use of virtual ip
> > addresses as a unique source address for each mount.
> >
>
> Sorry, braino -- sunrpc clients keep their source ports in most cases
> (for legacy reasons). I don't think libceph msgr does though. You're
> right that a debugfs info file won't really help.
>
> You could roll some sort of deep packet inspection to discern this but
> that's more difficult. I wonder if you could do it with BPF these days
> though...
>
> > > > > From my perspective, this halt patch looks pretty simple and doesn't
> > > > > appear to be a huge maintenance burden. Is it really so objectionable?
> > > >
> > > > Well, this patch is simple only because it isn't even remotely
> > > > equivalent to a cable pull.  I mean, it aborts in-flight requests
> > > > with EIO, closes sockets, etc.  Has it been tested against the test
> > > > cases that currently cold reset the node through the BMC?
> >
> > Of course not, this is the initial work soliciting feedback on the concept.
> >
>
> Yep. Don't get discouraged, I think we can do something to better
> accommodate testing, but I don't think this is the correct direction for
> it.
>
> > > > If it has been tested and the current semantics are sufficient,
> > > > are you sure they will remain so in the future?  What happens when
> > > > a new test gets added that needs a harder shutdown?  We won't be
> > > > able to reuse existing "umount -f" infrastructure anymore...  What
> > > > if a new test needs to _actually_ kill the client?
> > > >
> > > > And then a debugging knob that permanently wedges the client sure
> > > > can't be a mount option for all the obvious reasons.  This bit is easy
> > > > to fix, but the fact that it is submitted as a mount option makes me
> > > > suspect that the whole thing hasn't been thought through very well.
> >
> > Or, Xiubo needs advice on a better way to do it. In the tracker ticket
> > I suggested a sysfs control file. Would that be appropriate?
> >
>
> I'm not a fan of adding fault injection code to the client. I'd prefer
> doing this via some other mechanism. If you really do want something
> like this in the kernel, then you may want to consider something like
> BPF.
>
> > > Agreed on all points. This sort of fault injection is really best done
> > > via other means. Otherwise, it's really hard to know whether it'll
> > > behave the way you expect in other situations.
> > >
> > > I'll add too that I think experience shows that these sorts of
> > > interfaces end up bitrotted because they're too specialized to use
> > > outside of anything but very specific environments. We need to think
> > > larger than just teuthology's needs here.
> >
> > I doubt they'd become bitrotted with regular use in teuthology.
> >
>
> Well, certainly some uses of them might not, but interfaces like this
> need to be generically useful across a range of environments. I'm not
> terribly interested in plumbing something in that is _only_ used for
> teuthology, even as important as that use-case is.
>
> > I get that you both see VMs or virtual interfaces would obviate this
> > PR. VMs are not an option in teuthology. We can try to spend some time
> > on seeing if something like a bridged virtual network will work. Will
> > the kernel driver operate in the network namespace of the container
> > that mounts the volume?
> >
>
> That, I'm not sure about. I'm not sure if the sockets end up inheriting
> the net namespace of the mounting process. It'd be good to investigate
> this. You may be able to just get crafty with the unshare command to
> test it out.

It will -- I added that several years ago when docker started gaining
popularity.

This is why I keep mentioning virtual interfaces.  One thing that
will most likely work without any hiccups is a veth pair with one
interface in the namespace and one in the host plus a simple iptables
masquerading rule to NAT between the veth network and the world.
For cutting all sessions, you won't even need to touch iptables any
further: just down either end of the veth pair.

Doing it from the container would obviously work too, but further
iptables manipulation might be trickier because of more parts involved:
additional interfaces, bridge, iptables rules installed by the
container runtime, etc.

Thanks,

                Ilya
Xiubo Li Feb. 27, 2020, 1:19 p.m. UTC | #16
On 2020/2/20 11:43, Ilya Dryomov wrote:
> On Thu, Feb 20, 2020 at 12:49 AM Jeff Layton <jlayton@kernel.org> wrote:
>> On Wed, 2020-02-19 at 14:49 -0800, Patrick Donnelly wrote:
>>> Responding to you and Ilya both:
>>>
>>> On Wed, Feb 19, 2020 at 1:21 PM Jeff Layton <jlayton@kernel.org> wrote:
>>>> On Wed, 2020-02-19 at 21:42 +0100, Ilya Dryomov wrote:
>>>>> On Wed, Feb 19, 2020 at 8:22 PM Patrick Donnelly <pdonnell@redhat.com> wrote:
>>>>>> On Tue, Feb 18, 2020 at 6:59 AM Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>> Yeah, I've mostly done this using DROP rules when I needed to test things.
>>>>>>>> But, I think I was probably just guilty of speculating out loud here.
>>>>>>> I'm not sure what exactly Xiubo meant by "fulfilling" iptables rules
>>>>>>> in libceph, but I will say that any kind of iptables manipulation from
>>>>>>> within libceph is probably out of the question.
>>>>>> I think we're getting confused about two thoughts on iptables: (1) to
>>>>>> use iptables to effectively partition the mount instead of this new
>>>>>> halt option; (2) use iptables in concert with halt to prevent FIN
>>>>>> packets from being sent when the sockets are closed. I think we all
>>>>>> agree (2) is not going to happen.
>>>>> Right.
>>>>>
>>>>>>>> I think doing this by just closing down the sockets is probably fine. I
>>>>>>>> wouldn't pursue anything relating to to iptables here, unless we have
>>>>>>>> some larger reason to go that route.
>>>>>>> IMO investing into a set of iptables and tc helpers for teuthology
>>>>>>> makes a _lot_ of sense.  It isn't exactly the same as a cable pull,
>>>>>>> but it's probably the next best thing.  First, it will be external to
>>>>>>> the system under test.  Second, it can be made selective -- you can
>>>>>>> cut a single session or all of them, simulate packet loss and latency
>>>>>>> issues, etc.  Third, it can be used for recovery and failover/fencing
>>>>>>> testing -- what happens when these packets get delivered two minutes
>>>>>>> later?  None of this is possible with something that just attempts to
>>>>>>> wedge the mount and acts as a point of no return.
>>>>>> This sounds attractive but it does require each mount to have its own
>>>>>> IP address? Or are there options? Maybe the kernel driver could mark
>>>>>> the connection with a mount ID we could do filtering on it? From a
>>>>>> quick Google, maybe [1] could be used for this purpose. I wonder
>>>>>> however if the kernel driver would have to do that marking of the
>>>>>> connection... and then we have iptables dependencies in the driver
>>>>>> again which we don't want to do.
>>>>> As I said yesterday, I think it should be doable with no kernel
>>>>> changes -- either with IP aliases or with the help of some virtual
>>>>> interface.  Exactly how, I'm not sure because I use VMs for my tests
>>>>> and haven't had to touch iptables in a while, but I would be surprised
>>>>> to learn otherwise given the myriad of options out there.
>>>>>
>>>> ...and really, doing this sort of testing with the kernel client outside
>>>> of a vm is sort of a mess anyway, IMO.
>>> Testing often involves making a mess :) I disagree in principle that
>>> having a mechanism for stopping a netfs mount without pulling the plug
>>> (virtually or otherwise) is unnecessary.
>>>
>> Ok, here are some more concerns:
>>
>> I'm not clear on what value this new mount option really adds. Once you
>> do this, the client is hosed, so this is really only useful for testing
>> the MDS. If your goal is to test the MDS with dying clients, then why
>> not use a synthetic userland client to take state and do whatever you
>> want?
>>
>> It could be I'm missing some value in using a kclient for this. If you
>> did want to do this after all, then why are you keeping the mount around
>> at all? It's useless after the remount, so you might as well just umount
>> it.
>>
>> If you really want to make it just shut down the sockets, then you could
>> add a new flag to umount2/sys_umount (UMOUNT_KILL or something) that
>> would kill off the mount w/o talking to the MDS. That seems like a much
>> cleaner interface than doing this.
>>
>>>> That said, I think we might need a way to match up a superblock with the
>>>> sockets associated with it -- so mon, osd and mds socket info,
>>>> basically. That could be a very simple thing in debugfs though, in the
>>>> existing directory hierarchy there. With that info, you could reasonably
>>>> do something with iptables like we're suggesting.
>>> That's certainly useful information to expose but I don't see how that
>>> would help with constructing iptable rules. The kernel may reconnect
>>> to any Ceph service at any time, especially during potential network
>>> disruption (like an iptables rule dropping packets). Any rules you
>>> construct for those connections would no longer apply. You cannot
>>> construct rules that broadly apply to e.g. the entire ceph cluster as
>>> a destination because it would interfere with other kernel client
>>> mounts. I believe this is why Ilya is suggesting the use of virtual ip
>>> addresses as a unique source address for each mount.
>>>
>> Sorry, braino -- sunrpc clients keep their source ports in most cases
>> (for legacy reasons). I don't think libceph msgr does though. You're
>> right that a debugfs info file won't really help.
>>
>> You could roll some sort of deep packet inspection to discern this but
>> that's more difficult. I wonder if you could do it with BPF these days
>> though...
>>
>>>>>>  From my perspective, this halt patch looks pretty simple and doesn't
>>>>>> appear to be a huge maintenance burden. Is it really so objectionable?
>>>>> Well, this patch is simple only because it isn't even remotely
>>>>> equivalent to a cable pull.  I mean, it aborts in-flight requests
>>>>> with EIO, closes sockets, etc.  Has it been tested against the test
>>>>> cases that currently cold reset the node through the BMC?
>>> Of course not, this is the initial work soliciting feedback on the concept.
>>>
>> Yep. Don't get discouraged, I think we can do something to better
>> accommodate testing, but I don't think this is the correct direction for
>> it.
>>
>>>>> If it has been tested and the current semantics are sufficient,
>>>>> are you sure they will remain so in the future?  What happens when
>>>>> a new test gets added that needs a harder shutdown?  We won't be
>>>>> able to reuse existing "umount -f" infrastructure anymore...  What
>>>>> if a new test needs to _actually_ kill the client?
>>>>>
>>>>> And then a debugging knob that permanently wedges the client sure
>>>>> can't be a mount option for all the obvious reasons.  This bit is easy
>>>>> to fix, but the fact that it is submitted as a mount option makes me
>>>>> suspect that the whole thing hasn't been thought through very well.
>>> Or, Xiubo needs advice on a better way to do it. In the tracker ticket
>>> I suggested a sysfs control file. Would that be appropriate?
>>>
>> I'm not a fan of adding fault injection code to the client. I'd prefer
>> doing this via some other mechanism. If you really do want something
>> like this in the kernel, then you may want to consider something like
>> BPF.
>>
>>>> Agreed on all points. This sort of fault injection is really best done
>>>> via other means. Otherwise, it's really hard to know whether it'll
>>>> behave the way you expect in other situations.
>>>>
>>>> I'll add too that I think experience shows that these sorts of
>>>> interfaces end up bitrotted because they're too specialized to use
>>>> outside of anything but very specific environments. We need to think
>>>> larger than just teuthology's needs here.
>>> I doubt they'd become bitrotted with regular use in teuthology.
>>>
>> Well, certainly some uses of them might not, but interfaces like this
>> need to be generically useful across a range of environments. I'm not
>> terribly interested in plumbing something in that is _only_ used for
>> teuthology, even as important as that use-case is.
>>
>>> I get that you both see VMs or virtual interfaces would obviate this
>>> PR. VMs are not an option in teuthology. We can try to spend some time
>>> on seeing if something like a bridged virtual network will work. Will
>>> the kernel driver operate in the network namespace of the container
>>> that mounts the volume?
>>>
>> That, I'm not sure about. I'm not sure if the sockets end up inheriting
>> the net namespace of the mounting process. It'd be good to investigate
>> this. You may be able to just get crafty with the unshare command to
>> test it out.
> It will -- I added that several years ago when docker started gaining
> popularity.
>
> This is why I keep mentioning virtual interfaces.  One thing that
> will most likely work without any hiccups is a veth pair with one
> interface in the namespace and one in the host plus a simple iptables
> masquerading rule to NAT between the veth network and the world.
> For cutting all sessions, you won't even need to touch iptables any
> further: just down either end of the veth pair.
>
> Doing it from the container would obviously work too, but further
> iptables manipulation might be trickier because of more parts involved:
> additional interfaces, bridge, iptables rules installed by the
> container runtime, etc.

Hi Ilya, Jeff, Patrick

Thanks for your advice and great idea of this.

I started it with ceph-fuse, and the patch is ready, please see 
https://github.com/ceph/ceph/pull/33576.

Thanks
BRs
Xiubo


> Thanks,
>
>                  Ilya
>

Patch
diff mbox series

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b0f34251ad28..b6aa357f7c61 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4110,6 +4110,9 @@  static void maybe_recover_session(struct ceph_mds_client *mdsc)
 {
 	struct ceph_fs_client *fsc = mdsc->fsc;
 
+	if (ceph_test_mount_opt(fsc, HALT))
+		return;
+
 	if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
 		return;
 
@@ -4735,7 +4738,7 @@  void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	dout("stopped\n");
 }
 
-void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
+void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc, bool halt)
 {
 	struct ceph_mds_session *session;
 	int mds;
@@ -4748,7 +4751,12 @@  void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
 		if (!session)
 			continue;
 
-		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
+		/*
+		 * when halting the superblock, it will simulate pulling
+		 * the power cable, so here close the connection before
+		 * doing any cleanup.
+		 */
+		if (halt || (session->s_state == CEPH_MDS_SESSION_REJECTED))
 			__unregister_session(mdsc, session);
 		__wake_requests(mdsc, &session->s_waiting);
 		mutex_unlock(&mdsc->mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c13910da07c4..b66eea830ae1 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -478,7 +478,8 @@  extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
 
 extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc,
+				   bool halt);
 extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 
 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 8b52bea13273..2a6fd5d2fffa 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -155,6 +155,7 @@  enum {
 	Opt_acl,
 	Opt_quotadf,
 	Opt_copyfrom,
+	Opt_halt,
 };
 
 enum ceph_recover_session_mode {
@@ -194,6 +195,7 @@  static const struct fs_parameter_spec ceph_mount_param_specs[] = {
 	fsparam_string	("snapdirname",			Opt_snapdirname),
 	fsparam_string	("source",			Opt_source),
 	fsparam_u32	("wsize",			Opt_wsize),
+	fsparam_flag	("halt",			Opt_halt),
 	{}
 };
 
@@ -435,6 +437,9 @@  static int ceph_parse_mount_param(struct fs_context *fc,
 			fc->sb_flags &= ~SB_POSIXACL;
 		}
 		break;
+	case Opt_halt:
+		fsopt->flags |= CEPH_MOUNT_OPT_HALT;
+		break;
 	default:
 		BUG();
 	}
@@ -601,6 +606,8 @@  static int ceph_show_options(struct seq_file *m, struct dentry *root)
 	if (m->count == pos)
 		m->count--;
 
+	if (fsopt->flags & CEPH_MOUNT_OPT_HALT)
+		seq_puts(m, ",halt");
 	if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
 		seq_puts(m, ",dirstat");
 	if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES))
@@ -877,22 +884,28 @@  static void destroy_caches(void)
 }
 
 /*
- * ceph_umount_begin - initiate forced umount.  Tear down down the
- * mount, skipping steps that may hang while waiting for server(s).
+ * ceph_umount_begin - initiate forced umount.  Tear down the mount,
+ * skipping steps that may hang while waiting for server(s).
  */
-static void ceph_umount_begin(struct super_block *sb)
+static void __ceph_umount_begin(struct super_block *sb, bool halt)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 
-	dout("ceph_umount_begin - starting forced umount\n");
 	if (!fsc)
 		return;
 	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
 	ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
-	ceph_mdsc_force_umount(fsc->mdsc);
+	ceph_mdsc_force_umount(fsc->mdsc, halt);
 	fsc->filp_gen++; // invalidate open files
 }
 
+static void ceph_umount_begin(struct super_block *sb)
+{
+	dout("%s - starting forced umount\n", __func__);
+
+	__ceph_umount_begin(sb, false);
+}
+
 static const struct super_operations ceph_super_ops = {
 	.alloc_inode	= ceph_alloc_inode,
 	.free_inode	= ceph_free_inode,
@@ -1193,6 +1206,16 @@  static int ceph_reconfigure_fc(struct fs_context *fc)
 	struct ceph_parse_opts_ctx *pctx = fc->fs_private;
 	struct ceph_mount_options *new_fsopt = pctx->opts;
 
+	/* halt the mount point, will ignore other options */
+	if (new_fsopt->flags & CEPH_MOUNT_OPT_HALT) {
+		dout("halt the mount point\n");
+		fsopt->flags |= CEPH_MOUNT_OPT_HALT;
+		__ceph_umount_begin(sb, true);
+		ceph_halt_client(fsc->client);
+
+		return 0;
+	}
+
 	sync_filesystem(sb);
 
 	if (strcmp_null(new_fsopt->snapdir_name, fsopt->snapdir_name))
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 4c40e86ad016..64f16083b216 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -43,6 +43,7 @@ 
 #define CEPH_MOUNT_OPT_MOUNTWAIT       (1<<12) /* mount waits if no mds is up */
 #define CEPH_MOUNT_OPT_NOQUOTADF       (1<<13) /* no root dir quota in statfs */
 #define CEPH_MOUNT_OPT_NOCOPYFROM      (1<<14) /* don't use RADOS 'copy-from' op */
+#define CEPH_MOUNT_OPT_HALT            (1<<15) /* halt the mount point */
 
 #define CEPH_MOUNT_OPT_DEFAULT			\
 	(CEPH_MOUNT_OPT_DCACHE |		\
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 8fe9b80e80a5..12e9f0cc8501 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -295,6 +295,7 @@  struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private);
 struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
 u64 ceph_client_gid(struct ceph_client *client);
 extern void ceph_destroy_client(struct ceph_client *client);
+void ceph_halt_client(struct ceph_client *client);
 extern void ceph_reset_client_addr(struct ceph_client *client);
 extern int __ceph_open_session(struct ceph_client *client,
 			       unsigned long started);
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index dbb8a6959a73..7718a2e65d07 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -78,6 +78,7 @@  struct ceph_mon_client {
 	struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
 	int pending_auth;
 
+	bool halt;
 	bool hunting;
 	int cur_mon;                       /* last monitor i contacted */
 	unsigned long sub_renew_after;
@@ -109,6 +110,7 @@  extern int ceph_monmap_contains(struct ceph_monmap *m,
 
 extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
 extern void ceph_monc_stop(struct ceph_mon_client *monc);
+void ceph_monc_halt(struct ceph_mon_client *monc);
 extern void ceph_monc_reopen_session(struct ceph_mon_client *monc);
 
 enum {
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 02ff3a302d26..4b9143f7d989 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -382,6 +382,7 @@  extern void ceph_osdc_cleanup(void);
 extern int ceph_osdc_init(struct ceph_osd_client *osdc,
 			  struct ceph_client *client);
 extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+extern void ceph_osdc_halt(struct ceph_osd_client *osdc);
 extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc);
 
 extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index a9d6c97b5b0d..c47578ed0546 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -652,6 +652,20 @@  struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
 }
 EXPORT_SYMBOL(ceph_create_client);
 
+void ceph_halt_client(struct ceph_client *client)
+{
+	dout("halt_client %p\n", client);
+
+	atomic_set(&client->msgr.stopping, 1);
+
+	/* unmount */
+	ceph_osdc_halt(&client->osdc);
+	ceph_monc_halt(&client->monc);
+
+	dout("halt_client %p done\n", client);
+}
+EXPORT_SYMBOL(ceph_halt_client);
+
 void ceph_destroy_client(struct ceph_client *client)
 {
 	dout("destroy_client %p\n", client);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 9d9e4e4ea600..5819a02af7fe 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -979,14 +979,16 @@  static void delayed_work(struct work_struct *work)
 	mutex_lock(&monc->mutex);
 	if (monc->hunting) {
 		dout("%s continuing hunt\n", __func__);
-		reopen_session(monc);
+		if (!monc->halt)
+			reopen_session(monc);
 	} else {
 		int is_auth = ceph_auth_is_authenticated(monc->auth);
 		if (ceph_con_keepalive_expired(&monc->con,
 					       CEPH_MONC_PING_TIMEOUT)) {
 			dout("monc keepalive timeout\n");
 			is_auth = 0;
-			reopen_session(monc);
+			if (!monc->halt)
+				reopen_session(monc);
 		}
 
 		if (!monc->hunting) {
@@ -1115,6 +1117,16 @@  int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 }
 EXPORT_SYMBOL(ceph_monc_init);
 
+void ceph_monc_halt(struct ceph_mon_client *monc)
+{
+	dout("monc halt\n");
+
+	mutex_lock(&monc->mutex);
+	monc->halt = true;
+	ceph_con_close(&monc->con);
+	mutex_unlock(&monc->mutex);
+}
+
 void ceph_monc_stop(struct ceph_mon_client *monc)
 {
 	dout("stop\n");
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 108c9457d629..161daf35d7f1 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -5202,6 +5202,17 @@  int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	return err;
 }
 
+void ceph_osdc_halt(struct ceph_osd_client *osdc)
+{
+	down_write(&osdc->lock);
+	while (!RB_EMPTY_ROOT(&osdc->osds)) {
+		struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
+						struct ceph_osd, o_node);
+		close_osd(osd);
+	}
+	up_write(&osdc->lock);
+}
+
 void ceph_osdc_stop(struct ceph_osd_client *osdc)
 {
 	destroy_workqueue(osdc->completion_wq);