Message ID | 20130927170748.GA11716@shrek.lan (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Sep 27, 2013 at 12:07:53PM -0500, Goldwyn Rodrigues wrote: > This is done primarily for backward compatibility. I hope we do > away with this sooner than later ;) > --- > fs/ocfs2/stack_user.c | 70 ++++++++++++++++++++++++++++++++------------------- > 1 file changed, 44 insertions(+), 26 deletions(-) > > diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c > index 2d4503b..38c69c8 100644 > --- a/fs/ocfs2/stack_user.c > +++ b/fs/ocfs2/stack_user.c > @@ -103,6 +103,11 @@ > #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 > #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 > > +enum ocfs2_connection_type { > + NO_CONTROLD, > + WITH_CONTROLD > +}; > + > /* > * ocfs2_live_connection is refcounted because the filesystem and > * miscdevice sides can detach in different order. Let's just be safe. > @@ -110,6 +115,7 @@ > struct ocfs2_live_connection { > struct list_head oc_list; > struct ocfs2_cluster_connection *oc_conn; > + enum ocfs2_connection_type oc_type; > }; > > struct ocfs2_control_private { > @@ -199,7 +205,8 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) > * fill_super(), we can't get dupes here. > */ > static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, > - struct ocfs2_live_connection **c_ret) > + struct ocfs2_live_connection **c_ret, > + enum ocfs2_connection_type type) > { > int rc = 0; > struct ocfs2_live_connection *c; > @@ -210,8 +217,9 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, > > mutex_lock(&ocfs2_control_lock); > c->oc_conn = conn; > + c->oc_type = type; > > - if (atomic_read(&ocfs2_control_opened)) > + if ((type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened)) > list_add(&c->oc_list, &ocfs2_live_connection_list); > else { > printk(KERN_ERR > @@ -833,6 +841,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) > dlm_lockspace_t *fsdlm; > struct ocfs2_live_connection *uninitialized_var(control); > int rc = 0, ops_rv; > + enum ocfs2_connection_type type = NO_CONTROLD; > > BUG_ON(conn == NULL); > > @@ -843,38 +852,47 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) > if (rc) > goto out; > > - if (!ops_rv) { > - /* DLM lockspace creation with newer new_lockspace successful */ > - goto out; > - } else if (ops_rv != -EOPNOTSUPP) { > + if (ops_rv == -EOPNOTSUPP) { > + type = WITH_CONTROLD; > + printk(KERN_NOTICE "ocfs2: You seem to be using an older " > + "version of dlm_controld and/or ocfs2-tools." > + " Please consider upgrading.\n"); > + } else if (ops_rv) { > rc = ops_rv; > goto out; > } > - > conn->cc_lockspace = fsdlm; > > - printk(KERN_NOTICE "ocfs2: You seem to be using an older version " > - "of dlm_controld and/or ocfs2-tools. Please consider " > - "upgrading.\n"); > - > - rc = ocfs2_live_connection_new(conn, &control); > + rc = ocfs2_live_connection_new(conn, &control, type); > if (rc) > goto out; > > - /* > - * running_proto must have been set before we allowed any mounts > - * to proceed. > - */ > - if (fs_protocol_compare(&running_proto, &conn->cc_version)) { > - printk(KERN_ERR > - "Unable to mount with fs locking protocol version " > - "%u.%u because the userspace control daemon has " > - "negotiated %u.%u\n", > - conn->cc_version.pv_major, conn->cc_version.pv_minor, > - running_proto.pv_major, running_proto.pv_minor); > - rc = -EPROTO; > - user_cluster_disconnect(conn); > - goto out; > + if (type == WITH_CONTROLD) { > + /* > + * running_proto must have been set before we allowed any mounts > + * to proceed. > + */ > + if (fs_protocol_compare(&running_proto, &conn->cc_version)) { You need to find a way to compare the fs locking protocol in the new style. Otherwise the two ocfs2 versions can't be sure they are using the same locks in the same way. Joel > + printk(KERN_ERR > + "Unable to mount with fs locking protocol" > + " version %u.%u because the userspace control " > + "daemon has negotiated %u.%u\n", > + conn->cc_version.pv_major, > + conn->cc_version.pv_minor, > + running_proto.pv_major, > + running_proto.pv_minor); > + rc = -EPROTO; > + user_cluster_disconnect(conn); > + goto out; > + } > + > + rc = dlm_new_lockspace(conn->cc_name, NULL, > + DLM_LSFL_FS, DLM_LVB_LEN, > + NULL, NULL, NULL, &fsdlm); > + if (rc) { > + ocfs2_live_connection_drop(control); > + goto out; > + } > } > > conn->cc_private = control; > -- > 1.8.1.4 > > > -- > Goldwyn > > _______________________________________________ > Ocfs2-devel mailing list > Ocfs2-devel@oss.oracle.com > https://oss.oracle.com/mailman/listinfo/ocfs2-devel
On 09/27/2013 02:02 PM, Joel Becker wrote: > On Fri, Sep 27, 2013 at 12:07:53PM -0500, Goldwyn Rodrigues wrote: >> This is done primarily for backward compatibility. I hope we do >> away with this sooner than later ;) >> --- >> fs/ocfs2/stack_user.c | 70 ++++++++++++++++++++++++++++++++------------------- >> 1 file changed, 44 insertions(+), 26 deletions(-) >> >> diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c >> index 2d4503b..38c69c8 100644 >> --- a/fs/ocfs2/stack_user.c >> +++ b/fs/ocfs2/stack_user.c >> @@ -103,6 +103,11 @@ >> #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 >> #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 >> >> +enum ocfs2_connection_type { >> + NO_CONTROLD, >> + WITH_CONTROLD >> +}; >> + >> /* >> * ocfs2_live_connection is refcounted because the filesystem and >> * miscdevice sides can detach in different order. Let's just be safe. >> @@ -110,6 +115,7 @@ >> struct ocfs2_live_connection { >> struct list_head oc_list; >> struct ocfs2_cluster_connection *oc_conn; >> + enum ocfs2_connection_type oc_type; >> }; >> >> struct ocfs2_control_private { >> @@ -199,7 +205,8 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) >> * fill_super(), we can't get dupes here. >> */ >> static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, >> - struct ocfs2_live_connection **c_ret) >> + struct ocfs2_live_connection **c_ret, >> + enum ocfs2_connection_type type) >> { >> int rc = 0; >> struct ocfs2_live_connection *c; >> @@ -210,8 +217,9 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, >> >> mutex_lock(&ocfs2_control_lock); >> c->oc_conn = conn; >> + c->oc_type = type; >> >> - if (atomic_read(&ocfs2_control_opened)) >> + if ((type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened)) >> list_add(&c->oc_list, &ocfs2_live_connection_list); >> else { >> printk(KERN_ERR >> @@ -833,6 +841,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) >> dlm_lockspace_t *fsdlm; >> struct ocfs2_live_connection *uninitialized_var(control); >> int rc = 0, ops_rv; >> + enum ocfs2_connection_type type = NO_CONTROLD; >> >> BUG_ON(conn == NULL); >> >> @@ -843,38 +852,47 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) >> if (rc) >> goto out; >> >> - if (!ops_rv) { >> - /* DLM lockspace creation with newer new_lockspace successful */ >> - goto out; >> - } else if (ops_rv != -EOPNOTSUPP) { >> + if (ops_rv == -EOPNOTSUPP) { >> + type = WITH_CONTROLD; >> + printk(KERN_NOTICE "ocfs2: You seem to be using an older " >> + "version of dlm_controld and/or ocfs2-tools." >> + " Please consider upgrading.\n"); >> + } else if (ops_rv) { >> rc = ops_rv; >> goto out; >> } >> - >> conn->cc_lockspace = fsdlm; >> >> - printk(KERN_NOTICE "ocfs2: You seem to be using an older version " >> - "of dlm_controld and/or ocfs2-tools. Please consider " >> - "upgrading.\n"); >> - >> - rc = ocfs2_live_connection_new(conn, &control); >> + rc = ocfs2_live_connection_new(conn, &control, type); >> if (rc) >> goto out; >> >> - /* >> - * running_proto must have been set before we allowed any mounts >> - * to proceed. >> - */ >> - if (fs_protocol_compare(&running_proto, &conn->cc_version)) { >> - printk(KERN_ERR >> - "Unable to mount with fs locking protocol version " >> - "%u.%u because the userspace control daemon has " >> - "negotiated %u.%u\n", >> - conn->cc_version.pv_major, conn->cc_version.pv_minor, >> - running_proto.pv_major, running_proto.pv_minor); >> - rc = -EPROTO; >> - user_cluster_disconnect(conn); >> - goto out; >> + if (type == WITH_CONTROLD) { >> + /* >> + * running_proto must have been set before we allowed any mounts >> + * to proceed. >> + */ >> + if (fs_protocol_compare(&running_proto, &conn->cc_version)) { > > You need to find a way to compare the fs locking protocol in the new > style. Otherwise the two ocfs2 versions can't be sure they are using > the same locks in the same way. > What locking protocol is it safeguarding? Is it something to do specifically with the OCFS2 fs, or with respect to controld set versioning only? The advantage of eliminating controld is that all inter-node communication is handled by fs/dlm. This includes protocol negotiation. Unfortunately, dlm with fs-controld (v3) is not compatible dlm without fs-controld (v4). I think they call different numbers for protocol versioning (v5/v6). So, all nodes will have to be upgraded anyways with cluster downtime.
On 2013-09-27T12:07:53, Goldwyn Rodrigues <rgoldwyn@suse.de> wrote: > This is done primarily for backward compatibility. I hope we do > away with this sooner than later ;) I'd actually be quite happy if we could do away with this directly. Users that want to remain on an older user-space code base can always revert this or stick to a stable long-term kernel. That may be an unpopular opinion ;-) The relevant GFS2 commit seems to be http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/fs/gfs2?id=e0c2a9aa1e68455dc3439e95d85cabcaff073666 Regards, Lars
On 10/02/2013 04:49 AM, Lars Marowsky-Bree wrote: > On 2013-09-27T12:07:53, Goldwyn Rodrigues <rgoldwyn@suse.de> wrote: > >> This is done primarily for backward compatibility. I hope we do >> away with this sooner than later ;) > > I'd actually be quite happy if we could do away with this directly. > > Users that want to remain on an older user-space code base can always > revert this or stick to a stable long-term kernel. That may be an > unpopular opinion ;-) > > The relevant GFS2 commit seems to be > http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/fs/gfs2?id=e0c2a9aa1e68455dc3439e95d85cabcaff073666 > > Just to be complete, the DLM recovery callbacks were introduced in this commit http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/fs/gfs2?id=60f98d1839376d30e13f3e452dce2433fad3060e GFS2 also supports the older controld using DFL_NO_DLM_OPS flag set in the lockspace structure. While I would also like the controld to be done away with: from a maintenance perspective.
On Sat, Sep 28, 2013 at 09:39:42AM -0500, Goldwyn Rodrigues wrote: > On 09/27/2013 02:02 PM, Joel Becker wrote: > >On Fri, Sep 27, 2013 at 12:07:53PM -0500, Goldwyn Rodrigues wrote: > >>- /* > >>- * running_proto must have been set before we allowed any mounts > >>- * to proceed. > >>- */ > >>- if (fs_protocol_compare(&running_proto, &conn->cc_version)) { > >>- printk(KERN_ERR > >>- "Unable to mount with fs locking protocol version " > >>- "%u.%u because the userspace control daemon has " > >>- "negotiated %u.%u\n", > >>- conn->cc_version.pv_major, conn->cc_version.pv_minor, > >>- running_proto.pv_major, running_proto.pv_minor); > >>- rc = -EPROTO; > >>- user_cluster_disconnect(conn); > >>- goto out; > >>+ if (type == WITH_CONTROLD) { > >>+ /* > >>+ * running_proto must have been set before we allowed any mounts > >>+ * to proceed. > >>+ */ > >>+ if (fs_protocol_compare(&running_proto, &conn->cc_version)) { > > > >You need to find a way to compare the fs locking protocol in the new > >style. Otherwise the two ocfs2 versions can't be sure they are using > >the same locks in the same way. > > > > What locking protocol is it safeguarding? Is it something to do > specifically with the OCFS2 fs, or with respect to controld set > versioning only? Specific to ocfs2. Think about it this way. Both nodes might have the exact same version of fs/dlm, but node1 has an ocfs2 version using EX locks for an operation, while node2 has a new version of ocfs2 that can use PR locks for the same thing. The two cannot interact safely. By checking the protocol, the newer version knows to use the EX lock. > The advantage of eliminating controld is that all inter-node > communication is handled by fs/dlm. This includes protocol > negotiation. This isn't about controld here, it's about the way the filesystem uses locking. Similarly, o2cb/o2dlm has a version for the network/dlm protocol that is distinct from the fs protocol. A new mechanism without controld will need another way to communicate the fs protocol version. This will require a new major version for that as well. > Unfortunately, dlm with fs-controld (v3) is not compatible dlm > without fs-controld (v4). I think they call different numbers for > protocol versioning (v5/v6). So, all nodes will have to be upgraded > anyways with cluster downtime. Sure, but once you're on the new controld, you should still be able to upgrade your ocfs2 software without downtime. Joel
Hi Joel, Thanks for the comments. On 10/07/2013 07:00 PM, Joel Becker wrote: > On Sat, Sep 28, 2013 at 09:39:42AM -0500, Goldwyn Rodrigues wrote: >> On 09/27/2013 02:02 PM, Joel Becker wrote: >>> On Fri, Sep 27, 2013 at 12:07:53PM -0500, Goldwyn Rodrigues wrote: >>>> - /* >>>> - * running_proto must have been set before we allowed any mounts >>>> - * to proceed. >>>> - */ >>>> - if (fs_protocol_compare(&running_proto, &conn->cc_version)) { >>>> - printk(KERN_ERR >>>> - "Unable to mount with fs locking protocol version " >>>> - "%u.%u because the userspace control daemon has " >>>> - "negotiated %u.%u\n", >>>> - conn->cc_version.pv_major, conn->cc_version.pv_minor, >>>> - running_proto.pv_major, running_proto.pv_minor); >>>> - rc = -EPROTO; >>>> - user_cluster_disconnect(conn); >>>> - goto out; >>>> + if (type == WITH_CONTROLD) { >>>> + /* >>>> + * running_proto must have been set before we allowed any mounts >>>> + * to proceed. >>>> + */ >>>> + if (fs_protocol_compare(&running_proto, &conn->cc_version)) { >>> >>> You need to find a way to compare the fs locking protocol in the new >>> style. Otherwise the two ocfs2 versions can't be sure they are using >>> the same locks in the same way. >>> >> >> What locking protocol is it safeguarding? Is it something to do >> specifically with the OCFS2 fs, or with respect to controld set >> versioning only? > > Specific to ocfs2. Think about it this way. Both nodes might have the > exact same version of fs/dlm, but node1 has an ocfs2 version using EX > locks for an operation, while node2 has a new version of ocfs2 that can > use PR locks for the same thing. The two cannot interact safely. By > checking the protocol, the newer version knows to use the EX lock. What happens if a lower version ocfs2 node has mounted the ocfs2 partition and the higher version node attempts to mount the partition? though it's obvious, I would like to know the vice-versa case as well. I am thinking in terms of keeping the ocfs2 lock version on disk as a system file with each node PR locking and reading the file. The first mount writes it with an EX lock. Of course, we cannot afford to change this part of the locking in the future. Would that be a feasible solution? This may require version upgrade. > >> The advantage of eliminating controld is that all inter-node >> communication is handled by fs/dlm. This includes protocol >> negotiation. > > This isn't about controld here, it's about the way the > filesystem uses locking. Similarly, o2cb/o2dlm has a version for the > network/dlm protocol that is distinct from the fs protocol. > A new mechanism without controld will need another way to > communicate the fs protocol version. This will require a new major > version for that as well. > >> Unfortunately, dlm with fs-controld (v3) is not compatible dlm >> without fs-controld (v4). I think they call different numbers for >> protocol versioning (v5/v6). So, all nodes will have to be upgraded >> anyways with cluster downtime. > > Sure, but once you're on the new controld, you should still be able to > upgrade your ocfs2 software without downtime. > > Joel >
On Mon, Oct 07, 2013 at 07:17:46PM -0500, Goldwyn Rodrigues wrote: > On 10/07/2013 07:00 PM, Joel Becker wrote: > >On Sat, Sep 28, 2013 at 09:39:42AM -0500, Goldwyn Rodrigues wrote: > >>On 09/27/2013 02:02 PM, Joel Becker wrote: > >>>On Fri, Sep 27, 2013 at 12:07:53PM -0500, Goldwyn Rodrigues wrote: > >>>>- /* > >>>>- * running_proto must have been set before we allowed any mounts > >>>>- * to proceed. > >>>>- */ > >>>>- if (fs_protocol_compare(&running_proto, &conn->cc_version)) { > >>>>- printk(KERN_ERR > >>>>- "Unable to mount with fs locking protocol version " > >>>>- "%u.%u because the userspace control daemon has " > >>>>- "negotiated %u.%u\n", > >>>>- conn->cc_version.pv_major, conn->cc_version.pv_minor, > >>>>- running_proto.pv_major, running_proto.pv_minor); > >>>>- rc = -EPROTO; > >>>>- user_cluster_disconnect(conn); > >>>>- goto out; > >>>>+ if (type == WITH_CONTROLD) { > >>>>+ /* > >>>>+ * running_proto must have been set before we allowed any mounts > >>>>+ * to proceed. > >>>>+ */ > >>>>+ if (fs_protocol_compare(&running_proto, &conn->cc_version)) { > >>> > >>>You need to find a way to compare the fs locking protocol in the new > >>>style. Otherwise the two ocfs2 versions can't be sure they are using > >>>the same locks in the same way. > >>> > >> > >>What locking protocol is it safeguarding? Is it something to do > >>specifically with the OCFS2 fs, or with respect to controld set > >>versioning only? > > > >Specific to ocfs2. Think about it this way. Both nodes might have the > >exact same version of fs/dlm, but node1 has an ocfs2 version using EX > >locks for an operation, while node2 has a new version of ocfs2 that can > >use PR locks for the same thing. The two cannot interact safely. By > >checking the protocol, the newer version knows to use the EX lock. > > What happens if a lower version ocfs2 node has mounted the ocfs2 > partition and the higher version node attempts to mount the > partition? though it's obvious, I would like to know the vice-versa > case as well. This is explicitly documented in the version comparison code (fs_protocol_compare()): 1. If the major numbers are different, they are incompatible. 2. If the current minor is greater than the request, they are incompatible. 3. If the current minor is less than or equal to the request, they are compatible, and the requester should run at the current minor version. Specific examples: - If a node is the first node in the cluster, it will set the running version to its major.minor. - If a node joins a cluster already running at 1.2, and the new node has a version of 2.0, it will fail to mount (incompatible major version). - If a node joins a cluster already running at 1.2, and the new node has a version of 1.1, it will fail to mount (incompatible minor version). - If a node joins a cluster already running at 1.2, and the new node has a version of 1.3, it will mount at version 1.2 (matching the running minor version). > I am thinking in terms of keeping the ocfs2 lock version on disk as > a system file with each node PR locking and reading the file. The > first mount writes it with an EX lock. Of course, we cannot afford > to change this part of the locking in the future. Would that be a > feasible solution? This may require version upgrade. No. It should not be on disk, and it must not be permanent. Consider a cluster running at version 1.2. One by one, each node is upgraded to a new version of ocfs2 that supports the 1.3 protocol. Each node will still reconnect to the cluster at 1.2 due to the third rule above. But when the entire cluster is taken down for maintenance, they will start back up at 1.3. In the future, we may even support online update to the new version when every node has it. A far more reasonable solution would be to create a special lock in the DLM that has the version number in the LVB. You will, of course, have to handle LVB recovery. Joel
On 10/07/2013 07:43 PM, Joel Becker wrote: > On Mon, Oct 07, 2013 at 07:17:46PM -0500, Goldwyn Rodrigues wrote: >> On 10/07/2013 07:00 PM, Joel Becker wrote: >>> On Sat, Sep 28, 2013 at 09:39:42AM -0500, Goldwyn Rodrigues wrote: >>>> On 09/27/2013 02:02 PM, Joel Becker wrote: >>>>> On Fri, Sep 27, 2013 at 12:07:53PM -0500, Goldwyn Rodrigues wrote: >>>>>> - /* >>>>>> - * running_proto must have been set before we allowed any mounts >>>>>> - * to proceed. >>>>>> - */ >>>>>> - if (fs_protocol_compare(&running_proto, &conn->cc_version)) { >>>>>> - printk(KERN_ERR >>>>>> - "Unable to mount with fs locking protocol version " >>>>>> - "%u.%u because the userspace control daemon has " >>>>>> - "negotiated %u.%u\n", >>>>>> - conn->cc_version.pv_major, conn->cc_version.pv_minor, >>>>>> - running_proto.pv_major, running_proto.pv_minor); >>>>>> - rc = -EPROTO; >>>>>> - user_cluster_disconnect(conn); >>>>>> - goto out; >>>>>> + if (type == WITH_CONTROLD) { >>>>>> + /* >>>>>> + * running_proto must have been set before we allowed any mounts >>>>>> + * to proceed. >>>>>> + */ >>>>>> + if (fs_protocol_compare(&running_proto, &conn->cc_version)) { >>>>> >>>>> You need to find a way to compare the fs locking protocol in the new >>>>> style. Otherwise the two ocfs2 versions can't be sure they are using >>>>> the same locks in the same way. >>>>> >>>> >>>> What locking protocol is it safeguarding? Is it something to do >>>> specifically with the OCFS2 fs, or with respect to controld set >>>> versioning only? >>> >>> Specific to ocfs2. Think about it this way. Both nodes might have the >>> exact same version of fs/dlm, but node1 has an ocfs2 version using EX >>> locks for an operation, while node2 has a new version of ocfs2 that can >>> use PR locks for the same thing. The two cannot interact safely. By >>> checking the protocol, the newer version knows to use the EX lock. >> >> What happens if a lower version ocfs2 node has mounted the ocfs2 >> partition and the higher version node attempts to mount the >> partition? though it's obvious, I would like to know the vice-versa >> case as well. > > This is explicitly documented in the version comparison code > (fs_protocol_compare()): > > 1. If the major numbers are different, they are incompatible. > 2. If the current minor is greater than the request, they are > incompatible. > 3. If the current minor is less than or equal to the request, they are > compatible, and the requester should run at the current minor > version. > > Specific examples: > > - If a node is the first node in the cluster, it will set the running > version to its major.minor. > - If a node joins a cluster already running at 1.2, and the new node has > a version of 2.0, it will fail to mount (incompatible major version). > - If a node joins a cluster already running at 1.2, and the new node has > a version of 1.1, it will fail to mount (incompatible minor version). > - If a node joins a cluster already running at 1.2, and the new node has > a version of 1.3, it will mount at version 1.2 (matching the running > minor version). > >> I am thinking in terms of keeping the ocfs2 lock version on disk as >> a system file with each node PR locking and reading the file. The >> first mount writes it with an EX lock. Of course, we cannot afford >> to change this part of the locking in the future. Would that be a >> feasible solution? This may require version upgrade. > > No. It should not be on disk, and it must not be permanent. Consider a > cluster running at version 1.2. One by one, each node is upgraded to a > new version of ocfs2 that supports the 1.3 protocol. Each node will > still reconnect to the cluster at 1.2 due to the third rule above. But > when the entire cluster is taken down for maintenance, they will start > back up at 1.3. In the future, we may even support online update to the > new version when every node has it. Yes, the method I proposed works with what you mentioned and it is not permanent. Let me elaborate on what I said. A node on mount after setting up DLM would: Requests a non-blocking EX lock on the protocol version file. If it fails, it takes a PR lock on the version file. If it succeeds, it writes it's own version info *overwriting* what was before and downconverts to PR lock. ie, even if there is a higher version in the file before. This could be done with existing inode locks and no other locking infrastructure needs to be added. This way if the first node is 1.2, the whole cluster will be 1.2 even if a node with 1.3 joins. The first node decides what the entire cluster will be. Later, if all nodes have upgraded to 1.3, and the whole cluster restarts after a total cluster shutdown, the whole cluster will start with 1.3 The reason I proposed a file is because this is ocfs2 specific, and ideally should not be mixed with dlm stuff. > > A far more reasonable solution would be to create a special lock in the > DLM that has the version number in the LVB. You will, of course, have > to handle LVB recovery. > If the above proposal works, we don't need to bother with recovery. It becomes DLM's responsibility. We could extend it to perform online locking version update, but it requires much more work from the locking POV.
On Tue, Oct 08, 2013 at 09:46:14AM -0500, Goldwyn Rodrigues wrote: > On 10/07/2013 07:43 PM, Joel Becker wrote: > >No. It should not be on disk, and it must not be permanent. Consider a > >cluster running at version 1.2. One by one, each node is upgraded to a > >new version of ocfs2 that supports the 1.3 protocol. Each node will > >still reconnect to the cluster at 1.2 due to the third rule above. But > >when the entire cluster is taken down for maintenance, they will start > >back up at 1.3. In the future, we may even support online update to the > >new version when every node has it. > > Yes, the method I proposed works with what you mentioned and it is > not permanent. Let me elaborate on what I said. A node on mount > after setting up DLM would: > > Requests a non-blocking EX lock on the protocol version file. > If it fails, it takes a PR lock on the version file. > If it succeeds, it writes it's own version info *overwriting* what > was before and downconverts to PR lock. ie, even if there is a > higher version in the file before. > > This could be done with existing inode locks and no other locking > infrastructure needs to be added. > > This way if the first node is 1.2, the whole cluster will be 1.2 > even if a node with 1.3 joins. The first node decides what the > entire cluster will be. Later, if all nodes have upgraded to 1.3, > and the whole cluster restarts after a total cluster shutdown, the > whole cluster will start with 1.3 > > The reason I proposed a file is because this is ocfs2 specific, and > ideally should not be mixed with dlm stuff. I understood what you said, and the entire point of using the network-based DLM is to stay away from disk structures for locking. There is nothing wrong with using the DLM for DLM things. Joel
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 2d4503b..38c69c8 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -103,6 +103,11 @@ #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 +enum ocfs2_connection_type { + NO_CONTROLD, + WITH_CONTROLD +}; + /* * ocfs2_live_connection is refcounted because the filesystem and * miscdevice sides can detach in different order. Let's just be safe. @@ -110,6 +115,7 @@ struct ocfs2_live_connection { struct list_head oc_list; struct ocfs2_cluster_connection *oc_conn; + enum ocfs2_connection_type oc_type; }; struct ocfs2_control_private { @@ -199,7 +205,8 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) * fill_super(), we can't get dupes here. */ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, - struct ocfs2_live_connection **c_ret) + struct ocfs2_live_connection **c_ret, + enum ocfs2_connection_type type) { int rc = 0; struct ocfs2_live_connection *c; @@ -210,8 +217,9 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, mutex_lock(&ocfs2_control_lock); c->oc_conn = conn; + c->oc_type = type; - if (atomic_read(&ocfs2_control_opened)) + if ((type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened)) list_add(&c->oc_list, &ocfs2_live_connection_list); else { printk(KERN_ERR @@ -833,6 +841,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) dlm_lockspace_t *fsdlm; struct ocfs2_live_connection *uninitialized_var(control); int rc = 0, ops_rv; + enum ocfs2_connection_type type = NO_CONTROLD; BUG_ON(conn == NULL); @@ -843,38 +852,47 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) if (rc) goto out; - if (!ops_rv) { - /* DLM lockspace creation with newer new_lockspace successful */ - goto out; - } else if (ops_rv != -EOPNOTSUPP) { + if (ops_rv == -EOPNOTSUPP) { + type = WITH_CONTROLD; + printk(KERN_NOTICE "ocfs2: You seem to be using an older " + "version of dlm_controld and/or ocfs2-tools." + " Please consider upgrading.\n"); + } else if (ops_rv) { rc = ops_rv; goto out; } - conn->cc_lockspace = fsdlm; - printk(KERN_NOTICE "ocfs2: You seem to be using an older version " - "of dlm_controld and/or ocfs2-tools. Please consider " - "upgrading.\n"); - - rc = ocfs2_live_connection_new(conn, &control); + rc = ocfs2_live_connection_new(conn, &control, type); if (rc) goto out; - /* - * running_proto must have been set before we allowed any mounts - * to proceed. - */ - if (fs_protocol_compare(&running_proto, &conn->cc_version)) { - printk(KERN_ERR - "Unable to mount with fs locking protocol version " - "%u.%u because the userspace control daemon has " - "negotiated %u.%u\n", - conn->cc_version.pv_major, conn->cc_version.pv_minor, - running_proto.pv_major, running_proto.pv_minor); - rc = -EPROTO; - user_cluster_disconnect(conn); - goto out; + if (type == WITH_CONTROLD) { + /* + * running_proto must have been set before we allowed any mounts + * to proceed. + */ + if (fs_protocol_compare(&running_proto, &conn->cc_version)) { + printk(KERN_ERR + "Unable to mount with fs locking protocol" + " version %u.%u because the userspace control " + "daemon has negotiated %u.%u\n", + conn->cc_version.pv_major, + conn->cc_version.pv_minor, + running_proto.pv_major, + running_proto.pv_minor); + rc = -EPROTO; + user_cluster_disconnect(conn); + goto out; + } + + rc = dlm_new_lockspace(conn->cc_name, NULL, + DLM_LSFL_FS, DLM_LVB_LEN, + NULL, NULL, NULL, &fsdlm); + if (rc) { + ocfs2_live_connection_drop(control); + goto out; + } } conn->cc_private = control;