diff mbox

[v3,5/7] nfsdcltrack: update schema to v2

Message ID 1410193821-25109-6-git-send-email-jlayton@primarydata.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jeff Layton Sept. 8, 2014, 4:30 p.m. UTC
From: Jeff Layton <jlayton@poochiereds.net>

In order to allow knfsd's lock manager to lift its grace period early,
we need to figure out whether all clients have finished reclaiming
their state not. Unfortunately, the current code doesn't allow us to
ascertain this. All we track for each client is a timestamp that tells
us when the last "check" or "create" operation came in.

We need to track the two timestamps separately. Add a new
"reclaim_complete" column to the database that tells us when the last
"create" operation came in. For now, we just insert "0" in that column
but a later patch will make it so that we insert a real timestamp for
v4.1+ client records.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
---
 utils/nfsdcltrack/sqlite.c | 102 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 94 insertions(+), 8 deletions(-)

Comments

J. Bruce Fields Sept. 11, 2014, 7:55 p.m. UTC | #1
On Mon, Sep 08, 2014 at 12:30:19PM -0400, Jeff Layton wrote:
> From: Jeff Layton <jlayton@poochiereds.net>
> 
> In order to allow knfsd's lock manager to lift its grace period early,
> we need to figure out whether all clients have finished reclaiming
> their state not. Unfortunately, the current code doesn't allow us to
> ascertain this. All we track for each client is a timestamp that tells
> us when the last "check" or "create" operation came in.
> 
> We need to track the two timestamps separately. Add a new
> "reclaim_complete" column to the database that tells us when the last
> "create" operation came in. For now, we just insert "0" in that column
> but a later patch will make it so that we insert a real timestamp for
> v4.1+ client records.

If I understand correctly, then nfsdcltrack has a bug here: we shouldn't
be counting a 4.1 client as allowed to reclaim on the next boot until we
get the RECLAIM_COMPLETE, but nfsdcltrack is allowing a 4.1 client to
reclaim if all we got the previous boot was a reclaim open (a "check"
operation).

--b.

> 
> Signed-off-by: Jeff Layton <jlayton@primarydata.com>
> ---
>  utils/nfsdcltrack/sqlite.c | 102 +++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 94 insertions(+), 8 deletions(-)
> 
> diff --git a/utils/nfsdcltrack/sqlite.c b/utils/nfsdcltrack/sqlite.c
> index dde2f7ff8621..e260e81b1722 100644
> --- a/utils/nfsdcltrack/sqlite.c
> +++ b/utils/nfsdcltrack/sqlite.c
> @@ -29,7 +29,10 @@
>   *
>   * clients: an "id" column containing a BLOB with the long-form clientid as
>   * 	    sent by the client, a "time" column containing a timestamp (in
> - * 	    epoch seconds) of when the record was last updated.
> + * 	    epoch seconds) of when the record was last updated, and a
> + * 	    "reclaim_complete" column containing a timestamp (in epoch seconds)
> + * 	    of when the last "create" operation came in for v4.1+ clients.
> + * 	    v4.0 clients should always have this set to 0.
>   */
>  
>  #ifdef HAVE_CONFIG_H
> @@ -50,7 +53,7 @@
>  
>  #include "xlog.h"
>  
> -#define CLTRACK_SQLITE_LATEST_SCHEMA_VERSION 1
> +#define CLTRACK_SQLITE_LATEST_SCHEMA_VERSION 2
>  
>  /* in milliseconds */
>  #define CLTRACK_SQLITE_BUSY_TIMEOUT 10000
> @@ -120,6 +123,81 @@ out:
>  	return ret;
>  }
>  
> +static int
> +sqlite_maindb_update_v1_to_v2(void)
> +{
> +	int ret, ret2;
> +	char *err;
> +
> +	/* begin transaction */
> +	ret = sqlite3_exec(dbh, "BEGIN EXCLUSIVE TRANSACTION;", NULL, NULL,
> +				&err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to begin transaction: %s", err);
> +		goto rollback;
> +	}
> +
> +	/*
> +	 * Check schema version again. This time, under an exclusive
> +	 * transaction to guard against racing DB setup attempts
> +	 */
> +	ret = sqlite_query_schema_version();
> +	switch (ret) {
> +	case 1:
> +		/* Still at v1 -- do conversion */
> +		break;
> +	case CLTRACK_SQLITE_LATEST_SCHEMA_VERSION:
> +		/* Someone else raced in and set it up */
> +		ret = 0;
> +		goto rollback;
> +	default:
> +		/* Something went wrong -- fail! */
> +		ret = -EINVAL;
> +		goto rollback;
> +	}
> +
> +	/* create v2 clients table */
> +	ret = sqlite3_exec(dbh, "ALTER TABLE clients ADD COLUMN "
> +				"reclaim_complete INTEGER;",
> +				NULL, NULL, &err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to update clients table: %s", err);
> +		goto rollback;
> +	}
> +
> +	ret = snprintf(buf, sizeof(buf), "UPDATE parameters SET value = %d "
> +			"WHERE key = \"version\";",
> +			CLTRACK_SQLITE_LATEST_SCHEMA_VERSION);
> +	if (ret < 0) {
> +		xlog(L_ERROR, "sprintf failed!");
> +		goto rollback;
> +	} else if ((size_t)ret >= sizeof(buf)) {
> +		xlog(L_ERROR, "sprintf output too long! (%d chars)", ret);
> +		ret = -EINVAL;
> +		goto rollback;
> +	}
> +
> +	ret = sqlite3_exec(dbh, (const char *)buf, NULL, NULL, &err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to update schema version: %s", err);
> +		goto rollback;
> +	}
> +
> +	ret = sqlite3_exec(dbh, "COMMIT TRANSACTION;", NULL, NULL, &err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to commit transaction: %s", err);
> +		goto rollback;
> +	}
> +out:
> +	sqlite3_free(err);
> +	return ret;
> +rollback:
> +	ret2 = sqlite3_exec(dbh, "ROLLBACK TRANSACTION;", NULL, NULL, &err);
> +	if (ret2 != SQLITE_OK)
> +		xlog(L_ERROR, "Unable to rollback transaction: %s", err);
> +	goto out;
> +}
> +
>  /*
>   * Start an exclusive transaction and recheck the DB schema version. If it's
>   * still zero (indicating a new database) then set it up. If that all works,
> @@ -127,9 +205,9 @@ out:
>   * transaction. On any error, rollback the transaction.
>   */
>  int
> -sqlite_maindb_init_v1(void)
> +sqlite_maindb_init_v2(void)
>  {
> -	int ret;
> +	int ret, ret2;
>  	char *err = NULL;
>  
>  	/* Start a transaction */
> @@ -169,7 +247,7 @@ sqlite_maindb_init_v1(void)
>  
>  	/* create the "clients" table */
>  	ret = sqlite3_exec(dbh, "CREATE TABLE clients (id BLOB PRIMARY KEY, "
> -				"time INTEGER);",
> +				"time INTEGER, reclaim_complete INTEGER);",
>  				NULL, NULL, &err);
>  	if (ret != SQLITE_OK) {
>  		xlog(L_ERROR, "Unable to create clients table: %s", err);
> @@ -207,7 +285,9 @@ out:
>  
>  rollback:
>  	/* Attempt to rollback the transaction */
> -	sqlite3_exec(dbh, "ROLLBACK TRANSACTION;", NULL, NULL, &err);
> +	ret2 = sqlite3_exec(dbh, "ROLLBACK TRANSACTION;", NULL, NULL, &err);
> +	if (ret2 != SQLITE_OK)
> +		xlog(L_ERROR, "Unable to rollback transaction: %s", err);
>  	goto out;
>  }
>  
> @@ -255,9 +335,15 @@ sqlite_prepare_dbh(const char *topdir)
>  		/* DB is already set up. Do nothing */
>  		ret = 0;
>  		break;
> +	case 1:
> +		/* Old DB -- update to new schema */
> +		ret = sqlite_maindb_update_v1_to_v2();
> +		if (ret)
> +			goto out_close;
> +		break;
>  	case 0:
>  		/* Query failed -- try to set up new DB */
> -		ret = sqlite_maindb_init_v1();
> +		ret = sqlite_maindb_init_v2();
>  		if (ret)
>  			goto out_close;
>  		break;
> @@ -289,7 +375,7 @@ sqlite_insert_client(const unsigned char *clname, const size_t namelen)
>  	sqlite3_stmt *stmt = NULL;
>  
>  	ret = sqlite3_prepare_v2(dbh, "INSERT OR REPLACE INTO clients VALUES "
> -				      "(?, strftime('%s', 'now'));", -1,
> +					"(?, strftime('%s', 'now'), 0);", -1,
>  					&stmt, NULL);
>  	if (ret != SQLITE_OK) {
>  		xlog(L_ERROR, "%s: insert statement prepare failed: %s",
> -- 
> 1.9.3
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Sept. 11, 2014, 8:28 p.m. UTC | #2
On Thu, 11 Sep 2014 15:55:47 -0400
"J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Mon, Sep 08, 2014 at 12:30:19PM -0400, Jeff Layton wrote:
> > From: Jeff Layton <jlayton@poochiereds.net>
> > 
> > In order to allow knfsd's lock manager to lift its grace period early,
> > we need to figure out whether all clients have finished reclaiming
> > their state not. Unfortunately, the current code doesn't allow us to
> > ascertain this. All we track for each client is a timestamp that tells
> > us when the last "check" or "create" operation came in.
> > 
> > We need to track the two timestamps separately. Add a new
> > "reclaim_complete" column to the database that tells us when the last
> > "create" operation came in. For now, we just insert "0" in that column
> > but a later patch will make it so that we insert a real timestamp for
> > v4.1+ client records.
> 
> If I understand correctly, then nfsdcltrack has a bug here: we shouldn't
> be counting a 4.1 client as allowed to reclaim on the next boot until we
> get the RECLAIM_COMPLETE, but nfsdcltrack is allowing a 4.1 client to
> reclaim if all we got the previous boot was a reclaim open (a "check"
> operation).
> 
> --b.
> 

Yeah, I guess so, with a bit of a clarification I think...

We don't want to allow a v4.1 client to reclaim if it didn't send a
RECLAIM_COMPLETE prior to the last reboot *and* the grace period ended
prior to the last reboot.

IOW, in the case where the reboot occurs before the grace period ends,
we don't want to clean out the and deny reclaims. FWIW, the legacy
client tracker got this very wrong -- if you did a couple of rapid
reboots in succession you couldn't reclaim once everything was back up.

I'll have to ponder how best to fix that. Given that the logic required
is quite different between v4.0 and v4.1 clients, we may have to add yet
another column to the DB to track what sort of client this is.

> > 
> > Signed-off-by: Jeff Layton <jlayton@primarydata.com>
> > ---
> >  utils/nfsdcltrack/sqlite.c | 102
> > +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 94
> > insertions(+), 8 deletions(-)
> > 
> > diff --git a/utils/nfsdcltrack/sqlite.c b/utils/nfsdcltrack/sqlite.c
> > index dde2f7ff8621..e260e81b1722 100644
> > --- a/utils/nfsdcltrack/sqlite.c
> > +++ b/utils/nfsdcltrack/sqlite.c
> > @@ -29,7 +29,10 @@
> >   *
> >   * clients: an "id" column containing a BLOB with the long-form
> > clientid as
> >   * 	    sent by the client, a "time" column containing a
> > timestamp (in
> > - * 	    epoch seconds) of when the record was last updated.
> > + * 	    epoch seconds) of when the record was last updated,
> > and a
> > + * 	    "reclaim_complete" column containing a timestamp
> > (in epoch seconds)
> > + * 	    of when the last "create" operation came in for
> > v4.1+ clients.
> > + * 	    v4.0 clients should always have this set to 0.
> >   */
> >  
> >  #ifdef HAVE_CONFIG_H
> > @@ -50,7 +53,7 @@
> >  
> >  #include "xlog.h"
> >  
> > -#define CLTRACK_SQLITE_LATEST_SCHEMA_VERSION 1
> > +#define CLTRACK_SQLITE_LATEST_SCHEMA_VERSION 2
> >  
> >  /* in milliseconds */
> >  #define CLTRACK_SQLITE_BUSY_TIMEOUT 10000
> > @@ -120,6 +123,81 @@ out:
> >  	return ret;
> >  }
> >  
> > +static int
> > +sqlite_maindb_update_v1_to_v2(void)
> > +{
> > +	int ret, ret2;
> > +	char *err;
> > +
> > +	/* begin transaction */
> > +	ret = sqlite3_exec(dbh, "BEGIN EXCLUSIVE TRANSACTION;",
> > NULL, NULL,
> > +				&err);
> > +	if (ret != SQLITE_OK) {
> > +		xlog(L_ERROR, "Unable to begin transaction: %s",
> > err);
> > +		goto rollback;
> > +	}
> > +
> > +	/*
> > +	 * Check schema version again. This time, under an
> > exclusive
> > +	 * transaction to guard against racing DB setup attempts
> > +	 */
> > +	ret = sqlite_query_schema_version();
> > +	switch (ret) {
> > +	case 1:
> > +		/* Still at v1 -- do conversion */
> > +		break;
> > +	case CLTRACK_SQLITE_LATEST_SCHEMA_VERSION:
> > +		/* Someone else raced in and set it up */
> > +		ret = 0;
> > +		goto rollback;
> > +	default:
> > +		/* Something went wrong -- fail! */
> > +		ret = -EINVAL;
> > +		goto rollback;
> > +	}
> > +
> > +	/* create v2 clients table */
> > +	ret = sqlite3_exec(dbh, "ALTER TABLE clients ADD COLUMN "
> > +				"reclaim_complete INTEGER;",
> > +				NULL, NULL, &err);
> > +	if (ret != SQLITE_OK) {
> > +		xlog(L_ERROR, "Unable to update clients table:
> > %s", err);
> > +		goto rollback;
> > +	}
> > +
> > +	ret = snprintf(buf, sizeof(buf), "UPDATE parameters SET
> > value = %d "
> > +			"WHERE key = \"version\";",
> > +			CLTRACK_SQLITE_LATEST_SCHEMA_VERSION);
> > +	if (ret < 0) {
> > +		xlog(L_ERROR, "sprintf failed!");
> > +		goto rollback;
> > +	} else if ((size_t)ret >= sizeof(buf)) {
> > +		xlog(L_ERROR, "sprintf output too long! (%d
> > chars)", ret);
> > +		ret = -EINVAL;
> > +		goto rollback;
> > +	}
> > +
> > +	ret = sqlite3_exec(dbh, (const char *)buf, NULL, NULL,
> > &err);
> > +	if (ret != SQLITE_OK) {
> > +		xlog(L_ERROR, "Unable to update schema version:
> > %s", err);
> > +		goto rollback;
> > +	}
> > +
> > +	ret = sqlite3_exec(dbh, "COMMIT TRANSACTION;", NULL, NULL,
> > &err);
> > +	if (ret != SQLITE_OK) {
> > +		xlog(L_ERROR, "Unable to commit transaction: %s",
> > err);
> > +		goto rollback;
> > +	}
> > +out:
> > +	sqlite3_free(err);
> > +	return ret;
> > +rollback:
> > +	ret2 = sqlite3_exec(dbh, "ROLLBACK TRANSACTION;", NULL,
> > NULL, &err);
> > +	if (ret2 != SQLITE_OK)
> > +		xlog(L_ERROR, "Unable to rollback transaction:
> > %s", err);
> > +	goto out;
> > +}
> > +
> >  /*
> >   * Start an exclusive transaction and recheck the DB schema
> > version. If it's
> >   * still zero (indicating a new database) then set it up. If that
> > all works, @@ -127,9 +205,9 @@ out:
> >   * transaction. On any error, rollback the transaction.
> >   */
> >  int
> > -sqlite_maindb_init_v1(void)
> > +sqlite_maindb_init_v2(void)
> >  {
> > -	int ret;
> > +	int ret, ret2;
> >  	char *err = NULL;
> >  
> >  	/* Start a transaction */
> > @@ -169,7 +247,7 @@ sqlite_maindb_init_v1(void)
> >  
> >  	/* create the "clients" table */
> >  	ret = sqlite3_exec(dbh, "CREATE TABLE clients (id BLOB
> > PRIMARY KEY, "
> > -				"time INTEGER);",
> > +				"time INTEGER, reclaim_complete
> > INTEGER);", NULL, NULL, &err);
> >  	if (ret != SQLITE_OK) {
> >  		xlog(L_ERROR, "Unable to create clients table:
> > %s", err); @@ -207,7 +285,9 @@ out:
> >  
> >  rollback:
> >  	/* Attempt to rollback the transaction */
> > -	sqlite3_exec(dbh, "ROLLBACK TRANSACTION;", NULL, NULL,
> > &err);
> > +	ret2 = sqlite3_exec(dbh, "ROLLBACK TRANSACTION;", NULL,
> > NULL, &err);
> > +	if (ret2 != SQLITE_OK)
> > +		xlog(L_ERROR, "Unable to rollback transaction:
> > %s", err); goto out;
> >  }
> >  
> > @@ -255,9 +335,15 @@ sqlite_prepare_dbh(const char *topdir)
> >  		/* DB is already set up. Do nothing */
> >  		ret = 0;
> >  		break;
> > +	case 1:
> > +		/* Old DB -- update to new schema */
> > +		ret = sqlite_maindb_update_v1_to_v2();
> > +		if (ret)
> > +			goto out_close;
> > +		break;
> >  	case 0:
> >  		/* Query failed -- try to set up new DB */
> > -		ret = sqlite_maindb_init_v1();
> > +		ret = sqlite_maindb_init_v2();
> >  		if (ret)
> >  			goto out_close;
> >  		break;
> > @@ -289,7 +375,7 @@ sqlite_insert_client(const unsigned char
> > *clname, const size_t namelen) sqlite3_stmt *stmt = NULL;
> >  
> >  	ret = sqlite3_prepare_v2(dbh, "INSERT OR REPLACE INTO
> > clients VALUES "
> > -				      "(?, strftime('%s',
> > 'now'));", -1,
> > +					"(?, strftime('%s',
> > 'now'), 0);", -1, &stmt, NULL);
> >  	if (ret != SQLITE_OK) {
> >  		xlog(L_ERROR, "%s: insert statement prepare
> > failed: %s", -- 
> > 1.9.3
> >
Jeff Layton Sept. 12, 2014, 1:36 p.m. UTC | #3
On Thu, 11 Sep 2014 16:28:36 -0400
Jeff Layton <jeff.layton@primarydata.com> wrote:

> On Thu, 11 Sep 2014 15:55:47 -0400
> "J. Bruce Fields" <bfields@fieldses.org> wrote:
> 
> > On Mon, Sep 08, 2014 at 12:30:19PM -0400, Jeff Layton wrote:
> > > From: Jeff Layton <jlayton@poochiereds.net>
> > > 
> > > In order to allow knfsd's lock manager to lift its grace period early,
> > > we need to figure out whether all clients have finished reclaiming
> > > their state not. Unfortunately, the current code doesn't allow us to
> > > ascertain this. All we track for each client is a timestamp that tells
> > > us when the last "check" or "create" operation came in.
> > > 
> > > We need to track the two timestamps separately. Add a new
> > > "reclaim_complete" column to the database that tells us when the last
> > > "create" operation came in. For now, we just insert "0" in that column
> > > but a later patch will make it so that we insert a real timestamp for
> > > v4.1+ client records.
> > 
> > If I understand correctly, then nfsdcltrack has a bug here: we shouldn't
> > be counting a 4.1 client as allowed to reclaim on the next boot until we
> > get the RECLAIM_COMPLETE, but nfsdcltrack is allowing a 4.1 client to
> > reclaim if all we got the previous boot was a reclaim open (a "check"
> > operation).
> > 
> > --b.
> > 
> 
> Yeah, I guess so, with a bit of a clarification I think...
> 
> We don't want to allow a v4.1 client to reclaim if it didn't send a
> RECLAIM_COMPLETE prior to the last reboot *and* the grace period ended
> prior to the last reboot.
> 
> IOW, in the case where the reboot occurs before the grace period ends,
> we don't want to clean out the and deny reclaims. FWIW, the legacy
> client tracker got this very wrong -- if you did a couple of rapid
> reboots in succession you couldn't reclaim once everything was back up.
> 
> I'll have to ponder how best to fix that. Given that the logic required
> is quite different between v4.0 and v4.1 clients, we may have to add yet
> another column to the DB to track what sort of client this is.
> 

This new requirement complicates things quite a bit. I'll have to
respin both patchsets.

I think we can fix this by ensuring that we clean out any v4.1+ clients
that have not done a "create" since the start of the grace period
during a "grace_done" upcall. For v4.0 clients, we can't do that of
course since a v4.0 client may reclaim opens but never do a new one
(and so may never send a "create" at all).

That means that we'll need also to send something in the "check" upcall
that indicates the client's minorversion. The good news is that we
won't need a new column in the DB since the only timestamp that matters
for v4.1+ clients is the "create" time. We can just avoid setting the
time field for v4.1+ clients on the "check" upcall.

Now that we need to send info about the minorversion in a "check", I
may go back to sending an actual minorversion in the upcall's
environment vars. It doesn't make sense to me to send a boolean about
RECLAIM_COMPLETE when the client hasn't actually sent one.

I'll get started on reworking this but I have no idea on an ETA just
yet. Hopefully I can have something that works by next week sometime.
Jeff Layton Sept. 12, 2014, 2:21 p.m. UTC | #4
On Fri, 12 Sep 2014 09:36:00 -0400
Jeff Layton <jlayton@primarydata.com> wrote:

> On Thu, 11 Sep 2014 16:28:36 -0400
> Jeff Layton <jeff.layton@primarydata.com> wrote:
> 
> > On Thu, 11 Sep 2014 15:55:47 -0400
> > "J. Bruce Fields" <bfields@fieldses.org> wrote:
> > 
> > > On Mon, Sep 08, 2014 at 12:30:19PM -0400, Jeff Layton wrote:
> > > > From: Jeff Layton <jlayton@poochiereds.net>
> > > > 
> > > > In order to allow knfsd's lock manager to lift its grace period early,
> > > > we need to figure out whether all clients have finished reclaiming
> > > > their state not. Unfortunately, the current code doesn't allow us to
> > > > ascertain this. All we track for each client is a timestamp that tells
> > > > us when the last "check" or "create" operation came in.
> > > > 
> > > > We need to track the two timestamps separately. Add a new
> > > > "reclaim_complete" column to the database that tells us when the last
> > > > "create" operation came in. For now, we just insert "0" in that column
> > > > but a later patch will make it so that we insert a real timestamp for
> > > > v4.1+ client records.
> > > 
> > > If I understand correctly, then nfsdcltrack has a bug here: we shouldn't
> > > be counting a 4.1 client as allowed to reclaim on the next boot until we
> > > get the RECLAIM_COMPLETE, but nfsdcltrack is allowing a 4.1 client to
> > > reclaim if all we got the previous boot was a reclaim open (a "check"
> > > operation).
> > > 
> > > --b.
> > > 
> > 
> > Yeah, I guess so, with a bit of a clarification I think...
> > 
> > We don't want to allow a v4.1 client to reclaim if it didn't send a
> > RECLAIM_COMPLETE prior to the last reboot *and* the grace period ended
> > prior to the last reboot.
> > 
> > IOW, in the case where the reboot occurs before the grace period ends,
> > we don't want to clean out the and deny reclaims. FWIW, the legacy
> > client tracker got this very wrong -- if you did a couple of rapid
> > reboots in succession you couldn't reclaim once everything was back up.
> > 
> > I'll have to ponder how best to fix that. Given that the logic required
> > is quite different between v4.0 and v4.1 clients, we may have to add yet
> > another column to the DB to track what sort of client this is.
> > 
> 
> This new requirement complicates things quite a bit. I'll have to
> respin both patchsets.
> 
> I think we can fix this by ensuring that we clean out any v4.1+ clients
> that have not done a "create" since the start of the grace period
> during a "grace_done" upcall. For v4.0 clients, we can't do that of
> course since a v4.0 client may reclaim opens but never do a new one
> (and so may never send a "create" at all).
> 
> That means that we'll need also to send something in the "check" upcall
> that indicates the client's minorversion. The good news is that we
> won't need a new column in the DB since the only timestamp that matters
> for v4.1+ clients is the "create" time. We can just avoid setting the
> time field for v4.1+ clients on the "check" upcall.
> 
> Now that we need to send info about the minorversion in a "check", I
> may go back to sending an actual minorversion in the upcall's
> environment vars. It doesn't make sense to me to send a boolean about
> RECLAIM_COMPLETE when the client hasn't actually sent one.
> 
> I'll get started on reworking this but I have no idea on an ETA just
> yet. Hopefully I can have something that works by next week sometime.
>  

This is actually a much larger can of worms than it originally looks.
Consider this:

Server reboots and v4.1+ client reclaims a few records but never sends
a RECLAIM_COMPLETE (client bug or maybe some bad timing?). Grace period
eventually ends, and its record is purged from the DB.

Now we have a client that has reclaimed some files but that has no
record on stable storage.

One possibility is to prematurely expire v4.1+ clients that have not
sent a RECLAIM_COMPLETE when the grace period ends.

That seems problematic though -- what about clients that just happen to
do an EXCHANGE_ID just before the grace period is going to end, and
that get expired before they can issue their RECLAIM_COMPLETE. Will
that be a problem for them?

Thoughts?
J. Bruce Fields Sept. 12, 2014, 2:36 p.m. UTC | #5
On Fri, Sep 12, 2014 at 10:21:53AM -0400, Jeff Layton wrote:
> On Fri, 12 Sep 2014 09:36:00 -0400
> Jeff Layton <jlayton@primarydata.com> wrote:
> 
> > On Thu, 11 Sep 2014 16:28:36 -0400
> > Jeff Layton <jeff.layton@primarydata.com> wrote:
> > 
> > > On Thu, 11 Sep 2014 15:55:47 -0400
> > > "J. Bruce Fields" <bfields@fieldses.org> wrote:
> > > 
> > > > On Mon, Sep 08, 2014 at 12:30:19PM -0400, Jeff Layton wrote:
> > > > > From: Jeff Layton <jlayton@poochiereds.net>
> > > > > 
> > > > > In order to allow knfsd's lock manager to lift its grace period early,
> > > > > we need to figure out whether all clients have finished reclaiming
> > > > > their state not. Unfortunately, the current code doesn't allow us to
> > > > > ascertain this. All we track for each client is a timestamp that tells
> > > > > us when the last "check" or "create" operation came in.
> > > > > 
> > > > > We need to track the two timestamps separately. Add a new
> > > > > "reclaim_complete" column to the database that tells us when the last
> > > > > "create" operation came in. For now, we just insert "0" in that column
> > > > > but a later patch will make it so that we insert a real timestamp for
> > > > > v4.1+ client records.
> > > > 
> > > > If I understand correctly, then nfsdcltrack has a bug here: we shouldn't
> > > > be counting a 4.1 client as allowed to reclaim on the next boot until we
> > > > get the RECLAIM_COMPLETE, but nfsdcltrack is allowing a 4.1 client to
> > > > reclaim if all we got the previous boot was a reclaim open (a "check"
> > > > operation).
> > > > 
> > > > --b.
> > > > 
> > > 
> > > Yeah, I guess so, with a bit of a clarification I think...
> > > 
> > > We don't want to allow a v4.1 client to reclaim if it didn't send a
> > > RECLAIM_COMPLETE prior to the last reboot *and* the grace period ended
> > > prior to the last reboot.
> > > 
> > > IOW, in the case where the reboot occurs before the grace period ends,
> > > we don't want to clean out the and deny reclaims. FWIW, the legacy
> > > client tracker got this very wrong -- if you did a couple of rapid
> > > reboots in succession you couldn't reclaim once everything was back up.
> > > 
> > > I'll have to ponder how best to fix that. Given that the logic required
> > > is quite different between v4.0 and v4.1 clients, we may have to add yet
> > > another column to the DB to track what sort of client this is.
> > > 
> > 
> > This new requirement complicates things quite a bit. I'll have to
> > respin both patchsets.
> > 
> > I think we can fix this by ensuring that we clean out any v4.1+ clients
> > that have not done a "create" since the start of the grace period
> > during a "grace_done" upcall. For v4.0 clients, we can't do that of
> > course since a v4.0 client may reclaim opens but never do a new one
> > (and so may never send a "create" at all).
> > 
> > That means that we'll need also to send something in the "check" upcall
> > that indicates the client's minorversion. The good news is that we
> > won't need a new column in the DB since the only timestamp that matters
> > for v4.1+ clients is the "create" time. We can just avoid setting the
> > time field for v4.1+ clients on the "check" upcall.
> > 
> > Now that we need to send info about the minorversion in a "check", I
> > may go back to sending an actual minorversion in the upcall's
> > environment vars. It doesn't make sense to me to send a boolean about
> > RECLAIM_COMPLETE when the client hasn't actually sent one.
> > 
> > I'll get started on reworking this but I have no idea on an ETA just
> > yet. Hopefully I can have something that works by next week sometime.
> >  
> 
> This is actually a much larger can of worms than it originally looks.
> Consider this:
> 
> Server reboots and v4.1+ client reclaims a few records but never sends
> a RECLAIM_COMPLETE (client bug or maybe some bad timing?).

The client must send a RECLAIM_COMPLETE.  It's not permitted to do any
regular opens, for example, till it does.

So either the client is buggy (too bad), or it's lost touch with the
server (in which case it will eventually expire normally).

I don't see any work to do here.

> Grace period
> eventually ends, and its record is purged from the DB.
> 
> Now we have a client that has reclaimed some files but that has no
> record on stable storage.
> 
> One possibility is to prematurely expire v4.1+ clients that have not
> sent a RECLAIM_COMPLETE when the grace period ends.
> 
> That seems problematic though -- what about clients that just happen to
> do an EXCHANGE_ID just before the grace period is going to end, and
> that get expired before they can issue their RECLAIM_COMPLETE. Will
> that be a problem for them?

In that case a client will send a reclaim, get back a NO_GRACE error,
mark the rest of its state as unrecoverable, send the RECLAIM_COMPLETE,
and continue normally.  (To the extent it can--signalling affected
processes or EIOing further attempts to use the unreclaimed state, or
whatever.)

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Sept. 12, 2014, 3:21 p.m. UTC | #6
On Fri, Sep 12, 2014 at 10:36:21AM -0400, J. Bruce Fields wrote:
> On Fri, Sep 12, 2014 at 10:21:53AM -0400, Jeff Layton wrote:
> > Grace period
> > eventually ends, and its record is purged from the DB.
> > 
> > Now we have a client that has reclaimed some files but that has no
> > record on stable storage.
> > 
> > One possibility is to prematurely expire v4.1+ clients that have not
> > sent a RECLAIM_COMPLETE when the grace period ends.
> > 
> > That seems problematic though -- what about clients that just happen to
> > do an EXCHANGE_ID just before the grace period is going to end, and
> > that get expired before they can issue their RECLAIM_COMPLETE. Will
> > that be a problem for them?
> 
> In that case a client will send a reclaim, get back a NO_GRACE error,
> mark the rest of its state as unrecoverable, send the RECLAIM_COMPLETE,
> and continue normally.  (To the extent it can--signalling affected
> processes or EIOing further attempts to use the unreclaimed state, or
> whatever.)

The one thing the server *could* do in this sort of case is extend the
grace period by a little--I seem to recall the spec giving some leeway
for this kind of thing.

So for example the server could have a heuristics like: extend the grace
period by another second each time we notice there's been an EXCHANGE_ID
or reclaim in the previous second, up to some maximum.  And I suppose it
could also delay the grace period until someone actually attempts a
non-reclaim open.

In isolation a single client slipping in the end like that sounds like a
freak event, but if there's a ton of state to reclaim perhaps it could
become more likely.

I don't think that's a priority, we might just want to make sure we know
how to do that in the future.

But now that I think about it I don't see the existing or proposed
nfsdcltrack stuff tying our hands in any way here.  It just gives the
kernel some extra information, and the kernel still has discretion about
when exactly it wants to end the grace period.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust Sept. 12, 2014, 3:42 p.m. UTC | #7
On Fri, Sep 12, 2014 at 10:21 AM, Jeff Layton
<jeff.layton@primarydata.com> wrote:
> On Fri, 12 Sep 2014 09:36:00 -0400
> Jeff Layton <jlayton@primarydata.com> wrote:
>
>> On Thu, 11 Sep 2014 16:28:36 -0400
>> Jeff Layton <jeff.layton@primarydata.com> wrote:
>>
>> > On Thu, 11 Sep 2014 15:55:47 -0400
>> > "J. Bruce Fields" <bfields@fieldses.org> wrote:
>> >
>> > > On Mon, Sep 08, 2014 at 12:30:19PM -0400, Jeff Layton wrote:
>> > > > From: Jeff Layton <jlayton@poochiereds.net>
>> > > >
>> > > > In order to allow knfsd's lock manager to lift its grace period early,
>> > > > we need to figure out whether all clients have finished reclaiming
>> > > > their state not. Unfortunately, the current code doesn't allow us to
>> > > > ascertain this. All we track for each client is a timestamp that tells
>> > > > us when the last "check" or "create" operation came in.
>> > > >
>> > > > We need to track the two timestamps separately. Add a new
>> > > > "reclaim_complete" column to the database that tells us when the last
>> > > > "create" operation came in. For now, we just insert "0" in that column
>> > > > but a later patch will make it so that we insert a real timestamp for
>> > > > v4.1+ client records.
>> > >
>> > > If I understand correctly, then nfsdcltrack has a bug here: we shouldn't
>> > > be counting a 4.1 client as allowed to reclaim on the next boot until we
>> > > get the RECLAIM_COMPLETE, but nfsdcltrack is allowing a 4.1 client to
>> > > reclaim if all we got the previous boot was a reclaim open (a "check"
>> > > operation).
>> > >
>> > > --b.
>> > >
>> >
>> > Yeah, I guess so, with a bit of a clarification I think...
>> >
>> > We don't want to allow a v4.1 client to reclaim if it didn't send a
>> > RECLAIM_COMPLETE prior to the last reboot *and* the grace period ended
>> > prior to the last reboot.
>> >
>> > IOW, in the case where the reboot occurs before the grace period ends,
>> > we don't want to clean out the and deny reclaims. FWIW, the legacy
>> > client tracker got this very wrong -- if you did a couple of rapid
>> > reboots in succession you couldn't reclaim once everything was back up.
>> >
>> > I'll have to ponder how best to fix that. Given that the logic required
>> > is quite different between v4.0 and v4.1 clients, we may have to add yet
>> > another column to the DB to track what sort of client this is.
>> >
>>
>> This new requirement complicates things quite a bit. I'll have to
>> respin both patchsets.
>>
>> I think we can fix this by ensuring that we clean out any v4.1+ clients
>> that have not done a "create" since the start of the grace period
>> during a "grace_done" upcall. For v4.0 clients, we can't do that of
>> course since a v4.0 client may reclaim opens but never do a new one
>> (and so may never send a "create" at all).
>>
>> That means that we'll need also to send something in the "check" upcall
>> that indicates the client's minorversion. The good news is that we
>> won't need a new column in the DB since the only timestamp that matters
>> for v4.1+ clients is the "create" time. We can just avoid setting the
>> time field for v4.1+ clients on the "check" upcall.
>>
>> Now that we need to send info about the minorversion in a "check", I
>> may go back to sending an actual minorversion in the upcall's
>> environment vars. It doesn't make sense to me to send a boolean about
>> RECLAIM_COMPLETE when the client hasn't actually sent one.
>>
>> I'll get started on reworking this but I have no idea on an ETA just
>> yet. Hopefully I can have something that works by next week sometime.
>>
>
> This is actually a much larger can of worms than it originally looks.
> Consider this:
>
> Server reboots and v4.1+ client reclaims a few records but never sends
> a RECLAIM_COMPLETE (client bug or maybe some bad timing?). Grace period
> eventually ends, and its record is purged from the DB.
>
> Now we have a client that has reclaimed some files but that has no
> record on stable storage.
>
> One possibility is to prematurely expire v4.1+ clients that have not
> sent a RECLAIM_COMPLETE when the grace period ends.
>
> That seems problematic though -- what about clients that just happen to
> do an EXCHANGE_ID just before the grace period is going to end, and
> that get expired before they can issue their RECLAIM_COMPLETE. Will
> that be a problem for them?
>
> Thoughts?

See RFC5661 section 8.4.3, which describes those edge conditions, and
how to deal with them.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust Sept. 12, 2014, 3:54 p.m. UTC | #8
On Fri, Sep 12, 2014 at 11:21 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
> On Fri, Sep 12, 2014 at 10:36:21AM -0400, J. Bruce Fields wrote:
>> On Fri, Sep 12, 2014 at 10:21:53AM -0400, Jeff Layton wrote:
>> > Grace period
>> > eventually ends, and its record is purged from the DB.
>> >
>> > Now we have a client that has reclaimed some files but that has no
>> > record on stable storage.
>> >
>> > One possibility is to prematurely expire v4.1+ clients that have not
>> > sent a RECLAIM_COMPLETE when the grace period ends.
>> >
>> > That seems problematic though -- what about clients that just happen to
>> > do an EXCHANGE_ID just before the grace period is going to end, and
>> > that get expired before they can issue their RECLAIM_COMPLETE. Will
>> > that be a problem for them?
>>
>> In that case a client will send a reclaim, get back a NO_GRACE error,
>> mark the rest of its state as unrecoverable, send the RECLAIM_COMPLETE,
>> and continue normally.  (To the extent it can--signalling affected
>> processes or EIOing further attempts to use the unreclaimed state, or
>> whatever.)
>
> The one thing the server *could* do in this sort of case is extend the
> grace period by a little--I seem to recall the spec giving some leeway
> for this kind of thing.


Section 8.4.2.1.

> So for example the server could have a heuristics like: extend the grace
> period by another second each time we notice there's been an EXCHANGE_ID
> or reclaim in the previous second, up to some maximum.  And I suppose it
> could also delay the grace period until someone actually attempts a
> non-reclaim open.
>
> In isolation a single client slipping in the end like that sounds like a
> freak event, but if there's a ton of state to reclaim perhaps it could
> become more likely.
>
> I don't think that's a priority, we might just want to make sure we know
> how to do that in the future.
>
> But now that I think about it I don't see the existing or proposed
> nfsdcltrack stuff tying our hands in any way here.  It just gives the
> kernel some extra information, and the kernel still has discretion about
> when exactly it wants to end the grace period.
>

It is even allowed to grant reclaim lock attempts after the grace
period has ended _if_ and only if it can guarantee that no conflicting
locks were issued.

However note that the NFSv4.1 client is not actually allowed to issue
non-reclaim lock requests before it has issued a RECLAIM_COMPLETE. I
dunno how religiously we stick to that in Linux (I think we do), but
the point is that the server can and should rely on the client
_always_ sending a RECLAIM_COMPLETE if it is going to establish new
locks.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Sept. 12, 2014, 4:05 p.m. UTC | #9
On Fri, Sep 12, 2014 at 11:54:17AM -0400, Trond Myklebust wrote:
> On Fri, Sep 12, 2014 at 11:21 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
> > On Fri, Sep 12, 2014 at 10:36:21AM -0400, J. Bruce Fields wrote:
> >> On Fri, Sep 12, 2014 at 10:21:53AM -0400, Jeff Layton wrote:
> >> > Grace period
> >> > eventually ends, and its record is purged from the DB.
> >> >
> >> > Now we have a client that has reclaimed some files but that has no
> >> > record on stable storage.
> >> >
> >> > One possibility is to prematurely expire v4.1+ clients that have not
> >> > sent a RECLAIM_COMPLETE when the grace period ends.
> >> >
> >> > That seems problematic though -- what about clients that just happen to
> >> > do an EXCHANGE_ID just before the grace period is going to end, and
> >> > that get expired before they can issue their RECLAIM_COMPLETE. Will
> >> > that be a problem for them?
> >>
> >> In that case a client will send a reclaim, get back a NO_GRACE error,
> >> mark the rest of its state as unrecoverable, send the RECLAIM_COMPLETE,
> >> and continue normally.  (To the extent it can--signalling affected
> >> processes or EIOing further attempts to use the unreclaimed state, or
> >> whatever.)
> >
> > The one thing the server *could* do in this sort of case is extend the
> > grace period by a little--I seem to recall the spec giving some leeway
> > for this kind of thing.
> 
> 
> Section 8.4.2.1.

Thanks!

	http://tools.ietf.org/html/rfc5661#section-8.4.2.1

	"Some additional time in order to allow a client to establish a
	new client ID and session and to effect lock reclaims may be
	added to the lease time."

I thought there was something else but I actually must have been
remembering the "diligently flushing" language describing delegation
recalls.  Anyway.

> 
> > So for example the server could have a heuristics like: extend the grace
> > period by another second each time we notice there's been an EXCHANGE_ID
> > or reclaim in the previous second, up to some maximum.  And I suppose it
> > could also delay the grace period until someone actually attempts a
> > non-reclaim open.
> >
> > In isolation a single client slipping in the end like that sounds like a
> > freak event, but if there's a ton of state to reclaim perhaps it could
> > become more likely.
> >
> > I don't think that's a priority, we might just want to make sure we know
> > how to do that in the future.
> >
> > But now that I think about it I don't see the existing or proposed
> > nfsdcltrack stuff tying our hands in any way here.  It just gives the
> > kernel some extra information, and the kernel still has discretion about
> > when exactly it wants to end the grace period.
> >
> 
> It is even allowed to grant reclaim lock attempts after the grace
> period has ended _if_ and only if it can guarantee that no conflicting
> locks were issued.
> 
> However note that the NFSv4.1 client is not actually allowed to issue
> non-reclaim lock requests before it has issued a RECLAIM_COMPLETE. I
> dunno how religiously we stick to that in Linux (I think we do),

Yes, the server strictly enforces this so we'd know if the client
skipped the RECLAIM_COMPLETE.  (Any open would fail over 4.1.)

> but the point is that the server can and should rely on the client
> _always_ sending a RECLAIM_COMPLETE if it is going to establish new
> locks.

Yes.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Sept. 12, 2014, 4:07 p.m. UTC | #10
On Fri, 12 Sep 2014 11:54:17 -0400
Trond Myklebust <trondmy@gmail.com> wrote:

> On Fri, Sep 12, 2014 at 11:21 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
> > On Fri, Sep 12, 2014 at 10:36:21AM -0400, J. Bruce Fields wrote:
> >> On Fri, Sep 12, 2014 at 10:21:53AM -0400, Jeff Layton wrote:
> >> > Grace period
> >> > eventually ends, and its record is purged from the DB.
> >> >
> >> > Now we have a client that has reclaimed some files but that has no
> >> > record on stable storage.
> >> >
> >> > One possibility is to prematurely expire v4.1+ clients that have not
> >> > sent a RECLAIM_COMPLETE when the grace period ends.
> >> >
> >> > That seems problematic though -- what about clients that just happen to
> >> > do an EXCHANGE_ID just before the grace period is going to end, and
> >> > that get expired before they can issue their RECLAIM_COMPLETE. Will
> >> > that be a problem for them?
> >>
> >> In that case a client will send a reclaim, get back a NO_GRACE error,
> >> mark the rest of its state as unrecoverable, send the RECLAIM_COMPLETE,
> >> and continue normally.  (To the extent it can--signalling affected
> >> processes or EIOing further attempts to use the unreclaimed state, or
> >> whatever.)
> >
> > The one thing the server *could* do in this sort of case is extend the
> > grace period by a little--I seem to recall the spec giving some leeway
> > for this kind of thing.
> 
> 
> Section 8.4.2.1.
> 
> > So for example the server could have a heuristics like: extend the grace
> > period by another second each time we notice there's been an EXCHANGE_ID
> > or reclaim in the previous second, up to some maximum.  And I suppose it
> > could also delay the grace period until someone actually attempts a
> > non-reclaim open.
> >
> > In isolation a single client slipping in the end like that sounds like a
> > freak event, but if there's a ton of state to reclaim perhaps it could
> > become more likely.
> >
> > I don't think that's a priority, we might just want to make sure we know
> > how to do that in the future.
> >
> > But now that I think about it I don't see the existing or proposed
> > nfsdcltrack stuff tying our hands in any way here.  It just gives the
> > kernel some extra information, and the kernel still has discretion about
> > when exactly it wants to end the grace period.
> >
> 
> It is even allowed to grant reclaim lock attempts after the grace
> period has ended _if_ and only if it can guarantee that no conflicting
> locks were issued.
> 
> However note that the NFSv4.1 client is not actually allowed to issue
> non-reclaim lock requests before it has issued a RECLAIM_COMPLETE. I
> dunno how religiously we stick to that in Linux (I think we do), but
> the point is that the server can and should rely on the client
> _always_ sending a RECLAIM_COMPLETE if it is going to establish new
> locks.

Yeah, I'm pretty sure that bit is enforced. The problem situation that
I think Bruce was referring to is this:

Server reboots. Client1 reclaims some of its locks (but not all) and
never sends a RECLAIM_COMPLETE. Grace period ends and then server
hands out a lock to client2 that was previously held by client1 but
that didn't get reclaimed.

Server reboots again, prior to the client1 expiring (so its record is
still in the DB). Now client1 comes back and starts reclaiming again.
This time it reclaims all of its locks and we have a conflict between
it and client2.

It's a solvable problem, but I'll need to work through how best to do
so.
Trond Myklebust Sept. 12, 2014, 4:29 p.m. UTC | #11
On Fri, Sep 12, 2014 at 12:07 PM, Jeff Layton
<jeff.layton@primarydata.com> wrote:
> On Fri, 12 Sep 2014 11:54:17 -0400
> Trond Myklebust <trondmy@gmail.com> wrote:
>
>> On Fri, Sep 12, 2014 at 11:21 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
>> > On Fri, Sep 12, 2014 at 10:36:21AM -0400, J. Bruce Fields wrote:
>> >> On Fri, Sep 12, 2014 at 10:21:53AM -0400, Jeff Layton wrote:
>> >> > Grace period
>> >> > eventually ends, and its record is purged from the DB.
>> >> >
>> >> > Now we have a client that has reclaimed some files but that has no
>> >> > record on stable storage.
>> >> >
>> >> > One possibility is to prematurely expire v4.1+ clients that have not
>> >> > sent a RECLAIM_COMPLETE when the grace period ends.
>> >> >
>> >> > That seems problematic though -- what about clients that just happen to
>> >> > do an EXCHANGE_ID just before the grace period is going to end, and
>> >> > that get expired before they can issue their RECLAIM_COMPLETE. Will
>> >> > that be a problem for them?
>> >>
>> >> In that case a client will send a reclaim, get back a NO_GRACE error,
>> >> mark the rest of its state as unrecoverable, send the RECLAIM_COMPLETE,
>> >> and continue normally.  (To the extent it can--signalling affected
>> >> processes or EIOing further attempts to use the unreclaimed state, or
>> >> whatever.)
>> >
>> > The one thing the server *could* do in this sort of case is extend the
>> > grace period by a little--I seem to recall the spec giving some leeway
>> > for this kind of thing.
>>
>>
>> Section 8.4.2.1.
>>
>> > So for example the server could have a heuristics like: extend the grace
>> > period by another second each time we notice there's been an EXCHANGE_ID
>> > or reclaim in the previous second, up to some maximum.  And I suppose it
>> > could also delay the grace period until someone actually attempts a
>> > non-reclaim open.
>> >
>> > In isolation a single client slipping in the end like that sounds like a
>> > freak event, but if there's a ton of state to reclaim perhaps it could
>> > become more likely.
>> >
>> > I don't think that's a priority, we might just want to make sure we know
>> > how to do that in the future.
>> >
>> > But now that I think about it I don't see the existing or proposed
>> > nfsdcltrack stuff tying our hands in any way here.  It just gives the
>> > kernel some extra information, and the kernel still has discretion about
>> > when exactly it wants to end the grace period.
>> >
>>
>> It is even allowed to grant reclaim lock attempts after the grace
>> period has ended _if_ and only if it can guarantee that no conflicting
>> locks were issued.
>>
>> However note that the NFSv4.1 client is not actually allowed to issue
>> non-reclaim lock requests before it has issued a RECLAIM_COMPLETE. I
>> dunno how religiously we stick to that in Linux (I think we do), but
>> the point is that the server can and should rely on the client
>> _always_ sending a RECLAIM_COMPLETE if it is going to establish new
>> locks.
>
> Yeah, I'm pretty sure that bit is enforced. The problem situation that
> I think Bruce was referring to is this:
>
> Server reboots. Client1 reclaims some of its locks (but not all) and
> never sends a RECLAIM_COMPLETE. Grace period ends and then server
> hands out a lock to client2 that was previously held by client1 but
> that didn't get reclaimed.
>
> Server reboots again, prior to the client1 expiring (so its record is
> still in the DB). Now client1 comes back and starts reclaiming again.
> This time it reclaims all of its locks and we have a conflict between
> it and client2.
>
> It's a solvable problem, but I'll need to work through how best to do
> so.
>
> --

That's the first edge condition described in section 8.4.3.
Jeff Layton Sept. 12, 2014, 4:33 p.m. UTC | #12
On Fri, 12 Sep 2014 12:29:01 -0400
Trond Myklebust <trond.myklebust@primarydata.com> wrote:

> On Fri, Sep 12, 2014 at 12:07 PM, Jeff Layton
> <jeff.layton@primarydata.com> wrote:
> > On Fri, 12 Sep 2014 11:54:17 -0400
> > Trond Myklebust <trondmy@gmail.com> wrote:
> >
> >> On Fri, Sep 12, 2014 at 11:21 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
> >> > On Fri, Sep 12, 2014 at 10:36:21AM -0400, J. Bruce Fields wrote:
> >> >> On Fri, Sep 12, 2014 at 10:21:53AM -0400, Jeff Layton wrote:
> >> >> > Grace period
> >> >> > eventually ends, and its record is purged from the DB.
> >> >> >
> >> >> > Now we have a client that has reclaimed some files but that has no
> >> >> > record on stable storage.
> >> >> >
> >> >> > One possibility is to prematurely expire v4.1+ clients that have not
> >> >> > sent a RECLAIM_COMPLETE when the grace period ends.
> >> >> >
> >> >> > That seems problematic though -- what about clients that just happen to
> >> >> > do an EXCHANGE_ID just before the grace period is going to end, and
> >> >> > that get expired before they can issue their RECLAIM_COMPLETE. Will
> >> >> > that be a problem for them?
> >> >>
> >> >> In that case a client will send a reclaim, get back a NO_GRACE error,
> >> >> mark the rest of its state as unrecoverable, send the RECLAIM_COMPLETE,
> >> >> and continue normally.  (To the extent it can--signalling affected
> >> >> processes or EIOing further attempts to use the unreclaimed state, or
> >> >> whatever.)
> >> >
> >> > The one thing the server *could* do in this sort of case is extend the
> >> > grace period by a little--I seem to recall the spec giving some leeway
> >> > for this kind of thing.
> >>
> >>
> >> Section 8.4.2.1.
> >>
> >> > So for example the server could have a heuristics like: extend the grace
> >> > period by another second each time we notice there's been an EXCHANGE_ID
> >> > or reclaim in the previous second, up to some maximum.  And I suppose it
> >> > could also delay the grace period until someone actually attempts a
> >> > non-reclaim open.
> >> >
> >> > In isolation a single client slipping in the end like that sounds like a
> >> > freak event, but if there's a ton of state to reclaim perhaps it could
> >> > become more likely.
> >> >
> >> > I don't think that's a priority, we might just want to make sure we know
> >> > how to do that in the future.
> >> >
> >> > But now that I think about it I don't see the existing or proposed
> >> > nfsdcltrack stuff tying our hands in any way here.  It just gives the
> >> > kernel some extra information, and the kernel still has discretion about
> >> > when exactly it wants to end the grace period.
> >> >
> >>
> >> It is even allowed to grant reclaim lock attempts after the grace
> >> period has ended _if_ and only if it can guarantee that no conflicting
> >> locks were issued.
> >>
> >> However note that the NFSv4.1 client is not actually allowed to issue
> >> non-reclaim lock requests before it has issued a RECLAIM_COMPLETE. I
> >> dunno how religiously we stick to that in Linux (I think we do), but
> >> the point is that the server can and should rely on the client
> >> _always_ sending a RECLAIM_COMPLETE if it is going to establish new
> >> locks.
> >
> > Yeah, I'm pretty sure that bit is enforced. The problem situation that
> > I think Bruce was referring to is this:
> >
> > Server reboots. Client1 reclaims some of its locks (but not all) and
> > never sends a RECLAIM_COMPLETE. Grace period ends and then server
> > hands out a lock to client2 that was previously held by client1 but
> > that didn't get reclaimed.
> >
> > Server reboots again, prior to the client1 expiring (so its record is
> > still in the DB). Now client1 comes back and starts reclaiming again.
> > This time it reclaims all of its locks and we have a conflict between
> > it and client2.
> >
> > It's a solvable problem, but I'll need to work through how best to do
> > so.
> >
> > --
> 
> That's the first edge condition described in section 8.4.3.
> 

Actually, it's case #2 I think...
diff mbox

Patch

diff --git a/utils/nfsdcltrack/sqlite.c b/utils/nfsdcltrack/sqlite.c
index dde2f7ff8621..e260e81b1722 100644
--- a/utils/nfsdcltrack/sqlite.c
+++ b/utils/nfsdcltrack/sqlite.c
@@ -29,7 +29,10 @@ 
  *
  * clients: an "id" column containing a BLOB with the long-form clientid as
  * 	    sent by the client, a "time" column containing a timestamp (in
- * 	    epoch seconds) of when the record was last updated.
+ * 	    epoch seconds) of when the record was last updated, and a
+ * 	    "reclaim_complete" column containing a timestamp (in epoch seconds)
+ * 	    of when the last "create" operation came in for v4.1+ clients.
+ * 	    v4.0 clients should always have this set to 0.
  */
 
 #ifdef HAVE_CONFIG_H
@@ -50,7 +53,7 @@ 
 
 #include "xlog.h"
 
-#define CLTRACK_SQLITE_LATEST_SCHEMA_VERSION 1
+#define CLTRACK_SQLITE_LATEST_SCHEMA_VERSION 2
 
 /* in milliseconds */
 #define CLTRACK_SQLITE_BUSY_TIMEOUT 10000
@@ -120,6 +123,81 @@  out:
 	return ret;
 }
 
+static int
+sqlite_maindb_update_v1_to_v2(void)
+{
+	int ret, ret2;
+	char *err;
+
+	/* begin transaction */
+	ret = sqlite3_exec(dbh, "BEGIN EXCLUSIVE TRANSACTION;", NULL, NULL,
+				&err);
+	if (ret != SQLITE_OK) {
+		xlog(L_ERROR, "Unable to begin transaction: %s", err);
+		goto rollback;
+	}
+
+	/*
+	 * Check schema version again. This time, under an exclusive
+	 * transaction to guard against racing DB setup attempts
+	 */
+	ret = sqlite_query_schema_version();
+	switch (ret) {
+	case 1:
+		/* Still at v1 -- do conversion */
+		break;
+	case CLTRACK_SQLITE_LATEST_SCHEMA_VERSION:
+		/* Someone else raced in and set it up */
+		ret = 0;
+		goto rollback;
+	default:
+		/* Something went wrong -- fail! */
+		ret = -EINVAL;
+		goto rollback;
+	}
+
+	/* create v2 clients table */
+	ret = sqlite3_exec(dbh, "ALTER TABLE clients ADD COLUMN "
+				"reclaim_complete INTEGER;",
+				NULL, NULL, &err);
+	if (ret != SQLITE_OK) {
+		xlog(L_ERROR, "Unable to update clients table: %s", err);
+		goto rollback;
+	}
+
+	ret = snprintf(buf, sizeof(buf), "UPDATE parameters SET value = %d "
+			"WHERE key = \"version\";",
+			CLTRACK_SQLITE_LATEST_SCHEMA_VERSION);
+	if (ret < 0) {
+		xlog(L_ERROR, "sprintf failed!");
+		goto rollback;
+	} else if ((size_t)ret >= sizeof(buf)) {
+		xlog(L_ERROR, "sprintf output too long! (%d chars)", ret);
+		ret = -EINVAL;
+		goto rollback;
+	}
+
+	ret = sqlite3_exec(dbh, (const char *)buf, NULL, NULL, &err);
+	if (ret != SQLITE_OK) {
+		xlog(L_ERROR, "Unable to update schema version: %s", err);
+		goto rollback;
+	}
+
+	ret = sqlite3_exec(dbh, "COMMIT TRANSACTION;", NULL, NULL, &err);
+	if (ret != SQLITE_OK) {
+		xlog(L_ERROR, "Unable to commit transaction: %s", err);
+		goto rollback;
+	}
+out:
+	sqlite3_free(err);
+	return ret;
+rollback:
+	ret2 = sqlite3_exec(dbh, "ROLLBACK TRANSACTION;", NULL, NULL, &err);
+	if (ret2 != SQLITE_OK)
+		xlog(L_ERROR, "Unable to rollback transaction: %s", err);
+	goto out;
+}
+
 /*
  * Start an exclusive transaction and recheck the DB schema version. If it's
  * still zero (indicating a new database) then set it up. If that all works,
@@ -127,9 +205,9 @@  out:
  * transaction. On any error, rollback the transaction.
  */
 int
-sqlite_maindb_init_v1(void)
+sqlite_maindb_init_v2(void)
 {
-	int ret;
+	int ret, ret2;
 	char *err = NULL;
 
 	/* Start a transaction */
@@ -169,7 +247,7 @@  sqlite_maindb_init_v1(void)
 
 	/* create the "clients" table */
 	ret = sqlite3_exec(dbh, "CREATE TABLE clients (id BLOB PRIMARY KEY, "
-				"time INTEGER);",
+				"time INTEGER, reclaim_complete INTEGER);",
 				NULL, NULL, &err);
 	if (ret != SQLITE_OK) {
 		xlog(L_ERROR, "Unable to create clients table: %s", err);
@@ -207,7 +285,9 @@  out:
 
 rollback:
 	/* Attempt to rollback the transaction */
-	sqlite3_exec(dbh, "ROLLBACK TRANSACTION;", NULL, NULL, &err);
+	ret2 = sqlite3_exec(dbh, "ROLLBACK TRANSACTION;", NULL, NULL, &err);
+	if (ret2 != SQLITE_OK)
+		xlog(L_ERROR, "Unable to rollback transaction: %s", err);
 	goto out;
 }
 
@@ -255,9 +335,15 @@  sqlite_prepare_dbh(const char *topdir)
 		/* DB is already set up. Do nothing */
 		ret = 0;
 		break;
+	case 1:
+		/* Old DB -- update to new schema */
+		ret = sqlite_maindb_update_v1_to_v2();
+		if (ret)
+			goto out_close;
+		break;
 	case 0:
 		/* Query failed -- try to set up new DB */
-		ret = sqlite_maindb_init_v1();
+		ret = sqlite_maindb_init_v2();
 		if (ret)
 			goto out_close;
 		break;
@@ -289,7 +375,7 @@  sqlite_insert_client(const unsigned char *clname, const size_t namelen)
 	sqlite3_stmt *stmt = NULL;
 
 	ret = sqlite3_prepare_v2(dbh, "INSERT OR REPLACE INTO clients VALUES "
-				      "(?, strftime('%s', 'now'));", -1,
+					"(?, strftime('%s', 'now'), 0);", -1,
 					&stmt, NULL);
 	if (ret != SQLITE_OK) {
 		xlog(L_ERROR, "%s: insert statement prepare failed: %s",