diff mbox

[4/4] NFSD: Implement SEEK

Message ID 1382972247-1108-5-git-send-email-bjschuma@netapp.com (mailing list archive)
State New, archived
Headers show

Commit Message

Bryan Schumaker Oct. 28, 2013, 2:57 p.m. UTC
This patch adds in the SEEK operation used by clients doing an llseek on
a file to find either hole or data sections in a file.  I'm faking the
"allocated" status right now, since I haven't quite figured out how to
tell if a range is allocated on disk yet.

This patch is missing correctly determining the "allocated" status of
the HOLE / DATA range.  I expect I'll need to learn all about how fiemap
works before properly setting these values.

Signed-off-by: Anna Schumaker <bjschuma@netapp.com>
---
 fs/nfsd/nfs4proc.c   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4xdr.c    | 40 ++++++++++++++++++++++++++++++++++++++--
 fs/nfsd/xdr4.h       | 15 +++++++++++++++
 include/linux/nfs4.h |  2 +-
 4 files changed, 98 insertions(+), 3 deletions(-)

Comments

J. Bruce Fields Oct. 28, 2013, 9:51 p.m. UTC | #1
On Mon, Oct 28, 2013 at 10:57:26AM -0400, Anna Schumaker wrote:
> This patch adds in the SEEK operation used by clients doing an llseek on
> a file to find either hole or data sections in a file.  I'm faking the
> "allocated" status right now, since I haven't quite figured out how to
> tell if a range is allocated on disk yet.
> 
> This patch is missing correctly determining the "allocated" status of
> the HOLE / DATA range.  I expect I'll need to learn all about how fiemap
> works before properly setting these values.
> 
> Signed-off-by: Anna Schumaker <bjschuma@netapp.com>
> ---
>  fs/nfsd/nfs4proc.c   | 44 ++++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfs4xdr.c    | 40 ++++++++++++++++++++++++++++++++++++++--
>  fs/nfsd/xdr4.h       | 15 +++++++++++++++
>  include/linux/nfs4.h |  2 +-
>  4 files changed, 98 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 3210c6f..bc45ed2 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -1064,6 +1064,45 @@ nfsd4_write_plus(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	return nfserr_union_notsupp;
>  }
>  
> +static __be32
> +nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> +		struct nfsd4_seek *seek)
> +{
> +	struct file *file;
> +	loff_t pos, end_pos;
> +	__be32 status;
> +
> +	status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
> +					    &seek->seek_stateid,
> +					    RD_STATE | WR_STATE, &file);

Note nfs4_preprocess_stateid_op requires the caller to hold the state
lock.  Andyou want to either hold it till you're done using "file" or
take a reference on file before dropping it.

--b.

> +	if (status != nfs_ok)
> +		return status;
> +
> +	if (seek->seek_whence == NFS4_CONTENT_DATA) {
> +		pos = vfs_llseek(file, seek->seek_offset, SEEK_DATA);
> +		end_pos = vfs_llseek(file, pos, SEEK_HOLE);
> +		seek->seek_allocated = true;
> +	} else {
> +		pos = vfs_llseek(file, seek->seek_offset, SEEK_HOLE);
> +		end_pos = vfs_llseek(file, pos, SEEK_DATA);
> +		seek->seek_allocated = false;
> +	}
> +
> +	if (pos < 0)
> +		return nfserrno(pos);
> +	else if (end_pos == -ENXIO) {
> +		seek->seek_length = 0;
> +		seek->seek_eof = true;
> +	} else if (end_pos < 0)
> +		return nfserrno(end_pos);
> +	else
> +		seek->seek_length = end_pos - pos;
> +
> +	seek->seek_offset = pos;
> +
> +	return nfs_ok;
> +}
> +
>  /* This routine never returns NFS_OK!  If there are no other errors, it
>   * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
>   * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
> @@ -1885,6 +1924,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
>  		.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
>  		.op_get_currentstateid = (stateid_getter)nfsd4_get_writestateid,
>  	},
> +
> +	[OP_SEEK] = {
> +		.op_func = (nfsd4op_func)nfsd4_seek,
> +		.op_name = "OP_SEEK",
> +	},
>  };
>  
>  #ifdef NFSD_DEBUG
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 1e4e9af..8434740 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -1511,6 +1511,22 @@ nfsd4_decode_write_plus(struct nfsd4_compoundargs *argp,
>  }
>  
>  static __be32
> +nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
> +{
> +	DECODE_HEAD;
> +
> +	status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
> +	if (status)
> +		return status;
> +
> +	READ_BUF(12);
> +	READ64(seek->seek_offset);
> +	READ32(seek->seek_whence);
> +
> +	DECODE_TAIL;
> +}
> +
> +static __be32
>  nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
>  {
>  	return nfs_ok;
> @@ -1693,7 +1709,7 @@ static nfsd4_dec nfsd42_dec_ops[] = {
>  	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_notsupp,
>  	[OP_WRITE_PLUS]		= (nfsd4_dec)nfsd4_decode_write_plus,
>  	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
> -	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_notsupp,
> +	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_seek,
>  	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
>  };
>  
> @@ -3650,6 +3666,26 @@ nfsd4_encode_write_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
>  }
>  
>  static __be32
> +nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
> +		  struct nfsd4_seek *seek)
> +{
> +	__be32 *p;
> +
> +	if (nfserr)
> +		return nfserr;
> +
> +	RESERVE_SPACE(28);
> +	WRITE32(seek->seek_eof);
> +	WRITE32(seek->seek_whence);
> +	WRITE64(seek->seek_offset);
> +	WRITE64(seek->seek_length);
> +	WRITE32(seek->seek_allocated);
> +	ADJUST_ARGS();
> +
> +	return nfserr;
> +}
> +
> +static __be32
>  nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
>  {
>  	return nfserr;
> @@ -3730,7 +3766,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
>  	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_noop,
>  	[OP_WRITE_PLUS]		= (nfsd4_enc)nfsd4_encode_write_plus,
>  	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
> -	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_noop,
> +	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_seek,
>  	[OP_IO_ADVISE]		= (nfsd4_enc)nfsd4_encode_noop,
>  };
>  
> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
> index aaef9ab..ae9debc 100644
> --- a/fs/nfsd/xdr4.h
> +++ b/fs/nfsd/xdr4.h
> @@ -451,6 +451,20 @@ struct nfsd4_write_plus {
>  	struct nfsd42_write_res	wp_res;
>  };
>  
> +struct nfsd4_seek {
> +	/* request */
> +	stateid_t	seek_stateid;
> +
> +	/* Shared between request and response */
> +	u64		seek_offset;
> +	u32		seek_whence;
> +
> +	/* response */
> +	u32		seek_eof;
> +	u64		seek_length;
> +	u32		seek_allocated;
> +};
> +
>  struct nfsd4_op {
>  	int					opnum;
>  	__be32					status;
> @@ -499,6 +513,7 @@ struct nfsd4_op {
>  
>  		/* NFSv4.2 */
>  		struct nfsd4_write_plus		write_plus;
> +		struct nfsd4_seek		seek;
>  	} u;
>  	struct nfs4_replay *			replay;
>  };
> diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
> index 55ed991..81d6b09 100644
> --- a/include/linux/nfs4.h
> +++ b/include/linux/nfs4.h
> @@ -128,7 +128,7 @@ enum nfs_opnum4 {
>  Needs to be updated if more operations are defined in future.*/
>  
>  #define FIRST_NFS4_OP	OP_ACCESS
> -#define LAST_NFS4_OP 	OP_WRITE_PLUS
> +#define LAST_NFS4_OP 	OP_SEEK
>  
>  enum nfsstat4 {
>  	NFS4_OK = 0,
> -- 
> 1.8.4.1
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Oct. 29, 2013, 7:35 a.m. UTC | #2
On Mon, Oct 28, 2013 at 10:57:26AM -0400, Anna Schumaker wrote:
> This patch adds in the SEEK operation used by clients doing an llseek on
> a file to find either hole or data sections in a file.  I'm faking the
> "allocated" status right now, since I haven't quite figured out how to
> tell if a range is allocated on disk yet.
> 
> This patch is missing correctly determining the "allocated" status of
> the HOLE / DATA range.  I expect I'll need to learn all about how fiemap
> works before properly setting these values.

What is the definition of allocated in this context?  Specificly how
does it different from meaning of allocated as used by SEEK_DATA?

Going out to fiemap is something we should absolutely avoid.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker Oct. 29, 2013, 12:53 p.m. UTC | #3
On Mon 28 Oct 2013 05:51:26 PM EDT, J. Bruce Fields wrote:
> On Mon, Oct 28, 2013 at 10:57:26AM -0400, Anna Schumaker wrote:
>> This patch adds in the SEEK operation used by clients doing an llseek on
>> a file to find either hole or data sections in a file.  I'm faking the
>> "allocated" status right now, since I haven't quite figured out how to
>> tell if a range is allocated on disk yet.
>>
>> This patch is missing correctly determining the "allocated" status of
>> the HOLE / DATA range.  I expect I'll need to learn all about how fiemap
>> works before properly setting these values.
>>
>> Signed-off-by: Anna Schumaker <bjschuma@netapp.com>
>> ---
>>  fs/nfsd/nfs4proc.c   | 44 ++++++++++++++++++++++++++++++++++++++++++++
>>  fs/nfsd/nfs4xdr.c    | 40 ++++++++++++++++++++++++++++++++++++++--
>>  fs/nfsd/xdr4.h       | 15 +++++++++++++++
>>  include/linux/nfs4.h |  2 +-
>>  4 files changed, 98 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>> index 3210c6f..bc45ed2 100644
>> --- a/fs/nfsd/nfs4proc.c
>> +++ b/fs/nfsd/nfs4proc.c
>> @@ -1064,6 +1064,45 @@ nfsd4_write_plus(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>  	return nfserr_union_notsupp;
>>  }
>>
>> +static __be32
>> +nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>> +		struct nfsd4_seek *seek)
>> +{
>> +	struct file *file;
>> +	loff_t pos, end_pos;
>> +	__be32 status;
>> +
>> +	status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
>> +					    &seek->seek_stateid,
>> +					    RD_STATE | WR_STATE, &file);
>
> Note nfs4_preprocess_stateid_op requires the caller to hold the state
> lock.  Andyou want to either hold it till you're done using "file" or
> take a reference on file before dropping it.

That's useful to know.  I'll fix it up here, and anywhere else I call 
nfs4_preprocess_stateid_op()!
>
> --b.
>
>> +	if (status != nfs_ok)
>> +		return status;
>> +
>> +	if (seek->seek_whence == NFS4_CONTENT_DATA) {
>> +		pos = vfs_llseek(file, seek->seek_offset, SEEK_DATA);
>> +		end_pos = vfs_llseek(file, pos, SEEK_HOLE);
>> +		seek->seek_allocated = true;
>> +	} else {
>> +		pos = vfs_llseek(file, seek->seek_offset, SEEK_HOLE);
>> +		end_pos = vfs_llseek(file, pos, SEEK_DATA);
>> +		seek->seek_allocated = false;
>> +	}
>> +
>> +	if (pos < 0)
>> +		return nfserrno(pos);
>> +	else if (end_pos == -ENXIO) {
>> +		seek->seek_length = 0;
>> +		seek->seek_eof = true;
>> +	} else if (end_pos < 0)
>> +		return nfserrno(end_pos);
>> +	else
>> +		seek->seek_length = end_pos - pos;
>> +
>> +	seek->seek_offset = pos;
>> +
>> +	return nfs_ok;
>> +}
>> +
>>  /* This routine never returns NFS_OK!  If there are no other errors, it
>>   * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
>>   * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
>> @@ -1885,6 +1924,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
>>  		.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
>>  		.op_get_currentstateid = (stateid_getter)nfsd4_get_writestateid,
>>  	},
>> +
>> +	[OP_SEEK] = {
>> +		.op_func = (nfsd4op_func)nfsd4_seek,
>> +		.op_name = "OP_SEEK",
>> +	},
>>  };
>>
>>  #ifdef NFSD_DEBUG
>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>> index 1e4e9af..8434740 100644
>> --- a/fs/nfsd/nfs4xdr.c
>> +++ b/fs/nfsd/nfs4xdr.c
>> @@ -1511,6 +1511,22 @@ nfsd4_decode_write_plus(struct nfsd4_compoundargs *argp,
>>  }
>>
>>  static __be32
>> +nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
>> +{
>> +	DECODE_HEAD;
>> +
>> +	status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
>> +	if (status)
>> +		return status;
>> +
>> +	READ_BUF(12);
>> +	READ64(seek->seek_offset);
>> +	READ32(seek->seek_whence);
>> +
>> +	DECODE_TAIL;
>> +}
>> +
>> +static __be32
>>  nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
>>  {
>>  	return nfs_ok;
>> @@ -1693,7 +1709,7 @@ static nfsd4_dec nfsd42_dec_ops[] = {
>>  	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_notsupp,
>>  	[OP_WRITE_PLUS]		= (nfsd4_dec)nfsd4_decode_write_plus,
>>  	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
>> -	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_notsupp,
>> +	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_seek,
>>  	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
>>  };
>>
>> @@ -3650,6 +3666,26 @@ nfsd4_encode_write_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
>>  }
>>
>>  static __be32
>> +nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
>> +		  struct nfsd4_seek *seek)
>> +{
>> +	__be32 *p;
>> +
>> +	if (nfserr)
>> +		return nfserr;
>> +
>> +	RESERVE_SPACE(28);
>> +	WRITE32(seek->seek_eof);
>> +	WRITE32(seek->seek_whence);
>> +	WRITE64(seek->seek_offset);
>> +	WRITE64(seek->seek_length);
>> +	WRITE32(seek->seek_allocated);
>> +	ADJUST_ARGS();
>> +
>> +	return nfserr;
>> +}
>> +
>> +static __be32
>>  nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
>>  {
>>  	return nfserr;
>> @@ -3730,7 +3766,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
>>  	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_noop,
>>  	[OP_WRITE_PLUS]		= (nfsd4_enc)nfsd4_encode_write_plus,
>>  	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
>> -	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_noop,
>> +	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_seek,
>>  	[OP_IO_ADVISE]		= (nfsd4_enc)nfsd4_encode_noop,
>>  };
>>
>> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
>> index aaef9ab..ae9debc 100644
>> --- a/fs/nfsd/xdr4.h
>> +++ b/fs/nfsd/xdr4.h
>> @@ -451,6 +451,20 @@ struct nfsd4_write_plus {
>>  	struct nfsd42_write_res	wp_res;
>>  };
>>
>> +struct nfsd4_seek {
>> +	/* request */
>> +	stateid_t	seek_stateid;
>> +
>> +	/* Shared between request and response */
>> +	u64		seek_offset;
>> +	u32		seek_whence;
>> +
>> +	/* response */
>> +	u32		seek_eof;
>> +	u64		seek_length;
>> +	u32		seek_allocated;
>> +};
>> +
>>  struct nfsd4_op {
>>  	int					opnum;
>>  	__be32					status;
>> @@ -499,6 +513,7 @@ struct nfsd4_op {
>>
>>  		/* NFSv4.2 */
>>  		struct nfsd4_write_plus		write_plus;
>> +		struct nfsd4_seek		seek;
>>  	} u;
>>  	struct nfs4_replay *			replay;
>>  };
>> diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
>> index 55ed991..81d6b09 100644
>> --- a/include/linux/nfs4.h
>> +++ b/include/linux/nfs4.h
>> @@ -128,7 +128,7 @@ enum nfs_opnum4 {
>>  Needs to be updated if more operations are defined in future.*/
>>
>>  #define FIRST_NFS4_OP	OP_ACCESS
>> -#define LAST_NFS4_OP 	OP_WRITE_PLUS
>> +#define LAST_NFS4_OP 	OP_SEEK
>>
>>  enum nfsstat4 {
>>  	NFS4_OK = 0,
>> --
>> 1.8.4.1
>>


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker Oct. 29, 2013, 1 p.m. UTC | #4
On Tue 29 Oct 2013 03:35:58 AM EDT, Christoph Hellwig wrote:
> On Mon, Oct 28, 2013 at 10:57:26AM -0400, Anna Schumaker wrote:
>> This patch adds in the SEEK operation used by clients doing an llseek on
>> a file to find either hole or data sections in a file.  I'm faking the
>> "allocated" status right now, since I haven't quite figured out how to
>> tell if a range is allocated on disk yet.
>>
>> This patch is missing correctly determining the "allocated" status of
>> the HOLE / DATA range.  I expect I'll need to learn all about how fiemap
>> works before properly setting these values.
>
> What is the definition of allocated in this context?  Specificly how
> does it different from meaning of allocated as used by SEEK_DATA?

 From what I can tell, I think the allocated flag will just tell the 
clients if all the blocks exist on disk or not.  Is there a way to have 
a hole with allocated blocks?  Or maybe it's supposed to represent 
partially allocated blocks?  I checked the draft, and it doesn't 
actually say what they expect allocated to represent...

>
> Going out to fiemap is something we should absolutely avoid.
>


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Oct. 29, 2013, 1:07 p.m. UTC | #5
On Tue, Oct 29, 2013 at 09:00:48AM -0400, Anna Schumaker wrote:
>  From what I can tell, I think the allocated flag will just tell the 
> clients if all the blocks exist on disk or not.  Is there a way to have 
> a hole with allocated blocks?  Or maybe it's supposed to represent 
> partially allocated blocks?  I checked the draft, and it doesn't 
> actually say what they expect allocated to represent...

You can have preallocated space, which behaves like a hole in that
reads return zeroes.  As far as SEEK_DATA/SEEK_HOLE is concerned we
tend to treat them as holes when we can as that's what the users expect.

Note that at least the pnfs block rfc has a special state representing
this kind of preallocated space.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Oct. 29, 2013, 1:30 p.m. UTC | #6
On Tue, Oct 29, 2013 at 06:07:21AM -0700, Christoph Hellwig wrote:
> On Tue, Oct 29, 2013 at 09:00:48AM -0400, Anna Schumaker wrote:
> >  From what I can tell, I think the allocated flag will just tell the 
> > clients if all the blocks exist on disk or not.  Is there a way to have 
> > a hole with allocated blocks?  Or maybe it's supposed to represent 
> > partially allocated blocks?  I checked the draft, and it doesn't 
> > actually say what they expect allocated to represent...
> 
> You can have preallocated space, which behaves like a hole in that
> reads return zeroes.  As far as SEEK_DATA/SEEK_HOLE is concerned we
> tend to treat them as holes when we can as that's what the users expect.
> 
> Note that at least the pnfs block rfc has a special state representing
> this kind of preallocated space.

http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-20#section-5.2

E.g. "Hole:  A byte range within a Sparse file that contains regions of
all zeroes.  For block-based file systems, this could also be an
unallocated region of the file."

So it sounds like a hole can either be allocated or not, which seems to
agree with what you expect?

I haven't read these parts of the spec yet at all....

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Oct. 31, 2013, 4:04 p.m. UTC | #7
Btw, I just noticed that current nfs mainline fails xfstests 286 which
test SEEK_DATA/SEEK_HOLE.   That sounds like a useful test case for your
patches.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker Oct. 31, 2013, 4:07 p.m. UTC | #8
On 10/31/2013 12:04 PM, Christoph Hellwig wrote:
> Btw, I just noticed that current nfs mainline fails xfstests 286 which
> test SEEK_DATA/SEEK_HOLE.   That sounds like a useful test case for your
> patches.
> 

Yeah, I've been using it to test this.  I'm having trouble getting the last few parts of both test 11 and 12 to pass, which is better than what's happening now!

Anna
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker Oct. 31, 2013, 4:17 p.m. UTC | #9
On 10/31/2013 12:04 PM, Christoph Hellwig wrote:
> Btw, I just noticed that current nfs mainline fails xfstests 286 which
> test SEEK_DATA/SEEK_HOLE.   That sounds like a useful test case for your
> patches.
> 

I read too quickly.  I've been using 285 to test against, but I'll see what 286 gets me!  Thanks for the tip :)
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Oct. 31, 2013, 5:13 p.m. UTC | #10
On Thu, Oct 31, 2013 at 12:17:30PM -0400, Anna Schumaker wrote:
> On 10/31/2013 12:04 PM, Christoph Hellwig wrote:
> > Btw, I just noticed that current nfs mainline fails xfstests 286 which
> > test SEEK_DATA/SEEK_HOLE.   That sounds like a useful test case for your
> > patches.
> > 
> 
> I read too quickly.  I've been using 285 to test against, but I'll see what 286 gets me!  Thanks for the tip :)

I actually meant 285.  286 tests SEEK_HOLE as well but passes already
with the dumb default behaviour.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 2, 2013, 1:48 p.m. UTC | #11
On Tue, Oct 29, 2013 at 09:30:06AM -0400, J. Bruce Fields wrote:
> I haven't read these parts of the spec yet at all....

Btw, is there a way do to something like a git blame for the spec?  I'd
really love to confront people personally with some of the odd bits they
came up with.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Nov. 2, 2013, 2:37 p.m. UTC | #12
On Sat, Nov 02, 2013 at 06:48:37AM -0700, Christoph Hellwig wrote:
> On Tue, Oct 29, 2013 at 09:30:06AM -0400, J. Bruce Fields wrote:
> > I haven't read these parts of the spec yet at all....
> 
> Btw, is there a way do to something like a git blame for the spec?  I'd
> really love to confront people personally with some of the odd bits they
> came up with.

It *is* in git, so you can literally run git blame:

	https://github.com/loghyr/NFSv4.2

But it'll probably credit almost everything to Tom and he's typically
just the messenger....

Just direct gripes to nfsv4@ietf.org.  They also take patches.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 2, 2013, 2:41 p.m. UTC | #13
On Sat, Nov 02, 2013 at 10:37:29AM -0400, J. Bruce Fields wrote:
> On Sat, Nov 02, 2013 at 06:48:37AM -0700, Christoph Hellwig wrote:
> > On Tue, Oct 29, 2013 at 09:30:06AM -0400, J. Bruce Fields wrote:
> > > I haven't read these parts of the spec yet at all....
> > 
> > Btw, is there a way do to something like a git blame for the spec?  I'd
> > really love to confront people personally with some of the odd bits they
> > came up with.
> 
> It *is* in git, so you can literally run git blame:
> 
> 	https://github.com/loghyr/NFSv4.2
> 
> But it'll probably credit almost everything to Tom and he's typically
> just the messenger....

Indeed, a quick try attributes everything to him.  No good way to figure
out who is to blame for the whole WRITE PLUS idiocy.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Nov. 4, 2013, 4:46 p.m. UTC | #14
On Sat, Nov 02, 2013 at 07:41:07AM -0700, Christoph Hellwig wrote:
> On Sat, Nov 02, 2013 at 10:37:29AM -0400, J. Bruce Fields wrote:
> > On Sat, Nov 02, 2013 at 06:48:37AM -0700, Christoph Hellwig wrote:
> > > On Tue, Oct 29, 2013 at 09:30:06AM -0400, J. Bruce Fields wrote:
> > > > I haven't read these parts of the spec yet at all....
> > > 
> > > Btw, is there a way do to something like a git blame for the spec?  I'd
> > > really love to confront people personally with some of the odd bits they
> > > came up with.
> > 
> > It *is* in git, so you can literally run git blame:
> > 
> > 	https://github.com/loghyr/NFSv4.2
> > 
> > But it'll probably credit almost everything to Tom and he's typically
> > just the messenger....
> 
> Indeed, a quick try attributes everything to him.  No good way to figure
> out who is to blame for the whole WRITE PLUS idiocy.

Imagine it was me, if it helps.

The important thing is that we understand what should be fixed.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 4, 2013, 5:05 p.m. UTC | #15
On Mon, Nov 04, 2013 at 11:46:58AM -0500, J. Bruce Fields wrote:
> Imagine it was me, if it helps.
> 
> The important thing is that we understand what should be fixed.

The important part is how someone got the idea for the pattern writing
and the weird ADBs from and why they believe they should overload a
single command for all of them.  And apparently reqular data writes as
well, even if I haven't found that part of the spec yet.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever Nov. 4, 2013, 5:22 p.m. UTC | #16
On Nov 4, 2013, at 9:05 AM, Christoph Hellwig <hch@infradead.org> wrote:

> On Mon, Nov 04, 2013 at 11:46:58AM -0500, J. Bruce Fields wrote:
>> Imagine it was me, if it helps.
>> 
>> The important thing is that we understand what should be fixed.
> 
> The important part is how someone got the idea for the pattern writing
> and the weird ADBs from

The original proposal is

  http://tools.ietf.org/html/draft-eisler-nfsv4-enterprise-apps-01

and has been somewhat modified in its expression in NFSv4.2.

> and why they believe they should overload a
> single command for all of them.

I believe that file initialization is closer in semantics to COPY_OFFLOAD than it is to WRITE, and I find it awkward to use the WRITE_PLUS operation for initialization.  But I could never make a strong technical argument why they should not be joined, other than "Clutter!".

> And apparently reqular data writes as
> well, even if I haven't found that part of the spec yet.

  http://www.ietf.org/id/draft-ietf-nfsv4-minorversion2-20.txt

is the latest version I can find.

The usual process is to explain what needs to be fixed (ie, make a problem statement) as Bruce said.  That is often accompanied by an alternate design proposal that addresses your issues.

It is appropriate to address your comments to the group, and not to a single person.  As in most online communities, ad hominem and confrontation are not appreciated.

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Haynes Nov. 4, 2013, 9:56 p.m. UTC | #17
On Nov 4, 2013, at 8:46 AM, J. Bruce Fields <bfields@fieldses.org> wrote:

> On Sat, Nov 02, 2013 at 07:41:07AM -0700, Christoph Hellwig wrote:
>> On Sat, Nov 02, 2013 at 10:37:29AM -0400, J. Bruce Fields wrote:
>>> On Sat, Nov 02, 2013 at 06:48:37AM -0700, Christoph Hellwig wrote:
>>>> On Tue, Oct 29, 2013 at 09:30:06AM -0400, J. Bruce Fields wrote:
>>>>> I haven't read these parts of the spec yet at all....
>>>> 
>>>> Btw, is there a way do to something like a git blame for the spec?  I'd
>>>> really love to confront people personally with some of the odd bits they
>>>> came up with.
>>> 
>>> It *is* in git, so you can literally run git blame:
>>> 
>>> 	https://github.com/loghyr/NFSv4.2
>>> 
>>> But it'll probably credit almost everything to Tom and he's typically
>>> just the messenger....
>> 
>> Indeed, a quick try attributes everything to him.  No good way to figure
>> out who is to blame for the whole WRITE PLUS idiocy.
> 
> Imagine it was me, if it helps.
> 

Nah, most likely me, no imagination needed.

We were also looking at bringing larger WRITEs during 4.3 and I wanted
to have WRITE_PLUS already in place for that.


> The important thing is that we understand what should be fixed.
> 
> --b.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 5, 2013, 1:03 a.m. UTC | #18
On Mon, Nov 04, 2013 at 01:56:39PM -0800, Thomas Haynes wrote:
> Nah, most likely me, no imagination needed.
> 
> We were also looking at bringing larger WRITEs during 4.3 and I wanted
> to have WRITE_PLUS already in place for that.

What exactly does a WRITE PLUS with various non-related overloads in 4.2
buy you for introducing large writes in 4.3?
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haynes, Tom Nov. 5, 2013, 2:07 a.m. UTC | #19
On Nov 4, 2013, at 5:03 PM, Christoph Hellwig <hch@infradead.org> wrote:

> On Mon, Nov 04, 2013 at 01:56:39PM -0800, Thomas Haynes wrote:
>> Nah, most likely me, no imagination needed.
>> 
>> We were also looking at bringing larger WRITEs during 4.3 and I wanted
>> to have WRITE_PLUS already in place for that.
> 
> What exactly does a WRITE PLUS with various non-related overloads in 4.2
> buy you for introducing large writes in 4.3?

In 4.3, READ_PLUS would be extended for large reads. We would at that
time need a WRITE_PLUS to be extended for large writes.

The relationship falls from having WRITE_PLUS handle all of the variants
handled by READ_PLUS. It needs to accept all of them because it
does not know what will be returned.

We did not want many new operators and also wanted the operators to
be extensible. With this approach, you can define a new arm of the
discriminated union, not have to implement it, and not burn an
operator.

Some of the history is captured here:

http://www.ietf.org/mail-archive/web/nfsv4/current/msg11235.html
http://www.ietf.org/mail-archive/web/nfsv4/current/msg11470.html
http://www.ietf.org/proceedings/84/slides/slides-84-nfsv4-1.pdf (slide 6)

It doesn't capture the intent of NFS4ERR_UNION_NOTSUPP in
this decision.

11.1.1.1.  NFS4ERR_UNION_NOTSUPP (Error Code 10090)

   One of the arguments to the operation is a discriminated union and
   while the server supports the given operation, it does not support
   the selected arm of the discriminated union.  For an example, see
   READ_PLUS (Section 14.10).
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 5, 2013, 8:23 a.m. UTC | #20
On Tue, Nov 05, 2013 at 02:07:15AM +0000, Haynes, Tom wrote:
> We did not want many new operators and also wanted the operators to
> be extensible. With this approach, you can define a new arm of the
> discriminated union, not have to implement it, and not burn an
> operator.

That seems very much like a non-argument.  If saving operators was a
good argument the NFS operations should be

MULTIPLEX1 with many sub opcodes, followed by MULTIPLEX2 once it fills
up.

> Some of the history is captured here:
> 
> http://www.ietf.org/mail-archive/web/nfsv4/current/msg11235.html

That mail seems to draw the wrong conclusion that a hole punching
or a preallocation are equivalent to a server side copy from /dev/zero.

Treating a pattern write as a server side copy is fine and I'd fully
support that.  Hole punching and preallocation on the other hand are
primarily metadata operations that reserve or free space.  They only
happen to zero out the range as zero is the most convenient well known
pattern to avoid stale data exposure.

> http://www.ietf.org/mail-archive/web/nfsv4/current/msg11470.html

I think Chuck's reply summarizes very well why a pattern initialization
should not be mixed with an actual data write.  

> http://www.ietf.org/proceedings/84/slides/slides-84-nfsv4-1.pdf (slide 6)

That side seems to inadvertently sum up a lot of what's wrong with merging
hole punching and preallocations into some form of super write:

 - COMMIT doesn't really apply to pure metadata operations like a hole
   punch and preallocation, so fitting it into a WRITE that expects
   COMMIT causes all kinds of problems (as we saw in the thread about
   Annas implementation).
 - requiring the server to be able to handle offloads for these
   operations does not make any sense, because they are again very
   quick metadata operation, and not long running operation like
   a pattern initialization on the server.

Note that the arbitrary pattern initialization over multiple blocks is
a very different operation from a space allocation even if the latter
happens to also zero the range as a side effect.


> 
> It doesn't capture the intent of NFS4ERR_UNION_NOTSUPP in
> this decision.
> 
> 11.1.1.1.  NFS4ERR_UNION_NOTSUPP (Error Code 10090)
> 
>    One of the arguments to the operation is a discriminated union and
>    while the server supports the given operation, it does not support
>    the selected arm of the discriminated union.  For an example, see
>    READ_PLUS (Section 14.10).

Btw, there is an odd use of this error in 14.7.3.:

 "WRITE_PLUS has to support all of the errors which are returned by
  WRITE plus NFS4ERR_UNION_NOTSUPP.  If the client asks for a hole and
  the server does not support that arm of the discriminated union, but
  does support one or more additional arms, it can signal to the
  client that it supports the operation, but not the arm with
  NFS4ERR_UNION_NOTSUPP."

This does not specicly writes but appears to assume hole punching
is the only optional arm.  On the other hand to me it appears the only
interesting arm, with the data arm buying nothing over WRITE in 4.2 and
thus being entirely superflous, and ADHs being a complicated map to Unix
filesystems on the backend.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 5, 2013, 12:33 p.m. UTC | #21
On Mon, Nov 04, 2013 at 09:22:11AM -0800, Chuck Lever wrote:
> The original proposal is
> 
>   http://tools.ietf.org/html/draft-eisler-nfsv4-enterprise-apps-01
> 
> and has been somewhat modified in its expression in NFSv4.2.

The INITIALIZE operation in that proposal look like a sane translation of
the SCSI WRITE SAME commands to NFS, but I have no idea
how that eveloved into the monster of ADHs in the NFS4.2 draft.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 3210c6f..bc45ed2 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1064,6 +1064,45 @@  nfsd4_write_plus(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return nfserr_union_notsupp;
 }
 
+static __be32
+nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		struct nfsd4_seek *seek)
+{
+	struct file *file;
+	loff_t pos, end_pos;
+	__be32 status;
+
+	status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
+					    &seek->seek_stateid,
+					    RD_STATE | WR_STATE, &file);
+	if (status != nfs_ok)
+		return status;
+
+	if (seek->seek_whence == NFS4_CONTENT_DATA) {
+		pos = vfs_llseek(file, seek->seek_offset, SEEK_DATA);
+		end_pos = vfs_llseek(file, pos, SEEK_HOLE);
+		seek->seek_allocated = true;
+	} else {
+		pos = vfs_llseek(file, seek->seek_offset, SEEK_HOLE);
+		end_pos = vfs_llseek(file, pos, SEEK_DATA);
+		seek->seek_allocated = false;
+	}
+
+	if (pos < 0)
+		return nfserrno(pos);
+	else if (end_pos == -ENXIO) {
+		seek->seek_length = 0;
+		seek->seek_eof = true;
+	} else if (end_pos < 0)
+		return nfserrno(end_pos);
+	else
+		seek->seek_length = end_pos - pos;
+
+	seek->seek_offset = pos;
+
+	return nfs_ok;
+}
+
 /* This routine never returns NFS_OK!  If there are no other errors, it
  * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
  * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
@@ -1885,6 +1924,11 @@  static struct nfsd4_operation nfsd4_ops[] = {
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
 		.op_get_currentstateid = (stateid_getter)nfsd4_get_writestateid,
 	},
+
+	[OP_SEEK] = {
+		.op_func = (nfsd4op_func)nfsd4_seek,
+		.op_name = "OP_SEEK",
+	},
 };
 
 #ifdef NFSD_DEBUG
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1e4e9af..8434740 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1511,6 +1511,22 @@  nfsd4_decode_write_plus(struct nfsd4_compoundargs *argp,
 }
 
 static __be32
+nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
+{
+	DECODE_HEAD;
+
+	status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
+	if (status)
+		return status;
+
+	READ_BUF(12);
+	READ64(seek->seek_offset);
+	READ32(seek->seek_whence);
+
+	DECODE_TAIL;
+}
+
+static __be32
 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 {
 	return nfs_ok;
@@ -1693,7 +1709,7 @@  static nfsd4_dec nfsd42_dec_ops[] = {
 	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_WRITE_PLUS]		= (nfsd4_dec)nfsd4_decode_write_plus,
 	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_seek,
 	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
 };
 
@@ -3650,6 +3666,26 @@  nfsd4_encode_write_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
 }
 
 static __be32
+nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  struct nfsd4_seek *seek)
+{
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	RESERVE_SPACE(28);
+	WRITE32(seek->seek_eof);
+	WRITE32(seek->seek_whence);
+	WRITE64(seek->seek_offset);
+	WRITE64(seek->seek_length);
+	WRITE32(seek->seek_allocated);
+	ADJUST_ARGS();
+
+	return nfserr;
+}
+
+static __be32
 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 {
 	return nfserr;
@@ -3730,7 +3766,7 @@  static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_WRITE_PLUS]		= (nfsd4_enc)nfsd4_encode_write_plus,
 	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_seek,
 	[OP_IO_ADVISE]		= (nfsd4_enc)nfsd4_encode_noop,
 };
 
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index aaef9ab..ae9debc 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -451,6 +451,20 @@  struct nfsd4_write_plus {
 	struct nfsd42_write_res	wp_res;
 };
 
+struct nfsd4_seek {
+	/* request */
+	stateid_t	seek_stateid;
+
+	/* Shared between request and response */
+	u64		seek_offset;
+	u32		seek_whence;
+
+	/* response */
+	u32		seek_eof;
+	u64		seek_length;
+	u32		seek_allocated;
+};
+
 struct nfsd4_op {
 	int					opnum;
 	__be32					status;
@@ -499,6 +513,7 @@  struct nfsd4_op {
 
 		/* NFSv4.2 */
 		struct nfsd4_write_plus		write_plus;
+		struct nfsd4_seek		seek;
 	} u;
 	struct nfs4_replay *			replay;
 };
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 55ed991..81d6b09 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -128,7 +128,7 @@  enum nfs_opnum4 {
 Needs to be updated if more operations are defined in future.*/
 
 #define FIRST_NFS4_OP	OP_ACCESS
-#define LAST_NFS4_OP 	OP_WRITE_PLUS
+#define LAST_NFS4_OP 	OP_SEEK
 
 enum nfsstat4 {
 	NFS4_OK = 0,