diff mbox

[3/4] NFSD: Add WRITE_PLUS support for hole punches

Message ID 1382972247-1108-4-git-send-email-bjschuma@netapp.com (mailing list archive)
State New, archived
Headers show

Commit Message

Bryan Schumaker Oct. 28, 2013, 2:57 p.m. UTC
This patch only implements DATA_CONTENT_HOLE support, but I tried to
write it so other arms of the discriminated union can be added easily
later.

This patch is missing support for decoding multiple requests on the same
file.  The way it's written now only the last range provided by the
client will be written to.

Signed-off-by: Anna Schumaker <bjschuma@netapp.com>
---
 fs/nfsd/nfs4proc.c   | 45 ++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4xdr.c    | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/nfsd/vfs.c        | 14 ++++++++++++
 fs/nfsd/vfs.h        |  1 +
 fs/nfsd/xdr4.h       | 24 ++++++++++++++++++++
 fs/open.c            |  1 +
 include/linux/nfs4.h |  8 ++++++-
 7 files changed, 152 insertions(+), 3 deletions(-)

Comments

J. Bruce Fields Oct. 28, 2013, 9:40 p.m. UTC | #1
On Mon, Oct 28, 2013 at 10:57:25AM -0400, Anna Schumaker wrote:
> This patch only implements DATA_CONTENT_HOLE support, but I tried to
> write it so other arms of the discriminated union can be added easily
> later.

And there's an NFS4ERR_UNION_NOTSUPP to handle that case, OK.  (Though
the spec language is a little ambiguous:

	If the client asks for a hole and the server does not support
	that arm of the discriminated union, but does support one or
	more additional arms, it can signal to the client that it
	supports the operation, but not the arm with
	NFS4ERR_UNION_NOTSUPP.

So it's unclear whether we can return this in the case the client is
*not* asking for a hole.  Are we sure the NFS4_CONTENT_DATA case is
optional?  The language in 5.3 ("Instead a client should utlize
READ_PLUS and WRITE_PLUS") also suggests the CONTENT_DATA case might be
meant to be mandatory.)

> This patch is missing support for decoding multiple requests on the same
> file.  The way it's written now only the last range provided by the
> client will be written to.

But that doesn't seem to be a limitation anticipated by the spec, so I
guess we need to fix that.

(Why is there an array, anyway?  Wouldn't it work just as well to send
multiple RPC's, or a single RPC with multiple WRITE_PLUS ops?)

> 
> Signed-off-by: Anna Schumaker <bjschuma@netapp.com>
> ---
>  fs/nfsd/nfs4proc.c   | 45 ++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfs4xdr.c    | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  fs/nfsd/vfs.c        | 14 ++++++++++++
>  fs/nfsd/vfs.h        |  1 +
>  fs/nfsd/xdr4.h       | 24 ++++++++++++++++++++
>  fs/open.c            |  1 +
>  include/linux/nfs4.h |  8 ++++++-
>  7 files changed, 152 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 419572f..3210c6f 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -1028,6 +1028,42 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	return status;
>  }
>  
> +static __be32
> +nfsd4_write_plus_hole(struct file *file, struct nfsd4_write_plus *writeplus,
> +		      struct net *net)
> +{
> +	__be32 status;
> +
> +	status = nfsd4_vfs_fallocate(file, writeplus->wp_allocated,
> +				writeplus->wp_offset, writeplus->wp_length);
> +	if (status == nfs_ok) {
> +		writeplus->wp_res.wr_stid = NULL;
> +		writeplus->wp_res.wr_bytes_written = writeplus->wp_length;
> +		writeplus->wp_res.wr_stable_how = NFS_FILE_SYNC;

Do we need to sync?

> +		gen_boot_verifier(&writeplus->wp_res.wr_verifier, net);
> +	}
> +
> +	return status;

Nit, but I like the usual idiom better in most cases:

	if (status)
		return status;
	normal case here...
	return 0;

--b.

> +}
> +
> +static __be32
> +nfsd4_write_plus(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> +		struct nfsd4_write_plus *writeplus)
> +{
> +	struct file *file;
> +	__be32 status;
> +	struct net *net = SVC_NET(rqstp);
> +
> +	status = nfs4_preprocess_stateid_op(net, cstate, &writeplus->wp_stateid,
> +					    WR_STATE, &file);
> +	if (status)
> +		return status;
> +
> +	if (writeplus->wp_data_content == NFS4_CONTENT_HOLE)
> +		return nfsd4_write_plus_hole(file, writeplus, net);
> +	return nfserr_union_notsupp;
> +}
> +
>  /* This routine never returns NFS_OK!  If there are no other errors, it
>   * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
>   * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
> @@ -1840,6 +1876,15 @@ static struct nfsd4_operation nfsd4_ops[] = {
>  		.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
>  		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
>  	},
> +
> +	/* NFSv4.2 operations */
> +	[OP_WRITE_PLUS] = {
> +		.op_func = (nfsd4op_func)nfsd4_write_plus,
> +		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
> +		.op_name = "OP_WRITE_PLUS",
> +		.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
> +		.op_get_currentstateid = (stateid_getter)nfsd4_get_writestateid,
> +	},
>  };
>  
>  #ifdef NFSD_DEBUG
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 60f5a1f..1e4e9af 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -1485,6 +1485,32 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
>  }
>  
>  static __be32
> +nfsd4_decode_write_plus(struct nfsd4_compoundargs *argp,
> +			struct nfsd4_write_plus *writeplus)
> +{
> +	DECODE_HEAD;
> +	unsigned int i;
> +
> +	status = nfsd4_decode_stateid(argp, &writeplus->wp_stateid);
> +	if (status)
> +		return status;
> +
> +	READ_BUF(8);
> +	READ32(writeplus->wp_stable_how);
> +	READ32(writeplus->wp_data_length);
> +
> +	for (i = 0; i < writeplus->wp_data_length; i++) {
> +		READ_BUF(24);
> +		READ32(writeplus->wp_data_content);
> +		READ64(writeplus->wp_offset);
> +		READ64(writeplus->wp_length);
> +		READ32(writeplus->wp_allocated);
> +	}
> +
> +	DECODE_TAIL;
> +}
> +
> +static __be32
>  nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
>  {
>  	return nfs_ok;
> @@ -1665,7 +1691,7 @@ static nfsd4_dec nfsd42_dec_ops[] = {
>  	[OP_COPY_NOTIFY]	= (nfsd4_dec)nfsd4_decode_notsupp,
>  	[OP_OFFLOAD_REVOKE]	= (nfsd4_dec)nfsd4_decode_notsupp,
>  	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_notsupp,
> -	[OP_WRITE_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
> +	[OP_WRITE_PLUS]		= (nfsd4_dec)nfsd4_decode_write_plus,
>  	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
>  	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_notsupp,
>  	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
> @@ -3591,6 +3617,38 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
>  	return nfserr;
>  }
>  
> +static void
> +nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write)
> +{
> +	__be32 *p;
> +
> +	RESERVE_SPACE(4);
> +
> +	if (write->wr_stid == NULL) {
> +		WRITE32(0);
> +		ADJUST_ARGS();
> +	} else {
> +		WRITE32(1);
> +		ADJUST_ARGS();
> +		nfsd4_encode_stateid(resp, &write->wr_stid->sc_stateid);
> +	}
> +
> +	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
> +	WRITE64(write->wr_bytes_written);
> +	WRITE32(write->wr_stable_how);
> +	WRITEMEM(write->wr_verifier.data, NFS4_VERIFIER_SIZE);
> +	ADJUST_ARGS();
> +}
> +
> +static __be32
> +nfsd4_encode_write_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
> +		        struct nfsd4_write_plus *writeplus)
> +{
> +	if (!nfserr)
> +		nfsd42_encode_write_res(resp, &writeplus->wp_res);
> +	return nfserr;
> +}
> +
>  static __be32
>  nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
>  {
> @@ -3670,7 +3728,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
>  	[OP_COPY_NOTIFY]	= (nfsd4_enc)nfsd4_encode_noop,
>  	[OP_OFFLOAD_REVOKE]	= (nfsd4_enc)nfsd4_encode_noop,
>  	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_noop,
> -	[OP_WRITE_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
> +	[OP_WRITE_PLUS]		= (nfsd4_enc)nfsd4_encode_write_plus,
>  	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
>  	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_noop,
>  	[OP_IO_ADVISE]		= (nfsd4_enc)nfsd4_encode_noop,
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index c827acb..7fec087 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -16,6 +16,7 @@
>  #include <linux/fs.h>
>  #include <linux/file.h>
>  #include <linux/splice.h>
> +#include <linux/falloc.h>
>  #include <linux/fcntl.h>
>  #include <linux/namei.h>
>  #include <linux/delay.h>
> @@ -649,6 +650,19 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  }
>  #endif
>  
> +__be32 nfsd4_vfs_fallocate(struct file *file, bool allocated, loff_t offset, loff_t len)
> +{
> +	int error, mode = 0;
> +
> +	if (allocated == false)
> +		mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
> +
> +	error = do_fallocate(file, mode, offset, len);
> +	if (error == 0)
> +		error = vfs_fsync_range(file, offset, offset + len, 0);
> +	return nfserrno(error);
> +}
> +
>  #endif /* defined(CONFIG_NFSD_V4) */
>  
>  #ifdef CONFIG_NFSD_V3
> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> index a4be2e3..187eb95 100644
> --- a/fs/nfsd/vfs.h
> +++ b/fs/nfsd/vfs.h
> @@ -57,6 +57,7 @@ __be32          nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
>  int             nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
>  __be32          nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
>  		    struct xdr_netobj *);
> +__be32		nfsd4_vfs_fallocate(struct file *, bool, loff_t, loff_t);
>  #endif /* CONFIG_NFSD_V4 */
>  __be32		nfsd_create(struct svc_rqst *, struct svc_fh *,
>  				char *name, int len, struct iattr *attrs,
> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
> index b3ed644..aaef9ab 100644
> --- a/fs/nfsd/xdr4.h
> +++ b/fs/nfsd/xdr4.h
> @@ -430,6 +430,27 @@ struct nfsd4_reclaim_complete {
>  	u32 rca_one_fs;
>  };
>  
> +struct nfsd42_write_res {
> +	struct nfs4_stid	*wr_stid;
> +	u64			wr_bytes_written;
> +	u32			wr_stable_how;
> +	nfs4_verifier		wr_verifier;
> +};
> +
> +struct nfsd4_write_plus {
> +	/* request */
> +	stateid_t	wp_stateid;
> +	u32		wp_stable_how;
> +	u32		wp_data_length;
> +	u32		wp_data_content;
> +	u64		wp_offset;
> +	u64		wp_length;
> +	u32		wp_allocated;
> +
> +	/* response */
> +	struct nfsd42_write_res	wp_res;
> +};
> +
>  struct nfsd4_op {
>  	int					opnum;
>  	__be32					status;
> @@ -475,6 +496,9 @@ struct nfsd4_op {
>  		struct nfsd4_reclaim_complete	reclaim_complete;
>  		struct nfsd4_test_stateid	test_stateid;
>  		struct nfsd4_free_stateid	free_stateid;
> +
> +		/* NFSv4.2 */
> +		struct nfsd4_write_plus		write_plus;
>  	} u;
>  	struct nfs4_replay *			replay;
>  };
> diff --git a/fs/open.c b/fs/open.c
> index d420331..09db2d5 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -278,6 +278,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>  	sb_end_write(inode->i_sb);
>  	return ret;
>  }
> +EXPORT_SYMBOL_GPL(do_fallocate);
>  
>  SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
>  {
> diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
> index 2bc5217..55ed991 100644
> --- a/include/linux/nfs4.h
> +++ b/include/linux/nfs4.h
> @@ -128,7 +128,7 @@ enum nfs_opnum4 {
>  Needs to be updated if more operations are defined in future.*/
>  
>  #define FIRST_NFS4_OP	OP_ACCESS
> -#define LAST_NFS4_OP 	OP_RECLAIM_COMPLETE
> +#define LAST_NFS4_OP 	OP_WRITE_PLUS
>  
>  enum nfsstat4 {
>  	NFS4_OK = 0,
> @@ -552,4 +552,10 @@ struct nfs4_deviceid {
>  	char data[NFS4_DEVICEID4_SIZE];
>  };
>  
> +enum data_content4 {
> +	NFS4_CONTENT_DATA		= 0,
> +	NFS4_CONTENT_APP_DATA_HOLE	= 1,
> +	NFS4_CONTENT_HOLE		= 2,
> +};
> +
>  #endif
> -- 
> 1.8.4.1
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker Oct. 29, 2013, 12:49 p.m. UTC | #2
On Mon 28 Oct 2013 05:40:30 PM EDT, J. Bruce Fields wrote:
> On Mon, Oct 28, 2013 at 10:57:25AM -0400, Anna Schumaker wrote:
>> This patch only implements DATA_CONTENT_HOLE support, but I tried to
>> write it so other arms of the discriminated union can be added easily
>> later.
>
> And there's an NFS4ERR_UNION_NOTSUPP to handle that case, OK.  (Though
> the spec language is a little ambiguous:
>
> 	If the client asks for a hole and the server does not support
> 	that arm of the discriminated union, but does support one or
> 	more additional arms, it can signal to the client that it
> 	supports the operation, but not the arm with
> 	NFS4ERR_UNION_NOTSUPP.
>
> So it's unclear whether we can return this in the case the client is
> *not* asking for a hole.  Are we sure the NFS4_CONTENT_DATA case is
> optional?  The language in 5.3 ("Instead a client should utlize
> READ_PLUS and WRITE_PLUS") also suggests the CONTENT_DATA case might be
> meant to be mandatory.)

Fair enough.  I wasn't planning on implementing the NFS4_CONTENT_DATA 
arm on the client right now, but I might be able to hack something 
together to test this on the server!

>
>> This patch is missing support for decoding multiple requests on the same
>> file.  The way it's written now only the last range provided by the
>> client will be written to.
>
> But that doesn't seem to be a limitation anticipated by the spec, so I
> guess we need to fix that.
>
> (Why is there an array, anyway?  Wouldn't it work just as well to send
> multiple RPC's, or a single RPC with multiple WRITE_PLUS ops?)

Yeah, I'll fix that up.  The Linux client will send requests one at a 
time, so that's what I tested against.

>
>>
>> Signed-off-by: Anna Schumaker <bjschuma@netapp.com>
>> ---
>>  fs/nfsd/nfs4proc.c   | 45 ++++++++++++++++++++++++++++++++++++++
>>  fs/nfsd/nfs4xdr.c    | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>>  fs/nfsd/vfs.c        | 14 ++++++++++++
>>  fs/nfsd/vfs.h        |  1 +
>>  fs/nfsd/xdr4.h       | 24 ++++++++++++++++++++
>>  fs/open.c            |  1 +
>>  include/linux/nfs4.h |  8 ++++++-
>>  7 files changed, 152 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>> index 419572f..3210c6f 100644
>> --- a/fs/nfsd/nfs4proc.c
>> +++ b/fs/nfsd/nfs4proc.c
>> @@ -1028,6 +1028,42 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>  	return status;
>>  }
>>
>> +static __be32
>> +nfsd4_write_plus_hole(struct file *file, struct nfsd4_write_plus *writeplus,
>> +		      struct net *net)
>> +{
>> +	__be32 status;
>> +
>> +	status = nfsd4_vfs_fallocate(file, writeplus->wp_allocated,
>> +				writeplus->wp_offset, writeplus->wp_length);
>> +	if (status == nfs_ok) {
>> +		writeplus->wp_res.wr_stid = NULL;
>> +		writeplus->wp_res.wr_bytes_written = writeplus->wp_length;
>> +		writeplus->wp_res.wr_stable_how = NFS_FILE_SYNC;
>
> Do we need to sync?
>
>> +		gen_boot_verifier(&writeplus->wp_res.wr_verifier, net);
>> +	}
>> +
>> +	return status;
>
> Nit, but I like the usual idiom better in most cases:
>
> 	if (status)
> 		return status;
> 	normal case here...
> 	return 0;
>
> --b.
>
>> +}
>> +
>> +static __be32
>> +nfsd4_write_plus(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>> +		struct nfsd4_write_plus *writeplus)
>> +{
>> +	struct file *file;
>> +	__be32 status;
>> +	struct net *net = SVC_NET(rqstp);
>> +
>> +	status = nfs4_preprocess_stateid_op(net, cstate, &writeplus->wp_stateid,
>> +					    WR_STATE, &file);
>> +	if (status)
>> +		return status;
>> +
>> +	if (writeplus->wp_data_content == NFS4_CONTENT_HOLE)
>> +		return nfsd4_write_plus_hole(file, writeplus, net);
>> +	return nfserr_union_notsupp;
>> +}
>> +
>>  /* This routine never returns NFS_OK!  If there are no other errors, it
>>   * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
>>   * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
>> @@ -1840,6 +1876,15 @@ static struct nfsd4_operation nfsd4_ops[] = {
>>  		.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
>>  		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
>>  	},
>> +
>> +	/* NFSv4.2 operations */
>> +	[OP_WRITE_PLUS] = {
>> +		.op_func = (nfsd4op_func)nfsd4_write_plus,
>> +		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
>> +		.op_name = "OP_WRITE_PLUS",
>> +		.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
>> +		.op_get_currentstateid = (stateid_getter)nfsd4_get_writestateid,
>> +	},
>>  };
>>
>>  #ifdef NFSD_DEBUG
>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>> index 60f5a1f..1e4e9af 100644
>> --- a/fs/nfsd/nfs4xdr.c
>> +++ b/fs/nfsd/nfs4xdr.c
>> @@ -1485,6 +1485,32 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
>>  }
>>
>>  static __be32
>> +nfsd4_decode_write_plus(struct nfsd4_compoundargs *argp,
>> +			struct nfsd4_write_plus *writeplus)
>> +{
>> +	DECODE_HEAD;
>> +	unsigned int i;
>> +
>> +	status = nfsd4_decode_stateid(argp, &writeplus->wp_stateid);
>> +	if (status)
>> +		return status;
>> +
>> +	READ_BUF(8);
>> +	READ32(writeplus->wp_stable_how);
>> +	READ32(writeplus->wp_data_length);
>> +
>> +	for (i = 0; i < writeplus->wp_data_length; i++) {
>> +		READ_BUF(24);
>> +		READ32(writeplus->wp_data_content);
>> +		READ64(writeplus->wp_offset);
>> +		READ64(writeplus->wp_length);
>> +		READ32(writeplus->wp_allocated);
>> +	}
>> +
>> +	DECODE_TAIL;
>> +}
>> +
>> +static __be32
>>  nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
>>  {
>>  	return nfs_ok;
>> @@ -1665,7 +1691,7 @@ static nfsd4_dec nfsd42_dec_ops[] = {
>>  	[OP_COPY_NOTIFY]	= (nfsd4_dec)nfsd4_decode_notsupp,
>>  	[OP_OFFLOAD_REVOKE]	= (nfsd4_dec)nfsd4_decode_notsupp,
>>  	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_notsupp,
>> -	[OP_WRITE_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
>> +	[OP_WRITE_PLUS]		= (nfsd4_dec)nfsd4_decode_write_plus,
>>  	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
>>  	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_notsupp,
>>  	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
>> @@ -3591,6 +3617,38 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
>>  	return nfserr;
>>  }
>>
>> +static void
>> +nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write)
>> +{
>> +	__be32 *p;
>> +
>> +	RESERVE_SPACE(4);
>> +
>> +	if (write->wr_stid == NULL) {
>> +		WRITE32(0);
>> +		ADJUST_ARGS();
>> +	} else {
>> +		WRITE32(1);
>> +		ADJUST_ARGS();
>> +		nfsd4_encode_stateid(resp, &write->wr_stid->sc_stateid);
>> +	}
>> +
>> +	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
>> +	WRITE64(write->wr_bytes_written);
>> +	WRITE32(write->wr_stable_how);
>> +	WRITEMEM(write->wr_verifier.data, NFS4_VERIFIER_SIZE);
>> +	ADJUST_ARGS();
>> +}
>> +
>> +static __be32
>> +nfsd4_encode_write_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
>> +		        struct nfsd4_write_plus *writeplus)
>> +{
>> +	if (!nfserr)
>> +		nfsd42_encode_write_res(resp, &writeplus->wp_res);
>> +	return nfserr;
>> +}
>> +
>>  static __be32
>>  nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
>>  {
>> @@ -3670,7 +3728,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
>>  	[OP_COPY_NOTIFY]	= (nfsd4_enc)nfsd4_encode_noop,
>>  	[OP_OFFLOAD_REVOKE]	= (nfsd4_enc)nfsd4_encode_noop,
>>  	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_noop,
>> -	[OP_WRITE_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
>> +	[OP_WRITE_PLUS]		= (nfsd4_enc)nfsd4_encode_write_plus,
>>  	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
>>  	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_noop,
>>  	[OP_IO_ADVISE]		= (nfsd4_enc)nfsd4_encode_noop,
>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>> index c827acb..7fec087 100644
>> --- a/fs/nfsd/vfs.c
>> +++ b/fs/nfsd/vfs.c
>> @@ -16,6 +16,7 @@
>>  #include <linux/fs.h>
>>  #include <linux/file.h>
>>  #include <linux/splice.h>
>> +#include <linux/falloc.h>
>>  #include <linux/fcntl.h>
>>  #include <linux/namei.h>
>>  #include <linux/delay.h>
>> @@ -649,6 +650,19 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>  }
>>  #endif
>>
>> +__be32 nfsd4_vfs_fallocate(struct file *file, bool allocated, loff_t offset, loff_t len)
>> +{
>> +	int error, mode = 0;
>> +
>> +	if (allocated == false)
>> +		mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
>> +
>> +	error = do_fallocate(file, mode, offset, len);
>> +	if (error == 0)
>> +		error = vfs_fsync_range(file, offset, offset + len, 0);
>> +	return nfserrno(error);
>> +}
>> +
>>  #endif /* defined(CONFIG_NFSD_V4) */
>>
>>  #ifdef CONFIG_NFSD_V3
>> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
>> index a4be2e3..187eb95 100644
>> --- a/fs/nfsd/vfs.h
>> +++ b/fs/nfsd/vfs.h
>> @@ -57,6 +57,7 @@ __be32          nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
>>  int             nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
>>  __be32          nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
>>  		    struct xdr_netobj *);
>> +__be32		nfsd4_vfs_fallocate(struct file *, bool, loff_t, loff_t);
>>  #endif /* CONFIG_NFSD_V4 */
>>  __be32		nfsd_create(struct svc_rqst *, struct svc_fh *,
>>  				char *name, int len, struct iattr *attrs,
>> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
>> index b3ed644..aaef9ab 100644
>> --- a/fs/nfsd/xdr4.h
>> +++ b/fs/nfsd/xdr4.h
>> @@ -430,6 +430,27 @@ struct nfsd4_reclaim_complete {
>>  	u32 rca_one_fs;
>>  };
>>
>> +struct nfsd42_write_res {
>> +	struct nfs4_stid	*wr_stid;
>> +	u64			wr_bytes_written;
>> +	u32			wr_stable_how;
>> +	nfs4_verifier		wr_verifier;
>> +};
>> +
>> +struct nfsd4_write_plus {
>> +	/* request */
>> +	stateid_t	wp_stateid;
>> +	u32		wp_stable_how;
>> +	u32		wp_data_length;
>> +	u32		wp_data_content;
>> +	u64		wp_offset;
>> +	u64		wp_length;
>> +	u32		wp_allocated;
>> +
>> +	/* response */
>> +	struct nfsd42_write_res	wp_res;
>> +};
>> +
>>  struct nfsd4_op {
>>  	int					opnum;
>>  	__be32					status;
>> @@ -475,6 +496,9 @@ struct nfsd4_op {
>>  		struct nfsd4_reclaim_complete	reclaim_complete;
>>  		struct nfsd4_test_stateid	test_stateid;
>>  		struct nfsd4_free_stateid	free_stateid;
>> +
>> +		/* NFSv4.2 */
>> +		struct nfsd4_write_plus		write_plus;
>>  	} u;
>>  	struct nfs4_replay *			replay;
>>  };
>> diff --git a/fs/open.c b/fs/open.c
>> index d420331..09db2d5 100644
>> --- a/fs/open.c
>> +++ b/fs/open.c
>> @@ -278,6 +278,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>>  	sb_end_write(inode->i_sb);
>>  	return ret;
>>  }
>> +EXPORT_SYMBOL_GPL(do_fallocate);
>>
>>  SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
>>  {
>> diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
>> index 2bc5217..55ed991 100644
>> --- a/include/linux/nfs4.h
>> +++ b/include/linux/nfs4.h
>> @@ -128,7 +128,7 @@ enum nfs_opnum4 {
>>  Needs to be updated if more operations are defined in future.*/
>>
>>  #define FIRST_NFS4_OP	OP_ACCESS
>> -#define LAST_NFS4_OP 	OP_RECLAIM_COMPLETE
>> +#define LAST_NFS4_OP 	OP_WRITE_PLUS
>>
>>  enum nfsstat4 {
>>  	NFS4_OK = 0,
>> @@ -552,4 +552,10 @@ struct nfs4_deviceid {
>>  	char data[NFS4_DEVICEID4_SIZE];
>>  };
>>
>> +enum data_content4 {
>> +	NFS4_CONTENT_DATA		= 0,
>> +	NFS4_CONTENT_APP_DATA_HOLE	= 1,
>> +	NFS4_CONTENT_HOLE		= 2,
>> +};
>> +
>>  #endif
>> --
>> 1.8.4.1
>>


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker Oct. 29, 2013, 12:50 p.m. UTC | #3
On Mon 28 Oct 2013 05:40:30 PM EDT, J. Bruce Fields wrote:
> On Mon, Oct 28, 2013 at 10:57:25AM -0400, Anna Schumaker wrote:
>> This patch only implements DATA_CONTENT_HOLE support, but I tried to
>> write it so other arms of the discriminated union can be added easily
>> later.
>
> And there's an NFS4ERR_UNION_NOTSUPP to handle that case, OK.  (Though
> the spec language is a little ambiguous:
>
> 	If the client asks for a hole and the server does not support
> 	that arm of the discriminated union, but does support one or
> 	more additional arms, it can signal to the client that it
> 	supports the operation, but not the arm with
> 	NFS4ERR_UNION_NOTSUPP.
>
> So it's unclear whether we can return this in the case the client is
> *not* asking for a hole.  Are we sure the NFS4_CONTENT_DATA case is
> optional?  The language in 5.3 ("Instead a client should utlize
> READ_PLUS and WRITE_PLUS") also suggests the CONTENT_DATA case might be
> meant to be mandatory.)
>
>> This patch is missing support for decoding multiple requests on the same
>> file.  The way it's written now only the last range provided by the
>> client will be written to.
>
> But that doesn't seem to be a limitation anticipated by the spec, so I
> guess we need to fix that.
>
> (Why is there an array, anyway?  Wouldn't it work just as well to send
> multiple RPC's, or a single RPC with multiple WRITE_PLUS ops?)
>
>>
>> Signed-off-by: Anna Schumaker <bjschuma@netapp.com>
>> ---
>>  fs/nfsd/nfs4proc.c   | 45 ++++++++++++++++++++++++++++++++++++++
>>  fs/nfsd/nfs4xdr.c    | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>>  fs/nfsd/vfs.c        | 14 ++++++++++++
>>  fs/nfsd/vfs.h        |  1 +
>>  fs/nfsd/xdr4.h       | 24 ++++++++++++++++++++
>>  fs/open.c            |  1 +
>>  include/linux/nfs4.h |  8 ++++++-
>>  7 files changed, 152 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>> index 419572f..3210c6f 100644
>> --- a/fs/nfsd/nfs4proc.c
>> +++ b/fs/nfsd/nfs4proc.c
>> @@ -1028,6 +1028,42 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>  	return status;
>>  }
>>
>> +static __be32
>> +nfsd4_write_plus_hole(struct file *file, struct nfsd4_write_plus *writeplus,
>> +		      struct net *net)
>> +{
>> +	__be32 status;
>> +
>> +	status = nfsd4_vfs_fallocate(file, writeplus->wp_allocated,
>> +				writeplus->wp_offset, writeplus->wp_length);
>> +	if (status == nfs_ok) {
>> +		writeplus->wp_res.wr_stid = NULL;
>> +		writeplus->wp_res.wr_bytes_written = writeplus->wp_length;
>> +		writeplus->wp_res.wr_stable_how = NFS_FILE_SYNC;
>
> Do we need to sync?

I did the sync in nfsd4_vfs_fallocate (below), but I can move it if 
that would make more sense.
>
>> +		gen_boot_verifier(&writeplus->wp_res.wr_verifier, net);
>> +	}
>> +
>> +	return status;
>
> Nit, but I like the usual idiom better in most cases:
>
> 	if (status)
> 		return status;
> 	normal case here...
> 	return 0;

Sure, I'll change that.

>
> --b.
>
>> +}
>> +
>> +static __be32
>> +nfsd4_write_plus(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>> +		struct nfsd4_write_plus *writeplus)
>> +{
>> +	struct file *file;
>> +	__be32 status;
>> +	struct net *net = SVC_NET(rqstp);
>> +
>> +	status = nfs4_preprocess_stateid_op(net, cstate, &writeplus->wp_stateid,
>> +					    WR_STATE, &file);
>> +	if (status)
>> +		return status;
>> +
>> +	if (writeplus->wp_data_content == NFS4_CONTENT_HOLE)
>> +		return nfsd4_write_plus_hole(file, writeplus, net);
>> +	return nfserr_union_notsupp;
>> +}
>> +
>>  /* This routine never returns NFS_OK!  If there are no other errors, it
>>   * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
>>   * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
>> @@ -1840,6 +1876,15 @@ static struct nfsd4_operation nfsd4_ops[] = {
>>  		.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
>>  		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
>>  	},
>> +
>> +	/* NFSv4.2 operations */
>> +	[OP_WRITE_PLUS] = {
>> +		.op_func = (nfsd4op_func)nfsd4_write_plus,
>> +		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
>> +		.op_name = "OP_WRITE_PLUS",
>> +		.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
>> +		.op_get_currentstateid = (stateid_getter)nfsd4_get_writestateid,
>> +	},
>>  };
>>
>>  #ifdef NFSD_DEBUG
>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>> index 60f5a1f..1e4e9af 100644
>> --- a/fs/nfsd/nfs4xdr.c
>> +++ b/fs/nfsd/nfs4xdr.c
>> @@ -1485,6 +1485,32 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
>>  }
>>
>>  static __be32
>> +nfsd4_decode_write_plus(struct nfsd4_compoundargs *argp,
>> +			struct nfsd4_write_plus *writeplus)
>> +{
>> +	DECODE_HEAD;
>> +	unsigned int i;
>> +
>> +	status = nfsd4_decode_stateid(argp, &writeplus->wp_stateid);
>> +	if (status)
>> +		return status;
>> +
>> +	READ_BUF(8);
>> +	READ32(writeplus->wp_stable_how);
>> +	READ32(writeplus->wp_data_length);
>> +
>> +	for (i = 0; i < writeplus->wp_data_length; i++) {
>> +		READ_BUF(24);
>> +		READ32(writeplus->wp_data_content);
>> +		READ64(writeplus->wp_offset);
>> +		READ64(writeplus->wp_length);
>> +		READ32(writeplus->wp_allocated);
>> +	}
>> +
>> +	DECODE_TAIL;
>> +}
>> +
>> +static __be32
>>  nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
>>  {
>>  	return nfs_ok;
>> @@ -1665,7 +1691,7 @@ static nfsd4_dec nfsd42_dec_ops[] = {
>>  	[OP_COPY_NOTIFY]	= (nfsd4_dec)nfsd4_decode_notsupp,
>>  	[OP_OFFLOAD_REVOKE]	= (nfsd4_dec)nfsd4_decode_notsupp,
>>  	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_notsupp,
>> -	[OP_WRITE_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
>> +	[OP_WRITE_PLUS]		= (nfsd4_dec)nfsd4_decode_write_plus,
>>  	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
>>  	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_notsupp,
>>  	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
>> @@ -3591,6 +3617,38 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
>>  	return nfserr;
>>  }
>>
>> +static void
>> +nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write)
>> +{
>> +	__be32 *p;
>> +
>> +	RESERVE_SPACE(4);
>> +
>> +	if (write->wr_stid == NULL) {
>> +		WRITE32(0);
>> +		ADJUST_ARGS();
>> +	} else {
>> +		WRITE32(1);
>> +		ADJUST_ARGS();
>> +		nfsd4_encode_stateid(resp, &write->wr_stid->sc_stateid);
>> +	}
>> +
>> +	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
>> +	WRITE64(write->wr_bytes_written);
>> +	WRITE32(write->wr_stable_how);
>> +	WRITEMEM(write->wr_verifier.data, NFS4_VERIFIER_SIZE);
>> +	ADJUST_ARGS();
>> +}
>> +
>> +static __be32
>> +nfsd4_encode_write_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
>> +		        struct nfsd4_write_plus *writeplus)
>> +{
>> +	if (!nfserr)
>> +		nfsd42_encode_write_res(resp, &writeplus->wp_res);
>> +	return nfserr;
>> +}
>> +
>>  static __be32
>>  nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
>>  {
>> @@ -3670,7 +3728,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
>>  	[OP_COPY_NOTIFY]	= (nfsd4_enc)nfsd4_encode_noop,
>>  	[OP_OFFLOAD_REVOKE]	= (nfsd4_enc)nfsd4_encode_noop,
>>  	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_noop,
>> -	[OP_WRITE_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
>> +	[OP_WRITE_PLUS]		= (nfsd4_enc)nfsd4_encode_write_plus,
>>  	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
>>  	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_noop,
>>  	[OP_IO_ADVISE]		= (nfsd4_enc)nfsd4_encode_noop,
>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>> index c827acb..7fec087 100644
>> --- a/fs/nfsd/vfs.c
>> +++ b/fs/nfsd/vfs.c
>> @@ -16,6 +16,7 @@
>>  #include <linux/fs.h>
>>  #include <linux/file.h>
>>  #include <linux/splice.h>
>> +#include <linux/falloc.h>
>>  #include <linux/fcntl.h>
>>  #include <linux/namei.h>
>>  #include <linux/delay.h>
>> @@ -649,6 +650,19 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>  }
>>  #endif
>>
>> +__be32 nfsd4_vfs_fallocate(struct file *file, bool allocated, loff_t offset, loff_t len)
>> +{
>> +	int error, mode = 0;
>> +
>> +	if (allocated == false)
>> +		mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
>> +
>> +	error = do_fallocate(file, mode, offset, len);
>> +	if (error == 0)
>> +		error = vfs_fsync_range(file, offset, offset + len, 0);
>> +	return nfserrno(error);
>> +}
>> +
>>  #endif /* defined(CONFIG_NFSD_V4) */
>>
>>  #ifdef CONFIG_NFSD_V3
>> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
>> index a4be2e3..187eb95 100644
>> --- a/fs/nfsd/vfs.h
>> +++ b/fs/nfsd/vfs.h
>> @@ -57,6 +57,7 @@ __be32          nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
>>  int             nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
>>  __be32          nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
>>  		    struct xdr_netobj *);
>> +__be32		nfsd4_vfs_fallocate(struct file *, bool, loff_t, loff_t);
>>  #endif /* CONFIG_NFSD_V4 */
>>  __be32		nfsd_create(struct svc_rqst *, struct svc_fh *,
>>  				char *name, int len, struct iattr *attrs,
>> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
>> index b3ed644..aaef9ab 100644
>> --- a/fs/nfsd/xdr4.h
>> +++ b/fs/nfsd/xdr4.h
>> @@ -430,6 +430,27 @@ struct nfsd4_reclaim_complete {
>>  	u32 rca_one_fs;
>>  };
>>
>> +struct nfsd42_write_res {
>> +	struct nfs4_stid	*wr_stid;
>> +	u64			wr_bytes_written;
>> +	u32			wr_stable_how;
>> +	nfs4_verifier		wr_verifier;
>> +};
>> +
>> +struct nfsd4_write_plus {
>> +	/* request */
>> +	stateid_t	wp_stateid;
>> +	u32		wp_stable_how;
>> +	u32		wp_data_length;
>> +	u32		wp_data_content;
>> +	u64		wp_offset;
>> +	u64		wp_length;
>> +	u32		wp_allocated;
>> +
>> +	/* response */
>> +	struct nfsd42_write_res	wp_res;
>> +};
>> +
>>  struct nfsd4_op {
>>  	int					opnum;
>>  	__be32					status;
>> @@ -475,6 +496,9 @@ struct nfsd4_op {
>>  		struct nfsd4_reclaim_complete	reclaim_complete;
>>  		struct nfsd4_test_stateid	test_stateid;
>>  		struct nfsd4_free_stateid	free_stateid;
>> +
>> +		/* NFSv4.2 */
>> +		struct nfsd4_write_plus		write_plus;
>>  	} u;
>>  	struct nfs4_replay *			replay;
>>  };
>> diff --git a/fs/open.c b/fs/open.c
>> index d420331..09db2d5 100644
>> --- a/fs/open.c
>> +++ b/fs/open.c
>> @@ -278,6 +278,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
>>  	sb_end_write(inode->i_sb);
>>  	return ret;
>>  }
>> +EXPORT_SYMBOL_GPL(do_fallocate);
>>
>>  SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
>>  {
>> diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
>> index 2bc5217..55ed991 100644
>> --- a/include/linux/nfs4.h
>> +++ b/include/linux/nfs4.h
>> @@ -128,7 +128,7 @@ enum nfs_opnum4 {
>>  Needs to be updated if more operations are defined in future.*/
>>
>>  #define FIRST_NFS4_OP	OP_ACCESS
>> -#define LAST_NFS4_OP 	OP_RECLAIM_COMPLETE
>> +#define LAST_NFS4_OP 	OP_WRITE_PLUS
>>
>>  enum nfsstat4 {
>>  	NFS4_OK = 0,
>> @@ -552,4 +552,10 @@ struct nfs4_deviceid {
>>  	char data[NFS4_DEVICEID4_SIZE];
>>  };
>>
>> +enum data_content4 {
>> +	NFS4_CONTENT_DATA		= 0,
>> +	NFS4_CONTENT_APP_DATA_HOLE	= 1,
>> +	NFS4_CONTENT_HOLE		= 2,
>> +};
>> +
>>  #endif
>> --
>> 1.8.4.1
>>


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Oct. 29, 2013, 1:06 p.m. UTC | #4
On Tue, Oct 29, 2013 at 08:50:56AM -0400, Anna Schumaker wrote:
> On Mon 28 Oct 2013 05:40:30 PM EDT, J. Bruce Fields wrote:
> > On Mon, Oct 28, 2013 at 10:57:25AM -0400, Anna Schumaker wrote:
> >> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> >> index 419572f..3210c6f 100644
> >> --- a/fs/nfsd/nfs4proc.c
> >> +++ b/fs/nfsd/nfs4proc.c
> >> @@ -1028,6 +1028,42 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >>  	return status;
> >>  }
> >>
> >> +static __be32
> >> +nfsd4_write_plus_hole(struct file *file, struct nfsd4_write_plus *writeplus,
> >> +		      struct net *net)
> >> +{
> >> +	__be32 status;
> >> +
> >> +	status = nfsd4_vfs_fallocate(file, writeplus->wp_allocated,
> >> +				writeplus->wp_offset, writeplus->wp_length);
> >> +	if (status == nfs_ok) {
> >> +		writeplus->wp_res.wr_stid = NULL;
> >> +		writeplus->wp_res.wr_bytes_written = writeplus->wp_length;
> >> +		writeplus->wp_res.wr_stable_how = NFS_FILE_SYNC;
> >
> > Do we need to sync?
> 
> I did the sync in nfsd4_vfs_fallocate (below), but I can move it if 
> that would make more sense.

What I meant was--why are we doing a sync at all, instead of returning
NFS_UNSTABLE and making the client commit?

Honest question, I haven't thought about which is best.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker Oct. 29, 2013, 1:11 p.m. UTC | #5
On Tue 29 Oct 2013 09:06:49 AM EDT, J. Bruce Fields wrote:
> On Tue, Oct 29, 2013 at 08:50:56AM -0400, Anna Schumaker wrote:
>> On Mon 28 Oct 2013 05:40:30 PM EDT, J. Bruce Fields wrote:
>>> On Mon, Oct 28, 2013 at 10:57:25AM -0400, Anna Schumaker wrote:
>>>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>>>> index 419572f..3210c6f 100644
>>>> --- a/fs/nfsd/nfs4proc.c
>>>> +++ b/fs/nfsd/nfs4proc.c
>>>> @@ -1028,6 +1028,42 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>>>  	return status;
>>>>  }
>>>>
>>>> +static __be32
>>>> +nfsd4_write_plus_hole(struct file *file, struct nfsd4_write_plus *writeplus,
>>>> +		      struct net *net)
>>>> +{
>>>> +	__be32 status;
>>>> +
>>>> +	status = nfsd4_vfs_fallocate(file, writeplus->wp_allocated,
>>>> +				writeplus->wp_offset, writeplus->wp_length);
>>>> +	if (status == nfs_ok) {
>>>> +		writeplus->wp_res.wr_stid = NULL;
>>>> +		writeplus->wp_res.wr_bytes_written = writeplus->wp_length;
>>>> +		writeplus->wp_res.wr_stable_how = NFS_FILE_SYNC;
>>>
>>> Do we need to sync?
>>
>> I did the sync in nfsd4_vfs_fallocate (below), but I can move it if
>> that would make more sense.
>
> What I meant was--why are we doing a sync at all, instead of returning
> NFS_UNSTABLE and making the client commit?
>
> Honest question, I haven't thought about which is best.

No reason other than it seemed easier for my initial implementation.

>
> --b.


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust Oct. 29, 2013, 1:23 p.m. UTC | #6
On Oct 29, 2013, at 9:06 AM, J. Bruce Fields <bfields@fieldses.org> wrote:

> On Tue, Oct 29, 2013 at 08:50:56AM -0400, Anna Schumaker wrote:
>> On Mon 28 Oct 2013 05:40:30 PM EDT, J. Bruce Fields wrote:
>>> On Mon, Oct 28, 2013 at 10:57:25AM -0400, Anna Schumaker wrote:
>>>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>>>> index 419572f..3210c6f 100644
>>>> --- a/fs/nfsd/nfs4proc.c
>>>> +++ b/fs/nfsd/nfs4proc.c
>>>> @@ -1028,6 +1028,42 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>>> 	return status;
>>>> }
>>>> 
>>>> +static __be32
>>>> +nfsd4_write_plus_hole(struct file *file, struct nfsd4_write_plus *writeplus,
>>>> +		      struct net *net)
>>>> +{
>>>> +	__be32 status;
>>>> +
>>>> +	status = nfsd4_vfs_fallocate(file, writeplus->wp_allocated,
>>>> +				writeplus->wp_offset, writeplus->wp_length);
>>>> +	if (status == nfs_ok) {
>>>> +		writeplus->wp_res.wr_stid = NULL;
>>>> +		writeplus->wp_res.wr_bytes_written = writeplus->wp_length;
>>>> +		writeplus->wp_res.wr_stable_how = NFS_FILE_SYNC;
>>> 
>>> Do we need to sync?
>> 
>> I did the sync in nfsd4_vfs_fallocate (below), but I can move it if 
>> that would make more sense.
> 
> What I meant was--why are we doing a sync at all, instead of returning
> NFS_UNSTABLE and making the client commit?

What if the client specifies FILE_SYNC? The spec doesn't allow the server to return UNSTABLE in that situation.

Trond--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker Oct. 29, 2013, 1:28 p.m. UTC | #7
On Tue 29 Oct 2013 09:23:19 AM EDT, Myklebust, Trond wrote:
>
> On Oct 29, 2013, at 9:06 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
>
>> On Tue, Oct 29, 2013 at 08:50:56AM -0400, Anna Schumaker wrote:
>>> On Mon 28 Oct 2013 05:40:30 PM EDT, J. Bruce Fields wrote:
>>>> On Mon, Oct 28, 2013 at 10:57:25AM -0400, Anna Schumaker wrote:
>>>>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>>>>> index 419572f..3210c6f 100644
>>>>> --- a/fs/nfsd/nfs4proc.c
>>>>> +++ b/fs/nfsd/nfs4proc.c
>>>>> @@ -1028,6 +1028,42 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>>>> 	return status;
>>>>> }
>>>>>
>>>>> +static __be32
>>>>> +nfsd4_write_plus_hole(struct file *file, struct nfsd4_write_plus *writeplus,
>>>>> +		      struct net *net)
>>>>> +{
>>>>> +	__be32 status;
>>>>> +
>>>>> +	status = nfsd4_vfs_fallocate(file, writeplus->wp_allocated,
>>>>> +				writeplus->wp_offset, writeplus->wp_length);
>>>>> +	if (status == nfs_ok) {
>>>>> +		writeplus->wp_res.wr_stid = NULL;
>>>>> +		writeplus->wp_res.wr_bytes_written = writeplus->wp_length;
>>>>> +		writeplus->wp_res.wr_stable_how = NFS_FILE_SYNC;
>>>>
>>>> Do we need to sync?
>>>
>>> I did the sync in nfsd4_vfs_fallocate (below), but I can move it if
>>> that would make more sense.
>>
>> What I meant was--why are we doing a sync at all, instead of returning
>> NFS_UNSTABLE and making the client commit?
>
> What if the client specifies FILE_SYNC? The spec doesn't allow the server to return UNSTABLE in that situation.

I don't have that check in there either, but I do think it would be 
useful if the client were prepared to handle an UNSTABLE reply from the 
server...

>
> Trond


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Oct. 29, 2013, 1:32 p.m. UTC | #8
On Tue, Oct 29, 2013 at 01:23:19PM +0000, Myklebust, Trond wrote:
> 
> On Oct 29, 2013, at 9:06 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
> 
> > On Tue, Oct 29, 2013 at 08:50:56AM -0400, Anna Schumaker wrote:
> >> On Mon 28 Oct 2013 05:40:30 PM EDT, J. Bruce Fields wrote:
> >>> On Mon, Oct 28, 2013 at 10:57:25AM -0400, Anna Schumaker wrote:
> >>>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> >>>> index 419572f..3210c6f 100644
> >>>> --- a/fs/nfsd/nfs4proc.c
> >>>> +++ b/fs/nfsd/nfs4proc.c
> >>>> @@ -1028,6 +1028,42 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >>>> 	return status;
> >>>> }
> >>>> 
> >>>> +static __be32
> >>>> +nfsd4_write_plus_hole(struct file *file, struct nfsd4_write_plus *writeplus,
> >>>> +		      struct net *net)
> >>>> +{
> >>>> +	__be32 status;
> >>>> +
> >>>> +	status = nfsd4_vfs_fallocate(file, writeplus->wp_allocated,
> >>>> +				writeplus->wp_offset, writeplus->wp_length);
> >>>> +	if (status == nfs_ok) {
> >>>> +		writeplus->wp_res.wr_stid = NULL;
> >>>> +		writeplus->wp_res.wr_bytes_written = writeplus->wp_length;
> >>>> +		writeplus->wp_res.wr_stable_how = NFS_FILE_SYNC;
> >>> 
> >>> Do we need to sync?
> >> 
> >> I did the sync in nfsd4_vfs_fallocate (below), but I can move it if 
> >> that would make more sense.
> > 
> > What I meant was--why are we doing a sync at all, instead of returning
> > NFS_UNSTABLE and making the client commit?
> 
> What if the client specifies FILE_SYNC? The spec doesn't allow the server to return UNSTABLE in that situation.

Of course, sorry for being unclear.  I was just wondering if there's
some particular reason we're not using NFS_UNSTABLE when the client
allows it.

But "because it's easier" is a fine answer.  We can always add that
later if it turns out to be useful.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Oct. 29, 2013, 1:34 p.m. UTC | #9
On Tue, Oct 29, 2013 at 08:49:06AM -0400, Anna Schumaker wrote:
> On Mon 28 Oct 2013 05:40:30 PM EDT, J. Bruce Fields wrote:
> > On Mon, Oct 28, 2013 at 10:57:25AM -0400, Anna Schumaker wrote:
> >> This patch only implements DATA_CONTENT_HOLE support, but I tried to
> >> write it so other arms of the discriminated union can be added easily
> >> later.
> >
> > And there's an NFS4ERR_UNION_NOTSUPP to handle that case, OK.  (Though
> > the spec language is a little ambiguous:
> >
> > 	If the client asks for a hole and the server does not support
> > 	that arm of the discriminated union, but does support one or
> > 	more additional arms, it can signal to the client that it
> > 	supports the operation, but not the arm with
> > 	NFS4ERR_UNION_NOTSUPP.
> >
> > So it's unclear whether we can return this in the case the client is
> > *not* asking for a hole.  Are we sure the NFS4_CONTENT_DATA case is
> > optional?  The language in 5.3 ("Instead a client should utlize
> > READ_PLUS and WRITE_PLUS") also suggests the CONTENT_DATA case might be
> > meant to be mandatory.)
> 
> Fair enough.  I wasn't planning on implementing the NFS4_CONTENT_DATA 
> arm on the client right now, but I might be able to hack something 
> together to test this on the server!

OK.  Either way, could you also submit a patch to the spec to make it
explicit whether CONTENT_DATA is mandatory or not?

--b.

> 
> >
> >> This patch is missing support for decoding multiple requests on the same
> >> file.  The way it's written now only the last range provided by the
> >> client will be written to.
> >
> > But that doesn't seem to be a limitation anticipated by the spec, so I
> > guess we need to fix that.
> >
> > (Why is there an array, anyway?  Wouldn't it work just as well to send
> > multiple RPC's, or a single RPC with multiple WRITE_PLUS ops?)
> 
> Yeah, I'll fix that up.  The Linux client will send requests one at a 
> time, so that's what I tested against.
> 
> >
> >>
> >> Signed-off-by: Anna Schumaker <bjschuma@netapp.com>
> >> ---
> >>  fs/nfsd/nfs4proc.c   | 45 ++++++++++++++++++++++++++++++++++++++
> >>  fs/nfsd/nfs4xdr.c    | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++--
> >>  fs/nfsd/vfs.c        | 14 ++++++++++++
> >>  fs/nfsd/vfs.h        |  1 +
> >>  fs/nfsd/xdr4.h       | 24 ++++++++++++++++++++
> >>  fs/open.c            |  1 +
> >>  include/linux/nfs4.h |  8 ++++++-
> >>  7 files changed, 152 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> >> index 419572f..3210c6f 100644
> >> --- a/fs/nfsd/nfs4proc.c
> >> +++ b/fs/nfsd/nfs4proc.c
> >> @@ -1028,6 +1028,42 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >>  	return status;
> >>  }
> >>
> >> +static __be32
> >> +nfsd4_write_plus_hole(struct file *file, struct nfsd4_write_plus *writeplus,
> >> +		      struct net *net)
> >> +{
> >> +	__be32 status;
> >> +
> >> +	status = nfsd4_vfs_fallocate(file, writeplus->wp_allocated,
> >> +				writeplus->wp_offset, writeplus->wp_length);
> >> +	if (status == nfs_ok) {
> >> +		writeplus->wp_res.wr_stid = NULL;
> >> +		writeplus->wp_res.wr_bytes_written = writeplus->wp_length;
> >> +		writeplus->wp_res.wr_stable_how = NFS_FILE_SYNC;
> >
> > Do we need to sync?
> >
> >> +		gen_boot_verifier(&writeplus->wp_res.wr_verifier, net);
> >> +	}
> >> +
> >> +	return status;
> >
> > Nit, but I like the usual idiom better in most cases:
> >
> > 	if (status)
> > 		return status;
> > 	normal case here...
> > 	return 0;
> >
> > --b.
> >
> >> +}
> >> +
> >> +static __be32
> >> +nfsd4_write_plus(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >> +		struct nfsd4_write_plus *writeplus)
> >> +{
> >> +	struct file *file;
> >> +	__be32 status;
> >> +	struct net *net = SVC_NET(rqstp);
> >> +
> >> +	status = nfs4_preprocess_stateid_op(net, cstate, &writeplus->wp_stateid,
> >> +					    WR_STATE, &file);
> >> +	if (status)
> >> +		return status;
> >> +
> >> +	if (writeplus->wp_data_content == NFS4_CONTENT_HOLE)
> >> +		return nfsd4_write_plus_hole(file, writeplus, net);
> >> +	return nfserr_union_notsupp;
> >> +}
> >> +
> >>  /* This routine never returns NFS_OK!  If there are no other errors, it
> >>   * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
> >>   * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
> >> @@ -1840,6 +1876,15 @@ static struct nfsd4_operation nfsd4_ops[] = {
> >>  		.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
> >>  		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
> >>  	},
> >> +
> >> +	/* NFSv4.2 operations */
> >> +	[OP_WRITE_PLUS] = {
> >> +		.op_func = (nfsd4op_func)nfsd4_write_plus,
> >> +		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
> >> +		.op_name = "OP_WRITE_PLUS",
> >> +		.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
> >> +		.op_get_currentstateid = (stateid_getter)nfsd4_get_writestateid,
> >> +	},
> >>  };
> >>
> >>  #ifdef NFSD_DEBUG
> >> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> >> index 60f5a1f..1e4e9af 100644
> >> --- a/fs/nfsd/nfs4xdr.c
> >> +++ b/fs/nfsd/nfs4xdr.c
> >> @@ -1485,6 +1485,32 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
> >>  }
> >>
> >>  static __be32
> >> +nfsd4_decode_write_plus(struct nfsd4_compoundargs *argp,
> >> +			struct nfsd4_write_plus *writeplus)
> >> +{
> >> +	DECODE_HEAD;
> >> +	unsigned int i;
> >> +
> >> +	status = nfsd4_decode_stateid(argp, &writeplus->wp_stateid);
> >> +	if (status)
> >> +		return status;
> >> +
> >> +	READ_BUF(8);
> >> +	READ32(writeplus->wp_stable_how);
> >> +	READ32(writeplus->wp_data_length);
> >> +
> >> +	for (i = 0; i < writeplus->wp_data_length; i++) {
> >> +		READ_BUF(24);
> >> +		READ32(writeplus->wp_data_content);
> >> +		READ64(writeplus->wp_offset);
> >> +		READ64(writeplus->wp_length);
> >> +		READ32(writeplus->wp_allocated);
> >> +	}
> >> +
> >> +	DECODE_TAIL;
> >> +}
> >> +
> >> +static __be32
> >>  nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
> >>  {
> >>  	return nfs_ok;
> >> @@ -1665,7 +1691,7 @@ static nfsd4_dec nfsd42_dec_ops[] = {
> >>  	[OP_COPY_NOTIFY]	= (nfsd4_dec)nfsd4_decode_notsupp,
> >>  	[OP_OFFLOAD_REVOKE]	= (nfsd4_dec)nfsd4_decode_notsupp,
> >>  	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_notsupp,
> >> -	[OP_WRITE_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
> >> +	[OP_WRITE_PLUS]		= (nfsd4_dec)nfsd4_decode_write_plus,
> >>  	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
> >>  	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_notsupp,
> >>  	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
> >> @@ -3591,6 +3617,38 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
> >>  	return nfserr;
> >>  }
> >>
> >> +static void
> >> +nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write)
> >> +{
> >> +	__be32 *p;
> >> +
> >> +	RESERVE_SPACE(4);
> >> +
> >> +	if (write->wr_stid == NULL) {
> >> +		WRITE32(0);
> >> +		ADJUST_ARGS();
> >> +	} else {
> >> +		WRITE32(1);
> >> +		ADJUST_ARGS();
> >> +		nfsd4_encode_stateid(resp, &write->wr_stid->sc_stateid);
> >> +	}
> >> +
> >> +	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
> >> +	WRITE64(write->wr_bytes_written);
> >> +	WRITE32(write->wr_stable_how);
> >> +	WRITEMEM(write->wr_verifier.data, NFS4_VERIFIER_SIZE);
> >> +	ADJUST_ARGS();
> >> +}
> >> +
> >> +static __be32
> >> +nfsd4_encode_write_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
> >> +		        struct nfsd4_write_plus *writeplus)
> >> +{
> >> +	if (!nfserr)
> >> +		nfsd42_encode_write_res(resp, &writeplus->wp_res);
> >> +	return nfserr;
> >> +}
> >> +
> >>  static __be32
> >>  nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
> >>  {
> >> @@ -3670,7 +3728,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
> >>  	[OP_COPY_NOTIFY]	= (nfsd4_enc)nfsd4_encode_noop,
> >>  	[OP_OFFLOAD_REVOKE]	= (nfsd4_enc)nfsd4_encode_noop,
> >>  	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_noop,
> >> -	[OP_WRITE_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
> >> +	[OP_WRITE_PLUS]		= (nfsd4_enc)nfsd4_encode_write_plus,
> >>  	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
> >>  	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_noop,
> >>  	[OP_IO_ADVISE]		= (nfsd4_enc)nfsd4_encode_noop,
> >> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> >> index c827acb..7fec087 100644
> >> --- a/fs/nfsd/vfs.c
> >> +++ b/fs/nfsd/vfs.c
> >> @@ -16,6 +16,7 @@
> >>  #include <linux/fs.h>
> >>  #include <linux/file.h>
> >>  #include <linux/splice.h>
> >> +#include <linux/falloc.h>
> >>  #include <linux/fcntl.h>
> >>  #include <linux/namei.h>
> >>  #include <linux/delay.h>
> >> @@ -649,6 +650,19 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >>  }
> >>  #endif
> >>
> >> +__be32 nfsd4_vfs_fallocate(struct file *file, bool allocated, loff_t offset, loff_t len)
> >> +{
> >> +	int error, mode = 0;
> >> +
> >> +	if (allocated == false)
> >> +		mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
> >> +
> >> +	error = do_fallocate(file, mode, offset, len);
> >> +	if (error == 0)
> >> +		error = vfs_fsync_range(file, offset, offset + len, 0);
> >> +	return nfserrno(error);
> >> +}
> >> +
> >>  #endif /* defined(CONFIG_NFSD_V4) */
> >>
> >>  #ifdef CONFIG_NFSD_V3
> >> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> >> index a4be2e3..187eb95 100644
> >> --- a/fs/nfsd/vfs.h
> >> +++ b/fs/nfsd/vfs.h
> >> @@ -57,6 +57,7 @@ __be32          nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
> >>  int             nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
> >>  __be32          nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
> >>  		    struct xdr_netobj *);
> >> +__be32		nfsd4_vfs_fallocate(struct file *, bool, loff_t, loff_t);
> >>  #endif /* CONFIG_NFSD_V4 */
> >>  __be32		nfsd_create(struct svc_rqst *, struct svc_fh *,
> >>  				char *name, int len, struct iattr *attrs,
> >> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
> >> index b3ed644..aaef9ab 100644
> >> --- a/fs/nfsd/xdr4.h
> >> +++ b/fs/nfsd/xdr4.h
> >> @@ -430,6 +430,27 @@ struct nfsd4_reclaim_complete {
> >>  	u32 rca_one_fs;
> >>  };
> >>
> >> +struct nfsd42_write_res {
> >> +	struct nfs4_stid	*wr_stid;
> >> +	u64			wr_bytes_written;
> >> +	u32			wr_stable_how;
> >> +	nfs4_verifier		wr_verifier;
> >> +};
> >> +
> >> +struct nfsd4_write_plus {
> >> +	/* request */
> >> +	stateid_t	wp_stateid;
> >> +	u32		wp_stable_how;
> >> +	u32		wp_data_length;
> >> +	u32		wp_data_content;
> >> +	u64		wp_offset;
> >> +	u64		wp_length;
> >> +	u32		wp_allocated;
> >> +
> >> +	/* response */
> >> +	struct nfsd42_write_res	wp_res;
> >> +};
> >> +
> >>  struct nfsd4_op {
> >>  	int					opnum;
> >>  	__be32					status;
> >> @@ -475,6 +496,9 @@ struct nfsd4_op {
> >>  		struct nfsd4_reclaim_complete	reclaim_complete;
> >>  		struct nfsd4_test_stateid	test_stateid;
> >>  		struct nfsd4_free_stateid	free_stateid;
> >> +
> >> +		/* NFSv4.2 */
> >> +		struct nfsd4_write_plus		write_plus;
> >>  	} u;
> >>  	struct nfs4_replay *			replay;
> >>  };
> >> diff --git a/fs/open.c b/fs/open.c
> >> index d420331..09db2d5 100644
> >> --- a/fs/open.c
> >> +++ b/fs/open.c
> >> @@ -278,6 +278,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> >>  	sb_end_write(inode->i_sb);
> >>  	return ret;
> >>  }
> >> +EXPORT_SYMBOL_GPL(do_fallocate);
> >>
> >>  SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
> >>  {
> >> diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
> >> index 2bc5217..55ed991 100644
> >> --- a/include/linux/nfs4.h
> >> +++ b/include/linux/nfs4.h
> >> @@ -128,7 +128,7 @@ enum nfs_opnum4 {
> >>  Needs to be updated if more operations are defined in future.*/
> >>
> >>  #define FIRST_NFS4_OP	OP_ACCESS
> >> -#define LAST_NFS4_OP 	OP_RECLAIM_COMPLETE
> >> +#define LAST_NFS4_OP 	OP_WRITE_PLUS
> >>
> >>  enum nfsstat4 {
> >>  	NFS4_OK = 0,
> >> @@ -552,4 +552,10 @@ struct nfs4_deviceid {
> >>  	char data[NFS4_DEVICEID4_SIZE];
> >>  };
> >>
> >> +enum data_content4 {
> >> +	NFS4_CONTENT_DATA		= 0,
> >> +	NFS4_CONTENT_APP_DATA_HOLE	= 1,
> >> +	NFS4_CONTENT_HOLE		= 2,
> >> +};
> >> +
> >>  #endif
> >> --
> >> 1.8.4.1
> >>
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 2, 2013, 1:52 p.m. UTC | #10
> +__be32 nfsd4_vfs_fallocate(struct file *file, bool allocated, loff_t offset, loff_t len)
> +{
> +	int error, mode = 0;
> +
> +	if (allocated == false)
> +		mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
> +
> +	error = do_fallocate(file, mode, offset, len);
> +	if (error == 0)
> +		error = vfs_fsync_range(file, offset, offset + len, 0);

What are you trying to do with the fsync_range here?  If it's just to
make sure the metadata operation is stable on disk please use commit_metadata()
from fs/nfsd/vfs.c.  As there's no data involved here I can't really see
what else it would be for.


Also the allocated flag doesn't make any sense for people not taking the
same drugs as the spec authors, please add a comment what it means.

Btw, how did anyone come up with the name WRITE PLUS for something that
doesn't actually involve any writes?

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 2, 2013, 1:54 p.m. UTC | #11
On Tue, Oct 29, 2013 at 09:06:49AM -0400, J. Bruce Fields wrote:
> What I meant was--why are we doing a sync at all, instead of returning
> NFS_UNSTABLE and making the client commit?

Did NFSv4.2 introduce a concept of unstable metadata operations?

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Nov. 2, 2013, 2:44 p.m. UTC | #12
On Sat, Nov 02, 2013 at 06:54:09AM -0700, Christoph Hellwig wrote:
> On Tue, Oct 29, 2013 at 09:06:49AM -0400, J. Bruce Fields wrote:
> > What I meant was--why are we doing a sync at all, instead of returning
> > NFS_UNSTABLE and making the client commit?
> 
> Did NFSv4.2 introduce a concept of unstable metadata operations?

No, but I think WRITE_PLUS does have a stable/unstable bit so I think we
could choose not to do the sync if that'd make sense.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 2, 2013, 2:51 p.m. UTC | #13
On Sat, Nov 02, 2013 at 10:44:31AM -0400, J. Bruce Fields wrote:
> On Sat, Nov 02, 2013 at 06:54:09AM -0700, Christoph Hellwig wrote:
> > On Tue, Oct 29, 2013 at 09:06:49AM -0400, J. Bruce Fields wrote:
> > > What I meant was--why are we doing a sync at all, instead of returning
> > > NFS_UNSTABLE and making the client commit?
> > 
> > Did NFSv4.2 introduce a concept of unstable metadata operations?
> 
> No, but I think WRITE_PLUS does have a stable/unstable bit so I think we
> could choose not to do the sync if that'd make sense.

Both operations are idempotent, so supporting it shouldn't be a major
obstactle.  But suddenly having some metadata operations that can be
unstable seems like a major wart in the spec.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust Nov. 2, 2013, 3:02 p.m. UTC | #14
On Nov 2, 2013, at 10:51, Christoph Hellwig <hch@infradead.org> wrote:

> On Sat, Nov 02, 2013 at 10:44:31AM -0400, J. Bruce Fields wrote:
>> On Sat, Nov 02, 2013 at 06:54:09AM -0700, Christoph Hellwig wrote:
>>> On Tue, Oct 29, 2013 at 09:06:49AM -0400, J. Bruce Fields wrote:
>>>> What I meant was--why are we doing a sync at all, instead of returning
>>>> NFS_UNSTABLE and making the client commit?
>>> 
>>> Did NFSv4.2 introduce a concept of unstable metadata operations?
>> 
>> No, but I think WRITE_PLUS does have a stable/unstable bit so I think we
>> could choose not to do the sync if that'd make sense.
> 
> Both operations are idempotent, so supporting it shouldn't be a major
> obstactle.  But suddenly having some metadata operations that can be
> unstable seems like a major wart in the spec.

COMMIT has always had a metadata part to it, though. It guarantees stability of the ctime/mime and change attributes in addition to the file size and data.

IOW: it really is more akin to fsync() than to fdatasync().

Cheers
  Trond--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 2, 2013, 3:07 p.m. UTC | #15
On Sat, Nov 02, 2013 at 03:02:29PM +0000, Myklebust, Trond wrote:
> COMMIT has always had a metadata part to it, though. It guarantees stability of the ctime/mime and change attributes in addition to the file size and data.
> 
> IOW: it really is more akin to fsync() than to fdatasync().

That's fine, but so far it wasn't used for any purely metadata
operation.  E.g. if we allow it for a preallocation we should also
allow it for updating the size put the current file size.

And diverting from the spec to your implementation: we'd need a lot more
generic infrastructure to deal with unstable random non-data operations.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 2, 2013, 3:20 p.m. UTC | #16
On Mon, Oct 28, 2013 at 05:40:30PM -0400, J. Bruce Fields wrote:
> On Mon, Oct 28, 2013 at 10:57:25AM -0400, Anna Schumaker wrote:
> > This patch only implements DATA_CONTENT_HOLE support, but I tried to
> > write it so other arms of the discriminated union can be added easily
> > later.
> 
> And there's an NFS4ERR_UNION_NOTSUPP to handle that case, OK.  (Though
> the spec language is a little ambiguous:
> 
> 	If the client asks for a hole and the server does not support
> 	that arm of the discriminated union, but does support one or
> 	more additional arms, it can signal to the client that it
> 	supports the operation, but not the arm with
> 	NFS4ERR_UNION_NOTSUPP.

Where is this language coming from?  Seems like the latest document in
NFSv4.2/ repo still calls WRITE_PLUS INITIALIZE and doesn't have
anything content related, just a weird "ADB" concept.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker Nov. 4, 2013, 4:41 p.m. UTC | #17
On 11/02/2013 09:52 AM, Christoph Hellwig wrote:
>> +__be32 nfsd4_vfs_fallocate(struct file *file, bool allocated, loff_t offset, loff_t len)
>> +{
>> +	int error, mode = 0;
>> +
>> +	if (allocated == false)
>> +		mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
>> +
>> +	error = do_fallocate(file, mode, offset, len);
>> +	if (error == 0)
>> +		error = vfs_fsync_range(file, offset, offset + len, 0);
> 
> What are you trying to do with the fsync_range here?  If it's just to
> make sure the metadata operation is stable on disk please use commit_metadata()
> from fs/nfsd/vfs.c.  As there's no data involved here I can't really see
> what else it would be for.

My understanding is that if I call fallocate with a mode of "0" it will write a range of 0s to the file.  I'll move the fsync() call here and only do it if the client doesn't ask for NFS_UNSTABLE.

> 
> 
> Also the allocated flag doesn't make any sense for people not taking the
> same drugs as the spec authors, please add a comment what it means.

Sure

> 
> Btw, how did anyone come up with the name WRITE PLUS for something that
> doesn't actually involve any writes?
> 

Write plus does write more than just holes, I just didn't implement that part of the spec.  I'm implementing the NFS4_CONTENT_HOLE arm, but NFS4_CONTENT_DATA is intended to eventually replace the WRITE operation.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 4, 2013, 5:03 p.m. UTC | #18
On Mon, Nov 04, 2013 at 11:41:07AM -0500, Anna Schumaker wrote:
> > what else it would be for.
> 
> My understanding is that if I call fallocate with a mode of "0" it will write a range of 0s to the file.

While writing zeroes to the file would be a correct implementation,
that's now how the operation is defined or usually implemented.  The
defintion is that the blocks in the range will be allocated, and reads
from it will return zero (and the file size will be updated if needed).

The usual way to implement it is to created extents in an "unwritten
state" that will return zeroes when read, even if those zeroes didn't
make it to disk.

>I'll move the fsync() call here and only do it if the client doesn't ask for NFS_UNSTABLE.

As said the right way to handle it would be to use commit_metadata, as
the cache flushes a fsync does would be unessecary and too expensive for
many workloads typically using fallocate.

> > 
> > Btw, how did anyone come up with the name WRITE PLUS for something that
> > doesn't actually involve any writes?
> > 
> 
> Write plus does write more than just holes, I just didn't implement that part of the spec.  I'm implementing the NFS4_CONTENT_HOLE arm, but NFS4_CONTENT_DATA is intended to eventually replace the WRITE operation.

Where is WRITE PLUS defined?  I the various
draft-ietf-nfsv4-minorversion2-*.txt I can only find what appears to be
the predecessor, but that doesn't have a data arm.

Also what does the data arm buy us over good old WRITE?

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker Nov. 4, 2013, 5:23 p.m. UTC | #19
On 11/04/2013 12:03 PM, Christoph Hellwig wrote:
> On Mon, Nov 04, 2013 at 11:41:07AM -0500, Anna Schumaker wrote:
>>> what else it would be for.
>>
>> My understanding is that if I call fallocate with a mode of "0" it will write a range of 0s to the file.
> 
> While writing zeroes to the file would be a correct implementation,
> that's now how the operation is defined or usually implemented.  The
> defintion is that the blocks in the range will be allocated, and reads
> from it will return zero (and the file size will be updated if needed).
> 
> The usual way to implement it is to created extents in an "unwritten
> state" that will return zeroes when read, even if those zeroes didn't
> make it to disk.

And that's all done through metadata?  Doing the commit_metadata() call makes a bit more sense now.

> 
>> I'll move the fsync() call here and only do it if the client doesn't ask for NFS_UNSTABLE.
> 
> As said the right way to handle it would be to use commit_metadata, as
> the cache flushes a fsync does would be unessecary and too expensive for
> many workloads typically using fallocate.

Okay, sorry for me not understanding earlier!

> 
>>>
>>> Btw, how did anyone come up with the name WRITE PLUS for something that
>>> doesn't actually involve any writes?
>>>
>>
>> Write plus does write more than just holes, I just didn't implement that part of the spec.  I'm implementing the NFS4_CONTENT_HOLE arm, but NFS4_CONTENT_DATA is intended to eventually replace the WRITE operation.
> 
> Where is WRITE PLUS defined?  I the various
> draft-ietf-nfsv4-minorversion2-*.txt I can only find what appears to be
> the predecessor, but that doesn't have a data arm.
> 
> Also what does the data arm buy us over good old WRITE?
> 

I've been working off of draft #21, the most recent commit is 2b3ab1740b1ea843faa59566fb4213a42d8c724a from Aug 19.  The eventual goal is to phase out WRITE and replace it with WRITE_PLUS, but that won't happen until at least 4.3.  I don't know why it's being done this way instead of just adding an FALLOCATE operation.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 4, 2013, 6:53 p.m. UTC | #20
On Mon, Nov 04, 2013 at 12:23:27PM -0500, Anna Schumaker wrote:
> > state" that will return zeroes when read, even if those zeroes didn't
> > make it to disk.
> 
> And that's all done through metadata?  Doing the commit_metadata() call makes a bit more sense now.

Yes.  The only filesystem in tree that actually seems to write zeroes
is gfs2, but even then it doesn't do so through the usual mechanisms.

> > Also what does the data arm buy us over good old WRITE?
> > 
> 
> I've been working off of draft #21, the most recent commit is 2b3ab1740b1ea843faa59566fb4213a42d8c724a from Aug 19.  The eventual goal is to phase out WRITE and replace it with WRITE_PLUS, but that won't happen until at least 4.3.  I don't know why it's being done this way instead of just adding an FALLOCATE operation.

Where can I find draft 21?  The newest document in the git repo I was
pointed to earlier is draft-ietf-nfsv4-minorversion2-13.txt, and the
newest I could find on tools.ietf.org is draft 20.

I still don't understand why anyone would phase out WRITE in favour of
something that doesn't actually add any value for the write case.

I'll try to take it up directly with the working group, but my post to
the list yesterday in your SEEK thread didn't seem to have made it
trough.   I also tried to research how they came up with this idiotic
design, but the mailing list archives tell very little.  Could it be
that most of these decisions are actually made in a smokey backroom and
not on the list?
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker Nov. 4, 2013, 6:57 p.m. UTC | #21
On 11/04/2013 01:53 PM, Christoph Hellwig wrote:
> On Mon, Nov 04, 2013 at 12:23:27PM -0500, Anna Schumaker wrote:
>>> state" that will return zeroes when read, even if those zeroes didn't
>>> make it to disk.
>>
>> And that's all done through metadata?  Doing the commit_metadata() call makes a bit more sense now.
> 
> Yes.  The only filesystem in tree that actually seems to write zeroes
> is gfs2, but even then it doesn't do so through the usual mechanisms.
> 
>>> Also what does the data arm buy us over good old WRITE?
>>>
>>
>> I've been working off of draft #21, the most recent commit is 2b3ab1740b1ea843faa59566fb4213a42d8c724a from Aug 19.  The eventual goal is to phase out WRITE and replace it with WRITE_PLUS, but that won't happen until at least 4.3.  I don't know why it's being done this way instead of just adding an FALLOCATE operation.
> 
> Where can I find draft 21?  The newest document in the git repo I was
> pointed to earlier is draft-ietf-nfsv4-minorversion2-13.txt, and the
> newest I could find on tools.ietf.org is draft 20.

You may need to run `make` in the NFSv4.2 directory.

> 
> I still don't understand why anyone would phase out WRITE in favour of
> something that doesn't actually add any value for the write case.
> 
> I'll try to take it up directly with the working group, but my post to
> the list yesterday in your SEEK thread didn't seem to have made it
> trough.   I also tried to research how they came up with this idiotic
> design, but the mailing list archives tell very little.  Could it be
> that most of these decisions are actually made in a smokey backroom and
> not on the list?
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever Nov. 4, 2013, 7:16 p.m. UTC | #22
On Nov 4, 2013, at 10:53 AM, Christoph Hellwig <hch@infradead.org> wrote:

> On Mon, Nov 04, 2013 at 12:23:27PM -0500, Anna Schumaker wrote:
>>> state" that will return zeroes when read, even if those zeroes didn't
>>> make it to disk.
>> 
>> And that's all done through metadata?  Doing the commit_metadata() call makes a bit more sense now.
> 
> Yes.  The only filesystem in tree that actually seems to write zeroes
> is gfs2, but even then it doesn't do so through the usual mechanisms.
> 
>>> Also what does the data arm buy us over good old WRITE?
>>> 
>> 
>> I've been working off of draft #21, the most recent commit is 2b3ab1740b1ea843faa59566fb4213a42d8c724a from Aug 19.  The eventual goal is to phase out WRITE and replace it with WRITE_PLUS, but that won't happen until at least 4.3.  I don't know why it's being done this way instead of just adding an FALLOCATE operation.
> 
> Where can I find draft 21?  The newest document in the git repo I was
> pointed to earlier is draft-ietf-nfsv4-minorversion2-13.txt, and the
> newest I could find on tools.ietf.org is draft 20.
> 
> I still don't understand why anyone would phase out WRITE in favour of
> something that doesn't actually add any value for the write case.

Protocol extensibility.  WRITE_PLUS adds a discriminated union of data types that can be extended easily to include initialization patterns, integrity metadata, and other things like holes.  It's entirely another question as to whether any particular extension is tasteful.

> I'll try to take it up directly with the working group, but my post to
> the list yesterday in your SEEK thread didn't seem to have made it
> trough.   I also tried to research how they came up with this idiotic
> design, but the mailing list archives tell very little.  Could it be
> that most of these decisions are actually made in a smokey backroom and
> not on the list?

The documents themselves count as public proposals.  That's why they are referred to as "Requests For Comments".

The mailing list is not the only place where they are discussed.  We also use conference calls, and there are face-to-face meetings too.  Not all of that content is distilled into a public record.  Very much like Linux kernel development.

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 4, 2013, 7:19 p.m. UTC | #23
On Mon, Nov 04, 2013 at 11:16:13AM -0800, Chuck Lever wrote:
> > I still don't understand why anyone would phase out WRITE in favour of
> > something that doesn't actually add any value for the write case.
> 
> Protocol extensibility.  WRITE_PLUS adds a discriminated union of data types that can be extended easily to include initialization patterns, integrity metadata, and other things like holes.  It's entirely another question as to whether any particular extension is tasteful.

Maybe I'm missing something, but what does multiplexing entirely
different operation actually buy you?  It is different operation after
all.  It's not like adding new operations to NFS is all that hard.

> The mailing list is not the only place where they are discussed.  We also use conference calls, and there are face-to-face meetings too.  Not all of that content is distilled into a public record.  Very much like Linux kernel development.

Linux development strives very hard documenting rationales, something I
haven't found on the NFS list or in the repository.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever Nov. 4, 2013, 7:50 p.m. UTC | #24
On Nov 4, 2013, at 11:19 AM, Christoph Hellwig <hch@infradead.org> wrote:

> On Mon, Nov 04, 2013 at 11:16:13AM -0800, Chuck Lever wrote:
>>> I still don't understand why anyone would phase out WRITE in favour of
>>> something that doesn't actually add any value for the write case.
>> 
>> Protocol extensibility.  WRITE_PLUS adds a discriminated union of data types that can be extended easily to include initialization patterns, integrity metadata, and other things like holes.  It's entirely another question as to whether any particular extension is tasteful.
> 
> Maybe I'm missing something, but what does multiplexing entirely
> different operation actually buy you?  It is different operation after
> all.

I think a successful argument could be made that initialization or hole punching may not belong in a WRITE_PLUS operation.  For example, using a COMMIT for a long-running server-performed file initialization doesn't make sense to me.  The infrastructure we introduced for COPY_OFFLOAD seems better suited for file initialization.

> It's not like adding new operations to NFS is all that hard.

Actually, it is harder than you think.  For various reasons, it takes five or more years to create a new NFS protocol specification, something we hope to address.

>> The mailing list is not the only place where they are discussed.  We also use conference calls, and there are face-to-face meetings too.  Not all of that content is distilled into a public record.  Very much like Linux kernel development.
> 
> Linux development strives very hard documenting rationales, something I
> haven't found on the NFS list or in the repository.

There are plenty of times where I find the kernel git log or e-mail archives entirely unhelpful in understanding the why's and wherefor's.  It's a human endeavor, and like all such things, is imperfect, and is often a matter of knowing who to ask rather than where to look.

By the way, e-mail from non-members to nfsv4@ietf.org is moderated.  The moderator may be traveling or otherwise unavailable, as IETF 88 is happening this week.  If you subscribe to the list, you should be able to post without restriction.

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 4, 2013, 7:54 p.m. UTC | #25
On Mon, Nov 04, 2013 at 11:50:51AM -0800, Chuck Lever wrote:
> I think a successful argument could be made that initialization or hole punching may not belong in a WRITE_PLUS operation.  For example, using a COMMIT for a long-running server-performed file initialization doesn't make sense to me.  The infrastructure we introduced for COPY_OFFLOAD seems better suited for file initialization.

I've been going through the latest draft and will submit a counter
proposal in a few days time.

> > It's not like adding new operations to NFS is all that hard.
> 
> Actually, it is harder than you think.  For various reasons, it takes five or more years to create a new NFS protocol specification, something we hope to address.

I see the pain in getting new minor versions out.  But that pain is
shared between adding a new operation and a new arm to an union.

> By the way, e-mail from non-members to nfsv4@ietf.org is moderated.  The moderator may be traveling or otherwise unavailable, as IETF 88 is happening this week.  If you subscribe to the list, you should be able to post without restriction.

I have been subscribed longer than I've been posting to it.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields Nov. 4, 2013, 7:55 p.m. UTC | #26
On Mon, Nov 04, 2013 at 11:50:51AM -0800, Chuck Lever wrote:
> 
> On Nov 4, 2013, at 11:19 AM, Christoph Hellwig <hch@infradead.org> wrote:
> 
> > On Mon, Nov 04, 2013 at 11:16:13AM -0800, Chuck Lever wrote:
> >>> I still don't understand why anyone would phase out WRITE in favour of
> >>> something that doesn't actually add any value for the write case.
> >> 
> >> Protocol extensibility.  WRITE_PLUS adds a discriminated union of data types that can be extended easily to include initialization patterns, integrity metadata, and other things like holes.  It's entirely another question as to whether any particular extension is tasteful.
> > 
> > Maybe I'm missing something, but what does multiplexing entirely
> > different operation actually buy you?  It is different operation after
> > all.
> 
> I think a successful argument could be made that initialization or hole punching may not belong in a WRITE_PLUS operation.  For example, using a COMMIT for a long-running server-performed file initialization doesn't make sense to me.  The infrastructure we introduced for COPY_OFFLOAD seems better suited for file initialization.
> 
> > It's not like adding new operations to NFS is all that hard.
> 
> Actually, it is harder than you think.  For various reasons, it takes five or more years to create a new NFS protocol specification, something we hope to address.

Well, but that's not really a function of the number of operations.
It's not as though implementing hole punching will take more or less
time depending on whether we implement something as a compound op or a
union.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haynes, Tom Nov. 4, 2013, 8:03 p.m. UTC | #27
On Nov 4, 2013, at 11:50 AM, Chuck Lever <chuck.lever@oracle.com> wrote:

> 
> On Nov 4, 2013, at 11:19 AM, Christoph Hellwig <hch@infradead.org> wrote:
> 
>> On Mon, Nov 04, 2013 at 11:16:13AM -0800, Chuck Lever wrote:
>>>> I still don't understand why anyone would phase out WRITE in favour of
>>>> something that doesn't actually add any value for the write case.
>>> 
>>> Protocol extensibility.  WRITE_PLUS adds a discriminated union of data types that can be extended easily to include initialization patterns, integrity metadata, and other things like holes.  It's entirely another question as to whether any particular extension is tasteful.
>> 
>> Maybe I'm missing something, but what does multiplexing entirely
>> different operation actually buy you?  It is different operation after
>> all.


Do you have an issue with READ_PLUS?

I.e., are you okay with that operation because the answer is not known beforehand?


> 
> I think a successful argument could be made that initialization or hole punching may not belong in a WRITE_PLUS operation.  For example, using a COMMIT for a long-running server-performed file initialization doesn't make sense to me.  The infrastructure we introduced for COPY_OFFLOAD seems better suited for file initialization.

WRITE_PLUS does utilize COPY_OFFLOAD for such actions.


> 
>> It's not like adding new operations to NFS is all that hard.
> 
> Actually, it is harder than you think.  For various reasons, it takes five or more years to create a new NFS protocol specification, something we hope to address.
> 
>>> The mailing list is not the only place where they are discussed.  We also use conference calls, and there are face-to-face meetings too.  Not all of that content is distilled into a public record.  Very much like Linux kernel development.
>> 
>> Linux development strives very hard documenting rationales, something I
>> haven't found on the NFS list or in the repository.


There was a strong consensus in the biweekly calls to merge all of the operations together into WRITE_PLUS.
I think we also voted on this at one of the meetings.

But in any event, the document is still open and I'm glad Anna is funneling implementation experience into
it. 



--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 4, 2013, 8:13 p.m. UTC | #28
On Mon, Nov 04, 2013 at 08:03:58PM +0000, Haynes, Tom wrote:
> Do you have an issue with READ_PLUS?
> 
> I.e., are you okay with that operation because the answer is not known beforehand?

The hole vs data part of it looks reasonable.  The whole ADH complex
looks a little fishy to me, but I will have to dig deeper into before
writing down a detailed critism.

But one very obvious question already comes up:  what data is supposed
to be returned when a client that doesn't suport 4.2 at all are doesn't
support ADHs reads a file that contains them.  From a quick glance it
seems a mapping from ADHs to flat blocks is possible, but such a mapping
should be specified in the standard.

> There was a strong consensus in the biweekly calls to merge all of the operations together into WRITE_PLUS.

Do you remember any arguments for it?  Also any rational for adding the
plain old data write to it?

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 5, 2013, 9:40 a.m. UTC | #29
> +{
> +	int error, mode = 0;
> +
> +	if (allocated == false)
> +		mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
> +
> +	error = do_fallocate(file, mode, offset, len);
> +	if (error == 0)
> +		error = vfs_fsync_range(file, offset, offset + len, 0);
> +	return nfserrno(error);

Same problem with the falloc vs WRITE_PLUS mismatch here.  For XFS you
could make the allocated case all XFS_IOC_ZERO_RANGE, but the proposed
VFS equivalent never seemed to have made it in.

Also no need for the FALLOC_FL_KEEP_SIZE in the hole punch case, it is
ignored.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bryan Schumaker Nov. 5, 2013, 2:23 p.m. UTC | #30
On 11/05/2013 04:40 AM, Christoph Hellwig wrote:
>> +{
>> +	int error, mode = 0;
>> +
>> +	if (allocated == false)
>> +		mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
>> +
>> +	error = do_fallocate(file, mode, offset, len);
>> +	if (error == 0)
>> +		error = vfs_fsync_range(file, offset, offset + len, 0);
>> +	return nfserrno(error);
> 
> Same problem with the falloc vs WRITE_PLUS mismatch here.  For XFS you
> could make the allocated case all XFS_IOC_ZERO_RANGE, but the proposed
> VFS equivalent never seemed to have made it in.
> 
> Also no need for the FALLOC_FL_KEEP_SIZE in the hole punch case, it is
> ignored.
> 

It's not ignored in do_fallocate() (fs/open.c) where it returns -EOPNOTSUPP if FALLOC_FL_PUNCH_HOLE is set without FALLOC_FL_KEEP_SIZE :)
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 5, 2013, 3:11 p.m. UTC | #31
On Tue, Nov 05, 2013 at 09:23:59AM -0500, Anna Schumaker wrote:
> It's not ignored in do_fallocate() (fs/open.c) where it returns -EOPNOTSUPP if FALLOC_FL_PUNCH_HOLE is set without FALLOC_FL_KEEP_SIZE :)

Oops, looks like we decided to enforce it, despite hole punches never
having an effect on file size.  I stand corrected.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 419572f..3210c6f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1028,6 +1028,42 @@  nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status;
 }
 
+static __be32
+nfsd4_write_plus_hole(struct file *file, struct nfsd4_write_plus *writeplus,
+		      struct net *net)
+{
+	__be32 status;
+
+	status = nfsd4_vfs_fallocate(file, writeplus->wp_allocated,
+				writeplus->wp_offset, writeplus->wp_length);
+	if (status == nfs_ok) {
+		writeplus->wp_res.wr_stid = NULL;
+		writeplus->wp_res.wr_bytes_written = writeplus->wp_length;
+		writeplus->wp_res.wr_stable_how = NFS_FILE_SYNC;
+		gen_boot_verifier(&writeplus->wp_res.wr_verifier, net);
+	}
+
+	return status;
+}
+
+static __be32
+nfsd4_write_plus(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		struct nfsd4_write_plus *writeplus)
+{
+	struct file *file;
+	__be32 status;
+	struct net *net = SVC_NET(rqstp);
+
+	status = nfs4_preprocess_stateid_op(net, cstate, &writeplus->wp_stateid,
+					    WR_STATE, &file);
+	if (status)
+		return status;
+
+	if (writeplus->wp_data_content == NFS4_CONTENT_HOLE)
+		return nfsd4_write_plus_hole(file, writeplus, net);
+	return nfserr_union_notsupp;
+}
+
 /* This routine never returns NFS_OK!  If there are no other errors, it
  * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
  * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
@@ -1840,6 +1876,15 @@  static struct nfsd4_operation nfsd4_ops[] = {
 		.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
+
+	/* NFSv4.2 operations */
+	[OP_WRITE_PLUS] = {
+		.op_func = (nfsd4op_func)nfsd4_write_plus,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_name = "OP_WRITE_PLUS",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
+		.op_get_currentstateid = (stateid_getter)nfsd4_get_writestateid,
+	},
 };
 
 #ifdef NFSD_DEBUG
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 60f5a1f..1e4e9af 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1485,6 +1485,32 @@  static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
 }
 
 static __be32
+nfsd4_decode_write_plus(struct nfsd4_compoundargs *argp,
+			struct nfsd4_write_plus *writeplus)
+{
+	DECODE_HEAD;
+	unsigned int i;
+
+	status = nfsd4_decode_stateid(argp, &writeplus->wp_stateid);
+	if (status)
+		return status;
+
+	READ_BUF(8);
+	READ32(writeplus->wp_stable_how);
+	READ32(writeplus->wp_data_length);
+
+	for (i = 0; i < writeplus->wp_data_length; i++) {
+		READ_BUF(24);
+		READ32(writeplus->wp_data_content);
+		READ64(writeplus->wp_offset);
+		READ64(writeplus->wp_length);
+		READ32(writeplus->wp_allocated);
+	}
+
+	DECODE_TAIL;
+}
+
+static __be32
 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 {
 	return nfs_ok;
@@ -1665,7 +1691,7 @@  static nfsd4_dec nfsd42_dec_ops[] = {
 	[OP_COPY_NOTIFY]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_OFFLOAD_REVOKE]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_WRITE_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_WRITE_PLUS]		= (nfsd4_dec)nfsd4_decode_write_plus,
 	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
@@ -3591,6 +3617,38 @@  nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
 	return nfserr;
 }
 
+static void
+nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write)
+{
+	__be32 *p;
+
+	RESERVE_SPACE(4);
+
+	if (write->wr_stid == NULL) {
+		WRITE32(0);
+		ADJUST_ARGS();
+	} else {
+		WRITE32(1);
+		ADJUST_ARGS();
+		nfsd4_encode_stateid(resp, &write->wr_stid->sc_stateid);
+	}
+
+	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
+	WRITE64(write->wr_bytes_written);
+	WRITE32(write->wr_stable_how);
+	WRITEMEM(write->wr_verifier.data, NFS4_VERIFIER_SIZE);
+	ADJUST_ARGS();
+}
+
+static __be32
+nfsd4_encode_write_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
+		        struct nfsd4_write_plus *writeplus)
+{
+	if (!nfserr)
+		nfsd42_encode_write_res(resp, &writeplus->wp_res);
+	return nfserr;
+}
+
 static __be32
 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 {
@@ -3670,7 +3728,7 @@  static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_COPY_NOTIFY]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_OFFLOAD_REVOKE]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_WRITE_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_WRITE_PLUS]		= (nfsd4_enc)nfsd4_encode_write_plus,
 	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_IO_ADVISE]		= (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c827acb..7fec087 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -16,6 +16,7 @@ 
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/splice.h>
+#include <linux/falloc.h>
 #include <linux/fcntl.h>
 #include <linux/namei.h>
 #include <linux/delay.h>
@@ -649,6 +650,19 @@  __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
 }
 #endif
 
+__be32 nfsd4_vfs_fallocate(struct file *file, bool allocated, loff_t offset, loff_t len)
+{
+	int error, mode = 0;
+
+	if (allocated == false)
+		mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+
+	error = do_fallocate(file, mode, offset, len);
+	if (error == 0)
+		error = vfs_fsync_range(file, offset, offset + len, 0);
+	return nfserrno(error);
+}
+
 #endif /* defined(CONFIG_NFSD_V4) */
 
 #ifdef CONFIG_NFSD_V3
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a4be2e3..187eb95 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -57,6 +57,7 @@  __be32          nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
 int             nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
 __be32          nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
 		    struct xdr_netobj *);
+__be32		nfsd4_vfs_fallocate(struct file *, bool, loff_t, loff_t);
 #endif /* CONFIG_NFSD_V4 */
 __be32		nfsd_create(struct svc_rqst *, struct svc_fh *,
 				char *name, int len, struct iattr *attrs,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index b3ed644..aaef9ab 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -430,6 +430,27 @@  struct nfsd4_reclaim_complete {
 	u32 rca_one_fs;
 };
 
+struct nfsd42_write_res {
+	struct nfs4_stid	*wr_stid;
+	u64			wr_bytes_written;
+	u32			wr_stable_how;
+	nfs4_verifier		wr_verifier;
+};
+
+struct nfsd4_write_plus {
+	/* request */
+	stateid_t	wp_stateid;
+	u32		wp_stable_how;
+	u32		wp_data_length;
+	u32		wp_data_content;
+	u64		wp_offset;
+	u64		wp_length;
+	u32		wp_allocated;
+
+	/* response */
+	struct nfsd42_write_res	wp_res;
+};
+
 struct nfsd4_op {
 	int					opnum;
 	__be32					status;
@@ -475,6 +496,9 @@  struct nfsd4_op {
 		struct nfsd4_reclaim_complete	reclaim_complete;
 		struct nfsd4_test_stateid	test_stateid;
 		struct nfsd4_free_stateid	free_stateid;
+
+		/* NFSv4.2 */
+		struct nfsd4_write_plus		write_plus;
 	} u;
 	struct nfs4_replay *			replay;
 };
diff --git a/fs/open.c b/fs/open.c
index d420331..09db2d5 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -278,6 +278,7 @@  int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	sb_end_write(inode->i_sb);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(do_fallocate);
 
 SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
 {
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 2bc5217..55ed991 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -128,7 +128,7 @@  enum nfs_opnum4 {
 Needs to be updated if more operations are defined in future.*/
 
 #define FIRST_NFS4_OP	OP_ACCESS
-#define LAST_NFS4_OP 	OP_RECLAIM_COMPLETE
+#define LAST_NFS4_OP 	OP_WRITE_PLUS
 
 enum nfsstat4 {
 	NFS4_OK = 0,
@@ -552,4 +552,10 @@  struct nfs4_deviceid {
 	char data[NFS4_DEVICEID4_SIZE];
 };
 
+enum data_content4 {
+	NFS4_CONTENT_DATA		= 0,
+	NFS4_CONTENT_APP_DATA_HOLE	= 1,
+	NFS4_CONTENT_HOLE		= 2,
+};
+
 #endif