diff mbox series

[v2,4/6] NFS: Add READ_PLUS data segment support

Message ID 20200214211227.407836-5-Anna.Schumaker@Netapp.com (mailing list archive)
State New, archived
Headers show
Series NFS: Add support for the v4.2 READ_PLUS operation | expand

Commit Message

Anna Schumaker Feb. 14, 2020, 9:12 p.m. UTC
From: Anna Schumaker <Anna.Schumaker@Netapp.com>

This patch adds client support for decoding a single NFS4_CONTENT_DATA
segment returned by the server. This is the simplest implementation
possible, since it does not account for any hole segments in the reply.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs42xdr.c         | 138 ++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4proc.c         |  43 +++++++++++-
 fs/nfs/nfs4xdr.c          |   1 +
 include/linux/nfs4.h      |   2 +-
 include/linux/nfs_fs_sb.h |   1 +
 include/linux/nfs_xdr.h   |   2 +-
 6 files changed, 182 insertions(+), 5 deletions(-)

Comments

Chuck Lever III Feb. 14, 2020, 10:28 p.m. UTC | #1
> On Feb 14, 2020, at 4:12 PM, schumaker.anna@gmail.com wrote:
> 
> From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> 
> This patch adds client support for decoding a single NFS4_CONTENT_DATA
> segment returned by the server. This is the simplest implementation
> possible, since it does not account for any hole segments in the reply.
> 
> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> ---
> fs/nfs/nfs42xdr.c         | 138 ++++++++++++++++++++++++++++++++++++++
> fs/nfs/nfs4proc.c         |  43 +++++++++++-
> fs/nfs/nfs4xdr.c          |   1 +
> include/linux/nfs4.h      |   2 +-
> include/linux/nfs_fs_sb.h |   1 +
> include/linux/nfs_xdr.h   |   2 +-
> 6 files changed, 182 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
> index c03f3246d6c5..bf118ecabe2c 100644
> --- a/fs/nfs/nfs42xdr.c
> +++ b/fs/nfs/nfs42xdr.c
> @@ -45,6 +45,15 @@
> #define encode_deallocate_maxsz		(op_encode_hdr_maxsz + \
> 					 encode_fallocate_maxsz)
> #define decode_deallocate_maxsz		(op_decode_hdr_maxsz)
> +#define encode_read_plus_maxsz		(op_encode_hdr_maxsz + \
> +					 encode_stateid_maxsz + 3)
> +#define NFS42_READ_PLUS_SEGMENT_SIZE	(1 /* data_content4 */ + \
> +					 2 /* data_info4.di_offset */ + \
> +					 2 /* data_info4.di_length */)
> +#define decode_read_plus_maxsz		(op_decode_hdr_maxsz + \
> +					 1 /* rpr_eof */ + \
> +					 1 /* rpr_contents count */ + \
> +					 NFS42_READ_PLUS_SEGMENT_SIZE)
> #define encode_seek_maxsz		(op_encode_hdr_maxsz + \
> 					 encode_stateid_maxsz + \
> 					 2 /* offset */ + \
> @@ -128,6 +137,14 @@
> 					 decode_putfh_maxsz + \
> 					 decode_deallocate_maxsz + \
> 					 decode_getattr_maxsz)
> +#define NFS4_enc_read_plus_sz		(compound_encode_hdr_maxsz + \
> +					 encode_sequence_maxsz + \
> +					 encode_putfh_maxsz + \
> +					 encode_read_plus_maxsz)
> +#define NFS4_dec_read_plus_sz		(compound_decode_hdr_maxsz + \
> +					 decode_sequence_maxsz + \
> +					 decode_putfh_maxsz + \
> +					 decode_read_plus_maxsz)
> #define NFS4_enc_seek_sz		(compound_encode_hdr_maxsz + \
> 					 encode_sequence_maxsz + \
> 					 encode_putfh_maxsz + \
> @@ -252,6 +269,16 @@ static void encode_deallocate(struct xdr_stream *xdr,
> 	encode_fallocate(xdr, args);
> }
> 
> +static void encode_read_plus(struct xdr_stream *xdr,
> +			     const struct nfs_pgio_args *args,
> +			     struct compound_hdr *hdr)
> +{
> +	encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr);
> +	encode_nfs4_stateid(xdr, &args->stateid);
> +	encode_uint64(xdr, args->offset);
> +	encode_uint32(xdr, args->count);
> +}
> +
> static void encode_seek(struct xdr_stream *xdr,
> 			const struct nfs42_seek_args *args,
> 			struct compound_hdr *hdr)
> @@ -446,6 +473,29 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
> 	encode_nops(&hdr);
> }
> 
> +/*
> + * Encode READ_PLUS request
> + */
> +static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
> +				   struct xdr_stream *xdr,
> +				   const void *data)
> +{
> +	const struct nfs_pgio_args *args = data;
> +	struct compound_hdr hdr = {
> +		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
> +	};
> +
> +	encode_compound_hdr(xdr, req, &hdr);
> +	encode_sequence(xdr, &args->seq_args, &hdr);
> +	encode_putfh(xdr, args->fh, &hdr);
> +	encode_read_plus(xdr, args, &hdr);
> +
> +	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
> +				args->count, hdr.replen);
> +	req->rq_rcv_buf.flags |= XDRBUF_READ;

IMO this line is incorrect.

RFC 8267 Section 6.1 does not list any part of the result of READ_PLUS
as DDP-eligible. There's no way for a client to know how to set up
Write chunks, unless it knows exactly where the file's holes are in
advance. Even then... racy.

Just curious, have you tried READ_PLUS with proto=rdma ?


> +	encode_nops(&hdr);
> +}
> +
> /*
>  * Encode SEEK request
>  */
> @@ -694,6 +744,67 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re
> 	return decode_op_hdr(xdr, OP_DEALLOCATE);
> }
> 
> +static uint32_t decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *res,
> +				      uint32_t *eof)
> +{
> +	__be32 *p;
> +	uint32_t count, recvd;
> +	uint64_t offset;
> +
> +	p = xdr_inline_decode(xdr, 8 + 4);
> +	if (unlikely(!p))
> +		return -EIO;
> +
> +	p = xdr_decode_hyper(p, &offset);
> +	count = be32_to_cpup(p);
> +	if (count == 0)
> +		return 0;
> +
> +	recvd = xdr_read_pages(xdr, count);
> +	if (count > recvd) {
> +		dprintk("NFS: server cheating in read reply: "
> +				"count %u > recvd %u\n", count, recvd);
> +		count = recvd;
> +		*eof = 0;
> +	}
> +
> +	return count;
> +}
> +
> +static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
> +{
> +	__be32 *p;
> +	uint32_t count, eof, segments, type;
> +	int status;
> +
> +	status = decode_op_hdr(xdr, OP_READ_PLUS);
> +	if (status)
> +		return status;
> +
> +	p = xdr_inline_decode(xdr, 4 + 4);
> +	if (unlikely(!p))
> +		return -EIO;
> +
> +	eof = be32_to_cpup(p++);
> +	segments = be32_to_cpup(p++);
> +	if (segments == 0)
> +		return 0;
> +
> +	p = xdr_inline_decode(xdr, 4);
> +	if (unlikely(!p))
> +		return -EIO;
> +
> +	type = be32_to_cpup(p++);
> +	if (type == NFS4_CONTENT_DATA)
> +		count = decode_read_plus_data(xdr, res, &eof);
> +	else
> +		return -EINVAL;
> +
> +	res->eof = eof;
> +	res->count = count;
> +	return 0;
> +}
> +
> static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
> {
> 	int status;
> @@ -870,6 +981,33 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp,
> 	return status;
> }
> 
> +/*
> + * Decode READ_PLUS request
> + */
> +static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
> +				  struct xdr_stream *xdr,
> +				  void *data)
> +{
> +	struct nfs_pgio_res *res = data;
> +	struct compound_hdr hdr;
> +	int status;
> +
> +	status = decode_compound_hdr(xdr, &hdr);
> +	if (status)
> +		goto out;
> +	status = decode_sequence(xdr, &res->seq_res, rqstp);
> +	if (status)
> +		goto out;
> +	status = decode_putfh(xdr);
> +	if (status)
> +		goto out;
> +	status = decode_read_plus(xdr, res);
> +	if (!status)
> +		status = res->count;
> +out:
> +	return status;
> +}
> +
> /*
>  * Decode SEEK request
>  */
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 95d07a3dc5d1..ed3ec8c36273 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -69,6 +69,10 @@
> 
> #include "nfs4trace.h"
> 
> +#ifdef CONFIG_NFS_V4_2
> +#include "nfs42.h"
> +#endif /* CONFIG_NFS_V4_2 */
> +
> #define NFSDBG_FACILITY		NFSDBG_PROC
> 
> #define NFS4_BITMASK_SZ		3
> @@ -5199,28 +5203,60 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
> 	return true;
> }
> 
> +static bool nfs4_read_plus_not_supported(struct rpc_task *task,
> +					 struct nfs_pgio_header *hdr)
> +{
> +	struct nfs_server *server = NFS_SERVER(hdr->inode);
> +	struct rpc_message *msg = &task->tk_msg;
> +
> +	if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] &&
> +	    server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) {
> +		server->caps &= ~NFS_CAP_READ_PLUS;
> +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> +		rpc_restart_call_prepare(task);
> +		return true;
> +	}
> +	return false;
> +}
> +
> static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
> {
> -
> 	dprintk("--> %s\n", __func__);
> 
> 	if (!nfs4_sequence_done(task, &hdr->res.seq_res))
> 		return -EAGAIN;
> 	if (nfs4_read_stateid_changed(task, &hdr->args))
> 		return -EAGAIN;
> +	if (nfs4_read_plus_not_supported(task, hdr))
> +		return -EAGAIN;
> 	if (task->tk_status > 0)
> 		nfs_invalidate_atime(hdr->inode);
> 	return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
> 				    nfs4_read_done_cb(task, hdr);
> }
> 
> +#ifdef CONFIG_NFS_V4_2
> +static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg)
> +{
> +	if (server->caps & NFS_CAP_READ_PLUS)
> +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
> +	else
> +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> +}
> +#else
> +static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg)
> +{
> +	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> +}
> +#endif /* CONFIG_NFS_V4_2 */
> +
> static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
> 				 struct rpc_message *msg)
> {
> 	hdr->timestamp   = jiffies;
> 	if (!hdr->pgio_done_cb)
> 		hdr->pgio_done_cb = nfs4_read_done_cb;
> -	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> +	nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg);
> 	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
> }
> 
> @@ -9970,7 +10006,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
> 		| NFS_CAP_SEEK
> 		| NFS_CAP_LAYOUTSTATS
> 		| NFS_CAP_CLONE
> -		| NFS_CAP_LAYOUTERROR,
> +		| NFS_CAP_LAYOUTERROR
> +		| NFS_CAP_READ_PLUS,
> 	.init_client = nfs41_init_client,
> 	.shutdown_client = nfs41_shutdown_client,
> 	.match_stateid = nfs41_match_stateid,
> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> index 47817ef0aadb..68b2917d0537 100644
> --- a/fs/nfs/nfs4xdr.c
> +++ b/fs/nfs/nfs4xdr.c
> @@ -7584,6 +7584,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
> 	PROC42(COPY_NOTIFY,	enc_copy_notify,	dec_copy_notify),
> 	PROC(LOOKUPP,		enc_lookupp,		dec_lookupp),
> 	PROC42(LAYOUTERROR,	enc_layouterror,	dec_layouterror),
> +	PROC42(READ_PLUS,	enc_read_plus,		dec_read_plus),
> };
> 
> static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
> diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
> index 82d8fb422092..c1eeef52545c 100644
> --- a/include/linux/nfs4.h
> +++ b/include/linux/nfs4.h
> @@ -540,8 +540,8 @@ enum {
> 
> 	NFSPROC4_CLNT_LOOKUPP,
> 	NFSPROC4_CLNT_LAYOUTERROR,
> -
> 	NFSPROC4_CLNT_COPY_NOTIFY,
> +	NFSPROC4_CLNT_READ_PLUS,
> };
> 
> /* nfs41 types */
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 465fa98258a3..11248c5a7b24 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -281,5 +281,6 @@ struct nfs_server {
> #define NFS_CAP_OFFLOAD_CANCEL	(1U << 25)
> #define NFS_CAP_LAYOUTERROR	(1U << 26)
> #define NFS_CAP_COPY_NOTIFY	(1U << 27)
> +#define NFS_CAP_READ_PLUS	(1U << 28)
> 
> #endif
> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> index 94c77ed55ce1..8efbf3d8b263 100644
> --- a/include/linux/nfs_xdr.h
> +++ b/include/linux/nfs_xdr.h
> @@ -655,7 +655,7 @@ struct nfs_pgio_args {
> struct nfs_pgio_res {
> 	struct nfs4_sequence_res	seq_res;
> 	struct nfs_fattr *	fattr;
> -	__u32			count;
> +	__u64			count;
> 	__u32			op_status;
> 	union {
> 		struct {
> -- 
> 2.25.0
> 

--
Chuck Lever
Anna Schumaker Feb. 20, 2020, 2:42 p.m. UTC | #2
On Fri, 2020-02-14 at 17:28 -0500, Chuck Lever wrote:
> > On Feb 14, 2020, at 4:12 PM, schumaker.anna@gmail.com wrote:
> > 
> > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > 
> > This patch adds client support for decoding a single NFS4_CONTENT_DATA
> > segment returned by the server. This is the simplest implementation
> > possible, since it does not account for any hole segments in the reply.
> > 
> > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > ---
> > fs/nfs/nfs42xdr.c         | 138 ++++++++++++++++++++++++++++++++++++++
> > fs/nfs/nfs4proc.c         |  43 +++++++++++-
> > fs/nfs/nfs4xdr.c          |   1 +
> > include/linux/nfs4.h      |   2 +-
> > include/linux/nfs_fs_sb.h |   1 +
> > include/linux/nfs_xdr.h   |   2 +-
> > 6 files changed, 182 insertions(+), 5 deletions(-)
> > 
> > diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
> > index c03f3246d6c5..bf118ecabe2c 100644
> > --- a/fs/nfs/nfs42xdr.c
> > +++ b/fs/nfs/nfs42xdr.c
> > @@ -45,6 +45,15 @@
> > #define encode_deallocate_maxsz		(op_encode_hdr_maxsz + \
> > 					 encode_fallocate_maxsz)
> > #define decode_deallocate_maxsz		(op_decode_hdr_maxsz)
> > +#define encode_read_plus_maxsz		(op_encode_hdr_maxsz + \
> > +					 encode_stateid_maxsz + 3)
> > +#define NFS42_READ_PLUS_SEGMENT_SIZE	(1 /* data_content4 */ + \
> > +					 2 /* data_info4.di_offset */ + \
> > +					 2 /* data_info4.di_length */)
> > +#define decode_read_plus_maxsz		(op_decode_hdr_maxsz + \
> > +					 1 /* rpr_eof */ + \
> > +					 1 /* rpr_contents count */ + \
> > +					 NFS42_READ_PLUS_SEGMENT_SIZE)
> > #define encode_seek_maxsz		(op_encode_hdr_maxsz + \
> > 					 encode_stateid_maxsz + \
> > 					 2 /* offset */ + \
> > @@ -128,6 +137,14 @@
> > 					 decode_putfh_maxsz + \
> > 					 decode_deallocate_maxsz + \
> > 					 decode_getattr_maxsz)
> > +#define NFS4_enc_read_plus_sz		(compound_encode_hdr_maxsz + \
> > +					 encode_sequence_maxsz + \
> > +					 encode_putfh_maxsz + \
> > +					 encode_read_plus_maxsz)
> > +#define NFS4_dec_read_plus_sz		(compound_decode_hdr_maxsz + \
> > +					 decode_sequence_maxsz + \
> > +					 decode_putfh_maxsz + \
> > +					 decode_read_plus_maxsz)
> > #define NFS4_enc_seek_sz		(compound_encode_hdr_maxsz + \
> > 					 encode_sequence_maxsz + \
> > 					 encode_putfh_maxsz + \
> > @@ -252,6 +269,16 @@ static void encode_deallocate(struct xdr_stream *xdr,
> > 	encode_fallocate(xdr, args);
> > }
> > 
> > +static void encode_read_plus(struct xdr_stream *xdr,
> > +			     const struct nfs_pgio_args *args,
> > +			     struct compound_hdr *hdr)
> > +{
> > +	encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr);
> > +	encode_nfs4_stateid(xdr, &args->stateid);
> > +	encode_uint64(xdr, args->offset);
> > +	encode_uint32(xdr, args->count);
> > +}
> > +
> > static void encode_seek(struct xdr_stream *xdr,
> > 			const struct nfs42_seek_args *args,
> > 			struct compound_hdr *hdr)
> > @@ -446,6 +473,29 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst
> > *req,
> > 	encode_nops(&hdr);
> > }
> > 
> > +/*
> > + * Encode READ_PLUS request
> > + */
> > +static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
> > +				   struct xdr_stream *xdr,
> > +				   const void *data)
> > +{
> > +	const struct nfs_pgio_args *args = data;
> > +	struct compound_hdr hdr = {
> > +		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
> > +	};
> > +
> > +	encode_compound_hdr(xdr, req, &hdr);
> > +	encode_sequence(xdr, &args->seq_args, &hdr);
> > +	encode_putfh(xdr, args->fh, &hdr);
> > +	encode_read_plus(xdr, args, &hdr);
> > +
> > +	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
> > +				args->count, hdr.replen);
> > +	req->rq_rcv_buf.flags |= XDRBUF_READ;
> 
> IMO this line is incorrect.

You're right, this line causes problems for RDMA with READ_PLUS. I added it to
match how the other xdr read encoders were set up
> 
> RFC 8267 Section 6.1 does not list any part of the result of READ_PLUS
> as DDP-eligible. There's no way for a client to know how to set up
> Write chunks, unless it knows exactly where the file's holes are in
> advance. Even then... racy.
> 
> Just curious, have you tried READ_PLUS with proto=rdma ?

I haven't done in-depth performance testing, but I have been able to run it.

Anna

> 
> 
> > +	encode_nops(&hdr);
> > +}
> > +
> > /*
> >  * Encode SEEK request
> >  */
> > @@ -694,6 +744,67 @@ static int decode_deallocate(struct xdr_stream *xdr,
> > struct nfs42_falloc_res *re
> > 	return decode_op_hdr(xdr, OP_DEALLOCATE);
> > }
> > 
> > +static uint32_t decode_read_plus_data(struct xdr_stream *xdr, struct
> > nfs_pgio_res *res,
> > +				      uint32_t *eof)
> > +{
> > +	__be32 *p;
> > +	uint32_t count, recvd;
> > +	uint64_t offset;
> > +
> > +	p = xdr_inline_decode(xdr, 8 + 4);
> > +	if (unlikely(!p))
> > +		return -EIO;
> > +
> > +	p = xdr_decode_hyper(p, &offset);
> > +	count = be32_to_cpup(p);
> > +	if (count == 0)
> > +		return 0;
> > +
> > +	recvd = xdr_read_pages(xdr, count);
> > +	if (count > recvd) {
> > +		dprintk("NFS: server cheating in read reply: "
> > +				"count %u > recvd %u\n", count, recvd);
> > +		count = recvd;
> > +		*eof = 0;
> > +	}
> > +
> > +	return count;
> > +}
> > +
> > +static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res
> > *res)
> > +{
> > +	__be32 *p;
> > +	uint32_t count, eof, segments, type;
> > +	int status;
> > +
> > +	status = decode_op_hdr(xdr, OP_READ_PLUS);
> > +	if (status)
> > +		return status;
> > +
> > +	p = xdr_inline_decode(xdr, 4 + 4);
> > +	if (unlikely(!p))
> > +		return -EIO;
> > +
> > +	eof = be32_to_cpup(p++);
> > +	segments = be32_to_cpup(p++);
> > +	if (segments == 0)
> > +		return 0;
> > +
> > +	p = xdr_inline_decode(xdr, 4);
> > +	if (unlikely(!p))
> > +		return -EIO;
> > +
> > +	type = be32_to_cpup(p++);
> > +	if (type == NFS4_CONTENT_DATA)
> > +		count = decode_read_plus_data(xdr, res, &eof);
> > +	else
> > +		return -EINVAL;
> > +
> > +	res->eof = eof;
> > +	res->count = count;
> > +	return 0;
> > +}
> > +
> > static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
> > {
> > 	int status;
> > @@ -870,6 +981,33 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst
> > *rqstp,
> > 	return status;
> > }
> > 
> > +/*
> > + * Decode READ_PLUS request
> > + */
> > +static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
> > +				  struct xdr_stream *xdr,
> > +				  void *data)
> > +{
> > +	struct nfs_pgio_res *res = data;
> > +	struct compound_hdr hdr;
> > +	int status;
> > +
> > +	status = decode_compound_hdr(xdr, &hdr);
> > +	if (status)
> > +		goto out;
> > +	status = decode_sequence(xdr, &res->seq_res, rqstp);
> > +	if (status)
> > +		goto out;
> > +	status = decode_putfh(xdr);
> > +	if (status)
> > +		goto out;
> > +	status = decode_read_plus(xdr, res);
> > +	if (!status)
> > +		status = res->count;
> > +out:
> > +	return status;
> > +}
> > +
> > /*
> >  * Decode SEEK request
> >  */
> > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> > index 95d07a3dc5d1..ed3ec8c36273 100644
> > --- a/fs/nfs/nfs4proc.c
> > +++ b/fs/nfs/nfs4proc.c
> > @@ -69,6 +69,10 @@
> > 
> > #include "nfs4trace.h"
> > 
> > +#ifdef CONFIG_NFS_V4_2
> > +#include "nfs42.h"
> > +#endif /* CONFIG_NFS_V4_2 */
> > +
> > #define NFSDBG_FACILITY		NFSDBG_PROC
> > 
> > #define NFS4_BITMASK_SZ		3
> > @@ -5199,28 +5203,60 @@ static bool nfs4_read_stateid_changed(struct
> > rpc_task *task,
> > 	return true;
> > }
> > 
> > +static bool nfs4_read_plus_not_supported(struct rpc_task *task,
> > +					 struct nfs_pgio_header *hdr)
> > +{
> > +	struct nfs_server *server = NFS_SERVER(hdr->inode);
> > +	struct rpc_message *msg = &task->tk_msg;
> > +
> > +	if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] &&
> > +	    server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) {
> > +		server->caps &= ~NFS_CAP_READ_PLUS;
> > +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> > +		rpc_restart_call_prepare(task);
> > +		return true;
> > +	}
> > +	return false;
> > +}
> > +
> > static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header
> > *hdr)
> > {
> > -
> > 	dprintk("--> %s\n", __func__);
> > 
> > 	if (!nfs4_sequence_done(task, &hdr->res.seq_res))
> > 		return -EAGAIN;
> > 	if (nfs4_read_stateid_changed(task, &hdr->args))
> > 		return -EAGAIN;
> > +	if (nfs4_read_plus_not_supported(task, hdr))
> > +		return -EAGAIN;
> > 	if (task->tk_status > 0)
> > 		nfs_invalidate_atime(hdr->inode);
> > 	return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
> > 				    nfs4_read_done_cb(task, hdr);
> > }
> > 
> > +#ifdef CONFIG_NFS_V4_2
> > +static void nfs42_read_plus_support(struct nfs_server *server, struct
> > rpc_message *msg)
> > +{
> > +	if (server->caps & NFS_CAP_READ_PLUS)
> > +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
> > +	else
> > +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> > +}
> > +#else
> > +static void nfs42_read_plus_support(struct nfs_server *server, struct
> > rpc_message *msg)
> > +{
> > +	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> > +}
> > +#endif /* CONFIG_NFS_V4_2 */
> > +
> > static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
> > 				 struct rpc_message *msg)
> > {
> > 	hdr->timestamp   = jiffies;
> > 	if (!hdr->pgio_done_cb)
> > 		hdr->pgio_done_cb = nfs4_read_done_cb;
> > -	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> > +	nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg);
> > 	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
> > }
> > 
> > @@ -9970,7 +10006,8 @@ static const struct nfs4_minor_version_ops
> > nfs_v4_2_minor_ops = {
> > 		| NFS_CAP_SEEK
> > 		| NFS_CAP_LAYOUTSTATS
> > 		| NFS_CAP_CLONE
> > -		| NFS_CAP_LAYOUTERROR,
> > +		| NFS_CAP_LAYOUTERROR
> > +		| NFS_CAP_READ_PLUS,
> > 	.init_client = nfs41_init_client,
> > 	.shutdown_client = nfs41_shutdown_client,
> > 	.match_stateid = nfs41_match_stateid,
> > diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> > index 47817ef0aadb..68b2917d0537 100644
> > --- a/fs/nfs/nfs4xdr.c
> > +++ b/fs/nfs/nfs4xdr.c
> > @@ -7584,6 +7584,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
> > 	PROC42(COPY_NOTIFY,	enc_copy_notify,	dec_copy_notify),
> > 	PROC(LOOKUPP,		enc_lookupp,		dec_lookupp),
> > 	PROC42(LAYOUTERROR,	enc_layouterror,	dec_layouterror),
> > +	PROC42(READ_PLUS,	enc_read_plus,		dec_read_plus),
> > };
> > 
> > static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
> > diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
> > index 82d8fb422092..c1eeef52545c 100644
> > --- a/include/linux/nfs4.h
> > +++ b/include/linux/nfs4.h
> > @@ -540,8 +540,8 @@ enum {
> > 
> > 	NFSPROC4_CLNT_LOOKUPP,
> > 	NFSPROC4_CLNT_LAYOUTERROR,
> > -
> > 	NFSPROC4_CLNT_COPY_NOTIFY,
> > +	NFSPROC4_CLNT_READ_PLUS,
> > };
> > 
> > /* nfs41 types */
> > diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> > index 465fa98258a3..11248c5a7b24 100644
> > --- a/include/linux/nfs_fs_sb.h
> > +++ b/include/linux/nfs_fs_sb.h
> > @@ -281,5 +281,6 @@ struct nfs_server {
> > #define NFS_CAP_OFFLOAD_CANCEL	(1U << 25)
> > #define NFS_CAP_LAYOUTERROR	(1U << 26)
> > #define NFS_CAP_COPY_NOTIFY	(1U << 27)
> > +#define NFS_CAP_READ_PLUS	(1U << 28)
> > 
> > #endif
> > diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> > index 94c77ed55ce1..8efbf3d8b263 100644
> > --- a/include/linux/nfs_xdr.h
> > +++ b/include/linux/nfs_xdr.h
> > @@ -655,7 +655,7 @@ struct nfs_pgio_args {
> > struct nfs_pgio_res {
> > 	struct nfs4_sequence_res	seq_res;
> > 	struct nfs_fattr *	fattr;
> > -	__u32			count;
> > +	__u64			count;
> > 	__u32			op_status;
> > 	union {
> > 		struct {
> > -- 
> > 2.25.0
> > 
> 
> --
> Chuck Lever
> 
> 
>
Chuck Lever III Feb. 20, 2020, 2:55 p.m. UTC | #3
> On Feb 20, 2020, at 9:42 AM, Anna Schumaker <schumaker.anna@gmail.com> wrote:
> 
> On Fri, 2020-02-14 at 17:28 -0500, Chuck Lever wrote:
>>> On Feb 14, 2020, at 4:12 PM, schumaker.anna@gmail.com wrote:
>>> 
>>> From: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>> 
>>> This patch adds client support for decoding a single NFS4_CONTENT_DATA
>>> segment returned by the server. This is the simplest implementation
>>> possible, since it does not account for any hole segments in the reply.
>>> 
>>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>> ---
>>> fs/nfs/nfs42xdr.c         | 138 ++++++++++++++++++++++++++++++++++++++
>>> fs/nfs/nfs4proc.c         |  43 +++++++++++-
>>> fs/nfs/nfs4xdr.c          |   1 +
>>> include/linux/nfs4.h      |   2 +-
>>> include/linux/nfs_fs_sb.h |   1 +
>>> include/linux/nfs_xdr.h   |   2 +-
>>> 6 files changed, 182 insertions(+), 5 deletions(-)
>>> 
>>> diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
>>> index c03f3246d6c5..bf118ecabe2c 100644
>>> --- a/fs/nfs/nfs42xdr.c
>>> +++ b/fs/nfs/nfs42xdr.c
>>> @@ -45,6 +45,15 @@
>>> #define encode_deallocate_maxsz		(op_encode_hdr_maxsz + \
>>> 					encode_fallocate_maxsz)
>>> #define decode_deallocate_maxsz		(op_decode_hdr_maxsz)
>>> +#define encode_read_plus_maxsz		(op_encode_hdr_maxsz + \
>>> +					encode_stateid_maxsz + 3)
>>> +#define NFS42_READ_PLUS_SEGMENT_SIZE	(1 /* data_content4 */ + \
>>> +					 2 /* data_info4.di_offset */ + \
>>> +					 2 /* data_info4.di_length */)
>>> +#define decode_read_plus_maxsz		(op_decode_hdr_maxsz + \
>>> +					 1 /* rpr_eof */ + \
>>> +					 1 /* rpr_contents count */ + \
>>> +					NFS42_READ_PLUS_SEGMENT_SIZE)
>>> #define encode_seek_maxsz		(op_encode_hdr_maxsz + \
>>> 					encode_stateid_maxsz + \
>>> 					 2 /* offset */ + \
>>> @@ -128,6 +137,14 @@
>>> 					decode_putfh_maxsz + \
>>> 					decode_deallocate_maxsz + \
>>> 					decode_getattr_maxsz)
>>> +#define NFS4_enc_read_plus_sz		(compound_encode_hdr_maxsz + \
>>> +					encode_sequence_maxsz + \
>>> +					encode_putfh_maxsz + \
>>> +					encode_read_plus_maxsz)
>>> +#define NFS4_dec_read_plus_sz		(compound_decode_hdr_maxsz + \
>>> +					decode_sequence_maxsz + \
>>> +					decode_putfh_maxsz + \
>>> +					decode_read_plus_maxsz)
>>> #define NFS4_enc_seek_sz		(compound_encode_hdr_maxsz + \
>>> 					encode_sequence_maxsz + \
>>> 					encode_putfh_maxsz + \
>>> @@ -252,6 +269,16 @@ static void encode_deallocate(struct xdr_stream *xdr,
>>> 	encode_fallocate(xdr, args);
>>> }
>>> 
>>> +static void encode_read_plus(struct xdr_stream *xdr,
>>> +			     const struct nfs_pgio_args *args,
>>> +			     struct compound_hdr *hdr)
>>> +{
>>> +	encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr);
>>> +	encode_nfs4_stateid(xdr, &args->stateid);
>>> +	encode_uint64(xdr, args->offset);
>>> +	encode_uint32(xdr, args->count);
>>> +}
>>> +
>>> static void encode_seek(struct xdr_stream *xdr,
>>> 			const struct nfs42_seek_args *args,
>>> 			struct compound_hdr *hdr)
>>> @@ -446,6 +473,29 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst
>>> *req,
>>> 	encode_nops(&hdr);
>>> }
>>> 
>>> +/*
>>> + * Encode READ_PLUS request
>>> + */
>>> +static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
>>> +				   struct xdr_stream *xdr,
>>> +				   const void *data)
>>> +{
>>> +	const struct nfs_pgio_args *args = data;
>>> +	struct compound_hdr hdr = {
>>> +		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
>>> +	};
>>> +
>>> +	encode_compound_hdr(xdr, req, &hdr);
>>> +	encode_sequence(xdr, &args->seq_args, &hdr);
>>> +	encode_putfh(xdr, args->fh, &hdr);
>>> +	encode_read_plus(xdr, args, &hdr);
>>> +
>>> +	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
>>> +				args->count, hdr.replen);
>>> +	req->rq_rcv_buf.flags |= XDRBUF_READ;
>> 
>> IMO this line is incorrect.
> 
> You're right, this line causes problems for RDMA with READ_PLUS. I added it to
> match how the other xdr read encoders were set up

Ja, I think just removing that line should be sufficient.
Better would be replacing it with a comment explaining
why this encoder does not set XDRBUF_READ. :-)


>> RFC 8267 Section 6.1 does not list any part of the result of READ_PLUS
>> as DDP-eligible. There's no way for a client to know how to set up
>> Write chunks, unless it knows exactly where the file's holes are in
>> advance. Even then... racy.
>> 
>> Just curious, have you tried READ_PLUS with proto=rdma ?
> 
> I haven't done in-depth performance testing, but I have been able to run it.

We should figure out whether that will have a regressive
impact on NFS/RDMA workloads. I expect that it will, but
the client can always set up the Reply chunk so that the
READ payload fits precisely in an RDMA segment that lines
up with page cache pages. That mitigates some impact.

If your patch set already changes NFSv4.2 mounts to always
use READ_PLUS in place of READ, it might be prudent for the
"proto=rdma" mount option to also set "noreadplus", at least
for the time being.

The down-side here is that would make NFSv4.2 on RDMA
unable to recognize holes in files the same way as it
does on TCP, and that's a pretty significant variation
in behavior. Does "noreadplus" even deal with that?


> Anna
> 
>> 
>> 
>>> +	encode_nops(&hdr);
>>> +}
>>> +
>>> /*
>>> * Encode SEEK request
>>> */
>>> @@ -694,6 +744,67 @@ static int decode_deallocate(struct xdr_stream *xdr,
>>> struct nfs42_falloc_res *re
>>> 	return decode_op_hdr(xdr, OP_DEALLOCATE);
>>> }
>>> 
>>> +static uint32_t decode_read_plus_data(struct xdr_stream *xdr, struct
>>> nfs_pgio_res *res,
>>> +				      uint32_t *eof)
>>> +{
>>> +	__be32 *p;
>>> +	uint32_t count, recvd;
>>> +	uint64_t offset;
>>> +
>>> +	p = xdr_inline_decode(xdr, 8 + 4);
>>> +	if (unlikely(!p))
>>> +		return -EIO;
>>> +
>>> +	p = xdr_decode_hyper(p, &offset);
>>> +	count = be32_to_cpup(p);
>>> +	if (count == 0)
>>> +		return 0;
>>> +
>>> +	recvd = xdr_read_pages(xdr, count);
>>> +	if (count > recvd) {
>>> +		dprintk("NFS: server cheating in read reply: "
>>> +				"count %u > recvd %u\n", count, recvd);
>>> +		count = recvd;
>>> +		*eof = 0;
>>> +	}
>>> +
>>> +	return count;
>>> +}
>>> +
>>> +static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res
>>> *res)
>>> +{
>>> +	__be32 *p;
>>> +	uint32_t count, eof, segments, type;
>>> +	int status;
>>> +
>>> +	status = decode_op_hdr(xdr, OP_READ_PLUS);
>>> +	if (status)
>>> +		return status;
>>> +
>>> +	p = xdr_inline_decode(xdr, 4 + 4);
>>> +	if (unlikely(!p))
>>> +		return -EIO;
>>> +
>>> +	eof = be32_to_cpup(p++);
>>> +	segments = be32_to_cpup(p++);
>>> +	if (segments == 0)
>>> +		return 0;
>>> +
>>> +	p = xdr_inline_decode(xdr, 4);
>>> +	if (unlikely(!p))
>>> +		return -EIO;
>>> +
>>> +	type = be32_to_cpup(p++);
>>> +	if (type == NFS4_CONTENT_DATA)
>>> +		count = decode_read_plus_data(xdr, res, &eof);
>>> +	else
>>> +		return -EINVAL;
>>> +
>>> +	res->eof = eof;
>>> +	res->count = count;
>>> +	return 0;
>>> +}
>>> +
>>> static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
>>> {
>>> 	int status;
>>> @@ -870,6 +981,33 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst
>>> *rqstp,
>>> 	return status;
>>> }
>>> 
>>> +/*
>>> + * Decode READ_PLUS request
>>> + */
>>> +static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
>>> +				  struct xdr_stream *xdr,
>>> +				  void *data)
>>> +{
>>> +	struct nfs_pgio_res *res = data;
>>> +	struct compound_hdr hdr;
>>> +	int status;
>>> +
>>> +	status = decode_compound_hdr(xdr, &hdr);
>>> +	if (status)
>>> +		goto out;
>>> +	status = decode_sequence(xdr, &res->seq_res, rqstp);
>>> +	if (status)
>>> +		goto out;
>>> +	status = decode_putfh(xdr);
>>> +	if (status)
>>> +		goto out;
>>> +	status = decode_read_plus(xdr, res);
>>> +	if (!status)
>>> +		status = res->count;
>>> +out:
>>> +	return status;
>>> +}
>>> +
>>> /*
>>> * Decode SEEK request
>>> */
>>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
>>> index 95d07a3dc5d1..ed3ec8c36273 100644
>>> --- a/fs/nfs/nfs4proc.c
>>> +++ b/fs/nfs/nfs4proc.c
>>> @@ -69,6 +69,10 @@
>>> 
>>> #include "nfs4trace.h"
>>> 
>>> +#ifdef CONFIG_NFS_V4_2
>>> +#include "nfs42.h"
>>> +#endif /* CONFIG_NFS_V4_2 */
>>> +
>>> #define NFSDBG_FACILITY		NFSDBG_PROC
>>> 
>>> #define NFS4_BITMASK_SZ		3
>>> @@ -5199,28 +5203,60 @@ static bool nfs4_read_stateid_changed(struct
>>> rpc_task *task,
>>> 	return true;
>>> }
>>> 
>>> +static bool nfs4_read_plus_not_supported(struct rpc_task *task,
>>> +					struct nfs_pgio_header *hdr)
>>> +{
>>> +	struct nfs_server *server = NFS_SERVER(hdr->inode);
>>> +	struct rpc_message *msg = &task->tk_msg;
>>> +
>>> +	if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] &&
>>> +	    server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) {
>>> +		server->caps &= ~NFS_CAP_READ_PLUS;
>>> +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
>>> +		rpc_restart_call_prepare(task);
>>> +		return true;
>>> +	}
>>> +	return false;
>>> +}
>>> +
>>> static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header
>>> *hdr)
>>> {
>>> -
>>> 	dprintk("--> %s\n", __func__);
>>> 
>>> 	if (!nfs4_sequence_done(task, &hdr->res.seq_res))
>>> 		return -EAGAIN;
>>> 	if (nfs4_read_stateid_changed(task, &hdr->args))
>>> 		return -EAGAIN;
>>> +	if (nfs4_read_plus_not_supported(task, hdr))
>>> +		return -EAGAIN;
>>> 	if (task->tk_status > 0)
>>> 		nfs_invalidate_atime(hdr->inode);
>>> 	return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
>>> 				    nfs4_read_done_cb(task, hdr);
>>> }
>>> 
>>> +#ifdef CONFIG_NFS_V4_2
>>> +static void nfs42_read_plus_support(struct nfs_server *server, struct
>>> rpc_message *msg)
>>> +{
>>> +	if (server->caps & NFS_CAP_READ_PLUS)
>>> +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
>>> +	else
>>> +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
>>> +}
>>> +#else
>>> +static void nfs42_read_plus_support(struct nfs_server *server, struct
>>> rpc_message *msg)
>>> +{
>>> +	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
>>> +}
>>> +#endif /* CONFIG_NFS_V4_2 */
>>> +
>>> static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
>>> 				 struct rpc_message *msg)
>>> {
>>> 	hdr->timestamp   = jiffies;
>>> 	if (!hdr->pgio_done_cb)
>>> 		hdr->pgio_done_cb = nfs4_read_done_cb;
>>> -	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
>>> +	nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg);
>>> 	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
>>> }
>>> 
>>> @@ -9970,7 +10006,8 @@ static const struct nfs4_minor_version_ops
>>> nfs_v4_2_minor_ops = {
>>> 		| NFS_CAP_SEEK
>>> 		| NFS_CAP_LAYOUTSTATS
>>> 		| NFS_CAP_CLONE
>>> -		| NFS_CAP_LAYOUTERROR,
>>> +		| NFS_CAP_LAYOUTERROR
>>> +		| NFS_CAP_READ_PLUS,
>>> 	.init_client = nfs41_init_client,
>>> 	.shutdown_client = nfs41_shutdown_client,
>>> 	.match_stateid = nfs41_match_stateid,
>>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>>> index 47817ef0aadb..68b2917d0537 100644
>>> --- a/fs/nfs/nfs4xdr.c
>>> +++ b/fs/nfs/nfs4xdr.c
>>> @@ -7584,6 +7584,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
>>> 	PROC42(COPY_NOTIFY,	enc_copy_notify,	dec_copy_notify),
>>> 	PROC(LOOKUPP,		enc_lookupp,		dec_lookupp),
>>> 	PROC42(LAYOUTERROR,	enc_layouterror,	dec_layouterror),
>>> +	PROC42(READ_PLUS,	enc_read_plus,		dec_read_plus),
>>> };
>>> 
>>> static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
>>> diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
>>> index 82d8fb422092..c1eeef52545c 100644
>>> --- a/include/linux/nfs4.h
>>> +++ b/include/linux/nfs4.h
>>> @@ -540,8 +540,8 @@ enum {
>>> 
>>> 	NFSPROC4_CLNT_LOOKUPP,
>>> 	NFSPROC4_CLNT_LAYOUTERROR,
>>> -
>>> 	NFSPROC4_CLNT_COPY_NOTIFY,
>>> +	NFSPROC4_CLNT_READ_PLUS,
>>> };
>>> 
>>> /* nfs41 types */
>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>>> index 465fa98258a3..11248c5a7b24 100644
>>> --- a/include/linux/nfs_fs_sb.h
>>> +++ b/include/linux/nfs_fs_sb.h
>>> @@ -281,5 +281,6 @@ struct nfs_server {
>>> #define NFS_CAP_OFFLOAD_CANCEL	(1U << 25)
>>> #define NFS_CAP_LAYOUTERROR	(1U << 26)
>>> #define NFS_CAP_COPY_NOTIFY	(1U << 27)
>>> +#define NFS_CAP_READ_PLUS	(1U << 28)
>>> 
>>> #endif
>>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
>>> index 94c77ed55ce1..8efbf3d8b263 100644
>>> --- a/include/linux/nfs_xdr.h
>>> +++ b/include/linux/nfs_xdr.h
>>> @@ -655,7 +655,7 @@ struct nfs_pgio_args {
>>> struct nfs_pgio_res {
>>> 	struct nfs4_sequence_res	seq_res;
>>> 	struct nfs_fattr *	fattr;
>>> -	__u32			count;
>>> +	__u64			count;
>>> 	__u32			op_status;
>>> 	union {
>>> 		struct {
>>> -- 
>>> 2.25.0
>>> 
>> 
>> --
>> Chuck Lever

--
Chuck Lever
Anna Schumaker Feb. 20, 2020, 6:28 p.m. UTC | #4
On Thu, 2020-02-20 at 09:55 -0500, Chuck Lever wrote:
> > On Feb 20, 2020, at 9:42 AM, Anna Schumaker <schumaker.anna@gmail.com>
> > wrote:
> > 
> > On Fri, 2020-02-14 at 17:28 -0500, Chuck Lever wrote:
> > > > On Feb 14, 2020, at 4:12 PM, schumaker.anna@gmail.com wrote:
> > > > 
> > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > 
> > > > This patch adds client support for decoding a single NFS4_CONTENT_DATA
> > > > segment returned by the server. This is the simplest implementation
> > > > possible, since it does not account for any hole segments in the reply.
> > > > 
> > > > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > ---
> > > > fs/nfs/nfs42xdr.c         | 138 ++++++++++++++++++++++++++++++++++++++
> > > > fs/nfs/nfs4proc.c         |  43 +++++++++++-
> > > > fs/nfs/nfs4xdr.c          |   1 +
> > > > include/linux/nfs4.h      |   2 +-
> > > > include/linux/nfs_fs_sb.h |   1 +
> > > > include/linux/nfs_xdr.h   |   2 +-
> > > > 6 files changed, 182 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
> > > > index c03f3246d6c5..bf118ecabe2c 100644
> > > > --- a/fs/nfs/nfs42xdr.c
> > > > +++ b/fs/nfs/nfs42xdr.c
> > > > @@ -45,6 +45,15 @@
> > > > #define encode_deallocate_maxsz		(op_encode_hdr_maxsz + \
> > > > 					encode_fallocate_maxsz)
> > > > #define decode_deallocate_maxsz		(op_decode_hdr_maxsz)
> > > > +#define encode_read_plus_maxsz		(op_encode_hdr_maxsz + \
> > > > +					encode_stateid_maxsz + 3)
> > > > +#define NFS42_READ_PLUS_SEGMENT_SIZE	(1 /* data_content4 */ + \
> > > > +					 2 /* data_info4.di_offset */ +
> > > > \
> > > > +					 2 /* data_info4.di_length */)
> > > > +#define decode_read_plus_maxsz		(op_decode_hdr_maxsz + \
> > > > +					 1 /* rpr_eof */ + \
> > > > +					 1 /* rpr_contents count */ + \
> > > > +					NFS42_READ_PLUS_SEGMENT_SIZE)
> > > > #define encode_seek_maxsz		(op_encode_hdr_maxsz + \
> > > > 					encode_stateid_maxsz + \
> > > > 					 2 /* offset */ + \
> > > > @@ -128,6 +137,14 @@
> > > > 					decode_putfh_maxsz + \
> > > > 					decode_deallocate_maxsz + \
> > > > 					decode_getattr_maxsz)
> > > > +#define NFS4_enc_read_plus_sz		(compound_encode_hdr_maxsz + \
> > > > +					encode_sequence_maxsz + \
> > > > +					encode_putfh_maxsz + \
> > > > +					encode_read_plus_maxsz)
> > > > +#define NFS4_dec_read_plus_sz		(compound_decode_hdr_maxsz + \
> > > > +					decode_sequence_maxsz + \
> > > > +					decode_putfh_maxsz + \
> > > > +					decode_read_plus_maxsz)
> > > > #define NFS4_enc_seek_sz		(compound_encode_hdr_maxsz + \
> > > > 					encode_sequence_maxsz + \
> > > > 					encode_putfh_maxsz + \
> > > > @@ -252,6 +269,16 @@ static void encode_deallocate(struct xdr_stream
> > > > *xdr,
> > > > 	encode_fallocate(xdr, args);
> > > > }
> > > > 
> > > > +static void encode_read_plus(struct xdr_stream *xdr,
> > > > +			     const struct nfs_pgio_args *args,
> > > > +			     struct compound_hdr *hdr)
> > > > +{
> > > > +	encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr);
> > > > +	encode_nfs4_stateid(xdr, &args->stateid);
> > > > +	encode_uint64(xdr, args->offset);
> > > > +	encode_uint32(xdr, args->count);
> > > > +}
> > > > +
> > > > static void encode_seek(struct xdr_stream *xdr,
> > > > 			const struct nfs42_seek_args *args,
> > > > 			struct compound_hdr *hdr)
> > > > @@ -446,6 +473,29 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst
> > > > *req,
> > > > 	encode_nops(&hdr);
> > > > }
> > > > 
> > > > +/*
> > > > + * Encode READ_PLUS request
> > > > + */
> > > > +static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
> > > > +				   struct xdr_stream *xdr,
> > > > +				   const void *data)
> > > > +{
> > > > +	const struct nfs_pgio_args *args = data;
> > > > +	struct compound_hdr hdr = {
> > > > +		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
> > > > +	};
> > > > +
> > > > +	encode_compound_hdr(xdr, req, &hdr);
> > > > +	encode_sequence(xdr, &args->seq_args, &hdr);
> > > > +	encode_putfh(xdr, args->fh, &hdr);
> > > > +	encode_read_plus(xdr, args, &hdr);
> > > > +
> > > > +	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
> > > > +				args->count, hdr.replen);
> > > > +	req->rq_rcv_buf.flags |= XDRBUF_READ;
> > > 
> > > IMO this line is incorrect.
> > 
> > You're right, this line causes problems for RDMA with READ_PLUS. I added it
> > to
> > match how the other xdr read encoders were set up
> 
> Ja, I think just removing that line should be sufficient.
> Better would be replacing it with a comment explaining
> why this encoder does not set XDRBUF_READ. :-)
> 
> 
> > > RFC 8267 Section 6.1 does not list any part of the result of READ_PLUS
> > > as DDP-eligible. There's no way for a client to know how to set up
> > > Write chunks, unless it knows exactly where the file's holes are in
> > > advance. Even then... racy.
> > > 
> > > Just curious, have you tried READ_PLUS with proto=rdma ?
> > 
> > I haven't done in-depth performance testing, but I have been able to run it.
> 
> We should figure out whether that will have a regressive
> impact on NFS/RDMA workloads. I expect that it will, but
> the client can always set up the Reply chunk so that the
> READ payload fits precisely in an RDMA segment that lines
> up with page cache pages. That mitigates some impact.
> 
> If your patch set already changes NFSv4.2 mounts to always
> use READ_PLUS in place of READ, it might be prudent for the
> "proto=rdma" mount option to also set "noreadplus", at least
> for the time being.

I can make this change.

> 
> The down-side here is that would make NFSv4.2 on RDMA
> unable to recognize holes in files the same way as it
> does on TCP, and that's a pretty significant variation
> in behavior. Does "noreadplus" even deal with that?

Setting "noreadplus" just causes the client to use the READ operation instead,
so there should be no difference between v4.1 and v4.2 if the option is set.

Anna

> 
> 
> > Anna
> > 
> > > 
> > > > +	encode_nops(&hdr);
> > > > +}
> > > > +
> > > > /*
> > > > * Encode SEEK request
> > > > */
> > > > @@ -694,6 +744,67 @@ static int decode_deallocate(struct xdr_stream
> > > > *xdr,
> > > > struct nfs42_falloc_res *re
> > > > 	return decode_op_hdr(xdr, OP_DEALLOCATE);
> > > > }
> > > > 
> > > > +static uint32_t decode_read_plus_data(struct xdr_stream *xdr, struct
> > > > nfs_pgio_res *res,
> > > > +				      uint32_t *eof)
> > > > +{
> > > > +	__be32 *p;
> > > > +	uint32_t count, recvd;
> > > > +	uint64_t offset;
> > > > +
> > > > +	p = xdr_inline_decode(xdr, 8 + 4);
> > > > +	if (unlikely(!p))
> > > > +		return -EIO;
> > > > +
> > > > +	p = xdr_decode_hyper(p, &offset);
> > > > +	count = be32_to_cpup(p);
> > > > +	if (count == 0)
> > > > +		return 0;
> > > > +
> > > > +	recvd = xdr_read_pages(xdr, count);
> > > > +	if (count > recvd) {
> > > > +		dprintk("NFS: server cheating in read reply: "
> > > > +				"count %u > recvd %u\n", count, recvd);
> > > > +		count = recvd;
> > > > +		*eof = 0;
> > > > +	}
> > > > +
> > > > +	return count;
> > > > +}
> > > > +
> > > > +static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res
> > > > *res)
> > > > +{
> > > > +	__be32 *p;
> > > > +	uint32_t count, eof, segments, type;
> > > > +	int status;
> > > > +
> > > > +	status = decode_op_hdr(xdr, OP_READ_PLUS);
> > > > +	if (status)
> > > > +		return status;
> > > > +
> > > > +	p = xdr_inline_decode(xdr, 4 + 4);
> > > > +	if (unlikely(!p))
> > > > +		return -EIO;
> > > > +
> > > > +	eof = be32_to_cpup(p++);
> > > > +	segments = be32_to_cpup(p++);
> > > > +	if (segments == 0)
> > > > +		return 0;
> > > > +
> > > > +	p = xdr_inline_decode(xdr, 4);
> > > > +	if (unlikely(!p))
> > > > +		return -EIO;
> > > > +
> > > > +	type = be32_to_cpup(p++);
> > > > +	if (type == NFS4_CONTENT_DATA)
> > > > +		count = decode_read_plus_data(xdr, res, &eof);
> > > > +	else
> > > > +		return -EINVAL;
> > > > +
> > > > +	res->eof = eof;
> > > > +	res->count = count;
> > > > +	return 0;
> > > > +}
> > > > +
> > > > static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res
> > > > *res)
> > > > {
> > > > 	int status;
> > > > @@ -870,6 +981,33 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst
> > > > *rqstp,
> > > > 	return status;
> > > > }
> > > > 
> > > > +/*
> > > > + * Decode READ_PLUS request
> > > > + */
> > > > +static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
> > > > +				  struct xdr_stream *xdr,
> > > > +				  void *data)
> > > > +{
> > > > +	struct nfs_pgio_res *res = data;
> > > > +	struct compound_hdr hdr;
> > > > +	int status;
> > > > +
> > > > +	status = decode_compound_hdr(xdr, &hdr);
> > > > +	if (status)
> > > > +		goto out;
> > > > +	status = decode_sequence(xdr, &res->seq_res, rqstp);
> > > > +	if (status)
> > > > +		goto out;
> > > > +	status = decode_putfh(xdr);
> > > > +	if (status)
> > > > +		goto out;
> > > > +	status = decode_read_plus(xdr, res);
> > > > +	if (!status)
> > > > +		status = res->count;
> > > > +out:
> > > > +	return status;
> > > > +}
> > > > +
> > > > /*
> > > > * Decode SEEK request
> > > > */
> > > > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> > > > index 95d07a3dc5d1..ed3ec8c36273 100644
> > > > --- a/fs/nfs/nfs4proc.c
> > > > +++ b/fs/nfs/nfs4proc.c
> > > > @@ -69,6 +69,10 @@
> > > > 
> > > > #include "nfs4trace.h"
> > > > 
> > > > +#ifdef CONFIG_NFS_V4_2
> > > > +#include "nfs42.h"
> > > > +#endif /* CONFIG_NFS_V4_2 */
> > > > +
> > > > #define NFSDBG_FACILITY		NFSDBG_PROC
> > > > 
> > > > #define NFS4_BITMASK_SZ		3
> > > > @@ -5199,28 +5203,60 @@ static bool nfs4_read_stateid_changed(struct
> > > > rpc_task *task,
> > > > 	return true;
> > > > }
> > > > 
> > > > +static bool nfs4_read_plus_not_supported(struct rpc_task *task,
> > > > +					struct nfs_pgio_header *hdr)
> > > > +{
> > > > +	struct nfs_server *server = NFS_SERVER(hdr->inode);
> > > > +	struct rpc_message *msg = &task->tk_msg;
> > > > +
> > > > +	if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]
> > > > &&
> > > > +	    server->caps & NFS_CAP_READ_PLUS && task->tk_status ==
> > > > -ENOTSUPP) {
> > > > +		server->caps &= ~NFS_CAP_READ_PLUS;
> > > > +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> > > > +		rpc_restart_call_prepare(task);
> > > > +		return true;
> > > > +	}
> > > > +	return false;
> > > > +}
> > > > +
> > > > static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header
> > > > *hdr)
> > > > {
> > > > -
> > > > 	dprintk("--> %s\n", __func__);
> > > > 
> > > > 	if (!nfs4_sequence_done(task, &hdr->res.seq_res))
> > > > 		return -EAGAIN;
> > > > 	if (nfs4_read_stateid_changed(task, &hdr->args))
> > > > 		return -EAGAIN;
> > > > +	if (nfs4_read_plus_not_supported(task, hdr))
> > > > +		return -EAGAIN;
> > > > 	if (task->tk_status > 0)
> > > > 		nfs_invalidate_atime(hdr->inode);
> > > > 	return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
> > > > 				    nfs4_read_done_cb(task, hdr);
> > > > }
> > > > 
> > > > +#ifdef CONFIG_NFS_V4_2
> > > > +static void nfs42_read_plus_support(struct nfs_server *server, struct
> > > > rpc_message *msg)
> > > > +{
> > > > +	if (server->caps & NFS_CAP_READ_PLUS)
> > > > +		msg->rpc_proc =
> > > > &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
> > > > +	else
> > > > +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> > > > +}
> > > > +#else
> > > > +static void nfs42_read_plus_support(struct nfs_server *server, struct
> > > > rpc_message *msg)
> > > > +{
> > > > +	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> > > > +}
> > > > +#endif /* CONFIG_NFS_V4_2 */
> > > > +
> > > > static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
> > > > 				 struct rpc_message *msg)
> > > > {
> > > > 	hdr->timestamp   = jiffies;
> > > > 	if (!hdr->pgio_done_cb)
> > > > 		hdr->pgio_done_cb = nfs4_read_done_cb;
> > > > -	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> > > > +	nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg);
> > > > 	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0,
> > > > 0);
> > > > }
> > > > 
> > > > @@ -9970,7 +10006,8 @@ static const struct nfs4_minor_version_ops
> > > > nfs_v4_2_minor_ops = {
> > > > 		| NFS_CAP_SEEK
> > > > 		| NFS_CAP_LAYOUTSTATS
> > > > 		| NFS_CAP_CLONE
> > > > -		| NFS_CAP_LAYOUTERROR,
> > > > +		| NFS_CAP_LAYOUTERROR
> > > > +		| NFS_CAP_READ_PLUS,
> > > > 	.init_client = nfs41_init_client,
> > > > 	.shutdown_client = nfs41_shutdown_client,
> > > > 	.match_stateid = nfs41_match_stateid,
> > > > diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> > > > index 47817ef0aadb..68b2917d0537 100644
> > > > --- a/fs/nfs/nfs4xdr.c
> > > > +++ b/fs/nfs/nfs4xdr.c
> > > > @@ -7584,6 +7584,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
> > > > 	PROC42(COPY_NOTIFY,	enc_copy_notify,	dec_copy_notify)
> > > > ,
> > > > 	PROC(LOOKUPP,		enc_lookupp,		dec_lookupp),
> > > > 	PROC42(LAYOUTERROR,	enc_layouterror,	dec_layouterror)
> > > > ,
> > > > +	PROC42(READ_PLUS,	enc_read_plus,		dec_read_plus),
> > > > };
> > > > 
> > > > static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
> > > > diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
> > > > index 82d8fb422092..c1eeef52545c 100644
> > > > --- a/include/linux/nfs4.h
> > > > +++ b/include/linux/nfs4.h
> > > > @@ -540,8 +540,8 @@ enum {
> > > > 
> > > > 	NFSPROC4_CLNT_LOOKUPP,
> > > > 	NFSPROC4_CLNT_LAYOUTERROR,
> > > > -
> > > > 	NFSPROC4_CLNT_COPY_NOTIFY,
> > > > +	NFSPROC4_CLNT_READ_PLUS,
> > > > };
> > > > 
> > > > /* nfs41 types */
> > > > diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> > > > index 465fa98258a3..11248c5a7b24 100644
> > > > --- a/include/linux/nfs_fs_sb.h
> > > > +++ b/include/linux/nfs_fs_sb.h
> > > > @@ -281,5 +281,6 @@ struct nfs_server {
> > > > #define NFS_CAP_OFFLOAD_CANCEL	(1U << 25)
> > > > #define NFS_CAP_LAYOUTERROR	(1U << 26)
> > > > #define NFS_CAP_COPY_NOTIFY	(1U << 27)
> > > > +#define NFS_CAP_READ_PLUS	(1U << 28)
> > > > 
> > > > #endif
> > > > diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> > > > index 94c77ed55ce1..8efbf3d8b263 100644
> > > > --- a/include/linux/nfs_xdr.h
> > > > +++ b/include/linux/nfs_xdr.h
> > > > @@ -655,7 +655,7 @@ struct nfs_pgio_args {
> > > > struct nfs_pgio_res {
> > > > 	struct nfs4_sequence_res	seq_res;
> > > > 	struct nfs_fattr *	fattr;
> > > > -	__u32			count;
> > > > +	__u64			count;
> > > > 	__u32			op_status;
> > > > 	union {
> > > > 		struct {
> > > > -- 
> > > > 2.25.0
> > > > 
> > > 
> > > --
> > > Chuck Lever
> 
> --
> Chuck Lever
> 
> 
>
Chuck Lever III Feb. 20, 2020, 6:30 p.m. UTC | #5
> On Feb 20, 2020, at 1:28 PM, Anna Schumaker <schumaker.anna@gmail.com> wrote:
> 
> On Thu, 2020-02-20 at 09:55 -0500, Chuck Lever wrote:
>>> On Feb 20, 2020, at 9:42 AM, Anna Schumaker <schumaker.anna@gmail.com>
>>> wrote:
>>> 
>>> On Fri, 2020-02-14 at 17:28 -0500, Chuck Lever wrote:
>>>>> On Feb 14, 2020, at 4:12 PM, schumaker.anna@gmail.com wrote:
>>>>> 
>>>>> From: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>>>> 
>>>>> This patch adds client support for decoding a single NFS4_CONTENT_DATA
>>>>> segment returned by the server. This is the simplest implementation
>>>>> possible, since it does not account for any hole segments in the reply.
>>>>> 
>>>>> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
>>>>> ---
>>>>> fs/nfs/nfs42xdr.c         | 138 ++++++++++++++++++++++++++++++++++++++
>>>>> fs/nfs/nfs4proc.c         |  43 +++++++++++-
>>>>> fs/nfs/nfs4xdr.c          |   1 +
>>>>> include/linux/nfs4.h      |   2 +-
>>>>> include/linux/nfs_fs_sb.h |   1 +
>>>>> include/linux/nfs_xdr.h   |   2 +-
>>>>> 6 files changed, 182 insertions(+), 5 deletions(-)
>>>>> 
>>>>> diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
>>>>> index c03f3246d6c5..bf118ecabe2c 100644
>>>>> --- a/fs/nfs/nfs42xdr.c
>>>>> +++ b/fs/nfs/nfs42xdr.c
>>>>> @@ -45,6 +45,15 @@
>>>>> #define encode_deallocate_maxsz		(op_encode_hdr_maxsz + \
>>>>> 					encode_fallocate_maxsz)
>>>>> #define decode_deallocate_maxsz		(op_decode_hdr_maxsz)
>>>>> +#define encode_read_plus_maxsz		(op_encode_hdr_maxsz + \
>>>>> +					encode_stateid_maxsz + 3)
>>>>> +#define NFS42_READ_PLUS_SEGMENT_SIZE	(1 /* data_content4 */ + \
>>>>> +					 2 /* data_info4.di_offset */ +
>>>>> \
>>>>> +					 2 /* data_info4.di_length */)
>>>>> +#define decode_read_plus_maxsz		(op_decode_hdr_maxsz + \
>>>>> +					 1 /* rpr_eof */ + \
>>>>> +					 1 /* rpr_contents count */ + \
>>>>> +					NFS42_READ_PLUS_SEGMENT_SIZE)
>>>>> #define encode_seek_maxsz		(op_encode_hdr_maxsz + \
>>>>> 					encode_stateid_maxsz + \
>>>>> 					 2 /* offset */ + \
>>>>> @@ -128,6 +137,14 @@
>>>>> 					decode_putfh_maxsz + \
>>>>> 					decode_deallocate_maxsz + \
>>>>> 					decode_getattr_maxsz)
>>>>> +#define NFS4_enc_read_plus_sz		(compound_encode_hdr_maxsz + \
>>>>> +					encode_sequence_maxsz + \
>>>>> +					encode_putfh_maxsz + \
>>>>> +					encode_read_plus_maxsz)
>>>>> +#define NFS4_dec_read_plus_sz		(compound_decode_hdr_maxsz + \
>>>>> +					decode_sequence_maxsz + \
>>>>> +					decode_putfh_maxsz + \
>>>>> +					decode_read_plus_maxsz)
>>>>> #define NFS4_enc_seek_sz		(compound_encode_hdr_maxsz + \
>>>>> 					encode_sequence_maxsz + \
>>>>> 					encode_putfh_maxsz + \
>>>>> @@ -252,6 +269,16 @@ static void encode_deallocate(struct xdr_stream
>>>>> *xdr,
>>>>> 	encode_fallocate(xdr, args);
>>>>> }
>>>>> 
>>>>> +static void encode_read_plus(struct xdr_stream *xdr,
>>>>> +			     const struct nfs_pgio_args *args,
>>>>> +			     struct compound_hdr *hdr)
>>>>> +{
>>>>> +	encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr);
>>>>> +	encode_nfs4_stateid(xdr, &args->stateid);
>>>>> +	encode_uint64(xdr, args->offset);
>>>>> +	encode_uint32(xdr, args->count);
>>>>> +}
>>>>> +
>>>>> static void encode_seek(struct xdr_stream *xdr,
>>>>> 			const struct nfs42_seek_args *args,
>>>>> 			struct compound_hdr *hdr)
>>>>> @@ -446,6 +473,29 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst
>>>>> *req,
>>>>> 	encode_nops(&hdr);
>>>>> }
>>>>> 
>>>>> +/*
>>>>> + * Encode READ_PLUS request
>>>>> + */
>>>>> +static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
>>>>> +				   struct xdr_stream *xdr,
>>>>> +				   const void *data)
>>>>> +{
>>>>> +	const struct nfs_pgio_args *args = data;
>>>>> +	struct compound_hdr hdr = {
>>>>> +		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
>>>>> +	};
>>>>> +
>>>>> +	encode_compound_hdr(xdr, req, &hdr);
>>>>> +	encode_sequence(xdr, &args->seq_args, &hdr);
>>>>> +	encode_putfh(xdr, args->fh, &hdr);
>>>>> +	encode_read_plus(xdr, args, &hdr);
>>>>> +
>>>>> +	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
>>>>> +				args->count, hdr.replen);
>>>>> +	req->rq_rcv_buf.flags |= XDRBUF_READ;
>>>> 
>>>> IMO this line is incorrect.
>>> 
>>> You're right, this line causes problems for RDMA with READ_PLUS. I added it
>>> to
>>> match how the other xdr read encoders were set up
>> 
>> Ja, I think just removing that line should be sufficient.
>> Better would be replacing it with a comment explaining
>> why this encoder does not set XDRBUF_READ. :-)
>> 
>> 
>>>> RFC 8267 Section 6.1 does not list any part of the result of READ_PLUS
>>>> as DDP-eligible. There's no way for a client to know how to set up
>>>> Write chunks, unless it knows exactly where the file's holes are in
>>>> advance. Even then... racy.
>>>> 
>>>> Just curious, have you tried READ_PLUS with proto=rdma ?
>>> 
>>> I haven't done in-depth performance testing, but I have been able to run it.
>> 
>> We should figure out whether that will have a regressive
>> impact on NFS/RDMA workloads. I expect that it will, but
>> the client can always set up the Reply chunk so that the
>> READ payload fits precisely in an RDMA segment that lines
>> up with page cache pages. That mitigates some impact.
>> 
>> If your patch set already changes NFSv4.2 mounts to always
>> use READ_PLUS in place of READ, it might be prudent for the
>> "proto=rdma" mount option to also set "noreadplus", at least
>> for the time being.
> 
> I can make this change.
> 
>> 
>> The down-side here is that would make NFSv4.2 on RDMA
>> unable to recognize holes in files the same way as it
>> does on TCP, and that's a pretty significant variation
>> in behavior. Does "noreadplus" even deal with that?
> 
> Setting "noreadplus" just causes the client to use the READ operation instead,
> so there should be no difference between v4.1 and v4.2 if the option is set.

My concern is the difference between NFSv4.2 with noreadplus
and NFSv4.2 with readplus. The former is not able to detect
holes in files on the server, but the latter is.

Is that worth mentioning in the man page, or in release notes
when NFSv4.2 becomes the default?


> Anna
> 
>> 
>> 
>>> Anna
>>> 
>>>> 
>>>>> +	encode_nops(&hdr);
>>>>> +}
>>>>> +
>>>>> /*
>>>>> * Encode SEEK request
>>>>> */
>>>>> @@ -694,6 +744,67 @@ static int decode_deallocate(struct xdr_stream
>>>>> *xdr,
>>>>> struct nfs42_falloc_res *re
>>>>> 	return decode_op_hdr(xdr, OP_DEALLOCATE);
>>>>> }
>>>>> 
>>>>> +static uint32_t decode_read_plus_data(struct xdr_stream *xdr, struct
>>>>> nfs_pgio_res *res,
>>>>> +				      uint32_t *eof)
>>>>> +{
>>>>> +	__be32 *p;
>>>>> +	uint32_t count, recvd;
>>>>> +	uint64_t offset;
>>>>> +
>>>>> +	p = xdr_inline_decode(xdr, 8 + 4);
>>>>> +	if (unlikely(!p))
>>>>> +		return -EIO;
>>>>> +
>>>>> +	p = xdr_decode_hyper(p, &offset);
>>>>> +	count = be32_to_cpup(p);
>>>>> +	if (count == 0)
>>>>> +		return 0;
>>>>> +
>>>>> +	recvd = xdr_read_pages(xdr, count);
>>>>> +	if (count > recvd) {
>>>>> +		dprintk("NFS: server cheating in read reply: "
>>>>> +				"count %u > recvd %u\n", count, recvd);
>>>>> +		count = recvd;
>>>>> +		*eof = 0;
>>>>> +	}
>>>>> +
>>>>> +	return count;
>>>>> +}
>>>>> +
>>>>> +static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res
>>>>> *res)
>>>>> +{
>>>>> +	__be32 *p;
>>>>> +	uint32_t count, eof, segments, type;
>>>>> +	int status;
>>>>> +
>>>>> +	status = decode_op_hdr(xdr, OP_READ_PLUS);
>>>>> +	if (status)
>>>>> +		return status;
>>>>> +
>>>>> +	p = xdr_inline_decode(xdr, 4 + 4);
>>>>> +	if (unlikely(!p))
>>>>> +		return -EIO;
>>>>> +
>>>>> +	eof = be32_to_cpup(p++);
>>>>> +	segments = be32_to_cpup(p++);
>>>>> +	if (segments == 0)
>>>>> +		return 0;
>>>>> +
>>>>> +	p = xdr_inline_decode(xdr, 4);
>>>>> +	if (unlikely(!p))
>>>>> +		return -EIO;
>>>>> +
>>>>> +	type = be32_to_cpup(p++);
>>>>> +	if (type == NFS4_CONTENT_DATA)
>>>>> +		count = decode_read_plus_data(xdr, res, &eof);
>>>>> +	else
>>>>> +		return -EINVAL;
>>>>> +
>>>>> +	res->eof = eof;
>>>>> +	res->count = count;
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>> static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res
>>>>> *res)
>>>>> {
>>>>> 	int status;
>>>>> @@ -870,6 +981,33 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst
>>>>> *rqstp,
>>>>> 	return status;
>>>>> }
>>>>> 
>>>>> +/*
>>>>> + * Decode READ_PLUS request
>>>>> + */
>>>>> +static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
>>>>> +				  struct xdr_stream *xdr,
>>>>> +				  void *data)
>>>>> +{
>>>>> +	struct nfs_pgio_res *res = data;
>>>>> +	struct compound_hdr hdr;
>>>>> +	int status;
>>>>> +
>>>>> +	status = decode_compound_hdr(xdr, &hdr);
>>>>> +	if (status)
>>>>> +		goto out;
>>>>> +	status = decode_sequence(xdr, &res->seq_res, rqstp);
>>>>> +	if (status)
>>>>> +		goto out;
>>>>> +	status = decode_putfh(xdr);
>>>>> +	if (status)
>>>>> +		goto out;
>>>>> +	status = decode_read_plus(xdr, res);
>>>>> +	if (!status)
>>>>> +		status = res->count;
>>>>> +out:
>>>>> +	return status;
>>>>> +}
>>>>> +
>>>>> /*
>>>>> * Decode SEEK request
>>>>> */
>>>>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
>>>>> index 95d07a3dc5d1..ed3ec8c36273 100644
>>>>> --- a/fs/nfs/nfs4proc.c
>>>>> +++ b/fs/nfs/nfs4proc.c
>>>>> @@ -69,6 +69,10 @@
>>>>> 
>>>>> #include "nfs4trace.h"
>>>>> 
>>>>> +#ifdef CONFIG_NFS_V4_2
>>>>> +#include "nfs42.h"
>>>>> +#endif /* CONFIG_NFS_V4_2 */
>>>>> +
>>>>> #define NFSDBG_FACILITY		NFSDBG_PROC
>>>>> 
>>>>> #define NFS4_BITMASK_SZ		3
>>>>> @@ -5199,28 +5203,60 @@ static bool nfs4_read_stateid_changed(struct
>>>>> rpc_task *task,
>>>>> 	return true;
>>>>> }
>>>>> 
>>>>> +static bool nfs4_read_plus_not_supported(struct rpc_task *task,
>>>>> +					struct nfs_pgio_header *hdr)
>>>>> +{
>>>>> +	struct nfs_server *server = NFS_SERVER(hdr->inode);
>>>>> +	struct rpc_message *msg = &task->tk_msg;
>>>>> +
>>>>> +	if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]
>>>>> &&
>>>>> +	    server->caps & NFS_CAP_READ_PLUS && task->tk_status ==
>>>>> -ENOTSUPP) {
>>>>> +		server->caps &= ~NFS_CAP_READ_PLUS;
>>>>> +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
>>>>> +		rpc_restart_call_prepare(task);
>>>>> +		return true;
>>>>> +	}
>>>>> +	return false;
>>>>> +}
>>>>> +
>>>>> static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header
>>>>> *hdr)
>>>>> {
>>>>> -
>>>>> 	dprintk("--> %s\n", __func__);
>>>>> 
>>>>> 	if (!nfs4_sequence_done(task, &hdr->res.seq_res))
>>>>> 		return -EAGAIN;
>>>>> 	if (nfs4_read_stateid_changed(task, &hdr->args))
>>>>> 		return -EAGAIN;
>>>>> +	if (nfs4_read_plus_not_supported(task, hdr))
>>>>> +		return -EAGAIN;
>>>>> 	if (task->tk_status > 0)
>>>>> 		nfs_invalidate_atime(hdr->inode);
>>>>> 	return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
>>>>> 				    nfs4_read_done_cb(task, hdr);
>>>>> }
>>>>> 
>>>>> +#ifdef CONFIG_NFS_V4_2
>>>>> +static void nfs42_read_plus_support(struct nfs_server *server, struct
>>>>> rpc_message *msg)
>>>>> +{
>>>>> +	if (server->caps & NFS_CAP_READ_PLUS)
>>>>> +		msg->rpc_proc =
>>>>> &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
>>>>> +	else
>>>>> +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
>>>>> +}
>>>>> +#else
>>>>> +static void nfs42_read_plus_support(struct nfs_server *server, struct
>>>>> rpc_message *msg)
>>>>> +{
>>>>> +	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
>>>>> +}
>>>>> +#endif /* CONFIG_NFS_V4_2 */
>>>>> +
>>>>> static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
>>>>> 				 struct rpc_message *msg)
>>>>> {
>>>>> 	hdr->timestamp   = jiffies;
>>>>> 	if (!hdr->pgio_done_cb)
>>>>> 		hdr->pgio_done_cb = nfs4_read_done_cb;
>>>>> -	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
>>>>> +	nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg);
>>>>> 	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0,
>>>>> 0);
>>>>> }
>>>>> 
>>>>> @@ -9970,7 +10006,8 @@ static const struct nfs4_minor_version_ops
>>>>> nfs_v4_2_minor_ops = {
>>>>> 		| NFS_CAP_SEEK
>>>>> 		| NFS_CAP_LAYOUTSTATS
>>>>> 		| NFS_CAP_CLONE
>>>>> -		| NFS_CAP_LAYOUTERROR,
>>>>> +		| NFS_CAP_LAYOUTERROR
>>>>> +		| NFS_CAP_READ_PLUS,
>>>>> 	.init_client = nfs41_init_client,
>>>>> 	.shutdown_client = nfs41_shutdown_client,
>>>>> 	.match_stateid = nfs41_match_stateid,
>>>>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>>>>> index 47817ef0aadb..68b2917d0537 100644
>>>>> --- a/fs/nfs/nfs4xdr.c
>>>>> +++ b/fs/nfs/nfs4xdr.c
>>>>> @@ -7584,6 +7584,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
>>>>> 	PROC42(COPY_NOTIFY,	enc_copy_notify,	dec_copy_notify)
>>>>> ,
>>>>> 	PROC(LOOKUPP,		enc_lookupp,		dec_lookupp),
>>>>> 	PROC42(LAYOUTERROR,	enc_layouterror,	dec_layouterror)
>>>>> ,
>>>>> +	PROC42(READ_PLUS,	enc_read_plus,		dec_read_plus),
>>>>> };
>>>>> 
>>>>> static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
>>>>> diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
>>>>> index 82d8fb422092..c1eeef52545c 100644
>>>>> --- a/include/linux/nfs4.h
>>>>> +++ b/include/linux/nfs4.h
>>>>> @@ -540,8 +540,8 @@ enum {
>>>>> 
>>>>> 	NFSPROC4_CLNT_LOOKUPP,
>>>>> 	NFSPROC4_CLNT_LAYOUTERROR,
>>>>> -
>>>>> 	NFSPROC4_CLNT_COPY_NOTIFY,
>>>>> +	NFSPROC4_CLNT_READ_PLUS,
>>>>> };
>>>>> 
>>>>> /* nfs41 types */
>>>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>>>>> index 465fa98258a3..11248c5a7b24 100644
>>>>> --- a/include/linux/nfs_fs_sb.h
>>>>> +++ b/include/linux/nfs_fs_sb.h
>>>>> @@ -281,5 +281,6 @@ struct nfs_server {
>>>>> #define NFS_CAP_OFFLOAD_CANCEL	(1U << 25)
>>>>> #define NFS_CAP_LAYOUTERROR	(1U << 26)
>>>>> #define NFS_CAP_COPY_NOTIFY	(1U << 27)
>>>>> +#define NFS_CAP_READ_PLUS	(1U << 28)
>>>>> 
>>>>> #endif
>>>>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
>>>>> index 94c77ed55ce1..8efbf3d8b263 100644
>>>>> --- a/include/linux/nfs_xdr.h
>>>>> +++ b/include/linux/nfs_xdr.h
>>>>> @@ -655,7 +655,7 @@ struct nfs_pgio_args {
>>>>> struct nfs_pgio_res {
>>>>> 	struct nfs4_sequence_res	seq_res;
>>>>> 	struct nfs_fattr *	fattr;
>>>>> -	__u32			count;
>>>>> +	__u64			count;
>>>>> 	__u32			op_status;
>>>>> 	union {
>>>>> 		struct {
>>>>> -- 
>>>>> 2.25.0
>>>>> 
>>>> 
>>>> --
>>>> Chuck Lever
>> 
>> --
>> Chuck Lever

--
Chuck Lever
Anna Schumaker Feb. 20, 2020, 6:35 p.m. UTC | #6
On Thu, 2020-02-20 at 13:30 -0500, Chuck Lever wrote:
> > On Feb 20, 2020, at 1:28 PM, Anna Schumaker <schumaker.anna@gmail.com>
> > wrote:
> > 
> > On Thu, 2020-02-20 at 09:55 -0500, Chuck Lever wrote:
> > > > On Feb 20, 2020, at 9:42 AM, Anna Schumaker <schumaker.anna@gmail.com>
> > > > wrote:
> > > > 
> > > > On Fri, 2020-02-14 at 17:28 -0500, Chuck Lever wrote:
> > > > > > On Feb 14, 2020, at 4:12 PM, schumaker.anna@gmail.com wrote:
> > > > > > 
> > > > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > > > 
> > > > > > This patch adds client support for decoding a single
> > > > > > NFS4_CONTENT_DATA
> > > > > > segment returned by the server. This is the simplest implementation
> > > > > > possible, since it does not account for any hole segments in the
> > > > > > reply.
> > > > > > 
> > > > > > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > > > ---
> > > > > > fs/nfs/nfs42xdr.c         | 138
> > > > > > ++++++++++++++++++++++++++++++++++++++
> > > > > > fs/nfs/nfs4proc.c         |  43 +++++++++++-
> > > > > > fs/nfs/nfs4xdr.c          |   1 +
> > > > > > include/linux/nfs4.h      |   2 +-
> > > > > > include/linux/nfs_fs_sb.h |   1 +
> > > > > > include/linux/nfs_xdr.h   |   2 +-
> > > > > > 6 files changed, 182 insertions(+), 5 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
> > > > > > index c03f3246d6c5..bf118ecabe2c 100644
> > > > > > --- a/fs/nfs/nfs42xdr.c
> > > > > > +++ b/fs/nfs/nfs42xdr.c
> > > > > > @@ -45,6 +45,15 @@
> > > > > > #define encode_deallocate_maxsz		(op_encode_hdr_maxsz + \
> > > > > > 					encode_fallocate_maxsz)
> > > > > > #define decode_deallocate_maxsz		(op_decode_hdr_maxsz)
> > > > > > +#define encode_read_plus_maxsz		(op_encode_hdr_maxsz + \
> > > > > > +					encode_stateid_maxsz + 3)
> > > > > > +#define NFS42_READ_PLUS_SEGMENT_SIZE	(1 /* data_content4 */ +
> > > > > > \
> > > > > > +					 2 /* data_info4.di_offset */ +
> > > > > > \
> > > > > > +					 2 /* data_info4.di_length */)
> > > > > > +#define decode_read_plus_maxsz		(op_decode_hdr_maxsz + \
> > > > > > +					 1 /* rpr_eof */ + \
> > > > > > +					 1 /* rpr_contents count */ + \
> > > > > > +					NFS42_READ_PLUS_SEGMENT_SIZE)
> > > > > > #define encode_seek_maxsz		(op_encode_hdr_maxsz + \
> > > > > > 					encode_stateid_maxsz + \
> > > > > > 					 2 /* offset */ + \
> > > > > > @@ -128,6 +137,14 @@
> > > > > > 					decode_putfh_maxsz + \
> > > > > > 					decode_deallocate_maxsz + \
> > > > > > 					decode_getattr_maxsz)
> > > > > > +#define NFS4_enc_read_plus_sz		(compound_encode_hdr_max
> > > > > > sz + \
> > > > > > +					encode_sequence_maxsz + \
> > > > > > +					encode_putfh_maxsz + \
> > > > > > +					encode_read_plus_maxsz)
> > > > > > +#define NFS4_dec_read_plus_sz		(compound_decode_hdr_max
> > > > > > sz + \
> > > > > > +					decode_sequence_maxsz + \
> > > > > > +					decode_putfh_maxsz + \
> > > > > > +					decode_read_plus_maxsz)
> > > > > > #define NFS4_enc_seek_sz		(compound_encode_hdr_maxsz + \
> > > > > > 					encode_sequence_maxsz + \
> > > > > > 					encode_putfh_maxsz + \
> > > > > > @@ -252,6 +269,16 @@ static void encode_deallocate(struct xdr_stream
> > > > > > *xdr,
> > > > > > 	encode_fallocate(xdr, args);
> > > > > > }
> > > > > > 
> > > > > > +static void encode_read_plus(struct xdr_stream *xdr,
> > > > > > +			     const struct nfs_pgio_args *args,
> > > > > > +			     struct compound_hdr *hdr)
> > > > > > +{
> > > > > > +	encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr);
> > > > > > +	encode_nfs4_stateid(xdr, &args->stateid);
> > > > > > +	encode_uint64(xdr, args->offset);
> > > > > > +	encode_uint32(xdr, args->count);
> > > > > > +}
> > > > > > +
> > > > > > static void encode_seek(struct xdr_stream *xdr,
> > > > > > 			const struct nfs42_seek_args *args,
> > > > > > 			struct compound_hdr *hdr)
> > > > > > @@ -446,6 +473,29 @@ static void nfs4_xdr_enc_deallocate(struct
> > > > > > rpc_rqst
> > > > > > *req,
> > > > > > 	encode_nops(&hdr);
> > > > > > }
> > > > > > 
> > > > > > +/*
> > > > > > + * Encode READ_PLUS request
> > > > > > + */
> > > > > > +static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
> > > > > > +				   struct xdr_stream *xdr,
> > > > > > +				   const void *data)
> > > > > > +{
> > > > > > +	const struct nfs_pgio_args *args = data;
> > > > > > +	struct compound_hdr hdr = {
> > > > > > +		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
> > > > > > +	};
> > > > > > +
> > > > > > +	encode_compound_hdr(xdr, req, &hdr);
> > > > > > +	encode_sequence(xdr, &args->seq_args, &hdr);
> > > > > > +	encode_putfh(xdr, args->fh, &hdr);
> > > > > > +	encode_read_plus(xdr, args, &hdr);
> > > > > > +
> > > > > > +	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
> > > > > > +				args->count, hdr.replen);
> > > > > > +	req->rq_rcv_buf.flags |= XDRBUF_READ;
> > > > > 
> > > > > IMO this line is incorrect.
> > > > 
> > > > You're right, this line causes problems for RDMA with READ_PLUS. I added
> > > > it
> > > > to
> > > > match how the other xdr read encoders were set up
> > > 
> > > Ja, I think just removing that line should be sufficient.
> > > Better would be replacing it with a comment explaining
> > > why this encoder does not set XDRBUF_READ. :-)
> > > 
> > > 
> > > > > RFC 8267 Section 6.1 does not list any part of the result of READ_PLUS
> > > > > as DDP-eligible. There's no way for a client to know how to set up
> > > > > Write chunks, unless it knows exactly where the file's holes are in
> > > > > advance. Even then... racy.
> > > > > 
> > > > > Just curious, have you tried READ_PLUS with proto=rdma ?
> > > > 
> > > > I haven't done in-depth performance testing, but I have been able to run
> > > > it.
> > > 
> > > We should figure out whether that will have a regressive
> > > impact on NFS/RDMA workloads. I expect that it will, but
> > > the client can always set up the Reply chunk so that the
> > > READ payload fits precisely in an RDMA segment that lines
> > > up with page cache pages. That mitigates some impact.
> > > 
> > > If your patch set already changes NFSv4.2 mounts to always
> > > use READ_PLUS in place of READ, it might be prudent for the
> > > "proto=rdma" mount option to also set "noreadplus", at least
> > > for the time being.
> > 
> > I can make this change.
> > 
> > > The down-side here is that would make NFSv4.2 on RDMA
> > > unable to recognize holes in files the same way as it
> > > does on TCP, and that's a pretty significant variation
> > > in behavior. Does "noreadplus" even deal with that?
> > 
> > Setting "noreadplus" just causes the client to use the READ operation
> > instead,
> > so there should be no difference between v4.1 and v4.2 if the option is set.
> 
> My concern is the difference between NFSv4.2 with noreadplus
> and NFSv4.2 with readplus. The former is not able to detect
> holes in files on the server, but the latter is.

The client could still use lseek to detect holes. The client throws away the
hole information after xdr decoding, and zeroes out the corresponding pages for
the page cache.

> 
> Is that worth mentioning in the man page, or in release notes
> when NFSv4.2 becomes the default?
> 
> 
> > Anna
> > 
> > > 
> > > > Anna
> > > > 
> > > > > > +	encode_nops(&hdr);
> > > > > > +}
> > > > > > +
> > > > > > /*
> > > > > > * Encode SEEK request
> > > > > > */
> > > > > > @@ -694,6 +744,67 @@ static int decode_deallocate(struct xdr_stream
> > > > > > *xdr,
> > > > > > struct nfs42_falloc_res *re
> > > > > > 	return decode_op_hdr(xdr, OP_DEALLOCATE);
> > > > > > }
> > > > > > 
> > > > > > +static uint32_t decode_read_plus_data(struct xdr_stream *xdr,
> > > > > > struct
> > > > > > nfs_pgio_res *res,
> > > > > > +				      uint32_t *eof)
> > > > > > +{
> > > > > > +	__be32 *p;
> > > > > > +	uint32_t count, recvd;
> > > > > > +	uint64_t offset;
> > > > > > +
> > > > > > +	p = xdr_inline_decode(xdr, 8 + 4);
> > > > > > +	if (unlikely(!p))
> > > > > > +		return -EIO;
> > > > > > +
> > > > > > +	p = xdr_decode_hyper(p, &offset);
> > > > > > +	count = be32_to_cpup(p);
> > > > > > +	if (count == 0)
> > > > > > +		return 0;
> > > > > > +
> > > > > > +	recvd = xdr_read_pages(xdr, count);
> > > > > > +	if (count > recvd) {
> > > > > > +		dprintk("NFS: server cheating in read reply: "
> > > > > > +				"count %u > recvd %u\n", count, recvd);
> > > > > > +		count = recvd;
> > > > > > +		*eof = 0;
> > > > > > +	}
> > > > > > +
> > > > > > +	return count;
> > > > > > +}
> > > > > > +
> > > > > > +static int decode_read_plus(struct xdr_stream *xdr, struct
> > > > > > nfs_pgio_res
> > > > > > *res)
> > > > > > +{
> > > > > > +	__be32 *p;
> > > > > > +	uint32_t count, eof, segments, type;
> > > > > > +	int status;
> > > > > > +
> > > > > > +	status = decode_op_hdr(xdr, OP_READ_PLUS);
> > > > > > +	if (status)
> > > > > > +		return status;
> > > > > > +
> > > > > > +	p = xdr_inline_decode(xdr, 4 + 4);
> > > > > > +	if (unlikely(!p))
> > > > > > +		return -EIO;
> > > > > > +
> > > > > > +	eof = be32_to_cpup(p++);
> > > > > > +	segments = be32_to_cpup(p++);
> > > > > > +	if (segments == 0)
> > > > > > +		return 0;
> > > > > > +
> > > > > > +	p = xdr_inline_decode(xdr, 4);
> > > > > > +	if (unlikely(!p))
> > > > > > +		return -EIO;
> > > > > > +
> > > > > > +	type = be32_to_cpup(p++);
> > > > > > +	if (type == NFS4_CONTENT_DATA)
> > > > > > +		count = decode_read_plus_data(xdr, res, &eof);
> > > > > > +	else
> > > > > > +		return -EINVAL;
> > > > > > +
> > > > > > +	res->eof = eof;
> > > > > > +	res->count = count;
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res
> > > > > > *res)
> > > > > > {
> > > > > > 	int status;
> > > > > > @@ -870,6 +981,33 @@ static int nfs4_xdr_dec_deallocate(struct
> > > > > > rpc_rqst
> > > > > > *rqstp,
> > > > > > 	return status;
> > > > > > }
> > > > > > 
> > > > > > +/*
> > > > > > + * Decode READ_PLUS request
> > > > > > + */
> > > > > > +static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
> > > > > > +				  struct xdr_stream *xdr,
> > > > > > +				  void *data)
> > > > > > +{
> > > > > > +	struct nfs_pgio_res *res = data;
> > > > > > +	struct compound_hdr hdr;
> > > > > > +	int status;
> > > > > > +
> > > > > > +	status = decode_compound_hdr(xdr, &hdr);
> > > > > > +	if (status)
> > > > > > +		goto out;
> > > > > > +	status = decode_sequence(xdr, &res->seq_res, rqstp);
> > > > > > +	if (status)
> > > > > > +		goto out;
> > > > > > +	status = decode_putfh(xdr);
> > > > > > +	if (status)
> > > > > > +		goto out;
> > > > > > +	status = decode_read_plus(xdr, res);
> > > > > > +	if (!status)
> > > > > > +		status = res->count;
> > > > > > +out:
> > > > > > +	return status;
> > > > > > +}
> > > > > > +
> > > > > > /*
> > > > > > * Decode SEEK request
> > > > > > */
> > > > > > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> > > > > > index 95d07a3dc5d1..ed3ec8c36273 100644
> > > > > > --- a/fs/nfs/nfs4proc.c
> > > > > > +++ b/fs/nfs/nfs4proc.c
> > > > > > @@ -69,6 +69,10 @@
> > > > > > 
> > > > > > #include "nfs4trace.h"
> > > > > > 
> > > > > > +#ifdef CONFIG_NFS_V4_2
> > > > > > +#include "nfs42.h"
> > > > > > +#endif /* CONFIG_NFS_V4_2 */
> > > > > > +
> > > > > > #define NFSDBG_FACILITY		NFSDBG_PROC
> > > > > > 
> > > > > > #define NFS4_BITMASK_SZ		3
> > > > > > @@ -5199,28 +5203,60 @@ static bool nfs4_read_stateid_changed(struct
> > > > > > rpc_task *task,
> > > > > > 	return true;
> > > > > > }
> > > > > > 
> > > > > > +static bool nfs4_read_plus_not_supported(struct rpc_task *task,
> > > > > > +					struct nfs_pgio_header *hdr)
> > > > > > +{
> > > > > > +	struct nfs_server *server = NFS_SERVER(hdr->inode);
> > > > > > +	struct rpc_message *msg = &task->tk_msg;
> > > > > > +
> > > > > > +	if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]
> > > > > > &&
> > > > > > +	    server->caps & NFS_CAP_READ_PLUS && task->tk_status ==
> > > > > > -ENOTSUPP) {
> > > > > > +		server->caps &= ~NFS_CAP_READ_PLUS;
> > > > > > +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> > > > > > +		rpc_restart_call_prepare(task);
> > > > > > +		return true;
> > > > > > +	}
> > > > > > +	return false;
> > > > > > +}
> > > > > > +
> > > > > > static int nfs4_read_done(struct rpc_task *task, struct
> > > > > > nfs_pgio_header
> > > > > > *hdr)
> > > > > > {
> > > > > > -
> > > > > > 	dprintk("--> %s\n", __func__);
> > > > > > 
> > > > > > 	if (!nfs4_sequence_done(task, &hdr->res.seq_res))
> > > > > > 		return -EAGAIN;
> > > > > > 	if (nfs4_read_stateid_changed(task, &hdr->args))
> > > > > > 		return -EAGAIN;
> > > > > > +	if (nfs4_read_plus_not_supported(task, hdr))
> > > > > > +		return -EAGAIN;
> > > > > > 	if (task->tk_status > 0)
> > > > > > 		nfs_invalidate_atime(hdr->inode);
> > > > > > 	return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
> > > > > > 				    nfs4_read_done_cb(task, hdr);
> > > > > > }
> > > > > > 
> > > > > > +#ifdef CONFIG_NFS_V4_2
> > > > > > +static void nfs42_read_plus_support(struct nfs_server *server,
> > > > > > struct
> > > > > > rpc_message *msg)
> > > > > > +{
> > > > > > +	if (server->caps & NFS_CAP_READ_PLUS)
> > > > > > +		msg->rpc_proc =
> > > > > > &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
> > > > > > +	else
> > > > > > +		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> > > > > > +}
> > > > > > +#else
> > > > > > +static void nfs42_read_plus_support(struct nfs_server *server,
> > > > > > struct
> > > > > > rpc_message *msg)
> > > > > > +{
> > > > > > +	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> > > > > > +}
> > > > > > +#endif /* CONFIG_NFS_V4_2 */
> > > > > > +
> > > > > > static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
> > > > > > 				 struct rpc_message *msg)
> > > > > > {
> > > > > > 	hdr->timestamp   = jiffies;
> > > > > > 	if (!hdr->pgio_done_cb)
> > > > > > 		hdr->pgio_done_cb = nfs4_read_done_cb;
> > > > > > -	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
> > > > > > +	nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg);
> > > > > > 	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0,
> > > > > > 0);
> > > > > > }
> > > > > > 
> > > > > > @@ -9970,7 +10006,8 @@ static const struct nfs4_minor_version_ops
> > > > > > nfs_v4_2_minor_ops = {
> > > > > > 		| NFS_CAP_SEEK
> > > > > > 		| NFS_CAP_LAYOUTSTATS
> > > > > > 		| NFS_CAP_CLONE
> > > > > > -		| NFS_CAP_LAYOUTERROR,
> > > > > > +		| NFS_CAP_LAYOUTERROR
> > > > > > +		| NFS_CAP_READ_PLUS,
> > > > > > 	.init_client = nfs41_init_client,
> > > > > > 	.shutdown_client = nfs41_shutdown_client,
> > > > > > 	.match_stateid = nfs41_match_stateid,
> > > > > > diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> > > > > > index 47817ef0aadb..68b2917d0537 100644
> > > > > > --- a/fs/nfs/nfs4xdr.c
> > > > > > +++ b/fs/nfs/nfs4xdr.c
> > > > > > @@ -7584,6 +7584,7 @@ const struct rpc_procinfo nfs4_procedures[] =
> > > > > > {
> > > > > > 	PROC42(COPY_NOTIFY,	enc_copy_notify,	dec_copy_notify)
> > > > > > ,
> > > > > > 	PROC(LOOKUPP,		enc_lookupp,		dec_lookupp)
> > > > > > ,
> > > > > > 	PROC42(LAYOUTERROR,	enc_layouterror,	dec_layouterror)
> > > > > > ,
> > > > > > +	PROC42(READ_PLUS,	enc_read_plus,		dec_read_plus),
> > > > > > };
> > > > > > 
> > > > > > static unsigned int
> > > > > > nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
> > > > > > diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
> > > > > > index 82d8fb422092..c1eeef52545c 100644
> > > > > > --- a/include/linux/nfs4.h
> > > > > > +++ b/include/linux/nfs4.h
> > > > > > @@ -540,8 +540,8 @@ enum {
> > > > > > 
> > > > > > 	NFSPROC4_CLNT_LOOKUPP,
> > > > > > 	NFSPROC4_CLNT_LAYOUTERROR,
> > > > > > -
> > > > > > 	NFSPROC4_CLNT_COPY_NOTIFY,
> > > > > > +	NFSPROC4_CLNT_READ_PLUS,
> > > > > > };
> > > > > > 
> > > > > > /* nfs41 types */
> > > > > > diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> > > > > > index 465fa98258a3..11248c5a7b24 100644
> > > > > > --- a/include/linux/nfs_fs_sb.h
> > > > > > +++ b/include/linux/nfs_fs_sb.h
> > > > > > @@ -281,5 +281,6 @@ struct nfs_server {
> > > > > > #define NFS_CAP_OFFLOAD_CANCEL	(1U << 25)
> > > > > > #define NFS_CAP_LAYOUTERROR	(1U << 26)
> > > > > > #define NFS_CAP_COPY_NOTIFY	(1U << 27)
> > > > > > +#define NFS_CAP_READ_PLUS	(1U << 28)
> > > > > > 
> > > > > > #endif
> > > > > > diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> > > > > > index 94c77ed55ce1..8efbf3d8b263 100644
> > > > > > --- a/include/linux/nfs_xdr.h
> > > > > > +++ b/include/linux/nfs_xdr.h
> > > > > > @@ -655,7 +655,7 @@ struct nfs_pgio_args {
> > > > > > struct nfs_pgio_res {
> > > > > > 	struct nfs4_sequence_res	seq_res;
> > > > > > 	struct nfs_fattr *	fattr;
> > > > > > -	__u32			count;
> > > > > > +	__u64			count;
> > > > > > 	__u32			op_status;
> > > > > > 	union {
> > > > > > 		struct {
> > > > > > -- 
> > > > > > 2.25.0
> > > > > > 
> > > > > 
> > > > > --
> > > > > Chuck Lever
> > > 
> > > --
> > > Chuck Lever
> 
> --
> Chuck Lever
> 
> 
>
Chuck Lever III Feb. 20, 2020, 6:54 p.m. UTC | #7
> On Feb 20, 2020, at 1:35 PM, Anna Schumaker <schumaker.anna@gmail.com> wrote:
> 
> On Thu, 2020-02-20 at 13:30 -0500, Chuck Lever wrote:
>>> On Feb 20, 2020, at 1:28 PM, Anna Schumaker <schumaker.anna@gmail.com>
>>> wrote:
>>> 
>>> On Thu, 2020-02-20 at 09:55 -0500, Chuck Lever wrote:
>>> 
>>>> The down-side here is that would make NFSv4.2 on RDMA
>>>> unable to recognize holes in files the same way as it
>>>> does on TCP, and that's a pretty significant variation
>>>> in behavior. Does "noreadplus" even deal with that?
>>> 
>>> Setting "noreadplus" just causes the client to use the READ operation
>>> instead,
>>> so there should be no difference between v4.1 and v4.2 if the option is set.
>> 
>> My concern is the difference between NFSv4.2 with noreadplus
>> and NFSv4.2 with readplus. The former is not able to detect
>> holes in files on the server, but the latter is.
> 
> The client could still use lseek to detect holes. The client throws away the
> hole information after xdr decoding, and zeroes out the corresponding pages for
> the page cache.

Then maybe that's an optimization for another day. The READ_PLUS
reply can have hole information. The client could save itself some
SEEK_HOLE operations if it cached that hole information somehow.

But if "readplus" and "noreadplus" result in exactly the same hole
retention behavior on our client, I guess there's no harm in using
"noreadplus" instead, except for possible performance regression.

An alternative to making "noreadplus" the default would be to
temporarily add "noreadplus" semantics to the "proto=rdma" mount
option, as a separate patch; maybe after some performance results
show that it is necessary. That would keep the mount interface a
little less complicated.


--
Chuck Lever
diff mbox series

Patch

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index c03f3246d6c5..bf118ecabe2c 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -45,6 +45,15 @@ 
 #define encode_deallocate_maxsz		(op_encode_hdr_maxsz + \
 					 encode_fallocate_maxsz)
 #define decode_deallocate_maxsz		(op_decode_hdr_maxsz)
+#define encode_read_plus_maxsz		(op_encode_hdr_maxsz + \
+					 encode_stateid_maxsz + 3)
+#define NFS42_READ_PLUS_SEGMENT_SIZE	(1 /* data_content4 */ + \
+					 2 /* data_info4.di_offset */ + \
+					 2 /* data_info4.di_length */)
+#define decode_read_plus_maxsz		(op_decode_hdr_maxsz + \
+					 1 /* rpr_eof */ + \
+					 1 /* rpr_contents count */ + \
+					 NFS42_READ_PLUS_SEGMENT_SIZE)
 #define encode_seek_maxsz		(op_encode_hdr_maxsz + \
 					 encode_stateid_maxsz + \
 					 2 /* offset */ + \
@@ -128,6 +137,14 @@ 
 					 decode_putfh_maxsz + \
 					 decode_deallocate_maxsz + \
 					 decode_getattr_maxsz)
+#define NFS4_enc_read_plus_sz		(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_putfh_maxsz + \
+					 encode_read_plus_maxsz)
+#define NFS4_dec_read_plus_sz		(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_putfh_maxsz + \
+					 decode_read_plus_maxsz)
 #define NFS4_enc_seek_sz		(compound_encode_hdr_maxsz + \
 					 encode_sequence_maxsz + \
 					 encode_putfh_maxsz + \
@@ -252,6 +269,16 @@  static void encode_deallocate(struct xdr_stream *xdr,
 	encode_fallocate(xdr, args);
 }
 
+static void encode_read_plus(struct xdr_stream *xdr,
+			     const struct nfs_pgio_args *args,
+			     struct compound_hdr *hdr)
+{
+	encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr);
+	encode_nfs4_stateid(xdr, &args->stateid);
+	encode_uint64(xdr, args->offset);
+	encode_uint32(xdr, args->count);
+}
+
 static void encode_seek(struct xdr_stream *xdr,
 			const struct nfs42_seek_args *args,
 			struct compound_hdr *hdr)
@@ -446,6 +473,29 @@  static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
+/*
+ * Encode READ_PLUS request
+ */
+static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const void *data)
+{
+	const struct nfs_pgio_args *args = data;
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	encode_read_plus(xdr, args, &hdr);
+
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
+				args->count, hdr.replen);
+	req->rq_rcv_buf.flags |= XDRBUF_READ;
+	encode_nops(&hdr);
+}
+
 /*
  * Encode SEEK request
  */
@@ -694,6 +744,67 @@  static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re
 	return decode_op_hdr(xdr, OP_DEALLOCATE);
 }
 
+static uint32_t decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *res,
+				      uint32_t *eof)
+{
+	__be32 *p;
+	uint32_t count, recvd;
+	uint64_t offset;
+
+	p = xdr_inline_decode(xdr, 8 + 4);
+	if (unlikely(!p))
+		return -EIO;
+
+	p = xdr_decode_hyper(p, &offset);
+	count = be32_to_cpup(p);
+	if (count == 0)
+		return 0;
+
+	recvd = xdr_read_pages(xdr, count);
+	if (count > recvd) {
+		dprintk("NFS: server cheating in read reply: "
+				"count %u > recvd %u\n", count, recvd);
+		count = recvd;
+		*eof = 0;
+	}
+
+	return count;
+}
+
+static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
+{
+	__be32 *p;
+	uint32_t count, eof, segments, type;
+	int status;
+
+	status = decode_op_hdr(xdr, OP_READ_PLUS);
+	if (status)
+		return status;
+
+	p = xdr_inline_decode(xdr, 4 + 4);
+	if (unlikely(!p))
+		return -EIO;
+
+	eof = be32_to_cpup(p++);
+	segments = be32_to_cpup(p++);
+	if (segments == 0)
+		return 0;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		return -EIO;
+
+	type = be32_to_cpup(p++);
+	if (type == NFS4_CONTENT_DATA)
+		count = decode_read_plus_data(xdr, res, &eof);
+	else
+		return -EINVAL;
+
+	res->eof = eof;
+	res->count = count;
+	return 0;
+}
+
 static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
 {
 	int status;
@@ -870,6 +981,33 @@  static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp,
 	return status;
 }
 
+/*
+ * Decode READ_PLUS request
+ */
+static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
+				  void *data)
+{
+	struct nfs_pgio_res *res = data;
+	struct compound_hdr hdr;
+	int status;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	status = decode_read_plus(xdr, res);
+	if (!status)
+		status = res->count;
+out:
+	return status;
+}
+
 /*
  * Decode SEEK request
  */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 95d07a3dc5d1..ed3ec8c36273 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -69,6 +69,10 @@ 
 
 #include "nfs4trace.h"
 
+#ifdef CONFIG_NFS_V4_2
+#include "nfs42.h"
+#endif /* CONFIG_NFS_V4_2 */
+
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
 #define NFS4_BITMASK_SZ		3
@@ -5199,28 +5203,60 @@  static bool nfs4_read_stateid_changed(struct rpc_task *task,
 	return true;
 }
 
+static bool nfs4_read_plus_not_supported(struct rpc_task *task,
+					 struct nfs_pgio_header *hdr)
+{
+	struct nfs_server *server = NFS_SERVER(hdr->inode);
+	struct rpc_message *msg = &task->tk_msg;
+
+	if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] &&
+	    server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) {
+		server->caps &= ~NFS_CAP_READ_PLUS;
+		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+		rpc_restart_call_prepare(task);
+		return true;
+	}
+	return false;
+}
+
 static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-
 	dprintk("--> %s\n", __func__);
 
 	if (!nfs4_sequence_done(task, &hdr->res.seq_res))
 		return -EAGAIN;
 	if (nfs4_read_stateid_changed(task, &hdr->args))
 		return -EAGAIN;
+	if (nfs4_read_plus_not_supported(task, hdr))
+		return -EAGAIN;
 	if (task->tk_status > 0)
 		nfs_invalidate_atime(hdr->inode);
 	return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
 				    nfs4_read_done_cb(task, hdr);
 }
 
+#ifdef CONFIG_NFS_V4_2
+static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg)
+{
+	if (server->caps & NFS_CAP_READ_PLUS)
+		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
+	else
+		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+}
+#else
+static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg)
+{
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+}
+#endif /* CONFIG_NFS_V4_2 */
+
 static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
 				 struct rpc_message *msg)
 {
 	hdr->timestamp   = jiffies;
 	if (!hdr->pgio_done_cb)
 		hdr->pgio_done_cb = nfs4_read_done_cb;
-	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+	nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg);
 	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
 }
 
@@ -9970,7 +10006,8 @@  static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 		| NFS_CAP_SEEK
 		| NFS_CAP_LAYOUTSTATS
 		| NFS_CAP_CLONE
-		| NFS_CAP_LAYOUTERROR,
+		| NFS_CAP_LAYOUTERROR
+		| NFS_CAP_READ_PLUS,
 	.init_client = nfs41_init_client,
 	.shutdown_client = nfs41_shutdown_client,
 	.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 47817ef0aadb..68b2917d0537 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7584,6 +7584,7 @@  const struct rpc_procinfo nfs4_procedures[] = {
 	PROC42(COPY_NOTIFY,	enc_copy_notify,	dec_copy_notify),
 	PROC(LOOKUPP,		enc_lookupp,		dec_lookupp),
 	PROC42(LAYOUTERROR,	enc_layouterror,	dec_layouterror),
+	PROC42(READ_PLUS,	enc_read_plus,		dec_read_plus),
 };
 
 static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 82d8fb422092..c1eeef52545c 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -540,8 +540,8 @@  enum {
 
 	NFSPROC4_CLNT_LOOKUPP,
 	NFSPROC4_CLNT_LAYOUTERROR,
-
 	NFSPROC4_CLNT_COPY_NOTIFY,
+	NFSPROC4_CLNT_READ_PLUS,
 };
 
 /* nfs41 types */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 465fa98258a3..11248c5a7b24 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -281,5 +281,6 @@  struct nfs_server {
 #define NFS_CAP_OFFLOAD_CANCEL	(1U << 25)
 #define NFS_CAP_LAYOUTERROR	(1U << 26)
 #define NFS_CAP_COPY_NOTIFY	(1U << 27)
+#define NFS_CAP_READ_PLUS	(1U << 28)
 
 #endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 94c77ed55ce1..8efbf3d8b263 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -655,7 +655,7 @@  struct nfs_pgio_args {
 struct nfs_pgio_res {
 	struct nfs4_sequence_res	seq_res;
 	struct nfs_fattr *	fattr;
-	__u32			count;
+	__u64			count;
 	__u32			op_status;
 	union {
 		struct {