diff mbox series

[v4,2/5] NFSD: Add READ_PLUS data support

Message ID 20200817165310.354092-3-Anna.Schumaker@Netapp.com (mailing list archive)
State New, archived
Headers show
Series NFSD: Add support for the v4.2 READ_PLUS operation | expand

Commit Message

Anna Schumaker Aug. 17, 2020, 4:53 p.m. UTC
From: Anna Schumaker <Anna.Schumaker@Netapp.com>

This patch adds READ_PLUS support for returning a single
NFS4_CONTENT_DATA segment to the client. This is basically the same as
the READ operation, only with the extra information about data segments.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfsd/nfs4proc.c | 17 ++++++++++
 fs/nfsd/nfs4xdr.c  | 83 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 98 insertions(+), 2 deletions(-)

Comments

Bruce Fields Aug. 28, 2020, 9:25 p.m. UTC | #1
On Mon, Aug 17, 2020 at 12:53:07PM -0400, schumaker.anna@gmail.com wrote:
> From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> 
> This patch adds READ_PLUS support for returning a single
> NFS4_CONTENT_DATA segment to the client. This is basically the same as
> the READ operation, only with the extra information about data segments.
> 
> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> ---
>  fs/nfsd/nfs4proc.c | 17 ++++++++++
>  fs/nfsd/nfs4xdr.c  | 83 ++++++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 98 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index a09c35f0f6f0..9630d33211f2 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -2523,6 +2523,16 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
>  	return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
>  }
>  
> +static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
> +{
> +	u32 maxcount = svc_max_payload(rqstp);
> +	u32 rlen = min(op->u.read.rd_length, maxcount);
> +	/* enough extra xdr space for encoding either a hole or data segment. */
> +	u32 segments = 1 + 2 + 2;
> +
> +	return (op_encode_hdr_size + 2 + segments + XDR_QUADLEN(rlen)) * sizeof(__be32);

I'm not sure I understand this calculation.

In the final code, there's no fixed limit on the number of segments
returned by a single READ_PLUS op, right?

--b.

> +}
> +
>  static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
>  {
>  	u32 maxcount = 0, rlen = 0;
> @@ -3059,6 +3069,13 @@ static const struct nfsd4_operation nfsd4_ops[] = {
>  		.op_name = "OP_COPY",
>  		.op_rsize_bop = nfsd4_copy_rsize,
>  	},
> +	[OP_READ_PLUS] = {
> +		.op_func = nfsd4_read,
> +		.op_release = nfsd4_read_release,
> +		.op_name = "OP_READ_PLUS",
> +		.op_rsize_bop = nfsd4_read_plus_rsize,
> +		.op_get_currentstateid = nfsd4_get_readstateid,
> +	},
>  	[OP_SEEK] = {
>  		.op_func = nfsd4_seek,
>  		.op_name = "OP_SEEK",
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 6a1c0a7fae05..9af92f538000 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -1957,7 +1957,7 @@ static const nfsd4_dec nfsd4_dec_ops[] = {
>  	[OP_LAYOUTSTATS]	= (nfsd4_dec)nfsd4_decode_notsupp,
>  	[OP_OFFLOAD_CANCEL]	= (nfsd4_dec)nfsd4_decode_offload_status,
>  	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_offload_status,
> -	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
> +	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_read,
>  	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_seek,
>  	[OP_WRITE_SAME]		= (nfsd4_dec)nfsd4_decode_notsupp,
>  	[OP_CLONE]		= (nfsd4_dec)nfsd4_decode_clone,
> @@ -4367,6 +4367,85 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
>  		return nfserr_resource;
>  	p = xdr_encode_hyper(p, os->count);
>  	*p++ = cpu_to_be32(0);
> +	return nfserr;
> +}
> +
> +static __be32
> +nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> +			    struct nfsd4_read *read,
> +			    unsigned long maxcount,  u32 *eof)
> +{
> +	struct xdr_stream *xdr = &resp->xdr;
> +	struct file *file = read->rd_nf->nf_file;
> +	int starting_len = xdr->buf->len;
> +	__be32 nfserr;
> +	__be32 *p, tmp;
> +	__be64 tmp64;
> +
> +	/* Content type, offset, byte count */
> +	p = xdr_reserve_space(xdr, 4 + 8 + 4);
> +	if (!p)
> +		return nfserr_resource;
> +
> +	read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount);
> +	if (read->rd_vlen < 0)
> +		return nfserr_resource;
> +
> +	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
> +			    resp->rqstp->rq_vec, read->rd_vlen, &maxcount, eof);
> +	if (nfserr)
> +		return nfserr;
> +
> +	tmp = htonl(NFS4_CONTENT_DATA);
> +	write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
> +	tmp64 = cpu_to_be64(read->rd_offset);
> +	write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
> +	tmp = htonl(maxcount);
> +	write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
> +	return nfs_ok;
> +}
> +
> +static __be32
> +nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
> +		       struct nfsd4_read *read)
> +{
> +	unsigned long maxcount;
> +	struct xdr_stream *xdr = &resp->xdr;
> +	struct file *file;
> +	int starting_len = xdr->buf->len;
> +	int segments = 0;
> +	__be32 *p, tmp;
> +	u32 eof;
> +
> +	if (nfserr)
> +		return nfserr;
> +	file = read->rd_nf->nf_file;
> +
> +	/* eof flag, segment count */
> +	p = xdr_reserve_space(xdr, 4 + 4);
> +	if (!p)
> +		return nfserr_resource;
> +	xdr_commit_encode(xdr);
> +
> +	maxcount = svc_max_payload(resp->rqstp);
> +	maxcount = min_t(unsigned long, maxcount,
> +			 (xdr->buf->buflen - xdr->buf->len));
> +	maxcount = min_t(unsigned long, maxcount, read->rd_length);
> +
> +	eof = read->rd_offset >= i_size_read(file_inode(file));
> +	if (!eof) {
> +		nfserr = nfsd4_encode_read_plus_data(resp, read, maxcount, &eof);
> +		segments++;
> +	}
> +
> +	if (nfserr)
> +		xdr_truncate_encode(xdr, starting_len);
> +	else {
> +		tmp = htonl(eof);
> +		write_bytes_to_xdr_buf(xdr->buf, starting_len,     &tmp, 4);
> +		tmp = htonl(segments);
> +		write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
> +	}
>  
>  	return nfserr;
>  }
> @@ -4509,7 +4588,7 @@ static const nfsd4_enc nfsd4_enc_ops[] = {
>  	[OP_LAYOUTSTATS]	= (nfsd4_enc)nfsd4_encode_noop,
>  	[OP_OFFLOAD_CANCEL]	= (nfsd4_enc)nfsd4_encode_noop,
>  	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_offload_status,
> -	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
> +	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_read_plus,
>  	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_seek,
>  	[OP_WRITE_SAME]		= (nfsd4_enc)nfsd4_encode_noop,
>  	[OP_CLONE]		= (nfsd4_enc)nfsd4_encode_noop,
> -- 
> 2.28.0
>
Bruce Fields Aug. 28, 2020, 9:56 p.m. UTC | #2
On Fri, Aug 28, 2020 at 05:25:21PM -0400, J. Bruce Fields wrote:
> On Mon, Aug 17, 2020 at 12:53:07PM -0400, schumaker.anna@gmail.com wrote:
> > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > 
> > This patch adds READ_PLUS support for returning a single
> > NFS4_CONTENT_DATA segment to the client. This is basically the same as
> > the READ operation, only with the extra information about data segments.
> > 
> > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > ---
> >  fs/nfsd/nfs4proc.c | 17 ++++++++++
> >  fs/nfsd/nfs4xdr.c  | 83 ++++++++++++++++++++++++++++++++++++++++++++--
> >  2 files changed, 98 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> > index a09c35f0f6f0..9630d33211f2 100644
> > --- a/fs/nfsd/nfs4proc.c
> > +++ b/fs/nfsd/nfs4proc.c
> > @@ -2523,6 +2523,16 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
> >  	return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
> >  }
> >  
> > +static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
> > +{
> > +	u32 maxcount = svc_max_payload(rqstp);
> > +	u32 rlen = min(op->u.read.rd_length, maxcount);
> > +	/* enough extra xdr space for encoding either a hole or data segment. */
> > +	u32 segments = 1 + 2 + 2;
> > +
> > +	return (op_encode_hdr_size + 2 + segments + XDR_QUADLEN(rlen)) * sizeof(__be32);
> 
> I'm not sure I understand this calculation.
> 
> In the final code, there's no fixed limit on the number of segments
> returned by a single READ_PLUS op, right?

I think the worst-case overhead to represent a hole is around 50 bytes. 

So as long as we don't encode any holes less than that, then we can just
use rlen as an upper bound.

We really don't want to bother encoding small holes.  I doubt
filesystems want to bother with them either.  Do they give us any
guarantees as to the minimum size of a hole?

--b.

> 
> --b.
> 
> > +}
> > +
> >  static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
> >  {
> >  	u32 maxcount = 0, rlen = 0;
> > @@ -3059,6 +3069,13 @@ static const struct nfsd4_operation nfsd4_ops[] = {
> >  		.op_name = "OP_COPY",
> >  		.op_rsize_bop = nfsd4_copy_rsize,
> >  	},
> > +	[OP_READ_PLUS] = {
> > +		.op_func = nfsd4_read,
> > +		.op_release = nfsd4_read_release,
> > +		.op_name = "OP_READ_PLUS",
> > +		.op_rsize_bop = nfsd4_read_plus_rsize,
> > +		.op_get_currentstateid = nfsd4_get_readstateid,
> > +	},
> >  	[OP_SEEK] = {
> >  		.op_func = nfsd4_seek,
> >  		.op_name = "OP_SEEK",
> > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > index 6a1c0a7fae05..9af92f538000 100644
> > --- a/fs/nfsd/nfs4xdr.c
> > +++ b/fs/nfsd/nfs4xdr.c
> > @@ -1957,7 +1957,7 @@ static const nfsd4_dec nfsd4_dec_ops[] = {
> >  	[OP_LAYOUTSTATS]	= (nfsd4_dec)nfsd4_decode_notsupp,
> >  	[OP_OFFLOAD_CANCEL]	= (nfsd4_dec)nfsd4_decode_offload_status,
> >  	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_offload_status,
> > -	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
> > +	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_read,
> >  	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_seek,
> >  	[OP_WRITE_SAME]		= (nfsd4_dec)nfsd4_decode_notsupp,
> >  	[OP_CLONE]		= (nfsd4_dec)nfsd4_decode_clone,
> > @@ -4367,6 +4367,85 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
> >  		return nfserr_resource;
> >  	p = xdr_encode_hyper(p, os->count);
> >  	*p++ = cpu_to_be32(0);
> > +	return nfserr;
> > +}
> > +
> > +static __be32
> > +nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> > +			    struct nfsd4_read *read,
> > +			    unsigned long maxcount,  u32 *eof)
> > +{
> > +	struct xdr_stream *xdr = &resp->xdr;
> > +	struct file *file = read->rd_nf->nf_file;
> > +	int starting_len = xdr->buf->len;
> > +	__be32 nfserr;
> > +	__be32 *p, tmp;
> > +	__be64 tmp64;
> > +
> > +	/* Content type, offset, byte count */
> > +	p = xdr_reserve_space(xdr, 4 + 8 + 4);
> > +	if (!p)
> > +		return nfserr_resource;
> > +
> > +	read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount);
> > +	if (read->rd_vlen < 0)
> > +		return nfserr_resource;
> > +
> > +	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
> > +			    resp->rqstp->rq_vec, read->rd_vlen, &maxcount, eof);
> > +	if (nfserr)
> > +		return nfserr;
> > +
> > +	tmp = htonl(NFS4_CONTENT_DATA);
> > +	write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
> > +	tmp64 = cpu_to_be64(read->rd_offset);
> > +	write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
> > +	tmp = htonl(maxcount);
> > +	write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
> > +	return nfs_ok;
> > +}
> > +
> > +static __be32
> > +nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
> > +		       struct nfsd4_read *read)
> > +{
> > +	unsigned long maxcount;
> > +	struct xdr_stream *xdr = &resp->xdr;
> > +	struct file *file;
> > +	int starting_len = xdr->buf->len;
> > +	int segments = 0;
> > +	__be32 *p, tmp;
> > +	u32 eof;
> > +
> > +	if (nfserr)
> > +		return nfserr;
> > +	file = read->rd_nf->nf_file;
> > +
> > +	/* eof flag, segment count */
> > +	p = xdr_reserve_space(xdr, 4 + 4);
> > +	if (!p)
> > +		return nfserr_resource;
> > +	xdr_commit_encode(xdr);
> > +
> > +	maxcount = svc_max_payload(resp->rqstp);
> > +	maxcount = min_t(unsigned long, maxcount,
> > +			 (xdr->buf->buflen - xdr->buf->len));
> > +	maxcount = min_t(unsigned long, maxcount, read->rd_length);
> > +
> > +	eof = read->rd_offset >= i_size_read(file_inode(file));
> > +	if (!eof) {
> > +		nfserr = nfsd4_encode_read_plus_data(resp, read, maxcount, &eof);
> > +		segments++;
> > +	}
> > +
> > +	if (nfserr)
> > +		xdr_truncate_encode(xdr, starting_len);
> > +	else {
> > +		tmp = htonl(eof);
> > +		write_bytes_to_xdr_buf(xdr->buf, starting_len,     &tmp, 4);
> > +		tmp = htonl(segments);
> > +		write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
> > +	}
> >  
> >  	return nfserr;
> >  }
> > @@ -4509,7 +4588,7 @@ static const nfsd4_enc nfsd4_enc_ops[] = {
> >  	[OP_LAYOUTSTATS]	= (nfsd4_enc)nfsd4_encode_noop,
> >  	[OP_OFFLOAD_CANCEL]	= (nfsd4_enc)nfsd4_encode_noop,
> >  	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_offload_status,
> > -	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
> > +	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_read_plus,
> >  	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_seek,
> >  	[OP_WRITE_SAME]		= (nfsd4_enc)nfsd4_encode_noop,
> >  	[OP_CLONE]		= (nfsd4_enc)nfsd4_encode_noop,
> > -- 
> > 2.28.0
> >
Anna Schumaker Aug. 31, 2020, 6:16 p.m. UTC | #3
On Fri, Aug 28, 2020 at 5:56 PM J. Bruce Fields <bfields@redhat.com> wrote:
>
> On Fri, Aug 28, 2020 at 05:25:21PM -0400, J. Bruce Fields wrote:
> > On Mon, Aug 17, 2020 at 12:53:07PM -0400, schumaker.anna@gmail.com wrote:
> > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > >
> > > This patch adds READ_PLUS support for returning a single
> > > NFS4_CONTENT_DATA segment to the client. This is basically the same as
> > > the READ operation, only with the extra information about data segments.
> > >
> > > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > ---
> > >  fs/nfsd/nfs4proc.c | 17 ++++++++++
> > >  fs/nfsd/nfs4xdr.c  | 83 ++++++++++++++++++++++++++++++++++++++++++++--
> > >  2 files changed, 98 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> > > index a09c35f0f6f0..9630d33211f2 100644
> > > --- a/fs/nfsd/nfs4proc.c
> > > +++ b/fs/nfsd/nfs4proc.c
> > > @@ -2523,6 +2523,16 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
> > >     return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
> > >  }
> > >
> > > +static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
> > > +{
> > > +   u32 maxcount = svc_max_payload(rqstp);
> > > +   u32 rlen = min(op->u.read.rd_length, maxcount);
> > > +   /* enough extra xdr space for encoding either a hole or data segment. */
> > > +   u32 segments = 1 + 2 + 2;
> > > +
> > > +   return (op_encode_hdr_size + 2 + segments + XDR_QUADLEN(rlen)) * sizeof(__be32);
> >
> > I'm not sure I understand this calculation.
> >
> > In the final code, there's no fixed limit on the number of segments
> > returned by a single READ_PLUS op, right?
>
> I think the worst-case overhead to represent a hole is around 50 bytes.
>
> So as long as we don't encode any holes less than that, then we can just
> use rlen as an upper bound.
>
> We really don't want to bother encoding small holes.  I doubt
> filesystems want to bother with them either.  Do they give us any
> guarantees as to the minimum size of a hole?

The minimum size seems to be PAGE_SIZE from everything I've seen.

>
> --b.
>
> >
> > --b.
> >
> > > +}
> > > +
> > >  static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
> > >  {
> > >     u32 maxcount = 0, rlen = 0;
> > > @@ -3059,6 +3069,13 @@ static const struct nfsd4_operation nfsd4_ops[] = {
> > >             .op_name = "OP_COPY",
> > >             .op_rsize_bop = nfsd4_copy_rsize,
> > >     },
> > > +   [OP_READ_PLUS] = {
> > > +           .op_func = nfsd4_read,
> > > +           .op_release = nfsd4_read_release,
> > > +           .op_name = "OP_READ_PLUS",
> > > +           .op_rsize_bop = nfsd4_read_plus_rsize,
> > > +           .op_get_currentstateid = nfsd4_get_readstateid,
> > > +   },
> > >     [OP_SEEK] = {
> > >             .op_func = nfsd4_seek,
> > >             .op_name = "OP_SEEK",
> > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > index 6a1c0a7fae05..9af92f538000 100644
> > > --- a/fs/nfsd/nfs4xdr.c
> > > +++ b/fs/nfsd/nfs4xdr.c
> > > @@ -1957,7 +1957,7 @@ static const nfsd4_dec nfsd4_dec_ops[] = {
> > >     [OP_LAYOUTSTATS]        = (nfsd4_dec)nfsd4_decode_notsupp,
> > >     [OP_OFFLOAD_CANCEL]     = (nfsd4_dec)nfsd4_decode_offload_status,
> > >     [OP_OFFLOAD_STATUS]     = (nfsd4_dec)nfsd4_decode_offload_status,
> > > -   [OP_READ_PLUS]          = (nfsd4_dec)nfsd4_decode_notsupp,
> > > +   [OP_READ_PLUS]          = (nfsd4_dec)nfsd4_decode_read,
> > >     [OP_SEEK]               = (nfsd4_dec)nfsd4_decode_seek,
> > >     [OP_WRITE_SAME]         = (nfsd4_dec)nfsd4_decode_notsupp,
> > >     [OP_CLONE]              = (nfsd4_dec)nfsd4_decode_clone,
> > > @@ -4367,6 +4367,85 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
> > >             return nfserr_resource;
> > >     p = xdr_encode_hyper(p, os->count);
> > >     *p++ = cpu_to_be32(0);
> > > +   return nfserr;
> > > +}
> > > +
> > > +static __be32
> > > +nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
> > > +                       struct nfsd4_read *read,
> > > +                       unsigned long maxcount,  u32 *eof)
> > > +{
> > > +   struct xdr_stream *xdr = &resp->xdr;
> > > +   struct file *file = read->rd_nf->nf_file;
> > > +   int starting_len = xdr->buf->len;
> > > +   __be32 nfserr;
> > > +   __be32 *p, tmp;
> > > +   __be64 tmp64;
> > > +
> > > +   /* Content type, offset, byte count */
> > > +   p = xdr_reserve_space(xdr, 4 + 8 + 4);
> > > +   if (!p)
> > > +           return nfserr_resource;
> > > +
> > > +   read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount);
> > > +   if (read->rd_vlen < 0)
> > > +           return nfserr_resource;
> > > +
> > > +   nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
> > > +                       resp->rqstp->rq_vec, read->rd_vlen, &maxcount, eof);
> > > +   if (nfserr)
> > > +           return nfserr;
> > > +
> > > +   tmp = htonl(NFS4_CONTENT_DATA);
> > > +   write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
> > > +   tmp64 = cpu_to_be64(read->rd_offset);
> > > +   write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
> > > +   tmp = htonl(maxcount);
> > > +   write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
> > > +   return nfs_ok;
> > > +}
> > > +
> > > +static __be32
> > > +nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
> > > +                  struct nfsd4_read *read)
> > > +{
> > > +   unsigned long maxcount;
> > > +   struct xdr_stream *xdr = &resp->xdr;
> > > +   struct file *file;
> > > +   int starting_len = xdr->buf->len;
> > > +   int segments = 0;
> > > +   __be32 *p, tmp;
> > > +   u32 eof;
> > > +
> > > +   if (nfserr)
> > > +           return nfserr;
> > > +   file = read->rd_nf->nf_file;
> > > +
> > > +   /* eof flag, segment count */
> > > +   p = xdr_reserve_space(xdr, 4 + 4);
> > > +   if (!p)
> > > +           return nfserr_resource;
> > > +   xdr_commit_encode(xdr);
> > > +
> > > +   maxcount = svc_max_payload(resp->rqstp);
> > > +   maxcount = min_t(unsigned long, maxcount,
> > > +                    (xdr->buf->buflen - xdr->buf->len));
> > > +   maxcount = min_t(unsigned long, maxcount, read->rd_length);
> > > +
> > > +   eof = read->rd_offset >= i_size_read(file_inode(file));
> > > +   if (!eof) {
> > > +           nfserr = nfsd4_encode_read_plus_data(resp, read, maxcount, &eof);
> > > +           segments++;
> > > +   }
> > > +
> > > +   if (nfserr)
> > > +           xdr_truncate_encode(xdr, starting_len);
> > > +   else {
> > > +           tmp = htonl(eof);
> > > +           write_bytes_to_xdr_buf(xdr->buf, starting_len,     &tmp, 4);
> > > +           tmp = htonl(segments);
> > > +           write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
> > > +   }
> > >
> > >     return nfserr;
> > >  }
> > > @@ -4509,7 +4588,7 @@ static const nfsd4_enc nfsd4_enc_ops[] = {
> > >     [OP_LAYOUTSTATS]        = (nfsd4_enc)nfsd4_encode_noop,
> > >     [OP_OFFLOAD_CANCEL]     = (nfsd4_enc)nfsd4_encode_noop,
> > >     [OP_OFFLOAD_STATUS]     = (nfsd4_enc)nfsd4_encode_offload_status,
> > > -   [OP_READ_PLUS]          = (nfsd4_enc)nfsd4_encode_noop,
> > > +   [OP_READ_PLUS]          = (nfsd4_enc)nfsd4_encode_read_plus,
> > >     [OP_SEEK]               = (nfsd4_enc)nfsd4_encode_seek,
> > >     [OP_WRITE_SAME]         = (nfsd4_enc)nfsd4_encode_noop,
> > >     [OP_CLONE]              = (nfsd4_enc)nfsd4_encode_noop,
> > > --
> > > 2.28.0
> > >
>
J. Bruce Fields Sept. 1, 2020, 4:49 p.m. UTC | #4
On Mon, Aug 31, 2020 at 02:16:26PM -0400, Anna Schumaker wrote:
> On Fri, Aug 28, 2020 at 5:56 PM J. Bruce Fields <bfields@redhat.com> wrote:
> >
> > On Fri, Aug 28, 2020 at 05:25:21PM -0400, J. Bruce Fields wrote:
> > > On Mon, Aug 17, 2020 at 12:53:07PM -0400, schumaker.anna@gmail.com wrote:
> > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > >
> > > > This patch adds READ_PLUS support for returning a single
> > > > NFS4_CONTENT_DATA segment to the client. This is basically the same as
> > > > the READ operation, only with the extra information about data segments.
> > > >
> > > > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > ---
> > > >  fs/nfsd/nfs4proc.c | 17 ++++++++++
> > > >  fs/nfsd/nfs4xdr.c  | 83 ++++++++++++++++++++++++++++++++++++++++++++--
> > > >  2 files changed, 98 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> > > > index a09c35f0f6f0..9630d33211f2 100644
> > > > --- a/fs/nfsd/nfs4proc.c
> > > > +++ b/fs/nfsd/nfs4proc.c
> > > > @@ -2523,6 +2523,16 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
> > > >     return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
> > > >  }
> > > >
> > > > +static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
> > > > +{
> > > > +   u32 maxcount = svc_max_payload(rqstp);
> > > > +   u32 rlen = min(op->u.read.rd_length, maxcount);
> > > > +   /* enough extra xdr space for encoding either a hole or data segment. */
> > > > +   u32 segments = 1 + 2 + 2;
> > > > +
> > > > +   return (op_encode_hdr_size + 2 + segments + XDR_QUADLEN(rlen)) * sizeof(__be32);
> > >
> > > I'm not sure I understand this calculation.
> > >
> > > In the final code, there's no fixed limit on the number of segments
> > > returned by a single READ_PLUS op, right?
> >
> > I think the worst-case overhead to represent a hole is around 50 bytes.
> >
> > So as long as we don't encode any holes less than that, then we can just
> > use rlen as an upper bound.
> >
> > We really don't want to bother encoding small holes.  I doubt
> > filesystems want to bother with them either.  Do they give us any
> > guarantees as to the minimum size of a hole?
> 
> The minimum size seems to be PAGE_SIZE from everything I've seen.

OK, can we make that assumption explicit?  It'd simplify stuff like
this.

--b.
Anna Schumaker Sept. 1, 2020, 5:40 p.m. UTC | #5
On Tue, Sep 1, 2020 at 12:49 PM J. Bruce Fields <bfields@fieldses.org> wrote:
>
> On Mon, Aug 31, 2020 at 02:16:26PM -0400, Anna Schumaker wrote:
> > On Fri, Aug 28, 2020 at 5:56 PM J. Bruce Fields <bfields@redhat.com> wrote:
> > >
> > > On Fri, Aug 28, 2020 at 05:25:21PM -0400, J. Bruce Fields wrote:
> > > > On Mon, Aug 17, 2020 at 12:53:07PM -0400, schumaker.anna@gmail.com wrote:
> > > > > From: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > >
> > > > > This patch adds READ_PLUS support for returning a single
> > > > > NFS4_CONTENT_DATA segment to the client. This is basically the same as
> > > > > the READ operation, only with the extra information about data segments.
> > > > >
> > > > > Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> > > > > ---
> > > > >  fs/nfsd/nfs4proc.c | 17 ++++++++++
> > > > >  fs/nfsd/nfs4xdr.c  | 83 ++++++++++++++++++++++++++++++++++++++++++++--
> > > > >  2 files changed, 98 insertions(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> > > > > index a09c35f0f6f0..9630d33211f2 100644
> > > > > --- a/fs/nfsd/nfs4proc.c
> > > > > +++ b/fs/nfsd/nfs4proc.c
> > > > > @@ -2523,6 +2523,16 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
> > > > >     return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
> > > > >  }
> > > > >
> > > > > +static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
> > > > > +{
> > > > > +   u32 maxcount = svc_max_payload(rqstp);
> > > > > +   u32 rlen = min(op->u.read.rd_length, maxcount);
> > > > > +   /* enough extra xdr space for encoding either a hole or data segment. */
> > > > > +   u32 segments = 1 + 2 + 2;
> > > > > +
> > > > > +   return (op_encode_hdr_size + 2 + segments + XDR_QUADLEN(rlen)) * sizeof(__be32);
> > > >
> > > > I'm not sure I understand this calculation.
> > > >
> > > > In the final code, there's no fixed limit on the number of segments
> > > > returned by a single READ_PLUS op, right?
> > >
> > > I think the worst-case overhead to represent a hole is around 50 bytes.
> > >
> > > So as long as we don't encode any holes less than that, then we can just
> > > use rlen as an upper bound.
> > >
> > > We really don't want to bother encoding small holes.  I doubt
> > > filesystems want to bother with them either.  Do they give us any
> > > guarantees as to the minimum size of a hole?
> >
> > The minimum size seems to be PAGE_SIZE from everything I've seen.
>
> OK, can we make that assumption explicit?  It'd simplify stuff like
> this.

I'm okay with that, but it's technically up to the underlying filesystem.

>
> --b.
J. Bruce Fields Sept. 1, 2020, 7:18 p.m. UTC | #6
On Tue, Sep 01, 2020 at 01:40:16PM -0400, Anna Schumaker wrote:
> On Tue, Sep 1, 2020 at 12:49 PM J. Bruce Fields <bfields@fieldses.org> wrote:
> >
> > On Mon, Aug 31, 2020 at 02:16:26PM -0400, Anna Schumaker wrote:
> > > On Fri, Aug 28, 2020 at 5:56 PM J. Bruce Fields <bfields@redhat.com> wrote:
> > > > We really don't want to bother encoding small holes.  I doubt
> > > > filesystems want to bother with them either.  Do they give us any
> > > > guarantees as to the minimum size of a hole?
> > >
> > > The minimum size seems to be PAGE_SIZE from everything I've seen.
> >
> > OK, can we make that assumption explicit?  It'd simplify stuff like
> > this.
> 
> I'm okay with that, but it's technically up to the underlying filesystem.

Maybe we should ask on linux-fsdevel.

Maybe minimum hole length isn't the right question: suppose at time 1 a
file has a single hole at bytes 100-200, then it's modified so at time 2
it has a hole at bytes 50-150.  If you lseek(fd, 0, SEEK_HOLE) at time
1, you'll get 100.  Then if you lseek(fd, 100, SEEK_DATA) at time 2,
you'll get 150.  So you'll encode a 50-byte hole in the READ_PLUS reply
even though the file never had a hole smaller than 100 bytes.

Minimum hole alignment might be the right idea.

If we can't get that: maybe just teach encode_read to stop when it
*either* returns maxcount worth of file data (and holes) *or* maxcount
of encoded xdr data, just to prevent a weird filesystem from triggering
a bug.

--b.
J. Bruce Fields Sept. 4, 2020, 1:52 p.m. UTC | #7
On Tue, Sep 01, 2020 at 03:18:54PM -0400, J. Bruce Fields wrote:
> On Tue, Sep 01, 2020 at 01:40:16PM -0400, Anna Schumaker wrote:
> > On Tue, Sep 1, 2020 at 12:49 PM J. Bruce Fields <bfields@fieldses.org> wrote:
> > >
> > > On Mon, Aug 31, 2020 at 02:16:26PM -0400, Anna Schumaker wrote:
> > > > On Fri, Aug 28, 2020 at 5:56 PM J. Bruce Fields <bfields@redhat.com> wrote:
> > > > > We really don't want to bother encoding small holes.  I doubt
> > > > > filesystems want to bother with them either.  Do they give us any
> > > > > guarantees as to the minimum size of a hole?
> > > >
> > > > The minimum size seems to be PAGE_SIZE from everything I've seen.
> > >
> > > OK, can we make that assumption explicit?  It'd simplify stuff like
> > > this.
> > 
> > I'm okay with that, but it's technically up to the underlying filesystem.
> 
> Maybe we should ask on linux-fsdevel.
> 
> Maybe minimum hole length isn't the right question: suppose at time 1 a
> file has a single hole at bytes 100-200, then it's modified so at time 2
> it has a hole at bytes 50-150.  If you lseek(fd, 0, SEEK_HOLE) at time
> 1, you'll get 100.  Then if you lseek(fd, 100, SEEK_DATA) at time 2,
> you'll get 150.  So you'll encode a 50-byte hole in the READ_PLUS reply
> even though the file never had a hole smaller than 100 bytes.
> 
> Minimum hole alignment might be the right idea.
> 
> If we can't get that: maybe just teach encode_read to stop when it
> *either* returns maxcount worth of file data (and holes) *or* maxcount
> of encoded xdr data, just to prevent a weird filesystem from triggering
> a bug.

Alternatively, if it's easier, we could enforce a minimum alignment by
rounding up the result of SEEK_HOLE to the nearest multiple of (say) 512
bytes, and rounding down the result of SEEK_DATA.

--b.
Chuck Lever Sept. 4, 2020, 1:56 p.m. UTC | #8
> On Sep 4, 2020, at 9:52 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
> 
> On Tue, Sep 01, 2020 at 03:18:54PM -0400, J. Bruce Fields wrote:
>> On Tue, Sep 01, 2020 at 01:40:16PM -0400, Anna Schumaker wrote:
>>> On Tue, Sep 1, 2020 at 12:49 PM J. Bruce Fields <bfields@fieldses.org> wrote:
>>>> 
>>>> On Mon, Aug 31, 2020 at 02:16:26PM -0400, Anna Schumaker wrote:
>>>>> On Fri, Aug 28, 2020 at 5:56 PM J. Bruce Fields <bfields@redhat.com> wrote:
>>>>>> We really don't want to bother encoding small holes.  I doubt
>>>>>> filesystems want to bother with them either.  Do they give us any
>>>>>> guarantees as to the minimum size of a hole?
>>>>> 
>>>>> The minimum size seems to be PAGE_SIZE from everything I've seen.
>>>> 
>>>> OK, can we make that assumption explicit?  It'd simplify stuff like
>>>> this.
>>> 
>>> I'm okay with that, but it's technically up to the underlying filesystem.
>> 
>> Maybe we should ask on linux-fsdevel.
>> 
>> Maybe minimum hole length isn't the right question: suppose at time 1 a
>> file has a single hole at bytes 100-200, then it's modified so at time 2
>> it has a hole at bytes 50-150.  If you lseek(fd, 0, SEEK_HOLE) at time
>> 1, you'll get 100.  Then if you lseek(fd, 100, SEEK_DATA) at time 2,
>> you'll get 150.  So you'll encode a 50-byte hole in the READ_PLUS reply
>> even though the file never had a hole smaller than 100 bytes.
>> 
>> Minimum hole alignment might be the right idea.
>> 
>> If we can't get that: maybe just teach encode_read to stop when it
>> *either* returns maxcount worth of file data (and holes) *or* maxcount
>> of encoded xdr data, just to prevent a weird filesystem from triggering
>> a bug.
> 
> Alternatively, if it's easier, we could enforce a minimum alignment by
> rounding up the result of SEEK_HOLE to the nearest multiple of (say) 512
> bytes, and rounding down the result of SEEK_DATA.

Perhaps it goes without saying, but is there an effort to
ensure that the set of holes is represented in exactly the
same way when accessing a file via READ_PLUS and
SEEK_DATA/HOLE ?


--
Chuck Lever
J. Bruce Fields Sept. 4, 2020, 2:03 p.m. UTC | #9
On Fri, Sep 04, 2020 at 09:56:19AM -0400, Chuck Lever wrote:
> 
> 
> > On Sep 4, 2020, at 9:52 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
> > 
> > On Tue, Sep 01, 2020 at 03:18:54PM -0400, J. Bruce Fields wrote:
> >> On Tue, Sep 01, 2020 at 01:40:16PM -0400, Anna Schumaker wrote:
> >>> On Tue, Sep 1, 2020 at 12:49 PM J. Bruce Fields <bfields@fieldses.org> wrote:
> >>>> 
> >>>> On Mon, Aug 31, 2020 at 02:16:26PM -0400, Anna Schumaker wrote:
> >>>>> On Fri, Aug 28, 2020 at 5:56 PM J. Bruce Fields <bfields@redhat.com> wrote:
> >>>>>> We really don't want to bother encoding small holes.  I doubt
> >>>>>> filesystems want to bother with them either.  Do they give us any
> >>>>>> guarantees as to the minimum size of a hole?
> >>>>> 
> >>>>> The minimum size seems to be PAGE_SIZE from everything I've seen.
> >>>> 
> >>>> OK, can we make that assumption explicit?  It'd simplify stuff like
> >>>> this.
> >>> 
> >>> I'm okay with that, but it's technically up to the underlying filesystem.
> >> 
> >> Maybe we should ask on linux-fsdevel.
> >> 
> >> Maybe minimum hole length isn't the right question: suppose at time 1 a
> >> file has a single hole at bytes 100-200, then it's modified so at time 2
> >> it has a hole at bytes 50-150.  If you lseek(fd, 0, SEEK_HOLE) at time
> >> 1, you'll get 100.  Then if you lseek(fd, 100, SEEK_DATA) at time 2,
> >> you'll get 150.  So you'll encode a 50-byte hole in the READ_PLUS reply
> >> even though the file never had a hole smaller than 100 bytes.
> >> 
> >> Minimum hole alignment might be the right idea.
> >> 
> >> If we can't get that: maybe just teach encode_read to stop when it
> >> *either* returns maxcount worth of file data (and holes) *or* maxcount
> >> of encoded xdr data, just to prevent a weird filesystem from triggering
> >> a bug.
> > 
> > Alternatively, if it's easier, we could enforce a minimum alignment by
> > rounding up the result of SEEK_HOLE to the nearest multiple of (say) 512
> > bytes, and rounding down the result of SEEK_DATA.
> 
> Perhaps it goes without saying, but is there an effort to
> ensure that the set of holes is represented in exactly the
> same way when accessing a file via READ_PLUS and
> SEEK_DATA/HOLE ?

So you're thinking of something like a pynfs test that creates a file
with holes and then tries reading through it with READ_PLUS and SEEK and
comparing the results?

There are lots of legitimate reasons that test might "fail"--servers
aren't required to support holes at all, and have a lot of lattitude
about how to report them.

But it might be a good idea to test anyway.

--b.
Chuck Lever Sept. 4, 2020, 2:07 p.m. UTC | #10
> On Sep 4, 2020, at 10:03 AM, Bruce Fields <bfields@fieldses.org> wrote:
> 
> On Fri, Sep 04, 2020 at 09:56:19AM -0400, Chuck Lever wrote:
>> 
>> 
>>> On Sep 4, 2020, at 9:52 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
>>> 
>>> On Tue, Sep 01, 2020 at 03:18:54PM -0400, J. Bruce Fields wrote:
>>>> On Tue, Sep 01, 2020 at 01:40:16PM -0400, Anna Schumaker wrote:
>>>>> On Tue, Sep 1, 2020 at 12:49 PM J. Bruce Fields <bfields@fieldses.org> wrote:
>>>>>> 
>>>>>> On Mon, Aug 31, 2020 at 02:16:26PM -0400, Anna Schumaker wrote:
>>>>>>> On Fri, Aug 28, 2020 at 5:56 PM J. Bruce Fields <bfields@redhat.com> wrote:
>>>>>>>> We really don't want to bother encoding small holes.  I doubt
>>>>>>>> filesystems want to bother with them either.  Do they give us any
>>>>>>>> guarantees as to the minimum size of a hole?
>>>>>>> 
>>>>>>> The minimum size seems to be PAGE_SIZE from everything I've seen.
>>>>>> 
>>>>>> OK, can we make that assumption explicit?  It'd simplify stuff like
>>>>>> this.
>>>>> 
>>>>> I'm okay with that, but it's technically up to the underlying filesystem.
>>>> 
>>>> Maybe we should ask on linux-fsdevel.
>>>> 
>>>> Maybe minimum hole length isn't the right question: suppose at time 1 a
>>>> file has a single hole at bytes 100-200, then it's modified so at time 2
>>>> it has a hole at bytes 50-150.  If you lseek(fd, 0, SEEK_HOLE) at time
>>>> 1, you'll get 100.  Then if you lseek(fd, 100, SEEK_DATA) at time 2,
>>>> you'll get 150.  So you'll encode a 50-byte hole in the READ_PLUS reply
>>>> even though the file never had a hole smaller than 100 bytes.
>>>> 
>>>> Minimum hole alignment might be the right idea.
>>>> 
>>>> If we can't get that: maybe just teach encode_read to stop when it
>>>> *either* returns maxcount worth of file data (and holes) *or* maxcount
>>>> of encoded xdr data, just to prevent a weird filesystem from triggering
>>>> a bug.
>>> 
>>> Alternatively, if it's easier, we could enforce a minimum alignment by
>>> rounding up the result of SEEK_HOLE to the nearest multiple of (say) 512
>>> bytes, and rounding down the result of SEEK_DATA.
>> 
>> Perhaps it goes without saying, but is there an effort to
>> ensure that the set of holes is represented in exactly the
>> same way when accessing a file via READ_PLUS and
>> SEEK_DATA/HOLE ?
> 
> So you're thinking of something like a pynfs test that creates a file
> with holes and then tries reading through it with READ_PLUS and SEEK and
> comparing the results?

I hadn't considered a particular test platform, but yes, a regression
test like that would be appropriate.


> There are lots of legitimate reasons that test might "fail"--servers
> aren't required to support holes at all, and have a lot of lattitude
> about how to report them.

Agreed that the test would need to account for server support for holes.


> But it might be a good idea to test anyway.

My primary concern is that the result of a file copy operation should
look the same on NFS/TCP (with READ_PLUS) and NFS/RDMA (with SEEK_DATA/HOLE).


--
Chuck Lever
J. Bruce Fields Sept. 4, 2020, 2:29 p.m. UTC | #11
On Fri, Sep 04, 2020 at 10:07:22AM -0400, Chuck Lever wrote:
> My primary concern is that the result of a file copy operation should
> look the same on NFS/TCP (with READ_PLUS) and NFS/RDMA (with SEEK_DATA/HOLE).

I'm not sure what you mean.

I don't see the spec providing any guarantee of consistency between
READ_PLUS and SEEK.  It also doesn't guarantee that the results tell you
anything about how the file is actually stored--a returned "hole" could
represent an unallocated segment, or a fully allocated segment that's
filled with zeroes, or some combination.

So, for example, if you implemented an optimized copy that used
ALLOCATE, DEALLOCATE, SEEK and/or READ_PLUS to avoid reading and writing
a lot of zeroes--there's no guarantee that the target file would end up
allocated in the same way as the source.

--b.
Chuck Lever Sept. 4, 2020, 2:36 p.m. UTC | #12
> On Sep 4, 2020, at 10:29 AM, Bruce Fields <bfields@fieldses.org> wrote:
> 
> On Fri, Sep 04, 2020 at 10:07:22AM -0400, Chuck Lever wrote:
>> My primary concern is that the result of a file copy operation should
>> look the same on NFS/TCP (with READ_PLUS) and NFS/RDMA (with SEEK_DATA/HOLE).
> 
> I'm not sure what you mean.
> 
> I don't see the spec providing any guarantee of consistency between
> READ_PLUS and SEEK.

IMO this is an implementation quality issue, not a spec compliance
issue.


> It also doesn't guarantee that the results tell you
> anything about how the file is actually stored--a returned "hole" could
> represent an unallocated segment, or a fully allocated segment that's
> filled with zeroes, or some combination.

Understood, but the resulting copied file should look the same whether
it was read from the server using READ_PLUS or SEEK_DATA/HOLE.


> So, for example, if you implemented an optimized copy that used
> ALLOCATE, DEALLOCATE, SEEK and/or READ_PLUS to avoid reading and writing
> a lot of zeroes--there's no guarantee that the target file would end up
> allocated in the same way as the source.



--
Chuck Lever
Bruce Fields Sept. 4, 2020, 2:49 p.m. UTC | #13
On Fri, Sep 04, 2020 at 10:36:36AM -0400, Chuck Lever wrote:
> > On Sep 4, 2020, at 10:29 AM, Bruce Fields <bfields@fieldses.org> wrote:
> > It also doesn't guarantee that the results tell you
> > anything about how the file is actually stored--a returned "hole" could
> > represent an unallocated segment, or a fully allocated segment that's
> > filled with zeroes, or some combination.
> 
> Understood, but the resulting copied file should look the same whether
> it was read from the server using READ_PLUS or SEEK_DATA/HOLE.

I'm uncomfortable about promising that.  What do you think might go
wrong otherwise?

--b.
Chuck Lever Sept. 4, 2020, 2:58 p.m. UTC | #14
> On Sep 4, 2020, at 10:49 AM, J. Bruce Fields <bfields@redhat.com> wrote:
> 
> On Fri, Sep 04, 2020 at 10:36:36AM -0400, Chuck Lever wrote:
>>> On Sep 4, 2020, at 10:29 AM, Bruce Fields <bfields@fieldses.org> wrote:
>>> It also doesn't guarantee that the results tell you
>>> anything about how the file is actually stored--a returned "hole" could
>>> represent an unallocated segment, or a fully allocated segment that's
>>> filled with zeroes, or some combination.
>> 
>> Understood, but the resulting copied file should look the same whether
>> it was read from the server using READ_PLUS or SEEK_DATA/HOLE.
> 
> I'm uncomfortable about promising that.

The server should be able to represent a file with holes
in exactly the same way with both mechanisms, unless there
is a significant flaw in the protocols or implementation.

The client's behavior is also important here, so the
guarantee would have to be about how the server presents
the holes. A quality client implementation would be able
to use this guarantee to reconstruct the holes exactly.


> What do you think might go wrong otherwise?

I don't see a data corruption issue here, if that's what
you mean.

Suppose the server has a large file with a lot of holes,
and these holes are all unallocated. This might be
typical of a container image.

Suppose further the client is able to punch holes in a
destination file as a thin provisioning mechanism.

Now, suppose we copy the file via TCP/READ_PLUS, and
that preserves the holes.

Copy with RDMA/SEEK_HOLE and maybe it doesn't preserve
holes. The destination file is now significantly larger
and less efficiently stored.

Or maybe it's the other way around. Either way, one
mechanism is hole-preserving and one isn't.

A quality implementation would try to preserve holes as
much as possible so that the server can make smart storage
provisioning decisions.

--
Chuck Lever
J. Bruce Fields Sept. 4, 2020, 3:24 p.m. UTC | #15
On Fri, Sep 04, 2020 at 10:58:43AM -0400, Chuck Lever wrote:
> > What do you think might go wrong otherwise?
> 
> I don't see a data corruption issue here, if that's what
> you mean.
> 
> Suppose the server has a large file with a lot of holes,
> and these holes are all unallocated. This might be
> typical of a container image.
> 
> Suppose further the client is able to punch holes in a
> destination file as a thin provisioning mechanism.
> 
> Now, suppose we copy the file via TCP/READ_PLUS, and
> that preserves the holes.
> 
> Copy with RDMA/SEEK_HOLE and maybe it doesn't preserve
> holes. The destination file is now significantly larger
> and less efficiently stored.
> 
> Or maybe it's the other way around. Either way, one
> mechanism is hole-preserving and one isn't.
> 
> A quality implementation would try to preserve holes as
> much as possible so that the server can make smart storage
> provisioning decisions.

OK, I can see that, thanks.

So, I was trying to make sure we handle cases where SEEK results are
weirdly aligned or segments returned are very small.  I don't think
that'll happen with any "normal" setup, I think it probably requires
strange FUSE filesystems or unusual races or malicious users or some
combination thereof.  So suboptimal handling is OK, I just don't want to
violate the protocol or crash or hang or something.

I'm not seeing the RDMA connection, by the way.  SEEK and READ_PLUS
should work the same over TCP and RDMA.

--b.
Chuck Lever Sept. 4, 2020, 4:17 p.m. UTC | #16
> On Sep 4, 2020, at 11:24 AM, Bruce Fields <bfields@fieldses.org> wrote:
> 
> I'm not seeing the RDMA connection, by the way.  SEEK and READ_PLUS
> should work the same over TCP and RDMA.

The READ_PLUS result is not DDP-eligible because there's no way for
the client to predict in advance whether there will be data (which
can be moved by RDMA), holes or patterns (which cannot), nor how
many of each there might be.

Therefore I've asked Anna to leave READ_PLUS disabled on RDMA mounts.


--
Chuck Lever
J. Bruce Fields Sept. 4, 2020, 4:26 p.m. UTC | #17
On Fri, Sep 04, 2020 at 12:17:29PM -0400, Chuck Lever wrote:
> 
> 
> > On Sep 4, 2020, at 11:24 AM, Bruce Fields <bfields@fieldses.org> wrote:
> > 
> > I'm not seeing the RDMA connection, by the way.  SEEK and READ_PLUS
> > should work the same over TCP and RDMA.
> 
> The READ_PLUS result is not DDP-eligible because there's no way for
> the client to predict in advance whether there will be data (which
> can be moved by RDMA), holes or patterns (which cannot), nor how
> many of each there might be.
> 
> Therefore I've asked Anna to leave READ_PLUS disabled on RDMA mounts.

Oh, of course, makes sense.

I think we should leave ourselves the options of handling some of these
unlikely corner cases differently in the two cases, but I wouldn't
expect that to cause a noticeable difference in any normal use.

--b.
Chuck Lever Sept. 4, 2020, 4:30 p.m. UTC | #18
> On Sep 4, 2020, at 12:17 PM, Chuck Lever <chuck.lever@oracle.com> wrote:
> 
> 
> 
>> On Sep 4, 2020, at 11:24 AM, Bruce Fields <bfields@fieldses.org> wrote:
>> 
>> I'm not seeing the RDMA connection, by the way.  SEEK and READ_PLUS
>> should work the same over TCP and RDMA.
> 
> The READ_PLUS result is not DDP-eligible because there's no way for
> the client to predict in advance whether there will be data (which
> can be moved by RDMA), holes or patterns (which cannot), nor how
> many of each there might be.
> 
> Therefore I've asked Anna to leave READ_PLUS disabled on RDMA mounts.

Since this is a public forum, I should clarify that READ_PLUS does
work on NFS/RDMA mounts. It's just not as efficient as NFS READ with
RDMA, and that kind of defeats its purpose as a replacement for NFS
READ in everyday usage.


--
Chuck Lever
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a09c35f0f6f0..9630d33211f2 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2523,6 +2523,16 @@  static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 	return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
 
+static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	u32 maxcount = svc_max_payload(rqstp);
+	u32 rlen = min(op->u.read.rd_length, maxcount);
+	/* enough extra xdr space for encoding either a hole or data segment. */
+	u32 segments = 1 + 2 + 2;
+
+	return (op_encode_hdr_size + 2 + segments + XDR_QUADLEN(rlen)) * sizeof(__be32);
+}
+
 static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
 	u32 maxcount = 0, rlen = 0;
@@ -3059,6 +3069,13 @@  static const struct nfsd4_operation nfsd4_ops[] = {
 		.op_name = "OP_COPY",
 		.op_rsize_bop = nfsd4_copy_rsize,
 	},
+	[OP_READ_PLUS] = {
+		.op_func = nfsd4_read,
+		.op_release = nfsd4_read_release,
+		.op_name = "OP_READ_PLUS",
+		.op_rsize_bop = nfsd4_read_plus_rsize,
+		.op_get_currentstateid = nfsd4_get_readstateid,
+	},
 	[OP_SEEK] = {
 		.op_func = nfsd4_seek,
 		.op_name = "OP_SEEK",
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6a1c0a7fae05..9af92f538000 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1957,7 +1957,7 @@  static const nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_LAYOUTSTATS]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_OFFLOAD_CANCEL]	= (nfsd4_dec)nfsd4_decode_offload_status,
 	[OP_OFFLOAD_STATUS]	= (nfsd4_dec)nfsd4_decode_offload_status,
-	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_read,
 	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_seek,
 	[OP_WRITE_SAME]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_CLONE]		= (nfsd4_dec)nfsd4_decode_clone,
@@ -4367,6 +4367,85 @@  nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
 		return nfserr_resource;
 	p = xdr_encode_hyper(p, os->count);
 	*p++ = cpu_to_be32(0);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
+			    struct nfsd4_read *read,
+			    unsigned long maxcount,  u32 *eof)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	struct file *file = read->rd_nf->nf_file;
+	int starting_len = xdr->buf->len;
+	__be32 nfserr;
+	__be32 *p, tmp;
+	__be64 tmp64;
+
+	/* Content type, offset, byte count */
+	p = xdr_reserve_space(xdr, 4 + 8 + 4);
+	if (!p)
+		return nfserr_resource;
+
+	read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount);
+	if (read->rd_vlen < 0)
+		return nfserr_resource;
+
+	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
+			    resp->rqstp->rq_vec, read->rd_vlen, &maxcount, eof);
+	if (nfserr)
+		return nfserr;
+
+	tmp = htonl(NFS4_CONTENT_DATA);
+	write_bytes_to_xdr_buf(xdr->buf, starting_len,      &tmp,   4);
+	tmp64 = cpu_to_be64(read->rd_offset);
+	write_bytes_to_xdr_buf(xdr->buf, starting_len + 4,  &tmp64, 8);
+	tmp = htonl(maxcount);
+	write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp,   4);
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
+		       struct nfsd4_read *read)
+{
+	unsigned long maxcount;
+	struct xdr_stream *xdr = &resp->xdr;
+	struct file *file;
+	int starting_len = xdr->buf->len;
+	int segments = 0;
+	__be32 *p, tmp;
+	u32 eof;
+
+	if (nfserr)
+		return nfserr;
+	file = read->rd_nf->nf_file;
+
+	/* eof flag, segment count */
+	p = xdr_reserve_space(xdr, 4 + 4);
+	if (!p)
+		return nfserr_resource;
+	xdr_commit_encode(xdr);
+
+	maxcount = svc_max_payload(resp->rqstp);
+	maxcount = min_t(unsigned long, maxcount,
+			 (xdr->buf->buflen - xdr->buf->len));
+	maxcount = min_t(unsigned long, maxcount, read->rd_length);
+
+	eof = read->rd_offset >= i_size_read(file_inode(file));
+	if (!eof) {
+		nfserr = nfsd4_encode_read_plus_data(resp, read, maxcount, &eof);
+		segments++;
+	}
+
+	if (nfserr)
+		xdr_truncate_encode(xdr, starting_len);
+	else {
+		tmp = htonl(eof);
+		write_bytes_to_xdr_buf(xdr->buf, starting_len,     &tmp, 4);
+		tmp = htonl(segments);
+		write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
+	}
 
 	return nfserr;
 }
@@ -4509,7 +4588,7 @@  static const nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_LAYOUTSTATS]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_OFFLOAD_CANCEL]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_OFFLOAD_STATUS]	= (nfsd4_enc)nfsd4_encode_offload_status,
-	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_read_plus,
 	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_seek,
 	[OP_WRITE_SAME]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_CLONE]		= (nfsd4_enc)nfsd4_encode_noop,