diff mbox

[v3,3/3] NFSD: Add support for encoding multiple segments

Message ID 1426540688-32095-4-git-send-email-Anna.Schumaker@Netapp.com (mailing list archive)
State New, archived
Headers show

Commit Message

Schumaker, Anna March 16, 2015, 9:18 p.m. UTC
This patch implements sending an array of segments back to the client.
Clients should be prepared to handle multiple segment reads to make this
useful.  We try to splice the first data segment into the XDR result,
and remaining segments are encoded directly.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfsd/nfs4proc.c |  4 ++--
 fs/nfsd/nfs4xdr.c  | 35 ++++++++++++++++++++++++-----------
 2 files changed, 26 insertions(+), 13 deletions(-)

Comments

J. Bruce Fields March 17, 2015, 7:56 p.m. UTC | #1
On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
> This patch implements sending an array of segments back to the client.
> Clients should be prepared to handle multiple segment reads to make this
> useful.  We try to splice the first data segment into the XDR result,
> and remaining segments are encoded directly.

I'm still interested in what would happen if we started with an
implementation like:

	- if the entire requested range falls within a hole, return that
	  single hole.
	- otherwise, just treat the thing as one big data segment.

That would provide a benefit in the case there are large-ish holes
with minimal impact otherwise.

(Though patches for full support are still useful even if only for
client-testing purposes.)

--b.

> 
> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
> ---
>  fs/nfsd/nfs4proc.c |  4 ++--
>  fs/nfsd/nfs4xdr.c  | 35 ++++++++++++++++++++++++-----------
>  2 files changed, 26 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index e9f4d8f..6801973 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -1862,8 +1862,8 @@ static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op
>  {
>  	u32 maxcount = svc_max_payload(rqstp);
>  	u32 rlen = min(op->u.read.rd_length, maxcount);
> -	/* enough extra xdr space for encoding either a hole or data segment. */
> -	u32 xdr  = 5;
> +	/* Extra xdr padding for encoding multiple segments. */
> +	u32 xdr  = 20;
>  
>  	return (op_encode_hdr_size + 2 + xdr + XDR_QUADLEN(rlen)) * sizeof(__be32);
>  }
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 799d52c..5eaecd2 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -4117,7 +4117,7 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
>  
>  static __be32
>  nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
> -			    struct file *file)
> +			    struct file *file, loff_t hole_pos)
>  {
>  	__be32 *p, err;
>  	unsigned long maxcount;
> @@ -4128,20 +4128,26 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *r
>  		return nfserr_resource;
>  	xdr_commit_encode(xdr);
>  
> +	if (hole_pos <= read->rd_offset)
> +		hole_pos = i_size_read(file_inode(file));
> +
>  	maxcount = svc_max_payload(resp->rqstp);
>  	maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
>  	maxcount = min_t(unsigned long, maxcount, read->rd_length);
> +	maxcount = min_t(unsigned long, maxcount, hole_pos - read->rd_offset);
>  
>  	if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
>  		err = nfsd4_encode_splice_read(resp, read, file, &maxcount);
>  	else
>  		err = nfsd4_encode_readv(resp, read, file, &maxcount);
> +	clear_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
>  
>  	*p++ = cpu_to_be32(NFS4_CONTENT_DATA);
>  	p = xdr_encode_hyper(p, read->rd_offset);
>  	*p++ = cpu_to_be32(maxcount);
>  
>  	read->rd_offset += maxcount;
> +	read->rd_length -= maxcount;
>  	return err;
>  }
>  
> @@ -4156,7 +4162,7 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, struct nfsd4_read *r
>  	if (data_pos == -ENXIO)
>  		data_pos = i_size_read(file_inode(file));
>  	if (data_pos <= read->rd_offset)
> -		return nfsd4_encode_read_plus_data(resp, read, file);
> +		return nfsd4_encode_read_plus_data(resp, read, file, 0);
>  
>  	maxcount = data_pos - read->rd_offset;
>  	p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8);
> @@ -4165,6 +4171,10 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, struct nfsd4_read *r
>  	p = xdr_encode_hyper(p, maxcount);
>  
>  	read->rd_offset += maxcount;
> +	if (maxcount > read->rd_length)
> +		read->rd_length = 0;
> +	else
> +		read->rd_length -= maxcount;
>  	return nfs_ok;
>  }
>  
> @@ -4197,17 +4207,20 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
>  			goto err_truncate;
>  	}
>  
> -	hole_pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> -	if (hole_pos == -ENXIO)
> -		goto out_encode;
> +	do {
> +		hole_pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
> +		if (hole_pos == -ENXIO)
> +			break;
>  
> -	if (hole_pos == read->rd_offset)
> -		err = nfsd4_encode_read_plus_hole(resp, read, file);
> -	else
> -		err = nfsd4_encode_read_plus_data(resp, read, file);
> -	segments++;
> +		if (hole_pos == read->rd_offset)
> +			err = nfsd4_encode_read_plus_hole(resp, read, file);
> +		else
> +			err = nfsd4_encode_read_plus_data(resp, read, file, hole_pos);
> +		if (err)
> +			break;
> +		segments++;
> +	} while (read->rd_length > 0);
>  
> -out_encode:
>  	eof = (read->rd_offset >= i_size_read(file_inode(file)));
>  	*p++ = cpu_to_be32(eof);
>  	*p++ = cpu_to_be32(segments);
> -- 
> 2.3.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields March 17, 2015, 8:07 p.m. UTC | #2
On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
> > This patch implements sending an array of segments back to the client.
> > Clients should be prepared to handle multiple segment reads to make this
> > useful.  We try to splice the first data segment into the XDR result,
> > and remaining segments are encoded directly.
> 
> I'm still interested in what would happen if we started with an
> implementation like:
> 
> 	- if the entire requested range falls within a hole, return that
> 	  single hole.
> 	- otherwise, just treat the thing as one big data segment.
> 
> That would provide a benefit in the case there are large-ish holes
> with minimal impact otherwise.
> 
> (Though patches for full support are still useful even if only for
> client-testing purposes.)

Also, looks like

	xvs_io -c "fiemap -v" <file>

will give hole sizes for a given <file>.  (Thanks, esandeen.)  Running
that on a few of my test vm images shows a fair number of large
(hundreds of megs) files, which suggests identifying only >=rwsize holes
might still be useful.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields March 17, 2015, 9:36 p.m. UTC | #3
On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
> > On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
> > > This patch implements sending an array of segments back to the client.
> > > Clients should be prepared to handle multiple segment reads to make this
> > > useful.  We try to splice the first data segment into the XDR result,
> > > and remaining segments are encoded directly.
> > 
> > I'm still interested in what would happen if we started with an
> > implementation like:
> > 
> > 	- if the entire requested range falls within a hole, return that
> > 	  single hole.
> > 	- otherwise, just treat the thing as one big data segment.
> > 
> > That would provide a benefit in the case there are large-ish holes
> > with minimal impact otherwise.
> > 
> > (Though patches for full support are still useful even if only for
> > client-testing purposes.)
> 
> Also, looks like
> 
> 	xvs_io -c "fiemap -v" <file>
> 
> will give hole sizes for a given <file>.  (Thanks, esandeen.)  Running
> that on a few of my test vm images shows a fair number of large
> (hundreds of megs) files, which suggests identifying only >=rwsize holes
> might still be useful.

Just for fun.... I wrote the following test program and ran it on my
collection of testing vm's.  Some looked like this:

	f21-1.qcow2
	144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
	total hole bytes:      8443252736 (98%)
	in aligned 1MB chunks: 8428453888 (98%)

So, basically, read_plus would save transferring most of the data even
when only handling 1MB holes.

But some looked like this:

	501524 -rw-------. 1 qemu qemu 8589934592 May 20  2014 rhel6-1-1.img
	total hole bytes:      8077516800 (94%)
	in aligned 1MB chunks: 0 (0%)

So the READ_PLUS that caught every hole might save a lot, the one that
only caught 1MB holes wouldn't help at all.

And there were lots of examples in between those two extremes.

(But, check my math, I haven't tested this carefully.)

--b.

#define _GNU_SOURCE
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h> 
#include <errno.h>
#include <err.h>

long round_up(long n, long b)
{
	return ((n + b - 1)/b) * b;
}

long round_down(long n, long b)
{
	return (n/b) * b;
}

long hbytes = 0;
long rplusbytes = 0;

do_stats(off_t hole_start, off_t hole_end)
{
	off_t hole_start_up, hole_end_down;

	hole_start_up = round_up(hole_start, 1024*1024);
	hole_end_down = round_down(hole_end, 1024*1024);

	hbytes += hole_end - hole_start;
	if (hole_start_up < hole_end_down)
		rplusbytes += hole_end_down - hole_start_up;
}

int main(int argc, char *argv[])
{
	off_t hole_start, hole_end;
	int fd;
	char *name;

	/* Map out holes with SEEK_HOLE, SEEK_DATA */
	/* Useful statistics:
	 * 	- what percentage of file is in holes?
	 * 	- what percentage of file would be skipped if we read it
	 * 	  sequentially in 1MB chunks?
	 */

	if (argc != 2)
		errx(1, "usage: %s <filename>\n", argv[0]);
	name = argv[1];
	fd = open(name, O_RDONLY);
	if (fd == -1)
		err(1, "open");

	hole_end = 0;
	while (1) {
		hole_start = lseek(fd, hole_end, SEEK_HOLE);
		if (hole_start == -1)
			err(1, "lseek");
		hole_end = lseek(fd, hole_start, SEEK_DATA);
		if (hole_end == -1) {
			if (errno == ENXIO)
				break;
			err(1, "lseek");
		}
		do_stats(hole_start, hole_end);
	}
	hole_end = lseek(fd, 0, SEEK_END);
	do_stats(hole_start, hole_end);
	printf("total hole bytes:      %ld (%.0f%)\n", hbytes,
				100 * (float)hbytes/hole_end);
	printf("in aligned 1MB chunks: %ld (%.0f%)\n", rplusbytes,
				100 * (float)rplusbytes/hole_end);
}
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna March 18, 2015, 6:16 p.m. UTC | #4
On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
>>>> This patch implements sending an array of segments back to the client.
>>>> Clients should be prepared to handle multiple segment reads to make this
>>>> useful.  We try to splice the first data segment into the XDR result,
>>>> and remaining segments are encoded directly.
>>>
>>> I'm still interested in what would happen if we started with an
>>> implementation like:
>>>
>>> 	- if the entire requested range falls within a hole, return that
>>> 	  single hole.
>>> 	- otherwise, just treat the thing as one big data segment.
>>>
>>> That would provide a benefit in the case there are large-ish holes
>>> with minimal impact otherwise.
>>>
>>> (Though patches for full support are still useful even if only for
>>> client-testing purposes.)
>>
>> Also, looks like
>>
>> 	xvs_io -c "fiemap -v" <file>
>>
>> will give hole sizes for a given <file>.  (Thanks, esandeen.)  Running
>> that on a few of my test vm images shows a fair number of large
>> (hundreds of megs) files, which suggests identifying only >=rwsize holes
>> might still be useful.
> 
> Just for fun.... I wrote the following test program and ran it on my
> collection of testing vm's.  Some looked like this:
> 
> 	f21-1.qcow2
> 	144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
> 	total hole bytes:      8443252736 (98%)
> 	in aligned 1MB chunks: 8428453888 (98%)
> 
> So, basically, read_plus would save transferring most of the data even
> when only handling 1MB holes.
> 
> But some looked like this:
> 
> 	501524 -rw-------. 1 qemu qemu 8589934592 May 20  2014 rhel6-1-1.img
> 	total hole bytes:      8077516800 (94%)
> 	in aligned 1MB chunks: 0 (0%)
> 
> So the READ_PLUS that caught every hole might save a lot, the one that
> only caught 1MB holes wouldn't help at all.
> 
> And there were lots of examples in between those two extremes.

I tested with three different 512 MB files:  100% data, 100% hole, and alternating every megabyte.  The results were surprising:

      |  v4.1  |  v4.2
-----------------------
data  | 0.685s |  0.714s
hole  | 0.485s | 15.547s
mixed |	1.283s |  0.448

>From what I can tell, the 100% hole case takes so long because of the SEEK_DATA call in nfsd4_encode_read_plus_hole().  I took this out to trick the function into thinking that the entire file was already a hole, and runtime dropped to the levels of v4.1 and v4.2.  I wonder if this is filesystem dependent?  My server is exporting ext4.

Anna
> 
> (But, check my math, I haven't tested this carefully.)
> 
> --b.
> 
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <unistd.h> 
> #include <errno.h>
> #include <err.h>
> 
> long round_up(long n, long b)
> {
> 	return ((n + b - 1)/b) * b;
> }
> 
> long round_down(long n, long b)
> {
> 	return (n/b) * b;
> }
> 
> long hbytes = 0;
> long rplusbytes = 0;
> 
> do_stats(off_t hole_start, off_t hole_end)
> {
> 	off_t hole_start_up, hole_end_down;
> 
> 	hole_start_up = round_up(hole_start, 1024*1024);
> 	hole_end_down = round_down(hole_end, 1024*1024);
> 
> 	hbytes += hole_end - hole_start;
> 	if (hole_start_up < hole_end_down)
> 		rplusbytes += hole_end_down - hole_start_up;
> }
> 
> int main(int argc, char *argv[])
> {
> 	off_t hole_start, hole_end;
> 	int fd;
> 	char *name;
> 
> 	/* Map out holes with SEEK_HOLE, SEEK_DATA */
> 	/* Useful statistics:
> 	 * 	- what percentage of file is in holes?
> 	 * 	- what percentage of file would be skipped if we read it
> 	 * 	  sequentially in 1MB chunks?
> 	 */
> 
> 	if (argc != 2)
> 		errx(1, "usage: %s <filename>\n", argv[0]);
> 	name = argv[1];
> 	fd = open(name, O_RDONLY);
> 	if (fd == -1)
> 		err(1, "open");
> 
> 	hole_end = 0;
> 	while (1) {
> 		hole_start = lseek(fd, hole_end, SEEK_HOLE);
> 		if (hole_start == -1)
> 			err(1, "lseek");
> 		hole_end = lseek(fd, hole_start, SEEK_DATA);
> 		if (hole_end == -1) {
> 			if (errno == ENXIO)
> 				break;
> 			err(1, "lseek");
> 		}
> 		do_stats(hole_start, hole_end);
> 	}
> 	hole_end = lseek(fd, 0, SEEK_END);
> 	do_stats(hole_start, hole_end);
> 	printf("total hole bytes:      %ld (%.0f%)\n", hbytes,
> 				100 * (float)hbytes/hole_end);
> 	printf("in aligned 1MB chunks: %ld (%.0f%)\n", rplusbytes,
> 				100 * (float)rplusbytes/hole_end);
> }
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields March 18, 2015, 6:55 p.m. UTC | #5
On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker wrote:
> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> > On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
> >> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
> >>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
> >>>> This patch implements sending an array of segments back to the client.
> >>>> Clients should be prepared to handle multiple segment reads to make this
> >>>> useful.  We try to splice the first data segment into the XDR result,
> >>>> and remaining segments are encoded directly.
> >>>
> >>> I'm still interested in what would happen if we started with an
> >>> implementation like:
> >>>
> >>> 	- if the entire requested range falls within a hole, return that
> >>> 	  single hole.
> >>> 	- otherwise, just treat the thing as one big data segment.
> >>>
> >>> That would provide a benefit in the case there are large-ish holes
> >>> with minimal impact otherwise.
> >>>
> >>> (Though patches for full support are still useful even if only for
> >>> client-testing purposes.)
> >>
> >> Also, looks like
> >>
> >> 	xvs_io -c "fiemap -v" <file>
> >>
> >> will give hole sizes for a given <file>.  (Thanks, esandeen.)  Running
> >> that on a few of my test vm images shows a fair number of large
> >> (hundreds of megs) files, which suggests identifying only >=rwsize holes
> >> might still be useful.
> > 
> > Just for fun.... I wrote the following test program and ran it on my
> > collection of testing vm's.  Some looked like this:
> > 
> > 	f21-1.qcow2
> > 	144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
> > 	total hole bytes:      8443252736 (98%)
> > 	in aligned 1MB chunks: 8428453888 (98%)
> > 
> > So, basically, read_plus would save transferring most of the data even
> > when only handling 1MB holes.
> > 
> > But some looked like this:
> > 
> > 	501524 -rw-------. 1 qemu qemu 8589934592 May 20  2014 rhel6-1-1.img
> > 	total hole bytes:      8077516800 (94%)
> > 	in aligned 1MB chunks: 0 (0%)
> > 
> > So the READ_PLUS that caught every hole might save a lot, the one that
> > only caught 1MB holes wouldn't help at all.
> > 
> > And there were lots of examples in between those two extremes.
> 
> I tested with three different 512 MB files:  100% data, 100% hole, and alternating every megabyte.  The results were surprising:
> 
>       |  v4.1  |  v4.2
> -----------------------
> data  | 0.685s |  0.714s
> hole  | 0.485s | 15.547s
> mixed |	1.283s |  0.448
> 
> >From what I can tell, the 100% hole case takes so long because of the
> >SEEK_DATA call in nfsd4_encode_read_plus_hole().  I took this out to
> >trick the function into thinking that the entire file was already a
> >hole, and runtime dropped to the levels of v4.1 and v4.2.

Wait, that 15s is due to just one SEEK_DATA?

> I wonder
> >if this is filesystem dependent?  My server is exporting ext4.

Sounds like just a bug.  I've been doing lots of lseek(.,.,SEEK_DATA) on
both ext4 and xfs without seeing anything that weird.

I believe it does return -ENXIO in the case SEEK_DATA is called at an
offset beyond which there's no more data.  At least that's what I saw in
userspace.  So maybe your code just isn't handling that case correctly?

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna March 18, 2015, 8:39 p.m. UTC | #6
On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
> On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker wrote:
>> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
>>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
>>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
>>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
>>>>>> This patch implements sending an array of segments back to the client.
>>>>>> Clients should be prepared to handle multiple segment reads to make this
>>>>>> useful.  We try to splice the first data segment into the XDR result,
>>>>>> and remaining segments are encoded directly.
>>>>>
>>>>> I'm still interested in what would happen if we started with an
>>>>> implementation like:
>>>>>
>>>>> 	- if the entire requested range falls within a hole, return that
>>>>> 	  single hole.
>>>>> 	- otherwise, just treat the thing as one big data segment.
>>>>>
>>>>> That would provide a benefit in the case there are large-ish holes
>>>>> with minimal impact otherwise.
>>>>>
>>>>> (Though patches for full support are still useful even if only for
>>>>> client-testing purposes.)
>>>>
>>>> Also, looks like
>>>>
>>>> 	xvs_io -c "fiemap -v" <file>
>>>>
>>>> will give hole sizes for a given <file>.  (Thanks, esandeen.)  Running
>>>> that on a few of my test vm images shows a fair number of large
>>>> (hundreds of megs) files, which suggests identifying only >=rwsize holes
>>>> might still be useful.
>>>
>>> Just for fun.... I wrote the following test program and ran it on my
>>> collection of testing vm's.  Some looked like this:
>>>
>>> 	f21-1.qcow2
>>> 	144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
>>> 	total hole bytes:      8443252736 (98%)
>>> 	in aligned 1MB chunks: 8428453888 (98%)
>>>
>>> So, basically, read_plus would save transferring most of the data even
>>> when only handling 1MB holes.
>>>
>>> But some looked like this:
>>>
>>> 	501524 -rw-------. 1 qemu qemu 8589934592 May 20  2014 rhel6-1-1.img
>>> 	total hole bytes:      8077516800 (94%)
>>> 	in aligned 1MB chunks: 0 (0%)
>>>
>>> So the READ_PLUS that caught every hole might save a lot, the one that
>>> only caught 1MB holes wouldn't help at all.
>>>
>>> And there were lots of examples in between those two extremes.
>>
>> I tested with three different 512 MB files:  100% data, 100% hole, and alternating every megabyte.  The results were surprising:
>>
>>       |  v4.1  |  v4.2
>> -----------------------
>> data  | 0.685s |  0.714s
>> hole  | 0.485s | 15.547s
>> mixed |	1.283s |  0.448
>>
>> >From what I can tell, the 100% hole case takes so long because of the
>>> SEEK_DATA call in nfsd4_encode_read_plus_hole().  I took this out to
>>> trick the function into thinking that the entire file was already a
>>> hole, and runtime dropped to the levels of v4.1 and v4.2.
> 
> Wait, that 15s is due to just one SEEK_DATA?

The server is returning a larger hole than the client can read at once, so there are several SEEK_DATA calls made to verify that there are no data segments before the end of the file.

> 
>> I wonder
>>> if this is filesystem dependent?  My server is exporting ext4.
> 
> Sounds like just a bug.  I've been doing lots of lseek(.,.,SEEK_DATA) on
> both ext4 and xfs without seeing anything that weird.

It looks like something weird on ext4.  I switched my exported filesystem to xfs:

      |  v4.1  |  v4.2
------+--------+-------
data  | 0.764s | 1.343s
hole  | 0.572s | 0.205s
mixed |	0.634s | 0.472s


I bumped up the test to 1G files:

      |  v4.1  |  v4.2
------+--------+-------
data  | 1.578s | 1.743s
hole  | 1.241s | 0.443s
mixed |	1.884s | 0.913s

Let me know if I should test anything larger!

Anna
> 
> I believe it does return -ENXIO in the case SEEK_DATA is called at an
> offset beyond which there's no more data.  At least that's what I saw in
> userspace.  So maybe your code just isn't handling that case correctly?
> 
> --b.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields March 18, 2015, 8:55 p.m. UTC | #7
On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
> On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
> > On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker wrote:
> >> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> >>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
> >>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
> >>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
> >>>>>> This patch implements sending an array of segments back to the client.
> >>>>>> Clients should be prepared to handle multiple segment reads to make this
> >>>>>> useful.  We try to splice the first data segment into the XDR result,
> >>>>>> and remaining segments are encoded directly.
> >>>>>
> >>>>> I'm still interested in what would happen if we started with an
> >>>>> implementation like:
> >>>>>
> >>>>> 	- if the entire requested range falls within a hole, return that
> >>>>> 	  single hole.
> >>>>> 	- otherwise, just treat the thing as one big data segment.
> >>>>>
> >>>>> That would provide a benefit in the case there are large-ish holes
> >>>>> with minimal impact otherwise.
> >>>>>
> >>>>> (Though patches for full support are still useful even if only for
> >>>>> client-testing purposes.)
> >>>>
> >>>> Also, looks like
> >>>>
> >>>> 	xvs_io -c "fiemap -v" <file>
> >>>>
> >>>> will give hole sizes for a given <file>.  (Thanks, esandeen.)  Running
> >>>> that on a few of my test vm images shows a fair number of large
> >>>> (hundreds of megs) files, which suggests identifying only >=rwsize holes
> >>>> might still be useful.
> >>>
> >>> Just for fun.... I wrote the following test program and ran it on my
> >>> collection of testing vm's.  Some looked like this:
> >>>
> >>> 	f21-1.qcow2
> >>> 	144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
> >>> 	total hole bytes:      8443252736 (98%)
> >>> 	in aligned 1MB chunks: 8428453888 (98%)
> >>>
> >>> So, basically, read_plus would save transferring most of the data even
> >>> when only handling 1MB holes.
> >>>
> >>> But some looked like this:
> >>>
> >>> 	501524 -rw-------. 1 qemu qemu 8589934592 May 20  2014 rhel6-1-1.img
> >>> 	total hole bytes:      8077516800 (94%)
> >>> 	in aligned 1MB chunks: 0 (0%)
> >>>
> >>> So the READ_PLUS that caught every hole might save a lot, the one that
> >>> only caught 1MB holes wouldn't help at all.
> >>>
> >>> And there were lots of examples in between those two extremes.
> >>
> >> I tested with three different 512 MB files:  100% data, 100% hole, and alternating every megabyte.  The results were surprising:
> >>
> >>       |  v4.1  |  v4.2
> >> -----------------------
> >> data  | 0.685s |  0.714s
> >> hole  | 0.485s | 15.547s
> >> mixed |	1.283s |  0.448
> >>
> >> >From what I can tell, the 100% hole case takes so long because of the
> >>> SEEK_DATA call in nfsd4_encode_read_plus_hole().  I took this out to
> >>> trick the function into thinking that the entire file was already a
> >>> hole, and runtime dropped to the levels of v4.1 and v4.2.
> > 
> > Wait, that 15s is due to just one SEEK_DATA?
> 
> The server is returning a larger hole than the client can read at once, so there are several SEEK_DATA calls made to verify that there are no data segments before the end of the file.
> 
> > 
> >> I wonder
> >>> if this is filesystem dependent?  My server is exporting ext4.
> > 
> > Sounds like just a bug.  I've been doing lots of lseek(.,.,SEEK_DATA) on
> > both ext4 and xfs without seeing anything that weird.
> 
> It looks like something weird on ext4.  I switched my exported filesystem to xfs:

Huh.  Maybe we should report a bug....

> 
>       |  v4.1  |  v4.2
> ------+--------+-------
> data  | 0.764s | 1.343s

That's too bad.  Non-sparse files are surely still a common case and
we'd like to not see a slowdown there....  I wonder if we can figure out
where it's coming from?

> hole  | 0.572s | 0.205s
> mixed |	0.634s | 0.472s
> 
> 
> I bumped up the test to 1G files:
> 
>       |  v4.1  |  v4.2
> ------+--------+-------
> data  | 1.578s | 1.743s
> hole  | 1.241s | 0.443s
> mixed |	1.884s | 0.913s
> 
> Let me know if I should test anything larger!

The other thing I'd be interested in would be a "mixed" case that
alternates every 4k.  That will test the worst case where we we do a 1MB
read and get back only a 4k hole.  Aligned 1MB holes are somewhat of a
best case.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna March 18, 2015, 9:03 p.m. UTC | #8
On 03/18/2015 04:55 PM, J. Bruce Fields wrote:
> On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
>> On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
>>> On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker wrote:
>>>> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
>>>>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
>>>>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
>>>>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
>>>>>>>> This patch implements sending an array of segments back to the client.
>>>>>>>> Clients should be prepared to handle multiple segment reads to make this
>>>>>>>> useful.  We try to splice the first data segment into the XDR result,
>>>>>>>> and remaining segments are encoded directly.
>>>>>>>
>>>>>>> I'm still interested in what would happen if we started with an
>>>>>>> implementation like:
>>>>>>>
>>>>>>> 	- if the entire requested range falls within a hole, return that
>>>>>>> 	  single hole.
>>>>>>> 	- otherwise, just treat the thing as one big data segment.
>>>>>>>
>>>>>>> That would provide a benefit in the case there are large-ish holes
>>>>>>> with minimal impact otherwise.
>>>>>>>
>>>>>>> (Though patches for full support are still useful even if only for
>>>>>>> client-testing purposes.)
>>>>>>
>>>>>> Also, looks like
>>>>>>
>>>>>> 	xvs_io -c "fiemap -v" <file>
>>>>>>
>>>>>> will give hole sizes for a given <file>.  (Thanks, esandeen.)  Running
>>>>>> that on a few of my test vm images shows a fair number of large
>>>>>> (hundreds of megs) files, which suggests identifying only >=rwsize holes
>>>>>> might still be useful.
>>>>>
>>>>> Just for fun.... I wrote the following test program and ran it on my
>>>>> collection of testing vm's.  Some looked like this:
>>>>>
>>>>> 	f21-1.qcow2
>>>>> 	144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
>>>>> 	total hole bytes:      8443252736 (98%)
>>>>> 	in aligned 1MB chunks: 8428453888 (98%)
>>>>>
>>>>> So, basically, read_plus would save transferring most of the data even
>>>>> when only handling 1MB holes.
>>>>>
>>>>> But some looked like this:
>>>>>
>>>>> 	501524 -rw-------. 1 qemu qemu 8589934592 May 20  2014 rhel6-1-1.img
>>>>> 	total hole bytes:      8077516800 (94%)
>>>>> 	in aligned 1MB chunks: 0 (0%)
>>>>>
>>>>> So the READ_PLUS that caught every hole might save a lot, the one that
>>>>> only caught 1MB holes wouldn't help at all.
>>>>>
>>>>> And there were lots of examples in between those two extremes.
>>>>
>>>> I tested with three different 512 MB files:  100% data, 100% hole, and alternating every megabyte.  The results were surprising:
>>>>
>>>>       |  v4.1  |  v4.2
>>>> -----------------------
>>>> data  | 0.685s |  0.714s
>>>> hole  | 0.485s | 15.547s
>>>> mixed |	1.283s |  0.448
>>>>
>>>> >From what I can tell, the 100% hole case takes so long because of the
>>>>> SEEK_DATA call in nfsd4_encode_read_plus_hole().  I took this out to
>>>>> trick the function into thinking that the entire file was already a
>>>>> hole, and runtime dropped to the levels of v4.1 and v4.2.
>>>
>>> Wait, that 15s is due to just one SEEK_DATA?
>>
>> The server is returning a larger hole than the client can read at once, so there are several SEEK_DATA calls made to verify that there are no data segments before the end of the file.
>>
>>>
>>>> I wonder
>>>>> if this is filesystem dependent?  My server is exporting ext4.
>>>
>>> Sounds like just a bug.  I've been doing lots of lseek(.,.,SEEK_DATA) on
>>> both ext4 and xfs without seeing anything that weird.
>>
>> It looks like something weird on ext4.  I switched my exported filesystem to xfs:
> 
> Huh.  Maybe we should report a bug....
> 
>>
>>       |  v4.1  |  v4.2
>> ------+--------+-------
>> data  | 0.764s | 1.343s
> 
> That's too bad.  Non-sparse files are surely still a common case and
> we'd like to not see a slowdown there....  I wonder if we can figure out
> where it's coming from?

That's a good question, especially since the 1G file didn't double this time.  Maybe a VM quirk?


> 
>> hole  | 0.572s | 0.205s
>> mixed |	0.634s | 0.472s
>>
>>
>> I bumped up the test to 1G files:
>>
>>       |  v4.1  |  v4.2
>> ------+--------+-------
>> data  | 1.578s | 1.743s
>> hole  | 1.241s | 0.443s
>> mixed |	1.884s | 0.913s
>>
>> Let me know if I should test anything larger!
> 
> The other thing I'd be interested in would be a "mixed" case that
> alternates every 4k.  That will test the worst case where we we do a 1MB
> read and get back only a 4k hole.  Aligned 1MB holes are somewhat of a
> best case.

I probably won't get a chance to test this until I'm back from my vacation, but I'll keep the suggestion in mind!

Anna
> 
> --b.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields March 18, 2015, 9:11 p.m. UTC | #9
On Wed, Mar 18, 2015 at 05:03:32PM -0400, Anna Schumaker wrote:
> On 03/18/2015 04:55 PM, J. Bruce Fields wrote:
> > On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
> >> On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
> >>> On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker wrote:
> >>>> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> >>>>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
> >>>>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields wrote:
> >>>>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker wrote:
> >>>>>>>> This patch implements sending an array of segments back to the client.
> >>>>>>>> Clients should be prepared to handle multiple segment reads to make this
> >>>>>>>> useful.  We try to splice the first data segment into the XDR result,
> >>>>>>>> and remaining segments are encoded directly.
> >>>>>>>
> >>>>>>> I'm still interested in what would happen if we started with an
> >>>>>>> implementation like:
> >>>>>>>
> >>>>>>> 	- if the entire requested range falls within a hole, return that
> >>>>>>> 	  single hole.
> >>>>>>> 	- otherwise, just treat the thing as one big data segment.
> >>>>>>>
> >>>>>>> That would provide a benefit in the case there are large-ish holes
> >>>>>>> with minimal impact otherwise.
> >>>>>>>
> >>>>>>> (Though patches for full support are still useful even if only for
> >>>>>>> client-testing purposes.)
> >>>>>>
> >>>>>> Also, looks like
> >>>>>>
> >>>>>> 	xvs_io -c "fiemap -v" <file>
> >>>>>>
> >>>>>> will give hole sizes for a given <file>.  (Thanks, esandeen.)  Running
> >>>>>> that on a few of my test vm images shows a fair number of large
> >>>>>> (hundreds of megs) files, which suggests identifying only >=rwsize holes
> >>>>>> might still be useful.
> >>>>>
> >>>>> Just for fun.... I wrote the following test program and ran it on my
> >>>>> collection of testing vm's.  Some looked like this:
> >>>>>
> >>>>> 	f21-1.qcow2
> >>>>> 	144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 f21-1.qcow2
> >>>>> 	total hole bytes:      8443252736 (98%)
> >>>>> 	in aligned 1MB chunks: 8428453888 (98%)
> >>>>>
> >>>>> So, basically, read_plus would save transferring most of the data even
> >>>>> when only handling 1MB holes.
> >>>>>
> >>>>> But some looked like this:
> >>>>>
> >>>>> 	501524 -rw-------. 1 qemu qemu 8589934592 May 20  2014 rhel6-1-1.img
> >>>>> 	total hole bytes:      8077516800 (94%)
> >>>>> 	in aligned 1MB chunks: 0 (0%)
> >>>>>
> >>>>> So the READ_PLUS that caught every hole might save a lot, the one that
> >>>>> only caught 1MB holes wouldn't help at all.
> >>>>>
> >>>>> And there were lots of examples in between those two extremes.
> >>>>
> >>>> I tested with three different 512 MB files:  100% data, 100% hole, and alternating every megabyte.  The results were surprising:
> >>>>
> >>>>       |  v4.1  |  v4.2
> >>>> -----------------------
> >>>> data  | 0.685s |  0.714s
> >>>> hole  | 0.485s | 15.547s
> >>>> mixed |	1.283s |  0.448
> >>>>
> >>>> >From what I can tell, the 100% hole case takes so long because of the
> >>>>> SEEK_DATA call in nfsd4_encode_read_plus_hole().  I took this out to
> >>>>> trick the function into thinking that the entire file was already a
> >>>>> hole, and runtime dropped to the levels of v4.1 and v4.2.
> >>>
> >>> Wait, that 15s is due to just one SEEK_DATA?
> >>
> >> The server is returning a larger hole than the client can read at once, so there are several SEEK_DATA calls made to verify that there are no data segments before the end of the file.
> >>
> >>>
> >>>> I wonder
> >>>>> if this is filesystem dependent?  My server is exporting ext4.
> >>>
> >>> Sounds like just a bug.  I've been doing lots of lseek(.,.,SEEK_DATA) on
> >>> both ext4 and xfs without seeing anything that weird.
> >>
> >> It looks like something weird on ext4.  I switched my exported filesystem to xfs:
> > 
> > Huh.  Maybe we should report a bug....
> > 
> >>
> >>       |  v4.1  |  v4.2
> >> ------+--------+-------
> >> data  | 0.764s | 1.343s
> > 
> > That's too bad.  Non-sparse files are surely still a common case and
> > we'd like to not see a slowdown there....  I wonder if we can figure out
> > where it's coming from?
> 
> That's a good question, especially since the 1G file didn't double this time.  Maybe a VM quirk?

We definitely need to figure it out, I think.  If we can't make
READ_PLUS perform as well as READ (or very close to it) in the
non-sparse case then I don't think we'll want it, and as Trond suggested
we may want to consider something more fiemap-like instead.

I don't know, maybe the client could try to be clever and only use
READ_PLUS if the space_used/size ratio is lower than some threshhold,
but it could get a little complicated to tune.

It's annoying that asking "does this range contain zeroes" is actually
taking longer than just reading the whole range....

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields March 19, 2015, 3:36 p.m. UTC | #10
On Thu, Mar 19, 2015 at 08:00:05AM -0700, Marc Eshel wrote:
> linux-nfs-owner@vger.kernel.org wrote on 03/18/2015 02:11:44 PM:
> 
> > From: "J. Bruce Fields" <bfields@fieldses.org>
> > To: Anna Schumaker <Anna.Schumaker@netapp.com>
> > Cc: linux-nfs@vger.kernel.org
> > Date: 03/18/2015 02:14 PM
> > Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple 
> segments
> > Sent by: linux-nfs-owner@vger.kernel.org
> > 
> > On Wed, Mar 18, 2015 at 05:03:32PM -0400, Anna Schumaker wrote:
> > > On 03/18/2015 04:55 PM, J. Bruce Fields wrote:
> > > > On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
> > > >> On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
> > > >>> On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker wrote:
> > > >>>> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> > > >>>>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields wrote:
> > > >>>>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields 
> wrote:
> > > >>>>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker 
> wrote:
> > > >>>>>>>> This patch implements sending an array of segments back 
> > to the client.
> > > >>>>>>>> Clients should be prepared to handle multiple segment 
> > reads to make this
> > > >>>>>>>> useful.  We try to splice the first data segment into the
> > XDR result,
> > > >>>>>>>> and remaining segments are encoded directly.
> > > >>>>>>>
> > > >>>>>>> I'm still interested in what would happen if we started with 
> an
> > > >>>>>>> implementation like:
> > > >>>>>>>
> > > >>>>>>>    - if the entire requested range falls within a hole, return 
> that
> > > >>>>>>>      single hole.
> > > >>>>>>>    - otherwise, just treat the thing as one big data segment.
> > > >>>>>>>
> > > >>>>>>> That would provide a benefit in the case there are large-ish 
> holes
> > > >>>>>>> with minimal impact otherwise.
> > > >>>>>>>
> > > >>>>>>> (Though patches for full support are still useful even if only 
> for
> > > >>>>>>> client-testing purposes.)
> > > >>>>>>
> > > >>>>>> Also, looks like
> > > >>>>>>
> > > >>>>>>    xvs_io -c "fiemap -v" <file>
> > > >>>>>>
> > > >>>>>> will give hole sizes for a given <file>.  (Thanks, 
> > esandeen.)  Running
> > > >>>>>> that on a few of my test vm images shows a fair number of large
> > > >>>>>> (hundreds of megs) files, which suggests identifying only 
> > >=rwsize holes
> > > >>>>>> might still be useful.
> > > >>>>>
> > > >>>>> Just for fun.... I wrote the following test program and ran it 
> on my
> > > >>>>> collection of testing vm's.  Some looked like this:
> > > >>>>>
> > > >>>>>    f21-1.qcow2
> > > >>>>>    144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 
> f21-1.qcow2
> > > >>>>>    total hole bytes:      8443252736 (98%)
> > > >>>>>    in aligned 1MB chunks: 8428453888 (98%)
> > > >>>>>
> > > >>>>> So, basically, read_plus would save transferring most of thedata 
> even
> > > >>>>> when only handling 1MB holes.
> > > >>>>>
> > > >>>>> But some looked like this:
> > > >>>>>
> > > >>>>>    501524 -rw-------. 1 qemu qemu 8589934592 May 20  2014 
> > rhel6-1-1.img
> > > >>>>>    total hole bytes:      8077516800 (94%)
> > > >>>>>    in aligned 1MB chunks: 0 (0%)
> > > >>>>>
> > > >>>>> So the READ_PLUS that caught every hole might save a lot, the 
> one that
> > > >>>>> only caught 1MB holes wouldn't help at all.
> > > >>>>>
> > > >>>>> And there were lots of examples in between those two extremes.
> > > >>>>
> > > >>>> I tested with three different 512 MB files:  100% data, 100% 
> > hole, and alternating every megabyte.  The results were surprising:
> > > >>>>
> > > >>>>       |  v4.1  |  v4.2
> > > >>>> -----------------------
> > > >>>> data  | 0.685s |  0.714s
> > > >>>> hole  | 0.485s | 15.547s
> > > >>>> mixed |   1.283s |  0.448
> > > >>>>
> > > >>>> >From what I can tell, the 100% hole case takes so long because 
> of the
> > > >>>>> SEEK_DATA call in nfsd4_encode_read_plus_hole().  I took this 
> out to
> > > >>>>> trick the function into thinking that the entire file was 
> already a
> > > >>>>> hole, and runtime dropped to the levels of v4.1 and v4.2.
> > > >>>
> > > >>> Wait, that 15s is due to just one SEEK_DATA?
> > > >>
> > > >> The server is returning a larger hole than the client can read 
> > at once, so there are several SEEK_DATA calls made to verify that 
> > there are no data segments before the end of the file.
> > > >>
> > > >>>
> > > >>>> I wonder
> > > >>>>> if this is filesystem dependent?  My server is exporting ext4.
> > > >>>
> > > >>> Sounds like just a bug.  I've been doing lots of 
> lseek(.,.,SEEK_DATA) on
> > > >>> both ext4 and xfs without seeing anything that weird.
> > > >>
> > > >> It looks like something weird on ext4.  I switched my exported 
> > filesystem to xfs:
> > > > 
> > > > Huh.  Maybe we should report a bug....
> > > > 
> > > >>
> > > >>       |  v4.1  |  v4.2
> > > >> ------+--------+-------
> > > >> data  | 0.764s | 1.343s
> > > > 
> > > > That's too bad.  Non-sparse files are surely still a common case and
> > > > we'd like to not see a slowdown there....  I wonder if we can figure 
> out
> > > > where it's coming from?
> > > 
> > > That's a good question, especially since the 1G file didn't double
> > this time.  Maybe a VM quirk?
> > 
> > We definitely need to figure it out, I think.  If we can't make
> > READ_PLUS perform as well as READ (or very close to it) in the
> > non-sparse case then I don't think we'll want it, and as Trond suggested
> > we may want to consider something more fiemap-like instead.
> 
> Testing Anna's NFS client with the Ganesha NFS server and GPFS file system 
> shows the same numbers for READ with v4.1 and READ_PUSE with v4.2 of a 
> data file. Using sparse files READ_PLUS is 5 times faster than READ.

Thanks!  Is it possible to report the exact numbers?

Is Ganesha also implementing READ_PLUS with SEEK_HOLE/SEEK_DATA?  If so
then maybe the difference is the filesystem.  Might be interesting to
run the same sort of test with ganesha exporting xfs and/or knfsd
exporting GPFS.

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marc Eshel March 19, 2015, 4:28 p.m. UTC | #11
linux-nfs-owner@vger.kernel.org wrote on 03/19/2015 08:36:27 AM:

> From: "J. Bruce Fields" <bfields@fieldses.org>
> To: Marc Eshel/Almaden/IBM@IBMUS
> Cc: Anna Schumaker <Anna.Schumaker@netapp.com>, linux-
> nfs@vger.kernel.org, linux-nfs-owner@vger.kernel.org
> Date: 03/19/2015 08:36 AM
> Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple 
segments
> Sent by: linux-nfs-owner@vger.kernel.org
> 
> On Thu, Mar 19, 2015 at 08:00:05AM -0700, Marc Eshel wrote:
> > linux-nfs-owner@vger.kernel.org wrote on 03/18/2015 02:11:44 PM:
> > 
> > > From: "J. Bruce Fields" <bfields@fieldses.org>
> > > To: Anna Schumaker <Anna.Schumaker@netapp.com>
> > > Cc: linux-nfs@vger.kernel.org
> > > Date: 03/18/2015 02:14 PM
> > > Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple 
> > segments
> > > Sent by: linux-nfs-owner@vger.kernel.org
> > > 
> > > On Wed, Mar 18, 2015 at 05:03:32PM -0400, Anna Schumaker wrote:
> > > > On 03/18/2015 04:55 PM, J. Bruce Fields wrote:
> > > > > On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
> > > > >> On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
> > > > >>> On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker 
wrote:
> > > > >>>> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> > > > >>>>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields 
wrote:
> > > > >>>>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields 
> > wrote:
> > > > >>>>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker 
> > wrote:
> > > > >>>>>>>> This patch implements sending an array of segments back 
> > > to the client.
> > > > >>>>>>>> Clients should be prepared to handle multiple segment 
> > > reads to make this
> > > > >>>>>>>> useful.  We try to splice the first data segment into the
> > > XDR result,
> > > > >>>>>>>> and remaining segments are encoded directly.
> > > > >>>>>>>
> > > > >>>>>>> I'm still interested in what would happen if we started 
with 
> > an
> > > > >>>>>>> implementation like:
> > > > >>>>>>>
> > > > >>>>>>>    - if the entire requested range falls within a hole, 
return 
> > that
> > > > >>>>>>>      single hole.
> > > > >>>>>>>    - otherwise, just treat the thing as one big data 
segment.
> > > > >>>>>>>
> > > > >>>>>>> That would provide a benefit in the case there are 
large-ish 
> > holes
> > > > >>>>>>> with minimal impact otherwise.
> > > > >>>>>>>
> > > > >>>>>>> (Though patches for full support are still useful even if 
only 
> > for
> > > > >>>>>>> client-testing purposes.)
> > > > >>>>>>
> > > > >>>>>> Also, looks like
> > > > >>>>>>
> > > > >>>>>>    xvs_io -c "fiemap -v" <file>
> > > > >>>>>>
> > > > >>>>>> will give hole sizes for a given <file>.  (Thanks, 
> > > esandeen.)  Running
> > > > >>>>>> that on a few of my test vm images shows a fair number of 
large
> > > > >>>>>> (hundreds of megs) files, which suggests identifying only 
> > > >=rwsize holes
> > > > >>>>>> might still be useful.
> > > > >>>>>
> > > > >>>>> Just for fun.... I wrote the following test program and ran 
it 
> > on my
> > > > >>>>> collection of testing vm's.  Some looked like this:
> > > > >>>>>
> > > > >>>>>    f21-1.qcow2
> > > > >>>>>    144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 
> > f21-1.qcow2
> > > > >>>>>    total hole bytes:      8443252736 (98%)
> > > > >>>>>    in aligned 1MB chunks: 8428453888 (98%)
> > > > >>>>>
> > > > >>>>> So, basically, read_plus would save transferring most of 
thedata 
> > even
> > > > >>>>> when only handling 1MB holes.
> > > > >>>>>
> > > > >>>>> But some looked like this:
> > > > >>>>>
> > > > >>>>>    501524 -rw-------. 1 qemu qemu 8589934592 May 20  2014 
> > > rhel6-1-1.img
> > > > >>>>>    total hole bytes:      8077516800 (94%)
> > > > >>>>>    in aligned 1MB chunks: 0 (0%)
> > > > >>>>>
> > > > >>>>> So the READ_PLUS that caught every hole might save a lot, 
the 
> > one that
> > > > >>>>> only caught 1MB holes wouldn't help at all.
> > > > >>>>>
> > > > >>>>> And there were lots of examples in between those two 
extremes.
> > > > >>>>
> > > > >>>> I tested with three different 512 MB files:  100% data, 100% 
> > > hole, and alternating every megabyte.  The results were surprising:
> > > > >>>>
> > > > >>>>       |  v4.1  |  v4.2
> > > > >>>> -----------------------
> > > > >>>> data  | 0.685s |  0.714s
> > > > >>>> hole  | 0.485s | 15.547s
> > > > >>>> mixed |   1.283s |  0.448
> > > > >>>>
> > > > >>>> >From what I can tell, the 100% hole case takes so long 
because 
> > of the
> > > > >>>>> SEEK_DATA call in nfsd4_encode_read_plus_hole().  I took 
this 
> > out to
> > > > >>>>> trick the function into thinking that the entire file was 
> > already a
> > > > >>>>> hole, and runtime dropped to the levels of v4.1 and v4.2.
> > > > >>>
> > > > >>> Wait, that 15s is due to just one SEEK_DATA?
> > > > >>
> > > > >> The server is returning a larger hole than the client can read 
> > > at once, so there are several SEEK_DATA calls made to verify that 
> > > there are no data segments before the end of the file.
> > > > >>
> > > > >>>
> > > > >>>> I wonder
> > > > >>>>> if this is filesystem dependent?  My server is exporting 
ext4.
> > > > >>>
> > > > >>> Sounds like just a bug.  I've been doing lots of 
> > lseek(.,.,SEEK_DATA) on
> > > > >>> both ext4 and xfs without seeing anything that weird.
> > > > >>
> > > > >> It looks like something weird on ext4.  I switched my exported 
> > > filesystem to xfs:
> > > > > 
> > > > > Huh.  Maybe we should report a bug....
> > > > > 
> > > > >>
> > > > >>       |  v4.1  |  v4.2
> > > > >> ------+--------+-------
> > > > >> data  | 0.764s | 1.343s
> > > > > 
> > > > > That's too bad.  Non-sparse files are surely still a common case 
and
> > > > > we'd like to not see a slowdown there....  I wonder if we can 
figure 
> > out
> > > > > where it's coming from?
> > > > 
> > > > That's a good question, especially since the 1G file didn't double
> > > this time.  Maybe a VM quirk?
> > > 
> > > We definitely need to figure it out, I think.  If we can't make
> > > READ_PLUS perform as well as READ (or very close to it) in the
> > > non-sparse case then I don't think we'll want it, and as Trond 
suggested
> > > we may want to consider something more fiemap-like instead.
> > 
> > Testing Anna's NFS client with the Ganesha NFS server and GPFS file 
system 
> > shows the same numbers for READ with v4.1 and READ_PLUS with v4.2 of a 

> > data file. Using sparse files READ_PLUS is 5 times faster than READ.
> 
> Thanks!  Is it possible to report the exact numbers?

This is a copy of a 100M file. 

[root@fin16 ~]# umount /mnt
[root@fin16 ~]# mount -t nfs4 -o minorversion=1 9.1.74.120:/gpfsA /mnt
[root@fin16 ~]# time cp /mnt/100M /dev/null

real    0m1.597s
user    0m0.000s
sys     0m0.062s
[root@fin16 ~]# umount /mnt
[root@fin16 ~]# mount -t nfs4 -o minorversion=2 9.1.74.120:/gpfsA /mnt
[root@fin16 ~]# time cp /mnt/100M /dev/null

real    0m1.595s
user    0m0.002s
sys     0m0.057s

> 
> Is Ganesha also implementing READ_PLUS with SEEK_HOLE/SEEK_DATA?  If so
> then maybe the difference is the filesystem.  Might be interesting to
> run the same sort of test with ganesha exporting xfs and/or knfsd
> exporting GPFS.

GPFS did not implement it using SEEK it just calls the fs read and if 
there is no data the fs returns ENODATA return code. It is not yet 
implemented on other FSLAs
 
> 
> --b.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields March 20, 2015, 3:17 p.m. UTC | #12
Maybe this is a question for xfs developers.

So, we have a new READ_PLUS call that's basically just a version of READ
optimized for sparse files:

	http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-33#section-15.10

It allows an NFS server to return either file data (like a normal READ
call) or, at the server's discretion, records saying "this range of the
data is all zeroes".

Anna tried implementing READ_PLUS for knfsd using
vfs_llseek(.,.,SEEK_HOLE) followed by an ordinary read if that
determines we're not at a hole.

(Very) preliminary results suggest that's slower than a plain READ for
an xfs file with no holes.  (And *much* slower in the ext4 case for some
reason.)

Is that expected, and should we be doing this some other way instead?

--b.

On Thu, Mar 19, 2015 at 09:28:09AM -0700, Marc Eshel wrote:
> linux-nfs-owner@vger.kernel.org wrote on 03/19/2015 08:36:27 AM:
> 
> > From: "J. Bruce Fields" <bfields@fieldses.org>
> > To: Marc Eshel/Almaden/IBM@IBMUS
> > Cc: Anna Schumaker <Anna.Schumaker@netapp.com>, linux-
> > nfs@vger.kernel.org, linux-nfs-owner@vger.kernel.org
> > Date: 03/19/2015 08:36 AM
> > Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple 
> segments
> > Sent by: linux-nfs-owner@vger.kernel.org
> > 
> > On Thu, Mar 19, 2015 at 08:00:05AM -0700, Marc Eshel wrote:
> > > linux-nfs-owner@vger.kernel.org wrote on 03/18/2015 02:11:44 PM:
> > > > From: "J. Bruce Fields" <bfields@fieldses.org>
> > > > On Wed, Mar 18, 2015 at 05:03:32PM -0400, Anna Schumaker wrote:
> > > > > On 03/18/2015 04:55 PM, J. Bruce Fields wrote:
> > > > > > On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
> > > > > >>       |  v4.1  |  v4.2
> > > > > >> ------+--------+-------
> > > > > >> data  | 0.764s | 1.343s
> > > > > > 
> > > > > > That's too bad.  Non-sparse files are surely still a common case 
> and
> > > > > > we'd like to not see a slowdown there....  I wonder if we can 
> figure 
> > > out
> > > > > > where it's coming from?
> > > > > 
> > > > > That's a good question, especially since the 1G file didn't double
> > > > this time.  Maybe a VM quirk?
> > > > 
> > > > We definitely need to figure it out, I think.  If we can't make
> > > > READ_PLUS perform as well as READ (or very close to it) in the
> > > > non-sparse case then I don't think we'll want it, and as Trond 
> suggested
> > > > we may want to consider something more fiemap-like instead.
> > > 
> > > Testing Anna's NFS client with the Ganesha NFS server and GPFS file 
> system 
> > > shows the same numbers for READ with v4.1 and READ_PLUS with v4.2 of a 
> 
> > > data file. Using sparse files READ_PLUS is 5 times faster than READ.
> > 
> > Thanks!  Is it possible to report the exact numbers?
> 
> This is a copy of a 100M file. 
> 
> [root@fin16 ~]# umount /mnt
> [root@fin16 ~]# mount -t nfs4 -o minorversion=1 9.1.74.120:/gpfsA /mnt
> [root@fin16 ~]# time cp /mnt/100M /dev/null
> 
> real    0m1.597s
> user    0m0.000s
> sys     0m0.062s
> [root@fin16 ~]# umount /mnt
> [root@fin16 ~]# mount -t nfs4 -o minorversion=2 9.1.74.120:/gpfsA /mnt
> [root@fin16 ~]# time cp /mnt/100M /dev/null
> 
> real    0m1.595s
> user    0m0.002s
> sys     0m0.057s
> 
> > 
> > Is Ganesha also implementing READ_PLUS with SEEK_HOLE/SEEK_DATA?  If so
> > then maybe the difference is the filesystem.  Might be interesting to
> > run the same sort of test with ganesha exporting xfs and/or knfsd
> > exporting GPFS.
> 
> GPFS did not implement it using SEEK it just calls the fs read and if 
> there is no data the fs returns ENODATA return code. It is not yet 
> implemented on other FSLAs
>  
> > 
> > --b.
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 

On Thu, Mar 19, 2015 at 09:28:09AM -0700, Marc Eshel wrote:
> linux-nfs-owner@vger.kernel.org wrote on 03/19/2015 08:36:27 AM:
> 
> > From: "J. Bruce Fields" <bfields@fieldses.org>
> > To: Marc Eshel/Almaden/IBM@IBMUS
> > Cc: Anna Schumaker <Anna.Schumaker@netapp.com>, linux-
> > nfs@vger.kernel.org, linux-nfs-owner@vger.kernel.org
> > Date: 03/19/2015 08:36 AM
> > Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple 
> segments
> > Sent by: linux-nfs-owner@vger.kernel.org
> > 
> > On Thu, Mar 19, 2015 at 08:00:05AM -0700, Marc Eshel wrote:
> > > linux-nfs-owner@vger.kernel.org wrote on 03/18/2015 02:11:44 PM:
> > > 
> > > > From: "J. Bruce Fields" <bfields@fieldses.org>
> > > > To: Anna Schumaker <Anna.Schumaker@netapp.com>
> > > > Cc: linux-nfs@vger.kernel.org
> > > > Date: 03/18/2015 02:14 PM
> > > > Subject: Re: [PATCH v3 3/3] NFSD: Add support for encoding multiple 
> > > segments
> > > > Sent by: linux-nfs-owner@vger.kernel.org
> > > > 
> > > > On Wed, Mar 18, 2015 at 05:03:32PM -0400, Anna Schumaker wrote:
> > > > > On 03/18/2015 04:55 PM, J. Bruce Fields wrote:
> > > > > > On Wed, Mar 18, 2015 at 04:39:24PM -0400, Anna Schumaker wrote:
> > > > > >> On 03/18/2015 02:55 PM, J. Bruce Fields wrote:
> > > > > >>> On Wed, Mar 18, 2015 at 02:16:29PM -0400, Anna Schumaker 
> wrote:
> > > > > >>>> On 03/17/2015 05:36 PM, J. Bruce Fields wrote:
> > > > > >>>>> On Tue, Mar 17, 2015 at 04:07:38PM -0400, J. Bruce Fields 
> wrote:
> > > > > >>>>>> On Tue, Mar 17, 2015 at 03:56:33PM -0400, J. Bruce Fields 
> > > wrote:
> > > > > >>>>>>> On Mon, Mar 16, 2015 at 05:18:08PM -0400, Anna Schumaker 
> > > wrote:
> > > > > >>>>>>>> This patch implements sending an array of segments back 
> > > > to the client.
> > > > > >>>>>>>> Clients should be prepared to handle multiple segment 
> > > > reads to make this
> > > > > >>>>>>>> useful.  We try to splice the first data segment into the
> > > > XDR result,
> > > > > >>>>>>>> and remaining segments are encoded directly.
> > > > > >>>>>>>
> > > > > >>>>>>> I'm still interested in what would happen if we started 
> with 
> > > an
> > > > > >>>>>>> implementation like:
> > > > > >>>>>>>
> > > > > >>>>>>>    - if the entire requested range falls within a hole, 
> return 
> > > that
> > > > > >>>>>>>      single hole.
> > > > > >>>>>>>    - otherwise, just treat the thing as one big data 
> segment.
> > > > > >>>>>>>
> > > > > >>>>>>> That would provide a benefit in the case there are 
> large-ish 
> > > holes
> > > > > >>>>>>> with minimal impact otherwise.
> > > > > >>>>>>>
> > > > > >>>>>>> (Though patches for full support are still useful even if 
> only 
> > > for
> > > > > >>>>>>> client-testing purposes.)
> > > > > >>>>>>
> > > > > >>>>>> Also, looks like
> > > > > >>>>>>
> > > > > >>>>>>    xvs_io -c "fiemap -v" <file>
> > > > > >>>>>>
> > > > > >>>>>> will give hole sizes for a given <file>.  (Thanks, 
> > > > esandeen.)  Running
> > > > > >>>>>> that on a few of my test vm images shows a fair number of 
> large
> > > > > >>>>>> (hundreds of megs) files, which suggests identifying only 
> > > > >=rwsize holes
> > > > > >>>>>> might still be useful.
> > > > > >>>>>
> > > > > >>>>> Just for fun.... I wrote the following test program and ran 
> it 
> > > on my
> > > > > >>>>> collection of testing vm's.  Some looked like this:
> > > > > >>>>>
> > > > > >>>>>    f21-1.qcow2
> > > > > >>>>>    144784 -rw-------. 1 qemu qemu 8591507456 Mar 16 10:13 
> > > f21-1.qcow2
> > > > > >>>>>    total hole bytes:      8443252736 (98%)
> > > > > >>>>>    in aligned 1MB chunks: 8428453888 (98%)
> > > > > >>>>>
> > > > > >>>>> So, basically, read_plus would save transferring most of 
> thedata 
> > > even
> > > > > >>>>> when only handling 1MB holes.
> > > > > >>>>>
> > > > > >>>>> But some looked like this:
> > > > > >>>>>
> > > > > >>>>>    501524 -rw-------. 1 qemu qemu 8589934592 May 20  2014 
> > > > rhel6-1-1.img
> > > > > >>>>>    total hole bytes:      8077516800 (94%)
> > > > > >>>>>    in aligned 1MB chunks: 0 (0%)
> > > > > >>>>>
> > > > > >>>>> So the READ_PLUS that caught every hole might save a lot, 
> the 
> > > one that
> > > > > >>>>> only caught 1MB holes wouldn't help at all.
> > > > > >>>>>
> > > > > >>>>> And there were lots of examples in between those two 
> extremes.
> > > > > >>>>
> > > > > >>>> I tested with three different 512 MB files:  100% data, 100% 
> > > > hole, and alternating every megabyte.  The results were surprising:
> > > > > >>>>
> > > > > >>>>       |  v4.1  |  v4.2
> > > > > >>>> -----------------------
> > > > > >>>> data  | 0.685s |  0.714s
> > > > > >>>> hole  | 0.485s | 15.547s
> > > > > >>>> mixed |   1.283s |  0.448
> > > > > >>>>
> > > > > >>>> >From what I can tell, the 100% hole case takes so long 
> because 
> > > of the
> > > > > >>>>> SEEK_DATA call in nfsd4_encode_read_plus_hole().  I took 
> this 
> > > out to
> > > > > >>>>> trick the function into thinking that the entire file was 
> > > already a
> > > > > >>>>> hole, and runtime dropped to the levels of v4.1 and v4.2.
> > > > > >>>
> > > > > >>> Wait, that 15s is due to just one SEEK_DATA?
> > > > > >>
> > > > > >> The server is returning a larger hole than the client can read 
> > > > at once, so there are several SEEK_DATA calls made to verify that 
> > > > there are no data segments before the end of the file.
> > > > > >>
> > > > > >>>
> > > > > >>>> I wonder
> > > > > >>>>> if this is filesystem dependent?  My server is exporting 
> ext4.
> > > > > >>>
> > > > > >>> Sounds like just a bug.  I've been doing lots of 
> > > lseek(.,.,SEEK_DATA) on
> > > > > >>> both ext4 and xfs without seeing anything that weird.
> > > > > >>
> > > > > >> It looks like something weird on ext4.  I switched my exported 
> > > > filesystem to xfs:
> > > > > > 
> > > > > > Huh.  Maybe we should report a bug....
> > > > > > 
> > > > > >>
> > > > > >>       |  v4.1  |  v4.2
> > > > > >> ------+--------+-------
> > > > > >> data  | 0.764s | 1.343s
> > > > > > 
> > > > > > That's too bad.  Non-sparse files are surely still a common case 
> and
> > > > > > we'd like to not see a slowdown there....  I wonder if we can 
> figure 
> > > out
> > > > > > where it's coming from?
> > > > > 
> > > > > That's a good question, especially since the 1G file didn't double
> > > > this time.  Maybe a VM quirk?
> > > > 
> > > > We definitely need to figure it out, I think.  If we can't make
> > > > READ_PLUS perform as well as READ (or very close to it) in the
> > > > non-sparse case then I don't think we'll want it, and as Trond 
> suggested
> > > > we may want to consider something more fiemap-like instead.
> > > 
> > > Testing Anna's NFS client with the Ganesha NFS server and GPFS file 
> system 
> > > shows the same numbers for READ with v4.1 and READ_PLUS with v4.2 of a 
> 
> > > data file. Using sparse files READ_PLUS is 5 times faster than READ.
> > 
> > Thanks!  Is it possible to report the exact numbers?
> 
> This is a copy of a 100M file. 
> 
> [root@fin16 ~]# umount /mnt
> [root@fin16 ~]# mount -t nfs4 -o minorversion=1 9.1.74.120:/gpfsA /mnt
> [root@fin16 ~]# time cp /mnt/100M /dev/null
> 
> real    0m1.597s
> user    0m0.000s
> sys     0m0.062s
> [root@fin16 ~]# umount /mnt
> [root@fin16 ~]# mount -t nfs4 -o minorversion=2 9.1.74.120:/gpfsA /mnt
> [root@fin16 ~]# time cp /mnt/100M /dev/null
> 
> real    0m1.595s
> user    0m0.002s
> sys     0m0.057s
> 
> > 
> > Is Ganesha also implementing READ_PLUS with SEEK_HOLE/SEEK_DATA?  If so
> > then maybe the difference is the filesystem.  Might be interesting to
> > run the same sort of test with ganesha exporting xfs and/or knfsd
> > exporting GPFS.
> 
> GPFS did not implement it using SEEK it just calls the fs read and if 
> there is no data the fs returns ENODATA return code. It is not yet 
> implemented on other FSLAs
>  
> > 
> > --b.
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig March 20, 2015, 4:23 p.m. UTC | #13
On Fri, Mar 20, 2015 at 11:17:18AM -0400, J. Bruce Fields wrote:
> Maybe this is a question for xfs developers.
> 
> So, we have a new READ_PLUS call that's basically just a version of READ
> optimized for sparse files:
> 
> 	http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-33#section-15.10
> 
> It allows an NFS server to return either file data (like a normal READ
> call) or, at the server's discretion, records saying "this range of the
> data is all zeroes".
> 
> Anna tried implementing READ_PLUS for knfsd using
> vfs_llseek(.,.,SEEK_HOLE) followed by an ordinary read if that
> determines we're not at a hole.
> 
> (Very) preliminary results suggest that's slower than a plain READ for
> an xfs file with no holes.  (And *much* slower in the ext4 case for some
> reason.)

It should be a fairly cheap operastion, and does extent tree operations
that are pretty similar to an (uncached) read.  Do you have profiles?

> Is that expected, and should we be doing this some other way instead?

Are the read cached or uncached?  If they are from pagecache just
copying the zeroes is pretty much unbeatable compared to extent
tree lookups, so we'd need a new page flag (difficult..) to see
that a page is a hole (and then it would only work for the whole page),
but for uncached reads an optimization would be to tell a read that it's
an NFS READ_PLUS so that it could just read until it reach a hole,
and then we'd need some way to communicate the hole size (or just fall
back to SEEK_HOLE for that case).
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields March 20, 2015, 6:26 p.m. UTC | #14
On Fri, Mar 20, 2015 at 09:23:03AM -0700, Christoph Hellwig wrote:
> On Fri, Mar 20, 2015 at 11:17:18AM -0400, J. Bruce Fields wrote:
> > Maybe this is a question for xfs developers.
> > 
> > So, we have a new READ_PLUS call that's basically just a version of READ
> > optimized for sparse files:
> > 
> > 	http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-33#section-15.10
> > 
> > It allows an NFS server to return either file data (like a normal READ
> > call) or, at the server's discretion, records saying "this range of the
> > data is all zeroes".
> > 
> > Anna tried implementing READ_PLUS for knfsd using
> > vfs_llseek(.,.,SEEK_HOLE) followed by an ordinary read if that
> > determines we're not at a hole.
> > 
> > (Very) preliminary results suggest that's slower than a plain READ for
> > an xfs file with no holes.  (And *much* slower in the ext4 case for some
> > reason.)
> 
> It should be a fairly cheap operastion, and does extent tree operations
> that are pretty similar to an (uncached) read.  Do you have profiles?
> 
> > Is that expected, and should we be doing this some other way instead?
> 
> Are the read cached or uncached?

I don't know, and don't have profiles.  I'll either try to reproduce or
wait till Anna's back from vacation.

> If they are from pagecache just copying the zeroes is pretty much
> unbeatable compared to extent tree lookups, so we'd need a new page
> flag (difficult..) to see that a page is a hole (and then it would
> only work for the whole page), but for uncached reads an optimization
> would be to tell a read that it's an NFS READ_PLUS so that it could
> just read until it reach a hole, and then we'd need some way to
> communicate the hole size (or just fall back to SEEK_HOLE for that
> case).

Ugh, OK.  We'll do some more tests before coming back to ask about
that....

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna March 24, 2015, 12:43 p.m. UTC | #15
On Fri, Mar 20, 2015 at 2:26 PM, J. Bruce Fields <bfields@fieldses.org> wrote:
> On Fri, Mar 20, 2015 at 09:23:03AM -0700, Christoph Hellwig wrote:
>> On Fri, Mar 20, 2015 at 11:17:18AM -0400, J. Bruce Fields wrote:
>> > Maybe this is a question for xfs developers.
>> >
>> > So, we have a new READ_PLUS call that's basically just a version of READ
>> > optimized for sparse files:
>> >
>> >     http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion2-33#section-15.10
>> >
>> > It allows an NFS server to return either file data (like a normal READ
>> > call) or, at the server's discretion, records saying "this range of the
>> > data is all zeroes".
>> >
>> > Anna tried implementing READ_PLUS for knfsd using
>> > vfs_llseek(.,.,SEEK_HOLE) followed by an ordinary read if that
>> > determines we're not at a hole.
>> >
>> > (Very) preliminary results suggest that's slower than a plain READ for
>> > an xfs file with no holes.  (And *much* slower in the ext4 case for some
>> > reason.)
>>
>> It should be a fairly cheap operastion, and does extent tree operations
>> that are pretty similar to an (uncached) read.  Do you have profiles?
>>
>> > Is that expected, and should we be doing this some other way instead?
>>
>> Are the read cached or uncached?
>
> I don't know, and don't have profiles.  I'll either try to reproduce or
> wait till Anna's back from vacation.

I'm using whatever functions NFSD already uses for reading files,
which I expect go through the VFS.  Is there a flag that controls
cache behavior?

>
>> If they are from pagecache just copying the zeroes is pretty much
>> unbeatable compared to extent tree lookups, so we'd need a new page
>> flag (difficult..) to see that a page is a hole (and then it would
>> only work for the whole page), but for uncached reads an optimization
>> would be to tell a read that it's an NFS READ_PLUS so that it could
>> just read until it reach a hole, and then we'd need some way to
>> communicate the hole size (or just fall back to SEEK_HOLE for that
>> case).
>
> Ugh, OK.  We'll do some more tests before coming back to ask about
> that....

I only had time for the one run, so I'll do more trials and see if
that one read is always so long.  I'm still hoping it was something in
the way my VM was scheduling its tasks!

Anna

>
> --b.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig March 24, 2015, 5:49 p.m. UTC | #16
On Tue, Mar 24, 2015 at 08:43:31AM -0400, Anna Schumaker wrote:
> > I don't know, and don't have profiles.  I'll either try to reproduce or
> > wait till Anna's back from vacation.
> 
> I'm using whatever functions NFSD already uses for reading files,
> which I expect go through the VFS.  Is there a flag that controls
> cache behavior?

There's the O_DIRECT flag, but that's not what I mean.  If you just
wrote to it it's a cached read, if you did unmount the filesystem after
writing, or did an echo to /proc/sys/vm/drop_caches you get uncached
read behavior.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna March 25, 2015, 5:15 p.m. UTC | #17
On 03/24/2015 01:49 PM, Christoph Hellwig wrote:
> On Tue, Mar 24, 2015 at 08:43:31AM -0400, Anna Schumaker wrote:
>>> I don't know, and don't have profiles.  I'll either try to reproduce or
>>> wait till Anna's back from vacation.
>>
>> I'm using whatever functions NFSD already uses for reading files,
>> which I expect go through the VFS.  Is there a flag that controls
>> cache behavior?
> 
> There's the O_DIRECT flag, but that's not what I mean.  If you just
> wrote to it it's a cached read, if you did unmount the filesystem after
> writing, or did an echo to /proc/sys/vm/drop_caches you get uncached
> read behavior.

Oh, I'm doing uncached reads for my tests.  I'm collecting updated numbers now!

Anna
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna March 26, 2015, 3:21 p.m. UTC | #18
Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:

##########################
#                        #
#   Without READ_PLUS    #
#                        #
##########################


NFS v4.1:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
|    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
|   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
|---------|---------|---------|---------|---------|---------|---------|




NFS v4.2:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
|    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
|   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
|---------|---------|---------|---------|---------|---------|---------|





#######################
#                     #
#   With READ_PLUS    #
#                     #
#######################


NFS v4.1:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
|    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
|   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
|---------|---------|---------|---------|---------|---------|---------|




NFS v4.2:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
|    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
|   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
|---------|---------|---------|---------|---------|---------|---------|



On 03/24/2015 01:49 PM, Christoph Hellwig wrote:
> On Tue, Mar 24, 2015 at 08:43:31AM -0400, Anna Schumaker wrote:
>>> I don't know, and don't have profiles.  I'll either try to reproduce or
>>> wait till Anna's back from vacation.
>>
>> I'm using whatever functions NFSD already uses for reading files,
>> which I expect go through the VFS.  Is there a flag that controls
>> cache behavior?
> 
> There's the O_DIRECT flag, but that's not what I mean.  If you just
> wrote to it it's a cached read, if you did unmount the filesystem after
> writing, or did an echo to /proc/sys/vm/drop_caches you get uncached
> read behavior.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust March 26, 2015, 3:32 p.m. UTC | #19
On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
<Anna.Schumaker@netapp.com> wrote:
> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>
> ##########################
> #                        #
> #   Without READ_PLUS    #
> #                        #
> ##########################
>
>
> NFS v4.1:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
> NFS v4.2:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
>
> #######################
> #                     #
> #   With READ_PLUS    #
> #                     #
> #######################
>
>
> NFS v4.1:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
> NFS v4.2:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
> |---------|---------|---------|---------|---------|---------|---------|
>

So there is a clear win in the 100% hole case here, but otherwise the
statistical fluctuations are dominating the numbers. Can you get us a
little more stats and then perhaps run the results through nfsometer?

>
>
> On 03/24/2015 01:49 PM, Christoph Hellwig wrote:
>> On Tue, Mar 24, 2015 at 08:43:31AM -0400, Anna Schumaker wrote:
>>>> I don't know, and don't have profiles.  I'll either try to reproduce or
>>>> wait till Anna's back from vacation.
>>>
>>> I'm using whatever functions NFSD already uses for reading files,
>>> which I expect go through the VFS.  Is there a flag that controls
>>> cache behavior?
>>
>> There's the O_DIRECT flag, but that's not what I mean.  If you just
>> wrote to it it's a cached read, if you did unmount the filesystem after
>> writing, or did an echo to /proc/sys/vm/drop_caches you get uncached
>> read behavior.
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna March 26, 2015, 3:36 p.m. UTC | #20
On 03/26/2015 11:32 AM, Trond Myklebust wrote:
> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
> <Anna.Schumaker@netapp.com> wrote:
>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>
>> ##########################
>> #                        #
>> #   Without READ_PLUS    #
>> #                        #
>> ##########################
>>
>>
>> NFS v4.1:
>>                             Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>>
>>
>> NFS v4.2:
>>                             Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>>
>>
>>
>> #######################
>> #                     #
>> #   With READ_PLUS    #
>> #                     #
>> #######################
>>
>>
>> NFS v4.1:
>>                             Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>>
>>
>> NFS v4.2:
>>                             Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
> 
> So there is a clear win in the 100% hole case here, but otherwise the
> statistical fluctuations are dominating the numbers. Can you get us a
> little more stats and then perhaps run the results through nfsometer?

Sure!  Do you want any information besides runtime?

Anna

> 
>>
>>
>> On 03/24/2015 01:49 PM, Christoph Hellwig wrote:
>>> On Tue, Mar 24, 2015 at 08:43:31AM -0400, Anna Schumaker wrote:
>>>>> I don't know, and don't have profiles.  I'll either try to reproduce or
>>>>> wait till Anna's back from vacation.
>>>>
>>>> I'm using whatever functions NFSD already uses for reading files,
>>>> which I expect go through the VFS.  Is there a flag that controls
>>>> cache behavior?
>>>
>>> There's the O_DIRECT flag, but that's not what I mean.  If you just
>>> wrote to it it's a cached read, if you did unmount the filesystem after
>>> writing, or did an echo to /proc/sys/vm/drop_caches you get uncached
>>> read behavior.
>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields March 26, 2015, 3:38 p.m. UTC | #21
On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
> <Anna.Schumaker@netapp.com> wrote:
> > Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
> >
> > ##########################
> > #                        #
> > #   Without READ_PLUS    #
> > #                        #
> > ##########################
> >
> >
> > NFS v4.1:
> >                             Trial
> > |---------|---------|---------|---------|---------|---------|---------|
> > |         |    1    |    2    |    3    |    4    |    5    | Average |
> > |---------|---------|---------|---------|---------|---------|---------|
> > |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
> > |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
> > |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
> > |---------|---------|---------|---------|---------|---------|---------|
> >
> >
> >
> >
> > NFS v4.2:
> >                             Trial
> > |---------|---------|---------|---------|---------|---------|---------|
> > |         |    1    |    2    |    3    |    4    |    5    | Average |
> > |---------|---------|---------|---------|---------|---------|---------|
> > |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
> > |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
> > |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
> > |---------|---------|---------|---------|---------|---------|---------|
> >
> >
> >
> >
> >
> > #######################
> > #                     #
> > #   With READ_PLUS    #
> > #                     #
> > #######################
> >
> >
> > NFS v4.1:
> >                             Trial
> > |---------|---------|---------|---------|---------|---------|---------|
> > |         |    1    |    2    |    3    |    4    |    5    | Average |
> > |---------|---------|---------|---------|---------|---------|---------|
> > |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
> > |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
> > |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
> > |---------|---------|---------|---------|---------|---------|---------|
> >
> >
> >
> >
> > NFS v4.2:
> >                             Trial
> > |---------|---------|---------|---------|---------|---------|---------|
> > |         |    1    |    2    |    3    |    4    |    5    | Average |
> > |---------|---------|---------|---------|---------|---------|---------|
> > |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
> > |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
> > |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
> > |---------|---------|---------|---------|---------|---------|---------|
> >
> 
> So there is a clear win in the 100% hole case here, but otherwise the
> statistical fluctuations are dominating the numbers. Can you get us a
> little more stats and then perhaps run the results through nfsometer?

Also, could you describe the setup (are these still kvm's), and how
you're clearing the cache between runs?

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna March 26, 2015, 3:47 p.m. UTC | #22
On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>> <Anna.Schumaker@netapp.com> wrote:
>>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>
>>> ##########################
>>> #                        #
>>> #   Without READ_PLUS    #
>>> #                        #
>>> ##########################
>>>
>>>
>>> NFS v4.1:
>>>                             Trial
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
>>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
>>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>>
>>>
>>>
>>>
>>> NFS v4.2:
>>>                             Trial
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
>>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
>>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>>
>>>
>>>
>>>
>>>
>>> #######################
>>> #                     #
>>> #   With READ_PLUS    #
>>> #                     #
>>> #######################
>>>
>>>
>>> NFS v4.1:
>>>                             Trial
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
>>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
>>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>>
>>>
>>>
>>>
>>> NFS v4.2:
>>>                             Trial
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
>>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
>>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
>>> |---------|---------|---------|---------|---------|---------|---------|
>>>
>>
>> So there is a clear win in the 100% hole case here, but otherwise the
>> statistical fluctuations are dominating the numbers. Can you get us a
>> little more stats and then perhaps run the results through nfsometer?
> 
> Also, could you describe the setup (are these still kvm's), and how
> you're clearing the cache between runs?

These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.

> 
> --b.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust March 26, 2015, 4:06 p.m. UTC | #23
On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
<Anna.Schumaker@netapp.com> wrote:
> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>> <Anna.Schumaker@netapp.com> wrote:
>>>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>
>>>> ##########################
>>>> #                        #
>>>> #   Without READ_PLUS    #
>>>> #                        #
>>>> ##########################
>>>>
>>>>
>>>> NFS v4.1:
>>>>                             Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
>>>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
>>>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>>
>>>>
>>>> NFS v4.2:
>>>>                             Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
>>>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
>>>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>>
>>>>
>>>>
>>>> #######################
>>>> #                     #
>>>> #   With READ_PLUS    #
>>>> #                     #
>>>> #######################
>>>>
>>>>
>>>> NFS v4.1:
>>>>                             Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
>>>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
>>>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>>
>>>>
>>>> NFS v4.2:
>>>>                             Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
>>>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
>>>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>
>>> So there is a clear win in the 100% hole case here, but otherwise the
>>> statistical fluctuations are dominating the numbers. Can you get us a
>>> little more stats and then perhaps run the results through nfsometer?
>>
>> Also, could you describe the setup (are these still kvm's), and how
>> you're clearing the cache between runs?
>
> These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.

I agree that you have to use the 'drop_caches' interface on the
server, but why not just use O_DIRECT on the clients?
Schumaker, Anna March 26, 2015, 4:11 p.m. UTC | #24
On 03/26/2015 12:06 PM, Trond Myklebust wrote:
> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
> <Anna.Schumaker@netapp.com> wrote:
>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>
>>>>> ##########################
>>>>> #                        #
>>>>> #   Without READ_PLUS    #
>>>>> #                        #
>>>>> ##########################
>>>>>
>>>>>
>>>>> NFS v4.1:
>>>>>                             Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
>>>>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
>>>>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> NFS v4.2:
>>>>>                             Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
>>>>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
>>>>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> #######################
>>>>> #                     #
>>>>> #   With READ_PLUS    #
>>>>> #                     #
>>>>> #######################
>>>>>
>>>>>
>>>>> NFS v4.1:
>>>>>                             Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
>>>>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
>>>>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> NFS v4.2:
>>>>>                             Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
>>>>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
>>>>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>
>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>> little more stats and then perhaps run the results through nfsometer?
>>>
>>> Also, could you describe the setup (are these still kvm's), and how
>>> you're clearing the cache between runs?
>>
>> These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
> 
> I agree that you have to use the 'drop_caches' interface on the
> server, but why not just use O_DIRECT on the clients?

I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`.  I can write something to read files with O_DIRECT if that would be more useful!

Anna

> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields March 26, 2015, 4:11 p.m. UTC | #25
On Thu, Mar 26, 2015 at 11:47:03AM -0400, Anna Schumaker wrote:
> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
> > On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
> >> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
> >> <Anna.Schumaker@netapp.com> wrote:
> >>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
> >>>
> >>> ##########################
> >>> #                        #
> >>> #   Without READ_PLUS    #
> >>> #                        #
> >>> ##########################
> >>>
> >>>
> >>> NFS v4.1:
> >>>                             Trial
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
> >>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
> >>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>>
> >>>
> >>>
> >>>
> >>> NFS v4.2:
> >>>                             Trial
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
> >>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
> >>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>>
> >>>
> >>>
> >>>
> >>>
> >>> #######################
> >>> #                     #
> >>> #   With READ_PLUS    #
> >>> #                     #
> >>> #######################
> >>>
> >>>
> >>> NFS v4.1:
> >>>                             Trial
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
> >>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
> >>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>>
> >>>
> >>>
> >>>
> >>> NFS v4.2:
> >>>                             Trial
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
> >>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
> >>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
> >>> |---------|---------|---------|---------|---------|---------|---------|
> >>>
> >>
> >> So there is a clear win in the 100% hole case here, but otherwise the
> >> statistical fluctuations are dominating the numbers. Can you get us a
> >> little more stats and then perhaps run the results through nfsometer?
> > 
> > Also, could you describe the setup (are these still kvm's), and how
> > you're clearing the cache between runs?
> 
> These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.

What sort of device is the exported xfs filesystem on?  (Can't there
be a second level of caching on the guest, depending on how it's set
up?)

Can we get results on bare metal?  (The kvm test might be a good
worst-case for read_plus, as I'd expect bandwidth to be relatively high
compared to the cost of the extra memcpy's or seek calls.  But it also
seems more complicated.)

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust March 26, 2015, 4:13 p.m. UTC | #26
On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
<Anna.Schumaker@netapp.com> wrote:
> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>> <Anna.Schumaker@netapp.com> wrote:
>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>
>>>>>> ##########################
>>>>>> #                        #
>>>>>> #   Without READ_PLUS    #
>>>>>> #                        #
>>>>>> ##########################
>>>>>>
>>>>>>
>>>>>> NFS v4.1:
>>>>>>                             Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
>>>>>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
>>>>>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> NFS v4.2:
>>>>>>                             Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
>>>>>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
>>>>>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> #######################
>>>>>> #                     #
>>>>>> #   With READ_PLUS    #
>>>>>> #                     #
>>>>>> #######################
>>>>>>
>>>>>>
>>>>>> NFS v4.1:
>>>>>>                             Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
>>>>>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
>>>>>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> NFS v4.2:
>>>>>>                             Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
>>>>>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
>>>>>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>
>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>
>>>> Also, could you describe the setup (are these still kvm's), and how
>>>> you're clearing the cache between runs?
>>>
>>> These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>
>> I agree that you have to use the 'drop_caches' interface on the
>> server, but why not just use O_DIRECT on the clients?
>
> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`.  I can write something to read files with O_DIRECT if that would be more useful!
>

'dd' can do that for you if the appropriate incantations are performed.
Schumaker, Anna March 26, 2015, 4:14 p.m. UTC | #27
On 03/26/2015 12:13 PM, Trond Myklebust wrote:
> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
> <Anna.Schumaker@netapp.com> wrote:
>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>>> <Anna.Schumaker@netapp.com> wrote:
>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>>
>>>>>>> ##########################
>>>>>>> #                        #
>>>>>>> #   Without READ_PLUS    #
>>>>>>> #                        #
>>>>>>> ##########################
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.1:
>>>>>>>                             Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
>>>>>>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
>>>>>>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.2:
>>>>>>>                             Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
>>>>>>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
>>>>>>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> #######################
>>>>>>> #                     #
>>>>>>> #   With READ_PLUS    #
>>>>>>> #                     #
>>>>>>> #######################
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.1:
>>>>>>>                             Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
>>>>>>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
>>>>>>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.2:
>>>>>>>                             Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
>>>>>>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
>>>>>>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>
>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>>
>>>>> Also, could you describe the setup (are these still kvm's), and how
>>>>> you're clearing the cache between runs?
>>>>
>>>> These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>>
>>> I agree that you have to use the 'drop_caches' interface on the
>>> server, but why not just use O_DIRECT on the clients?
>>
>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`.  I can write something to read files with O_DIRECT if that would be more useful!
>>
> 
> 'dd' can do that for you if the appropriate incantations are performed.

Got it.  I'll sacrifice a goat to 'dd' and rerun the tests with O_DIRECT!
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna March 26, 2015, 4:18 p.m. UTC | #28
On 03/26/2015 12:11 PM, J. Bruce Fields wrote:
> On Thu, Mar 26, 2015 at 11:47:03AM -0400, Anna Schumaker wrote:
>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>
>>>>> ##########################
>>>>> #                        #
>>>>> #   Without READ_PLUS    #
>>>>> #                        #
>>>>> ##########################
>>>>>
>>>>>
>>>>> NFS v4.1:
>>>>>                             Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
>>>>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
>>>>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> NFS v4.2:
>>>>>                             Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
>>>>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
>>>>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> #######################
>>>>> #                     #
>>>>> #   With READ_PLUS    #
>>>>> #                     #
>>>>> #######################
>>>>>
>>>>>
>>>>> NFS v4.1:
>>>>>                             Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
>>>>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
>>>>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> NFS v4.2:
>>>>>                             Trial
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
>>>>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
>>>>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>
>>>>
>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>> little more stats and then perhaps run the results through nfsometer?
>>>
>>> Also, could you describe the setup (are these still kvm's), and how
>>> you're clearing the cache between runs?
>>
>> These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
> 
> What sort of device is the exported xfs filesystem on?  (Can't there
> be a second level of caching on the guest, depending on how it's set
> up?)

My host is a macbook pro running Archlinux, and I have all my virtio disks set to "cache mode = none".  Let me know if you were asking something different!


> 
> Can we get results on bare metal?  (The kvm test might be a good
> worst-case for read_plus, as I'd expect bandwidth to be relatively high
> compared to the cost of the extra memcpy's or seek calls.  But it also
> seems more complicated.)

I do all of my testing on kvm these days!  I'll see how difficult it is to setup refind with a custom kernel to test between my laptop and my desktop (or I could run the test between my raspberry pis!)

Anna

> 
> --b.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna March 27, 2015, 7:04 p.m. UTC | #29
I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines.  First, I ran dd using direct IO for reads:
	dd if=/nfs/file iflag=direct of=/dev/null bs=128K

Mixed file performance was awful, so I reran without direct IO enabled for comparison:
	dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K

bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.


##########################
#                        #
#   Without READ_PLUS    #
#                        #
##########################


NFS v4.1, iflag=direct:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
|    Hole |  9.839s |  9.326s |  9.381s |  9.430s |  8.875s |  9.370s |
|   Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2, iflag=direct:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
|    Hole |  9.515s |  9.039s |  9.116s |  8.867s |  8.905s |  9.088s |
|   Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
|---------|---------|---------|---------|---------|---------|---------|




NFS v4.1, iflag=nocache oflag=nocache:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  6.808s |  6.698s |  7.482s |  6.761s |  7.235s |  6.995s |
|    Hole |  5.350s |  5.148s |  5.161s |  5.070s |  5.089s |  5.164s |
|   Mixed |  9.316s |  8.731s |  9.072s |  9.145s |  8.627s |  8.978s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2, iflag=nocache oflag=nocache:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  6.686s |  6.848s |  6.876s |  6.799s |  7.815s |  7.004s |
|    Hole |  5.092s |  5.330s |  5.050s |  5.280s |  5.030s |  5.156s |
|   Mixed |  8.142s |  7.897s |  8.040s |  7.960s |  8.050s |  8.018s |
|---------|---------|---------|---------|---------|---------|---------|






#######################
#                     #
#   With READ_PLUS    #
#                     #
#######################


NFS v4.1, iflag=direct:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  9.464s | 10.181s | 10.048s |  9.452s | 10.795s |  9.988s |
|    Hole |  7.954s |  8.486s |  7.762s |  7.969s |  8.299s |  8.094s |
|   Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2, iflag=direct:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
|    Hole |  3.247s |  3.155s |  3.191s |  3.243s |  3.202s |  3.208s |
|   Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |
|---------|---------|---------|---------|---------|---------|---------|




NFS v4.1, iflag=nocache oflag=nocache:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  6.788s |  6.802s |  6.750s |  6.756s |  6.852s |  6.790s |
|    Hole |  5.143s |  5.165s |  5.104s |  5.154s |  5.116s |  5.136s |
|   Mixed |  7.902s |  7.693s |  9.169s |  8.186s |  9.157s |  8.421s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2, iflag=nocache oflag=nocache:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  6.897s |  6.862s |  7.054s |  6.961s |  7.081s |  6.971s |
|    Hole |  1.690s |  1.673s |  1.553s |  1.554s |  1.490s |  1.592s |
|   Mixed |  9.009s |  7.840s |  7.661s |  8.945s |  7.649s |  8.221s |
|---------|---------|---------|---------|---------|---------|---------|


On 03/26/2015 12:13 PM, Trond Myklebust wrote:
> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
> <Anna.Schumaker@netapp.com> wrote:
>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>>> <Anna.Schumaker@netapp.com> wrote:
>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>>
>>>>>>> ##########################
>>>>>>> #                        #
>>>>>>> #   Without READ_PLUS    #
>>>>>>> #                        #
>>>>>>> ##########################
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.1:
>>>>>>>                             Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
>>>>>>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
>>>>>>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.2:
>>>>>>>                             Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
>>>>>>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
>>>>>>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> #######################
>>>>>>> #                     #
>>>>>>> #   With READ_PLUS    #
>>>>>>> #                     #
>>>>>>> #######################
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.1:
>>>>>>>                             Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
>>>>>>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
>>>>>>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> NFS v4.2:
>>>>>>>                             Trial
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
>>>>>>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
>>>>>>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>
>>>>>>
>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>>
>>>>> Also, could you describe the setup (are these still kvm's), and how
>>>>> you're clearing the cache between runs?
>>>>
>>>> These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>>
>>> I agree that you have to use the 'drop_caches' interface on the
>>> server, but why not just use O_DIRECT on the clients?
>>
>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`.  I can write something to read files with O_DIRECT if that would be more useful!
>>
> 
> 'dd' can do that for you if the appropriate incantations are performed.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust March 27, 2015, 8:22 p.m. UTC | #30
On Fri, Mar 27, 2015 at 3:04 PM, Anna Schumaker
<Anna.Schumaker@netapp.com> wrote:
> I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines.  First, I ran dd using direct IO for reads:
>         dd if=/nfs/file iflag=direct of=/dev/null bs=128K
>
> Mixed file performance was awful, so I reran without direct IO enabled for comparison:
>         dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K
>
> bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.
>
>
> ##########################
> #                        #
> #   Without READ_PLUS    #
> #                        #
> ##########################
>
>
> NFS v4.1, iflag=direct:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
> |    Hole |  9.839s |  9.326s |  9.381s |  9.430s |  8.875s |  9.370s |
> |   Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> NFS v4.2, iflag=direct:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
> |    Hole |  9.515s |  9.039s |  9.116s |  8.867s |  8.905s |  9.088s |
> |   Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
> NFS v4.1, iflag=nocache oflag=nocache:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  6.808s |  6.698s |  7.482s |  6.761s |  7.235s |  6.995s |
> |    Hole |  5.350s |  5.148s |  5.161s |  5.070s |  5.089s |  5.164s |
> |   Mixed |  9.316s |  8.731s |  9.072s |  9.145s |  8.627s |  8.978s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> NFS v4.2, iflag=nocache oflag=nocache:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  6.686s |  6.848s |  6.876s |  6.799s |  7.815s |  7.004s |
> |    Hole |  5.092s |  5.330s |  5.050s |  5.280s |  5.030s |  5.156s |
> |   Mixed |  8.142s |  7.897s |  8.040s |  7.960s |  8.050s |  8.018s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
>
>
> #######################
> #                     #
> #   With READ_PLUS    #
> #                     #
> #######################
>
>
> NFS v4.1, iflag=direct:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  9.464s | 10.181s | 10.048s |  9.452s | 10.795s |  9.988s |
> |    Hole |  7.954s |  8.486s |  7.762s |  7.969s |  8.299s |  8.094s |
> |   Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> NFS v4.2, iflag=direct:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
> |    Hole |  3.247s |  3.155s |  3.191s |  3.243s |  3.202s |  3.208s |
> |   Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |

That's a bit nasty. Any idea what is going on with the Mixed case here?

> |---------|---------|---------|---------|---------|---------|---------|
>
>
>
>
> NFS v4.1, iflag=nocache oflag=nocache:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  6.788s |  6.802s |  6.750s |  6.756s |  6.852s |  6.790s |
> |    Hole |  5.143s |  5.165s |  5.104s |  5.154s |  5.116s |  5.136s |
> |   Mixed |  7.902s |  7.693s |  9.169s |  8.186s |  9.157s |  8.421s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> NFS v4.2, iflag=nocache oflag=nocache:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  6.897s |  6.862s |  7.054s |  6.961s |  7.081s |  6.971s |
> |    Hole |  1.690s |  1.673s |  1.553s |  1.554s |  1.490s |  1.592s |
> |   Mixed |  9.009s |  7.840s |  7.661s |  8.945s |  7.649s |  8.221s |
> |---------|---------|---------|---------|---------|---------|---------|
>
>
> On 03/26/2015 12:13 PM, Trond Myklebust wrote:
>> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
>> <Anna.Schumaker@netapp.com> wrote:
>>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>>>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>>>
>>>>>>>> ##########################
>>>>>>>> #                        #
>>>>>>>> #   Without READ_PLUS    #
>>>>>>>> #                        #
>>>>>>>> ##########################
>>>>>>>>
>>>>>>>>
>>>>>>>> NFS v4.1:
>>>>>>>>                             Trial
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
>>>>>>>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
>>>>>>>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> NFS v4.2:
>>>>>>>>                             Trial
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
>>>>>>>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
>>>>>>>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> #######################
>>>>>>>> #                     #
>>>>>>>> #   With READ_PLUS    #
>>>>>>>> #                     #
>>>>>>>> #######################
>>>>>>>>
>>>>>>>>
>>>>>>>> NFS v4.1:
>>>>>>>>                             Trial
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
>>>>>>>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
>>>>>>>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> NFS v4.2:
>>>>>>>>                             Trial
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
>>>>>>>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
>>>>>>>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>
>>>>>>>
>>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>>>
>>>>>> Also, could you describe the setup (are these still kvm's), and how
>>>>>> you're clearing the cache between runs?
>>>>>
>>>>> These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>>>
>>>> I agree that you have to use the 'drop_caches' interface on the
>>>> server, but why not just use O_DIRECT on the clients?
>>>
>>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`.  I can write something to read files with O_DIRECT if that would be more useful!
>>>
>>
>> 'dd' can do that for you if the appropriate incantations are performed.
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna March 27, 2015, 8:46 p.m. UTC | #31
On 03/27/2015 04:22 PM, Trond Myklebust wrote:
> On Fri, Mar 27, 2015 at 3:04 PM, Anna Schumaker
> <Anna.Schumaker@netapp.com> wrote:
>> I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines.  First, I ran dd using direct IO for reads:
>>         dd if=/nfs/file iflag=direct of=/dev/null bs=128K
>>
>> Mixed file performance was awful, so I reran without direct IO enabled for comparison:
>>         dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K
>>
>> bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.
>>
>>
>> ##########################
>> #                        #
>> #   Without READ_PLUS    #
>> #                        #
>> ##########################
>>
>>
>> NFS v4.1, iflag=direct:
>>                             Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> |    Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
>> |    Hole |  9.839s |  9.326s |  9.381s |  9.430s |  8.875s |  9.370s |
>> |   Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>> NFS v4.2, iflag=direct:
>>                             Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> |    Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
>> |    Hole |  9.515s |  9.039s |  9.116s |  8.867s |  8.905s |  9.088s |
>> |   Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>>
>>
>> NFS v4.1, iflag=nocache oflag=nocache:
>>                             Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> |    Data |  6.808s |  6.698s |  7.482s |  6.761s |  7.235s |  6.995s |
>> |    Hole |  5.350s |  5.148s |  5.161s |  5.070s |  5.089s |  5.164s |
>> |   Mixed |  9.316s |  8.731s |  9.072s |  9.145s |  8.627s |  8.978s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>> NFS v4.2, iflag=nocache oflag=nocache:
>>                             Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> |    Data |  6.686s |  6.848s |  6.876s |  6.799s |  7.815s |  7.004s |
>> |    Hole |  5.092s |  5.330s |  5.050s |  5.280s |  5.030s |  5.156s |
>> |   Mixed |  8.142s |  7.897s |  8.040s |  7.960s |  8.050s |  8.018s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>>
>>
>>
>>
>> #######################
>> #                     #
>> #   With READ_PLUS    #
>> #                     #
>> #######################
>>
>>
>> NFS v4.1, iflag=direct:
>>                             Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> |    Data |  9.464s | 10.181s | 10.048s |  9.452s | 10.795s |  9.988s |
>> |    Hole |  7.954s |  8.486s |  7.762s |  7.969s |  8.299s |  8.094s |
>> |   Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>> NFS v4.2, iflag=direct:
>>                             Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> |    Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
>> |    Hole |  3.247s |  3.155s |  3.191s |  3.243s |  3.202s |  3.208s |
>> |   Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |
> 
> That's a bit nasty. Any idea what is going on with the Mixed case here?

Not offhand, but my first guess would be something to do with extra seeks to find how long each hole and data segment is.

Anna

> 
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>>
>>
>> NFS v4.1, iflag=nocache oflag=nocache:
>>                             Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> |    Data |  6.788s |  6.802s |  6.750s |  6.756s |  6.852s |  6.790s |
>> |    Hole |  5.143s |  5.165s |  5.104s |  5.154s |  5.116s |  5.136s |
>> |   Mixed |  7.902s |  7.693s |  9.169s |  8.186s |  9.157s |  8.421s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>> NFS v4.2, iflag=nocache oflag=nocache:
>>                             Trial
>> |---------|---------|---------|---------|---------|---------|---------|
>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>> |---------|---------|---------|---------|---------|---------|---------|
>> |    Data |  6.897s |  6.862s |  7.054s |  6.961s |  7.081s |  6.971s |
>> |    Hole |  1.690s |  1.673s |  1.553s |  1.554s |  1.490s |  1.592s |
>> |   Mixed |  9.009s |  7.840s |  7.661s |  8.945s |  7.649s |  8.221s |
>> |---------|---------|---------|---------|---------|---------|---------|
>>
>>
>> On 03/26/2015 12:13 PM, Trond Myklebust wrote:
>>> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
>>> <Anna.Schumaker@netapp.com> wrote:
>>>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>>>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>>>>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>>>>
>>>>>>>>> ##########################
>>>>>>>>> #                        #
>>>>>>>>> #   Without READ_PLUS    #
>>>>>>>>> #                        #
>>>>>>>>> ##########################
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> NFS v4.1:
>>>>>>>>>                             Trial
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
>>>>>>>>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
>>>>>>>>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> NFS v4.2:
>>>>>>>>>                             Trial
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
>>>>>>>>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
>>>>>>>>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> #######################
>>>>>>>>> #                     #
>>>>>>>>> #   With READ_PLUS    #
>>>>>>>>> #                     #
>>>>>>>>> #######################
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> NFS v4.1:
>>>>>>>>>                             Trial
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
>>>>>>>>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
>>>>>>>>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> NFS v4.2:
>>>>>>>>>                             Trial
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
>>>>>>>>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
>>>>>>>>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>
>>>>>>>>
>>>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>>>>
>>>>>>> Also, could you describe the setup (are these still kvm's), and how
>>>>>>> you're clearing the cache between runs?
>>>>>>
>>>>>> These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>>>>
>>>>> I agree that you have to use the 'drop_caches' interface on the
>>>>> server, but why not just use O_DIRECT on the clients?
>>>>
>>>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`.  I can write something to read files with O_DIRECT if that would be more useful!
>>>>
>>>
>>> 'dd' can do that for you if the appropriate incantations are performed.
>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields March 27, 2015, 8:54 p.m. UTC | #32
On Fri, Mar 27, 2015 at 04:46:55PM -0400, Anna Schumaker wrote:
> On 03/27/2015 04:22 PM, Trond Myklebust wrote:
> > On Fri, Mar 27, 2015 at 3:04 PM, Anna Schumaker
> > <Anna.Schumaker@netapp.com> wrote:
> >> I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines.  First, I ran dd using direct IO for reads:
> >>         dd if=/nfs/file iflag=direct of=/dev/null bs=128K
> >>
> >> Mixed file performance was awful, so I reran without direct IO enabled for comparison:
> >>         dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K
> >>
> >> bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.
> >>
> >>
> >> ##########################
> >> #                        #
> >> #   Without READ_PLUS    #
> >> #                        #
> >> ##########################
> >>
> >>
> >> NFS v4.1, iflag=direct:
> >>                             Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |    Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
> >> |    Hole |  9.839s |  9.326s |  9.381s |  9.430s |  8.875s |  9.370s |
> >> |   Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >> NFS v4.2, iflag=direct:
> >>                             Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |    Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
> >> |    Hole |  9.515s |  9.039s |  9.116s |  8.867s |  8.905s |  9.088s |
> >> |   Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >>
> >>
> >> NFS v4.1, iflag=nocache oflag=nocache:
> >>                             Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |    Data |  6.808s |  6.698s |  7.482s |  6.761s |  7.235s |  6.995s |
> >> |    Hole |  5.350s |  5.148s |  5.161s |  5.070s |  5.089s |  5.164s |
> >> |   Mixed |  9.316s |  8.731s |  9.072s |  9.145s |  8.627s |  8.978s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >> NFS v4.2, iflag=nocache oflag=nocache:
> >>                             Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |    Data |  6.686s |  6.848s |  6.876s |  6.799s |  7.815s |  7.004s |
> >> |    Hole |  5.092s |  5.330s |  5.050s |  5.280s |  5.030s |  5.156s |
> >> |   Mixed |  8.142s |  7.897s |  8.040s |  7.960s |  8.050s |  8.018s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >>
> >>
> >>
> >>
> >> #######################
> >> #                     #
> >> #   With READ_PLUS    #
> >> #                     #
> >> #######################
> >>
> >>
> >> NFS v4.1, iflag=direct:
> >>                             Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |    Data |  9.464s | 10.181s | 10.048s |  9.452s | 10.795s |  9.988s |
> >> |    Hole |  7.954s |  8.486s |  7.762s |  7.969s |  8.299s |  8.094s |
> >> |   Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >> NFS v4.2, iflag=direct:
> >>                             Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |    Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
> >> |    Hole |  3.247s |  3.155s |  3.191s |  3.243s |  3.202s |  3.208s |
> >> |   Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |
> > 
> > That's a bit nasty. Any idea what is going on with the Mixed case here?
> 
> Not offhand, but my first guess would be something to do with extra seeks to find how long each hole and data segment is.

Remind us what "mixed" means?  (I think you were alternating, but how
large is each segment?)

--b.

> 
> Anna
> 
> > 
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >>
> >>
> >> NFS v4.1, iflag=nocache oflag=nocache:
> >>                             Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |    Data |  6.788s |  6.802s |  6.750s |  6.756s |  6.852s |  6.790s |
> >> |    Hole |  5.143s |  5.165s |  5.104s |  5.154s |  5.116s |  5.136s |
> >> |   Mixed |  7.902s |  7.693s |  9.169s |  8.186s |  9.157s |  8.421s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >> NFS v4.2, iflag=nocache oflag=nocache:
> >>                             Trial
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >> |    Data |  6.897s |  6.862s |  7.054s |  6.961s |  7.081s |  6.971s |
> >> |    Hole |  1.690s |  1.673s |  1.553s |  1.554s |  1.490s |  1.592s |
> >> |   Mixed |  9.009s |  7.840s |  7.661s |  8.945s |  7.649s |  8.221s |
> >> |---------|---------|---------|---------|---------|---------|---------|
> >>
> >>
> >> On 03/26/2015 12:13 PM, Trond Myklebust wrote:
> >>> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
> >>> <Anna.Schumaker@netapp.com> wrote:
> >>>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
> >>>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
> >>>>> <Anna.Schumaker@netapp.com> wrote:
> >>>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
> >>>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
> >>>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
> >>>>>>>> <Anna.Schumaker@netapp.com> wrote:
> >>>>>>>>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
> >>>>>>>>>
> >>>>>>>>> ##########################
> >>>>>>>>> #                        #
> >>>>>>>>> #   Without READ_PLUS    #
> >>>>>>>>> #                        #
> >>>>>>>>> ##########################
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> NFS v4.1:
> >>>>>>>>>                             Trial
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
> >>>>>>>>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
> >>>>>>>>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> NFS v4.2:
> >>>>>>>>>                             Trial
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
> >>>>>>>>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
> >>>>>>>>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> #######################
> >>>>>>>>> #                     #
> >>>>>>>>> #   With READ_PLUS    #
> >>>>>>>>> #                     #
> >>>>>>>>> #######################
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> NFS v4.1:
> >>>>>>>>>                             Trial
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
> >>>>>>>>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
> >>>>>>>>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> NFS v4.2:
> >>>>>>>>>                             Trial
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
> >>>>>>>>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
> >>>>>>>>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
> >>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
> >>>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
> >>>>>>>> little more stats and then perhaps run the results through nfsometer?
> >>>>>>>
> >>>>>>> Also, could you describe the setup (are these still kvm's), and how
> >>>>>>> you're clearing the cache between runs?
> >>>>>>
> >>>>>> These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
> >>>>>
> >>>>> I agree that you have to use the 'drop_caches' interface on the
> >>>>> server, but why not just use O_DIRECT on the clients?
> >>>>
> >>>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`.  I can write something to read files with O_DIRECT if that would be more useful!
> >>>>
> >>>
> >>> 'dd' can do that for you if the appropriate incantations are performed.
> >>>
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> > 
> > 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna March 27, 2015, 8:55 p.m. UTC | #33
On 03/27/2015 04:54 PM, J. Bruce Fields wrote:
> On Fri, Mar 27, 2015 at 04:46:55PM -0400, Anna Schumaker wrote:
>> On 03/27/2015 04:22 PM, Trond Myklebust wrote:
>>> On Fri, Mar 27, 2015 at 3:04 PM, Anna Schumaker
>>> <Anna.Schumaker@netapp.com> wrote:
>>>> I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines.  First, I ran dd using direct IO for reads:
>>>>         dd if=/nfs/file iflag=direct of=/dev/null bs=128K
>>>>
>>>> Mixed file performance was awful, so I reran without direct IO enabled for comparison:
>>>>         dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K
>>>>
>>>> bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.
>>>>
>>>>
>>>> ##########################
>>>> #                        #
>>>> #   Without READ_PLUS    #
>>>> #                        #
>>>> ##########################
>>>>
>>>>
>>>> NFS v4.1, iflag=direct:
>>>>                             Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |    Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
>>>> |    Hole |  9.839s |  9.326s |  9.381s |  9.430s |  8.875s |  9.370s |
>>>> |   Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>> NFS v4.2, iflag=direct:
>>>>                             Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |    Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
>>>> |    Hole |  9.515s |  9.039s |  9.116s |  8.867s |  8.905s |  9.088s |
>>>> |   Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>>
>>>>
>>>> NFS v4.1, iflag=nocache oflag=nocache:
>>>>                             Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |    Data |  6.808s |  6.698s |  7.482s |  6.761s |  7.235s |  6.995s |
>>>> |    Hole |  5.350s |  5.148s |  5.161s |  5.070s |  5.089s |  5.164s |
>>>> |   Mixed |  9.316s |  8.731s |  9.072s |  9.145s |  8.627s |  8.978s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>> NFS v4.2, iflag=nocache oflag=nocache:
>>>>                             Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |    Data |  6.686s |  6.848s |  6.876s |  6.799s |  7.815s |  7.004s |
>>>> |    Hole |  5.092s |  5.330s |  5.050s |  5.280s |  5.030s |  5.156s |
>>>> |   Mixed |  8.142s |  7.897s |  8.040s |  7.960s |  8.050s |  8.018s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>> #######################
>>>> #                     #
>>>> #   With READ_PLUS    #
>>>> #                     #
>>>> #######################
>>>>
>>>>
>>>> NFS v4.1, iflag=direct:
>>>>                             Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |    Data |  9.464s | 10.181s | 10.048s |  9.452s | 10.795s |  9.988s |
>>>> |    Hole |  7.954s |  8.486s |  7.762s |  7.969s |  8.299s |  8.094s |
>>>> |   Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>> NFS v4.2, iflag=direct:
>>>>                             Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |    Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
>>>> |    Hole |  3.247s |  3.155s |  3.191s |  3.243s |  3.202s |  3.208s |
>>>> |   Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |
>>>
>>> That's a bit nasty. Any idea what is going on with the Mixed case here?
>>
>> Not offhand, but my first guess would be something to do with extra seeks to find how long each hole and data segment is.
> 
> Remind us what "mixed" means?  (I think you were alternating, but how
> large is each segment?)

"Mixed" is alternating 4K segments.

> 
> --b.
> 
>>
>> Anna
>>
>>>
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>>
>>>>
>>>> NFS v4.1, iflag=nocache oflag=nocache:
>>>>                             Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |    Data |  6.788s |  6.802s |  6.750s |  6.756s |  6.852s |  6.790s |
>>>> |    Hole |  5.143s |  5.165s |  5.104s |  5.154s |  5.116s |  5.136s |
>>>> |   Mixed |  7.902s |  7.693s |  9.169s |  8.186s |  9.157s |  8.421s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>> NFS v4.2, iflag=nocache oflag=nocache:
>>>>                             Trial
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>> |    Data |  6.897s |  6.862s |  7.054s |  6.961s |  7.081s |  6.971s |
>>>> |    Hole |  1.690s |  1.673s |  1.553s |  1.554s |  1.490s |  1.592s |
>>>> |   Mixed |  9.009s |  7.840s |  7.661s |  8.945s |  7.649s |  8.221s |
>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>
>>>>
>>>> On 03/26/2015 12:13 PM, Trond Myklebust wrote:
>>>>> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>>>>>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>>>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>>>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>>>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>>>>>>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>>>>>>
>>>>>>>>>>> ##########################
>>>>>>>>>>> #                        #
>>>>>>>>>>> #   Without READ_PLUS    #
>>>>>>>>>>> #                        #
>>>>>>>>>>> ##########################
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> NFS v4.1:
>>>>>>>>>>>                             Trial
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
>>>>>>>>>>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
>>>>>>>>>>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> NFS v4.2:
>>>>>>>>>>>                             Trial
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
>>>>>>>>>>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
>>>>>>>>>>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> #######################
>>>>>>>>>>> #                     #
>>>>>>>>>>> #   With READ_PLUS    #
>>>>>>>>>>> #                     #
>>>>>>>>>>> #######################
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> NFS v4.1:
>>>>>>>>>>>                             Trial
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
>>>>>>>>>>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
>>>>>>>>>>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> NFS v4.2:
>>>>>>>>>>>                             Trial
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
>>>>>>>>>>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
>>>>>>>>>>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>>>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>>>>>>
>>>>>>>>> Also, could you describe the setup (are these still kvm's), and how
>>>>>>>>> you're clearing the cache between runs?
>>>>>>>>
>>>>>>>> These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>>>>>>
>>>>>>> I agree that you have to use the 'drop_caches' interface on the
>>>>>>> server, but why not just use O_DIRECT on the clients?
>>>>>>
>>>>>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`.  I can write something to read files with O_DIRECT if that would be more useful!
>>>>>>
>>>>>
>>>>> 'dd' can do that for you if the appropriate incantations are performed.
>>>>>
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>>
>>>

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields March 27, 2015, 9:08 p.m. UTC | #34
On Fri, Mar 27, 2015 at 04:55:26PM -0400, Anna Schumaker wrote:
> On 03/27/2015 04:54 PM, J. Bruce Fields wrote:
> > On Fri, Mar 27, 2015 at 04:46:55PM -0400, Anna Schumaker wrote:
> >> On 03/27/2015 04:22 PM, Trond Myklebust wrote:
> >>> On Fri, Mar 27, 2015 at 3:04 PM, Anna Schumaker
> >>> <Anna.Schumaker@netapp.com> wrote:
> >>>> I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines.  First, I ran dd using direct IO for reads:
> >>>>         dd if=/nfs/file iflag=direct of=/dev/null bs=128K
> >>>>
> >>>> Mixed file performance was awful, so I reran without direct IO enabled for comparison:
> >>>>         dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K
> >>>>
> >>>> bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.
> >>>>
> >>>>
> >>>> ##########################
> >>>> #                        #
> >>>> #   Without READ_PLUS    #
> >>>> #                        #
> >>>> ##########################
> >>>>
> >>>>
> >>>> NFS v4.1, iflag=direct:
> >>>>                             Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |    Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
> >>>> |    Hole |  9.839s |  9.326s |  9.381s |  9.430s |  8.875s |  9.370s |
> >>>> |   Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>> NFS v4.2, iflag=direct:
> >>>>                             Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |    Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
> >>>> |    Hole |  9.515s |  9.039s |  9.116s |  8.867s |  8.905s |  9.088s |
> >>>> |   Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>>
> >>>>
> >>>> NFS v4.1, iflag=nocache oflag=nocache:
> >>>>                             Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |    Data |  6.808s |  6.698s |  7.482s |  6.761s |  7.235s |  6.995s |
> >>>> |    Hole |  5.350s |  5.148s |  5.161s |  5.070s |  5.089s |  5.164s |
> >>>> |   Mixed |  9.316s |  8.731s |  9.072s |  9.145s |  8.627s |  8.978s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>> NFS v4.2, iflag=nocache oflag=nocache:
> >>>>                             Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |    Data |  6.686s |  6.848s |  6.876s |  6.799s |  7.815s |  7.004s |
> >>>> |    Hole |  5.092s |  5.330s |  5.050s |  5.280s |  5.030s |  5.156s |
> >>>> |   Mixed |  8.142s |  7.897s |  8.040s |  7.960s |  8.050s |  8.018s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>>
> >>>>
> >>>>
> >>>>
> >>>> #######################
> >>>> #                     #
> >>>> #   With READ_PLUS    #
> >>>> #                     #
> >>>> #######################
> >>>>
> >>>>
> >>>> NFS v4.1, iflag=direct:
> >>>>                             Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |    Data |  9.464s | 10.181s | 10.048s |  9.452s | 10.795s |  9.988s |
> >>>> |    Hole |  7.954s |  8.486s |  7.762s |  7.969s |  8.299s |  8.094s |
> >>>> |   Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>> NFS v4.2, iflag=direct:
> >>>>                             Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |    Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
> >>>> |    Hole |  3.247s |  3.155s |  3.191s |  3.243s |  3.202s |  3.208s |
> >>>> |   Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |
> >>>
> >>> That's a bit nasty. Any idea what is going on with the Mixed case here?
> >>
> >> Not offhand, but my first guess would be something to do with extra seeks to find how long each hole and data segment is.
> > 
> > Remind us what "mixed" means?  (I think you were alternating, but how
> > large is each segment?)
> 
> "Mixed" is alternating 4K segments.

So it's probably doing 128/4 = 32 reads where previously one was
necessary.  You could confirm that by looking at the READ counts in
/proc/self/mountstats.  With odirect turned off maybe that's hidden by
readahead?

--b.

> 
> > 
> > --b.
> > 
> >>
> >> Anna
> >>
> >>>
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>>
> >>>>
> >>>> NFS v4.1, iflag=nocache oflag=nocache:
> >>>>                             Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |    Data |  6.788s |  6.802s |  6.750s |  6.756s |  6.852s |  6.790s |
> >>>> |    Hole |  5.143s |  5.165s |  5.104s |  5.154s |  5.116s |  5.136s |
> >>>> |   Mixed |  7.902s |  7.693s |  9.169s |  8.186s |  9.157s |  8.421s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>> NFS v4.2, iflag=nocache oflag=nocache:
> >>>>                             Trial
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>> |    Data |  6.897s |  6.862s |  7.054s |  6.961s |  7.081s |  6.971s |
> >>>> |    Hole |  1.690s |  1.673s |  1.553s |  1.554s |  1.490s |  1.592s |
> >>>> |   Mixed |  9.009s |  7.840s |  7.661s |  8.945s |  7.649s |  8.221s |
> >>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>
> >>>>
> >>>> On 03/26/2015 12:13 PM, Trond Myklebust wrote:
> >>>>> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
> >>>>> <Anna.Schumaker@netapp.com> wrote:
> >>>>>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
> >>>>>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
> >>>>>>> <Anna.Schumaker@netapp.com> wrote:
> >>>>>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
> >>>>>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
> >>>>>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
> >>>>>>>>>> <Anna.Schumaker@netapp.com> wrote:
> >>>>>>>>>>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
> >>>>>>>>>>>
> >>>>>>>>>>> ##########################
> >>>>>>>>>>> #                        #
> >>>>>>>>>>> #   Without READ_PLUS    #
> >>>>>>>>>>> #                        #
> >>>>>>>>>>> ##########################
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> NFS v4.1:
> >>>>>>>>>>>                             Trial
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
> >>>>>>>>>>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
> >>>>>>>>>>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> NFS v4.2:
> >>>>>>>>>>>                             Trial
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
> >>>>>>>>>>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
> >>>>>>>>>>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> #######################
> >>>>>>>>>>> #                     #
> >>>>>>>>>>> #   With READ_PLUS    #
> >>>>>>>>>>> #                     #
> >>>>>>>>>>> #######################
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> NFS v4.1:
> >>>>>>>>>>>                             Trial
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
> >>>>>>>>>>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
> >>>>>>>>>>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> NFS v4.2:
> >>>>>>>>>>>                             Trial
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
> >>>>>>>>>>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
> >>>>>>>>>>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
> >>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
> >>>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
> >>>>>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
> >>>>>>>>>> little more stats and then perhaps run the results through nfsometer?
> >>>>>>>>>
> >>>>>>>>> Also, could you describe the setup (are these still kvm's), and how
> >>>>>>>>> you're clearing the cache between runs?
> >>>>>>>>
> >>>>>>>> These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
> >>>>>>>
> >>>>>>> I agree that you have to use the 'drop_caches' interface on the
> >>>>>>> server, but why not just use O_DIRECT on the clients?
> >>>>>>
> >>>>>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`.  I can write something to read files with O_DIRECT if that would be more useful!
> >>>>>>
> >>>>>
> >>>>> 'dd' can do that for you if the appropriate incantations are performed.
> >>>>>
> >>>>
> >>>> --
> >>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> >>>> the body of a message to majordomo@vger.kernel.org
> >>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>>
> >>>
> >>>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig March 30, 2015, 2:06 p.m. UTC | #35
On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
> So there is a clear win in the 100% hole case here, but otherwise the
> statistical fluctuations are dominating the numbers. Can you get us a
> little more stats and then perhaps run the results through nfsometer?

And that's just the uncached reads if I understand the thread correctly.
The cached case isn't uncommon in real life, so regressing it isn't
really an option either.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna April 15, 2015, 7:32 p.m. UTC | #36
I just ran some more tests comparing the directio case across different filesystem types.  These tests used three 1G files:  100% data, 100% hole, and mixed file with alternating 4k data and hole segments.  The mixed case seems to be consistently slower compared to NFS v4.1, and I'm at a loss for anything I could do to make it faster.  Here are my numbers:

###########
#         #
#   XFS   #
#         #
###########


NFS v4.1:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  1.883s |  1.808s |  1.781s |  1.685s |  1.591s |  1.746s |
|    Hole |  1.815s |  1.635s |  1.682s |  1.698s |  1.653s |  1.697s |
|   Mixed |  2.089s |  2.024s |  1.970s |  1.925s |  2.049s |  2.011s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  1.849s |  1.879s |  1.852s |  1.799s |  1.781s |  1.832s |
|    Hole |  0.668s |  0.600s |  0.611s |  0.619s |  0.617s |  0.623s |
|   Mixed |  5.913s |  5.811s |  5.952s |  5.962s |  5.806s |  5.889s |
|---------|---------|---------|---------|---------|---------|---------|





############
#          #
#   EXT4   #
#          #
############


NFS v4.1:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  2.637s |  1.823s |  1.792s |  1.816s |  2.000s |  2.014s |
|    Hole |  1.734s |  1.743s |  1.709s |  1.761s |  1.871s |  1.764s |
|   Mixed |  5.465s |  2.158s |  2.254s |  2.676s |  2.422s |  2.995s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  1.934s |  1.783s |  1.800s |  2.010s |  1.982s |  1.902s |
|    Hole | 63.568s | 63.423s | 64.671s | 66.190s | 65.985s | 64.767s |
|   Mixed |  6.010s |  5.798s |  6.146s |  6.460s |  6.720s |  6.225s |
|---------|---------|---------|---------|---------|---------|---------|





#############
#           #
#   BTRFS   #
#           #
#############


NFS v4.1:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  2.386s |  1.952s |  1.832s |  1.818s |  1.826s |  1.963s |
|    Hole |  1.759s |  1.717s |  1.754s |  1.621s |  1.708s |  1.712s |
|   Mixed |  2.889s |  2.272s |  2.778s |  2.277s |  2.255s |  2.494s |
|---------|---------|---------|---------|---------|---------|---------|


NFS v4.2:
                            Trial
|---------|---------|---------|---------|---------|---------|---------|
|         |    1    |    2    |    3    |    4    |    5    | Average |
|---------|---------|---------|---------|---------|---------|---------|
|    Data |  2.586s |  1.816s |  2.022s |  1.862s |  1.975s |  2.052s |
|    Hole |  0.646s |  0.659s |  0.669s |  0.628s |  0.605s |  0.641s |
|   Mixed |  8.555s |  8.553s |  7.904s |  8.567s |  8.286s |  8.373s |
|---------|---------|---------|---------|---------|---------|---------|


On 03/27/2015 05:08 PM, J. Bruce Fields wrote:
> On Fri, Mar 27, 2015 at 04:55:26PM -0400, Anna Schumaker wrote:
>> On 03/27/2015 04:54 PM, J. Bruce Fields wrote:
>>> On Fri, Mar 27, 2015 at 04:46:55PM -0400, Anna Schumaker wrote:
>>>> On 03/27/2015 04:22 PM, Trond Myklebust wrote:
>>>>> On Fri, Mar 27, 2015 at 3:04 PM, Anna Schumaker
>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>> I did two separate dd tests with the same 5G file from yesterday, and still using the same virtual machines.  First, I ran dd using direct IO for reads:
>>>>>>         dd if=/nfs/file iflag=direct of=/dev/null bs=128K
>>>>>>
>>>>>> Mixed file performance was awful, so I reran without direct IO enabled for comparison:
>>>>>>         dd if=/nfs/file iflag=nocache of=/dev/null oflag=nocache bs=128K
>>>>>>
>>>>>> bs=128K sets the block size used by dd to the NFS rsize, without this dd will only read 512 bytes at a time and take forever to complete.
>>>>>>
>>>>>>
>>>>>> ##########################
>>>>>> #                        #
>>>>>> #   Without READ_PLUS    #
>>>>>> #                        #
>>>>>> ##########################
>>>>>>
>>>>>>
>>>>>> NFS v4.1, iflag=direct:
>>>>>>                             Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |    Data | 11.704s | 11.055s | 11.329s | 11.453s | 10.741s | 11.256s |
>>>>>> |    Hole |  9.839s |  9.326s |  9.381s |  9.430s |  8.875s |  9.370s |
>>>>>> |   Mixed | 19.150s | 19.468s | 18.650s | 18.537s | 19.312s | 19.023s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>> NFS v4.2, iflag=direct:
>>>>>>                             Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |    Data | 10.927s | 10.885s | 11.114s | 11.283s | 10.371s | 10.916s |
>>>>>> |    Hole |  9.515s |  9.039s |  9.116s |  8.867s |  8.905s |  9.088s |
>>>>>> |   Mixed | 19.149s | 18.656s | 19.400s | 18.834s | 20.041s | 19.216s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> NFS v4.1, iflag=nocache oflag=nocache:
>>>>>>                             Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |    Data |  6.808s |  6.698s |  7.482s |  6.761s |  7.235s |  6.995s |
>>>>>> |    Hole |  5.350s |  5.148s |  5.161s |  5.070s |  5.089s |  5.164s |
>>>>>> |   Mixed |  9.316s |  8.731s |  9.072s |  9.145s |  8.627s |  8.978s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>> NFS v4.2, iflag=nocache oflag=nocache:
>>>>>>                             Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |    Data |  6.686s |  6.848s |  6.876s |  6.799s |  7.815s |  7.004s |
>>>>>> |    Hole |  5.092s |  5.330s |  5.050s |  5.280s |  5.030s |  5.156s |
>>>>>> |   Mixed |  8.142s |  7.897s |  8.040s |  7.960s |  8.050s |  8.018s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> #######################
>>>>>> #                     #
>>>>>> #   With READ_PLUS    #
>>>>>> #                     #
>>>>>> #######################
>>>>>>
>>>>>>
>>>>>> NFS v4.1, iflag=direct:
>>>>>>                             Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |    Data |  9.464s | 10.181s | 10.048s |  9.452s | 10.795s |  9.988s |
>>>>>> |    Hole |  7.954s |  8.486s |  7.762s |  7.969s |  8.299s |  8.094s |
>>>>>> |   Mixed | 19.037s | 18.323s | 18.965s | 18.156s | 19.185s | 18.733s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>> NFS v4.2, iflag=direct:
>>>>>>                             Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |    Data | 11.923s | 10.026s | 10.222s | 12.387s | 11.431s | 11.198s |
>>>>>> |    Hole |  3.247s |  3.155s |  3.191s |  3.243s |  3.202s |  3.208s |
>>>>>> |   Mixed | 54.677s | 54.697s | 52.978s | 53.704s | 54.054s | 54.022s |
>>>>>
>>>>> That's a bit nasty. Any idea what is going on with the Mixed case here?
>>>>
>>>> Not offhand, but my first guess would be something to do with extra seeks to find how long each hole and data segment is.
>>>
>>> Remind us what "mixed" means?  (I think you were alternating, but how
>>> large is each segment?)
>>
>> "Mixed" is alternating 4K segments.
> 
> So it's probably doing 128/4 = 32 reads where previously one was
> necessary.  You could confirm that by looking at the READ counts in
> /proc/self/mountstats.  With odirect turned off maybe that's hidden by
> readahead?
> 
> --b.
> 
>>
>>>
>>> --b.
>>>
>>>>
>>>> Anna
>>>>
>>>>>
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> NFS v4.1, iflag=nocache oflag=nocache:
>>>>>>                             Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |    Data |  6.788s |  6.802s |  6.750s |  6.756s |  6.852s |  6.790s |
>>>>>> |    Hole |  5.143s |  5.165s |  5.104s |  5.154s |  5.116s |  5.136s |
>>>>>> |   Mixed |  7.902s |  7.693s |  9.169s |  8.186s |  9.157s |  8.421s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>> NFS v4.2, iflag=nocache oflag=nocache:
>>>>>>                             Trial
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>> |    Data |  6.897s |  6.862s |  7.054s |  6.961s |  7.081s |  6.971s |
>>>>>> |    Hole |  1.690s |  1.673s |  1.553s |  1.554s |  1.490s |  1.592s |
>>>>>> |   Mixed |  9.009s |  7.840s |  7.661s |  8.945s |  7.649s |  8.221s |
>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>
>>>>>>
>>>>>> On 03/26/2015 12:13 PM, Trond Myklebust wrote:
>>>>>>> On Thu, Mar 26, 2015 at 12:11 PM, Anna Schumaker
>>>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>>>> On 03/26/2015 12:06 PM, Trond Myklebust wrote:
>>>>>>>>> On Thu, Mar 26, 2015 at 11:47 AM, Anna Schumaker
>>>>>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>>>>>> On 03/26/2015 11:38 AM, J. Bruce Fields wrote:
>>>>>>>>>>> On Thu, Mar 26, 2015 at 11:32:25AM -0400, Trond Myklebust wrote:
>>>>>>>>>>>> On Thu, Mar 26, 2015 at 11:21 AM, Anna Schumaker
>>>>>>>>>>>> <Anna.Schumaker@netapp.com> wrote:
>>>>>>>>>>>>> Here are my updated numbers!  I tested with files 5G in size: one 100% data, one 100% hole, and one alternating between hole and data every 4K.  I collected data for both v4.1 and v4.2 with and without the READ_PLUS patches:
>>>>>>>>>>>>>
>>>>>>>>>>>>> ##########################
>>>>>>>>>>>>> #                        #
>>>>>>>>>>>>> #   Without READ_PLUS    #
>>>>>>>>>>>>> #                        #
>>>>>>>>>>>>> ##########################
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> NFS v4.1:
>>>>>>>>>>>>>                             Trial
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> |    Data |  8.723s |  7.243s |  8.252s |  6.997s |  6.980s |  7.639s |
>>>>>>>>>>>>> |    Hole |  5.271s |  5.224s |  5.060s |  4.897s |  5.321s |  5.155s |
>>>>>>>>>>>>> |   Mixed |  8.050s | 10.057s |  7.919s |  8.060s |  9.557s |  8.729s |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> NFS v4.2:
>>>>>>>>>>>>>                             Trial
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> |    Data |  6.707s |  7.070s |  6.722s |  6.761s |  6.810s |  6.814s |
>>>>>>>>>>>>> |    Hole |  5.152s |  5.149s |  5.213s |  5.206s |  5.312s |  5.206s |
>>>>>>>>>>>>> |   Mixed |  7.979s |  7.985s |  8.177s |  7.772s |  8.280s |  8.039s |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> #######################
>>>>>>>>>>>>> #                     #
>>>>>>>>>>>>> #   With READ_PLUS    #
>>>>>>>>>>>>> #                     #
>>>>>>>>>>>>> #######################
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> NFS v4.1:
>>>>>>>>>>>>>                             Trial
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> |    Data |  9.082s |  7.008s |  7.116s |  6.771s |  7.902s |  7.576s |
>>>>>>>>>>>>> |    Hole |  5.333s |  5.358s |  5.380s |  5.161s |  5.282s |  5.303s |
>>>>>>>>>>>>> |   Mixed |  8.189s |  8.308s |  9.540s |  7.937s |  8.420s |  8.479s |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> NFS v4.2:
>>>>>>>>>>>>>                             Trial
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> |         |    1    |    2    |    3    |    4    |    5    | Average |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>> |    Data |  7.033s |  6.829s |  7.025s |  6.873s |  7.134s |  6.979s |
>>>>>>>>>>>>> |    Hole |  1.794s |  1.800s |  1.905s |  1.811s |  1.725s |  1.807s |
>>>>>>>>>>>>> |   Mixed |  7.590s |  8.777s |  9.423s | 10.366s |  8.024s |  8.836s |
>>>>>>>>>>>>> |---------|---------|---------|---------|---------|---------|---------|
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> So there is a clear win in the 100% hole case here, but otherwise the
>>>>>>>>>>>> statistical fluctuations are dominating the numbers. Can you get us a
>>>>>>>>>>>> little more stats and then perhaps run the results through nfsometer?
>>>>>>>>>>>
>>>>>>>>>>> Also, could you describe the setup (are these still kvm's), and how
>>>>>>>>>>> you're clearing the cache between runs?
>>>>>>>>>>
>>>>>>>>>> These are still KVMs and my server is exporting an xfs filesystem.  I clear caches by running "echo 3 > /proc/sys/vm/drop_caches" on the server before every read, and I remount my client after reading each set of three files once.
>>>>>>>>>
>>>>>>>>> I agree that you have to use the 'drop_caches' interface on the
>>>>>>>>> server, but why not just use O_DIRECT on the clients?
>>>>>>>>
>>>>>>>> I've been reading by using cat from my test shell script: `time cat /nfs/file > /dev/null`.  I can write something to read files with O_DIRECT if that would be more useful!
>>>>>>>>
>>>>>>>
>>>>>>> 'dd' can do that for you if the appropriate incantations are performed.
>>>>>>>
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>
>>>>>
>>>>>

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields April 15, 2015, 7:56 p.m. UTC | #37
On Wed, Apr 15, 2015 at 03:32:02PM -0400, Anna Schumaker wrote:
> I just ran some more tests comparing the directio case across
> different filesystem types.  These tests used three 1G files:  100%
> data, 100% hole, and mixed file with alternating 4k data and hole
> segments.  The mixed case seems to be consistently slower compared to
> NFS v4.1, and I'm at a loss for anything I could do to make it faster.
> Here are my numbers:

Have you tried the implementation we discussed that always returns a
single segment covering the whole requested range, by treating holes as
data if necessary when they don't cover the whole range?

(Also: I assume it's the same as before, but: when you post test
results, could you repost if necessary:

	- what the actual test is
	- what the hardware/software setup is on client and server

so that we have reproduceable results for posterity's sake.)

Interesting that "Mixed" is a little slower even before READ_PLUS.

And I guess we should really report this to ext4 people, looks like they
may have a bug.

--b.

> 
> ###########
> #         #
> #   XFS   #
> #         #
> ###########
> 
> 
> NFS v4.1:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  1.883s |  1.808s |  1.781s |  1.685s |  1.591s |  1.746s |
> |    Hole |  1.815s |  1.635s |  1.682s |  1.698s |  1.653s |  1.697s |
> |   Mixed |  2.089s |  2.024s |  1.970s |  1.925s |  2.049s |  2.011s |
> |---------|---------|---------|---------|---------|---------|---------|
> 
> 
> NFS v4.2:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  1.849s |  1.879s |  1.852s |  1.799s |  1.781s |  1.832s |
> |    Hole |  0.668s |  0.600s |  0.611s |  0.619s |  0.617s |  0.623s |
> |   Mixed |  5.913s |  5.811s |  5.952s |  5.962s |  5.806s |  5.889s |
> |---------|---------|---------|---------|---------|---------|---------|
> 
> 
> 
> 
> 
> ############
> #          #
> #   EXT4   #
> #          #
> ############
> 
> 
> NFS v4.1:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  2.637s |  1.823s |  1.792s |  1.816s |  2.000s |  2.014s |
> |    Hole |  1.734s |  1.743s |  1.709s |  1.761s |  1.871s |  1.764s |
> |   Mixed |  5.465s |  2.158s |  2.254s |  2.676s |  2.422s |  2.995s |
> |---------|---------|---------|---------|---------|---------|---------|
> 
> 
> NFS v4.2:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  1.934s |  1.783s |  1.800s |  2.010s |  1.982s |  1.902s |
> |    Hole | 63.568s | 63.423s | 64.671s | 66.190s | 65.985s | 64.767s |
> |   Mixed |  6.010s |  5.798s |  6.146s |  6.460s |  6.720s |  6.225s |
> |---------|---------|---------|---------|---------|---------|---------|
> 
> 
> 
> 
> 
> #############
> #           #
> #   BTRFS   #
> #           #
> #############
> 
> 
> NFS v4.1:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  2.386s |  1.952s |  1.832s |  1.818s |  1.826s |  1.963s |
> |    Hole |  1.759s |  1.717s |  1.754s |  1.621s |  1.708s |  1.712s |
> |   Mixed |  2.889s |  2.272s |  2.778s |  2.277s |  2.255s |  2.494s |
> |---------|---------|---------|---------|---------|---------|---------|
> 
> 
> NFS v4.2:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  2.586s |  1.816s |  2.022s |  1.862s |  1.975s |  2.052s |
> |    Hole |  0.646s |  0.659s |  0.669s |  0.628s |  0.605s |  0.641s |
> |   Mixed |  8.555s |  8.553s |  7.904s |  8.567s |  8.286s |  8.373s |
> |---------|---------|---------|---------|---------|---------|---------|
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
J. Bruce Fields April 15, 2015, 8 p.m. UTC | #38
On Wed, Apr 15, 2015 at 03:56:14PM -0400, J. Bruce Fields wrote:
> On Wed, Apr 15, 2015 at 03:32:02PM -0400, Anna Schumaker wrote:
> > I just ran some more tests comparing the directio case across
> > different filesystem types.  These tests used three 1G files:  100%
> > data, 100% hole, and mixed file with alternating 4k data and hole
> > segments.  The mixed case seems to be consistently slower compared to
> > NFS v4.1, and I'm at a loss for anything I could do to make it faster.
> > Here are my numbers:
> 
> Have you tried the implementation we discussed that always returns a
> single segment covering the whole requested range, by treating holes as
> data if necessary when they don't cover the whole range?
> 
> (Also: I assume it's the same as before, but: when you post test
> results, could you repost if necessary:
> 
> 	- what the actual test is
> 	- what the hardware/software setup is on client and server
> 
> so that we have reproduceable results for posterity's sake.)
> 
> Interesting that "Mixed" is a little slower even before READ_PLUS.
> 
> And I guess we should really report this to ext4 people, looks like they
> may have a bug.

FWIW, this is what I was using to test SEEK_HOLE/SEEK_DATA and map out
holes on files on my local disk.  Might be worth checking whether the
ext4 slowdowns are reproduceable just with something like this, to rule
out protocol problems.

--b.

#define _GNU_SOURCE
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h> 
#include <errno.h>
#include <err.h>

long round_up(long n, long b)
{
	return ((n + b - 1)/b) * b;
}

long round_down(long n, long b)
{
	return (n/b) * b;
}

long hbytes = 0;
long rplusbytes = 0;
long num_holes = 0;

do_stats(off_t hole_start, off_t hole_end)
{
	off_t hole_start_up, hole_end_down;

	hole_start_up = round_up(hole_start, 1024*1024);
	hole_end_down = round_down(hole_end, 1024*1024);

	hbytes += hole_end - hole_start;
	if (hole_start < hole_end)
		num_holes++;
	if (hole_start_up < hole_end_down)
		rplusbytes += hole_end_down - hole_start_up;
}

int main(int argc, char *argv[])
{
	off_t hole_start, hole_end;
	int fd;
	char *name;

	/* Map out holes with SEEK_HOLE, SEEK_DATA */
	/* Useful statistics:
	 * 	- what percentage of file is in holes?
	 * 	- what percentage of file would be skipped if we read it
	 * 	  sequentially in 1MB chunks?
	 */

	if (argc != 2)
		errx(1, "usage: %s <filename>\n", argv[0]);
	name = argv[1];
	fd = open(name, O_RDONLY);
	if (fd == -1)
		err(1, "open");

	hole_end = 0;
	while (1) {
		hole_start = lseek(fd, hole_end, SEEK_HOLE);
		if (hole_start == -1)
			err(1, "lseek");
		hole_end = lseek(fd, hole_start, SEEK_DATA);
		if (hole_end == -1) {
			if (errno == ENXIO)
				break;
			err(1, "lseek");
		}
		do_stats(hole_start, hole_end);
	}
	hole_end = lseek(fd, 0, SEEK_END);
	do_stats(hole_start, hole_end);
	printf("%ld holes\n", num_holes);
	printf("total hole bytes:      %ld (%.0f%)\n", hbytes,
				100 * (float)hbytes/hole_end);
	printf("in aligned 1MB chunks: %ld (%.0f%)\n", rplusbytes,
				100 * (float)rplusbytes/hole_end);
}
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner April 15, 2015, 10:50 p.m. UTC | #39
On Wed, Apr 15, 2015 at 04:00:16PM -0400, J. Bruce Fields wrote:
> On Wed, Apr 15, 2015 at 03:56:14PM -0400, J. Bruce Fields wrote:
> > On Wed, Apr 15, 2015 at 03:32:02PM -0400, Anna Schumaker wrote:
> > > I just ran some more tests comparing the directio case across
> > > different filesystem types.  These tests used three 1G files:  100%
> > > data, 100% hole, and mixed file with alternating 4k data and hole
> > > segments.  The mixed case seems to be consistently slower compared to
> > > NFS v4.1, and I'm at a loss for anything I could do to make it faster.
> > > Here are my numbers:
> > 
> > Have you tried the implementation we discussed that always returns a
> > single segment covering the whole requested range, by treating holes as
> > data if necessary when they don't cover the whole range?
> > 
> > (Also: I assume it's the same as before, but: when you post test
> > results, could you repost if necessary:
> > 
> > 	- what the actual test is
> > 	- what the hardware/software setup is on client and server
> > 
> > so that we have reproduceable results for posterity's sake.)
> > 
> > Interesting that "Mixed" is a little slower even before READ_PLUS.
> > 
> > And I guess we should really report this to ext4 people, looks like they
> > may have a bug.
> 
> FWIW, this is what I was using to test SEEK_HOLE/SEEK_DATA and map out
> holes on files on my local disk.  Might be worth checking whether the
> ext4 slowdowns are reproduceable just with something like this, to rule
> out protocol problems.

Wheel reinvention. :)

$ rm -f /mnt/scratch/bar
$ for i in `seq 20 -2 0`; do
> sudo xfs_io -f -c "pwrite $((i * 8192)) 4096" /mnt/scratch/bar
> done
.....
$ sync
$ sudo xfs_io -c "seek -ar 0" /mnt/scratch/bar
Whence  Result
DATA    0
HOLE    4096
DATA    16384
HOLE    20480
DATA    32768
HOLE    36864
DATA    49152
HOLE    53248
DATA    65536
HOLE    69632
DATA    81920
HOLE    86016
DATA    98304
HOLE    102400
DATA    114688
HOLE    118784
DATA    131072
HOLE    135168
DATA    147456
HOLE    151552
DATA    163840
HOLE    167936
$

-Dave.
Dave Chinner April 15, 2015, 10:57 p.m. UTC | #40
On Wed, Apr 15, 2015 at 03:32:02PM -0400, Anna Schumaker wrote:
> I just ran some more tests comparing the directio case across
> different filesystem types.  These tests used three 1G files:
> 100% data, 100% hole, and mixed file with alternating 4k data and
> hole segments.  The mixed case seems to be consistently slower
> compared to NFS v4.1, and I'm at a loss for anything I could do to
> make it faster.  Here are my numbers:
> 
> ###########
> #         #
> #   XFS   #
> #         #
> ###########
> 
> 
> NFS v4.1:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  1.883s |  1.808s |  1.781s |  1.685s |  1.591s |  1.746s |
> |    Hole |  1.815s |  1.635s |  1.682s |  1.698s |  1.653s |  1.697s |
> |   Mixed |  2.089s |  2.024s |  1.970s |  1.925s |  2.049s |  2.011s |
> |---------|---------|---------|---------|---------|---------|---------|
> 
> 
> NFS v4.2:
>                             Trial
> |---------|---------|---------|---------|---------|---------|---------|
> |         |    1    |    2    |    3    |    4    |    5    | Average |
> |---------|---------|---------|---------|---------|---------|---------|
> |    Data |  1.849s |  1.879s |  1.852s |  1.799s |  1.781s |  1.832s |
> |    Hole |  0.668s |  0.600s |  0.611s |  0.619s |  0.617s |  0.623s |
> |   Mixed |  5.913s |  5.811s |  5.952s |  5.962s |  5.806s |  5.889s |
> |---------|---------|---------|---------|---------|---------|---------|

What that says to me is that the READ_PLUS when there are (worst
case) mixed holes is either burning a lot more CPU than we expected
or it is serialising somewhere (not sure where, everything in XFS
should be shared locks on read/seek).

Can you run a perf profile (even just a snapshot from perf top) on
the server so we can see a bit about what is happening on the CPU
for the different workloads?

Cheers,

Dave.
J. Bruce Fields April 17, 2015, 10:07 p.m. UTC | #41
On Thu, Apr 16, 2015 at 08:50:02AM +1000, Dave Chinner wrote:
> On Wed, Apr 15, 2015 at 04:00:16PM -0400, J. Bruce Fields wrote:
> > On Wed, Apr 15, 2015 at 03:56:14PM -0400, J. Bruce Fields wrote:
> > > On Wed, Apr 15, 2015 at 03:32:02PM -0400, Anna Schumaker wrote:
> > > > I just ran some more tests comparing the directio case across
> > > > different filesystem types.  These tests used three 1G files:  100%
> > > > data, 100% hole, and mixed file with alternating 4k data and hole
> > > > segments.  The mixed case seems to be consistently slower compared to
> > > > NFS v4.1, and I'm at a loss for anything I could do to make it faster.
> > > > Here are my numbers:
> > > 
> > > Have you tried the implementation we discussed that always returns a
> > > single segment covering the whole requested range, by treating holes as
> > > data if necessary when they don't cover the whole range?

Uh, sorry, I forgot, I think you're running with the patches that
support full multi-segment READ_PLUS on both sides so there's not that
issue with multiplying RPC's in this case.

Still, might be interesting to compare.  And wouldn't hurt to remind us
of these details when you repost this stuff to help keep my forgetful
self going in circles.

> > > (Also: I assume it's the same as before, but: when you post test
> > > results, could you repost if necessary:
> > > 
> > > 	- what the actual test is
> > > 	- what the hardware/software setup is on client and server
> > > 
> > > so that we have reproduceable results for posterity's sake.)
> > > 
> > > Interesting that "Mixed" is a little slower even before READ_PLUS.
> > > 
> > > And I guess we should really report this to ext4 people, looks like they
> > > may have a bug.
> > 
> > FWIW, this is what I was using to test SEEK_HOLE/SEEK_DATA and map out
> > holes on files on my local disk.  Might be worth checking whether the
> > ext4 slowdowns are reproduceable just with something like this, to rule
> > out protocol problems.
> 
> Wheel reinvention. :)

xfs_io appears to have a lot of wheels.  OK, I'll go read that man page
one of these days.

--b.

> 
> $ rm -f /mnt/scratch/bar
> $ for i in `seq 20 -2 0`; do
> > sudo xfs_io -f -c "pwrite $((i * 8192)) 4096" /mnt/scratch/bar
> > done
> .....
> $ sync
> $ sudo xfs_io -c "seek -ar 0" /mnt/scratch/bar
> Whence  Result
> DATA    0
> HOLE    4096
> DATA    16384
> HOLE    20480
> DATA    32768
> HOLE    36864
> DATA    49152
> HOLE    53248
> DATA    65536
> HOLE    69632
> DATA    81920
> HOLE    86016
> DATA    98304
> HOLE    102400
> DATA    114688
> HOLE    118784
> DATA    131072
> HOLE    135168
> DATA    147456
> HOLE    151552
> DATA    163840
> HOLE    167936
> $
> 
> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e9f4d8f..6801973 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1862,8 +1862,8 @@  static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op
 {
 	u32 maxcount = svc_max_payload(rqstp);
 	u32 rlen = min(op->u.read.rd_length, maxcount);
-	/* enough extra xdr space for encoding either a hole or data segment. */
-	u32 xdr  = 5;
+	/* Extra xdr padding for encoding multiple segments. */
+	u32 xdr  = 20;
 
 	return (op_encode_hdr_size + 2 + xdr + XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 799d52c..5eaecd2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4117,7 +4117,7 @@  nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 static __be32
 nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
-			    struct file *file)
+			    struct file *file, loff_t hole_pos)
 {
 	__be32 *p, err;
 	unsigned long maxcount;
@@ -4128,20 +4128,26 @@  nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *r
 		return nfserr_resource;
 	xdr_commit_encode(xdr);
 
+	if (hole_pos <= read->rd_offset)
+		hole_pos = i_size_read(file_inode(file));
+
 	maxcount = svc_max_payload(resp->rqstp);
 	maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
 	maxcount = min_t(unsigned long, maxcount, read->rd_length);
+	maxcount = min_t(unsigned long, maxcount, hole_pos - read->rd_offset);
 
 	if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
 		err = nfsd4_encode_splice_read(resp, read, file, &maxcount);
 	else
 		err = nfsd4_encode_readv(resp, read, file, &maxcount);
+	clear_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
 
 	*p++ = cpu_to_be32(NFS4_CONTENT_DATA);
 	p = xdr_encode_hyper(p, read->rd_offset);
 	*p++ = cpu_to_be32(maxcount);
 
 	read->rd_offset += maxcount;
+	read->rd_length -= maxcount;
 	return err;
 }
 
@@ -4156,7 +4162,7 @@  nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, struct nfsd4_read *r
 	if (data_pos == -ENXIO)
 		data_pos = i_size_read(file_inode(file));
 	if (data_pos <= read->rd_offset)
-		return nfsd4_encode_read_plus_data(resp, read, file);
+		return nfsd4_encode_read_plus_data(resp, read, file, 0);
 
 	maxcount = data_pos - read->rd_offset;
 	p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8);
@@ -4165,6 +4171,10 @@  nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, struct nfsd4_read *r
 	p = xdr_encode_hyper(p, maxcount);
 
 	read->rd_offset += maxcount;
+	if (maxcount > read->rd_length)
+		read->rd_length = 0;
+	else
+		read->rd_length -= maxcount;
 	return nfs_ok;
 }
 
@@ -4197,17 +4207,20 @@  nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
 			goto err_truncate;
 	}
 
-	hole_pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
-	if (hole_pos == -ENXIO)
-		goto out_encode;
+	do {
+		hole_pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
+		if (hole_pos == -ENXIO)
+			break;
 
-	if (hole_pos == read->rd_offset)
-		err = nfsd4_encode_read_plus_hole(resp, read, file);
-	else
-		err = nfsd4_encode_read_plus_data(resp, read, file);
-	segments++;
+		if (hole_pos == read->rd_offset)
+			err = nfsd4_encode_read_plus_hole(resp, read, file);
+		else
+			err = nfsd4_encode_read_plus_data(resp, read, file, hole_pos);
+		if (err)
+			break;
+		segments++;
+	} while (read->rd_length > 0);
 
-out_encode:
 	eof = (read->rd_offset >= i_size_read(file_inode(file)));
 	*p++ = cpu_to_be32(eof);
 	*p++ = cpu_to_be32(segments);