diff mbox

[87/88] Add configurable prefetch size for layoutget

Message ID	09142112ff0115f7f22124a69ead7b9bb5e0958f.1307464382.git.rees@umich.edu (mailing list archive)
State	New, archived
Headers	show Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id p57HZsog022911 for <patchwork-linux-nfs@patchwork.kernel.org>; Tue, 7 Jun 2011 17:36:12 GMT Date: Tue, 7 Jun 2011 13:36:05 -0400 From: Jim Rees <rees@umich.edu> To: Benny Halevy <bhalevy@panasas.com> Cc: linux-nfs@vger.kernel.org, peter honeyman <honey@citi.umich.edu> Subject: [PATCH 87/88] Add configurable prefetch size for layoutget Message-ID: <09142112ff0115f7f22124a69ead7b9bb5e0958f.1307464382.git.rees@umich.edu> References: <cover.1307464381.git.rees@umich.edu> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <cover.1307464381.git.rees@umich.edu> Sender: linux-nfs-owner@vger.kernel.org Precedence: bulk

Message ID

09142112ff0115f7f22124a69ead7b9bb5e0958f.1307464382.git.rees@umich.edu (mailing list archive)

State

New, archived

Headers

Date: Tue, 7 Jun 2011 13:36:05 -0400
From: Jim Rees <rees@umich.edu>
To: Benny Halevy <bhalevy@panasas.com>
Cc: linux-nfs@vger.kernel.org, peter honeyman <honey@citi.umich.edu>
Subject: [PATCH 87/88] Add configurable prefetch size for layoutget
Message-ID: <09142112ff0115f7f22124a69ead7b9bb5e0958f.1307464382.git.rees@umich.edu>
References: <cover.1307464381.git.rees@umich.edu>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <cover.1307464381.git.rees@umich.edu>
Sender: linux-nfs-owner@vger.kernel.org
Precedence: bulk

Commit Message

Jim Rees June 7, 2011, 5:36 p.m. UTC

From: Peng Tao <peng_tao@emc.com>

pnfs_layout_prefetch_kb can be modified via sysctl.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Jim Rees <rees@umich.edu>
---
 fs/nfs/pnfs.c   |   17 +++++++++++++++++
 fs/nfs/pnfs.h   |    1 +
 fs/nfs/sysctl.c |   10 ++++++++++
 3 files changed, 28 insertions(+), 0 deletions(-)

Comments

Benny Halevy June 8, 2011, 2:01 a.m. UTC | #1

NAK.
This affects all layout types.  In particular it is undesired
for write layouts that extend the file with the objects layout.
The server can extend the layout segments range
over what the client requested so why would the client
ask for artificially large layouts?

Benny

On 2011-06-07 13:36, Jim Rees wrote:
> From: Peng Tao <peng_tao@emc.com>
> 
> pnfs_layout_prefetch_kb can be modified via sysctl.
> 
> Signed-off-by: Peng Tao <peng_tao@emc.com>
> Signed-off-by: Jim Rees <rees@umich.edu>
> ---
>  fs/nfs/pnfs.c   |   17 +++++++++++++++++
>  fs/nfs/pnfs.h   |    1 +
>  fs/nfs/sysctl.c |   10 ++++++++++
>  3 files changed, 28 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 9920bff..9c2b569 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -46,6 +46,11 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
>   */
>  static LIST_HEAD(pnfs_modules_tbl);
>  
> +/*
> + * layoutget prefetch size
> + */
> +unsigned int pnfs_layout_prefetch_kb = 2 << 10;
> +
>  /* Return the registered pnfs layout driver module matching given id */
>  static struct pnfs_layoutdriver_type *
>  find_pnfs_driver_locked(u32 id)
> @@ -906,6 +911,16 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
>  }
>  
>  /*
> + * Set layout prefetch length.
> + */
> +static void
> +pnfs_set_layout_prefetch(struct pnfs_layout_range *range)
> +{
> +	if (range->length < (pnfs_layout_prefetch_kb << 10))
> +		range->length = pnfs_layout_prefetch_kb << 10;
> +}
> +
> +/*
>   * Layout segment is retreived from the server if not cached.
>   * The appropriate layout segment is referenced and returned to the caller.
>   */
> @@ -956,6 +971,8 @@ pnfs_update_layout(struct inode *ino,
>  
>  	if (pnfs_layoutgets_blocked(lo, NULL, 0))
>  		goto out_unlock;
> +
> +	pnfs_set_layout_prefetch(&arg);
>  	atomic_inc(&lo->plh_outstanding);
>  
>  	get_layout_hdr(lo);
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 28d57c9..563c67b 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -182,6 +182,7 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
>  extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
>  
>  /* pnfs.c */
> +extern unsigned int pnfs_layout_prefetch_kb;
>  void get_layout_hdr(struct pnfs_layout_hdr *lo);
>  void put_lseg(struct pnfs_layout_segment *lseg);
>  struct pnfs_layout_segment *
> diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
> index 978aaeb..79a5134 100644
> --- a/fs/nfs/sysctl.c
> +++ b/fs/nfs/sysctl.c
> @@ -14,6 +14,7 @@
>  #include <linux/nfs_fs.h>
>  
>  #include "callback.h"
> +#include "pnfs.h"
>  
>  #ifdef CONFIG_NFS_V4
>  static const int nfs_set_port_min = 0;
> @@ -42,6 +43,15 @@ static ctl_table nfs_cb_sysctls[] = {
>  	},
>  #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
>  #endif
> +#ifdef CONFIG_NFS_V4_1
> +	{
> +		.procname	= "pnfs_layout_prefetch_kb",
> +		.data		= &pnfs_layout_prefetch_kb,
> +		.maxlen		= sizeof(pnfs_layout_prefetch_kb),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +#endif
>  	{
>  		.procname	= "nfs_mountpoint_timeout",
>  		.data		= &nfs_mountpoint_expiry_timeout,
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Jim Rees June 8, 2011, 2:18 a.m. UTC | #2

Benny Halevy wrote:

  NAK.
  This affects all layout types.  In particular it is undesired
  for write layouts that extend the file with the objects layout.
  The server can extend the layout segments range
  over what the client requested so why would the client
  ask for artificially large layouts?

This has actually been the subject of some debate over Thursday night
beers.  The problem we're trying to solve is that the client is spending 98%
of its time in layoutget.  This patch gives us something like a 10x
speedup.  But many of us think it's not the right fix.  I suggest we discuss
next week.

But note that this patch doesn't change anything unless you set the sysctl.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Benny Halevy June 9, 2011, 6:06 a.m. UTC | #3

On 2011-06-08 03:15, Peng Tao wrote:
> On 6/8/11, Jim Rees <rees@umich.edu> wrote:
>> Benny Halevy wrote:
>>
>>   NAK.
>>   This affects all layout types.  In particular it is undesired
>>   for write layouts that extend the file with the objects layout.
>>   The server can extend the layout segments range
>>   over what the client requested so why would the client
>>   ask for artificially large layouts?
>>
>> This has actually been the subject of some debate over Thursday night
>> beers.  The problem we're trying to solve is that the client is spending 98%
>> of its time in layoutget.  This patch gives us something like a 10x
>> speedup.  But many of us think it's not the right fix.  I suggest we discuss
>> next week.
>>

Sure.

>> But note that this patch doesn't change anything unless you set the sysctl.
> there is a default value of 2M. maybe we can set it to page size by
> default so other layout are not affected and block layout can let
> users set it by hand if they care about performance. does this make
> sense?

If doing it at all why use a sysctl rather than a mount option?
Or maybe coding the logic for prefetching the layout iff sequential
access is detected is the right thing to do.

Benny

>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Benny Halevy June 9, 2011, 6:08 a.m. UTC | #4

On 2011-06-08 03:15, Peng Tao wrote:
> On 6/8/11, Jim Rees <rees@umich.edu> wrote:
>> Benny Halevy wrote:
>>
>>   NAK.
>>   This affects all layout types.  In particular it is undesired
>>   for write layouts that extend the file with the objects layout.
>>   The server can extend the layout segments range
>>   over what the client requested so why would the client
>>   ask for artificially large layouts?
>>
>> This has actually been the subject of some debate over Thursday night
>> beers.  The problem we're trying to solve is that the client is spending 98%
>> of its time in layoutget.  This patch gives us something like a 10x
>> speedup.  But many of us think it's not the right fix.  I suggest we discuss
>> next week.
>>

Sure.

>> But note that this patch doesn't change anything unless you set the sysctl.
> there is a default value of 2M. maybe we can set it to page size by
> default so other layout are not affected and block layout can let
> users set it by hand if they care about performance. does this make
> sense?

If doing it at all why use a sysctl rather than a mount option?
Or maybe coding the logic for prefetching the layout iff sequential
access is detected is the right thing to do.

Benny

>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Jim Rees June 9, 2011, 11:49 a.m. UTC | #5

Benny Halevy wrote:

  >> But note that this patch doesn't change anything unless you set the sysctl.
  > there is a default value of 2M. maybe we can set it to page size by
  > default so other layout are not affected and block layout can let
  > users set it by hand if they care about performance. does this make
  > sense?

  If doing it at all why use a sysctl rather than a mount option?
  Or maybe coding the logic for prefetching the layout iff sequential
  access is detected is the right thing to do.

I would rather see some automatic solution than to add either a sysctl or a
mount option.  For now you can just drop that patch, as it's not needed for
basic pnfs block.

My understanding is that layoutget specifies a min and max, and the server
is returning the min.  Trond and Fred believe this should be fixed on the
server.  Here's the original report of the problem:

From: Bergwolf

From the network trace for pnfs, we can see the root cause for slow performance
is too many small layoutget. In specific, client asks for a layout of only 4K
pagesize (and server returns 8K due to block size alignment) at each time.

The total IO time is 256/1.68 = 152 second.
There are 256*1024/8 = 32768 layoutget for the 256MB file.
On average, the time spent on each layoutget is 0.00456 second according to the
trace.
The total layoutget time is 32768* 0.00456 = 149 second, which takes up about
98% of total IO time.

So we should optimize layoutget's granularity to get better performance. For
instance, use a configurable prefetch size of 2MB or so.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Benny Halevy June 9, 2011, 1:32 p.m. UTC | #6

On 2011-06-09 07:49, Jim Rees wrote:
> Benny Halevy wrote:
> 
>   >> But note that this patch doesn't change anything unless you set the sysctl.
>   > there is a default value of 2M. maybe we can set it to page size by
>   > default so other layout are not affected and block layout can let
>   > users set it by hand if they care about performance. does this make
>   > sense?
>   
>   If doing it at all why use a sysctl rather than a mount option?
>   Or maybe coding the logic for prefetching the layout iff sequential
>   access is detected is the right thing to do.
> 
> I would rather see some automatic solution than to add either a sysctl or a
> mount option.  For now you can just drop that patch, as it's not needed for
> basic pnfs block.
> 
> My understanding is that layoutget specifies a min and max, and the server

There's a min.  What do you consider the max?
Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?

> is returning the min.  Trond and Fred believe this should be fixed on the
> server.  

Agreed.

Benny

> Here's the original report of the problem:
> 
> From: Bergwolf
> 
> From the network trace for pnfs, we can see the root cause for slow performance
> is too many small layoutget. In specific, client asks for a layout of only 4K
> pagesize (and server returns 8K due to block size alignment) at each time.
> 
> The total IO time is 256/1.68 = 152 second.
> There are 256*1024/8 = 32768 layoutget for the 256MB file.
> On average, the time spent on each layoutget is 0.00456 second according to the
> trace.
> The total layoutget time is 32768* 0.00456 = 149 second, which takes up about
> 98% of total IO time.
> 
> So we should optimize layoutget's granularity to get better performance. For
> instance, use a configurable prefetch size of 2MB or so.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Jim Rees June 9, 2011, 1:58 p.m. UTC | #7

Benny Halevy wrote:

  > My understanding is that layoutget specifies a min and max, and the server
  
  There's a min.  What do you consider the max?
  Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?

The spec doesn't say max, it says "desired."  I guess I assumed the server
wouldn't normally return more than desired.

18.43.3.  DESCRIPTION
...

   The LAYOUTGET operation returns layout information for the specified
   byte-range: a layout.  The client actually specifies two ranges, both
   starting at the offset in the loga_offset field.  The first range is
   between loga_offset and loga_offset + loga_length - 1 inclusive.
   This range indicates the desired range the client wants the layout to
   cover.  The second range is between loga_offset and loga_offset +
   loga_minlength - 1 inclusive.  This range indicates the required
   range the client needs the layout to cover.  Thus, loga_minlength
   MUST be less than or equal to loga_length.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peng Tao June 9, 2011, 2:54 p.m. UTC | #8

On Thu, Jun 9, 2011 at 2:06 PM, Benny Halevy <bhalevy@panasas.com> wrote:
> On 2011-06-08 03:15, Peng Tao wrote:
>> On 6/8/11, Jim Rees <rees@umich.edu> wrote:
>>> Benny Halevy wrote:
>>>
>>>   NAK.
>>>   This affects all layout types.  In particular it is undesired
>>>   for write layouts that extend the file with the objects layout.
>>>   The server can extend the layout segments range
>>>   over what the client requested so why would the client
>>>   ask for artificially large layouts?
>>>
>>> This has actually been the subject of some debate over Thursday night
>>> beers.  The problem we're trying to solve is that the client is spending 98%
>>> of its time in layoutget.  This patch gives us something like a 10x
>>> speedup.  But many of us think it's not the right fix.  I suggest we discuss
>>> next week.
>>>
>
> Sure.
>
>>> But note that this patch doesn't change anything unless you set the sysctl.
>> there is a default value of 2M. maybe we can set it to page size by
>> default so other layout are not affected and block layout can let
>> users set it by hand if they care about performance. does this make
>> sense?
>
> If doing it at all why use a sysctl rather than a mount option?
The purpose of using a sysctl is to give client the ability to change
it on the fly. In theory, layout prefetching can benefit all layout
types. So the patch tries to solve it in the pnfs generic layer.

> Or maybe coding the logic for prefetching the layout iff sequential
> access is detected is the right thing to do.
Yeah, automatic decision should be a better way.

>
> Benny
>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>
>>
>

Peng Tao June 9, 2011, 3:01 p.m. UTC | #9

Hi, Jim,

On Thu, Jun 9, 2011 at 7:49 PM, Jim Rees <rees@umich.edu> wrote:
> Benny Halevy wrote:
>
>  >> But note that this patch doesn't change anything unless you set the sysctl.
>  > there is a default value of 2M. maybe we can set it to page size by
>  > default so other layout are not affected and block layout can let
>  > users set it by hand if they care about performance. does this make
>  > sense?
>
>  If doing it at all why use a sysctl rather than a mount option?
>  Or maybe coding the logic for prefetching the layout iff sequential
>  access is detected is the right thing to do.
>
> I would rather see some automatic solution than to add either a sysctl or a
> mount option.  For now you can just drop that patch, as it's not needed for
> basic pnfs block.
The current code w/o the patch makes block layout performance very poor.
Can we have it in for now, and when we have something smarter, change
it back later?

>
> My understanding is that layoutget specifies a min and max, and the server
> is returning the min.  Trond and Fred believe this should be fixed on the
> server.  Here's the original report of the problem:
>
> From: Bergwolf
>
> From the network trace for pnfs, we can see the root cause for slow performance
> is too many small layoutget. In specific, client asks for a layout of only 4K
> pagesize (and server returns 8K due to block size alignment) at each time.
>
> The total IO time is 256/1.68 = 152 second.
> There are 256*1024/8 = 32768 layoutget for the 256MB file.
> On average, the time spent on each layoutget is 0.00456 second according to the
> trace.
> The total layoutget time is 32768* 0.00456 = 149 second, which takes up about
> 98% of total IO time.
>
> So we should optimize layoutget's granularity to get better performance. For
> instance, use a configurable prefetch size of 2MB or so.
>

Peng Tao June 9, 2011, 3:07 p.m. UTC | #10

Hi, Jim and Benny,

On Thu, Jun 9, 2011 at 9:58 PM, Jim Rees <rees@umich.edu> wrote:
> Benny Halevy wrote:
>
>  > My understanding is that layoutget specifies a min and max, and the server
>
>  There's a min.  What do you consider the max?
>  Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?
>
> The spec doesn't say max, it says "desired."  I guess I assumed the server
> wouldn't normally return more than desired.
In fact server is returning "desired" length. The problem is that we
call pnfs_update_layout in nfs_write_begin, and it will end up setting
both minlength and length to page size. There is no space for client
to collapse layoutget range in nfs_write_begin.

>
> 18.43.3.  DESCRIPTION
> ...
>
>   The LAYOUTGET operation returns layout information for the specified
>   byte-range: a layout.  The client actually specifies two ranges, both
>   starting at the offset in the loga_offset field.  The first range is
>   between loga_offset and loga_offset + loga_length - 1 inclusive.
>   This range indicates the desired range the client wants the layout to
>   cover.  The second range is between loga_offset and loga_offset +
>   loga_minlength - 1 inclusive.  This range indicates the required
>   range the client needs the layout to cover.  Thus, loga_minlength
>   MUST be less than or equal to loga_length.
>

Benny Halevy June 9, 2011, 9:22 p.m. UTC | #11

On 2011-06-09 08:07, Peng Tao wrote:
> Hi, Jim and Benny,
> 
> On Thu, Jun 9, 2011 at 9:58 PM, Jim Rees <rees@umich.edu> wrote:
>> Benny Halevy wrote:
>>
>>  > My understanding is that layoutget specifies a min and max, and the server
>>
>>  There's a min.  What do you consider the max?
>>  Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?
>>
>> The spec doesn't say max, it says "desired."  I guess I assumed the server
>> wouldn't normally return more than desired.
> In fact server is returning "desired" length. The problem is that we
> call pnfs_update_layout in nfs_write_begin, and it will end up setting
> both minlength and length to page size. There is no space for client
> to collapse layoutget range in nfs_write_begin.
> 

That's a different issue.  Waiting with pnfs_update_layout to flush
time rather than write_begin if the whole page is written would help
sending a more meaningful desired range as well as avoiding needless
read-modify-writes in case the application also wrote the whole
preallocated block.

Benny

>>
>> 18.43.3.  DESCRIPTION
>> ...
>>
>>   The LAYOUTGET operation returns layout information for the specified
>>   byte-range: a layout.  The client actually specifies two ranges, both
>>   starting at the offset in the loga_offset field.  The first range is
>>   between loga_offset and loga_offset + loga_length - 1 inclusive.
>>   This range indicates the desired range the client wants the layout to
>>   cover.  The second range is between loga_offset and loga_offset +
>>   loga_minlength - 1 inclusive.  This range indicates the required
>>   range the client needs the layout to cover.  Thus, loga_minlength
>>   MUST be less than or equal to loga_length.
>>
> 
> 
>

Benny Halevy June 9, 2011, 9:23 p.m. UTC | #12

On 2011-06-09 06:58, Jim Rees wrote:
> Benny Halevy wrote:
> 
>   > My understanding is that layoutget specifies a min and max, and the server
>   
>   There's a min.  What do you consider the max?
>   Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?
> 
> The spec doesn't say max, it says "desired."  I guess I assumed the server
> wouldn't normally return more than desired.

No, the server may freely upgrade the returned layout segment by returning
a layout for a larger byte range or even returning a RW layout where a READ
layout was asked for.

Benny

> 
> 18.43.3.  DESCRIPTION
> ...
> 
>    The LAYOUTGET operation returns layout information for the specified
>    byte-range: a layout.  The client actually specifies two ranges, both
>    starting at the offset in the loga_offset field.  The first range is
>    between loga_offset and loga_offset + loga_length - 1 inclusive.
>    This range indicates the desired range the client wants the layout to
>    cover.  The second range is between loga_offset and loga_offset +
>    loga_minlength - 1 inclusive.  This range indicates the required
>    range the client needs the layout to cover.  Thus, loga_minlength
>    MUST be less than or equal to loga_length.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Benny Halevy June 9, 2011, 9:30 p.m. UTC | #13

On 2011-06-09 07:54, Peng Tao wrote:
> On Thu, Jun 9, 2011 at 2:06 PM, Benny Halevy <bhalevy@panasas.com> wrote:
>> On 2011-06-08 03:15, Peng Tao wrote:
>>> On 6/8/11, Jim Rees <rees@umich.edu> wrote:
>>>> Benny Halevy wrote:
>>>>
>>>>   NAK.
>>>>   This affects all layout types.  In particular it is undesired
>>>>   for write layouts that extend the file with the objects layout.
>>>>   The server can extend the layout segments range
>>>>   over what the client requested so why would the client
>>>>   ask for artificially large layouts?
>>>>
>>>> This has actually been the subject of some debate over Thursday night
>>>> beers.  The problem we're trying to solve is that the client is spending 98%
>>>> of its time in layoutget.  This patch gives us something like a 10x
>>>> speedup.  But many of us think it's not the right fix.  I suggest we discuss
>>>> next week.
>>>>
>>
>> Sure.
>>
>>>> But note that this patch doesn't change anything unless you set the sysctl.
>>> there is a default value of 2M. maybe we can set it to page size by
>>> default so other layout are not affected and block layout can let
>>> users set it by hand if they care about performance. does this make
>>> sense?
>>
>> If doing it at all why use a sysctl rather than a mount option?
> The purpose of using a sysctl is to give client the ability to change
> it on the fly. In theory, layout prefetching can benefit all layout
> types. So the patch tries to solve it in the pnfs generic layer.
> 

But the need for this varies per-server and many times per application.
Think sequential vs. random I/O.  Therefore a mount option would help
tuning the behavior on a per-use basis.  Global behavior must be implemented
using a dynamic algorithm that would take both the workload and the server
observed behavior into account.

Benny

>> Or maybe coding the logic for prefetching the layout iff sequential
>> access is detected is the right thing to do.
> Yeah, automatic decision should be a better way.
> 
>>
>> Benny
>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>
>>>
>>>
>>
> 
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

tao.peng@emc.com June 10, 2011, 5:36 a.m. UTC | #14

Hi, Benny,

-----Original Message-----
From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
Sent: Friday, June 10, 2011 5:23 AM
To: Jim Rees
Cc: Peng Tao; linux-nfs@vger.kernel.org; peter honeyman
Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget

On 2011-06-09 06:58, Jim Rees wrote:
> Benny Halevy wrote:
> 
>   > My understanding is that layoutget specifies a min and max, and the server
>   
>   There's a min.  What do you consider the max?
>   Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?
> 
> The spec doesn't say max, it says "desired."  I guess I assumed the server
> wouldn't normally return more than desired.

No, the server may freely upgrade the returned layout segment by returning
a layout for a larger byte range or even returning a RW layout where a READ
layout was asked for.
[PT] It is true that server can upgrade the layout segment freely. But there is always a price to pay. Server has to be dealing with all kind of clients.
If server returns more than being asked for, it may hurt other clients.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

tao.peng@emc.com June 10, 2011, 6 a.m. UTC | #15

Hi, Benny,

Cheers,
-Bergwolf


-----Original Message-----
From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
Sent: Friday, June 10, 2011 5:23 AM
To: Peng Tao
Cc: Jim Rees; linux-nfs@vger.kernel.org; peter honeyman
Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget

On 2011-06-09 08:07, Peng Tao wrote:
> Hi, Jim and Benny,
> 
> On Thu, Jun 9, 2011 at 9:58 PM, Jim Rees <rees@umich.edu> wrote:
>> Benny Halevy wrote:
>>
>>  > My understanding is that layoutget specifies a min and max, and the server
>>
>>  There's a min.  What do you consider the max?
>>  Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?
>>
>> The spec doesn't say max, it says "desired."  I guess I assumed the server
>> wouldn't normally return more than desired.
> In fact server is returning "desired" length. The problem is that we
> call pnfs_update_layout in nfs_write_begin, and it will end up setting
> both minlength and length to page size. There is no space for client
> to collapse layoutget range in nfs_write_begin.
> 

That's a different issue.  Waiting with pnfs_update_layout to flush
time rather than write_begin if the whole page is written would help
sending a more meaningful desired range as well as avoiding needless
read-modify-writes in case the application also wrote the whole
preallocated block.
[PT] It is also the reason why we want to introduce layout prefetching, to get more segment than the page passed in nfs_write_begin.

>>
>> 18.43.3.  DESCRIPTION
>> ...
>>
>>   The LAYOUTGET operation returns layout information for the specified
>>   byte-range: a layout.  The client actually specifies two ranges, both
>>   starting at the offset in the loga_offset field.  The first range is
>>   between loga_offset and loga_offset + loga_length - 1 inclusive.
>>   This range indicates the desired range the client wants the layout to
>>   cover.  The second range is between loga_offset and loga_offset +
>>   loga_minlength - 1 inclusive.  This range indicates the required
>>   range the client needs the layout to cover.  Thus, loga_minlength
>>   MUST be less than or equal to loga_length.
>>
> 
> 
>

tao.peng@emc.com June 10, 2011, 6:02 a.m. UTC | #16

Hi, Benny,

Cheers,
-Bergwolf


-----Original Message-----
From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
Sent: Friday, June 10, 2011 5:30 AM
To: Peng Tao
Cc: Jim Rees; linux-nfs@vger.kernel.org; peter honeyman
Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget

On 2011-06-09 07:54, Peng Tao wrote:
> On Thu, Jun 9, 2011 at 2:06 PM, Benny Halevy <bhalevy@panasas.com> wrote:
>> On 2011-06-08 03:15, Peng Tao wrote:
>>> On 6/8/11, Jim Rees <rees@umich.edu> wrote:
>>>> Benny Halevy wrote:
>>>>
>>>>   NAK.
>>>>   This affects all layout types.  In particular it is undesired
>>>>   for write layouts that extend the file with the objects layout.
>>>>   The server can extend the layout segments range
>>>>   over what the client requested so why would the client
>>>>   ask for artificially large layouts?
>>>>
>>>> This has actually been the subject of some debate over Thursday night
>>>> beers.  The problem we're trying to solve is that the client is spending 98%
>>>> of its time in layoutget.  This patch gives us something like a 10x
>>>> speedup.  But many of us think it's not the right fix.  I suggest we discuss
>>>> next week.
>>>>
>>
>> Sure.
>>
>>>> But note that this patch doesn't change anything unless you set the sysctl.
>>> there is a default value of 2M. maybe we can set it to page size by
>>> default so other layout are not affected and block layout can let
>>> users set it by hand if they care about performance. does this make
>>> sense?
>>
>> If doing it at all why use a sysctl rather than a mount option?
> The purpose of using a sysctl is to give client the ability to change
> it on the fly. In theory, layout prefetching can benefit all layout
> types. So the patch tries to solve it in the pnfs generic layer.
> 

But the need for this varies per-server and many times per application.
Think sequential vs. random I/O.  Therefore a mount option would help
tuning the behavior on a per-use basis.  Global behavior must be implemented
using a dynamic algorithm that would take both the workload and the server
observed behavior into account.
[PT] Indeed. Dynamic algorithm is supposed to be able to solve all this. And it often takes longer to be designed/accepted. It has to prove to be better in most scenarios and does not hurt the left.


--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Benny Halevy June 10, 2011, 12:33 p.m. UTC | #17

On 2011-06-10 02:00, tao.peng@emc.com wrote:
> Hi, Benny,
> 
> Cheers,
> -Bergwolf
> 
> 
> -----Original Message-----
> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
> Sent: Friday, June 10, 2011 5:23 AM
> To: Peng Tao
> Cc: Jim Rees; linux-nfs@vger.kernel.org; peter honeyman
> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
> 
> On 2011-06-09 08:07, Peng Tao wrote:
>> Hi, Jim and Benny,
>>
>> On Thu, Jun 9, 2011 at 9:58 PM, Jim Rees <rees@umich.edu> wrote:
>>> Benny Halevy wrote:
>>>
>>>  > My understanding is that layoutget specifies a min and max, and the server
>>>
>>>  There's a min.  What do you consider the max?
>>>  Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?
>>>
>>> The spec doesn't say max, it says "desired."  I guess I assumed the server
>>> wouldn't normally return more than desired.
>> In fact server is returning "desired" length. The problem is that we
>> call pnfs_update_layout in nfs_write_begin, and it will end up setting
>> both minlength and length to page size. There is no space for client
>> to collapse layoutget range in nfs_write_begin.
>>
> 
> That's a different issue.  Waiting with pnfs_update_layout to flush
> time rather than write_begin if the whole page is written would help
> sending a more meaningful desired range as well as avoiding needless
> read-modify-writes in case the application also wrote the whole
> preallocated block.
> [PT] It is also the reason why we want to introduce layout prefetching, to get more segment than the page passed in nfs_write_begin.
> 

Peng, I understand what you want to achieve but the proposed way
just doesn't fly. The server knows better than the client its allocation policies
and it knows better the combined workload of different client and possible
conflicts between them therefore it should be making the ultimate decision
about the actual segment sizes.

That said, the client should indeed do its best to ask for the most appropriate
segments size for its use and we should be making a better job at that.
It's just that blindly asking for more is not a good strategy and requiring
manual admin help to tune the clients is not acceptable.

Benny
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Benny Halevy June 10, 2011, 12:36 p.m. UTC | #18

On 2011-06-10 01:36, tao.peng@emc.com wrote:
> Hi, Benny,
> 
> -----Original Message-----
> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
> Sent: Friday, June 10, 2011 5:23 AM
> To: Jim Rees
> Cc: Peng Tao; linux-nfs@vger.kernel.org; peter honeyman
> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
> 
> On 2011-06-09 06:58, Jim Rees wrote:
>> Benny Halevy wrote:
>>
>>   > My understanding is that layoutget specifies a min and max, and the server
>>   
>>   There's a min.  What do you consider the max?
>>   Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?
>>
>> The spec doesn't say max, it says "desired."  I guess I assumed the server
>> wouldn't normally return more than desired.
> 
> No, the server may freely upgrade the returned layout segment by returning
> a layout for a larger byte range or even returning a RW layout where a READ
> layout was asked for.
> [PT] It is true that server can upgrade the layout segment freely. But there is always a price to pay. Server has to be dealing with all kind of clients.
> If server returns more than being asked for, it may hurt other clients.

And if all clients ask for more than they need and the server just
gives it to them, what do you get out of that?

Benny
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Benny Halevy June 10, 2011, 12:47 p.m. UTC | #19

On 2011-06-10 02:02, tao.peng@emc.com wrote:
> Hi, Benny,
> 
> Cheers,
> -Bergwolf
> 
> 
> -----Original Message-----
> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
> Sent: Friday, June 10, 2011 5:30 AM
> To: Peng Tao
> Cc: Jim Rees; linux-nfs@vger.kernel.org; peter honeyman
> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
> 
> On 2011-06-09 07:54, Peng Tao wrote:
>> On Thu, Jun 9, 2011 at 2:06 PM, Benny Halevy <bhalevy@panasas.com> wrote:
>>> On 2011-06-08 03:15, Peng Tao wrote:
>>>> On 6/8/11, Jim Rees <rees@umich.edu> wrote:
>>>>> Benny Halevy wrote:
>>>>>
>>>>>   NAK.
>>>>>   This affects all layout types.  In particular it is undesired
>>>>>   for write layouts that extend the file with the objects layout.
>>>>>   The server can extend the layout segments range
>>>>>   over what the client requested so why would the client
>>>>>   ask for artificially large layouts?
>>>>>
>>>>> This has actually been the subject of some debate over Thursday night
>>>>> beers.  The problem we're trying to solve is that the client is spending 98%
>>>>> of its time in layoutget.  This patch gives us something like a 10x
>>>>> speedup.  But many of us think it's not the right fix.  I suggest we discuss
>>>>> next week.
>>>>>
>>>
>>> Sure.
>>>
>>>>> But note that this patch doesn't change anything unless you set the sysctl.
>>>> there is a default value of 2M. maybe we can set it to page size by
>>>> default so other layout are not affected and block layout can let
>>>> users set it by hand if they care about performance. does this make
>>>> sense?
>>>
>>> If doing it at all why use a sysctl rather than a mount option?
>> The purpose of using a sysctl is to give client the ability to change
>> it on the fly. In theory, layout prefetching can benefit all layout
>> types. So the patch tries to solve it in the pnfs generic layer.
>>
> 
> But the need for this varies per-server and many times per application.
> Think sequential vs. random I/O.  Therefore a mount option would help
> tuning the behavior on a per-use basis.  Global behavior must be implemented
> using a dynamic algorithm that would take both the workload and the server
> observed behavior into account.
> [PT] Indeed. Dynamic algorithm is supposed to be able to solve all this. And it often takes longer to be designed/accepted. It has to prove to be better in most scenarios and does not hurt the left.

We need to find an acceptable solution to push this driver upstream.
I understand that developing a dynamic algorithm in the given time frame is
too big of a challenge, but hacking yet another client tunable is out of the
question either.   For testing in the Bakeathon I'd consider taking a DEVONLY version
of this patch that is enabled using a config option and defaults to zero to have no effect
in run-time until the sysctl is sets it differently.
But keep in mind this is not suitable for pushing upstream.

Benny
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

tao.peng@emc.com June 10, 2011, 2:09 p.m. UTC | #20

Hi, Benny,

-----Original Message-----
From: Benny Halevy [mailto:benny@tonian.com] 
Sent: Friday, June 10, 2011 8:33 PM
To: Peng, Tao
Cc: bergwolf@gmail.com; rees@umich.edu; linux-nfs@vger.kernel.org; honey@citi.umich.edu
Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget

On 2011-06-10 02:00, tao.peng@emc.com wrote:
> Hi, Benny,
> 
> Cheers,
> -Bergwolf
> 
> 
> -----Original Message-----
> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
> Sent: Friday, June 10, 2011 5:23 AM
> To: Peng Tao
> Cc: Jim Rees; linux-nfs@vger.kernel.org; peter honeyman
> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
> 
> On 2011-06-09 08:07, Peng Tao wrote:
>> Hi, Jim and Benny,
>>
>> On Thu, Jun 9, 2011 at 9:58 PM, Jim Rees <rees@umich.edu> wrote:
>>> Benny Halevy wrote:
>>>
>>>  > My understanding is that layoutget specifies a min and max, and the server
>>>
>>>  There's a min.  What do you consider the max?
>>>  Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?
>>>
>>> The spec doesn't say max, it says "desired."  I guess I assumed the server
>>> wouldn't normally return more than desired.
>> In fact server is returning "desired" length. The problem is that we
>> call pnfs_update_layout in nfs_write_begin, and it will end up setting
>> both minlength and length to page size. There is no space for client
>> to collapse layoutget range in nfs_write_begin.
>>
> 
> That's a different issue.  Waiting with pnfs_update_layout to flush
> time rather than write_begin if the whole page is written would help
> sending a more meaningful desired range as well as avoiding needless
> read-modify-writes in case the application also wrote the whole
> preallocated block.
> [PT] It is also the reason why we want to introduce layout prefetching, to get more segment than the page passed in nfs_write_begin.
> 

Peng, I understand what you want to achieve but the proposed way
just doesn't fly. The server knows better than the client its allocation policies
and it knows better the combined workload of different client and possible
conflicts between them therefore it should be making the ultimate decision
about the actual segment sizes.
[PT] Yes, you are right. Server should know combined workload of all clients and make its decision based on that.
And it always has the right to return more than (or less than) specified in loga_length.

That said, the client should indeed do its best to ask for the most appropriate
segments size for its use and we should be making a better job at that.
It's just that blindly asking for more is not a good strategy and requiring
manual admin help to tune the clients is not acceptable.
[PT] yeah, determing the most appropriate is always the hart part. Do you have any suggestions to that?

Thanks,
Tao

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

tao.peng@emc.com June 10, 2011, 2:17 p.m. UTC | #21

-----Original Message-----
From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
Sent: Friday, June 10, 2011 8:36 PM
To: Peng, Tao
Cc: rees@umich.edu; bergwolf@gmail.com; linux-nfs@vger.kernel.org; honey@citi.umich.edu
Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget

On 2011-06-10 01:36, tao.peng@emc.com wrote:
> Hi, Benny,
> 
> -----Original Message-----
> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
> Sent: Friday, June 10, 2011 5:23 AM
> To: Jim Rees
> Cc: Peng Tao; linux-nfs@vger.kernel.org; peter honeyman
> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
> 
> On 2011-06-09 06:58, Jim Rees wrote:
>> Benny Halevy wrote:
>>
>>   > My understanding is that layoutget specifies a min and max, and the server
>>   
>>   There's a min.  What do you consider the max?
>>   Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?
>>
>> The spec doesn't say max, it says "desired."  I guess I assumed the server
>> wouldn't normally return more than desired.
> 
> No, the server may freely upgrade the returned layout segment by returning
> a layout for a larger byte range or even returning a RW layout where a READ
> layout was asked for.
> [PT] It is true that server can upgrade the layout segment freely. But there is always a price to pay. Server has to be dealing with all kind of clients.
> If server returns more than being asked for, it may hurt other clients.

And if all clients ask for more than they need and the server just
gives it to them, what do you get out of that?
[PT] We cannot avoid this even if client has automatic layout prefetch algorithem implemented... think about all clients are doing sequential IO...


-Tao
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

tao.peng@emc.com June 10, 2011, 2:30 p.m. UTC | #22

-----Original Message-----
From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
Sent: Friday, June 10, 2011 8:48 PM
To: Peng, Tao
Cc: bergwolf@gmail.com; rees@umich.edu; linux-nfs@vger.kernel.org; honey@citi.umich.edu
Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget

On 2011-06-10 02:02, tao.peng@emc.com wrote:
> Hi, Benny,
> 
> Cheers,
> -Bergwolf
> 
> 
> -----Original Message-----
> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
> Sent: Friday, June 10, 2011 5:30 AM
> To: Peng Tao
> Cc: Jim Rees; linux-nfs@vger.kernel.org; peter honeyman
> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
> 
> On 2011-06-09 07:54, Peng Tao wrote:
>> On Thu, Jun 9, 2011 at 2:06 PM, Benny Halevy <bhalevy@panasas.com> wrote:
>>> On 2011-06-08 03:15, Peng Tao wrote:
>>>> On 6/8/11, Jim Rees <rees@umich.edu> wrote:
>>>>> Benny Halevy wrote:
>>>>>
>>>>>   NAK.
>>>>>   This affects all layout types.  In particular it is undesired
>>>>>   for write layouts that extend the file with the objects layout.
>>>>>   The server can extend the layout segments range
>>>>>   over what the client requested so why would the client
>>>>>   ask for artificially large layouts?
>>>>>
>>>>> This has actually been the subject of some debate over Thursday night
>>>>> beers.  The problem we're trying to solve is that the client is spending 98%
>>>>> of its time in layoutget.  This patch gives us something like a 10x
>>>>> speedup.  But many of us think it's not the right fix.  I suggest we discuss
>>>>> next week.
>>>>>
>>>
>>> Sure.
>>>
>>>>> But note that this patch doesn't change anything unless you set the sysctl.
>>>> there is a default value of 2M. maybe we can set it to page size by
>>>> default so other layout are not affected and block layout can let
>>>> users set it by hand if they care about performance. does this make
>>>> sense?
>>>
>>> If doing it at all why use a sysctl rather than a mount option?
>> The purpose of using a sysctl is to give client the ability to change
>> it on the fly. In theory, layout prefetching can benefit all layout
>> types. So the patch tries to solve it in the pnfs generic layer.
>>
> 
> But the need for this varies per-server and many times per application.
> Think sequential vs. random I/O.  Therefore a mount option would help
> tuning the behavior on a per-use basis.  Global behavior must be implemented
> using a dynamic algorithm that would take both the workload and the server
> observed behavior into account.
> [PT] Indeed. Dynamic algorithm is supposed to be able to solve all this. And it often takes longer to be designed/accepted. It has to prove to be better in most scenarios and does not hurt the left.

We need to find an acceptable solution to push this driver upstream.
I understand that developing a dynamic algorithm in the given time frame is
too big of a challenge, but hacking yet another client tunable is out of the
question either.   For testing in the Bakeathon I'd consider taking a DEVONLY version
of this patch that is enabled using a config option and defaults to zero to have no effect
in run-time until the sysctl is sets it differently.
But keep in mind this is not suitable for pushing upstream.
[PT] Thanks for your understanding. We truly want to solve the performance problem and are open to suggestions. And this will be a feature that benefits all layout types, am I right?

-Tao
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Boaz Harrosh June 10, 2011, 4:23 p.m. UTC | #23

On 06/10/2011 05:47 AM, Benny Halevy wrote:
> On 2011-06-10 02:02, tao.peng@emc.com wrote:
>> Hi, Benny,
>>
>> Cheers,
>> -Bergwolf
>>
>>
>> -----Original Message-----
>> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
>> Sent: Friday, June 10, 2011 5:30 AM
>> To: Peng Tao
>> Cc: Jim Rees; linux-nfs@vger.kernel.org; peter honeyman
>> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
>>
>> On 2011-06-09 07:54, Peng Tao wrote:
>>> On Thu, Jun 9, 2011 at 2:06 PM, Benny Halevy <bhalevy@panasas.com> wrote:
>>>> On 2011-06-08 03:15, Peng Tao wrote:
>>>>> On 6/8/11, Jim Rees <rees@umich.edu> wrote:
>>>>>> Benny Halevy wrote:
>>>>>>
>>>>>>   NAK.
>>>>>>   This affects all layout types.  In particular it is undesired
>>>>>>   for write layouts that extend the file with the objects layout.
>>>>>>   The server can extend the layout segments range
>>>>>>   over what the client requested so why would the client
>>>>>>   ask for artificially large layouts?
>>>>>>
>>>>>> This has actually been the subject of some debate over Thursday night
>>>>>> beers.  The problem we're trying to solve is that the client is spending 98%
>>>>>> of its time in layoutget.  This patch gives us something like a 10x
>>>>>> speedup.  But many of us think it's not the right fix.  I suggest we discuss
>>>>>> next week.
>>>>>>
>>>>
>>>> Sure.
>>>>
>>>>>> But note that this patch doesn't change anything unless you set the sysctl.
>>>>> there is a default value of 2M. maybe we can set it to page size by
>>>>> default so other layout are not affected and block layout can let
>>>>> users set it by hand if they care about performance. does this make
>>>>> sense?
>>>>
>>>> If doing it at all why use a sysctl rather than a mount option?
>>> The purpose of using a sysctl is to give client the ability to change
>>> it on the fly. In theory, layout prefetching can benefit all layout
>>> types. So the patch tries to solve it in the pnfs generic layer.
>>>
>>
>> But the need for this varies per-server and many times per application.
>> Think sequential vs. random I/O.  Therefore a mount option would help
>> tuning the behavior on a per-use basis.  Global behavior must be implemented
>> using a dynamic algorithm that would take both the workload and the server
>> observed behavior into account.
>> [PT] Indeed. Dynamic algorithm is supposed to be able to solve all this. And it often takes longer to be designed/accepted. It has to prove to be better in most scenarios and does not hurt the left.
> 
> We need to find an acceptable solution to push this driver upstream.
> I understand that developing a dynamic algorithm in the given time frame is
> too big of a challenge, but hacking yet another client tunable is out of the
> question either.   For testing in the Bakeathon I'd consider taking a DEVONLY version
> of this patch that is enabled using a config option and defaults to zero to have no effect
> in run-time until the sysctl is sets it differently.
> But keep in mind this is not suitable for pushing upstream.
> 

I disagree. Please make that same exact patch for the server you are using!

Leave the client alone. Don't even consider getting use to this, sticking
broken stuff on client scripts. If anywhere it should be in Server configuration
files

Boaz

> Benny
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Boaz Harrosh June 10, 2011, 4:44 p.m. UTC | #24

On 06/10/2011 09:23 AM, Boaz Harrosh wrote:
> 
> I disagree. Please make that same exact patch for the server you are using!
> 
> Leave the client alone. Don't even consider getting use to this, sticking
> broken stuff on client scripts. If anywhere it should be in Server configuration
> files
> 
> Boaz
> 

BTW the algorithm at the *server* side need not be dynamic and can be very simple.
Just a simple rule based:

1. If the file is new/zero-size just give out a small layout say 1-2 stripe_units
2. If the file is or becoming bigger give out some BIG maximal optimum IO size.
   A size that anything bigger will not increase performance.

3. If the file is opened by a second client, recall the first big layout and
   give out a one-to-few strips layout for shared access.
   You will find that this shared file case is very rare. There are not many
   application that share the same file. If they do they usually use range
   locks. Query your lock manager if the client has a lock on this range of
   the file, give out the full locked range as a layout. If that is not the proper
   hint from the application, then what is? It is a much better hint then the
   nfs-client can ever guess.

4. Make it simple as hell....

Just my $0.017
Boaz
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Benny Halevy June 10, 2011, 7:02 p.m. UTC | #25

On 2011-06-10 10:17, tao.peng@emc.com wrote:
> 
> -----Original Message-----
> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
> Sent: Friday, June 10, 2011 8:36 PM
> To: Peng, Tao
> Cc: rees@umich.edu; bergwolf@gmail.com; linux-nfs@vger.kernel.org; honey@citi.umich.edu
> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
> 
> On 2011-06-10 01:36, tao.peng@emc.com wrote:
>> Hi, Benny,
>>
>> -----Original Message-----
>> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
>> Sent: Friday, June 10, 2011 5:23 AM
>> To: Jim Rees
>> Cc: Peng Tao; linux-nfs@vger.kernel.org; peter honeyman
>> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
>>
>> On 2011-06-09 06:58, Jim Rees wrote:
>>> Benny Halevy wrote:
>>>
>>>   > My understanding is that layoutget specifies a min and max, and the server
>>>   
>>>   There's a min.  What do you consider the max?
>>>   Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?
>>>
>>> The spec doesn't say max, it says "desired."  I guess I assumed the server
>>> wouldn't normally return more than desired.
>>
>> No, the server may freely upgrade the returned layout segment by returning
>> a layout for a larger byte range or even returning a RW layout where a READ
>> layout was asked for.
>> [PT] It is true that server can upgrade the layout segment freely. But there is always a price to pay. Server has to be dealing with all kind of clients.
>> If server returns more than being asked for, it may hurt other clients.
> 
> And if all clients ask for more than they need and the server just
> gives it to them, what do you get out of that?
> [PT] We cannot avoid this even if client has automatic layout prefetch algorithem implemented... think about all clients are doing sequential IO...

Right and that's why the server has to be intelligent about it.

Benny
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Benny Halevy June 10, 2011, 7:07 p.m. UTC | #26

On 2011-06-10 10:30, tao.peng@emc.com wrote:
> 
> -----Original Message-----
> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
> Sent: Friday, June 10, 2011 8:48 PM
> To: Peng, Tao
> Cc: bergwolf@gmail.com; rees@umich.edu; linux-nfs@vger.kernel.org; honey@citi.umich.edu
> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
> 
> On 2011-06-10 02:02, tao.peng@emc.com wrote:
>> Hi, Benny,
>>
>> Cheers,
>> -Bergwolf
>>
>>
>> -----Original Message-----
>> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
>> Sent: Friday, June 10, 2011 5:30 AM
>> To: Peng Tao
>> Cc: Jim Rees; linux-nfs@vger.kernel.org; peter honeyman
>> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
>>
>> On 2011-06-09 07:54, Peng Tao wrote:
>>> On Thu, Jun 9, 2011 at 2:06 PM, Benny Halevy <bhalevy@panasas.com> wrote:
>>>> On 2011-06-08 03:15, Peng Tao wrote:
>>>>> On 6/8/11, Jim Rees <rees@umich.edu> wrote:
>>>>>> Benny Halevy wrote:
>>>>>>
>>>>>>   NAK.
>>>>>>   This affects all layout types.  In particular it is undesired
>>>>>>   for write layouts that extend the file with the objects layout.
>>>>>>   The server can extend the layout segments range
>>>>>>   over what the client requested so why would the client
>>>>>>   ask for artificially large layouts?
>>>>>>
>>>>>> This has actually been the subject of some debate over Thursday night
>>>>>> beers.  The problem we're trying to solve is that the client is spending 98%
>>>>>> of its time in layoutget.  This patch gives us something like a 10x
>>>>>> speedup.  But many of us think it's not the right fix.  I suggest we discuss
>>>>>> next week.
>>>>>>
>>>>
>>>> Sure.
>>>>
>>>>>> But note that this patch doesn't change anything unless you set the sysctl.
>>>>> there is a default value of 2M. maybe we can set it to page size by
>>>>> default so other layout are not affected and block layout can let
>>>>> users set it by hand if they care about performance. does this make
>>>>> sense?
>>>>
>>>> If doing it at all why use a sysctl rather than a mount option?
>>> The purpose of using a sysctl is to give client the ability to change
>>> it on the fly. In theory, layout prefetching can benefit all layout
>>> types. So the patch tries to solve it in the pnfs generic layer.
>>>
>>
>> But the need for this varies per-server and many times per application.
>> Think sequential vs. random I/O.  Therefore a mount option would help
>> tuning the behavior on a per-use basis.  Global behavior must be implemented
>> using a dynamic algorithm that would take both the workload and the server
>> observed behavior into account.
>> [PT] Indeed. Dynamic algorithm is supposed to be able to solve all this. And it often takes longer to be designed/accepted. It has to prove to be better in most scenarios and does not hurt the left.
> 
> We need to find an acceptable solution to push this driver upstream.
> I understand that developing a dynamic algorithm in the given time frame is
> too big of a challenge, but hacking yet another client tunable is out of the
> question either.   For testing in the Bakeathon I'd consider taking a DEVONLY version
> of this patch that is enabled using a config option and defaults to zero to have no effect
> in run-time until the sysctl is sets it differently.
> But keep in mind this is not suitable for pushing upstream.
> [PT] Thanks for your understanding. We truly want to solve the performance problem and are open to suggestions. And this will be a feature that benefits all layout types, am I right?

Adaptive layout prefetching at the client side is a nice workaround for
naive server implementations but you can't get around implementing
a more sophisticated algorithm on the server side.  As for the benefit,
if yo implement such an algorithm, on the client side I would like it
to be implemented generically so that all layout types could benefit
from it.

Benny

> 
> -Tao
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Benny Halevy June 10, 2011, 7:23 p.m. UTC | #27

On 2011-06-10 10:09, tao.peng@emc.com wrote:
> Hi, Benny,
> 
> -----Original Message-----
> From: Benny Halevy [mailto:benny@tonian.com] 
> Sent: Friday, June 10, 2011 8:33 PM
> To: Peng, Tao
> Cc: bergwolf@gmail.com; rees@umich.edu; linux-nfs@vger.kernel.org; honey@citi.umich.edu
> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
> 
> On 2011-06-10 02:00, tao.peng@emc.com wrote:
>> Hi, Benny,
>>
>> Cheers,
>> -Bergwolf
>>
>>
>> -----Original Message-----
>> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
>> Sent: Friday, June 10, 2011 5:23 AM
>> To: Peng Tao
>> Cc: Jim Rees; linux-nfs@vger.kernel.org; peter honeyman
>> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
>>
>> On 2011-06-09 08:07, Peng Tao wrote:
>>> Hi, Jim and Benny,
>>>
>>> On Thu, Jun 9, 2011 at 9:58 PM, Jim Rees <rees@umich.edu> wrote:
>>>> Benny Halevy wrote:
>>>>
>>>>  > My understanding is that layoutget specifies a min and max, and the server
>>>>
>>>>  There's a min.  What do you consider the max?
>>>>  Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?
>>>>
>>>> The spec doesn't say max, it says "desired."  I guess I assumed the server
>>>> wouldn't normally return more than desired.
>>> In fact server is returning "desired" length. The problem is that we
>>> call pnfs_update_layout in nfs_write_begin, and it will end up setting
>>> both minlength and length to page size. There is no space for client
>>> to collapse layoutget range in nfs_write_begin.
>>>
>>
>> That's a different issue.  Waiting with pnfs_update_layout to flush
>> time rather than write_begin if the whole page is written would help
>> sending a more meaningful desired range as well as avoiding needless
>> read-modify-writes in case the application also wrote the whole
>> preallocated block.
>> [PT] It is also the reason why we want to introduce layout prefetching, to get more segment than the page passed in nfs_write_begin.
>>
> 
> Peng, I understand what you want to achieve but the proposed way
> just doesn't fly. The server knows better than the client its allocation policies
> and it knows better the combined workload of different client and possible
> conflicts between them therefore it should be making the ultimate decision
> about the actual segment sizes.
> [PT] Yes, you are right. Server should know combined workload of all clients and make its decision based on that.
> And it always has the right to return more than (or less than) specified in loga_length.
>  
> That said, the client should indeed do its best to ask for the most appropriate
> segments size for its use and we should be making a better job at that.
> It's just that blindly asking for more is not a good strategy and requiring
> manual admin help to tune the clients is not acceptable.
> [PT] yeah, determing the most appropriate is always the hart part. Do you have any suggestions to that?

A simple algorithm I can suggest is:
- on initialization, calculate and save, per layout driver
  - maximum layout size
    - take into account csr_fore_chan_attrs.ca_maxresponsesize and possible other parameters
  - keep a working copy of the maximum value and the calculated copy.
  - alignment value.
- on miss, see if there's an adjacent layout segment in cache
- if found, ask for twice the found segment size, up to the maximum value,
  aligned on the alignment value.
- if the server returns less the layoutget range, keep note of the returned length
  (but not adjust maximum yet, as the server may return a short segment for various
   reasons)
- if the server is consistent about returning less than was asked, adjust the
  - working copy of the maximum length
- if the maximum was adjusted try bumping it up after X (TBD) layoutgets or T seconds
  to see if that was just due to high load or conflicts on the server
- on any error returned for LAYOUTGET reset the algorithm parameters
- on session reestablishment recalculate maximums.

Benny

> 
> Thanks,
> Tao
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Fred Isaman June 10, 2011, 8:03 p.m. UTC | #28

On Fri, Jun 10, 2011 at 3:23 PM, Benny Halevy <benny@tonian.com> wrote:
>
> A simple algorithm I can suggest is:
> - on initialization, calculate and save, per layout driver
>  - maximum layout size

I must be misunderstanding something.  Layout size has nothing to do
with io size (other than the obvious fact that you want the layout >
io).

I don't know about the object driver, but for both the file and block
drivers the client wants as much as the server will give it.

Fred

>    - take into account csr_fore_chan_attrs.ca_maxresponsesize and possible other parameters
>  - keep a working copy of the maximum value and the calculated copy.
>  - alignment value.
> - on miss, see if there's an adjacent layout segment in cache
> - if found, ask for twice the found segment size, up to the maximum value,
>  aligned on the alignment value.
> - if the server returns less the layoutget range, keep note of the returned length
>  (but not adjust maximum yet, as the server may return a short segment for various
>   reasons)
> - if the server is consistent about returning less than was asked, adjust the
>  - working copy of the maximum length
> - if the maximum was adjusted try bumping it up after X (TBD) layoutgets or T seconds
>  to see if that was just due to high load or conflicts on the server
> - on any error returned for LAYOUTGET reset the algorithm parameters
> - on session reestablishment recalculate maximums.
>
> Benny
>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Benny Halevy June 10, 2011, 9:15 p.m. UTC | #29

On 2011-06-10 16:03, Fred Isaman wrote:
> On Fri, Jun 10, 2011 at 3:23 PM, Benny Halevy <benny@tonian.com> wrote:
>>
>> A simple algorithm I can suggest is:
>> - on initialization, calculate and save, per layout driver
>>  - maximum layout size
> 
> I must be misunderstanding something.  Layout size has nothing to do
> with io size (other than the obvious fact that you want the layout >
> io).

Exactly, and if the layouts you get from the server are too small
it's hard to do efficient I/O (modulo layout segment merging
or gathering)

> 
> I don't know about the object driver, but for both the file and block
> drivers the client wants as much as the server will give it.
> 

For blocks the message buffer size (as mentioned below) may be
a limiting factor hence the limit.
Another constraint for blocks and for objects to some extent is
provisional allocation where you don't want to just arbitrarily
ask for artificially large write layouts and this may be interpreted
as intent to write to the whole range and will result in excessive
provisional allocation of resources.

Bottom line, I completely agree with "the client wants as much as the
server will give it" so it should ask for what it needs and let the
server decide whether to extend or trim the range if need be.

Benny

> Fred
> 
>>    - take into account csr_fore_chan_attrs.ca_maxresponsesize and possible other parameters
>>  - keep a working copy of the maximum value and the calculated copy.
>>  - alignment value.
>> - on miss, see if there's an adjacent layout segment in cache
>> - if found, ask for twice the found segment size, up to the maximum value,
>>  aligned on the alignment value.
>> - if the server returns less the layoutget range, keep note of the returned length
>>  (but not adjust maximum yet, as the server may return a short segment for various
>>   reasons)
>> - if the server is consistent about returning less than was asked, adjust the
>>  - working copy of the maximum length
>> - if the maximum was adjusted try bumping it up after X (TBD) layoutgets or T seconds
>>  to see if that was just due to high load or conflicts on the server
>> - on any error returned for LAYOUTGET reset the algorithm parameters
>> - on session reestablishment recalculate maximums.
>>
>> Benny
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Boaz Harrosh June 10, 2011, 11:20 p.m. UTC | #30

On 06/10/2011 12:23 PM, Benny Halevy wrote:
> On 2011-06-10 10:09, tao.peng@emc.com wrote:
> 
> A simple algorithm I can suggest is:
> - on initialization, calculate and save, per layout driver
>   - maximum layout size
>     - take into account csr_fore_chan_attrs.ca_maxresponsesize and possible other parameters
>   - keep a working copy of the maximum value and the calculated copy.
>   - alignment value.
> - on miss, see if there's an adjacent layout segment in cache
> - if found, ask for twice the found segment size, up to the maximum value,
>   aligned on the alignment value.
> - if the server returns less the layoutget range, keep note of the returned length
>   (but not adjust maximum yet, as the server may return a short segment for various
>    reasons)
> - if the server is consistent about returning less than was asked, adjust the
>   - working copy of the maximum length
> - if the maximum was adjusted try bumping it up after X (TBD) layoutgets or T seconds
>   to see if that was just due to high load or conflicts on the server
> - on any error returned for LAYOUTGET reset the algorithm parameters
> - on session reestablishment recalculate maximums.
> 
> Benny
> 

I completely disagree with all this. NACK!

The only proper thing a client can do is ask for what it needs, and only the application
can do that, because at the VFS level it is only second guessing, and is completely
pointless.

The only one that can know about structure, alignments, optimal IO sizes and layouts
is the server. The server even have more information to second guess the application
from the file size information and it's share and lock disposition. Please see my
simple Server side algorithm.

Because you must understand one most important thing. Any smart decision a client can
make will be after it received the layout (stripe_unit, number-of-devices etc..) But
at that time it is too late it already sent the layout_get. Only the server knows
before hand what is the most optimal size. The client should just be a transparent
pipe from application to the server. It should never ever set policy. Only a Server
can/should do that.

Lets put the efforts and algorithms where they belong, please?

Boaz
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peng Tao June 11, 2011, 1:35 a.m. UTC | #31

On Sat, Jun 11, 2011 at 3:23 AM, Benny Halevy <benny@tonian.com> wrote:
> On 2011-06-10 10:09, tao.peng@emc.com wrote:
>> Hi, Benny,
>>
>> -----Original Message-----
>> From: Benny Halevy [mailto:benny@tonian.com]
>> Sent: Friday, June 10, 2011 8:33 PM
>> To: Peng, Tao
>> Cc: bergwolf@gmail.com; rees@umich.edu; linux-nfs@vger.kernel.org; honey@citi.umich.edu
>> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
>>
>> On 2011-06-10 02:00, tao.peng@emc.com wrote:
>>> Hi, Benny,
>>>
>>> Cheers,
>>> -Bergwolf
>>>
>>>
>>> -----Original Message-----
>>> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org] On Behalf Of Benny Halevy
>>> Sent: Friday, June 10, 2011 5:23 AM
>>> To: Peng Tao
>>> Cc: Jim Rees; linux-nfs@vger.kernel.org; peter honeyman
>>> Subject: Re: [PATCH 87/88] Add configurable prefetch size for layoutget
>>>
>>> On 2011-06-09 08:07, Peng Tao wrote:
>>>> Hi, Jim and Benny,
>>>>
>>>> On Thu, Jun 9, 2011 at 9:58 PM, Jim Rees <rees@umich.edu> wrote:
>>>>> Benny Halevy wrote:
>>>>>
>>>>>  > My understanding is that layoutget specifies a min and max, and the server
>>>>>
>>>>>  There's a min.  What do you consider the max?
>>>>>  Whatever gets into csa_fore_chan_attrs.ca_maxresponsesize?
>>>>>
>>>>> The spec doesn't say max, it says "desired."  I guess I assumed the server
>>>>> wouldn't normally return more than desired.
>>>> In fact server is returning "desired" length. The problem is that we
>>>> call pnfs_update_layout in nfs_write_begin, and it will end up setting
>>>> both minlength and length to page size. There is no space for client
>>>> to collapse layoutget range in nfs_write_begin.
>>>>
>>>
>>> That's a different issue.  Waiting with pnfs_update_layout to flush
>>> time rather than write_begin if the whole page is written would help
>>> sending a more meaningful desired range as well as avoiding needless
>>> read-modify-writes in case the application also wrote the whole
>>> preallocated block.
>>> [PT] It is also the reason why we want to introduce layout prefetching, to get more segment than the page passed in nfs_write_begin.
>>>
>>
>> Peng, I understand what you want to achieve but the proposed way
>> just doesn't fly. The server knows better than the client its allocation policies
>> and it knows better the combined workload of different client and possible
>> conflicts between them therefore it should be making the ultimate decision
>> about the actual segment sizes.
>> [PT] Yes, you are right. Server should know combined workload of all clients and make its decision based on that.
>> And it always has the right to return more than (or less than) specified in loga_length.
>>
>> That said, the client should indeed do its best to ask for the most appropriate
>> segments size for its use and we should be making a better job at that.
>> It's just that blindly asking for more is not a good strategy and requiring
>> manual admin help to tune the clients is not acceptable.
>> [PT] yeah, determing the most appropriate is always the hart part. Do you have any suggestions to that?
>
> A simple algorithm I can suggest is:
> - on initialization, calculate and save, per layout driver
>  - maximum layout size
>    - take into account csr_fore_chan_attrs.ca_maxresponsesize and possible other parameters
>  - keep a working copy of the maximum value and the calculated copy.
>  - alignment value.
> - on miss, see if there's an adjacent layout segment in cache
Err, that's another issue. Generic layer should really merge adjacent
layout segments when necessary, instead of letting lookup code find
out what are adjacent...

> - if found, ask for twice the found segment size, up to the maximum value,
>  aligned on the alignment value.
> - if the server returns less the layoutget range, keep note of the returned length
>  (but not adjust maximum yet, as the server may return a short segment for various
>   reasons)
> - if the server is consistent about returning less than was asked, adjust the
>  - working copy of the maximum length
If server is consistent about returning more/less than asked, it is an
indicator that server is adjust the range automatically. Then client
should stop using this algorithm and trust server behavior...

> - if the maximum was adjusted try bumping it up after X (TBD) layoutgets or T seconds
>  to see if that was just due to high load or conflicts on the server
> - on any error returned for LAYOUTGET reset the algorithm parameters
> - on session reestablishment recalculate maximums.
>
> Benny
>
>>
>> Thanks,
>> Tao
>

Peng Tao June 11, 2011, 1:46 a.m. UTC | #32

Hi, Benny,

On Sat, Jun 11, 2011 at 5:15 AM, Benny Halevy <benny@tonian.com> wrote:
> On 2011-06-10 16:03, Fred Isaman wrote:
>> On Fri, Jun 10, 2011 at 3:23 PM, Benny Halevy <benny@tonian.com> wrote:
>>>
>>> A simple algorithm I can suggest is:
>>> - on initialization, calculate and save, per layout driver
>>>  - maximum layout size
>>
>> I must be misunderstanding something.  Layout size has nothing to do
>> with io size (other than the obvious fact that you want the layout >
>> io).
>
> Exactly, and if the layouts you get from the server are too small
> it's hard to do efficient I/O (modulo layout segment merging
> or gathering)
>
>>
>> I don't know about the object driver, but for both the file and block
>> drivers the client wants as much as the server will give it.
>>
>
> For blocks the message buffer size (as mentioned below) may be
> a limiting factor hence the limit.
It is true that message buffer size can limit the number of extents
returned by block server. But the length that each extent is
addressing is unknown. It can be of any size from layout_blksize to
even larger than file size), determined by the file's on disk layout.
Client cannot guess maximum segment size based on message buffer size.

> Another constraint for blocks and for objects to some extent is
> provisional allocation where you don't want to just arbitrarily
> ask for artificially large write layouts and this may be interpreted
> as intent to write to the whole range and will result in excessive
> provisional allocation of resources.
>
> Bottom line, I completely agree with "the client wants as much as the
> server will give it" so it should ask for what it needs and let the
> server decide whether to extend or trim the range if need be.
>
> Benny
>
>> Fred
>>
>>>    - take into account csr_fore_chan_attrs.ca_maxresponsesize and possible other parameters
>>>  - keep a working copy of the maximum value and the calculated copy.
>>>  - alignment value.
>>> - on miss, see if there's an adjacent layout segment in cache
>>> - if found, ask for twice the found segment size, up to the maximum value,
>>>  aligned on the alignment value.
>>> - if the server returns less the layoutget range, keep note of the returned length
>>>  (but not adjust maximum yet, as the server may return a short segment for various
>>>   reasons)
>>> - if the server is consistent about returning less than was asked, adjust the
>>>  - working copy of the maximum length
>>> - if the maximum was adjusted try bumping it up after X (TBD) layoutgets or T seconds
>>>  to see if that was just due to high load or conflicts on the server
>>> - on any error returned for LAYOUTGET reset the algorithm parameters
>>> - on session reestablishment recalculate maximums.
>>>
>>> Benny
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

Peng Tao June 11, 2011, 2:19 a.m. UTC | #33

On Sat, Jun 11, 2011 at 7:20 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
> On 06/10/2011 12:23 PM, Benny Halevy wrote:
>> On 2011-06-10 10:09, tao.peng@emc.com wrote:
>>
>> A simple algorithm I can suggest is:
>> - on initialization, calculate and save, per layout driver
>>   - maximum layout size
>>     - take into account csr_fore_chan_attrs.ca_maxresponsesize and possible other parameters
>>   - keep a working copy of the maximum value and the calculated copy.
>>   - alignment value.
>> - on miss, see if there's an adjacent layout segment in cache
>> - if found, ask for twice the found segment size, up to the maximum value,
>>   aligned on the alignment value.
>> - if the server returns less the layoutget range, keep note of the returned length
>>   (but not adjust maximum yet, as the server may return a short segment for various
>>    reasons)
>> - if the server is consistent about returning less than was asked, adjust the
>>   - working copy of the maximum length
>> - if the maximum was adjusted try bumping it up after X (TBD) layoutgets or T seconds
>>   to see if that was just due to high load or conflicts on the server
>> - on any error returned for LAYOUTGET reset the algorithm parameters
>> - on session reestablishment recalculate maximums.
>>
>> Benny
>>
>
> I completely disagree with all this. NACK!
>
> The only proper thing a client can do is ask for what it needs, and only the application
> can do that, because at the VFS level it is only second guessing, and is completely
> pointless.
>
> The only one that can know about structure, alignments, optimal IO sizes and layouts
> is the server. The server even have more information to second guess the application
> from the file size information and it's share and lock disposition. Please see my
> simple Server side algorithm.
Well, IMO, client is closer to applications and should have a better
position at "guessing" application's workload.

A simple example, when a client asks for a layout, server would have
no idea if client is doing layout prefetch, or if it really need that
range to complete its work. But the client knows it for sure.

>
> Because you must understand one most important thing. Any smart decision a client can
> make will be after it received the layout (stripe_unit, number-of-devices etc..) But
> at that time it is too late it already sent the layout_get. Only the server knows
> before hand what is the most optimal size. The client should just be a transparent
> pipe from application to the server. It should never ever set policy. Only a Server
> can/should do that.
>
> Lets put the efforts and algorithms where they belong, please?
>
> Boaz
>

Boaz Harrosh June 12, 2011, 2:40 p.m. UTC | #34

On 06/10/2011 07:19 PM, Peng Tao wrote:
> On Sat, Jun 11, 2011 at 7:20 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
>> On 06/10/2011 12:23 PM, Benny Halevy wrote:
>>> On 2011-06-10 10:09, tao.peng@emc.com wrote:
>>>
>>> A simple algorithm I can suggest is:
>>> - on initialization, calculate and save, per layout driver
>>>   - maximum layout size
>>>     - take into account csr_fore_chan_attrs.ca_maxresponsesize and possible other parameters
>>>   - keep a working copy of the maximum value and the calculated copy.
>>>   - alignment value.
>>> - on miss, see if there's an adjacent layout segment in cache
>>> - if found, ask for twice the found segment size, up to the maximum value,
>>>   aligned on the alignment value.
>>> - if the server returns less the layoutget range, keep note of the returned length
>>>   (but not adjust maximum yet, as the server may return a short segment for various
>>>    reasons)
>>> - if the server is consistent about returning less than was asked, adjust the
>>>   - working copy of the maximum length
>>> - if the maximum was adjusted try bumping it up after X (TBD) layoutgets or T seconds
>>>   to see if that was just due to high load or conflicts on the server
>>> - on any error returned for LAYOUTGET reset the algorithm parameters
>>> - on session reestablishment recalculate maximums.
>>>
>>> Benny
>>>
>>
>> I completely disagree with all this. NACK!
>>
>> The only proper thing a client can do is ask for what it needs, and only the application
>> can do that, because at the VFS level it is only second guessing, and is completely
>> pointless.
>>
>> The only one that can know about structure, alignments, optimal IO sizes and layouts
>> is the server. The server even have more information to second guess the application
>> from the file size information and it's share and lock disposition. Please see my
>> simple Server side algorithm.
> Well, IMO, client is closer to applications and should have a better
> position at "guessing" application's workload.
> 
> A simple example, when a client asks for a layout, server would have
> no idea if client is doing layout prefetch, or if it really need that
> range to complete its work. But the client knows it for sure.
> 

What is layout prefetch? we don't do any in the Linux client.

And your example does not make any sense. If a theoretical stupid client does
"layout prefetch" what ever that means, how is it related to the application?

There is not a single information a client has the the server does not. Look at
your patch. Set an arbitrary value at client setup. That same arbitrary value
can be setup at server. Look at benny's algorithm. It can be done just the same
at the server side. (Which does not mean it is a good algorithm)

Just take that patch of yours, but instead of at client side. Put it at server
side. You will achieve exact same results. Only you configure one server instead
of every client.

And no way in hell I let that go into the Linux generic client!

Boaz

>>
>> Because you must understand one most important thing. Any smart decision a client can
>> make will be after it received the layout (stripe_unit, number-of-devices etc..) But
>> at that time it is too late it already sent the layout_get. Only the server knows
>> before hand what is the most optimal size. The client should just be a transparent
>> pipe from application to the server. It should never ever set policy. Only a Server
>> can/should do that.
>>
>> Lets put the efforts and algorithms where they belong, please?
>>
>> Boaz
>>
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Peng Tao June 12, 2011, 6:46 p.m. UTC | #35

On Sun, Jun 12, 2011 at 10:40 PM, Boaz Harrosh <bharrosh@panasas.com> wrote:
> On 06/10/2011 07:19 PM, Peng Tao wrote:
>> On Sat, Jun 11, 2011 at 7:20 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
>>> On 06/10/2011 12:23 PM, Benny Halevy wrote:
>>>> On 2011-06-10 10:09, tao.peng@emc.com wrote:
>>>>
>>>> A simple algorithm I can suggest is:
>>>> - on initialization, calculate and save, per layout driver
>>>>   - maximum layout size
>>>>     - take into account csr_fore_chan_attrs.ca_maxresponsesize and possible other parameters
>>>>   - keep a working copy of the maximum value and the calculated copy.
>>>>   - alignment value.
>>>> - on miss, see if there's an adjacent layout segment in cache
>>>> - if found, ask for twice the found segment size, up to the maximum value,
>>>>   aligned on the alignment value.
>>>> - if the server returns less the layoutget range, keep note of the returned length
>>>>   (but not adjust maximum yet, as the server may return a short segment for various
>>>>    reasons)
>>>> - if the server is consistent about returning less than was asked, adjust the
>>>>   - working copy of the maximum length
>>>> - if the maximum was adjusted try bumping it up after X (TBD) layoutgets or T seconds
>>>>   to see if that was just due to high load or conflicts on the server
>>>> - on any error returned for LAYOUTGET reset the algorithm parameters
>>>> - on session reestablishment recalculate maximums.
>>>>
>>>> Benny
>>>>
>>>
>>> I completely disagree with all this. NACK!
>>>
>>> The only proper thing a client can do is ask for what it needs, and only the application
>>> can do that, because at the VFS level it is only second guessing, and is completely
>>> pointless.
>>>
>>> The only one that can know about structure, alignments, optimal IO sizes and layouts
>>> is the server. The server even have more information to second guess the application
>>> from the file size information and it's share and lock disposition. Please see my
>>> simple Server side algorithm.
>> Well, IMO, client is closer to applications and should have a better
>> position at "guessing" application's workload.
>>
>> A simple example, when a client asks for a layout, server would have
>> no idea if client is doing layout prefetch, or if it really need that
>> range to complete its work. But the client knows it for sure.
>>
>
> What is layout prefetch? we don't do any in the Linux client.
>
> And your example does not make any sense. If a theoretical stupid client does
> "layout prefetch" what ever that means, how is it related to the application?
The example is just showing you that sometimes server does not know
everything. And the thing it does not know, can have some impact on
application IO performance. If client is asking for more than it needs
and server is returning more than client asks for, application
performance is impacted for sure.

>
> There is not a single information a client has the the server does not. Look at
> your patch. Set an arbitrary value at client setup. That same arbitrary value
> can be setup at server. Look at benny's algorithm. It can be done just the same
> at the server side. (Which does not mean it is a good algorithm)
True. And both client and server should have that some kind of
algorithms implemented. Server can be smart. But it does not prevent
client from being smart too. There can always be naive clients and
naive servers. We can't prevent users from using them. And when users
are dealing with naive servers, we can do better with the algorithm at
client side, other than simply telling them "it's your fault not using
the smart servers!".

>
> Just take that patch of yours, but instead of at client side. Put it at server
> side. You will achieve exact same results. Only you configure one server instead
> of every client.
>
> And no way in hell I let that go into the Linux generic client!
While taking your side so firmly, would you mind telling why such kind
of algorithm cannot be implemented at client side?

>
> Boaz
>
>>>
>>> Because you must understand one most important thing. Any smart decision a client can
>>> make will be after it received the layout (stripe_unit, number-of-devices etc..) But
>>> at that time it is too late it already sent the layout_get. Only the server knows
>>> before hand what is the most optimal size. The client should just be a transparent
>>> pipe from application to the server. It should never ever set policy. Only a Server
>>> can/should do that.
>>>
>>> Lets put the efforts and algorithms where they belong, please?
>>>
>>> Boaz
>>>
>>
>>
>>
>
>

diff mbox

Patch

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 9920bff..9c2b569 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -46,6 +46,11 @@  static DEFINE_SPINLOCK(pnfs_spinlock);
  */
 static LIST_HEAD(pnfs_modules_tbl);
 
+/*
+ * layoutget prefetch size
+ */
+unsigned int pnfs_layout_prefetch_kb = 2 << 10;
+
 /* Return the registered pnfs layout driver module matching given id */
 static struct pnfs_layoutdriver_type *
 find_pnfs_driver_locked(u32 id)
@@ -906,6 +911,16 @@  pnfs_find_lseg(struct pnfs_layout_hdr *lo,
 }
 
 /*
+ * Set layout prefetch length.
+ */
+static void
+pnfs_set_layout_prefetch(struct pnfs_layout_range *range)
+{
+	if (range->length < (pnfs_layout_prefetch_kb << 10))
+		range->length = pnfs_layout_prefetch_kb << 10;
+}
+
+/*
  * Layout segment is retreived from the server if not cached.
  * The appropriate layout segment is referenced and returned to the caller.
  */
@@ -956,6 +971,8 @@  pnfs_update_layout(struct inode *ino,
 
 	if (pnfs_layoutgets_blocked(lo, NULL, 0))
 		goto out_unlock;
+
+	pnfs_set_layout_prefetch(&arg);
 	atomic_inc(&lo->plh_outstanding);
 
 	get_layout_hdr(lo);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 28d57c9..563c67b 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -182,6 +182,7 @@  extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 
 /* pnfs.c */
+extern unsigned int pnfs_layout_prefetch_kb;
 void get_layout_hdr(struct pnfs_layout_hdr *lo);
 void put_lseg(struct pnfs_layout_segment *lseg);
 struct pnfs_layout_segment *
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 978aaeb..79a5134 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -14,6 +14,7 @@ 
 #include <linux/nfs_fs.h>
 
 #include "callback.h"
+#include "pnfs.h"
 
 #ifdef CONFIG_NFS_V4
 static const int nfs_set_port_min = 0;
@@ -42,6 +43,15 @@  static ctl_table nfs_cb_sysctls[] = {
 	},
 #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
 #endif
+#ifdef CONFIG_NFS_V4_1
+	{
+		.procname	= "pnfs_layout_prefetch_kb",
+		.data		= &pnfs_layout_prefetch_kb,
+		.maxlen		= sizeof(pnfs_layout_prefetch_kb),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 	{
 		.procname	= "nfs_mountpoint_timeout",
 		.data		= &nfs_mountpoint_expiry_timeout,