diff mbox

[v4,2/3] xfs: Set realtime flag based on initial allocation size

Message ID 20170919035238.3976871-3-rwareing@fb.com (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Richard Wareing Sept. 19, 2017, 3:52 a.m. UTC
- The rt_alloc_min sysfs option automatically selects the device (data
  device, or realtime) based on the size of the initial allocation of the
  file.
- This option can be used to route the storage of small files (and the
  inefficient workloads associated with them) to a suitable storage
  device such a SSD, while larger allocations are sent to a traditional
  HDD.
- Supports writes via O_DIRECT, buffered (i.e. page cache), and
  pre-allocations (i.e. fallocate)
- Available only when kernel is compiled w/ CONFIG_XFS_RT option.

Signed-off-by: Richard Wareing <rwareing@fb.com>
---
Changes since v3:
* Now functions via initial allocation regardless of O_DIRECT, buffered or
  pre-allocation code paths.  Provides a consistent user-experience.
* I Did do some experiments putting this in the xfs_bmapi_write code path
  however pre-allocation accounting unfortunately prevents this cleaner
  approach.  As such, this proved to be the cleanest and functional approach.
* No longer a mount option, now a sysfs tunable

Changes since v2:
* None

 fs/xfs/xfs_bmap_util.c |  3 +++
 fs/xfs/xfs_inode.c     | 18 ++++++++++++------
 fs/xfs/xfs_iomap.c     |  8 ++++++++
 fs/xfs/xfs_mount.h     |  1 +
 fs/xfs/xfs_rtalloc.c   | 26 ++++++++++++++++++++++++++
 fs/xfs/xfs_rtalloc.h   |  4 ++++
 fs/xfs/xfs_sysfs.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 7 files changed, 95 insertions(+), 6 deletions(-)

Comments

Dave Chinner Sept. 22, 2017, 5:54 a.m. UTC | #1
On Mon, Sep 18, 2017 at 08:52:37PM -0700, Richard Wareing wrote:
> - The rt_alloc_min sysfs option automatically selects the device (data
>   device, or realtime) based on the size of the initial allocation of the
>   file.
> - This option can be used to route the storage of small files (and the
>   inefficient workloads associated with them) to a suitable storage
>   device such a SSD, while larger allocations are sent to a traditional
>   HDD.
> - Supports writes via O_DIRECT, buffered (i.e. page cache), and
>   pre-allocations (i.e. fallocate)
> - Available only when kernel is compiled w/ CONFIG_XFS_RT option.
> 
> Signed-off-by: Richard Wareing <rwareing@fb.com>
> ---
> Changes since v3:
> * Now functions via initial allocation regardless of O_DIRECT, buffered or
>   pre-allocation code paths.  Provides a consistent user-experience.
> * I Did do some experiments putting this in the xfs_bmapi_write code path
>   however pre-allocation accounting unfortunately prevents this cleaner
>   approach.  As such, this proved to be the cleanest and functional approach.
> * No longer a mount option, now a sysfs tunable
> 
> Changes since v2:
> * None
> 
>  fs/xfs/xfs_bmap_util.c |  3 +++
>  fs/xfs/xfs_inode.c     | 18 ++++++++++++------
>  fs/xfs/xfs_iomap.c     |  8 ++++++++
>  fs/xfs/xfs_mount.h     |  1 +
>  fs/xfs/xfs_rtalloc.c   | 26 ++++++++++++++++++++++++++
>  fs/xfs/xfs_rtalloc.h   |  4 ++++
>  fs/xfs/xfs_sysfs.c     | 41 +++++++++++++++++++++++++++++++++++++++++
>  7 files changed, 95 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 9e3cc21..2d253fb 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1026,6 +1026,9 @@ xfs_alloc_file_space(
>  	if (len <= 0)
>  		return -EINVAL;
>  
> +	if (XFS_IS_REALTIME_MOUNT(mp))
> +		xfs_rt_alloc_min(ip, len);

I'd put the XFS_IS_REALTIME_MOUNT() check inside xfs_rt_alloc_min().
That way we can compile the code out completely when
CONFIG_XFS_RT=n.

>  	rt = XFS_IS_REALTIME_INODE(ip);
>  	extsz = xfs_get_extsz_hint(ip);
>  
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index ec9826c..f9e2deb 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1620,12 +1620,18 @@ xfs_itruncate_extents(
>  	if (error)
>  		goto out;
>  
> -	/*
> -	 * Clear the reflink flag if we truncated everything.
> -	 */
> -	if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) {
> -		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
> -		xfs_inode_clear_cowblocks_tag(ip);
> +	if (ip->i_d.di_nblocks == 0) {
> +		/*
> +		 * Clear the reflink flag if we truncated everything.
> +		 */
> +		if (xfs_is_reflink_inode(ip)) {
> +			ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
> +			xfs_inode_clear_cowblocks_tag(ip);
> +		}
> +		/* Clear realtime flag if m_rt_alloc_min policy is in place */
> +		if (XFS_IS_REALTIME_MOUNT(mp) && mp->m_rt_alloc_min) {

Won't m_rt_alloc_min always be zero on non-rt filesystems?

> +			ip->i_d.di_flags &= ~XFS_DIFLAG_REALTIME;
> +		}
>  	}
>  
>  	/*
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 94e5bdf..11f1c95 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -40,6 +40,7 @@
>  #include "xfs_dquot_item.h"
>  #include "xfs_dquot.h"
>  #include "xfs_reflink.h"
> +#include "xfs_rtalloc.h"
>  
>  
>  #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
> @@ -174,6 +175,10 @@ xfs_iomap_write_direct(
>  	int		bmapi_flags = XFS_BMAPI_PREALLOC;
>  	uint		tflags = 0;
>  
> +
> +	if (XFS_IS_REALTIME_MOUNT(mp))
> +		xfs_rt_alloc_min(ip, count);

Reading this makes me wonder what we are allocating here :/
A better name might be in order - something that indicates we're
selecting the target device rather allocating something. e.g.
xfs_inode_select_target()?

> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 9fa312a..067be3b 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -197,6 +197,7 @@ typedef struct xfs_mount {
>  	__uint32_t		m_generation;
>  
>  	bool			m_fail_unmount;
> +        uint                    m_rt_alloc_min; /* Min RT allocation */

WHitespace problem - looks like spaces instead of tabs....

>  #ifdef DEBUG
>  	/*
>  	 * DEBUG mode instrumentation to test and/or trigger delayed allocation
> diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
> index c57aa7f..e51cb25 100644
> --- a/fs/xfs/xfs_rtalloc.c
> +++ b/fs/xfs/xfs_rtalloc.c
> @@ -1284,3 +1284,29 @@ xfs_rtpick_extent(
>  	*pick = b;
>  	return 0;
>  }
> +
> +/*
> + * Automatically set real-time flag if initial write to inode is > m_rt_alloc_min
> + *
> + * Also valid on truncations.
> + *
> + */
> +void xfs_rt_alloc_min(
> +	struct xfs_inode	*ip,
> +	xfs_off_t       	len)
> +{
> +	struct xfs_mount    *mp = ip->i_mount;
> +
> +	if (!mp->m_rt_alloc_min || ip->i_d.di_size)
> +		return;

I kinda prefer stacking single checks like

	/*
	 * m_rt_alloc_min controls the target selection. It is
	 * inactive if it is not set.
	 */
	if (!mp->m_rt_alloc_min)
		return;

	/*
	 * Can't select a different target if we've already
	 * allocated blocks (e.g. fallocate() beyond EOF) or has
	 * data in it already.
	 */
	if (!ip->i_nextents)
		return;
	if (!ip->i_d.di_size)
		return;

> +	if (XFS_IS_REALTIME_INODE(ip)) {
> +		if (len < mp->m_rt_alloc_min) {
> +			ip->i_d.di_flags &= ~XFS_DIFLAG_REALTIME;
> +		}
> +	} else {
> +		if (len >= mp->m_rt_alloc_min) {
> +			ip->i_d.di_flags |= XFS_DIFLAG_REALTIME;
> +		}
> +	}

Checking for XFS_IS_REALTIME_INODE() is redundant here. This does
the same thing:

	/*
	 * if the allocation length is less than the threshold,
	 * always select the data device. Otherwise we should
	 * select the realtime device.
	 */
	if (len < mp->m_rt_alloc_min)
		ip->i_d.di_flags &= ~XFS_DIFLAG_REALTIME;
	else
		ip->i_d.di_flags |= XFS_DIFLAG_REALTIME;

> +}
> diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
> index f13133e..12939d9 100644
> --- a/fs/xfs/xfs_rtalloc.h
> +++ b/fs/xfs/xfs_rtalloc.h
> @@ -136,6 +136,9 @@ int xfs_rtalloc_query_range(struct xfs_trans *tp,
>  int xfs_rtalloc_query_all(struct xfs_trans *tp,
>  			  xfs_rtalloc_query_range_fn fn,
>  			  void *priv);
> +void xfs_rt_alloc_min(struct xfs_inode *ip, xfs_off_t len);
> +
> +
>  #else
>  # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb)    (ENOSYS)
>  # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
> @@ -155,6 +158,7 @@ xfs_rtmount_init(
>  }
>  # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
>  # define xfs_rtunmount_inodes(m)
> +# define xfs_rt_alloc_min(i,l)                          (ENOSYS)
>  #endif	/* CONFIG_XFS_RT */
>  
>  #endif	/* __XFS_RTALLOC_H__ */
> diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
> index 80ac15f..3c8dedb 100644
> --- a/fs/xfs/xfs_sysfs.c
> +++ b/fs/xfs/xfs_sysfs.c
> @@ -129,10 +129,51 @@ XFS_SYSFS_ATTR_RW(drop_writes);
>  
>  #endif /* DEBUG */
>  
> +#ifdef CONFIG_XFS_RT
> +STATIC ssize_t
> +rt_alloc_min_store(
> +	struct kobject			*kobject,
> +	const char				*buf,
> +	size_t					count)
> +{
> +	struct xfs_mount		*mp = to_mp(kobject);
> +	int						ret;
> +	int						val;
> +
> +	ret = kstrtoint(buf, 0, &val);
> +	if (ret)
> +		return ret;
> +
> +	/* Only valid if using a real-time device */
> +	if (XFS_IS_REALTIME_MOUNT(mp) && val > 0)
> +		mp->m_rt_alloc_min = val;
> +	else if (val <= 0)
> +		mp->m_rt_alloc_min = 0;
> +	else
> +		return -EINVAL;

Seems inconsistent. This will allow a value <= 0 for a non-realtime
mount, but will return EINVAL for values > 0. Perhaps it would be
more consistent to return EINVAL for a non-realtime mount or
any value < 0?

> +
> +	return count;
> +}
> +
> +STATIC ssize_t
> +rt_alloc_min_show(
> +	struct kobject          *kobject,
> +	char                    *buf)
> +{
> +	struct xfs_mount        *mp = to_mp(kobject);
> +
> +	return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_rt_alloc_min);
> +}
> +XFS_SYSFS_ATTR_RW(rt_alloc_min);
> +#endif /* CONFIG_XFS_RT */
> +
>  static struct attribute *xfs_mp_attrs[] = {
>  #ifdef DEBUG
>  	ATTR_LIST(drop_writes),
>  #endif
> +#ifdef CONFIG_XFS_RT
> +	ATTR_LIST(rt_alloc_min),
> +#endif

This is userspace visible - shouldn't we always compile this in,
even if all it does it return EINVAL to attempts to change it when
CONFIG_XFS_RT=n?

Cheers,

Dave.
Richard Wareing Sept. 22, 2017, 7:06 p.m. UTC | #2
Dave Chinner <david@fromorbit.com> wrote:

> On Mon, Sep 18, 2017 at 08:52:37PM -0700, Richard Wareing wrote:
>> - The rt_alloc_min sysfs option automatically selects the device (data
>>   device, or realtime) based on the size of the initial allocation of the
>>   file.
>> - This option can be used to route the storage of small files (and the
>>   inefficient workloads associated with them) to a suitable storage
>>   device such a SSD, while larger allocations are sent to a traditional
>>   HDD.
>> - Supports writes via O_DIRECT, buffered (i.e. page cache), and
>>   pre-allocations (i.e. fallocate)
>> - Available only when kernel is compiled w/ CONFIG_XFS_RT option.
>>
>> Signed-off-by: Richard Wareing <rwareing@fb.com>
>> ---
>> Changes since v3:
>> * Now functions via initial allocation regardless of O_DIRECT, buffered or
>>   pre-allocation code paths.  Provides a consistent user-experience.
>> * I Did do some experiments putting this in the xfs_bmapi_write code path
>>   however pre-allocation accounting unfortunately prevents this cleaner
>>   approach.  As such, this proved to be the cleanest and functional approach.
>> * No longer a mount option, now a sysfs tunable
>>
>> Changes since v2:
>> * None
>>
>>  fs/xfs/xfs_bmap_util.c |  3 +++
>>  fs/xfs/xfs_inode.c     | 18 ++++++++++++------
>>  fs/xfs/xfs_iomap.c     |  8 ++++++++
>>  fs/xfs/xfs_mount.h     |  1 +
>>  fs/xfs/xfs_rtalloc.c   | 26 ++++++++++++++++++++++++++
>>  fs/xfs/xfs_rtalloc.h   |  4 ++++
>>  fs/xfs/xfs_sysfs.c     | 41 +++++++++++++++++++++++++++++++++++++++++
>>  7 files changed, 95 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
>> index 9e3cc21..2d253fb 100644
>> --- a/fs/xfs/xfs_bmap_util.c
>> +++ b/fs/xfs/xfs_bmap_util.c
>> @@ -1026,6 +1026,9 @@ xfs_alloc_file_space(
>>  	if (len <= 0)
>>  		return -EINVAL;
>>
>> +	if (XFS_IS_REALTIME_MOUNT(mp))
>> +		xfs_rt_alloc_min(ip, len);
>
> I'd put the XFS_IS_REALTIME_MOUNT() check inside xfs_rt_alloc_min().
> That way we can compile the code out completely when
> CONFIG_XFS_RT=n.
>

I was wondering about this, but elected to model the
XFS_IS_REALTIME_INODE code structure.  I'm partial to putting
this in the function as well, as it reads cleaner too.

>> rt = XFS_IS_REALTIME_INODE(ip);
>>  	extsz = xfs_get_extsz_hint(ip);
>>
>> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
>> index ec9826c..f9e2deb 100644
>> --- a/fs/xfs/xfs_inode.c
>> +++ b/fs/xfs/xfs_inode.c
>> @@ -1620,12 +1620,18 @@ xfs_itruncate_extents(
>>  	if (error)
>>  		goto out;
>>
>> -	/*
>> -	 * Clear the reflink flag if we truncated everything.
>> -	 */
>> -	if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) {
>> -		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
>> -		xfs_inode_clear_cowblocks_tag(ip);
>> +	if (ip->i_d.di_nblocks == 0) {
>> +		/*
>> +		 * Clear the reflink flag if we truncated everything.
>> +		 */
>> +		if (xfs_is_reflink_inode(ip)) {
>> +			ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
>> +			xfs_inode_clear_cowblocks_tag(ip);
>> +		}
>> +		/* Clear realtime flag if m_rt_alloc_min policy is in place */
>> +		if (XFS_IS_REALTIME_MOUNT(mp) && mp->m_rt_alloc_min) {
>
> Won't m_rt_alloc_min always be zero on non-rt filesystems?
>

Yes, but was trying to be defensive, in case some future me or other
developer screwed something up in the future :).

>> +			ip->i_d.di_flags &= ~XFS_DIFLAG_REALTIME;
>> +		}
>>  	}
>>
>>  	/*
>> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
>> index 94e5bdf..11f1c95 100644
>> --- a/fs/xfs/xfs_iomap.c
>> +++ b/fs/xfs/xfs_iomap.c
>> @@ -40,6 +40,7 @@
>>  #include "xfs_dquot_item.h"
>>  #include "xfs_dquot.h"
>>  #include "xfs_reflink.h"
>> +#include "xfs_rtalloc.h"
>>
>>
>>  #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
>> @@ -174,6 +175,10 @@ xfs_iomap_write_direct(
>>  	int		bmapi_flags = XFS_BMAPI_PREALLOC;
>>  	uint		tflags = 0;
>>
>> +
>> +	if (XFS_IS_REALTIME_MOUNT(mp))
>> +		xfs_rt_alloc_min(ip, count);
>
> Reading this makes me wonder what we are allocating here :/
> A better name might be in order - something that indicates we're
> selecting the target device rather allocating something. e.g.
> xfs_inode_select_target()?

I like your function name better than mine, changing this.

>
>> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
>> index 9fa312a..067be3b 100644
>> --- a/fs/xfs/xfs_mount.h
>> +++ b/fs/xfs/xfs_mount.h
>> @@ -197,6 +197,7 @@ typedef struct xfs_mount {
>>  	__uint32_t		m_generation;
>>
>>  	bool			m_fail_unmount;
>> +        uint                    m_rt_alloc_min; /* Min RT allocation */
>
> WHitespace problem - looks like spaces instead of tabs....
>

Fixing.

>> #ifdef DEBUG
>>  	/*
>>  	 * DEBUG mode instrumentation to test and/or trigger delayed allocation
>> diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
>> index c57aa7f..e51cb25 100644
>> --- a/fs/xfs/xfs_rtalloc.c
>> +++ b/fs/xfs/xfs_rtalloc.c
>> @@ -1284,3 +1284,29 @@ xfs_rtpick_extent(
>>  	*pick = b;
>>  	return 0;
>>  }
>> +
>> +/*
>> + * Automatically set real-time flag if initial write to inode is >  
>> m_rt_alloc_min
>> + *
>> + * Also valid on truncations.
>> + *
>> + */
>> +void xfs_rt_alloc_min(
>> +	struct xfs_inode	*ip,
>> +	xfs_off_t       	len)
>> +{
>> +	struct xfs_mount    *mp = ip->i_mount;
>> +
>> +	if (!mp->m_rt_alloc_min || ip->i_d.di_size)
>> +		return;
>
> I kinda prefer stacking single checks like
>
> 	/*
> 	 * m_rt_alloc_min controls the target selection. It is
> 	 * inactive if it is not set.
> 	 */
> 	if (!mp->m_rt_alloc_min)
> 		return;
>
> 	/*
> 	 * Can't select a different target if we've already
> 	 * allocated blocks (e.g. fallocate() beyond EOF) or has
> 	 * data in it already.
> 	 */
> 	if (!ip->i_nextents)
> 		return;
> 	if (!ip->i_d.di_size)
> 		return;
>

Fixing.

>> +	if (XFS_IS_REALTIME_INODE(ip)) {
>> +		if (len < mp->m_rt_alloc_min) {
>> +			ip->i_d.di_flags &= ~XFS_DIFLAG_REALTIME;
>> +		}
>> +	} else {
>> +		if (len >= mp->m_rt_alloc_min) {
>> +			ip->i_d.di_flags |= XFS_DIFLAG_REALTIME;
>> +		}
>> +	}
>
> Checking for XFS_IS_REALTIME_INODE() is redundant here. This does
> the same thing:
>
> 	/*
> 	 * if the allocation length is less than the threshold,
> 	 * always select the data device. Otherwise we should
> 	 * select the realtime device.
> 	 */
> 	if (len < mp->m_rt_alloc_min)
> 		ip->i_d.di_flags &= ~XFS_DIFLAG_REALTIME;
> 	else
> 		ip->i_d.di_flags |= XFS_DIFLAG_REALTIME;
>

Fixing.


>> +}
>> diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
>> index f13133e..12939d9 100644
>> --- a/fs/xfs/xfs_rtalloc.h
>> +++ b/fs/xfs/xfs_rtalloc.h
>> @@ -136,6 +136,9 @@ int xfs_rtalloc_query_range(struct xfs_trans *tp,
>>  int xfs_rtalloc_query_all(struct xfs_trans *tp,
>>  			  xfs_rtalloc_query_range_fn fn,
>>  			  void *priv);
>> +void xfs_rt_alloc_min(struct xfs_inode *ip, xfs_off_t len);
>> +
>> +
>>  #else
>>  # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb)    (ENOSYS)
>>  # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
>> @@ -155,6 +158,7 @@ xfs_rtmount_init(
>>  }
>>  # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
>>  # define xfs_rtunmount_inodes(m)
>> +# define xfs_rt_alloc_min(i,l)                          (ENOSYS)
>>  #endif	/* CONFIG_XFS_RT */
>>
>>  #endif	/* __XFS_RTALLOC_H__ */
>> diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
>> index 80ac15f..3c8dedb 100644
>> --- a/fs/xfs/xfs_sysfs.c
>> +++ b/fs/xfs/xfs_sysfs.c
>> @@ -129,10 +129,51 @@ XFS_SYSFS_ATTR_RW(drop_writes);
>>
>>  #endif /* DEBUG */
>>
>> +#ifdef CONFIG_XFS_RT
>> +STATIC ssize_t
>> +rt_alloc_min_store(
>> +	struct kobject			*kobject,
>> +	const char				*buf,
>> +	size_t					count)
>> +{
>> +	struct xfs_mount		*mp = to_mp(kobject);
>> +	int						ret;
>> +	int						val;
>> +
>> +	ret = kstrtoint(buf, 0, &val);
>> +	if (ret)
>> +		return ret;
>> +
>> +	/* Only valid if using a real-time device */
>> +	if (XFS_IS_REALTIME_MOUNT(mp) && val > 0)
>> +		mp->m_rt_alloc_min = val;
>> +	else if (val <= 0)
>> +		mp->m_rt_alloc_min = 0;
>> +	else
>> +		return -EINVAL;
>
> Seems inconsistent. This will allow a value <= 0 for a non-realtime
> mount, but will return EINVAL for values > 0. Perhaps it would be
> more consistent to return EINVAL for a non-realtime mount or
> any value < 0?

Fair observation, fixing.

>
>> +
>> +	return count;
>> +}
>> +
>> +STATIC ssize_t
>> +rt_alloc_min_show(
>> +	struct kobject          *kobject,
>> +	char                    *buf)
>> +{
>> +	struct xfs_mount        *mp = to_mp(kobject);
>> +
>> +	return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_rt_alloc_min);
>> +}
>> +XFS_SYSFS_ATTR_RW(rt_alloc_min);
>> +#endif /* CONFIG_XFS_RT */
>> +
>>  static struct attribute *xfs_mp_attrs[] = {
>>  #ifdef DEBUG
>>  	ATTR_LIST(drop_writes),
>>  #endif
>> +#ifdef CONFIG_XFS_RT
>> +	ATTR_LIST(rt_alloc_min),
>> +#endif
>
> This is userspace visible - shouldn't we always compile this in,
> even if all it does it return EINVAL to attempts to change it when
> CONFIG_XFS_RT=n?
>

I'm fine with changing this, my thinking here was to simply not
show options which aren't tunable without a realtime device.  But
I'm fine with throwing EINVAL too.


Richard



--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 9e3cc21..2d253fb 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1026,6 +1026,9 @@  xfs_alloc_file_space(
 	if (len <= 0)
 		return -EINVAL;
 
+	if (XFS_IS_REALTIME_MOUNT(mp))
+		xfs_rt_alloc_min(ip, len);
+
 	rt = XFS_IS_REALTIME_INODE(ip);
 	extsz = xfs_get_extsz_hint(ip);
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ec9826c..f9e2deb 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1620,12 +1620,18 @@  xfs_itruncate_extents(
 	if (error)
 		goto out;
 
-	/*
-	 * Clear the reflink flag if we truncated everything.
-	 */
-	if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) {
-		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
-		xfs_inode_clear_cowblocks_tag(ip);
+	if (ip->i_d.di_nblocks == 0) {
+		/*
+		 * Clear the reflink flag if we truncated everything.
+		 */
+		if (xfs_is_reflink_inode(ip)) {
+			ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+			xfs_inode_clear_cowblocks_tag(ip);
+		}
+		/* Clear realtime flag if m_rt_alloc_min policy is in place */
+		if (XFS_IS_REALTIME_MOUNT(mp) && mp->m_rt_alloc_min) {
+			ip->i_d.di_flags &= ~XFS_DIFLAG_REALTIME;
+		}
 	}
 
 	/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 94e5bdf..11f1c95 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -40,6 +40,7 @@ 
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
 #include "xfs_reflink.h"
+#include "xfs_rtalloc.h"
 
 
 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
@@ -174,6 +175,10 @@  xfs_iomap_write_direct(
 	int		bmapi_flags = XFS_BMAPI_PREALLOC;
 	uint		tflags = 0;
 
+
+	if (XFS_IS_REALTIME_MOUNT(mp))
+		xfs_rt_alloc_min(ip, count);
+
 	rt = XFS_IS_REALTIME_INODE(ip);
 	extsz = xfs_get_extsz_hint(ip);
 	lockmode = XFS_ILOCK_SHARED;	/* locked by caller */
@@ -981,6 +986,9 @@  xfs_file_iomap_begin(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
+	if (XFS_IS_REALTIME_MOUNT(mp))
+		xfs_rt_alloc_min(ip, length);
+
 	if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
 			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
 		/* Reserve delalloc blocks for regular writeback. */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9fa312a..067be3b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -197,6 +197,7 @@  typedef struct xfs_mount {
 	__uint32_t		m_generation;
 
 	bool			m_fail_unmount;
+        uint                    m_rt_alloc_min; /* Min RT allocation */
 #ifdef DEBUG
 	/*
 	 * DEBUG mode instrumentation to test and/or trigger delayed allocation
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index c57aa7f..e51cb25 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1284,3 +1284,29 @@  xfs_rtpick_extent(
 	*pick = b;
 	return 0;
 }
+
+/*
+ * Automatically set real-time flag if initial write to inode is > m_rt_alloc_min
+ *
+ * Also valid on truncations.
+ *
+ */
+void xfs_rt_alloc_min(
+	struct xfs_inode	*ip,
+	xfs_off_t       	len)
+{
+	struct xfs_mount    *mp = ip->i_mount;
+
+	if (!mp->m_rt_alloc_min || ip->i_d.di_size)
+		return;
+
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		if (len < mp->m_rt_alloc_min) {
+			ip->i_d.di_flags &= ~XFS_DIFLAG_REALTIME;
+		}
+	} else {
+		if (len >= mp->m_rt_alloc_min) {
+			ip->i_d.di_flags |= XFS_DIFLAG_REALTIME;
+		}
+	}
+}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index f13133e..12939d9 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -136,6 +136,9 @@  int xfs_rtalloc_query_range(struct xfs_trans *tp,
 int xfs_rtalloc_query_all(struct xfs_trans *tp,
 			  xfs_rtalloc_query_range_fn fn,
 			  void *priv);
+void xfs_rt_alloc_min(struct xfs_inode *ip, xfs_off_t len);
+
+
 #else
 # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb)    (ENOSYS)
 # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
@@ -155,6 +158,7 @@  xfs_rtmount_init(
 }
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtunmount_inodes(m)
+# define xfs_rt_alloc_min(i,l)                          (ENOSYS)
 #endif	/* CONFIG_XFS_RT */
 
 #endif	/* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 80ac15f..3c8dedb 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -129,10 +129,51 @@  XFS_SYSFS_ATTR_RW(drop_writes);
 
 #endif /* DEBUG */
 
+#ifdef CONFIG_XFS_RT
+STATIC ssize_t
+rt_alloc_min_store(
+	struct kobject			*kobject,
+	const char				*buf,
+	size_t					count)
+{
+	struct xfs_mount		*mp = to_mp(kobject);
+	int						ret;
+	int						val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	/* Only valid if using a real-time device */
+	if (XFS_IS_REALTIME_MOUNT(mp) && val > 0)
+		mp->m_rt_alloc_min = val;
+	else if (val <= 0)
+		mp->m_rt_alloc_min = 0;
+	else
+		return -EINVAL;
+
+	return count;
+}
+
+STATIC ssize_t
+rt_alloc_min_show(
+	struct kobject          *kobject,
+	char                    *buf)
+{
+	struct xfs_mount        *mp = to_mp(kobject);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_rt_alloc_min);
+}
+XFS_SYSFS_ATTR_RW(rt_alloc_min);
+#endif /* CONFIG_XFS_RT */
+
 static struct attribute *xfs_mp_attrs[] = {
 #ifdef DEBUG
 	ATTR_LIST(drop_writes),
 #endif
+#ifdef CONFIG_XFS_RT
+	ATTR_LIST(rt_alloc_min),
+#endif
 	NULL,
 };