diff mbox series

[RFC,2/2] ceph: test basic ceph.quota.max_bytes quota

Message ID 20190402103428.21435-3-lhenriques@suse.com (mailing list archive)
State New, archived
Headers show
Series Initial CephFS tests | expand

Commit Message

Luis Henriques April 2, 2019, 10:34 a.m. UTC
Simple set of checks for CephFS max_bytes directory quota implementation.

Signed-off-by: Luis Henriques <lhenriques@suse.com>
---
 tests/ceph/002     | 147 +++++++++++++++++++++++++++++++++++++++++++++
 tests/ceph/002.out |   1 +
 tests/ceph/group   |   1 +
 3 files changed, 149 insertions(+)
 create mode 100755 tests/ceph/002
 create mode 100644 tests/ceph/002.out

Comments

Dave Chinner April 2, 2019, 9:09 p.m. UTC | #1
On Tue, Apr 02, 2019 at 11:34:28AM +0100, Luis Henriques wrote:
> Simple set of checks for CephFS max_bytes directory quota implementation.
> 
> Signed-off-by: Luis Henriques <lhenriques@suse.com>
> ---
>  tests/ceph/002     | 147 +++++++++++++++++++++++++++++++++++++++++++++
>  tests/ceph/002.out |   1 +
>  tests/ceph/group   |   1 +
>  3 files changed, 149 insertions(+)
>  create mode 100755 tests/ceph/002
>  create mode 100644 tests/ceph/002.out
> 
> diff --git a/tests/ceph/002 b/tests/ceph/002
> new file mode 100755
> index 000000000000..313865dc639e
> --- /dev/null
> +++ b/tests/ceph/002
> @@ -0,0 +1,147 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2019 SUSE LLC. All Rights Reserved.
> +#
> +# FS QA Test No. 002
> +#
> +# This tests basic ceph.quota.max_bytes quota features.
> +#
> +
> +seq=`basename $0`
> +seqres=$RESULT_DIR/$seq
> +echo "QA output created by $seq"
> +
> +testdir=$TEST_DIR/quota-test

Try not to name local variables the same as when known global
variables. When we talk about "test dir", we mean the mount point
for the test device, not the local, tests specific work directory.
 i.e. this is a "work dir", not a "test dir".

And, often, we just name them after the test that is running,
so we can identify what test left them behind. i.e.

workdir=$TEST_DIR/$seq

> +
> +tmp=/tmp/$$
> +status=1    # failure is the default!
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +_cleanup()
> +{
> +	cd /
> +	rm -rf $tmp.*
> +	rm -rf $testdir

Leave it behind for post-mortem analysis if necessary, remove it
before starting this test execution....

> +}
> +
> +# get standard environment, filters and checks
> +. ./common/rc
> +. ./common/filter
> +. ./common/attr
> +
> +# real QA test starts here
> +_supported_os Linux
> +_supported_fs ceph
> +
> +_require_attrs
> +
> +set_quota()
> +{
> +	val=$1
> +	dir=$2
> +	$SETFATTR_PROG -n ceph.quota.max_bytes -v $val $dir >/dev/null 2>&1
> +}
> +
> +get_quota()
> +{
> +	dir=$1
> +	$GETFATTR_PROG --only-values -n ceph.quota.max_bytes $dir 2> /dev/null
> +}
> +
> +# function to write a file.  We use a loop because quotas in CephFS is a
> +# "best-effort" implementation, i.e. a write may actually be allowed even if the
> +# quota is being exceeded.  Using a loop reduces the chances of this to happen.
> +#
> +# NOTE: 'size' parameter is in M

xfs_io accepts "1m" as one megabyte.

> +write_file()
> +{
> +	file=$1
> +	size=$2 # size in M
> +	for (( i = 1; i < $size; i++ )); do
> +		$XFS_IO_PROG -f -c "pwrite -W $((i * 1048576)) 1048576" \
> +			     $file >/dev/null 2>&1
> +	done
> +}

Makes no sense to me. xfs_io does a write() loop internally with
this pwrite command of 4kB writes - the default buffer size. If you
want xfs_io to loop doing 1MB sized pwrite() calls, then all you
need is this:

	$XFS_IO_PROG -f -c "pwrite -w -B 1m 0 ${size}m" $file | _filter_xfs_io

> +
> +# Check a file size
> +#
> +# NOTE: 'expected' (size) parameter is in M
> +check_file_size()
> +{
> +	file=$1
> +	expected=$(($2 * 1048576))
> +	size=$(stat -c %s $file)
> +	if [ "$size" -ne "$expected" ]; then
> +		_fail "Expecting file with $expected got $size"
> +	fi

Just emit the size into the output, and if it's not the correct size
the test will fail because of a golden output mismatch.

i.e. just do:

stat -c %s $file

and nothing else.

> +}
> +
> +mkdir $testdir
> +
> +# test setting quota
> +set_quota 1000000 $testdir
> +ret=$(get_quota $testdir)
> +if [ "$ret" -ne 1000000 ]; then
> +	_fail "expected max_bytes quota to be 1000000, got '$ret' instead"
> +fi

Ditto. You should almost never need to use "if [comparison] then _fail
...." in fstests, because the golden output matching is precisely
for this purpose.

> +# set quota to largest acceptable value (0x7FFFFFFFFFFFFFFF)
> +set_quota 9223372036854775807 $testdir
> +ret=$(get_quota $testdir)
> +if [ "$ret" -ne 9223372036854775807 ]; then
> +	_fail "expected max_bytes quota to be 9223372036854775807, got '$ret' instead"
> +fi
> +# test resetting quota
> +set_quota 0 $testdir
> +ret=$(get_quota $testdir)
> +if [ -n "$ret" ]; then
> +	_fail "expected 0 max_bytes quota, got '$ret' instead"
> +fi
> +# set quota to invalid values (0x8000000000000000 and -1)
> +set_quota 9223372036854775808 $testdir
> +ret=$(get_quota $testdir)
> +if [ -n "$ret" ]; then
> +	_fail "expected max_bytes quota to be 0, got '$ret' instead"
> +fi
> +set_quota -1 $testdir
> +ret=$(get_quota $testdir)
> +if [ -n "$ret" ]; then
> +	_fail "expected max_bytes quota to be 0, got '$ret' instead"
> +fi

But didn't you test these boundary conditions in the last test? Why
repeat them here?

> +bigfile="$testdir/bigfile"
> +
> +# set quota to 10M
> +set_quota $((10 * 1048576)) $testdir
> +
> +# write 9M file
> +write_file $bigfile 9
> +check_file_size $bigfile 9
> +rm $bigfile

which is:

$XFS_IO_PROG -f -c "pwrite -W -B 1m 0 9m" $bigfile | _filter_xfs_io
stat -c %s $bigfile
rm -f $bigfile

So no need for functions here...

> diff --git a/tests/ceph/002.out b/tests/ceph/002.out
> new file mode 100644
> index 000000000000..c57ca23e5cbe
> --- /dev/null
> +++ b/tests/ceph/002.out
> @@ -0,0 +1 @@
> +QA output created by 002

An empty golden image file and lots of _fail calls in the test is an
indication you are probably not quite doing things the right way :P

Cheers,

Dave.
Luis Henriques April 3, 2019, 9:45 a.m. UTC | #2
Dave Chinner <david@fromorbit.com> writes:

> On Tue, Apr 02, 2019 at 11:34:28AM +0100, Luis Henriques wrote:
>> Simple set of checks for CephFS max_bytes directory quota implementation.
>> 
>> Signed-off-by: Luis Henriques <lhenriques@suse.com>
>> ---
>>  tests/ceph/002     | 147 +++++++++++++++++++++++++++++++++++++++++++++
>>  tests/ceph/002.out |   1 +
>>  tests/ceph/group   |   1 +
>>  3 files changed, 149 insertions(+)
>>  create mode 100755 tests/ceph/002
>>  create mode 100644 tests/ceph/002.out
>> 
>> diff --git a/tests/ceph/002 b/tests/ceph/002
>> new file mode 100755
>> index 000000000000..313865dc639e
>> --- /dev/null
>> +++ b/tests/ceph/002
>> @@ -0,0 +1,147 @@
>> +#! /bin/bash
>> +# SPDX-License-Identifier: GPL-2.0
>> +# Copyright (c) 2019 SUSE LLC. All Rights Reserved.
>> +#
>> +# FS QA Test No. 002
>> +#
>> +# This tests basic ceph.quota.max_bytes quota features.
>> +#
>> +
>> +seq=`basename $0`
>> +seqres=$RESULT_DIR/$seq
>> +echo "QA output created by $seq"
>> +
>> +testdir=$TEST_DIR/quota-test
>
> Try not to name local variables the same as when known global
> variables. When we talk about "test dir", we mean the mount point
> for the test device, not the local, tests specific work directory.
>  i.e. this is a "work dir", not a "test dir".
>
> And, often, we just name them after the test that is running,
> so we can identify what test left them behind. i.e.
>
> workdir=$TEST_DIR/$seq
>
>> +
>> +tmp=/tmp/$$
>> +status=1    # failure is the default!
>> +trap "_cleanup; exit \$status" 0 1 2 3 15
>> +
>> +_cleanup()
>> +{
>> +	cd /
>> +	rm -rf $tmp.*
>> +	rm -rf $testdir
>
> Leave it behind for post-mortem analysis if necessary, remove it
> before starting this test execution....
>
>> +}
>> +
>> +# get standard environment, filters and checks
>> +. ./common/rc
>> +. ./common/filter
>> +. ./common/attr
>> +
>> +# real QA test starts here
>> +_supported_os Linux
>> +_supported_fs ceph
>> +
>> +_require_attrs
>> +
>> +set_quota()
>> +{
>> +	val=$1
>> +	dir=$2
>> +	$SETFATTR_PROG -n ceph.quota.max_bytes -v $val $dir >/dev/null 2>&1
>> +}
>> +
>> +get_quota()
>> +{
>> +	dir=$1
>> +	$GETFATTR_PROG --only-values -n ceph.quota.max_bytes $dir 2> /dev/null
>> +}
>> +
>> +# function to write a file.  We use a loop because quotas in CephFS is a
>> +# "best-effort" implementation, i.e. a write may actually be allowed even if the
>> +# quota is being exceeded.  Using a loop reduces the chances of this to happen.
>> +#
>> +# NOTE: 'size' parameter is in M
>
> xfs_io accepts "1m" as one megabyte.
>
>> +write_file()
>> +{
>> +	file=$1
>> +	size=$2 # size in M
>> +	for (( i = 1; i < $size; i++ )); do
>> +		$XFS_IO_PROG -f -c "pwrite -W $((i * 1048576)) 1048576" \
>> +			     $file >/dev/null 2>&1
>> +	done
>> +}
>
> Makes no sense to me. xfs_io does a write() loop internally with
> this pwrite command of 4kB writes - the default buffer size. If you
> want xfs_io to loop doing 1MB sized pwrite() calls, then all you
> need is this:
>
> 	$XFS_IO_PROG -f -c "pwrite -w -B 1m 0 ${size}m" $file | _filter_xfs_io
>

Thank you for your review, Dave.  I'll make sure the next revision of
these tests will include all your comments implemented... except for
this one.

The reason I'm using a loop for writing a file is due to the nature of
the (very!) loose definition of quotas in CephFS.  Basically, clients
will likely write some amount of data over the configured limit because
the servers they are communicating with to write the data (the OSDs)
have no idea about the concept of quotas (or files even); the filesystem
view in the cluster is managed at a different level, with the help of
the MDS and the client itself.

So, the loop in this function is simply to allow the metadata associated
with the file to be updated while we're writing the file.  If I use a
single pwrite, the whole file will be written before we get a -EDQUOT.

If an admin wants to really enforce some hard quotas in the filesystem,
there are other means to do that, but not at the filesystem level.
There are some more details on the quota implementation in Ceph here:

  http://docs.ceph.com/docs/master/cephfs/quota/

I hope this makes sense and helps understanding why I need a loop to be
used in this test.

Cheers
Nikolay Borisov April 3, 2019, 12:17 p.m. UTC | #3
On 3.04.19 г. 12:45 ч., Luis Henriques wrote:
> Dave Chinner <david@fromorbit.com> writes:
> 
>> On Tue, Apr 02, 2019 at 11:34:28AM +0100, Luis Henriques wrote:
>>> Simple set of checks for CephFS max_bytes directory quota implementation.
>>>
>>> Signed-off-by: Luis Henriques <lhenriques@suse.com>
>>> ---
>>>  tests/ceph/002     | 147 +++++++++++++++++++++++++++++++++++++++++++++
>>>  tests/ceph/002.out |   1 +
>>>  tests/ceph/group   |   1 +
>>>  3 files changed, 149 insertions(+)
>>>  create mode 100755 tests/ceph/002
>>>  create mode 100644 tests/ceph/002.out
>>>
>>> diff --git a/tests/ceph/002 b/tests/ceph/002
>>> new file mode 100755
>>> index 000000000000..313865dc639e
>>> --- /dev/null
>>> +++ b/tests/ceph/002
>>> @@ -0,0 +1,147 @@
>>> +#! /bin/bash
>>> +# SPDX-License-Identifier: GPL-2.0
>>> +# Copyright (c) 2019 SUSE LLC. All Rights Reserved.
>>> +#
>>> +# FS QA Test No. 002
>>> +#
>>> +# This tests basic ceph.quota.max_bytes quota features.
>>> +#
>>> +
>>> +seq=`basename $0`
>>> +seqres=$RESULT_DIR/$seq
>>> +echo "QA output created by $seq"
>>> +
>>> +testdir=$TEST_DIR/quota-test
>>
>> Try not to name local variables the same as when known global
>> variables. When we talk about "test dir", we mean the mount point
>> for the test device, not the local, tests specific work directory.
>>  i.e. this is a "work dir", not a "test dir".
>>
>> And, often, we just name them after the test that is running,
>> so we can identify what test left them behind. i.e.
>>
>> workdir=$TEST_DIR/$seq
>>
>>> +
>>> +tmp=/tmp/$$
>>> +status=1    # failure is the default!
>>> +trap "_cleanup; exit \$status" 0 1 2 3 15
>>> +
>>> +_cleanup()
>>> +{
>>> +	cd /
>>> +	rm -rf $tmp.*
>>> +	rm -rf $testdir
>>
>> Leave it behind for post-mortem analysis if necessary, remove it
>> before starting this test execution....
>>
>>> +}
>>> +
>>> +# get standard environment, filters and checks
>>> +. ./common/rc
>>> +. ./common/filter
>>> +. ./common/attr
>>> +
>>> +# real QA test starts here
>>> +_supported_os Linux
>>> +_supported_fs ceph
>>> +
>>> +_require_attrs
>>> +
>>> +set_quota()
>>> +{
>>> +	val=$1
>>> +	dir=$2
>>> +	$SETFATTR_PROG -n ceph.quota.max_bytes -v $val $dir >/dev/null 2>&1
>>> +}
>>> +
>>> +get_quota()
>>> +{
>>> +	dir=$1
>>> +	$GETFATTR_PROG --only-values -n ceph.quota.max_bytes $dir 2> /dev/null
>>> +}
>>> +
>>> +# function to write a file.  We use a loop because quotas in CephFS is a
>>> +# "best-effort" implementation, i.e. a write may actually be allowed even if the
>>> +# quota is being exceeded.  Using a loop reduces the chances of this to happen.
>>> +#
>>> +# NOTE: 'size' parameter is in M
>>
>> xfs_io accepts "1m" as one megabyte.
>>
>>> +write_file()
>>> +{
>>> +	file=$1
>>> +	size=$2 # size in M
>>> +	for (( i = 1; i < $size; i++ )); do
>>> +		$XFS_IO_PROG -f -c "pwrite -W $((i * 1048576)) 1048576" \
>>> +			     $file >/dev/null 2>&1
>>> +	done
>>> +}
>>
>> Makes no sense to me. xfs_io does a write() loop internally with
>> this pwrite command of 4kB writes - the default buffer size. If you
>> want xfs_io to loop doing 1MB sized pwrite() calls, then all you
>> need is this:
>>
>> 	$XFS_IO_PROG -f -c "pwrite -w -B 1m 0 ${size}m" $file | _filter_xfs_io
>>
> 
> Thank you for your review, Dave.  I'll make sure the next revision of
> these tests will include all your comments implemented... except for
> this one.
> 
> The reason I'm using a loop for writing a file is due to the nature of
> the (very!) loose definition of quotas in CephFS.  Basically, clients
> will likely write some amount of data over the configured limit because
> the servers they are communicating with to write the data (the OSDs)
> have no idea about the concept of quotas (or files even); the filesystem
> view in the cluster is managed at a different level, with the help of
> the MDS and the client itself.
> 
> So, the loop in this function is simply to allow the metadata associated
> with the file to be updated while we're writing the file.  If I use a

But the metadata will be modified while writing the file even with a
single invocation of xfs_io. It's just that you are moving the loop
inside xfs_io rather than having to invoke xfs_io a lot of time. Also,
just because you are using a single -c "pwrite" command doesn't mean
this will translate to a single call to pwrite. As Dave mentioned, the
default block size is 4k meaning :

"pwrite -w -B 1m 0 ${size}m"

will result in 'size / 1m' writes of size 1m, each being a distinct call
to pwrite.

> single pwrite, the whole file will be written before we get a -EDQUOT.
> 
> If an admin wants to really enforce some hard quotas in the filesystem,
> there are other means to do that, but not at the filesystem level.
> There are some more details on the quota implementation in Ceph here:
> 
>   http://docs.ceph.com/docs/master/cephfs/quota/
> 
> I hope this makes sense and helps understanding why I need a loop to be
> used in this test.
> 
> Cheers
>
Luis Henriques April 3, 2019, 1:19 p.m. UTC | #4
Nikolay Borisov <nborisov@suse.com> writes:

> On 3.04.19 г. 12:45 ч., Luis Henriques wrote:
>> Dave Chinner <david@fromorbit.com> writes:
>> 
>>> On Tue, Apr 02, 2019 at 11:34:28AM +0100, Luis Henriques wrote:
>>>> Simple set of checks for CephFS max_bytes directory quota implementation.
>>>>
>>>> Signed-off-by: Luis Henriques <lhenriques@suse.com>
>>>> ---
>>>>  tests/ceph/002     | 147 +++++++++++++++++++++++++++++++++++++++++++++
>>>>  tests/ceph/002.out |   1 +
>>>>  tests/ceph/group   |   1 +
>>>>  3 files changed, 149 insertions(+)
>>>>  create mode 100755 tests/ceph/002
>>>>  create mode 100644 tests/ceph/002.out
>>>>
>>>> diff --git a/tests/ceph/002 b/tests/ceph/002
>>>> new file mode 100755
>>>> index 000000000000..313865dc639e
>>>> --- /dev/null
>>>> +++ b/tests/ceph/002
>>>> @@ -0,0 +1,147 @@
>>>> +#! /bin/bash
>>>> +# SPDX-License-Identifier: GPL-2.0
>>>> +# Copyright (c) 2019 SUSE LLC. All Rights Reserved.
>>>> +#
>>>> +# FS QA Test No. 002
>>>> +#
>>>> +# This tests basic ceph.quota.max_bytes quota features.
>>>> +#
>>>> +
>>>> +seq=`basename $0`
>>>> +seqres=$RESULT_DIR/$seq
>>>> +echo "QA output created by $seq"
>>>> +
>>>> +testdir=$TEST_DIR/quota-test
>>>
>>> Try not to name local variables the same as when known global
>>> variables. When we talk about "test dir", we mean the mount point
>>> for the test device, not the local, tests specific work directory.
>>>  i.e. this is a "work dir", not a "test dir".
>>>
>>> And, often, we just name them after the test that is running,
>>> so we can identify what test left them behind. i.e.
>>>
>>> workdir=$TEST_DIR/$seq
>>>
>>>> +
>>>> +tmp=/tmp/$$
>>>> +status=1    # failure is the default!
>>>> +trap "_cleanup; exit \$status" 0 1 2 3 15
>>>> +
>>>> +_cleanup()
>>>> +{
>>>> +	cd /
>>>> +	rm -rf $tmp.*
>>>> +	rm -rf $testdir
>>>
>>> Leave it behind for post-mortem analysis if necessary, remove it
>>> before starting this test execution....
>>>
>>>> +}
>>>> +
>>>> +# get standard environment, filters and checks
>>>> +. ./common/rc
>>>> +. ./common/filter
>>>> +. ./common/attr
>>>> +
>>>> +# real QA test starts here
>>>> +_supported_os Linux
>>>> +_supported_fs ceph
>>>> +
>>>> +_require_attrs
>>>> +
>>>> +set_quota()
>>>> +{
>>>> +	val=$1
>>>> +	dir=$2
>>>> +	$SETFATTR_PROG -n ceph.quota.max_bytes -v $val $dir >/dev/null 2>&1
>>>> +}
>>>> +
>>>> +get_quota()
>>>> +{
>>>> +	dir=$1
>>>> +	$GETFATTR_PROG --only-values -n ceph.quota.max_bytes $dir 2> /dev/null
>>>> +}
>>>> +
>>>> +# function to write a file.  We use a loop because quotas in CephFS is a
>>>> +# "best-effort" implementation, i.e. a write may actually be allowed even if the
>>>> +# quota is being exceeded.  Using a loop reduces the chances of this to happen.
>>>> +#
>>>> +# NOTE: 'size' parameter is in M
>>>
>>> xfs_io accepts "1m" as one megabyte.
>>>
>>>> +write_file()
>>>> +{
>>>> +	file=$1
>>>> +	size=$2 # size in M
>>>> +	for (( i = 1; i < $size; i++ )); do
>>>> +		$XFS_IO_PROG -f -c "pwrite -W $((i * 1048576)) 1048576" \
>>>> +			     $file >/dev/null 2>&1
>>>> +	done
>>>> +}
>>>
>>> Makes no sense to me. xfs_io does a write() loop internally with
>>> this pwrite command of 4kB writes - the default buffer size. If you
>>> want xfs_io to loop doing 1MB sized pwrite() calls, then all you
>>> need is this:
>>>
>>> 	$XFS_IO_PROG -f -c "pwrite -w -B 1m 0 ${size}m" $file | _filter_xfs_io
>>>
>> 
>> Thank you for your review, Dave.  I'll make sure the next revision of
>> these tests will include all your comments implemented... except for
>> this one.
>> 
>> The reason I'm using a loop for writing a file is due to the nature of
>> the (very!) loose definition of quotas in CephFS.  Basically, clients
>> will likely write some amount of data over the configured limit because
>> the servers they are communicating with to write the data (the OSDs)
>> have no idea about the concept of quotas (or files even); the filesystem
>> view in the cluster is managed at a different level, with the help of
>> the MDS and the client itself.
>> 
>> So, the loop in this function is simply to allow the metadata associated
>> with the file to be updated while we're writing the file.  If I use a
>
> But the metadata will be modified while writing the file even with a
> single invocation of xfs_io.

No, that's not true.  It would be too expensive to keep the metadata
server updated while writing to a file.  So, making sure there's
actually an open/close to the file (plus the fsync in pwrite) helps
making sure the metadata is flushed into the MDS.

(And yes I _did_ tried to use xfs_io with the 1m loop and the test fails
because it simply writes all the data and gets no EDQUOT error.)

Cheers,
Dave Chinner April 3, 2019, 9:47 p.m. UTC | #5
On Wed, Apr 03, 2019 at 02:19:11PM +0100, Luis Henriques wrote:
> Nikolay Borisov <nborisov@suse.com> writes:
> > On 3.04.19 г. 12:45 ч., Luis Henriques wrote:
> >> Dave Chinner <david@fromorbit.com> writes:
> >>> Makes no sense to me. xfs_io does a write() loop internally with
> >>> this pwrite command of 4kB writes - the default buffer size. If you
> >>> want xfs_io to loop doing 1MB sized pwrite() calls, then all you
> >>> need is this:
> >>>
> >>> 	$XFS_IO_PROG -f -c "pwrite -w -B 1m 0 ${size}m" $file | _filter_xfs_io
> >>>
> >> 
> >> Thank you for your review, Dave.  I'll make sure the next revision of
> >> these tests will include all your comments implemented... except for
> >> this one.
> >> 
> >> The reason I'm using a loop for writing a file is due to the nature of
> >> the (very!) loose definition of quotas in CephFS.  Basically, clients
> >> will likely write some amount of data over the configured limit because
> >> the servers they are communicating with to write the data (the OSDs)
> >> have no idea about the concept of quotas (or files even); the filesystem
> >> view in the cluster is managed at a different level, with the help of
> >> the MDS and the client itself.
> >> 
> >> So, the loop in this function is simply to allow the metadata associated
> >> with the file to be updated while we're writing the file.  If I use a
> >
> > But the metadata will be modified while writing the file even with a
> > single invocation of xfs_io.
> 
> No, that's not true.  It would be too expensive to keep the metadata
> server updated while writing to a file.  So, making sure there's
> actually an open/close to the file (plus the fsync in pwrite) helps
> making sure the metadata is flushed into the MDS.

/me sighs.

So you want:

	loop until ${size}MB written:
		write 1MB
		fsync
		  -> flush data to server
		  -> flush metadata to server

i.e. this one liner:

xfs_io -f -c "pwrite -D -B 1m 0 ${size}m" /path/to/file

Fundamentally, if you find yourself writing a loop around xfs_io to
break up a sequential IO stream into individual chunks, then you are
most likely doing something xfs_io can already do. And if xfs_io
cannot do it, then the right thing to do is to modify xfs_io to be
able to do it and then use xfs_io....

Cheers,

Dave.
Luis Henriques April 4, 2019, 10:18 a.m. UTC | #6
Dave Chinner <david@fromorbit.com> writes:

> On Wed, Apr 03, 2019 at 02:19:11PM +0100, Luis Henriques wrote:
>> Nikolay Borisov <nborisov@suse.com> writes:
>> > On 3.04.19 г. 12:45 ч., Luis Henriques wrote:
>> >> Dave Chinner <david@fromorbit.com> writes:
>> >>> Makes no sense to me. xfs_io does a write() loop internally with
>> >>> this pwrite command of 4kB writes - the default buffer size. If you
>> >>> want xfs_io to loop doing 1MB sized pwrite() calls, then all you
>> >>> need is this:
>> >>>
>> >>> 	$XFS_IO_PROG -f -c "pwrite -w -B 1m 0 ${size}m" $file | _filter_xfs_io
>> >>>
>> >> 
>> >> Thank you for your review, Dave.  I'll make sure the next revision of
>> >> these tests will include all your comments implemented... except for
>> >> this one.
>> >> 
>> >> The reason I'm using a loop for writing a file is due to the nature of
>> >> the (very!) loose definition of quotas in CephFS.  Basically, clients
>> >> will likely write some amount of data over the configured limit because
>> >> the servers they are communicating with to write the data (the OSDs)
>> >> have no idea about the concept of quotas (or files even); the filesystem
>> >> view in the cluster is managed at a different level, with the help of
>> >> the MDS and the client itself.
>> >> 
>> >> So, the loop in this function is simply to allow the metadata associated
>> >> with the file to be updated while we're writing the file.  If I use a
>> >
>> > But the metadata will be modified while writing the file even with a
>> > single invocation of xfs_io.
>> 
>> No, that's not true.  It would be too expensive to keep the metadata
>> server updated while writing to a file.  So, making sure there's
>> actually an open/close to the file (plus the fsync in pwrite) helps
>> making sure the metadata is flushed into the MDS.
>
> /me sighs.
>
> So you want:
>
> 	loop until ${size}MB written:
> 		write 1MB
> 		fsync
> 		  -> flush data to server
> 		  -> flush metadata to server
>
> i.e. this one liner:
>
> xfs_io -f -c "pwrite -D -B 1m 0 ${size}m" /path/to/file

Unfortunately, that doesn't do what I want either :-/
(and I guess you meant '-b 1m', not '-B 1m', right?)

[ Zheng: please feel free to correct me if I'm saying something really
  stupid below. ]

So, one of the key things in my loop is the open/close operations.  When
a file is closed in cephfs the capabilities (that's ceph jargon for what
sort of operations a client is allowed to perform on an inode) will
likely be released and that's when the metadata server will get the
updated file size.  Before that, the client is allowed to modify the
file size if it has acquired the capabilities for doing so.

OTOH, a pwrite operation will eventually get the -EDQUOT even with the
one-liner above because the client itself will realize it has exceeded a
certain threshold set by the MDS and will eventually update the server
with the new file size.  However that won't happen at a deterministic
file size.  For example, if quota is 10m and we're writing 20m, we may
get the error after writing 15m.

Does this make sense?

So, I guess I *could* use your one-liner in the test, but I would need
to slightly change the test logic -- I would need to write enough data
to the file to make sure I would get the -EDQUOT but I wouldn't be able
to actually check the file size as it will not be constant.

> Fundamentally, if you find yourself writing a loop around xfs_io to
> break up a sequential IO stream into individual chunks, then you are
> most likely doing something xfs_io can already do. And if xfs_io
> cannot do it, then the right thing to do is to modify xfs_io to be
> able to do it and then use xfs_io....

Got it!  But I guess it wouldn't make sense to change xfs_io for this
specific scenario where I want several open-write-close cycles.

Cheers,
Dave Chinner April 12, 2019, 1:15 a.m. UTC | #7
On Thu, Apr 04, 2019 at 11:18:22AM +0100, Luis Henriques wrote:
> Dave Chinner <david@fromorbit.com> writes:
> 
> > On Wed, Apr 03, 2019 at 02:19:11PM +0100, Luis Henriques wrote:
> >> Nikolay Borisov <nborisov@suse.com> writes:
> >> > On 3.04.19 г. 12:45 ч., Luis Henriques wrote:
> >> >> Dave Chinner <david@fromorbit.com> writes:
> >> >>> Makes no sense to me. xfs_io does a write() loop internally with
> >> >>> this pwrite command of 4kB writes - the default buffer size. If you
> >> >>> want xfs_io to loop doing 1MB sized pwrite() calls, then all you
> >> >>> need is this:
> >> >>>
> >> >>> 	$XFS_IO_PROG -f -c "pwrite -w -B 1m 0 ${size}m" $file | _filter_xfs_io
> >> >>>
> >> >> 
> >> >> Thank you for your review, Dave.  I'll make sure the next revision of
> >> >> these tests will include all your comments implemented... except for
> >> >> this one.
> >> >> 
> >> >> The reason I'm using a loop for writing a file is due to the nature of
> >> >> the (very!) loose definition of quotas in CephFS.  Basically, clients
> >> >> will likely write some amount of data over the configured limit because
> >> >> the servers they are communicating with to write the data (the OSDs)
> >> >> have no idea about the concept of quotas (or files even); the filesystem
> >> >> view in the cluster is managed at a different level, with the help of
> >> >> the MDS and the client itself.
> >> >> 
> >> >> So, the loop in this function is simply to allow the metadata associated
> >> >> with the file to be updated while we're writing the file.  If I use a
> >> >
> >> > But the metadata will be modified while writing the file even with a
> >> > single invocation of xfs_io.
> >> 
> >> No, that's not true.  It would be too expensive to keep the metadata
> >> server updated while writing to a file.  So, making sure there's
> >> actually an open/close to the file (plus the fsync in pwrite) helps
> >> making sure the metadata is flushed into the MDS.
> >
> > /me sighs.
> >
> > So you want:
> >
> > 	loop until ${size}MB written:
> > 		write 1MB
> > 		fsync
> > 		  -> flush data to server
> > 		  -> flush metadata to server
> >
> > i.e. this one liner:
> >
> > xfs_io -f -c "pwrite -D -B 1m 0 ${size}m" /path/to/file
> 
> Unfortunately, that doesn't do what I want either :-/
> (and I guess you meant '-b 1m', not '-B 1m', right?)

Yes. But I definitely did mean "-D" so that RWF_DSYNC was used with
each 1MB write.

> [ Zheng: please feel free to correct me if I'm saying something really
>   stupid below. ]
> 
> So, one of the key things in my loop is the open/close operations.  When
> a file is closed in cephfs the capabilities (that's ceph jargon for what
> sort of operations a client is allowed to perform on an inode) will
> likely be released and that's when the metadata server will get the
> updated file size.  Before that, the client is allowed to modify the
> file size if it has acquired the capabilities for doing so.

So you are saying that O_DSYNC writes on ceph do not force file
size metadata changes to the metadata server to be made stable?

> OTOH, a pwrite operation will eventually get the -EDQUOT even with the
> one-liner above because the client itself will realize it has exceeded a
> certain threshold set by the MDS and will eventually update the server
> with the new file size.

Sure, but if the client crashes without having sent the updated file
size to the server as part of an extending O_DSYNC write, then how
is it recovered when the client reconnects to the server and
accesses the file again?

> However that won't happen at a deterministic
> file size.  For example, if quota is 10m and we're writing 20m, we may
> get the error after writing 15m.
> 
> Does this make sense?

Only makes sense to me if O_DSYNC is ignored by the ceph client...

> So, I guess I *could* use your one-liner in the test, but I would need
> to slightly change the test logic -- I would need to write enough data
> to the file to make sure I would get the -EDQUOT but I wouldn't be able
> to actually check the file size as it will not be constant.
> 
> > Fundamentally, if you find yourself writing a loop around xfs_io to
> > break up a sequential IO stream into individual chunks, then you are
> > most likely doing something xfs_io can already do. And if xfs_io
> > cannot do it, then the right thing to do is to modify xfs_io to be
> > able to do it and then use xfs_io....
> 
> Got it!  But I guess it wouldn't make sense to change xfs_io for this
> specific scenario where I want several open-write-close cycles.

That's how individual NFS client writes appear to filesystem under
the NFS server. I've previously considered adding an option in
xfs_io to mimic this open-write-close loop per buffer so it's easy
to exercise such behaviours, but never actually required it to
reproduce the problems I was chasing. So it's definitely something
that xfs_io /could/ do if necessary.

Cheers,

Dave.
Yan, Zheng April 12, 2019, 3:37 a.m. UTC | #8
On 4/12/19 9:15 AM, Dave Chinner wrote:
> On Thu, Apr 04, 2019 at 11:18:22AM +0100, Luis Henriques wrote:
>> Dave Chinner <david@fromorbit.com> writes:
>>
>>> On Wed, Apr 03, 2019 at 02:19:11PM +0100, Luis Henriques wrote:
>>>> Nikolay Borisov <nborisov@suse.com> writes:
>>>>> On 3.04.19 г. 12:45 ч., Luis Henriques wrote:
>>>>>> Dave Chinner <david@fromorbit.com> writes:
>>>>>>> Makes no sense to me. xfs_io does a write() loop internally with
>>>>>>> this pwrite command of 4kB writes - the default buffer size. If you
>>>>>>> want xfs_io to loop doing 1MB sized pwrite() calls, then all you
>>>>>>> need is this:
>>>>>>>
>>>>>>> 	$XFS_IO_PROG -f -c "pwrite -w -B 1m 0 ${size}m" $file | _filter_xfs_io
>>>>>>>
>>>>>>
>>>>>> Thank you for your review, Dave.  I'll make sure the next revision of
>>>>>> these tests will include all your comments implemented... except for
>>>>>> this one.
>>>>>>
>>>>>> The reason I'm using a loop for writing a file is due to the nature of
>>>>>> the (very!) loose definition of quotas in CephFS.  Basically, clients
>>>>>> will likely write some amount of data over the configured limit because
>>>>>> the servers they are communicating with to write the data (the OSDs)
>>>>>> have no idea about the concept of quotas (or files even); the filesystem
>>>>>> view in the cluster is managed at a different level, with the help of
>>>>>> the MDS and the client itself.
>>>>>>
>>>>>> So, the loop in this function is simply to allow the metadata associated
>>>>>> with the file to be updated while we're writing the file.  If I use a
>>>>>
>>>>> But the metadata will be modified while writing the file even with a
>>>>> single invocation of xfs_io.
>>>>
>>>> No, that's not true.  It would be too expensive to keep the metadata
>>>> server updated while writing to a file.  So, making sure there's
>>>> actually an open/close to the file (plus the fsync in pwrite) helps
>>>> making sure the metadata is flushed into the MDS.
>>>
>>> /me sighs.
>>>
>>> So you want:
>>>
>>> 	loop until ${size}MB written:
>>> 		write 1MB
>>> 		fsync
>>> 		  -> flush data to server
>>> 		  -> flush metadata to server
>>>
>>> i.e. this one liner:
>>>
>>> xfs_io -f -c "pwrite -D -B 1m 0 ${size}m" /path/to/file
>>
>> Unfortunately, that doesn't do what I want either :-/
>> (and I guess you meant '-b 1m', not '-B 1m', right?)
> 
> Yes. But I definitely did mean "-D" so that RWF_DSYNC was used with
> each 1MB write.
> 
>> [ Zheng: please feel free to correct me if I'm saying something really
>>    stupid below. ]
>>
>> So, one of the key things in my loop is the open/close operations.  When
>> a file is closed in cephfs the capabilities (that's ceph jargon for what
>> sort of operations a client is allowed to perform on an inode) will
>> likely be released and that's when the metadata server will get the
>> updated file size.  Before that, the client is allowed to modify the
>> file size if it has acquired the capabilities for doing so.
> 
> So you are saying that O_DSYNC writes on ceph do not force file
> size metadata changes to the metadata server to be made stable?
> 
>> OTOH, a pwrite operation will eventually get the -EDQUOT even with the
>> one-liner above because the client itself will realize it has exceeded a
>> certain threshold set by the MDS and will eventually update the server
>> with the new file size.
> 
> Sure, but if the client crashes without having sent the updated file
> size to the server as part of an extending O_DSYNC write, then how
> is it recovered when the client reconnects to the server and
> accesses the file again?


For DSYNC write, client has already written data to object store. If 
client crashes, MDS will set file to 'recovering' state and probe file 
size by checking object store. Accessing the file is blocked during 
recovery.

Regards
Yan, Zheng




> 
>> However that won't happen at a deterministic
>> file size.  For example, if quota is 10m and we're writing 20m, we may
>> get the error after writing 15m.
>>
>> Does this make sense?
> 
> Only makes sense to me if O_DSYNC is ignored by the ceph client...
> 
>> So, I guess I *could* use your one-liner in the test, but I would need
>> to slightly change the test logic -- I would need to write enough data
>> to the file to make sure I would get the -EDQUOT but I wouldn't be able
>> to actually check the file size as it will not be constant.
>>
>>> Fundamentally, if you find yourself writing a loop around xfs_io to
>>> break up a sequential IO stream into individual chunks, then you are
>>> most likely doing something xfs_io can already do. And if xfs_io
>>> cannot do it, then the right thing to do is to modify xfs_io to be
>>> able to do it and then use xfs_io....
>>
>> Got it!  But I guess it wouldn't make sense to change xfs_io for this
>> specific scenario where I want several open-write-close cycles.
> 
> That's how individual NFS client writes appear to filesystem under
> the NFS server. I've previously considered adding an option in
> xfs_io to mimic this open-write-close loop per buffer so it's easy
> to exercise such behaviours, but never actually required it to
> reproduce the problems I was chasing. So it's definitely something
> that xfs_io /could/ do if necessary.
> 
> Cheers,
> 
> Dave.
>
Luis Henriques April 12, 2019, 11:04 a.m. UTC | #9
"Yan, Zheng" <zyan@redhat.com> writes:

> On 4/12/19 9:15 AM, Dave Chinner wrote:
>> On Thu, Apr 04, 2019 at 11:18:22AM +0100, Luis Henriques wrote:
>>> Dave Chinner <david@fromorbit.com> writes:
>>>
>>>> On Wed, Apr 03, 2019 at 02:19:11PM +0100, Luis Henriques wrote:
>>>>> Nikolay Borisov <nborisov@suse.com> writes:
>>>>>> On 3.04.19 г. 12:45 ч., Luis Henriques wrote:
>>>>>>> Dave Chinner <david@fromorbit.com> writes:
>>>>>>>> Makes no sense to me. xfs_io does a write() loop internally with
>>>>>>>> this pwrite command of 4kB writes - the default buffer size. If you
>>>>>>>> want xfs_io to loop doing 1MB sized pwrite() calls, then all you
>>>>>>>> need is this:
>>>>>>>>
>>>>>>>> 	$XFS_IO_PROG -f -c "pwrite -w -B 1m 0 ${size}m" $file | _filter_xfs_io
>>>>>>>>
>>>>>>>
>>>>>>> Thank you for your review, Dave.  I'll make sure the next revision of
>>>>>>> these tests will include all your comments implemented... except for
>>>>>>> this one.
>>>>>>>
>>>>>>> The reason I'm using a loop for writing a file is due to the nature of
>>>>>>> the (very!) loose definition of quotas in CephFS.  Basically, clients
>>>>>>> will likely write some amount of data over the configured limit because
>>>>>>> the servers they are communicating with to write the data (the OSDs)
>>>>>>> have no idea about the concept of quotas (or files even); the filesystem
>>>>>>> view in the cluster is managed at a different level, with the help of
>>>>>>> the MDS and the client itself.
>>>>>>>
>>>>>>> So, the loop in this function is simply to allow the metadata associated
>>>>>>> with the file to be updated while we're writing the file.  If I use a
>>>>>>
>>>>>> But the metadata will be modified while writing the file even with a
>>>>>> single invocation of xfs_io.
>>>>>
>>>>> No, that's not true.  It would be too expensive to keep the metadata
>>>>> server updated while writing to a file.  So, making sure there's
>>>>> actually an open/close to the file (plus the fsync in pwrite) helps
>>>>> making sure the metadata is flushed into the MDS.
>>>>
>>>> /me sighs.
>>>>
>>>> So you want:
>>>>
>>>> 	loop until ${size}MB written:
>>>> 		write 1MB
>>>> 		fsync
>>>> 		  -> flush data to server
>>>> 		  -> flush metadata to server
>>>>
>>>> i.e. this one liner:
>>>>
>>>> xfs_io -f -c "pwrite -D -B 1m 0 ${size}m" /path/to/file
>>>
>>> Unfortunately, that doesn't do what I want either :-/
>>> (and I guess you meant '-b 1m', not '-B 1m', right?)
>>
>> Yes. But I definitely did mean "-D" so that RWF_DSYNC was used with
>> each 1MB write.
>>
>>> [ Zheng: please feel free to correct me if I'm saying something really
>>>    stupid below. ]
>>>
>>> So, one of the key things in my loop is the open/close operations.  When
>>> a file is closed in cephfs the capabilities (that's ceph jargon for what
>>> sort of operations a client is allowed to perform on an inode) will
>>> likely be released and that's when the metadata server will get the
>>> updated file size.  Before that, the client is allowed to modify the
>>> file size if it has acquired the capabilities for doing so.
>>
>> So you are saying that O_DSYNC writes on ceph do not force file
>> size metadata changes to the metadata server to be made stable?
>>
>>> OTOH, a pwrite operation will eventually get the -EDQUOT even with the
>>> one-liner above because the client itself will realize it has exceeded a
>>> certain threshold set by the MDS and will eventually update the server
>>> with the new file size.
>>
>> Sure, but if the client crashes without having sent the updated file
>> size to the server as part of an extending O_DSYNC write, then how
>> is it recovered when the client reconnects to the server and
>> accesses the file again?
>
>
> For DSYNC write, client has already written data to object store. If client
> crashes, MDS will set file to 'recovering' state and probe file size by checking
> object store. Accessing the file is blocked during recovery.

Thank you for chiming in, Zheng.

>
> Regards
> Yan, Zheng
>
>
>
>
>>
>>> However that won't happen at a deterministic
>>> file size.  For example, if quota is 10m and we're writing 20m, we may
>>> get the error after writing 15m.
>>>
>>> Does this make sense?
>>
>> Only makes sense to me if O_DSYNC is ignored by the ceph client...
>>
>>> So, I guess I *could* use your one-liner in the test, but I would need
>>> to slightly change the test logic -- I would need to write enough data
>>> to the file to make sure I would get the -EDQUOT but I wouldn't be able
>>> to actually check the file size as it will not be constant.
>>>
>>>> Fundamentally, if you find yourself writing a loop around xfs_io to
>>>> break up a sequential IO stream into individual chunks, then you are
>>>> most likely doing something xfs_io can already do. And if xfs_io
>>>> cannot do it, then the right thing to do is to modify xfs_io to be
>>>> able to do it and then use xfs_io....
>>>
>>> Got it!  But I guess it wouldn't make sense to change xfs_io for this
>>> specific scenario where I want several open-write-close cycles.
>>
>> That's how individual NFS client writes appear to filesystem under
>> the NFS server. I've previously considered adding an option in
>> xfs_io to mimic this open-write-close loop per buffer so it's easy
>> to exercise such behaviours, but never actually required it to
>> reproduce the problems I was chasing. So it's definitely something
>> that xfs_io /could/ do if necessary.

Ok, since there seems to be other use-cases for this, I agree it may be
worth adding that option then.  I'll see if I can come up with a patch
for that.

Cheers,
Dave Chinner April 14, 2019, 10:15 p.m. UTC | #10
On Fri, Apr 12, 2019 at 11:37:55AM +0800, Yan, Zheng wrote:
> On 4/12/19 9:15 AM, Dave Chinner wrote:
> > On Thu, Apr 04, 2019 at 11:18:22AM +0100, Luis Henriques wrote:
> > > Dave Chinner <david@fromorbit.com> writes:
> > > 
> > > > On Wed, Apr 03, 2019 at 02:19:11PM +0100, Luis Henriques wrote:
> > > > > Nikolay Borisov <nborisov@suse.com> writes:
> > > > > > On 3.04.19 г. 12:45 ч., Luis Henriques wrote:
> > > > > > > Dave Chinner <david@fromorbit.com> writes:
> > > > > > > > Makes no sense to me. xfs_io does a write() loop internally with
> > > > > > > > this pwrite command of 4kB writes - the default buffer size. If you
> > > > > > > > want xfs_io to loop doing 1MB sized pwrite() calls, then all you
> > > > > > > > need is this:
> > > > > > > > 
> > > > > > > > 	$XFS_IO_PROG -f -c "pwrite -w -B 1m 0 ${size}m" $file | _filter_xfs_io
> > > > > > > > 
> > > > > > > 
> > > > > > > Thank you for your review, Dave.  I'll make sure the next revision of
> > > > > > > these tests will include all your comments implemented... except for
> > > > > > > this one.
> > > > > > > 
> > > > > > > The reason I'm using a loop for writing a file is due to the nature of
> > > > > > > the (very!) loose definition of quotas in CephFS.  Basically, clients
> > > > > > > will likely write some amount of data over the configured limit because
> > > > > > > the servers they are communicating with to write the data (the OSDs)
> > > > > > > have no idea about the concept of quotas (or files even); the filesystem
> > > > > > > view in the cluster is managed at a different level, with the help of
> > > > > > > the MDS and the client itself.
> > > > > > > 
> > > > > > > So, the loop in this function is simply to allow the metadata associated
> > > > > > > with the file to be updated while we're writing the file.  If I use a
> > > > > > 
> > > > > > But the metadata will be modified while writing the file even with a
> > > > > > single invocation of xfs_io.
> > > > > 
> > > > > No, that's not true.  It would be too expensive to keep the metadata
> > > > > server updated while writing to a file.  So, making sure there's
> > > > > actually an open/close to the file (plus the fsync in pwrite) helps
> > > > > making sure the metadata is flushed into the MDS.
> > > > 
> > > > /me sighs.
> > > > 
> > > > So you want:
> > > > 
> > > > 	loop until ${size}MB written:
> > > > 		write 1MB
> > > > 		fsync
> > > > 		  -> flush data to server
> > > > 		  -> flush metadata to server
> > > > 
> > > > i.e. this one liner:
> > > > 
> > > > xfs_io -f -c "pwrite -D -B 1m 0 ${size}m" /path/to/file
> > > 
> > > Unfortunately, that doesn't do what I want either :-/
> > > (and I guess you meant '-b 1m', not '-B 1m', right?)
> > 
> > Yes. But I definitely did mean "-D" so that RWF_DSYNC was used with
> > each 1MB write.
> > 
> > > [ Zheng: please feel free to correct me if I'm saying something really
> > >    stupid below. ]
> > > 
> > > So, one of the key things in my loop is the open/close operations.  When
> > > a file is closed in cephfs the capabilities (that's ceph jargon for what
> > > sort of operations a client is allowed to perform on an inode) will
> > > likely be released and that's when the metadata server will get the
> > > updated file size.  Before that, the client is allowed to modify the
> > > file size if it has acquired the capabilities for doing so.
> > 
> > So you are saying that O_DSYNC writes on ceph do not force file
> > size metadata changes to the metadata server to be made stable?
> > 
> > > OTOH, a pwrite operation will eventually get the -EDQUOT even with the
> > > one-liner above because the client itself will realize it has exceeded a
> > > certain threshold set by the MDS and will eventually update the server
> > > with the new file size.
> > 
> > Sure, but if the client crashes without having sent the updated file
> > size to the server as part of an extending O_DSYNC write, then how
> > is it recovered when the client reconnects to the server and
> > accesses the file again?
> 
> 
> For DSYNC write, client has already written data to object store. If client
> crashes, MDS will set file to 'recovering' state and probe file size by
> checking object store. Accessing the file is blocked during recovery.

IOWs, ceph allows data integrity writes to the object store even
though those writes breach quota limits on that object store? i.e.
ceph quota essentially ignores O_SYNC/O_DSYNC metadata requirements?

FWIW, quotas normally have soft and hard limits - soft limits can be
breached with a warning and a time limit to return under the soft
limit, but the quota hard limit should /never/ be breached by users.

I guess that's the way of the world these days - fast and loose
because everyone demands fast before correct....

Cheers,

Dave.
Yan, Zheng April 15, 2019, 2:16 a.m. UTC | #11
On 4/15/19 6:15 AM, Dave Chinner wrote:
> On Fri, Apr 12, 2019 at 11:37:55AM +0800, Yan, Zheng wrote:
>> On 4/12/19 9:15 AM, Dave Chinner wrote:
>>> On Thu, Apr 04, 2019 at 11:18:22AM +0100, Luis Henriques wrote:
>>>> Dave Chinner <david@fromorbit.com> writes:
>>>>
>>>>> On Wed, Apr 03, 2019 at 02:19:11PM +0100, Luis Henriques wrote:
>>>>>> Nikolay Borisov <nborisov@suse.com> writes:
>>>>>>> On 3.04.19 г. 12:45 ч., Luis Henriques wrote:
>>>>>>>> Dave Chinner <david@fromorbit.com> writes:
>>>>>>>>> Makes no sense to me. xfs_io does a write() loop internally with
>>>>>>>>> this pwrite command of 4kB writes - the default buffer size. If you
>>>>>>>>> want xfs_io to loop doing 1MB sized pwrite() calls, then all you
>>>>>>>>> need is this:
>>>>>>>>>
>>>>>>>>> 	$XFS_IO_PROG -f -c "pwrite -w -B 1m 0 ${size}m" $file | _filter_xfs_io
>>>>>>>>>
>>>>>>>>
>>>>>>>> Thank you for your review, Dave.  I'll make sure the next revision of
>>>>>>>> these tests will include all your comments implemented... except for
>>>>>>>> this one.
>>>>>>>>
>>>>>>>> The reason I'm using a loop for writing a file is due to the nature of
>>>>>>>> the (very!) loose definition of quotas in CephFS.  Basically, clients
>>>>>>>> will likely write some amount of data over the configured limit because
>>>>>>>> the servers they are communicating with to write the data (the OSDs)
>>>>>>>> have no idea about the concept of quotas (or files even); the filesystem
>>>>>>>> view in the cluster is managed at a different level, with the help of
>>>>>>>> the MDS and the client itself.
>>>>>>>>
>>>>>>>> So, the loop in this function is simply to allow the metadata associated
>>>>>>>> with the file to be updated while we're writing the file.  If I use a
>>>>>>>
>>>>>>> But the metadata will be modified while writing the file even with a
>>>>>>> single invocation of xfs_io.
>>>>>>
>>>>>> No, that's not true.  It would be too expensive to keep the metadata
>>>>>> server updated while writing to a file.  So, making sure there's
>>>>>> actually an open/close to the file (plus the fsync in pwrite) helps
>>>>>> making sure the metadata is flushed into the MDS.
>>>>>
>>>>> /me sighs.
>>>>>
>>>>> So you want:
>>>>>
>>>>> 	loop until ${size}MB written:
>>>>> 		write 1MB
>>>>> 		fsync
>>>>> 		  -> flush data to server
>>>>> 		  -> flush metadata to server
>>>>>
>>>>> i.e. this one liner:
>>>>>
>>>>> xfs_io -f -c "pwrite -D -B 1m 0 ${size}m" /path/to/file
>>>>
>>>> Unfortunately, that doesn't do what I want either :-/
>>>> (and I guess you meant '-b 1m', not '-B 1m', right?)
>>>
>>> Yes. But I definitely did mean "-D" so that RWF_DSYNC was used with
>>> each 1MB write.
>>>
>>>> [ Zheng: please feel free to correct me if I'm saying something really
>>>>     stupid below. ]
>>>>
>>>> So, one of the key things in my loop is the open/close operations.  When
>>>> a file is closed in cephfs the capabilities (that's ceph jargon for what
>>>> sort of operations a client is allowed to perform on an inode) will
>>>> likely be released and that's when the metadata server will get the
>>>> updated file size.  Before that, the client is allowed to modify the
>>>> file size if it has acquired the capabilities for doing so.
>>>
>>> So you are saying that O_DSYNC writes on ceph do not force file
>>> size metadata changes to the metadata server to be made stable?
>>>
>>>> OTOH, a pwrite operation will eventually get the -EDQUOT even with the
>>>> one-liner above because the client itself will realize it has exceeded a
>>>> certain threshold set by the MDS and will eventually update the server
>>>> with the new file size.
>>>
>>> Sure, but if the client crashes without having sent the updated file
>>> size to the server as part of an extending O_DSYNC write, then how
>>> is it recovered when the client reconnects to the server and
>>> accesses the file again?
>>
>>
>> For DSYNC write, client has already written data to object store. If client
>> crashes, MDS will set file to 'recovering' state and probe file size by
>> checking object store. Accessing the file is blocked during recovery.
> 
> IOWs, ceph allows data integrity writes to the object store even
> though those writes breach  limits on that object store? i.e.
> ceph quota essentially ignores O_SYNC/O_DSYNC metadata requirements?
> 

Current cephfs quota implementation checks quota (compare i_size and 
quota setting) at very beginning of ceph_write_iter(). Nothing do with 
O_SYNC and O_DSYNC.

Regards
Yan, Zheng


> FWIW, quotas normally have soft and hard limits - soft limits can be
> breached with a warning and a time limit to return under the soft
> limit, but the quota hard limit should /never/ be breached by users.
> 
> I guess that's the way of the world these days - fast and loose
> because everyone demands fast before correct....
> 
> Cheers,
> 
> Dave.
>
Dave Chinner April 16, 2019, 8:13 a.m. UTC | #12
On Mon, Apr 15, 2019 at 10:16:18AM +0800, Yan, Zheng wrote:
> On 4/15/19 6:15 AM, Dave Chinner wrote:
> > On Fri, Apr 12, 2019 at 11:37:55AM +0800, Yan, Zheng wrote:
> > > On 4/12/19 9:15 AM, Dave Chinner wrote:
> > > > On Thu, Apr 04, 2019 at 11:18:22AM +0100, Luis Henriques wrote:
> > > > > Dave Chinner <david@fromorbit.com> writes:
> > > For DSYNC write, client has already written data to object store. If client
> > > crashes, MDS will set file to 'recovering' state and probe file size by
> > > checking object store. Accessing the file is blocked during recovery.
> > 
> > IOWs, ceph allows data integrity writes to the object store even
> > though those writes breach  limits on that object store? i.e.
> > ceph quota essentially ignores O_SYNC/O_DSYNC metadata requirements?
> > 
> 
> Current cephfs quota implementation checks quota (compare i_size and quota
> setting) at very beginning of ceph_write_iter(). Nothing do with O_SYNC and
> O_DSYNC.

Hold on, if the quota is checked on the client at the start of every
write, then why is it not enforced /exactly/? Where does this "we
didn't notice we'd run out of quota" overrun come from then?

i.e. the test changes are implying that quota is not accurately
checked and enforced on every write, and that there is something
less that exact about quotas on the ceph client. Yet you say they
are checked on every write.

Where does the need to open/close files and force flushing client
state to the MDS come from if quota is actually being checked
on every write as you say it is?

i.e. I'm trying to work out if this change is just working around
bugs in ceph quota accounting and I'm being told conflicting things
about how the ceph client accounts and enforces quota limits. Can
you please clearly explain how the quota enforcedment works and why
close/open between writes is necessary for accurate quota
enforcement so that we have some clue as to why these rubbery limit
hacks are necessary?

If we don't understand why a test does something and it's not
adequately documented, we can't really be expected to maintain
it in working order....

Cheers,

Dave.
Luis Henriques April 16, 2019, 10:48 a.m. UTC | #13
Dave Chinner <david@fromorbit.com> writes:

> On Mon, Apr 15, 2019 at 10:16:18AM +0800, Yan, Zheng wrote:
>> On 4/15/19 6:15 AM, Dave Chinner wrote:
>> > On Fri, Apr 12, 2019 at 11:37:55AM +0800, Yan, Zheng wrote:
>> > > On 4/12/19 9:15 AM, Dave Chinner wrote:
>> > > > On Thu, Apr 04, 2019 at 11:18:22AM +0100, Luis Henriques wrote:
>> > > > > Dave Chinner <david@fromorbit.com> writes:
>> > > For DSYNC write, client has already written data to object store. If client
>> > > crashes, MDS will set file to 'recovering' state and probe file size by
>> > > checking object store. Accessing the file is blocked during recovery.
>> > 
>> > IOWs, ceph allows data integrity writes to the object store even
>> > though those writes breach  limits on that object store? i.e.
>> > ceph quota essentially ignores O_SYNC/O_DSYNC metadata requirements?
>> > 
>> 
>> Current cephfs quota implementation checks quota (compare i_size and quota
>> setting) at very beginning of ceph_write_iter(). Nothing do with O_SYNC and
>> O_DSYNC.
>
> Hold on, if the quota is checked on the client at the start of every
> write, then why is it not enforced /exactly/? Where does this "we
> didn't notice we'd run out of quota" overrun come from then?

Ok, there's an extra piece of information that I don't think it was
mentioned in the thread yet, which is the (recursive) directory
statistics.  These stats are maintained by the MDS and it's against
these stats that the client actually checks if there's an overrun.  The
checks can be done against the amount of bytes ('max_bytes' quota)
and/or number of files ('max_files' quota), depending on which quotas
are enabled.

_Maybe_ there's space for some optimization on the client-side by
locally updating the stats received from the MDS with local writes,
files creation/deletion, etc.  But I'm not sure it's worth the effort,
because:

- for each operation (write, truncate, create, ...) we would also have
  the extra overhead of walking up the directory tree (or at least the
  quota realms tree) to update the stats for each directory;  and

- we may have more clients writing to the same directories, and thus
  messing up with the dir quotas anyway.  I.e. the quotas overrun
  problem would still be there, and we would still require to open/close
  files in the test.

Hopefully this helps clarifying why the open/close loop is a needed hack
with the current quotas design.

Cheers,
Gregory Farnum April 16, 2019, 6:38 p.m. UTC | #14
On Tue, Apr 16, 2019 at 3:48 AM Luis Henriques <lhenriques@suse.com> wrote:
>
> Dave Chinner <david@fromorbit.com> writes:
>
> > On Mon, Apr 15, 2019 at 10:16:18AM +0800, Yan, Zheng wrote:
> >> On 4/15/19 6:15 AM, Dave Chinner wrote:
> >> > On Fri, Apr 12, 2019 at 11:37:55AM +0800, Yan, Zheng wrote:
> >> > > On 4/12/19 9:15 AM, Dave Chinner wrote:
> >> > > > On Thu, Apr 04, 2019 at 11:18:22AM +0100, Luis Henriques wrote:
> >> > > > > Dave Chinner <david@fromorbit.com> writes:
> >> > > For DSYNC write, client has already written data to object store. If client
> >> > > crashes, MDS will set file to 'recovering' state and probe file size by
> >> > > checking object store. Accessing the file is blocked during recovery.
> >> >
> >> > IOWs, ceph allows data integrity writes to the object store even
> >> > though those writes breach  limits on that object store? i.e.
> >> > ceph quota essentially ignores O_SYNC/O_DSYNC metadata requirements?
> >> >
> >>
> >> Current cephfs quota implementation checks quota (compare i_size and quota
> >> setting) at very beginning of ceph_write_iter(). Nothing do with O_SYNC and
> >> O_DSYNC.
> >
> > Hold on, if the quota is checked on the client at the start of every
> > write, then why is it not enforced /exactly/? Where does this "we
> > didn't notice we'd run out of quota" overrun come from then?
>
> Ok, there's an extra piece of information that I don't think it was
> mentioned in the thread yet, which is the (recursive) directory
> statistics.  These stats are maintained by the MDS and it's against
> these stats that the client actually checks if there's an overrun.  The
> checks can be done against the amount of bytes ('max_bytes' quota)
> and/or number of files ('max_files' quota), depending on which quotas
> are enabled.
>
> _Maybe_ there's space for some optimization on the client-side by
> locally updating the stats received from the MDS with local writes,
> files creation/deletion, etc.  But I'm not sure it's worth the effort,
> because:
>
> - for each operation (write, truncate, create, ...) we would also have
>   the extra overhead of walking up the directory tree (or at least the
>   quota realms tree) to update the stats for each directory;  and
>
> - we may have more clients writing to the same directories, and thus
>   messing up with the dir quotas anyway.  I.e. the quotas overrun
>   problem would still be there, and we would still require to open/close
>   files in the test.
>
> Hopefully this helps clarifying why the open/close loop is a needed hack
> with the current quotas design.

More broadly, and without reference to any specific design, Ceph is a
distributed storage system that allows multiple active clients to
independently do activity against multiple servers, and which allows
directory quotas.

Strict enforcement of quotas requires a single centralized authority
to know the data within each directory and decide on every individual
write operation if there is enough space to allow it; this is easy on
a local filesystem but fundamentally cripples a multi-computer one
since it implies serializing every IO on a single server. (Yes, there
are LOTS of tricks you can pull so that "normally" it's not a problem
and only impacts IO when you are actually approaching the quota limit;
yes, there are ways to try and proactively reserve quota for a given
writer; they all break down into the "contact a single server and
serialize on it" when you actually approach the quota and nobody has
yet expressed any interest in Ceph's quotas being that precise
anyway.)

>
> Cheers,
> --
> Luis
>
> >
> > i.e. the test changes are implying that quota is not accurately
> > checked and enforced on every write, and that there is something
> > less that exact about quotas on the ceph client. Yet you say they
> > are checked on every write.
> >
> > Where does the need to open/close files and force flushing client
> > state to the MDS come from if quota is actually being checked
> > on every write as you say it is?
> >
> > i.e. I'm trying to work out if this change is just working around
> > bugs in ceph quota accounting and I'm being told conflicting things
> > about how the ceph client accounts and enforces quota limits. Can
> > you please clearly explain how the quota enforcedment works and why
> > close/open between writes is necessary for accurate quota
> > enforcement so that we have some clue as to why these rubbery limit
> > hacks are necessary?
> >
> > If we don't understand why a test does something and it's not
> > adequately documented, we can't really be expected to maintain
> > it in working order....
> >
> > Cheers,
> >
> > Dave.
diff mbox series

Patch

diff --git a/tests/ceph/002 b/tests/ceph/002
new file mode 100755
index 000000000000..313865dc639e
--- /dev/null
+++ b/tests/ceph/002
@@ -0,0 +1,147 @@ 
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2019 SUSE LLC. All Rights Reserved.
+#
+# FS QA Test No. 002
+#
+# This tests basic ceph.quota.max_bytes quota features.
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+testdir=$TEST_DIR/quota-test
+
+tmp=/tmp/$$
+status=1    # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+	cd /
+	rm -rf $tmp.*
+	rm -rf $testdir
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/attr
+
+# real QA test starts here
+_supported_os Linux
+_supported_fs ceph
+
+_require_attrs
+
+set_quota()
+{
+	val=$1
+	dir=$2
+	$SETFATTR_PROG -n ceph.quota.max_bytes -v $val $dir >/dev/null 2>&1
+}
+
+get_quota()
+{
+	dir=$1
+	$GETFATTR_PROG --only-values -n ceph.quota.max_bytes $dir 2> /dev/null
+}
+
+# function to write a file.  We use a loop because quotas in CephFS is a
+# "best-effort" implementation, i.e. a write may actually be allowed even if the
+# quota is being exceeded.  Using a loop reduces the chances of this to happen.
+#
+# NOTE: 'size' parameter is in M
+write_file()
+{
+	file=$1
+	size=$2 # size in M
+	for (( i = 1; i < $size; i++ )); do
+		$XFS_IO_PROG -f -c "pwrite -W $((i * 1048576)) 1048576" \
+			     $file >/dev/null 2>&1
+	done
+}
+
+# Check a file size
+#
+# NOTE: 'expected' (size) parameter is in M
+check_file_size()
+{
+	file=$1
+	expected=$(($2 * 1048576))
+	size=$(stat -c %s $file)
+	if [ "$size" -ne "$expected" ]; then
+		_fail "Expecting file with $expected got $size"
+	fi
+}
+
+mkdir $testdir
+
+# test setting quota
+set_quota 1000000 $testdir
+ret=$(get_quota $testdir)
+if [ "$ret" -ne 1000000 ]; then
+	_fail "expected max_bytes quota to be 1000000, got '$ret' instead"
+fi
+# set quota to largest acceptable value (0x7FFFFFFFFFFFFFFF)
+set_quota 9223372036854775807 $testdir
+ret=$(get_quota $testdir)
+if [ "$ret" -ne 9223372036854775807 ]; then
+	_fail "expected max_bytes quota to be 9223372036854775807, got '$ret' instead"
+fi
+# test resetting quota
+set_quota 0 $testdir
+ret=$(get_quota $testdir)
+if [ -n "$ret" ]; then
+	_fail "expected 0 max_bytes quota, got '$ret' instead"
+fi
+# set quota to invalid values (0x8000000000000000 and -1)
+set_quota 9223372036854775808 $testdir
+ret=$(get_quota $testdir)
+if [ -n "$ret" ]; then
+	_fail "expected max_bytes quota to be 0, got '$ret' instead"
+fi
+set_quota -1 $testdir
+ret=$(get_quota $testdir)
+if [ -n "$ret" ]; then
+	_fail "expected max_bytes quota to be 0, got '$ret' instead"
+fi
+
+bigfile="$testdir/bigfile"
+
+# set quota to 10M
+set_quota $((10 * 1048576)) $testdir
+
+# write 9M file
+write_file $bigfile 9
+check_file_size $bigfile 9
+rm $bigfile
+
+# try to write 11M file
+write_file $bigfile 11 # 11M
+check_file_size $bigfile 10
+rm $bigfile
+
+# write 5 x 2M files
+for (( j = 1; j < 6; j++ )); do
+	smallfile="$testdir/smallfile_$j"
+	write_file $smallfile 2 # 2M
+	check_file_size $smallfile 2
+done
+
+# try write another 2M file
+smallfile="$testdir/smallfile_fail"
+write_file $smallfile 2
+check_file_size $smallfile 0
+
+# reset quota
+set_quota 0 $testdir
+
+# write 2M file
+write_file $smallfile 2
+check_file_size $smallfile 2
+
+# success, all done
+status=0
+exit
diff --git a/tests/ceph/002.out b/tests/ceph/002.out
new file mode 100644
index 000000000000..c57ca23e5cbe
--- /dev/null
+++ b/tests/ceph/002.out
@@ -0,0 +1 @@ 
+QA output created by 002
diff --git a/tests/ceph/group b/tests/ceph/group
index e389bc6ec7ee..02da95169c67 100644
--- a/tests/ceph/group
+++ b/tests/ceph/group
@@ -1 +1,2 @@ 
 001 auto quick quota
+002 auto quick quota