Message ID | eae6d3a1c16e440f18fd60d69b061d15ffbfe8e7.1619818517.git.matheus.bernardino@usp.br (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Parallel Checkout (part 3) | expand |
On Fri, Apr 30, 2021 at 06:40:32PM -0300, Matheus Tavares wrote: > Add tests to confirm that path collisions are properly detected by > checkout workers, both to avoid race conditions and to report colliding > entries on clone. > > Co-authored-by: Jeff Hostetler <jeffhost@microsoft.com> > Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br> > --- > parallel-checkout.c | 4 + > t/lib-parallel-checkout.sh | 4 +- > t/t2081-parallel-checkout-collisions.sh | 162 ++++++++++++++++++++++++ > 3 files changed, 168 insertions(+), 2 deletions(-) > create mode 100755 t/t2081-parallel-checkout-collisions.sh > > diff --git a/parallel-checkout.c b/parallel-checkout.c > index 09e8b10a35..6fb3f1e6c9 100644 > --- a/parallel-checkout.c > +++ b/parallel-checkout.c > @@ -8,6 +8,7 @@ > #include "sigchain.h" > #include "streaming.h" > #include "thread-utils.h" > +#include "trace2.h" > > struct pc_worker { > struct child_process cp; > @@ -326,6 +327,7 @@ void write_pc_item(struct parallel_checkout_item *pc_item, > if (dir_sep && !has_dirs_only_path(path.buf, dir_sep - path.buf, > state->base_dir_len)) { > pc_item->status = PC_ITEM_COLLIDED; > + trace2_data_string("pcheckout", NULL, "collision/dirname", path.buf); > goto out; > } > > @@ -341,6 +343,8 @@ void write_pc_item(struct parallel_checkout_item *pc_item, > * call should have already caught these cases. > */ > pc_item->status = PC_ITEM_COLLIDED; > + trace2_data_string("pcheckout", NULL, > + "collision/basename", path.buf); > } else { > error_errno("failed to open file '%s'", path.buf); > pc_item->status = PC_ITEM_FAILED; > diff --git a/t/lib-parallel-checkout.sh b/t/lib-parallel-checkout.sh > index f60b22ef34..d6740425b1 100644 > --- a/t/lib-parallel-checkout.sh > +++ b/t/lib-parallel-checkout.sh > @@ -22,12 +22,12 @@ test_checkout_workers () { > > local trace_file=trace-test-checkout-workers && > rm -f "$trace_file" && > - GIT_TRACE2="$(pwd)/$trace_file" "$@" && > + GIT_TRACE2="$(pwd)/$trace_file" "$@" 2>&8 && > > local workers=$(grep "child_start\[..*\] git checkout--worker" "$trace_file" | wc -l) && > test $workers -eq $expected_workers && > rm "$trace_file" > -} > +} 8>&2 2>&4 > > # Verify that both the working tree and the index were created correctly > verify_checkout () { > diff --git a/t/t2081-parallel-checkout-collisions.sh b/t/t2081-parallel-checkout-collisions.sh > new file mode 100755 > index 0000000000..f6fcfc0c1e > --- /dev/null > +++ b/t/t2081-parallel-checkout-collisions.sh > @@ -0,0 +1,162 @@ > +#!/bin/sh > + > +test_description="path collisions during parallel checkout > + > +Parallel checkout must detect path collisions to: > + > +1) Avoid racily writing to different paths that represent the same file on disk. > +2) Report the colliding entries on clone. > + > +The tests in this file exercise parallel checkout's collision detection code in > +both these mechanics. > +" > + > +. ./test-lib.sh > +. "$TEST_DIRECTORY/lib-parallel-checkout.sh" Why this $TEST_DIRECTORY ? Aren't all files under the t/ directory, where test-lib.sh is as well ? (The $TEST_DIRECTORY macro is used at different places, so I may have missed something) > + > +TEST_ROOT="$PWD" > + > +test_expect_success CASE_INSENSITIVE_FS 'setup' ' > + empty_oid=$(git hash-object -w --stdin </dev/null) && > + cat >objs <<-EOF && > + 100644 $empty_oid FILE_X > + 100644 $empty_oid FILE_x > + 100644 $empty_oid file_X > + 100644 $empty_oid file_x > + EOF > + git update-index --index-info <objs && > + git commit -m "colliding files" && > + git tag basename_collision && > + > + write_script "$TEST_ROOT"/logger_script <<-\EOF > + echo "$@" >>filter.log > + EOF > +' > + > +test_workers_in_event_trace () > +{ > + test $1 -eq $(grep ".event.:.child_start..*checkout--worker" $2 | wc -l) > +} > + > +test_expect_success CASE_INSENSITIVE_FS 'worker detects basename collision' ' > + GIT_TRACE2_EVENT="$(pwd)/trace" git \ > + -c checkout.workers=2 -c checkout.thresholdForParallelism=0 \ > + checkout . && > + > + test_workers_in_event_trace 2 trace && > + collisions=$(grep -i "category.:.pcheckout.,.key.:.collision/basename.,.value.:.file_x.}" trace | wc -l) && > + test $collisions -eq 3 > +' > + > +test_expect_success CASE_INSENSITIVE_FS 'worker detects dirname collision' ' > + test_config filter.logger.smudge "\"$TEST_ROOT/logger_script\" %f" && > + empty_oid=$(git hash-object -w --stdin </dev/null) && > + > + # By setting a filter command to "a", we make it ineligible for parallel > + # checkout, and thus it is checked out *first*. This way we can ensure > + # that "A/B" and "A/C" will both collide with the regular file "a". > + # > + attr_oid=$(echo "a filter=logger" | git hash-object -w --stdin) && > + > + cat >objs <<-EOF && > + 100644 $empty_oid A/B > + 100644 $empty_oid A/C > + 100644 $empty_oid a > + 100644 $attr_oid .gitattributes > + EOF > + git rm -rf . && > + git update-index --index-info <objs && > + > + rm -f trace filter.log && > + GIT_TRACE2_EVENT="$(pwd)/trace" git \ > + -c checkout.workers=2 -c checkout.thresholdForParallelism=0 \ > + checkout . && > + > + # Check that "a" (and only "a") was filtered > + echo a >expected.log && > + test_cmp filter.log expected.log && > + > + # Check that it used the right number of workers and detected the collisions > + test_workers_in_event_trace 2 trace && > + grep "category.:.pcheckout.,.key.:.collision/dirname.,.value.:.A/B.}" trace && > + grep "category.:.pcheckout.,.key.:.collision/dirname.,.value.:.A/C.}" trace > +' > + > +test_expect_success SYMLINKS,CASE_INSENSITIVE_FS 'do not follow symlinks colliding with leading dir' ' > + empty_oid=$(git hash-object -w --stdin </dev/null) && > + symlink_oid=$(echo "./e" | git hash-object -w --stdin) && > + mkdir e && > + > + cat >objs <<-EOF && > + 120000 $symlink_oid D > + 100644 $empty_oid d/x > + 100644 $empty_oid e/y > + EOF > + git rm -rf . && > + git update-index --index-info <objs && > + > + set_checkout_config 2 0 && > + test_checkout_workers 2 git checkout . && > + test_path_is_dir e && > + test_path_is_missing e/x > +' > + > +# The two following tests check that parallel checkout correctly reports > +# colliding entries on clone. The sequential code detects a collision by > +# calling lstat() before trying to open(O_CREAT) a file. (Note that this only > +# works for clone.) Then, to find the pair of a colliding item k, it searches > +# cache_entry[0, k-1]. This is not sufficient in parallel checkout because: > +# > +# - A colliding file may be created between the lstat() and open() calls; > +# - A colliding entry might appear in the second half of the cache_entry array. > +# > +test_expect_success CASE_INSENSITIVE_FS 'collision report on clone (w/ racy file creation)' ' > + git reset --hard basename_collision && > + set_checkout_config 2 0 && > + test_checkout_workers 2 git clone . clone-repo 2>stderr && > + > + grep FILE_X stderr && > + grep FILE_x stderr && > + grep file_X stderr && > + grep file_x stderr && > + grep "the following paths have collided" stderr > +' > + > +# This test ensures that the collision report code is correctly looking for > +# colliding peers in the second half of the cache_entry array. This is done by > +# defining a smudge command for the *last* array entry, which makes it > +# non-eligible for parallel-checkout. Thus, it is checked out *first*, before > +# spawning the workers. > +# > +# Note: this test doesn't work on Windows because, on this system, the > +# collision report code uses strcmp() to find the colliding pairs when > +# core.ignoreCase is false. And we need this setting for this test so that only > +# 'file_x' matches the pattern of the filter attribute. But the test works on > +# OSX, where the colliding pairs are found using inode. > +# > +test_expect_success CASE_INSENSITIVE_FS,!MINGW,!CYGWIN \ > + 'collision report on clone (w/ colliding peer after the detected entry)' ' > + > + test_config_global filter.logger.smudge "\"$TEST_ROOT/logger_script\" %f" && > + git reset --hard basename_collision && > + echo "file_x filter=logger" >.gitattributes && > + git add .gitattributes && > + git commit -m "filter for file_x" && > + > + rm -rf clone-repo && > + set_checkout_config 2 0 && > + test_checkout_workers 2 \ > + git -c core.ignoreCase=false clone . clone-repo 2>stderr && > + > + grep FILE_X stderr && > + grep FILE_x stderr && > + grep file_X stderr && > + grep file_x stderr && > + grep "the following paths have collided" stderr && > + > + # Check that only "file_x" was filtered > + echo file_x >expected.log && > + test_cmp clone-repo/filter.log expected.log > +' > + > +test_done > -- > 2.30.1 >
On Sun, May 2, 2021 at 4:59 AM Torsten Bögershausen <tboegi@web.de> wrote: > > On Fri, Apr 30, 2021 at 06:40:32PM -0300, Matheus Tavares wrote: > > > > diff --git a/t/t2081-parallel-checkout-collisions.sh b/t/t2081-parallel-checkout-collisions.sh > > new file mode 100755 > > index 0000000000..f6fcfc0c1e > > --- /dev/null > > +++ b/t/t2081-parallel-checkout-collisions.sh > > @@ -0,0 +1,162 @@ > > +#!/bin/sh > > + > > +test_description="path collisions during parallel checkout > > + > > +Parallel checkout must detect path collisions to: > > + > > +1) Avoid racily writing to different paths that represent the same file on disk. > > +2) Report the colliding entries on clone. > > + > > +The tests in this file exercise parallel checkout's collision detection code in > > +both these mechanics. > > +" > > + > > +. ./test-lib.sh > > +. "$TEST_DIRECTORY/lib-parallel-checkout.sh" > > Why this $TEST_DIRECTORY ? > Aren't all files under the t/ directory, where test-lib.sh is as well ? Good point. From what I understand, the reason why we need the macro here is that, when running the test with --root=<path>, the trash dir might actually be at a path outside `t/`. So the test can't rely on `./` to read files from `t/`. We don't need the macro for `test-lib.sh` because the working dir is only changed after it is sourced (by `test-lib.sh` itself).
diff --git a/parallel-checkout.c b/parallel-checkout.c index 09e8b10a35..6fb3f1e6c9 100644 --- a/parallel-checkout.c +++ b/parallel-checkout.c @@ -8,6 +8,7 @@ #include "sigchain.h" #include "streaming.h" #include "thread-utils.h" +#include "trace2.h" struct pc_worker { struct child_process cp; @@ -326,6 +327,7 @@ void write_pc_item(struct parallel_checkout_item *pc_item, if (dir_sep && !has_dirs_only_path(path.buf, dir_sep - path.buf, state->base_dir_len)) { pc_item->status = PC_ITEM_COLLIDED; + trace2_data_string("pcheckout", NULL, "collision/dirname", path.buf); goto out; } @@ -341,6 +343,8 @@ void write_pc_item(struct parallel_checkout_item *pc_item, * call should have already caught these cases. */ pc_item->status = PC_ITEM_COLLIDED; + trace2_data_string("pcheckout", NULL, + "collision/basename", path.buf); } else { error_errno("failed to open file '%s'", path.buf); pc_item->status = PC_ITEM_FAILED; diff --git a/t/lib-parallel-checkout.sh b/t/lib-parallel-checkout.sh index f60b22ef34..d6740425b1 100644 --- a/t/lib-parallel-checkout.sh +++ b/t/lib-parallel-checkout.sh @@ -22,12 +22,12 @@ test_checkout_workers () { local trace_file=trace-test-checkout-workers && rm -f "$trace_file" && - GIT_TRACE2="$(pwd)/$trace_file" "$@" && + GIT_TRACE2="$(pwd)/$trace_file" "$@" 2>&8 && local workers=$(grep "child_start\[..*\] git checkout--worker" "$trace_file" | wc -l) && test $workers -eq $expected_workers && rm "$trace_file" -} +} 8>&2 2>&4 # Verify that both the working tree and the index were created correctly verify_checkout () { diff --git a/t/t2081-parallel-checkout-collisions.sh b/t/t2081-parallel-checkout-collisions.sh new file mode 100755 index 0000000000..f6fcfc0c1e --- /dev/null +++ b/t/t2081-parallel-checkout-collisions.sh @@ -0,0 +1,162 @@ +#!/bin/sh + +test_description="path collisions during parallel checkout + +Parallel checkout must detect path collisions to: + +1) Avoid racily writing to different paths that represent the same file on disk. +2) Report the colliding entries on clone. + +The tests in this file exercise parallel checkout's collision detection code in +both these mechanics. +" + +. ./test-lib.sh +. "$TEST_DIRECTORY/lib-parallel-checkout.sh" + +TEST_ROOT="$PWD" + +test_expect_success CASE_INSENSITIVE_FS 'setup' ' + empty_oid=$(git hash-object -w --stdin </dev/null) && + cat >objs <<-EOF && + 100644 $empty_oid FILE_X + 100644 $empty_oid FILE_x + 100644 $empty_oid file_X + 100644 $empty_oid file_x + EOF + git update-index --index-info <objs && + git commit -m "colliding files" && + git tag basename_collision && + + write_script "$TEST_ROOT"/logger_script <<-\EOF + echo "$@" >>filter.log + EOF +' + +test_workers_in_event_trace () +{ + test $1 -eq $(grep ".event.:.child_start..*checkout--worker" $2 | wc -l) +} + +test_expect_success CASE_INSENSITIVE_FS 'worker detects basename collision' ' + GIT_TRACE2_EVENT="$(pwd)/trace" git \ + -c checkout.workers=2 -c checkout.thresholdForParallelism=0 \ + checkout . && + + test_workers_in_event_trace 2 trace && + collisions=$(grep -i "category.:.pcheckout.,.key.:.collision/basename.,.value.:.file_x.}" trace | wc -l) && + test $collisions -eq 3 +' + +test_expect_success CASE_INSENSITIVE_FS 'worker detects dirname collision' ' + test_config filter.logger.smudge "\"$TEST_ROOT/logger_script\" %f" && + empty_oid=$(git hash-object -w --stdin </dev/null) && + + # By setting a filter command to "a", we make it ineligible for parallel + # checkout, and thus it is checked out *first*. This way we can ensure + # that "A/B" and "A/C" will both collide with the regular file "a". + # + attr_oid=$(echo "a filter=logger" | git hash-object -w --stdin) && + + cat >objs <<-EOF && + 100644 $empty_oid A/B + 100644 $empty_oid A/C + 100644 $empty_oid a + 100644 $attr_oid .gitattributes + EOF + git rm -rf . && + git update-index --index-info <objs && + + rm -f trace filter.log && + GIT_TRACE2_EVENT="$(pwd)/trace" git \ + -c checkout.workers=2 -c checkout.thresholdForParallelism=0 \ + checkout . && + + # Check that "a" (and only "a") was filtered + echo a >expected.log && + test_cmp filter.log expected.log && + + # Check that it used the right number of workers and detected the collisions + test_workers_in_event_trace 2 trace && + grep "category.:.pcheckout.,.key.:.collision/dirname.,.value.:.A/B.}" trace && + grep "category.:.pcheckout.,.key.:.collision/dirname.,.value.:.A/C.}" trace +' + +test_expect_success SYMLINKS,CASE_INSENSITIVE_FS 'do not follow symlinks colliding with leading dir' ' + empty_oid=$(git hash-object -w --stdin </dev/null) && + symlink_oid=$(echo "./e" | git hash-object -w --stdin) && + mkdir e && + + cat >objs <<-EOF && + 120000 $symlink_oid D + 100644 $empty_oid d/x + 100644 $empty_oid e/y + EOF + git rm -rf . && + git update-index --index-info <objs && + + set_checkout_config 2 0 && + test_checkout_workers 2 git checkout . && + test_path_is_dir e && + test_path_is_missing e/x +' + +# The two following tests check that parallel checkout correctly reports +# colliding entries on clone. The sequential code detects a collision by +# calling lstat() before trying to open(O_CREAT) a file. (Note that this only +# works for clone.) Then, to find the pair of a colliding item k, it searches +# cache_entry[0, k-1]. This is not sufficient in parallel checkout because: +# +# - A colliding file may be created between the lstat() and open() calls; +# - A colliding entry might appear in the second half of the cache_entry array. +# +test_expect_success CASE_INSENSITIVE_FS 'collision report on clone (w/ racy file creation)' ' + git reset --hard basename_collision && + set_checkout_config 2 0 && + test_checkout_workers 2 git clone . clone-repo 2>stderr && + + grep FILE_X stderr && + grep FILE_x stderr && + grep file_X stderr && + grep file_x stderr && + grep "the following paths have collided" stderr +' + +# This test ensures that the collision report code is correctly looking for +# colliding peers in the second half of the cache_entry array. This is done by +# defining a smudge command for the *last* array entry, which makes it +# non-eligible for parallel-checkout. Thus, it is checked out *first*, before +# spawning the workers. +# +# Note: this test doesn't work on Windows because, on this system, the +# collision report code uses strcmp() to find the colliding pairs when +# core.ignoreCase is false. And we need this setting for this test so that only +# 'file_x' matches the pattern of the filter attribute. But the test works on +# OSX, where the colliding pairs are found using inode. +# +test_expect_success CASE_INSENSITIVE_FS,!MINGW,!CYGWIN \ + 'collision report on clone (w/ colliding peer after the detected entry)' ' + + test_config_global filter.logger.smudge "\"$TEST_ROOT/logger_script\" %f" && + git reset --hard basename_collision && + echo "file_x filter=logger" >.gitattributes && + git add .gitattributes && + git commit -m "filter for file_x" && + + rm -rf clone-repo && + set_checkout_config 2 0 && + test_checkout_workers 2 \ + git -c core.ignoreCase=false clone . clone-repo 2>stderr && + + grep FILE_X stderr && + grep FILE_x stderr && + grep file_X stderr && + grep file_x stderr && + grep "the following paths have collided" stderr && + + # Check that only "file_x" was filtered + echo file_x >expected.log && + test_cmp clone-repo/filter.log expected.log +' + +test_done
Add tests to confirm that path collisions are properly detected by checkout workers, both to avoid race conditions and to report colliding entries on clone. Co-authored-by: Jeff Hostetler <jeffhost@microsoft.com> Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br> --- parallel-checkout.c | 4 + t/lib-parallel-checkout.sh | 4 +- t/t2081-parallel-checkout-collisions.sh | 162 ++++++++++++++++++++++++ 3 files changed, 168 insertions(+), 2 deletions(-) create mode 100755 t/t2081-parallel-checkout-collisions.sh