[GSoC,2/2] clone: use dir-iterator to avoid explicit dir traversal
diff mbox series

Message ID 20190215154913.18800-3-matheus.bernardino@usp.br
State New
Headers show
Series
  • clone: convert explicit traversal to
Related show

Commit Message

Matheus Tavares Bernardino Feb. 15, 2019, 3:49 p.m. UTC
Replace usage of opendir/readdir/closedir API to traverse directories
recursively, at copy_or_link_directory function, by the dir-iterator
API.

Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
---
 builtin/clone.c | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

Comments

Christian Couder Feb. 16, 2019, 6:41 a.m. UTC | #1
On Fri, Feb 15, 2019 at 5:39 PM Matheus Tavares
<matheus.bernardino@usp.br> wrote:
>
> Replace usage of opendir/readdir/closedir API to traverse directories
> recursively, at copy_or_link_directory function, by the dir-iterator
> API.

You may want to add that this simplifies the code and avoid recursive
calls of copy_or_link_directory().

Otherwise the patch looks good to me. Thanks.
Thomas Gummerer Feb. 16, 2019, 2:38 p.m. UTC | #2
On 02/15, Matheus Tavares wrote:
> Replace usage of opendir/readdir/closedir API to traverse directories
> recursively, at copy_or_link_directory function, by the dir-iterator
> API.
> 
> Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
> ---
>  builtin/clone.c | 39 +++++++++++++++++++--------------------
>  1 file changed, 19 insertions(+), 20 deletions(-)
> 
> diff --git a/builtin/clone.c b/builtin/clone.c
> index 2a1cc4dab9..66ae347f79 100644
> --- a/builtin/clone.c
> +++ b/builtin/clone.c
> @@ -23,6 +23,8 @@
>  #include "transport.h"
>  #include "strbuf.h"
>  #include "dir.h"
> +#include "dir-iterator.h"
> +#include "iterator.h"
>  #include "sigchain.h"
>  #include "branch.h"
>  #include "remote.h"
> @@ -413,40 +415,33 @@ static void mkdir_if_missing(const char *pathname, mode_t mode)
>  static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
>  				   const char *src_repo, int src_baselen)
>  {

The src_baselen parameter is now unused, and should be removed.  Our
codebase currently doesn't compile with -Wunused-parameter, so this is
not something the compiler can catch at the moment unfortunately.
However there is some work going on towards removing unused parameter
from the codebase, so it would be nice to not make things worse here.

*1*: https://public-inbox.org/git/20190214054736.GA20091@sigill.intra.peff.net

> -	struct dirent *de;
> -	struct stat buf;
>  	int src_len, dest_len;
> -	DIR *dir;
> -
> -	dir = opendir(src->buf);
> -	if (!dir)
> -		die_errno(_("failed to open '%s'"), src->buf);
> +	struct dir_iterator *iter;
> +	int iter_status;
>  
>  	mkdir_if_missing(dest->buf, 0777);
>  
> +	iter = dir_iterator_begin(src->buf);
> +
>  	strbuf_addch(src, '/');
>  	src_len = src->len;
>  	strbuf_addch(dest, '/');
>  	dest_len = dest->len;
>  
> -	while ((de = readdir(dir)) != NULL) {
> +	while ((iter_status = dir_iterator_advance(iter)) == ITER_OK) {
>  		strbuf_setlen(src, src_len);
> -		strbuf_addstr(src, de->d_name);
> +		strbuf_addstr(src, iter->relative_path);
>  		strbuf_setlen(dest, dest_len);
> -		strbuf_addstr(dest, de->d_name);
> -		if (stat(src->buf, &buf)) {
> -			warning (_("failed to stat %s\n"), src->buf);
> -			continue;
> -		}

Why was this warning removed?  I don't see a corresponding warning in
the iterator API.  The one thing the iterator API is doing is that it
does an lstat on the path to check if it exists.  However that does
not follow symlinks, and ignores possible other errors such as
permission errors.

If there is a good reason to remove the warning, that would be useful
to describe in the commit message.

> -		if (S_ISDIR(buf.st_mode)) {
> -			if (de->d_name[0] != '.')
> -				copy_or_link_directory(src, dest,
> -						       src_repo, src_baselen);
> +		strbuf_addstr(dest, iter->relative_path);
> +
> +		if (S_ISDIR(iter->st.st_mode)) {
> +			if (iter->basename[0] != '.')
> +				mkdir_if_missing(dest->buf, 0777);
>  			continue;
>  		}
>  
>  		/* Files that cannot be copied bit-for-bit... */
> -		if (!strcmp(src->buf + src_baselen, "/info/alternates")) {
> +		if (!strcmp(iter->relative_path, "info/alternates")) {
>  			copy_alternates(src, dest, src_repo);
>  			continue;
>  		}
> @@ -463,7 +458,11 @@ static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
>  		if (copy_file_with_time(dest->buf, src->buf, 0666))
>  			die_errno(_("failed to copy file to '%s'"), dest->buf);
>  	}
> -	closedir(dir);
> +
> +	if (iter_status != ITER_DONE) {
> +		strbuf_setlen(src, src_len);
> +		die(_("failed to iterate over '%s'"), src->buf);
> +	}

Interestingly enough, this is not something that can currently
happen if I read the dir-iterator code correctly.  Even though the
dir_iterator_advance function says it can return ITER_ERROR, it never
actually does.  The only way the iter_status can be not ITER_DONE at
this point is if we would 'break' out of the loop.

I don't think it hurts to be defensive here in case someone decides to
break out of the loop in the future, just something odd I noticed
while reviewing the code.

>  }
>  
>  static void clone_local(const char *src_repo, const char *dest_repo)
> -- 
> 2.20.1
>
Matheus Tavares Bernardino Feb. 18, 2019, 9:13 p.m. UTC | #3
On Sat, Feb 16, 2019 at 12:38 PM Thomas Gummerer <t.gummerer@gmail.com> wrote:
>
> On 02/15, Matheus Tavares wrote:
> > Replace usage of opendir/readdir/closedir API to traverse directories
> > recursively, at copy_or_link_directory function, by the dir-iterator
> > API.
> >
> > Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
> > ---
> >  builtin/clone.c | 39 +++++++++++++++++++--------------------
> >  1 file changed, 19 insertions(+), 20 deletions(-)
> >
> > diff --git a/builtin/clone.c b/builtin/clone.c
> > index 2a1cc4dab9..66ae347f79 100644
> > --- a/builtin/clone.c
> > +++ b/builtin/clone.c
> > @@ -23,6 +23,8 @@
> >  #include "transport.h"
> >  #include "strbuf.h"
> >  #include "dir.h"
> > +#include "dir-iterator.h"
> > +#include "iterator.h"
> >  #include "sigchain.h"
> >  #include "branch.h"
> >  #include "remote.h"
> > @@ -413,40 +415,33 @@ static void mkdir_if_missing(const char *pathname, mode_t mode)
> >  static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
> >                                  const char *src_repo, int src_baselen)
> >  {
>

Hi, Thomas. Thanks for the review.

> The src_baselen parameter is now unused, and should be removed.  Our
> codebase currently doesn't compile with -Wunused-parameter, so this is
> not something the compiler can catch at the moment unfortunately.
> However there is some work going on towards removing unused parameter
> from the codebase, so it would be nice to not make things worse here.
>

Nice! I will fix that in v2.

> *1*: https://public-inbox.org/git/20190214054736.GA20091@sigill.intra.peff.net
>
> > -     struct dirent *de;
> > -     struct stat buf;
> >       int src_len, dest_len;
> > -     DIR *dir;
> > -
> > -     dir = opendir(src->buf);
> > -     if (!dir)
> > -             die_errno(_("failed to open '%s'"), src->buf);
> > +     struct dir_iterator *iter;
> > +     int iter_status;
> >
> >       mkdir_if_missing(dest->buf, 0777);
> >
> > +     iter = dir_iterator_begin(src->buf);
> > +
> >       strbuf_addch(src, '/');
> >       src_len = src->len;
> >       strbuf_addch(dest, '/');
> >       dest_len = dest->len;
> >
> > -     while ((de = readdir(dir)) != NULL) {
> > +     while ((iter_status = dir_iterator_advance(iter)) == ITER_OK) {
> >               strbuf_setlen(src, src_len);
> > -             strbuf_addstr(src, de->d_name);
> > +             strbuf_addstr(src, iter->relative_path);
> >               strbuf_setlen(dest, dest_len);
> > -             strbuf_addstr(dest, de->d_name);
> > -             if (stat(src->buf, &buf)) {
> > -                     warning (_("failed to stat %s\n"), src->buf);
> > -                     continue;
> > -             }
>
> Why was this warning removed?  I don't see a corresponding warning in
> the iterator API.  The one thing the iterator API is doing is that it
> does an lstat on the path to check if it exists.  However that does
> not follow symlinks, and ignores possible other errors such as
> permission errors.
>

You are right. I didn't know the differences from lstat and stat. And
reflecting on this now, I realize that the problem is even deeper:
copy_or_link_directory follows symlinks but dir-iterator don't, so I
cannot use dir-iterator without falling back to recursion on
copy_or_link_directory. Because of that, I thought off adding an
option for dir-iterator to follow symlinks. Does this seem like a good
idea?

Also, I just noticed that dir-iterator follows hidden paths while
copy_or_link_directory don't. Maybe another option to add for
dir-iterator?

> If there is a good reason to remove the warning, that would be useful
> to describe in the commit message.
>
> > -             if (S_ISDIR(buf.st_mode)) {
> > -                     if (de->d_name[0] != '.')
> > -                             copy_or_link_directory(src, dest,
> > -                                                    src_repo, src_baselen);
> > +             strbuf_addstr(dest, iter->relative_path);
> > +
> > +             if (S_ISDIR(iter->st.st_mode)) {
> > +                     if (iter->basename[0] != '.')
> > +                             mkdir_if_missing(dest->buf, 0777);
> >                       continue;
> >               }
> >
> >               /* Files that cannot be copied bit-for-bit... */
> > -             if (!strcmp(src->buf + src_baselen, "/info/alternates")) {
> > +             if (!strcmp(iter->relative_path, "info/alternates")) {
> >                       copy_alternates(src, dest, src_repo);
> >                       continue;
> >               }
> > @@ -463,7 +458,11 @@ static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
> >               if (copy_file_with_time(dest->buf, src->buf, 0666))
> >                       die_errno(_("failed to copy file to '%s'"), dest->buf);
> >       }
> > -     closedir(dir);
> > +
> > +     if (iter_status != ITER_DONE) {
> > +             strbuf_setlen(src, src_len);
> > +             die(_("failed to iterate over '%s'"), src->buf);
> > +     }
>
> Interestingly enough, this is not something that can currently
> happen if I read the dir-iterator code correctly.  Even though the
> dir_iterator_advance function says it can return ITER_ERROR, it never
> actually does.  The only way the iter_status can be not ITER_DONE at
> this point is if we would 'break' out of the loop.
>
> I don't think it hurts to be defensive here in case someone decides to
> break out of the loop in the future, just something odd I noticed
> while reviewing the code.
>

Yes, I also noticed that. But I thought it would be nice to make this
check so that this code stays consistent to the documentation at
dir-iterator.h (although implementation is different).

Something I just noticed now is that copy_or_link_directory dies upon
an opendir error, but dir-iterator just prints a warning and keeps
going (looking for another file/dir to return for the caller). Is this
ok? Or should I, perhaps, add a "pedantic" option to dir-iterator so
that, when enabled, it immediately returns ITER_ERROR when an error
occurs instead of keep going?

I'm proposing some new options to dir-iterator because as the patch
that adds it says "There are obviously a lot of features that could
easily be added to this class", and maybe those would be useful for
other dir-iterator users. But I don't know if that would be the best
way of doing it, so any feedback on this will be much appreciated.
Also, I saw on public-inbox[1] a patchset from 2017 proposing new
features/options for dir-iterator, but I don't really know if (and
why) it was rejected. (I couldn't find the patches on master/pu log)

[1] https://public-inbox.org/git/1493226219-33423-1-git-send-email-bnmvco@gmail.com/

Thanks,
Matheus Tavares


> >  }
> >
> >  static void clone_local(const char *src_repo, const char *dest_repo)
> > --
> > 2.20.1
> >
Thomas Gummerer Feb. 18, 2019, 11:35 p.m. UTC | #4
On 02/18, Matheus Tavares Bernardino wrote:
> On Sat, Feb 16, 2019 at 12:38 PM Thomas Gummerer <t.gummerer@gmail.com> wrote:
> >
> > On 02/15, Matheus Tavares wrote:
> > > -     struct dirent *de;
> > > -     struct stat buf;
> > >       int src_len, dest_len;
> > > -     DIR *dir;
> > > -
> > > -     dir = opendir(src->buf);
> > > -     if (!dir)
> > > -             die_errno(_("failed to open '%s'"), src->buf);
> > > +     struct dir_iterator *iter;
> > > +     int iter_status;
> > >
> > >       mkdir_if_missing(dest->buf, 0777);
> > >
> > > +     iter = dir_iterator_begin(src->buf);
> > > +
> > >       strbuf_addch(src, '/');
> > >       src_len = src->len;
> > >       strbuf_addch(dest, '/');
> > >       dest_len = dest->len;
> > >
> > > -     while ((de = readdir(dir)) != NULL) {
> > > +     while ((iter_status = dir_iterator_advance(iter)) == ITER_OK) {
> > >               strbuf_setlen(src, src_len);
> > > -             strbuf_addstr(src, de->d_name);
> > > +             strbuf_addstr(src, iter->relative_path);
> > >               strbuf_setlen(dest, dest_len);
> > > -             strbuf_addstr(dest, de->d_name);
> > > -             if (stat(src->buf, &buf)) {
> > > -                     warning (_("failed to stat %s\n"), src->buf);
> > > -                     continue;
> > > -             }
> >
> > Why was this warning removed?  I don't see a corresponding warning in
> > the iterator API.  The one thing the iterator API is doing is that it
> > does an lstat on the path to check if it exists.  However that does
> > not follow symlinks, and ignores possible other errors such as
> > permission errors.
> >
> 
> You are right. I didn't know the differences from lstat and stat. And
> reflecting on this now, I realize that the problem is even deeper:
> copy_or_link_directory follows symlinks but dir-iterator don't, so I
> cannot use dir-iterator without falling back to recursion on
> copy_or_link_directory. Because of that, I thought off adding an
> option for dir-iterator to follow symlinks. Does this seem like a good
> idea?

Hmm looking at how this function is actually called, I don't think we
need to worry about symlinks too much.  It only copies the
.git/objects directory, which normally shouldn't contain any
symlinks.  I think it's still good to preserve the warning, in case
someone gave some object directories bad permissions, but following
symlinks in there may be overly paranoid, dunno.

Adding an option to dir-iterator to follow symlinks may be a good
idea, but it raises a few questions I think, e.g. how are we going to
deal with symlinks that point to a parent directory.  The answer for
this function is that we currently don't and just keep recursing,
until 'stat()' fails, but that is probably not the behaviour we'd want
if we were to add this functionality to dir-iterator.

At this point I'm almost convinced that not following symlinks may be
the better solution here.  That is a change in behaviour though, and
it should be described in the commit message why that change is a good
change.

> Also, I just noticed that dir-iterator follows hidden paths while
> copy_or_link_directory don't. Maybe another option to add for
> dir-iterator?

That feels like quite a special case in 'copy_or_link_directory()'.  I
don't think it's worth adding a feature like that for now, at least
not until we find another use case for it.  It's easy enough to skip
over hidden directories here.

> > If there is a good reason to remove the warning, that would be useful
> > to describe in the commit message.
> >
> > > -             if (S_ISDIR(buf.st_mode)) {
> > > -                     if (de->d_name[0] != '.')
> > > -                             copy_or_link_directory(src, dest,
> > > -                                                    src_repo, src_baselen);
> > > +             strbuf_addstr(dest, iter->relative_path);
> > > +
> > > +             if (S_ISDIR(iter->st.st_mode)) {
> > > +                     if (iter->basename[0] != '.')
> > > +                             mkdir_if_missing(dest->buf, 0777);
> > >                       continue;
> > >               }
> > >
> > >               /* Files that cannot be copied bit-for-bit... */
> > > -             if (!strcmp(src->buf + src_baselen, "/info/alternates")) {
> > > +             if (!strcmp(iter->relative_path, "info/alternates")) {
> > >                       copy_alternates(src, dest, src_repo);
> > >                       continue;
> > >               }
> > > @@ -463,7 +458,11 @@ static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
> > >               if (copy_file_with_time(dest->buf, src->buf, 0666))
> > >                       die_errno(_("failed to copy file to '%s'"), dest->buf);
> > >       }
> > > -     closedir(dir);
> > > +
> > > +     if (iter_status != ITER_DONE) {
> > > +             strbuf_setlen(src, src_len);
> > > +             die(_("failed to iterate over '%s'"), src->buf);
> > > +     }
> >
> > Interestingly enough, this is not something that can currently
> > happen if I read the dir-iterator code correctly.  Even though the
> > dir_iterator_advance function says it can return ITER_ERROR, it never
> > actually does.  The only way the iter_status can be not ITER_DONE at
> > this point is if we would 'break' out of the loop.
> >
> > I don't think it hurts to be defensive here in case someone decides to
> > break out of the loop in the future, just something odd I noticed
> > while reviewing the code.
> >
> 
> Yes, I also noticed that. But I thought it would be nice to make this
> check so that this code stays consistent to the documentation at
> dir-iterator.h (although implementation is different).

Yeah, I'm not opposed to keep this check to be more defensive against
future changes of the API and the code here.

> Something I just noticed now is that copy_or_link_directory dies upon
> an opendir error, but dir-iterator just prints a warning and keeps
> going (looking for another file/dir to return for the caller). Is this
> ok? Or should I, perhaps, add a "pedantic" option to dir-iterator so
> that, when enabled, it immediately returns ITER_ERROR when an error
> occurs instead of keep going?

Good point, I failed to notice that during my earlier review.  I do
think a "pedantic" option would be a good idea here, to preserve the
previous behaviour.

> I'm proposing some new options to dir-iterator because as the patch
> that adds it says "There are obviously a lot of features that could
> easily be added to this class", and maybe those would be useful for
> other dir-iterator users. But I don't know if that would be the best
> way of doing it, so any feedback on this will be much appreciated.
> Also, I saw on public-inbox[1] a patchset from 2017 proposing new
> features/options for dir-iterator, but I don't really know if (and
> why) it was rejected. (I couldn't find the patches on master/pu log)

I don't think we should necessarily add new options because the
original patch said we should do so.  Instead the best way to
introduce new options is to introduce them when they are actually
going to be needed in subsequent patches.  So it would be nice to
implement features that are actually needed by your patches, but I
don't think it's worth trying to introduce new features that could
potentially be useful in the future, but where we don't know yet
whether they really will be or not.

I can't recall those patches, and from a quick look at the history I
can't tell why they were never merged.  The what's cooking emails from
Junio around that time may have a bit more background, but I don't
have time to dig them up now.

> [1] https://public-inbox.org/git/1493226219-33423-1-git-send-email-bnmvco@gmail.com/
> 
> Thanks,
> Matheus Tavares
> 
> 
> > >  }
> > >
> > >  static void clone_local(const char *src_repo, const char *dest_repo)
> > > --
> > > 2.20.1
> > >
Matheus Tavares Bernardino Feb. 19, 2019, 4:23 p.m. UTC | #5
On Mon, Feb 18, 2019 at 8:35 PM Thomas Gummerer <t.gummerer@gmail.com> wrote:
>
> >
> > You are right. I didn't know the differences from lstat and stat. And
> > reflecting on this now, I realize that the problem is even deeper:
> > copy_or_link_directory follows symlinks but dir-iterator don't, so I
> > cannot use dir-iterator without falling back to recursion on
> > copy_or_link_directory. Because of that, I thought off adding an
> > option for dir-iterator to follow symlinks. Does this seem like a good
> > idea?
>
> Hmm looking at how this function is actually called, I don't think we
> need to worry about symlinks too much.  It only copies the
> .git/objects directory, which normally shouldn't contain any
> symlinks.  I think it's still good to preserve the warning, in case
> someone gave some object directories bad permissions, but following
> symlinks in there may be overly paranoid, dunno.
>
> Adding an option to dir-iterator to follow symlinks may be a good
> idea, but it raises a few questions I think, e.g. how are we going to
> deal with symlinks that point to a parent directory.  The answer for
> this function is that we currently don't and just keep recursing,
> until 'stat()' fails, but that is probably not the behaviour we'd want
> if we were to add this functionality to dir-iterator.
>
> At this point I'm almost convinced that not following symlinks may be
> the better solution here.  That is a change in behaviour though, and
> it should be described in the commit message why that change is a good
> change.
>

Ok, so I will add this note at v2's commit message.

> > Also, I just noticed that dir-iterator follows hidden paths while
> > copy_or_link_directory don't. Maybe another option to add for
> > dir-iterator?
>
> That feels like quite a special case in 'copy_or_link_directory()'.  I
> don't think it's worth adding a feature like that for now, at least
> not until we find another use case for it.  It's easy enough to skip
> over hidden directories here.
>

Ok, I agree. I noticed copy_or_link_directory only skips hidden
directories, not hidden files. And I couldn't think why we would want
to skip one but not the other... Could that be unintentional? I mean,
maybe the idea was to skip just "." and ".."? If that is the case, I
don't even need to make an if check at copy_or_link_directory, since
dir-iterator already skips "." and "..". What do you think about that?

> > > If there is a good reason to remove the warning, that would be useful
> > > to describe in the commit message.
> > >
> > > > -             if (S_ISDIR(buf.st_mode)) {
> > > > -                     if (de->d_name[0] != '.')
> > > > -                             copy_or_link_directory(src, dest,
> > > > -                                                    src_repo, src_baselen);
> > > > +             strbuf_addstr(dest, iter->relative_path);
> > > > +
> > > > +             if (S_ISDIR(iter->st.st_mode)) {
> > > > +                     if (iter->basename[0] != '.')
> > > > +                             mkdir_if_missing(dest->buf, 0777);
> > > >                       continue;
> > > >               }
> > > >
> > > >               /* Files that cannot be copied bit-for-bit... */
> > > > -             if (!strcmp(src->buf + src_baselen, "/info/alternates")) {
> > > > +             if (!strcmp(iter->relative_path, "info/alternates")) {
> > > >                       copy_alternates(src, dest, src_repo);
> > > >                       continue;
> > > >               }
> > > > @@ -463,7 +458,11 @@ static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
> > > >               if (copy_file_with_time(dest->buf, src->buf, 0666))
> > > >                       die_errno(_("failed to copy file to '%s'"), dest->buf);
> > > >       }
> > > > -     closedir(dir);
> > > > +
> > > > +     if (iter_status != ITER_DONE) {
> > > > +             strbuf_setlen(src, src_len);
> > > > +             die(_("failed to iterate over '%s'"), src->buf);
> > > > +     }
> > >
> > > Interestingly enough, this is not something that can currently
> > > happen if I read the dir-iterator code correctly.  Even though the
> > > dir_iterator_advance function says it can return ITER_ERROR, it never
> > > actually does.  The only way the iter_status can be not ITER_DONE at
> > > this point is if we would 'break' out of the loop.
> > >
> > > I don't think it hurts to be defensive here in case someone decides to
> > > break out of the loop in the future, just something odd I noticed
> > > while reviewing the code.
> > >
> >
> > Yes, I also noticed that. But I thought it would be nice to make this
> > check so that this code stays consistent to the documentation at
> > dir-iterator.h (although implementation is different).
>
> Yeah, I'm not opposed to keep this check to be more defensive against
> future changes of the API and the code here.
>
> > Something I just noticed now is that copy_or_link_directory dies upon
> > an opendir error, but dir-iterator just prints a warning and keeps
> > going (looking for another file/dir to return for the caller). Is this
> > ok? Or should I, perhaps, add a "pedantic" option to dir-iterator so
> > that, when enabled, it immediately returns ITER_ERROR when an error
> > occurs instead of keep going?
>
> Good point, I failed to notice that during my earlier review.  I do
> think a "pedantic" option would be a good idea here, to preserve the
> previous behaviour.
>

Ok, I'm going to work on that. Thanks.

> > I'm proposing some new options to dir-iterator because as the patch
> > that adds it says "There are obviously a lot of features that could
> > easily be added to this class", and maybe those would be useful for
> > other dir-iterator users. But I don't know if that would be the best
> > way of doing it, so any feedback on this will be much appreciated.
> > Also, I saw on public-inbox[1] a patchset from 2017 proposing new
> > features/options for dir-iterator, but I don't really know if (and
> > why) it was rejected. (I couldn't find the patches on master/pu log)
>
> I don't think we should necessarily add new options because the
> original patch said we should do so.  Instead the best way to
> introduce new options is to introduce them when they are actually
> going to be needed in subsequent patches.  So it would be nice to
> implement features that are actually needed by your patches, but I
> don't think it's worth trying to introduce new features that could
> potentially be useful in the future, but where we don't know yet
> whether they really will be or not.
>

Yes, I agree with you. What I meant is that there is some space for
new features at dir-iterator and maybe those that I suggested could be
nice not just for my patch, on clone.c, but for others. But I see your
point. We really should try to keep it simpler and just add the
feature when (and if) there is a more expressive usage for it.

Thanks again for all the help and feedback.
Thomas Gummerer Feb. 19, 2019, 11:45 p.m. UTC | #6
On 02/19, Matheus Tavares Bernardino wrote:
> On Mon, Feb 18, 2019 at 8:35 PM Thomas Gummerer <t.gummerer@gmail.com> wrote:
> > > Also, I just noticed that dir-iterator follows hidden paths while
> > > copy_or_link_directory don't. Maybe another option to add for
> > > dir-iterator?
> >
> > That feels like quite a special case in 'copy_or_link_directory()'.  I
> > don't think it's worth adding a feature like that for now, at least
> > not until we find another use case for it.  It's easy enough to skip
> > over hidden directories here.
> >
> 
> Ok, I agree. I noticed copy_or_link_directory only skips hidden
> directories, not hidden files. And I couldn't think why we would want
> to skip one but not the other... Could that be unintentional? I mean,
> maybe the idea was to skip just "." and ".."? If that is the case, I
> don't even need to make an if check at copy_or_link_directory, since
> dir-iterator already skips "." and "..". What do you think about that?

I don't know.  The best way to look at these things is usually to use
'git blame' and have a look at the commit that introduced whatever you
are looking at, in this case why we are skipping anything starting
with a '.'.  But looking at that commit it's just the translation from
shell script to builtin.  It's also more than 10 years old by now, so
it's unlikely that the original author would still remember why
exactly they made this choice.  (Note that I didn't take the time to
actually dig into the original shell script).

I feel like you are probably right in that it could be unintentional,
and I don't think anyone should be messing with the objects
directory.  However that is no guarantee that changing this wouldn't
break *something*.

For the purpose of this change I would try to keep the same behaviour
as we currently have where possible, to not increase the scope too
much.

> > > I'm proposing some new options to dir-iterator because as the patch
> > > that adds it says "There are obviously a lot of features that could
> > > easily be added to this class", and maybe those would be useful for
> > > other dir-iterator users. But I don't know if that would be the best
> > > way of doing it, so any feedback on this will be much appreciated.
> > > Also, I saw on public-inbox[1] a patchset from 2017 proposing new
> > > features/options for dir-iterator, but I don't really know if (and
> > > why) it was rejected. (I couldn't find the patches on master/pu log)
> >
> > I don't think we should necessarily add new options because the
> > original patch said we should do so.  Instead the best way to
> > introduce new options is to introduce them when they are actually
> > going to be needed in subsequent patches.  So it would be nice to
> > implement features that are actually needed by your patches, but I
> > don't think it's worth trying to introduce new features that could
> > potentially be useful in the future, but where we don't know yet
> > whether they really will be or not.
> >
> 
> Yes, I agree with you. What I meant is that there is some space for
> new features at dir-iterator and maybe those that I suggested could be
> nice not just for my patch, on clone.c, but for others. But I see your
> point. We really should try to keep it simpler and just add the
> feature when (and if) there is a more expressive usage for it.

Right, if you wanted to pursue this, that could be a separate patch
series.  However generally patch series that just introduce something
that might be used later, but doesn't add much value on its own tend
to be much more likely to be rejected.

> Thanks again for all the help and feedback.

Thanks for working on this!
Matheus Tavares Bernardino Feb. 21, 2019, 4:46 a.m. UTC | #7
Ok, I think I'm almost there and I should be able to send a v2 on the
weekend. But again, a few questions arose while I'm coding v2. Please,
see inline.

On Tue, Feb 19, 2019 at 8:45 PM Thomas Gummerer <t.gummerer@gmail.com> wrote:
>
> On 02/19, Matheus Tavares Bernardino wrote:
> > Ok, I agree. I noticed copy_or_link_directory only skips hidden
> > directories, not hidden files. And I couldn't think why we would want
> > to skip one but not the other... Could that be unintentional? I mean,
> > maybe the idea was to skip just "." and ".."? If that is the case, I
> > don't even need to make an if check at copy_or_link_directory, since
> > dir-iterator already skips "." and "..". What do you think about that?

[...]

> I feel like you are probably right in that it could be unintentional,
> and I don't think anyone should be messing with the objects
> directory.  However that is no guarantee that changing this wouldn't
> break *something*.
>
> For the purpose of this change I would try to keep the same behaviour
> as we currently have where possible, to not increase the scope too
> much.
>

I understand your point, but to make copy_or_link_directory() skip
just hidden directories using dir-iterator seems, now, a little
tricker than I though before. The "if (iter->basename[0] != '.')"
statement in the patch I sent for v1 only skips the hidden directories
creation, but it does not avoid the attempt to copy the hidden
directory contents (which would obviously fail). If we were to do
that, we would need to check every directory in the relative path
string of each iteration entry looking for a hidden directory. That
seems a little too much, IMHO. Because of that and the intuition that
skipping over hidden dirs was an unintentional operation, I think we
could modify copy_or_link_directory() to skip '.' and '..', only.
Also, hidden dirs/files are not expected to be at .git/objects anyway,
are they?

And to have copy_or_link_directory()'s behaviour changed as little as
possible, I could add the option to not follow hidden paths (dirs and
regular files) at dir-iterator.c and use it at
copy_or_link_directory() (now that I'm adding the 'pedantic' option,
this won't be so difficult, anyway). Would these suggestions be a good
plan?

Thanks,
Matheus Tavares
Thomas Gummerer Feb. 21, 2019, 9:25 p.m. UTC | #8
On 02/21, Matheus Tavares Bernardino wrote:
> Ok, I think I'm almost there and I should be able to send a v2 on the
> weekend. But again, a few questions arose while I'm coding v2. Please,
> see inline.
> 
> On Tue, Feb 19, 2019 at 8:45 PM Thomas Gummerer <t.gummerer@gmail.com> wrote:
> >
> > On 02/19, Matheus Tavares Bernardino wrote:
> > > Ok, I agree. I noticed copy_or_link_directory only skips hidden
> > > directories, not hidden files. And I couldn't think why we would want
> > > to skip one but not the other... Could that be unintentional? I mean,
> > > maybe the idea was to skip just "." and ".."? If that is the case, I
> > > don't even need to make an if check at copy_or_link_directory, since
> > > dir-iterator already skips "." and "..". What do you think about that?
> 
> [...]
> 
> > I feel like you are probably right in that it could be unintentional,
> > and I don't think anyone should be messing with the objects
> > directory.  However that is no guarantee that changing this wouldn't
> > break *something*.
> >
> > For the purpose of this change I would try to keep the same behaviour
> > as we currently have where possible, to not increase the scope too
> > much.
> >
> 
> I understand your point, but to make copy_or_link_directory() skip
> just hidden directories using dir-iterator seems, now, a little
> tricker than I though before. The "if (iter->basename[0] != '.')"
> statement in the patch I sent for v1 only skips the hidden directories
> creation, but it does not avoid the attempt to copy the hidden
> directory contents (which would obviously fail). If we were to do
> that, we would need to check every directory in the relative path
> string of each iteration entry looking for a hidden directory. That
> seems a little too much, IMHO. Because of that and the intuition that
> skipping over hidden dirs was an unintentional operation, I think we
> could modify copy_or_link_directory() to skip '.' and '..', only.
> Also, hidden dirs/files are not expected to be at .git/objects anyway,
> are they?

No, as far as I know hidden dirs/files should not be in .git/objects.
The only reason they would be there is if somebody would create them
manually.  I didn't think of hidden child directories, which as you
noticed should be excluded as well to keep current behaviour.

With that in mind, I think I agree with the change of only excluding
'.' and '..', for which we don't need to do anything in your
dir-iterator patches.  Also thinking of what the worst case that could
happen is, it's that while cloning a repository locally a few more
directories could be copied, which is not all that bad.

So I think I was overly paranoid here and am on board with skipping
'.' and '..' only.  I even think that this can be just described in
detail in the commit message, rather than being done in a separate
patch, though others may disagree with that.  

> And to have copy_or_link_directory()'s behaviour changed as little as
> possible, I could add the option to not follow hidden paths (dirs and
> regular files) at dir-iterator.c and use it at
> copy_or_link_directory() (now that I'm adding the 'pedantic' option,
> this won't be so difficult, anyway). Would these suggestions be a good
> plan?

I think this would be a valid plan as well, however thinking this
through again I don't feel like it is necessary.  I'm happy with
whichever way you prefer here.

> Thanks,
> Matheus Tavares
Daniel Ferreira (theiostream) Feb. 22, 2019, 12:22 a.m. UTC | #9
Hi there! I'm happy to see this series has gotten renewed attention.

The series was fine per se, and it even got through to the pu branch.
But then it got ejected since it conflicted non-trivially with another
patch series that was added along the way (see
https://lwn.net/Articles/724166/). At that point I did not keep
pushing it forward.

I'd be happy to help anyone willing to make some progress on this! (Or
even ultimately solve the conflict myself, if it serves a greater goal
than just being a GSoC microproject.)

Best,
-- Daniel.

Patch
diff mbox series

diff --git a/builtin/clone.c b/builtin/clone.c
index 2a1cc4dab9..66ae347f79 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -23,6 +23,8 @@ 
 #include "transport.h"
 #include "strbuf.h"
 #include "dir.h"
+#include "dir-iterator.h"
+#include "iterator.h"
 #include "sigchain.h"
 #include "branch.h"
 #include "remote.h"
@@ -413,40 +415,33 @@  static void mkdir_if_missing(const char *pathname, mode_t mode)
 static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
 				   const char *src_repo, int src_baselen)
 {
-	struct dirent *de;
-	struct stat buf;
 	int src_len, dest_len;
-	DIR *dir;
-
-	dir = opendir(src->buf);
-	if (!dir)
-		die_errno(_("failed to open '%s'"), src->buf);
+	struct dir_iterator *iter;
+	int iter_status;
 
 	mkdir_if_missing(dest->buf, 0777);
 
+	iter = dir_iterator_begin(src->buf);
+
 	strbuf_addch(src, '/');
 	src_len = src->len;
 	strbuf_addch(dest, '/');
 	dest_len = dest->len;
 
-	while ((de = readdir(dir)) != NULL) {
+	while ((iter_status = dir_iterator_advance(iter)) == ITER_OK) {
 		strbuf_setlen(src, src_len);
-		strbuf_addstr(src, de->d_name);
+		strbuf_addstr(src, iter->relative_path);
 		strbuf_setlen(dest, dest_len);
-		strbuf_addstr(dest, de->d_name);
-		if (stat(src->buf, &buf)) {
-			warning (_("failed to stat %s\n"), src->buf);
-			continue;
-		}
-		if (S_ISDIR(buf.st_mode)) {
-			if (de->d_name[0] != '.')
-				copy_or_link_directory(src, dest,
-						       src_repo, src_baselen);
+		strbuf_addstr(dest, iter->relative_path);
+
+		if (S_ISDIR(iter->st.st_mode)) {
+			if (iter->basename[0] != '.')
+				mkdir_if_missing(dest->buf, 0777);
 			continue;
 		}
 
 		/* Files that cannot be copied bit-for-bit... */
-		if (!strcmp(src->buf + src_baselen, "/info/alternates")) {
+		if (!strcmp(iter->relative_path, "info/alternates")) {
 			copy_alternates(src, dest, src_repo);
 			continue;
 		}
@@ -463,7 +458,11 @@  static void copy_or_link_directory(struct strbuf *src, struct strbuf *dest,
 		if (copy_file_with_time(dest->buf, src->buf, 0666))
 			die_errno(_("failed to copy file to '%s'"), dest->buf);
 	}
-	closedir(dir);
+
+	if (iter_status != ITER_DONE) {
+		strbuf_setlen(src, src_len);
+		die(_("failed to iterate over '%s'"), src->buf);
+	}
 }
 
 static void clone_local(const char *src_repo, const char *dest_repo)