diff mbox series

archive: support compression levels beyond 9

Message ID 96e6e2ce-fc7b-1e73-0112-93589b28506d@web.de (mailing list archive)
State New, archived
Headers show
Series archive: support compression levels beyond 9 | expand

Commit Message

René Scharfe Nov. 9, 2020, 4:05 p.m. UTC
Compression programs like zip, gzip, bzip2 and xz allow to adjust the
trade-off between CPU cost and size gain with numerical options from -1
for fast compression and -9 for high compression ratio.  zip also
accepts -0 for storing files verbatim.  git archive directly support
these single-digit compression levels for ZIP output and passes them to
filters like gzip.

Zstandard additionally supports compression level options -10 to -19, or
up to -22 with --ultra.  This *seems* to work with git archive in most
cases, e.g. it will produce an archive with -19 without complaining, but
since it only supports single-digit compression level options this is
the same as -1 -9 and thus -9.

Allow git archive to accept multi-digit compression levels to support
the full range supported by zstd.  Explicitly reject them for the ZIP
format, as otherwise deflateInit2() would just fail with a somewhat
cryptic "stream consistency error".

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 archive-tar.c |  3 ++-
 archive.c     | 26 +++++++++++---------------
 archive.h     |  1 +
 3 files changed, 14 insertions(+), 16 deletions(-)

--
2.29.2

Comments

Junio C Hamano Nov. 9, 2020, 6:35 p.m. UTC | #1
René Scharfe <l.s.r@web.de> writes:

> Compression programs like zip, gzip, bzip2 and xz allow to adjust the
> trade-off between CPU cost and size gain with numerical options from -1
> for fast compression and -9 for high compression ratio.  zip also
> accepts -0 for storing files verbatim.  git archive directly support
> these single-digit compression levels for ZIP output and passes them to
> filters like gzip.
>
> Zstandard additionally supports compression level options -10 to -19, or
> up to -22 with --ultra.  This *seems* to work with git archive in most
> cases, e.g. it will produce an archive with -19 without complaining, but
> since it only supports single-digit compression level options this is
> the same as -1 -9 and thus -9.
>
> Allow git archive to accept multi-digit compression levels to support
> the full range supported by zstd.  Explicitly reject them for the ZIP
> format, as otherwise deflateInit2() would just fail with a somewhat
> cryptic "stream consistency error".

The implementation looks more like "not enable them for the ZIP
format", but the symptom observable to end-users is exactly
"explicitly reject", so that's OK ;-)

As with the usual compression levels, this is only about how
deflator finds a better results, and the stream is understandable by
any existing inflator, right?

> diff --git a/archive.h b/archive.h
> index 82b226011a..e3d04e8ab3 100644
> --- a/archive.h
> +++ b/archive.h
> @@ -36,6 +36,7 @@ const char *archive_format_from_filename(const char *filename);
>
>  #define ARCHIVER_WANT_COMPRESSION_LEVELS 1
>  #define ARCHIVER_REMOTE 2
> +#define ARCHIVER_HIGH_COMPRESSION_LEVELS 4
>  struct archiver {
>  	const char *name;
>  	int (*write_archive)(const struct archiver *, struct archiver_args *);
> diff --git a/archive-tar.c b/archive-tar.c
> index f1a1447ebd..a971fdc0f6 100644
> --- a/archive-tar.c
> +++ b/archive-tar.c
> @@ -374,7 +374,8 @@ static int tar_filter_config(const char *var, const char *value, void *data)
>  		ar = xcalloc(1, sizeof(*ar));
>  		ar->name = xmemdupz(name, namelen);
>  		ar->write_archive = write_tar_filter_archive;
> -		ar->flags = ARCHIVER_WANT_COMPRESSION_LEVELS;
> +		ar->flags = ARCHIVER_WANT_COMPRESSION_LEVELS |
> +			    ARCHIVER_HIGH_COMPRESSION_LEVELS;

Nice.  

Hindsight tells me that WANT should have been ACCEPT, though---and
an addition of ARCHIVER_ACCEPT_HIGH_COMPRESSION_LEVELS would be in
line with that.  But that probably is too minor---it just stood out
a bit funny to me.

> diff --git a/archive.c b/archive.c
> index 3c1541af9e..7a888c5338 100644
> --- a/archive.c
> +++ b/archive.c
> @@ -529,10 +529,12 @@ static int add_file_cb(const struct option *opt, const char *arg, int unset)
>  	return 0;
>  }
>
> -#define OPT__COMPR(s, v, h, p) \
> -	OPT_SET_INT_F(s, NULL, v, h, p, PARSE_OPT_NONEG)
> -#define OPT__COMPR_HIDDEN(s, v, p) \
> -	OPT_SET_INT_F(s, NULL, v, "", p, PARSE_OPT_NONEG | PARSE_OPT_HIDDEN)
> +static int number_callback(const struct option *opt, const char *arg, int unset)
> +{
> +	BUG_ON_OPT_NEG(unset);
> +	*(int *)opt->value = strtol(arg, NULL, 10);
> +	return 0;
> +}
>
>  static int parse_archive_args(int argc, const char **argv,
>  		const struct archiver **ar, struct archiver_args *args,
> @@ -561,16 +563,8 @@ static int parse_archive_args(int argc, const char **argv,
>  		OPT_BOOL(0, "worktree-attributes", &worktree_attributes,
>  			N_("read .gitattributes in working directory")),
>  		OPT__VERBOSE(&verbose, N_("report archived files on stderr")),
> -		OPT__COMPR('0', &compression_level, N_("store only"), 0),
> -		OPT__COMPR('1', &compression_level, N_("compress faster"), 1),
> -		OPT__COMPR_HIDDEN('2', &compression_level, 2),
> -		OPT__COMPR_HIDDEN('3', &compression_level, 3),
> -		OPT__COMPR_HIDDEN('4', &compression_level, 4),
> -		OPT__COMPR_HIDDEN('5', &compression_level, 5),
> -		OPT__COMPR_HIDDEN('6', &compression_level, 6),
> -		OPT__COMPR_HIDDEN('7', &compression_level, 7),
> -		OPT__COMPR_HIDDEN('8', &compression_level, 8),
> -		OPT__COMPR('9', &compression_level, N_("compress better"), 9),
> +		OPT_NUMBER_CALLBACK(&compression_level,
> +			N_("set compression level"), number_callback),

Doubly nice.  Adds a feature while removing lines.  

Do we miss the description given in "git archive -h" though?

    usage: git archive [<options>] <tree-ish> [<path>...]
       or: git archive --list
       ...
        -v, --verbose         report archived files on stderr
        -0                    store only
        -1                    compress faster
        -9                    compress better
René Scharfe Nov. 9, 2020, 11:48 p.m. UTC | #2
Am 09.11.20 um 19:35 schrieb Junio C Hamano:
> René Scharfe <l.s.r@web.de> writes:
>
>> Compression programs like zip, gzip, bzip2 and xz allow to adjust the
>> trade-off between CPU cost and size gain with numerical options from -1
>> for fast compression and -9 for high compression ratio.  zip also
>> accepts -0 for storing files verbatim.  git archive directly support
>> these single-digit compression levels for ZIP output and passes them to
>> filters like gzip.
>>
>> Zstandard additionally supports compression level options -10 to -19, or
>> up to -22 with --ultra.  This *seems* to work with git archive in most
>> cases, e.g. it will produce an archive with -19 without complaining, but
>> since it only supports single-digit compression level options this is
>> the same as -1 -9 and thus -9.
>>
>> Allow git archive to accept multi-digit compression levels to support
>> the full range supported by zstd.  Explicitly reject them for the ZIP
>> format, as otherwise deflateInit2() would just fail with a somewhat
>> cryptic "stream consistency error".
>
> The implementation looks more like "not enable them for the ZIP
> format", but the symptom observable to end-users is exactly
> "explicitly reject", so that's OK ;-)
>
> As with the usual compression levels, this is only about how
> deflator finds a better results, and the stream is understandable by
> any existing inflator, right?

Support for higher levels might have been added in later versions of
Zstandard -- https://github.com/facebook/zstd/blob/dev/CHANGELOG
mentions "Command line utility compatible with high compression levels"
for v.0.4.0.  I'm not aware of other implementations  of the algorithm
than the original one from Facebook, so I don't know how compatible
they are.  It's not a problem we can solve in Git, though.

Side note: Using Zstandard with git archive requires the config setting
tar.tar.zst.command=zstd.

>> diff --git a/archive.c b/archive.c
>> index 3c1541af9e..7a888c5338 100644
>> --- a/archive.c
>> +++ b/archive.c
>> @@ -529,10 +529,12 @@ static int add_file_cb(const struct option *opt, const char *arg, int unset)
>>  	return 0;
>>  }
>>
>> -#define OPT__COMPR(s, v, h, p) \
>> -	OPT_SET_INT_F(s, NULL, v, h, p, PARSE_OPT_NONEG)
>> -#define OPT__COMPR_HIDDEN(s, v, p) \
>> -	OPT_SET_INT_F(s, NULL, v, "", p, PARSE_OPT_NONEG | PARSE_OPT_HIDDEN)
>> +static int number_callback(const struct option *opt, const char *arg, int unset)
>> +{
>> +	BUG_ON_OPT_NEG(unset);
>> +	*(int *)opt->value = strtol(arg, NULL, 10);
>> +	return 0;
>> +}
>>
>>  static int parse_archive_args(int argc, const char **argv,
>>  		const struct archiver **ar, struct archiver_args *args,
>> @@ -561,16 +563,8 @@ static int parse_archive_args(int argc, const char **argv,
>>  		OPT_BOOL(0, "worktree-attributes", &worktree_attributes,
>>  			N_("read .gitattributes in working directory")),
>>  		OPT__VERBOSE(&verbose, N_("report archived files on stderr")),
>> -		OPT__COMPR('0', &compression_level, N_("store only"), 0),
>> -		OPT__COMPR('1', &compression_level, N_("compress faster"), 1),
>> -		OPT__COMPR_HIDDEN('2', &compression_level, 2),
>> -		OPT__COMPR_HIDDEN('3', &compression_level, 3),
>> -		OPT__COMPR_HIDDEN('4', &compression_level, 4),
>> -		OPT__COMPR_HIDDEN('5', &compression_level, 5),
>> -		OPT__COMPR_HIDDEN('6', &compression_level, 6),
>> -		OPT__COMPR_HIDDEN('7', &compression_level, 7),
>> -		OPT__COMPR_HIDDEN('8', &compression_level, 8),
>> -		OPT__COMPR('9', &compression_level, N_("compress better"), 9),
>> +		OPT_NUMBER_CALLBACK(&compression_level,
>> +			N_("set compression level"), number_callback),
>
> Doubly nice.  Adds a feature while removing lines.
>
> Do we miss the description given in "git archive -h" though?
>
>     usage: git archive [<options>] <tree-ish> [<path>...]
>        or: git archive --list
>        ...
>         -v, --verbose         report archived files on stderr
>         -0                    store only
>         -1                    compress faster
>         -9                    compress better
>

Perhaps; I just couldn't cram it all into a single line.  Showing an
acceptable range would be nice and terse, but that depends on the
compressor.

Hmm, adding an option for passing arbitrary options to the filter and
removing the feature flag ARCHIVER_WANT_COMPRESSION_LEVELS from
archive-tar.c would be cleaner overall.  The latter would be a
regression, though.

René
René Scharfe Nov. 10, 2020, 11:37 a.m. UTC | #3
Am 10.11.20 um 00:48 schrieb René Scharfe:
> Hmm, adding an option for passing arbitrary options to the filter and
> removing the feature flag ARCHIVER_WANT_COMPRESSION_LEVELS from
> archive-tar.c would be cleaner overall.  The latter would be a
> regression, though.

And the former can allow remote code execution with --remote if we're
not careful, since filters are called via the shell.

René
diff mbox series

Patch

diff --git a/archive-tar.c b/archive-tar.c
index f1a1447ebd..a971fdc0f6 100644
--- a/archive-tar.c
+++ b/archive-tar.c
@@ -374,7 +374,8 @@  static int tar_filter_config(const char *var, const char *value, void *data)
 		ar = xcalloc(1, sizeof(*ar));
 		ar->name = xmemdupz(name, namelen);
 		ar->write_archive = write_tar_filter_archive;
-		ar->flags = ARCHIVER_WANT_COMPRESSION_LEVELS;
+		ar->flags = ARCHIVER_WANT_COMPRESSION_LEVELS |
+			    ARCHIVER_HIGH_COMPRESSION_LEVELS;
 		ALLOC_GROW(tar_filters, nr_tar_filters + 1, alloc_tar_filters);
 		tar_filters[nr_tar_filters++] = ar;
 	}
diff --git a/archive.c b/archive.c
index 3c1541af9e..7a888c5338 100644
--- a/archive.c
+++ b/archive.c
@@ -529,10 +529,12 @@  static int add_file_cb(const struct option *opt, const char *arg, int unset)
 	return 0;
 }

-#define OPT__COMPR(s, v, h, p) \
-	OPT_SET_INT_F(s, NULL, v, h, p, PARSE_OPT_NONEG)
-#define OPT__COMPR_HIDDEN(s, v, p) \
-	OPT_SET_INT_F(s, NULL, v, "", p, PARSE_OPT_NONEG | PARSE_OPT_HIDDEN)
+static int number_callback(const struct option *opt, const char *arg, int unset)
+{
+	BUG_ON_OPT_NEG(unset);
+	*(int *)opt->value = strtol(arg, NULL, 10);
+	return 0;
+}

 static int parse_archive_args(int argc, const char **argv,
 		const struct archiver **ar, struct archiver_args *args,
@@ -561,16 +563,8 @@  static int parse_archive_args(int argc, const char **argv,
 		OPT_BOOL(0, "worktree-attributes", &worktree_attributes,
 			N_("read .gitattributes in working directory")),
 		OPT__VERBOSE(&verbose, N_("report archived files on stderr")),
-		OPT__COMPR('0', &compression_level, N_("store only"), 0),
-		OPT__COMPR('1', &compression_level, N_("compress faster"), 1),
-		OPT__COMPR_HIDDEN('2', &compression_level, 2),
-		OPT__COMPR_HIDDEN('3', &compression_level, 3),
-		OPT__COMPR_HIDDEN('4', &compression_level, 4),
-		OPT__COMPR_HIDDEN('5', &compression_level, 5),
-		OPT__COMPR_HIDDEN('6', &compression_level, 6),
-		OPT__COMPR_HIDDEN('7', &compression_level, 7),
-		OPT__COMPR_HIDDEN('8', &compression_level, 8),
-		OPT__COMPR('9', &compression_level, N_("compress better"), 9),
+		OPT_NUMBER_CALLBACK(&compression_level,
+			N_("set compression level"), number_callback),
 		OPT_GROUP(""),
 		OPT_BOOL('l', "list", &list,
 			N_("list supported archive formats")),
@@ -617,7 +611,9 @@  static int parse_archive_args(int argc, const char **argv,

 	args->compression_level = Z_DEFAULT_COMPRESSION;
 	if (compression_level != -1) {
-		if ((*ar)->flags & ARCHIVER_WANT_COMPRESSION_LEVELS)
+		int levels_ok = (*ar)->flags & ARCHIVER_WANT_COMPRESSION_LEVELS;
+		int high_ok = (*ar)->flags & ARCHIVER_HIGH_COMPRESSION_LEVELS;
+		if (levels_ok && (compression_level <= 9 || high_ok))
 			args->compression_level = compression_level;
 		else {
 			die(_("Argument not supported for format '%s': -%d"),
diff --git a/archive.h b/archive.h
index 82b226011a..e3d04e8ab3 100644
--- a/archive.h
+++ b/archive.h
@@ -36,6 +36,7 @@  const char *archive_format_from_filename(const char *filename);

 #define ARCHIVER_WANT_COMPRESSION_LEVELS 1
 #define ARCHIVER_REMOTE 2
+#define ARCHIVER_HIGH_COMPRESSION_LEVELS 4
 struct archiver {
 	const char *name;
 	int (*write_archive)(const struct archiver *, struct archiver_args *);