diff mbox series

[v2] gitweb: redacted e-mail addresses feature.

Message ID pull.910.v2.git.1616297564158.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series [v2] gitweb: redacted e-mail addresses feature. | expand

Commit Message

Georgios Kontaxis March 21, 2021, 3:32 a.m. UTC
From: Georgios Kontaxis <geko1702+commits@99rst.org>

Gitweb extracts content from the Git log and makes it accessible
over HTTP. As a result, e-mail addresses found in commits are
exposed to web crawlers and they may not respect robots.txt.
This may result in unsolicited messages.
This is a feature for redacting e-mail addresses
from the generated HTML, etc. content.

This feature does not prevent someone from downloading the
unredacted commit log, e.g., by cloning the repository, and
extracting information from it.
It aims to hinder the low-effort bulk collection of e-mail
addresses by web crawlers.

Changes since v1:
- Turned off the feature by default.
- Removed duplicate code.
- Added note about Gitweb consumers receiving redacted logs.

Signed-off-by: Georgios Kontaxis <geko1702+commits@99rst.org>
---
    gitweb: Redacted e-mail addresses feature.
    
    Gitweb extracts content from the Git log and makes it accessible over
    HTTP. As a result, e-mail addresses found in commits are exposed to web
    crawlers. This may result in unsolicited messages. This is a feature for
    redacting e-mail addresses from the generated HTML content.
    
    This feature does not prevent someone from downloading the unredacted
    commit log and extracting information from it. It aims to hinder the
    low-effort bulk collection of e-mail addresses by web crawlers.
    
    Signed-off-by: Georgios Kontaxis geko1702+commits@99rst.org

Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-910%2Fkontaxis%2Fkontaxis%2Femail_privacy-v2
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-910/kontaxis/kontaxis/email_privacy-v2
Pull-Request: https://github.com/gitgitgadget/git/pull/910

Range-diff vs v1:

 1:  6fe6ebdb8e59 ! 1:  74af11ca8bf2 gitweb: redacted e-mail addresses feature.
     @@ Commit message
      
          Gitweb extracts content from the Git log and makes it accessible
          over HTTP. As a result, e-mail addresses found in commits are
     -    exposed to web crawlers. This may result in unsolicited messages.
     -    This is a feature for redacting e-mail addresses from the generated
     -    HTML content.
     +    exposed to web crawlers and they may not respect robots.txt.
     +    This may result in unsolicited messages.
     +    This is a feature for redacting e-mail addresses
     +    from the generated HTML, etc. content.
      
          This feature does not prevent someone from downloading the
     -    unredacted commit log and extracting information from it.
     +    unredacted commit log, e.g., by cloning the repository, and
     +    extracting information from it.
          It aims to hinder the low-effort bulk collection of e-mail
          addresses by web crawlers.
      
     +    Changes since v1:
     +    - Turned off the feature by default.
     +    - Removed duplicate code.
     +    - Added note about Gitweb consumers receiving redacted logs.
     +
          Signed-off-by: Georgios Kontaxis <geko1702+commits@99rst.org>
      
       ## Documentation/gitweb.conf.txt ##
     @@ Documentation/gitweb.conf.txt: same as of the snippet above:
      +email_privacy::
      +    Redact e-mail addresses from the generated HTML, etc. content.
      +    This hides e-mail addresses found in the commit log from web crawlers.
     -+    Enabled by default.
     ++    Disabled by default.
      ++
     -+It is highly recommended to keep this feature enabled unless web crawlers
     -+are hindered in some other way. You can disable this feature as shown below:
     ++It is highly recommended to enable this feature unless web crawlers are
     ++hindered in some other way. Note that crawlers intent on harvesting e-mail
     ++addresses may disregard robots.txt. You can enable this feature like so:
      ++
      +---------------------------------------------------------------------------
     -+$feature{'email_privacy'}{'default'} = [0];
     ++$feature{'email_privacy'}{'default'} = [1];
      +---------------------------------------------------------------------------
     +++
     ++Note that if Gitweb is not the final step in a workflow then subsequent
     ++steps may misbehave because of the redacted information they receive.
      +
       
       EXAMPLES
     @@ gitweb/gitweb.perl: sub evaluate_uri {
       		'override' => 0,
       		'default' => []},
      +
     -+    # Redact e-mail addresses.
     ++	# Redact e-mail addresses.
      +
     -+    # To disable system wide have in $GITWEB_CONFIG
     -+    # $feature{'email_privacy'}{'default'} = [0];
     ++	# To enable system wide have in $GITWEB_CONFIG
     ++	# $feature{'email_privacy'}{'default'} = [1];
      +	'email_privacy' => {
      +		'sub' => sub { feature_bool('email_privacy', @_) },
      +		'override' => 0,
     -+		'default' => [1]},
     ++		'default' => [0]},
       );
       
       sub gitweb_get_feature {
     +@@ gitweb/gitweb.perl: sub parse_date {
     + 	return %date;
     + }
     + 
     ++sub hide_mailaddr_if_private {
     ++	my $line = shift;
     ++	return $line unless (gitweb_check_feature('email_privacy') &&
     ++						$line =~ m/^([^<]+) <([^>]*)>/);
     ++	return hide_mailaddr($line)
     ++}
     ++
     ++sub hide_mailaddr {
     ++	my $mailaddr = shift;
     ++	$mailaddr =~ s/<([^>]*)>/<private>/;
     ++	return $mailaddr;
     ++}
     ++
     + sub parse_tag {
     + 	my $tag_id = shift;
     + 	my %tag;
      @@ gitweb/gitweb.perl: sub parse_tag {
       			if ($tag{'author'} =~ m/^([^<]+) <([^>]*)>/) {
       				$tag{'author_name'}  = $1;
       				$tag{'author_email'} = $2;
      +				if (gitweb_check_feature('email_privacy')) {
      +					$tag{'author_email'} = "private";
     -+					$tag{'author'} =~ s/<([^>]+)>/<private>/;
     ++					$tag{'author'} = hide_mailaddr($tag{'author'});
      +				}
       			} else {
       				$tag{'author_name'} = $tag{'author'};
     @@ gitweb/gitweb.perl: sub parse_commit_text {
       				$co{'author_email'} = $2;
      +				if (gitweb_check_feature('email_privacy')) {
      +					$co{'author_email'} = "private";
     -+					$co{'author'} =~ s/<([^>]+)>/<private>/;
     ++					$co{'author'} = hide_mailaddr($co{'author'});
      +				}
       			} else {
       				$co{'author_name'} = $co{'author'};
     @@ gitweb/gitweb.perl: sub parse_commit_text {
       				$co{'committer_email'} = $2;
      +				if (gitweb_check_feature('email_privacy')) {
      +					$co{'committer_email'} = "private";
     -+					$co{'committer'} =~ s/<([^>]+)>/<private>/;
     ++					$co{'committer'} = hide_mailaddr($co{'committer'});
      +				}
       			} else {
       				$co{'committer_name'} = $co{'committer'};
     @@ gitweb/gitweb.perl: sub parse_commit_text {
      +	# remove added spaces, redact e-mail addresses if applicable.
       	foreach my $line (@commit_lines) {
       		$line =~ s/^    //;
     -+		if (gitweb_check_feature('email_privacy') &&
     -+			$line =~ m/^([^<]+) <([^>]*)>/) {
     -+			$line =~ s/<([^>]+)>/<private>/;
     -+		}
     ++		$line = hide_mailaddr_if_private($line);
       	}
       	$co{'comment'} = \@commit_lines;
       
     @@ gitweb/gitweb.perl: sub git_commitdiff {
      -		local $/ = undef;
      -		print <$fd>;
      +		while (my $line = <$fd>) {
     -+			if (gitweb_check_feature('email_privacy') &&
     -+				$line =~ m/^([^<]+) <([^>]*)>/) {
     -+				$line =~ s/<([^>]+)>/<private>/;
     -+			}
     -+			print $line;
     ++			print hide_mailaddr_if_private($line);
      +		}
       		close $fd
       			or print "Reading git-format-patch failed\n";


 Documentation/gitweb.conf.txt | 16 +++++++++++++
 gitweb/gitweb.perl            | 42 ++++++++++++++++++++++++++++++++---
 2 files changed, 55 insertions(+), 3 deletions(-)


base-commit: a5828ae6b52137b913b978e16cd2334482eb4c1f
diff mbox series

Patch

diff --git a/Documentation/gitweb.conf.txt b/Documentation/gitweb.conf.txt
index 7963a79ba98b..b7af3240177d 100644
--- a/Documentation/gitweb.conf.txt
+++ b/Documentation/gitweb.conf.txt
@@ -896,6 +896,22 @@  same as of the snippet above:
 It is an error to specify a ref that does not pass "git check-ref-format"
 scrutiny. Duplicated values are filtered.
 
+email_privacy::
+    Redact e-mail addresses from the generated HTML, etc. content.
+    This hides e-mail addresses found in the commit log from web crawlers.
+    Disabled by default.
++
+It is highly recommended to enable this feature unless web crawlers are
+hindered in some other way. Note that crawlers intent on harvesting e-mail
+addresses may disregard robots.txt. You can enable this feature like so:
++
+---------------------------------------------------------------------------
+$feature{'email_privacy'}{'default'} = [1];
+---------------------------------------------------------------------------
++
+Note that if Gitweb is not the final step in a workflow then subsequent
+steps may misbehave because of the redacted information they receive.
+
 
 EXAMPLES
 --------
diff --git a/gitweb/gitweb.perl b/gitweb/gitweb.perl
index 0959a782eccb..210228f32efd 100755
--- a/gitweb/gitweb.perl
+++ b/gitweb/gitweb.perl
@@ -569,6 +569,15 @@  sub evaluate_uri {
 		'sub' => \&feature_extra_branch_refs,
 		'override' => 0,
 		'default' => []},
+
+	# Redact e-mail addresses.
+
+	# To enable system wide have in $GITWEB_CONFIG
+	# $feature{'email_privacy'}{'default'} = [1];
+	'email_privacy' => {
+		'sub' => sub { feature_bool('email_privacy', @_) },
+		'override' => 0,
+		'default' => [0]},
 );
 
 sub gitweb_get_feature {
@@ -3449,6 +3458,19 @@  sub parse_date {
 	return %date;
 }
 
+sub hide_mailaddr_if_private {
+	my $line = shift;
+	return $line unless (gitweb_check_feature('email_privacy') &&
+						$line =~ m/^([^<]+) <([^>]*)>/);
+	return hide_mailaddr($line)
+}
+
+sub hide_mailaddr {
+	my $mailaddr = shift;
+	$mailaddr =~ s/<([^>]*)>/<private>/;
+	return $mailaddr;
+}
+
 sub parse_tag {
 	my $tag_id = shift;
 	my %tag;
@@ -3471,6 +3493,10 @@  sub parse_tag {
 			if ($tag{'author'} =~ m/^([^<]+) <([^>]*)>/) {
 				$tag{'author_name'}  = $1;
 				$tag{'author_email'} = $2;
+				if (gitweb_check_feature('email_privacy')) {
+					$tag{'author_email'} = "private";
+					$tag{'author'} = hide_mailaddr($tag{'author'});
+				}
 			} else {
 				$tag{'author_name'} = $tag{'author'};
 			}
@@ -3519,6 +3545,10 @@  sub parse_commit_text {
 			if ($co{'author'} =~ m/^([^<]+) <([^>]*)>/) {
 				$co{'author_name'}  = $1;
 				$co{'author_email'} = $2;
+				if (gitweb_check_feature('email_privacy')) {
+					$co{'author_email'} = "private";
+					$co{'author'} = hide_mailaddr($co{'author'});
+				}
 			} else {
 				$co{'author_name'} = $co{'author'};
 			}
@@ -3529,6 +3559,10 @@  sub parse_commit_text {
 			if ($co{'committer'} =~ m/^([^<]+) <([^>]*)>/) {
 				$co{'committer_name'}  = $1;
 				$co{'committer_email'} = $2;
+				if (gitweb_check_feature('email_privacy')) {
+					$co{'committer_email'} = "private";
+					$co{'committer'} = hide_mailaddr($co{'committer'});
+				}
 			} else {
 				$co{'committer_name'} = $co{'committer'};
 			}
@@ -3568,9 +3602,10 @@  sub parse_commit_text {
 	if (! defined $co{'title'} || $co{'title'} eq "") {
 		$co{'title'} = $co{'title_short'} = '(no commit message)';
 	}
-	# remove added spaces
+	# remove added spaces, redact e-mail addresses if applicable.
 	foreach my $line (@commit_lines) {
 		$line =~ s/^    //;
+		$line = hide_mailaddr_if_private($line);
 	}
 	$co{'comment'} = \@commit_lines;
 
@@ -8060,8 +8095,9 @@  sub git_commitdiff {
 		close $fd
 			or print "Reading git-diff-tree failed\n";
 	} elsif ($format eq 'patch') {
-		local $/ = undef;
-		print <$fd>;
+		while (my $line = <$fd>) {
+			print hide_mailaddr_if_private($line);
+		}
 		close $fd
 			or print "Reading git-format-patch failed\n";
 	}