diff mbox series

[v4,05/10] git-contributors: better handling of hash mark/multiple emails

Message ID 20250213-update-release-v4-5-c06883a8bbd6@kernel.org (mailing list archive)
State New
Headers show
Series Update release.sh | expand

Commit Message

Andrey Albershteyn Feb. 13, 2025, 8:14 p.m. UTC
Better handling of hash mark, tags with multiple emails and not
quoted names in emails. See comments in the script.

Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 tools/git-contributors.py | 109 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 90 insertions(+), 19 deletions(-)

Comments

Darrick J. Wong Feb. 13, 2025, 9:47 p.m. UTC | #1
On Thu, Feb 13, 2025 at 09:14:27PM +0100, Andrey Albershteyn wrote:
> Better handling of hash mark, tags with multiple emails and not
> quoted names in emails. See comments in the script.
> 
> Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>

Matches my original git-contributors script, good enough for now
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  tools/git-contributors.py | 109 ++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 90 insertions(+), 19 deletions(-)
> 
> diff --git a/tools/git-contributors.py b/tools/git-contributors.py
> index 70ac8abb26c8ce65de336c5ae48abcfee39508b2..1a0f2b80e3dad9124b86b29f8507389ef91fe813 100755
> --- a/tools/git-contributors.py
> +++ b/tools/git-contributors.py
> @@ -37,35 +37,106 @@ class find_developers(object):
>  
>          self.r1 = re.compile(regex1, re.I)
>  
> +        # regex to guess if this is a list of multiple addresses.
> +        # Not sure why the initial "^.*" is needed here.
> +        self.r2 = re.compile(r'^.*,[^,]*@[^@]*,[^,]*@', re.I)
> +
> +        # regex to match on anything inside a pair of angle brackets
> +        self.r3 = re.compile(r'^.*<(.+)>', re.I)
> +
> +    def _handle_addr(self, addr):
> +        # The next split removes everything after an octothorpe (hash
> +        # mark), because someone could have provided an improperly
> +        # formatted email address:
> +        #
> +        # Cc: stable@vger.kernel.org # v6.19+
> +        #
> +        # This, according to my reading of RFC5322, is allowed because
> +        # octothorpes can be part of atom text.  However, it is
> +        # interepreted as if there weren't any whitespace
> +        # ("stable@vger.kernel.org#v6.19+").  The grammar allows for
> +        # this form, even though this is not a correct Internet domain
> +        # name.
> +        #
> +        # Worse, if you follow the format specified in the kernel's
> +        # SubmittingPatches file:
> +        #
> +        # Cc: <stable@vger.kernel.org> # v6.9
> +        #
> +        # emailutils will not know how to parse this, and returns empty
> +        # strings.  I think this is because the angle-addr
> +        # specification allows only whitespace between the closing
> +        # angle bracket and the CRLF.
> +        #
> +        # Hack around both problems by ignoring everything after an
> +        # octothorpe, no matter where it occurs in the string.  If
> +        # someone has one in their name or the email address, too bad.
> +        a = addr.split('#')[0]
> +
> +        # emailutils can extract email addresses from headers that
> +        # roughly follow the destination address field format:
> +        #
> +        # Reviewed-by: Bogus J. Simpson <bogus@simpson.com>
> +        # Reviewed-by: "Bogus J. Simpson" <bogus@simpson.com>
> +        # Reviewed-by: bogus@simpson.com
> +        #
> +        # Use it to extract the email address, because we don't care
> +        # about the display name.
> +        (name, addr) = email.utils.parseaddr(a)
> +        if DEBUG:
> +            print(f'A:{a}:NAME:{name}:ADDR:{addr}:')
> +        if len(addr) > 0:
> +            return addr
> +
> +        # If emailutils fails to find anything, let's see if there's
> +        # a sequence of characters within angle brackets and hope that
> +        # is an email address.  This works around things like:
> +        #
> +        # Reported-by: Xu, Wen <wen.xu@gatech.edu>
> +        #
> +        # Which should have had the name in quotations because there's
> +        # a comma.
> +        m = self.r3.match(a)
> +        if m:
> +            addr = m.expand(r'\g<1>')
> +            if DEBUG:
> +                print(f"M3:{addr}:M:{m}:")
> +            return addr
> +
> +        # No idea, just spit the whole thing out and hope for the best.
> +        return a
> +
>      def run(self, lines):
>          addr_list = []
>  
>          for line in lines:
>              l = line.strip()
>  
> -            # emailutils can handle abominations like:
> -            #
> -            # Reviewed-by: Bogus J. Simpson <bogus@simpson.com>
> -            # Reviewed-by: "Bogus J. Simpson" <bogus@simpson.com>
> -            # Reviewed-by: bogus@simpson.com
> -            # Cc: <stable@vger.kernel.org> # v6.9
> -            # Tested-by: Moo Cow <foo@bar.com> # powerpc
> +            # First, does this line match any of the headers we
> +            # know about?
>              m = self.r1.match(l)
>              if not m:
>                  continue
> -            (name, addr) = email.utils.parseaddr(m.expand(r'\g<2>'))
> +            rightside = m.expand(r'\g<2>')
>  
> -            # This last split removes anything after a hash mark,
> -            # because someone could have provided an improperly
> -            # formatted email address:
> -            #
> -            # Cc: stable@vger.kernel.org # v6.19+
> -            #
> -            # emailutils doesn't seem to catch this, and I can't
> -            # fully tell from RFC2822 that this isn't allowed.  I
> -            # think it is because dtext doesn't forbid spaces or
> -            # hash marks.
> -            addr_list.append(addr.split('#')[0])
> +            n = self.r2.match(rightside)
> +            if n:
> +                # Break the line into an array of addresses,
> +                # delimited by commas, then handle each
> +                # address.
> +                addrs = rightside.split(',')
> +                if DEBUG:
> +                    print(f"0LINE:{rightside}:ADDRS:{addrs}:M:{n}")
> +                for addr in addrs:
> +                    a = self._handle_addr(addr)
> +                    addr_list.append(a)
> +            else:
> +                # Otherwise treat the line as a single email
> +                # address.
> +                if DEBUG:
> +                    print(f"1LINE:{rightside}:M:{n}")
> +                a = self._handle_addr(rightside)
> +                addr_list.append(a)
>  
>          return sorted(set(addr_list))
>  
> 
> -- 
> 2.47.2
> 
>
diff mbox series

Patch

diff --git a/tools/git-contributors.py b/tools/git-contributors.py
index 70ac8abb26c8ce65de336c5ae48abcfee39508b2..1a0f2b80e3dad9124b86b29f8507389ef91fe813 100755
--- a/tools/git-contributors.py
+++ b/tools/git-contributors.py
@@ -37,35 +37,106 @@  class find_developers(object):
 
         self.r1 = re.compile(regex1, re.I)
 
+        # regex to guess if this is a list of multiple addresses.
+        # Not sure why the initial "^.*" is needed here.
+        self.r2 = re.compile(r'^.*,[^,]*@[^@]*,[^,]*@', re.I)
+
+        # regex to match on anything inside a pair of angle brackets
+        self.r3 = re.compile(r'^.*<(.+)>', re.I)
+
+    def _handle_addr(self, addr):
+        # The next split removes everything after an octothorpe (hash
+        # mark), because someone could have provided an improperly
+        # formatted email address:
+        #
+        # Cc: stable@vger.kernel.org # v6.19+
+        #
+        # This, according to my reading of RFC5322, is allowed because
+        # octothorpes can be part of atom text.  However, it is
+        # interepreted as if there weren't any whitespace
+        # ("stable@vger.kernel.org#v6.19+").  The grammar allows for
+        # this form, even though this is not a correct Internet domain
+        # name.
+        #
+        # Worse, if you follow the format specified in the kernel's
+        # SubmittingPatches file:
+        #
+        # Cc: <stable@vger.kernel.org> # v6.9
+        #
+        # emailutils will not know how to parse this, and returns empty
+        # strings.  I think this is because the angle-addr
+        # specification allows only whitespace between the closing
+        # angle bracket and the CRLF.
+        #
+        # Hack around both problems by ignoring everything after an
+        # octothorpe, no matter where it occurs in the string.  If
+        # someone has one in their name or the email address, too bad.
+        a = addr.split('#')[0]
+
+        # emailutils can extract email addresses from headers that
+        # roughly follow the destination address field format:
+        #
+        # Reviewed-by: Bogus J. Simpson <bogus@simpson.com>
+        # Reviewed-by: "Bogus J. Simpson" <bogus@simpson.com>
+        # Reviewed-by: bogus@simpson.com
+        #
+        # Use it to extract the email address, because we don't care
+        # about the display name.
+        (name, addr) = email.utils.parseaddr(a)
+        if DEBUG:
+            print(f'A:{a}:NAME:{name}:ADDR:{addr}:')
+        if len(addr) > 0:
+            return addr
+
+        # If emailutils fails to find anything, let's see if there's
+        # a sequence of characters within angle brackets and hope that
+        # is an email address.  This works around things like:
+        #
+        # Reported-by: Xu, Wen <wen.xu@gatech.edu>
+        #
+        # Which should have had the name in quotations because there's
+        # a comma.
+        m = self.r3.match(a)
+        if m:
+            addr = m.expand(r'\g<1>')
+            if DEBUG:
+                print(f"M3:{addr}:M:{m}:")
+            return addr
+
+        # No idea, just spit the whole thing out and hope for the best.
+        return a
+
     def run(self, lines):
         addr_list = []
 
         for line in lines:
             l = line.strip()
 
-            # emailutils can handle abominations like:
-            #
-            # Reviewed-by: Bogus J. Simpson <bogus@simpson.com>
-            # Reviewed-by: "Bogus J. Simpson" <bogus@simpson.com>
-            # Reviewed-by: bogus@simpson.com
-            # Cc: <stable@vger.kernel.org> # v6.9
-            # Tested-by: Moo Cow <foo@bar.com> # powerpc
+            # First, does this line match any of the headers we
+            # know about?
             m = self.r1.match(l)
             if not m:
                 continue
-            (name, addr) = email.utils.parseaddr(m.expand(r'\g<2>'))
+            rightside = m.expand(r'\g<2>')
 
-            # This last split removes anything after a hash mark,
-            # because someone could have provided an improperly
-            # formatted email address:
-            #
-            # Cc: stable@vger.kernel.org # v6.19+
-            #
-            # emailutils doesn't seem to catch this, and I can't
-            # fully tell from RFC2822 that this isn't allowed.  I
-            # think it is because dtext doesn't forbid spaces or
-            # hash marks.
-            addr_list.append(addr.split('#')[0])
+            n = self.r2.match(rightside)
+            if n:
+                # Break the line into an array of addresses,
+                # delimited by commas, then handle each
+                # address.
+                addrs = rightside.split(',')
+                if DEBUG:
+                    print(f"0LINE:{rightside}:ADDRS:{addrs}:M:{n}")
+                for addr in addrs:
+                    a = self._handle_addr(addr)
+                    addr_list.append(a)
+            else:
+                # Otherwise treat the line as a single email
+                # address.
+                if DEBUG:
+                    print(f"1LINE:{rightside}:M:{n}")
+                a = self._handle_addr(rightside)
+                addr_list.append(a)
 
         return sorted(set(addr_list))