diff mbox series

[v3] userdiff: add built-in pattern for rust

Message ID 20190520170403.16672-1-marcandre.lureau@redhat.com (mailing list archive)
State New, archived
Headers show
Series [v3] userdiff: add built-in pattern for rust | expand

Commit Message

Marc-André Lureau May 20, 2019, 5:04 p.m. UTC
From: Marc-André Lureau <mlureau@redhat.com>

This adds xfuncname and word_regex patterns for Rust, a quite
popular programming language. It also includes test cases for the
xfuncname regex (t4018) and updated documentation.

The word_regex pattern finds identifiers, integers, floats and
operators, according to the Rust Reference Book.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 Documentation/gitattributes.txt | 2 ++
 t/t4018-diff-funcname.sh        | 1 +
 t/t4018/rust-fn                 | 5 +++++
 t/t4018/rust-impl               | 5 +++++
 t/t4018/rust-struct             | 5 +++++
 t/t4018/rust-trait              | 5 +++++
 t/t4018/rust-unsafe             | 6 ++++++
 userdiff.c                      | 6 ++++++
 8 files changed, 35 insertions(+)
 create mode 100644 t/t4018/rust-fn
 create mode 100644 t/t4018/rust-impl
 create mode 100644 t/t4018/rust-struct
 create mode 100644 t/t4018/rust-trait
 create mode 100644 t/t4018/rust-unsafe


base-commit: aa25c82427ae70aebf3b8f970f2afd54e9a2a8c6

Comments

Johannes Sixt May 20, 2019, 7:52 p.m. UTC | #1
Am 20.05.19 um 19:04 schrieb marcandre.lureau@redhat.com:
> From: Marc-André Lureau <mlureau@redhat.com>
> 
> This adds xfuncname and word_regex patterns for Rust, a quite
> popular programming language. It also includes test cases for the
> xfuncname regex (t4018) and updated documentation.
> 
> The word_regex pattern finds identifiers, integers, floats and
> operators, according to the Rust Reference Book.

This looks very good. I have a few questions regarding the hunk header
regex.

> diff --git a/userdiff.c b/userdiff.c
> index 3a78fbf504..e45b5920c6 100644
> --- a/userdiff.c
> +++ b/userdiff.c
> @@ -130,6 +130,12 @@ PATTERNS("ruby", "^[ \t]*((class|module|def)[ \t].*)$",
>  	 "(@|@@|\\$)?[a-zA-Z_][a-zA-Z0-9_]*"
>  	 "|[-+0-9.e]+|0[xXbB]?[0-9a-fA-F]+|\\?(\\\\C-)?(\\\\M-)?."
>  	 "|//=?|[-+*/<>%&^|=!]=|<<=?|>>=?|===|\\.{1,3}|::|[!=]~"),
> +PATTERNS("rust",
> +	 "^[\t ]*((pub(\\([^\\)]+\\))?[\t ]+)?((async|const|unsafe|extern([\t ]+\"[^\"]+\"))[\t ]+)?(struct|enum|union|mod|trait|fn|impl(<.+>)?)[ \t]+[^;]*)$",

This pattern matches only if there is no semicolon behind the signal
words on the line. Is that important? Can you show a (test) case where a
line with a semicolon would be picked incorrectly if '[^;]*' were
simplified to '.*'?

You permit whitespace at the beginning of an anchor line. I guess that
is to catch nested definitions. Or is it common style to write indented
code? Can you show a test case where this makes sense?

Would it be sufficient to simplify

    (struct|enum|union|mod|trait|fn|impl(<.+>)?)[ \t]+
to
    (struct|enum|union|mod|trait|fn|impl)[< \t]+

as it is only important to exclude identifiers that start with these
keywords.

> +	 /* -- */
> +	 "[a-zA-Z_][a-zA-Z0-9_]*"
> +	 "|[0-9][0-9_a-fA-Fiosuxz]*(\\.([0-9]*[eE][+-]?)?[0-9_fF]*)?"
> +	 "|[-+*\\/<>%&^|=!:]=|<<=?|>>=?|&&|\\|\\||->|=>|\\.{2}=|\\.{3}|::"),
>  PATTERNS("bibtex", "(@[a-zA-Z]{1,}[ \t]*\\{{0,1}[ \t]*[^ \t\"@',\\#}{~%]*).*$",
>  	 "[={}\"]|[^={}\" \t]+"),
>  PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
> 
> base-commit: aa25c82427ae70aebf3b8f970f2afd54e9a2a8c6

-- Hannes
Marc-André Lureau May 21, 2019, 10:57 a.m. UTC | #2
Hi

On Mon, May 20, 2019 at 9:52 PM Johannes Sixt <j6t@kdbg.org> wrote:
>
> Am 20.05.19 um 19:04 schrieb marcandre.lureau@redhat.com:
> > From: Marc-André Lureau <mlureau@redhat.com>
> >
> > This adds xfuncname and word_regex patterns for Rust, a quite
> > popular programming language. It also includes test cases for the
> > xfuncname regex (t4018) and updated documentation.
> >
> > The word_regex pattern finds identifiers, integers, floats and
> > operators, according to the Rust Reference Book.
>
> This looks very good. I have a few questions regarding the hunk header
> regex.
>
> > diff --git a/userdiff.c b/userdiff.c
> > index 3a78fbf504..e45b5920c6 100644
> > --- a/userdiff.c
> > +++ b/userdiff.c
> > @@ -130,6 +130,12 @@ PATTERNS("ruby", "^[ \t]*((class|module|def)[ \t].*)$",
> >        "(@|@@|\\$)?[a-zA-Z_][a-zA-Z0-9_]*"
> >        "|[-+0-9.e]+|0[xXbB]?[0-9a-fA-F]+|\\?(\\\\C-)?(\\\\M-)?."
> >        "|//=?|[-+*/<>%&^|=!]=|<<=?|>>=?|===|\\.{1,3}|::|[!=]~"),
> > +PATTERNS("rust",
> > +      "^[\t ]*((pub(\\([^\\)]+\\))?[\t ]+)?((async|const|unsafe|extern([\t ]+\"[^\"]+\"))[\t ]+)?(struct|enum|union|mod|trait|fn|impl(<.+>)?)[ \t]+[^;]*)$",
>
> This pattern matches only if there is no semicolon behind the signal
> words on the line. Is that important? Can you show a (test) case where a
> line with a semicolon would be picked incorrectly if '[^;]*' were
> simplified to '.*'?


Ok, I am adding:

trait RIGHT {

    fn new(name: &'static str) -> Self;

    fn ChangeMe(&self) {
        // should skip "new", and return trait name
    }
}

> You permit whitespace at the beginning of an anchor line. I guess that
> is to catch nested definitions. Or is it common style to write indented
> code? Can you show a test case where this makes sense?
>

sure, I thought it was already covered.

fn foo() {
    fn RIGHT() {
        // must catch nested function
        ChangeMe;
    }
}

(a simpler example would be a method implementation)

> Would it be sufficient to simplify
>
>     (struct|enum|union|mod|trait|fn|impl(<.+>)?)[ \t]+
> to
>     (struct|enum|union|mod|trait|fn|impl)[< \t]+
>
> as it is only important to exclude identifiers that start with these
> keywords.

I think that would be fine, ok I am changing it

>
> > +      /* -- */
> > +      "[a-zA-Z_][a-zA-Z0-9_]*"
> > +      "|[0-9][0-9_a-fA-Fiosuxz]*(\\.([0-9]*[eE][+-]?)?[0-9_fF]*)?"
> > +      "|[-+*\\/<>%&^|=!:]=|<<=?|>>=?|&&|\\|\\||->|=>|\\.{2}=|\\.{3}|::"),
> >  PATTERNS("bibtex", "(@[a-zA-Z]{1,}[ \t]*\\{{0,1}[ \t]*[^ \t\"@',\\#}{~%]*).*$",
> >        "[={}\"]|[^={}\" \t]+"),
> >  PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
> >
> > base-commit: aa25c82427ae70aebf3b8f970f2afd54e9a2a8c6
>
> -- Hannes
Junio C Hamano May 28, 2019, 4:34 p.m. UTC | #3
Marc-André Lureau <marcandre.lureau@redhat.com> writes:

> Ok, I am adding:
> ...
> sure, I thought it was already covered.
> ...
> I think that would be fine, ok I am changing it

Thanks, both.

The previous round has already hit 'next' (which means that we won't
replacing the patch wholesale), so whatever you do, please make the
update relative to / on top of what is queued as d74e7860
("userdiff: add built-in pattern for rust", 2019-05-17).
Johannes Sixt May 28, 2019, 8:31 p.m. UTC | #4
Am 28.05.19 um 18:34 schrieb Junio C Hamano:
> Marc-André Lureau <marcandre.lureau@redhat.com> writes:
> 
>> Ok, I am adding:
>> ...
>> sure, I thought it was already covered.
>> ...
>> I think that would be fine, ok I am changing it
> 
> Thanks, both.
> 
> The previous round has already hit 'next' (which means that we won't
> replacing the patch wholesale), so whatever you do, please make the
> update relative to / on top of what is queued as d74e7860
> ("userdiff: add built-in pattern for rust", 2019-05-17).

Ok. So, Marc-André, would you mind resending an incremental patch,
because the word-regexp that is currently in 'next' would catch certain
expressions that should be multiple words as a single word?

-- Hannes
Marc-André Lureau May 28, 2019, 9:01 p.m. UTC | #5
Hi Johannes

On Tue, May 28, 2019 at 10:31 PM Johannes Sixt <j6t@kdbg.org> wrote:
>
> Am 28.05.19 um 18:34 schrieb Junio C Hamano:
> > Marc-André Lureau <marcandre.lureau@redhat.com> writes:
> >
> >> Ok, I am adding:
> >> ...
> >> sure, I thought it was already covered.
> >> ...
> >> I think that would be fine, ok I am changing it
> >
> > Thanks, both.
> >
> > The previous round has already hit 'next' (which means that we won't
> > replacing the patch wholesale), so whatever you do, please make the
> > update relative to / on top of what is queued as d74e7860
> > ("userdiff: add built-in pattern for rust", 2019-05-17).
>
> Ok. So, Marc-André, would you mind resending an incremental patch,
> because the word-regexp that is currently in 'next' would catch certain
> expressions that should be multiple words as a single word?

Beside a few extras tests, the diff is:

@@ -134,11 +134,10 @@ PATTERNS("ruby", "^[ \t]*((class|module|def)[ \t].*)$",
         "|[-+0-9.e]+|0[xXbB]?[0-9a-fA-F]+|\\?(\\\\C-)?(\\\\M-)?."
         "|//=?|[-+*/<>%&^|=!]=|<<=?|>>=?|===|\\.{1,3}|::|[!=]~"),
 PATTERNS("rust",
-        "^[\t ]*((pub(\\([^\\)]+\\))?[\t
]+)?((async|const|unsafe|extern([\t ]+\"[^\"]+\"))[\t
]+)?(struct|enum|union|mod|trait|fn|impl(<.+>)?)[ \t]+[^;]*)$",
+        "^[\t ]*((pub(\\([^\\)]+\\))?[\t
]+)?((async|const|unsafe|extern([\t ]+\"[^\"]+\"))[\t
]+)?(struct|enum|union|mod|trait|fn|impl)[< \t]+[^;]*)$",
         /* -- */
         "[a-zA-Z_][a-zA-Z0-9_]*"
-        "|[-+_0-9.eE]+(f32|f64|u8|u16|u32|u64|u128|usize|i8|i16|i32|i64|i128|isize)?"
-        "|0[box]?[0-9a-fA-F_]+(u8|u16|u32|u64|u128|usize|i8|i16|i32|i64|i128|isize)?"
+        "|[0-9][0-9_a-fA-Fiosuxz]*(\\.([0-9]*[eE][+-]?)?[0-9_fF]*)?"
         "|[-+*\\/<>%&^|=!:]=|<<=?|>>=?|&&|\\|\\||->|=>|\\.{2}=|\\.{3}|::"),

So it is simplifying handling of type parameters, and lowering the
complexity of literal numbers.

Both of these changes were based on your recommendations. Would you
mind sending a follow-up patch yourself?

I can send a seperate patch for the 3 extra tests.

thanks
diff mbox series

Patch

diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt
index 4fb20cd0e9..07da08fb27 100644
--- a/Documentation/gitattributes.txt
+++ b/Documentation/gitattributes.txt
@@ -833,6 +833,8 @@  patterns are available:
 
 - `ruby` suitable for source code in the Ruby language.
 
+- `rust` suitable for source code in the Rust language.
+
 - `tex` suitable for source code for LaTeX documents.
 
 
diff --git a/t/t4018-diff-funcname.sh b/t/t4018-diff-funcname.sh
index 22f9f88f0a..9261d6d3a0 100755
--- a/t/t4018-diff-funcname.sh
+++ b/t/t4018-diff-funcname.sh
@@ -43,6 +43,7 @@  diffpatterns="
 	php
 	python
 	ruby
+	rust
 	tex
 	custom1
 	custom2
diff --git a/t/t4018/rust-fn b/t/t4018/rust-fn
new file mode 100644
index 0000000000..cbe02155f1
--- /dev/null
+++ b/t/t4018/rust-fn
@@ -0,0 +1,5 @@ 
+pub(self) fn RIGHT<T>(x: &[T]) where T: Debug {
+    let _ = x;
+    // a comment
+    let a = ChangeMe;
+}
diff --git a/t/t4018/rust-impl b/t/t4018/rust-impl
new file mode 100644
index 0000000000..09df3cd93b
--- /dev/null
+++ b/t/t4018/rust-impl
@@ -0,0 +1,5 @@ 
+impl<'a, T: AsRef<[u8]>>  std::RIGHT for Git<'a> {
+
+    pub fn ChangeMe(&self) -> () {
+    }
+}
diff --git a/t/t4018/rust-struct b/t/t4018/rust-struct
new file mode 100644
index 0000000000..76aff1c0d8
--- /dev/null
+++ b/t/t4018/rust-struct
@@ -0,0 +1,5 @@ 
+#[derive(Debug)]
+pub(super) struct RIGHT<'a> {
+    name: &'a str,
+    age: ChangeMe,
+}
diff --git a/t/t4018/rust-trait b/t/t4018/rust-trait
new file mode 100644
index 0000000000..ea397f09ed
--- /dev/null
+++ b/t/t4018/rust-trait
@@ -0,0 +1,5 @@ 
+unsafe trait RIGHT<T> {
+    fn len(&self) -> u32;
+    fn ChangeMe(&self, n: u32) -> T;
+    fn iter<F>(&self, f: F) where F: Fn(T);
+}
diff --git a/t/t4018/rust-unsafe b/t/t4018/rust-unsafe
new file mode 100644
index 0000000000..fd4661a934
--- /dev/null
+++ b/t/t4018/rust-unsafe
@@ -0,0 +1,6 @@ 
+unsafe fn RIGHT(inc: u32) {
+    unsafe {
+        // don't catch unsafe block
+        ChangeMe += inc;
+    }
+}
diff --git a/userdiff.c b/userdiff.c
index 3a78fbf504..e45b5920c6 100644
--- a/userdiff.c
+++ b/userdiff.c
@@ -130,6 +130,12 @@  PATTERNS("ruby", "^[ \t]*((class|module|def)[ \t].*)$",
 	 "(@|@@|\\$)?[a-zA-Z_][a-zA-Z0-9_]*"
 	 "|[-+0-9.e]+|0[xXbB]?[0-9a-fA-F]+|\\?(\\\\C-)?(\\\\M-)?."
 	 "|//=?|[-+*/<>%&^|=!]=|<<=?|>>=?|===|\\.{1,3}|::|[!=]~"),
+PATTERNS("rust",
+	 "^[\t ]*((pub(\\([^\\)]+\\))?[\t ]+)?((async|const|unsafe|extern([\t ]+\"[^\"]+\"))[\t ]+)?(struct|enum|union|mod|trait|fn|impl(<.+>)?)[ \t]+[^;]*)$",
+	 /* -- */
+	 "[a-zA-Z_][a-zA-Z0-9_]*"
+	 "|[0-9][0-9_a-fA-Fiosuxz]*(\\.([0-9]*[eE][+-]?)?[0-9_fF]*)?"
+	 "|[-+*\\/<>%&^|=!:]=|<<=?|>>=?|&&|\\|\\||->|=>|\\.{2}=|\\.{3}|::"),
 PATTERNS("bibtex", "(@[a-zA-Z]{1,}[ \t]*\\{{0,1}[ \t]*[^ \t\"@',\\#}{~%]*).*$",
 	 "[={}\"]|[^={}\" \t]+"),
 PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",