Skip to content

Commit 97ab29a

Browse files
committed
heuristics: refactoring, extracting rule package
Signed-off-by: Alexander Bezzubov <[email protected]>
1 parent c4f3dbe commit 97ab29a

File tree

8 files changed

+1076
-1051
lines changed

8 files changed

+1076
-1051
lines changed

data/content.go

Lines changed: 443 additions & 447 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

data/doc.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
// Package data contains only auto-generated data-structures for all the language
2+
// identification strategies from the Linguist project sources.
3+
package data

data/heuristics.go

Lines changed: 15 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,24 @@
11
package data
22

3-
// Implmentation of a rule-based content heuristics matching engine.
4-
// Every Rule defines a patterns that content must match in order to be identifed as
5-
// belonging to a language(s).
6-
// It is used to generate a content.go code for disambiguation of languages with
7-
// colliding extensions based on regexps from Linguist.
3+
import "gopkg.in/src-d/enry.v1/data/rule"
84

9-
import "regexp"
5+
// Heuristics implements a rule-based content matching engine.
106

11-
type (
12-
// Heuristics consists of a number of sequntially applied Matchers.
13-
Heuristics []Matcher
7+
// Heuristics is a number of sequntially applied rule.Heuristic where a
8+
// matching one disambiguages language(s) for a single file extension.
9+
type Heuristics []rule.Heuristic
1410

15-
// Matcher checks if a given data matches (number of) patterns.
16-
Matcher interface {
17-
Match(data []byte) bool
18-
}
19-
20-
// Languages incapsulates data common to every Rule: number of languages
21-
// it identifies.
22-
Languages struct {
23-
langs []string
24-
}
25-
26-
// Rule interface provides access to a languages that this rule identifies.
27-
Rule interface {
28-
GetLanguages() []string
29-
}
30-
)
31-
32-
// Match returns languages identified by the matching rules of the heuristic.
33-
func (h *Heuristics) Match(data []byte) []string {
11+
// Match returns languages identified by the matching rule of the heuristic.
12+
func (hs *Heuristics) Match(data []byte) []string {
3413
var matchedLangs []string
35-
for _, matcher := range *h {
36-
if matcher.Match(data) {
37-
for _, langOrAlias := range matcher.(Rule).GetLanguages() {
14+
for _, heuristic := range *hs {
15+
if heuristic.Match(data) {
16+
for _, langOrAlias := range heuristic.Languages() {
3817
lang, ok := LanguageByAlias(langOrAlias)
3918
if !ok { // should never happen
40-
// language name/alias in heuristics.yml is not consistent with languages.yml
41-
// but we do not surface any error on the API
19+
// reaching here means language name/alias in heuristics.yml
20+
// is not consistent with languages.yml
21+
// but we do not surface any such error at the API
4222
continue
4323
}
4424
matchedLangs = append(matchedLangs, lang)
@@ -50,71 +30,6 @@ func (h *Heuristics) Match(data []byte) []string {
5030
}
5131

5232
// matchString is a convenience used only in tests.
53-
func (h *Heuristics) matchString(data string) []string {
54-
return h.Match([]byte(data))
55-
}
56-
57-
// GetLanguages returns languages, defined by this data.Rule.
58-
func (l *Languages) GetLanguages() []string {
59-
return l.langs
60-
}
61-
62-
// OrRule matches if a single matching pattern exists.
63-
// It defines only one pattern as it relis on compile-time optimization that
64-
// represtes union with | in a single regexp pattern.
65-
type OrRule struct {
66-
*Languages
67-
Pattern *regexp.Regexp
68-
}
69-
70-
// Match implements data.Matcher.
71-
func (r *OrRule) Match(data []byte) bool {
72-
return r.Pattern.Match(data)
73-
}
74-
75-
// AndRule matches if all of the patterns match.
76-
type AndRule struct {
77-
*Languages
78-
Patterns []Matcher
79-
}
80-
81-
// Match implements data.Matcher.
82-
func (r *AndRule) Match(data []byte) bool {
83-
allMatch := true
84-
for _, p := range r.Patterns {
85-
if !p.Match(data) {
86-
allMatch = false
87-
break
88-
}
89-
}
90-
return allMatch
91-
}
92-
93-
// NotRule matches if none of the patterns match.
94-
type NotRule struct {
95-
*Languages
96-
Patterns []*regexp.Regexp
97-
}
98-
99-
// Match implements data.Matcher.
100-
func (r *NotRule) Match(data []byte) bool {
101-
allDontMatch := true
102-
for _, p := range r.Patterns {
103-
if p.Match(data) {
104-
allDontMatch = false
105-
break
106-
}
107-
}
108-
return allDontMatch
109-
}
110-
111-
// AlwaysRule always matches.
112-
// Used as default fallback.
113-
type AlwaysRule struct {
114-
*Languages
115-
}
116-
117-
// Match implements data.Matcher.
118-
func (r *AlwaysRule) Match(data []byte) bool {
119-
return true
33+
func (hs *Heuristics) matchString(data string) []string {
34+
return hs.Match([]byte(data))
12035
}

data/heuristics_test.go

Lines changed: 27 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -5,61 +5,57 @@ import (
55
"testing"
66

77
"github.com/stretchr/testify/assert"
8+
"gopkg.in/src-d/enry.v1/data/rule"
89
)
910

1011
var testContentHeuristics = map[string]*Heuristics{
1112
".md": &Heuristics{ // final pattern for parsed YAML rule
12-
&OrRule{
13-
&Languages{[]string{"Markdown"}},
13+
rule.Or(
14+
rule.MatchingLanguages("Markdown"),
1415
regexp.MustCompile(`(^[-A-Za-z0-9=#!\*\[|>])|<\/ | \A\z`),
15-
},
16-
&OrRule{
17-
&Languages{[]string{"GCC Machine Description"}},
16+
),
17+
rule.Or(
18+
rule.MatchingLanguages("GCC Machine Description"),
1819
regexp.MustCompile(`^(;;|\(define_)`),
19-
},
20-
&AlwaysRule{
21-
&Languages{[]string{"Markdown"}},
22-
},
20+
),
21+
rule.Always(
22+
rule.MatchingLanguages("Markdown"),
23+
),
2324
},
2425
".ms": &Heuristics{
2526
// Order defines precedence: And, Or, Not, Named, Always
26-
&AndRule{
27-
&Languages{[]string{"Unix Assembly"}},
28-
[]Matcher{
29-
&NotRule{
30-
nil,
31-
[]*regexp.Regexp{regexp.MustCompile(`/\*`)},
32-
},
33-
&OrRule{
34-
nil,
35-
regexp.MustCompile(`^\s*\.(?:include\s|globa?l\s|[A-Za-z][_A-Za-z0-9]*:)`),
36-
},
37-
},
38-
},
39-
&OrRule{
40-
&Languages{[]string{"Roff"}},
27+
rule.And(
28+
rule.MatchingLanguages("Unix Assembly"),
29+
rule.Not(nil, regexp.MustCompile(`/\*`)),
30+
rule.Or(
31+
nil,
32+
regexp.MustCompile(`^\s*\.(?:include\s|globa?l\s|[A-Za-z][_A-Za-z0-9]*:)`),
33+
),
34+
),
35+
rule.Or(
36+
rule.MatchingLanguages("Roff"),
4137
regexp.MustCompile(`^[.''][A-Za-z]{2}(\s|$)`),
42-
},
43-
&AlwaysRule{
44-
&Languages{[]string{"MAXScript"}},
45-
},
38+
),
39+
rule.Always(
40+
rule.MatchingLanguages("MAXScript"),
41+
),
4642
},
4743
}
4844

49-
func TestContentHeuristics_MatchingAlways(t *testing.T) {
45+
func TestContentHeuristic_MatchingAlways(t *testing.T) {
5046
lang := testContentHeuristics[".md"].matchString("")
5147
assert.Equal(t, []string{"Markdown"}, lang)
5248

5349
lang = testContentHeuristics[".ms"].matchString("")
5450
assert.Equal(t, []string{"MAXScript"}, lang)
5551
}
5652

57-
func TestContentHeuristics_MatchingAnd(t *testing.T) {
53+
func TestContentHeuristic_MatchingAnd(t *testing.T) {
5854
lang := testContentHeuristics[".md"].matchString(";;")
5955
assert.Equal(t, []string{"GCC Machine Description"}, lang)
6056
}
6157

62-
func TestContentHeuristics_MatchingOr(t *testing.T) {
58+
func TestContentHeuristic_MatchingOr(t *testing.T) {
6359
lang := testContentHeuristics[".ms"].matchString(" .include \"math.s\"")
6460
assert.Equal(t, []string{"Unix Assembly"}, lang)
6561
}

data/rule/rule.go

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
// Package rule contains rule-based heuristic implementations.
2+
// It is used in the generated code in content.go for disambiguation of languages
3+
// with colliding extensions based on regexps from Linguist data.
4+
package rule
5+
6+
import (
7+
"regexp"
8+
)
9+
10+
// Heuristic consist of a number of rules where each, if matches,
11+
// identifes content as belonging to a programming language(s).
12+
type Heuristic interface {
13+
Matcher
14+
Languages() []string
15+
}
16+
17+
// Matcher checks if the data matches (number of) pattern.
18+
// Every rule below implements this interface: a rule is matcher that identifies
19+
// given programming language(s) in case of the match.
20+
type Matcher interface {
21+
Match(data []byte) bool
22+
}
23+
24+
// languages struct incapsulate data common to every Matcher: all languages
25+
// that it identifies.
26+
type languages struct {
27+
langs []string
28+
}
29+
30+
// Languages returns all languages, identified by this Matcher.
31+
func (l *languages) Languages() []string {
32+
return l.langs
33+
}
34+
35+
// MatchingLanguages is a helper to create new languages.
36+
func MatchingLanguages(langs ...string) *languages {
37+
return &languages{langs}
38+
}
39+
40+
// Implements a Heuristic.
41+
type or struct {
42+
*languages
43+
Pattern *regexp.Regexp
44+
}
45+
46+
// Or rule matches, if a single matching pattern exists.
47+
// It defines only one pattern as it relies on compile-time optimization that
48+
// represtes union with | in a single regexp.
49+
func Or(l *languages, r *regexp.Regexp) *or {
50+
return &or{l, r}
51+
}
52+
53+
// Match implements rule.Matcher.
54+
func (r *or) Match(data []byte) bool {
55+
return r.Pattern.Match(data)
56+
}
57+
58+
// Implements a Heuristic.
59+
type and struct {
60+
*languages
61+
Patterns []Matcher
62+
}
63+
64+
// And rule matches, if each of the patterns does match.
65+
func And(l *languages, m ...Matcher) *and {
66+
return &and{l, m}
67+
}
68+
69+
// Match implements data.Matcher.
70+
func (r *and) Match(data []byte) bool {
71+
allMatch := true
72+
for _, p := range r.Patterns {
73+
if !p.Match(data) {
74+
allMatch = false
75+
break
76+
}
77+
}
78+
return allMatch
79+
}
80+
81+
// Implements a Heuristic.
82+
type not struct {
83+
*languages
84+
Patterns []*regexp.Regexp
85+
}
86+
87+
// Not rule matches if none of the patterns match.
88+
func Not(l *languages, r ...*regexp.Regexp) *not {
89+
return &not{l, r}
90+
}
91+
92+
// Match implements data.Matcher.
93+
func (r *not) Match(data []byte) bool {
94+
allDontMatch := true
95+
for _, p := range r.Patterns {
96+
if p.Match(data) {
97+
allDontMatch = false
98+
break
99+
}
100+
}
101+
return allDontMatch
102+
}
103+
104+
// Implements a Heuristic.
105+
type always struct {
106+
*languages
107+
}
108+
109+
// Always rule always matches. Often is used as a default fallback.
110+
func Always(l *languages) *always {
111+
return &always{l}
112+
}
113+
114+
// Match implements Matcher.
115+
func (r *always) Match(data []byte) bool {
116+
return true
117+
}

0 commit comments

Comments
 (0)