Skip to content

Commit 682f026

Browse files
committed
tokenizer: cleanup & attributions
Signed-off-by: Alexander Bezzubov <[email protected]>
1 parent d45cddf commit 682f026

File tree

4 files changed

+4
-58
lines changed

4 files changed

+4
-58
lines changed

internal/tokenizer/flex/linguist.h

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
// https://github.com/github/linguist/blob/f72f2a21dfe80ebd16af3bc6216da75cd983a4f6/ext/linguist/linguist.h#L1
12
enum tokenizer_type {
23
NO_ACTION,
34
REGULAR_TOKEN,
@@ -10,11 +11,5 @@ struct tokenizer_extra {
1011
enum tokenizer_type type;
1112
};
1213

13-
// #include <stddef.h>
14-
15-
// #ifdef __APPLE__
16-
// char *strndup(const char *s1, size_t n);
17-
// #elif defined(_WIN32) || defined(_WIN64)
18-
// char *strndup(const char *s1, size_t n);
19-
// #pragma warning (disable: 4244)
20-
// #endif // _WIN32 || _WIN64
14+
// TODO(bzz) port Win support from
15+
// https://github.com/github/linguist/commit/8e912b4d8bf2aef7948de59eba48b75cfcbc97e0

internal/tokenizer/flex/tokenize_c.go

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,15 @@ package flex
99
import "C"
1010
import "unsafe"
1111

12-
// TokenizeC is only calling a C-flex based tokenizer from linguist
13-
func TokenizeC(content []byte) []string {
14-
cs := C.CBytes(content)
15-
defer C.free(unsafe.Pointer(cs))
16-
// C.tokenizer_extract_tokens((*C.char)(cs))
17-
return nil
18-
}
19-
2012
const maxTokenLen = 32
2113

2214

2315
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
16+
// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
2417
func TokenizeFlex(content []byte) []string {
2518
var buf C.YY_BUFFER_STATE
2619
var scanner C.yyscan_t
2720
var extra C.struct_tokenizer_extra
28-
// var scanner *C.yyscan_t = (*C.yyscan_t)(C.malloc(C.sizeof_yyscan_t))
29-
// var extra *C.struct_tokenizer_extra = (*C.struct_tokenizer_extra)(C.malloc(C.sizeof_struct_tokenizer_extra))
3021
var _len C.ulong
3122
var r C.int
3223

@@ -50,7 +41,6 @@ func TokenizeFlex(content []byte) []string {
5041
_len = C.strlen(extra.token)
5142
if (_len <= maxTokenLen) {
5243
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
53-
//rb_ary_push(ary, rb_str_new(extra.token, len))
5444
}
5545
C.free(unsafe.Pointer(extra.token))
5646
break
@@ -59,9 +49,6 @@ func TokenizeFlex(content []byte) []string {
5949
if (_len <= maxTokenLen) {
6050
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
6151
ary = append(ary, s)
62-
//s = rb_str_new2("SHEBANG#!");
63-
//rb_str_cat(s, extra.token, len);
64-
//rb_ary_push(ary, s);
6552
}
6653
C.free(unsafe.Pointer(extra.token))
6754
break
@@ -70,9 +57,6 @@ func TokenizeFlex(content []byte) []string {
7057
if (_len <= maxTokenLen) {
7158
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
7259
ary = append(ary, s)
73-
//s = rb_str_new(extra.token, len);
74-
//rb_str_cat2(s, ">");
75-
//rb_ary_push(ary, s);
7660
}
7761
C.free(unsafe.Pointer(extra.token))
7862
break
@@ -84,8 +68,6 @@ func TokenizeFlex(content []byte) []string {
8468

8569
C.linguist_yy_delete_buffer(buf, scanner)
8670
C.linguist_yylex_destroy(scanner)
87-
// C.free(unsafe.Pointer(extra))
88-
// C.free(unsafe.Pointer(scanner))
8971

9072
return ary
9173
}

internal/tokenizer/flex/tokenize_c_test.go

Lines changed: 0 additions & 25 deletions
This file was deleted.

internal/tokenizer/tokenize_test.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,3 @@ func BenchmarkTokenizer(b *testing.B) {
132132
}
133133
}
134134
}
135-
136-
//TODO(bzz): introduce tokenizer benchmark suit
137-
// baseline - just read the files
138-
// RE2
139-
// oniguruma
140-
// cgo to flex-based impl

0 commit comments

Comments
 (0)