Skip to content

Commit b72c682

Browse files
authored
Combine mixed word/number tokens (#498)
Prevents some extensions like somefile.mp4 from getting interpreted as URLs
1 parent 22e58d1 commit b72c682

File tree

5 files changed

+28
-8
lines changed

5 files changed

+28
-8
lines changed

packages/linkify-plugin-hashtag/src/hashtag.mjs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ const HashtagToken = createTokenClass('hashtag', { isLink: true });
88
*/
99
export default function hashtag({ scanner, parser }) {
1010
// Various tokens that may compose a hashtag
11-
const { POUND, UNDERSCORE, FULLWIDTHMIDDLEDOT } = scanner.tokens;
11+
const { POUND, UNDERSCORE, FULLWIDTHMIDDLEDOT, ASCIINUMERICAL, ALPHANUMERICAL } = scanner.tokens;
1212
const { alpha, numeric, alphanumeric, emoji } = scanner.tokens.groups;
1313

1414
// Take or create a transition from start to the '#' sign (non-accepting)
@@ -18,10 +18,14 @@ export default function hashtag({ scanner, parser }) {
1818
const HashPrefix = Hash.tt(UNDERSCORE);
1919
const Hashtag = new State(HashtagToken);
2020

21+
Hash.tt(ASCIINUMERICAL, Hashtag);
22+
Hash.tt(ALPHANUMERICAL, Hashtag);
2123
Hash.ta(numeric, HashPrefix);
2224
Hash.ta(alpha, Hashtag);
2325
Hash.ta(emoji, Hashtag);
2426
Hash.ta(FULLWIDTHMIDDLEDOT, Hashtag);
27+
HashPrefix.tt(ASCIINUMERICAL, Hashtag);
28+
HashPrefix.tt(ALPHANUMERICAL, Hashtag);
2529
HashPrefix.ta(alpha, Hashtag);
2630
HashPrefix.ta(emoji, Hashtag);
2731
HashPrefix.ta(FULLWIDTHMIDDLEDOT, Hashtag);

packages/linkifyjs/src/scanner.mjs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,24 @@ export function init(customSchemes = []) {
9999

100100
const Num = tr(Start, re.DIGIT, tk.NUM, { [fsm.numeric]: true });
101101
tr(Num, re.DIGIT, Num);
102+
const Asciinumeric = tr(Num, re.ASCII_LETTER, tk.ASCIINUMERICAL, { [fsm.asciinumeric]: true });
103+
const Alphanumeric = tr(Num, re.LETTER, tk.ALPHANUMERICAL, { [fsm.alphanumeric]: true });
102104

103105
// State which emits a word token
104106
const Word = tr(Start, re.ASCII_LETTER, tk.WORD, { [fsm.ascii]: true });
107+
tr(Word, re.DIGIT, Asciinumeric);
105108
tr(Word, re.ASCII_LETTER, Word);
109+
tr(Asciinumeric, re.DIGIT, Asciinumeric);
110+
tr(Asciinumeric, re.ASCII_LETTER, Asciinumeric);
106111

107112
// Same as previous, but specific to non-fsm.ascii alphabet words
108113
const UWord = tr(Start, re.LETTER, tk.UWORD, { [fsm.alpha]: true });
109114
tr(UWord, re.ASCII_LETTER); // Non-accepting
115+
tr(UWord, re.DIGIT, Alphanumeric);
110116
tr(UWord, re.LETTER, UWord);
117+
tr(Alphanumeric, re.DIGIT, Alphanumeric);
118+
tr(Alphanumeric, re.ASCII_LETTER); // Non-accepting
119+
tr(Alphanumeric, re.LETTER, Alphanumeric); // Non-accepting
111120

112121
// Whitespace jumps
113122
// Tokens of only non-newline whitespace are arbitrarily long
@@ -132,10 +141,14 @@ export function init(customSchemes = []) {
132141

133142
// Generates states for top-level domains
134143
// Note that this is most accurate when tlds are in alphabetical order
135-
const wordjr = [[re.ASCII_LETTER, Word]];
144+
const wordjr = [
145+
[re.ASCII_LETTER, Word],
146+
[re.DIGIT, Asciinumeric],
147+
];
136148
const uwordjr = [
137149
[re.ASCII_LETTER, null],
138150
[re.LETTER, UWord],
151+
[re.DIGIT, Alphanumeric],
139152
];
140153
for (let i = 0; i < tlds.length; i++) {
141154
fastts(Start, tlds[i], tk.TLD, tk.WORD, wordjr);

packages/linkifyjs/src/text.mjs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ Identifiers for token outputs from the regexp scanner
66
// A valid web domain token
77
export const WORD = 'WORD'; // only contains a-z
88
export const UWORD = 'UWORD'; // contains letters other than a-z, used for IDN
9+
export const ASCIINUMERICAL = 'ASCIINUMERICAL'; // contains a-z, 0-9
10+
export const ALPHANUMERICAL = 'ALPHANUMERICAL'; // contains numbers and letters other than a-z, used for IDN
911

1012
// Special case of word
1113
export const LOCALHOST = 'LOCALHOST';

test/spec/linkifyjs/parser.test.mjs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,7 @@ const tests = [
318318
[Url, Text],
319319
['https://google.com', '\ufffcthis'],
320320
],
321+
['some string with somefile.mp4 token', [Text], ['some string with somefile.mp4 token']],
321322
];
322323

323324
describe('linkifyjs/parser#run()', () => {

test/spec/linkifyjs/scanner.test.mjs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,10 @@ const tests = [
3636
],
3737
["!,;'", [t.EXCLAMATION, t.COMMA, t.SEMI, t.APOSTROPHE], ['!', ',', ';', "'"]],
3838
['hello', [t.WORD], ['hello']],
39-
['Hello123', [t.WORD, t.NUM], ['Hello', '123']],
40-
['hello123world', [t.WORD, t.NUM, t.TLD], ['hello', '123', 'world']],
39+
['Hello123', [t.ASCIINUMERICAL], ['Hello123']],
40+
['hello123world', [t.ASCIINUMERICAL], ['hello123world']],
4141
['0123', [t.NUM], ['0123']],
42-
['123abc', [t.NUM, t.TLD], ['123', 'abc']],
42+
['123abc', [t.ASCIINUMERICAL], ['123abc']],
4343
['http', [t.SLASH_SCHEME], ['http']],
4444
['http:', [t.SLASH_SCHEME, t.COLON], ['http', ':']],
4545
['https:', [t.SLASH_SCHEME, t.COLON], ['https', ':']],
@@ -66,10 +66,10 @@ const tests = [
6666
['local', [t.WORD], ['local']],
6767
['localhost', [t.LOCALHOST], ['localhost']],
6868
['localhosts', [t.WORD], ['localhosts']],
69-
['500px', [t.NUM, t.WORD], ['500', 'px']],
69+
['500px', [t.ASCIINUMERICAL], ['500px']],
7070
['500-px', [t.NUM, t.HYPHEN, t.WORD], ['500', '-', 'px']],
71-
['-500px', [t.HYPHEN, t.NUM, t.WORD], ['-', '500', 'px']],
72-
['500px-', [t.NUM, t.WORD, t.HYPHEN], ['500', 'px', '-']],
71+
['-500px', [t.HYPHEN, t.ASCIINUMERICAL], ['-', '500px']],
72+
['500px-', [t.ASCIINUMERICAL, t.HYPHEN], ['500px', '-']],
7373
['123-456', [t.NUM, t.HYPHEN, t.NUM], ['123', '-', '456']],
7474
['foo\u00a0bar', [t.TLD, t.WS, t.TLD], ['foo', '\u00a0', 'bar']], // nbsp
7575
['çïrâ.ca', [t.UWORD, t.WORD, t.UWORD, t.DOT, t.TLD], ['çï', 'r', 'â', '.', 'ca']],

0 commit comments

Comments
 (0)