Skip to content

Commit 223e317

Browse files
authored
Bracket parsing refactor and support for 「」『』<> brackets (#463)
* Cleaner brace/bracket/parens state transition definitions * Add additional brace kinds * Check that secrets are defined before running Browserify tests so it does not fail
1 parent 7471c52 commit 223e317

File tree

5 files changed

+347
-331
lines changed

5 files changed

+347
-331
lines changed

packages/linkifyjs/src/parser.js

Lines changed: 63 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -43,31 +43,37 @@ export function init({ groups }) {
4343
tk.SLASH,
4444
tk.SYM,
4545
tk.TILDE,
46-
tk.UNDERSCORE
46+
tk.UNDERSCORE,
4747
]);
4848

4949
// Types of tokens that can follow a URL and be part of the query string
5050
// but cannot be the very last characters
5151
// Characters that cannot appear in the URL at all should be excluded
5252
const qsNonAccepting = [
5353
tk.APOSTROPHE,
54-
tk.CLOSEANGLEBRACKET,
55-
tk.CLOSEBRACE,
56-
tk.CLOSEBRACKET,
57-
tk.CLOSEPAREN,
58-
tk.FULLWIDTH_CLOSEPAREN,
5954
tk.COLON,
6055
tk.COMMA,
6156
tk.DOT,
6257
tk.EXCLAMATION,
58+
tk.QUERY,
59+
tk.QUOTE,
60+
tk.SEMI,
6361
tk.OPENANGLEBRACKET,
62+
tk.CLOSEANGLEBRACKET,
6463
tk.OPENBRACE,
64+
tk.CLOSEBRACE,
65+
tk.CLOSEBRACKET,
6566
tk.OPENBRACKET,
6667
tk.OPENPAREN,
67-
tk.FULLWIDTH_OPENPAREN,
68-
tk.QUERY,
69-
tk.QUOTE,
70-
tk.SEMI
68+
tk.CLOSEPAREN,
69+
tk.FULLWIDTHLEFTPAREN,
70+
tk.FULLWIDTHRIGHTPAREN,
71+
tk.LEFTCORNERBRACKET,
72+
tk.RIGHTCORNERBRACKET,
73+
tk.LEFTWHITECORNERBRACKET,
74+
tk.RIGHTWHITECORNERBRACKET,
75+
tk.FULLWIDTHLESSTHAN,
76+
tk.FULLWIDTHGREATERTHAN,
7177
];
7278

7379
// For addresses without the mailto prefix
@@ -79,11 +85,11 @@ export function init({ groups }) {
7985
tk.BACKSLASH,
8086
tk.BACKTICK,
8187
tk.CARET,
82-
tk.CLOSEBRACE,
8388
tk.DOLLAR,
8489
tk.EQUALS,
8590
tk.HYPHEN,
8691
tk.OPENBRACE,
92+
tk.CLOSEBRACE,
8793
tk.PERCENT,
8894
tk.PIPE,
8995
tk.PLUS,
@@ -92,7 +98,7 @@ export function init({ groups }) {
9298
tk.SLASH,
9399
tk.SYM,
94100
tk.TILDE,
95-
tk.UNDERSCORE
101+
tk.UNDERSCORE,
96102
];
97103

98104
// The universal starting state.
@@ -104,7 +110,9 @@ export function init({ groups }) {
104110
ta(Localpart, localpartAccepting, Localpart);
105111
ta(Localpart, groups.domain, Localpart);
106112

107-
const Domain = makeState(), Scheme = makeState(), SlashScheme = makeState();
113+
const Domain = makeState(),
114+
Scheme = makeState(),
115+
SlashScheme = makeState();
108116
ta(Start, groups.domain, Domain); // parsed string ends with a potential domain name (A)
109117
ta(Start, groups.scheme, Scheme); // e.g., 'mailto'
110118
ta(Start, groups.slashscheme, SlashScheme); // e.g., 'http'
@@ -144,7 +152,7 @@ export function init({ groups }) {
144152

145153
// Final possible email states
146154
const EmailColon = tt(Email, tk.COLON); // URL followed by colon (potential port number here)
147-
/*const EmailColonPort = */ta(EmailColon, groups.numeric, mtk.Email); // URL followed by colon and port numner
155+
/*const EmailColonPort = */ ta(EmailColon, groups.numeric, mtk.Email); // URL followed by colon and port number
148156

149157
// Account for dots and hyphens. Hyphens are usually parts of domain names
150158
// (but not TLDs)
@@ -206,86 +214,46 @@ export function init({ groups }) {
206214
ta(UriPrefix, qsAccepting, Url);
207215
tt(UriPrefix, tk.SLASH, Url);
208216

209-
// URL, followed by an opening bracket
210-
const UrlOpenbrace = tt(Url, tk.OPENBRACE); // URL followed by {
211-
const UrlOpenbracket = tt(Url, tk.OPENBRACKET); // URL followed by [
212-
const UrlOpenanglebracket = tt(Url, tk.OPENANGLEBRACKET); // URL followed by <
213-
const UrlOpenparen = tt(Url, tk.OPENPAREN); // URL followed by (
214-
const UrlFullwidthOpenparen = tt(Url, tk.FULLWIDTH_OPENPAREN); // URL followed by (
215-
216-
tt(UrlNonaccept, tk.OPENBRACE, UrlOpenbrace);
217-
tt(UrlNonaccept, tk.OPENBRACKET, UrlOpenbracket);
218-
tt(UrlNonaccept, tk.OPENANGLEBRACKET, UrlOpenanglebracket);
219-
tt(UrlNonaccept, tk.OPENPAREN, UrlOpenparen);
220-
tt(UrlNonaccept, tk.FULLWIDTH_OPENPAREN, UrlFullwidthOpenparen);
221-
222-
// Closing bracket component. This character WILL be included in the URL
223-
tt(UrlOpenbrace, tk.CLOSEBRACE, Url);
224-
tt(UrlOpenbracket, tk.CLOSEBRACKET, Url);
225-
tt(UrlOpenanglebracket, tk.CLOSEANGLEBRACKET, Url);
226-
tt(UrlOpenparen, tk.CLOSEPAREN, Url);
227-
tt(UrlFullwidthOpenparen, tk.FULLWIDTH_CLOSEPAREN, Url);
228-
tt(UrlOpenbrace, tk.CLOSEBRACE, Url);
229-
230-
// URL that beings with an opening bracket, followed by a symbols.
231-
// Note that the final state can still be `UrlOpenbrace` (if the URL only
232-
// has a single opening bracket for some reason).
233-
const UrlOpenbraceQ = makeState(mtk.Url); // URL followed by { and some symbols that the URL can end it
234-
const UrlOpenbracketQ = makeState(mtk.Url); // URL followed by [ and some symbols that the URL can end it
235-
const UrlOpenanglebracketQ = makeState(mtk.Url); // URL followed by < and some symbols that the URL can end it
236-
const UrlOpenparenQ = makeState(mtk.Url); // URL followed by ( and some symbols that the URL can end it
237-
const UrlFullwidthOpenparenQ = makeState(mtk.Url); // URL followed by ( and some symbols that the URL can end it
238-
ta(UrlOpenbrace, qsAccepting, UrlOpenbraceQ);
239-
ta(UrlOpenbracket, qsAccepting, UrlOpenbracketQ);
240-
ta(UrlOpenanglebracket, qsAccepting, UrlOpenanglebracketQ);
241-
ta(UrlOpenparen, qsAccepting, UrlOpenparenQ);
242-
ta(UrlFullwidthOpenparen, qsAccepting, UrlFullwidthOpenparenQ);
243-
244-
const UrlOpenbraceSyms = makeState(); // UrlOpenbrace followed by some symbols it cannot end it
245-
const UrlOpenbracketSyms = makeState(); // UrlOpenbracketQ followed by some symbols it cannot end it
246-
const UrlOpenanglebracketSyms = makeState(); // UrlOpenanglebracketQ followed by some symbols it cannot end it
247-
const UrlOpenparenSyms = makeState(); // UrlOpenparenQ followed by some symbols it cannot end it
248-
const UrlFullwidthOpenparenSyms = makeState(); // UrlFullwidthOpenparenQ followed by some symbols it cannot end it
249-
ta(UrlOpenbrace, qsNonAccepting);
250-
ta(UrlOpenbracket, qsNonAccepting);
251-
ta(UrlOpenanglebracket, qsNonAccepting);
252-
ta(UrlOpenparen, qsNonAccepting);
253-
ta(UrlFullwidthOpenparen, qsNonAccepting);
254-
255-
// URL that begins with an opening bracket, followed by some symbols
256-
ta(UrlOpenbraceQ, qsAccepting, UrlOpenbraceQ);
257-
ta(UrlOpenbracketQ, qsAccepting, UrlOpenbracketQ);
258-
ta(UrlOpenanglebracketQ, qsAccepting, UrlOpenanglebracketQ);
259-
ta(UrlOpenparenQ, qsAccepting, UrlOpenparenQ);
260-
ta(UrlFullwidthOpenparenQ, qsAccepting, UrlFullwidthOpenparenQ);
261-
ta(UrlOpenbraceQ, qsNonAccepting, UrlOpenbraceQ);
262-
ta(UrlOpenbracketQ, qsNonAccepting, UrlOpenbracketQ);
263-
ta(UrlOpenanglebracketQ, qsNonAccepting, UrlOpenanglebracketQ);
264-
ta(UrlOpenparenQ, qsNonAccepting, UrlOpenparenQ);
265-
ta(UrlFullwidthOpenparenQ, qsAccepting, UrlFullwidthOpenparenQ);
266-
267-
ta(UrlOpenbraceSyms, qsAccepting, UrlOpenbraceSyms);
268-
ta(UrlOpenbracketSyms, qsAccepting, UrlOpenbracketQ);
269-
ta(UrlOpenanglebracketSyms, qsAccepting, UrlOpenanglebracketQ);
270-
ta(UrlOpenparenSyms, qsAccepting, UrlOpenparenQ);
271-
ta(UrlFullwidthOpenparenSyms, qsAccepting, UrlFullwidthOpenparenQ);
272-
ta(UrlOpenbraceSyms, qsNonAccepting, UrlOpenbraceSyms);
273-
ta(UrlOpenbracketSyms, qsNonAccepting, UrlOpenbracketSyms);
274-
ta(UrlOpenanglebracketSyms, qsNonAccepting, UrlOpenanglebracketSyms);
275-
ta(UrlOpenparenSyms, qsNonAccepting, UrlOpenparenSyms);
276-
ta(UrlFullwidthOpenparenSyms, qsAccepting, UrlFullwidthOpenparenSyms);
277-
278-
// Close brace/bracket to become regular URL
279-
tt(UrlOpenbracketQ, tk.CLOSEBRACKET, Url);
280-
tt(UrlOpenanglebracketQ, tk.CLOSEANGLEBRACKET, Url);
281-
tt(UrlOpenparenQ, tk.CLOSEPAREN, Url);
282-
tt(UrlFullwidthOpenparenQ, tk.FULLWIDTH_CLOSEPAREN, Url);
283-
tt(UrlOpenbraceQ, tk.CLOSEBRACE, Url);
284-
tt(UrlOpenbracketSyms, tk.CLOSEBRACKET, Url);
285-
tt(UrlOpenanglebracketSyms, tk.CLOSEANGLEBRACKET, Url);
286-
tt(UrlFullwidthOpenparenSyms, tk.FULLWIDTH_CLOSEPAREN, Url);
287-
tt(UrlOpenbraceSyms, tk.CLOSEPAREN, Url);
288-
tt(UrlOpenbraceSyms, tk.FULLWIDTH_CLOSEPAREN, Url);
217+
const bracketPairs = [
218+
[tk.OPENBRACE, tk.CLOSEBRACE], // {}
219+
[tk.OPENBRACKET, tk.CLOSEBRACKET], // []
220+
[tk.OPENPAREN, tk.CLOSEPAREN], // ()
221+
[tk.OPENANGLEBRACKET, tk.CLOSEANGLEBRACKET], // <>
222+
[tk.FULLWIDTHLEFTPAREN, tk.FULLWIDTHRIGHTPAREN], // ()
223+
[tk.LEFTCORNERBRACKET, tk.RIGHTCORNERBRACKET], // 「」
224+
[tk.LEFTWHITECORNERBRACKET, tk.RIGHTWHITECORNERBRACKET], // 『』
225+
[tk.FULLWIDTHLESSTHAN, tk.FULLWIDTHGREATERTHAN], // <>
226+
];
227+
228+
for (let i = 0; i < bracketPairs.length; i++) {
229+
const [OPEN, CLOSE] = bracketPairs[i];
230+
const UrlOpen = tt(Url, OPEN); // URL followed by open bracket
231+
232+
// Continue not accepting for open brackets
233+
tt(UrlNonaccept, OPEN, UrlOpen);
234+
235+
// Closing bracket component. This character WILL be included in the URL
236+
tt(UrlOpen, CLOSE, Url);
237+
238+
// URL that beings with an opening bracket, followed by a symbols.
239+
// Note that the final state can still be `UrlOpen` (if the URL has a
240+
// single opening bracket for some reason).
241+
const UrlOpenQ = makeState(mtk.Url);
242+
ta(UrlOpen, qsAccepting, UrlOpenQ);
243+
244+
const UrlOpenSyms = makeState(); // UrlOpen followed by some symbols it cannot end it
245+
ta(UrlOpen, qsNonAccepting);
246+
247+
// URL that begins with an opening bracket, followed by some symbols
248+
ta(UrlOpenQ, qsAccepting, UrlOpenQ);
249+
ta(UrlOpenQ, qsNonAccepting, UrlOpenSyms);
250+
ta(UrlOpenSyms, qsAccepting, UrlOpenQ);
251+
ta(UrlOpenSyms, qsNonAccepting, UrlOpenSyms);
252+
253+
// Close brace/bracket to become regular URL
254+
tt(UrlOpenQ, CLOSE, Url);
255+
tt(UrlOpenSyms, CLOSE, Url);
256+
}
289257

290258
tt(Start, tk.LOCALHOST, DomainDotTld); // localhost is a valid URL state
291259
tt(Start, tk.NL, mtk.Nl); // single new line
@@ -323,10 +291,7 @@ export function run(start, input, tokens) {
323291
textTokens.push(tokens[cursor++]);
324292
}
325293

326-
while (cursor < len && (
327-
nextState = secondState || state.go(tokens[cursor].t))
328-
) {
329-
294+
while (cursor < len && (nextState = secondState || state.go(tokens[cursor].t))) {
330295
// Get the next state
331296
secondState = null;
332297
state = nextState;

packages/linkifyjs/src/scanner.js

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ const NL = '\n'; // New line character
1414
const EMOJI_VARIATION = '\ufe0f'; // Variation selector, follows heart and others
1515
const EMOJI_JOINER = '\u200d'; // zero-width joiner
1616

17-
let tlds = null, utlds = null; // don't change so only have to be computed once
17+
let tlds = null,
18+
utlds = null; // don't change so only have to be computed once
1819

1920
/**
2021
* Scanner output token:
@@ -55,15 +56,21 @@ export function init(customSchemes = []) {
5556
// States for special URL symbols that accept immediately after start
5657
tt(Start, "'", tk.APOSTROPHE);
5758
tt(Start, '{', tk.OPENBRACE);
58-
tt(Start, '[', tk.OPENBRACKET);
59-
tt(Start, '<', tk.OPENANGLEBRACKET);
60-
tt(Start, '(', tk.OPENPAREN);
61-
tt(Start, '(', tk.FULLWIDTH_OPENPAREN);
6259
tt(Start, '}', tk.CLOSEBRACE);
60+
tt(Start, '[', tk.OPENBRACKET);
6361
tt(Start, ']', tk.CLOSEBRACKET);
64-
tt(Start, '>', tk.CLOSEANGLEBRACKET);
62+
tt(Start, '(', tk.OPENPAREN);
6563
tt(Start, ')', tk.CLOSEPAREN);
66-
tt(Start, ')', tk.FULLWIDTH_CLOSEPAREN);
64+
tt(Start, '<', tk.OPENANGLEBRACKET);
65+
tt(Start, '>', tk.CLOSEANGLEBRACKET);
66+
tt(Start, '(', tk.FULLWIDTHLEFTPAREN);
67+
tt(Start, ')', tk.FULLWIDTHRIGHTPAREN);
68+
tt(Start, '「', tk.LEFTCORNERBRACKET);
69+
tt(Start, '」', tk.RIGHTCORNERBRACKET);
70+
tt(Start, '『', tk.LEFTWHITECORNERBRACKET);
71+
tt(Start, '』', tk.RIGHTWHITECORNERBRACKET);
72+
tt(Start, '<', tk.FULLWIDTHLESSTHAN);
73+
tt(Start, '>', tk.FULLWIDTHGREATERTHAN);
6774
tt(Start, '&', tk.AMPERSAND);
6875
tt(Start, '*', tk.ASTERISK);
6976
tt(Start, '@', tk.AT);
@@ -122,7 +129,10 @@ export function init(customSchemes = []) {
122129
// Generates states for top-level domains
123130
// Note that this is most accurate when tlds are in alphabetical order
124131
const wordjr = [[re.ASCII_LETTER, Word]];
125-
const uwordjr = [[re.ASCII_LETTER, null], [re.LETTER, UWord]];
132+
const uwordjr = [
133+
[re.ASCII_LETTER, null],
134+
[re.LETTER, UWord],
135+
];
126136
for (let i = 0; i < tlds.length; i++) {
127137
fastts(Start, tlds[i], tk.TLD, tk.WORD, wordjr);
128138
}
@@ -145,7 +155,7 @@ export function init(customSchemes = []) {
145155
addToGroups(tk.SLASH_SCHEME, { slashscheme: true, ascii: true }, groups);
146156

147157
// Register custom schemes. Assumes each scheme is asciinumeric with hyphens
148-
customSchemes = customSchemes.sort((a, b) => a[0] > b[0] ? 1 : -1);
158+
customSchemes = customSchemes.sort((a, b) => (a[0] > b[0] ? 1 : -1));
149159
for (let i = 0; i < customSchemes.length; i++) {
150160
const sch = customSchemes[i][0];
151161
const optionalSlashSlash = customSchemes[i][1];
@@ -233,7 +243,7 @@ export function run(start, str) {
233243
t: latestAccepting.t, // token type/name
234244
v: str.slice(cursor - tokenLength, cursor), // string value
235245
s: cursor - tokenLength, // start index
236-
e: cursor // end index (excluding)
246+
e: cursor, // end index (excluding)
237247
});
238248
}
239249

@@ -258,10 +268,14 @@ export function stringToArray(str) {
258268
while (index < len) {
259269
let first = str.charCodeAt(index);
260270
let second;
261-
let char = first < 0xd800 || first > 0xdbff || index + 1 === len
262-
|| (second = str.charCodeAt(index + 1)) < 0xdc00 || second > 0xdfff
263-
? str[index] // single character
264-
: str.slice(index, index + 2); // two-index characters
271+
let char =
272+
first < 0xd800 ||
273+
first > 0xdbff ||
274+
index + 1 === len ||
275+
(second = str.charCodeAt(index + 1)) < 0xdc00 ||
276+
second > 0xdfff
277+
? str[index] // single character
278+
: str.slice(index, index + 2); // two-index characters
265279
result.push(char);
266280
index += char.length;
267281
}

packages/linkifyjs/src/text.js

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ Identifiers for token outputs from the regexp scanner
44
******************************************************************************/
55

66
// A valid web domain token
7-
export const WORD = 'WORD'; // only contains a-z
8-
export const UWORD = 'UWORD'; // contains letters other than a-z, used for IDN
7+
export const WORD = 'WORD'; // only contains a-z
8+
export const UWORD = 'UWORD'; // contains letters other than a-z, used for IDN
99

1010
// Special case of word
1111
export const LOCALHOST = 'LOCALHOST';
@@ -36,16 +36,24 @@ export const WS = 'WS';
3636
export const NL = 'NL'; // \n
3737

3838
// Opening/closing bracket classes
39+
// TODO: Rename OPEN -> LEFT and CLOSE -> RIGHT in v5 to fit with Unicode names
40+
// Also rename angle brackes to LESSTHAN and GREATER THAN
3941
export const OPENBRACE = 'OPENBRACE'; // {
40-
export const OPENBRACKET = 'OPENBRACKET'; // [
41-
export const OPENANGLEBRACKET = 'OPENANGLEBRACKET'; // <
42-
export const OPENPAREN = 'OPENPAREN'; // (
4342
export const CLOSEBRACE = 'CLOSEBRACE'; // }
43+
export const OPENBRACKET = 'OPENBRACKET'; // [
4444
export const CLOSEBRACKET = 'CLOSEBRACKET'; // ]
45-
export const CLOSEANGLEBRACKET = 'CLOSEANGLEBRACKET'; // >
45+
export const OPENPAREN = 'OPENPAREN'; // (
4646
export const CLOSEPAREN = 'CLOSEPAREN'; // )
47-
export const FULLWIDTH_OPENPAREN = 'FULLWIDTH_OPENPAREN'; // (
48-
export const FULLWIDTH_CLOSEPAREN = 'FULLWIDTH_CLOSEPAREN'; // )
47+
export const OPENANGLEBRACKET = 'OPENANGLEBRACKET'; // <
48+
export const CLOSEANGLEBRACKET = 'CLOSEANGLEBRACKET'; // >
49+
export const FULLWIDTHLEFTPAREN = 'FULLWIDTHLEFTPAREN'; // (
50+
export const FULLWIDTHRIGHTPAREN = 'FULLWIDTHRIGHTPAREN'; // )
51+
export const LEFTCORNERBRACKET = 'LEFTCORNERBRACKET'; // 「
52+
export const RIGHTCORNERBRACKET = 'RIGHTCORNERBRACKET'; // 」
53+
export const LEFTWHITECORNERBRACKET = 'LEFTWHITECORNERBRACKET'; // 『
54+
export const RIGHTWHITECORNERBRACKET = 'RIGHTWHITECORNERBRACKET'; // 』
55+
export const FULLWIDTHLESSTHAN = 'FULLWIDTHLESSTHAN'; // <
56+
export const FULLWIDTHGREATERTHAN = 'FULLWIDTHGREATERTHAN'; // >
4957

5058
// Various symbols
5159
export const AMPERSAND = 'AMPERSAND'; // &
@@ -79,4 +87,3 @@ export const EMOJI = 'EMOJI';
7987

8088
// Default token - anything that is not one of the above
8189
export const SYM = 'SYM';
82-

test/run.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@ if [[ "$1" == "--dist" ]]; then
88
npm run test:coverage
99
npm run build:ci
1010
npm run copy
11-
npm run test:ci
12-
sleep 3 # Wait for threads to exit?
11+
if [[ "${BROWSERSTACK_USERNAME}" != "" ]] && [[ "${BROWSERSTACK_ACCESS_KEY}" != "" ]]; then
12+
npm run test:ci
13+
sleep 3 # Wait for threads to exit?
14+
fi
1315
else
1416
# Run basic tests
1517
echo "Running basic tests..."

0 commit comments

Comments
 (0)