Skip to content

Commit 82b87e0

Browse files
author
Nick Frasser
committed
Merge pull request #93 from SoapBox/query-string-parens
Mostly correct parsing for links containing or wrapped in brackets
2 parents fbbd6f0 + a91db73 commit 82b87e0

File tree

5 files changed

+144
-11
lines changed

5 files changed

+144
-11
lines changed

src/linkify/core/parser.js

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,14 @@ TT_PROTOCOL = TEXT_TOKENS.PROTOCOL,
3333
TT_QUERY = TEXT_TOKENS.QUERY,
3434
TT_SLASH = TEXT_TOKENS.SLASH,
3535
TT_SYM = TEXT_TOKENS.SYM,
36-
TT_TLD = TEXT_TOKENS.TLD;
36+
TT_TLD = TEXT_TOKENS.TLD,
37+
TT_OPENBRACE = TEXT_TOKENS.OPENBRACE,
38+
TT_OPENBRACKET = TEXT_TOKENS.OPENBRACKET,
39+
TT_OPENPAREN = TEXT_TOKENS.OPENPAREN,
40+
TT_CLOSEBRACE = TEXT_TOKENS.CLOSEBRACE,
41+
TT_CLOSEBRACKET = TEXT_TOKENS.CLOSEBRACKET,
42+
TT_CLOSEPAREN = TEXT_TOKENS.CLOSEPAREN;
43+
3744
// TT_WS = TEXT_TOKENS.WS;
3845

3946
const
@@ -66,6 +73,15 @@ S_PSS_TLD_COLON = makeState(), // (A) URL followed by colon (potential port nu
6673
S_PSS_TLD_PORT = makeState(T_URL), // TLD followed by a port number
6774
S_URL = makeState(T_URL), // Long URL with optional port and maybe query string
6875
S_URL_SYMS = makeState(), // URL followed by some symbols (will not be part of the final URL)
76+
S_URL_OPENBRACE = makeState(), // URL followed by {
77+
S_URL_OPENBRACKET = makeState(), // URL followed by [
78+
S_URL_OPENPAREN = makeState(), // URL followed by (
79+
S_URL_OPENBRACE_Q = makeState(T_URL), // URL followed by { and some symbols that the URL can end it
80+
S_URL_OPENBRACKET_Q = makeState(T_URL), // URL followed by [ and some symbols that the URL can end it
81+
S_URL_OPENPAREN_Q = makeState(T_URL), // URL followed by ( and some symbols that the URL can end it
82+
S_URL_OPENBRACE_SYMS = makeState(), // S_URL_OPENBRACE_Q followed by some symbols it cannot end it
83+
S_URL_OPENBRACKET_SYMS = makeState(), // S_URL_OPENBRACKET_Q followed by some symbols it cannot end it
84+
S_URL_OPENPAREN_SYMS = makeState(), // S_URL_OPENPAREN_Q followed by some symbols it cannot end it
6985
S_EMAIL_DOMAIN = makeState(), // parsed string starts with local email info + @ with a potential domain name (C)
7086
S_EMAIL_DOMAIN_DOT = makeState(), // (C) domain followed by DOT
7187
S_EMAIL = makeState(T_EMAIL), // (C) Possible email address (could have more tlds)
@@ -144,8 +160,7 @@ let qsAccepting = [
144160
TT_POUND,
145161
TT_PROTOCOL,
146162
TT_SLASH,
147-
TT_TLD,
148-
TT_SYM
163+
TT_TLD
149164
];
150165

151166
// Types of tokens that can follow a URL and be part of the query string
@@ -155,9 +170,65 @@ let qsNonAccepting = [
155170
TT_COLON,
156171
TT_DOT,
157172
TT_QUERY,
158-
TT_PUNCTUATION
173+
TT_PUNCTUATION,
174+
TT_CLOSEBRACE,
175+
TT_CLOSEBRACKET,
176+
TT_CLOSEPAREN,
177+
TT_OPENBRACE,
178+
TT_OPENBRACKET,
179+
TT_OPENPAREN,
180+
TT_SYM
159181
];
160182

183+
// These states are responsible primarily for determining whether or not to
184+
// include the final round bracket.
185+
186+
// URL, followed by an opening bracket
187+
S_URL.on(TT_OPENBRACE, S_URL_OPENBRACE);
188+
S_URL.on(TT_OPENBRACKET, S_URL_OPENBRACKET);
189+
S_URL.on(TT_OPENPAREN, S_URL_OPENPAREN);
190+
191+
// URL with extra symbols at the end, followed by an opening bracket
192+
S_URL_SYMS.on(TT_OPENBRACE, S_URL_OPENBRACE);
193+
S_URL_SYMS.on(TT_OPENBRACKET, S_URL_OPENBRACKET);
194+
S_URL_SYMS.on(TT_OPENPAREN, S_URL_OPENPAREN);
195+
196+
// Closing bracket component. This character WILL be included in the URL
197+
S_URL_OPENBRACE.on(TT_CLOSEBRACE, S_URL);
198+
S_URL_OPENBRACKET.on(TT_CLOSEBRACKET, S_URL);
199+
S_URL_OPENPAREN.on(TT_CLOSEPAREN, S_URL);
200+
S_URL_OPENBRACE_Q.on(TT_CLOSEBRACE, S_URL);
201+
S_URL_OPENBRACKET_Q.on(TT_CLOSEBRACKET, S_URL);
202+
S_URL_OPENPAREN_Q.on(TT_CLOSEPAREN, S_URL);
203+
S_URL_OPENBRACE_SYMS.on(TT_CLOSEBRACE, S_URL);
204+
S_URL_OPENBRACKET_SYMS.on(TT_CLOSEBRACKET, S_URL);
205+
S_URL_OPENPAREN_SYMS.on(TT_CLOSEPAREN, S_URL);
206+
207+
// URL that beings with an opening bracket, followed by a symbols.
208+
// Note that the final state can still be `S_URL_OPENBRACE_Q` (if the URL only
209+
// has a single opening bracket for some reason).
210+
S_URL_OPENBRACE.on(qsAccepting, S_URL_OPENBRACE_Q);
211+
S_URL_OPENBRACKET.on(qsAccepting, S_URL_OPENBRACKET_Q);
212+
S_URL_OPENPAREN.on(qsAccepting, S_URL_OPENPAREN_Q);
213+
S_URL_OPENBRACE.on(qsNonAccepting, S_URL_OPENBRACE_SYMS);
214+
S_URL_OPENBRACKET.on(qsNonAccepting, S_URL_OPENBRACKET_SYMS);
215+
S_URL_OPENPAREN.on(qsNonAccepting, S_URL_OPENPAREN_SYMS);
216+
217+
// URL that begins with an opening bracket, followed by some symbols
218+
S_URL_OPENBRACE_Q.on(qsAccepting, S_URL_OPENBRACE_Q);
219+
S_URL_OPENBRACKET_Q.on(qsAccepting, S_URL_OPENBRACKET_Q);
220+
S_URL_OPENPAREN_Q.on(qsAccepting, S_URL_OPENPAREN_Q);
221+
S_URL_OPENBRACE_Q.on(qsNonAccepting, S_URL_OPENBRACE_Q);
222+
S_URL_OPENBRACKET_Q.on(qsNonAccepting, S_URL_OPENBRACKET_Q);
223+
S_URL_OPENPAREN_Q.on(qsNonAccepting, S_URL_OPENPAREN_Q);
224+
225+
S_URL_OPENBRACE_SYMS.on(qsAccepting, S_URL_OPENBRACE_Q);
226+
S_URL_OPENBRACKET_SYMS.on(qsAccepting, S_URL_OPENBRACKET_Q);
227+
S_URL_OPENPAREN_SYMS.on(qsAccepting, S_URL_OPENPAREN_Q);
228+
S_URL_OPENBRACE_SYMS.on(qsNonAccepting, S_URL_OPENBRACE_SYMS);
229+
S_URL_OPENBRACKET_SYMS.on(qsNonAccepting, S_URL_OPENBRACKET_SYMS);
230+
S_URL_OPENPAREN_SYMS.on(qsNonAccepting, S_URL_OPENPAREN_SYMS);
231+
161232
// Account for the query string
162233
S_URL.on(qsAccepting, S_URL);
163234
S_URL_SYMS.on(qsAccepting, S_URL);

src/linkify/core/scanner.js

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ S_START.on('#', makeState(TOKENS.POUND));
4444
S_START.on('?', makeState(TOKENS.QUERY));
4545
S_START.on('/', makeState(TOKENS.SLASH));
4646
S_START.on(COLON, makeState(TOKENS.COLON));
47+
S_START.on('{', makeState(TOKENS.OPENBRACE));
48+
S_START.on('[', makeState(TOKENS.OPENBRACKET));
49+
S_START.on('(', makeState(TOKENS.OPENPAREN));
50+
S_START.on('}', makeState(TOKENS.CLOSEBRACE));
51+
S_START.on(']', makeState(TOKENS.CLOSEBRACKET));
52+
S_START.on(')', makeState(TOKENS.CLOSEPAREN));
4753
S_START.on(/[,;!]/, makeState(TOKENS.PUNCTUATION));
4854

4955
// Whitespace jumps

src/linkify/core/tokens.js

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,34 @@ class TLD extends TextToken {}
161161
*/
162162
class WS extends TextToken {}
163163

164+
/**
165+
Opening/closing bracket classes
166+
*/
167+
168+
class OPENBRACE extends TextToken {
169+
constructor() { super('{'); }
170+
}
171+
172+
class OPENBRACKET extends TextToken {
173+
constructor() { super('['); }
174+
}
175+
176+
class OPENPAREN extends TextToken {
177+
constructor() { super('('); }
178+
}
179+
180+
class CLOSEBRACE extends TextToken {
181+
constructor() { super('}'); }
182+
}
183+
184+
class CLOSEBRACKET extends TextToken {
185+
constructor() { super(']'); }
186+
}
187+
188+
class CLOSEPAREN extends TextToken {
189+
constructor() { super(')'); }
190+
}
191+
164192
let text = {
165193
Base: TextToken,
166194
DOMAIN,
@@ -178,7 +206,13 @@ let text = {
178206
SLASH,
179207
SYM,
180208
TLD,
181-
WS
209+
WS,
210+
OPENBRACE,
211+
OPENBRACKET,
212+
OPENPAREN,
213+
CLOSEBRACE,
214+
CLOSEBRACKET,
215+
CLOSEPAREN
182216
};
183217

184218
/******************************************************************************

test/spec/linkify/core/parser-test.js

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ var tests = [
3737
], [
3838
'This [i.imgur.com/ckSj2Ba.jpg)] should also work',
3939
[TEXT, URL, TEXT],
40-
['This [', 'i.imgur.com/ckSj2Ba.jpg)]', ' should also work']
40+
['This [', 'i.imgur.com/ckSj2Ba.jpg', ')] should also work']
4141
], [
4242
'A link is http://nick.is.awesome/?q=nick+amazing&nick=yo%29%30hellp another is http://nick.con/?q=look',
4343
[TEXT, URL, TEXT],
@@ -118,6 +118,18 @@ var tests = [
118118
'Bu haritanın verileri Direniş İzleme Grubu\'nun yaptığı Türkiye İşçi Eylemleri haritası ile birleşebilir esasen. https://graphcommons.com/graphs/00af1cd8-5a67-40b1-86e5-32beae436f7c?show=Comments',
119119
[TEXT, URL],
120120
['Bu haritanın verileri Direniş İzleme Grubu\'nun yaptığı Türkiye İşçi Eylemleri haritası ile birleşebilir esasen. ', 'https://graphcommons.com/graphs/00af1cd8-5a67-40b1-86e5-32beae436f7c?show=Comments']
121+
], [
122+
'Links with brackets and parens https://en.wikipedia.org/wiki/Blur_[band] wat',
123+
[TEXT, URL, TEXT],
124+
['Links with brackets and parens ', 'https://en.wikipedia.org/wiki/Blur_[band]', ' wat'],
125+
], [
126+
'This has dots {https://msdn.microsoft.com/en-us/library/aa752574(VS.85).aspx}',
127+
[TEXT, URL, TEXT],
128+
['This has dots {', 'https://msdn.microsoft.com/en-us/library/aa752574(VS.85).aspx', '}']
129+
], [ // This test is correct, will count nested brackets as being part of the first
130+
'A really funky one (example.com/?id=asd2{hellow}and%20it%20continues(23&((@)) and it ends',
131+
[TEXT, URL, TEXT],
132+
['A really funky one (', 'example.com/?id=asd2{hellow}and%20it%20continues(23&((@)', ') and it ends']
121133
]
122134
];
123135

@@ -131,13 +143,14 @@ describe('linkify/core/parser#run()', function () {
131143
values = test[2],
132144
result = parser.run(scanner.run(str));
133145

146+
expect(result.map(function (token) {
147+
return token.toString();
148+
})).to.eql(values);
149+
134150
expect(result.map(function (token) {
135151
return token.constructor;
136152
})).to.eql(types);
137153

138-
expect(result.map(function (token) {
139-
return token.toString();
140-
})).to.eql(values);
141154
});
142155
}
143156

test/spec/linkify/core/scanner-test.js

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,15 @@ QUERY = TEXT_TOKENS.QUERY,
1818
SLASH = TEXT_TOKENS.SLASH,
1919
SYM = TEXT_TOKENS.SYM,
2020
TLD = TEXT_TOKENS.TLD,
21-
WS = TEXT_TOKENS.WS;
21+
WS = TEXT_TOKENS.WS,
22+
23+
OPENBRACE = TEXT_TOKENS.OPENBRACE,
24+
OPENBRACKET = TEXT_TOKENS.OPENBRACKET,
25+
OPENPAREN = TEXT_TOKENS.OPENPAREN,
26+
CLOSEBRACE = TEXT_TOKENS.CLOSEBRACE,
27+
CLOSEBRACKET = TEXT_TOKENS.CLOSEBRACKET,
28+
CLOSEPAREN = TEXT_TOKENS.CLOSEPAREN;
29+
2230

2331
// The elements are
2432
// 1. input string
@@ -35,7 +43,8 @@ var tests = [
3543
['#', [POUND], ['#']],
3644
['/', [SLASH], ['/']],
3745
['&', [SYM], ['&']],
38-
['&?<>(', [SYM, QUERY, SYM, SYM, SYM], ['&', '?', '<', '>', '(']],
46+
['&?<>(', [SYM, QUERY, SYM, SYM, OPENPAREN], ['&', '?', '<', '>', '(']],
47+
['([{}])', [OPENPAREN, OPENBRACKET, OPENBRACE, CLOSEBRACE, CLOSEBRACKET, CLOSEPAREN], ['(', '[', '{', '}', ']', ')']],
3948
['!,;', [PUNCTUATION, PUNCTUATION, PUNCTUATION], ['!', ',', ';']],
4049
['hello', [DOMAIN], ['hello']],
4150
['Hello123', [DOMAIN], ['Hello123']],

0 commit comments

Comments
 (0)