Skip to content

Commit 5f51707

Browse files
authored
regexp: fixed the zero advance logic in quantifiers
Ref: bellard/quickjs@10fc744
1 parent 56da486 commit 5f51707

File tree

10 files changed

+1078
-1109
lines changed

10 files changed

+1078
-1109
lines changed

gen/function_source.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
const uint32_t qjsc_function_source_size = 384;
66

77
const uint8_t qjsc_function_source[384] = {
8-
0x0d, 0x06, 0x0c, 0x61, 0x63, 0x74, 0x75, 0x61,
8+
0x0e, 0x06, 0x0c, 0x61, 0x63, 0x74, 0x75, 0x61,
99
0x6c, 0x02, 0x66, 0x30, 0x74, 0x65, 0x73, 0x74,
1010
0x73, 0x2f, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
1111
0x6f, 0x6e, 0x5f, 0x73, 0x6f, 0x75, 0x72, 0x63,

gen/hello.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
const uint32_t qjsc_hello_size = 89;
66

77
const uint8_t qjsc_hello[89] = {
8-
0x0d, 0x04, 0x0e, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
8+
0x0e, 0x04, 0x0e, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
99
0x6c, 0x65, 0x06, 0x6c, 0x6f, 0x67, 0x16, 0x48,
1010
0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72,
1111
0x6c, 0x64, 0x22, 0x65, 0x78, 0x61, 0x6d, 0x70,

gen/hello_module.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
const uint32_t qjsc_fib_module_size = 311;
66

77
const uint8_t qjsc_fib_module[311] = {
8-
0x0d, 0x03, 0x2c, 0x65, 0x78, 0x61, 0x6d, 0x70,
8+
0x0e, 0x03, 0x2c, 0x65, 0x78, 0x61, 0x6d, 0x70,
99
0x6c, 0x65, 0x73, 0x2f, 0x66, 0x69, 0x62, 0x5f,
1010
0x6d, 0x6f, 0x64, 0x75, 0x6c, 0x65, 0x2e, 0x6a,
1111
0x73, 0x06, 0x66, 0x69, 0x62, 0x02, 0x6e, 0x0d,
@@ -49,7 +49,7 @@ const uint8_t qjsc_fib_module[311] = {
4949
const uint32_t qjsc_hello_module_size = 178;
5050

5151
const uint8_t qjsc_hello_module[178] = {
52-
0x0d, 0x07, 0x30, 0x65, 0x78, 0x61, 0x6d, 0x70,
52+
0x0e, 0x07, 0x30, 0x65, 0x78, 0x61, 0x6d, 0x70,
5353
0x6c, 0x65, 0x73, 0x2f, 0x68, 0x65, 0x6c, 0x6c,
5454
0x6f, 0x5f, 0x6d, 0x6f, 0x64, 0x75, 0x6c, 0x65,
5555
0x2e, 0x6a, 0x73, 0x1e, 0x2e, 0x2f, 0x66, 0x69,

gen/repl.c

Lines changed: 1026 additions & 1026 deletions
Large diffs are not rendered by default.

gen/test_fib.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
const uint32_t qjsc_test_fib_size = 293;
66

77
const uint8_t qjsc_test_fib[293] = {
8-
0x0d, 0x0d, 0x28, 0x65, 0x78, 0x61, 0x6d, 0x70,
8+
0x0e, 0x0d, 0x28, 0x65, 0x78, 0x61, 0x6d, 0x70,
99
0x6c, 0x65, 0x73, 0x2f, 0x74, 0x65, 0x73, 0x74,
1010
0x5f, 0x66, 0x69, 0x62, 0x2e, 0x6a, 0x73, 0x04,
1111
0x6f, 0x73, 0x0a, 0x69, 0x73, 0x57, 0x69, 0x6e,

libregexp-opcode.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,7 @@ DEF(range32, 3) /* variable length */
5151
DEF(lookahead, 5)
5252
DEF(negative_lookahead, 5)
5353
DEF(push_char_pos, 1) /* push the character position on the stack */
54-
DEF(bne_char_pos, 5) /* pop one stack element and jump if equal to the character
55-
position */
54+
DEF(check_advance, 1) /* pop one stack element and check that it is different from the character position */
5655
DEF(prev, 1) /* go to the previous char */
5756
DEF(simple_greedy_quant, 17)
5857

libregexp.c

Lines changed: 37 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,6 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
283283
case REOP_loop:
284284
case REOP_lookahead:
285285
case REOP_negative_lookahead:
286-
case REOP_bne_char_pos:
287286
val = get_u32(buf + pos + 1);
288287
val += (pos + 5);
289288
printf(" %u", val);
@@ -921,21 +920,17 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
921920
}
922921

923922
/* Return:
924-
1 if the opcodes in bc_buf[] always advance the character pointer.
925-
0 if the character pointer may not be advanced.
926-
-1 if the code may depend on side effects of its previous execution (backreference)
923+
- true if the opcodes may not advance the char pointer
924+
- false if the opcodes always advance the char pointer
927925
*/
928-
static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
926+
static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
929927
{
930-
int pos, opcode, ret, len, i;
931-
uint32_t val, last;
932-
BOOL has_back_reference;
933-
uint8_t capture_bitmap[CAPTURE_COUNT_MAX];
928+
int pos, opcode, len;
929+
uint32_t val;
930+
BOOL ret;
934931

935-
ret = -2; /* not known yet */
932+
ret = TRUE;
936933
pos = 0;
937-
has_back_reference = FALSE;
938-
memset(capture_bitmap, 0, sizeof(capture_bitmap));
939934

940935
while (pos < bc_buf_len) {
941936
opcode = bc_buf[pos];
@@ -955,8 +950,7 @@ static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
955950
case REOP_dot:
956951
case REOP_any:
957952
simple_char:
958-
if (ret == -2)
959-
ret = 1;
953+
ret = FALSE;
960954
break;
961955
case REOP_line_start:
962956
case REOP_line_end:
@@ -970,41 +964,16 @@ static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
970964
break;
971965
case REOP_save_start:
972966
case REOP_save_end:
973-
val = bc_buf[pos + 1];
974-
capture_bitmap[val] |= 1;
975-
break;
976967
case REOP_save_reset:
977-
{
978-
val = bc_buf[pos + 1];
979-
last = bc_buf[pos + 2];
980-
while (val < last)
981-
capture_bitmap[val++] |= 1;
982-
}
983-
break;
984968
case REOP_back_reference:
985969
case REOP_backward_back_reference:
986-
val = bc_buf[pos + 1];
987-
capture_bitmap[val] |= 2;
988-
has_back_reference = TRUE;
989970
break;
990971
default:
991972
/* safe behvior: we cannot predict the outcome */
992-
if (ret == -2)
993-
ret = 0;
994-
break;
973+
return TRUE;
995974
}
996975
pos += len;
997976
}
998-
if (has_back_reference) {
999-
/* check if there is back reference which references a capture
1000-
made in the some code */
1001-
for(i = 0; i < CAPTURE_COUNT_MAX; i++) {
1002-
if (capture_bitmap[i] == 3)
1003-
return -1;
1004-
}
1005-
}
1006-
if (ret == -2)
1007-
ret = 0;
1008977
return ret;
1009978
}
1010979

@@ -1583,8 +1552,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
15831552
running the atom after the first quant_min times,
15841553
then there is no match. We remove this test when we
15851554
are sure the atom always advances the position. */
1586-
add_zero_advance_check = (re_check_advance(s->byte_code.buf + last_atom_start,
1587-
s->byte_code.size - last_atom_start) == 0);
1555+
add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start,
1556+
s->byte_code.size - last_atom_start);
15881557

15891558
{
15901559
int len, pos;
@@ -1601,38 +1570,34 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
16011570
}
16021571
if (quant_max == 0) {
16031572
s->byte_code.size = last_atom_start;
1604-
} else if (quant_max == 1) {
1605-
if (dbuf_insert(&s->byte_code, last_atom_start, 5))
1606-
goto out_of_memory;
1607-
s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
1608-
greedy;
1609-
put_u32(s->byte_code.buf + last_atom_start + 1, len);
1610-
} else if (quant_max == INT32_MAX) {
1573+
} else if (quant_max == 1 || quant_max == INT32_MAX) {
1574+
BOOL has_goto = (quant_max == INT32_MAX);
16111575
if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check))
16121576
goto out_of_memory;
16131577
s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
16141578
greedy;
16151579
put_u32(s->byte_code.buf + last_atom_start + 1,
1616-
len + 5 + add_zero_advance_check);
1580+
len + 5 * has_goto + add_zero_advance_check * 2);
16171581
if (add_zero_advance_check) {
1618-
/* avoid infinite loop by stoping the
1619-
recursion if no advance was made in the
1620-
atom (only works if the atom has no
1621-
side effect) */
16221582
s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
1623-
re_emit_goto(s, REOP_bne_char_pos, last_atom_start);
1624-
} else {
1625-
re_emit_goto(s, REOP_goto, last_atom_start);
1583+
re_emit_op(s, REOP_check_advance);
16261584
}
1585+
if (has_goto)
1586+
re_emit_goto(s, REOP_goto, last_atom_start);
16271587
} else {
1628-
if (dbuf_insert(&s->byte_code, last_atom_start, 10))
1588+
if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check))
16291589
goto out_of_memory;
16301590
pos = last_atom_start;
16311591
s->byte_code.buf[pos++] = REOP_push_i32;
16321592
put_u32(s->byte_code.buf + pos, quant_max);
16331593
pos += 4;
16341594
s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
1635-
put_u32(s->byte_code.buf + pos, len + 5);
1595+
put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2);
1596+
pos += 4;
1597+
if (add_zero_advance_check) {
1598+
s->byte_code.buf[pos++] = REOP_push_char_pos;
1599+
re_emit_op(s, REOP_check_advance);
1600+
}
16361601
re_emit_goto(s, REOP_loop, last_atom_start + 5);
16371602
re_emit_op(s, REOP_drop);
16381603
}
@@ -1656,22 +1621,25 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
16561621
if (quant_max == INT32_MAX) {
16571622
pos = s->byte_code.size;
16581623
re_emit_op_u32(s, REOP_split_goto_first + greedy,
1659-
len + 5 + add_zero_advance_check);
1624+
len + 5 + add_zero_advance_check * 2);
16601625
if (add_zero_advance_check)
16611626
re_emit_op(s, REOP_push_char_pos);
16621627
/* copy the atom */
16631628
dbuf_put_self(&s->byte_code, last_atom_start, len);
16641629
if (add_zero_advance_check)
1665-
re_emit_goto(s, REOP_bne_char_pos, pos);
1666-
else
1667-
re_emit_goto(s, REOP_goto, pos);
1630+
re_emit_op(s, REOP_check_advance);
1631+
re_emit_goto(s, REOP_goto, pos);
16681632
} else if (quant_max > quant_min) {
16691633
re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
16701634
pos = s->byte_code.size;
1671-
re_emit_op_u32(s, REOP_split_goto_first + greedy, len + 5);
1635+
re_emit_op_u32(s, REOP_split_goto_first + greedy,
1636+
len + 5 + add_zero_advance_check * 2);
1637+
if (add_zero_advance_check)
1638+
re_emit_op(s, REOP_push_char_pos);
16721639
/* copy the atom */
16731640
dbuf_put_self(&s->byte_code, last_atom_start, len);
1674-
1641+
if (add_zero_advance_check)
1642+
re_emit_op(s, REOP_check_advance);
16751643
re_emit_goto(s, REOP_loop, pos);
16761644
re_emit_op(s, REOP_drop);
16771645
}
@@ -1785,7 +1753,7 @@ static int lre_compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
17851753
}
17861754
break;
17871755
case REOP_drop:
1788-
case REOP_bne_char_pos:
1756+
case REOP_check_advance:
17891757
assert(stack_size > 0);
17901758
stack_size--;
17911759
break;
@@ -2281,11 +2249,9 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
22812249
case REOP_push_char_pos:
22822250
stack[stack_len++] = (uintptr_t)cptr;
22832251
break;
2284-
case REOP_bne_char_pos:
2285-
val = get_u32(pc);
2286-
pc += 4;
2287-
if (stack[--stack_len] != (uintptr_t)cptr)
2288-
pc += (int)val;
2252+
case REOP_check_advance:
2253+
if (stack[--stack_len] == (uintptr_t)cptr)
2254+
goto no_match;
22892255
break;
22902256
case REOP_word_boundary:
22912257
case REOP_not_word_boundary:

quickjs.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33312,7 +33312,7 @@ typedef enum BCTagEnum {
3331233312
BC_TAG_SET,
3331333313
} BCTagEnum;
3331433314

33315-
#define BC_VERSION 13
33315+
#define BC_VERSION 14
3331633316

3331733317
typedef struct BCWriterState {
3331833318
JSContext *ctx;

test262_errors.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,6 @@ test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-retu
1818
test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-return-not-object.js:72: strict mode: TypeError: $DONE() not called
1919
test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-return-object.js:66: TypeError: $DONE() not called
2020
test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-return-object.js:66: strict mode: TypeError: $DONE() not called
21-
test262/test/built-ins/RegExp/lookahead-quantifier-match-groups.js:27: Test262Error: Expected [a, abc] and [a, undefined] to have the same contents. ? quantifier
22-
test262/test/built-ins/RegExp/lookahead-quantifier-match-groups.js:27: strict mode: Test262Error: Expected [a, abc] and [a, undefined] to have the same contents. ? quantifier
23-
test262/test/built-ins/RegExp/nullable-quantifier.js:21: Test262Error: The regex is expected to match the whole string Expected SameValue(«a», «ab») to be true
24-
test262/test/built-ins/RegExp/nullable-quantifier.js:21: strict mode: Test262Error: The regex is expected to match the whole string Expected SameValue(«a», «ab») to be true
2521
test262/test/built-ins/RegExp/property-escapes/generated/Alphabetic.js:16: Test262Error: `\p{Alphabetic}` should match U+02EBF0 (`𮯰`)
2622
test262/test/built-ins/RegExp/property-escapes/generated/Alphabetic.js:16: strict mode: Test262Error: `\p{Alphabetic}` should match U+02EBF0 (`𮯰`)
2723
test262/test/built-ins/RegExp/property-escapes/generated/Assigned.js:16: Test262Error: `\p{Assigned}` should match U+002FFC (`⿼`)

tests/test_builtin.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,14 @@ function test_regexp()
775775
/* test zero length matches */
776776
a = /()*?a/.exec(",");
777777
assert(a, null);
778+
a = /(?:(?=(abc)))a/.exec("abc");
779+
assert(a, ["a", "abc"]);
780+
a = /(?:(?=(abc)))?a/.exec("abc");
781+
assert(a, ["a", undefined]);
782+
a = /(?:(?=(abc))){0,2}a/.exec("abc");
783+
assert(a, ["a", undefined]);
784+
a = /(?:|[\w])+([0-9])/.exec("123a23");
785+
assert(a, ["123a23", "3"]);
778786
}
779787

780788
function test_symbol()

0 commit comments

Comments
 (0)