aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDániel Bátyai <dbatyai@inf.u-szeged.hu>2020-05-26 15:28:54 +0200
committerGitHub <noreply@github.com>2020-05-26 15:28:54 +0200
commit8f76a1f38223cce4cf64064fc1b67670e1f4b806 (patch)
treeed761435ac581507f0a21355651177dd3aa20857
parent908240ba6223fb54e72c2ab85c3765db3492a968 (diff)
Rework RegExp engine and add support for proper unicode matching (#3746)
This change includes several bugfixes, general improvements, and support for additional features. - Added full support for web compatibility syntax defined in Annex B - Implemented parsing and matching patterns in unicode mode - Fixed capture results when iterating with nested capturing groups - Significantly reduced regexp bytecode size - Reduced stack usage during regexp execution - Improved matching performance JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
-rw-r--r--jerry-core/api/jerry-snapshot.c7
-rw-r--r--jerry-core/ecma/base/ecma-gc.c2
-rw-r--r--jerry-core/ecma/base/ecma-helpers-string.c6
-rw-r--r--jerry-core/ecma/builtin-objects/ecma-builtin-global.c23
-rw-r--r--jerry-core/ecma/builtin-objects/ecma-builtin-json.c11
-rw-r--r--jerry-core/ecma/builtin-objects/ecma-builtins.c8
-rw-r--r--jerry-core/ecma/operations/ecma-regexp-object.c1444
-rw-r--r--jerry-core/ecma/operations/ecma-regexp-object.h86
-rw-r--r--jerry-core/jcontext/jcontext.h2
-rw-r--r--jerry-core/lit/lit-char-helpers.c116
-rw-r--r--jerry-core/lit/lit-char-helpers.h12
-rw-r--r--jerry-core/lit/lit-strings.c6
-rw-r--r--jerry-core/parser/js/js-lexer.c12
-rw-r--r--jerry-core/parser/js/js-parser.c8
-rw-r--r--jerry-core/parser/regexp/re-bytecode.c696
-rw-r--r--jerry-core/parser/regexp/re-bytecode.h122
-rw-r--r--jerry-core/parser/regexp/re-compiler-context.h60
-rw-r--r--jerry-core/parser/regexp/re-compiler.c899
-rw-r--r--jerry-core/parser/regexp/re-compiler.h23
-rw-r--r--jerry-core/parser/regexp/re-parser.c1543
-rw-r--r--jerry-core/parser/regexp/re-parser.h70
-rw-r--r--jerry-core/parser/regexp/re-token.h72
-rw-r--r--tests/jerry/es2015/regexp-unicode.js361
-rw-r--r--tests/jerry/regexp-alternatives.js3
-rw-r--r--tests/jerry/regexp-backreference.js3
-rw-r--r--tests/jerry/regexp-backtrack.js115
-rw-r--r--tests/jerry/regexp-capture-groups.js9
-rw-r--r--tests/jerry/regexp-simple-atom-and-iterations.js3
-rw-r--r--tests/jerry/regression-test-issue-2190.js2
-rw-r--r--tests/jerry/string-prototype-trim.js2
30 files changed, 3360 insertions, 2366 deletions
diff --git a/jerry-core/api/jerry-snapshot.c b/jerry-core/api/jerry-snapshot.c
index c25cfc5c..e3f77f12 100644
--- a/jerry-core/api/jerry-snapshot.c
+++ b/jerry-core/api/jerry-snapshot.c
@@ -559,7 +559,6 @@ snapshot_load_compiled_code (const uint8_t *base_addr_p, /**< base address of th
#if ENABLED (JERRY_BUILTIN_REGEXP)
if (!(bytecode_p->status_flags & CBC_CODE_FLAGS_FUNCTION))
{
- const re_compiled_code_t *re_bytecode_p = NULL;
const uint8_t *regex_start_p = ((const uint8_t *) bytecode_p) + sizeof (ecma_compiled_code_t);
@@ -567,10 +566,8 @@ snapshot_load_compiled_code (const uint8_t *base_addr_p, /**< base address of th
ecma_string_t *pattern_str_p = ecma_new_ecma_string_from_utf8 (regex_start_p,
bytecode_p->refs);
- re_compile_bytecode (&re_bytecode_p,
- pattern_str_p,
- bytecode_p->status_flags);
-
+ const re_compiled_code_t *re_bytecode_p = re_compile_bytecode (pattern_str_p,
+ bytecode_p->status_flags);
ecma_deref_ecma_string (pattern_str_p);
return (ecma_compiled_code_t *) re_bytecode_p;
diff --git a/jerry-core/ecma/base/ecma-gc.c b/jerry-core/ecma/base/ecma-gc.c
index d828598d..ee2cffed 100644
--- a/jerry-core/ecma/base/ecma-gc.c
+++ b/jerry-core/ecma/base/ecma-gc.c
@@ -1467,7 +1467,7 @@ ecma_gc_run (void)
#if ENABLED (JERRY_BUILTIN_REGEXP)
/* Free RegExp bytecodes stored in cache */
- re_cache_gc_run ();
+ re_cache_gc ();
#endif /* ENABLED (JERRY_BUILTIN_REGEXP) */
} /* ecma_gc_run */
diff --git a/jerry-core/ecma/base/ecma-helpers-string.c b/jerry-core/ecma/base/ecma-helpers-string.c
index 3cd53da9..4c94038d 100644
--- a/jerry-core/ecma/base/ecma-helpers-string.c
+++ b/jerry-core/ecma/base/ecma-helpers-string.c
@@ -2362,8 +2362,7 @@ ecma_string_trim_helper (const lit_utf8_byte_t **utf8_str_p, /**< [in, out] curr
{
read_size = lit_read_code_unit_from_utf8 (current_p, &ch);
- if (!lit_char_is_white_space (ch)
- && !lit_char_is_line_terminator (ch))
+ if (!lit_char_is_white_space (ch))
{
nonws_start_p = current_p;
break;
@@ -2378,8 +2377,7 @@ ecma_string_trim_helper (const lit_utf8_byte_t **utf8_str_p, /**< [in, out] curr
{
read_size = lit_read_prev_code_unit_from_utf8 (current_p, &ch);
- if (!lit_char_is_white_space (ch)
- && !lit_char_is_line_terminator (ch))
+ if (!lit_char_is_white_space (ch))
{
break;
}
diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-global.c b/jerry-core/ecma/builtin-objects/ecma-builtin-global.c
index 0c00244a..76f5de37 100644
--- a/jerry-core/ecma/builtin-objects/ecma-builtin-global.c
+++ b/jerry-core/ecma/builtin-objects/ecma-builtin-global.c
@@ -223,13 +223,13 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
continue;
}
- ecma_char_t decoded_byte;
-
- if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
+ uint32_t hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);
+ if (hex_value == UINT32_MAX)
{
return ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value."));
}
+ ecma_char_t decoded_byte = (ecma_char_t) hex_value;
input_char_p += URI_ENCODED_BYTE_SIZE;
if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
@@ -272,20 +272,18 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
/* Input decode. */
if (*input_char_p != '%')
{
- *output_char_p = *input_char_p;
- output_char_p++;
- input_char_p++;
+ *output_char_p++ = *input_char_p++;
continue;
}
- ecma_char_t decoded_byte;
-
- if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
+ uint32_t hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);
+ if (hex_value == UINT32_MAX)
{
ret_value = ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value."));
break;
}
+ ecma_char_t decoded_byte = (ecma_char_t) hex_value;
input_char_p += URI_ENCODED_BYTE_SIZE;
if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
@@ -337,17 +335,16 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
}
else
{
- ecma_char_t chr;
+ hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);
- if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &chr)
- || ((chr & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER))
+ if (hex_value == UINT32_MAX || (hex_value & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
{
is_valid = false;
break;
}
- octets[i] = (lit_utf8_byte_t) chr;
input_char_p += URI_ENCODED_BYTE_SIZE;
+ octets[i] = (lit_utf8_byte_t) hex_value;
}
}
diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-json.c b/jerry-core/ecma/builtin-objects/ecma-builtin-json.c
index 3c329915..ea6c2f61 100644
--- a/jerry-core/ecma/builtin-objects/ecma-builtin-json.c
+++ b/jerry-core/ecma/builtin-objects/ecma-builtin-json.c
@@ -174,18 +174,13 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
}
case LIT_CHAR_LOWERCASE_U:
{
- if ((end_p - current_p <= ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH))
+ uint32_t hex_value = lit_char_hex_lookup (current_p + 1, end_p, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH);
+ if (hex_value == UINT32_MAX)
{
goto invalid_string;
}
- ecma_char_t code_unit;
- if (!(lit_read_code_unit_from_hex (current_p + 1, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH, &code_unit)))
- {
- goto invalid_string;
- }
-
- ecma_stringbuilder_append_char (&result_builder, code_unit);
+ ecma_stringbuilder_append_char (&result_builder, (ecma_char_t) hex_value);
current_p += ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH + 1;
break;
}
diff --git a/jerry-core/ecma/builtin-objects/ecma-builtins.c b/jerry-core/ecma/builtin-objects/ecma-builtins.c
index c173b5fe..21b26cd4 100644
--- a/jerry-core/ecma/builtin-objects/ecma-builtins.c
+++ b/jerry-core/ecma/builtin-objects/ecma-builtins.c
@@ -505,12 +505,10 @@ ecma_instantiate_builtin (ecma_builtin_id_t obj_builtin_id) /**< built-in id */
ext_object_p->u.class_prop.class_id = LIT_MAGIC_STRING_REGEXP_UL;
- const re_compiled_code_t *bc_p = NULL;
- ecma_value_t ret_value = re_compile_bytecode (&bc_p,
- ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP),
- RE_FLAG_EMPTY);
+ re_compiled_code_t *bc_p = re_compile_bytecode (ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP),
+ RE_FLAG_EMPTY);
- JERRY_ASSERT (ecma_is_value_empty (ret_value));
+ JERRY_ASSERT (bc_p != NULL);
ECMA_SET_INTERNAL_VALUE_POINTER (ext_object_p->u.class_prop.u.value, bc_p);
diff --git a/jerry-core/ecma/operations/ecma-regexp-object.c b/jerry-core/ecma/operations/ecma-regexp-object.c
index b2b0f275..5d6ad068 100644
--- a/jerry-core/ecma/operations/ecma-regexp-object.c
+++ b/jerry-core/ecma/operations/ecma-regexp-object.c
@@ -47,11 +47,6 @@
#define RE_GLOBAL_CAPTURE 0
/**
- * Check if a RegExp opcode is a capture group or not
- */
-#define RE_IS_CAPTURE_GROUP(x) (((x) < RE_OP_NON_CAPTURE_GROUP_START) ? 1 : 0)
-
-/**
* Parse RegExp flags (global, ignoreCase, multiline)
*
* See also: ECMA-262 v5, 15.10.4.1
@@ -200,36 +195,6 @@ ecma_regexp_update_props (ecma_object_t *re_object_p, /**< RegExp object */
} /* ecma_regexp_update_props */
#endif /* !ENABLED (JERRY_ES2015) */
-#if ENABLED (JERRY_ES2015)
-/**
- * Helper function to get current code point and advance the string pointer.
- *
- * @return lit_code_point_t current code point
- */
-static lit_code_point_t
-ecma_regexp_unicode_advance (const lit_utf8_byte_t **str_p, /**< reference to string pointer */
- const lit_utf8_byte_t *end_p) /**< string end pointer */
-{
- JERRY_ASSERT (str_p != NULL);
- const lit_utf8_byte_t *current_p = *str_p;
-
- lit_code_point_t ch = lit_cesu8_read_next (&current_p);
- if (lit_is_code_point_utf16_high_surrogate ((ecma_char_t) ch)
- && current_p < end_p)
- {
- const ecma_char_t next_ch = lit_cesu8_peek_next (current_p);
- if (lit_is_code_point_utf16_low_surrogate (next_ch))
- {
- lit_utf8_incr (&current_p);
- ch = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch, next_ch);
- }
- }
-
- *str_p = current_p;
- return ch;
-} /* ecma_regexp_unicode_advance */
-#endif /* ENABLED (JERRY_ES2015) */
-
/**
* RegExpAlloc method
*
@@ -379,17 +344,14 @@ ecma_op_create_regexp_from_pattern (ecma_object_t *regexp_obj_p, /**< RegExp obj
JERRY_ASSERT (ecma_is_value_empty (parse_flags_value));
}
- const re_compiled_code_t *bc_p = NULL;
- ecma_value_t ret_value = re_compile_bytecode (&bc_p, pattern_str_p, flags);
+ re_compiled_code_t *bc_p = re_compile_bytecode (pattern_str_p, flags);
- if (ECMA_IS_VALUE_ERROR (ret_value))
+ if (JERRY_UNLIKELY (bc_p == NULL))
{
ecma_deref_ecma_string (pattern_str_p);
- return ret_value;
+ return ECMA_VALUE_ERROR;
}
- JERRY_ASSERT (ecma_is_value_empty (ret_value));
-
ecma_op_regexp_initialize (regexp_obj_p, bc_p, pattern_str_p, flags);
ecma_deref_ecma_string (pattern_str_p);
@@ -437,19 +399,14 @@ ecma_op_create_regexp_with_flags (ecma_object_t *regexp_obj_p, /**< RegExp objec
return ECMA_VALUE_ERROR;
}
- const re_compiled_code_t *bc_p = NULL;
-
- ecma_value_t ret_value = re_compile_bytecode (&bc_p, pattern_str_p, flags);
-
+ re_compiled_code_t *bc_p = re_compile_bytecode (pattern_str_p, flags);
ecma_deref_ecma_string (pattern_str_p);
- if (ECMA_IS_VALUE_ERROR (ret_value))
+ if (JERRY_UNLIKELY (bc_p == NULL))
{
- return ret_value;
+ return ECMA_VALUE_ERROR;
}
- JERRY_ASSERT (ecma_is_value_empty (ret_value));
-
ecma_op_regexp_initialize (regexp_obj_p, bc_p, pattern_str_p, flags);
return ecma_make_object_value (regexp_obj_p);
@@ -461,7 +418,8 @@ ecma_op_create_regexp_with_flags (ecma_object_t *regexp_obj_p, /**< RegExp objec
* @return ecma_char_t canonicalized character
*/
lit_code_point_t
-ecma_regexp_canonicalize_char (lit_code_point_t ch) /**< character */
+ecma_regexp_canonicalize_char (lit_code_point_t ch, /**< character */
+ bool unicode) /**< unicode */
{
if (JERRY_LIKELY (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX))
{
@@ -484,21 +442,19 @@ ecma_regexp_canonicalize_char (lit_code_point_t ch) /**< character */
ecma_char_t u[LIT_MAXIMUM_OTHER_CASE_LENGTH];
const ecma_length_t size = lit_char_to_upper_case ((ecma_char_t) ch, u, LIT_MAXIMUM_OTHER_CASE_LENGTH);
- /* 3. */
if (size != 1)
{
return ch;
}
- /* 4. */
+
const ecma_char_t cu = u[0];
- /* 5. */
- if (cu >= 128)
+ if (cu <= LIT_UTF8_1_BYTE_CODE_POINT_MAX && !unicode)
{
/* 6. */
- return cu;
+ return ch;
}
- return ch;
+ return cu;
} /* ecma_regexp_canonicalize_char */
/**
@@ -508,31 +464,159 @@ ecma_regexp_canonicalize_char (lit_code_point_t ch) /**< character */
*
* @return ecma_char_t canonicalized character
*/
-inline lit_code_point_t JERRY_ATTR_ALWAYS_INLINE
+static inline lit_code_point_t JERRY_ATTR_ALWAYS_INLINE
ecma_regexp_canonicalize (lit_code_point_t ch, /**< character */
- bool is_ignorecase) /**< IgnoreCase flag */
+ uint16_t flags) /**< flags */
{
- if (is_ignorecase)
+ if (flags & RE_FLAG_IGNORE_CASE)
{
- return ecma_regexp_canonicalize_char (ch);
+ return ecma_regexp_canonicalize_char (ch, flags & RE_FLAG_UNICODE);
}
return ch;
} /* ecma_regexp_canonicalize */
/**
- * Recursive function for RegExp matching.
+ * Check if a code point is matched by a class escape.
+ *
+ * @return true, if code point matches escape
+ * false, otherwise
+ */
+static bool
+ecma_regexp_check_class_escape (lit_code_point_t cp, /**< char */
+ ecma_class_escape_t escape) /**< escape */
+{
+ switch (escape)
+ {
+ case RE_ESCAPE_DIGIT:
+ {
+ return (cp >= LIT_CHAR_0 && cp <= LIT_CHAR_9);
+ }
+ case RE_ESCAPE_NOT_DIGIT:
+ {
+ return (cp < LIT_CHAR_0 || cp > LIT_CHAR_9);
+ }
+ case RE_ESCAPE_WORD_CHAR:
+ {
+ return lit_char_is_word_char (cp);
+ }
+ case RE_ESCAPE_NOT_WORD_CHAR:
+ {
+ return !lit_char_is_word_char (cp);
+ }
+ case RE_ESCAPE_WHITESPACE:
+ {
+ return lit_char_is_white_space ((ecma_char_t) cp);
+ }
+ case RE_ESCAPE_NOT_WHITESPACE:
+ {
+ return !lit_char_is_white_space ((ecma_char_t) cp);
+ }
+ default:
+ {
+ JERRY_UNREACHABLE ();
+ }
+ }
+} /* ecma_regexp_check_class_escape */
+
+/**
+ * Helper function to get current code point or code unit depending on execution mode,
+ * and advance the string pointer.
+ *
+ * @return lit_code_point_t current code point
+ */
+static lit_code_point_t
+ecma_regexp_advance (ecma_regexp_ctx_t *re_ctx_p, /**< regexp context */
+ const lit_utf8_byte_t **str_p) /**< reference to string pointer */
+{
+ JERRY_ASSERT (str_p != NULL);
+ lit_code_point_t cp = lit_cesu8_read_next (str_p);
+
+#if ENABLED (JERRY_ES2015)
+ if (JERRY_UNLIKELY (re_ctx_p->flags & RE_FLAG_UNICODE)
+ && lit_is_code_point_utf16_high_surrogate ((ecma_char_t) cp)
+ && *str_p < re_ctx_p->input_end_p)
+ {
+ const ecma_char_t next_ch = lit_cesu8_peek_next (*str_p);
+ if (lit_is_code_point_utf16_low_surrogate (next_ch))
+ {
+ cp = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) cp, next_ch);
+ *str_p += LIT_UTF8_MAX_BYTES_IN_CODE_UNIT;
+ }
+ }
+#endif /* ENABLED (JERRY_ES2015) */
+
+ return ecma_regexp_canonicalize (cp, re_ctx_p->flags);
+} /* ecma_regexp_advance */
+
+#if ENABLED (JERRY_ES2015)
+/**
+ * Helper function to get current full unicode code point and advance the string pointer.
+ *
+ * @return lit_code_point_t current code point
+ */
+lit_code_point_t
+ecma_regexp_unicode_advance (const lit_utf8_byte_t **str_p, /**< reference to string pointer */
+ const lit_utf8_byte_t *end_p) /**< string end pointer */
+{
+ JERRY_ASSERT (str_p != NULL);
+ const lit_utf8_byte_t *current_p = *str_p;
+
+ lit_code_point_t ch = lit_cesu8_read_next (&current_p);
+ if (lit_is_code_point_utf16_high_surrogate ((ecma_char_t) ch)
+ && current_p < end_p)
+ {
+ const ecma_char_t next_ch = lit_cesu8_peek_next (current_p);
+ if (lit_is_code_point_utf16_low_surrogate (next_ch))
+ {
+ ch = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch, next_ch);
+ current_p += LIT_UTF8_MAX_BYTES_IN_CODE_UNIT;
+ }
+ }
+
+ *str_p = current_p;
+ return ch;
+} /* ecma_regexp_unicode_advance */
+#endif /* ENABLED (JERRY_ES2015) */
+
+/**
+ * Helper function to revert the string pointer to the previous code point.
+ *
+ * @return pointer to previous code point
+ */
+static JERRY_ATTR_NOINLINE const lit_utf8_byte_t *
+ecma_regexp_step_back (ecma_regexp_ctx_t *re_ctx_p, /**< regexp context */
+ const lit_utf8_byte_t *str_p) /**< reference to string pointer */
+{
+ JERRY_ASSERT (str_p != NULL);
+#if ENABLED (JERRY_ES2015)
+ lit_code_point_t ch = lit_cesu8_read_prev (&str_p);
+ if (JERRY_UNLIKELY (re_ctx_p->flags & RE_FLAG_UNICODE)
+ && lit_is_code_point_utf16_low_surrogate (ch)
+ && lit_is_code_point_utf16_high_surrogate (lit_cesu8_peek_prev (str_p)))
+ {
+ str_p -= LIT_UTF8_MAX_BYTES_IN_CODE_UNIT;
+ }
+#else /* !ENABLED (JERRY_ES2015) */
+ JERRY_UNUSED (re_ctx_p);
+ lit_utf8_decr (&str_p);
+#endif /* !ENABLED (JERRY_ES2015) */
+ return str_p;
+} /* ecma_regexp_step_back */
+
+/**
+ * Recursive function for executing RegExp bytecode.
*
* See also:
* ECMA-262 v5, 15.10.2.1
*
- * @return true - if matched
- * false - otherwise
+ * @return pointer to the end of the currently matched substring
+ * NULL, if pattern did not match
*/
static const lit_utf8_byte_t *
-ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
- const uint8_t *bc_p, /**< pointer to the current RegExp bytecode */
- const lit_utf8_byte_t *str_curr_p) /**< input string pointer */
+ecma_regexp_run (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
+ const uint8_t *bc_p, /**< pointer to the current RegExp bytecode */
+ const lit_utf8_byte_t *str_curr_p) /**< input string pointer */
{
#if (JERRY_STACK_LIMIT != 0)
if (JERRY_UNLIKELY (ecma_get_current_stack_usage () > CONFIG_MEM_STACK_LIMIT))
@@ -541,725 +625,950 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
}
#endif /* JERRY_STACK_LIMIT != 0 */
+ const lit_utf8_byte_t *str_start_p = str_curr_p;
+ const uint8_t *next_alternative_p = NULL;
+
while (true)
{
- re_opcode_t op = re_get_opcode (&bc_p);
+ const re_opcode_t op = re_get_opcode (&bc_p);
switch (op)
{
- case RE_OP_MATCH:
+ case RE_OP_EOF:
+ {
+ re_ctx_p->captures_p[RE_GLOBAL_CAPTURE].end_p = str_curr_p;
+ /* FALLTHRU */
+ }
+ case RE_OP_ASSERT_END:
+ case RE_OP_ITERATOR_END:
{
- JERRY_TRACE_MSG ("Execute RE_OP_MATCH: match\n");
return str_curr_p;
}
- case RE_OP_CHAR:
+ case RE_OP_ALTERNATIVE_START:
{
- if (str_curr_p >= re_ctx_p->input_end_p)
+ const uint32_t offset = re_get_value (&bc_p);
+ next_alternative_p = bc_p + offset;
+ continue;
+ }
+ case RE_OP_ALTERNATIVE_NEXT:
+ {
+ while (true)
{
- return NULL; /* fail */
- }
-
- const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE;
- lit_code_point_t ch1 = re_get_char (&bc_p); /* Already canonicalized. */
- lit_code_point_t ch2 = lit_cesu8_read_next (&str_curr_p);
+ const uint32_t offset = re_get_value (&bc_p);
+ bc_p += offset;
-#if ENABLED (JERRY_ES2015)
- if (re_ctx_p->flags & RE_FLAG_UNICODE
- && lit_is_code_point_utf16_high_surrogate (ch2)
- && str_curr_p < re_ctx_p->input_end_p)
- {
- const ecma_char_t next_ch = lit_cesu8_peek_next (str_curr_p);
- if (lit_is_code_point_utf16_low_surrogate (next_ch))
+ if (*bc_p != RE_OP_ALTERNATIVE_NEXT)
{
- lit_utf8_incr (&str_curr_p);
- ch2 = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch2, next_ch);
+ break;
}
- }
-#endif /* ENABLED (JERRY_ES2015) */
-
- ch2 = ecma_regexp_canonicalize (ch2, is_ignorecase);
- JERRY_TRACE_MSG ("Character matching %d to %d: ", ch1, ch2);
- if (ch1 != ch2)
- {
- JERRY_TRACE_MSG ("fail\n");
- return NULL; /* fail */
+ bc_p++;
}
- JERRY_TRACE_MSG ("match\n");
- break; /* tail merge */
+ continue;
}
- case RE_OP_PERIOD:
+ case RE_OP_NO_ALTERNATIVE:
{
- if (str_curr_p >= re_ctx_p->input_end_p)
- {
- return NULL; /* fail */
- }
+ return NULL;
+ }
+ case RE_OP_CAPTURING_GROUP_START:
+ {
+ const uint32_t group_idx = re_get_value (&bc_p);
+ ecma_regexp_capture_t *const group_p = re_ctx_p->captures_p + group_idx;
+ group_p->subcapture_count = re_get_value (&bc_p);
- const ecma_char_t ch = lit_cesu8_read_next (&str_curr_p);
- JERRY_TRACE_MSG ("Period matching '.' to %u: ", (unsigned int) ch);
+ const lit_utf8_byte_t *const saved_begin_p = group_p->begin_p;
+ const lit_utf8_byte_t *const saved_end_p = group_p->end_p;
+ const uint32_t saved_iterator = group_p->iterator;
- if (lit_char_is_line_terminator (ch))
+ const uint32_t qmin = re_get_value (&bc_p);
+ group_p->end_p = NULL;
+
+ /* If zero iterations are allowed, then execute the end opcode which will handle further iterations,
+ * otherwise run the 1st iteration immediately by executing group bytecode. */
+ if (qmin == 0)
+ {
+ group_p->iterator = 0;
+ group_p->begin_p = NULL;
+ const uint32_t end_offset = re_get_value (&bc_p);
+ group_p->bc_p = bc_p;
+
+ bc_p += end_offset;
+ }
+ else
{
- JERRY_TRACE_MSG ("fail\n");
- return NULL; /* fail */
+ group_p->iterator = 1;
+ group_p->begin_p = str_curr_p;
+ group_p->bc_p = bc_p;
}
-#if ENABLED (JERRY_ES2015)
- if (re_ctx_p->flags & RE_FLAG_UNICODE
- && lit_is_code_point_utf16_high_surrogate (ch)
- && str_curr_p < re_ctx_p->input_end_p)
+ const lit_utf8_byte_t *matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
+ group_p->iterator = saved_iterator;
+
+ if (matched_p == NULL)
{
- const ecma_char_t next_ch = lit_cesu8_peek_next (str_curr_p);
- if (lit_is_code_point_utf16_low_surrogate (next_ch))
- {
- lit_utf8_incr (&str_curr_p);
- }
+ group_p->begin_p = saved_begin_p;
+ group_p->end_p = saved_end_p;
+ goto fail;
}
-#endif /* ENABLED (JERRY_ES2015) */
- JERRY_TRACE_MSG ("match\n");
- break; /* tail merge */
+ return matched_p;
}
- case RE_OP_ASSERT_START:
+ case RE_OP_NON_CAPTURING_GROUP_START:
{
- JERRY_TRACE_MSG ("Execute RE_OP_ASSERT_START: ");
+ const uint32_t group_idx = re_get_value (&bc_p);
+ ecma_regexp_non_capture_t *const group_p = re_ctx_p->non_captures_p + group_idx;
- if (str_curr_p <= re_ctx_p->input_start_p)
+ group_p->subcapture_start = re_get_value (&bc_p);
+ group_p->subcapture_count = re_get_value (&bc_p);
+
+ const uint32_t saved_iterator = group_p->iterator;
+ const uint32_t qmin = re_get_value (&bc_p);
+
+ /* If zero iterations are allowed, then execute the end opcode which will handle further iterations,
+ * otherwise run the 1st iteration immediately by executing group bytecode. */
+ if (qmin == 0)
{
- JERRY_TRACE_MSG ("match\n");
- break; /* tail merge */
- }
+ group_p->iterator = 0;
+ group_p->begin_p = NULL;
+ const uint32_t end_offset = re_get_value (&bc_p);
+ group_p->bc_p = bc_p;
- if (!(re_ctx_p->flags & RE_FLAG_MULTILINE))
+ bc_p += end_offset;
+ }
+ else
{
- JERRY_TRACE_MSG ("fail\n");
- return NULL; /* fail */
+ group_p->iterator = 1;
+ group_p->begin_p = str_curr_p;
+ group_p->bc_p = bc_p;
}
- if (lit_char_is_line_terminator (lit_cesu8_peek_prev (str_curr_p)))
+ const lit_utf8_byte_t *matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
+ group_p->iterator = saved_iterator;
+
+ if (matched_p == NULL)
{
- JERRY_TRACE_MSG ("match\n");
- break; /* tail merge */
+ goto fail;
}
- JERRY_TRACE_MSG ("fail\n");
- return NULL; /* fail */
+ return matched_p;
}
- case RE_OP_ASSERT_END:
+ case RE_OP_GREEDY_CAPTURING_GROUP_END:
{
- JERRY_TRACE_MSG ("Execute RE_OP_ASSERT_END: ");
+ const uint32_t group_idx = re_get_value (&bc_p);
+ ecma_regexp_capture_t *const group_p = re_ctx_p->captures_p + group_idx;
+ const uint32_t qmin = re_get_value (&bc_p);
- if (str_curr_p >= re_ctx_p->input_end_p)
+ if (group_p->iterator < qmin)
{
- JERRY_TRACE_MSG ("match\n");
- break; /* tail merge */
+ /* No need to save begin_p since we don't have to backtrack beyond the minimum iteration count, but we have
+ * to clear nested capturing groups. */
+ group_p->begin_p = str_curr_p;
+ for (uint32_t i = 1; i < group_p->subcapture_count; ++i)
+ {
+ group_p[i].begin_p = NULL;
+ }
+
+ group_p->iterator++;
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p);
+
+ if (matched_p != NULL)
+ {
+ return matched_p;
+ }
+
+ group_p->iterator--;
+ goto fail;
}
- if (!(re_ctx_p->flags & RE_FLAG_MULTILINE))
+ /* Empty matches are not allowed after reaching the minimum number of iterations. */
+ if (JERRY_UNLIKELY (group_p->begin_p >= str_curr_p) && (group_p->iterator > qmin))
{
- JERRY_TRACE_MSG ("fail\n");
- return NULL; /* fail */
+ goto fail;
}
- if (lit_char_is_line_terminator (lit_cesu8_peek_next (str_curr_p)))
+ const uint32_t qmax = re_get_value (&bc_p) - RE_QMAX_OFFSET;
+ if (JERRY_UNLIKELY (group_p->iterator >= qmax))
{
- JERRY_TRACE_MSG ("match\n");
- break; /* tail merge */
- }
+ /* Reached maximum number of iterations, try to match tail bytecode. */
+ group_p->end_p = str_curr_p;
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
- JERRY_TRACE_MSG ("fail\n");
- return NULL; /* fail */
- }
- case RE_OP_ASSERT_WORD_BOUNDARY:
- case RE_OP_ASSERT_NOT_WORD_BOUNDARY:
- {
- const bool is_wordchar_left = ((str_curr_p > re_ctx_p->input_start_p)
- && lit_char_is_word_char (lit_cesu8_peek_prev (str_curr_p)));
+ if (matched_p != NULL)
+ {
+ return matched_p;
+ }
- const bool is_wordchar_right = ((str_curr_p < re_ctx_p->input_end_p)
- && lit_char_is_word_char (lit_cesu8_peek_next (str_curr_p)));
+ goto fail;
+ }
- if (op == RE_OP_ASSERT_WORD_BOUNDARY)
{
- JERRY_TRACE_MSG ("Execute RE_OP_ASSERT_WORD_BOUNDARY: ");
- if (is_wordchar_left == is_wordchar_right)
+ /* Save and clear all nested capturing groups, and try to iterate. */
+ JERRY_VLA (const lit_utf8_byte_t *, saved_captures_p, group_p->subcapture_count);
+ for (uint32_t i = 0; i < group_p->subcapture_count; ++i)
{
- JERRY_TRACE_MSG ("fail\n");
- return NULL; /* fail */
+ saved_captures_p[i] = group_p[i].begin_p;
+ group_p[i].begin_p = NULL;
}
- }
- else
- {
- JERRY_ASSERT (op == RE_OP_ASSERT_NOT_WORD_BOUNDARY);
- JERRY_TRACE_MSG ("Execute RE_OP_ASSERT_NOT_WORD_BOUNDARY: ");
- if (is_wordchar_left != is_wordchar_right)
+ group_p->iterator++;
+ group_p->begin_p = str_curr_p;
+
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p);
+
+ if (matched_p != NULL)
+ {
+ return matched_p;
+ }
+
+ /* Failed to iterate again, backtrack to current match, and try to run tail bytecode. */
+ for (uint32_t i = 0; i < group_p->subcapture_count; ++i)
{
- JERRY_TRACE_MSG ("fail\n");
- return NULL; /* fail */
+ group_p[i].begin_p = saved_captures_p[i];
}
+
+ group_p->iterator--;
+ group_p->end_p = str_curr_p;
}
- JERRY_TRACE_MSG ("match\n");
- break; /* tail merge */
+ const lit_utf8_byte_t *const tail_match_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
+
+ if (tail_match_p != NULL)
+ {
+ return tail_match_p;
+ }
+
+ goto fail;
}
- case RE_OP_LOOKAHEAD_POS:
- case RE_OP_LOOKAHEAD_NEG:
+ case RE_OP_GREEDY_NON_CAPTURING_GROUP_END:
{
- const lit_utf8_byte_t *matched_p = NULL;
- const size_t captures_size = re_ctx_p->captures_count * sizeof (ecma_regexp_capture_t);
- ecma_regexp_capture_t *saved_captures_p = (ecma_regexp_capture_t *) jmem_heap_alloc_block (captures_size);
- memcpy (saved_captures_p, re_ctx_p->captures_p, captures_size);
+ const uint32_t group_idx = re_get_value (&bc_p);
+ ecma_regexp_non_capture_t *const group_p = re_ctx_p->non_captures_p + group_idx;
+ const uint32_t qmin = re_get_value (&bc_p);
- do
+ if (group_p->iterator < qmin)
{
- const uint32_t offset = re_get_value (&bc_p);
+ /* No need to save begin_p but we have to clear nested capturing groups. */
+ group_p->begin_p = str_curr_p;
- if (matched_p == NULL)
+ ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + group_p->subcapture_start;
+ for (uint32_t i = 0; i < group_p->subcapture_count; ++i)
{
- matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
+ capture_p[i].begin_p = NULL;
+ }
- if (ECMA_RE_STACK_LIMIT_REACHED (matched_p))
- {
- jmem_heap_free_block (saved_captures_p, captures_size);
- return matched_p;
- }
+ group_p->iterator++;
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p);
+
+ if (matched_p != NULL)
+ {
+ return matched_p;
}
- bc_p += offset;
- }
- while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE);
- JERRY_TRACE_MSG ("Execute RE_OP_LOOKAHEAD_POS/NEG: ");
- if ((op == RE_OP_LOOKAHEAD_POS && matched_p != NULL)
- || (op == RE_OP_LOOKAHEAD_NEG && matched_p == NULL))
- {
- JERRY_TRACE_MSG ("match\n");
- matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
+ group_p->iterator--;
+ goto fail;
}
- else
+
+ /* Empty matches are not allowed after reaching the minimum number of iterations. */
+ if (JERRY_UNLIKELY (group_p->begin_p >= str_curr_p) && (group_p->iterator > qmin))
{
- JERRY_TRACE_MSG ("fail\n");
- matched_p = NULL; /* fail */
+ goto fail;
}
- if (matched_p == NULL)
+ const uint32_t qmax = re_get_value (&bc_p) - RE_QMAX_OFFSET;
+ if (JERRY_UNLIKELY (group_p->iterator >= qmax))
{
- /* restore saved */
- memcpy (re_ctx_p->captures_p, saved_captures_p, captures_size);
+ /* Reached maximum number of iterations, try to match tail bytecode. */
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
+
+ if (matched_p != NULL)
+ {
+ return matched_p;
+ }
+
+ goto fail;
}
- jmem_heap_free_block (saved_captures_p, captures_size);
- return matched_p;
- }
- case RE_OP_CHAR_CLASS:
- case RE_OP_INV_CHAR_CLASS:
- {
- JERRY_TRACE_MSG ("Execute RE_OP_CHAR_CLASS/RE_OP_INV_CHAR_CLASS, ");
- if (str_curr_p >= re_ctx_p->input_end_p)
{
- JERRY_TRACE_MSG ("fail\n");
- return NULL; /* fail */
- }
+ /* Save and clear all nested capturing groups, and try to iterate. */
+ JERRY_VLA (const lit_utf8_byte_t *, saved_captures_p, group_p->subcapture_count);
+ for (uint32_t i = 0; i < group_p->subcapture_count; ++i)
+ {
+ ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + group_p->subcapture_start + i;
+ saved_captures_p[i] = capture_p->begin_p;
+ capture_p->begin_p = NULL;
+ }
- uint32_t range_count = re_get_value (&bc_p);
- const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE;
- bool is_match = false;
+ group_p->iterator++;
+ const lit_utf8_byte_t *const saved_begin_p = group_p->begin_p;
+ group_p->begin_p = str_curr_p;
-#if ENABLED (JERRY_ES2015)
- if (re_ctx_p->flags & RE_FLAG_UNICODE)
- {
- lit_code_point_t curr_ch = ecma_regexp_unicode_advance (&str_curr_p,
- re_ctx_p->input_end_p);
- curr_ch = ecma_regexp_canonicalize (curr_ch, is_ignorecase);
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p);
- while (range_count-- > 0)
+ if (matched_p != NULL)
{
- const lit_code_point_t ch1 = re_get_value (&bc_p);
- if (curr_ch < ch1)
- {
- bc_p += sizeof (uint32_t);
- continue;
- }
-
- const lit_code_point_t ch2 = re_get_value (&bc_p);
- is_match = (curr_ch <= ch2);
- if (is_match)
- {
- /* Skip the remaining ranges in the bytecode. */
- bc_p += range_count * 2 * sizeof (uint32_t);
- break;
- }
+ return matched_p;
}
- }
- else
- {
-#endif /* ENABLED (JERRY_ES2015) */
- const ecma_char_t curr_ch = (ecma_char_t) ecma_regexp_canonicalize (lit_cesu8_read_next (&str_curr_p),
- is_ignorecase);
- while (range_count-- > 0)
+ /* Failed to iterate again, backtrack to current match, and try to run tail bytecode. */
+ for (uint32_t i = 0; i < group_p->subcapture_count; ++i)
{
- const ecma_char_t ch1 = re_get_char (&bc_p);
- if (curr_ch < ch1)
- {
- bc_p += sizeof (ecma_char_t);
- continue;
- }
-
- const ecma_char_t ch2 = re_get_char (&bc_p);
- is_match = (curr_ch <= ch2);
- if (is_match)
- {
- /* Skip the remaining ranges in the bytecode. */
- bc_p += range_count * 2 * sizeof (ecma_char_t);
- break;
- }
+ ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + group_p->subcapture_start + i;
+ capture_p->begin_p = saved_captures_p[i];
}
-#if ENABLED (JERRY_ES2015)
+
+ group_p->iterator--;
+ group_p->begin_p = saved_begin_p;
}
-#endif /* ENABLED (JERRY_ES2015) */
- JERRY_ASSERT (op == RE_OP_CHAR_CLASS || op == RE_OP_INV_CHAR_CLASS);
+ const lit_utf8_byte_t *const tail_match_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
- if ((op == RE_OP_CHAR_CLASS) != is_match)
+ if (tail_match_p != NULL)
{
- JERRY_TRACE_MSG ("fail\n");
- return NULL; /* fail */
+ return tail_match_p;
}
- JERRY_TRACE_MSG ("match\n");
- break; /* tail merge */
+ goto fail;
}
- case RE_OP_BACKREFERENCE:
+ case RE_OP_LAZY_CAPTURING_GROUP_END:
{
- const uint32_t backref_idx = re_get_value (&bc_p);
- JERRY_TRACE_MSG ("Execute RE_OP_BACKREFERENCE (idx: %u): ", (unsigned int) backref_idx);
- JERRY_ASSERT (backref_idx >= 1 && backref_idx < re_ctx_p->captures_count);
- const ecma_regexp_capture_t capture = re_ctx_p->captures_p[backref_idx];
+ const uint32_t group_idx = re_get_value (&bc_p);
+ ecma_regexp_capture_t *const group_p = re_ctx_p->captures_p + group_idx;
+ const uint32_t qmin = re_get_value (&bc_p);
- if (capture.begin_p == NULL || capture.end_p == NULL)
+ if (group_p->iterator < qmin)
{
- JERRY_TRACE_MSG ("match\n");
- break; /* capture is 'undefined', always matches! */
- }
+ /* No need to save begin_p but we have to clear nested capturing groups. */
+ group_p->begin_p = str_curr_p;
+ for (uint32_t i = 1; i < group_p->subcapture_count; ++i)
+ {
+ group_p[i].begin_p = NULL;
+ }
- const lit_utf8_size_t capture_size = (lit_utf8_size_t) (capture.end_p - capture.begin_p);
+ group_p->iterator++;
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p);
- if (str_curr_p + capture_size > re_ctx_p->input_end_p)
+ if (matched_p != NULL)
+ {
+ return matched_p;
+ }
+
+ group_p->iterator--;
+ goto fail;
+ }
+
+ /* Empty matches are not allowed after reaching the minimum number of iterations. */
+ if (JERRY_UNLIKELY (group_p->begin_p >= str_curr_p) && (group_p->iterator > qmin))
{
- JERRY_TRACE_MSG ("fail\n");
- return NULL; /* fail */
+ goto fail;
}
- if (memcmp (str_curr_p, capture.begin_p, capture_size))
+ const uint32_t qmax = re_get_value (&bc_p) - RE_QMAX_OFFSET;
+ group_p->end_p = str_curr_p;
+
+ /* Try to match tail bytecode. */
+ const lit_utf8_byte_t *const tail_match_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
+
+ if (tail_match_p != NULL)
{
- JERRY_TRACE_MSG ("fail\n");
- return NULL; /* fail */
+ return tail_match_p;
}
- str_curr_p += capture_size;
- JERRY_TRACE_MSG ("match\n");
- break; /* tail merge */
- }
- case RE_OP_SAVE_AT_START:
- {
- JERRY_TRACE_MSG ("Execute RE_OP_SAVE_AT_START\n");
- re_ctx_p->captures_p[RE_GLOBAL_CAPTURE].begin_p = str_curr_p;
+ if (JERRY_UNLIKELY (group_p->iterator >= qmax))
+ {
+ /* Reached maximum number of iterations and tail bytecode did not match. */
+ goto fail;
+ }
- do
{
- const uint32_t offset = re_get_value (&bc_p);
- const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
+ /* Save and clear all nested capturing groups, and try to iterate. */
+ JERRY_VLA (const lit_utf8_byte_t *, saved_captures_p, group_p->subcapture_count);
+ for (uint32_t i = 0; i < group_p->subcapture_count; ++i)
+ {
+ saved_captures_p[i] = group_p[i].begin_p;
+ group_p[i].begin_p = NULL;
+ }
+
+ group_p->iterator++;
+ group_p->begin_p = str_curr_p;
+
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p);
if (matched_p != NULL)
{
- return matched_p; /* match */
+ return matched_p;
}
- bc_p += offset;
+ /* Backtrack to current match. */
+ for (uint32_t i = 0; i < group_p->subcapture_count; ++i)
+ {
+ group_p[i].begin_p = saved_captures_p[i];
+ }
+
+ group_p->iterator--;
}
- while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE);
- bc_p -= sizeof (uint8_t);
- return NULL; /* fail */
+ goto fail;
}
- case RE_OP_SAVE_AND_MATCH:
+ case RE_OP_LAZY_NON_CAPTURING_GROUP_END:
{
- JERRY_TRACE_MSG ("End of pattern is reached: match\n");
- re_ctx_p->captures_p[RE_GLOBAL_CAPTURE].end_p = str_curr_p;
- return str_curr_p; /* match */
- }
- case RE_OP_ALTERNATIVE:
- {
- /*
- * Alternatives should be jumped over, when an alternative opcode appears.
- */
- uint32_t offset = re_get_value (&bc_p);
- JERRY_TRACE_MSG ("Execute RE_OP_ALTERNATIVE");
- bc_p += offset;
+ const uint32_t group_idx = re_get_value (&bc_p);
+ ecma_regexp_non_capture_t *const group_p = re_ctx_p->non_captures_p + group_idx;
+ const uint32_t qmin = re_get_value (&bc_p);
- while (*bc_p == RE_OP_ALTERNATIVE)
+ if (group_p->iterator < qmin)
{
- JERRY_TRACE_MSG (", jump: %u", (unsigned int) offset);
- bc_p++;
- offset = re_get_value (&bc_p);
- bc_p += offset;
- }
+ /* Clear nested captures. */
+ ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + group_p->subcapture_start;
+ for (uint32_t i = 0; i < group_p->subcapture_count; ++i)
+ {
+ capture_p[i].begin_p = NULL;
+ }
- JERRY_TRACE_MSG ("\n");
- break; /* tail merge */
- }
- case RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
- case RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
- {
- /*
- * On non-greedy iterations we have to execute the bytecode
- * after the group first, if zero iteration is allowed.
- */
- const lit_utf8_byte_t *old_begin_p = NULL;
- const uint8_t *const bc_start_p = bc_p; /* save the bytecode start position of the group start */
- const uint32_t start_idx = re_get_value (&bc_p);
- const uint32_t offset = re_get_value (&bc_p);
+ group_p->iterator++;
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p);
- uint32_t *iterator_p;
- if (RE_IS_CAPTURE_GROUP (op))
- {
- JERRY_ASSERT (start_idx < re_ctx_p->captures_count);
- re_ctx_p->captures_p[start_idx].begin_p = str_curr_p;
- iterator_p = &(re_ctx_p->iterations_p[start_idx - 1]);
+ if (matched_p != NULL)
+ {
+ return matched_p;
+ }
+
+ group_p->iterator--;
+ goto fail;
}
- else
+
+ /* Empty matches are not allowed after reaching the minimum number of iterations. */
+ if (JERRY_UNLIKELY (group_p->begin_p >= str_curr_p) && (group_p->iterator > qmin))
{
- JERRY_ASSERT (start_idx < re_ctx_p->non_captures_count);
- iterator_p = &(re_ctx_p->iterations_p[start_idx + re_ctx_p->captures_count - 1]);
+ goto fail;
}
- *iterator_p = 0;
- /* Jump all over to the end of the END opcode. */
- bc_p += offset;
+ const uint32_t qmax = re_get_value (&bc_p) - RE_QMAX_OFFSET;
- /* Try to match after the close paren if zero is allowed */
- const lit_utf8_byte_t *matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
+ /* Try to match tail bytecode. */
+ const lit_utf8_byte_t *const tail_match_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
- if (matched_p != NULL)
+ if (tail_match_p != NULL)
{
- return str_curr_p; /* match */
+ return tail_match_p;
}
- if (RE_IS_CAPTURE_GROUP (op))
+ if (JERRY_UNLIKELY (group_p->iterator >= qmax))
{
- re_ctx_p->captures_p[start_idx].begin_p = old_begin_p;
+ /* Reached maximum number of iterations and tail bytecode did not match. */
+ goto fail;
}
- bc_p = bc_start_p;
- /* FALLTHRU */
+ {
+ /* Save and clear all nested capturing groups, and try to iterate. */
+ JERRY_VLA (const lit_utf8_byte_t *, saved_captures_p, group_p->subcapture_count);
+ for (uint32_t i = 0; i < group_p->subcapture_count; ++i)
+ {
+ ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + group_p->subcapture_start + i;
+ saved_captures_p[i] = capture_p->begin_p;
+ capture_p->begin_p = NULL;
+ }
+
+ group_p->iterator++;
+ const lit_utf8_byte_t *const saved_begin_p = group_p->begin_p;
+ group_p->begin_p = str_curr_p;
+
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, group_p->bc_p, str_curr_p);
+
+ if (matched_p != NULL)
+ {
+ return matched_p;
+ }
+
+ /* Backtrack to current match. */
+ for (uint32_t i = 0; i < group_p->subcapture_count; ++i)
+ {
+ ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + group_p->subcapture_start + i;
+ capture_p->begin_p = saved_captures_p[i];
+ }
+
+ group_p->iterator--;
+ group_p->begin_p = saved_begin_p;
+ }
+
+ goto fail;
}
- case RE_OP_CAPTURE_GROUP_START:
- case RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START:
- case RE_OP_NON_CAPTURE_GROUP_START:
- case RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START:
+ case RE_OP_GREEDY_ITERATOR:
{
- const uint8_t *bc_end_p = NULL;
- const uint32_t start_idx = re_get_value (&bc_p);
+ const uint32_t qmin = re_get_value (&bc_p);
+ const uint32_t qmax = re_get_value (&bc_p) - RE_QMAX_OFFSET;
+ const uint32_t end_offset = re_get_value (&bc_p);
- if (op != RE_OP_CAPTURE_GROUP_START
- && op != RE_OP_NON_CAPTURE_GROUP_START)
+ uint32_t iterator = 0;
+ while (iterator < qmin)
{
- const uint32_t offset = re_get_value (&bc_p);
- bc_end_p = bc_p + offset;
+ str_curr_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
+
+ if (str_curr_p == NULL)
+ {
+ goto fail;
+ }
+
+ if (ECMA_RE_STACK_LIMIT_REACHED (str_curr_p))
+ {
+ return str_curr_p;
+ }
+
+ iterator++;
}
- const lit_utf8_byte_t **group_begin_p;
- uint32_t *iterator_p;
- if (RE_IS_CAPTURE_GROUP (op))
+ while (iterator < qmax)
{
- JERRY_ASSERT (start_idx < re_ctx_p->captures_count);
- group_begin_p = &(re_ctx_p->captures_p[start_idx].begin_p);
- iterator_p = &(re_ctx_p->iterations_p[start_idx - 1]);
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
+
+ if (matched_p == NULL)
+ {
+ break;
+ }
+
+ if (ECMA_RE_STACK_LIMIT_REACHED (str_curr_p))
+ {
+ return str_curr_p;
+ }
+
+ str_curr_p = matched_p;
+ iterator++;
}
- else
+
+ const uint8_t *const tail_bc_p = bc_p + end_offset;
+ while (true)
{
- JERRY_ASSERT (start_idx < re_ctx_p->non_captures_count);
- group_begin_p = &(re_ctx_p->non_captures_p[start_idx].str_p);
- iterator_p = &(re_ctx_p->iterations_p[start_idx + re_ctx_p->captures_count - 1]);
+ const lit_utf8_byte_t *const tail_match_p = ecma_regexp_run (re_ctx_p, tail_bc_p, str_curr_p);
+
+ if (tail_match_p != NULL)
+ {
+ return tail_match_p;
+ }
+
+ if (JERRY_UNLIKELY (iterator <= qmin))
+ {
+ goto fail;
+ }
+
+ iterator--;
+ JERRY_ASSERT (str_curr_p > re_ctx_p->input_start_p);
+ str_curr_p = ecma_regexp_step_back (re_ctx_p, str_curr_p);
}
- const lit_utf8_byte_t *const old_begin_p = *group_begin_p;
- const uint32_t old_iter_count = *iterator_p;
- *group_begin_p = str_curr_p;
- *iterator_p = 0;
+ JERRY_UNREACHABLE ();
+ }
+ case RE_OP_LAZY_ITERATOR:
+ {
+ const uint32_t qmin = re_get_value (&bc_p);
+ const uint32_t qmax = re_get_value (&bc_p) - RE_QMAX_OFFSET;
+ const uint32_t end_offset = re_get_value (&bc_p);
- do
+ uint32_t iterator = 0;
+ while (iterator < qmin)
{
- const uint32_t offset = re_get_value (&bc_p);
- const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
+ str_curr_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
- if (matched_p != NULL)
+ if (str_curr_p == NULL)
{
- return matched_p; /* match */
+ goto fail;
}
- bc_p += offset;
- }
- while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE);
+ if (ECMA_RE_STACK_LIMIT_REACHED (str_curr_p))
+ {
+ return str_curr_p;
+ }
- bc_p -= sizeof (uint8_t);
- *iterator_p = old_iter_count;
+ iterator++;
+ }
- /* Try to match after the close paren if zero is allowed. */
- if (op == RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START
- || op == RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START)
+ const uint8_t *const tail_bc_p = bc_p + end_offset;
+ while (true)
{
- JERRY_ASSERT (bc_end_p);
- const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_end_p, str_curr_p);
+ const lit_utf8_byte_t *const tail_match_p = ecma_regexp_run (re_ctx_p, tail_bc_p, str_curr_p);
- if (matched_p != NULL)
+ if (tail_match_p != NULL)
+ {
+ return tail_match_p;
+ }
+
+ if (JERRY_UNLIKELY (iterator >= qmax))
+ {
+ goto fail;
+ }
+
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
+
+ if (matched_p == NULL)
{
- return matched_p; /* match */
+ goto fail;
}
+
+ if (ECMA_RE_STACK_LIMIT_REACHED (matched_p))
+ {
+ return matched_p;
+ }
+
+ iterator++;
+ str_curr_p = matched_p;
}
- *group_begin_p = old_begin_p;
- return NULL; /* fail */
+ JERRY_UNREACHABLE ();
}
- case RE_OP_CAPTURE_NON_GREEDY_GROUP_END:
- case RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END:
+ case RE_OP_BACKREFERENCE:
{
- /*
- * On non-greedy iterations we have to execute the bytecode
- * after the group first. Try to iterate only if it fails.
- */
- const uint8_t *const bc_start_p = bc_p; /* save the bytecode start position of the group end */
- const uint32_t end_idx = re_get_value (&bc_p);
- const uint32_t min = re_get_value (&bc_p);
- const uint32_t max = re_get_value (&bc_p);
- re_get_value (&bc_p); /* start offset */
+ const uint32_t backref_idx = re_get_value (&bc_p);
+ JERRY_ASSERT (backref_idx >= 1 && backref_idx < re_ctx_p->captures_count);
+ const ecma_regexp_capture_t *capture_p = re_ctx_p->captures_p + backref_idx;
- const lit_utf8_byte_t **group_end_p;
- uint32_t *iterator_p;
- if (RE_IS_CAPTURE_GROUP (op))
+ if (!ECMA_RE_IS_CAPTURE_DEFINED (capture_p) || capture_p->end_p <= capture_p->begin_p)
{
- JERRY_ASSERT (end_idx < re_ctx_p->captures_count);
- group_end_p = &(re_ctx_p->captures_p[end_idx].end_p);
- iterator_p = &(re_ctx_p->iterations_p[end_idx - 1]);
+ /* Undefined or zero length captures always match. */
+ continue;
}
- else
+
+ const lit_utf8_size_t capture_size = (lit_utf8_size_t) (capture_p->end_p - capture_p->begin_p);
+
+ if (str_curr_p + capture_size > re_ctx_p->input_end_p
+ || memcmp (str_curr_p, capture_p->begin_p, capture_size))
{
- JERRY_ASSERT (end_idx < re_ctx_p->non_captures_count);
- group_end_p = &(re_ctx_p->non_captures_p[end_idx].str_p);
- iterator_p = &(re_ctx_p->iterations_p[end_idx + re_ctx_p->captures_count - 1]);
+ goto fail;
}
- (*iterator_p)++;
-
- if (*iterator_p >= min && *iterator_p <= max)
+ str_curr_p += capture_size;
+ continue;
+ }
+ case RE_OP_ASSERT_LINE_START:
+ {
+ if (str_curr_p <= re_ctx_p->input_start_p)
{
- const lit_utf8_byte_t *const old_end_p = *group_end_p;
- *group_end_p = str_curr_p;
+ continue;
+ }
- const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
+ if (!(re_ctx_p->flags & RE_FLAG_MULTILINE) || !lit_char_is_line_terminator (lit_cesu8_peek_prev (str_curr_p)))
+ {
+ goto fail;
+ }
- if (matched_p != NULL)
- {
- return matched_p; /* match */
- }
+ continue;
+ }
+ case RE_OP_ASSERT_LINE_END:
+ {
+ if (str_curr_p >= re_ctx_p->input_end_p)
+ {
+ continue;
+ }
- *group_end_p = old_end_p;
+ if (!(re_ctx_p->flags & RE_FLAG_MULTILINE) || !lit_char_is_line_terminator (lit_cesu8_peek_next (str_curr_p)))
+ {
+ goto fail;
}
- (*iterator_p)--;
- bc_p = bc_start_p;
- /* Non-greedy fails, try to iterate. */
- /* FALLTHRU */
+ continue;
}
- case RE_OP_CAPTURE_GREEDY_GROUP_END:
- case RE_OP_NON_CAPTURE_GREEDY_GROUP_END:
+ case RE_OP_ASSERT_WORD_BOUNDARY:
{
- const uint32_t end_idx = re_get_value (&bc_p);
- const uint32_t min = re_get_value (&bc_p);
- const uint32_t max = re_get_value (&bc_p);
- uint32_t offset = re_get_value (&bc_p);
-
- const lit_utf8_byte_t **group_begin_p;
- const lit_utf8_byte_t **group_end_p;
- uint32_t *iterator_p;
+ const bool is_wordchar_left = ((str_curr_p > re_ctx_p->input_start_p)
+ && lit_char_is_word_char (str_curr_p[-1]));
- if (RE_IS_CAPTURE_GROUP (op))
+ const bool is_wordchar_right = ((str_curr_p < re_ctx_p->input_end_p)
+ && lit_char_is_word_char (str_curr_p[0]));
+ if (is_wordchar_right == is_wordchar_left)
{
- JERRY_ASSERT (end_idx < re_ctx_p->captures_count);
- group_begin_p = &(re_ctx_p->captures_p[end_idx].begin_p);
- group_end_p = &(re_ctx_p->captures_p[end_idx].end_p);
- iterator_p = &(re_ctx_p->iterations_p[end_idx - 1]);
+ goto fail;
}
- else
+
+ continue;
+ }
+ case RE_OP_ASSERT_NOT_WORD_BOUNDARY:
+ {
+ const bool is_wordchar_left = ((str_curr_p > re_ctx_p->input_start_p)
+ && lit_char_is_word_char (str_curr_p[-1]));
+
+ const bool is_wordchar_right = ((str_curr_p < re_ctx_p->input_end_p)
+ && lit_char_is_word_char (str_curr_p[0]));
+ if (is_wordchar_right != is_wordchar_left)
{
- JERRY_ASSERT (end_idx <= re_ctx_p->non_captures_count);
- group_begin_p = &(re_ctx_p->non_captures_p[end_idx].str_p);
- group_end_p = &(re_ctx_p->non_captures_p[end_idx].str_p);
- iterator_p = &(re_ctx_p->iterations_p[end_idx + re_ctx_p->captures_count - 1]);
+ goto fail;
}
- /* Check the empty iteration if the minimum number of iterations is reached. */
- if (*iterator_p >= min && str_curr_p == *group_begin_p)
+ continue;
+ }
+ case RE_OP_ASSERT_LOOKAHEAD_POS:
+ {
+ const uint8_t qmin = re_get_byte (&bc_p);
+ const uint32_t capture_start = re_get_value (&bc_p);
+ const uint32_t capture_count = re_get_value (&bc_p);
+ const uint32_t end_offset = re_get_value (&bc_p);
+
+ /* If qmin is zero, the assertion implicitly matches. */
+ if (qmin == 0)
{
- return NULL; /* fail */
+ bc_p += end_offset;
+ continue;
}
- (*iterator_p)++;
+ /* Capture end pointers might get clobbered and need to be restored after a tail match fail. */
+ JERRY_VLA (const lit_utf8_byte_t *, saved_captures_p, capture_count);
+ for (uint32_t i = 0; i < capture_count; ++i)
+ {
+ ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + capture_start + i;
+ saved_captures_p[i] = capture_p->end_p;
+ }
- const uint8_t *const bc_start_p = bc_p; /* Save the bytecode end position of the END opcodes. */
- const lit_utf8_byte_t *const old_end_p = *group_end_p;
- *group_end_p = str_curr_p;
+ /* The first iteration will decide whether the assertion matches depending on whether
+ * the iteration matched or not. */
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
- if (*iterator_p < max)
+ if (ECMA_RE_STACK_LIMIT_REACHED (matched_p))
{
- bc_p -= offset;
- offset = re_get_value (&bc_p);
+ return matched_p;
+ }
- const lit_utf8_byte_t *const old_begin_p = *group_begin_p;
- *group_begin_p = str_curr_p;
+ if (matched_p == NULL)
+ {
+ goto fail;
+ }
- const lit_utf8_byte_t *matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
+ const lit_utf8_byte_t *tail_match_p = ecma_regexp_run (re_ctx_p, bc_p + end_offset, str_curr_p);
- if (matched_p != NULL)
+ if (tail_match_p == NULL)
+ {
+ for (uint32_t i = 0; i < capture_count; ++i)
{
- return matched_p; /* match */
+ ecma_regexp_capture_t *const capture_p = re_ctx_p->captures_p + capture_start + i;
+ capture_p->begin_p = NULL;
+ capture_p->end_p = saved_captures_p[i];
}
- /* Try to match alternatives if any. */
- bc_p += offset;
- while (*bc_p == RE_OP_ALTERNATIVE)
- {
- bc_p++; /* RE_OP_ALTERNATIVE */
- offset = re_get_value (&bc_p);
+ goto fail;
+ }
- *group_begin_p = str_curr_p;
+ return tail_match_p;
+ }
+ case RE_OP_ASSERT_LOOKAHEAD_NEG:
+ {
+ const uint8_t qmin = re_get_byte (&bc_p);
+ uint32_t capture_idx = re_get_value (&bc_p);
+ const uint32_t capture_count = re_get_value (&bc_p);
+ const uint32_t end_offset = re_get_value (&bc_p);
+
+ /* If qmin is zero, the assertion implicitly matches. */
+ if (qmin > 0)
+ {
+ /* The first iteration will decide whether the assertion matches depending on whether
+ * the iteration matched or not. */
+ const lit_utf8_byte_t *const matched_p = ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
- matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
+ if (ECMA_RE_STACK_LIMIT_REACHED (matched_p))
+ {
+ return matched_p;
+ }
- if (matched_p != NULL)
+ if (matched_p != NULL)
+ {
+ /* Nested capturing groups inside a negative lookahead can never capture, so we clear their results. */
+ const uint32_t capture_end = capture_idx + capture_count;
+ while (capture_idx < capture_end)
{
- return matched_p; /* match */
+ re_ctx_p->captures_p[capture_idx++].begin_p = NULL;
}
- bc_p += offset;
+ goto fail;
}
-
- *group_begin_p = old_begin_p;
}
- if (*iterator_p >= min && *iterator_p <= max)
+ bc_p += end_offset;
+ continue;
+ }
+ case RE_OP_CLASS_ESCAPE:
+ {
+ if (str_curr_p >= re_ctx_p->input_end_p)
{
- /* Try to match the rest of the bytecode. */
- const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_start_p, str_curr_p);
+ goto fail;
+ }
- if (matched_p != NULL)
- {
- return matched_p; /* match */
- }
+ const lit_code_point_t cp = ecma_regexp_advance (re_ctx_p, &str_curr_p);
+
+ const ecma_class_escape_t escape = (ecma_class_escape_t) re_get_byte (&bc_p);
+ if (!ecma_regexp_check_class_escape (cp, escape))
+ {
+ goto fail;
}
- /* restore if fails */
- *group_end_p = old_end_p;
- (*iterator_p)--;
- return NULL; /* fail */
+ continue;
}
- case RE_OP_NON_GREEDY_ITERATOR:
+ case RE_OP_CHAR_CLASS:
{
- const uint32_t min = re_get_value (&bc_p);
- const uint32_t max = re_get_value (&bc_p);
+ if (str_curr_p >= re_ctx_p->input_end_p)
+ {
+ goto fail;
+ }
- const uint32_t offset = re_get_value (&bc_p);
- JERRY_TRACE_MSG ("Non-greedy iterator, min=%lu, max=%lu, offset=%ld\n",
- (unsigned long) min, (unsigned long) max, (long) offset);
+ uint8_t flags = re_get_byte (&bc_p);
+ uint32_t char_count = (flags & RE_CLASS_HAS_CHARS) ? re_get_value (&bc_p) : 0;
+ uint32_t range_count = (flags & RE_CLASS_HAS_RANGES) ? re_get_value (&bc_p) : 0;
+
+ const lit_code_point_t cp = ecma_regexp_advance (re_ctx_p, &str_curr_p);
- uint32_t iter_count = 0;
- while (iter_count <= max)
+ uint8_t escape_count = flags & RE_CLASS_ESCAPE_COUNT_MASK;
+ while (escape_count > 0)
{
- if (iter_count >= min)
+ escape_count--;
+ const ecma_class_escape_t escape = re_get_byte (&bc_p);
+ if (ecma_regexp_check_class_escape (cp, escape))
{
- const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p + offset, str_curr_p);
+ goto class_found;
+ }
+ }
- if (matched_p != NULL)
- {
- return matched_p; /* match */
- }
+ while (char_count > 0)
+ {
+ char_count--;
+ const lit_code_point_t curr = re_get_char (&bc_p, re_ctx_p->flags & RE_FLAG_UNICODE);
+ if (cp == curr)
+ {
+ goto class_found;
}
+ }
- const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
+ while (range_count > 0)
+ {
+ range_count--;
+ const lit_code_point_t begin = re_get_char (&bc_p, re_ctx_p->flags & RE_FLAG_UNICODE);
- if (ECMA_RE_STACK_LIMIT_REACHED (matched_p))
+ if (cp < begin)
{
- return matched_p;
+ bc_p += re_ctx_p->char_size;
+ continue;
}
- if (matched_p == NULL)
+ const lit_code_point_t end = re_get_char (&bc_p, re_ctx_p->flags & RE_FLAG_UNICODE);
+ if (cp <= end)
{
- break;
+ goto class_found;
}
+ }
- str_curr_p = matched_p;
- iter_count++;
+ /* Not found */
+ if (flags & RE_CLASS_INVERT)
+ {
+ continue;
+ }
+
+ goto fail;
+
+class_found:
+ if (flags & RE_CLASS_INVERT)
+ {
+ goto fail;
}
- return NULL; /* fail */
+ const uint32_t chars_size = char_count * re_ctx_p->char_size;
+ const uint32_t ranges_size = range_count * re_ctx_p->char_size * 2;
+ bc_p = bc_p + escape_count + chars_size + ranges_size;
+ continue;
}
- default:
+#if ENABLED (JERRY_ES2015)
+ case RE_OP_UNICODE_PERIOD:
{
- JERRY_ASSERT (op == RE_OP_GREEDY_ITERATOR);
-
- const uint32_t min = re_get_value (&bc_p);
- const uint32_t max = re_get_value (&bc_p);
+ if (str_curr_p >= re_ctx_p->input_end_p)
+ {
+ goto fail;
+ }
- const uint32_t offset = re_get_value (&bc_p);
- JERRY_TRACE_MSG ("Greedy iterator, min=%lu, max=%lu, offset=%ld\n",
- (unsigned long) min, (unsigned long) max, (long) offset);
+ const lit_code_point_t cp = ecma_regexp_unicode_advance (&str_curr_p, re_ctx_p->input_end_p);
- uint32_t iter_count = 0;
- while (iter_count < max)
+ if (JERRY_UNLIKELY (cp <= LIT_UTF16_CODE_UNIT_MAX && lit_char_is_line_terminator ((ecma_char_t) cp)))
{
- const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
+ goto fail;
+ }
- if (ECMA_RE_STACK_LIMIT_REACHED (matched_p))
- {
- return matched_p;
- }
+ continue;
+ }
+#endif /* ENABLED (JERRY_ES2015) */
+ case RE_OP_PERIOD:
+ {
+ if (str_curr_p >= re_ctx_p->input_end_p)
+ {
+ goto fail;
+ }
- if (matched_p == NULL)
- {
- break;
- }
+ const ecma_char_t ch = lit_cesu8_read_next (&str_curr_p);
- str_curr_p = matched_p;
- iter_count++;
+ if (lit_char_is_line_terminator (ch))
+ {
+ goto fail;
}
- if (iter_count >= min)
+ continue;
+ }
+ case RE_OP_CHAR:
+ {
+ if (str_curr_p >= re_ctx_p->input_end_p)
{
- while (true)
- {
- const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p + offset, str_curr_p);
+ goto fail;
+ }
- if (matched_p != NULL)
- {
- return matched_p; /* match */
- }
+ const lit_code_point_t ch1 = re_get_char (&bc_p, re_ctx_p->flags & RE_FLAG_UNICODE);
+ const lit_code_point_t ch2 = ecma_regexp_advance (re_ctx_p, &str_curr_p);
- if (iter_count == min)
- {
- break;
- }
+ if (ch1 != ch2)
+ {
+ goto fail;
+ }
- lit_cesu8_read_prev (&str_curr_p);
- iter_count--;
- }
+ continue;
+ }
+ default:
+ {
+ JERRY_ASSERT (op == RE_OP_BYTE);
+
+ if (str_curr_p >= re_ctx_p->input_end_p
+ || *bc_p++ != *str_curr_p++)
+ {
+ goto fail;
}
- return NULL; /* fail */
+ continue;
}
}
+
+ JERRY_UNREACHABLE ();
+fail:
+ bc_p = next_alternative_p;
+
+ if (bc_p == NULL || *bc_p++ != RE_OP_ALTERNATIVE_NEXT)
+ {
+ /* None of the alternatives matched. */
+ return NULL;
+ }
+
+ /* Get the end of the new alternative and continue execution. */
+ str_curr_p = str_start_p;
+ const uint32_t offset = re_get_value (&bc_p);
+ next_alternative_p = bc_p + offset;
+ }
+} /* ecma_regexp_run */
+
+/**
+ * Match a RegExp at a specific position in the input string.
+ *
+ * @return pointer to the end of the matched sub-string
+ * NULL, if pattern did not match
+ */
+static const lit_utf8_byte_t *
+ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
+ const uint8_t *bc_p, /**< pointer to the current RegExp bytecode */
+ const lit_utf8_byte_t *str_curr_p) /**< input string pointer */
+{
+ re_ctx_p->captures_p[RE_GLOBAL_CAPTURE].begin_p = str_curr_p;
+
+ for (uint32_t i = 1; i < re_ctx_p->captures_count; ++i)
+ {
+ re_ctx_p->captures_p[i].begin_p = NULL;
}
+
+ return ecma_regexp_run (re_ctx_p, bc_p, str_curr_p);
} /* ecma_regexp_match */
/*
@@ -1273,6 +1582,7 @@ ecma_regexp_get_capture_value (const ecma_regexp_capture_t *const capture_p) /**
{
if (ECMA_RE_IS_CAPTURE_DEFINED (capture_p))
{
+ JERRY_ASSERT (capture_p->end_p >= capture_p->begin_p);
const lit_utf8_size_t capture_size = (lit_utf8_size_t) (capture_p->end_p - capture_p->begin_p);
ecma_string_t *const capture_str_p = ecma_new_ecma_string_from_utf8 (capture_p->begin_p, capture_size);
return ecma_make_string_value (capture_str_p);
@@ -1331,20 +1641,21 @@ ecma_regexp_initialize_context (ecma_regexp_ctx_t *ctx_p, /**< regexp context */
JERRY_ASSERT (input_start_p != NULL);
JERRY_ASSERT (input_end_p >= input_start_p);
+ ctx_p->flags = bc_p->header.status_flags;
+ ctx_p->char_size = (ctx_p->flags & RE_FLAG_UNICODE) ? sizeof (lit_code_point_t) : sizeof (ecma_char_t);
+
ctx_p->input_start_p = input_start_p;
ctx_p->input_end_p = input_end_p;
ctx_p->captures_count = bc_p->captures_count;
- ctx_p->captures_p = jmem_heap_alloc_block (ctx_p->captures_count * sizeof (ecma_regexp_capture_t));
- memset (ctx_p->captures_p, 0, ctx_p->captures_count * sizeof (ecma_regexp_capture_t));
-
ctx_p->non_captures_count = bc_p->non_captures_count;
- ctx_p->non_captures_p = jmem_heap_alloc_block (ctx_p->non_captures_count * sizeof (ecma_regexp_non_capture_t));
- memset (ctx_p->non_captures_p, 0, ctx_p->non_captures_count * sizeof (ecma_regexp_non_capture_t));
- const uint32_t iters_length = ctx_p->captures_count + ctx_p->non_captures_count - 1;
- ctx_p->iterations_p = jmem_heap_alloc_block (iters_length * sizeof (uint32_t));
- memset (ctx_p->iterations_p, 0, iters_length * sizeof (uint32_t));
+ ctx_p->captures_p = jmem_heap_alloc_block (ctx_p->captures_count * sizeof (ecma_regexp_capture_t));
+
+ if (ctx_p->non_captures_count > 0)
+ {
+ ctx_p->non_captures_p = jmem_heap_alloc_block (ctx_p->non_captures_count * sizeof (ecma_regexp_non_capture_t));
+ }
} /* ecma_regexp_initialize_context */
/**
@@ -1355,15 +1666,11 @@ ecma_regexp_cleanup_context (ecma_regexp_ctx_t *ctx_p) /**< regexp context */
{
JERRY_ASSERT (ctx_p != NULL);
jmem_heap_free_block (ctx_p->captures_p, ctx_p->captures_count * sizeof (ecma_regexp_capture_t));
- if (ctx_p->non_captures_p != NULL)
+
+ if (ctx_p->non_captures_count > 0)
{
jmem_heap_free_block (ctx_p->non_captures_p, ctx_p->non_captures_count * sizeof (ecma_regexp_non_capture_t));
}
- if (ctx_p->iterations_p != NULL)
- {
- const uint32_t iters_length = ctx_p->captures_count + ctx_p->non_captures_count - 1;
- jmem_heap_free_block (ctx_p->iterations_p, iters_length * sizeof (uint32_t));
- }
} /* ecma_regexp_cleanup_context */
/**
@@ -1391,8 +1698,6 @@ ecma_regexp_exec_helper (ecma_object_t *regexp_object_p, /**< RegExp object */
re_compiled_code_t *bc_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (re_compiled_code_t,
ext_object_p->u.class_prop.u.value);
- ecma_regexp_ctx_t re_ctx;
- re_ctx.flags = bc_p->header.status_flags;
lit_utf8_size_t input_size;
lit_utf8_size_t input_length;
uint8_t input_flags = ECMA_STRING_FLAG_IS_ASCII;
@@ -1404,7 +1709,7 @@ ecma_regexp_exec_helper (ecma_object_t *regexp_object_p, /**< RegExp object */
const lit_utf8_byte_t *input_curr_p = input_buffer_p;
uint32_t index = 0;
- if (re_ctx.flags & (RE_FLAG_GLOBAL | RE_FLAG_STICKY))
+ if (bc_p->header.status_flags & (RE_FLAG_GLOBAL | RE_FLAG_STICKY))
{
ecma_string_t *lastindex_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL);
ecma_value_t lastindex_value = ecma_op_object_get_own_data_prop (regexp_object_p, lastindex_str_p);
@@ -1464,6 +1769,7 @@ ecma_regexp_exec_helper (ecma_object_t *regexp_object_p, /**< RegExp object */
}
const lit_utf8_byte_t *input_end_p = input_buffer_p + input_size;
+ ecma_regexp_ctx_t re_ctx;
ecma_regexp_initialize_context (&re_ctx,
bc_p,
input_buffer_p,
@@ -1473,8 +1779,6 @@ ecma_regexp_exec_helper (ecma_object_t *regexp_object_p, /**< RegExp object */
uint8_t *bc_start_p = (uint8_t *) (bc_p + 1);
const lit_utf8_byte_t *matched_p = NULL;
- JERRY_TRACE_MSG ("Exec with flags [%x]\n", re_ctx.flags);
-
JERRY_ASSERT (index <= input_length);
while (true)
{
@@ -2077,7 +2381,6 @@ cleanup_string:
const lit_utf8_byte_t *const string_end_p = string_buffer_p + string_size;
ecma_regexp_ctx_t re_ctx;
- re_ctx.flags = bc_p->header.status_flags;
ecma_regexp_initialize_context (&re_ctx,
bc_p,
string_buffer_p,
@@ -2112,7 +2415,6 @@ cleanup_string:
while (current_str_p < string_end_p)
{
/* 13.a. */
- memset (re_ctx.captures_p, 0, re_ctx.captures_count);
const lit_utf8_byte_t *const matched_p = ecma_regexp_match (&re_ctx, bc_start_p, current_str_p);
if (ECMA_RE_STACK_LIMIT_REACHED (matched_p))
@@ -2223,8 +2525,6 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**<replace cont
{
JERRY_ASSERT (bc_p != NULL);
ecma_value_t result = ECMA_VALUE_EMPTY;
- ecma_regexp_ctx_t re_ctx;
- re_ctx.flags = bc_p->header.status_flags;
uint8_t string_flags = ECMA_STRING_FLAG_IS_ASCII;
lit_utf8_size_t string_length;
@@ -2260,6 +2560,7 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**<replace cont
}
#endif /* ENABLED (JERRY_ES2015) */
+ ecma_regexp_ctx_t re_ctx;
ecma_regexp_initialize_context (&re_ctx,
bc_p,
ctx_p->string_p,
@@ -2271,7 +2572,6 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**<replace cont
while (true)
{
- memset (re_ctx.captures_p, 0, re_ctx.captures_count);
matched_p = ecma_regexp_match (&re_ctx, bc_start_p, current_p);
if (matched_p != NULL)
diff --git a/jerry-core/ecma/operations/ecma-regexp-object.h b/jerry-core/ecma/operations/ecma-regexp-object.h
index bea4c092..fdc9971c 100644
--- a/jerry-core/ecma/operations/ecma-regexp-object.h
+++ b/jerry-core/ecma/operations/ecma-regexp-object.h
@@ -44,30 +44,73 @@ typedef enum
} ecma_regexp_flags_t;
/**
- * Structure for storing capturing group results
+ * Class escapes
*/
-typedef struct
+typedef enum
{
- const lit_utf8_byte_t *begin_p; /**< substring start pointer */
- const lit_utf8_byte_t *end_p; /**< substring end pointer */
-} ecma_regexp_capture_t;
+ RE_ESCAPE__START, /**< escapes start */
+ RE_ESCAPE_DIGIT = RE_ESCAPE__START, /**< digit */
+ RE_ESCAPE_NOT_DIGIT, /**< not digit */
+ RE_ESCAPE_WORD_CHAR, /**< word char */
+ RE_ESCAPE_NOT_WORD_CHAR, /**< not word char */
+ RE_ESCAPE_WHITESPACE, /**< whitespace */
+ RE_ESCAPE_NOT_WHITESPACE, /**< not whitespace */
+ RE_ESCAPE__COUNT, /**< escape count */
+} ecma_class_escape_t;
/**
- * Check if an ecma_regexp_capture_t contains a defined capture
+ * Character class flags escape count mask size.
*/
-#define ECMA_RE_IS_CAPTURE_DEFINED(c) ((c)->begin_p != NULL && (c)->end_p >= (c)->begin_p)
+#define RE_CLASS_ESCAPE_COUNT_MASK_SIZE (3u)
-ecma_value_t
-ecma_regexp_get_capture_value (const ecma_regexp_capture_t *const capture_p);
+/**
+ * Character class flags escape count mask.
+ */
+#define RE_CLASS_ESCAPE_COUNT_MASK ((1 << RE_CLASS_ESCAPE_COUNT_MASK_SIZE) - 1u)
/**
- * Structure for storing non-capturing group results
+ * Character class flags that are present in the upper bits of the class flags byte, while the 3 least significant bits
+ * hold a value that contains the number of class escapes present in the character class.
+ */
+typedef enum
+{
+ RE_CLASS_HAS_CHARS = (1 << 5), /**< contains individual characters */
+ RE_CLASS_HAS_RANGES = (1 << 6), /**< contains character ranges */
+ RE_CLASS_INVERT = (1 << 7), /**< inverted */
+} ecma_char_class_flags_t;
+
+/**
+ * Structure for matching capturing groups and storing their result
*/
typedef struct
{
- const lit_utf8_byte_t *str_p; /**< string pointer */
+ const lit_utf8_byte_t *begin_p; /**< capture start pointer */
+ const lit_utf8_byte_t *end_p; /**< capture end pointer */
+ const uint8_t *bc_p; /**< group bytecode pointer */
+ uint32_t iterator; /**< iteration counter */
+ uint32_t subcapture_count; /**< number of nested capturing groups */
+} ecma_regexp_capture_t;
+
+/**
+ * Structure for matching non-capturing groups
+ */
+typedef struct
+{
+ const lit_utf8_byte_t *begin_p; /**< substring start pointer */
+ const uint8_t *bc_p; /**< group bytecode pointer */
+ uint32_t iterator; /**< iteration counter */
+ uint32_t subcapture_start; /**< first nested capturing group index */
+ uint32_t subcapture_count; /**< number of nested capturing groups */
} ecma_regexp_non_capture_t;
+/**
+ * Check if an ecma_regexp_capture_t contains a defined capture
+ */
+#define ECMA_RE_IS_CAPTURE_DEFINED(c) ((c)->begin_p != NULL)
+
+ecma_value_t
+ecma_regexp_get_capture_value (const ecma_regexp_capture_t *const capture_p);
+
#if (JERRY_STACK_LIMIT != 0)
/**
* Value used ase result when stack limit is reached
@@ -83,26 +126,37 @@ typedef struct
#endif /* JERRY_STACK_LIMIT != 0 */
/**
+ * Offset applied to qmax when encoded into the bytecode.
+ *
+ * It's common for qmax to be Infinity, which is represented a UINT32_MAX. By applying the offset we are able to store
+ * it in a single byte az zero.
+ */
+#define RE_QMAX_OFFSET 1
+
+/**
* RegExp executor context
*/
typedef struct
{
- const lit_utf8_byte_t *input_end_p; /**< end of input string */
const lit_utf8_byte_t *input_start_p; /**< start of input string */
+ const lit_utf8_byte_t *input_end_p; /**< end of input string */
uint32_t captures_count; /**< number of capture groups */
- ecma_regexp_capture_t *captures_p; /**< capturing groups */
uint32_t non_captures_count; /**< number of non-capture groups */
+ ecma_regexp_capture_t *captures_p; /**< capturing groups */
ecma_regexp_non_capture_t *non_captures_p; /**< non-capturing groups */
- uint32_t *iterations_p; /**< number of iterations */
uint16_t flags; /**< RegExp flags */
+ uint8_t char_size; /**< size of encoded characters */
} ecma_regexp_ctx_t;
+#if ENABLED (JERRY_ES2015)
+lit_code_point_t ecma_regexp_unicode_advance (const lit_utf8_byte_t **str_p, const lit_utf8_byte_t *end_p);
+#endif /* ENABLED (JERRY_ES2015) */
+
ecma_object_t *ecma_op_regexp_alloc (ecma_object_t *new_target_obj_p);
ecma_value_t ecma_regexp_exec_helper (ecma_object_t *regexp_object_p,
ecma_string_t *input_string_p);
ecma_string_t *ecma_regexp_read_pattern_str_helper (ecma_value_t pattern_arg);
-lit_code_point_t ecma_regexp_canonicalize (lit_code_point_t ch, bool is_ignorecase);
-lit_code_point_t ecma_regexp_canonicalize_char (lit_code_point_t ch);
+lit_code_point_t ecma_regexp_canonicalize_char (lit_code_point_t ch, bool unicode);
ecma_value_t ecma_regexp_parse_flags (ecma_string_t *flags_str_p, uint16_t *flags_p);
void ecma_regexp_create_and_initialize_props (ecma_object_t *re_object_p,
ecma_string_t *source_p,
diff --git a/jerry-core/jcontext/jcontext.h b/jerry-core/jcontext/jcontext.h
index 698450f4..fd655651 100644
--- a/jerry-core/jcontext/jcontext.h
+++ b/jerry-core/jcontext/jcontext.h
@@ -127,7 +127,7 @@ struct jerry_context_t
/* Update JERRY_CONTEXT_FIRST_MEMBER if the first non-external member changes */
jmem_cpointer_t ecma_builtin_objects[ECMA_BUILTIN_ID__COUNT]; /**< pointer to instances of built-in objects */
#if ENABLED (JERRY_BUILTIN_REGEXP)
- const re_compiled_code_t *re_cache[RE_CACHE_SIZE]; /**< regex cache */
+ re_compiled_code_t *re_cache[RE_CACHE_SIZE]; /**< regex cache */
#endif /* ENABLED (JERRY_BUILTIN_REGEXP) */
jmem_cpointer_t ecma_gc_objects_cp; /**< List of currently alive objects. */
jmem_heap_free_t *jmem_heap_list_skip_p; /**< This is used to speed up deallocation. */
diff --git a/jerry-core/lit/lit-char-helpers.c b/jerry-core/lit/lit-char-helpers.c
index 74c235cb..90606d32 100644
--- a/jerry-core/lit/lit-char-helpers.c
+++ b/jerry-core/lit/lit-char-helpers.c
@@ -103,31 +103,32 @@ search_char_in_interval_array (ecma_char_t c, /**< code unit */
} /* search_char_in_interval_array */
/**
- * Check if specified character is one of the Whitespace characters including those
- * that fall into "Space, Separator" ("Zs") Unicode character category.
+ * Check if specified character is one of the Whitespace characters including those that fall into
+ * "Space, Separator" ("Zs") Unicode character category or one of the Line Terminator characters.
*
* @return true - if the character is one of characters, listed in ECMA-262 v5, Table 2,
* false - otherwise
*/
bool
-lit_char_is_white_space (ecma_char_t c) /**< code unit */
+lit_char_is_white_space (lit_code_point_t c) /**< code point */
{
if (c <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
{
- return (c == LIT_CHAR_TAB
- || c == LIT_CHAR_VTAB
- || c == LIT_CHAR_FF
- || c == LIT_CHAR_SP);
+ return (c == LIT_CHAR_SP || (c >= LIT_CHAR_TAB && c <= LIT_CHAR_CR));
}
else
{
- return (c == LIT_CHAR_NBSP
- || c == LIT_CHAR_BOM
- || (c >= lit_unicode_separator_char_interval_sps[0]
- && c <= lit_unicode_separator_char_interval_sps[0] + lit_unicode_separator_char_interval_lengths[0])
- || search_char_in_char_array (c,
- lit_unicode_separator_chars,
- NUM_OF_ELEMENTS (lit_unicode_separator_chars)));
+ if (c == LIT_CHAR_NBSP || c == LIT_CHAR_BOM || c == LIT_CHAR_LS || c == LIT_CHAR_PS)
+ {
+ return true;
+ }
+
+ return (c <= LIT_UTF16_CODE_UNIT_MAX
+ && ((c >= lit_unicode_separator_char_interval_sps[0]
+ && c < lit_unicode_separator_char_interval_sps[0] + lit_unicode_separator_char_interval_lengths[0])
+ || search_char_in_char_array ((ecma_char_t) c,
+ lit_unicode_separator_chars,
+ NUM_OF_ELEMENTS (lit_unicode_separator_chars))));
}
} /* lit_char_is_white_space */
@@ -429,51 +430,72 @@ lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, /**< destination buffer */
} /* lit_four_byte_utf8_char_to_cesu8 */
/**
- * Parse the next number_of_characters hexadecimal character,
- * and construct a code unit from them. The buffer must
- * be zero terminated.
+ * Lookup hex digits in a buffer
*
- * @return true if decoding was successful, false otherwise
+ * @return UINT32_MAX - if next 'lookup' number of characters do not form a valid hex number
+ * value of hex number, otherwise
*/
-bool
-lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
- lit_utf8_size_t number_of_characters, /**< number of characters to be read */
- ecma_char_t *out_code_unit_p) /**< [out] decoded result */
+uint32_t
+lit_char_hex_lookup (const lit_utf8_byte_t *buf_p, /**< buffer */
+ const lit_utf8_byte_t *const buf_end_p, /**< buffer end */
+ uint32_t lookup) /**< size of lookup */
{
- ecma_char_t code_unit = LIT_CHAR_NULL;
+ JERRY_ASSERT (lookup <= 4);
- JERRY_ASSERT (number_of_characters >= 2 && number_of_characters <= 4);
-
- for (lit_utf8_size_t i = 0; i < number_of_characters; i++)
+ if (JERRY_UNLIKELY (buf_p + lookup > buf_end_p))
{
- code_unit = (ecma_char_t) (code_unit << 4u);
+ return UINT32_MAX;
+ }
- if (*buf_p >= LIT_CHAR_ASCII_DIGITS_BEGIN
- && *buf_p <= LIT_CHAR_ASCII_DIGITS_END)
- {
- code_unit |= (ecma_char_t) (*buf_p - LIT_CHAR_ASCII_DIGITS_BEGIN);
- }
- else if (*buf_p >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN
- && *buf_p <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)
- {
- code_unit |= (ecma_char_t) (*buf_p - (LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN - 10));
- }
- else if (*buf_p >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN
- && *buf_p <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_END)
+ uint32_t value = 0;
+
+ while (lookup--)
+ {
+ lit_utf8_byte_t ch = *buf_p++;
+ if (!lit_char_is_hex_digit (ch))
{
- code_unit |= (ecma_char_t) (*buf_p - (LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN - 10));
+ return UINT32_MAX;
}
- else
+
+ value <<= 4;
+ value += lit_char_hex_to_int (ch);
+ }
+
+ JERRY_ASSERT (value <= LIT_UTF16_CODE_UNIT_MAX);
+ return value;
+} /* lit_char_hex_lookup */
+
+/**
+ * Parse a decimal number with the value clamped to UINT32_MAX.
+ *
+ * @returns uint32_t number
+ */
+uint32_t
+lit_parse_decimal (const lit_utf8_byte_t **buffer_p, /**< [in/out] character buffer */
+ const lit_utf8_byte_t *buffer_end_p) /**< buffer end */
+{
+ const lit_utf8_byte_t *current_p = *buffer_p;
+ JERRY_ASSERT (lit_char_is_decimal_digit (*current_p));
+
+ uint32_t value = (uint32_t) (*current_p++ - LIT_CHAR_0);
+
+ while (current_p < buffer_end_p && lit_char_is_decimal_digit (*current_p))
+ {
+ const uint32_t digit = (uint32_t) (*current_p++ - LIT_CHAR_0);
+ uint32_t new_value = value * 10 + digit;
+
+ if (JERRY_UNLIKELY (value > UINT32_MAX / 10) || JERRY_UNLIKELY (new_value < value))
{
- return false;
+ value = UINT32_MAX;
+ continue;
}
- buf_p++;
+ value = new_value;
}
- *out_code_unit_p = code_unit;
- return true;
-} /* lit_read_code_unit_from_hex */
+ *buffer_p = current_p;
+ return value;
+} /* lit_parse_decimal */
/**
* Check if specified character is a word character (part of IsWordChar abstract operation)
@@ -484,7 +506,7 @@ lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, /**< buffer with char
* false - otherwise
*/
bool
-lit_char_is_word_char (ecma_char_t c) /**< code unit */
+lit_char_is_word_char (lit_code_point_t c) /**< code point */
{
return ((c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)
|| (c >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
diff --git a/jerry-core/lit/lit-char-helpers.h b/jerry-core/lit/lit-char-helpers.h
index e6dbe6c5..3ad25b7f 100644
--- a/jerry-core/lit/lit-char-helpers.h
+++ b/jerry-core/lit/lit-char-helpers.h
@@ -18,8 +18,6 @@
#include "lit-globals.h"
-#define LIT_CHAR_UNDEF ((ecma_char_t) 0xFFFF) /* undefined character */
-
/*
* Format control characters (ECMA-262 v5, Table 1)
*/
@@ -37,7 +35,7 @@
#define LIT_CHAR_NBSP ((ecma_char_t) 0x00A0) /* no-break space */
/* LIT_CHAR_BOM is defined above */
-bool lit_char_is_white_space (ecma_char_t c);
+bool lit_char_is_white_space (lit_code_point_t c);
/*
* Line terminator characters (ECMA-262 v5, Table 3)
@@ -219,10 +217,8 @@ uint32_t lit_char_hex_to_int (ecma_char_t c);
size_t lit_code_point_to_cesu8_bytes (uint8_t *dst_p, lit_code_point_t code_point);
size_t lit_code_point_get_cesu8_length (lit_code_point_t code_point);
void lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, const uint8_t *source_p);
-
-/* read a hex encoded code point from a zero terminated buffer */
-bool lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, lit_utf8_size_t number_of_characters,
- ecma_char_t *out_code_unit_p);
+uint32_t lit_char_hex_lookup (const lit_utf8_byte_t *buf_p, const lit_utf8_byte_t *const buf_end_p, uint32_t lookup);
+uint32_t lit_parse_decimal (const lit_utf8_byte_t **buffer_p, const lit_utf8_byte_t *const buffer_end_p);
/**
* Null character
@@ -232,7 +228,7 @@ bool lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, lit_utf8_size_t
/*
* Part of IsWordChar abstract operation (ECMA-262 v5, 15.10.2.6, step 3)
*/
-bool lit_char_is_word_char (ecma_char_t c);
+bool lit_char_is_word_char (lit_code_point_t c);
/*
* Utility functions for uppercasing / lowercasing
diff --git a/jerry-core/lit/lit-strings.c b/jerry-core/lit/lit-strings.c
index 6f3b2ca0..c2fbb35f 100644
--- a/jerry-core/lit/lit-strings.c
+++ b/jerry-core/lit/lit-strings.c
@@ -513,7 +513,7 @@ lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with ch
*
* @return next code unit
*/
-ecma_char_t
+ecma_char_t JERRY_ATTR_NOINLINE
lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
{
JERRY_ASSERT (buf_p != NULL);
@@ -529,7 +529,7 @@ lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with cha
*
* @return previous code unit
*/
-ecma_char_t
+ecma_char_t JERRY_ATTR_NOINLINE
lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
{
JERRY_ASSERT (buf_p != NULL);
@@ -543,7 +543,7 @@ lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with cha
/**
* Increase cesu-8 encoded string pointer by one code unit.
*/
-void
+inline void JERRY_ATTR_ALWAYS_INLINE
lit_utf8_incr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
{
JERRY_ASSERT (*buf_p);
diff --git a/jerry-core/parser/js/js-lexer.c b/jerry-core/parser/js/js-lexer.c
index d327fb32..2655d711 100644
--- a/jerry-core/parser/js/js-lexer.c
+++ b/jerry-core/parser/js/js-lexer.c
@@ -2847,9 +2847,6 @@ lexer_construct_regexp_object (parser_context_t *context_p, /**< context */
context_p->literal_count++;
/* Compile the RegExp literal and store the RegExp bytecode pointer */
- const re_compiled_code_t *re_bytecode_p = NULL;
- ecma_value_t completion_value;
-
ecma_string_t *pattern_str_p = NULL;
if (lit_is_valid_cesu8_string (regex_start_p, length))
@@ -2862,19 +2859,14 @@ lexer_construct_regexp_object (parser_context_t *context_p, /**< context */
pattern_str_p = ecma_new_ecma_string_from_utf8_converted_to_cesu8 (regex_start_p, length);
}
- completion_value = re_compile_bytecode (&re_bytecode_p,
- pattern_str_p,
- current_flags);
+ re_compiled_code_t *re_bytecode_p = re_compile_bytecode (pattern_str_p, current_flags);
ecma_deref_ecma_string (pattern_str_p);
- if (ECMA_IS_VALUE_ERROR (completion_value))
+ if (JERRY_UNLIKELY (re_bytecode_p == NULL))
{
- jcontext_release_exception ();
parser_raise_error (context_p, PARSER_ERR_INVALID_REGEXP);
}
- ecma_free_value (completion_value);
-
literal_p->type = LEXER_REGEXP_LITERAL;
literal_p->u.bytecode_p = (ecma_compiled_code_t *) re_bytecode_p;
diff --git a/jerry-core/parser/js/js-parser.c b/jerry-core/parser/js/js-parser.c
index 3dd23f00..519440b5 100644
--- a/jerry-core/parser/js/js-parser.c
+++ b/jerry-core/parser/js/js-parser.c
@@ -2723,6 +2723,14 @@ parser_parse_script (const uint8_t *arg_list_p, /**< function argument list */
jcontext_raise_exception (ECMA_VALUE_NULL);
return ECMA_VALUE_ERROR;
}
+
+ if (parser_error.error == PARSER_ERR_INVALID_REGEXP)
+ {
+ /* The RegExp compiler has already raised an exception. */
+ JERRY_ASSERT (jcontext_has_pending_exception ());
+ return ECMA_VALUE_ERROR;
+ }
+
#if ENABLED (JERRY_ERROR_MESSAGES)
const lit_utf8_byte_t *err_bytes_p = (const lit_utf8_byte_t *) parser_error_to_string (parser_error.error);
lit_utf8_size_t err_bytes_size = lit_zt_utf8_string_size (err_bytes_p);
diff --git a/jerry-core/parser/regexp/re-bytecode.c b/jerry-core/parser/regexp/re-bytecode.c
index 1722f0c2..151d1baa 100644
--- a/jerry-core/parser/regexp/re-bytecode.c
+++ b/jerry-core/parser/regexp/re-bytecode.c
@@ -14,8 +14,9 @@
*/
#include "ecma-globals.h"
-#include "re-bytecode.h"
#include "ecma-regexp-object.h"
+#include "lit-strings.h"
+#include "re-bytecode.h"
#if ENABLED (JERRY_BUILTIN_REGEXP)
@@ -29,135 +30,103 @@
* @{
*/
-/**
- * Size of block of RegExp bytecode. Used for allocation
- *
- * @return pointer to the RegExp compiled code header
- */
-#define REGEXP_BYTECODE_BLOCK_SIZE 8UL
-
void
-re_initialize_regexp_bytecode (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
+re_initialize_regexp_bytecode (re_compiler_ctx_t *re_ctx_p) /**< RegExp bytecode context */
{
- const size_t initial_size = JERRY_ALIGNUP (REGEXP_BYTECODE_BLOCK_SIZE + sizeof (re_compiled_code_t), JMEM_ALIGNMENT);
- bc_ctx_p->block_start_p = jmem_heap_alloc_block (initial_size);
- bc_ctx_p->block_end_p = bc_ctx_p->block_start_p + initial_size;
- bc_ctx_p->current_p = bc_ctx_p->block_start_p + sizeof (re_compiled_code_t);
+ const size_t initial_size = sizeof (re_compiled_code_t);
+ re_ctx_p->bytecode_start_p = jmem_heap_alloc_block (initial_size);
+ re_ctx_p->bytecode_size = initial_size;
} /* re_initialize_regexp_bytecode */
-/**
- * Realloc the bytecode container
- *
- * @return current position in RegExp bytecode
- */
-static uint8_t *
-re_realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
+inline uint32_t JERRY_ATTR_ALWAYS_INLINE
+re_bytecode_size (re_compiler_ctx_t *re_ctx_p) /**< RegExp bytecode context */
{
- JERRY_ASSERT (bc_ctx_p->block_end_p >= bc_ctx_p->block_start_p);
- const size_t old_size = (size_t) (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p);
-
- /* If one of the members of RegExp bytecode context is NULL, then all member should be NULL
- * (it means first allocation), otherwise all of the members should be a non NULL pointer. */
- JERRY_ASSERT ((!bc_ctx_p->current_p && !bc_ctx_p->block_end_p && !bc_ctx_p->block_start_p)
- || (bc_ctx_p->current_p && bc_ctx_p->block_end_p && bc_ctx_p->block_start_p));
-
- const size_t new_size = old_size + REGEXP_BYTECODE_BLOCK_SIZE;
- JERRY_ASSERT (bc_ctx_p->current_p >= bc_ctx_p->block_start_p);
- const size_t current_ptr_offset = (size_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p);
-
- bc_ctx_p->block_start_p = jmem_heap_realloc_block (bc_ctx_p->block_start_p,
- old_size,
- new_size);
- bc_ctx_p->block_end_p = bc_ctx_p->block_start_p + new_size;
- bc_ctx_p->current_p = bc_ctx_p->block_start_p + current_ptr_offset;
-
- return bc_ctx_p->current_p;
-} /* re_realloc_regexp_bytecode_block */
+ return (uint32_t) re_ctx_p->bytecode_size;
+} /* re_bytecode_size */
/**
* Append a new bytecode to the and of the bytecode container
*/
static uint8_t *
-re_bytecode_reserve (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
+re_bytecode_reserve (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */
const size_t size) /**< size */
{
- JERRY_ASSERT (size <= REGEXP_BYTECODE_BLOCK_SIZE);
-
- uint8_t *current_p = bc_ctx_p->current_p;
- if (current_p + size > bc_ctx_p->block_end_p)
- {
- current_p = re_realloc_regexp_bytecode_block (bc_ctx_p);
- }
-
- bc_ctx_p->current_p += size;
- return current_p;
+ const size_t old_size = re_ctx_p->bytecode_size;
+ const size_t new_size = old_size + size;
+ re_ctx_p->bytecode_start_p = jmem_heap_realloc_block (re_ctx_p->bytecode_start_p, old_size, new_size);
+ re_ctx_p->bytecode_size = new_size;
+ return re_ctx_p->bytecode_start_p + old_size;
} /* re_bytecode_reserve */
/**
* Insert a new bytecode to the bytecode container
*/
-static void
-re_bytecode_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
+static uint8_t *
+re_bytecode_insert (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */
const size_t offset, /**< distance from the start of the container */
const size_t size) /**< size */
{
- JERRY_ASSERT (size <= REGEXP_BYTECODE_BLOCK_SIZE);
-
- uint8_t *current_p = bc_ctx_p->current_p;
- if (current_p + size > bc_ctx_p->block_end_p)
- {
- re_realloc_regexp_bytecode_block (bc_ctx_p);
- }
+ const size_t tail_size = re_ctx_p->bytecode_size - offset;
+ re_bytecode_reserve (re_ctx_p, size);
- uint8_t *dest_p = bc_ctx_p->block_start_p + offset;
- const size_t bytecode_length = re_get_bytecode_length (bc_ctx_p);
- if (bytecode_length - offset > 0)
- {
- memmove (dest_p + size, dest_p, bytecode_length - offset);
- }
+ uint8_t *dest_p = re_ctx_p->bytecode_start_p + offset;
+ memmove (dest_p + size, dest_p, tail_size);
- bc_ctx_p->current_p += size;
+ return dest_p;
} /* re_bytecode_insert */
/**
- * Encode ecma_char_t into bytecode
+ * Append a byte
*/
-static void
-re_encode_char (uint8_t *dest_p, /**< destination */
- const ecma_char_t c) /**< character */
+void
+re_append_byte (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */
+ const uint8_t byte) /**< byte value */
{
- *dest_p++ = (uint8_t) ((c >> 8) & 0xFF);
- *dest_p = (uint8_t) (c & 0xFF);
-} /* re_encode_char */
+ uint8_t *dest_p = re_bytecode_reserve (re_ctx_p, sizeof (uint8_t));
+ *dest_p = byte;
+} /* re_append_byte */
/**
- * Encode uint32_t into bytecode
+ * Insert a byte value
*/
-static void
-re_encode_u32 (uint8_t *dest_p, /**< destination */
- const uint32_t u) /**< uint32 value */
+void
+re_insert_byte (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */
+ const uint32_t offset, /**< distance from the start of the container */
+ const uint8_t byte) /**< byte value */
{
- *dest_p++ = (uint8_t) ((u >> 24) & 0xFF);
- *dest_p++ = (uint8_t) ((u >> 16) & 0xFF);
- *dest_p++ = (uint8_t) ((u >> 8) & 0xFF);
- *dest_p = (uint8_t) (u & 0xFF);
-} /* re_encode_u32 */
+ uint8_t *dest_p = re_bytecode_insert (re_ctx_p, offset, sizeof (uint8_t));
+ *dest_p = byte;
+} /* re_insert_byte */
/**
- * Get a character from the RegExp bytecode and increase the bytecode position
- *
- * @return ecma character
+ * Get a single byte and icnrease bytecode position.
*/
-inline ecma_char_t JERRY_ATTR_ALWAYS_INLINE
-re_get_char (const uint8_t **bc_p) /**< pointer to bytecode start */
+inline uint8_t JERRY_ATTR_ALWAYS_INLINE
+re_get_byte (const uint8_t **bc_p) /**< pointer to bytecode start */
{
- const uint8_t *src_p = *bc_p;
- ecma_char_t chr = (ecma_char_t) *src_p++;
- chr = (ecma_char_t) (chr << 8);
- chr = (ecma_char_t) (chr | *src_p);
- (*bc_p) += sizeof (ecma_char_t);
- return chr;
-} /* re_get_char */
+ return *((*bc_p)++);
+} /* re_get_byte */
+
+/**
+ * Append a RegExp opcode
+ */
+inline void JERRY_ATTR_ALWAYS_INLINE
+re_append_opcode (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */
+ const re_opcode_t opcode) /**< input opcode */
+{
+ re_append_byte (re_ctx_p, (uint8_t) opcode);
+} /* re_append_opcode */
+
+/**
+ * Insert a RegExp opcode
+ */
+inline void JERRY_ATTR_ALWAYS_INLINE
+re_insert_opcode (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */
+ const uint32_t offset, /**< distance from the start of the container */
+ const re_opcode_t opcode) /**< input opcode */
+{
+ re_insert_byte (re_ctx_p, offset, (uint8_t) opcode);
+} /* re_insert_opcode */
/**
* Get a RegExp opcode and increase the bytecode position
@@ -167,318 +136,497 @@ re_get_char (const uint8_t **bc_p) /**< pointer to bytecode start */
inline re_opcode_t JERRY_ATTR_ALWAYS_INLINE
re_get_opcode (const uint8_t **bc_p) /**< pointer to bytecode start */
{
- return (re_opcode_t) *((*bc_p)++);
+ return (re_opcode_t) re_get_byte (bc_p);
} /* re_get_opcode */
/**
- * Get a parameter of a RegExp opcode and increase the bytecode position
+ * Encode 2 byte unsigned integer into the bytecode
+ */
+static void
+re_encode_u16 (uint8_t *dest_p, /**< destination */
+ const uint16_t value) /**< value */
+{
+ *dest_p++ = (uint8_t) ((value >> 8) & 0xFF);
+ *dest_p = (uint8_t) (value & 0xFF);
+} /* re_encode_u16 */
+
+/**
+ * Encode 4 byte unsigned integer into the bytecode
+ */
+static void
+re_encode_u32 (uint8_t *dest_p, /**< destination */
+ const uint32_t value) /**< value */
+{
+ *dest_p++ = (uint8_t) ((value >> 24) & 0xFF);
+ *dest_p++ = (uint8_t) ((value >> 16) & 0xFF);
+ *dest_p++ = (uint8_t) ((value >> 8) & 0xFF);
+ *dest_p = (uint8_t) (value & 0xFF);
+} /* re_encode_u32 */
+
+/**
+ * Decode 2 byte unsigned integer from bytecode
*
- * @return opcode parameter
+ * @return uint16_t value
*/
-inline uint32_t JERRY_ATTR_ALWAYS_INLINE
-re_get_value (const uint8_t **bc_p) /**< pointer to bytecode start */
+static uint16_t
+re_decode_u16 (const uint8_t *src_p) /**< source */
{
- const uint8_t *src_p = *bc_p;
- uint32_t value = (uint32_t) (*src_p++);
- value <<= 8;
- value |= ((uint32_t) (*src_p++));
- value <<= 8;
- value |= ((uint32_t) (*src_p++));
- value <<= 8;
- value |= ((uint32_t) (*src_p++));
-
- (*bc_p) += sizeof (uint32_t);
+ uint16_t value = (uint16_t) (((uint16_t) *src_p++) << 8);
+ value = (uint16_t) (value + *src_p++);
return value;
-} /* re_get_value */
+} /* re_decode_u16 */
/**
- * Get length of bytecode
+ * Decode 4 byte unsigned integer from bytecode
*
- * @return bytecode length (unsigned integer)
+ * @return uint32_t value
*/
-inline uint32_t JERRY_ATTR_PURE JERRY_ATTR_ALWAYS_INLINE
-re_get_bytecode_length (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
+static uint32_t JERRY_ATTR_NOINLINE
+re_decode_u32 (const uint8_t *src_p) /**< source */
{
- return ((uint32_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p));
-} /* re_get_bytecode_length */
+ uint32_t value = (uint32_t) (((uint32_t) *src_p++) << 24);
+ value += (uint32_t) (((uint32_t) *src_p++) << 16);
+ value += (uint32_t) (((uint32_t) *src_p++) << 8);
+ value += (uint32_t) (*src_p++);
+ return value;
+} /* re_decode_u32 */
/**
- * Append a RegExp opcode
+ * Get the encoded size of an uint32_t value.
+ *
+ * @return encoded value size
*/
-void
-re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
- const re_opcode_t opcode) /**< input opcode */
+inline static size_t JERRY_ATTR_ALWAYS_INLINE
+re_get_encoded_value_size (uint32_t value) /**< value */
{
- uint8_t *dest_p = re_bytecode_reserve (bc_ctx_p, sizeof (uint8_t));
- *dest_p = (uint8_t) opcode;
-} /* re_append_opcode */
+ if (JERRY_LIKELY (value <= RE_VALUE_1BYTE_MAX))
+ {
+ return 1;
+ }
-/**
- * Append a parameter of a RegExp opcode
+ return 5;
+} /* re_get_encoded_value_size */
+
+/*
+ * Encode a value to the specified position in the bytecode.
*/
-void
-re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
- const uint32_t value) /**< input value */
+static void
+re_encode_value (uint8_t *dest_p, /**< position in bytecode */
+ const uint32_t value) /**< value */
{
- uint8_t *dest_p = re_bytecode_reserve (bc_ctx_p, sizeof (uint32_t));
+ if (JERRY_LIKELY (value <= RE_VALUE_1BYTE_MAX))
+ {
+ *dest_p = (uint8_t) value;
+ return;
+ }
+
+ *dest_p++ = (uint8_t) (RE_VALUE_4BYTE_MARKER);
re_encode_u32 (dest_p, value);
-} /* re_append_u32 */
+} /* re_encode_value */
/**
- * Append a character to the RegExp bytecode
+ * Append a value to the end of the bytecode.
*/
void
-re_append_char (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
- const ecma_char_t input_char) /**< input char */
+re_append_value (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */
+ const uint32_t value) /**< value */
{
- uint8_t *dest_p = re_bytecode_reserve (bc_ctx_p, sizeof (ecma_char_t));
- re_encode_char (dest_p, input_char);
-} /* re_append_char */
+ const size_t size = re_get_encoded_value_size (value);
+ uint8_t *dest_p = re_bytecode_reserve (re_ctx_p, size);
+ re_encode_value (dest_p, value);
+} /* re_append_value */
/**
- * Append a jump offset parameter of a RegExp opcode
+ * Insert a value into the bytecode at a specific offset.
*/
void
-re_append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
- uint32_t value) /**< input value */
+re_insert_value (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */
+ const uint32_t offset, /**< bytecode offset */
+ const uint32_t value) /**< value */
{
- value += (uint32_t) (sizeof (uint32_t));
- re_append_u32 (bc_ctx_p, value);
-} /* re_append_jump_offset */
+ const size_t size = re_get_encoded_value_size (value);
+ uint8_t *dest_p = re_bytecode_insert (re_ctx_p, offset, size);
+ re_encode_value (dest_p, value);
+} /* re_insert_value */
/**
- * Insert a RegExp opcode
+ * Read an encoded value from the bytecode.
+ *
+ * @return decoded value
+ */
+uint32_t JERRY_ATTR_ALWAYS_INLINE
+re_get_value (const uint8_t **bc_p) /** refence to bytecode pointer */
+{
+ uint32_t value = *(*bc_p)++;
+ if (JERRY_LIKELY (value <= RE_VALUE_1BYTE_MAX))
+ {
+ return value;
+ }
+
+ value = re_decode_u32 (*bc_p);
+ *bc_p += sizeof (uint32_t);
+ return value;
+} /* re_get_value */
+
+/**
+ * Append a character to the RegExp bytecode
*/
void
-re_insert_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
- const uint32_t offset, /**< distance from the start of the container */
- const re_opcode_t opcode) /**< input opcode */
+re_append_char (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */
+ const lit_code_point_t cp) /**< code point */
{
- re_bytecode_insert (bc_ctx_p, offset, sizeof (uint8_t));
- *(bc_ctx_p->block_start_p + offset) = (uint8_t) opcode;
-} /* re_insert_opcode */
+#if ENABLED (JERRY_ES2015)
+ const size_t size = (re_ctx_p->flags & RE_FLAG_UNICODE) ? sizeof (lit_code_point_t) : sizeof (ecma_char_t);
+#else /* !ENABLED (JERRY_ES2015) */
+ JERRY_UNUSED (re_ctx_p);
+ const size_t size = sizeof (ecma_char_t);
+#endif /* !ENABLED (JERRY_ES2015) */
+
+ uint8_t *dest_p = re_bytecode_reserve (re_ctx_p, size);
+
+#if ENABLED (JERRY_ES2015)
+ if (re_ctx_p->flags & RE_FLAG_UNICODE)
+ {
+ re_encode_u32 (dest_p, cp);
+ return;
+ }
+#endif /* ENABLED (JERRY_ES2015) */
+
+ JERRY_ASSERT (cp <= LIT_UTF16_CODE_UNIT_MAX);
+ re_encode_u16 (dest_p, (ecma_char_t) cp);
+} /* re_append_char */
/**
- * Insert a parameter of a RegExp opcode
+ * Append a character to the RegExp bytecode
*/
void
-re_insert_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
- uint32_t offset, /**< distance from the start of the container */
- uint32_t value) /**< input value */
+re_insert_char (re_compiler_ctx_t *re_ctx_p, /**< RegExp bytecode context */
+ const uint32_t offset, /**< bytecode offset */
+ const lit_code_point_t cp) /**< code point*/
+{
+#if ENABLED (JERRY_ES2015)
+ const size_t size = (re_ctx_p->flags & RE_FLAG_UNICODE) ? sizeof (lit_code_point_t) : sizeof (ecma_char_t);
+#else /* !ENABLED (JERRY_ES2015) */
+ JERRY_UNUSED (re_ctx_p);
+ const size_t size = sizeof (ecma_char_t);
+#endif /* !ENABLED (JERRY_ES2015) */
+
+ uint8_t *dest_p = re_bytecode_insert (re_ctx_p, offset, size);
+
+#if ENABLED (JERRY_ES2015)
+ if (re_ctx_p->flags & RE_FLAG_UNICODE)
+ {
+ re_encode_u32 (dest_p, cp);
+ return;
+ }
+#endif /* ENABLED (JERRY_ES2015) */
+
+ JERRY_ASSERT (cp <= LIT_UTF16_CODE_UNIT_MAX);
+ re_encode_u16 (dest_p, (ecma_char_t) cp);
+} /* re_insert_char */
+
+/**
+ * Decode a character from the bytecode.
+ *
+ * @return decoded character
+ */
+inline lit_code_point_t JERRY_ATTR_ALWAYS_INLINE
+re_get_char (const uint8_t **bc_p, /**< reference to bytecode pointer */
+ bool unicode) /**< full unicode mode */
{
- re_bytecode_insert (bc_ctx_p, offset, sizeof (uint32_t));
- re_encode_u32 (bc_ctx_p->block_start_p + offset, value);
-} /* re_insert_u32 */
+ lit_code_point_t cp;
+
+#if !ENABLED (JERRY_ES2015)
+ JERRY_UNUSED (unicode);
+#else /* ENABLED (JERRY_ES2015) */
+ if (unicode)
+ {
+ cp = re_decode_u32 (*bc_p);
+ *bc_p += sizeof (lit_code_point_t);
+ }
+ else
+#endif /* ENABLED (JERRY_ES2015) */
+ {
+ cp = re_decode_u16 (*bc_p);
+ *bc_p += sizeof (ecma_char_t);
+ }
+
+ return cp;
+} /* re_get_char */
#if ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE)
+static uint32_t
+re_get_bytecode_offset (const uint8_t *start_p, /**< bytecode start pointer */
+ const uint8_t *current_p) /**< current bytecode pointer */
+{
+ return (uint32_t) ((uintptr_t) current_p - (uintptr_t) start_p);
+} /* re_get_bytecode_offset */
+
/**
* RegExp bytecode dumper
*/
void
-re_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
+re_dump_bytecode (re_compiler_ctx_t *re_ctx_p) /**< RegExp bytecode context */
{
- re_compiled_code_t *compiled_code_p = (re_compiled_code_t *) bc_ctx_p->block_start_p;
- JERRY_DEBUG_MSG ("%d ", compiled_code_p->header.status_flags);
- JERRY_DEBUG_MSG ("%d ", compiled_code_p->captures_count);
- JERRY_DEBUG_MSG ("%d | ", compiled_code_p->non_captures_count);
+ static const char escape_chars[] = {'d', 'D', 'w', 'W', 's', 'S'};
- const uint8_t *bytecode_p = (const uint8_t *) (compiled_code_p + 1);
+ re_compiled_code_t *compiled_code_p = (re_compiled_code_t *) re_ctx_p->bytecode_start_p;
+ JERRY_DEBUG_MSG ("Flags: 0x%x ", compiled_code_p->header.status_flags);
+ JERRY_DEBUG_MSG ("Capturing groups: %d ", compiled_code_p->captures_count);
+ JERRY_DEBUG_MSG ("Non-capturing groups: %d\n", compiled_code_p->non_captures_count);
- re_opcode_t op;
- while ((op = re_get_opcode (&bytecode_p)))
+ const uint8_t *bytecode_start_p = (const uint8_t *) (compiled_code_p + 1);
+ const uint8_t *bytecode_p = bytecode_start_p;
+
+ while (true)
{
+ JERRY_DEBUG_MSG ("[%3u] ", (uint32_t) ((uintptr_t) bytecode_p - (uintptr_t) bytecode_start_p));
+ re_opcode_t op = *bytecode_p++;
switch (op)
{
- case RE_OP_MATCH:
+ case RE_OP_ALTERNATIVE_START:
{
- JERRY_DEBUG_MSG ("MATCH, ");
+ JERRY_DEBUG_MSG ("ALTERNATIVE_START ");
+ const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p);
+ JERRY_DEBUG_MSG ("tail offset: [%3u]\n", offset);
break;
}
- case RE_OP_CHAR:
+ case RE_OP_ALTERNATIVE_NEXT:
{
- JERRY_DEBUG_MSG ("CHAR ");
- JERRY_DEBUG_MSG ("%c, ", (char) re_get_char (&bytecode_p));
+ JERRY_DEBUG_MSG ("ALTERNATIVE_NEXT ");
+ const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p);
+ JERRY_DEBUG_MSG ("tail offset: [%3u]\n", offset);
break;
}
- case RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
- {
- JERRY_DEBUG_MSG ("N");
- /* FALLTHRU */
- }
- case RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START:
+ case RE_OP_NO_ALTERNATIVE:
{
- JERRY_DEBUG_MSG ("GZ_START ");
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("NO_ALTERNATIVES\n");
break;
}
- case RE_OP_CAPTURE_GROUP_START:
+ case RE_OP_CAPTURING_GROUP_START:
{
- JERRY_DEBUG_MSG ("START ");
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("CAPTURING_GROUP_START ");
+ JERRY_DEBUG_MSG ("idx: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("capture count: %u, ", re_get_value (&bytecode_p));
+
+ const uint32_t qmin = re_get_value (&bytecode_p);
+ JERRY_DEBUG_MSG ("qmin: %u", qmin);
+ if (qmin == 0)
+ {
+ const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p);
+ JERRY_DEBUG_MSG (", tail offset: [%3u]\n", offset);
+ }
+ else
+ {
+ JERRY_DEBUG_MSG ("\n");
+ }
+
break;
}
- case RE_OP_CAPTURE_NON_GREEDY_GROUP_END:
- {
- JERRY_DEBUG_MSG ("N");
- /* FALLTHRU */
- }
- case RE_OP_CAPTURE_GREEDY_GROUP_END:
+ case RE_OP_NON_CAPTURING_GROUP_START:
{
- JERRY_DEBUG_MSG ("G_END ");
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("NON_CAPTURING_GROUP_START ");
+ JERRY_DEBUG_MSG ("idx: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("capture start: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("capture count: %u, ", re_get_value (&bytecode_p));
+
+ const uint32_t qmin = re_get_value (&bytecode_p);
+ JERRY_DEBUG_MSG ("qmin: %u", qmin);
+ if (qmin == 0)
+ {
+ const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p);
+ JERRY_DEBUG_MSG (", tail offset: [%3u]\n", offset);
+ }
+ else
+ {
+ JERRY_DEBUG_MSG ("\n");
+ }
+
break;
}
- case RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
+ case RE_OP_GREEDY_CAPTURING_GROUP_END:
{
- JERRY_DEBUG_MSG ("N");
- /* FALLTHRU */
+ JERRY_DEBUG_MSG ("GREEDY_CAPTURING_GROUP_END ");
+ JERRY_DEBUG_MSG ("idx: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("qmin: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("qmax: %u\n", re_get_value (&bytecode_p) - RE_QMAX_OFFSET);
+ break;
}
- case RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START:
+ case RE_OP_LAZY_CAPTURING_GROUP_END:
{
- JERRY_DEBUG_MSG ("GZ_NC_START ");
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("LAZY_CAPTURING_GROUP_END ");
+ JERRY_DEBUG_MSG ("idx: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("qmin: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("qmax: %u\n", re_get_value (&bytecode_p) - RE_QMAX_OFFSET);
break;
}
- case RE_OP_NON_CAPTURE_GROUP_START:
+ case RE_OP_GREEDY_NON_CAPTURING_GROUP_END:
{
- JERRY_DEBUG_MSG ("NC_START ");
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("GREEDY_NON_CAPTURING_GROUP_END ");
+ JERRY_DEBUG_MSG ("idx: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("qmin: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("qmax: %u\n", re_get_value (&bytecode_p) - RE_QMAX_OFFSET);
break;
}
- case RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END:
+ case RE_OP_LAZY_NON_CAPTURING_GROUP_END:
{
- JERRY_DEBUG_MSG ("N");
- /* FALLTHRU */
+ JERRY_DEBUG_MSG ("LAZY_NON_CAPTURING_GROUP_END ");
+ JERRY_DEBUG_MSG ("idx: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("qmin: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("qmax: %u\n", re_get_value (&bytecode_p) - RE_QMAX_OFFSET);
+ break;
}
- case RE_OP_NON_CAPTURE_GREEDY_GROUP_END:
+ case RE_OP_GREEDY_ITERATOR:
{
- JERRY_DEBUG_MSG ("G_NC_END ");
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("GREEDY_ITERATOR ");
+ JERRY_DEBUG_MSG ("qmin: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("qmax: %u, ", re_get_value (&bytecode_p) - RE_QMAX_OFFSET);
+ const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p);
+ JERRY_DEBUG_MSG ("tail offset: [%3u]\n", offset);
break;
}
- case RE_OP_SAVE_AT_START:
+ case RE_OP_LAZY_ITERATOR:
{
- JERRY_DEBUG_MSG ("RE_START ");
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("LAZY_ITERATOR ");
+ JERRY_DEBUG_MSG ("qmin: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("qmax: %u, ", re_get_value (&bytecode_p) - RE_QMAX_OFFSET);
+ const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p);
+ JERRY_DEBUG_MSG ("tail offset: [%3u]\n", offset);
break;
}
- case RE_OP_SAVE_AND_MATCH:
+ case RE_OP_ITERATOR_END:
{
- JERRY_DEBUG_MSG ("RE_END, ");
+ JERRY_DEBUG_MSG ("ITERATOR_END\n");
break;
}
- case RE_OP_GREEDY_ITERATOR:
+ case RE_OP_BACKREFERENCE:
{
- JERRY_DEBUG_MSG ("GREEDY_ITERATOR ");
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("BACKREFERENCE ");
+ JERRY_DEBUG_MSG ("idx: %d\n", re_get_value (&bytecode_p));
break;
}
- case RE_OP_NON_GREEDY_ITERATOR:
+ case RE_OP_ASSERT_LINE_START:
{
- JERRY_DEBUG_MSG ("NON_GREEDY_ITERATOR ");
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("ASSERT_LINE_START\n");
break;
}
- case RE_OP_PERIOD:
+ case RE_OP_ASSERT_LINE_END:
{
- JERRY_DEBUG_MSG ("PERIOD ");
+ JERRY_DEBUG_MSG ("ASSERT_LINE_END\n");
break;
}
- case RE_OP_ALTERNATIVE:
+ case RE_OP_ASSERT_LOOKAHEAD_POS:
{
- JERRY_DEBUG_MSG ("ALTERNATIVE ");
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("ASSERT_LOOKAHEAD_POS ");
+ JERRY_DEBUG_MSG ("qmin: %u, ", *bytecode_p++);
+ JERRY_DEBUG_MSG ("capture start: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("capture count: %u, ", re_get_value (&bytecode_p));
+ const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p);
+ JERRY_DEBUG_MSG ("tail offset: [%3u]\n", offset);
break;
}
- case RE_OP_ASSERT_START:
+ case RE_OP_ASSERT_LOOKAHEAD_NEG:
{
- JERRY_DEBUG_MSG ("ASSERT_START ");
+ JERRY_DEBUG_MSG ("ASSERT_LOOKAHEAD_NEG ");
+ JERRY_DEBUG_MSG ("qmin: %u, ", *bytecode_p++);
+ JERRY_DEBUG_MSG ("capture start: %u, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("capture count: %u, ", re_get_value (&bytecode_p));
+ const uint32_t offset = re_get_value (&bytecode_p) + re_get_bytecode_offset (bytecode_start_p, bytecode_p);
+ JERRY_DEBUG_MSG ("tail offset: [%3u]\n", offset);
break;
}
case RE_OP_ASSERT_END:
{
- JERRY_DEBUG_MSG ("ASSERT_END ");
+ JERRY_DEBUG_MSG ("ASSERT_END\n");
break;
}
case RE_OP_ASSERT_WORD_BOUNDARY:
{
- JERRY_DEBUG_MSG ("ASSERT_WORD_BOUNDARY ");
+ JERRY_DEBUG_MSG ("ASSERT_WORD_BOUNDARY\n");
break;
}
case RE_OP_ASSERT_NOT_WORD_BOUNDARY:
{
- JERRY_DEBUG_MSG ("ASSERT_NOT_WORD_BOUNDARY ");
+ JERRY_DEBUG_MSG ("ASSERT_NOT_WORD_BOUNDARY\n");
break;
}
- case RE_OP_LOOKAHEAD_POS:
+ case RE_OP_CLASS_ESCAPE:
{
- JERRY_DEBUG_MSG ("LOOKAHEAD_POS ");
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
+ ecma_class_escape_t escape = (ecma_class_escape_t) *bytecode_p++;
+ JERRY_DEBUG_MSG ("CLASS_ESCAPE \\%c\n", escape_chars[escape]);
+ break;
+ }
+ case RE_OP_CHAR_CLASS:
+ {
+ JERRY_DEBUG_MSG ("CHAR_CLASS ");
+ uint8_t flags = *bytecode_p++;
+ uint32_t char_count = (flags & RE_CLASS_HAS_CHARS) ? re_get_value (&bytecode_p) : 0;
+ uint32_t range_count = (flags & RE_CLASS_HAS_RANGES) ? re_get_value (&bytecode_p) : 0;
+
+ if (flags & RE_CLASS_INVERT)
+ {
+ JERRY_DEBUG_MSG ("inverted ");
+ }
+
+ JERRY_DEBUG_MSG ("escapes: ");
+ uint8_t escape_count = flags & RE_CLASS_ESCAPE_COUNT_MASK;
+ while (escape_count--)
+ {
+ JERRY_DEBUG_MSG ("\\%c, ", escape_chars[*bytecode_p++]);
+ }
+
+ JERRY_DEBUG_MSG ("chars: ");
+ while (char_count--)
+ {
+ JERRY_DEBUG_MSG ("\\u%04x, ", re_get_char (&bytecode_p, re_ctx_p->flags & RE_FLAG_UNICODE));
+ }
+
+ JERRY_DEBUG_MSG ("ranges: ");
+ while (range_count--)
+ {
+ const lit_code_point_t begin = re_get_char (&bytecode_p, re_ctx_p->flags & RE_FLAG_UNICODE);
+ const lit_code_point_t end = re_get_char (&bytecode_p, re_ctx_p->flags & RE_FLAG_UNICODE);
+ JERRY_DEBUG_MSG ("\\u%04x-\\u%04x, ", begin, end);
+ }
+
+ JERRY_DEBUG_MSG ("\n");
break;
}
- case RE_OP_LOOKAHEAD_NEG:
+#if ENABLED (JERRY_ES2015)
+ case RE_OP_UNICODE_PERIOD:
{
- JERRY_DEBUG_MSG ("LOOKAHEAD_NEG ");
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("UNICODE_PERIOD\n");
break;
}
- case RE_OP_BACKREFERENCE:
+#endif /* ENABLED (JERRY_ES2015) */
+ case RE_OP_PERIOD:
{
- JERRY_DEBUG_MSG ("BACKREFERENCE ");
- JERRY_DEBUG_MSG ("%d, ", re_get_value (&bytecode_p));
+ JERRY_DEBUG_MSG ("PERIOD\n");
break;
}
- case RE_OP_INV_CHAR_CLASS:
+ case RE_OP_CHAR:
{
- JERRY_DEBUG_MSG ("INV_");
- /* FALLTHRU */
+ JERRY_DEBUG_MSG ("CHAR \\u%04x\n", re_get_char (&bytecode_p, re_ctx_p->flags & RE_FLAG_UNICODE));
+ break;
}
- case RE_OP_CHAR_CLASS:
+ case RE_OP_BYTE:
{
- JERRY_DEBUG_MSG ("CHAR_CLASS ");
- uint32_t num_of_class = re_get_value (&bytecode_p);
- JERRY_DEBUG_MSG ("%d", num_of_class);
- while (num_of_class)
- {
- if ((compiled_code_p->header.status_flags & RE_FLAG_UNICODE) != 0)
- {
- JERRY_DEBUG_MSG (" %u", re_get_value (&bytecode_p));
- JERRY_DEBUG_MSG ("-%u", re_get_value (&bytecode_p));
- }
- else
- {
- JERRY_DEBUG_MSG (" %u", re_get_char (&bytecode_p));
- JERRY_DEBUG_MSG ("-%u", re_get_char (&bytecode_p));
- }
- num_of_class--;
- }
- JERRY_DEBUG_MSG (", ");
+ const uint8_t ch = *bytecode_p++;
+ JERRY_DEBUG_MSG ("BYTE \\u%04x '%c'\n", ch, (char) ch);
break;
}
+ case RE_OP_EOF:
+ {
+ JERRY_DEBUG_MSG ("EOF\n");
+ return;
+ }
default:
{
- JERRY_DEBUG_MSG ("UNKNOWN(%d), ", (uint32_t) op);
+ JERRY_DEBUG_MSG ("UNKNOWN(%d)\n", (uint32_t) op);
break;
}
}
}
- JERRY_DEBUG_MSG ("EOF\n");
} /* re_dump_bytecode */
#endif /* ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) */
diff --git a/jerry-core/parser/regexp/re-bytecode.h b/jerry-core/parser/regexp/re-bytecode.h
index 715170bb..2a293a19 100644
--- a/jerry-core/parser/regexp/re-bytecode.h
+++ b/jerry-core/parser/regexp/re-bytecode.h
@@ -19,6 +19,7 @@
#if ENABLED (JERRY_BUILTIN_REGEXP)
#include "ecma-globals.h"
+#include "re-compiler-context.h"
/** \addtogroup parser Parser
* @{
@@ -41,42 +42,56 @@
#define RE_FLAGS_MASK 0x3F
/**
+ * Maximum value that can be encoded in the RegExp bytecode as a single byte.
+ */
+#define RE_VALUE_1BYTE_MAX 0xFE
+
+/**
+ * Marker that signals that the actual value is enocded in the following 4 bytes in the bytecode.
+ */
+#define RE_VALUE_4BYTE_MARKER 0xFF
+
+/**
* RegExp opcodes
*/
typedef enum
{
- RE_OP_EOF,
- /* Group opcode order is important, because RE_IS_CAPTURE_GROUP is based on it.
- * Change it carefully. Capture opcodes should be at first.
- */
- RE_OP_CAPTURE_GROUP_START, /**< group start */
- RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START, /**< greedy zero group start */
- RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START, /**< non-greedy zero group start */
- RE_OP_CAPTURE_GREEDY_GROUP_END, /**< greedy group end */
- RE_OP_CAPTURE_NON_GREEDY_GROUP_END, /**< non-greedy group end */
- RE_OP_NON_CAPTURE_GROUP_START, /**< non-capture group start */
- RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START, /**< non-capture greedy zero group start */
- RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START, /**< non-capture non-greedy zero group start */
- RE_OP_NON_CAPTURE_GREEDY_GROUP_END, /**< non-capture greedy group end */
- RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END, /**< non-capture non-greedy group end */
-
- RE_OP_MATCH, /**< match */
- RE_OP_CHAR, /**< any character */
- RE_OP_SAVE_AT_START, /**< save at start */
- RE_OP_SAVE_AND_MATCH, /**< save and match */
- RE_OP_PERIOD, /**< "." */
- RE_OP_ALTERNATIVE, /**< "|" */
+ RE_OP_EOF, /**< end of pattern */
+
+ RE_OP_ALTERNATIVE_START, /**< start of alternatives */
+ RE_OP_ALTERNATIVE_NEXT, /**< next alternative */
+ RE_OP_NO_ALTERNATIVE, /**< no alternative */
+
+ RE_OP_CAPTURING_GROUP_START, /**< start of a capturing group */
+ RE_OP_NON_CAPTURING_GROUP_START, /**< start of a non-capturing group */
+
+ RE_OP_GREEDY_CAPTURING_GROUP_END, /**< end of a greedy capturing group */
+ RE_OP_GREEDY_NON_CAPTURING_GROUP_END, /**< end of a greedy non-capturing group */
+ RE_OP_LAZY_CAPTURING_GROUP_END, /**< end of a lazy capturing group */
+ RE_OP_LAZY_NON_CAPTURING_GROUP_END, /**< end of a lazy non-capturing group */
+
RE_OP_GREEDY_ITERATOR, /**< greedy iterator */
- RE_OP_NON_GREEDY_ITERATOR, /**< non-greedy iterator */
- RE_OP_ASSERT_START, /**< "^" */
- RE_OP_ASSERT_END, /**< "$" */
- RE_OP_ASSERT_WORD_BOUNDARY, /**< "\b" */
- RE_OP_ASSERT_NOT_WORD_BOUNDARY, /**< "\B" */
- RE_OP_LOOKAHEAD_POS, /**< lookahead pos */
- RE_OP_LOOKAHEAD_NEG, /**< lookahead neg */
- RE_OP_BACKREFERENCE, /**< "\[0..9]" */
- RE_OP_CHAR_CLASS, /**< "[ ]" */
- RE_OP_INV_CHAR_CLASS /**< "[^ ]" */
+ RE_OP_LAZY_ITERATOR, /**< lazy iterator */
+ RE_OP_ITERATOR_END, /*** end of an iterator */
+
+ RE_OP_BACKREFERENCE, /**< backreference */
+
+ RE_OP_ASSERT_LINE_START, /**< line start assertion */
+ RE_OP_ASSERT_LINE_END, /**< line end assertion */
+ RE_OP_ASSERT_WORD_BOUNDARY, /**< word boundary assertion */
+ RE_OP_ASSERT_NOT_WORD_BOUNDARY, /**< not word boundary assertion */
+ RE_OP_ASSERT_LOOKAHEAD_POS, /**< positive lookahead assertion */
+ RE_OP_ASSERT_LOOKAHEAD_NEG, /**< negative lookahead assertion */
+ RE_OP_ASSERT_END, /**< end of an assertion */
+
+ RE_OP_CLASS_ESCAPE, /**< class escape */
+ RE_OP_CHAR_CLASS, /**< character class */
+#if ENABLED (JERRY_ES2015)
+ RE_OP_UNICODE_PERIOD, /**< period in full unicode mode */
+#endif /* ENABLED (JERRY_ES2015) */
+ RE_OP_PERIOD, /**< period in non-unicode mode */
+ RE_OP_CHAR, /**< any code point */
+ RE_OP_BYTE, /**< 1-byte utf8 character */
} re_opcode_t;
/**
@@ -85,42 +100,31 @@ typedef enum
typedef struct
{
ecma_compiled_code_t header; /**< compiled code header */
+ uint32_t captures_count; /**< number of capturing groups */
+ uint32_t non_captures_count; /**< number of non-capturing groups */
ecma_value_t source; /**< original RegExp pattern */
- uint32_t captures_count; /**< number of capturing brackets */
- uint32_t non_captures_count; /**< number of non capturing brackets */
} re_compiled_code_t;
-/**
- * Context of RegExp bytecode container
- */
-typedef struct
-{
- uint8_t *block_start_p; /**< start of bytecode block */
- uint8_t *block_end_p; /**< end of bytecode block */
- uint8_t *current_p; /**< current position in bytecode */
-} re_bytecode_ctx_t;
+void re_initialize_regexp_bytecode (re_compiler_ctx_t *re_ctx_p);
+uint32_t re_bytecode_size (re_compiler_ctx_t *re_ctx_p);
-re_opcode_t re_get_opcode (const uint8_t **bc_p);
-ecma_char_t re_get_char (const uint8_t **bc_p);
-uint32_t re_get_value (const uint8_t **bc_p);
-uint32_t JERRY_ATTR_PURE re_get_bytecode_length (re_bytecode_ctx_t *bc_ctx_p);
-
-void re_initialize_regexp_bytecode (re_bytecode_ctx_t *bc_ctx_p);
+void re_append_opcode (re_compiler_ctx_t *re_ctx_p, const re_opcode_t opcode);
+void re_append_byte (re_compiler_ctx_t *re_ctx_p, const uint8_t byte);
+void re_append_char (re_compiler_ctx_t *re_ctx_p, const lit_code_point_t cp);
+void re_append_value (re_compiler_ctx_t *re_ctx_p, const uint32_t value);
-void re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, const re_opcode_t opcode);
-void re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, const uint32_t value);
-void re_append_char (re_bytecode_ctx_t *bc_ctx_p, const ecma_char_t input_char);
-void re_append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, uint32_t value);
+void re_insert_opcode (re_compiler_ctx_t *re_ctx_p, const uint32_t offset, const re_opcode_t opcode);
+void re_insert_byte (re_compiler_ctx_t *re_ctx_p, const uint32_t offset, const uint8_t byte);
+void re_insert_char (re_compiler_ctx_t *re_ctx_p, const uint32_t offset, const lit_code_point_t cp);
+void re_insert_value (re_compiler_ctx_t *re_ctx_p, const uint32_t offset, const uint32_t value);
-void re_insert_opcode (re_bytecode_ctx_t *bc_ctx_p, const uint32_t offset, const re_opcode_t opcode);
-void re_insert_u32 (re_bytecode_ctx_t *bc_ctx_p, const uint32_t offset, const uint32_t value);
-void re_bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p,
- const size_t offset,
- const uint8_t *bytecode_p,
- const size_t length);
+re_opcode_t re_get_opcode (const uint8_t **bc_p);
+uint8_t re_get_byte (const uint8_t **bc_p);
+lit_code_point_t re_get_char (const uint8_t **bc_p, bool unicode);
+uint32_t re_get_value (const uint8_t **bc_p);
#if ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE)
-void re_dump_bytecode (re_bytecode_ctx_t *bc_ctx);
+void re_dump_bytecode (re_compiler_ctx_t *bc_ctx);
#endif /* ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) */
/**
diff --git a/jerry-core/parser/regexp/re-compiler-context.h b/jerry-core/parser/regexp/re-compiler-context.h
new file mode 100644
index 00000000..6d7b7537
--- /dev/null
+++ b/jerry-core/parser/regexp/re-compiler-context.h
@@ -0,0 +1,60 @@
+/* Copyright JS Foundation and other contributors, http://js.foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RE_COMPILER_CONTEXT_H
+#define RE_COMPILER_CONTEXT_H
+
+#if ENABLED (JERRY_BUILTIN_REGEXP)
+
+#include "re-token.h"
+
+/** \addtogroup parser Parser
+ * @{
+ *
+ * \addtogroup regexparser Regular expression
+ * @{
+ *
+ * \addtogroup regexparser_compiler Compiler
+ * @{
+ */
+
+/**
+ * RegExp compiler context
+ */
+typedef struct
+{
+ const lit_utf8_byte_t *input_start_p; /**< start of input pattern */
+ const lit_utf8_byte_t *input_curr_p; /**< current position in input pattern */
+ const lit_utf8_byte_t *input_end_p; /**< end of input pattern */
+
+ uint8_t *bytecode_start_p; /**< start of bytecode block */
+ size_t bytecode_size; /**< size of bytecode */
+
+ uint32_t captures_count; /**< number of capture groups */
+ uint32_t non_captures_count; /**< number of non-capture groups */
+
+ int groups_count; /**< number of groups */
+ uint16_t flags; /**< RegExp flags */
+ re_token_t token; /**< current token */
+} re_compiler_ctx_t;
+
+/**
+ * @}
+ * @}
+ * @}
+ */
+
+#endif /* ENABLED (JERRY_BUILTIN_REGEXP) */
+#endif /* !RE_COMPILER_CONTEXT_H */
diff --git a/jerry-core/parser/regexp/re-compiler.c b/jerry-core/parser/regexp/re-compiler.c
index f82f8909..c28fd170 100644
--- a/jerry-core/parser/regexp/re-compiler.c
+++ b/jerry-core/parser/regexp/re-compiler.c
@@ -23,6 +23,7 @@
#include "jmem.h"
#include "re-bytecode.h"
#include "re-compiler.h"
+#include "re-compiler-context.h"
#include "re-parser.h"
#if ENABLED (JERRY_BUILTIN_REGEXP)
@@ -38,896 +39,140 @@
*/
/**
- * Insert simple atom iterator
+ * Search for the given pattern in the RegExp cache.
*
- * @return empty ecma value - if inserted successfully
- * error ecma value - otherwise
- *
- * Returned value must be freed with ecma_free_value
+ * @return pointer to bytecode if found
+ * NULL - otherwise
*/
-static ecma_value_t
-re_insert_simple_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
- uint32_t new_atom_start_offset) /**< atom start offset */
+static re_compiled_code_t *
+re_cache_lookup (ecma_string_t *pattern_str_p, /**< pattern string */
+ uint16_t flags) /**< flags */
{
- uint32_t atom_code_length;
- uint32_t offset;
- uint32_t qmin, qmax;
-
- qmin = re_ctx_p->current_token.qmin;
- qmax = re_ctx_p->current_token.qmax;
+ re_compiled_code_t **cache_p = JERRY_CONTEXT (re_cache);
- if (qmin == 1 && qmax == 1)
- {
- return ECMA_VALUE_EMPTY;
- }
- else if (qmin > qmax)
- {
- /* ECMA-262 v5.1 15.10.2.5 */
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: min > max."));
- }
-
- /* TODO: optimize bytecode length. Store 0 rather than INF */
-
- re_append_opcode (re_ctx_p->bytecode_ctx_p, RE_OP_MATCH); /* complete 'sub atom' */
- uint32_t bytecode_length = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
- atom_code_length = (uint32_t) (bytecode_length - new_atom_start_offset);
-
- offset = new_atom_start_offset;
- re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, atom_code_length);
- re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmax);
- re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmin);
- if (re_ctx_p->current_token.greedy)
- {
- re_insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_GREEDY_ITERATOR);
- }
- else
+ for (uint8_t idx = 0u; idx < RE_CACHE_SIZE; idx++)
{
- re_insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_NON_GREEDY_ITERATOR);
- }
-
- return ECMA_VALUE_EMPTY;
-} /* re_insert_simple_iterator */
+ re_compiled_code_t *cached_bytecode_p = cache_p[idx];
-/**
- * Get the type of a group start
- *
- * @return RegExp opcode
- */
-static re_opcode_t
-re_get_start_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
- bool is_capturable) /**< is capturable group */
-{
- if (is_capturable)
- {
- if (re_ctx_p->current_token.qmin == 0)
+ if (cached_bytecode_p == NULL)
{
- if (re_ctx_p->current_token.greedy)
- {
- return RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START;
- }
-
- return RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START;
- }
-
- return RE_OP_CAPTURE_GROUP_START;
- }
-
- if (re_ctx_p->current_token.qmin == 0)
- {
- if (re_ctx_p->current_token.greedy)
- {
- return RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START;
+ break;
}
- return RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START;
- }
-
- return RE_OP_NON_CAPTURE_GROUP_START;
-} /* re_get_start_opcode_type */
+ ecma_string_t *cached_pattern_str_p = ecma_get_string_from_value (cached_bytecode_p->source);
-/**
- * Get the type of a group end
- *
- * @return RegExp opcode
- */
-static re_opcode_t
-re_get_end_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
- bool is_capturable) /**< is capturable group */
-{
- if (is_capturable)
- {
- if (re_ctx_p->current_token.greedy)
+ if ((cached_bytecode_p->header.status_flags & RE_FLAGS_MASK) == flags
+ && ecma_compare_ecma_strings (cached_pattern_str_p, pattern_str_p))
{
- return RE_OP_CAPTURE_GREEDY_GROUP_END;
+ return cached_bytecode_p;
}
-
- return RE_OP_CAPTURE_NON_GREEDY_GROUP_END;
- }
-
- if (re_ctx_p->current_token.greedy)
- {
- return RE_OP_NON_CAPTURE_GREEDY_GROUP_END;
}
- return RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END;
-} /* re_get_end_opcode_type */
+ return NULL;
+} /* re_cache_lookup */
/**
- * Enclose the given bytecode to a group
- *
- * @return empty ecma value - if inserted successfully
- * error ecma value - otherwise
- *
- * Returned value must be freed with ecma_free_value
+ * Run garbage collection in RegExp cache.
*/
-static ecma_value_t
-re_insert_into_group (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
- uint32_t group_start_offset, /**< offset of group start */
- uint32_t idx, /**< index of group */
- bool is_capturable) /**< is capturable group */
-{
- uint32_t qmin = re_ctx_p->current_token.qmin;
- uint32_t qmax = re_ctx_p->current_token.qmax;
-
- if (qmin > qmax)
- {
- /* ECMA-262 v5.1 15.10.2.5 */
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: min > max."));
- }
-
- re_opcode_t start_opcode = re_get_start_opcode_type (re_ctx_p, is_capturable);
- re_opcode_t end_opcode = re_get_end_opcode_type (re_ctx_p, is_capturable);
-
- uint32_t start_head_offset_len = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
- re_insert_u32 (re_ctx_p->bytecode_ctx_p, group_start_offset, idx);
- re_insert_opcode (re_ctx_p->bytecode_ctx_p, group_start_offset, start_opcode);
- start_head_offset_len = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - start_head_offset_len;
- re_append_opcode (re_ctx_p->bytecode_ctx_p, end_opcode);
- re_append_u32 (re_ctx_p->bytecode_ctx_p, idx);
- re_append_u32 (re_ctx_p->bytecode_ctx_p, qmin);
- re_append_u32 (re_ctx_p->bytecode_ctx_p, qmax);
-
- group_start_offset += start_head_offset_len;
- re_append_jump_offset (re_ctx_p->bytecode_ctx_p,
- re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset);
-
- if (start_opcode != RE_OP_CAPTURE_GROUP_START && start_opcode != RE_OP_NON_CAPTURE_GROUP_START)
- {
- re_insert_u32 (re_ctx_p->bytecode_ctx_p,
- group_start_offset,
- re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset);
- }
-
- return ECMA_VALUE_EMPTY;
-} /* re_insert_into_group */
-
-/**
- * Enclose the given bytecode to a group and inster jump value
- *
- * @return empty ecma value - if inserted successfully
- * error ecma value - otherwise
- *
- * Returned value must be freed with ecma_free_value
- */
-static ecma_value_t
-re_insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
- uint32_t group_start_offset, /**< offset of group start */
- uint32_t idx, /**< index of group */
- bool is_capturable) /**< is capturable group */
-{
- re_insert_u32 (re_ctx_p->bytecode_ctx_p,
- group_start_offset,
- re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset);
- return re_insert_into_group (re_ctx_p, group_start_offset, idx, is_capturable);
-} /* re_insert_into_group_with_jump */
-
-/**
- * Append a character class range to the bytecode
- */
-static void
-re_append_char_class (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
- lit_code_point_t start, /**< character class range from */
- lit_code_point_t end) /**< character class range to */
-{
- re_ctx_p->parser_ctx_p->classes_count++;
-
-#if ENABLED (JERRY_ES2015)
- if (re_ctx_p->flags & RE_FLAG_UNICODE)
- {
- re_append_u32 (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (start, re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
- re_append_u32 (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (end, re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
- return;
- }
-#endif /* ENABLED (JERRY_ES2015) */
-
- JERRY_ASSERT (start <= LIT_UTF16_CODE_UNIT_MAX);
- JERRY_ASSERT (end <= LIT_UTF16_CODE_UNIT_MAX);
-
- re_append_char (re_ctx_p->bytecode_ctx_p,
- (ecma_char_t) ecma_regexp_canonicalize (start,
- re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
- re_append_char (re_ctx_p->bytecode_ctx_p,
- (ecma_char_t) ecma_regexp_canonicalize (end,
- re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
-} /* re_append_char_class */
-
-/**
- * Read the input pattern and parse the range of character class
- *
- * @return empty ecma value - if parsed successfully
- * error ecma value - otherwise
- *
- * Returned value must be freed with ecma_free_value
- */
-static ecma_value_t
-re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */
- re_token_t *out_token_p) /**< [out] output token */
+void
+re_cache_gc (void)
{
- re_parser_ctx_t *const parser_ctx_p = re_ctx_p->parser_ctx_p;
- out_token_p->qmax = out_token_p->qmin = 1;
- parser_ctx_p->classes_count = 0;
-
- lit_code_point_t start = LIT_CHAR_UNDEF;
- bool is_range = false;
- const bool is_char_class = (re_ctx_p->current_token.type == RE_TOK_START_CHAR_CLASS
- || re_ctx_p->current_token.type == RE_TOK_START_INV_CHAR_CLASS);
-
- const ecma_char_t prev_char = lit_cesu8_peek_prev (parser_ctx_p->input_curr_p);
- if (prev_char != LIT_CHAR_LEFT_SQUARE && prev_char != LIT_CHAR_CIRCUMFLEX)
- {
- lit_utf8_decr (&parser_ctx_p->input_curr_p);
- lit_utf8_decr (&parser_ctx_p->input_curr_p);
- }
+ re_compiled_code_t **cache_p = JERRY_CONTEXT (re_cache);
- do
+ for (uint32_t i = 0u; i < RE_CACHE_SIZE; i++)
{
- if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string"));
- }
+ const re_compiled_code_t *cached_bytecode_p = cache_p[i];
- lit_code_point_t ch = lit_cesu8_read_next (&parser_ctx_p->input_curr_p);
-
- if (ch == LIT_CHAR_RIGHT_SQUARE)
+ if (cached_bytecode_p == NULL)
{
- if (start != LIT_CHAR_UNDEF)
- {
- re_append_char_class (re_ctx_p, start, start);
- }
break;
}
- else if (ch == LIT_CHAR_MINUS)
- {
- if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '-'"));
- }
-
- if (start != LIT_CHAR_UNDEF
- && !is_range
- && *parser_ctx_p->input_curr_p != LIT_CHAR_RIGHT_SQUARE)
- {
- is_range = true;
- continue;
- }
- }
- else if (ch == LIT_CHAR_BACKSLASH)
- {
- if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\'"));
- }
-
- ch = lit_cesu8_read_next (&parser_ctx_p->input_curr_p);
-
- if (ch == LIT_CHAR_LOWERCASE_B)
- {
- ch = LIT_CHAR_BS;
- }
- else if (ch == LIT_CHAR_LOWERCASE_F)
- {
- ch = LIT_CHAR_FF;
- }
- else if (ch == LIT_CHAR_LOWERCASE_N)
- {
- ch = LIT_CHAR_LF;
- }
- else if (ch == LIT_CHAR_LOWERCASE_T)
- {
- ch = LIT_CHAR_TAB;
- }
- else if (ch == LIT_CHAR_LOWERCASE_R)
- {
- ch = LIT_CHAR_CR;
- }
- else if (ch == LIT_CHAR_LOWERCASE_V)
- {
- ch = LIT_CHAR_VTAB;
- }
- else if (ch == LIT_CHAR_LOWERCASE_C)
- {
- if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p)
- {
- ch = *parser_ctx_p->input_curr_p;
-
- if ((ch >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
- || (ch >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)
- || (ch >= LIT_CHAR_0 && ch <= LIT_CHAR_9))
- {
- /* See ECMA-262 v5, 15.10.2.10 (Point 3) */
- ch = (ch % 32);
- parser_ctx_p->input_curr_p++;
- }
- else
- {
- ch = LIT_CHAR_LOWERCASE_C;
- }
- }
- }
- else if (ch == LIT_CHAR_LOWERCASE_X && re_hex_lookup (parser_ctx_p, 2))
- {
- ecma_char_t code_unit;
-
- if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 2, &code_unit))
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\x'"));
- }
-
- parser_ctx_p->input_curr_p += 2;
- if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p
- && is_range == false
- && lit_cesu8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS)
- {
- start = code_unit;
- continue;
- }
-
- ch = code_unit;
- }
- else if (ch == LIT_CHAR_LOWERCASE_U && re_hex_lookup (parser_ctx_p, 4))
- {
- ecma_char_t code_unit;
-
- if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 4, &code_unit))
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\u'"));
- }
-
- parser_ctx_p->input_curr_p += 4;
- if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p
- && is_range == false
- && lit_cesu8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS)
- {
- start = code_unit;
- continue;
- }
-
- ch = code_unit;
- }
- else if (ch == LIT_CHAR_LOWERCASE_D)
- {
- /* See ECMA-262 v5, 15.10.2.12 */
- re_append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_BEGIN, LIT_CHAR_ASCII_DIGITS_END);
- ch = LIT_CHAR_UNDEF;
- }
- else if (ch == LIT_CHAR_UPPERCASE_D)
- {
- /* See ECMA-262 v5, 15.10.2.12 */
- re_append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_ASCII_DIGITS_BEGIN - 1);
- re_append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_END + 1, LIT_UTF16_CODE_UNIT_MAX);
- ch = LIT_CHAR_UNDEF;
- }
- else if (ch == LIT_CHAR_LOWERCASE_S)
- {
- /* See ECMA-262 v5, 15.10.2.12 */
- re_append_char_class (re_ctx_p, LIT_CHAR_TAB, LIT_CHAR_CR);
- re_append_char_class (re_ctx_p, LIT_CHAR_SP, LIT_CHAR_SP);
- re_append_char_class (re_ctx_p, LIT_CHAR_NBSP, LIT_CHAR_NBSP);
- re_append_char_class (re_ctx_p, 0x1680UL, 0x1680UL); /* Ogham Space Mark */
- re_append_char_class (re_ctx_p, 0x180EUL, 0x180EUL); /* Mongolian Vowel Separator */
- re_append_char_class (re_ctx_p, 0x2000UL, 0x200AUL); /* En Quad - Hair Space */
- re_append_char_class (re_ctx_p, LIT_CHAR_LS, LIT_CHAR_PS);
- re_append_char_class (re_ctx_p, 0x202FUL, 0x202FUL); /* Narrow No-Break Space */
- re_append_char_class (re_ctx_p, 0x205FUL, 0x205FUL); /* Medium Mathematical Space */
- re_append_char_class (re_ctx_p, 0x3000UL, 0x3000UL); /* Ideographic Space */
- re_append_char_class (re_ctx_p, LIT_CHAR_BOM, LIT_CHAR_BOM);
- ch = LIT_CHAR_UNDEF;
- }
- else if (ch == LIT_CHAR_UPPERCASE_S)
- {
- /* See ECMA-262 v5, 15.10.2.12 */
- re_append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_TAB - 1);
- re_append_char_class (re_ctx_p, LIT_CHAR_CR + 1, LIT_CHAR_SP - 1);
- re_append_char_class (re_ctx_p, LIT_CHAR_SP + 1, LIT_CHAR_NBSP - 1);
- re_append_char_class (re_ctx_p, LIT_CHAR_NBSP + 1, 0x167FUL);
- re_append_char_class (re_ctx_p, 0x1681UL, 0x180DUL);
- re_append_char_class (re_ctx_p, 0x180FUL, 0x1FFFUL);
- re_append_char_class (re_ctx_p, 0x200BUL, LIT_CHAR_LS - 1);
- re_append_char_class (re_ctx_p, LIT_CHAR_PS + 1, 0x202EUL);
- re_append_char_class (re_ctx_p, 0x2030UL, 0x205EUL);
- re_append_char_class (re_ctx_p, 0x2060UL, 0x2FFFUL);
- re_append_char_class (re_ctx_p, 0x3001UL, LIT_CHAR_BOM - 1);
- re_append_char_class (re_ctx_p, LIT_CHAR_BOM + 1, LIT_UTF16_CODE_UNIT_MAX);
- ch = LIT_CHAR_UNDEF;
- }
- else if (ch == LIT_CHAR_LOWERCASE_W)
- {
- /* See ECMA-262 v5, 15.10.2.12 */
- re_append_char_class (re_ctx_p, LIT_CHAR_0, LIT_CHAR_9);
- re_append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_A, LIT_CHAR_UPPERCASE_Z);
- re_append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE, LIT_CHAR_UNDERSCORE);
- re_append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_A, LIT_CHAR_LOWERCASE_Z);
- ch = LIT_CHAR_UNDEF;
- }
- else if (ch == LIT_CHAR_UPPERCASE_W)
- {
- /* See ECMA-262 v5, 15.10.2.12 */
- re_append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_0 - 1);
- re_append_char_class (re_ctx_p, LIT_CHAR_9 + 1, LIT_CHAR_UPPERCASE_A - 1);
- re_append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_Z + 1, LIT_CHAR_UNDERSCORE - 1);
- re_append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE + 1, LIT_CHAR_LOWERCASE_A - 1);
- re_append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_Z + 1, LIT_UTF16_CODE_UNIT_MAX);
- ch = LIT_CHAR_UNDEF;
- }
- else if (lit_char_is_octal_digit ((ecma_char_t) ch))
- {
- lit_utf8_decr (&parser_ctx_p->input_curr_p);
- ch = (ecma_char_t) re_parse_octal (parser_ctx_p);
- }
- } /* ch == LIT_CHAR_BACKSLASH */
-
-#if ENABLED (JERRY_ES2015)
- if (re_ctx_p->flags & RE_FLAG_UNICODE
- && lit_is_code_point_utf16_high_surrogate (ch)
- && parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p)
- {
- const ecma_char_t next_ch = lit_cesu8_peek_next (parser_ctx_p->input_curr_p);
- if (lit_is_code_point_utf16_low_surrogate (next_ch))
- {
- ch = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch, next_ch);
- lit_utf8_incr (&parser_ctx_p->input_curr_p);
- }
- }
-#endif /* ENABLED (JERRY_ES2015) */
-
- if (start != LIT_CHAR_UNDEF)
- {
- if (is_range)
- {
- if (start > ch)
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, wrong order"));
- }
- else
- {
- re_append_char_class (re_ctx_p, start, ch);
- start = LIT_CHAR_UNDEF;
- is_range = false;
- }
- }
- else
- {
- re_append_char_class (re_ctx_p, start, start);
- start = ch;
- }
- }
- else
- {
- start = ch;
- }
- }
- while (is_char_class);
-
- return re_parse_iterator (parser_ctx_p, out_token_p);
-} /* re_parse_char_class */
-
-/**
- * Parse alternatives
- *
- * @return empty ecma value - if alternative was successfully parsed
- * error ecma value - otherwise
- *
- * Returned value must be freed with ecma_free_value
- */
-static ecma_value_t
-re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
- bool expect_eof) /**< expect end of file */
-{
- ECMA_CHECK_STACK_USAGE ();
- uint32_t idx;
- re_bytecode_ctx_t *bc_ctx_p = re_ctx_p->bytecode_ctx_p;
- ecma_value_t ret_value = ECMA_VALUE_EMPTY;
-
- uint32_t alternative_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
-
- while (ecma_is_value_empty (ret_value))
- {
- ecma_value_t next_token_result = re_parse_next_token (re_ctx_p->parser_ctx_p,
- &(re_ctx_p->current_token));
- if (ECMA_IS_VALUE_ERROR (next_token_result))
- {
- return next_token_result;
- }
-
- JERRY_ASSERT (ecma_is_value_empty (next_token_result));
-
- uint32_t new_atom_start_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
-
- switch (re_ctx_p->current_token.type)
- {
- case RE_TOK_START_CAPTURE_GROUP:
- {
- idx = re_ctx_p->captures_count++;
- JERRY_TRACE_MSG ("Compile a capture group start (idx: %u)\n", (unsigned int) idx);
-
- ret_value = re_parse_alternative (re_ctx_p, false);
-
- if (ecma_is_value_empty (ret_value))
- {
- ret_value = re_insert_into_group (re_ctx_p, new_atom_start_offset, idx, true);
- }
-
- break;
- }
- case RE_TOK_START_NON_CAPTURE_GROUP:
- {
- idx = re_ctx_p->non_captures_count++;
- JERRY_TRACE_MSG ("Compile a non-capture group start (idx: %u)\n", (unsigned int) idx);
-
- ret_value = re_parse_alternative (re_ctx_p, false);
-
- if (ecma_is_value_empty (ret_value))
- {
- ret_value = re_insert_into_group (re_ctx_p, new_atom_start_offset, idx, false);
- }
-
- break;
- }
- case RE_TOK_CHAR:
- {
- JERRY_TRACE_MSG ("Compile character token: %c, qmin: %u, qmax: %u\n",
- (char) re_ctx_p->current_token.value, (unsigned int) re_ctx_p->current_token.qmin,
- (unsigned int) re_ctx_p->current_token.qmax);
-
- re_append_opcode (bc_ctx_p, RE_OP_CHAR);
- re_append_char (bc_ctx_p, (ecma_char_t) ecma_regexp_canonicalize ((ecma_char_t) re_ctx_p->current_token.value,
- re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
-
- ret_value = re_insert_simple_iterator (re_ctx_p, new_atom_start_offset);
- break;
- }
- case RE_TOK_PERIOD:
- {
- JERRY_TRACE_MSG ("Compile a period\n");
- re_append_opcode (bc_ctx_p, RE_OP_PERIOD);
-
- ret_value = re_insert_simple_iterator (re_ctx_p, new_atom_start_offset);
- break;
- }
- case RE_TOK_ALTERNATIVE:
- {
- JERRY_TRACE_MSG ("Compile an alternative\n");
- re_insert_u32 (bc_ctx_p, alternative_offset, re_get_bytecode_length (bc_ctx_p) - alternative_offset);
- re_append_opcode (bc_ctx_p, RE_OP_ALTERNATIVE);
- alternative_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
- break;
- }
- case RE_TOK_ASSERT_START:
- {
- JERRY_TRACE_MSG ("Compile a start assertion\n");
- re_append_opcode (bc_ctx_p, RE_OP_ASSERT_START);
- break;
- }
- case RE_TOK_ASSERT_END:
- {
- JERRY_TRACE_MSG ("Compile an end assertion\n");
- re_append_opcode (bc_ctx_p, RE_OP_ASSERT_END);
- break;
- }
- case RE_TOK_ASSERT_WORD_BOUNDARY:
- {
- JERRY_TRACE_MSG ("Compile a word boundary assertion\n");
- re_append_opcode (bc_ctx_p, RE_OP_ASSERT_WORD_BOUNDARY);
- break;
- }
- case RE_TOK_ASSERT_NOT_WORD_BOUNDARY:
- {
- JERRY_TRACE_MSG ("Compile a not word boundary assertion\n");
- re_append_opcode (bc_ctx_p, RE_OP_ASSERT_NOT_WORD_BOUNDARY);
- break;
- }
- case RE_TOK_ASSERT_START_POS_LOOKAHEAD:
- {
- JERRY_TRACE_MSG ("Compile a positive lookahead assertion\n");
- idx = re_ctx_p->non_captures_count++;
- re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_POS);
-
- ret_value = re_parse_alternative (re_ctx_p, false);
-
- if (ecma_is_value_empty (ret_value))
- {
- re_append_opcode (bc_ctx_p, RE_OP_MATCH);
-
- ret_value = re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
- }
-
- break;
- }
- case RE_TOK_ASSERT_START_NEG_LOOKAHEAD:
- {
- JERRY_TRACE_MSG ("Compile a negative lookahead assertion\n");
- idx = re_ctx_p->non_captures_count++;
- re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_NEG);
-
- ret_value = re_parse_alternative (re_ctx_p, false);
-
- if (ecma_is_value_empty (ret_value))
- {
- re_append_opcode (bc_ctx_p, RE_OP_MATCH);
-
- ret_value = re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
- }
-
- break;
- }
- case RE_TOK_BACKREFERENCE:
- {
- uint32_t backref = (uint32_t) re_ctx_p->current_token.value;
- idx = re_ctx_p->non_captures_count++;
-
- if (backref > re_ctx_p->highest_backref)
- {
- re_ctx_p->highest_backref = backref;
- }
-
- JERRY_TRACE_MSG ("Compile a backreference: %u\n", (unsigned int) backref);
- re_append_opcode (bc_ctx_p, RE_OP_BACKREFERENCE);
- re_append_u32 (bc_ctx_p, backref);
-
- ret_value = re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
- break;
- }
- case RE_TOK_DIGIT:
- case RE_TOK_NOT_DIGIT:
- case RE_TOK_WHITE:
- case RE_TOK_NOT_WHITE:
- case RE_TOK_WORD_CHAR:
- case RE_TOK_NOT_WORD_CHAR:
- case RE_TOK_START_CHAR_CLASS:
- case RE_TOK_START_INV_CHAR_CLASS:
- {
- JERRY_TRACE_MSG ("Compile a character class\n");
- re_append_opcode (bc_ctx_p,
- re_ctx_p->current_token.type == RE_TOK_START_INV_CHAR_CLASS
- ? RE_OP_INV_CHAR_CLASS
- : RE_OP_CHAR_CLASS);
- uint32_t offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
-
- ret_value = re_parse_char_class (re_ctx_p,
- &(re_ctx_p->current_token));
-
- if (!ECMA_IS_VALUE_ERROR (ret_value))
- {
- re_insert_u32 (bc_ctx_p, offset, re_ctx_p->parser_ctx_p->classes_count);
- ret_value = re_insert_simple_iterator (re_ctx_p, new_atom_start_offset);
- }
-
- break;
- }
- case RE_TOK_END_GROUP:
- {
- JERRY_TRACE_MSG ("Compile a group end\n");
-
- if (expect_eof)
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected end of paren."));
- }
-
- re_insert_u32 (bc_ctx_p, alternative_offset, re_get_bytecode_length (bc_ctx_p) - alternative_offset);
- return ECMA_VALUE_EMPTY;
- }
- case RE_TOK_EOF:
- {
- if (!expect_eof)
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected end of pattern."));
- }
-
- re_insert_u32 (bc_ctx_p, alternative_offset, re_get_bytecode_length (bc_ctx_p) - alternative_offset);
- return ECMA_VALUE_EMPTY;
- }
- default:
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected RegExp token."));
- }
- }
- }
-
- return ret_value;
-} /* re_parse_alternative */
-
-/**
- * Search for the given pattern in the RegExp cache
- *
- * @return index of bytecode in cache - if found
- * RE_CACHE_SIZE - otherwise
- */
-static uint8_t
-re_find_bytecode_in_cache (ecma_string_t *pattern_str_p, /**< pattern string */
- uint16_t flags) /**< flags */
-{
- uint8_t free_idx = RE_CACHE_SIZE;
-
- for (uint8_t idx = 0u; idx < RE_CACHE_SIZE; idx++)
- {
- const re_compiled_code_t *cached_bytecode_p = JERRY_CONTEXT (re_cache)[idx];
- if (cached_bytecode_p != NULL)
- {
- ecma_string_t *cached_pattern_str_p = ecma_get_string_from_value (cached_bytecode_p->source);
-
- if ((cached_bytecode_p->header.status_flags & RE_FLAGS_MASK) == flags
- && ecma_compare_ecma_strings (cached_pattern_str_p, pattern_str_p))
- {
- JERRY_TRACE_MSG ("RegExp is found in cache\n");
- return idx;
- }
- }
- else
- {
- /* mark as free, so it can be overridden if the cache is full */
- free_idx = idx;
- }
+ ecma_bytecode_deref ((ecma_compiled_code_t *) cached_bytecode_p);
+ cache_p[i] = NULL;
}
- JERRY_TRACE_MSG ("RegExp is NOT found in cache\n");
- return free_idx;
-} /* re_find_bytecode_in_cache */
-
-/**
- * Run gerbage collection in RegExp cache
- */
-void
-re_cache_gc_run (void)
-{
- for (uint32_t i = 0u; i < RE_CACHE_SIZE; i++)
- {
- const re_compiled_code_t *cached_bytecode_p = JERRY_CONTEXT (re_cache)[i];
-
- if (cached_bytecode_p != NULL
- && cached_bytecode_p->header.refs == 1)
- {
- /* Only the cache has reference for the bytecode */
- ecma_bytecode_deref ((ecma_compiled_code_t *) cached_bytecode_p);
- JERRY_CONTEXT (re_cache)[i] = NULL;
- }
- }
-} /* re_cache_gc_run */
+ JERRY_CONTEXT (re_cache_idx) = 0;
+} /* re_cache_gc */
/**
* Compilation of RegExp bytecode
*
- * @return empty ecma value - if bytecode was compiled successfully
- * error ecma value - otherwise
- *
- * Returned value must be freed with ecma_free_value
+ * @return pointer to bytecode if compilation was successful
+ * NULL - otherwise
*/
-ecma_value_t
-re_compile_bytecode (const re_compiled_code_t **out_bytecode_p, /**< [out] pointer to bytecode */
- ecma_string_t *pattern_str_p, /**< pattern */
+re_compiled_code_t *
+re_compile_bytecode (ecma_string_t *pattern_str_p, /**< pattern */
uint16_t flags) /**< flags */
{
- ecma_value_t ret_value = ECMA_VALUE_EMPTY;
- uint8_t cache_idx = re_find_bytecode_in_cache (pattern_str_p, flags);
+ re_compiled_code_t *cached_bytecode_p = re_cache_lookup (pattern_str_p, flags);
- if (cache_idx < RE_CACHE_SIZE)
+ if (cached_bytecode_p != NULL)
{
- *out_bytecode_p = JERRY_CONTEXT (re_cache)[cache_idx];
-
- if (*out_bytecode_p != NULL)
- {
- ecma_bytecode_ref ((ecma_compiled_code_t *) *out_bytecode_p);
- return ret_value;
- }
+ ecma_bytecode_ref ((ecma_compiled_code_t *) cached_bytecode_p);
+ return cached_bytecode_p;
}
- /* not in the RegExp cache, so compile it */
re_compiler_ctx_t re_ctx;
re_ctx.flags = flags;
- re_ctx.highest_backref = 0;
+ re_ctx.captures_count = 1;
re_ctx.non_captures_count = 0;
- re_bytecode_ctx_t bc_ctx;
- re_ctx.bytecode_ctx_p = &bc_ctx;
- re_initialize_regexp_bytecode (&bc_ctx);
+ re_initialize_regexp_bytecode (&re_ctx);
ECMA_STRING_TO_UTF8_STRING (pattern_str_p, pattern_start_p, pattern_start_size);
- re_parser_ctx_t parser_ctx;
- parser_ctx.input_start_p = pattern_start_p;
- parser_ctx.input_curr_p = (lit_utf8_byte_t *) pattern_start_p;
- parser_ctx.input_end_p = pattern_start_p + pattern_start_size;
- parser_ctx.groups_count = -1;
- re_ctx.parser_ctx_p = &parser_ctx;
+ re_ctx.input_start_p = pattern_start_p;
+ re_ctx.input_curr_p = (lit_utf8_byte_t *) pattern_start_p;
+ re_ctx.input_end_p = pattern_start_p + pattern_start_size;
+ re_ctx.groups_count = -1;
/* Parse RegExp pattern */
- re_ctx.captures_count = 1;
- re_append_opcode (&bc_ctx, RE_OP_SAVE_AT_START);
-
ecma_value_t result = re_parse_alternative (&re_ctx, true);
ECMA_FINALIZE_UTF8_STRING (pattern_start_p, pattern_start_size);
if (ECMA_IS_VALUE_ERROR (result))
{
- ret_value = result;
- }
- /* Check for invalid backreference */
- else if (re_ctx.highest_backref >= re_ctx.captures_count)
- {
- ret_value = ecma_raise_syntax_error ("Invalid backreference.\n");
- }
- else
- {
- re_append_opcode (&bc_ctx, RE_OP_SAVE_AND_MATCH);
- re_append_opcode (&bc_ctx, RE_OP_EOF);
-
- /* Initialize bytecode header */
- re_compiled_code_t *re_compiled_code_p = (re_compiled_code_t *) bc_ctx.block_start_p;
- re_compiled_code_p->header.refs = 1;
- re_compiled_code_p->header.status_flags = re_ctx.flags;
- ecma_ref_ecma_string (pattern_str_p);
- re_compiled_code_p->source = ecma_make_string_value (pattern_str_p);
- re_compiled_code_p->captures_count = re_ctx.captures_count;
- re_compiled_code_p->non_captures_count = re_ctx.non_captures_count;
- }
-
- size_t byte_code_size = (size_t) (bc_ctx.block_end_p - bc_ctx.block_start_p);
-
- if (!ecma_is_value_empty (ret_value))
- {
/* Compilation failed, free bytecode. */
- JERRY_TRACE_MSG ("RegExp compilation failed!\n");
- jmem_heap_free_block (bc_ctx.block_start_p, byte_code_size);
- *out_bytecode_p = NULL;
+ jmem_heap_free_block (re_ctx.bytecode_start_p, re_ctx.bytecode_size);
+ return NULL;
}
- else
- {
-#if ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE)
- if (JERRY_CONTEXT (jerry_init_flags) & ECMA_INIT_SHOW_REGEXP_OPCODES)
- {
- re_dump_bytecode (&bc_ctx);
- }
-#endif /* ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) */
- *out_bytecode_p = (re_compiled_code_t *) bc_ctx.block_start_p;
- ((re_compiled_code_t *) bc_ctx.block_start_p)->header.size = (uint16_t) (byte_code_size >> JMEM_ALIGNMENT_LOG);
+ /* Align bytecode size to JMEM_ALIGNMENT so that it can be stored in the bytecode header. */
+ const uint32_t final_size = JERRY_ALIGNUP (re_ctx.bytecode_size, JMEM_ALIGNMENT);
+ re_compiled_code_t *re_compiled_code_p = (re_compiled_code_t *) jmem_heap_realloc_block (re_ctx.bytecode_start_p,
+ re_ctx.bytecode_size,
+ final_size);
- if (cache_idx == RE_CACHE_SIZE)
- {
- if (JERRY_CONTEXT (re_cache_idx) == RE_CACHE_SIZE)
- {
- JERRY_CONTEXT (re_cache_idx) = 0;
- }
+ /* Bytecoded will be inserted into the cache and returned to the caller, so refcount is implicitly set to 2. */
+ re_compiled_code_p->header.refs = 2;
+ re_compiled_code_p->header.size = (uint16_t) (final_size >> JMEM_ALIGNMENT_LOG);
+ re_compiled_code_p->header.status_flags = re_ctx.flags;
- JERRY_TRACE_MSG ("RegExp cache is full! Remove the element on idx: %d\n", JERRY_CONTEXT (re_cache_idx));
+ ecma_ref_ecma_string (pattern_str_p);
+ re_compiled_code_p->source = ecma_make_string_value (pattern_str_p);
+ re_compiled_code_p->captures_count = re_ctx.captures_count;
+ re_compiled_code_p->non_captures_count = re_ctx.non_captures_count;
- cache_idx = JERRY_CONTEXT (re_cache_idx)++;
+#if ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE)
+ if (JERRY_CONTEXT (jerry_init_flags) & ECMA_INIT_SHOW_REGEXP_OPCODES)
+ {
+ re_dump_bytecode (&re_ctx);
+ }
+#endif /* ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) */
- /* The garbage collector might run during the byte code
- * allocations above and it may free this entry. */
- if (JERRY_CONTEXT (re_cache)[cache_idx] != NULL)
- {
- ecma_bytecode_deref ((ecma_compiled_code_t *) JERRY_CONTEXT (re_cache)[cache_idx]);
- }
- }
+ uint8_t cache_idx = JERRY_CONTEXT (re_cache_idx);
- JERRY_TRACE_MSG ("Insert bytecode into RegExp cache (idx: %d).\n", cache_idx);
- ecma_bytecode_ref ((ecma_compiled_code_t *) *out_bytecode_p);
- JERRY_CONTEXT (re_cache)[cache_idx] = *out_bytecode_p;
+ if (JERRY_CONTEXT (re_cache)[cache_idx] != NULL)
+ {
+ ecma_bytecode_deref ((ecma_compiled_code_t *) JERRY_CONTEXT (re_cache)[cache_idx]);
}
- return ret_value;
+ JERRY_CONTEXT (re_cache)[cache_idx] = re_compiled_code_p;
+ JERRY_CONTEXT (re_cache_idx) = (uint8_t) (cache_idx + 1) % RE_CACHE_SIZE;
+
+ return re_compiled_code_p;
} /* re_compile_bytecode */
/**
diff --git a/jerry-core/parser/regexp/re-compiler.h b/jerry-core/parser/regexp/re-compiler.h
index 8dd2a72e..b5f1e8a7 100644
--- a/jerry-core/parser/regexp/re-compiler.h
+++ b/jerry-core/parser/regexp/re-compiler.h
@@ -20,7 +20,6 @@
#include "ecma-globals.h"
#include "re-bytecode.h"
-#include "re-parser.h"
/** \addtogroup parser Parser
* @{
@@ -32,24 +31,10 @@
* @{
*/
-/**
- * Context of RegExp compiler
- */
-typedef struct
-{
- uint16_t flags; /**< RegExp flags */
- uint32_t captures_count; /**< number of capture groups */
- uint32_t non_captures_count; /**< number of non-capture groups */
- uint32_t highest_backref; /**< highest backreference */
- re_bytecode_ctx_t *bytecode_ctx_p; /**< pointer of RegExp bytecode context */
- re_token_t current_token; /**< current token */
- re_parser_ctx_t *parser_ctx_p; /**< pointer of RegExp parser context */
-} re_compiler_ctx_t;
-
-ecma_value_t
-re_compile_bytecode (const re_compiled_code_t **out_bytecode_p, ecma_string_t *pattern_str_p, uint16_t flags);
-
-void re_cache_gc_run (void);
+re_compiled_code_t *
+re_compile_bytecode (ecma_string_t *pattern_str_p, uint16_t flags);
+
+void re_cache_gc (void);
/**
* @}
diff --git a/jerry-core/parser/regexp/re-parser.c b/jerry-core/parser/regexp/re-parser.c
index 01f305e1..3820d679 100644
--- a/jerry-core/parser/regexp/re-parser.c
+++ b/jerry-core/parser/regexp/re-parser.c
@@ -35,234 +35,336 @@
*/
/**
- * Lookup a character in the input string.
+ * Get the start opcode for the current group.
*
- * @return true - if lookup number of characters ahead are hex digits
- * false - otherwise
+ * @return RegExp opcode
*/
-bool
-re_hex_lookup (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */
- uint32_t lookup) /**< size of lookup */
+static re_opcode_t
+re_get_group_start_opcode (bool is_capturing) /**< is capturing group */
{
- const lit_utf8_byte_t *curr_p = parser_ctx_p->input_curr_p;
+ return (is_capturing) ? RE_OP_CAPTURING_GROUP_START : RE_OP_NON_CAPTURING_GROUP_START;
+} /* re_get_group_start_opcode*/
- if (JERRY_UNLIKELY (curr_p + lookup > parser_ctx_p->input_end_p))
+/**
+ * Get the end opcode for the current group.
+ *
+ * @return RegExp opcode
+ */
+static re_opcode_t
+re_get_group_end_opcode (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
+ bool is_capturing) /**< is capturing group */
+{
+ if (is_capturing)
{
- return false;
+ if (re_ctx_p->token.greedy)
+ {
+ return RE_OP_GREEDY_CAPTURING_GROUP_END;
+ }
+
+ return RE_OP_LAZY_CAPTURING_GROUP_END;
}
- for (uint32_t i = 0; i < lookup; i++)
+ if (re_ctx_p->token.greedy)
{
- if (!lit_char_is_hex_digit (*curr_p++))
- {
- return false;
- }
+ return RE_OP_GREEDY_NON_CAPTURING_GROUP_END;
}
- return true;
-} /* re_hex_lookup */
+ return RE_OP_LAZY_NON_CAPTURING_GROUP_END;
+} /* re_get_group_end_opcode */
/**
- * Consume non greedy (question mark) character if present.
- *
- * @return true - if non-greedy character found
- * false - otherwise
+ * Enclose the given bytecode to a group.
*/
-static inline bool JERRY_ATTR_ALWAYS_INLINE
-re_parse_non_greedy_char (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */
+static void
+re_insert_into_group (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
+ uint32_t group_start_offset, /**< offset of group start */
+ uint32_t idx, /**< index of group */
+ uint32_t capture_start, /**< index of first nested capture */
+ bool is_capturing) /**< is capturing group */
{
- if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p
- && *parser_ctx_p->input_curr_p == LIT_CHAR_QUESTION)
+ uint32_t qmin = re_ctx_p->token.qmin;
+ uint32_t qmax = re_ctx_p->token.qmax;
+
+ if (JERRY_UNLIKELY (!is_capturing && re_bytecode_size (re_ctx_p) == group_start_offset))
{
- parser_ctx_p->input_curr_p++;
- return true;
+ return;
}
- return false;
-} /* re_parse_non_greedy_char */
+ if (qmin == 0)
+ {
+ re_insert_value (re_ctx_p,
+ group_start_offset,
+ re_bytecode_size (re_ctx_p) - group_start_offset);
+ }
+
+ re_insert_value (re_ctx_p, group_start_offset, qmin);
+ re_insert_value (re_ctx_p, group_start_offset, re_ctx_p->captures_count - capture_start);
+
+ if (!is_capturing)
+ {
+ re_insert_value (re_ctx_p, group_start_offset, capture_start);
+ }
+ else
+ {
+ JERRY_ASSERT (idx == capture_start);
+ }
+
+ re_insert_value (re_ctx_p, group_start_offset, idx);
+ re_insert_opcode (re_ctx_p, group_start_offset, re_get_group_start_opcode (is_capturing));
+
+ re_append_opcode (re_ctx_p, re_get_group_end_opcode (re_ctx_p, is_capturing));
+ re_append_value (re_ctx_p, idx);
+ re_append_value (re_ctx_p, qmin);
+ re_append_value (re_ctx_p, qmax + RE_QMAX_OFFSET);
+} /* re_insert_into_group */
/**
- * Parse a max 3 digit long octal number from input string iterator.
- *
- * @return uint32_t - parsed octal number
+ * Insert simple atom iterator.
*/
-uint32_t
-re_parse_octal (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */
+static void
+re_insert_atom_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
+ uint32_t start_offset) /**< atom start offset */
{
- uint32_t number = 0;
- for (int index = 0;
- index < 3
- && parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p
- && lit_char_is_octal_digit (*parser_ctx_p->input_curr_p);
- index++)
+ const uint32_t qmin = re_ctx_p->token.qmin;
+ const uint32_t qmax = re_ctx_p->token.qmax;
+
+ if (qmin == 1 && qmax == 1)
{
- number = number * 8 + lit_char_hex_to_int (*parser_ctx_p->input_curr_p++);
+ return;
}
- return number;
-} /* re_parse_octal */
+ re_append_opcode (re_ctx_p, RE_OP_ITERATOR_END);
+ re_insert_value (re_ctx_p, start_offset, re_bytecode_size (re_ctx_p) - start_offset);
+ re_insert_value (re_ctx_p, start_offset, qmax + RE_QMAX_OFFSET);
+ re_insert_value (re_ctx_p, start_offset, qmin);
+ re_insert_opcode (re_ctx_p, start_offset, re_ctx_p->token.greedy ? RE_OP_GREEDY_ITERATOR : RE_OP_LAZY_ITERATOR);
+} /* re_insert_atom_iterator */
/**
- * Parse RegExp iterators
- *
- * @return empty ecma value - if parsed successfully
- * error ecma value - otherwise
- *
- * Returned value must be freed with ecma_free_value
+ * Insert a lookahead assertion.
*/
-ecma_value_t
-re_parse_iterator (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */
- re_token_t *re_token_p) /**< [out] output token */
+static void
+re_insert_assertion_lookahead (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
+ uint32_t start_offset, /**< atom start offset */
+ uint32_t capture_start, /**< index of first nested capture */
+ bool negative) /** lookahead type */
{
- ecma_value_t ret_value = ECMA_VALUE_EMPTY;
+ const uint32_t qmin = re_ctx_p->token.qmin;
+
+ re_append_opcode (re_ctx_p, RE_OP_ASSERT_END);
+ re_insert_value (re_ctx_p, start_offset, re_bytecode_size (re_ctx_p) - start_offset);
+
+ /* We need to clear nested capturing group results when a negative assertion or the tail after a positive assertion
+ * does not match, so we store the begin and end index of nested capturing groups. */
+ re_insert_value (re_ctx_p, start_offset, re_ctx_p->captures_count - capture_start);
+ re_insert_value (re_ctx_p, start_offset, capture_start);
+
+ /* Lookaheads always result in zero length matches, which means iterations will always stop on the first match.
+ * This allows us to not have to deal with iterations beyond one. Either qmin == 0 which will implicitly match,
+ * or qmin > 0, in which case the first iteration will decide whether the assertion matches depending on whether
+ * the iteration matched or not. This also allows us to ignore qmax entirely. */
+ re_insert_byte (re_ctx_p, start_offset, (uint8_t) JERRY_MIN (qmin, 1));
- re_token_p->qmin = 1;
- re_token_p->qmax = 1;
- re_token_p->greedy = true;
+ const re_opcode_t opcode = (negative) ? RE_OP_ASSERT_LOOKAHEAD_NEG : RE_OP_ASSERT_LOOKAHEAD_POS;
+ re_insert_opcode (re_ctx_p, start_offset, opcode);
+} /* re_insert_assertion_lookahead */
- if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
+/**
+ * Consume non greedy (question mark) character if present.
+ */
+static void
+re_parse_lazy_char (re_compiler_ctx_t *re_ctx_p) /**< RegExp parser context */
+{
+ if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p
+ && *re_ctx_p->input_curr_p == LIT_CHAR_QUESTION)
{
- return ret_value;
+ re_ctx_p->input_curr_p++;
+ re_ctx_p->token.greedy = false;
+ return;
}
- ecma_char_t ch = *parser_ctx_p->input_curr_p;
+ re_ctx_p->token.greedy = true;
+} /* re_parse_lazy_char */
- switch (ch)
+/**
+ * Parse a max 3 digit long octal number from the input string, with a decimal value less than 256.
+ *
+ * @return value of the octal number
+ */
+static uint32_t
+re_parse_octal (re_compiler_ctx_t *re_ctx_p) /**< RegExp parser context */
+{
+ JERRY_ASSERT (re_ctx_p->input_curr_p < re_ctx_p->input_end_p);
+ JERRY_ASSERT (lit_char_is_octal_digit (*re_ctx_p->input_curr_p));
+
+ uint32_t value = (uint32_t) (*re_ctx_p->input_curr_p++) - LIT_CHAR_0;
+
+ if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p
+ && lit_char_is_octal_digit (*re_ctx_p->input_curr_p))
{
- case LIT_CHAR_QUESTION:
- {
- parser_ctx_p->input_curr_p++;
- re_token_p->qmin = 0;
- re_token_p->qmax = 1;
- re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p);
- break;
- }
- case LIT_CHAR_ASTERISK:
- {
- parser_ctx_p->input_curr_p++;
- re_token_p->qmin = 0;
- re_token_p->qmax = RE_ITERATOR_INFINITE;
- re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p);
- break;
- }
- case LIT_CHAR_PLUS:
+ value = value * 8 + (*re_ctx_p->input_curr_p++) - LIT_CHAR_0;
+ }
+
+ if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p
+ && lit_char_is_octal_digit (*re_ctx_p->input_curr_p))
+ {
+ const uint32_t new_value = value * 8 + (*re_ctx_p->input_curr_p) - LIT_CHAR_0;
+
+ if (new_value <= RE_MAX_OCTAL_VALUE)
{
- parser_ctx_p->input_curr_p++;
- re_token_p->qmin = 1;
- re_token_p->qmax = RE_ITERATOR_INFINITE;
- re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p);
- break;
+ value = new_value;
+ re_ctx_p->input_curr_p++;
}
- case LIT_CHAR_LEFT_BRACE:
+ }
+
+ return value;
+} /* re_parse_octal */
+
+/**
+ * Check that the currently parsed quantifier is valid.
+ *
+ * @return ECMA_VALUE_ERROR, if quantifier is invalid
+ * ECMA_VALUE_EMPTY, otherwise
+ */
+static ecma_value_t
+re_check_quantifier (re_compiler_ctx_t *re_ctx_p)
+{
+ if (re_ctx_p->token.qmin > re_ctx_p->token.qmax)
+ {
+ /* ECMA-262 v5.1 15.10.2.5 */
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("quantifier error: min > max."));
+ }
+
+ return ECMA_VALUE_EMPTY;
+} /* re_check_quantifier */
+
+/**
+ * Parse RegExp quantifier.
+ *
+ * @return ECMA_VALUE_TRUE - if parsed successfully
+ * ECMA_VALUE_FALSE - otherwise
+ */
+static ecma_value_t
+re_parse_quantifier (re_compiler_ctx_t *re_ctx_p) /**< RegExp compiler context */
+{
+ if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p)
+ {
+ switch (*re_ctx_p->input_curr_p)
{
- parser_ctx_p->input_curr_p++;
- uint32_t qmin = 0;
- uint32_t qmax = RE_ITERATOR_INFINITE;
- uint32_t digits = 0;
+ case LIT_CHAR_QUESTION:
+ {
+ re_ctx_p->input_curr_p++;
+ re_ctx_p->token.qmin = 0;
+ re_ctx_p->token.qmax = 1;
- while (true)
+ re_parse_lazy_char (re_ctx_p);
+ return ECMA_VALUE_TRUE;
+ }
+ case LIT_CHAR_ASTERISK:
{
- if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid quantifier"));
- }
+ re_ctx_p->input_curr_p++;
+ re_ctx_p->token.qmin = 0;
+ re_ctx_p->token.qmax = RE_INFINITY;
- ch = *parser_ctx_p->input_curr_p++;
+ re_parse_lazy_char (re_ctx_p);
+ return ECMA_VALUE_TRUE;
+ }
+ case LIT_CHAR_PLUS:
+ {
+ re_ctx_p->input_curr_p++;
+ re_ctx_p->token.qmin = 1;
+ re_ctx_p->token.qmax = RE_INFINITY;
- if (lit_char_is_decimal_digit (ch))
+ re_parse_lazy_char (re_ctx_p);
+ return ECMA_VALUE_TRUE;
+ }
+ case LIT_CHAR_LEFT_BRACE:
+ {
+ const lit_utf8_byte_t *current_p = re_ctx_p->input_curr_p + 1;
+ uint32_t qmin = 0;
+ uint32_t qmax = RE_INFINITY;
+
+ if (current_p >= re_ctx_p->input_end_p)
{
- if (digits >= ECMA_NUMBER_MAX_DIGITS)
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: too many digits."));
- }
- digits++;
- qmin = qmin * 10 + lit_char_hex_to_int (ch);
+ break;
}
- else if (ch == LIT_CHAR_COMMA)
+
+ if (!lit_char_is_decimal_digit (*current_p))
{
- if (qmax != RE_ITERATOR_INFINITE)
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: double comma."));
- }
+ break;
+ }
- if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid quantifier"));
- }
+ qmin = lit_parse_decimal (&current_p, re_ctx_p->input_end_p);
- if (*parser_ctx_p->input_curr_p == LIT_CHAR_RIGHT_BRACE)
- {
- if (digits == 0)
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: missing digits."));
- }
+ if (current_p >= re_ctx_p->input_end_p)
+ {
+ break;
+ }
- parser_ctx_p->input_curr_p++;
- re_token_p->qmin = qmin;
- re_token_p->qmax = RE_ITERATOR_INFINITE;
- break;
- }
+ lit_utf8_byte_t ch = *current_p++;
+ if (ch == LIT_CHAR_RIGHT_BRACE)
+ {
qmax = qmin;
- qmin = 0;
- digits = 0;
}
- else if (ch == LIT_CHAR_RIGHT_BRACE)
+ else if (ch == LIT_CHAR_COMMA)
{
- if (digits == 0)
+ if (current_p >= re_ctx_p->input_end_p)
{
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: missing digits."));
+ break;
}
- if (qmax != RE_ITERATOR_INFINITE)
+ if (lit_char_is_decimal_digit (*current_p))
{
- re_token_p->qmin = qmax;
+ qmax = lit_parse_decimal (&current_p, re_ctx_p->input_end_p);
}
- else
+
+ if (current_p >= re_ctx_p->input_end_p || *current_p++ != LIT_CHAR_RIGHT_BRACE)
{
- re_token_p->qmin = qmin;
+ break;
}
-
- re_token_p->qmax = qmin;
-
- break;
}
else
{
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp quantifier error: unknown char."));
+ break;
}
- }
- re_token_p->greedy = !re_parse_non_greedy_char (parser_ctx_p);
- break;
- }
- default:
- {
- break;
+ re_ctx_p->token.qmin = qmin;
+ re_ctx_p->token.qmax = qmax;
+ re_ctx_p->input_curr_p = current_p;
+ re_parse_lazy_char (re_ctx_p);
+ return ECMA_VALUE_TRUE;
+ }
+ default:
+ {
+ break;
+ }
}
}
- JERRY_ASSERT (ecma_is_value_empty (ret_value));
+ re_ctx_p->token.qmin = 1;
+ re_ctx_p->token.qmax = 1;
+ re_ctx_p->token.greedy = true;
- return ret_value;
-} /* re_parse_iterator */
+ return ECMA_VALUE_FALSE;
+} /* re_parse_quantifier */
/**
- * Count the number of groups in pattern
+ * Count the number of groups in the current pattern.
*/
static void
-re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */
+re_count_groups (re_compiler_ctx_t *re_ctx_p) /**< RegExp compiler context */
{
- int char_class_in = 0;
- parser_ctx_p->groups_count = 0;
- const lit_utf8_byte_t *curr_p = parser_ctx_p->input_start_p;
+ bool is_char_class = 0;
+ re_ctx_p->groups_count = 0;
+ const lit_utf8_byte_t *curr_p = re_ctx_p->input_start_p;
- while (curr_p < parser_ctx_p->input_end_p)
+ while (curr_p < re_ctx_p->input_end_p)
{
switch (*curr_p++)
{
case LIT_CHAR_BACKSLASH:
{
- if (curr_p < parser_ctx_p->input_end_p)
+ if (curr_p < re_ctx_p->input_end_p)
{
lit_utf8_incr (&curr_p);
}
@@ -270,324 +372,424 @@ re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser contex
}
case LIT_CHAR_LEFT_SQUARE:
{
- char_class_in++;
+ is_char_class = true;
break;
}
case LIT_CHAR_RIGHT_SQUARE:
{
- if (char_class_in)
- {
- char_class_in--;
- }
+ is_char_class = false;
break;
}
case LIT_CHAR_LEFT_PAREN:
{
- if (curr_p < parser_ctx_p->input_end_p
+ if (curr_p < re_ctx_p->input_end_p
&& *curr_p != LIT_CHAR_QUESTION
- && !char_class_in)
+ && !is_char_class)
{
- parser_ctx_p->groups_count++;
+ re_ctx_p->groups_count++;
}
break;
}
}
}
-} /* re_count_num_of_groups */
+} /* re_count_groups */
+#if ENABLED (JERRY_ES2015)
/**
- * Read the input pattern and parse the next token for the RegExp compiler
+ * Check if a code point is a Syntax character
*
- * @return empty ecma value - if parsed successfully
- * error ecma value - otherwise
+ * @return true, if syntax character
+ * false, otherwise
+ */
+static bool
+re_is_syntax_char (lit_code_point_t cp) /**< code point */
+{
+ return (cp == LIT_CHAR_CIRCUMFLEX
+ || cp == LIT_CHAR_DOLLAR_SIGN
+ || cp == LIT_CHAR_BACKSLASH
+ || cp == LIT_CHAR_DOT
+ || cp == LIT_CHAR_ASTERISK
+ || cp == LIT_CHAR_PLUS
+ || cp == LIT_CHAR_QUESTION
+ || cp == LIT_CHAR_LEFT_PAREN
+ || cp == LIT_CHAR_RIGHT_PAREN
+ || cp == LIT_CHAR_LEFT_SQUARE
+ || cp == LIT_CHAR_RIGHT_SQUARE
+ || cp == LIT_CHAR_LEFT_BRACE
+ || cp == LIT_CHAR_RIGHT_BRACE
+ || cp == LIT_CHAR_VLINE);
+} /* re_is_syntax_char */
+#endif /* ENABLED (JERRY_ES2015) */
+
+/**
+ * Parse a Character Escape or a Character Class Escape.
*
- * Returned value must be freed with ecma_free_value
+ * @return ECMA_VALUE_EMPTY, if parsed successfully
+ * ECMA_VALUE_ERROR, otherwise
*/
-ecma_value_t
-re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */
- re_token_t *out_token_p) /**< [out] output token */
+static ecma_value_t
+re_parse_char_escape (re_compiler_ctx_t *re_ctx_p) /**< RegExp compiler context */
{
- ecma_value_t ret_value = ECMA_VALUE_EMPTY;
+ JERRY_ASSERT (re_ctx_p->input_curr_p < re_ctx_p->input_end_p);
+ re_ctx_p->token.type = RE_TOK_CHAR;
- if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
+ if (lit_char_is_decimal_digit (*re_ctx_p->input_curr_p))
{
- out_token_p->type = RE_TOK_EOF;
- return ret_value;
- }
+ /* NULL code point escape, only valid if there are no following digits. */
+ if (*re_ctx_p->input_curr_p == LIT_CHAR_0
+ && (re_ctx_p->input_curr_p + 1 >= re_ctx_p->input_end_p
+ || !lit_char_is_decimal_digit (re_ctx_p->input_curr_p[1])))
+ {
+ re_ctx_p->input_curr_p++;
+ re_ctx_p->token.value = LIT_UNICODE_CODE_POINT_NULL;
+ return ECMA_VALUE_EMPTY;
+ }
+
+#if ENABLED (JERRY_ES2015)
+ if (re_ctx_p->flags & RE_FLAG_UNICODE)
+ {
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid escape sequence"));
+ }
+#endif /* ENABLED (JERRY_ES2015) */
+
+ /* Legacy octal escape sequence */
+ if (lit_char_is_octal_digit (*re_ctx_p->input_curr_p))
+ {
+ re_ctx_p->token.value = re_parse_octal (re_ctx_p);
+ return ECMA_VALUE_EMPTY;
+ }
- ecma_char_t ch = lit_cesu8_read_next (&parser_ctx_p->input_curr_p);
+ /* Identity escape */
+ re_ctx_p->token.value = *re_ctx_p->input_curr_p++;
+ return ECMA_VALUE_EMPTY;
+ }
+ lit_code_point_t ch = lit_cesu8_read_next (&re_ctx_p->input_curr_p);
switch (ch)
{
- case LIT_CHAR_VLINE:
+ /* Character Class escapes */
+ case LIT_CHAR_LOWERCASE_D:
{
- out_token_p->type = RE_TOK_ALTERNATIVE;
+ re_ctx_p->token.type = RE_TOK_CLASS_ESCAPE;
+ re_ctx_p->token.value = RE_ESCAPE_DIGIT;
break;
}
- case LIT_CHAR_CIRCUMFLEX:
+ case LIT_CHAR_UPPERCASE_D:
{
- out_token_p->type = RE_TOK_ASSERT_START;
+ re_ctx_p->token.type = RE_TOK_CLASS_ESCAPE;
+ re_ctx_p->token.value = RE_ESCAPE_NOT_DIGIT;
break;
}
- case LIT_CHAR_DOLLAR_SIGN:
+ case LIT_CHAR_LOWERCASE_S:
{
- out_token_p->type = RE_TOK_ASSERT_END;
+ re_ctx_p->token.type = RE_TOK_CLASS_ESCAPE;
+ re_ctx_p->token.value = RE_ESCAPE_WHITESPACE;
break;
}
- case LIT_CHAR_DOT:
+ case LIT_CHAR_UPPERCASE_S:
{
- out_token_p->type = RE_TOK_PERIOD;
- ret_value = re_parse_iterator (parser_ctx_p, out_token_p);
+ re_ctx_p->token.type = RE_TOK_CLASS_ESCAPE;
+ re_ctx_p->token.value = RE_ESCAPE_NOT_WHITESPACE;
break;
}
- case LIT_CHAR_BACKSLASH:
+ case LIT_CHAR_LOWERCASE_W:
+ {
+ re_ctx_p->token.type = RE_TOK_CLASS_ESCAPE;
+ re_ctx_p->token.value = RE_ESCAPE_WORD_CHAR;
+ break;
+ }
+ case LIT_CHAR_UPPERCASE_W:
+ {
+ re_ctx_p->token.type = RE_TOK_CLASS_ESCAPE;
+ re_ctx_p->token.value = RE_ESCAPE_NOT_WORD_CHAR;
+ break;
+ }
+ /* Control escapes */
+ case LIT_CHAR_LOWERCASE_F:
+ {
+ re_ctx_p->token.value = LIT_CHAR_FF;
+ break;
+ }
+ case LIT_CHAR_LOWERCASE_N:
+ {
+ re_ctx_p->token.value = LIT_CHAR_LF;
+ break;
+ }
+ case LIT_CHAR_LOWERCASE_R:
{
- if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
+ re_ctx_p->token.value = LIT_CHAR_CR;
+ break;
+ }
+ case LIT_CHAR_LOWERCASE_T:
+ {
+ re_ctx_p->token.value = LIT_CHAR_TAB;
+ break;
+ }
+ case LIT_CHAR_LOWERCASE_V:
+ {
+ re_ctx_p->token.value = LIT_CHAR_VTAB;
+ break;
+ }
+ /* Control letter */
+ case LIT_CHAR_LOWERCASE_C:
+ {
+ if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p)
{
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid regular experssion"));
- }
-
- out_token_p->type = RE_TOK_CHAR;
- ch = lit_cesu8_read_next (&parser_ctx_p->input_curr_p);
+ ch = *re_ctx_p->input_curr_p;
- if (ch == LIT_CHAR_LOWERCASE_B)
- {
- out_token_p->type = RE_TOK_ASSERT_WORD_BOUNDARY;
- }
- else if (ch == LIT_CHAR_UPPERCASE_B)
- {
- out_token_p->type = RE_TOK_ASSERT_NOT_WORD_BOUNDARY;
- }
- else if (ch == LIT_CHAR_LOWERCASE_F)
- {
- out_token_p->value = LIT_CHAR_FF;
- }
- else if (ch == LIT_CHAR_LOWERCASE_N)
- {
- out_token_p->value = LIT_CHAR_LF;
- }
- else if (ch == LIT_CHAR_LOWERCASE_T)
- {
- out_token_p->value = LIT_CHAR_TAB;
- }
- else if (ch == LIT_CHAR_LOWERCASE_R)
- {
- out_token_p->value = LIT_CHAR_CR;
- }
- else if (ch == LIT_CHAR_LOWERCASE_V)
- {
- out_token_p->value = LIT_CHAR_VTAB;
- }
- else if (ch == LIT_CHAR_LOWERCASE_C)
- {
- if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p)
+ if ((ch >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
+ || (ch >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END))
{
- ch = *parser_ctx_p->input_curr_p;
+ re_ctx_p->token.value = (ch % 32);
+ re_ctx_p->input_curr_p++;
- if ((ch >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
- || (ch >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END))
- {
- out_token_p->value = (ch % 32);
- parser_ctx_p->input_curr_p++;
- }
- else
- {
- out_token_p->value = LIT_CHAR_BACKSLASH;
- parser_ctx_p->input_curr_p--;
- }
- }
- else
- {
- out_token_p->value = LIT_CHAR_BACKSLASH;
- parser_ctx_p->input_curr_p--;
+ break;
}
}
- else if (ch == LIT_CHAR_LOWERCASE_X
- && re_hex_lookup (parser_ctx_p, 2))
- {
- ecma_char_t code_unit;
- if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 2, &code_unit))
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("decode error"));
- }
-
- parser_ctx_p->input_curr_p += 2;
- out_token_p->value = code_unit;
- }
- else if (ch == LIT_CHAR_LOWERCASE_U
- && re_hex_lookup (parser_ctx_p, 4))
+#if ENABLED (JERRY_ES2015)
+ if (re_ctx_p->flags & RE_FLAG_UNICODE)
{
- ecma_char_t code_unit;
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid control escape sequence"));
+ }
+#endif /* ENABLED (JERRY_ES2015) */
- if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 4, &code_unit))
- {
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("decode error"));
- }
+ re_ctx_p->token.value = LIT_CHAR_BACKSLASH;
+ re_ctx_p->input_curr_p--;
- parser_ctx_p->input_curr_p += 4;
- out_token_p->value = code_unit;
- }
- else if (ch == LIT_CHAR_LOWERCASE_D)
- {
- out_token_p->type = RE_TOK_DIGIT;
- break;
- }
- else if (ch == LIT_CHAR_UPPERCASE_D)
- {
- out_token_p->type = RE_TOK_NOT_DIGIT;
- break;
- }
- else if (ch == LIT_CHAR_LOWERCASE_S)
- {
- out_token_p->type = RE_TOK_WHITE;
- break;
- }
- else if (ch == LIT_CHAR_UPPERCASE_S)
- {
- out_token_p->type = RE_TOK_NOT_WHITE;
- break;
- }
- else if (ch == LIT_CHAR_LOWERCASE_W)
+ break;
+ }
+ /* Hex escape */
+ case LIT_CHAR_LOWERCASE_X:
+ {
+ uint32_t hex_value = lit_char_hex_lookup (re_ctx_p->input_curr_p, re_ctx_p->input_end_p, 2);
+ if (hex_value != UINT32_MAX)
{
- out_token_p->type = RE_TOK_WORD_CHAR;
+ re_ctx_p->token.value = hex_value;
+ re_ctx_p->input_curr_p += 2;
break;
}
- else if (ch == LIT_CHAR_UPPERCASE_W)
+
+#if ENABLED (JERRY_ES2015)
+ if (re_ctx_p->flags & RE_FLAG_UNICODE)
{
- out_token_p->type = RE_TOK_NOT_WORD_CHAR;
- break;
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid hex escape sequence"));
}
- else if (lit_char_is_decimal_digit (ch))
+#endif /* ENABLED (JERRY_ES2015) */
+
+ re_ctx_p->token.value = LIT_CHAR_LOWERCASE_X;
+ break;
+ }
+ /* Unicode escape */
+ case LIT_CHAR_LOWERCASE_U:
+ {
+ uint32_t hex_value = lit_char_hex_lookup (re_ctx_p->input_curr_p, re_ctx_p->input_end_p, 4);
+ if (hex_value != UINT32_MAX)
{
- if (ch == LIT_CHAR_0)
+ re_ctx_p->token.value = hex_value;
+ re_ctx_p->input_curr_p += 4;
+
+#if ENABLED (JERRY_ES2015)
+ if (re_ctx_p->flags & RE_FLAG_UNICODE
+ && lit_is_code_point_utf16_high_surrogate (re_ctx_p->token.value)
+ && re_ctx_p->input_curr_p + 6 <= re_ctx_p->input_end_p
+ && re_ctx_p->input_curr_p[0] == '\\'
+ && re_ctx_p->input_curr_p[1] == 'u')
{
- if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p
- && lit_char_is_decimal_digit (*parser_ctx_p->input_curr_p))
+ hex_value = lit_char_hex_lookup (re_ctx_p->input_curr_p + 2, re_ctx_p->input_end_p, 4);
+ if (lit_is_code_point_utf16_low_surrogate (hex_value))
{
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp escape pattern error."));
+ re_ctx_p->token.value = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) re_ctx_p->token.value,
+ (ecma_char_t) hex_value);
+ re_ctx_p->input_curr_p += 6;
}
-
- out_token_p->value = LIT_UNICODE_CODE_POINT_NULL;
}
- else
+#endif /* ENABLED (JERRY_ES2015) */
+
+ break;
+ }
+
+#if ENABLED (JERRY_ES2015)
+ if (re_ctx_p->flags & RE_FLAG_UNICODE)
+ {
+ if (*re_ctx_p->input_curr_p == LIT_CHAR_LEFT_BRACE)
{
- if (parser_ctx_p->groups_count == -1)
- {
- re_count_num_of_groups (parser_ctx_p);
- }
+ re_ctx_p->input_curr_p++;
- if (parser_ctx_p->groups_count)
+ if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p && lit_char_is_hex_digit (*re_ctx_p->input_curr_p))
{
- parser_ctx_p->input_curr_p--;
- uint32_t number = 0;
- int index = 0;
+ lit_code_point_t cp = lit_char_hex_to_int (*re_ctx_p->input_curr_p++);
- do
+ while (re_ctx_p->input_curr_p < re_ctx_p->input_end_p && lit_char_is_hex_digit (*re_ctx_p->input_curr_p))
{
- if (index >= RE_MAX_RE_DECESC_DIGITS)
- {
- ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("RegExp escape error: decimal escape too long."));
- return ret_value;
- }
- if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
- {
- break;
- }
+ cp = cp * 16 + lit_char_hex_to_int (*re_ctx_p->input_curr_p++);
- ecma_char_t digit = *parser_ctx_p->input_curr_p++;
-
- if (!lit_char_is_decimal_digit (digit))
+ if (JERRY_UNLIKELY (cp > LIT_UNICODE_CODE_POINT_MAX))
{
- parser_ctx_p->input_curr_p--;
- break;
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid unicode escape sequence"));
}
- number = number * 10 + lit_char_hex_to_int (digit);
- index++;
}
- while (true);
- if ((int) number <= parser_ctx_p->groups_count)
+ if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p && *re_ctx_p->input_curr_p == LIT_CHAR_RIGHT_BRACE)
{
- out_token_p->type = RE_TOK_BACKREFERENCE;
+ re_ctx_p->input_curr_p++;
+ re_ctx_p->token.value = cp;
+ break;
}
- else
- /* Invalid backreference, fallback to octal */
- {
- /* Rewind to start of number. */
- parser_ctx_p->input_curr_p -= index;
+ }
+ }
- /* Try to reparse as octal. */
- ecma_char_t digit = *parser_ctx_p->input_curr_p;
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid unicode escape sequence"));
+ }
+#endif /* ENABLED (JERRY_ES2015) */
- if (!lit_char_is_octal_digit (digit))
- {
- /* Not octal, keep digit character value. */
- number = digit;
- parser_ctx_p->input_curr_p++;
- }
- else
- {
- number = re_parse_octal (parser_ctx_p);
- }
- }
- out_token_p->value = number;
- }
- else
- /* Invalid backreference, fallback to octal if possible */
- {
- if (!lit_char_is_octal_digit (ch))
- {
- /* Not octal, keep character value. */
- out_token_p->value = ch;
- }
- else
- {
- parser_ctx_p->input_curr_p--;
- out_token_p->value = re_parse_octal (parser_ctx_p);
- }
- }
+ re_ctx_p->token.value = LIT_CHAR_LOWERCASE_U;
+ break;
+ }
+ /* Identity escape */
+ default:
+ {
+#if ENABLED (JERRY_ES2015)
+ /* Must be '/', or one of SyntaxCharacter */
+ if (re_ctx_p->flags & RE_FLAG_UNICODE
+ && ch != LIT_CHAR_SLASH
+ && !re_is_syntax_char (ch))
+ {
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid escape"));
+ }
+#endif /* ENABLED (JERRY_ES2015) */
+ re_ctx_p->token.value = ch;
+ }
+ }
+
+ return ECMA_VALUE_EMPTY;
+} /* re_parse_char_escape */
+
+/**
+ * Read the input pattern and parse the next token for the RegExp compiler
+ *
+ * @return empty ecma value - if parsed successfully
+ * error ecma value - otherwise
+ *
+ * Returned value must be freed with ecma_free_value
+ */
+static ecma_value_t
+re_parse_next_token (re_compiler_ctx_t *re_ctx_p) /**< RegExp compiler context */
+{
+ if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p)
+ {
+ re_ctx_p->token.type = RE_TOK_EOF;
+ return ECMA_VALUE_EMPTY;
+ }
+
+ ecma_char_t ch = lit_cesu8_read_next (&re_ctx_p->input_curr_p);
+
+ switch (ch)
+ {
+ case LIT_CHAR_CIRCUMFLEX:
+ {
+ re_ctx_p->token.type = RE_TOK_ASSERT_START;
+ return ECMA_VALUE_EMPTY;
+ }
+ case LIT_CHAR_DOLLAR_SIGN:
+ {
+ re_ctx_p->token.type = RE_TOK_ASSERT_END;
+ return ECMA_VALUE_EMPTY;
+ }
+ case LIT_CHAR_VLINE:
+ {
+ re_ctx_p->token.type = RE_TOK_ALTERNATIVE;
+ return ECMA_VALUE_EMPTY;
+ }
+ case LIT_CHAR_DOT:
+ {
+ re_ctx_p->token.type = RE_TOK_PERIOD;
+ /* Check quantifier */
+ break;
+ }
+ case LIT_CHAR_BACKSLASH:
+ {
+ if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p)
+ {
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid escape"));
+ }
+
+ /* DecimalEscape, Backreferences cannot start with a zero digit. */
+ if (*re_ctx_p->input_curr_p > LIT_CHAR_0 && *re_ctx_p->input_curr_p <= LIT_CHAR_9)
+ {
+ const lit_utf8_byte_t *digits_p = re_ctx_p->input_curr_p;
+ const uint32_t value = lit_parse_decimal (&digits_p, re_ctx_p->input_end_p);
+
+ if (re_ctx_p->groups_count < 0)
+ {
+ re_count_groups (re_ctx_p);
+ }
+
+ if (value <= (uint32_t) re_ctx_p->groups_count)
+ {
+ /* Valid backreference */
+ re_ctx_p->input_curr_p = digits_p;
+ re_ctx_p->token.type = RE_TOK_BACKREFERENCE;
+ re_ctx_p->token.value = value;
+
+ /* Check quantifier */
+ break;
}
}
- else
+
+ if (*re_ctx_p->input_curr_p == LIT_CHAR_LOWERCASE_B)
{
- out_token_p->value = ch;
+ re_ctx_p->input_curr_p++;
+ re_ctx_p->token.type = RE_TOK_ASSERT_WORD_BOUNDARY;
+ return ECMA_VALUE_EMPTY;
}
+ else if (*re_ctx_p->input_curr_p == LIT_CHAR_UPPERCASE_B)
+ {
+ re_ctx_p->input_curr_p++;
+ re_ctx_p->token.type = RE_TOK_ASSERT_NOT_WORD_BOUNDARY;
+ return ECMA_VALUE_EMPTY;
+ }
+
+ const ecma_value_t parse_result = re_parse_char_escape (re_ctx_p);
- ret_value = re_parse_iterator (parser_ctx_p, out_token_p);
+ if (ECMA_IS_VALUE_ERROR (parse_result))
+ {
+ return parse_result;
+ }
+
+ /* Check quantifier */
break;
}
case LIT_CHAR_LEFT_PAREN:
{
- if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
+ if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p)
{
return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unterminated group"));
}
- if (*parser_ctx_p->input_curr_p == LIT_CHAR_QUESTION)
+ if (*re_ctx_p->input_curr_p == LIT_CHAR_QUESTION)
{
- parser_ctx_p->input_curr_p++;
- if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
+ re_ctx_p->input_curr_p++;
+ if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p)
{
return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid group"));
}
- ch = *parser_ctx_p->input_curr_p++;
+ ch = *re_ctx_p->input_curr_p++;
if (ch == LIT_CHAR_EQUALS)
{
- /* (?= */
- out_token_p->type = RE_TOK_ASSERT_START_POS_LOOKAHEAD;
+ re_ctx_p->token.type = RE_TOK_ASSERT_LOOKAHEAD;
+ re_ctx_p->token.value = false;
}
else if (ch == LIT_CHAR_EXCLAMATION)
{
- /* (?! */
- out_token_p->type = RE_TOK_ASSERT_START_NEG_LOOKAHEAD;
+ re_ctx_p->token.type = RE_TOK_ASSERT_LOOKAHEAD;
+ re_ctx_p->token.value = true;
}
else if (ch == LIT_CHAR_COLON)
{
- /* (?: */
- out_token_p->type = RE_TOK_START_NON_CAPTURE_GROUP;
+ re_ctx_p->token.type = RE_TOK_START_NON_CAPTURE_GROUP;
}
else
{
@@ -596,105 +798,584 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
}
else
{
- /* ( */
- out_token_p->type = RE_TOK_START_CAPTURE_GROUP;
+ re_ctx_p->token.type = RE_TOK_START_CAPTURE_GROUP;
}
- break;
+
+ return ECMA_VALUE_EMPTY;
}
case LIT_CHAR_RIGHT_PAREN:
{
- out_token_p->type = RE_TOK_END_GROUP;
- ret_value = re_parse_iterator (parser_ctx_p, out_token_p);
- break;
+ re_ctx_p->token.type = RE_TOK_END_GROUP;
+
+ return ECMA_VALUE_EMPTY;
}
case LIT_CHAR_LEFT_SQUARE:
{
- out_token_p->type = RE_TOK_START_CHAR_CLASS;
+ re_ctx_p->token.type = RE_TOK_CHAR_CLASS;
- if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
+ if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p)
{
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class"));
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unterminated character class."));
}
- if (*parser_ctx_p->input_curr_p == LIT_CHAR_CIRCUMFLEX)
- {
- out_token_p->type = RE_TOK_START_INV_CHAR_CLASS;
- parser_ctx_p->input_curr_p++;
- }
-
- break;
+ return ECMA_VALUE_EMPTY;
}
case LIT_CHAR_QUESTION:
case LIT_CHAR_ASTERISK:
case LIT_CHAR_PLUS:
{
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid RegExp token."));
- }
- case LIT_CHAR_NULL:
- {
- out_token_p->type = RE_TOK_EOF;
- break;
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid quantifier."));
}
case LIT_CHAR_LEFT_BRACE:
{
-#if ENABLED (JERRY_REGEXP_STRICT_MODE)
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid RegExp token."));
-#else /* !ENABLED (JERRY_REGEXP_STRICT_MODE) */
-
- /* Make sure that the current '{' does not start an iterator.
- *
- * E.g: /\s+{3,4}/ should fail as there is nothing to iterate.
- * However /\s+{3,4/ should be valid in web compatibility mode.
- */
- const lit_utf8_byte_t *input_curr_p = parser_ctx_p->input_curr_p;
+ re_ctx_p->input_curr_p--;
+ if (ecma_is_value_true (re_parse_quantifier (re_ctx_p)))
+ {
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Nothing to repeat."));
+ }
- lit_utf8_decr (&parser_ctx_p->input_curr_p);
- ret_value = re_parse_iterator (parser_ctx_p, out_token_p);
- if (ecma_is_value_empty (ret_value))
+#if ENABLED (JERRY_ES2015)
+ if (re_ctx_p->flags & RE_FLAG_UNICODE)
{
- return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid RegExp token."));
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Lone quantifier bracket."));
}
+#endif /* ENABLED (JERRY_ES2015) */
+
+ re_ctx_p->input_curr_p++;
+ re_ctx_p->token.type = RE_TOK_CHAR;
+ re_ctx_p->token.value = ch;
- JERRY_ASSERT (ECMA_IS_VALUE_ERROR (ret_value));
- jcontext_release_exception ();
+ /* Check quantifier */
+ break;
+ }
+#if ENABLED (JERRY_ES2015)
+ case LIT_CHAR_RIGHT_SQUARE:
+ case LIT_CHAR_RIGHT_BRACE:
+ {
+ if (re_ctx_p->flags & RE_FLAG_UNICODE)
+ {
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Lone quantifier bracket."));
+ }
- parser_ctx_p->input_curr_p = input_curr_p;
- /* It was not an iterator, continue the parsing. */
-#endif /* ENABLED (JERRY_REGEXP_STRICT_MODE) */
/* FALLTHRU */
}
+#endif /* ENABLED (JERRY_ES2015) */
default:
{
- out_token_p->type = RE_TOK_CHAR;
- out_token_p->value = ch;
-#if ENABLED (JERRY_REGEXP_STRICT_MODE)
- ret_value = re_parse_iterator (parser_ctx_p, out_token_p);
-#else
- /* In case of compatiblity mode try the following:
- * 1. Try parsing an iterator after the character.
- * 2.a. If no error is reported: it was an iterator so return an empty value.
- * 2.b. If there was an error: it was not an iterator thus return the current position
- * to the start of the iterator parsing and set the return value to the empty value.
- * 3. The next 're_parse_next_token' call will handle the further parsing of characters.
- */
- const lit_utf8_byte_t *input_curr_p = parser_ctx_p->input_curr_p;
- ret_value = re_parse_iterator (parser_ctx_p, out_token_p);
-
- if (!ecma_is_value_empty (ret_value))
- {
- jcontext_release_exception ();
- parser_ctx_p->input_curr_p = input_curr_p;
- ret_value = ECMA_VALUE_EMPTY;
- }
-#endif
+ re_ctx_p->token.type = RE_TOK_CHAR;
+ re_ctx_p->token.value = ch;
+
+#if ENABLED (JERRY_ES2015)
+ if (re_ctx_p->flags & RE_FLAG_UNICODE
+ && lit_is_code_point_utf16_high_surrogate (ch))
+ {
+ const ecma_char_t next = lit_cesu8_peek_next (re_ctx_p->input_curr_p);
+ if (lit_is_code_point_utf16_low_surrogate (next))
+ {
+ re_ctx_p->token.value = lit_convert_surrogate_pair_to_code_point (ch, next);
+ re_ctx_p->input_curr_p += LIT_UTF8_MAX_BYTES_IN_CODE_UNIT;
+ }
+ }
+#endif /* ENABLED (JERRY_ES2015) */
+
+ /* Check quantifier */
break;
}
}
- return ret_value;
+ re_parse_quantifier (re_ctx_p);
+ return re_check_quantifier (re_ctx_p);
} /* re_parse_next_token */
/**
+ * Append a character class range to the bytecode.
+ */
+static void
+re_class_add_range (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
+ lit_code_point_t start, /**< range begin */
+ lit_code_point_t end) /**< range end */
+{
+ if (re_ctx_p->flags & RE_FLAG_IGNORE_CASE)
+ {
+ start = ecma_regexp_canonicalize_char (start, re_ctx_p->flags & RE_FLAG_UNICODE);
+ end = ecma_regexp_canonicalize_char (end, re_ctx_p->flags & RE_FLAG_UNICODE);
+ }
+
+ re_append_char (re_ctx_p, start);
+ re_append_char (re_ctx_p, end);
+} /* re_class_add_range */
+
+/**
+ * Add a single character to the character class
+ */
+static void
+re_class_add_char (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
+ uint32_t class_offset, /**< character class bytecode offset*/
+ lit_code_point_t cp) /**< code point */
+{
+ if (re_ctx_p->flags & RE_FLAG_IGNORE_CASE)
+ {
+ cp = ecma_regexp_canonicalize_char (cp, re_ctx_p->flags & RE_FLAG_UNICODE);
+ }
+
+ re_insert_char (re_ctx_p, class_offset, cp);
+} /* re_class_add_char */
+
+/**
+ * Invalid character code point
+ */
+#define RE_INVALID_CP 0xFFFFFFFF
+
+/**
+ * Read the input pattern and parse the range of character class
+ *
+ * @return empty ecma value - if parsed successfully
+ * error ecma value - otherwise
+ *
+ * Returned value must be freed with ecma_free_value
+ */
+static ecma_value_t
+re_parse_char_class (re_compiler_ctx_t *re_ctx_p) /**< RegExp compiler context */
+{
+ static const uint8_t escape_flags[] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20};
+ const uint32_t class_offset = re_bytecode_size (re_ctx_p);
+
+ uint8_t found_escape_flags = 0;
+ uint8_t out_class_flags = 0;
+
+ uint32_t range_count = 0;
+ uint32_t char_count = 0;
+ bool is_range = false;
+
+ JERRY_ASSERT (re_ctx_p->input_curr_p < re_ctx_p->input_end_p);
+ if (*re_ctx_p->input_curr_p == LIT_CHAR_CIRCUMFLEX)
+ {
+ re_ctx_p->input_curr_p++;
+ out_class_flags |= RE_CLASS_INVERT;
+ }
+
+ lit_code_point_t start = RE_INVALID_CP;
+
+ while (true)
+ {
+ if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p)
+ {
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unterminated character class."));
+ }
+
+ if (*re_ctx_p->input_curr_p == LIT_CHAR_RIGHT_SQUARE)
+ {
+ if (is_range)
+ {
+ if (start != RE_INVALID_CP)
+ {
+ re_class_add_char (re_ctx_p, class_offset, start);
+ char_count++;
+ }
+
+ re_class_add_char (re_ctx_p, class_offset, LIT_CHAR_MINUS);
+ char_count++;
+ }
+
+ re_ctx_p->input_curr_p++;
+ break;
+ }
+
+ JERRY_ASSERT (re_ctx_p->input_curr_p < re_ctx_p->input_end_p);
+ lit_code_point_t current;
+
+ if (*re_ctx_p->input_curr_p == LIT_CHAR_BACKSLASH)
+ {
+ re_ctx_p->input_curr_p++;
+ if (re_ctx_p->input_curr_p >= re_ctx_p->input_end_p)
+ {
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid escape"));
+ }
+
+ if (*re_ctx_p->input_curr_p == LIT_CHAR_LOWERCASE_B)
+ {
+ re_ctx_p->input_curr_p++;
+ current = LIT_CHAR_BS;
+ }
+#if ENABLED (JERRY_ES2015)
+ else if (*re_ctx_p->input_curr_p == LIT_CHAR_MINUS)
+ {
+ re_ctx_p->input_curr_p++;
+ current = LIT_CHAR_MINUS;
+ }
+#endif /* ENABLED (JERRY_ES2015) */
+ else if ((re_ctx_p->flags & RE_FLAG_UNICODE) == 0
+ && *re_ctx_p->input_curr_p == LIT_CHAR_LOWERCASE_C
+ && re_ctx_p->input_curr_p + 1 < re_ctx_p->input_end_p
+ && (lit_char_is_decimal_digit (*(re_ctx_p->input_curr_p + 1))
+ || *(re_ctx_p->input_curr_p + 1) == LIT_CHAR_UNDERSCORE))
+ {
+ current = ((uint8_t) *(re_ctx_p->input_curr_p + 1) % 32);
+ re_ctx_p->input_curr_p += 2;
+ }
+ else
+ {
+ if (ECMA_IS_VALUE_ERROR (re_parse_char_escape (re_ctx_p)))
+ {
+ return ECMA_VALUE_ERROR;
+ }
+
+ if (re_ctx_p->token.type == RE_TOK_CLASS_ESCAPE)
+ {
+ const uint8_t escape = (uint8_t) re_ctx_p->token.value;
+ found_escape_flags |= escape_flags[escape];
+ current = RE_INVALID_CP;
+ }
+ else
+ {
+ JERRY_ASSERT (re_ctx_p->token.type == RE_TOK_CHAR);
+ current = re_ctx_p->token.value;
+ }
+ }
+ }
+#if ENABLED (JERRY_ES2015)
+ else if (re_ctx_p->flags & RE_FLAG_UNICODE)
+ {
+ current = ecma_regexp_unicode_advance (&re_ctx_p->input_curr_p, re_ctx_p->input_end_p);
+ }
+#endif /* ENABLED (JERRY_ES2015) */
+ else
+ {
+ current = lit_cesu8_read_next (&re_ctx_p->input_curr_p);
+ }
+
+ if (is_range)
+ {
+ is_range = false;
+
+ if (start != RE_INVALID_CP && current != RE_INVALID_CP)
+ {
+ if (start > current)
+ {
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Range out of order in character class"));
+ }
+
+ re_class_add_range (re_ctx_p, start, current);
+ range_count++;
+ continue;
+ }
+
+#if ENABLED (JERRY_ES2015)
+ if (re_ctx_p->flags & RE_FLAG_UNICODE)
+ {
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid character class"));
+ }
+#endif /* ENABLED (JERRY_ES2015) */
+
+ if (start != RE_INVALID_CP)
+ {
+ re_class_add_char (re_ctx_p, class_offset, start);
+ char_count++;
+ }
+ else if (current != RE_INVALID_CP)
+ {
+ re_class_add_char (re_ctx_p, class_offset, current);
+ char_count++;
+ }
+
+ re_class_add_char (re_ctx_p, class_offset, LIT_CHAR_MINUS);
+ char_count++;
+ continue;
+ }
+
+ if (re_ctx_p->input_curr_p < re_ctx_p->input_end_p
+ && *re_ctx_p->input_curr_p == LIT_CHAR_MINUS)
+ {
+ re_ctx_p->input_curr_p++;
+ start = current;
+ is_range = true;
+ continue;
+ }
+
+ if (current != RE_INVALID_CP)
+ {
+ re_class_add_char (re_ctx_p, class_offset, current);
+ char_count++;
+ }
+ }
+
+ uint8_t escape_count = 0;
+ for (ecma_class_escape_t escape = RE_ESCAPE__START; escape < RE_ESCAPE__COUNT; ++escape)
+ {
+ if (found_escape_flags & escape_flags[escape])
+ {
+ re_insert_byte (re_ctx_p, class_offset, (uint8_t) escape);
+ escape_count++;
+ }
+ }
+
+ if (range_count > 0)
+ {
+ re_insert_value (re_ctx_p, class_offset, range_count);
+ out_class_flags |= RE_CLASS_HAS_RANGES;
+ }
+
+ if (char_count > 0)
+ {
+ re_insert_value (re_ctx_p, class_offset, char_count);
+ out_class_flags |= RE_CLASS_HAS_CHARS;
+ }
+
+ JERRY_ASSERT (escape_count <= RE_CLASS_ESCAPE_COUNT_MASK);
+ out_class_flags |= escape_count;
+
+ re_insert_byte (re_ctx_p, class_offset, out_class_flags);
+ re_insert_opcode (re_ctx_p, class_offset, RE_OP_CHAR_CLASS);
+
+ re_parse_quantifier (re_ctx_p);
+ return re_check_quantifier (re_ctx_p);
+} /* re_parse_char_class */
+
+/**
+ * Parse alternatives
+ *
+ * @return empty ecma value - if alternative was successfully parsed
+ * error ecma value - otherwise
+ *
+ * Returned value must be freed with ecma_free_value
+ */
+ecma_value_t
+re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
+ bool expect_eof) /**< expect end of file */
+{
+ ECMA_CHECK_STACK_USAGE ();
+ uint32_t alternative_offset = re_bytecode_size (re_ctx_p);
+ bool first_alternative = true;
+
+ while (true)
+ {
+ ecma_value_t next_token_result = re_parse_next_token (re_ctx_p);
+ if (ECMA_IS_VALUE_ERROR (next_token_result))
+ {
+ return next_token_result;
+ }
+
+ JERRY_ASSERT (ecma_is_value_empty (next_token_result));
+
+ uint32_t atom_offset = re_bytecode_size (re_ctx_p);
+
+ switch (re_ctx_p->token.type)
+ {
+ case RE_TOK_START_CAPTURE_GROUP:
+ {
+ const uint32_t idx = re_ctx_p->captures_count++;
+ const uint32_t capture_start = idx;
+
+ ecma_value_t result = re_parse_alternative (re_ctx_p, false);
+ if (ECMA_IS_VALUE_ERROR (result))
+ {
+ return result;
+ }
+
+ re_parse_quantifier (re_ctx_p);
+
+ if (ECMA_IS_VALUE_ERROR (re_check_quantifier (re_ctx_p)))
+ {
+ return ECMA_VALUE_ERROR;
+ }
+
+ re_insert_into_group (re_ctx_p, atom_offset, idx, capture_start, true);
+ break;
+ }
+ case RE_TOK_START_NON_CAPTURE_GROUP:
+ {
+ const uint32_t idx = re_ctx_p->non_captures_count++;
+ const uint32_t capture_start = re_ctx_p->captures_count;
+
+ ecma_value_t result = re_parse_alternative (re_ctx_p, false);
+ if (ECMA_IS_VALUE_ERROR (result))
+ {
+ return result;
+ }
+
+ re_parse_quantifier (re_ctx_p);
+
+ if (ECMA_IS_VALUE_ERROR (re_check_quantifier (re_ctx_p)))
+ {
+ return ECMA_VALUE_ERROR;
+ }
+
+ re_insert_into_group (re_ctx_p, atom_offset, idx, capture_start, false);
+ break;
+ }
+ case RE_TOK_PERIOD:
+ {
+#if ENABLED (JERRY_ES2015)
+ re_append_opcode (re_ctx_p, (re_ctx_p->flags & RE_FLAG_UNICODE) ? RE_OP_UNICODE_PERIOD : RE_OP_PERIOD);
+#else /* !ENABLED (JERRY_ES2015) */
+ re_append_opcode (re_ctx_p, RE_OP_PERIOD);
+#endif /* !ENABLED (JERRY_ES2015) */
+
+ re_insert_atom_iterator (re_ctx_p, atom_offset);
+ break;
+ }
+ case RE_TOK_ALTERNATIVE:
+ {
+ re_insert_value (re_ctx_p, alternative_offset, re_bytecode_size (re_ctx_p) - alternative_offset);
+ re_insert_opcode (re_ctx_p, alternative_offset, first_alternative ? RE_OP_ALTERNATIVE_START
+ : RE_OP_ALTERNATIVE_NEXT);
+
+ alternative_offset = re_bytecode_size (re_ctx_p);
+ first_alternative = false;
+ break;
+ }
+ case RE_TOK_ASSERT_START:
+ {
+ re_append_opcode (re_ctx_p, RE_OP_ASSERT_LINE_START);
+ break;
+ }
+ case RE_TOK_ASSERT_END:
+ {
+ re_append_opcode (re_ctx_p, RE_OP_ASSERT_LINE_END);
+ break;
+ }
+ case RE_TOK_ASSERT_WORD_BOUNDARY:
+ {
+ re_append_opcode (re_ctx_p, RE_OP_ASSERT_WORD_BOUNDARY);
+ break;
+ }
+ case RE_TOK_ASSERT_NOT_WORD_BOUNDARY:
+ {
+ re_append_opcode (re_ctx_p, RE_OP_ASSERT_NOT_WORD_BOUNDARY);
+ break;
+ }
+ case RE_TOK_ASSERT_LOOKAHEAD:
+ {
+ const uint32_t start_capture_count = re_ctx_p->captures_count;
+ const bool is_negative = !!re_ctx_p->token.value;
+
+ ecma_value_t result = re_parse_alternative (re_ctx_p, false);
+
+ if (ECMA_IS_VALUE_ERROR (result))
+ {
+ return result;
+ }
+
+#if ENABLED (JERRY_ES2015)
+ if (re_ctx_p->flags & RE_FLAG_UNICODE)
+ {
+ re_ctx_p->token.qmin = 1;
+ re_ctx_p->token.qmax = 1;
+ re_ctx_p->token.greedy = true;
+ }
+ else
+#endif /* ENABLED (JERRY_ES2015) */
+ {
+ re_parse_quantifier (re_ctx_p);
+
+ if (ECMA_IS_VALUE_ERROR (re_check_quantifier (re_ctx_p)))
+ {
+ return ECMA_VALUE_ERROR;
+ }
+ }
+
+ re_insert_assertion_lookahead (re_ctx_p, atom_offset, start_capture_count, is_negative);
+ break;
+ }
+ case RE_TOK_BACKREFERENCE:
+ {
+ const uint32_t backref_idx = re_ctx_p->token.value;
+ re_append_opcode (re_ctx_p, RE_OP_BACKREFERENCE);
+ re_append_value (re_ctx_p, backref_idx);
+
+ if (re_ctx_p->token.qmin != 1 || re_ctx_p->token.qmax != 1)
+ {
+ const uint32_t group_idx = re_ctx_p->non_captures_count++;
+ re_insert_into_group (re_ctx_p, atom_offset, group_idx, re_ctx_p->captures_count, false);
+ }
+
+ break;
+ }
+ case RE_TOK_CLASS_ESCAPE:
+ {
+ const ecma_class_escape_t escape = (ecma_class_escape_t) re_ctx_p->token.value;
+ re_append_opcode (re_ctx_p, RE_OP_CLASS_ESCAPE);
+ re_append_byte (re_ctx_p, (uint8_t) escape);
+
+ re_insert_atom_iterator (re_ctx_p, atom_offset);
+ break;
+ }
+ case RE_TOK_CHAR_CLASS:
+ {
+ ecma_value_t result = re_parse_char_class (re_ctx_p);
+
+ if (ECMA_IS_VALUE_ERROR (result))
+ {
+ return result;
+ }
+
+ re_insert_atom_iterator (re_ctx_p, atom_offset);
+ break;
+ }
+ case RE_TOK_END_GROUP:
+ {
+ if (expect_eof)
+ {
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unmatched ')'"));
+ }
+
+ if (!first_alternative)
+ {
+ re_insert_value (re_ctx_p, alternative_offset, re_bytecode_size (re_ctx_p) - alternative_offset);
+ re_insert_opcode (re_ctx_p, alternative_offset, RE_OP_ALTERNATIVE_NEXT);
+ }
+
+ return ECMA_VALUE_EMPTY;
+ }
+ case RE_TOK_EOF:
+ {
+ if (!expect_eof)
+ {
+ return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected end of pattern."));
+ }
+
+ if (!first_alternative)
+ {
+ re_insert_value (re_ctx_p, alternative_offset, re_bytecode_size (re_ctx_p) - alternative_offset);
+ re_insert_opcode (re_ctx_p, alternative_offset, RE_OP_ALTERNATIVE_NEXT);
+ }
+
+ re_append_opcode (re_ctx_p, RE_OP_EOF);
+ return ECMA_VALUE_EMPTY;
+ }
+ default:
+ {
+ JERRY_ASSERT (re_ctx_p->token.type == RE_TOK_CHAR);
+
+ lit_code_point_t ch = re_ctx_p->token.value;
+
+ if (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX && (re_ctx_p->flags & RE_FLAG_IGNORE_CASE) == 0)
+ {
+ re_append_opcode (re_ctx_p, RE_OP_BYTE);
+ re_append_byte (re_ctx_p, (uint8_t) ch);
+
+ re_insert_atom_iterator (re_ctx_p, atom_offset);
+ break;
+ }
+
+ if (re_ctx_p->flags & RE_FLAG_IGNORE_CASE)
+ {
+ ch = ecma_regexp_canonicalize_char (ch, re_ctx_p->flags & RE_FLAG_UNICODE);
+ }
+
+ re_append_opcode (re_ctx_p, RE_OP_CHAR);
+ re_append_char (re_ctx_p, ch);
+
+ re_insert_atom_iterator (re_ctx_p, atom_offset);
+ break;
+ }
+ }
+ }
+
+ return ECMA_VALUE_EMPTY;
+} /* re_parse_alternative */
+
+/**
* @}
* @}
* @}
diff --git a/jerry-core/parser/regexp/re-parser.h b/jerry-core/parser/regexp/re-parser.h
index 7e3c2e2c..1540968b 100644
--- a/jerry-core/parser/regexp/re-parser.h
+++ b/jerry-core/parser/regexp/re-parser.h
@@ -18,46 +18,19 @@
#if ENABLED (JERRY_BUILTIN_REGEXP)
+#include "re-compiler-context.h"
+
/** \addtogroup parser Parser
* @{
*
* \addtogroup regexparser Regular expression
* @{
*
- * \addtogroup regexparser_bytecode Bytecode
+ * \addtogroup regexparser_parser Parser
* @{
*/
/**
- * RegExp token type definitions
- */
-typedef enum
-{
- RE_TOK_EOF, /**< EOF */
- RE_TOK_BACKREFERENCE, /**< "\[0..9]" */
- RE_TOK_CHAR, /**< any character */
- RE_TOK_ALTERNATIVE, /**< "|" */
- RE_TOK_ASSERT_START, /**< "^" */
- RE_TOK_ASSERT_END, /**< "$" */
- RE_TOK_PERIOD, /**< "." */
- RE_TOK_START_CAPTURE_GROUP, /**< "(" */
- RE_TOK_START_NON_CAPTURE_GROUP, /**< "(?:" */
- RE_TOK_END_GROUP, /**< ")" */
- RE_TOK_ASSERT_START_POS_LOOKAHEAD, /**< "(?=" */
- RE_TOK_ASSERT_START_NEG_LOOKAHEAD, /**< "(?!" */
- RE_TOK_ASSERT_WORD_BOUNDARY, /**< "\b" */
- RE_TOK_ASSERT_NOT_WORD_BOUNDARY, /**< "\B" */
- RE_TOK_DIGIT, /**< "\d" */
- RE_TOK_NOT_DIGIT, /**< "\D" */
- RE_TOK_WHITE, /**< "\s" */
- RE_TOK_NOT_WHITE, /**< "\S" */
- RE_TOK_WORD_CHAR, /**< "\w" */
- RE_TOK_NOT_WORD_CHAR, /**< "\W" */
- RE_TOK_START_CHAR_CLASS, /**< "[ ]" */
- RE_TOK_START_INV_CHAR_CLASS, /**< "[^ ]" */
-} re_token_type_t;
-
-/**
* @}
*
* \addtogroup regexparser_parser Parser
@@ -65,43 +38,16 @@ typedef enum
*/
/**
- * RegExp constant of infinite
+ * Value used for infinite quantifier.
*/
-#define RE_ITERATOR_INFINITE ((uint32_t) - 1)
+#define RE_INFINITY UINT32_MAX
/**
- * Maximum number of decimal escape digits
+ * Maximum decimal value of an octal escape
*/
-#define RE_MAX_RE_DECESC_DIGITS 9
-
-/**
- * RegExp token type
- */
-typedef struct
-{
- re_token_type_t type; /**< type of the token */
- uint32_t value; /**< value of the token */
- uint32_t qmin; /**< minimum number of token iterations */
- uint32_t qmax; /**< maximum number of token iterations */
- bool greedy; /**< type of iteration */
-} re_token_t;
-
-/**
- * RegExp parser context
- */
-typedef struct
-{
- const lit_utf8_byte_t *input_start_p; /**< start of input pattern */
- const lit_utf8_byte_t *input_curr_p; /**< current position in input pattern */
- const lit_utf8_byte_t *input_end_p; /**< end of input pattern */
- int groups_count; /**< number of groups */
- uint32_t classes_count; /**< number of character classes */
-} re_parser_ctx_t;
+#define RE_MAX_OCTAL_VALUE 0xff
-bool re_hex_lookup (re_parser_ctx_t *parser_ctx_p, uint32_t lookup);
-uint32_t re_parse_octal (re_parser_ctx_t *parser_ctx_p);
-ecma_value_t re_parse_iterator (re_parser_ctx_t *parser_ctx_p, re_token_t *re_token_p);
-ecma_value_t re_parse_next_token (re_parser_ctx_t *parser_ctx_p, re_token_t *out_token_p);
+ecma_value_t re_parse_alternative (re_compiler_ctx_t *re_ctx_p, bool expect_eof);
/**
* @}
diff --git a/jerry-core/parser/regexp/re-token.h b/jerry-core/parser/regexp/re-token.h
new file mode 100644
index 00000000..fd203a19
--- /dev/null
+++ b/jerry-core/parser/regexp/re-token.h
@@ -0,0 +1,72 @@
+/* Copyright JS Foundation and other contributors, http://js.foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RE_TOKEN_H
+#define RE_TOKEN_H
+
+#if ENABLED (JERRY_BUILTIN_REGEXP)
+
+/** \addtogroup parser Parser
+ * @{
+ *
+ * \addtogroup regexparser Regular expression
+ * @{
+ *
+ * \addtogroup regexparser_parser Parser
+ * @{
+ */
+
+/**
+ * RegExp token type definitions
+ */
+typedef enum
+{
+ RE_TOK_EOF, /**< EOF */
+ RE_TOK_BACKREFERENCE, /**< "\[0..9]" */
+ RE_TOK_ALTERNATIVE, /**< "|" */
+ RE_TOK_ASSERT_START, /**< "^" */
+ RE_TOK_ASSERT_END, /**< "$" */
+ RE_TOK_PERIOD, /**< "." */
+ RE_TOK_START_CAPTURE_GROUP, /**< "(" */
+ RE_TOK_START_NON_CAPTURE_GROUP, /**< "(?:" */
+ RE_TOK_END_GROUP, /**< ")" */
+ RE_TOK_ASSERT_LOOKAHEAD, /**< "(?=" */
+ RE_TOK_ASSERT_WORD_BOUNDARY, /**< "\b" */
+ RE_TOK_ASSERT_NOT_WORD_BOUNDARY, /**< "\B" */
+ RE_TOK_CLASS_ESCAPE, /**< "\d \D \w \W \s \S" */
+ RE_TOK_CHAR_CLASS, /**< "[ ]" */
+ RE_TOK_CHAR, /**< any character */
+} re_token_type_t;
+
+/**
+ * RegExp token
+ */
+typedef struct
+{
+ uint32_t value; /**< value of the token */
+ uint32_t qmin; /**< minimum number of token iterations */
+ uint32_t qmax; /**< maximum number of token iterations */
+ re_token_type_t type; /**< type of the token */
+ bool greedy; /**< type of iteration */
+} re_token_t;
+
+/**
+ * @}
+ * @}
+ * @}
+ */
+
+#endif /* ENABLED (JERRY_BUILTIN_REGEXP) */
+#endif /* !RE_TOKEN_H */
diff --git a/tests/jerry/es2015/regexp-unicode.js b/tests/jerry/es2015/regexp-unicode.js
new file mode 100644
index 00000000..60ac33e8
--- /dev/null
+++ b/tests/jerry/es2015/regexp-unicode.js
@@ -0,0 +1,361 @@
+// Copyright JS Foundation and other contributors, http://js.foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+var result = /\0/.exec("\u0000");
+assert (result !== null);
+assert (result[0] === "\u0000");
+
+result = /\0/u.exec("\u0000");
+assert (result !== null);
+assert (result[0] === "\u0000");
+
+result = /\000/.exec("\u0000");
+assert (result !== null);
+assert (result[0] === "\u0000");
+
+try {
+ new RegExp("\\000", 'u').exec("\u0000");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+result = /\0000/.exec("\u0000\u0030");
+assert (result !== null);
+assert (result[0] === "\u0000\u0030");
+
+result = /\377/.exec("\u00ff");
+assert (result !== null);
+assert (result[0] === "\u00ff");
+
+try {
+ new RegExp("\\377", 'u').exec("\u00ff");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+result = /\3777/.exec("\u00ff\u0037");
+assert (result !== null);
+assert (result[0] === "\u00ff\u0037");
+
+try {
+ new RegExp("\\3777", 'u').exec("\u00ff\u0037");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+result = /\400/.exec("\u0020\u0030");
+assert (result !== null);
+assert (result[0] === "\u0020\u0030");
+
+try {
+ new RegExp("\\400", 'u').exec("\u0020\u0030");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+result = /(\1)/.exec("\u0001");
+assert (result !== null);
+assert (result[0].length === 0);
+
+result = /(\1)/u.exec("\u0001");
+assert (result !== null);
+assert (result[0].length === 0);
+
+result = /(\2)/.exec("\u0002");
+assert (result !== null);
+assert (result[0] === '\u0002');
+
+try {
+ new RegExp("(\\2)", 'u').exec("\u0002");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+result = /\8/.exec("\u0038");
+assert (result !== null);
+assert (result[0] === '8');
+
+result = /\99/.exec("\u0039\u0039");
+assert (result !== null);
+assert (result[0] === "99");
+
+// CharClassEscape
+assert (/\d+/.exec("123")[0] === "123");
+assert (/\D+/.exec("abc")[0] === "abc");
+assert (/\s+/.exec(" ")[0] === " ");
+assert (/\S+/.exec("abc")[0] === "abc");
+assert (/\w+/.exec("abc")[0] === "abc");
+assert (/\W+/.exec("|||")[0] === "|||");
+assert (/\d+/u.exec("123")[0] === "123");
+assert (/\D+/u.exec("abc")[0] === "abc");
+assert (/\s+/u.exec(" ")[0] === " ");
+assert (/\S+/u.exec("abc")[0] === "abc");
+assert (/\w+/u.exec("abc")[0] === "abc");
+assert (/\W+/u.exec("|||")[0] === "|||");
+
+assert (/\d+/u.exec("\u{10CAF}") === null);
+assert (/\D+/u.exec("\u{10CAF}")[0] === "\u{10CAF}");
+assert (/\s+/u.exec("\u{10CAF}") === null);
+assert (/\S+/u.exec("\u{10CAF}")[0] === "\u{10CAF}");
+assert (/\w+/u.exec("\u{10CAF}") === null);
+assert (/\W+/u.exec("\u{10CAF}")[0] === "\u{10CAF}");
+
+result = /\xz/.exec("xz");
+assert (result !== null);
+assert (result[0] === "xz");
+
+try {
+ new RegExp("\\xz", "u").exec("xz");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+result = /\c/.exec("\\c");
+assert (result !== null);
+assert (result[0] === "\\c");
+
+try {
+ new RegExp("\\c", 'u').exec("\\c")
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+result = /\c1/.exec("\\c1");
+assert (result !== null);
+assert (result[0] === "\\c1");
+
+try {
+ new RegExp("\\c1", 'u').exec("\\c1");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+try {
+ new RegExp("^+");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+try {
+ new RegExp("$+");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+try {
+ new RegExp("\\b+");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+try {
+ new RegExp("\\B+");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+assert (/[\b]/.exec("\u0008")[0] === "\u0008");
+assert (/[\b]/u.exec("\u0008")[0] === "\u0008");
+assert (/[\B]/.exec("\u0042")[0] === "\u0042");
+
+try {
+ new RegExp ("[\\B]", 'u').exec("\u0042");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+assert (/[\c1]/.exec("\u0011")[0] === "\u0011");
+assert (/[\c_]/.exec("\u001f")[0] === "\u001f");
+assert (/[\c]/.exec("\\")[0] === "\\");
+assert (/[\c]/.exec("c")[0] === "c");
+
+try {
+ new RegExp("[\\c1]", 'u');
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+try {
+ new RegExp("[\\c]", 'u');
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+try {
+ new RegExp("[\\c_]", 'u');
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+assert (/{{1,2}/.exec("{{")[0] === "{{");
+
+try {
+ new RegExp("{{1,2}", 'u').exec("{{");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+assert (/a{1,2/.exec("a{1,2")[0] === "a{1,2");
+
+try {
+ new RegExp("a{1,2", 'u').exec("a{1,2");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+assert (/\u017f/i.exec("s") === null);
+assert (/\u017f/ui.exec("s")[0] === "s");
+
+assert (/𐲯/.exec("𐲯")[0] === "𐲯");
+assert (/𐲯/u.exec("𐲯")[0] === "𐲯");
+assert (/𐲯*?/.exec("𐲯")[0] === "\ud803");
+assert (/𐲯*?/u.exec("𐲯")[0] === "");
+assert (/𐲯+/.exec("𐲯𐲯𐲯")[0] === "𐲯");
+assert (/𐲯+/u.exec("𐲯𐲯𐲯")[0] === "𐲯𐲯𐲯");
+
+assert (/\ud803\udc96*?/.exec("𐲖")[0] === '\ud803');
+assert (/\ud803\udc96*?/u.exec("𐲖")[0] === '');
+assert (/\ud803\udc96+/.exec("𐲖𐲖𐲖")[0] === '𐲖');
+assert (/\ud803\udc96+/u.exec("𐲖𐲖𐲖")[0] === '𐲖𐲖𐲖');
+
+assert (/.*𐲗𐲘/u.exec("𐲓𐲔𐲕𐲖𐲗𐲘")[0] === '𐲓𐲔𐲕𐲖𐲗𐲘');
+
+assert (/[\u{10000}]/.exec("\u{10000}") === null);
+assert (/[\u{10000}]/.exec("{")[0] === "{");
+assert (/[^\u{10000}]/.exec("\u{10000}")[0] === "\ud800");
+assert (/[^\u{10000}]/.exec("{") === null);
+
+assert (/[\uffff]/.exec("\uffff")[0] === "\uffff");
+assert (/[^\uffff]/.exec("\uffff") === null);
+
+assert (/[\u{10000}]/u.exec("\u{10000}")[0] === "\u{10000}");
+assert (/[\u{10000}]/u.exec("{") === null);
+assert (/[^\u{10000}]/u.exec("\u{10000}") === null);
+assert (/[^\u{10000}]/u.exec("{")[0] === "{");
+
+assert (/[\uffff]/u.exec("\uffff")[0] === "\uffff");
+assert (/[^\uffff]/u.exec("\uffff") === null);
+
+assert (/a{4294967296,4294967297}/.exec("aaaa") === null);
+assert (/a{4294967294,4294967295}/.exec("aaaa") === null);
+assert (/a{0000000000000000001,0000000000000000002}/u.exec("aaaa")[0] === 'aa');
+assert (/(\4294967297)/.exec("\4294967297")[0] === "\4294967297");
+assert (/(\1)/u.exec("aaaa")[0] === "");
+
+try {
+ new RegExp("a{4294967295,4294967294}", '');
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+assert (/[\d-\s]/.exec("-")[0] === "-");
+assert (/[0-\s]/.exec("-")[0] === "-");
+assert (/[\d-0]/.exec("-")[0] === "-");
+
+try {
+ new RegExp("[\\d-\\s]", 'u').exec("-");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+try {
+ new RegExp("[0-\\s]", 'u').exec("-");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+try {
+ new RegExp("[\\d-0]", 'u').exec("-");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+assert (/[-]/.exec("-")[0] === "-");
+assert (/[-]/u.exec("-")[0] === "-");
+assert (/[--]/.exec("-")[0] === "-");
+assert (/[--]/u.exec("-")[0] === "-");
+
+assert (/}/.exec("}")[0] === "}");
+assert (/\}/u.exec("}")[0] === "}");
+
+try {
+ new RegExp("}", 'u').exec("}");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+assert (/]/.exec("]")[0] === "]");
+assert (/\]/u.exec("]")[0] === "]");
+
+try {
+ new RegExp("]", 'u').exec("]");
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+assert (/(?=)*/.exec("")[0] === "");
+assert (/(?=)+/.exec("")[0] === "");
+assert (/(?=){1,2}/.exec("")[0] === "");
+
+try {
+ new RegExp("(?=)*", 'u');
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+try {
+ new RegExp("(?=)+", 'u');
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+try {
+ new RegExp("(?=){1,2}", 'u');
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
+
+try {
+ new RegExp("(?=){2,1}", '');
+ assert (false);
+} catch (e) {
+ assert (e instanceof SyntaxError);
+}
diff --git a/tests/jerry/regexp-alternatives.js b/tests/jerry/regexp-alternatives.js
index d084d459..379702c3 100644
--- a/tests/jerry/regexp-alternatives.js
+++ b/tests/jerry/regexp-alternatives.js
@@ -58,3 +58,6 @@ assert (r.exec("a") == "a");
r = new RegExp ("a|bb|c|d");
assert (r.exec("b") == undefined);
+
+r = new RegExp("(?:a|b)\\b|\\.\\w+", "g");
+assert (r.exec("name.lower()")[0] === ".lower")
diff --git a/tests/jerry/regexp-backreference.js b/tests/jerry/regexp-backreference.js
index 2551cd54..55b92f36 100644
--- a/tests/jerry/regexp-backreference.js
+++ b/tests/jerry/regexp-backreference.js
@@ -24,3 +24,6 @@ assert (r == undefined);
r = new RegExp ("(a)*b\\1").exec("b");
assert (r[0] == "b");
assert (r[1] == undefined);
+
+assert (JSON.stringify (/[[]?(a)\1/.exec("aa")) === '["aa","a"]');
+assert (JSON.stringify (/\1{2,5}()\B/.exec("asd")) === '["",""]');
diff --git a/tests/jerry/regexp-backtrack.js b/tests/jerry/regexp-backtrack.js
new file mode 100644
index 00000000..3099fe78
--- /dev/null
+++ b/tests/jerry/regexp-backtrack.js
@@ -0,0 +1,115 @@
+// Copyright JS Foundation and other contributors, http://js.foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+assert (JSON.stringify (/(?:(a)*){3,}/.exec("aaaab")) === '["aaaa",null]');
+assert (JSON.stringify (/((a)*){3,}/.exec("aaaab")) === '["aaaa","",null]');
+assert (JSON.stringify (/((a)+){3,}/.exec("aaaab")) === '["aaaa","a","a"]');
+assert (JSON.stringify (/((.)*){3,}/.exec("abcd")) === '["abcd","",null]');
+assert (JSON.stringify (/((.)+){3,}/.exec("abcd")) === '["abcd","d","d"]');
+
+assert (JSON.stringify (/((.){1,2}){1,2}/.exec("abc")) === '["abc","c","c"]');
+assert (JSON.stringify (/(?:(a)*?)asd/.exec("aaasd")) === '["aaasd","a"]');
+assert (JSON.stringify (/(?:(a)*)asd/.exec("aaasd")) === '["aaasd","a"]');
+
+assert (JSON.stringify (/(.)*((a)*|(b)*)/.exec("ab")) === '["ab","b","",null,null]');
+assert (JSON.stringify (/(.)*((x)|(y))+/.exec("xy")) === '["xy","x","y",null,"y"]');
+assert (JSON.stringify (/(.)*((y)|(x))+/.exec("xy")) === '["xy","x","y","y",null]');
+
+assert (JSON.stringify (/((?:a)*)/.exec("aaaad")) === '["aaaa","aaaa"]');
+assert (JSON.stringify (/((y)+|x)+/.exec("x")) === '["x","x",null]');
+assert (JSON.stringify (/((?:y)*|x)+/.exec("x")) === '["x","x"]');
+assert (JSON.stringify (/((y)*|x)+/.exec("x")) === '["x","x",null]');
+assert (JSON.stringify (/((y)*|x)*/.exec("x")) === '["x","x",null]');
+assert (JSON.stringify (/(?:(y)*|x)*/.exec("x")) === '["x",null]');
+assert (JSON.stringify (/(?:(y)*|(x))*/.exec("x")) === '["x",null,"x"]');
+
+assert (JSON.stringify (/((?:a)*)asd/.exec("aaasd")) === '["aaasd","aa"]');
+assert (JSON.stringify (/((?:a)+)asd/.exec("aaasd")) === '["aaasd","aa"]');
+assert (JSON.stringify (/((?:a)*?)asd/.exec("aaasd")) === '["aaasd","aa"]');
+assert (JSON.stringify (/((?:a)+?)asd/.exec("aaasd")) === '["aaasd","aa"]');
+
+assert (JSON.stringify (/((y)|(z)|(a))*/.exec("yazx")) === '["yaz","z",null,"z",null]');
+assert (JSON.stringify (/((y)|(z)|(.))*/.exec("yaz")) === '["yaz","z",null,"z",null]');
+assert (JSON.stringify (/((y)*|(z)*|(a)*)*/.exec("yazx")) === '["yaz","z",null,"z",null]')
+assert (JSON.stringify (/((y)|(z)|(a))*/.exec("yazx")) === '["yaz","z",null,"z",null]')
+assert (JSON.stringify (/(?:(y)|(z)|(a))*/.exec("yazx")) === '["yaz",null,"z",null]')
+assert (JSON.stringify (/((y)|(z)|(a))+?/.exec("yazx")) === '["y","y","y",null,null]')
+assert (JSON.stringify (/(?:(y)|(z)|(a))+?/.exec("yazx")) === '["y","y",null,null]')
+
+assert (JSON.stringify (/(?:(x|y)*|z)*/.exec("yz")) === '["yz",null]');
+assert (JSON.stringify (/((x|y)*|z)*/.exec("yz")) == '["yz","z",null]');
+assert (JSON.stringify (/(((x|y)*|(v|w)*|z)*)asd/.exec("xyzwvxzasd")) === '["xyzwvxzasd","xyzwvxz","z",null,null]');
+
+assert (JSON.stringify (/((a)*){1,3}b/.exec("ab")) === '["ab","a","a"]')
+assert (JSON.stringify (/((a)*){2,3}b/.exec("ab")) === '["ab","",null]')
+assert (JSON.stringify (/((a)*){3,3}b/.exec("ab")) === '["ab","",null]')
+
+assert (JSON.stringify (/((a)*){3,}b/.exec("aaaab")) === '["aaaab","",null]');
+assert (JSON.stringify (/((a)*)*b/.exec("aaaab")) === '["aaaab","aaaa","a"]');
+
+assert (JSON.stringify (/((bb?)*)*a/.exec("bbba")) === '["bbba","bbb","b"]');
+assert (JSON.stringify (/((b)*)*a/.exec("bbba")) === '["bbba","bbb","b"]');
+
+assert (JSON.stringify (/(aa|a)a/.exec("aa")) === '["aa","a"]');
+assert (JSON.stringify (/(aa|a)?a/.exec("aa")) === '["aa","a"]');
+assert (JSON.stringify (/(aa|a)+?a/.exec("aa")) === '["aa","a"]');
+assert (JSON.stringify (/(?:aa|a)a/.exec("aa")) === '["aa"]');
+assert (JSON.stringify (/(?:aa|a)?a/.exec("aa")) === '["aa"]');
+assert (JSON.stringify (/(?:aa|a)+?a/.exec("aa")) === '["aa"]');
+
+assert (JSON.stringify (/(aa|a)a/.exec("a")) === 'null');
+assert (JSON.stringify (/(aa|a)?a/.exec("a")) === '["a",null]');
+assert (JSON.stringify (/(aa|a)+?a/.exec("a")) === 'null');
+assert (JSON.stringify (/(?:aa|a)a/.exec("a")) === 'null');
+assert (JSON.stringify (/(?:aa|a)?a/.exec("a")) === '["a"]');
+assert (JSON.stringify (/(?:aa|a)+?a/.exec("a")) === 'null');
+
+assert (JSON.stringify (/a+/.exec("aaasd")) === '["aaa"]');
+assert (JSON.stringify (/a+?/.exec("aaasd")) === '["a"]');
+
+assert (JSON.stringify (/a+sd/.exec("aaasd")) === '["aaasd"]');
+assert (JSON.stringify (/a+?sd/.exec("aaasd")) === '["aaasd"]');
+
+assert (JSON.stringify (/a{2}sd/.exec("aaasd")) === '["aasd"]');
+assert (JSON.stringify (/a{3}sd/.exec("aaasd")) === '["aaasd"]');
+
+assert (JSON.stringify (/(?=a)/.exec("a")) === '[""]');
+assert (JSON.stringify (/(?=a)+/.exec("a")) === '[""]');
+assert (JSON.stringify (/(?=a)*/.exec("a")) === '[""]');
+assert (JSON.stringify (/(?=(a))?/.exec("a")) === '["",null]');
+assert (JSON.stringify (/(?=(a))+?/.exec("a")) === '["","a"]');
+assert (JSON.stringify (/(?=(a))*?/.exec("a")) === '["",null]');
+
+assert (JSON.stringify (/(?!a)/.exec("a")) === '[""]');
+assert (JSON.stringify (/(?!a)+/.exec("a")) === '[""]');
+assert (JSON.stringify (/(?!a)*/.exec("a")) === '[""]');
+assert (JSON.stringify (/(?!(a))?/.exec("a")) === '["",null]');
+assert (JSON.stringify (/(?!(a))+?/.exec("a")) === '["",null]');
+assert (JSON.stringify (/(?!(a))*?/.exec("a")) === '["",null]');
+
+assert (JSON.stringify (/al(?=(ma))*ma/.exec("alma")) === '["alma",null]');
+assert (JSON.stringify (/al(?!(ma))*ma/.exec("alma")) === '["alma",null]');
+assert (JSON.stringify (/al(?=(ma))+ma/.exec("alma")) === '["alma","ma"]');
+assert (JSON.stringify (/al(?!(ma))+ma/.exec("alma")) === 'null');
+
+assert (JSON.stringify (/(?=())x|/.exec("asd")) === '["",null]');
+assert (JSON.stringify (/(?!())x|/.exec("asd")) === '["",null]');
+
+assert (JSON.stringify (/(().*)+.$/.exec("abcdefg")) === '["abcdefg","abcdef",""]');
+assert (JSON.stringify (/(().*)+?.$/.exec("abcdefg")) === '["abcdefg","abcdef",""]');
+assert (JSON.stringify (/(?:().*)+.$/.exec("abcdefg")) === '["abcdefg",""]');
+assert (JSON.stringify (/(?:().*)+?.$/.exec("abcdefg")) === '["abcdefg",""]');
+
+assert (JSON.stringify(/((?=())|.)+^/.exec("a")) === '["","",""]');
+assert (JSON.stringify(/(?:(|\b\w+?){2})+$/.exec("aaaa")) === '["aaaa","aaaa"]');
diff --git a/tests/jerry/regexp-capture-groups.js b/tests/jerry/regexp-capture-groups.js
index 801e062a..c3644d56 100644
--- a/tests/jerry/regexp-capture-groups.js
+++ b/tests/jerry/regexp-capture-groups.js
@@ -196,3 +196,12 @@ assert (r.exec("aa") == "aa,a");
r = new RegExp ("(a{0,1}?){0,1}a");
assert (r.exec("aa") == "aa,a");
+
+r = new RegExp ("(|.)+");
+assert (JSON.stringify (r.exec("asdfgh")) === '["asdfgh","h"]');
+
+assert (JSON.stringify (/([^\W](){8,}?){5}/.exec("asdfghijk")) === '["asdfg","g",""]');
+assert (JSON.stringify (/(()+?(.+)|){3,}./u.exec("asdfghi")) === '["asdfghi","",null,null]')
+assert (JSON.stringify (/(()+?(.+)|){3,}?./u.exec("asdfghi")) === '["asdfghi","",null,null]')
+assert (JSON.stringify (/(?:()+?(.+)|){3,}./u.exec("asdfghi")) === '["asdfghi",null,null]')
+assert (JSON.stringify (/(?:()+?(.+)|){3,}?./u.exec("asdfghi")) === '["asdfghi",null,null]')
diff --git a/tests/jerry/regexp-simple-atom-and-iterations.js b/tests/jerry/regexp-simple-atom-and-iterations.js
index c1f15da9..19cf7010 100644
--- a/tests/jerry/regexp-simple-atom-and-iterations.js
+++ b/tests/jerry/regexp-simple-atom-and-iterations.js
@@ -88,3 +88,6 @@ assert (r.exec ("\\c3") == "\\c3");
r = /\cIasd/;
assert (r.exec ("\tasd") == "\tasd");
+
+r = /.??$/;
+assert (JSON.stringify (r.exec("asd")) === '["d"]');
diff --git a/tests/jerry/regression-test-issue-2190.js b/tests/jerry/regression-test-issue-2190.js
index ed229a20..a811572b 100644
--- a/tests/jerry/regression-test-issue-2190.js
+++ b/tests/jerry/regression-test-issue-2190.js
@@ -13,7 +13,7 @@
// limitations under the License.
try {
- /(?:(?=x)){1000}xyz/.exec('xyz');
+ /(?:(?=x)){10000}xyz/.exec('xyz');
assert(false);
} catch (e) {
assert(e instanceof RangeError);
diff --git a/tests/jerry/string-prototype-trim.js b/tests/jerry/string-prototype-trim.js
index 2750e0ce..689d9d33 100644
--- a/tests/jerry/string-prototype-trim.js
+++ b/tests/jerry/string-prototype-trim.js
@@ -85,3 +85,5 @@ assert("\u000A\u000D\u2028\u202911".trim() === "11");
assert("\u0009\u000B\u000C\u0020\u00A01\u0009\u000B\u000C\u0020\u00A0".trim() === "1");
assert("\u000A\u000D\u2028\u202911\u000A\u000D\u2028\u2029".trim() === "11");
+
+assert ("\u200B".trim() === '\u200B')