1
0
mirror of https://sourceware.org/git/glibc.git synced 2025-06-06 11:41:02 +03:00
2004-11-08  Ulrich Drepper  <drepper@redhat.com>

	* posix/regcomp.c (utf8_sb_map): Define.
	(free_dfa_content): Don't free dfa->sb_char if it's a pointer to
	utf8_sb_map.
	(init_dfa): Use utf8_sb_map instead of initializing memory when the
	encoding is UTF-8.

	* posix/regcomp.c (init_dfa): Get the codeset name outside glibc as
	well.  Check if it is spelled UTF8 as well as UTF-8, and check
	case-insensitively.  Set dfa->map_notascii manually when outside
	glibc.
	* posix/regex_internal.c (build_wcs_upper_buffer) [!_LIBC]: Enable
	optimizations based on map_notascii.
	* posix/regex_internal.h [HAVE_LANGINFO_H || HAVE_LANGINFO_CODESET
	|| _LIBC]: Include langinfo.h.

	* posix/regex_internal.h (struct re_backref_cache_entry): Add "more"
	field.
	* posix/regexec.c (check_dst_limits): Hoist computation of the source
	and destination bkref_idx out of the loop.  Pass it to
	check_dst_limits_calc_pos.
	(check_dst_limits_calc_pos_1): New function, containing the recursive
	loop of check_dst_limits_calc_pos; uses the "more" field of
	struct re_backref_cache to control the loop.
	(check_dst_limits_calc_pos): Store into "boundaries" the position
	relative to lim's start and end positions.  Do not accept eclosures,
	accept bkref_idx instead.  Call check_dst_limits_calc_pos_1 to do the
	work.
	(sift_states_bkref): Use the "more" field of struct re_backref_cache
	to control the loop.  A big "if" was turned into a continue and the
	function was reindented.
	(get_subexp): Use the "more" field of struct re_backref_cache
	to control the loop.
	(match_ctx_add_entry): Initialize the bkref_ents' "more" field.
	(search_cur_bkref_entry): Return -1 if out of bounds.

	* posix/regexec.c (empty_set): Remove.
	(sift_states_backward): Remove cur_src variable.  Move inner loop
	to build_sifted_states.
	(build_sifted_states): Extract from sift_states_backward.  Do not
	use empty_set.
	(update_cur_sifted_state): Do not use empty_set.  Special case
	dest_nodes->nelem == 0.
This commit is contained in:
Ulrich Drepper 2004-11-08 22:49:44 +00:00
parent d2c38eb3fa
commit e40a38b383
5 changed files with 363 additions and 221 deletions

View File

@ -1,5 +1,50 @@
2004-11-08 Ulrich Drepper <drepper@redhat.com>
* posix/regcomp.c (utf8_sb_map): Define.
(free_dfa_content): Don't free dfa->sb_char if it's a pointer to
utf8_sb_map.
(init_dfa): Use utf8_sb_map instead of initializing memory when the
encoding is UTF-8.
2004-11-03 Paolo Bonzini <bonzini@gnu.org> 2004-11-03 Paolo Bonzini <bonzini@gnu.org>
* posix/regcomp.c (init_dfa): Get the codeset name outside glibc as
well. Check if it is spelled UTF8 as well as UTF-8, and check
case-insensitively. Set dfa->map_notascii manually when outside
glibc.
* posix/regex_internal.c (build_wcs_upper_buffer) [!_LIBC]: Enable
optimizations based on map_notascii.
* posix/regex_internal.h [HAVE_LANGINFO_H || HAVE_LANGINFO_CODESET
|| _LIBC]: Include langinfo.h.
* posix/regex_internal.h (struct re_backref_cache_entry): Add "more"
field.
* posix/regexec.c (check_dst_limits): Hoist computation of the source
and destination bkref_idx out of the loop. Pass it to
check_dst_limits_calc_pos.
(check_dst_limits_calc_pos_1): New function, containing the recursive
loop of check_dst_limits_calc_pos; uses the "more" field of
struct re_backref_cache to control the loop.
(check_dst_limits_calc_pos): Store into "boundaries" the position
relative to lim's start and end positions. Do not accept eclosures,
accept bkref_idx instead. Call check_dst_limits_calc_pos_1 to do the
work.
(sift_states_bkref): Use the "more" field of struct re_backref_cache
to control the loop. A big "if" was turned into a continue and the
function was reindented.
(get_subexp): Use the "more" field of struct re_backref_cache
to control the loop.
(match_ctx_add_entry): Initialize the bkref_ents' "more" field.
(search_cur_bkref_entry): Return -1 if out of bounds.
* posix/regexec.c (empty_set): Remove.
(sift_states_backward): Remove cur_src variable. Move inner loop
to build_sifted_states.
(build_sifted_states): Extract from sift_states_backward. Do not
use empty_set.
(update_cur_sifted_state): Do not use empty_set. Special case
dest_nodes->nelem == 0.
* posix/regex_internal.h (struct re_backref_cache_entry): Remove flag * posix/regex_internal.h (struct re_backref_cache_entry): Remove flag
field. field.
(struct re_sift_context_t): Remove cur_bkref, cls_subexp_idx, (struct re_sift_context_t): Remove cur_bkref, cls_subexp_idx,

View File

@ -566,6 +566,23 @@ weak_alias (__regerror, regerror)
#endif #endif
#ifdef RE_ENABLE_I18N
/* This static array is used for the map to single-byte characters when
UTF-8 is used. Otherwise we would allocate memory just to initialize
it the same all the time. UTF-8 is the preferred encoding so this is
a worthwhile optimization. */
static const bitset utf8_sb_map =
{
/* Set the first 128 bits. */
# if UINT_MAX == 0xffffffff
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
# else
# error "Add case for new unsigned int size"
# endif
};
#endif
static void static void
free_dfa_content (re_dfa_t *dfa) free_dfa_content (re_dfa_t *dfa)
{ {
@ -613,7 +630,8 @@ free_dfa_content (re_dfa_t *dfa)
} }
re_free (dfa->state_table); re_free (dfa->state_table);
#ifdef RE_ENABLE_I18N #ifdef RE_ENABLE_I18N
re_free (dfa->sb_char); if (dfa->sb_char != utf8_sb_map)
re_free (dfa->sb_char);
#endif #endif
#ifdef DEBUG #ifdef DEBUG
re_free (dfa->re_str); re_free (dfa->re_str);
@ -824,6 +842,9 @@ init_dfa (dfa, pat_len)
int pat_len; int pat_len;
{ {
int table_size; int table_size;
#ifndef _LIBC
char *codeset_name;
#endif
memset (dfa, '\0', sizeof (re_dfa_t)); memset (dfa, '\0', sizeof (re_dfa_t));
@ -853,22 +874,59 @@ init_dfa (dfa, pat_len)
dfa->is_utf8 = 1; dfa->is_utf8 = 1;
dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
!= 0); != 0);
#else
# ifdef HAVE_LANGINFO_CODESET
codeset_name = nl_langinfo (CODESET);
# else
codeset_name = getenv ("LC_ALL");
if (codeset_name == NULL || codeset[0] == '\0')
codeset_name = getenv ("LC_CTYPE");
if (codeset_name == NULL || codeset[0] == '\0')
codeset_name = getenv ("LANG");
if (codeset_name == NULL)
codeset_name = "";
else if (strchr (codeset_name, '.') != NULL)
codeset_name = strchr (codeset_name, '.') + 1;
# endif
if (strcasecmp (codeset_name, "UTF-8") == 0
|| strcasecmp (codeset_name, "UTF8") == 0)
dfa->is_utf8 = 1;
/* We check exhaustively in the loop below if this charset is a
superset of ASCII. */
dfa->map_notascii = 0;
#endif #endif
#ifdef RE_ENABLE_I18N #ifdef RE_ENABLE_I18N
if (dfa->mb_cur_max > 1) if (dfa->mb_cur_max > 1)
{ {
int i, j, ch;
dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
if (BE (dfa->sb_char == NULL, 0))
return REG_ESPACE;
if (dfa->is_utf8) if (dfa->is_utf8)
memset (dfa->sb_char, 255, sizeof (unsigned int) * BITSET_UINTS / 2); dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
else else
for (i = 0, ch = 0; i < BITSET_UINTS; ++i) {
for (j = 0; j < UINT_BITS; ++j, ++ch) int i, j, ch;
if (__btowc (ch) != WEOF)
dfa->sb_char[i] |= 1 << j; dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
if (BE (dfa->sb_char == NULL, 0))
return REG_ESPACE;
/* Clear all bits by, then set those corresponding to single
byte chars. */
bitset_empty (dfa->sb_char);
for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
for (j = 0; j < UINT_BITS; ++j, ++ch)
{
wchar_t wch = __btowc (ch);
if (wch != WEOF)
dfa->sb_char[i] |= 1 << j;
# ifndef _LIBC
if (isascii (ch) && wch != (wchar_t) ch)
dfa->map_notascii = 1;
# endif
}
}
} }
#endif #endif

View File

@ -293,9 +293,8 @@ build_wcs_upper_buffer (pstr)
byte_idx = pstr->valid_len; byte_idx = pstr->valid_len;
end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
#ifdef _LIBC /* The following optimization assumes that ASCII characters can be
/* The following optimization assumes that the wchar_t encoding is mapped to wide characters with a simple cast. */
always ISO 10646. */
if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
{ {
while (byte_idx < end_idx) while (byte_idx < end_idx)
@ -309,8 +308,7 @@ build_wcs_upper_buffer (pstr)
pstr->mbs[byte_idx] pstr->mbs[byte_idx]
= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
/* The next step uses the assumption that wchar_t is encoded /* The next step uses the assumption that wchar_t is encoded
with ISO 10646: all ASCII values can be converted like ASCII-safe: all ASCII values can be converted like this. */
this. */
pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
++byte_idx; ++byte_idx;
continue; continue;
@ -368,14 +366,11 @@ build_wcs_upper_buffer (pstr)
return REG_NOERROR; return REG_NOERROR;
} }
else else
#endif
for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
{ {
wchar_t wc; wchar_t wc;
const char *p; const char *p;
#ifdef _LIBC offsets_needed:
offsets_needed:
#endif
remain_len = end_idx - byte_idx; remain_len = end_idx - byte_idx;
prev_st = pstr->cur_state; prev_st = pstr->cur_state;
if (BE (pstr->trans != NULL, 0)) if (BE (pstr->trans != NULL, 0))
@ -647,7 +642,6 @@ re_string_reconstruct (pstr, idx, eflags)
int wcs_idx; int wcs_idx;
wint_t wc = WEOF; wint_t wc = WEOF;
#ifdef _LIBC
if (pstr->is_utf8) if (pstr->is_utf8)
{ {
const unsigned char *raw, *p, *q, *end; const unsigned char *raw, *p, *q, *end;
@ -687,7 +681,7 @@ re_string_reconstruct (pstr, idx, eflags)
break; break;
} }
} }
#endif
if (wc == WEOF) if (wc == WEOF)
pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
if (BE (pstr->valid_len, 0)) if (BE (pstr->valid_len, 0))

View File

@ -27,6 +27,9 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
# include <langinfo.h>
#endif
#if defined HAVE_LOCALE_H || defined _LIBC #if defined HAVE_LOCALE_H || defined _LIBC
# include <locale.h> # include <locale.h>
#endif #endif
@ -545,6 +548,9 @@ struct re_backref_cache_entry
int str_idx; int str_idx;
int subexp_from; int subexp_from;
int subexp_to; int subexp_to;
/* We need only one byte from the following field. If other small
fields are added the type could be changed to 'char'. */
int more;
}; };
typedef struct typedef struct

View File

@ -93,6 +93,9 @@ static int sift_states_iter_mb (const re_match_context_t *mctx,
#endif /* RE_ENABLE_I18N */ #endif /* RE_ENABLE_I18N */
static reg_errcode_t sift_states_backward (re_match_context_t *mctx, static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
re_sift_context_t *sctx) internal_function; re_sift_context_t *sctx) internal_function;
static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
re_sift_context_t *sctx, int str_idx,
re_node_set *cur_dest) internal_function;
static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx, static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
re_sift_context_t *sctx, re_sift_context_t *sctx,
int str_idx, int str_idx,
@ -106,9 +109,13 @@ static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits, static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
int dst_node, int dst_idx, int src_node, int dst_node, int dst_idx, int src_node,
int src_idx) internal_function; int src_idx) internal_function;
static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
int boundaries, int subexp_idx,
int from_node, int bkref_idx) internal_function;
static int check_dst_limits_calc_pos (re_match_context_t *mctx, static int check_dst_limits_calc_pos (re_match_context_t *mctx,
int limit, re_node_set *eclosures, int limit, int subexp_idx,
int subexp_idx, int node, int str_idx) internal_function; int node, int str_idx,
int bkref_idx) internal_function;
static reg_errcode_t check_subexp_limits (re_dfa_t *dfa, static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
re_node_set *dest_nodes, re_node_set *dest_nodes,
const re_node_set *candidates, const re_node_set *candidates,
@ -576,8 +583,6 @@ re_exec (s)
} }
#endif /* _REGEX_RE_COMP */ #endif /* _REGEX_RE_COMP */
static re_node_set empty_set;
/* Internal entry point. */ /* Internal entry point. */
/* Searches for a compiled pattern PREG in the string STRING, whose /* Searches for a compiled pattern PREG in the string STRING, whose
@ -640,8 +645,6 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
start = range = 0; start = range = 0;
} }
re_node_set_init_empty (&empty_set);
/* We must check the longest matching, if nmatch > 0. */ /* We must check the longest matching, if nmatch > 0. */
fl_longest_match = (nmatch != 0 || dfa->nbackref); fl_longest_match = (nmatch != 0 || dfa->nbackref);
@ -1492,17 +1495,14 @@ sift_states_backward (mctx, sctx)
re_match_context_t *mctx; re_match_context_t *mctx;
re_sift_context_t *sctx; re_sift_context_t *sctx;
{ {
re_dfa_t *const dfa = mctx->dfa;
reg_errcode_t err; reg_errcode_t err;
int null_cnt = 0; int null_cnt = 0;
int str_idx = sctx->last_str_idx; int str_idx = sctx->last_str_idx;
re_node_set cur_dest; re_node_set cur_dest;
re_node_set *cur_src; /* Points the state_log[str_idx]->nodes */
#ifdef DEBUG #ifdef DEBUG
assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL); assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
#endif #endif
cur_src = &mctx->state_log[str_idx]->nodes;
/* Build sifted state_log[str_idx]. It has the nodes which can epsilon /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
transit to the last_node and the last_node itself. */ transit to the last_node and the last_node itself. */
@ -1516,7 +1516,6 @@ sift_states_backward (mctx, sctx)
/* Then check each states in the state_log. */ /* Then check each states in the state_log. */
while (str_idx > 0) while (str_idx > 0)
{ {
int i, ret;
/* Update counters. */ /* Update counters. */
null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0; null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
if (null_cnt > mctx->max_mb_elem_len) if (null_cnt > mctx->max_mb_elem_len)
@ -1528,56 +1527,12 @@ sift_states_backward (mctx, sctx)
} }
re_node_set_empty (&cur_dest); re_node_set_empty (&cur_dest);
--str_idx; --str_idx;
cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set
: &mctx->state_log[str_idx]->nodes);
/* Then build the next sifted state. if (mctx->state_log[str_idx])
We build the next sifted state on `cur_dest', and update
`sifted_states[str_idx]' with `cur_dest'.
Note:
`cur_dest' is the sifted state from `state_log[str_idx + 1]'.
`cur_src' points the node_set of the old `state_log[str_idx]'. */
for (i = 0; i < cur_src->nelem; i++)
{ {
int prev_node = cur_src->elems[i]; err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
int naccepted = 0; if (BE (err != REG_NOERROR, 0))
re_token_type_t type = dfa->nodes[prev_node].type; goto free_return;
if (IS_EPSILON_NODE (type))
continue;
#ifdef RE_ENABLE_I18N
/* If the node may accept `multi byte'. */
if (ACCEPT_MB_NODE (type))
naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
str_idx, sctx->last_str_idx);
#endif /* RE_ENABLE_I18N */
/* We don't check backreferences here.
See update_cur_sifted_state(). */
if (!naccepted
&& check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
&& STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
dfa->nexts[prev_node]))
naccepted = 1;
if (naccepted == 0)
continue;
if (sctx->limits.nelem)
{
int to_idx = str_idx + naccepted;
if (check_dst_limits (mctx, &sctx->limits,
dfa->nexts[prev_node], to_idx,
prev_node, str_idx))
continue;
}
ret = re_node_set_insert (&cur_dest, prev_node);
if (BE (ret == -1, 0))
{
err = REG_ESPACE;
goto free_return;
}
} }
/* Add all the nodes which satisfy the following conditions: /* Add all the nodes which satisfy the following conditions:
@ -1594,6 +1549,66 @@ sift_states_backward (mctx, sctx)
return err; return err;
} }
static reg_errcode_t
build_sifted_states (mctx, sctx, str_idx, cur_dest)
re_match_context_t *mctx;
re_sift_context_t *sctx;
int str_idx;
re_node_set *cur_dest;
{
re_dfa_t *const dfa = mctx->dfa;
re_node_set *cur_src = &mctx->state_log[str_idx]->nodes;
int i;
/* Then build the next sifted state.
We build the next sifted state on `cur_dest', and update
`sifted_states[str_idx]' with `cur_dest'.
Note:
`cur_dest' is the sifted state from `state_log[str_idx + 1]'.
`cur_src' points the node_set of the old `state_log[str_idx]'. */
for (i = 0; i < cur_src->nelem; i++)
{
int prev_node = cur_src->elems[i];
int naccepted = 0;
re_token_type_t type = dfa->nodes[prev_node].type;
int ret;
if (IS_EPSILON_NODE (type))
continue;
#ifdef RE_ENABLE_I18N
/* If the node may accept `multi byte'. */
if (ACCEPT_MB_NODE (type))
naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
str_idx, sctx->last_str_idx);
#endif /* RE_ENABLE_I18N */
/* We don't check backreferences here.
See update_cur_sifted_state(). */
if (!naccepted
&& check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
&& STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
dfa->nexts[prev_node]))
naccepted = 1;
if (naccepted == 0)
continue;
if (sctx->limits.nelem)
{
int to_idx = str_idx + naccepted;
if (check_dst_limits (mctx, &sctx->limits,
dfa->nexts[prev_node], to_idx,
prev_node, str_idx))
continue;
}
ret = re_node_set_insert (cur_dest, prev_node);
if (BE (ret == -1, 0))
return REG_ESPACE;
}
return REG_NOERROR;
}
/* Helper functions. */ /* Helper functions. */
static reg_errcode_t static reg_errcode_t
@ -1661,34 +1676,37 @@ update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
re_dfa_t *const dfa = mctx->dfa; re_dfa_t *const dfa = mctx->dfa;
reg_errcode_t err; reg_errcode_t err;
const re_node_set *candidates; const re_node_set *candidates;
candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
: &mctx->state_log[str_idx]->nodes); : &mctx->state_log[str_idx]->nodes);
/* At first, add the nodes which can epsilon transit to a node in if (dest_nodes->nelem == 0)
DEST_NODE. */ sctx->sifted_states[str_idx] = NULL;
if (dest_nodes->nelem) else
{ {
err = add_epsilon_src_nodes (dfa, dest_nodes, candidates); if (candidates)
{
/* At first, add the nodes which can epsilon transit to a node in
DEST_NODE. */
err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
if (BE (err != REG_NOERROR, 0))
return err;
/* Then, check the limitations in the current sift_context. */
if (sctx->limits.nelem)
{
err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
mctx->bkref_ents, str_idx);
if (BE (err != REG_NOERROR, 0))
return err;
}
}
sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
if (BE (err != REG_NOERROR, 0)) if (BE (err != REG_NOERROR, 0))
return err; return err;
} }
/* Then, check the limitations in the current sift_context. */ if (candidates && mctx->state_log[str_idx]->has_backref)
if (dest_nodes->nelem && sctx->limits.nelem)
{
err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
mctx->bkref_ents, str_idx);
if (BE (err != REG_NOERROR, 0))
return err;
}
/* Update state_log. */
sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0))
return err;
if ((mctx->state_log[str_idx] != NULL
&& mctx->state_log[str_idx]->has_backref))
{ {
err = sift_states_bkref (mctx, sctx, str_idx, candidates); err = sift_states_bkref (mctx, sctx, str_idx, candidates);
if (BE (err != REG_NOERROR, 0)) if (BE (err != REG_NOERROR, 0))
@ -1785,6 +1803,8 @@ check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
re_dfa_t *const dfa = mctx->dfa; re_dfa_t *const dfa = mctx->dfa;
int lim_idx, src_pos, dst_pos; int lim_idx, src_pos, dst_pos;
int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
{ {
int subexp_idx; int subexp_idx;
@ -1793,11 +1813,11 @@ check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
subexp_idx = dfa->nodes[ent->node].opr.idx - 1; subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
dfa->eclosures + dst_node, subexp_idx, dst_node, dst_idx,
subexp_idx, dst_node, dst_idx); dst_bkref_idx);
src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
dfa->eclosures + src_node, subexp_idx, src_node, src_idx,
subexp_idx, src_node, src_idx); src_bkref_idx);
/* In case of: /* In case of:
<src> <dst> ( <subexp> ) <src> <dst> ( <subexp> )
@ -1812,27 +1832,14 @@ check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
} }
static int static int
check_dst_limits_calc_pos (mctx, limit, eclosures, subexp_idx, from_node, check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
str_idx)
re_match_context_t *mctx; re_match_context_t *mctx;
re_node_set *eclosures; int boundaries, subexp_idx, from_node, bkref_idx;
int limit, subexp_idx, from_node, str_idx;
{ {
re_dfa_t *const dfa = mctx->dfa; re_dfa_t *const dfa = mctx->dfa;
struct re_backref_cache_entry *lim = mctx->bkref_ents + limit; re_node_set *eclosures = dfa->eclosures + from_node;
int node_idx; int node_idx;
/* If we are outside the range of the subexpression, return -1 or 1. */
if (str_idx < lim->subexp_from)
return -1;
if (lim->subexp_to < str_idx)
return 1;
/* If we are within the subexpression, return 0. */
if (str_idx != lim->subexp_from && str_idx != lim->subexp_to)
return 0;
/* Else, we are on the boundary: examine the nodes on the epsilon /* Else, we are on the boundary: examine the nodes on the epsilon
closure. */ closure. */
for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx) for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
@ -1842,17 +1849,11 @@ check_dst_limits_calc_pos (mctx, limit, eclosures, subexp_idx, from_node,
{ {
case OP_BACK_REF: case OP_BACK_REF:
{ {
int bi = search_cur_bkref_entry (mctx, str_idx); struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
for (; bi < mctx->nbkref_ents; ++bi) do
{ {
struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
int dst, cpos; int dst, cpos;
/* If this backreference goes beyond the point we're
examining, don't go any further. */
if (ent->str_idx > str_idx)
break;
if (ent->node != node || ent->subexp_from != ent->subexp_to) if (ent->node != node || ent->subexp_from != ent->subexp_to)
continue; continue;
@ -1865,33 +1866,32 @@ check_dst_limits_calc_pos (mctx, limit, eclosures, subexp_idx, from_node,
dst = dfa->edests[node].elems[0]; dst = dfa->edests[node].elems[0];
if (dst == from_node) if (dst == from_node)
{ {
if (str_idx == lim->subexp_from) if (boundaries & 1)
return -1; return -1;
else /* if (str_idx == lim->subexp_to) */ else /* if (boundaries & 2) */
return 0; return 0;
} }
cpos = check_dst_limits_calc_pos (mctx, limit, cpos = check_dst_limits_calc_pos_1 (mctx, boundaries,
dfa->eclosures + dst, subexp_idx, dst, bkref_idx);
subexp_idx, dst,
str_idx);
if (cpos == -1 && str_idx == lim->subexp_from) if (cpos == -1 && (boundaries & 1))
return -1; return -1;
if (cpos == 0 /* && str_idx == lim->lim->subexp_to */) if (cpos == 0 /* && (boundaries & 2) */)
return 0; return 0;
} }
break; while (ent++->more);
} break;
}
case OP_OPEN_SUBEXP: case OP_OPEN_SUBEXP:
if (str_idx == lim->subexp_from && subexp_idx == dfa->nodes[node].opr.idx) if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
return -1; return -1;
break; break;
case OP_CLOSE_SUBEXP: case OP_CLOSE_SUBEXP:
if (str_idx == lim->subexp_to && subexp_idx == dfa->nodes[node].opr.idx) if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
return 0; return 0;
break; break;
@ -1900,10 +1900,33 @@ check_dst_limits_calc_pos (mctx, limit, eclosures, subexp_idx, from_node,
} }
} }
if (str_idx == lim->subexp_to) return (boundaries & 2) ? 1 : 0;
}
static int
check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
re_match_context_t *mctx;
int limit, subexp_idx, from_node, str_idx, bkref_idx;
{
struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
int boundaries;
/* If we are outside the range of the subexpression, return -1 or 1. */
if (str_idx < lim->subexp_from)
return -1;
if (lim->subexp_to < str_idx)
return 1; return 1;
else
/* If we are within the subexpression, return 0. */
boundaries = (str_idx == lim->subexp_from);
boundaries |= (str_idx == lim->subexp_to) << 1;
if (boundaries == 0)
return 0; return 0;
/* Else, examine epsilon closure. */
return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
from_node, bkref_idx);
} }
/* Check the limitations of sub expressions LIMITS, and remove the nodes /* Check the limitations of sub expressions LIMITS, and remove the nodes
@ -2015,75 +2038,81 @@ sift_states_bkref (mctx, sctx, str_idx, candidates)
reg_errcode_t err; reg_errcode_t err;
int node_idx, node; int node_idx, node;
re_sift_context_t local_sctx; re_sift_context_t local_sctx;
int first_idx = search_cur_bkref_entry (mctx, str_idx);
if (first_idx == -1)
return REG_NOERROR;
local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */ local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
for (node_idx = 0; node_idx < candidates->nelem; ++node_idx) for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
{ {
int enabled_idx;
re_token_type_t type; re_token_type_t type;
struct re_backref_cache_entry *entry;
node = candidates->elems[node_idx]; node = candidates->elems[node_idx];
type = dfa->nodes[node].type; type = dfa->nodes[node].type;
/* Avoid infinite loop for the REs like "()\1+". */ /* Avoid infinite loop for the REs like "()\1+". */
if (node == sctx->last_node && str_idx == sctx->last_str_idx) if (node == sctx->last_node && str_idx == sctx->last_str_idx)
continue; continue;
if (type == OP_BACK_REF) if (type != OP_BACK_REF)
{ continue;
int enabled_idx = search_cur_bkref_entry (mctx, str_idx);
for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
{
int subexp_len, to_idx, dst_node;
struct re_backref_cache_entry *entry;
entry = mctx->bkref_ents + enabled_idx;
if (entry->str_idx > str_idx)
break;
if (entry->node != node)
continue;
subexp_len = entry->subexp_to - entry->subexp_from;
to_idx = str_idx + subexp_len;
dst_node = (subexp_len ? dfa->nexts[node]
: dfa->edests[node].elems[0]);
if (to_idx > sctx->last_str_idx entry = mctx->bkref_ents + first_idx;
|| sctx->sifted_states[to_idx] == NULL enabled_idx = first_idx;
|| !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], do
dst_node) {
|| check_dst_limits (mctx, &sctx->limits, node, int subexp_len, to_idx, dst_node;
str_idx, dst_node, to_idx)) re_dfastate_t *cur_state;
continue;
{ if (entry->node != node)
re_dfastate_t *cur_state; continue;
if (local_sctx.sifted_states == NULL) subexp_len = entry->subexp_to - entry->subexp_from;
{ to_idx = str_idx + subexp_len;
local_sctx = *sctx; dst_node = (subexp_len ? dfa->nexts[node]
err = re_node_set_init_copy (&local_sctx.limits, : dfa->edests[node].elems[0]);
&sctx->limits);
if (BE (err != REG_NOERROR, 0)) if (to_idx > sctx->last_str_idx
goto free_return; || sctx->sifted_states[to_idx] == NULL
} || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
local_sctx.last_node = node; || check_dst_limits (mctx, &sctx->limits, node,
local_sctx.last_str_idx = str_idx; str_idx, dst_node, to_idx))
err = re_node_set_insert (&local_sctx.limits, enabled_idx); continue;
if (BE (err < 0, 0))
{ if (local_sctx.sifted_states == NULL)
err = REG_ESPACE; {
goto free_return; local_sctx = *sctx;
} err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
cur_state = local_sctx.sifted_states[str_idx]; if (BE (err != REG_NOERROR, 0))
err = sift_states_backward (mctx, &local_sctx); goto free_return;
if (BE (err != REG_NOERROR, 0))
goto free_return;
if (sctx->limited_states != NULL)
{
err = merge_state_array (dfa, sctx->limited_states,
local_sctx.sifted_states,
str_idx + 1);
if (BE (err != REG_NOERROR, 0))
goto free_return;
}
local_sctx.sifted_states[str_idx] = cur_state;
re_node_set_remove (&local_sctx.limits, enabled_idx);
}
} }
local_sctx.last_node = node;
local_sctx.last_str_idx = str_idx;
err = re_node_set_insert (&local_sctx.limits, enabled_idx);
if (BE (err < 0, 0))
{
err = REG_ESPACE;
goto free_return;
}
cur_state = local_sctx.sifted_states[str_idx];
err = sift_states_backward (mctx, &local_sctx);
if (BE (err != REG_NOERROR, 0))
goto free_return;
if (sctx->limited_states != NULL)
{
err = merge_state_array (dfa, sctx->limited_states,
local_sctx.sifted_states,
str_idx + 1);
if (BE (err != REG_NOERROR, 0))
goto free_return;
}
local_sctx.sifted_states[str_idx] = cur_state;
re_node_set_remove (&local_sctx.limits, enabled_idx);
/* mctx->bkref_ents may have changed, reload the pointer. */
entry = mctx->bkref_ents + enabled_idx;
} }
while (enabled_idx++, entry++->more);
} }
err = REG_NOERROR; err = REG_NOERROR;
free_return: free_return:
@ -2577,15 +2606,15 @@ get_subexp (mctx, bkref_node, bkref_str_idx)
const char *buf = (const char *) re_string_get_buffer (&mctx->input); const char *buf = (const char *) re_string_get_buffer (&mctx->input);
/* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */ /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx); int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
for (; cache_idx < mctx->nbkref_ents; ++cache_idx) if (cache_idx != -1)
{ {
const struct re_backref_cache_entry *entry const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
= &mctx->bkref_ents[cache_idx]; do
if (entry->str_idx > bkref_str_idx) if (entry->node == bkref_node)
break; return REG_NOERROR; /* We already checked it. */
if (entry->node == bkref_node) while (entry++->more);
return REG_NOERROR; /* We already checked it. */
} }
subexp_num = dfa->nodes[bkref_node].opr.idx - 1; subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
/* For each sub expression */ /* For each sub expression */
@ -3115,16 +3144,18 @@ expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
{ {
re_dfa_t *const dfa = mctx->dfa; re_dfa_t *const dfa = mctx->dfa;
reg_errcode_t err; reg_errcode_t err;
int cache_idx, cache_idx_start; int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
/* The current state. */ struct re_backref_cache_entry *ent;
cache_idx_start = search_cur_bkref_entry (mctx, cur_str); if (cache_idx_start == -1)
for (cache_idx = cache_idx_start; cache_idx < mctx->nbkref_ents; ++cache_idx) return REG_NOERROR;
restart:
ent = mctx->bkref_ents + cache_idx_start;
do
{ {
int to_idx, next_node; int to_idx, next_node;
struct re_backref_cache_entry *ent = mctx->bkref_ents + cache_idx;
if (ent->str_idx > cur_str)
break;
/* Is this entry ENT is appropriate? */ /* Is this entry ENT is appropriate? */
if (!re_node_set_contains (cur_nodes, ent->node)) if (!re_node_set_contains (cur_nodes, ent->node))
continue; /* No. */ continue; /* No. */
@ -3153,8 +3184,7 @@ expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
return err; return err;
} }
/* TODO: It is still inefficient... */ /* TODO: It is still inefficient... */
cache_idx = cache_idx_start - 1; goto restart;
continue;
} }
else else
{ {
@ -3189,6 +3219,7 @@ expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
return err; return err;
} }
} }
while (ent++->more);
return REG_NOERROR; return REG_NOERROR;
} }
@ -4115,25 +4146,30 @@ match_ctx_add_entry (mctx, node, str_idx, from, to)
sizeof (struct re_backref_cache_entry) * mctx->abkref_ents); sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
mctx->abkref_ents *= 2; mctx->abkref_ents *= 2;
} }
if (mctx->nbkref_ents > 0
&& mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
mctx->bkref_ents[mctx->nbkref_ents].node = node; mctx->bkref_ents[mctx->nbkref_ents].node = node;
mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx; mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from; mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
mctx->bkref_ents[mctx->nbkref_ents++].subexp_to = to; mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
if (mctx->max_mb_elem_len < to - from) if (mctx->max_mb_elem_len < to - from)
mctx->max_mb_elem_len = to - from; mctx->max_mb_elem_len = to - from;
return REG_NOERROR; return REG_NOERROR;
} }
/* Search for the first entry which has the same str_idx. /* Search for the first entry which has the same str_idx, or -1 if none is
Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */ found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
static int static int
search_cur_bkref_entry (mctx, str_idx) search_cur_bkref_entry (mctx, str_idx)
re_match_context_t *mctx; re_match_context_t *mctx;
int str_idx; int str_idx;
{ {
int left, right, mid; int left, right, mid, last;
right = mctx->nbkref_ents; last = right = mctx->nbkref_ents;
for (left = 0; left < right;) for (left = 0; left < right;)
{ {
mid = (left + right) / 2; mid = (left + right) / 2;
@ -4142,7 +4178,10 @@ search_cur_bkref_entry (mctx, str_idx)
else else
right = mid; right = mid;
} }
return left; if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
return left;
else
return -1;
} }
/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches