1
0
mirror of https://sourceware.org/git/glibc.git synced 2025-08-05 19:35:52 +03:00
Update.
2004-11-18  Jakub Jelinek  <jakub@redhat.com>

	[BZ #544]
	* posix/regex.h (RE_NO_SUB): New define.
	* posix/regex_internal.h (OP_DELETED_SUBEXP): New.
	(re_dfa_t): Add subexp_map.
	* posix/regcomp.c (struct subexp_optimize): New type.
	(optimize_subexps): New routine.
	(re_compile_internal): Call it.
	(re_compile_pattern): Set preg->no_sub to 1 if RE_NO_SUB.
	(free_dfa_content): Free subexp_map.
	(calc_inveclosure, calc_eclosure): Skip OP_DELETED_SUBEXP
	nodes.
	* posix/regexec.c (re_search_internal): If subexp_map
	is not NULL, duplicate registers as needed.
	* posix/Makefile: Add rules to build and run tst-regex2.
	* posix/tst-regex2.c: New test.
	* posix/rxspencer/tests: Fix last two tests (\0 -> \1).
	Add some new tests for nested subexpressions.
This commit is contained in:
Ulrich Drepper
2004-11-18 23:57:34 +00:00
parent 1b1d36792e
commit c06a6956a4
9 changed files with 408 additions and 6 deletions

View File

@@ -1,3 +1,23 @@
2004-11-18 Jakub Jelinek <jakub@redhat.com>
[BZ #544]
* posix/regex.h (RE_NO_SUB): New define.
* posix/regex_internal.h (OP_DELETED_SUBEXP): New.
(re_dfa_t): Add subexp_map.
* posix/regcomp.c (struct subexp_optimize): New type.
(optimize_subexps): New routine.
(re_compile_internal): Call it.
(re_compile_pattern): Set preg->no_sub to 1 if RE_NO_SUB.
(free_dfa_content): Free subexp_map.
(calc_inveclosure, calc_eclosure): Skip OP_DELETED_SUBEXP
nodes.
* posix/regexec.c (re_search_internal): If subexp_map
is not NULL, duplicate registers as needed.
* posix/Makefile: Add rules to build and run tst-regex2.
* posix/tst-regex2.c: New test.
* posix/rxspencer/tests: Fix last two tests (\0 -> \1).
Add some new tests for nested subexpressions.
2004-11-18 Ulrich Drepper <drepper@redhat.com> 2004-11-18 Ulrich Drepper <drepper@redhat.com>
* libio/libio.h (_IO_FLAGS2_FORTIFY): Renamed from * libio/libio.h (_IO_FLAGS2_FORTIFY): Renamed from

View File

@@ -80,7 +80,7 @@ tests := tstgetopt testfnm runtests runptests \
bug-regex13 bug-regex14 bug-regex15 bug-regex16 \ bug-regex13 bug-regex14 bug-regex15 bug-regex16 \
bug-regex17 bug-regex18 bug-regex19 bug-regex20 \ bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
bug-regex21 bug-regex22 bug-regex23 bug-regex24 \ bug-regex21 bug-regex22 bug-regex23 bug-regex24 \
tst-nice tst-nanosleep \ tst-nice tst-nanosleep tst-regex2 \
transbug tst-rxspencer tst-pcre tst-boost \ transbug tst-rxspencer tst-pcre tst-boost \
bug-ga1 tst-vfork1 tst-vfork2 tst-waitid \ bug-ga1 tst-vfork1 tst-vfork2 tst-waitid \
tst-getaddrinfo2 bug-glob1 bug-glob2 tst-getaddrinfo2 bug-glob1 bug-glob2
@@ -160,6 +160,7 @@ tst-fnmatch-ENV = LOCPATH=$(common-objpfx)localedata
tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex1-ENV = LOCPATH=$(common-objpfx)localedata bug-regex1-ENV = LOCPATH=$(common-objpfx)localedata
tst-regex-ENV = LOCPATH=$(common-objpfx)localedata tst-regex-ENV = LOCPATH=$(common-objpfx)localedata
tst-regex2-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex5-ENV = LOCPATH=$(common-objpfx)localedata bug-regex5-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex6-ENV = LOCPATH=$(common-objpfx)localedata bug-regex6-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex17-ENV = LOCPATH=$(common-objpfx)localedata bug-regex17-ENV = LOCPATH=$(common-objpfx)localedata
@@ -244,8 +245,10 @@ $(objpfx)tst-getconf.out: tst-getconf.sh $(objpfx)getconf
ifeq (yes,$(build-shared)) ifeq (yes,$(build-shared))
$(objpfx)tst-regex: $(common-objpfx)rt/librt.so $(objpfx)tst-regex: $(common-objpfx)rt/librt.so
$(objpfx)tst-regex2: $(common-objpfx)rt/librt.so
else else
$(objpfx)tst-regex: $(common-objpfx)rt/librt.a $(objpfx)tst-regex: $(common-objpfx)rt/librt.a
$(objpfx)tst-regex2: $(common-objpfx)rt/librt.a
endif endif
$(objpfx)bug-ga2-mem: $(objpfx)bug-ga2.out $(objpfx)bug-ga2-mem: $(objpfx)bug-ga2.out

View File

@@ -33,6 +33,14 @@ static reg_errcode_t create_initial_state (re_dfa_t *dfa);
#ifdef RE_ENABLE_I18N #ifdef RE_ENABLE_I18N
static void optimize_utf8 (re_dfa_t *dfa); static void optimize_utf8 (re_dfa_t *dfa);
#endif #endif
struct subexp_optimize
{
re_dfa_t *dfa;
re_token_t *nodes;
int no_sub, re_nsub;
};
static bin_tree_t *optimize_subexps (struct subexp_optimize *so,
bin_tree_t *node, int sidx, int depth);
static reg_errcode_t analyze (re_dfa_t *dfa); static reg_errcode_t analyze (re_dfa_t *dfa);
static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node); static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
static void calc_first (re_dfa_t *dfa, bin_tree_t *node); static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
@@ -238,8 +246,8 @@ re_compile_pattern (pattern, length, bufp)
/* And GNU code determines whether or not to get register information /* And GNU code determines whether or not to get register information
by passing null for the REGS argument to re_match, etc., not by by passing null for the REGS argument to re_match, etc., not by
setting no_sub. */ setting no_sub, unless RE_NO_SUB is set. */
bufp->no_sub = 0; bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
/* Match anchors at newline. */ /* Match anchors at newline. */
bufp->newline_anchor = 1; bufp->newline_anchor = 1;
@@ -633,6 +641,7 @@ free_dfa_content (re_dfa_t *dfa)
if (dfa->sb_char != utf8_sb_map) if (dfa->sb_char != utf8_sb_map)
re_free (dfa->sb_char); re_free (dfa->sb_char);
#endif #endif
re_free (dfa->subexp_map);
#ifdef DEBUG #ifdef DEBUG
re_free (dfa->re_str); re_free (dfa->re_str);
#endif #endif
@@ -810,6 +819,17 @@ re_compile_internal (preg, pattern, length, syntax)
optimize_utf8 (dfa); optimize_utf8 (dfa);
#endif #endif
if (preg->re_nsub > 0)
{
struct subexp_optimize so;
so.dfa = dfa;
so.nodes = dfa->nodes;
so.no_sub = preg->no_sub;
so.re_nsub = preg->re_nsub;
dfa->str_tree = optimize_subexps (&so, dfa->str_tree, -1, 0);
}
/* Analyze the tree and collect information which is necessary to /* Analyze the tree and collect information which is necessary to
create the dfa. */ create the dfa. */
err = analyze (dfa); err = analyze (dfa);
@@ -1121,6 +1141,82 @@ optimize_utf8 (dfa)
} }
#endif #endif
static bin_tree_t *
optimize_subexps (so, node, sidx, depth)
struct subexp_optimize *so;
bin_tree_t *node;
int sidx, depth;
{
int idx, new_depth, new_sidx;
bin_tree_t *ret;
if (node == NULL)
return NULL;
new_depth = 0;
new_sidx = sidx;
if ((depth & 1) && node->type == CONCAT
&& node->right && node->right->type == 0
&& so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
{
new_depth = depth + 1;
if (new_depth == 2
|| (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
&& so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx)))
new_sidx = so->nodes[idx].opr.idx;
}
node->left = optimize_subexps (so, node->left, new_sidx, new_depth);
new_depth = (depth & 1) == 0 && node->type == CONCAT
&& node->left && node->left->type == 0
&& so->nodes[node->left->node_idx].type == OP_OPEN_SUBEXP
? depth + 1 : 0;
node->right = optimize_subexps (so, node->right, sidx, new_depth);
if (node->type != CONCAT)
return node;
if ((depth & 1) == 0
&& node->left
&& node->left->type == 0
&& so->nodes[idx = node->left->node_idx].type == OP_OPEN_SUBEXP)
ret = node->right;
else if ((depth & 1)
&& node->right
&& node->right->type == 0
&& so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
ret = node->left;
else
return node;
if (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
&& so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx))
return node;
if (!so->no_sub)
{
int i;
if (depth < 2)
return node;
if (so->dfa->subexp_map == NULL)
{
so->dfa->subexp_map = re_malloc (int, so->re_nsub);
if (so->dfa->subexp_map == NULL)
return node;
for (i = 0; i < so->re_nsub; i++)
so->dfa->subexp_map[i] = i;
}
i = so->nodes[idx].opr.idx;
assert (sidx < i);
so->dfa->subexp_map[i] = sidx;
}
so->nodes[idx].type = OP_DELETED_SUBEXP;
ret->parent = node->parent;
return ret;
}
/* Analyze the structure tree, and calculate "first", "next", "edest", /* Analyze the structure tree, and calculate "first", "next", "edest",
"eclosure", and "inveclosure". */ "eclosure", and "inveclosure". */
@@ -1525,6 +1621,8 @@ calc_inveclosure (dfa)
int src, idx, dest; int src, idx, dest;
for (src = 0; src < dfa->nodes_len; ++src) for (src = 0; src < dfa->nodes_len; ++src)
{ {
if (dfa->nodes[src].type == OP_DELETED_SUBEXP)
continue;
for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
{ {
dest = dfa->eclosures[src].elems[idx]; dest = dfa->eclosures[src].elems[idx];
@@ -1560,6 +1658,9 @@ calc_eclosure (dfa)
#ifdef DEBUG #ifdef DEBUG
assert (dfa->eclosures[node_idx].nelem != -1); assert (dfa->eclosures[node_idx].nelem != -1);
#endif #endif
if (dfa->nodes[node_idx].type == OP_DELETED_SUBEXP)
continue;
/* If we have already calculated, skip it. */ /* If we have already calculated, skip it. */
if (dfa->eclosures[node_idx].nelem != 0) if (dfa->eclosures[node_idx].nelem != 0)
continue; continue;

View File

@@ -179,6 +179,10 @@ typedef unsigned long int reg_syntax_t;
immediately after an alternation or begin-group operator. */ immediately after an alternation or begin-group operator. */
#define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1) #define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1)
/* If this bit is set, then no_sub will be set to 1 during
re_compile_pattern. */
#define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1)
/* This global variable defines the particular regexp syntax to use (for /* This global variable defines the particular regexp syntax to use (for
some interfaces). When a regexp is compiled, the syntax used is some interfaces). When a regexp is compiled, the syntax used is
stored in the pattern buffer, so changing this does not affect stored in the pattern buffer, so changing this does not affect

View File

@@ -189,6 +189,7 @@ typedef enum
OP_DUP_PLUS = EPSILON_BIT | 4, OP_DUP_PLUS = EPSILON_BIT | 4,
OP_DUP_QUESTION = EPSILON_BIT | 5, OP_DUP_QUESTION = EPSILON_BIT | 5,
ANCHOR = EPSILON_BIT | 6, ANCHOR = EPSILON_BIT | 6,
OP_DELETED_SUBEXP = EPSILON_BIT | 7,
/* Tree type, these are used only by tree. */ /* Tree type, these are used only by tree. */
CONCAT = 16, CONCAT = 16,
@@ -644,6 +645,7 @@ struct re_dfa_t
int mb_cur_max; int mb_cur_max;
bitset word_char; bitset word_char;
reg_syntax_t syntax; reg_syntax_t syntax;
int *subexp_map;
#ifdef DEBUG #ifdef DEBUG
char* re_str; char* re_str;
#endif #endif

View File

@@ -882,6 +882,18 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
pmatch[reg_idx].rm_so += match_first; pmatch[reg_idx].rm_so += match_first;
pmatch[reg_idx].rm_eo += match_first; pmatch[reg_idx].rm_eo += match_first;
} }
if (dfa->subexp_map)
for (reg_idx = 0;
reg_idx + 1 < nmatch && reg_idx < preg->re_nsub;
reg_idx++)
if (dfa->subexp_map[reg_idx] != reg_idx)
{
pmatch[reg_idx + 1].rm_so
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
pmatch[reg_idx + 1].rm_eo
= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
}
} }
free_return: free_return:

View File

@@ -508,5 +508,21 @@ a*a*a*a*a*a*a* & aaaaaa aaaaaa
(\b){0} - x @x - (\b){0} - x @x -
\(\b\)\{0,0\} b abc @abc - \(\b\)\{0,0\} b abc @abc -
a(\b){0}c - ac ac - a(\b){0}c - ac ac -
a(.*)b(\0){0}c - abc abc @bc,- a(.*)b(\1){0}c - abc abc @bc,-
a(.*)b(\0){0}c - axbc axbc x,- a(.*)b(\1){0}c - axbc axbc x,-
a\(\(b*\)\)c\1d b abbcbbd abbcbbd bb,bb
a\(\([bc]\)\)\2d b abcdabbd abbd b,b
a\(\(\(\([bc]\)\)\3\)\)*d b abbccd abbccd cc,cc,c,c
a(b)(c)d - abcd abcd b,c
a(((b)))c - abc abc b,b,b
a(((b|(((c))))))d - abd abd b,b,b,-,-,-
a(((b*|c|e)))d - abbd abbd bb,bb,bb
a((b|c)){0,0}d - ad ad -,-
a((b|c)){0,1}d - abd abd b,b
a((b|c)){0,2}d - abcd abcd c,c
a((b+|((c)*)))+d - abd abd b,b,-,-
a((b+|((c)*)))+d - abcd abcd c,c,c,c
(((\b))){0} - x @x -,-,-
a(((.*)))b((\2)){0}c - abc abc @bc,@bc,@bc,-,-
a(((.*)))b((\1)){0}c - axbc axbc x,x,x,-,-

244
posix/tst-regex2.c Normal file
View File

@@ -0,0 +1,244 @@
#include <fcntl.h>
#include <locale.h>
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <time.h>
#include <unistd.h>
#ifdef _POSIX_CPUTIME
static clockid_t cl;
static int use_clock;
#endif
static int
do_test (void)
{
#ifdef _POSIX_CPUTIME
/* See whether we can use the CPU clock. */
use_clock = clock_getcpuclockid (0, &cl) == 0;
#endif
static const char *pat[] = {
".?.?.?.?.?.?.?Log\\.13",
"(.?)(.?)(.?)(.?)(.?)(.?)(.?)Log\\.13",
"((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))"
"((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))"
"((((((((((.?))))))))))Log\\.13" };
int fd = open ("../ChangeLog.14", O_RDONLY);
if (fd < 0)
{
printf ("Couldn't open ChangeLog.14: %m\n");
return 1;
}
struct stat64 st;
if (fstat64 (fd, &st) < 0)
{
printf ("Couldn't fstat ChangeLog.14: %m\n");
return 1;
}
char *buf = malloc (st.st_size + 1);
if (buf == NULL)
{
printf ("Couldn't allocate buffer: %m\n");
return 1;
}
if (read (fd, buf, st.st_size) != (ssize_t) st.st_size)
{
puts ("Couldn't read ChangeLog.14");
return 1;
}
close (fd);
buf[st.st_size] = '\0';
setlocale (LC_ALL, "de_DE.UTF-8");
char *string = buf;
size_t len = st.st_size;
#ifndef WHOLE_FILE_TIMING
/* Don't search the whole file normally, it takes too long. */
if (len > 500000 + 64)
{
string += 500000;
len -= 500000;
}
#endif
for (int testno = 0; testno < 4; ++testno)
for (int i = 0; i < sizeof (pat) / sizeof (pat[0]); ++i)
{
printf ("test %d pattern %d", testno, i);
regex_t rbuf;
struct re_pattern_buffer rpbuf;
int err;
if (testno < 2)
{
err = regcomp (&rbuf, pat[i],
REG_EXTENDED | (testno ? REG_NOSUB : 0));
if (err != 0)
{
putchar ('\n');
char errstr[300];
regerror (err, &rbuf, errstr, sizeof (errstr));
puts (errstr);
return err;
}
}
else
{
re_set_syntax (RE_SYNTAX_POSIX_EGREP
| (testno == 3 ? RE_NO_SUB : 0));
memset (&rpbuf, 0, sizeof (rpbuf));
const char *s = re_compile_pattern (pat[i], strlen (pat[i]),
&rpbuf);
if (s != NULL)
{
printf ("\n%s\n", s);
return 1;
}
/* Just so that this can be tested with earlier glibc as well. */
if (testno == 3)
rpbuf.no_sub = 1;
}
#ifdef _POSIX_CPUTIME
struct timespec start, stop;
if (use_clock)
use_clock = clock_gettime (cl, &start) == 0;
#endif
if (testno < 2)
{
regmatch_t pmatch[71];
err = regexec (&rbuf, string, 71, pmatch, 0);
if (err == REG_NOMATCH)
{
puts ("\nregexec failed");
return 1;
}
if (testno == 0)
{
if (pmatch[0].rm_eo != pmatch[0].rm_so + 13
|| pmatch[0].rm_eo > len
|| pmatch[0].rm_so < len - 100
|| strncmp (string + pmatch[0].rm_so,
" ChangeLog.13 for earlier changes",
sizeof " ChangeLog.13 for earlier changes" - 1)
!= 0)
{
puts ("\nregexec without REG_NOSUB did not find the correct match");
return 1;
}
if (i > 0)
for (int j = 0, l = 1; j < 7; ++j)
for (int k = 0; k < (i == 1 ? 1 : 10); ++k, ++l)
if (pmatch[l].rm_so != pmatch[0].rm_so + j
|| pmatch[l].rm_eo != pmatch[l].rm_so + 1)
{
printf ("\npmatch[%d] incorrect\n", l);
return 1;
}
}
}
else
{
struct re_registers regs;
memset (&regs, 0, sizeof (regs));
int match = re_search (&rpbuf, string, len, 0, len,
&regs);
if (match < 0)
{
puts ("\nre_search failed");
return 1;
}
if (match + 13 > len
|| match < len - 100
|| strncmp (string + match,
" ChangeLog.13 for earlier changes",
sizeof " ChangeLog.13 for earlier changes" - 1)
!= 0)
{
puts ("\nre_search did not find the correct match");
return 1;
}
if (testno == 2)
{
if (regs.num_regs != 2 + (i == 0 ? 0 : i == 1 ? 7 : 70))
{
printf ("\nincorrect num_regs %d\n", regs.num_regs);
return 1;
}
if (regs.start[0] != match || regs.end[0] != match + 13)
{
printf ("\nincorrect regs.{start,end}[0] = { %d, %d}\n",
regs.start[0], regs.end[0]);
return 1;
}
if (regs.start[regs.num_regs - 1] != -1
|| regs.end[regs.num_regs - 1] != -1)
{
puts ("\nincorrect regs.{start,end}[num_regs - 1]");
return 1;
}
if (i > 0)
for (int j = 0, l = 1; j < 7; ++j)
for (int k = 0; k < (i == 1 ? 1 : 10); ++k, ++l)
if (regs.start[l] != match + j
|| regs.end[l] != regs.start[l] + 1)
{
printf ("\nregs.{start,end}[%d] incorrect\n", l);
return 1;
}
}
}
#ifdef _POSIX_CPUTIME
if (use_clock)
use_clock = clock_gettime (cl, &stop) == 0;
if (use_clock)
{
stop.tv_sec -= start.tv_sec;
if (stop.tv_nsec < start.tv_nsec)
{
stop.tv_sec--;
stop.tv_nsec += 1000000000 - start.tv_nsec;
}
else
stop.tv_nsec -= start.tv_nsec;
printf (": %ld.%09lds\n", (long) stop.tv_sec, (long) stop.tv_nsec);
}
else
#endif
putchar ('\n');
if (testno < 2)
regfree (&rbuf);
else
regfree (&rpbuf);
}
return 0;
}
#define TIMEOUT 20
#define TEST_FUNCTION do_test ()
#include "../test-skeleton.c"

View File

@@ -893,7 +893,7 @@ vfprintf (FILE *s, const CHAR_T *format, va_list ap)
* sizeof (CHAR_T))); \ * sizeof (CHAR_T))); \
} \ } \
if (readonly_format < 0) \ if (readonly_format < 0) \
__libc_fatal ("*** %n is writable segment detected ***\n"); \ __libc_fatal ("*** %n in writable segment detected ***\n"); \
} \ } \
/* Answer the count of characters written. */ \ /* Answer the count of characters written. */ \
if (fspec == NULL) \ if (fspec == NULL) \