mirror of
https://github.com/postgres/postgres.git
synced 2025-06-30 21:42:05 +03:00
Replace regular expression package with Henry Spencer's latest version
(extracted from Tcl 8.4.1 release, as Henry still hasn't got round to making it a separate library). This solves a performance problem for multibyte, as well as upgrading our regexp support to match recent Tcl and nearly match recent Perl.
This commit is contained in:
@ -1,56 +1,84 @@
|
||||
Copyright 1992, 1993, 1994 Henry Spencer. All rights reserved.
|
||||
This software is not subject to any license of the American Telephone
|
||||
and Telegraph Company or of the Regents of the University of California.
|
||||
This regular expression package was originally developed by Henry Spencer.
|
||||
It bears the following copyright notice:
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose on
|
||||
any computer system, and to alter it and redistribute it, subject
|
||||
to the following restrictions:
|
||||
**********************************************************************
|
||||
|
||||
1. The author is not responsible for the consequences of use of this
|
||||
software, no matter how awful, even if they arise from flaws in it.
|
||||
Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
|
||||
|
||||
2. The origin of this software must not be misrepresented, either by
|
||||
explicit claim or by omission. Since few users ever read sources,
|
||||
credits must appear in the documentation.
|
||||
Development of this software was funded, in part, by Cray Research Inc.,
|
||||
UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
|
||||
Corporation, none of whom are responsible for the results. The author
|
||||
thanks all of them.
|
||||
|
||||
3. Altered versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software. Since few users
|
||||
ever read sources, credits must appear in the documentation.
|
||||
Redistribution and use in source and binary forms -- with or without
|
||||
modification -- are permitted for any purpose, provided that
|
||||
redistributions in source form retain this entire copyright notice and
|
||||
indicate the origin and nature of any modifications.
|
||||
|
||||
4. This notice may not be removed or altered.
|
||||
I'd appreciate being given credit for this package in the documentation
|
||||
of software which uses it, but that is not a requirement.
|
||||
|
||||
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
/*-
|
||||
* Copyright (c) 1994
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by the University of
|
||||
* California, Berkeley and its contributors.
|
||||
* 4. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)COPYRIGHT 8.1 (Berkeley) 3/16/94
|
||||
*/
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
||||
AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
**********************************************************************
|
||||
|
||||
PostgreSQL adopted the code out of Tcl 8.4.1. Portions of regc_locale.c
|
||||
and re_syntax.n were developed by Tcl developers other than Henry; these
|
||||
files bear the Tcl copyright and license notice:
|
||||
|
||||
**********************************************************************
|
||||
|
||||
This software is copyrighted by the Regents of the University of
|
||||
California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
|
||||
Corporation and other parties. The following terms apply to all files
|
||||
associated with the software unless explicitly disclaimed in
|
||||
individual files.
|
||||
|
||||
The authors hereby grant permission to use, copy, modify, distribute,
|
||||
and license this software and its documentation for any purpose, provided
|
||||
that existing copyright notices are retained in all copies and that this
|
||||
notice is included verbatim in any distributions. No written agreement,
|
||||
license, or royalty fee is required for any of the authorized uses.
|
||||
Modifications to this software may be copyrighted by their authors
|
||||
and need not follow the licensing terms described here, provided that
|
||||
the new terms are clearly indicated on the first page of each file where
|
||||
they apply.
|
||||
|
||||
IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
|
||||
FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
|
||||
ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
|
||||
DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
|
||||
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
|
||||
IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
|
||||
NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
|
||||
MODIFICATIONS.
|
||||
|
||||
GOVERNMENT USE: If you are acquiring this software on behalf of the
|
||||
U.S. government, the Government shall have only "Restricted Rights"
|
||||
in the software and related documentation as defined in the Federal
|
||||
Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
|
||||
are acquiring the software on behalf of the Department of Defense, the
|
||||
software shall be classified as "Commercial Computer Software" and the
|
||||
Government shall have only "Restricted Rights" as defined in Clause
|
||||
252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
|
||||
authors grant the U.S. Government and others acting in its behalf
|
||||
permission to use and distribute the software in accordance with the
|
||||
terms specified in this license.
|
||||
|
||||
**********************************************************************
|
||||
|
||||
Subsequent modifications to the code by the PostgreSQL project follow
|
||||
the same license terms as the rest of PostgreSQL.
|
||||
|
@ -1,10 +1,10 @@
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
# Makefile--
|
||||
# Makefile for regex
|
||||
# Makefile for backend/regex
|
||||
#
|
||||
# IDENTIFICATION
|
||||
# $Header: /cvsroot/pgsql/src/backend/regex/Makefile,v 1.19 2002/09/16 16:02:43 momjian Exp $
|
||||
# $Header: /cvsroot/pgsql/src/backend/regex/Makefile,v 1.20 2003/02/05 17:41:32 tgl Exp $
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
|
||||
@ -12,23 +12,17 @@ subdir = src/backend/regex
|
||||
top_builddir = ../../..
|
||||
include $(top_builddir)/src/Makefile.global
|
||||
|
||||
override CPPFLAGS += -DPOSIX_MISTAKE
|
||||
|
||||
OBJS = regcomp.o regerror.o regexec.o regfree.o
|
||||
DEBUGOBJ += ../utils/mb/SUBSYS.o
|
||||
|
||||
all: SUBSYS.o
|
||||
|
||||
SUBSYS.o: $(OBJS)
|
||||
$(LD) $(LDREL) $(LDOUT) SUBSYS.o $(OBJS)
|
||||
|
||||
regexec.o: regexec.c engine.c
|
||||
# mark inclusion dependencies between .c files explicitly
|
||||
regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c regc_locale.c
|
||||
|
||||
# retest will not compile because multibyte is now enabled by default
|
||||
# and the multibyte calls require /mmgr, /adt, and other calls that
|
||||
# are complex for linkage, bjm 2002-09-16
|
||||
#retest: retest.o SUBSYS.o $(DEBUGOBJ)
|
||||
# $(CC) $(CFLAGS) $(LDFLAGS) $^ $(LIBS) -o $@
|
||||
regexec.o: regexec.c rege_dfa.c
|
||||
|
||||
clean:
|
||||
rm -f SUBSYS.o $(OBJS) retest retest.o
|
||||
rm -f SUBSYS.o $(OBJS)
|
||||
|
@ -1,94 +0,0 @@
|
||||
# @(#)WHATSNEW 8.3 (Berkeley) 3/18/94
|
||||
|
||||
New in alpha3.4: The complex bug alluded to below has been fixed (in a
|
||||
slightly kludgey temporary way that may hurt efficiency a bit; this is
|
||||
another "get it out the door for 4.4" release). The tests at the end of
|
||||
the tests file have accordingly been uncommented. The primary sign of
|
||||
the bug was that something like a?b matching ab matched b rather than ab.
|
||||
(The bug was essentially specific to this exact situation, else it would
|
||||
have shown up earlier.)
|
||||
|
||||
New in alpha3.3: The definition of word boundaries has been altered
|
||||
slightly, to more closely match the usual programming notion that "_"
|
||||
is an alphabetic. Stuff used for pre-ANSI systems is now in a subdir,
|
||||
and the makefile no longer alludes to it in mysterious ways. The
|
||||
makefile has generally been cleaned up some. Fixes have been made
|
||||
(again!) so that the regression test will run without -DREDEBUG, at
|
||||
the cost of weaker checking. A workaround for a bug in some folks'
|
||||
<assert.h> has been added. And some more things have been added to
|
||||
tests, including a couple right at the end which are commented out
|
||||
because the code currently flunks them (complex bug; fix coming).
|
||||
Plus the usual minor cleanup.
|
||||
|
||||
New in alpha3.2: Assorted bits of cleanup and portability improvement
|
||||
(the development base is now a BSDI system using GCC instead of an ancient
|
||||
Sun system, and the newer compiler exposed some glitches). Fix for a
|
||||
serious bug that affected REs using many [] (including REG_ICASE REs
|
||||
because of the way they are implemented), *sometimes*, depending on
|
||||
memory-allocation patterns. The header-file prototypes no longer name
|
||||
the parameters, avoiding possible name conflicts. The possibility that
|
||||
some clot has defined CHAR_MIN as (say) `-128' instead of `(-128)' is
|
||||
now handled gracefully. "uchar" is no longer used as an internal type
|
||||
name (too many people have the same idea). Still the same old lousy
|
||||
performance, alas.
|
||||
|
||||
New in alpha3.1: Basically nothing, this release is just a bookkeeping
|
||||
convenience. Stay tuned.
|
||||
|
||||
New in alpha3.0: Performance is no better, alas, but some fixes have been
|
||||
made and some functionality has been added. (This is basically the "get
|
||||
it out the door in time for 4.4" release.) One bug fix: regfree() didn't
|
||||
free the main internal structure (how embarrassing). It is now possible
|
||||
to put NULs in either the RE or the target string, using (resp.) a new
|
||||
REG_PEND flag and the old REG_STARTEND flag. The REG_NOSPEC flag to
|
||||
regcomp() makes all characters ordinary, so you can match a literal
|
||||
string easily (this will become more useful when performance improves!).
|
||||
There are now primitives to match beginnings and ends of words, although
|
||||
the syntax is disgusting and so is the implementation. The REG_ATOI
|
||||
debugging interface has changed a bit. And there has been considerable
|
||||
internal cleanup of various kinds.
|
||||
|
||||
New in alpha2.3: Split change list out of README, and moved flags notes
|
||||
into Makefile. Macro-ized the name of regex(7) in regex(3), since it has
|
||||
to change for 4.4BSD. Cleanup work in engine.c, and some new regression
|
||||
tests to catch tricky cases thereof.
|
||||
|
||||
New in alpha2.2: Out-of-date manpages updated. Regerror() acquires two
|
||||
small extensions -- REG_ITOA and REG_ATOI -- which avoid debugging kludges
|
||||
in my own test program and might be useful to others for similar purposes.
|
||||
The regression test will now compile (and run) without REDEBUG. The
|
||||
BRE \$ bug is fixed. Most uses of "uchar" are gone; it's all chars now.
|
||||
Char/uchar parameters are now written int/unsigned, to avoid possible
|
||||
portability problems with unpromoted parameters. Some unsigned casts have
|
||||
been introduced to minimize portability problems with shifting into sign
|
||||
bits.
|
||||
|
||||
New in alpha2.1: Lots of little stuff, cleanup and fixes. The one big
|
||||
thing is that regex.h is now generated, using mkh, rather than being
|
||||
supplied in the distribution; due to circularities in dependencies,
|
||||
you have to build regex.h explicitly by "make h". The two known bugs
|
||||
have been fixed (and the regression test now checks for them), as has a
|
||||
problem with assertions not being suppressed in the absence of REDEBUG.
|
||||
No performance work yet.
|
||||
|
||||
New in alpha2: Backslash-anything is an ordinary character, not an
|
||||
error (except, of course, for the handful of backslashed metacharacters
|
||||
in BREs), which should reduce script breakage. The regression test
|
||||
checks *where* null strings are supposed to match, and has generally
|
||||
been tightened up somewhat. Small bug fixes in parameter passing (not
|
||||
harmful, but technically errors) and some other areas. Debugging
|
||||
invoked by defining REDEBUG rather than not defining NDEBUG.
|
||||
|
||||
New in alpha+3: full prototyping for internal routines, using a little
|
||||
helper program, mkh, which extracts prototypes given in stylized comments.
|
||||
More minor cleanup. Buglet fix: it's CHAR_BIT, not CHAR_BITS. Simple
|
||||
pre-screening of input when a literal string is known to be part of the
|
||||
RE; this does wonders for performance.
|
||||
|
||||
New in alpha+2: minor bits of cleanup. Notably, the number "32" for the
|
||||
word width isn't hardwired into regexec.c any more, the public header
|
||||
file prototypes the functions if __STDC__ is defined, and some small typos
|
||||
in the manpages have been fixed.
|
||||
|
||||
New in alpha+1: improvements to the manual pages, and an important
|
||||
extension, the REG_STARTEND option to regexec().
|
File diff suppressed because it is too large
Load Diff
@ -1,269 +0,0 @@
|
||||
.\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
|
||||
.\" Copyright (c) 1992, 1993, 1994
|
||||
.\" The Regents of the University of California. All rights reserved.
|
||||
.\"
|
||||
.\" This code is derived from software contributed to Berkeley by
|
||||
.\" Henry Spencer.
|
||||
.\"
|
||||
.\" Redistribution and use in source and binary forms, with or without
|
||||
.\" modification, are permitted provided that the following conditions
|
||||
.\" are met:
|
||||
.\" 1. Redistributions of source code must retain the above copyright
|
||||
.\" notice, this list of conditions and the following disclaimer.
|
||||
.\" 2. Redistributions in binary form must reproduce the above copyright
|
||||
.\" notice, this list of conditions and the following disclaimer in the
|
||||
.\" documentation and/or other materials provided with the distribution.
|
||||
.\" 3. All advertising materials mentioning features or use of this software
|
||||
.\" must display the following acknowledgement:
|
||||
.\" This product includes software developed by the University of
|
||||
.\" California, Berkeley and its contributors.
|
||||
.\" 4. Neither the name of the University nor the names of its contributors
|
||||
.\" may be used to endorse or promote products derived from this software
|
||||
.\" without specific prior written permission.
|
||||
.\"
|
||||
.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
.\" SUCH DAMAGE.
|
||||
.\"
|
||||
.\" @(#)re_format.7 8.3 (Berkeley) 3/20/94
|
||||
.\"
|
||||
.TH RE_FORMAT 7 "March 20, 1994"
|
||||
.SH NAME
|
||||
re_format \- POSIX 1003.2 regular expressions
|
||||
.SH DESCRIPTION
|
||||
Regular expressions (``RE''s),
|
||||
as defined in POSIX 1003.2, come in two forms:
|
||||
modern REs (roughly those of
|
||||
.IR egrep ;
|
||||
1003.2 calls these ``extended'' REs)
|
||||
and obsolete REs (roughly those of
|
||||
.IR ed ;
|
||||
1003.2 ``basic'' REs).
|
||||
Obsolete REs mostly exist for backward compatibility in some old programs;
|
||||
they will be discussed at the end.
|
||||
1003.2 leaves some aspects of RE syntax and semantics open;
|
||||
`\(dg' marks decisions on these aspects that
|
||||
may not be fully portable to other 1003.2 implementations.
|
||||
.PP
|
||||
A (modern) RE is one\(dg or more non-empty\(dg \fIbranches\fR,
|
||||
separated by `|'.
|
||||
It matches anything that matches one of the branches.
|
||||
.PP
|
||||
A branch is one\(dg or more \fIpieces\fR, concatenated.
|
||||
It matches a match for the first, followed by a match for the second, etc.
|
||||
.PP
|
||||
A piece is an \fIatom\fR possibly followed
|
||||
by a single\(dg `*', `+', `?', or \fIbound\fR.
|
||||
An atom followed by `*' matches a sequence of 0 or more matches of the atom.
|
||||
An atom followed by `+' matches a sequence of 1 or more matches of the atom.
|
||||
An atom followed by `?' matches a sequence of 0 or 1 matches of the atom.
|
||||
.PP
|
||||
A \fIbound\fR is `{' followed by an unsigned decimal integer,
|
||||
possibly followed by `,'
|
||||
possibly followed by another unsigned decimal integer,
|
||||
always followed by `}'.
|
||||
The integers must lie between 0 and RE_DUP_MAX (255\(dg) inclusive,
|
||||
and if there are two of them, the first may not exceed the second.
|
||||
An atom followed by a bound containing one integer \fIi\fR
|
||||
and no comma matches
|
||||
a sequence of exactly \fIi\fR matches of the atom.
|
||||
An atom followed by a bound
|
||||
containing one integer \fIi\fR and a comma matches
|
||||
a sequence of \fIi\fR or more matches of the atom.
|
||||
An atom followed by a bound
|
||||
containing two integers \fIi\fR and \fIj\fR matches
|
||||
a sequence of \fIi\fR through \fIj\fR (inclusive) matches of the atom.
|
||||
.PP
|
||||
An atom is a regular expression enclosed in `()' (matching a match for the
|
||||
regular expression),
|
||||
an empty set of `()' (matching the null string)\(dg,
|
||||
a \fIbracket expression\fR (see below), `.'
|
||||
(matching any single character), `^' (matching the null string at the
|
||||
beginning of a line), `$' (matching the null string at the
|
||||
end of a line), a `\e' followed by one of the characters
|
||||
`^.[$()|*+?{\e'
|
||||
(matching that character taken as an ordinary character),
|
||||
a `\e' followed by any other character\(dg
|
||||
(matching that character taken as an ordinary character,
|
||||
as if the `\e' had not been present\(dg),
|
||||
or a single character with no other significance (matching that character).
|
||||
A `{' followed by a character other than a digit is an ordinary
|
||||
character, not the beginning of a bound\(dg.
|
||||
It is illegal to end an RE with `\e'.
|
||||
.PP
|
||||
A \fIbracket expression\fR is a list of characters enclosed in `[]'.
|
||||
It normally matches any single character from the list (but see below).
|
||||
If the list begins with `^',
|
||||
it matches any single character
|
||||
(but see below) \fInot\fR from the rest of the list.
|
||||
If two characters in the list are separated by `\-', this is shorthand
|
||||
for the full \fIrange\fR of characters between those two (inclusive) in the
|
||||
collating sequence,
|
||||
e.g. `[0-9]' in ASCII matches any decimal digit.
|
||||
It is illegal\(dg for two ranges to share an
|
||||
endpoint, e.g. `a-c-e'.
|
||||
Ranges are very collating-sequence-dependent,
|
||||
and portable programs should avoid relying on them.
|
||||
.PP
|
||||
To include a literal `]' in the list, make it the first character
|
||||
(following a possible `^').
|
||||
To include a literal `\-', make it the first or last character,
|
||||
or the second endpoint of a range.
|
||||
To use a literal `\-' as the first endpoint of a range,
|
||||
enclose it in `[.' and `.]' to make it a collating element (see below).
|
||||
With the exception of these and some combinations using `[' (see next
|
||||
paragraphs), all other special characters, including `\e', lose their
|
||||
special significance within a bracket expression.
|
||||
.PP
|
||||
Within a bracket expression, a collating element (a character,
|
||||
a multi-character sequence that collates as if it were a single character,
|
||||
or a collating-sequence name for either)
|
||||
enclosed in `[.' and `.]' stands for the
|
||||
sequence of characters of that collating element.
|
||||
The sequence is a single element of the bracket expression's list.
|
||||
A bracket expression containing a multi-character collating element
|
||||
can thus match more than one character,
|
||||
e.g. if the collating sequence includes a `ch' collating element,
|
||||
then the RE `[[.ch.]]*c' matches the first five characters
|
||||
of `chchcc'.
|
||||
.PP
|
||||
Within a bracket expression, a collating element enclosed in `[=' and
|
||||
`=]' is an equivalence class, standing for the sequences of characters
|
||||
of all collating elements equivalent to that one, including itself.
|
||||
(If there are no other equivalent collating elements,
|
||||
the treatment is as if the enclosing delimiters were `[.' and `.]'.)
|
||||
For example, if o and \o'o^' are the members of an equivalence class,
|
||||
then `[[=o=]]', `[[=\o'o^'=]]', and `[o\o'o^']' are all synonymous.
|
||||
An equivalence class may not\(dg be an endpoint
|
||||
of a range.
|
||||
.PP
|
||||
Within a bracket expression, the name of a \fIcharacter class\fR enclosed
|
||||
in `[:' and `:]' stands for the list of all characters belonging to that
|
||||
class.
|
||||
Standard character class names are:
|
||||
.PP
|
||||
.RS
|
||||
.nf
|
||||
.ta 3c 6c 9c
|
||||
alnum digit punct
|
||||
alpha graph space
|
||||
blank lower upper
|
||||
cntrl print xdigit
|
||||
.fi
|
||||
.RE
|
||||
.PP
|
||||
These stand for the character classes defined in
|
||||
.IR ctype (3).
|
||||
A locale may provide others.
|
||||
A character class may not be used as an endpoint of a range.
|
||||
.PP
|
||||
There are two special cases\(dg of bracket expressions:
|
||||
the bracket expressions `[[:<:]]' and `[[:>:]]' match the null string at
|
||||
the beginning and end of a word respectively.
|
||||
A word is defined as a sequence of
|
||||
word characters
|
||||
which is neither preceded nor followed by
|
||||
word characters.
|
||||
A word character is an
|
||||
.I alnum
|
||||
character (as defined by
|
||||
.IR ctype (3))
|
||||
or an underscore.
|
||||
This is an extension,
|
||||
compatible with but not specified by POSIX 1003.2,
|
||||
and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
.PP
|
||||
In the event that an RE could match more than one substring of a given
|
||||
string,
|
||||
the RE matches the one starting earliest in the string.
|
||||
If the RE could match more than one substring starting at that point,
|
||||
it matches the longest.
|
||||
Subexpressions also match the longest possible substrings, subject to
|
||||
the constraint that the whole match be as long as possible,
|
||||
with subexpressions starting earlier in the RE taking priority over
|
||||
ones starting later.
|
||||
Note that higher-level subexpressions thus take priority over
|
||||
their lower-level component subexpressions.
|
||||
.PP
|
||||
Match lengths are measured in characters, not collating elements.
|
||||
A null string is considered longer than no match at all.
|
||||
For example,
|
||||
`bb*' matches the three middle characters of `abbbc',
|
||||
`(wee|week)(knights|nights)' matches all ten characters of `weeknights',
|
||||
when `(.*).*' is matched against `abc' the parenthesized subexpression
|
||||
matches all three characters, and
|
||||
when `(a*)*' is matched against `bc' both the whole RE and the parenthesized
|
||||
subexpression match the null string.
|
||||
.PP
|
||||
If case-independent matching is specified,
|
||||
the effect is much as if all case distinctions had vanished from the
|
||||
alphabet.
|
||||
When an alphabetic that exists in multiple cases appears as an
|
||||
ordinary character outside a bracket expression, it is effectively
|
||||
transformed into a bracket expression containing both cases,
|
||||
e.g. `x' becomes `[xX]'.
|
||||
When it appears inside a bracket expression, all case counterparts
|
||||
of it are added to the bracket expression, so that (e.g.) `[x]'
|
||||
becomes `[xX]' and `[^x]' becomes `[^xX]'.
|
||||
.PP
|
||||
No particular limit is imposed on the length of REs\(dg.
|
||||
Programs intended to be portable should not employ REs longer
|
||||
than 256 bytes,
|
||||
as an implementation can refuse to accept such REs and remain
|
||||
POSIX-compliant.
|
||||
.PP
|
||||
Obsolete (``basic'') regular expressions differ in several respects.
|
||||
`|', `+', and `?' are ordinary characters and there is no equivalent
|
||||
for their functionality.
|
||||
The delimiters for bounds are `\e{' and `\e}',
|
||||
with `{' and `}' by themselves ordinary characters.
|
||||
The parentheses for nested subexpressions are `\e(' and `\e)',
|
||||
with `(' and `)' by themselves ordinary characters.
|
||||
`^' is an ordinary character except at the beginning of the
|
||||
RE or\(dg the beginning of a parenthesized subexpression,
|
||||
`$' is an ordinary character except at the end of the
|
||||
RE or\(dg the end of a parenthesized subexpression,
|
||||
and `*' is an ordinary character if it appears at the beginning of the
|
||||
RE or the beginning of a parenthesized subexpression
|
||||
(after a possible leading `^').
|
||||
Finally, there is one new type of atom, a \fIback reference\fR:
|
||||
`\e' followed by a non-zero decimal digit \fId\fR
|
||||
matches the same sequence of characters
|
||||
matched by the \fId\fRth parenthesized subexpression
|
||||
(numbering subexpressions by the positions of their opening parentheses,
|
||||
left to right),
|
||||
so that (e.g.) `\e([bc]\e)\e1' matches `bb' or `cc' but not `bc'.
|
||||
.SH SEE ALSO
|
||||
regex(3)
|
||||
.PP
|
||||
POSIX 1003.2, section 2.8 (Regular Expression Notation).
|
||||
.SH BUGS
|
||||
Having two kinds of REs is a botch.
|
||||
.PP
|
||||
The current 1003.2 spec says that `)' is an ordinary character in
|
||||
the absence of an unmatched `(';
|
||||
this was an unintentional result of a wording error,
|
||||
and change is likely.
|
||||
Avoid relying on it.
|
||||
.PP
|
||||
Back references are a dreadful botch,
|
||||
posing major problems for efficient implementations.
|
||||
They are also somewhat vaguely defined
|
||||
(does
|
||||
`a\e(\e(b\e)*\e2\e)*d' match `abbbd'?).
|
||||
Avoid using them.
|
||||
.PP
|
||||
1003.2's specification of case-independent matching is vague.
|
||||
The ``one case implies all cases'' definition given above
|
||||
is current consensus among implementors as to the right interpretation.
|
||||
.PP
|
||||
The syntax for word boundaries is incredibly ugly.
|
970
src/backend/regex/re_syntax.n
Normal file
970
src/backend/regex/re_syntax.n
Normal file
@ -0,0 +1,970 @@
|
||||
'\"
|
||||
'\" Copyright (c) 1998 Sun Microsystems, Inc.
|
||||
'\" Copyright (c) 1999 Scriptics Corporation
|
||||
'\"
|
||||
'\" This software is copyrighted by the Regents of the University of
|
||||
'\" California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
|
||||
'\" Corporation and other parties. The following terms apply to all files
|
||||
'\" associated with the software unless explicitly disclaimed in
|
||||
'\" individual files.
|
||||
'\"
|
||||
'\" The authors hereby grant permission to use, copy, modify, distribute,
|
||||
'\" and license this software and its documentation for any purpose, provided
|
||||
'\" that existing copyright notices are retained in all copies and that this
|
||||
'\" notice is included verbatim in any distributions. No written agreement,
|
||||
'\" license, or royalty fee is required for any of the authorized uses.
|
||||
'\" Modifications to this software may be copyrighted by their authors
|
||||
'\" and need not follow the licensing terms described here, provided that
|
||||
'\" the new terms are clearly indicated on the first page of each file where
|
||||
'\" they apply.
|
||||
'\"
|
||||
'\" IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
|
||||
'\" FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
|
||||
'\" ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
|
||||
'\" DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
|
||||
'\" POSSIBILITY OF SUCH DAMAGE.
|
||||
'\"
|
||||
'\" THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
|
||||
'\" INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
'\" FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
|
||||
'\" IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
|
||||
'\" NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
|
||||
'\" MODIFICATIONS.
|
||||
'\"
|
||||
'\" GOVERNMENT USE: If you are acquiring this software on behalf of the
|
||||
'\" U.S. government, the Government shall have only "Restricted Rights"
|
||||
'\" in the software and related documentation as defined in the Federal
|
||||
'\" Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
|
||||
'\" are acquiring the software on behalf of the Department of Defense, the
|
||||
'\" software shall be classified as "Commercial Computer Software" and the
|
||||
'\" Government shall have only "Restricted Rights" as defined in Clause
|
||||
'\" 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
|
||||
'\" authors grant the U.S. Government and others acting in its behalf
|
||||
'\" permission to use and distribute the software in accordance with the
|
||||
'\" terms specified in this license.
|
||||
'\"
|
||||
'\" RCS: @(#) Id: re_syntax.n,v 1.3 1999/07/14 19:09:36 jpeek Exp
|
||||
'\"
|
||||
.so man.macros
|
||||
.TH re_syntax n "8.1" Tcl "Tcl Built-In Commands"
|
||||
.BS
|
||||
.SH NAME
|
||||
re_syntax \- Syntax of Tcl regular expressions.
|
||||
.BE
|
||||
|
||||
.SH DESCRIPTION
|
||||
.PP
|
||||
A \fIregular expression\fR describes strings of characters.
|
||||
It's a pattern that matches certain strings and doesn't match others.
|
||||
|
||||
.SH "DIFFERENT FLAVORS OF REs"
|
||||
Regular expressions (``RE''s), as defined by POSIX, come in two
|
||||
flavors: \fIextended\fR REs (``EREs'') and \fIbasic\fR REs (``BREs'').
|
||||
EREs are roughly those of the traditional \fIegrep\fR, while BREs are
|
||||
roughly those of the traditional \fIed\fR. This implementation adds
|
||||
a third flavor, \fIadvanced\fR REs (``AREs''), basically EREs with
|
||||
some significant extensions.
|
||||
.PP
|
||||
This manual page primarily describes AREs. BREs mostly exist for
|
||||
backward compatibility in some old programs; they will be discussed at
|
||||
the end. POSIX EREs are almost an exact subset of AREs. Features of
|
||||
AREs that are not present in EREs will be indicated.
|
||||
|
||||
.SH "REGULAR EXPRESSION SYNTAX"
|
||||
.PP
|
||||
Tcl regular expressions are implemented using the package written by
|
||||
Henry Spencer, based on the 1003.2 spec and some (not quite all) of
|
||||
the Perl5 extensions (thanks, Henry!). Much of the description of
|
||||
regular expressions below is copied verbatim from his manual entry.
|
||||
.PP
|
||||
An ARE is one or more \fIbranches\fR,
|
||||
separated by `\fB|\fR',
|
||||
matching anything that matches any of the branches.
|
||||
.PP
|
||||
A branch is zero or more \fIconstraints\fR or \fIquantified atoms\fR,
|
||||
concatenated.
|
||||
It matches a match for the first, followed by a match for the second, etc;
|
||||
an empty branch matches the empty string.
|
||||
.PP
|
||||
A quantified atom is an \fIatom\fR possibly followed
|
||||
by a single \fIquantifier\fR.
|
||||
Without a quantifier, it matches a match for the atom.
|
||||
The quantifiers,
|
||||
and what a so-quantified atom matches, are:
|
||||
.RS 2
|
||||
.TP 6
|
||||
\fB*\fR
|
||||
a sequence of 0 or more matches of the atom
|
||||
.TP
|
||||
\fB+\fR
|
||||
a sequence of 1 or more matches of the atom
|
||||
.TP
|
||||
\fB?\fR
|
||||
a sequence of 0 or 1 matches of the atom
|
||||
.TP
|
||||
\fB{\fIm\fB}\fR
|
||||
a sequence of exactly \fIm\fR matches of the atom
|
||||
.TP
|
||||
\fB{\fIm\fB,}\fR
|
||||
a sequence of \fIm\fR or more matches of the atom
|
||||
.TP
|
||||
\fB{\fIm\fB,\fIn\fB}\fR
|
||||
a sequence of \fIm\fR through \fIn\fR (inclusive) matches of the atom;
|
||||
\fIm\fR may not exceed \fIn\fR
|
||||
.TP
|
||||
\fB*? +? ?? {\fIm\fB}? {\fIm\fB,}? {\fIm\fB,\fIn\fB}?\fR
|
||||
\fInon-greedy\fR quantifiers,
|
||||
which match the same possibilities,
|
||||
but prefer the smallest number rather than the largest number
|
||||
of matches (see MATCHING)
|
||||
.RE
|
||||
.PP
|
||||
The forms using
|
||||
\fB{\fR and \fB}\fR
|
||||
are known as \fIbound\fRs.
|
||||
The numbers
|
||||
\fIm\fR and \fIn\fR are unsigned decimal integers
|
||||
with permissible values from 0 to 255 inclusive.
|
||||
.PP
|
||||
An atom is one of:
|
||||
.RS 2
|
||||
.TP 6
|
||||
\fB(\fIre\fB)\fR
|
||||
(where \fIre\fR is any regular expression)
|
||||
matches a match for
|
||||
\fIre\fR, with the match noted for possible reporting
|
||||
.TP
|
||||
\fB(?:\fIre\fB)\fR
|
||||
as previous,
|
||||
but does no reporting
|
||||
(a ``non-capturing'' set of parentheses)
|
||||
.TP
|
||||
\fB()\fR
|
||||
matches an empty string,
|
||||
noted for possible reporting
|
||||
.TP
|
||||
\fB(?:)\fR
|
||||
matches an empty string,
|
||||
without reporting
|
||||
.TP
|
||||
\fB[\fIchars\fB]\fR
|
||||
a \fIbracket expression\fR,
|
||||
matching any one of the \fIchars\fR (see BRACKET EXPRESSIONS for more detail)
|
||||
.TP
|
||||
\fB.\fR
|
||||
matches any single character
|
||||
.TP
|
||||
\fB\e\fIk\fR
|
||||
(where \fIk\fR is a non-alphanumeric character)
|
||||
matches that character taken as an ordinary character,
|
||||
e.g. \e\e matches a backslash character
|
||||
.TP
|
||||
\fB\e\fIc\fR
|
||||
where \fIc\fR is alphanumeric
|
||||
(possibly followed by other characters),
|
||||
an \fIescape\fR (AREs only),
|
||||
see ESCAPES below
|
||||
.TP
|
||||
\fB{\fR
|
||||
when followed by a character other than a digit,
|
||||
matches the left-brace character `\fB{\fR';
|
||||
when followed by a digit, it is the beginning of a
|
||||
\fIbound\fR (see above)
|
||||
.TP
|
||||
\fIx\fR
|
||||
where \fIx\fR is
|
||||
a single character with no other significance, matches that character.
|
||||
.RE
|
||||
.PP
|
||||
A \fIconstraint\fR matches an empty string when specific conditions
|
||||
are met.
|
||||
A constraint may not be followed by a quantifier.
|
||||
The simple constraints are as follows; some more constraints are
|
||||
described later, under ESCAPES.
|
||||
.RS 2
|
||||
.TP 8
|
||||
\fB^\fR
|
||||
matches at the beginning of a line
|
||||
.TP
|
||||
\fB$\fR
|
||||
matches at the end of a line
|
||||
.TP
|
||||
\fB(?=\fIre\fB)\fR
|
||||
\fIpositive lookahead\fR (AREs only), matches at any point
|
||||
where a substring matching \fIre\fR begins
|
||||
.TP
|
||||
\fB(?!\fIre\fB)\fR
|
||||
\fInegative lookahead\fR (AREs only), matches at any point
|
||||
where no substring matching \fIre\fR begins
|
||||
.RE
|
||||
.PP
|
||||
The lookahead constraints may not contain back references (see later),
|
||||
and all parentheses within them are considered non-capturing.
|
||||
.PP
|
||||
An RE may not end with `\fB\e\fR'.
|
||||
|
||||
.SH "BRACKET EXPRESSIONS"
|
||||
A \fIbracket expression\fR is a list of characters enclosed in `\fB[\|]\fR'.
|
||||
It normally matches any single character from the list (but see below).
|
||||
If the list begins with `\fB^\fR',
|
||||
it matches any single character
|
||||
(but see below) \fInot\fR from the rest of the list.
|
||||
.PP
|
||||
If two characters in the list are separated by `\fB\-\fR',
|
||||
this is shorthand
|
||||
for the full \fIrange\fR of characters between those two (inclusive) in the
|
||||
collating sequence,
|
||||
e.g.
|
||||
\fB[0\-9]\fR
|
||||
in ASCII matches any decimal digit.
|
||||
Two ranges may not share an
|
||||
endpoint, so e.g.
|
||||
\fBa\-c\-e\fR
|
||||
is illegal.
|
||||
Ranges are very collating-sequence-dependent,
|
||||
and portable programs should avoid relying on them.
|
||||
.PP
|
||||
To include a literal
|
||||
\fB]\fR
|
||||
or
|
||||
\fB\-\fR
|
||||
in the list,
|
||||
the simplest method is to
|
||||
enclose it in
|
||||
\fB[.\fR and \fB.]\fR
|
||||
to make it a collating element (see below).
|
||||
Alternatively,
|
||||
make it the first character
|
||||
(following a possible `\fB^\fR'),
|
||||
or (AREs only) precede it with `\fB\e\fR'.
|
||||
Alternatively, for `\fB\-\fR',
|
||||
make it the last character,
|
||||
or the second endpoint of a range.
|
||||
To use a literal
|
||||
\fB\-\fR
|
||||
as the first endpoint of a range,
|
||||
make it a collating element
|
||||
or (AREs only) precede it with `\fB\e\fR'.
|
||||
With the exception of these, some combinations using
|
||||
\fB[\fR
|
||||
(see next
|
||||
paragraphs), and escapes,
|
||||
all other special characters lose their
|
||||
special significance within a bracket expression.
|
||||
.PP
|
||||
Within a bracket expression, a collating element (a character,
|
||||
a multi-character sequence that collates as if it were a single character,
|
||||
or a collating-sequence name for either)
|
||||
enclosed in
|
||||
\fB[.\fR and \fB.]\fR
|
||||
stands for the
|
||||
sequence of characters of that collating element.
|
||||
The sequence is a single element of the bracket expression's list.
|
||||
A bracket expression in a locale that has
|
||||
multi-character collating elements
|
||||
can thus match more than one character.
|
||||
.VS 8.2
|
||||
So (insidiously), a bracket expression that starts with \fB^\fR
|
||||
can match multi-character collating elements even if none of them
|
||||
appear in the bracket expression!
|
||||
(\fINote:\fR Tcl currently has no multi-character collating elements.
|
||||
This information is only for illustration.)
|
||||
.PP
|
||||
For example, assume the collating sequence includes a \fBch\fR
|
||||
multi-character collating element.
|
||||
Then the RE \fB[[.ch.]]*c\fR (zero or more \fBch\fP's followed by \fBc\fP)
|
||||
matches the first five characters of `\fBchchcc\fR'.
|
||||
Also, the RE \fB[^c]b\fR matches all of `\fBchb\fR'
|
||||
(because \fB[^c]\fR matches the multi-character \fBch\fR).
|
||||
.VE 8.2
|
||||
.PP
|
||||
Within a bracket expression, a collating element enclosed in
|
||||
\fB[=\fR
|
||||
and
|
||||
\fB=]\fR
|
||||
is an equivalence class, standing for the sequences of characters
|
||||
of all collating elements equivalent to that one, including itself.
|
||||
(If there are no other equivalent collating elements,
|
||||
the treatment is as if the enclosing delimiters were `\fB[.\fR'\&
|
||||
and `\fB.]\fR'.)
|
||||
For example, if
|
||||
\fBo\fR
|
||||
and
|
||||
\fB\o'o^'\fR
|
||||
are the members of an equivalence class,
|
||||
then `\fB[[=o=]]\fR', `\fB[[=\o'o^'=]]\fR',
|
||||
and `\fB[o\o'o^']\fR'\&
|
||||
are all synonymous.
|
||||
An equivalence class may not be an endpoint
|
||||
of a range.
|
||||
.VS 8.2
|
||||
(\fINote:\fR
|
||||
Tcl currently implements only the Unicode locale.
|
||||
It doesn't define any equivalence classes.
|
||||
The examples above are just illustrations.)
|
||||
.VE 8.2
|
||||
.PP
|
||||
Within a bracket expression, the name of a \fIcharacter class\fR enclosed
|
||||
in
|
||||
\fB[:\fR
|
||||
and
|
||||
\fB:]\fR
|
||||
stands for the list of all characters
|
||||
(not all collating elements!)
|
||||
belonging to that
|
||||
class.
|
||||
Standard character classes are:
|
||||
.PP
|
||||
.RS
|
||||
.ne 5
|
||||
.nf
|
||||
.ta 3c
|
||||
\fBalpha\fR A letter.
|
||||
\fBupper\fR An upper-case letter.
|
||||
\fBlower\fR A lower-case letter.
|
||||
\fBdigit\fR A decimal digit.
|
||||
\fBxdigit\fR A hexadecimal digit.
|
||||
\fBalnum\fR An alphanumeric (letter or digit).
|
||||
\fBprint\fR An alphanumeric (same as alnum).
|
||||
\fBblank\fR A space or tab character.
|
||||
\fBspace\fR A character producing white space in displayed text.
|
||||
\fBpunct\fR A punctuation character.
|
||||
\fBgraph\fR A character with a visible representation.
|
||||
\fBcntrl\fR A control character.
|
||||
.fi
|
||||
.RE
|
||||
.PP
|
||||
A locale may provide others.
|
||||
.VS 8.2
|
||||
(Note that the current Tcl implementation has only one locale:
|
||||
the Unicode locale.)
|
||||
.VE 8.2
|
||||
A character class may not be used as an endpoint of a range.
|
||||
.PP
|
||||
There are two special cases of bracket expressions:
|
||||
the bracket expressions
|
||||
\fB[[:<:]]\fR
|
||||
and
|
||||
\fB[[:>:]]\fR
|
||||
are constraints, matching empty strings at
|
||||
the beginning and end of a word respectively.
|
||||
'\" note, discussion of escapes below references this definition of word
|
||||
A word is defined as a sequence of
|
||||
word characters
|
||||
that is neither preceded nor followed by
|
||||
word characters.
|
||||
A word character is an
|
||||
\fIalnum\fR
|
||||
character
|
||||
or an underscore
|
||||
(\fB_\fR).
|
||||
These special bracket expressions are deprecated;
|
||||
users of AREs should use constraint escapes instead (see below).
|
||||
.SH ESCAPES
|
||||
Escapes (AREs only), which begin with a
|
||||
\fB\e\fR
|
||||
followed by an alphanumeric character,
|
||||
come in several varieties:
|
||||
character entry, class shorthands, constraint escapes, and back references.
|
||||
A
|
||||
\fB\e\fR
|
||||
followed by an alphanumeric character but not constituting
|
||||
a valid escape is illegal in AREs.
|
||||
In EREs, there are no escapes:
|
||||
outside a bracket expression,
|
||||
a
|
||||
\fB\e\fR
|
||||
followed by an alphanumeric character merely stands for that
|
||||
character as an ordinary character,
|
||||
and inside a bracket expression,
|
||||
\fB\e\fR
|
||||
is an ordinary character.
|
||||
(The latter is the one actual incompatibility between EREs and AREs.)
|
||||
.PP
|
||||
Character-entry escapes (AREs only) exist to make it easier to specify
|
||||
non-printing and otherwise inconvenient characters in REs:
|
||||
.RS 2
|
||||
.TP 5
|
||||
\fB\ea\fR
|
||||
alert (bell) character, as in C
|
||||
.TP
|
||||
\fB\eb\fR
|
||||
backspace, as in C
|
||||
.TP
|
||||
\fB\eB\fR
|
||||
synonym for
|
||||
\fB\e\fR
|
||||
to help reduce backslash doubling in some
|
||||
applications where there are multiple levels of backslash processing
|
||||
.TP
|
||||
\fB\ec\fIX\fR
|
||||
(where X is any character) the character whose
|
||||
low-order 5 bits are the same as those of
|
||||
\fIX\fR,
|
||||
and whose other bits are all zero
|
||||
.TP
|
||||
\fB\ee\fR
|
||||
the character whose collating-sequence name
|
||||
is `\fBESC\fR',
|
||||
or failing that, the character with octal value 033
|
||||
.TP
|
||||
\fB\ef\fR
|
||||
formfeed, as in C
|
||||
.TP
|
||||
\fB\en\fR
|
||||
newline, as in C
|
||||
.TP
|
||||
\fB\er\fR
|
||||
carriage return, as in C
|
||||
.TP
|
||||
\fB\et\fR
|
||||
horizontal tab, as in C
|
||||
.TP
|
||||
\fB\eu\fIwxyz\fR
|
||||
(where
|
||||
\fIwxyz\fR
|
||||
is exactly four hexadecimal digits)
|
||||
the Unicode character
|
||||
\fBU+\fIwxyz\fR
|
||||
in the local byte ordering
|
||||
.TP
|
||||
\fB\eU\fIstuvwxyz\fR
|
||||
(where
|
||||
\fIstuvwxyz\fR
|
||||
is exactly eight hexadecimal digits)
|
||||
reserved for a somewhat-hypothetical Unicode extension to 32 bits
|
||||
.TP
|
||||
\fB\ev\fR
|
||||
vertical tab, as in C
|
||||
are all available.
|
||||
.TP
|
||||
\fB\ex\fIhhh\fR
|
||||
(where
|
||||
\fIhhh\fR
|
||||
is any sequence of hexadecimal digits)
|
||||
the character whose hexadecimal value is
|
||||
\fB0x\fIhhh\fR
|
||||
(a single character no matter how many hexadecimal digits are used).
|
||||
.TP
|
||||
\fB\e0\fR
|
||||
the character whose value is
|
||||
\fB0\fR
|
||||
.TP
|
||||
\fB\e\fIxy\fR
|
||||
(where
|
||||
\fIxy\fR
|
||||
is exactly two octal digits,
|
||||
and is not a
|
||||
\fIback reference\fR (see below))
|
||||
the character whose octal value is
|
||||
\fB0\fIxy\fR
|
||||
.TP
|
||||
\fB\e\fIxyz\fR
|
||||
(where
|
||||
\fIxyz\fR
|
||||
is exactly three octal digits,
|
||||
and is not a
|
||||
back reference (see below))
|
||||
the character whose octal value is
|
||||
\fB0\fIxyz\fR
|
||||
.RE
|
||||
.PP
|
||||
Hexadecimal digits are `\fB0\fR'-`\fB9\fR', `\fBa\fR'-`\fBf\fR',
|
||||
and `\fBA\fR'-`\fBF\fR'.
|
||||
Octal digits are `\fB0\fR'-`\fB7\fR'.
|
||||
.PP
|
||||
The character-entry escapes are always taken as ordinary characters.
|
||||
For example,
|
||||
\fB\e135\fR
|
||||
is
|
||||
\fB]\fR
|
||||
in ASCII,
|
||||
but
|
||||
\fB\e135\fR
|
||||
does not terminate a bracket expression.
|
||||
Beware, however, that some applications (e.g., C compilers) interpret
|
||||
such sequences themselves before the regular-expression package
|
||||
gets to see them, which may require doubling (quadrupling, etc.) the `\fB\e\fR'.
|
||||
.PP
|
||||
Class-shorthand escapes (AREs only) provide shorthands for certain commonly-used
|
||||
character classes:
|
||||
.RS 2
|
||||
.TP 10
|
||||
\fB\ed\fR
|
||||
\fB[[:digit:]]\fR
|
||||
.TP
|
||||
\fB\es\fR
|
||||
\fB[[:space:]]\fR
|
||||
.TP
|
||||
\fB\ew\fR
|
||||
\fB[[:alnum:]_]\fR
|
||||
(note underscore)
|
||||
.TP
|
||||
\fB\eD\fR
|
||||
\fB[^[:digit:]]\fR
|
||||
.TP
|
||||
\fB\eS\fR
|
||||
\fB[^[:space:]]\fR
|
||||
.TP
|
||||
\fB\eW\fR
|
||||
\fB[^[:alnum:]_]\fR
|
||||
(note underscore)
|
||||
.RE
|
||||
.PP
|
||||
Within bracket expressions, `\fB\ed\fR', `\fB\es\fR',
|
||||
and `\fB\ew\fR'\&
|
||||
lose their outer brackets,
|
||||
and `\fB\eD\fR', `\fB\eS\fR',
|
||||
and `\fB\eW\fR'\&
|
||||
are illegal.
|
||||
.VS 8.2
|
||||
(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
|
||||
Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.)
|
||||
.VE 8.2
|
||||
.PP
|
||||
A constraint escape (AREs only) is a constraint,
|
||||
matching the empty string if specific conditions are met,
|
||||
written as an escape:
|
||||
.RS 2
|
||||
.TP 6
|
||||
\fB\eA\fR
|
||||
matches only at the beginning of the string
|
||||
(see MATCHING, below, for how this differs from `\fB^\fR')
|
||||
.TP
|
||||
\fB\em\fR
|
||||
matches only at the beginning of a word
|
||||
.TP
|
||||
\fB\eM\fR
|
||||
matches only at the end of a word
|
||||
.TP
|
||||
\fB\ey\fR
|
||||
matches only at the beginning or end of a word
|
||||
.TP
|
||||
\fB\eY\fR
|
||||
matches only at a point that is not the beginning or end of a word
|
||||
.TP
|
||||
\fB\eZ\fR
|
||||
matches only at the end of the string
|
||||
(see MATCHING, below, for how this differs from `\fB$\fR')
|
||||
.TP
|
||||
\fB\e\fIm\fR
|
||||
(where
|
||||
\fIm\fR
|
||||
is a nonzero digit) a \fIback reference\fR, see below
|
||||
.TP
|
||||
\fB\e\fImnn\fR
|
||||
(where
|
||||
\fIm\fR
|
||||
is a nonzero digit, and
|
||||
\fInn\fR
|
||||
is some more digits,
|
||||
and the decimal value
|
||||
\fImnn\fR
|
||||
is not greater than the number of closing capturing parentheses seen so far)
|
||||
a \fIback reference\fR, see below
|
||||
.RE
|
||||
.PP
|
||||
A word is defined as in the specification of
|
||||
\fB[[:<:]]\fR
|
||||
and
|
||||
\fB[[:>:]]\fR
|
||||
above.
|
||||
Constraint escapes are illegal within bracket expressions.
|
||||
.PP
|
||||
A back reference (AREs only) matches the same string matched by the parenthesized
|
||||
subexpression specified by the number,
|
||||
so that (e.g.)
|
||||
\fB([bc])\e1\fR
|
||||
matches
|
||||
\fBbb\fR
|
||||
or
|
||||
\fBcc\fR
|
||||
but not `\fBbc\fR'.
|
||||
The subexpression must entirely precede the back reference in the RE.
|
||||
Subexpressions are numbered in the order of their leading parentheses.
|
||||
Non-capturing parentheses do not define subexpressions.
|
||||
.PP
|
||||
There is an inherent historical ambiguity between octal character-entry
|
||||
escapes and back references, which is resolved by heuristics,
|
||||
as hinted at above.
|
||||
A leading zero always indicates an octal escape.
|
||||
A single non-zero digit, not followed by another digit,
|
||||
is always taken as a back reference.
|
||||
A multi-digit sequence not starting with a zero is taken as a back
|
||||
reference if it comes after a suitable subexpression
|
||||
(i.e. the number is in the legal range for a back reference),
|
||||
and otherwise is taken as octal.
|
||||
.SH "METASYNTAX"
|
||||
In addition to the main syntax described above, there are some special
|
||||
forms and miscellaneous syntactic facilities available.
|
||||
.PP
|
||||
Normally the flavor of RE being used is specified by
|
||||
application-dependent means.
|
||||
However, this can be overridden by a \fIdirector\fR.
|
||||
If an RE of any flavor begins with `\fB***:\fR',
|
||||
the rest of the RE is an ARE.
|
||||
If an RE of any flavor begins with `\fB***=\fR',
|
||||
the rest of the RE is taken to be a literal string,
|
||||
with all characters considered ordinary characters.
|
||||
.PP
|
||||
An ARE may begin with \fIembedded options\fR:
|
||||
a sequence
|
||||
\fB(?\fIxyz\fB)\fR
|
||||
(where
|
||||
\fIxyz\fR
|
||||
is one or more alphabetic characters)
|
||||
specifies options affecting the rest of the RE.
|
||||
These supplement, and can override,
|
||||
any options specified by the application.
|
||||
The available option letters are:
|
||||
.RS 2
|
||||
.TP 3
|
||||
\fBb\fR
|
||||
rest of RE is a BRE
|
||||
.TP 3
|
||||
\fBc\fR
|
||||
case-sensitive matching (usual default)
|
||||
.TP 3
|
||||
\fBe\fR
|
||||
rest of RE is an ERE
|
||||
.TP 3
|
||||
\fBi\fR
|
||||
case-insensitive matching (see MATCHING, below)
|
||||
.TP 3
|
||||
\fBm\fR
|
||||
historical synonym for
|
||||
\fBn\fR
|
||||
.TP 3
|
||||
\fBn\fR
|
||||
newline-sensitive matching (see MATCHING, below)
|
||||
.TP 3
|
||||
\fBp\fR
|
||||
partial newline-sensitive matching (see MATCHING, below)
|
||||
.TP 3
|
||||
\fBq\fR
|
||||
rest of RE is a literal (``quoted'') string, all ordinary characters
|
||||
.TP 3
|
||||
\fBs\fR
|
||||
non-newline-sensitive matching (usual default)
|
||||
.TP 3
|
||||
\fBt\fR
|
||||
tight syntax (usual default; see below)
|
||||
.TP 3
|
||||
\fBw\fR
|
||||
inverse partial newline-sensitive (``weird'') matching (see MATCHING, below)
|
||||
.TP 3
|
||||
\fBx\fR
|
||||
expanded syntax (see below)
|
||||
.RE
|
||||
.PP
|
||||
Embedded options take effect at the
|
||||
\fB)\fR
|
||||
terminating the sequence.
|
||||
They are available only at the start of an ARE,
|
||||
and may not be used later within it.
|
||||
.PP
|
||||
In addition to the usual (\fItight\fR) RE syntax, in which all characters are
|
||||
significant, there is an \fIexpanded\fR syntax,
|
||||
available in all flavors of RE
|
||||
with the \fB-expanded\fR switch, or in AREs with the embedded x option.
|
||||
In the expanded syntax,
|
||||
white-space characters are ignored
|
||||
and all characters between a
|
||||
\fB#\fR
|
||||
and the following newline (or the end of the RE) are ignored,
|
||||
permitting paragraphing and commenting a complex RE.
|
||||
There are three exceptions to that basic rule:
|
||||
.RS 2
|
||||
.PP
|
||||
a white-space character or `\fB#\fR' preceded by `\fB\e\fR' is retained
|
||||
.PP
|
||||
white space or `\fB#\fR' within a bracket expression is retained
|
||||
.PP
|
||||
white space and comments are illegal within multi-character symbols
|
||||
like the ARE `\fB(?:\fR' or the BRE `\fB\e(\fR'
|
||||
.RE
|
||||
.PP
|
||||
Expanded-syntax white-space characters are blank, tab, newline, and
|
||||
.VS 8.2
|
||||
any character that belongs to the \fIspace\fR character class.
|
||||
.VE 8.2
|
||||
.PP
|
||||
Finally, in an ARE,
|
||||
outside bracket expressions, the sequence `\fB(?#\fIttt\fB)\fR'
|
||||
(where
|
||||
\fIttt\fR
|
||||
is any text not containing a `\fB)\fR')
|
||||
is a comment,
|
||||
completely ignored.
|
||||
Again, this is not allowed between the characters of
|
||||
multi-character symbols like `\fB(?:\fR'.
|
||||
Such comments are more a historical artifact than a useful facility,
|
||||
and their use is deprecated;
|
||||
use the expanded syntax instead.
|
||||
.PP
|
||||
\fINone\fR of these metasyntax extensions is available if the application
|
||||
(or an initial
|
||||
\fB***=\fR
|
||||
director)
|
||||
has specified that the user's input be treated as a literal string
|
||||
rather than as an RE.
|
||||
.SH MATCHING
|
||||
In the event that an RE could match more than one substring of a given
|
||||
string,
|
||||
the RE matches the one starting earliest in the string.
|
||||
If the RE could match more than one substring starting at that point,
|
||||
its choice is determined by its \fIpreference\fR:
|
||||
either the longest substring, or the shortest.
|
||||
.PP
|
||||
Most atoms, and all constraints, have no preference.
|
||||
A parenthesized RE has the same preference (possibly none) as the RE.
|
||||
A quantified atom with quantifier
|
||||
\fB{\fIm\fB}\fR
|
||||
or
|
||||
\fB{\fIm\fB}?\fR
|
||||
has the same preference (possibly none) as the atom itself.
|
||||
A quantified atom with other normal quantifiers (including
|
||||
\fB{\fIm\fB,\fIn\fB}\fR
|
||||
with
|
||||
\fIm\fR
|
||||
equal to
|
||||
\fIn\fR)
|
||||
prefers longest match.
|
||||
A quantified atom with other non-greedy quantifiers (including
|
||||
\fB{\fIm\fB,\fIn\fB}?\fR
|
||||
with
|
||||
\fIm\fR
|
||||
equal to
|
||||
\fIn\fR)
|
||||
prefers shortest match.
|
||||
A branch has the same preference as the first quantified atom in it
|
||||
which has a preference.
|
||||
An RE consisting of two or more branches connected by the
|
||||
\fB|\fR
|
||||
operator prefers longest match.
|
||||
.PP
|
||||
Subject to the constraints imposed by the rules for matching the whole RE,
|
||||
subexpressions also match the longest or shortest possible substrings,
|
||||
based on their preferences,
|
||||
with subexpressions starting earlier in the RE taking priority over
|
||||
ones starting later.
|
||||
Note that outer subexpressions thus take priority over
|
||||
their component subexpressions.
|
||||
.PP
|
||||
Note that the quantifiers
|
||||
\fB{1,1}\fR
|
||||
and
|
||||
\fB{1,1}?\fR
|
||||
can be used to force longest and shortest preference, respectively,
|
||||
on a subexpression or a whole RE.
|
||||
.PP
|
||||
Match lengths are measured in characters, not collating elements.
|
||||
An empty string is considered longer than no match at all.
|
||||
For example,
|
||||
\fBbb*\fR
|
||||
matches the three middle characters of `\fBabbbc\fR',
|
||||
\fB(week|wee)(night|knights)\fR
|
||||
matches all ten characters of `\fBweeknights\fR',
|
||||
when
|
||||
\fB(.*).*\fR
|
||||
is matched against
|
||||
\fBabc\fR
|
||||
the parenthesized subexpression
|
||||
matches all three characters, and
|
||||
when
|
||||
\fB(a*)*\fR
|
||||
is matched against
|
||||
\fBbc\fR
|
||||
both the whole RE and the parenthesized
|
||||
subexpression match an empty string.
|
||||
.PP
|
||||
If case-independent matching is specified,
|
||||
the effect is much as if all case distinctions had vanished from the
|
||||
alphabet.
|
||||
When an alphabetic that exists in multiple cases appears as an
|
||||
ordinary character outside a bracket expression, it is effectively
|
||||
transformed into a bracket expression containing both cases,
|
||||
so that
|
||||
\fBx\fR
|
||||
becomes `\fB[xX]\fR'.
|
||||
When it appears inside a bracket expression, all case counterparts
|
||||
of it are added to the bracket expression, so that
|
||||
\fB[x]\fR
|
||||
becomes
|
||||
\fB[xX]\fR
|
||||
and
|
||||
\fB[^x]\fR
|
||||
becomes `\fB[^xX]\fR'.
|
||||
.PP
|
||||
If newline-sensitive matching is specified, \fB.\fR
|
||||
and bracket expressions using
|
||||
\fB^\fR
|
||||
will never match the newline character
|
||||
(so that matches will never cross newlines unless the RE
|
||||
explicitly arranges it)
|
||||
and
|
||||
\fB^\fR
|
||||
and
|
||||
\fB$\fR
|
||||
will match the empty string after and before a newline
|
||||
respectively, in addition to matching at beginning and end of string
|
||||
respectively.
|
||||
ARE
|
||||
\fB\eA\fR
|
||||
and
|
||||
\fB\eZ\fR
|
||||
continue to match beginning or end of string \fIonly\fR.
|
||||
.PP
|
||||
If partial newline-sensitive matching is specified,
|
||||
this affects \fB.\fR
|
||||
and bracket expressions
|
||||
as with newline-sensitive matching, but not
|
||||
\fB^\fR
|
||||
and `\fB$\fR'.
|
||||
.PP
|
||||
If inverse partial newline-sensitive matching is specified,
|
||||
this affects
|
||||
\fB^\fR
|
||||
and
|
||||
\fB$\fR
|
||||
as with
|
||||
newline-sensitive matching,
|
||||
but not \fB.\fR
|
||||
and bracket expressions.
|
||||
This isn't very useful but is provided for symmetry.
|
||||
.SH "LIMITS AND COMPATIBILITY"
|
||||
No particular limit is imposed on the length of REs.
|
||||
Programs intended to be highly portable should not employ REs longer
|
||||
than 256 bytes,
|
||||
as a POSIX-compliant implementation can refuse to accept such REs.
|
||||
.PP
|
||||
The only feature of AREs that is actually incompatible with
|
||||
POSIX EREs is that
|
||||
\fB\e\fR
|
||||
does not lose its special
|
||||
significance inside bracket expressions.
|
||||
All other ARE features use syntax which is illegal or has
|
||||
undefined or unspecified effects in POSIX EREs;
|
||||
the
|
||||
\fB***\fR
|
||||
syntax of directors likewise is outside the POSIX
|
||||
syntax for both BREs and EREs.
|
||||
.PP
|
||||
Many of the ARE extensions are borrowed from Perl, but some have
|
||||
been changed to clean them up, and a few Perl extensions are not present.
|
||||
Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR',
|
||||
the lack of special treatment for a trailing newline,
|
||||
the addition of complemented bracket expressions to the things
|
||||
affected by newline-sensitive matching,
|
||||
the restrictions on parentheses and back references in lookahead constraints,
|
||||
and the longest/shortest-match (rather than first-match) matching semantics.
|
||||
.PP
|
||||
The matching rules for REs containing both normal and non-greedy quantifiers
|
||||
have changed since early beta-test versions of this package.
|
||||
(The new rules are much simpler and cleaner,
|
||||
but don't work as hard at guessing the user's real intentions.)
|
||||
.PP
|
||||
Henry Spencer's original 1986 \fIregexp\fR package,
|
||||
still in widespread use (e.g., in pre-8.1 releases of Tcl),
|
||||
implemented an early version of today's EREs.
|
||||
There are four incompatibilities between \fIregexp\fR's near-EREs
|
||||
(`RREs' for short) and AREs.
|
||||
In roughly increasing order of significance:
|
||||
.PP
|
||||
.RS
|
||||
In AREs,
|
||||
\fB\e\fR
|
||||
followed by an alphanumeric character is either an
|
||||
escape or an error,
|
||||
while in RREs, it was just another way of writing the
|
||||
alphanumeric.
|
||||
This should not be a problem because there was no reason to write
|
||||
such a sequence in RREs.
|
||||
.PP
|
||||
\fB{\fR
|
||||
followed by a digit in an ARE is the beginning of a bound,
|
||||
while in RREs,
|
||||
\fB{\fR
|
||||
was always an ordinary character.
|
||||
Such sequences should be rare,
|
||||
and will often result in an error because following characters
|
||||
will not look like a valid bound.
|
||||
.PP
|
||||
In AREs,
|
||||
\fB\e\fR
|
||||
remains a special character within `\fB[\|]\fR',
|
||||
so a literal
|
||||
\fB\e\fR
|
||||
within
|
||||
\fB[\|]\fR
|
||||
must be written `\fB\e\e\fR'.
|
||||
\fB\e\e\fR
|
||||
also gives a literal
|
||||
\fB\e\fR
|
||||
within
|
||||
\fB[\|]\fR
|
||||
in RREs,
|
||||
but only truly paranoid programmers routinely doubled the backslash.
|
||||
.PP
|
||||
AREs report the longest/shortest match for the RE,
|
||||
rather than the first found in a specified search order.
|
||||
This may affect some RREs which were written in the expectation that
|
||||
the first match would be reported.
|
||||
(The careful crafting of RREs to optimize the search order for fast
|
||||
matching is obsolete (AREs examine all possible matches
|
||||
in parallel, and their performance is largely insensitive to their
|
||||
complexity) but cases where the search order was exploited to deliberately
|
||||
find a match which was \fInot\fR the longest/shortest will need rewriting.)
|
||||
.RE
|
||||
|
||||
.SH "BASIC REGULAR EXPRESSIONS"
|
||||
BREs differ from EREs in several respects. `\fB|\fR', `\fB+\fR',
|
||||
and
|
||||
\fB?\fR
|
||||
are ordinary characters and there is no equivalent
|
||||
for their functionality.
|
||||
The delimiters for bounds are
|
||||
\fB\e{\fR
|
||||
and `\fB\e}\fR',
|
||||
with
|
||||
\fB{\fR
|
||||
and
|
||||
\fB}\fR
|
||||
by themselves ordinary characters.
|
||||
The parentheses for nested subexpressions are
|
||||
\fB\e(\fR
|
||||
and `\fB\e)\fR',
|
||||
with
|
||||
\fB(\fR
|
||||
and
|
||||
\fB)\fR
|
||||
by themselves ordinary characters.
|
||||
\fB^\fR
|
||||
is an ordinary character except at the beginning of the
|
||||
RE or the beginning of a parenthesized subexpression,
|
||||
\fB$\fR
|
||||
is an ordinary character except at the end of the
|
||||
RE or the end of a parenthesized subexpression,
|
||||
and
|
||||
\fB*\fR
|
||||
is an ordinary character if it appears at the beginning of the
|
||||
RE or the beginning of a parenthesized subexpression
|
||||
(after a possible leading `\fB^\fR').
|
||||
Finally,
|
||||
single-digit back references are available,
|
||||
and
|
||||
\fB\e<\fR
|
||||
and
|
||||
\fB\e>\fR
|
||||
are synonyms for
|
||||
\fB[[:<:]]\fR
|
||||
and
|
||||
\fB[[:>:]]\fR
|
||||
respectively;
|
||||
no other escapes are available.
|
||||
|
||||
.SH "SEE ALSO"
|
||||
RegExp(3), regexp(n), regsub(n), lsearch(n), switch(n), text(n)
|
||||
|
||||
.SH KEYWORDS
|
||||
match, regular expression, string
|
728
src/backend/regex/regc_color.c
Normal file
728
src/backend/regex/regc_color.c
Normal file
@ -0,0 +1,728 @@
|
||||
/*
|
||||
* colorings of characters
|
||||
* This file is #included by regcomp.c.
|
||||
*
|
||||
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
|
||||
*
|
||||
* Development of this software was funded, in part, by Cray Research Inc.,
|
||||
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
|
||||
* Corporation, none of whom are responsible for the results. The author
|
||||
* thanks all of them.
|
||||
*
|
||||
* Redistribution and use in source and binary forms -- with or without
|
||||
* modification -- are permitted for any purpose, provided that
|
||||
* redistributions in source form retain this entire copyright notice and
|
||||
* indicate the origin and nature of any modifications.
|
||||
*
|
||||
* I'd appreciate being given credit for this package in the documentation
|
||||
* of software which uses it, but that is not a requirement.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
||||
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* $Header: /cvsroot/pgsql/src/backend/regex/regc_color.c,v 1.1 2003/02/05 17:41:32 tgl Exp $
|
||||
*
|
||||
*
|
||||
* Note that there are some incestuous relationships between this code and
|
||||
* NFA arc maintenance, which perhaps ought to be cleaned up sometime.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#define CISERR() VISERR(cm->v)
|
||||
#define CERR(e) VERR(cm->v, (e))
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* initcm - set up new colormap
|
||||
*/
|
||||
static void
|
||||
initcm(struct vars *v,
|
||||
struct colormap *cm)
|
||||
{
|
||||
int i;
|
||||
int j;
|
||||
union tree *t;
|
||||
union tree *nextt;
|
||||
struct colordesc *cd;
|
||||
|
||||
cm->magic = CMMAGIC;
|
||||
cm->v = v;
|
||||
|
||||
cm->ncds = NINLINECDS;
|
||||
cm->cd = cm->cdspace;
|
||||
cm->max = 0;
|
||||
cm->free = 0;
|
||||
|
||||
cd = cm->cd; /* cm->cd[WHITE] */
|
||||
cd->sub = NOSUB;
|
||||
cd->arcs = NULL;
|
||||
cd->flags = 0;
|
||||
cd->nchrs = CHR_MAX - CHR_MIN + 1;
|
||||
|
||||
/* upper levels of tree */
|
||||
for (t = &cm->tree[0], j = NBYTS-1; j > 0; t = nextt, j--) {
|
||||
nextt = t + 1;
|
||||
for (i = BYTTAB-1; i >= 0; i--)
|
||||
t->tptr[i] = nextt;
|
||||
}
|
||||
/* bottom level is solid white */
|
||||
t = &cm->tree[NBYTS-1];
|
||||
for (i = BYTTAB-1; i >= 0; i--)
|
||||
t->tcolor[i] = WHITE;
|
||||
cd->block = t;
|
||||
}
|
||||
|
||||
/*
|
||||
* freecm - free dynamically-allocated things in a colormap
|
||||
*/
|
||||
static void
|
||||
freecm(struct colormap *cm)
|
||||
{
|
||||
size_t i;
|
||||
union tree *cb;
|
||||
|
||||
cm->magic = 0;
|
||||
if (NBYTS > 1)
|
||||
cmtreefree(cm, cm->tree, 0);
|
||||
for (i = 1; i <= cm->max; i++) /* skip WHITE */
|
||||
if (!UNUSEDCOLOR(&cm->cd[i])) {
|
||||
cb = cm->cd[i].block;
|
||||
if (cb != NULL)
|
||||
FREE(cb);
|
||||
}
|
||||
if (cm->cd != cm->cdspace)
|
||||
FREE(cm->cd);
|
||||
}
|
||||
|
||||
/*
|
||||
* cmtreefree - free a non-terminal part of a colormap tree
|
||||
*/
|
||||
static void
|
||||
cmtreefree(struct colormap *cm,
|
||||
union tree *tree,
|
||||
int level) /* level number (top == 0) of this block */
|
||||
{
|
||||
int i;
|
||||
union tree *t;
|
||||
union tree *fillt = &cm->tree[level+1];
|
||||
union tree *cb;
|
||||
|
||||
assert(level < NBYTS-1); /* this level has pointers */
|
||||
for (i = BYTTAB-1; i >= 0; i--) {
|
||||
t = tree->tptr[i];
|
||||
assert(t != NULL);
|
||||
if (t != fillt) {
|
||||
if (level < NBYTS-2) { /* more pointer blocks below */
|
||||
cmtreefree(cm, t, level+1);
|
||||
FREE(t);
|
||||
} else { /* color block below */
|
||||
cb = cm->cd[t->tcolor[0]].block;
|
||||
if (t != cb) /* not a solid block */
|
||||
FREE(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* setcolor - set the color of a character in a colormap
|
||||
*/
|
||||
static color /* previous color */
|
||||
setcolor(struct colormap *cm,
|
||||
chr c,
|
||||
pcolor co)
|
||||
{
|
||||
uchr uc = c;
|
||||
int shift;
|
||||
int level;
|
||||
int b;
|
||||
int bottom;
|
||||
union tree *t;
|
||||
union tree *newt;
|
||||
union tree *fillt;
|
||||
union tree *lastt;
|
||||
union tree *cb;
|
||||
color prev;
|
||||
|
||||
assert(cm->magic == CMMAGIC);
|
||||
if (CISERR() || co == COLORLESS)
|
||||
return COLORLESS;
|
||||
|
||||
t = cm->tree;
|
||||
for (level = 0, shift = BYTBITS * (NBYTS - 1); shift > 0;
|
||||
level++, shift -= BYTBITS) {
|
||||
b = (uc >> shift) & BYTMASK;
|
||||
lastt = t;
|
||||
t = lastt->tptr[b];
|
||||
assert(t != NULL);
|
||||
fillt = &cm->tree[level+1];
|
||||
bottom = (shift <= BYTBITS) ? 1 : 0;
|
||||
cb = (bottom) ? cm->cd[t->tcolor[0]].block : fillt;
|
||||
if (t == fillt || t == cb) { /* must allocate a new block */
|
||||
newt = (union tree *)MALLOC((bottom) ?
|
||||
sizeof(struct colors) : sizeof(struct ptrs));
|
||||
if (newt == NULL) {
|
||||
CERR(REG_ESPACE);
|
||||
return COLORLESS;
|
||||
}
|
||||
if (bottom)
|
||||
memcpy(VS(newt->tcolor), VS(t->tcolor),
|
||||
BYTTAB*sizeof(color));
|
||||
else
|
||||
memcpy(VS(newt->tptr), VS(t->tptr),
|
||||
BYTTAB*sizeof(union tree *));
|
||||
t = newt;
|
||||
lastt->tptr[b] = t;
|
||||
}
|
||||
}
|
||||
|
||||
b = uc & BYTMASK;
|
||||
prev = t->tcolor[b];
|
||||
t->tcolor[b] = (color)co;
|
||||
return prev;
|
||||
}
|
||||
|
||||
/*
|
||||
* maxcolor - report largest color number in use
|
||||
*/
|
||||
static color
|
||||
maxcolor(struct colormap *cm)
|
||||
{
|
||||
if (CISERR())
|
||||
return COLORLESS;
|
||||
|
||||
return (color)cm->max;
|
||||
}
|
||||
|
||||
/*
|
||||
* newcolor - find a new color (must be subject of setcolor at once)
|
||||
* Beware: may relocate the colordescs.
|
||||
*/
|
||||
static color /* COLORLESS for error */
|
||||
newcolor(struct colormap *cm)
|
||||
{
|
||||
struct colordesc *cd;
|
||||
struct colordesc *new;
|
||||
size_t n;
|
||||
|
||||
if (CISERR())
|
||||
return COLORLESS;
|
||||
|
||||
if (cm->free != 0) {
|
||||
assert(cm->free > 0);
|
||||
assert((size_t)cm->free < cm->ncds);
|
||||
cd = &cm->cd[cm->free];
|
||||
assert(UNUSEDCOLOR(cd));
|
||||
assert(cd->arcs == NULL);
|
||||
cm->free = cd->sub;
|
||||
} else if (cm->max < cm->ncds - 1) {
|
||||
cm->max++;
|
||||
cd = &cm->cd[cm->max];
|
||||
} else {
|
||||
/* oops, must allocate more */
|
||||
n = cm->ncds * 2;
|
||||
if (cm->cd == cm->cdspace) {
|
||||
new = (struct colordesc *)MALLOC(n *
|
||||
sizeof(struct colordesc));
|
||||
if (new != NULL)
|
||||
memcpy(VS(new), VS(cm->cdspace), cm->ncds *
|
||||
sizeof(struct colordesc));
|
||||
} else
|
||||
new = (struct colordesc *)REALLOC(cm->cd,
|
||||
n * sizeof(struct colordesc));
|
||||
if (new == NULL) {
|
||||
CERR(REG_ESPACE);
|
||||
return COLORLESS;
|
||||
}
|
||||
cm->cd = new;
|
||||
cm->ncds = n;
|
||||
assert(cm->max < cm->ncds - 1);
|
||||
cm->max++;
|
||||
cd = &cm->cd[cm->max];
|
||||
}
|
||||
|
||||
cd->nchrs = 0;
|
||||
cd->sub = NOSUB;
|
||||
cd->arcs = NULL;
|
||||
cd->flags = 0;
|
||||
cd->block = NULL;
|
||||
|
||||
return (color)(cd - cm->cd);
|
||||
}
|
||||
|
||||
/*
|
||||
* freecolor - free a color (must have no arcs or subcolor)
|
||||
*/
|
||||
static void
|
||||
freecolor(struct colormap *cm,
|
||||
pcolor co)
|
||||
{
|
||||
struct colordesc *cd = &cm->cd[co];
|
||||
color pco, nco; /* for freelist scan */
|
||||
|
||||
assert(co >= 0);
|
||||
if (co == WHITE)
|
||||
return;
|
||||
|
||||
assert(cd->arcs == NULL);
|
||||
assert(cd->sub == NOSUB);
|
||||
assert(cd->nchrs == 0);
|
||||
cd->flags = FREECOL;
|
||||
if (cd->block != NULL) {
|
||||
FREE(cd->block);
|
||||
cd->block = NULL; /* just paranoia */
|
||||
}
|
||||
|
||||
if ((size_t)co == cm->max) {
|
||||
while (cm->max > WHITE && UNUSEDCOLOR(&cm->cd[cm->max]))
|
||||
cm->max--;
|
||||
assert(cm->free >= 0);
|
||||
while ((size_t)cm->free > cm->max)
|
||||
cm->free = cm->cd[cm->free].sub;
|
||||
if (cm->free > 0) {
|
||||
assert(cm->free < cm->max);
|
||||
pco = cm->free;
|
||||
nco = cm->cd[pco].sub;
|
||||
while (nco > 0)
|
||||
if ((size_t)nco > cm->max) {
|
||||
/* take this one out of freelist */
|
||||
nco = cm->cd[nco].sub;
|
||||
cm->cd[pco].sub = nco;
|
||||
} else {
|
||||
assert(nco < cm->max);
|
||||
pco = nco;
|
||||
nco = cm->cd[pco].sub;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cd->sub = cm->free;
|
||||
cm->free = (color)(cd - cm->cd);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* pseudocolor - allocate a false color, to be managed by other means
|
||||
*/
|
||||
static color
|
||||
pseudocolor(struct colormap *cm)
|
||||
{
|
||||
color co;
|
||||
|
||||
co = newcolor(cm);
|
||||
if (CISERR())
|
||||
return COLORLESS;
|
||||
cm->cd[co].nchrs = 1;
|
||||
cm->cd[co].flags = PSEUDO;
|
||||
return co;
|
||||
}
|
||||
|
||||
/*
|
||||
* subcolor - allocate a new subcolor (if necessary) to this chr
|
||||
*/
|
||||
static color
|
||||
subcolor(struct colormap *cm, chr c)
|
||||
{
|
||||
color co; /* current color of c */
|
||||
color sco; /* new subcolor */
|
||||
|
||||
co = GETCOLOR(cm, c);
|
||||
sco = newsub(cm, co);
|
||||
if (CISERR())
|
||||
return COLORLESS;
|
||||
assert(sco != COLORLESS);
|
||||
|
||||
if (co == sco) /* already in an open subcolor */
|
||||
return co; /* rest is redundant */
|
||||
cm->cd[co].nchrs--;
|
||||
cm->cd[sco].nchrs++;
|
||||
setcolor(cm, c, sco);
|
||||
return sco;
|
||||
}
|
||||
|
||||
/*
|
||||
* newsub - allocate a new subcolor (if necessary) for a color
|
||||
*/
|
||||
static color
|
||||
newsub(struct colormap *cm,
|
||||
pcolor co)
|
||||
{
|
||||
color sco; /* new subcolor */
|
||||
|
||||
sco = cm->cd[co].sub;
|
||||
if (sco == NOSUB) { /* color has no open subcolor */
|
||||
if (cm->cd[co].nchrs == 1) /* optimization */
|
||||
return co;
|
||||
sco = newcolor(cm); /* must create subcolor */
|
||||
if (sco == COLORLESS) {
|
||||
assert(CISERR());
|
||||
return COLORLESS;
|
||||
}
|
||||
cm->cd[co].sub = sco;
|
||||
cm->cd[sco].sub = sco; /* open subcolor points to self */
|
||||
}
|
||||
assert(sco != NOSUB);
|
||||
|
||||
return sco;
|
||||
}
|
||||
|
||||
/*
|
||||
* subrange - allocate new subcolors to this range of chrs, fill in arcs
|
||||
*/
|
||||
static void
|
||||
subrange(struct vars *v,
|
||||
chr from,
|
||||
chr to,
|
||||
struct state *lp,
|
||||
struct state *rp)
|
||||
{
|
||||
uchr uf;
|
||||
int i;
|
||||
|
||||
assert(from <= to);
|
||||
|
||||
/* first, align "from" on a tree-block boundary */
|
||||
uf = (uchr)from;
|
||||
i = (int)( ((uf + BYTTAB-1) & (uchr)~BYTMASK) - uf );
|
||||
for (; from <= to && i > 0; i--, from++)
|
||||
newarc(v->nfa, PLAIN, subcolor(v->cm, from), lp, rp);
|
||||
if (from > to) /* didn't reach a boundary */
|
||||
return;
|
||||
|
||||
/* deal with whole blocks */
|
||||
for (; to - from >= BYTTAB; from += BYTTAB)
|
||||
subblock(v, from, lp, rp);
|
||||
|
||||
/* clean up any remaining partial table */
|
||||
for (; from <= to; from++)
|
||||
newarc(v->nfa, PLAIN, subcolor(v->cm, from), lp, rp);
|
||||
}
|
||||
|
||||
/*
|
||||
* subblock - allocate new subcolors for one tree block of chrs, fill in arcs
|
||||
*/
|
||||
static void
|
||||
subblock(struct vars *v,
|
||||
chr start, /* first of BYTTAB chrs */
|
||||
struct state *lp,
|
||||
struct state *rp)
|
||||
{
|
||||
uchr uc = start;
|
||||
struct colormap *cm = v->cm;
|
||||
int shift;
|
||||
int level;
|
||||
int i;
|
||||
int b;
|
||||
union tree *t;
|
||||
union tree *cb;
|
||||
union tree *fillt;
|
||||
union tree *lastt;
|
||||
int previ;
|
||||
int ndone;
|
||||
color co;
|
||||
color sco;
|
||||
|
||||
assert((uc % BYTTAB) == 0);
|
||||
|
||||
/* find its color block, making new pointer blocks as needed */
|
||||
t = cm->tree;
|
||||
fillt = NULL;
|
||||
for (level = 0, shift = BYTBITS * (NBYTS - 1); shift > 0;
|
||||
level++, shift -= BYTBITS) {
|
||||
b = (uc >> shift) & BYTMASK;
|
||||
lastt = t;
|
||||
t = lastt->tptr[b];
|
||||
assert(t != NULL);
|
||||
fillt = &cm->tree[level+1];
|
||||
if (t == fillt && shift > BYTBITS) { /* need new ptr block */
|
||||
t = (union tree *)MALLOC(sizeof(struct ptrs));
|
||||
if (t == NULL) {
|
||||
CERR(REG_ESPACE);
|
||||
return;
|
||||
}
|
||||
memcpy(VS(t->tptr), VS(fillt->tptr),
|
||||
BYTTAB*sizeof(union tree *));
|
||||
lastt->tptr[b] = t;
|
||||
}
|
||||
}
|
||||
|
||||
/* special cases: fill block or solid block */
|
||||
co = t->tcolor[0];
|
||||
cb = cm->cd[co].block;
|
||||
if (t == fillt || t == cb) {
|
||||
/* either way, we want a subcolor solid block */
|
||||
sco = newsub(cm, co);
|
||||
t = cm->cd[sco].block;
|
||||
if (t == NULL) { /* must set it up */
|
||||
t = (union tree *)MALLOC(sizeof(struct colors));
|
||||
if (t == NULL) {
|
||||
CERR(REG_ESPACE);
|
||||
return;
|
||||
}
|
||||
for (i = 0; i < BYTTAB; i++)
|
||||
t->tcolor[i] = sco;
|
||||
cm->cd[sco].block = t;
|
||||
}
|
||||
/* find loop must have run at least once */
|
||||
lastt->tptr[b] = t;
|
||||
newarc(v->nfa, PLAIN, sco, lp, rp);
|
||||
cm->cd[co].nchrs -= BYTTAB;
|
||||
cm->cd[sco].nchrs += BYTTAB;
|
||||
return;
|
||||
}
|
||||
|
||||
/* general case, a mixed block to be altered */
|
||||
i = 0;
|
||||
while (i < BYTTAB) {
|
||||
co = t->tcolor[i];
|
||||
sco = newsub(cm, co);
|
||||
newarc(v->nfa, PLAIN, sco, lp, rp);
|
||||
previ = i;
|
||||
do {
|
||||
t->tcolor[i++] = sco;
|
||||
} while (i < BYTTAB && t->tcolor[i] == co);
|
||||
ndone = i - previ;
|
||||
cm->cd[co].nchrs -= ndone;
|
||||
cm->cd[sco].nchrs += ndone;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* okcolors - promote subcolors to full colors
|
||||
*/
|
||||
static void
|
||||
okcolors(struct nfa *nfa,
|
||||
struct colormap *cm)
|
||||
{
|
||||
struct colordesc *cd;
|
||||
struct colordesc *end = CDEND(cm);
|
||||
struct colordesc *scd;
|
||||
struct arc *a;
|
||||
color co;
|
||||
color sco;
|
||||
|
||||
for (cd = cm->cd, co = 0; cd < end; cd++, co++) {
|
||||
sco = cd->sub;
|
||||
if (UNUSEDCOLOR(cd) || sco == NOSUB) {
|
||||
/* has no subcolor, no further action */
|
||||
} else if (sco == co) {
|
||||
/* is subcolor, let parent deal with it */
|
||||
} else if (cd->nchrs == 0) {
|
||||
/* parent empty, its arcs change color to subcolor */
|
||||
cd->sub = NOSUB;
|
||||
scd = &cm->cd[sco];
|
||||
assert(scd->nchrs > 0);
|
||||
assert(scd->sub == sco);
|
||||
scd->sub = NOSUB;
|
||||
while ((a = cd->arcs) != NULL) {
|
||||
assert(a->co == co);
|
||||
/* uncolorchain(cm, a); */
|
||||
cd->arcs = a->colorchain;
|
||||
a->co = sco;
|
||||
/* colorchain(cm, a); */
|
||||
a->colorchain = scd->arcs;
|
||||
scd->arcs = a;
|
||||
}
|
||||
freecolor(cm, co);
|
||||
} else {
|
||||
/* parent's arcs must gain parallel subcolor arcs */
|
||||
cd->sub = NOSUB;
|
||||
scd = &cm->cd[sco];
|
||||
assert(scd->nchrs > 0);
|
||||
assert(scd->sub == sco);
|
||||
scd->sub = NOSUB;
|
||||
for (a = cd->arcs; a != NULL; a = a->colorchain) {
|
||||
assert(a->co == co);
|
||||
newarc(nfa, a->type, sco, a->from, a->to);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* colorchain - add this arc to the color chain of its color
|
||||
*/
|
||||
static void
|
||||
colorchain(struct colormap *cm,
|
||||
struct arc *a)
|
||||
{
|
||||
struct colordesc *cd = &cm->cd[a->co];
|
||||
|
||||
a->colorchain = cd->arcs;
|
||||
cd->arcs = a;
|
||||
}
|
||||
|
||||
/*
|
||||
* uncolorchain - delete this arc from the color chain of its color
|
||||
*/
|
||||
static void
|
||||
uncolorchain(struct colormap *cm,
|
||||
struct arc *a)
|
||||
{
|
||||
struct colordesc *cd = &cm->cd[a->co];
|
||||
struct arc *aa;
|
||||
|
||||
aa = cd->arcs;
|
||||
if (aa == a) /* easy case */
|
||||
cd->arcs = a->colorchain;
|
||||
else {
|
||||
for (; aa != NULL && aa->colorchain != a; aa = aa->colorchain)
|
||||
continue;
|
||||
assert(aa != NULL);
|
||||
aa->colorchain = a->colorchain;
|
||||
}
|
||||
a->colorchain = NULL; /* paranoia */
|
||||
}
|
||||
|
||||
/*
|
||||
* singleton - is this character in its own color?
|
||||
*/
|
||||
static int /* predicate */
|
||||
singleton(struct colormap *cm,
|
||||
chr c)
|
||||
{
|
||||
color co; /* color of c */
|
||||
|
||||
co = GETCOLOR(cm, c);
|
||||
if (cm->cd[co].nchrs == 1 && cm->cd[co].sub == NOSUB)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* rainbow - add arcs of all full colors (but one) between specified states
|
||||
*/
|
||||
static void
|
||||
rainbow(struct nfa *nfa,
|
||||
struct colormap *cm,
|
||||
int type,
|
||||
pcolor but, /* COLORLESS if no exceptions */
|
||||
struct state *from,
|
||||
struct state *to)
|
||||
{
|
||||
struct colordesc *cd;
|
||||
struct colordesc *end = CDEND(cm);
|
||||
color co;
|
||||
|
||||
for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
|
||||
if (!UNUSEDCOLOR(cd) && cd->sub != co && co != but &&
|
||||
!(cd->flags&PSEUDO))
|
||||
newarc(nfa, type, co, from, to);
|
||||
}
|
||||
|
||||
/*
|
||||
* colorcomplement - add arcs of complementary colors
|
||||
*
|
||||
* The calling sequence ought to be reconciled with cloneouts().
|
||||
*/
|
||||
static void
|
||||
colorcomplement(struct nfa *nfa,
|
||||
struct colormap *cm,
|
||||
int type,
|
||||
struct state *of, /* complements of this guy's PLAIN outarcs */
|
||||
struct state *from,
|
||||
struct state *to)
|
||||
{
|
||||
struct colordesc *cd;
|
||||
struct colordesc *end = CDEND(cm);
|
||||
color co;
|
||||
|
||||
assert(of != from);
|
||||
for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
|
||||
if (!UNUSEDCOLOR(cd) && !(cd->flags&PSEUDO))
|
||||
if (findarc(of, PLAIN, co) == NULL)
|
||||
newarc(nfa, type, co, from, to);
|
||||
}
|
||||
|
||||
|
||||
#ifdef REG_DEBUG
|
||||
|
||||
/*
|
||||
* dumpcolors - debugging output
|
||||
*/
|
||||
static void
|
||||
dumpcolors(struct colormap *cm,
|
||||
FILE *f)
|
||||
{
|
||||
struct colordesc *cd;
|
||||
struct colordesc *end;
|
||||
color co;
|
||||
chr c;
|
||||
char *has;
|
||||
|
||||
fprintf(f, "max %ld\n", (long)cm->max);
|
||||
if (NBYTS > 1)
|
||||
fillcheck(cm, cm->tree, 0, f);
|
||||
end = CDEND(cm);
|
||||
for (cd = cm->cd + 1, co = 1; cd < end; cd++, co++) /* skip 0 */
|
||||
if (!UNUSEDCOLOR(cd)) {
|
||||
assert(cd->nchrs > 0);
|
||||
has = (cd->block != NULL) ? "#" : "";
|
||||
if (cd->flags&PSEUDO)
|
||||
fprintf(f, "#%2ld%s(ps): ", (long)co, has);
|
||||
else
|
||||
fprintf(f, "#%2ld%s(%2d): ", (long)co,
|
||||
has, cd->nchrs);
|
||||
/* it's hard to do this more efficiently */
|
||||
for (c = CHR_MIN; c < CHR_MAX; c++)
|
||||
if (GETCOLOR(cm, c) == co)
|
||||
dumpchr(c, f);
|
||||
assert(c == CHR_MAX);
|
||||
if (GETCOLOR(cm, c) == co)
|
||||
dumpchr(c, f);
|
||||
fprintf(f, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* fillcheck - check proper filling of a tree
|
||||
*/
|
||||
static void
|
||||
fillcheck(struct colormap *cm,
|
||||
union tree *tree,
|
||||
int level, /* level number (top == 0) of this block */
|
||||
FILE *f)
|
||||
{
|
||||
int i;
|
||||
union tree *t;
|
||||
union tree *fillt = &cm->tree[level+1];
|
||||
|
||||
assert(level < NBYTS-1); /* this level has pointers */
|
||||
for (i = BYTTAB-1; i >= 0; i--) {
|
||||
t = tree->tptr[i];
|
||||
if (t == NULL)
|
||||
fprintf(f, "NULL found in filled tree!\n");
|
||||
else if (t == fillt)
|
||||
{}
|
||||
else if (level < NBYTS-2) /* more pointer blocks below */
|
||||
fillcheck(cm, t, level+1, f);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* dumpchr - print a chr
|
||||
*
|
||||
* Kind of char-centric but works well enough for debug use.
|
||||
*/
|
||||
static void
|
||||
dumpchr(chr c,
|
||||
FILE *f)
|
||||
{
|
||||
if (c == '\\')
|
||||
fprintf(f, "\\\\");
|
||||
else if (c > ' ' && c <= '~')
|
||||
putc((char)c, f);
|
||||
else
|
||||
fprintf(f, "\\u%04lx", (long)c);
|
||||
}
|
||||
|
||||
#endif /* REG_DEBUG */
|
194
src/backend/regex/regc_cvec.c
Normal file
194
src/backend/regex/regc_cvec.c
Normal file
@ -0,0 +1,194 @@
|
||||
/*
|
||||
* Utility functions for handling cvecs
|
||||
* This file is #included by regcomp.c.
|
||||
*
|
||||
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
|
||||
*
|
||||
* Development of this software was funded, in part, by Cray Research Inc.,
|
||||
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
|
||||
* Corporation, none of whom are responsible for the results. The author
|
||||
* thanks all of them.
|
||||
*
|
||||
* Redistribution and use in source and binary forms -- with or without
|
||||
* modification -- are permitted for any purpose, provided that
|
||||
* redistributions in source form retain this entire copyright notice and
|
||||
* indicate the origin and nature of any modifications.
|
||||
*
|
||||
* I'd appreciate being given credit for this package in the documentation
|
||||
* of software which uses it, but that is not a requirement.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
||||
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* $Header: /cvsroot/pgsql/src/backend/regex/regc_cvec.c,v 1.1 2003/02/05 17:41:32 tgl Exp $
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* newcvec - allocate a new cvec
|
||||
*/
|
||||
static struct cvec *
|
||||
newcvec(int nchrs, /* to hold this many chrs... */
|
||||
int nranges, /* ... and this many ranges... */
|
||||
int nmcces) /* ... and this many MCCEs */
|
||||
{
|
||||
size_t n;
|
||||
size_t nc;
|
||||
struct cvec *cv;
|
||||
|
||||
nc = (size_t)nchrs + (size_t)nmcces*(MAXMCCE+1) + (size_t)nranges*2;
|
||||
n = sizeof(struct cvec) + (size_t)(nmcces-1)*sizeof(chr *)
|
||||
+ nc*sizeof(chr);
|
||||
cv = (struct cvec *)MALLOC(n);
|
||||
if (cv == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
cv->chrspace = nchrs;
|
||||
cv->chrs = (chr *)&cv->mcces[nmcces]; /* chrs just after MCCE ptrs */
|
||||
cv->mccespace = nmcces;
|
||||
cv->ranges = cv->chrs + nchrs + nmcces*(MAXMCCE+1);
|
||||
cv->rangespace = nranges;
|
||||
return clearcvec(cv);
|
||||
}
|
||||
|
||||
/*
|
||||
* clearcvec - clear a possibly-new cvec
|
||||
* Returns pointer as convenience.
|
||||
*/
|
||||
static struct cvec *
|
||||
clearcvec(struct cvec *cv)
|
||||
{
|
||||
int i;
|
||||
|
||||
assert(cv != NULL);
|
||||
cv->nchrs = 0;
|
||||
assert(cv->chrs == (chr *)&cv->mcces[cv->mccespace]);
|
||||
cv->nmcces = 0;
|
||||
cv->nmccechrs = 0;
|
||||
cv->nranges = 0;
|
||||
for (i = 0; i < cv->mccespace; i++) {
|
||||
cv->mcces[i] = NULL;
|
||||
}
|
||||
|
||||
return cv;
|
||||
}
|
||||
|
||||
/*
|
||||
* addchr - add a chr to a cvec
|
||||
*/
|
||||
static void
|
||||
addchr(struct cvec *cv, /* character vector */
|
||||
chr c) /* character to add */
|
||||
{
|
||||
assert(cv->nchrs < cv->chrspace - cv->nmccechrs);
|
||||
cv->chrs[cv->nchrs++] = (chr)c;
|
||||
}
|
||||
|
||||
/*
|
||||
* addrange - add a range to a cvec
|
||||
*/
|
||||
static void
|
||||
addrange(struct cvec *cv, /* character vector */
|
||||
chr from, /* first character of range */
|
||||
chr to) /* last character of range */
|
||||
{
|
||||
assert(cv->nranges < cv->rangespace);
|
||||
cv->ranges[cv->nranges*2] = (chr)from;
|
||||
cv->ranges[cv->nranges*2 + 1] = (chr)to;
|
||||
cv->nranges++;
|
||||
}
|
||||
|
||||
/*
|
||||
* addmcce - add an MCCE to a cvec
|
||||
*/
|
||||
static void
|
||||
addmcce(struct cvec *cv, /* character vector */
|
||||
chr *startp, /* beginning of text */
|
||||
chr *endp) /* just past end of text */
|
||||
{
|
||||
int len;
|
||||
int i;
|
||||
chr *s;
|
||||
chr *d;
|
||||
|
||||
if (startp == NULL && endp == NULL) {
|
||||
return;
|
||||
}
|
||||
len = endp - startp;
|
||||
assert(len > 0);
|
||||
assert(cv->nchrs + len < cv->chrspace - cv->nmccechrs);
|
||||
assert(cv->nmcces < cv->mccespace);
|
||||
d = &cv->chrs[cv->chrspace - cv->nmccechrs - len - 1];
|
||||
cv->mcces[cv->nmcces++] = d;
|
||||
for (s = startp, i = len; i > 0; s++, i--) {
|
||||
*d++ = *s;
|
||||
}
|
||||
*d++ = 0; /* endmarker */
|
||||
assert(d == &cv->chrs[cv->chrspace - cv->nmccechrs]);
|
||||
cv->nmccechrs += len + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* haschr - does a cvec contain this chr?
|
||||
*/
|
||||
static int /* predicate */
|
||||
haschr(struct cvec *cv, /* character vector */
|
||||
chr c) /* character to test for */
|
||||
{
|
||||
int i;
|
||||
chr *p;
|
||||
|
||||
for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
|
||||
if (*p == c) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) {
|
||||
if ((*p <= c) && (c <= *(p+1))) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* getcvec - get a cvec, remembering it as v->cv
|
||||
*/
|
||||
static struct cvec *
|
||||
getcvec(struct vars *v, /* context */
|
||||
int nchrs, /* to hold this many chrs... */
|
||||
int nranges, /* ... and this many ranges... */
|
||||
int nmcces) /* ... and this many MCCEs */
|
||||
{
|
||||
if (v->cv != NULL && nchrs <= v->cv->chrspace &&
|
||||
nranges <= v->cv->rangespace && nmcces <= v->cv->mccespace) {
|
||||
return clearcvec(v->cv);
|
||||
}
|
||||
|
||||
if (v->cv != NULL) {
|
||||
freecvec(v->cv);
|
||||
}
|
||||
v->cv = newcvec(nchrs, nranges, nmcces);
|
||||
if (v->cv == NULL) {
|
||||
ERR(REG_ESPACE);
|
||||
}
|
||||
|
||||
return v->cv;
|
||||
}
|
||||
|
||||
/*
|
||||
* freecvec - free a cvec
|
||||
*/
|
||||
static void
|
||||
freecvec(struct cvec *cv)
|
||||
{
|
||||
FREE(cv);
|
||||
}
|
1028
src/backend/regex/regc_lex.c
Normal file
1028
src/backend/regex/regc_lex.c
Normal file
File diff suppressed because it is too large
Load Diff
615
src/backend/regex/regc_locale.c
Normal file
615
src/backend/regex/regc_locale.c
Normal file
@ -0,0 +1,615 @@
|
||||
/*
|
||||
* regc_locale.c --
|
||||
*
|
||||
* This file contains locale-specific regexp routines.
|
||||
* This file is #included by regcomp.c.
|
||||
*
|
||||
* Copyright (c) 1998 by Scriptics Corporation.
|
||||
*
|
||||
* This software is copyrighted by the Regents of the University of
|
||||
* California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
|
||||
* Corporation and other parties. The following terms apply to all files
|
||||
* associated with the software unless explicitly disclaimed in
|
||||
* individual files.
|
||||
*
|
||||
* The authors hereby grant permission to use, copy, modify, distribute,
|
||||
* and license this software and its documentation for any purpose, provided
|
||||
* that existing copyright notices are retained in all copies and that this
|
||||
* notice is included verbatim in any distributions. No written agreement,
|
||||
* license, or royalty fee is required for any of the authorized uses.
|
||||
* Modifications to this software may be copyrighted by their authors
|
||||
* and need not follow the licensing terms described here, provided that
|
||||
* the new terms are clearly indicated on the first page of each file where
|
||||
* they apply.
|
||||
*
|
||||
* IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
|
||||
* FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
|
||||
* ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
|
||||
* DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
|
||||
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
|
||||
* IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
|
||||
* NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
|
||||
* MODIFICATIONS.
|
||||
*
|
||||
* GOVERNMENT USE: If you are acquiring this software on behalf of the
|
||||
* U.S. government, the Government shall have only "Restricted Rights"
|
||||
* in the software and related documentation as defined in the Federal
|
||||
* Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
|
||||
* are acquiring the software on behalf of the Department of Defense, the
|
||||
* software shall be classified as "Commercial Computer Software" and the
|
||||
* Government shall have only "Restricted Rights" as defined in Clause
|
||||
* 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
|
||||
* authors grant the U.S. Government and others acting in its behalf
|
||||
* permission to use and distribute the software in accordance with the
|
||||
* terms specified in this license.
|
||||
*
|
||||
* $Header: /cvsroot/pgsql/src/backend/regex/regc_locale.c,v 1.1 2003/02/05 17:41:32 tgl Exp $
|
||||
*/
|
||||
|
||||
/* ASCII character-name table */
|
||||
|
||||
static struct cname {
|
||||
char *name;
|
||||
char code;
|
||||
} cnames[] = {
|
||||
{"NUL", '\0'},
|
||||
{"SOH", '\001'},
|
||||
{"STX", '\002'},
|
||||
{"ETX", '\003'},
|
||||
{"EOT", '\004'},
|
||||
{"ENQ", '\005'},
|
||||
{"ACK", '\006'},
|
||||
{"BEL", '\007'},
|
||||
{"alert", '\007'},
|
||||
{"BS", '\010'},
|
||||
{"backspace", '\b'},
|
||||
{"HT", '\011'},
|
||||
{"tab", '\t'},
|
||||
{"LF", '\012'},
|
||||
{"newline", '\n'},
|
||||
{"VT", '\013'},
|
||||
{"vertical-tab", '\v'},
|
||||
{"FF", '\014'},
|
||||
{"form-feed", '\f'},
|
||||
{"CR", '\015'},
|
||||
{"carriage-return", '\r'},
|
||||
{"SO", '\016'},
|
||||
{"SI", '\017'},
|
||||
{"DLE", '\020'},
|
||||
{"DC1", '\021'},
|
||||
{"DC2", '\022'},
|
||||
{"DC3", '\023'},
|
||||
{"DC4", '\024'},
|
||||
{"NAK", '\025'},
|
||||
{"SYN", '\026'},
|
||||
{"ETB", '\027'},
|
||||
{"CAN", '\030'},
|
||||
{"EM", '\031'},
|
||||
{"SUB", '\032'},
|
||||
{"ESC", '\033'},
|
||||
{"IS4", '\034'},
|
||||
{"FS", '\034'},
|
||||
{"IS3", '\035'},
|
||||
{"GS", '\035'},
|
||||
{"IS2", '\036'},
|
||||
{"RS", '\036'},
|
||||
{"IS1", '\037'},
|
||||
{"US", '\037'},
|
||||
{"space", ' '},
|
||||
{"exclamation-mark",'!'},
|
||||
{"quotation-mark", '"'},
|
||||
{"number-sign", '#'},
|
||||
{"dollar-sign", '$'},
|
||||
{"percent-sign", '%'},
|
||||
{"ampersand", '&'},
|
||||
{"apostrophe", '\''},
|
||||
{"left-parenthesis",'('},
|
||||
{"right-parenthesis", ')'},
|
||||
{"asterisk", '*'},
|
||||
{"plus-sign", '+'},
|
||||
{"comma", ','},
|
||||
{"hyphen", '-'},
|
||||
{"hyphen-minus", '-'},
|
||||
{"period", '.'},
|
||||
{"full-stop", '.'},
|
||||
{"slash", '/'},
|
||||
{"solidus", '/'},
|
||||
{"zero", '0'},
|
||||
{"one", '1'},
|
||||
{"two", '2'},
|
||||
{"three", '3'},
|
||||
{"four", '4'},
|
||||
{"five", '5'},
|
||||
{"six", '6'},
|
||||
{"seven", '7'},
|
||||
{"eight", '8'},
|
||||
{"nine", '9'},
|
||||
{"colon", ':'},
|
||||
{"semicolon", ';'},
|
||||
{"less-than-sign", '<'},
|
||||
{"equals-sign", '='},
|
||||
{"greater-than-sign", '>'},
|
||||
{"question-mark", '?'},
|
||||
{"commercial-at", '@'},
|
||||
{"left-square-bracket", '['},
|
||||
{"backslash", '\\'},
|
||||
{"reverse-solidus", '\\'},
|
||||
{"right-square-bracket", ']'},
|
||||
{"circumflex", '^'},
|
||||
{"circumflex-accent", '^'},
|
||||
{"underscore", '_'},
|
||||
{"low-line", '_'},
|
||||
{"grave-accent", '`'},
|
||||
{"left-brace", '{'},
|
||||
{"left-curly-bracket", '{'},
|
||||
{"vertical-line", '|'},
|
||||
{"right-brace", '}'},
|
||||
{"right-curly-bracket", '}'},
|
||||
{"tilde", '~'},
|
||||
{"DEL", '\177'},
|
||||
{NULL, 0}
|
||||
};
|
||||
|
||||
/*
|
||||
* some ctype functions with non-ascii-char guard
|
||||
*/
|
||||
static int
|
||||
pg_isdigit(pg_wchar c)
|
||||
{
|
||||
return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
|
||||
}
|
||||
|
||||
static int
|
||||
pg_isalpha(pg_wchar c)
|
||||
{
|
||||
return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
|
||||
}
|
||||
|
||||
static int
|
||||
pg_isalnum(pg_wchar c)
|
||||
{
|
||||
return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
|
||||
}
|
||||
|
||||
static int
|
||||
pg_isupper(pg_wchar c)
|
||||
{
|
||||
return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
|
||||
}
|
||||
|
||||
static int
|
||||
pg_islower(pg_wchar c)
|
||||
{
|
||||
return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
|
||||
}
|
||||
|
||||
static int
|
||||
pg_isgraph(pg_wchar c)
|
||||
{
|
||||
return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
|
||||
}
|
||||
|
||||
static int
|
||||
pg_ispunct(pg_wchar c)
|
||||
{
|
||||
return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
|
||||
}
|
||||
|
||||
static int
|
||||
pg_isspace(pg_wchar c)
|
||||
{
|
||||
return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
|
||||
}
|
||||
|
||||
static pg_wchar
|
||||
pg_toupper(pg_wchar c)
|
||||
{
|
||||
if (c >= 0 && c <= UCHAR_MAX)
|
||||
return toupper((unsigned char) c);
|
||||
return c;
|
||||
}
|
||||
|
||||
static pg_wchar
|
||||
pg_tolower(pg_wchar c)
|
||||
{
|
||||
if (c >= 0 && c <= UCHAR_MAX)
|
||||
return tolower((unsigned char) c);
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* nmcces - how many distinct MCCEs are there?
|
||||
*/
|
||||
static int
|
||||
nmcces(struct vars *v)
|
||||
{
|
||||
/*
|
||||
* No multi-character collating elements defined at the moment.
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* nleaders - how many chrs can be first chrs of MCCEs?
|
||||
*/
|
||||
static int
|
||||
nleaders(struct vars *v)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* allmcces - return a cvec with all the MCCEs of the locale
|
||||
*/
|
||||
static struct cvec *
|
||||
allmcces(struct vars *v, /* context */
|
||||
struct cvec *cv) /* this is supposed to have enough room */
|
||||
{
|
||||
return clearcvec(cv);
|
||||
}
|
||||
|
||||
/*
|
||||
* element - map collating-element name to celt
|
||||
*/
|
||||
static celt
|
||||
element(struct vars *v, /* context */
|
||||
chr *startp, /* points to start of name */
|
||||
chr *endp) /* points just past end of name */
|
||||
{
|
||||
struct cname *cn;
|
||||
size_t len;
|
||||
|
||||
/* generic: one-chr names stand for themselves */
|
||||
assert(startp < endp);
|
||||
len = endp - startp;
|
||||
if (len == 1) {
|
||||
return *startp;
|
||||
}
|
||||
|
||||
NOTE(REG_ULOCALE);
|
||||
|
||||
/* search table */
|
||||
for (cn=cnames; cn->name!=NULL; cn++) {
|
||||
if (strlen(cn->name)==len &&
|
||||
pg_char_and_wchar_strncmp(cn->name, startp, len)==0) {
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
}
|
||||
if (cn->name != NULL) {
|
||||
return CHR(cn->code);
|
||||
}
|
||||
|
||||
/* couldn't find it */
|
||||
ERR(REG_ECOLLATE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* range - supply cvec for a range, including legality check
|
||||
*/
|
||||
static struct cvec *
|
||||
range(struct vars *v, /* context */
|
||||
celt a, /* range start */
|
||||
celt b, /* range end, might equal a */
|
||||
int cases) /* case-independent? */
|
||||
{
|
||||
int nchrs;
|
||||
struct cvec *cv;
|
||||
celt c, lc, uc;
|
||||
|
||||
if (a != b && !before(a, b)) {
|
||||
ERR(REG_ERANGE);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!cases) { /* easy version */
|
||||
cv = getcvec(v, 0, 1, 0);
|
||||
NOERRN();
|
||||
addrange(cv, a, b);
|
||||
return cv;
|
||||
}
|
||||
|
||||
/*
|
||||
* When case-independent, it's hard to decide when cvec ranges are
|
||||
* usable, so for now at least, we won't try. We allocate enough
|
||||
* space for two case variants plus a little extra for the two
|
||||
* title case variants.
|
||||
*/
|
||||
|
||||
nchrs = (b - a + 1)*2 + 4;
|
||||
|
||||
cv = getcvec(v, nchrs, 0, 0);
|
||||
NOERRN();
|
||||
|
||||
for (c=a; c<=b; c++) {
|
||||
addchr(cv, c);
|
||||
lc = pg_tolower((chr)c);
|
||||
if (c != lc) {
|
||||
addchr(cv, lc);
|
||||
}
|
||||
uc = pg_toupper((chr)c);
|
||||
if (c != uc) {
|
||||
addchr(cv, uc);
|
||||
}
|
||||
}
|
||||
|
||||
return cv;
|
||||
}
|
||||
|
||||
/*
|
||||
* before - is celt x before celt y, for purposes of range legality?
|
||||
*/
|
||||
static int /* predicate */
|
||||
before(celt x, celt y)
|
||||
{
|
||||
/* trivial because no MCCEs */
|
||||
if (x < y) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* eclass - supply cvec for an equivalence class
|
||||
* Must include case counterparts on request.
|
||||
*/
|
||||
static struct cvec *
|
||||
eclass(struct vars *v, /* context */
|
||||
celt c, /* Collating element representing
|
||||
* the equivalence class. */
|
||||
int cases) /* all cases? */
|
||||
{
|
||||
struct cvec *cv;
|
||||
|
||||
/* crude fake equivalence class for testing */
|
||||
if ((v->cflags®_FAKE) && c == 'x') {
|
||||
cv = getcvec(v, 4, 0, 0);
|
||||
addchr(cv, (chr)'x');
|
||||
addchr(cv, (chr)'y');
|
||||
if (cases) {
|
||||
addchr(cv, (chr)'X');
|
||||
addchr(cv, (chr)'Y');
|
||||
}
|
||||
return cv;
|
||||
}
|
||||
|
||||
/* otherwise, none */
|
||||
if (cases) {
|
||||
return allcases(v, c);
|
||||
}
|
||||
cv = getcvec(v, 1, 0, 0);
|
||||
assert(cv != NULL);
|
||||
addchr(cv, (chr)c);
|
||||
return cv;
|
||||
}
|
||||
|
||||
/*
|
||||
* cclass - supply cvec for a character class
|
||||
*
|
||||
* Must include case counterparts on request.
|
||||
*/
|
||||
static struct cvec *
|
||||
cclass(struct vars *v, /* context */
|
||||
chr *startp, /* where the name starts */
|
||||
chr *endp, /* just past the end of the name */
|
||||
int cases) /* case-independent? */
|
||||
{
|
||||
size_t len;
|
||||
struct cvec *cv = NULL;
|
||||
char **namePtr;
|
||||
int i, index;
|
||||
|
||||
/*
|
||||
* The following arrays define the valid character class names.
|
||||
*/
|
||||
|
||||
static char *classNames[] = {
|
||||
"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
|
||||
"lower", "print", "punct", "space", "upper", "xdigit", NULL
|
||||
};
|
||||
|
||||
enum classes {
|
||||
CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
|
||||
CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
|
||||
};
|
||||
|
||||
/*
|
||||
* Map the name to the corresponding enumerated value.
|
||||
*/
|
||||
len = endp - startp;
|
||||
index = -1;
|
||||
for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) {
|
||||
if (strlen(*namePtr) == len &&
|
||||
pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) {
|
||||
index = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (index == -1) {
|
||||
ERR(REG_ECTYPE);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remap lower and upper to alpha if the match is case insensitive.
|
||||
*/
|
||||
|
||||
if (cases &&
|
||||
((enum classes) index == CC_LOWER ||
|
||||
(enum classes) index == CC_UPPER))
|
||||
index = (int) CC_ALPHA;
|
||||
|
||||
/*
|
||||
* Now compute the character class contents.
|
||||
*
|
||||
* For the moment, assume that only char codes < 256 can be in these
|
||||
* classes.
|
||||
*/
|
||||
|
||||
switch((enum classes) index) {
|
||||
case CC_PRINT:
|
||||
case CC_ALNUM:
|
||||
cv = getcvec(v, UCHAR_MAX, 1, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<= UCHAR_MAX ; i++) {
|
||||
if (pg_isalpha((chr) i))
|
||||
addchr(cv, (chr) i);
|
||||
}
|
||||
addrange(cv, (chr) '0', (chr) '9');
|
||||
}
|
||||
break;
|
||||
case CC_ALPHA:
|
||||
cv = getcvec(v, UCHAR_MAX, 0, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<= UCHAR_MAX ; i++) {
|
||||
if (pg_isalpha((chr) i))
|
||||
addchr(cv, (chr) i);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case CC_ASCII:
|
||||
cv = getcvec(v, 0, 1, 0);
|
||||
if (cv) {
|
||||
addrange(cv, 0, 0x7f);
|
||||
}
|
||||
break;
|
||||
case CC_BLANK:
|
||||
cv = getcvec(v, 2, 0, 0);
|
||||
addchr(cv, '\t');
|
||||
addchr(cv, ' ');
|
||||
break;
|
||||
case CC_CNTRL:
|
||||
cv = getcvec(v, 0, 2, 0);
|
||||
addrange(cv, 0x0, 0x1f);
|
||||
addrange(cv, 0x7f, 0x9f);
|
||||
break;
|
||||
case CC_DIGIT:
|
||||
cv = getcvec(v, 0, 1, 0);
|
||||
if (cv) {
|
||||
addrange(cv, (chr) '0', (chr) '9');
|
||||
}
|
||||
break;
|
||||
case CC_PUNCT:
|
||||
cv = getcvec(v, UCHAR_MAX, 0, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<= UCHAR_MAX ; i++) {
|
||||
if (pg_ispunct((chr) i))
|
||||
addchr(cv, (chr) i);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case CC_XDIGIT:
|
||||
cv = getcvec(v, 0, 3, 0);
|
||||
if (cv) {
|
||||
addrange(cv, '0', '9');
|
||||
addrange(cv, 'a', 'f');
|
||||
addrange(cv, 'A', 'F');
|
||||
}
|
||||
break;
|
||||
case CC_SPACE:
|
||||
cv = getcvec(v, UCHAR_MAX, 0, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<= UCHAR_MAX ; i++) {
|
||||
if (pg_isspace((chr) i))
|
||||
addchr(cv, (chr) i);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case CC_LOWER:
|
||||
cv = getcvec(v, UCHAR_MAX, 0, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<= UCHAR_MAX ; i++) {
|
||||
if (pg_islower((chr) i))
|
||||
addchr(cv, (chr) i);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case CC_UPPER:
|
||||
cv = getcvec(v, UCHAR_MAX, 0, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<= UCHAR_MAX ; i++) {
|
||||
if (pg_isupper((chr) i))
|
||||
addchr(cv, (chr) i);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case CC_GRAPH:
|
||||
cv = getcvec(v, UCHAR_MAX, 0, 0);
|
||||
if (cv) {
|
||||
for (i=0 ; i<= UCHAR_MAX ; i++) {
|
||||
if (pg_isgraph((chr) i))
|
||||
addchr(cv, (chr) i);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (cv == NULL) {
|
||||
ERR(REG_ESPACE);
|
||||
}
|
||||
return cv;
|
||||
}
|
||||
|
||||
/*
|
||||
* allcases - supply cvec for all case counterparts of a chr (including itself)
|
||||
*
|
||||
* This is a shortcut, preferably an efficient one, for simple characters;
|
||||
* messy cases are done via range().
|
||||
*/
|
||||
static struct cvec *
|
||||
allcases(struct vars *v, /* context */
|
||||
chr pc) /* character to get case equivs of */
|
||||
{
|
||||
struct cvec *cv;
|
||||
chr c = (chr)pc;
|
||||
chr lc, uc;
|
||||
|
||||
lc = pg_tolower((chr)c);
|
||||
uc = pg_toupper((chr)c);
|
||||
|
||||
cv = getcvec(v, 2, 0, 0);
|
||||
addchr(cv, lc);
|
||||
if (lc != uc) {
|
||||
addchr(cv, uc);
|
||||
}
|
||||
return cv;
|
||||
}
|
||||
|
||||
/*
|
||||
* cmp - chr-substring compare
|
||||
*
|
||||
* Backrefs need this. It should preferably be efficient.
|
||||
* Note that it does not need to report anything except equal/unequal.
|
||||
* Note also that the length is exact, and the comparison should not
|
||||
* stop at embedded NULs!
|
||||
*/
|
||||
static int /* 0 for equal, nonzero for unequal */
|
||||
cmp(const chr *x, const chr *y, /* strings to compare */
|
||||
size_t len) /* exact length of comparison */
|
||||
{
|
||||
return memcmp(VS(x), VS(y), len*sizeof(chr));
|
||||
}
|
||||
|
||||
/*
|
||||
* casecmp - case-independent chr-substring compare
|
||||
*
|
||||
* REG_ICASE backrefs need this. It should preferably be efficient.
|
||||
* Note that it does not need to report anything except equal/unequal.
|
||||
* Note also that the length is exact, and the comparison should not
|
||||
* stop at embedded NULs!
|
||||
*/
|
||||
static int /* 0 for equal, nonzero for unequal */
|
||||
casecmp(const chr *x, const chr *y, /* strings to compare */
|
||||
size_t len) /* exact length of comparison */
|
||||
{
|
||||
for (; len > 0; len--, x++, y++) {
|
||||
if ((*x!=*y) && (pg_tolower(*x) != pg_tolower(*y))) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
1481
src/backend/regex/regc_nfa.c
Normal file
1481
src/backend/regex/regc_nfa.c
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
655
src/backend/regex/rege_dfa.c
Normal file
655
src/backend/regex/rege_dfa.c
Normal file
@ -0,0 +1,655 @@
|
||||
/*
|
||||
* DFA routines
|
||||
* This file is #included by regexec.c.
|
||||
*
|
||||
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
|
||||
*
|
||||
* Development of this software was funded, in part, by Cray Research Inc.,
|
||||
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
|
||||
* Corporation, none of whom are responsible for the results. The author
|
||||
* thanks all of them.
|
||||
*
|
||||
* Redistribution and use in source and binary forms -- with or without
|
||||
* modification -- are permitted for any purpose, provided that
|
||||
* redistributions in source form retain this entire copyright notice and
|
||||
* indicate the origin and nature of any modifications.
|
||||
*
|
||||
* I'd appreciate being given credit for this package in the documentation
|
||||
* of software which uses it, but that is not a requirement.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
||||
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* $Header: /cvsroot/pgsql/src/backend/regex/rege_dfa.c,v 1.1 2003/02/05 17:41:33 tgl Exp $
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* longest - longest-preferred matching engine
|
||||
*/
|
||||
static chr * /* endpoint, or NULL */
|
||||
longest(struct vars *v, /* used only for debug and exec flags */
|
||||
struct dfa *d,
|
||||
chr *start, /* where the match should start */
|
||||
chr *stop, /* match must end at or before here */
|
||||
int *hitstopp) /* record whether hit v->stop, if non-NULL */
|
||||
{
|
||||
chr *cp;
|
||||
chr *realstop = (stop == v->stop) ? stop : stop + 1;
|
||||
color co;
|
||||
struct sset *css;
|
||||
struct sset *ss;
|
||||
chr *post;
|
||||
int i;
|
||||
struct colormap *cm = d->cm;
|
||||
|
||||
/* initialize */
|
||||
css = initialize(v, d, start);
|
||||
cp = start;
|
||||
if (hitstopp != NULL)
|
||||
*hitstopp = 0;
|
||||
|
||||
/* startup */
|
||||
FDEBUG(("+++ startup +++\n"));
|
||||
if (cp == v->start) {
|
||||
co = d->cnfa->bos[(v->eflags®_NOTBOL) ? 0 : 1];
|
||||
FDEBUG(("color %ld\n", (long)co));
|
||||
} else {
|
||||
co = GETCOLOR(cm, *(cp - 1));
|
||||
FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co));
|
||||
}
|
||||
css = miss(v, d, css, co, cp, start);
|
||||
if (css == NULL)
|
||||
return NULL;
|
||||
css->lastseen = cp;
|
||||
|
||||
/* main loop */
|
||||
if (v->eflags®_FTRACE)
|
||||
while (cp < realstop) {
|
||||
FDEBUG(("+++ at c%d +++\n", css - d->ssets));
|
||||
co = GETCOLOR(cm, *cp);
|
||||
FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co));
|
||||
ss = css->outs[co];
|
||||
if (ss == NULL) {
|
||||
ss = miss(v, d, css, co, cp+1, start);
|
||||
if (ss == NULL)
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
cp++;
|
||||
ss->lastseen = cp;
|
||||
css = ss;
|
||||
}
|
||||
else
|
||||
while (cp < realstop) {
|
||||
co = GETCOLOR(cm, *cp);
|
||||
ss = css->outs[co];
|
||||
if (ss == NULL) {
|
||||
ss = miss(v, d, css, co, cp+1, start);
|
||||
if (ss == NULL)
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
cp++;
|
||||
ss->lastseen = cp;
|
||||
css = ss;
|
||||
}
|
||||
|
||||
/* shutdown */
|
||||
FDEBUG(("+++ shutdown at c%d +++\n", css - d->ssets));
|
||||
if (cp == v->stop && stop == v->stop) {
|
||||
if (hitstopp != NULL)
|
||||
*hitstopp = 1;
|
||||
co = d->cnfa->eos[(v->eflags®_NOTEOL) ? 0 : 1];
|
||||
FDEBUG(("color %ld\n", (long)co));
|
||||
ss = miss(v, d, css, co, cp, start);
|
||||
/* special case: match ended at eol? */
|
||||
if (ss != NULL && (ss->flags&POSTSTATE))
|
||||
return cp;
|
||||
else if (ss != NULL)
|
||||
ss->lastseen = cp; /* to be tidy */
|
||||
}
|
||||
|
||||
/* find last match, if any */
|
||||
post = d->lastpost;
|
||||
for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
|
||||
if ((ss->flags&POSTSTATE) && post != ss->lastseen &&
|
||||
(post == NULL || post < ss->lastseen))
|
||||
post = ss->lastseen;
|
||||
if (post != NULL) /* found one */
|
||||
return post - 1;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* shortest - shortest-preferred matching engine
|
||||
*/
|
||||
static chr * /* endpoint, or NULL */
|
||||
shortest(struct vars *v,
|
||||
struct dfa *d,
|
||||
chr *start, /* where the match should start */
|
||||
chr *min, /* match must end at or after here */
|
||||
chr *max, /* match must end at or before here */
|
||||
chr **coldp, /* store coldstart pointer here, if nonNULL */
|
||||
int *hitstopp) /* record whether hit v->stop, if non-NULL */
|
||||
{
|
||||
chr *cp;
|
||||
chr *realmin = (min == v->stop) ? min : min + 1;
|
||||
chr *realmax = (max == v->stop) ? max : max + 1;
|
||||
color co;
|
||||
struct sset *css;
|
||||
struct sset *ss;
|
||||
struct colormap *cm = d->cm;
|
||||
|
||||
/* initialize */
|
||||
css = initialize(v, d, start);
|
||||
cp = start;
|
||||
if (hitstopp != NULL)
|
||||
*hitstopp = 0;
|
||||
|
||||
/* startup */
|
||||
FDEBUG(("--- startup ---\n"));
|
||||
if (cp == v->start) {
|
||||
co = d->cnfa->bos[(v->eflags®_NOTBOL) ? 0 : 1];
|
||||
FDEBUG(("color %ld\n", (long)co));
|
||||
} else {
|
||||
co = GETCOLOR(cm, *(cp - 1));
|
||||
FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co));
|
||||
}
|
||||
css = miss(v, d, css, co, cp, start);
|
||||
if (css == NULL)
|
||||
return NULL;
|
||||
css->lastseen = cp;
|
||||
ss = css;
|
||||
|
||||
/* main loop */
|
||||
if (v->eflags®_FTRACE)
|
||||
while (cp < realmax) {
|
||||
FDEBUG(("--- at c%d ---\n", css - d->ssets));
|
||||
co = GETCOLOR(cm, *cp);
|
||||
FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co));
|
||||
ss = css->outs[co];
|
||||
if (ss == NULL) {
|
||||
ss = miss(v, d, css, co, cp+1, start);
|
||||
if (ss == NULL)
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
cp++;
|
||||
ss->lastseen = cp;
|
||||
css = ss;
|
||||
if ((ss->flags&POSTSTATE) && cp >= realmin)
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
else
|
||||
while (cp < realmax) {
|
||||
co = GETCOLOR(cm, *cp);
|
||||
ss = css->outs[co];
|
||||
if (ss == NULL) {
|
||||
ss = miss(v, d, css, co, cp+1, start);
|
||||
if (ss == NULL)
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
cp++;
|
||||
ss->lastseen = cp;
|
||||
css = ss;
|
||||
if ((ss->flags&POSTSTATE) && cp >= realmin)
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
|
||||
if (ss == NULL)
|
||||
return NULL;
|
||||
|
||||
if (coldp != NULL) /* report last no-progress state set, if any */
|
||||
*coldp = lastcold(v, d);
|
||||
|
||||
if ((ss->flags&POSTSTATE) && cp > min) {
|
||||
assert(cp >= realmin);
|
||||
cp--;
|
||||
} else if (cp == v->stop && max == v->stop) {
|
||||
co = d->cnfa->eos[(v->eflags®_NOTEOL) ? 0 : 1];
|
||||
FDEBUG(("color %ld\n", (long)co));
|
||||
ss = miss(v, d, css, co, cp, start);
|
||||
/* match might have ended at eol */
|
||||
if ((ss == NULL || !(ss->flags&POSTSTATE)) && hitstopp != NULL)
|
||||
*hitstopp = 1;
|
||||
}
|
||||
|
||||
if (ss == NULL || !(ss->flags&POSTSTATE))
|
||||
return NULL;
|
||||
|
||||
return cp;
|
||||
}
|
||||
|
||||
/*
|
||||
* lastcold - determine last point at which no progress had been made
|
||||
*/
|
||||
static chr * /* endpoint, or NULL */
|
||||
lastcold(struct vars *v,
|
||||
struct dfa *d)
|
||||
{
|
||||
struct sset *ss;
|
||||
chr *nopr;
|
||||
int i;
|
||||
|
||||
nopr = d->lastnopr;
|
||||
if (nopr == NULL)
|
||||
nopr = v->start;
|
||||
for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
|
||||
if ((ss->flags&NOPROGRESS) && nopr < ss->lastseen)
|
||||
nopr = ss->lastseen;
|
||||
return nopr;
|
||||
}
|
||||
|
||||
/*
|
||||
* newdfa - set up a fresh DFA
|
||||
*/
|
||||
static struct dfa *
|
||||
newdfa(struct vars *v,
|
||||
struct cnfa *cnfa,
|
||||
struct colormap *cm,
|
||||
struct smalldfa *small) /* preallocated space, may be NULL */
|
||||
{
|
||||
struct dfa *d;
|
||||
size_t nss = cnfa->nstates * 2;
|
||||
int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
|
||||
struct smalldfa *smallwas = small;
|
||||
|
||||
assert(cnfa != NULL && cnfa->nstates != 0);
|
||||
|
||||
if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS) {
|
||||
assert(wordsper == 1);
|
||||
if (small == NULL) {
|
||||
small = (struct smalldfa *)MALLOC(
|
||||
sizeof(struct smalldfa));
|
||||
if (small == NULL) {
|
||||
ERR(REG_ESPACE);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
d = &small->dfa;
|
||||
d->ssets = small->ssets;
|
||||
d->statesarea = small->statesarea;
|
||||
d->work = &d->statesarea[nss];
|
||||
d->outsarea = small->outsarea;
|
||||
d->incarea = small->incarea;
|
||||
d->cptsmalloced = 0;
|
||||
d->mallocarea = (smallwas == NULL) ? (char *)small : NULL;
|
||||
} else {
|
||||
d = (struct dfa *)MALLOC(sizeof(struct dfa));
|
||||
if (d == NULL) {
|
||||
ERR(REG_ESPACE);
|
||||
return NULL;
|
||||
}
|
||||
d->ssets = (struct sset *)MALLOC(nss * sizeof(struct sset));
|
||||
d->statesarea = (unsigned *)MALLOC((nss+WORK) * wordsper *
|
||||
sizeof(unsigned));
|
||||
d->work = &d->statesarea[nss * wordsper];
|
||||
d->outsarea = (struct sset **)MALLOC(nss * cnfa->ncolors *
|
||||
sizeof(struct sset *));
|
||||
d->incarea = (struct arcp *)MALLOC(nss * cnfa->ncolors *
|
||||
sizeof(struct arcp));
|
||||
d->cptsmalloced = 1;
|
||||
d->mallocarea = (char *)d;
|
||||
if (d->ssets == NULL || d->statesarea == NULL ||
|
||||
d->outsarea == NULL || d->incarea == NULL) {
|
||||
freedfa(d);
|
||||
ERR(REG_ESPACE);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
d->nssets = (v->eflags®_SMALL) ? 7 : nss;
|
||||
d->nssused = 0;
|
||||
d->nstates = cnfa->nstates;
|
||||
d->ncolors = cnfa->ncolors;
|
||||
d->wordsper = wordsper;
|
||||
d->cnfa = cnfa;
|
||||
d->cm = cm;
|
||||
d->lastpost = NULL;
|
||||
d->lastnopr = NULL;
|
||||
d->search = d->ssets;
|
||||
|
||||
/* initialization of sset fields is done as needed */
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
/*
|
||||
* freedfa - free a DFA
|
||||
*/
|
||||
static void
|
||||
freedfa(struct dfa *d)
|
||||
{
|
||||
if (d->cptsmalloced) {
|
||||
if (d->ssets != NULL)
|
||||
FREE(d->ssets);
|
||||
if (d->statesarea != NULL)
|
||||
FREE(d->statesarea);
|
||||
if (d->outsarea != NULL)
|
||||
FREE(d->outsarea);
|
||||
if (d->incarea != NULL)
|
||||
FREE(d->incarea);
|
||||
}
|
||||
|
||||
if (d->mallocarea != NULL)
|
||||
FREE(d->mallocarea);
|
||||
}
|
||||
|
||||
/*
|
||||
* hash - construct a hash code for a bitvector
|
||||
*
|
||||
* There are probably better ways, but they're more expensive.
|
||||
*/
|
||||
static unsigned
|
||||
hash(unsigned *uv,
|
||||
int n)
|
||||
{
|
||||
int i;
|
||||
unsigned h;
|
||||
|
||||
h = 0;
|
||||
for (i = 0; i < n; i++)
|
||||
h ^= uv[i];
|
||||
return h;
|
||||
}
|
||||
|
||||
/*
|
||||
* initialize - hand-craft a cache entry for startup, otherwise get ready
|
||||
*/
|
||||
static struct sset *
|
||||
initialize(struct vars *v, /* used only for debug flags */
|
||||
struct dfa *d,
|
||||
chr *start)
|
||||
{
|
||||
struct sset *ss;
|
||||
int i;
|
||||
|
||||
/* is previous one still there? */
|
||||
if (d->nssused > 0 && (d->ssets[0].flags&STARTER))
|
||||
ss = &d->ssets[0];
|
||||
else { /* no, must (re)build it */
|
||||
ss = getvacant(v, d, start, start);
|
||||
for (i = 0; i < d->wordsper; i++)
|
||||
ss->states[i] = 0;
|
||||
BSET(ss->states, d->cnfa->pre);
|
||||
ss->hash = HASH(ss->states, d->wordsper);
|
||||
assert(d->cnfa->pre != d->cnfa->post);
|
||||
ss->flags = STARTER|LOCKED|NOPROGRESS;
|
||||
/* lastseen dealt with below */
|
||||
}
|
||||
|
||||
for (i = 0; i < d->nssused; i++)
|
||||
d->ssets[i].lastseen = NULL;
|
||||
ss->lastseen = start; /* maybe untrue, but harmless */
|
||||
d->lastpost = NULL;
|
||||
d->lastnopr = NULL;
|
||||
return ss;
|
||||
}
|
||||
|
||||
/*
|
||||
* miss - handle a cache miss
|
||||
*/
|
||||
static struct sset * /* NULL if goes to empty set */
|
||||
miss(struct vars *v, /* used only for debug flags */
|
||||
struct dfa *d,
|
||||
struct sset *css,
|
||||
pcolor co,
|
||||
chr *cp, /* next chr */
|
||||
chr *start) /* where the attempt got started */
|
||||
{
|
||||
struct cnfa *cnfa = d->cnfa;
|
||||
int i;
|
||||
unsigned h;
|
||||
struct carc *ca;
|
||||
struct sset *p;
|
||||
int ispost;
|
||||
int noprogress;
|
||||
int gotstate;
|
||||
int dolacons;
|
||||
int sawlacons;
|
||||
|
||||
/* for convenience, we can be called even if it might not be a miss */
|
||||
if (css->outs[co] != NULL) {
|
||||
FDEBUG(("hit\n"));
|
||||
return css->outs[co];
|
||||
}
|
||||
FDEBUG(("miss\n"));
|
||||
|
||||
/* first, what set of states would we end up in? */
|
||||
for (i = 0; i < d->wordsper; i++)
|
||||
d->work[i] = 0;
|
||||
ispost = 0;
|
||||
noprogress = 1;
|
||||
gotstate = 0;
|
||||
for (i = 0; i < d->nstates; i++)
|
||||
if (ISBSET(css->states, i))
|
||||
for (ca = cnfa->states[i]+1; ca->co != COLORLESS; ca++)
|
||||
if (ca->co == co) {
|
||||
BSET(d->work, ca->to);
|
||||
gotstate = 1;
|
||||
if (ca->to == cnfa->post)
|
||||
ispost = 1;
|
||||
if (!cnfa->states[ca->to]->co)
|
||||
noprogress = 0;
|
||||
FDEBUG(("%d -> %d\n", i, ca->to));
|
||||
}
|
||||
dolacons = (gotstate) ? (cnfa->flags&HASLACONS) : 0;
|
||||
sawlacons = 0;
|
||||
while (dolacons) { /* transitive closure */
|
||||
dolacons = 0;
|
||||
for (i = 0; i < d->nstates; i++)
|
||||
if (ISBSET(d->work, i))
|
||||
for (ca = cnfa->states[i]+1; ca->co != COLORLESS;
|
||||
ca++) {
|
||||
if (ca->co <= cnfa->ncolors)
|
||||
continue; /* NOTE CONTINUE */
|
||||
sawlacons = 1;
|
||||
if (ISBSET(d->work, ca->to))
|
||||
continue; /* NOTE CONTINUE */
|
||||
if (!lacon(v, cnfa, cp, ca->co))
|
||||
continue; /* NOTE CONTINUE */
|
||||
BSET(d->work, ca->to);
|
||||
dolacons = 1;
|
||||
if (ca->to == cnfa->post)
|
||||
ispost = 1;
|
||||
if (!cnfa->states[ca->to]->co)
|
||||
noprogress = 0;
|
||||
FDEBUG(("%d :> %d\n", i, ca->to));
|
||||
}
|
||||
}
|
||||
if (!gotstate)
|
||||
return NULL;
|
||||
h = HASH(d->work, d->wordsper);
|
||||
|
||||
/* next, is that in the cache? */
|
||||
for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
|
||||
if (HIT(h, d->work, p, d->wordsper)) {
|
||||
FDEBUG(("cached c%d\n", p - d->ssets));
|
||||
break; /* NOTE BREAK OUT */
|
||||
}
|
||||
if (i == 0) { /* nope, need a new cache entry */
|
||||
p = getvacant(v, d, cp, start);
|
||||
assert(p != css);
|
||||
for (i = 0; i < d->wordsper; i++)
|
||||
p->states[i] = d->work[i];
|
||||
p->hash = h;
|
||||
p->flags = (ispost) ? POSTSTATE : 0;
|
||||
if (noprogress)
|
||||
p->flags |= NOPROGRESS;
|
||||
/* lastseen to be dealt with by caller */
|
||||
}
|
||||
|
||||
if (!sawlacons) { /* lookahead conds. always cache miss */
|
||||
FDEBUG(("c%d[%d]->c%d\n", css - d->ssets, co, p - d->ssets));
|
||||
css->outs[co] = p;
|
||||
css->inchain[co] = p->ins;
|
||||
p->ins.ss = css;
|
||||
p->ins.co = (color)co;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
/*
|
||||
* lacon - lookahead-constraint checker for miss()
|
||||
*/
|
||||
static int /* predicate: constraint satisfied? */
|
||||
lacon(struct vars *v,
|
||||
struct cnfa *pcnfa, /* parent cnfa */
|
||||
chr *cp,
|
||||
pcolor co) /* "color" of the lookahead constraint */
|
||||
{
|
||||
int n;
|
||||
struct subre *sub;
|
||||
struct dfa *d;
|
||||
struct smalldfa sd;
|
||||
chr *end;
|
||||
|
||||
n = co - pcnfa->ncolors;
|
||||
assert(n < v->g->nlacons && v->g->lacons != NULL);
|
||||
FDEBUG(("=== testing lacon %d\n", n));
|
||||
sub = &v->g->lacons[n];
|
||||
d = newdfa(v, &sub->cnfa, &v->g->cmap, &sd);
|
||||
if (d == NULL) {
|
||||
ERR(REG_ESPACE);
|
||||
return 0;
|
||||
}
|
||||
end = longest(v, d, cp, v->stop, (int *)NULL);
|
||||
freedfa(d);
|
||||
FDEBUG(("=== lacon %d match %d\n", n, (end != NULL)));
|
||||
return (sub->subno) ? (end != NULL) : (end == NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* getvacant - get a vacant state set
|
||||
* This routine clears out the inarcs and outarcs, but does not otherwise
|
||||
* clear the innards of the state set -- that's up to the caller.
|
||||
*/
|
||||
static struct sset *
|
||||
getvacant(struct vars *v, /* used only for debug flags */
|
||||
struct dfa *d,
|
||||
chr *cp,
|
||||
chr *start)
|
||||
{
|
||||
int i;
|
||||
struct sset *ss;
|
||||
struct sset *p;
|
||||
struct arcp ap;
|
||||
struct arcp lastap;
|
||||
color co;
|
||||
|
||||
ss = pickss(v, d, cp, start);
|
||||
assert(!(ss->flags&LOCKED));
|
||||
|
||||
/* clear out its inarcs, including self-referential ones */
|
||||
ap = ss->ins;
|
||||
while ((p = ap.ss) != NULL) {
|
||||
co = ap.co;
|
||||
FDEBUG(("zapping c%d's %ld outarc\n", p - d->ssets, (long)co));
|
||||
p->outs[co] = NULL;
|
||||
ap = p->inchain[co];
|
||||
p->inchain[co].ss = NULL; /* paranoia */
|
||||
}
|
||||
ss->ins.ss = NULL;
|
||||
|
||||
/* take it off the inarc chains of the ssets reached by its outarcs */
|
||||
for (i = 0; i < d->ncolors; i++) {
|
||||
p = ss->outs[i];
|
||||
assert(p != ss); /* not self-referential */
|
||||
if (p == NULL)
|
||||
continue; /* NOTE CONTINUE */
|
||||
FDEBUG(("del outarc %d from c%d's in chn\n", i, p - d->ssets));
|
||||
if (p->ins.ss == ss && p->ins.co == i)
|
||||
p->ins = ss->inchain[i];
|
||||
else {
|
||||
assert(p->ins.ss != NULL);
|
||||
for (ap = p->ins; ap.ss != NULL &&
|
||||
!(ap.ss == ss && ap.co == i);
|
||||
ap = ap.ss->inchain[ap.co])
|
||||
lastap = ap;
|
||||
assert(ap.ss != NULL);
|
||||
lastap.ss->inchain[lastap.co] = ss->inchain[i];
|
||||
}
|
||||
ss->outs[i] = NULL;
|
||||
ss->inchain[i].ss = NULL;
|
||||
}
|
||||
|
||||
/* if ss was a success state, may need to remember location */
|
||||
if ((ss->flags&POSTSTATE) && ss->lastseen != d->lastpost &&
|
||||
(d->lastpost == NULL || d->lastpost < ss->lastseen))
|
||||
d->lastpost = ss->lastseen;
|
||||
|
||||
/* likewise for a no-progress state */
|
||||
if ((ss->flags&NOPROGRESS) && ss->lastseen != d->lastnopr &&
|
||||
(d->lastnopr == NULL || d->lastnopr < ss->lastseen))
|
||||
d->lastnopr = ss->lastseen;
|
||||
|
||||
return ss;
|
||||
}
|
||||
|
||||
/*
|
||||
* pickss - pick the next stateset to be used
|
||||
*/
|
||||
static struct sset *
|
||||
pickss(struct vars *v, /* used only for debug flags */
|
||||
struct dfa *d,
|
||||
chr *cp,
|
||||
chr *start)
|
||||
{
|
||||
int i;
|
||||
struct sset *ss;
|
||||
struct sset *end;
|
||||
chr *ancient;
|
||||
|
||||
/* shortcut for cases where cache isn't full */
|
||||
if (d->nssused < d->nssets) {
|
||||
i = d->nssused;
|
||||
d->nssused++;
|
||||
ss = &d->ssets[i];
|
||||
FDEBUG(("new c%d\n", i));
|
||||
/* set up innards */
|
||||
ss->states = &d->statesarea[i * d->wordsper];
|
||||
ss->flags = 0;
|
||||
ss->ins.ss = NULL;
|
||||
ss->ins.co = WHITE; /* give it some value */
|
||||
ss->outs = &d->outsarea[i * d->ncolors];
|
||||
ss->inchain = &d->incarea[i * d->ncolors];
|
||||
for (i = 0; i < d->ncolors; i++) {
|
||||
ss->outs[i] = NULL;
|
||||
ss->inchain[i].ss = NULL;
|
||||
}
|
||||
return ss;
|
||||
}
|
||||
|
||||
/* look for oldest, or old enough anyway */
|
||||
if (cp - start > d->nssets*2/3) /* oldest 33% are expendable */
|
||||
ancient = cp - d->nssets*2/3;
|
||||
else
|
||||
ancient = start;
|
||||
for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
|
||||
if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
|
||||
!(ss->flags&LOCKED)) {
|
||||
d->search = ss + 1;
|
||||
FDEBUG(("replacing c%d\n", ss - d->ssets));
|
||||
return ss;
|
||||
}
|
||||
for (ss = d->ssets, end = d->search; ss < end; ss++)
|
||||
if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
|
||||
!(ss->flags&LOCKED)) {
|
||||
d->search = ss + 1;
|
||||
FDEBUG(("replacing c%d\n", ss - d->ssets));
|
||||
return ss;
|
||||
}
|
||||
|
||||
/* nobody's old enough?!? -- something's really wrong */
|
||||
FDEBUG(("can't find victim to replace!\n"));
|
||||
assert(NOTREACHED);
|
||||
ERR(REG_ASSERT);
|
||||
return d->ssets;
|
||||
}
|
@ -1,181 +1,110 @@
|
||||
/*-
|
||||
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
|
||||
* Copyright (c) 1992, 1993, 1994
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
/*
|
||||
* regerror - error-code expansion
|
||||
*
|
||||
* This code is derived from software contributed to Berkeley by
|
||||
* Henry Spencer.
|
||||
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
|
||||
*
|
||||
* Development of this software was funded, in part, by Cray Research Inc.,
|
||||
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
|
||||
* Corporation, none of whom are responsible for the results. The author
|
||||
* thanks all of them.
|
||||
*
|
||||
* Redistribution and use in source and binary forms -- with or without
|
||||
* modification -- are permitted for any purpose, provided that
|
||||
* redistributions in source form retain this entire copyright notice and
|
||||
* indicate the origin and nature of any modifications.
|
||||
*
|
||||
* I'd appreciate being given credit for this package in the documentation
|
||||
* of software which uses it, but that is not a requirement.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
||||
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by the University of
|
||||
* California, Berkeley and its contributors.
|
||||
* 4. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
* $Header: /cvsroot/pgsql/src/backend/regex/regerror.c,v 1.25 2003/02/05 17:41:33 tgl Exp $
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)regerror.c 8.4 (Berkeley) 3/20/94
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
#include "regex/regguts.h"
|
||||
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#include <assert.h>
|
||||
/* unknown-error explanation */
|
||||
static char unk[] = "*** unknown regex error code 0x%x ***";
|
||||
|
||||
#include "regex/regex.h"
|
||||
#include "regex/utils.h"
|
||||
#include "regex/regex2.h"
|
||||
|
||||
static char *regatoi(const regex_t *preg, char *localbuf);
|
||||
|
||||
static struct rerr
|
||||
{
|
||||
int code;
|
||||
char *name;
|
||||
char *explain;
|
||||
} rerrs[] =
|
||||
|
||||
{
|
||||
{
|
||||
/* NOMATCH is not really an error condition, it just says no match */
|
||||
REG_NOMATCH, "REG_NOMATCH", "no pattern match found"
|
||||
},
|
||||
{
|
||||
REG_BADPAT, "REG_BADPAT", "invalid regex struct"
|
||||
},
|
||||
{
|
||||
REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element"
|
||||
},
|
||||
{
|
||||
REG_ECTYPE, "REG_ECTYPE", "invalid character class"
|
||||
},
|
||||
{
|
||||
REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)"
|
||||
},
|
||||
{
|
||||
REG_ESUBREG, "REG_ESUBREG", "invalid backreference number"
|
||||
},
|
||||
{
|
||||
REG_EBRACK, "REG_EBRACK", "brackets [ ] not balanced"
|
||||
},
|
||||
{
|
||||
REG_EPAREN, "REG_EPAREN", "parentheses ( ) not balanced"
|
||||
},
|
||||
{
|
||||
REG_EBRACE, "REG_EBRACE", "braces { } not balanced"
|
||||
},
|
||||
{
|
||||
REG_BADBR, "REG_BADBR", "invalid repetition count(s) in { }"
|
||||
},
|
||||
{
|
||||
REG_ERANGE, "REG_ERANGE", "invalid character range in [ ]"
|
||||
},
|
||||
{
|
||||
REG_ESPACE, "REG_ESPACE", "ran out of memory"
|
||||
},
|
||||
{
|
||||
REG_BADRPT, "REG_BADRPT", "?, *, or + operand invalid"
|
||||
},
|
||||
{
|
||||
REG_EMPTY, "REG_EMPTY", "empty expression or subexpression"
|
||||
},
|
||||
{
|
||||
REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug"
|
||||
},
|
||||
{
|
||||
REG_INVARG, "REG_INVARG", "invalid argument to regex routine"
|
||||
},
|
||||
{
|
||||
0, "", "*** unknown regexp error code ***"
|
||||
}
|
||||
/* struct to map among codes, code names, and explanations */
|
||||
static struct rerr {
|
||||
int code;
|
||||
char *name;
|
||||
char *explain;
|
||||
} rerrs[] = {
|
||||
/* the actual table is built from regex.h */
|
||||
#include "regex/regerrs.h"
|
||||
{ -1, "", "oops" }, /* explanation special-cased in code */
|
||||
};
|
||||
|
||||
/*
|
||||
* regerror - the interface to error numbers
|
||||
* pg_regerror - the interface to error numbers
|
||||
*/
|
||||
/* ARGSUSED */
|
||||
size_t
|
||||
pg_regerror(int errcode, const regex_t *preg,
|
||||
char *errbuf, size_t errbuf_size)
|
||||
size_t /* actual space needed (including NUL) */
|
||||
pg_regerror(int errcode, /* error code, or REG_ATOI or REG_ITOA */
|
||||
const regex_t *preg, /* associated regex_t (unused at present) */
|
||||
char *errbuf, /* result buffer (unless errbuf_size==0) */
|
||||
size_t errbuf_size) /* available space in errbuf, can be 0 */
|
||||
{
|
||||
struct rerr *r;
|
||||
size_t len;
|
||||
int target = errcode & ~REG_ITOA;
|
||||
char *s;
|
||||
char convbuf[50];
|
||||
char *msg;
|
||||
char convbuf[sizeof(unk)+50]; /* 50 = plenty for int */
|
||||
size_t len;
|
||||
int icode;
|
||||
|
||||
if (errcode == REG_ATOI)
|
||||
s = regatoi(preg, convbuf);
|
||||
else
|
||||
{
|
||||
for (r = rerrs; r->code != 0; r++)
|
||||
if (r->code == target)
|
||||
switch (errcode) {
|
||||
case REG_ATOI: /* convert name to number */
|
||||
for (r = rerrs; r->code >= 0; r++)
|
||||
if (strcmp(r->name, errbuf) == 0)
|
||||
break;
|
||||
|
||||
if (errcode & REG_ITOA)
|
||||
{
|
||||
if (r->code != 0)
|
||||
strcpy(convbuf, r->name);
|
||||
else
|
||||
sprintf(convbuf, "REG_0x%x", target);
|
||||
assert(strlen(convbuf) < sizeof(convbuf));
|
||||
s = convbuf;
|
||||
sprintf(convbuf, "%d", r->code); /* -1 for unknown */
|
||||
msg = convbuf;
|
||||
break;
|
||||
case REG_ITOA: /* convert number to name */
|
||||
icode = atoi(errbuf); /* not our problem if this fails */
|
||||
for (r = rerrs; r->code >= 0; r++)
|
||||
if (r->code == icode)
|
||||
break;
|
||||
if (r->code >= 0)
|
||||
msg = r->name;
|
||||
else { /* unknown; tell him the number */
|
||||
sprintf(convbuf, "REG_%u", (unsigned)icode);
|
||||
msg = convbuf;
|
||||
}
|
||||
else
|
||||
s = r->explain;
|
||||
break;
|
||||
default: /* a real, normal error code */
|
||||
for (r = rerrs; r->code >= 0; r++)
|
||||
if (r->code == errcode)
|
||||
break;
|
||||
if (r->code >= 0)
|
||||
msg = r->explain;
|
||||
else { /* unknown; say so */
|
||||
sprintf(convbuf, unk, errcode);
|
||||
msg = convbuf;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
len = strlen(s) + 1;
|
||||
if (errbuf_size > 0)
|
||||
{
|
||||
len = strlen(msg) + 1; /* space needed, including NUL */
|
||||
if (errbuf_size > 0) {
|
||||
if (errbuf_size > len)
|
||||
strcpy(errbuf, s);
|
||||
else
|
||||
{
|
||||
strncpy(errbuf, s, errbuf_size - 1);
|
||||
errbuf[errbuf_size - 1] = '\0';
|
||||
strcpy(errbuf, msg);
|
||||
else { /* truncate to fit */
|
||||
strncpy(errbuf, msg, errbuf_size-1);
|
||||
errbuf[errbuf_size-1] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/*
|
||||
* regatoi - internal routine to implement REG_ATOI
|
||||
*/
|
||||
static char *
|
||||
regatoi(const regex_t *preg, char *localbuf)
|
||||
{
|
||||
struct rerr *r;
|
||||
|
||||
for (r = rerrs; r->code != 0; r++)
|
||||
if (pg_char_and_wchar_strcmp(r->name, preg->re_endp) == 0)
|
||||
break;
|
||||
|
||||
if (r->code == 0)
|
||||
return "0";
|
||||
|
||||
sprintf(localbuf, "%d", r->code);
|
||||
return localbuf;
|
||||
}
|
||||
|
@ -1,538 +0,0 @@
|
||||
.\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
|
||||
.\" Copyright (c) 1992, 1993, 1994
|
||||
.\" The Regents of the University of California. All rights reserved.
|
||||
.\"
|
||||
.\" This code is derived from software contributed to Berkeley by
|
||||
.\" Henry Spencer.
|
||||
.\"
|
||||
.\" Redistribution and use in source and binary forms, with or without
|
||||
.\" modification, are permitted provided that the following conditions
|
||||
.\" are met:
|
||||
.\" 1. Redistributions of source code must retain the above copyright
|
||||
.\" notice, this list of conditions and the following disclaimer.
|
||||
.\" 2. Redistributions in binary form must reproduce the above copyright
|
||||
.\" notice, this list of conditions and the following disclaimer in the
|
||||
.\" documentation and/or other materials provided with the distribution.
|
||||
.\" 3. All advertising materials mentioning features or use of this software
|
||||
.\" must display the following acknowledgement:
|
||||
.\" This product includes software developed by the University of
|
||||
.\" California, Berkeley and its contributors.
|
||||
.\" 4. Neither the name of the University nor the names of its contributors
|
||||
.\" may be used to endorse or promote products derived from this software
|
||||
.\" without specific prior written permission.
|
||||
.\"
|
||||
.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
.\" SUCH DAMAGE.
|
||||
.\"
|
||||
.\" @(#)regex.3 8.4 (Berkeley) 3/20/94
|
||||
.\"
|
||||
.TH REGEX 3 "March 20, 1994"
|
||||
.de ZR
|
||||
.\" one other place knows this name: the SEE ALSO section
|
||||
.IR re_format (7) \\$1
|
||||
..
|
||||
.SH NAME
|
||||
regcomp, regexec, regerror, regfree \- regular-expression library
|
||||
.SH SYNOPSIS
|
||||
.ft B
|
||||
.\".na
|
||||
#include <sys/types.h>
|
||||
.br
|
||||
#include <regex.h>
|
||||
.HP 10
|
||||
int regcomp(regex_t\ *preg, const\ char\ *pattern, int\ cflags);
|
||||
.HP
|
||||
int\ regexec(const\ regex_t\ *preg, const\ char\ *string,
|
||||
size_t\ nmatch, regmatch_t\ pmatch[], int\ eflags);
|
||||
.HP
|
||||
size_t\ regerror(int\ errcode, const\ regex_t\ *preg,
|
||||
char\ *errbuf, size_t\ errbuf_size);
|
||||
.HP
|
||||
void\ regfree(regex_t\ *preg);
|
||||
.\".ad
|
||||
.ft
|
||||
.SH DESCRIPTION
|
||||
These routines implement POSIX 1003.2 regular expressions (``RE''s);
|
||||
see
|
||||
.ZR .
|
||||
.I Regcomp
|
||||
compiles an RE written as a string into an internal form,
|
||||
.I regexec
|
||||
matches that internal form against a string and reports results,
|
||||
.I regerror
|
||||
transforms error codes from either into human-readable messages,
|
||||
and
|
||||
.I regfree
|
||||
frees any dynamically-allocated storage used by the internal form
|
||||
of an RE.
|
||||
.PP
|
||||
The header
|
||||
.I <regex.h>
|
||||
declares two structure types,
|
||||
.I regex_t
|
||||
and
|
||||
.IR regmatch_t ,
|
||||
the former for compiled internal forms and the latter for match reporting.
|
||||
It also declares the four functions,
|
||||
a type
|
||||
.IR regoff_t ,
|
||||
and a number of constants with names starting with ``REG_''.
|
||||
.PP
|
||||
.I Regcomp
|
||||
compiles the regular expression contained in the
|
||||
.I pattern
|
||||
string,
|
||||
subject to the flags in
|
||||
.IR cflags ,
|
||||
and places the results in the
|
||||
.I regex_t
|
||||
structure pointed to by
|
||||
.IR preg .
|
||||
.I Cflags
|
||||
is the bitwise OR of zero or more of the following flags:
|
||||
.IP REG_EXTENDED \w'REG_EXTENDED'u+2n
|
||||
Compile modern (``extended'') REs,
|
||||
rather than the obsolete (``basic'') REs that
|
||||
are the default.
|
||||
.IP REG_BASIC
|
||||
This is a synonym for 0,
|
||||
provided as a counterpart to REG_EXTENDED to improve readability.
|
||||
.IP REG_NOSPEC
|
||||
Compile with recognition of all special characters turned off.
|
||||
All characters are thus considered ordinary,
|
||||
so the ``RE'' is a literal string.
|
||||
This is an extension,
|
||||
compatible with but not specified by POSIX 1003.2,
|
||||
and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
REG_EXTENDED and REG_NOSPEC may not be used
|
||||
in the same call to
|
||||
.IR regcomp .
|
||||
.IP REG_ICASE
|
||||
Compile for matching that ignores upper/lower case distinctions.
|
||||
See
|
||||
.ZR .
|
||||
.IP REG_NOSUB
|
||||
Compile for matching that need only report success or failure,
|
||||
not what was matched.
|
||||
.IP REG_NEWLINE
|
||||
Compile for newline-sensitive matching.
|
||||
By default, newline is a completely ordinary character with no special
|
||||
meaning in either REs or strings.
|
||||
With this flag,
|
||||
`[^' bracket expressions and `.' never match newline,
|
||||
a `^' anchor matches the null string after any newline in the string
|
||||
in addition to its normal function,
|
||||
and the `$' anchor matches the null string before any newline in the
|
||||
string in addition to its normal function.
|
||||
.IP REG_PEND
|
||||
The regular expression ends,
|
||||
not at the first NUL,
|
||||
but just before the character pointed to by the
|
||||
.I re_endp
|
||||
member of the structure pointed to by
|
||||
.IR preg .
|
||||
The
|
||||
.I re_endp
|
||||
member is of type
|
||||
.IR const\ char\ * .
|
||||
This flag permits inclusion of NULs in the RE;
|
||||
they are considered ordinary characters.
|
||||
This is an extension,
|
||||
compatible with but not specified by POSIX 1003.2,
|
||||
and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
.PP
|
||||
When successful,
|
||||
.I regcomp
|
||||
returns 0 and fills in the structure pointed to by
|
||||
.IR preg .
|
||||
One member of that structure
|
||||
(other than
|
||||
.IR re_endp )
|
||||
is publicized:
|
||||
.IR re_nsub ,
|
||||
of type
|
||||
.IR size_t ,
|
||||
contains the number of parenthesized subexpressions within the RE
|
||||
(except that the value of this member is undefined if the
|
||||
REG_NOSUB flag was used).
|
||||
If
|
||||
.I regcomp
|
||||
fails, it returns a non-zero error code;
|
||||
see DIAGNOSTICS.
|
||||
.PP
|
||||
.I Regexec
|
||||
matches the compiled RE pointed to by
|
||||
.I preg
|
||||
against the
|
||||
.IR string ,
|
||||
subject to the flags in
|
||||
.IR eflags ,
|
||||
and reports results using
|
||||
.IR nmatch ,
|
||||
.IR pmatch ,
|
||||
and the returned value.
|
||||
The RE must have been compiled by a previous invocation of
|
||||
.IR regcomp .
|
||||
The compiled form is not altered during execution of
|
||||
.IR regexec ,
|
||||
so a single compiled RE can be used simultaneously by multiple threads.
|
||||
.PP
|
||||
By default,
|
||||
the NUL-terminated string pointed to by
|
||||
.I string
|
||||
is considered to be the text of an entire line, minus any terminating
|
||||
newline.
|
||||
The
|
||||
.I eflags
|
||||
argument is the bitwise OR of zero or more of the following flags:
|
||||
.IP REG_NOTBOL \w'REG_STARTEND'u+2n
|
||||
The first character of
|
||||
the string
|
||||
is not the beginning of a line, so the `^' anchor should not match before it.
|
||||
This does not affect the behavior of newlines under REG_NEWLINE.
|
||||
.IP REG_NOTEOL
|
||||
The NUL terminating
|
||||
the string
|
||||
does not end a line, so the `$' anchor should not match before it.
|
||||
This does not affect the behavior of newlines under REG_NEWLINE.
|
||||
.IP REG_STARTEND
|
||||
The string is considered to start at
|
||||
\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_so\fR
|
||||
and to have a terminating NUL located at
|
||||
\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_eo\fR
|
||||
(there need not actually be a NUL at that location),
|
||||
regardless of the value of
|
||||
.IR nmatch .
|
||||
See below for the definition of
|
||||
.IR pmatch
|
||||
and
|
||||
.IR nmatch .
|
||||
This is an extension,
|
||||
compatible with but not specified by POSIX 1003.2,
|
||||
and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
Note that a non-zero \fIrm_so\fR does not imply REG_NOTBOL;
|
||||
REG_STARTEND affects only the location of the string,
|
||||
not how it is matched.
|
||||
.PP
|
||||
See
|
||||
.ZR
|
||||
for a discussion of what is matched in situations where an RE or a
|
||||
portion thereof could match any of several substrings of
|
||||
.IR string .
|
||||
.PP
|
||||
Normally,
|
||||
.I regexec
|
||||
returns 0 for success and the non-zero code REG_NOMATCH for failure.
|
||||
Other non-zero error codes may be returned in exceptional situations;
|
||||
see DIAGNOSTICS.
|
||||
.PP
|
||||
If REG_NOSUB was specified in the compilation of the RE,
|
||||
or if
|
||||
.I nmatch
|
||||
is 0,
|
||||
.I regexec
|
||||
ignores the
|
||||
.I pmatch
|
||||
argument (but see below for the case where REG_STARTEND is specified).
|
||||
Otherwise,
|
||||
.I pmatch
|
||||
points to an array of
|
||||
.I nmatch
|
||||
structures of type
|
||||
.IR regmatch_t .
|
||||
Such a structure has at least the members
|
||||
.I rm_so
|
||||
and
|
||||
.IR rm_eo ,
|
||||
both of type
|
||||
.I regoff_t
|
||||
(a signed arithmetic type at least as large as an
|
||||
.I off_t
|
||||
and a
|
||||
.IR ssize_t ),
|
||||
containing respectively the offset of the first character of a substring
|
||||
and the offset of the first character after the end of the substring.
|
||||
Offsets are measured from the beginning of the
|
||||
.I string
|
||||
argument given to
|
||||
.IR regexec .
|
||||
An empty substring is denoted by equal offsets,
|
||||
both indicating the character following the empty substring.
|
||||
.PP
|
||||
The 0th member of the
|
||||
.I pmatch
|
||||
array is filled in to indicate what substring of
|
||||
.I string
|
||||
was matched by the entire RE.
|
||||
Remaining members report what substring was matched by parenthesized
|
||||
subexpressions within the RE;
|
||||
member
|
||||
.I i
|
||||
reports subexpression
|
||||
.IR i ,
|
||||
with subexpressions counted (starting at 1) by the order of their opening
|
||||
parentheses in the RE, left to right.
|
||||
Unused entries in the array\(emcorresponding either to subexpressions that
|
||||
did not participate in the match at all, or to subexpressions that do not
|
||||
exist in the RE (that is, \fIi\fR\ > \fIpreg\fR\->\fIre_nsub\fR)\(emhave both
|
||||
.I rm_so
|
||||
and
|
||||
.I rm_eo
|
||||
set to \-1.
|
||||
If a subexpression participated in the match several times,
|
||||
the reported substring is the last one it matched.
|
||||
(Note, as an example in particular, that when the RE `(b*)+' matches `bbb',
|
||||
the parenthesized subexpression matches each of the three `b's and then
|
||||
an infinite number of empty strings following the last `b',
|
||||
so the reported substring is one of the empties.)
|
||||
.PP
|
||||
If REG_STARTEND is specified,
|
||||
.I pmatch
|
||||
must point to at least one
|
||||
.I regmatch_t
|
||||
(even if
|
||||
.I nmatch
|
||||
is 0 or REG_NOSUB was specified),
|
||||
to hold the input offsets for REG_STARTEND.
|
||||
Use for output is still entirely controlled by
|
||||
.IR nmatch ;
|
||||
if
|
||||
.I nmatch
|
||||
is 0 or REG_NOSUB was specified,
|
||||
the value of
|
||||
.IR pmatch [0]
|
||||
will not be changed by a successful
|
||||
.IR regexec .
|
||||
.PP
|
||||
.I Regerror
|
||||
maps a non-zero
|
||||
.I errcode
|
||||
from either
|
||||
.I regcomp
|
||||
or
|
||||
.I regexec
|
||||
to a human-readable, printable message.
|
||||
If
|
||||
.I preg
|
||||
is non-NULL,
|
||||
the error code should have arisen from use of
|
||||
the
|
||||
.I regex_t
|
||||
pointed to by
|
||||
.IR preg ,
|
||||
and if the error code came from
|
||||
.IR regcomp ,
|
||||
it should have been the result from the most recent
|
||||
.I regcomp
|
||||
using that
|
||||
.IR regex_t .
|
||||
.RI ( Regerror
|
||||
may be able to supply a more detailed message using information
|
||||
from the
|
||||
.IR regex_t .)
|
||||
.I Regerror
|
||||
places the NUL-terminated message into the buffer pointed to by
|
||||
.IR errbuf ,
|
||||
limiting the length (including the NUL) to at most
|
||||
.I errbuf_size
|
||||
bytes.
|
||||
If the whole message won't fit,
|
||||
as much of it as will fit before the terminating NUL is supplied.
|
||||
In any case,
|
||||
the returned value is the size of buffer needed to hold the whole
|
||||
message (including terminating NUL).
|
||||
If
|
||||
.I errbuf_size
|
||||
is 0,
|
||||
.I errbuf
|
||||
is ignored but the return value is still correct.
|
||||
.PP
|
||||
If the
|
||||
.I errcode
|
||||
given to
|
||||
.I regerror
|
||||
is first ORed with REG_ITOA,
|
||||
the ``message'' that results is the printable name of the error code,
|
||||
e.g. ``REG_NOMATCH'',
|
||||
rather than an explanation thereof.
|
||||
If
|
||||
.I errcode
|
||||
is REG_ATOI,
|
||||
then
|
||||
.I preg
|
||||
shall be non-NULL and the
|
||||
.I re_endp
|
||||
member of the structure it points to
|
||||
must point to the printable name of an error code;
|
||||
in this case, the result in
|
||||
.I errbuf
|
||||
is the decimal digits of
|
||||
the numeric value of the error code
|
||||
(0 if the name is not recognized).
|
||||
REG_ITOA and REG_ATOI are intended primarily as debugging facilities;
|
||||
they are extensions,
|
||||
compatible with but not specified by POSIX 1003.2,
|
||||
and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
Be warned also that they are considered experimental and changes are possible.
|
||||
.PP
|
||||
.I Regfree
|
||||
frees any dynamically-allocated storage associated with the compiled RE
|
||||
pointed to by
|
||||
.IR preg .
|
||||
The remaining
|
||||
.I regex_t
|
||||
is no longer a valid compiled RE
|
||||
and the effect of supplying it to
|
||||
.I regexec
|
||||
or
|
||||
.I regerror
|
||||
is undefined.
|
||||
.PP
|
||||
None of these functions references global variables except for tables
|
||||
of constants;
|
||||
all are safe for use from multiple threads if the arguments are safe.
|
||||
.SH IMPLEMENTATION CHOICES
|
||||
There are a number of decisions that 1003.2 leaves up to the implementor,
|
||||
either by explicitly saying ``undefined'' or by virtue of them being
|
||||
forbidden by the RE grammar.
|
||||
This implementation treats them as follows.
|
||||
.PP
|
||||
See
|
||||
.ZR
|
||||
for a discussion of the definition of case-independent matching.
|
||||
.PP
|
||||
There is no particular limit on the length of REs,
|
||||
except insofar as memory is limited.
|
||||
Memory usage is approximately linear in RE size, and largely insensitive
|
||||
to RE complexity, except for bounded repetitions.
|
||||
See BUGS for one short RE using them
|
||||
that will run almost any system out of memory.
|
||||
.PP
|
||||
A backslashed character other than one specifically given a magic meaning
|
||||
by 1003.2 (such magic meanings occur only in obsolete [``basic''] REs)
|
||||
is taken as an ordinary character.
|
||||
.PP
|
||||
Any unmatched [ is a REG_EBRACK error.
|
||||
.PP
|
||||
Equivalence classes cannot begin or end bracket-expression ranges.
|
||||
The endpoint of one range cannot begin another.
|
||||
.PP
|
||||
RE_DUP_MAX, the limit on repetition counts in bounded repetitions, is 255.
|
||||
.PP
|
||||
A repetition operator (?, *, +, or bounds) cannot follow another
|
||||
repetition operator.
|
||||
A repetition operator cannot begin an expression or subexpression
|
||||
or follow `^' or `|'.
|
||||
.PP
|
||||
`|' cannot appear first or last in a (sub)expression or after another `|',
|
||||
i.e. an operand of `|' cannot be an empty subexpression.
|
||||
An empty parenthesized subexpression, `()', is legal and matches an
|
||||
empty (sub)string.
|
||||
An empty string is not a legal RE.
|
||||
.PP
|
||||
A `{' followed by a digit is considered the beginning of bounds for a
|
||||
bounded repetition, which must then follow the syntax for bounds.
|
||||
A `{' \fInot\fR followed by a digit is considered an ordinary character.
|
||||
.PP
|
||||
`^' and `$' beginning and ending subexpressions in obsolete (``basic'')
|
||||
REs are anchors, not ordinary characters.
|
||||
.SH SEE ALSO
|
||||
grep(1), re_format(7)
|
||||
.PP
|
||||
POSIX 1003.2, sections 2.8 (Regular Expression Notation)
|
||||
and
|
||||
B.5 (C Binding for Regular Expression Matching).
|
||||
.SH DIAGNOSTICS
|
||||
Non-zero error codes from
|
||||
.I regcomp
|
||||
and
|
||||
.I regexec
|
||||
include the following:
|
||||
.PP
|
||||
.nf
|
||||
.ta \w'REG_ECOLLATE'u+3n
|
||||
REG_NOMATCH no pattern match found
|
||||
REG_BADPAT invalid regex struct
|
||||
REG_ECOLLATE invalid collating element
|
||||
REG_ECTYPE invalid character class
|
||||
REG_EESCAPE trailing backslash (\e)
|
||||
REG_ESUBREG invalid backreference number
|
||||
REG_EBRACK brackets [ ] not balanced
|
||||
REG_EPAREN parentheses ( ) not balanced
|
||||
REG_EBRACE braces { } not balanced
|
||||
REG_BADBR invalid repetition count(s) in { }
|
||||
REG_ERANGE invalid character range in [ ]
|
||||
REG_ESPACE ran out of memory
|
||||
REG_BADRPT ?, *, or + operand invalid
|
||||
REG_EMPTY empty expression or subexpression
|
||||
REG_ASSERT ``can't happen''\(emyou found a bug
|
||||
REG_INVARG invalid argument, e.g. negative-length string
|
||||
.fi
|
||||
.SH HISTORY
|
||||
Originally written by Henry Spencer.
|
||||
Altered for inclusion in the 4.4BSD distribution.
|
||||
.SH BUGS
|
||||
This is an alpha release with known defects.
|
||||
Please report problems.
|
||||
.PP
|
||||
There is one known functionality bug.
|
||||
The implementation of internationalization is incomplete:
|
||||
the locale is always assumed to be the default one of 1003.2,
|
||||
and only the collating elements etc. of that locale are available.
|
||||
.PP
|
||||
The back-reference code is subtle and doubts linger about its correctness
|
||||
in complex cases.
|
||||
.PP
|
||||
.I Regexec
|
||||
performance is poor.
|
||||
This will improve with later releases.
|
||||
.I Nmatch
|
||||
exceeding 0 is expensive;
|
||||
.I nmatch
|
||||
exceeding 1 is worse.
|
||||
.I Regexec
|
||||
is largely insensitive to RE complexity \fIexcept\fR that back
|
||||
references are massively expensive.
|
||||
RE length does matter; in particular, there is a strong speed bonus
|
||||
for keeping RE length under about 30 characters,
|
||||
with most special characters counting roughly double.
|
||||
.PP
|
||||
.I Regcomp
|
||||
implements bounded repetitions by macro expansion,
|
||||
which is costly in time and space if counts are large
|
||||
or bounded repetitions are nested.
|
||||
An RE like, say,
|
||||
`((((a{1,100}){1,100}){1,100}){1,100}){1,100}'
|
||||
will (eventually) run almost any existing machine out of swap space.
|
||||
.PP
|
||||
There are suspected problems with response to obscure error conditions.
|
||||
Notably,
|
||||
certain kinds of internal overflow,
|
||||
produced only by truly enormous REs or by multiply nested bounded repetitions,
|
||||
are probably not handled well.
|
||||
.PP
|
||||
Due to a mistake in 1003.2, things like `a)b' are legal REs because `)' is
|
||||
a special character only in the presence of a previous unmatched `('.
|
||||
This can't be fixed until the spec is fixed.
|
||||
.PP
|
||||
The standard's definition of back references is vague.
|
||||
For example, does
|
||||
`a\e(\e(b\e)*\e2\e)*d' match `abbbd'?
|
||||
Until the standard is clarified,
|
||||
behavior in such cases should not be relied on.
|
||||
.PP
|
||||
The implementation of word-boundary matching is a bit of a kludge,
|
||||
and bugs may lurk in combinations of word-boundary matching and anchoring.
|
File diff suppressed because it is too large
Load Diff
@ -1,75 +1,54 @@
|
||||
/*-
|
||||
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
|
||||
* Copyright (c) 1992, 1993, 1994
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
/*
|
||||
* regfree - free an RE
|
||||
*
|
||||
* This code is derived from software contributed to Berkeley by
|
||||
* Henry Spencer.
|
||||
* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
|
||||
*
|
||||
* Development of this software was funded, in part, by Cray Research Inc.,
|
||||
* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
|
||||
* Corporation, none of whom are responsible for the results. The author
|
||||
* thanks all of them.
|
||||
*
|
||||
* Redistribution and use in source and binary forms -- with or without
|
||||
* modification -- are permitted for any purpose, provided that
|
||||
* redistributions in source form retain this entire copyright notice and
|
||||
* indicate the origin and nature of any modifications.
|
||||
*
|
||||
* I'd appreciate being given credit for this package in the documentation
|
||||
* of software which uses it, but that is not a requirement.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
|
||||
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by the University of
|
||||
* California, Berkeley and its contributors.
|
||||
* 4. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
* $Header: /cvsroot/pgsql/src/backend/regex/regfree.c,v 1.16 2003/02/05 17:41:33 tgl Exp $
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)regfree.c 8.3 (Berkeley) 3/20/94
|
||||
* You might think that this could be incorporated into regcomp.c, and
|
||||
* that would be a reasonable idea... except that this is a generic
|
||||
* function (with a generic name), applicable to all compiled REs
|
||||
* regardless of the size of their characters, whereas the stuff in
|
||||
* regcomp.c gets compiled once per character size.
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
#include "regex/regguts.h"
|
||||
|
||||
|
||||
#include "regex/regex.h"
|
||||
#include "regex/utils.h"
|
||||
#include "regex/regex2.h"
|
||||
|
||||
/*
|
||||
* regfree - free everything
|
||||
* pg_regfree - free an RE (generic function, punts to RE-specific function)
|
||||
*
|
||||
* Ignoring invocation with NULL is a convenience.
|
||||
*/
|
||||
void
|
||||
pg_regfree(regex_t *preg)
|
||||
pg_regfree(regex_t *re)
|
||||
{
|
||||
struct re_guts *g;
|
||||
|
||||
if (preg->re_magic != MAGIC1) /* oops */
|
||||
return; /* nice to complain, but hard */
|
||||
|
||||
g = preg->re_g;
|
||||
if (g == NULL || g->magic != MAGIC2) /* oops again */
|
||||
if (re == NULL)
|
||||
return;
|
||||
preg->re_magic = 0; /* mark it invalid */
|
||||
g->magic = 0; /* mark it invalid */
|
||||
|
||||
if (preg->patsave != NULL)
|
||||
free((char *) preg->patsave);
|
||||
if (g->strip != NULL)
|
||||
free((char *) g->strip);
|
||||
if (g->sets != NULL)
|
||||
free((char *) g->sets);
|
||||
if (g->setbits != NULL)
|
||||
free((char *) g->setbits);
|
||||
if (g->must != NULL)
|
||||
free(g->must);
|
||||
free((char *) g);
|
||||
(*((struct fns *)re->re_fns)->free)(re);
|
||||
}
|
||||
|
@ -1,44 +0,0 @@
|
||||
/*
|
||||
* a simple regexp debug program
|
||||
*
|
||||
* $Header: /cvsroot/pgsql/src/backend/regex/Attic/retest.c,v 1.5 2002/06/11 15:41:37 thomas Exp $
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
#include "regex/regex.h"
|
||||
|
||||
int
|
||||
main()
|
||||
{
|
||||
int sts;
|
||||
regex_t re;
|
||||
char buf[1024];
|
||||
char *p;
|
||||
|
||||
printf("type in regexp string: ");
|
||||
if (!fgets(buf, sizeof(buf), stdin))
|
||||
exit(0);
|
||||
p = strchr(buf, '\n');
|
||||
if (p)
|
||||
*p = '\0';
|
||||
|
||||
sts = pg_regcomp(&re, buf, 1);
|
||||
printf("regcomp: parses \"%s\" and returns %d\n", buf, sts);
|
||||
for (;;)
|
||||
{
|
||||
printf("type in target string: ");
|
||||
if (!fgets(buf, sizeof(buf), stdin))
|
||||
exit(0);
|
||||
p = strchr(buf, '\n');
|
||||
if (p)
|
||||
*p = '\0';
|
||||
|
||||
sts = pg_regexec(&re, buf, 0, 0, 0);
|
||||
printf("regexec: returns %d\n", sts);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
elog(int lev, const char *fmt,...)
|
||||
{
|
||||
}
|
Reference in New Issue
Block a user