mirror of
https://git.savannah.gnu.org/git/gnulib.git
synced 2025-08-17 12:41:05 +03:00
Quote multibyte characters correctly.
(ISGRAPH): Remove. (ISPRINT): New macro. (<wchar.h>): Include if HAVE_MBRTOWC && HAVE_WCHAR_H. (isprint, mbrtowc, mbsinit, mbstate_t): New macros, defined if ! (HAVE_MBRTOWC && HAVE_WCHAR_H). (quotearg_buffer_restyled): New function, with most of the old quotearg_buffer's contents. Major rewrite to support multibyte characters. (quotearg_buffer): Now just calls quotearg_buffer_restyled.
This commit is contained in:
465
lib/quotearg.c
465
lib/quotearg.c
@@ -1,5 +1,5 @@
|
||||
/* quotearg.c - quote arguments for output
|
||||
Copyright (C) 1998, 1999 Free Software Foundation, Inc.
|
||||
Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
@@ -17,8 +17,6 @@
|
||||
|
||||
/* Written by Paul Eggert <eggert@twinsun.com> */
|
||||
|
||||
/* FIXME: Multibyte characters are not supported yet. */
|
||||
|
||||
#if HAVE_CONFIG_H
|
||||
# include <config.h>
|
||||
#endif
|
||||
@@ -33,11 +31,7 @@
|
||||
#else
|
||||
# define ISASCII(c) isascii (c)
|
||||
#endif
|
||||
#ifdef isgraph
|
||||
# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
|
||||
#else
|
||||
# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
|
||||
#endif
|
||||
#define ISPRINT(c) (ISASCII (c) && isprint (c))
|
||||
|
||||
#if ENABLE_NLS
|
||||
# include <libintl.h>
|
||||
@@ -64,6 +58,15 @@
|
||||
# include <string.h>
|
||||
#endif
|
||||
|
||||
#if HAVE_MBRTOWC && HAVE_WCHAR_H
|
||||
# include <wchar.h>
|
||||
#else
|
||||
# define iswprint(wc) 1
|
||||
# define mbrtowc(pwc, s, n, ps) 1
|
||||
# define mbsinit(ps) 1
|
||||
# define mbstate_t int
|
||||
#endif
|
||||
|
||||
#define INT_BITS (sizeof (int) * CHAR_BIT)
|
||||
|
||||
struct quoting_options
|
||||
@@ -71,7 +74,7 @@ struct quoting_options
|
||||
/* Basic quoting style. */
|
||||
enum quoting_style style;
|
||||
|
||||
/* Quote the chararacters indicated by this bit vector even if the
|
||||
/* Quote the characters indicated by this bit vector even if the
|
||||
quoting style would not normally require them to be quoted. */
|
||||
int quote_these_too[((UCHAR_MAX + 1) / INT_BITS
|
||||
+ ((UCHAR_MAX + 1) % INT_BITS != 0))];
|
||||
@@ -89,7 +92,7 @@ char const *const quoting_style_args[] =
|
||||
0
|
||||
};
|
||||
|
||||
/* Correspondances to quoting style names. */
|
||||
/* Correspondences to quoting style names. */
|
||||
enum quoting_style const quoting_style_vals[] =
|
||||
{
|
||||
literal_quoting_style,
|
||||
@@ -146,6 +149,292 @@ set_char_quoting (struct quoting_options *o, char c, int i)
|
||||
return r;
|
||||
}
|
||||
|
||||
/* Place into buffer BUFFER (of size BUFFERSIZE) a quoted version of
|
||||
argument ARG (of size ARGSIZE), using QUOTING_STYLE and the
|
||||
non-quoting-style part of O to control quoting.
|
||||
Terminate the output with a null character, and return the written
|
||||
size of the output, not counting the terminating null.
|
||||
If BUFFERSIZE is too small to store the output string, return the
|
||||
value that would have been returned had BUFFERSIZE been large enough.
|
||||
If ARGSIZE is -1, use the string length of the argument for ARGSIZE.
|
||||
|
||||
This function acts like quotearg_buffer (BUFFER, BUFFERSIZE, ARG,
|
||||
ARGSIZE, O), except it uses QUOTING_STYLE instead of the quoting
|
||||
style specified by O, and O may not be null. */
|
||||
|
||||
static size_t
|
||||
quotearg_buffer_restyled (char *buffer, size_t buffersize,
|
||||
char const *arg, size_t argsize,
|
||||
enum quoting_style quoting_style,
|
||||
struct quoting_options const *o)
|
||||
{
|
||||
size_t i;
|
||||
size_t len = 0;
|
||||
char const *quote_string = 0;
|
||||
size_t quote_string_len = 0;
|
||||
int backslash_escapes = 0;
|
||||
|
||||
#define STORE(c) \
|
||||
do \
|
||||
{ \
|
||||
if (len < buffersize) \
|
||||
buffer[len] = (c); \
|
||||
len++; \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
switch (quoting_style)
|
||||
{
|
||||
case c_quoting_style:
|
||||
STORE ('"');
|
||||
backslash_escapes = 1;
|
||||
quote_string = "\"";
|
||||
quote_string_len = 1;
|
||||
break;
|
||||
|
||||
case escape_quoting_style:
|
||||
backslash_escapes = 1;
|
||||
break;
|
||||
|
||||
case locale_quoting_style:
|
||||
for (quote_string = _("`"); *quote_string; quote_string++)
|
||||
STORE (*quote_string);
|
||||
backslash_escapes = 1;
|
||||
quote_string = _("'");
|
||||
quote_string_len = strlen (quote_string);
|
||||
break;
|
||||
|
||||
case shell_always_quoting_style:
|
||||
STORE ('\'');
|
||||
quote_string = "'";
|
||||
quote_string_len = 1;
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
for (i = 0; ! (argsize == (size_t) -1 ? arg[i] == '\0' : i == argsize); i++)
|
||||
{
|
||||
unsigned char c;
|
||||
unsigned char esc;
|
||||
|
||||
if (backslash_escapes
|
||||
&& quote_string_len
|
||||
&& i + quote_string_len <= argsize
|
||||
&& memcmp (arg + i, quote_string, quote_string_len) == 0)
|
||||
STORE ('\\');
|
||||
|
||||
c = arg[i];
|
||||
switch (c)
|
||||
{
|
||||
case '?':
|
||||
switch (quoting_style)
|
||||
{
|
||||
case shell_quoting_style:
|
||||
goto use_shell_always_quoting_style;
|
||||
|
||||
case c_quoting_style:
|
||||
if (i + 2 < argsize && arg[i + 1] == '?')
|
||||
switch (arg[i + 2])
|
||||
{
|
||||
case '!': case '\'':
|
||||
case '(': case ')': case '-': case '/':
|
||||
case '<': case '=': case '>':
|
||||
/* Escape the second '?' in what would otherwise be
|
||||
a trigraph. */
|
||||
i += 2;
|
||||
c = arg[i + 2];
|
||||
STORE ('?');
|
||||
STORE ('\\');
|
||||
STORE ('?');
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
#if HAVE_C_BACKSLASH_A
|
||||
case '\a': esc = 'a'; goto c_escape;
|
||||
#endif
|
||||
case '\b': esc = 'b'; goto c_escape;
|
||||
case '\f': esc = 'f'; goto c_escape;
|
||||
case '\n': esc = 'n'; goto c_escape;
|
||||
case '\r': esc = 'r'; goto c_escape;
|
||||
case '\t': esc = 't'; goto c_escape;
|
||||
case '\v': esc = 'v'; goto c_escape;
|
||||
case '\\': esc = c; goto c_escape;
|
||||
|
||||
c_escape:
|
||||
if (backslash_escapes)
|
||||
{
|
||||
c = esc;
|
||||
goto store_escape;
|
||||
}
|
||||
if (quoting_style == shell_quoting_style)
|
||||
goto use_shell_always_quoting_style;
|
||||
break;
|
||||
|
||||
case '#': case '~':
|
||||
if (i != 0)
|
||||
break;
|
||||
/* Fall through. */
|
||||
case ' ':
|
||||
case '!': /* special in bash */
|
||||
case '"': case '$': case '&':
|
||||
case '(': case ')': case '*': case ';':
|
||||
case '<': case '>': case '[':
|
||||
case '^': /* special in old /bin/sh, e.g. SunOS 4.1.4 */
|
||||
case '`': case '|':
|
||||
/* A shell special character. In theory, '$' and '`' could
|
||||
be the first bytes of multibyte characters, which means
|
||||
we should check them with mbrtowc, but in practice this
|
||||
doesn't happen so it's not worth worrying about. */
|
||||
if (quoting_style == shell_quoting_style)
|
||||
goto use_shell_always_quoting_style;
|
||||
break;
|
||||
|
||||
case '\'':
|
||||
switch (quoting_style)
|
||||
{
|
||||
case shell_quoting_style:
|
||||
goto use_shell_always_quoting_style;
|
||||
|
||||
case shell_always_quoting_style:
|
||||
STORE ('\'');
|
||||
STORE ('\\');
|
||||
STORE ('\'');
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case '%': case '+': case ',': case '-': case '.': case '/':
|
||||
case '0': case '1': case '2': case '3': case '4': case '5':
|
||||
case '6': case '7': case '8': case '9': case ':': case '=':
|
||||
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
|
||||
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
|
||||
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
|
||||
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
|
||||
case 'Y': case 'Z': case ']': case '_': case 'a': case 'b':
|
||||
case 'c': case 'd': case 'e': case 'f': case 'g': case 'h':
|
||||
case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
|
||||
case 'o': case 'p': case 'q': case 'r': case 's': case 't':
|
||||
case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
|
||||
case '{': case '}':
|
||||
/* These characters don't cause problems, no matter what the
|
||||
quoting style is. They cannot start multibyte sequences. */
|
||||
break;
|
||||
|
||||
default:
|
||||
/* If we have a multibyte sequence, copy it until we reach
|
||||
its end, find an error, or come back to the initial shift
|
||||
state. For C-like styles, if the sequence has
|
||||
unprintable characters, escape the whole sequence, since
|
||||
we can't easily escape single characters within it. */
|
||||
{
|
||||
/* Length of multibyte sequence found so far. */
|
||||
size_t m = 0;
|
||||
|
||||
int printable = 1;
|
||||
mbstate_t mbstate;
|
||||
memset (&mbstate, 0, sizeof mbstate);
|
||||
|
||||
if (argsize == (size_t) -1)
|
||||
argsize = strlen (arg);
|
||||
|
||||
do
|
||||
{
|
||||
wchar_t w;
|
||||
size_t bytes = mbrtowc (&w, &arg[i + m],
|
||||
argsize - (i + m), &mbstate);
|
||||
if (bytes == 0)
|
||||
break;
|
||||
else if (bytes == (size_t) -1)
|
||||
{
|
||||
printable = 0;
|
||||
break;
|
||||
}
|
||||
else if (bytes == (size_t) -2)
|
||||
{
|
||||
printable = 0;
|
||||
while (i + m < argsize && arg[i + m])
|
||||
m++;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (! iswprint (w))
|
||||
printable = 0;
|
||||
m += bytes;
|
||||
}
|
||||
}
|
||||
while (! mbsinit (&mbstate));
|
||||
|
||||
if (m <= 1)
|
||||
{
|
||||
/* Escape a unibyte character like a multibyte
|
||||
sequence if using backslash escapes, and if the
|
||||
character is not printable. */
|
||||
m = backslash_escapes && ! ISPRINT (c);
|
||||
printable = 0;
|
||||
}
|
||||
|
||||
if (m)
|
||||
{
|
||||
/* Output a multibyte sequence, or an escaped
|
||||
unprintable unibyte character. */
|
||||
size_t imax = i + m - 1;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
if (backslash_escapes && ! printable)
|
||||
{
|
||||
STORE ('\\');
|
||||
STORE ('0' + (c >> 6));
|
||||
STORE ('0' + ((c >> 3) & 7));
|
||||
c = '0' + (c & 7);
|
||||
}
|
||||
if (i == imax)
|
||||
break;
|
||||
STORE (c);
|
||||
c = arg[++i];
|
||||
}
|
||||
|
||||
goto store_c;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (! (backslash_escapes
|
||||
&& o->quote_these_too[c / INT_BITS] & (1 << (c % INT_BITS))))
|
||||
goto store_c;
|
||||
|
||||
store_escape:
|
||||
STORE ('\\');
|
||||
|
||||
store_c:
|
||||
STORE (c);
|
||||
}
|
||||
|
||||
if (quote_string)
|
||||
for (; *quote_string; quote_string++)
|
||||
STORE (*quote_string);
|
||||
|
||||
if (len < buffersize)
|
||||
buffer[len] = '\0';
|
||||
return len;
|
||||
|
||||
use_shell_always_quoting_style:
|
||||
return quotearg_buffer_restyled (buffer, buffersize, arg, argsize,
|
||||
shell_always_quoting_style, o);
|
||||
}
|
||||
|
||||
/* Place into buffer BUFFER (of size BUFFERSIZE) a quoted version of
|
||||
argument ARG (of size ARGSIZE), using O to control quoting.
|
||||
If O is null, use the default.
|
||||
@@ -159,161 +448,9 @@ quotearg_buffer (char *buffer, size_t buffersize,
|
||||
char const *arg, size_t argsize,
|
||||
struct quoting_options const *o)
|
||||
{
|
||||
unsigned char c;
|
||||
size_t i;
|
||||
size_t len = 0;
|
||||
char const *quote_string;
|
||||
size_t quote_string_len;
|
||||
struct quoting_options const *p = o ? o : &default_quoting_options;
|
||||
enum quoting_style quoting_style = p->style;
|
||||
#define STORE(c) \
|
||||
do \
|
||||
{ \
|
||||
if (len < buffersize) \
|
||||
buffer[len] = (c); \
|
||||
len++; \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
switch (quoting_style)
|
||||
{
|
||||
case shell_quoting_style:
|
||||
if (! (argsize == (size_t) -1 ? arg[0] == '\0' : argsize == 0))
|
||||
{
|
||||
switch (arg[0])
|
||||
{
|
||||
case '#': case '~':
|
||||
break;
|
||||
|
||||
default:
|
||||
for (i = 0; ; i++)
|
||||
{
|
||||
if (argsize == (size_t) -1 ? arg[i] == '\0' : i == argsize)
|
||||
goto done;
|
||||
|
||||
c = arg[i];
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case '\t': case '\n': case ' ':
|
||||
case '!': /* special in csh */
|
||||
case '"': case '$': case '&': case '\'':
|
||||
case '(': case ')': case '*': case ';':
|
||||
case '<': case '>': case '?': case '[': case '\\':
|
||||
case '^': /* special in old /bin/sh, e.g. SunOS 4.1.4 */
|
||||
case '`': case '|':
|
||||
goto needs_quoting;
|
||||
}
|
||||
|
||||
if (p->quote_these_too[c / INT_BITS] & (1 << (c % INT_BITS)))
|
||||
goto needs_quoting;
|
||||
|
||||
STORE (c);
|
||||
}
|
||||
needs_quoting:;
|
||||
|
||||
len = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* Fall through. */
|
||||
|
||||
case shell_always_quoting_style:
|
||||
STORE ('\'');
|
||||
quote_string = "'";
|
||||
quote_string_len = 1;
|
||||
break;
|
||||
|
||||
case c_quoting_style:
|
||||
STORE ('"');
|
||||
quote_string = "\"";
|
||||
quote_string_len = 1;
|
||||
break;
|
||||
|
||||
case locale_quoting_style:
|
||||
for (quote_string = _("`"); *quote_string; quote_string++)
|
||||
STORE (*quote_string);
|
||||
quote_string = _("'");
|
||||
quote_string_len = strlen (quote_string);
|
||||
break;
|
||||
|
||||
default:
|
||||
quote_string = 0;
|
||||
quote_string_len = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
for (i = 0; ! (argsize == (size_t) -1 ? arg[i] == '\0' : i == argsize); i++)
|
||||
{
|
||||
c = arg[i];
|
||||
|
||||
switch (quoting_style)
|
||||
{
|
||||
case literal_quoting_style:
|
||||
break;
|
||||
|
||||
case shell_quoting_style:
|
||||
case shell_always_quoting_style:
|
||||
if (c == '\'')
|
||||
{
|
||||
STORE ('\'');
|
||||
STORE ('\\');
|
||||
STORE ('\'');
|
||||
}
|
||||
break;
|
||||
|
||||
case c_quoting_style:
|
||||
case escape_quoting_style:
|
||||
case locale_quoting_style:
|
||||
switch (c)
|
||||
{
|
||||
case '?': /* Do not generate trigraphs. */
|
||||
case '\\': goto store_escape;
|
||||
/* Not all C compilers know what \a means. */
|
||||
case 7 : c = 'a'; goto store_escape;
|
||||
case '\b': c = 'b'; goto store_escape;
|
||||
case '\f': c = 'f'; goto store_escape;
|
||||
case '\n': c = 'n'; goto store_escape;
|
||||
case '\r': c = 'r'; goto store_escape;
|
||||
case '\t': c = 't'; goto store_escape;
|
||||
case '\v': c = 'v'; goto store_escape;
|
||||
|
||||
case ' ': break;
|
||||
|
||||
default:
|
||||
if (quote_string_len
|
||||
&& strncmp (arg + i, quote_string, quote_string_len) == 0)
|
||||
goto store_escape;
|
||||
if (!ISGRAPH (c))
|
||||
{
|
||||
STORE ('\\');
|
||||
STORE ('0' + (c >> 6));
|
||||
STORE ('0' + ((c >> 3) & 7));
|
||||
c = '0' + (c & 7);
|
||||
goto store_c;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (! (p->quote_these_too[c / INT_BITS] & (1 << (c % INT_BITS))))
|
||||
goto store_c;
|
||||
|
||||
store_escape:
|
||||
STORE ('\\');
|
||||
}
|
||||
|
||||
store_c:
|
||||
STORE (c);
|
||||
}
|
||||
|
||||
if (quote_string)
|
||||
for (; *quote_string; quote_string++)
|
||||
STORE (*quote_string);
|
||||
|
||||
done:
|
||||
if (len < buffersize)
|
||||
buffer[len] = '\0';
|
||||
return len;
|
||||
return quotearg_buffer_restyled (buffer, buffersize, arg, argsize,
|
||||
p->style, p);
|
||||
}
|
||||
|
||||
/* Use storage slot N to return a quoted version of the string ARG.
|
||||
|
Reference in New Issue
Block a user