mirror of
https://github.com/postgres/postgres.git
synced 2025-05-02 11:44:50 +03:00
Commit 54cd4f045 added some kluges to work around an old glibc bug, namely that %.*s could misbehave if glibc thought any characters in the supplied string were incorrectly encoded. Now that we use our own snprintf.c implementation, we need not worry about that bug (even if it still exists in the wild). Revert a couple of particularly ugly hacks, and remove or improve assorted comments. Note that there can still be encoding-related hazards here: blindly clipping at a fixed length risks producing wrongly-encoded output if the clip splits a multibyte character. However, code that's doing correct multibyte-aware clipping doesn't really need a comment about that, while code that isn't needs an explanation why not, rather than a red-herring comment about an obsolete bug. Discussion: https://postgr.es/m/279428.1593373684@sss.pgh.pa.us
221 lines
5.1 KiB
C
221 lines
5.1 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* scansup.c
|
|
* support routines for the lex/flex scanner, used by both the normal
|
|
* backend as well as the bootstrap backend
|
|
*
|
|
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/parser/scansup.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include <ctype.h>
|
|
|
|
#include "mb/pg_wchar.h"
|
|
#include "parser/scansup.h"
|
|
|
|
/* ----------------
|
|
* scanstr
|
|
*
|
|
* if the string passed in has escaped codes, map the escape codes to actual
|
|
* chars
|
|
*
|
|
* the string returned is palloc'd and should eventually be pfree'd by the
|
|
* caller!
|
|
* ----------------
|
|
*/
|
|
|
|
char *
|
|
scanstr(const char *s)
|
|
{
|
|
char *newStr;
|
|
int len,
|
|
i,
|
|
j;
|
|
|
|
if (s == NULL || s[0] == '\0')
|
|
return pstrdup("");
|
|
|
|
len = strlen(s);
|
|
|
|
newStr = palloc(len + 1); /* string cannot get longer */
|
|
|
|
for (i = 0, j = 0; i < len; i++)
|
|
{
|
|
if (s[i] == '\'')
|
|
{
|
|
/*
|
|
* Note: if scanner is working right, unescaped quotes can only
|
|
* appear in pairs, so there should be another character.
|
|
*/
|
|
i++;
|
|
/* The bootstrap parser is not as smart, so check here. */
|
|
Assert(s[i] == '\'');
|
|
newStr[j] = s[i];
|
|
}
|
|
else if (s[i] == '\\')
|
|
{
|
|
i++;
|
|
switch (s[i])
|
|
{
|
|
case 'b':
|
|
newStr[j] = '\b';
|
|
break;
|
|
case 'f':
|
|
newStr[j] = '\f';
|
|
break;
|
|
case 'n':
|
|
newStr[j] = '\n';
|
|
break;
|
|
case 'r':
|
|
newStr[j] = '\r';
|
|
break;
|
|
case 't':
|
|
newStr[j] = '\t';
|
|
break;
|
|
case '0':
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
{
|
|
int k;
|
|
long octVal = 0;
|
|
|
|
for (k = 0;
|
|
s[i + k] >= '0' && s[i + k] <= '7' && k < 3;
|
|
k++)
|
|
octVal = (octVal << 3) + (s[i + k] - '0');
|
|
i += k - 1;
|
|
newStr[j] = ((char) octVal);
|
|
}
|
|
break;
|
|
default:
|
|
newStr[j] = s[i];
|
|
break;
|
|
} /* switch */
|
|
} /* s[i] == '\\' */
|
|
else
|
|
newStr[j] = s[i];
|
|
j++;
|
|
}
|
|
newStr[j] = '\0';
|
|
return newStr;
|
|
}
|
|
|
|
|
|
/*
|
|
* downcase_truncate_identifier() --- do appropriate downcasing and
|
|
* truncation of an unquoted identifier. Optionally warn of truncation.
|
|
*
|
|
* Returns a palloc'd string containing the adjusted identifier.
|
|
*
|
|
* Note: in some usages the passed string is not null-terminated.
|
|
*
|
|
* Note: the API of this function is designed to allow for downcasing
|
|
* transformations that increase the string length, but we don't yet
|
|
* support that. If you want to implement it, you'll need to fix
|
|
* SplitIdentifierString() in utils/adt/varlena.c.
|
|
*/
|
|
char *
|
|
downcase_truncate_identifier(const char *ident, int len, bool warn)
|
|
{
|
|
return downcase_identifier(ident, len, warn, true);
|
|
}
|
|
|
|
/*
|
|
* a workhorse for downcase_truncate_identifier
|
|
*/
|
|
char *
|
|
downcase_identifier(const char *ident, int len, bool warn, bool truncate)
|
|
{
|
|
char *result;
|
|
int i;
|
|
bool enc_is_single_byte;
|
|
|
|
result = palloc(len + 1);
|
|
enc_is_single_byte = pg_database_encoding_max_length() == 1;
|
|
|
|
/*
|
|
* SQL99 specifies Unicode-aware case normalization, which we don't yet
|
|
* have the infrastructure for. Instead we use tolower() to provide a
|
|
* locale-aware translation. However, there are some locales where this
|
|
* is not right either (eg, Turkish may do strange things with 'i' and
|
|
* 'I'). Our current compromise is to use tolower() for characters with
|
|
* the high bit set, as long as they aren't part of a multi-byte
|
|
* character, and use an ASCII-only downcasing for 7-bit characters.
|
|
*/
|
|
for (i = 0; i < len; i++)
|
|
{
|
|
unsigned char ch = (unsigned char) ident[i];
|
|
|
|
if (ch >= 'A' && ch <= 'Z')
|
|
ch += 'a' - 'A';
|
|
else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
|
|
ch = tolower(ch);
|
|
result[i] = (char) ch;
|
|
}
|
|
result[i] = '\0';
|
|
|
|
if (i >= NAMEDATALEN && truncate)
|
|
truncate_identifier(result, i, warn);
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
/*
|
|
* truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
|
|
*
|
|
* The given string is modified in-place, if necessary. A warning is
|
|
* issued if requested.
|
|
*
|
|
* We require the caller to pass in the string length since this saves a
|
|
* strlen() call in some common usages.
|
|
*/
|
|
void
|
|
truncate_identifier(char *ident, int len, bool warn)
|
|
{
|
|
if (len >= NAMEDATALEN)
|
|
{
|
|
len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);
|
|
if (warn)
|
|
ereport(NOTICE,
|
|
(errcode(ERRCODE_NAME_TOO_LONG),
|
|
errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
|
|
ident, len, ident)));
|
|
ident[len] = '\0';
|
|
}
|
|
}
|
|
|
|
/*
|
|
* scanner_isspace() --- return true if flex scanner considers char whitespace
|
|
*
|
|
* This should be used instead of the potentially locale-dependent isspace()
|
|
* function when it's important to match the lexer's behavior.
|
|
*
|
|
* In principle we might need similar functions for isalnum etc, but for the
|
|
* moment only isspace seems needed.
|
|
*/
|
|
bool
|
|
scanner_isspace(char ch)
|
|
{
|
|
/* This must match scan.l's list of {space} characters */
|
|
if (ch == ' ' ||
|
|
ch == '\t' ||
|
|
ch == '\n' ||
|
|
ch == '\r' ||
|
|
ch == '\f')
|
|
return true;
|
|
return false;
|
|
}
|