mirror of
https://github.com/postgres/postgres.git
synced 2025-06-16 06:01:02 +03:00
Improve to_date/to_number/to_timestamp behavior with multibyte characters.
The documentation says that these functions skip one input character
per literal (non-pattern) format character. Actually, though, they
skipped one input *byte* per literal *byte*, which could be hugely
confusing if either data or format contained multibyte characters.
To fix, adjust the FormatNode representation and parse_format() so
that multibyte format characters are stored as one FormatNode not
several, and adjust the data-skipping bits to advance by pg_mblen()
not necessarily one byte. There's no user-visible behavior change
on the to_char() side, although the internal representation changes.
Commit e87d4965b
had already fixed most places where we skip characters
on the basis of non-literal format patterns to advance by characters
not bytes, but this gets one more place, the SKIP_THth macro. I think
everything in formatting.c gets that right now.
It'd be nice to have some regression test cases covering this behavior;
but of course there's no way to do so in an encoding-agnostic way, and
many of the interesting aspects would also require unportable locale
selections. So I've not bothered here.
Discussion: https://postgr.es/m/28186.1510957703@sss.pgh.pa.us
This commit is contained in:
@ -151,8 +151,6 @@ typedef enum
|
|||||||
FROM_CHAR_DATE_ISOWEEK /* ISO 8601 week date */
|
FROM_CHAR_DATE_ISOWEEK /* ISO 8601 week date */
|
||||||
} FromCharDateMode;
|
} FromCharDateMode;
|
||||||
|
|
||||||
typedef struct FormatNode FormatNode;
|
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
const char *name;
|
const char *name;
|
||||||
@ -162,13 +160,13 @@ typedef struct
|
|||||||
FromCharDateMode date_mode;
|
FromCharDateMode date_mode;
|
||||||
} KeyWord;
|
} KeyWord;
|
||||||
|
|
||||||
struct FormatNode
|
typedef struct
|
||||||
{
|
{
|
||||||
int type; /* node type */
|
int type; /* NODE_TYPE_XXX, see below */
|
||||||
const KeyWord *key; /* if node type is KEYWORD */
|
const KeyWord *key; /* if type is ACTION */
|
||||||
char character; /* if node type is CHAR */
|
char character[MAX_MULTIBYTE_CHAR_LEN + 1]; /* if type is CHAR */
|
||||||
int suffix; /* keyword suffix */
|
int suffix; /* keyword prefix/suffix code, if any */
|
||||||
};
|
} FormatNode;
|
||||||
|
|
||||||
#define NODE_TYPE_END 1
|
#define NODE_TYPE_END 1
|
||||||
#define NODE_TYPE_ACTION 2
|
#define NODE_TYPE_ACTION 2
|
||||||
@ -1282,12 +1280,15 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
|
|||||||
}
|
}
|
||||||
else if (*str)
|
else if (*str)
|
||||||
{
|
{
|
||||||
|
int chlen;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Process double-quoted literal string, if any
|
* Process double-quoted literal string, if any
|
||||||
*/
|
*/
|
||||||
if (*str == '"')
|
if (*str == '"')
|
||||||
{
|
{
|
||||||
while (*(++str))
|
str++;
|
||||||
|
while (*str)
|
||||||
{
|
{
|
||||||
if (*str == '"')
|
if (*str == '"')
|
||||||
{
|
{
|
||||||
@ -1297,11 +1298,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
|
|||||||
/* backslash quotes the next character, if any */
|
/* backslash quotes the next character, if any */
|
||||||
if (*str == '\\' && *(str + 1))
|
if (*str == '\\' && *(str + 1))
|
||||||
str++;
|
str++;
|
||||||
|
chlen = pg_mblen(str);
|
||||||
n->type = NODE_TYPE_CHAR;
|
n->type = NODE_TYPE_CHAR;
|
||||||
n->character = *str;
|
memcpy(n->character, str, chlen);
|
||||||
|
n->character[chlen] = '\0';
|
||||||
n->key = NULL;
|
n->key = NULL;
|
||||||
n->suffix = 0;
|
n->suffix = 0;
|
||||||
n++;
|
n++;
|
||||||
|
str += chlen;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -1312,12 +1316,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
|
|||||||
*/
|
*/
|
||||||
if (*str == '\\' && *(str + 1) == '"')
|
if (*str == '\\' && *(str + 1) == '"')
|
||||||
str++;
|
str++;
|
||||||
|
chlen = pg_mblen(str);
|
||||||
n->type = NODE_TYPE_CHAR;
|
n->type = NODE_TYPE_CHAR;
|
||||||
n->character = *str;
|
memcpy(n->character, str, chlen);
|
||||||
|
n->character[chlen] = '\0';
|
||||||
n->key = NULL;
|
n->key = NULL;
|
||||||
n->suffix = 0;
|
n->suffix = 0;
|
||||||
n++;
|
n++;
|
||||||
str++;
|
str += chlen;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1349,7 +1355,8 @@ dump_node(FormatNode *node, int max)
|
|||||||
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)",
|
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)",
|
||||||
a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix));
|
a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix));
|
||||||
else if (n->type == NODE_TYPE_CHAR)
|
else if (n->type == NODE_TYPE_CHAR)
|
||||||
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%c'", a, n->character);
|
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%s'",
|
||||||
|
a, n->character);
|
||||||
else if (n->type == NODE_TYPE_END)
|
else if (n->type == NODE_TYPE_END)
|
||||||
{
|
{
|
||||||
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a);
|
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a);
|
||||||
@ -2008,8 +2015,8 @@ asc_toupper_z(const char *buff)
|
|||||||
do { \
|
do { \
|
||||||
if (S_THth(_suf)) \
|
if (S_THth(_suf)) \
|
||||||
{ \
|
{ \
|
||||||
if (*(ptr)) (ptr)++; \
|
if (*(ptr)) (ptr) += pg_mblen(ptr); \
|
||||||
if (*(ptr)) (ptr)++; \
|
if (*(ptr)) (ptr) += pg_mblen(ptr); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
@ -2076,7 +2083,8 @@ is_next_separator(FormatNode *n)
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
else if (isdigit((unsigned char) n->character))
|
else if (n->character[1] == '\0' &&
|
||||||
|
isdigit((unsigned char) n->character[0]))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
return true; /* some non-digit input (separator) */
|
return true; /* some non-digit input (separator) */
|
||||||
@ -2405,8 +2413,8 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col
|
|||||||
{
|
{
|
||||||
if (n->type != NODE_TYPE_ACTION)
|
if (n->type != NODE_TYPE_ACTION)
|
||||||
{
|
{
|
||||||
*s = n->character;
|
strcpy(s, n->character);
|
||||||
s++;
|
s += strlen(s);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2974,7 +2982,7 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
|
|||||||
* we don't insist that the consumed character match the format's
|
* we don't insist that the consumed character match the format's
|
||||||
* character.
|
* character.
|
||||||
*/
|
*/
|
||||||
s++;
|
s += pg_mblen(s);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4217,7 +4225,7 @@ get_last_relevant_decnum(char *num)
|
|||||||
/*
|
/*
|
||||||
* These macros are used in NUM_processor() and its subsidiary routines.
|
* These macros are used in NUM_processor() and its subsidiary routines.
|
||||||
* OVERLOAD_TEST: true if we've reached end of input string
|
* OVERLOAD_TEST: true if we've reached end of input string
|
||||||
* AMOUNT_TEST(s): true if at least s characters remain in string
|
* AMOUNT_TEST(s): true if at least s bytes remain in string
|
||||||
*/
|
*/
|
||||||
#define OVERLOAD_TEST (Np->inout_p >= Np->inout + input_len)
|
#define OVERLOAD_TEST (Np->inout_p >= Np->inout + input_len)
|
||||||
#define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s)))
|
#define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s)))
|
||||||
@ -4821,9 +4829,9 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
|
|||||||
if (!Np->is_to_char)
|
if (!Np->is_to_char)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Check at least one character remains to be scanned. (In
|
* Check at least one byte remains to be scanned. (In actions
|
||||||
* actions below, must use AMOUNT_TEST if we want to read more
|
* below, must use AMOUNT_TEST if we want to read more bytes than
|
||||||
* characters than that.)
|
* that.)
|
||||||
*/
|
*/
|
||||||
if (OVERLOAD_TEST)
|
if (OVERLOAD_TEST)
|
||||||
break;
|
break;
|
||||||
@ -5081,12 +5089,18 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
|
|||||||
* In TO_CHAR, non-pattern characters in the format are copied to
|
* In TO_CHAR, non-pattern characters in the format are copied to
|
||||||
* the output. In TO_NUMBER, we skip one input character for each
|
* the output. In TO_NUMBER, we skip one input character for each
|
||||||
* non-pattern format character, whether or not it matches the
|
* non-pattern format character, whether or not it matches the
|
||||||
* format character. (Currently, that's actually implemented as
|
* format character.
|
||||||
* skipping one input byte per non-pattern format byte, which is
|
|
||||||
* wrong...)
|
|
||||||
*/
|
*/
|
||||||
if (Np->is_to_char)
|
if (Np->is_to_char)
|
||||||
*Np->inout_p = n->character;
|
{
|
||||||
|
strcpy(Np->inout_p, n->character);
|
||||||
|
Np->inout_p += strlen(Np->inout_p);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Np->inout_p += pg_mblen(Np->inout_p);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
Np->inout_p++;
|
Np->inout_p++;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user