1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-18 04:29:09 +03:00

Speed up byteain by not parsing traditional-style input twice.

Instead of laboriously computing the exact output length, use strlen
to get an upper bound cheaply.  (This is still O(N) of course, but
the constant factor is a lot less.)  This will typically result in
overallocating the output datum, but that's of little concern since
it's a short-lived allocation in just about all use-cases.

A simple microbenchmark showed about 40% speedup for long input
strings.

While here, make some cosmetic cleanups and add a test case that
covers the double-backslash code path in byteain and byteaout.

Author: Steven Niu <niushiji@gmail.com>
Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Reviewed-by: Stepan Neretin <slpmcf@gmail.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://postgr.es/m/ca315729-140b-426e-81a6-6cd5cfe7ecc5@gmail.com
This commit is contained in:
Tom Lane
2025-07-18 16:42:02 -04:00
parent 84409ed640
commit 3683af6170
3 changed files with 39 additions and 54 deletions

View File

@@ -182,27 +182,21 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
* *
* Non-printable characters must be passed as '\nnn' (octal) and are * Non-printable characters must be passed as '\nnn' (octal) and are
* converted to internal form. '\' must be passed as '\\'. * converted to internal form. '\' must be passed as '\\'.
* ereport(ERROR, ...) if bad form.
*
* BUGS:
* The input is scanned twice.
* The error checking of input is minimal.
*/ */
Datum Datum
byteain(PG_FUNCTION_ARGS) byteain(PG_FUNCTION_ARGS)
{ {
char *inputText = PG_GETARG_CSTRING(0); char *inputText = PG_GETARG_CSTRING(0);
Node *escontext = fcinfo->context; Node *escontext = fcinfo->context;
size_t len = strlen(inputText);
size_t bc;
char *tp; char *tp;
char *rp; char *rp;
int bc;
bytea *result; bytea *result;
/* Recognize hex input */ /* Recognize hex input */
if (inputText[0] == '\\' && inputText[1] == 'x') if (inputText[0] == '\\' && inputText[1] == 'x')
{ {
size_t len = strlen(inputText);
bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */ bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
result = palloc(bc); result = palloc(bc);
bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result), bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
@@ -213,18 +207,33 @@ byteain(PG_FUNCTION_ARGS)
} }
/* Else, it's the traditional escaped style */ /* Else, it's the traditional escaped style */
for (bc = 0, tp = inputText; *tp != '\0'; bc++) result = (bytea *) palloc(len + VARHDRSZ); /* maximum possible length */
tp = inputText;
rp = VARDATA(result);
while (*tp != '\0')
{ {
if (tp[0] != '\\') if (tp[0] != '\\')
tp++; *rp++ = *tp++;
else if ((tp[0] == '\\') && else if ((tp[1] >= '0' && tp[1] <= '3') &&
(tp[1] >= '0' && tp[1] <= '3') &&
(tp[2] >= '0' && tp[2] <= '7') && (tp[2] >= '0' && tp[2] <= '7') &&
(tp[3] >= '0' && tp[3] <= '7')) (tp[3] >= '0' && tp[3] <= '7'))
{
int v;
v = VAL(tp[1]);
v <<= 3;
v += VAL(tp[2]);
v <<= 3;
*rp++ = v + VAL(tp[3]);
tp += 4; tp += 4;
else if ((tp[0] == '\\') && }
(tp[1] == '\\')) else if (tp[1] == '\\')
{
*rp++ = '\\';
tp += 2; tp += 2;
}
else else
{ {
/* /*
@@ -236,46 +245,8 @@ byteain(PG_FUNCTION_ARGS)
} }
} }
bc += VARHDRSZ; bc = rp - VARDATA(result); /* actual length */
SET_VARSIZE(result, bc + VARHDRSZ);
result = (bytea *) palloc(bc);
SET_VARSIZE(result, bc);
tp = inputText;
rp = VARDATA(result);
while (*tp != '\0')
{
if (tp[0] != '\\')
*rp++ = *tp++;
else if ((tp[0] == '\\') &&
(tp[1] >= '0' && tp[1] <= '3') &&
(tp[2] >= '0' && tp[2] <= '7') &&
(tp[3] >= '0' && tp[3] <= '7'))
{
bc = VAL(tp[1]);
bc <<= 3;
bc += VAL(tp[2]);
bc <<= 3;
*rp++ = bc + VAL(tp[3]);
tp += 4;
}
else if ((tp[0] == '\\') &&
(tp[1] == '\\'))
{
*rp++ = '\\';
tp += 2;
}
else
{
/*
* We should never get here. The first pass should not allow it.
*/
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid input syntax for type %s", "bytea")));
}
}
PG_RETURN_BYTEA_P(result); PG_RETURN_BYTEA_P(result);
} }

View File

@@ -236,6 +236,12 @@ SELECT E'De\\678dBeEf'::bytea;
ERROR: invalid input syntax for type bytea ERROR: invalid input syntax for type bytea
LINE 1: SELECT E'De\\678dBeEf'::bytea; LINE 1: SELECT E'De\\678dBeEf'::bytea;
^ ^
SELECT E'DeAd\\\\BeEf'::bytea;
bytea
----------------------
\x446541645c42654566
(1 row)
SELECT reverse(''::bytea); SELECT reverse(''::bytea);
reverse reverse
--------- ---------
@@ -291,6 +297,12 @@ SELECT E'De\\123dBeEf'::bytea;
DeSdBeEf DeSdBeEf
(1 row) (1 row)
SELECT E'DeAd\\\\BeEf'::bytea;
bytea
------------
DeAd\\BeEf
(1 row)
-- Test non-error-throwing API too -- Test non-error-throwing API too
SELECT pg_input_is_valid(E'\\xDeAdBeE', 'bytea'); SELECT pg_input_is_valid(E'\\xDeAdBeE', 'bytea');
pg_input_is_valid pg_input_is_valid

View File

@@ -76,6 +76,7 @@ SELECT E'De\\000dBeEf'::bytea;
SELECT E'De\123dBeEf'::bytea; SELECT E'De\123dBeEf'::bytea;
SELECT E'De\\123dBeEf'::bytea; SELECT E'De\\123dBeEf'::bytea;
SELECT E'De\\678dBeEf'::bytea; SELECT E'De\\678dBeEf'::bytea;
SELECT E'DeAd\\\\BeEf'::bytea;
SELECT reverse(''::bytea); SELECT reverse(''::bytea);
SELECT reverse('\xaa'::bytea); SELECT reverse('\xaa'::bytea);
@@ -88,6 +89,7 @@ SELECT E'\\xDe00BeEf'::bytea;
SELECT E'DeAdBeEf'::bytea; SELECT E'DeAdBeEf'::bytea;
SELECT E'De\\000dBeEf'::bytea; SELECT E'De\\000dBeEf'::bytea;
SELECT E'De\\123dBeEf'::bytea; SELECT E'De\\123dBeEf'::bytea;
SELECT E'DeAd\\\\BeEf'::bytea;
-- Test non-error-throwing API too -- Test non-error-throwing API too
SELECT pg_input_is_valid(E'\\xDeAdBeE', 'bytea'); SELECT pg_input_is_valid(E'\\xDeAdBeE', 'bytea');