From e1d917182c1953b16b32a39ed2fe38e3d0823047 Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Sat, 20 Sep 2025 23:19:32 +0200 Subject: [PATCH] Add support for base64url encoding and decoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for base64url encoding and decoding, a base64 variant which is safe to use in filenames and URLs. base64url replaces '+' in the base64 alphabet with '-' and '/' with '_', thus making it safe for URL addresses and file systems. Support for base64url was originally suggested by Przemysław Sztoch. Author: Florents Tselai Reviewed-by: Aleksander Alekseev Reviewed-by: David E. Wheeler Reviewed-by: Masahiko Sawada Reviewed-by: Daniel Gustafsson Reviewed-by: Chao Li (Evan) Discussion: https://postgr.es/m/70f2b6a8-486a-4fdb-a951-84cef35e22ab@sztoch.pl --- doc/src/sgml/func/func-binarystring.sgml | 19 +++ src/backend/utils/adt/encode.c | 157 ++++++++++++++++++++--- src/test/regress/expected/strings.out | 150 ++++++++++++++++++++++ src/test/regress/sql/strings.sql | 54 ++++++++ 4 files changed, 359 insertions(+), 21 deletions(-) diff --git a/doc/src/sgml/func/func-binarystring.sgml b/doc/src/sgml/func/func-binarystring.sgml index 78814ee0685..9bab965f288 100644 --- a/doc/src/sgml/func/func-binarystring.sgml +++ b/doc/src/sgml/func/func-binarystring.sgml @@ -728,6 +728,7 @@ Encodes binary data into a textual representation; supported format values are: base64, + base64url, escape, hex. @@ -785,6 +786,24 @@ + + base64url + + base64url format + + + + The base64url format is that of + + RFC 4648 Section 5, a base64 variant safe to + use in filenames and URLs. The base64url alphabet + use '-' instead of '+' and + '_' instead of '/' and also omits + the '=' padding character. + + + + escape diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index 4ccaed815d1..9a9c7e8da99 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -267,12 +267,15 @@ hex_dec_len(const char *src, size_t srclen) } /* - * BASE64 + * BASE64 and BASE64URL */ static const char _base64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static const char _base64url[] = +"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; + static const int8 b64lookup[128] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, @@ -284,8 +287,15 @@ static const int8 b64lookup[128] = { 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1, }; +/* + * pg_base64_encode_internal + * + * Helper for decoding base64 or base64url. When url is passed as true the + * input will be encoded using base64url. len bytes in src is encoded into + * dst. + */ static uint64 -pg_base64_encode(const char *src, size_t len, char *dst) +pg_base64_encode_internal(const char *src, size_t len, char *dst, bool url) { char *p, *lend = dst + 76; @@ -293,6 +303,7 @@ pg_base64_encode(const char *src, size_t len, char *dst) *end = src + len; int pos = 2; uint32 buf = 0; + const char *alphabet = url ? _base64url : _base64; s = src; p = dst; @@ -306,33 +317,64 @@ pg_base64_encode(const char *src, size_t len, char *dst) /* write it out */ if (pos < 0) { - *p++ = _base64[(buf >> 18) & 0x3f]; - *p++ = _base64[(buf >> 12) & 0x3f]; - *p++ = _base64[(buf >> 6) & 0x3f]; - *p++ = _base64[buf & 0x3f]; + *p++ = alphabet[(buf >> 18) & 0x3f]; + *p++ = alphabet[(buf >> 12) & 0x3f]; + *p++ = alphabet[(buf >> 6) & 0x3f]; + *p++ = alphabet[buf & 0x3f]; pos = 2; buf = 0; - } - if (p >= lend) - { - *p++ = '\n'; - lend = p + 76; + + if (!url && p >= lend) + { + *p++ = '\n'; + lend = p + 76; + } } } + + /* Handle remaining bytes in buf */ if (pos != 2) { - *p++ = _base64[(buf >> 18) & 0x3f]; - *p++ = _base64[(buf >> 12) & 0x3f]; - *p++ = (pos == 0) ? _base64[(buf >> 6) & 0x3f] : '='; - *p++ = '='; + *p++ = alphabet[(buf >> 18) & 0x3f]; + *p++ = alphabet[(buf >> 12) & 0x3f]; + + if (pos == 0) + { + *p++ = alphabet[(buf >> 6) & 0x3f]; + if (!url) + *p++ = '='; + } + else if (!url) + { + *p++ = '='; + *p++ = '='; + } } return p - dst; } static uint64 -pg_base64_decode(const char *src, size_t len, char *dst) +pg_base64_encode(const char *src, size_t len, char *dst) +{ + return pg_base64_encode_internal(src, len, dst, false); +} + +static uint64 +pg_base64url_encode(const char *src, size_t len, char *dst) +{ + return pg_base64_encode_internal(src, len, dst, true); +} + +/* + * pg_base64_decode_internal + * + * Helper for decoding base64 or base64url. When url is passed as true the + * input will be assumed to be encoded using base64url. + */ +static uint64 +pg_base64_decode_internal(const char *src, size_t len, char *dst, bool url) { const char *srcend = src + len, *s = src; @@ -350,6 +392,15 @@ pg_base64_decode(const char *src, size_t len, char *dst) if (c == ' ' || c == '\t' || c == '\n' || c == '\r') continue; + /* convert base64url to base64 */ + if (url) + { + if (c == '-') + c = '+'; + else if (c == '_') + c = '/'; + } + if (c == '=') { /* end sequence */ @@ -360,9 +411,12 @@ pg_base64_decode(const char *src, size_t len, char *dst) else if (pos == 3) end = 2; else + { + /* translator: %s is the name of an encoding scheme */ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("unexpected \"=\" while decoding base64 sequence"))); + errmsg("unexpected \"=\" while decoding %s sequence", url ? "base64url" : "base64"))); + } } b = 0; } @@ -372,10 +426,14 @@ pg_base64_decode(const char *src, size_t len, char *dst) if (c > 0 && c < 127) b = b64lookup[(unsigned char) c]; if (b < 0) + { + /* translator: %s is the name of an encoding scheme */ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence", - pg_mblen(s - 1), s - 1))); + errmsg("invalid symbol \"%.*s\" found while decoding %s sequence", + pg_mblen(s - 1), s - 1, + url ? "base64url" : "base64"))); + } } /* add it to buffer */ buf = (buf << 6) + b; @@ -392,15 +450,40 @@ pg_base64_decode(const char *src, size_t len, char *dst) } } - if (pos != 0) + if (pos == 2) + { + buf <<= 12; + *p++ = (buf >> 16) & 0xFF; + } + else if (pos == 3) + { + buf <<= 6; + *p++ = (buf >> 16) & 0xFF; + *p++ = (buf >> 8) & 0xFF; + } + else if (pos != 0) + { + /* translator: %s is the name of an encoding scheme */ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid base64 end sequence"), + errmsg("invalid %s end sequence", url ? "base64url" : "base64"), errhint("Input data is missing padding, is truncated, or is otherwise corrupted."))); + } return p - dst; } +static uint64 +pg_base64_decode(const char *src, size_t len, char *dst) +{ + return pg_base64_decode_internal(src, len, dst, false); +} + +static uint64 +pg_base64url_decode(const char *src, size_t len, char *dst) +{ + return pg_base64_decode_internal(src, len, dst, true); +} static uint64 pg_base64_enc_len(const char *src, size_t srclen) @@ -415,6 +498,32 @@ pg_base64_dec_len(const char *src, size_t srclen) return ((uint64) srclen * 3) >> 2; } +static uint64 +pg_base64url_enc_len(const char *src, size_t srclen) +{ + /* + * Unlike standard base64, base64url doesn't use padding characters when + * the input length is not divisible by 3 + */ + return (srclen + 2) / 3 * 4; +} + +static uint64 +pg_base64url_dec_len(const char *src, size_t srclen) +{ + /* + * For base64, each 4 characters of input produce at most 3 bytes of + * output. For base64url without padding, we need to round up to the + * nearest 4 + */ + size_t adjusted_len = srclen; + + if (srclen % 4 != 0) + adjusted_len += 4 - (srclen % 4); + + return (adjusted_len * 3) / 4; +} + /* * Escape * Minimally escape bytea to text. @@ -606,6 +715,12 @@ static const struct pg_base64_enc_len, pg_base64_dec_len, pg_base64_encode, pg_base64_decode } }, + { + "base64url", + { + pg_base64url_enc_len, pg_base64url_dec_len, pg_base64url_encode, pg_base64url_decode + } + }, { "escape", { diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 2d6cb02ad60..691e475bce3 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -2517,6 +2517,156 @@ SELECT decode(encode('\x1234567890abcdef00', 'escape'), 'escape'); \x1234567890abcdef00 (1 row) +-- +-- base64url encoding/decoding +-- +SET bytea_output TO hex; +-- Simple encoding/decoding +SELECT encode('\x69b73eff', 'base64url'); -- abc-_w + encode +-------- + abc-_w +(1 row) + +SELECT decode('abc-_w', 'base64url'); -- \x69b73eff + decode +------------ + \x69b73eff +(1 row) + +-- Round-trip: decode(encode(x)) = x +SELECT decode(encode('\x1234567890abcdef00', 'base64url'), 'base64url'); -- \x1234567890abcdef00 + decode +---------------------- + \x1234567890abcdef00 +(1 row) + +-- Empty input +SELECT encode('', 'base64url'); -- '' + encode +-------- + +(1 row) + +SELECT decode('', 'base64url'); -- '' + decode +-------- + \x +(1 row) + +-- 1 byte input +SELECT encode('\x01', 'base64url'); -- AQ + encode +-------- + AQ +(1 row) + +SELECT decode('AQ', 'base64url'); -- \x01 + decode +-------- + \x01 +(1 row) + +-- 2 byte input +SELECT encode('\x0102'::bytea, 'base64url'); -- AQI + encode +-------- + AQI +(1 row) + +SELECT decode('AQI', 'base64url'); -- \x0102 + decode +-------- + \x0102 +(1 row) + +-- 3 byte input (no padding needed) +SELECT encode('\x010203'::bytea, 'base64url'); -- AQID + encode +-------- + AQID +(1 row) + +SELECT decode('AQID', 'base64url'); -- \x010203 + decode +---------- + \x010203 +(1 row) + +-- 4 byte input (results in 6 base64 chars) +SELECT encode('\xdeadbeef'::bytea, 'base64url'); -- 3q2-7w + encode +-------- + 3q2-7w +(1 row) + +SELECT decode('3q2-7w', 'base64url'); -- \xdeadbeef + decode +------------ + \xdeadbeef +(1 row) + +-- Round-trip test for all lengths from 0–4 +SELECT encode(decode(encode(E'\\x', 'base64url'), 'base64url'), 'base64url'); + encode +-------- + +(1 row) + +SELECT encode(decode(encode(E'\\x00', 'base64url'), 'base64url'), 'base64url'); + encode +-------- + AA +(1 row) + +SELECT encode(decode(encode(E'\\x0001', 'base64url'), 'base64url'), 'base64url'); + encode +-------- + AAE +(1 row) + +SELECT encode(decode(encode(E'\\x000102', 'base64url'), 'base64url'), 'base64url'); + encode +-------- + AAEC +(1 row) + +SELECT encode(decode(encode(E'\\x00010203', 'base64url'), 'base64url'), 'base64url'); + encode +-------- + AAECAw +(1 row) + +-- Invalid inputs (should ERROR) +-- invalid character '@' +SELECT decode('QQ@=', 'base64url'); +ERROR: invalid symbol "@" found while decoding base64url sequence +-- missing characters (incomplete group) +SELECT decode('QQ', 'base64url'); -- ok (1 byte) + decode +-------- + \x41 +(1 row) + +SELECT decode('QQI', 'base64url'); -- ok (2 bytes) + decode +-------- + \x4102 +(1 row) + +SELECT decode('QQIDQ', 'base64url'); -- ERROR: invalid base64url end sequence +ERROR: invalid base64url end sequence +HINT: Input data is missing padding, is truncated, or is otherwise corrupted. +-- unexpected '=' at start +SELECT decode('=QQQ', 'base64url'); +ERROR: unexpected "=" while decoding base64url sequence +-- valid base64 padding in base64url (optional, but accepted) +SELECT decode('abc-_w==', 'base64url'); -- should decode to \x69b73eff + decode +------------ + \x69b73eff +(1 row) + -- -- get_bit/set_bit etc -- diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 5ed421d6205..c05f3413699 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -799,6 +799,60 @@ SELECT decode(encode(('\x' || repeat('1234567890abcdef0001', 7))::bytea, SELECT encode('\x1234567890abcdef00', 'escape'); SELECT decode(encode('\x1234567890abcdef00', 'escape'), 'escape'); +-- +-- base64url encoding/decoding +-- +SET bytea_output TO hex; + +-- Simple encoding/decoding +SELECT encode('\x69b73eff', 'base64url'); -- abc-_w +SELECT decode('abc-_w', 'base64url'); -- \x69b73eff + +-- Round-trip: decode(encode(x)) = x +SELECT decode(encode('\x1234567890abcdef00', 'base64url'), 'base64url'); -- \x1234567890abcdef00 + +-- Empty input +SELECT encode('', 'base64url'); -- '' +SELECT decode('', 'base64url'); -- '' + +-- 1 byte input +SELECT encode('\x01', 'base64url'); -- AQ +SELECT decode('AQ', 'base64url'); -- \x01 + +-- 2 byte input +SELECT encode('\x0102'::bytea, 'base64url'); -- AQI +SELECT decode('AQI', 'base64url'); -- \x0102 + +-- 3 byte input (no padding needed) +SELECT encode('\x010203'::bytea, 'base64url'); -- AQID +SELECT decode('AQID', 'base64url'); -- \x010203 + +-- 4 byte input (results in 6 base64 chars) +SELECT encode('\xdeadbeef'::bytea, 'base64url'); -- 3q2-7w +SELECT decode('3q2-7w', 'base64url'); -- \xdeadbeef + +-- Round-trip test for all lengths from 0–4 +SELECT encode(decode(encode(E'\\x', 'base64url'), 'base64url'), 'base64url'); +SELECT encode(decode(encode(E'\\x00', 'base64url'), 'base64url'), 'base64url'); +SELECT encode(decode(encode(E'\\x0001', 'base64url'), 'base64url'), 'base64url'); +SELECT encode(decode(encode(E'\\x000102', 'base64url'), 'base64url'), 'base64url'); +SELECT encode(decode(encode(E'\\x00010203', 'base64url'), 'base64url'), 'base64url'); + +-- Invalid inputs (should ERROR) +-- invalid character '@' +SELECT decode('QQ@=', 'base64url'); + +-- missing characters (incomplete group) +SELECT decode('QQ', 'base64url'); -- ok (1 byte) +SELECT decode('QQI', 'base64url'); -- ok (2 bytes) +SELECT decode('QQIDQ', 'base64url'); -- ERROR: invalid base64url end sequence + +-- unexpected '=' at start +SELECT decode('=QQQ', 'base64url'); + +-- valid base64 padding in base64url (optional, but accepted) +SELECT decode('abc-_w==', 'base64url'); -- should decode to \x69b73eff + -- -- get_bit/set_bit etc --