mirror of
https://github.com/postgres/postgres.git
synced 2025-05-29 16:21:20 +03:00
With GB18030 as source encoding, applications could crash the server via SQL functions convert() or convert_from(). Applications themselves could crash after passing unterminated GB18030 input to libpq functions PQescapeLiteral(), PQescapeIdentifier(), PQescapeStringConn(), or PQescapeString(). Extension code could crash by passing unterminated GB18030 input to jsonapi.h functions. All those functions have been intended to handle untrusted, unterminated input safely. A crash required allocating the input such that the last byte of the allocation was the last byte of a virtual memory page. Some malloc() implementations take measures against that, making the SIGSEGV hard to reach. Back-patch to v13 (all supported versions). Author: Noah Misch <noah@leadboat.com> Author: Andres Freund <andres@anarazel.de> Reviewed-by: Masahiko Sawada <sawada.mshk@gmail.com> Backpatch-through: 13 Security: CVE-2025-4207
1845 lines
49 KiB
C
1845 lines
49 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* mbutils.c
|
|
* This file contains functions for encoding conversion.
|
|
*
|
|
* The string-conversion functions in this file share some API quirks.
|
|
* Note the following:
|
|
*
|
|
* The functions return a palloc'd, null-terminated string if conversion
|
|
* is required. However, if no conversion is performed, the given source
|
|
* string pointer is returned as-is.
|
|
*
|
|
* Although the presence of a length argument means that callers can pass
|
|
* non-null-terminated strings, care is required because the same string
|
|
* will be passed back if no conversion occurs. Such callers *must* check
|
|
* whether result == src and handle that case differently.
|
|
*
|
|
* If the source and destination encodings are the same, the source string
|
|
* is returned without any verification; it's assumed to be valid data.
|
|
* If that might not be the case, the caller is responsible for validating
|
|
* the string using a separate call to pg_verify_mbstr(). Whenever the
|
|
* source and destination encodings are different, the functions ensure that
|
|
* the result is validly encoded according to the destination encoding.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/utils/mb/mbutils.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/xact.h"
|
|
#include "catalog/namespace.h"
|
|
#include "mb/pg_wchar.h"
|
|
#include "utils/builtins.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/syscache.h"
|
|
#include "varatt.h"
|
|
|
|
/*
|
|
* We maintain a simple linked list caching the fmgr lookup info for the
|
|
* currently selected conversion functions, as well as any that have been
|
|
* selected previously in the current session. (We remember previous
|
|
* settings because we must be able to restore a previous setting during
|
|
* transaction rollback, without doing any fresh catalog accesses.)
|
|
*
|
|
* Since we'll never release this data, we just keep it in TopMemoryContext.
|
|
*/
|
|
typedef struct ConvProcInfo
|
|
{
|
|
int s_encoding; /* server and client encoding IDs */
|
|
int c_encoding;
|
|
FmgrInfo to_server_info; /* lookup info for conversion procs */
|
|
FmgrInfo to_client_info;
|
|
} ConvProcInfo;
|
|
|
|
static List *ConvProcList = NIL; /* List of ConvProcInfo */
|
|
|
|
/*
|
|
* These variables point to the currently active conversion functions,
|
|
* or are NULL when no conversion is needed.
|
|
*/
|
|
static FmgrInfo *ToServerConvProc = NULL;
|
|
static FmgrInfo *ToClientConvProc = NULL;
|
|
|
|
/*
|
|
* This variable stores the conversion function to convert from UTF-8
|
|
* to the server encoding. It's NULL if the server encoding *is* UTF-8,
|
|
* or if we lack a conversion function for this.
|
|
*/
|
|
static FmgrInfo *Utf8ToServerConvProc = NULL;
|
|
|
|
/*
|
|
* These variables track the currently-selected encodings.
|
|
*/
|
|
static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
|
|
static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
|
|
static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
|
|
|
|
/*
|
|
* During backend startup we can't set client encoding because we (a)
|
|
* can't look up the conversion functions, and (b) may not know the database
|
|
* encoding yet either. So SetClientEncoding() just accepts anything and
|
|
* remembers it for InitializeClientEncoding() to apply later.
|
|
*/
|
|
static bool backend_startup_complete = false;
|
|
static int pending_client_encoding = PG_SQL_ASCII;
|
|
|
|
|
|
/* Internal functions */
|
|
static char *perform_default_encoding_conversion(const char *src,
|
|
int len, bool is_client_to_server);
|
|
static int cliplen(const char *str, int len, int limit);
|
|
|
|
|
|
/*
|
|
* Prepare for a future call to SetClientEncoding. Success should mean
|
|
* that SetClientEncoding is guaranteed to succeed for this encoding request.
|
|
*
|
|
* (But note that success before backend_startup_complete does not guarantee
|
|
* success after ...)
|
|
*
|
|
* Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
|
|
*/
|
|
int
|
|
PrepareClientEncoding(int encoding)
|
|
{
|
|
int current_server_encoding;
|
|
ListCell *lc;
|
|
|
|
if (!PG_VALID_FE_ENCODING(encoding))
|
|
return -1;
|
|
|
|
/* Can't do anything during startup, per notes above */
|
|
if (!backend_startup_complete)
|
|
return 0;
|
|
|
|
current_server_encoding = GetDatabaseEncoding();
|
|
|
|
/*
|
|
* Check for cases that require no conversion function.
|
|
*/
|
|
if (current_server_encoding == encoding ||
|
|
current_server_encoding == PG_SQL_ASCII ||
|
|
encoding == PG_SQL_ASCII)
|
|
return 0;
|
|
|
|
if (IsTransactionState())
|
|
{
|
|
/*
|
|
* If we're in a live transaction, it's safe to access the catalogs,
|
|
* so look up the functions. We repeat the lookup even if the info is
|
|
* already cached, so that we can react to changes in the contents of
|
|
* pg_conversion.
|
|
*/
|
|
Oid to_server_proc,
|
|
to_client_proc;
|
|
ConvProcInfo *convinfo;
|
|
MemoryContext oldcontext;
|
|
|
|
to_server_proc = FindDefaultConversionProc(encoding,
|
|
current_server_encoding);
|
|
if (!OidIsValid(to_server_proc))
|
|
return -1;
|
|
to_client_proc = FindDefaultConversionProc(current_server_encoding,
|
|
encoding);
|
|
if (!OidIsValid(to_client_proc))
|
|
return -1;
|
|
|
|
/*
|
|
* Load the fmgr info into TopMemoryContext (could still fail here)
|
|
*/
|
|
convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
|
|
sizeof(ConvProcInfo));
|
|
convinfo->s_encoding = current_server_encoding;
|
|
convinfo->c_encoding = encoding;
|
|
fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
|
|
TopMemoryContext);
|
|
fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
|
|
TopMemoryContext);
|
|
|
|
/* Attach new info to head of list */
|
|
oldcontext = MemoryContextSwitchTo(TopMemoryContext);
|
|
ConvProcList = lcons(convinfo, ConvProcList);
|
|
MemoryContextSwitchTo(oldcontext);
|
|
|
|
/*
|
|
* We cannot yet remove any older entry for the same encoding pair,
|
|
* since it could still be in use. SetClientEncoding will clean up.
|
|
*/
|
|
|
|
return 0; /* success */
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* If we're not in a live transaction, the only thing we can do is
|
|
* restore a previous setting using the cache. This covers all
|
|
* transaction-rollback cases. The only case it might not work for is
|
|
* trying to change client_encoding on the fly by editing
|
|
* postgresql.conf and SIGHUP'ing. Which would probably be a stupid
|
|
* thing to do anyway.
|
|
*/
|
|
foreach(lc, ConvProcList)
|
|
{
|
|
ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
|
|
|
|
if (oldinfo->s_encoding == current_server_encoding &&
|
|
oldinfo->c_encoding == encoding)
|
|
return 0;
|
|
}
|
|
|
|
return -1; /* it's not cached, so fail */
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Set the active client encoding and set up the conversion-function pointers.
|
|
* PrepareClientEncoding should have been called previously for this encoding.
|
|
*
|
|
* Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
|
|
*/
|
|
int
|
|
SetClientEncoding(int encoding)
|
|
{
|
|
int current_server_encoding;
|
|
bool found;
|
|
ListCell *lc;
|
|
|
|
if (!PG_VALID_FE_ENCODING(encoding))
|
|
return -1;
|
|
|
|
/* Can't do anything during startup, per notes above */
|
|
if (!backend_startup_complete)
|
|
{
|
|
pending_client_encoding = encoding;
|
|
return 0;
|
|
}
|
|
|
|
current_server_encoding = GetDatabaseEncoding();
|
|
|
|
/*
|
|
* Check for cases that require no conversion function.
|
|
*/
|
|
if (current_server_encoding == encoding ||
|
|
current_server_encoding == PG_SQL_ASCII ||
|
|
encoding == PG_SQL_ASCII)
|
|
{
|
|
ClientEncoding = &pg_enc2name_tbl[encoding];
|
|
ToServerConvProc = NULL;
|
|
ToClientConvProc = NULL;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Search the cache for the entry previously prepared by
|
|
* PrepareClientEncoding; if there isn't one, we lose. While at it,
|
|
* release any duplicate entries so that repeated Prepare/Set cycles don't
|
|
* leak memory.
|
|
*/
|
|
found = false;
|
|
foreach(lc, ConvProcList)
|
|
{
|
|
ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
|
|
|
|
if (convinfo->s_encoding == current_server_encoding &&
|
|
convinfo->c_encoding == encoding)
|
|
{
|
|
if (!found)
|
|
{
|
|
/* Found newest entry, so set up */
|
|
ClientEncoding = &pg_enc2name_tbl[encoding];
|
|
ToServerConvProc = &convinfo->to_server_info;
|
|
ToClientConvProc = &convinfo->to_client_info;
|
|
found = true;
|
|
}
|
|
else
|
|
{
|
|
/* Duplicate entry, release it */
|
|
ConvProcList = foreach_delete_current(ConvProcList, lc);
|
|
pfree(convinfo);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (found)
|
|
return 0; /* success */
|
|
else
|
|
return -1; /* it's not cached, so fail */
|
|
}
|
|
|
|
/*
|
|
* Initialize client encoding conversions.
|
|
* Called from InitPostgres() once during backend startup.
|
|
*/
|
|
void
|
|
InitializeClientEncoding(void)
|
|
{
|
|
int current_server_encoding;
|
|
|
|
Assert(!backend_startup_complete);
|
|
backend_startup_complete = true;
|
|
|
|
if (PrepareClientEncoding(pending_client_encoding) < 0 ||
|
|
SetClientEncoding(pending_client_encoding) < 0)
|
|
{
|
|
/*
|
|
* Oops, the requested conversion is not available. We couldn't fail
|
|
* before, but we can now.
|
|
*/
|
|
ereport(FATAL,
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("conversion between %s and %s is not supported",
|
|
pg_enc2name_tbl[pending_client_encoding].name,
|
|
GetDatabaseEncodingName())));
|
|
}
|
|
|
|
/*
|
|
* Also look up the UTF8-to-server conversion function if needed. Since
|
|
* the server encoding is fixed within any one backend process, we don't
|
|
* have to do this more than once.
|
|
*/
|
|
current_server_encoding = GetDatabaseEncoding();
|
|
if (current_server_encoding != PG_UTF8 &&
|
|
current_server_encoding != PG_SQL_ASCII)
|
|
{
|
|
Oid utf8_to_server_proc;
|
|
|
|
Assert(IsTransactionState());
|
|
utf8_to_server_proc =
|
|
FindDefaultConversionProc(PG_UTF8,
|
|
current_server_encoding);
|
|
/* If there's no such conversion, just leave the pointer as NULL */
|
|
if (OidIsValid(utf8_to_server_proc))
|
|
{
|
|
FmgrInfo *finfo;
|
|
|
|
finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
|
|
sizeof(FmgrInfo));
|
|
fmgr_info_cxt(utf8_to_server_proc, finfo,
|
|
TopMemoryContext);
|
|
/* Set Utf8ToServerConvProc only after data is fully valid */
|
|
Utf8ToServerConvProc = finfo;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* returns the current client encoding
|
|
*/
|
|
int
|
|
pg_get_client_encoding(void)
|
|
{
|
|
return ClientEncoding->encoding;
|
|
}
|
|
|
|
/*
|
|
* returns the current client encoding name
|
|
*/
|
|
const char *
|
|
pg_get_client_encoding_name(void)
|
|
{
|
|
return ClientEncoding->name;
|
|
}
|
|
|
|
/*
|
|
* Convert src string to another encoding (general case).
|
|
*
|
|
* See the notes about string conversion functions at the top of this file.
|
|
*/
|
|
unsigned char *
|
|
pg_do_encoding_conversion(unsigned char *src, int len,
|
|
int src_encoding, int dest_encoding)
|
|
{
|
|
unsigned char *result;
|
|
Oid proc;
|
|
|
|
if (len <= 0)
|
|
return src; /* empty string is always valid */
|
|
|
|
if (src_encoding == dest_encoding)
|
|
return src; /* no conversion required, assume valid */
|
|
|
|
if (dest_encoding == PG_SQL_ASCII)
|
|
return src; /* any string is valid in SQL_ASCII */
|
|
|
|
if (src_encoding == PG_SQL_ASCII)
|
|
{
|
|
/* No conversion is possible, but we must validate the result */
|
|
(void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
|
|
return src;
|
|
}
|
|
|
|
if (!IsTransactionState()) /* shouldn't happen */
|
|
elog(ERROR, "cannot perform encoding conversion outside a transaction");
|
|
|
|
proc = FindDefaultConversionProc(src_encoding, dest_encoding);
|
|
if (!OidIsValid(proc))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_UNDEFINED_FUNCTION),
|
|
errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
|
|
pg_encoding_to_char(src_encoding),
|
|
pg_encoding_to_char(dest_encoding))));
|
|
|
|
/*
|
|
* Allocate space for conversion result, being wary of integer overflow.
|
|
*
|
|
* len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
|
|
* required space, so it might exceed MaxAllocSize even though the result
|
|
* would actually fit. We do not want to hand back a result string that
|
|
* exceeds MaxAllocSize, because callers might not cope gracefully --- but
|
|
* if we just allocate more than that, and don't use it, that's fine.
|
|
*/
|
|
if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("out of memory"),
|
|
errdetail("String of %d bytes is too long for encoding conversion.",
|
|
len)));
|
|
|
|
result = (unsigned char *)
|
|
MemoryContextAllocHuge(CurrentMemoryContext,
|
|
(Size) len * MAX_CONVERSION_GROWTH + 1);
|
|
|
|
(void) OidFunctionCall6(proc,
|
|
Int32GetDatum(src_encoding),
|
|
Int32GetDatum(dest_encoding),
|
|
CStringGetDatum((char *) src),
|
|
CStringGetDatum((char *) result),
|
|
Int32GetDatum(len),
|
|
BoolGetDatum(false));
|
|
|
|
/*
|
|
* If the result is large, it's worth repalloc'ing to release any extra
|
|
* space we asked for. The cutoff here is somewhat arbitrary, but we
|
|
* *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
|
|
*/
|
|
if (len > 1000000)
|
|
{
|
|
Size resultlen = strlen((char *) result);
|
|
|
|
if (resultlen >= MaxAllocSize)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("out of memory"),
|
|
errdetail("String of %d bytes is too long for encoding conversion.",
|
|
len)));
|
|
|
|
result = (unsigned char *) repalloc(result, resultlen + 1);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Convert src string to another encoding.
|
|
*
|
|
* This function has a different API than the other conversion functions.
|
|
* The caller should've looked up the conversion function using
|
|
* FindDefaultConversionProc(). Unlike the other functions, the converted
|
|
* result is not palloc'd. It is written to the caller-supplied buffer
|
|
* instead.
|
|
*
|
|
* src_encoding - encoding to convert from
|
|
* dest_encoding - encoding to convert to
|
|
* src, srclen - input buffer and its length in bytes
|
|
* dest, destlen - destination buffer and its size in bytes
|
|
*
|
|
* The output is null-terminated.
|
|
*
|
|
* If destlen < srclen * MAX_CONVERSION_INPUT_LENGTH + 1, the converted output
|
|
* wouldn't necessarily fit in the output buffer, and the function will not
|
|
* convert the whole input.
|
|
*
|
|
* TODO: The conversion function interface is not great. Firstly, it
|
|
* would be nice to pass through the destination buffer size to the
|
|
* conversion function, so that if you pass a shorter destination buffer, it
|
|
* could still continue to fill up the whole buffer. Currently, we have to
|
|
* assume worst case expansion and stop the conversion short, even if there
|
|
* is in fact space left in the destination buffer. Secondly, it would be
|
|
* nice to return the number of bytes written to the caller, to avoid a call
|
|
* to strlen().
|
|
*/
|
|
int
|
|
pg_do_encoding_conversion_buf(Oid proc,
|
|
int src_encoding,
|
|
int dest_encoding,
|
|
unsigned char *src, int srclen,
|
|
unsigned char *dest, int destlen,
|
|
bool noError)
|
|
{
|
|
Datum result;
|
|
|
|
/*
|
|
* If the destination buffer is not large enough to hold the result in the
|
|
* worst case, limit the input size passed to the conversion function.
|
|
*/
|
|
if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
|
|
srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
|
|
|
|
result = OidFunctionCall6(proc,
|
|
Int32GetDatum(src_encoding),
|
|
Int32GetDatum(dest_encoding),
|
|
CStringGetDatum((char *) src),
|
|
CStringGetDatum((char *) dest),
|
|
Int32GetDatum(srclen),
|
|
BoolGetDatum(noError));
|
|
return DatumGetInt32(result);
|
|
}
|
|
|
|
/*
|
|
* Convert string to encoding encoding_name. The source
|
|
* encoding is the DB encoding.
|
|
*
|
|
* BYTEA convert_to(TEXT string, NAME encoding_name) */
|
|
Datum
|
|
pg_convert_to(PG_FUNCTION_ARGS)
|
|
{
|
|
Datum string = PG_GETARG_DATUM(0);
|
|
Datum dest_encoding_name = PG_GETARG_DATUM(1);
|
|
Datum src_encoding_name = DirectFunctionCall1(namein,
|
|
CStringGetDatum(DatabaseEncoding->name));
|
|
Datum result;
|
|
|
|
/*
|
|
* pg_convert expects a bytea as its first argument. We're passing it a
|
|
* text argument here, relying on the fact that they are both in fact
|
|
* varlena types, and thus structurally identical.
|
|
*/
|
|
result = DirectFunctionCall3(pg_convert, string,
|
|
src_encoding_name, dest_encoding_name);
|
|
|
|
PG_RETURN_DATUM(result);
|
|
}
|
|
|
|
/*
|
|
* Convert string from encoding encoding_name. The destination
|
|
* encoding is the DB encoding.
|
|
*
|
|
* TEXT convert_from(BYTEA string, NAME encoding_name) */
|
|
Datum
|
|
pg_convert_from(PG_FUNCTION_ARGS)
|
|
{
|
|
Datum string = PG_GETARG_DATUM(0);
|
|
Datum src_encoding_name = PG_GETARG_DATUM(1);
|
|
Datum dest_encoding_name = DirectFunctionCall1(namein,
|
|
CStringGetDatum(DatabaseEncoding->name));
|
|
Datum result;
|
|
|
|
result = DirectFunctionCall3(pg_convert, string,
|
|
src_encoding_name, dest_encoding_name);
|
|
|
|
/*
|
|
* pg_convert returns a bytea, which we in turn return as text, relying on
|
|
* the fact that they are both in fact varlena types, and thus
|
|
* structurally identical. Although not all bytea values are valid text,
|
|
* in this case it will be because we've told pg_convert to return one
|
|
* that is valid as text in the current database encoding.
|
|
*/
|
|
PG_RETURN_DATUM(result);
|
|
}
|
|
|
|
/*
|
|
* Convert string between two arbitrary encodings.
|
|
*
|
|
* BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
|
|
*/
|
|
Datum
|
|
pg_convert(PG_FUNCTION_ARGS)
|
|
{
|
|
bytea *string = PG_GETARG_BYTEA_PP(0);
|
|
char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
|
|
int src_encoding = pg_char_to_encoding(src_encoding_name);
|
|
char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
|
|
int dest_encoding = pg_char_to_encoding(dest_encoding_name);
|
|
const char *src_str;
|
|
char *dest_str;
|
|
bytea *retval;
|
|
int len;
|
|
|
|
if (src_encoding < 0)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("invalid source encoding name \"%s\"",
|
|
src_encoding_name)));
|
|
if (dest_encoding < 0)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("invalid destination encoding name \"%s\"",
|
|
dest_encoding_name)));
|
|
|
|
/* make sure that source string is valid */
|
|
len = VARSIZE_ANY_EXHDR(string);
|
|
src_str = VARDATA_ANY(string);
|
|
(void) pg_verify_mbstr(src_encoding, src_str, len, false);
|
|
|
|
/* perform conversion */
|
|
dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
|
|
len,
|
|
src_encoding,
|
|
dest_encoding);
|
|
|
|
/* update len if conversion actually happened */
|
|
if (dest_str != src_str)
|
|
len = strlen(dest_str);
|
|
|
|
/*
|
|
* build bytea data type structure.
|
|
*/
|
|
retval = (bytea *) palloc(len + VARHDRSZ);
|
|
SET_VARSIZE(retval, len + VARHDRSZ);
|
|
memcpy(VARDATA(retval), dest_str, len);
|
|
|
|
if (dest_str != src_str)
|
|
pfree(dest_str);
|
|
|
|
/* free memory if allocated by the toaster */
|
|
PG_FREE_IF_COPY(string, 0);
|
|
|
|
PG_RETURN_BYTEA_P(retval);
|
|
}
|
|
|
|
/*
|
|
* get the length of the string considered as text in the specified
|
|
* encoding. Raises an error if the data is not valid in that
|
|
* encoding.
|
|
*
|
|
* INT4 length (BYTEA string, NAME src_encoding_name)
|
|
*/
|
|
Datum
|
|
length_in_encoding(PG_FUNCTION_ARGS)
|
|
{
|
|
bytea *string = PG_GETARG_BYTEA_PP(0);
|
|
char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
|
|
int src_encoding = pg_char_to_encoding(src_encoding_name);
|
|
const char *src_str;
|
|
int len;
|
|
int retval;
|
|
|
|
if (src_encoding < 0)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("invalid encoding name \"%s\"",
|
|
src_encoding_name)));
|
|
|
|
len = VARSIZE_ANY_EXHDR(string);
|
|
src_str = VARDATA_ANY(string);
|
|
|
|
retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
|
|
|
|
PG_RETURN_INT32(retval);
|
|
}
|
|
|
|
/*
|
|
* Get maximum multibyte character length in the specified encoding.
|
|
*
|
|
* Note encoding is specified numerically, not by name as above.
|
|
*/
|
|
Datum
|
|
pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
|
|
{
|
|
int encoding = PG_GETARG_INT32(0);
|
|
|
|
if (PG_VALID_ENCODING(encoding))
|
|
PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
|
|
else
|
|
PG_RETURN_NULL();
|
|
}
|
|
|
|
/*
|
|
* Convert client encoding to server encoding.
|
|
*
|
|
* See the notes about string conversion functions at the top of this file.
|
|
*/
|
|
char *
|
|
pg_client_to_server(const char *s, int len)
|
|
{
|
|
return pg_any_to_server(s, len, ClientEncoding->encoding);
|
|
}
|
|
|
|
/*
|
|
* Convert any encoding to server encoding.
|
|
*
|
|
* See the notes about string conversion functions at the top of this file.
|
|
*
|
|
* Unlike the other string conversion functions, this will apply validation
|
|
* even if encoding == DatabaseEncoding->encoding. This is because this is
|
|
* used to process data coming in from outside the database, and we never
|
|
* want to just assume validity.
|
|
*/
|
|
char *
|
|
pg_any_to_server(const char *s, int len, int encoding)
|
|
{
|
|
if (len <= 0)
|
|
return unconstify(char *, s); /* empty string is always valid */
|
|
|
|
if (encoding == DatabaseEncoding->encoding ||
|
|
encoding == PG_SQL_ASCII)
|
|
{
|
|
/*
|
|
* No conversion is needed, but we must still validate the data.
|
|
*/
|
|
(void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
|
|
return unconstify(char *, s);
|
|
}
|
|
|
|
if (DatabaseEncoding->encoding == PG_SQL_ASCII)
|
|
{
|
|
/*
|
|
* No conversion is possible, but we must still validate the data,
|
|
* because the client-side code might have done string escaping using
|
|
* the selected client_encoding. If the client encoding is ASCII-safe
|
|
* then we just do a straight validation under that encoding. For an
|
|
* ASCII-unsafe encoding we have a problem: we dare not pass such data
|
|
* to the parser but we have no way to convert it. We compromise by
|
|
* rejecting the data if it contains any non-ASCII characters.
|
|
*/
|
|
if (PG_VALID_BE_ENCODING(encoding))
|
|
(void) pg_verify_mbstr(encoding, s, len, false);
|
|
else
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < len; i++)
|
|
{
|
|
if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
|
errmsg("invalid byte value for encoding \"%s\": 0x%02x",
|
|
pg_enc2name_tbl[PG_SQL_ASCII].name,
|
|
(unsigned char) s[i])));
|
|
}
|
|
}
|
|
return unconstify(char *, s);
|
|
}
|
|
|
|
/* Fast path if we can use cached conversion function */
|
|
if (encoding == ClientEncoding->encoding)
|
|
return perform_default_encoding_conversion(s, len, true);
|
|
|
|
/* General case ... will not work outside transactions */
|
|
return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
|
|
len,
|
|
encoding,
|
|
DatabaseEncoding->encoding);
|
|
}
|
|
|
|
/*
|
|
* Convert server encoding to client encoding.
|
|
*
|
|
* See the notes about string conversion functions at the top of this file.
|
|
*/
|
|
char *
|
|
pg_server_to_client(const char *s, int len)
|
|
{
|
|
return pg_server_to_any(s, len, ClientEncoding->encoding);
|
|
}
|
|
|
|
/*
|
|
* Convert server encoding to any encoding.
|
|
*
|
|
* See the notes about string conversion functions at the top of this file.
|
|
*/
|
|
char *
|
|
pg_server_to_any(const char *s, int len, int encoding)
|
|
{
|
|
if (len <= 0)
|
|
return unconstify(char *, s); /* empty string is always valid */
|
|
|
|
if (encoding == DatabaseEncoding->encoding ||
|
|
encoding == PG_SQL_ASCII)
|
|
return unconstify(char *, s); /* assume data is valid */
|
|
|
|
if (DatabaseEncoding->encoding == PG_SQL_ASCII)
|
|
{
|
|
/* No conversion is possible, but we must validate the result */
|
|
(void) pg_verify_mbstr(encoding, s, len, false);
|
|
return unconstify(char *, s);
|
|
}
|
|
|
|
/* Fast path if we can use cached conversion function */
|
|
if (encoding == ClientEncoding->encoding)
|
|
return perform_default_encoding_conversion(s, len, false);
|
|
|
|
/* General case ... will not work outside transactions */
|
|
return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
|
|
len,
|
|
DatabaseEncoding->encoding,
|
|
encoding);
|
|
}
|
|
|
|
/*
|
|
* Perform default encoding conversion using cached FmgrInfo. Since
|
|
* this function does not access database at all, it is safe to call
|
|
* outside transactions. If the conversion has not been set up by
|
|
* SetClientEncoding(), no conversion is performed.
|
|
*/
|
|
static char *
|
|
perform_default_encoding_conversion(const char *src, int len,
|
|
bool is_client_to_server)
|
|
{
|
|
char *result;
|
|
int src_encoding,
|
|
dest_encoding;
|
|
FmgrInfo *flinfo;
|
|
|
|
if (is_client_to_server)
|
|
{
|
|
src_encoding = ClientEncoding->encoding;
|
|
dest_encoding = DatabaseEncoding->encoding;
|
|
flinfo = ToServerConvProc;
|
|
}
|
|
else
|
|
{
|
|
src_encoding = DatabaseEncoding->encoding;
|
|
dest_encoding = ClientEncoding->encoding;
|
|
flinfo = ToClientConvProc;
|
|
}
|
|
|
|
if (flinfo == NULL)
|
|
return unconstify(char *, src);
|
|
|
|
/*
|
|
* Allocate space for conversion result, being wary of integer overflow.
|
|
* See comments in pg_do_encoding_conversion.
|
|
*/
|
|
if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("out of memory"),
|
|
errdetail("String of %d bytes is too long for encoding conversion.",
|
|
len)));
|
|
|
|
result = (char *)
|
|
MemoryContextAllocHuge(CurrentMemoryContext,
|
|
(Size) len * MAX_CONVERSION_GROWTH + 1);
|
|
|
|
FunctionCall6(flinfo,
|
|
Int32GetDatum(src_encoding),
|
|
Int32GetDatum(dest_encoding),
|
|
CStringGetDatum(src),
|
|
CStringGetDatum(result),
|
|
Int32GetDatum(len),
|
|
BoolGetDatum(false));
|
|
|
|
/*
|
|
* Release extra space if there might be a lot --- see comments in
|
|
* pg_do_encoding_conversion.
|
|
*/
|
|
if (len > 1000000)
|
|
{
|
|
Size resultlen = strlen(result);
|
|
|
|
if (resultlen >= MaxAllocSize)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
|
errmsg("out of memory"),
|
|
errdetail("String of %d bytes is too long for encoding conversion.",
|
|
len)));
|
|
|
|
result = (char *) repalloc(result, resultlen + 1);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Convert a single Unicode code point into a string in the server encoding.
|
|
*
|
|
* The code point given by "c" is converted and stored at *s, which must
|
|
* have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
|
|
* The output will have a trailing '\0'. Throws error if the conversion
|
|
* cannot be performed.
|
|
*
|
|
* Note that this relies on having previously looked up any required
|
|
* conversion function. That's partly for speed but mostly because the parser
|
|
* may call this outside any transaction, or in an aborted transaction.
|
|
*/
|
|
void
|
|
pg_unicode_to_server(pg_wchar c, unsigned char *s)
|
|
{
|
|
unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
|
|
int c_as_utf8_len;
|
|
int server_encoding;
|
|
|
|
/*
|
|
* Complain if invalid Unicode code point. The choice of errcode here is
|
|
* debatable, but really our caller should have checked this anyway.
|
|
*/
|
|
if (!is_valid_unicode_codepoint(c))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_SYNTAX_ERROR),
|
|
errmsg("invalid Unicode code point")));
|
|
|
|
/* Otherwise, if it's in ASCII range, conversion is trivial */
|
|
if (c <= 0x7F)
|
|
{
|
|
s[0] = (unsigned char) c;
|
|
s[1] = '\0';
|
|
return;
|
|
}
|
|
|
|
/* If the server encoding is UTF-8, we just need to reformat the code */
|
|
server_encoding = GetDatabaseEncoding();
|
|
if (server_encoding == PG_UTF8)
|
|
{
|
|
unicode_to_utf8(c, s);
|
|
s[pg_utf_mblen(s)] = '\0';
|
|
return;
|
|
}
|
|
|
|
/* For all other cases, we must have a conversion function available */
|
|
if (Utf8ToServerConvProc == NULL)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("conversion between %s and %s is not supported",
|
|
pg_enc2name_tbl[PG_UTF8].name,
|
|
GetDatabaseEncodingName())));
|
|
|
|
/* Construct UTF-8 source string */
|
|
unicode_to_utf8(c, c_as_utf8);
|
|
c_as_utf8_len = pg_utf_mblen(c_as_utf8);
|
|
c_as_utf8[c_as_utf8_len] = '\0';
|
|
|
|
/* Convert, or throw error if we can't */
|
|
FunctionCall6(Utf8ToServerConvProc,
|
|
Int32GetDatum(PG_UTF8),
|
|
Int32GetDatum(server_encoding),
|
|
CStringGetDatum((char *) c_as_utf8),
|
|
CStringGetDatum((char *) s),
|
|
Int32GetDatum(c_as_utf8_len),
|
|
BoolGetDatum(false));
|
|
}
|
|
|
|
/*
|
|
* Convert a single Unicode code point into a string in the server encoding.
|
|
*
|
|
* Same as pg_unicode_to_server(), except that we don't throw errors,
|
|
* but simply return false on conversion failure.
|
|
*/
|
|
bool
|
|
pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
|
|
{
|
|
unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
|
|
int c_as_utf8_len;
|
|
int converted_len;
|
|
int server_encoding;
|
|
|
|
/* Fail if invalid Unicode code point */
|
|
if (!is_valid_unicode_codepoint(c))
|
|
return false;
|
|
|
|
/* Otherwise, if it's in ASCII range, conversion is trivial */
|
|
if (c <= 0x7F)
|
|
{
|
|
s[0] = (unsigned char) c;
|
|
s[1] = '\0';
|
|
return true;
|
|
}
|
|
|
|
/* If the server encoding is UTF-8, we just need to reformat the code */
|
|
server_encoding = GetDatabaseEncoding();
|
|
if (server_encoding == PG_UTF8)
|
|
{
|
|
unicode_to_utf8(c, s);
|
|
s[pg_utf_mblen(s)] = '\0';
|
|
return true;
|
|
}
|
|
|
|
/* For all other cases, we must have a conversion function available */
|
|
if (Utf8ToServerConvProc == NULL)
|
|
return false;
|
|
|
|
/* Construct UTF-8 source string */
|
|
unicode_to_utf8(c, c_as_utf8);
|
|
c_as_utf8_len = pg_utf_mblen(c_as_utf8);
|
|
c_as_utf8[c_as_utf8_len] = '\0';
|
|
|
|
/* Convert, but without throwing error if we can't */
|
|
converted_len = DatumGetInt32(FunctionCall6(Utf8ToServerConvProc,
|
|
Int32GetDatum(PG_UTF8),
|
|
Int32GetDatum(server_encoding),
|
|
CStringGetDatum((char *) c_as_utf8),
|
|
CStringGetDatum((char *) s),
|
|
Int32GetDatum(c_as_utf8_len),
|
|
BoolGetDatum(true)));
|
|
|
|
/* Conversion was successful iff it consumed the whole input */
|
|
return (converted_len == c_as_utf8_len);
|
|
}
|
|
|
|
|
|
/* convert a multibyte string to a wchar */
|
|
int
|
|
pg_mb2wchar(const char *from, pg_wchar *to)
|
|
{
|
|
return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
|
|
}
|
|
|
|
/* convert a multibyte string to a wchar with a limited length */
|
|
int
|
|
pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
|
|
{
|
|
return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
|
|
}
|
|
|
|
/* same, with any encoding */
|
|
int
|
|
pg_encoding_mb2wchar_with_len(int encoding,
|
|
const char *from, pg_wchar *to, int len)
|
|
{
|
|
return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
|
|
}
|
|
|
|
/* convert a wchar string to a multibyte */
|
|
int
|
|
pg_wchar2mb(const pg_wchar *from, char *to)
|
|
{
|
|
return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
|
|
}
|
|
|
|
/* convert a wchar string to a multibyte with a limited length */
|
|
int
|
|
pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
|
|
{
|
|
return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
|
|
}
|
|
|
|
/* same, with any encoding */
|
|
int
|
|
pg_encoding_wchar2mb_with_len(int encoding,
|
|
const pg_wchar *from, char *to, int len)
|
|
{
|
|
return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
|
|
}
|
|
|
|
/* returns the byte length of a multibyte character */
|
|
int
|
|
pg_mblen(const char *mbstr)
|
|
{
|
|
return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
|
|
}
|
|
|
|
/* returns the display length of a multibyte character */
|
|
int
|
|
pg_dsplen(const char *mbstr)
|
|
{
|
|
return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
|
|
}
|
|
|
|
/* returns the length (counted in wchars) of a multibyte string */
|
|
int
|
|
pg_mbstrlen(const char *mbstr)
|
|
{
|
|
int len = 0;
|
|
|
|
/* optimization for single byte encoding */
|
|
if (pg_database_encoding_max_length() == 1)
|
|
return strlen(mbstr);
|
|
|
|
while (*mbstr)
|
|
{
|
|
mbstr += pg_mblen(mbstr);
|
|
len++;
|
|
}
|
|
return len;
|
|
}
|
|
|
|
/* returns the length (counted in wchars) of a multibyte string
|
|
* (not necessarily NULL terminated)
|
|
*/
|
|
int
|
|
pg_mbstrlen_with_len(const char *mbstr, int limit)
|
|
{
|
|
int len = 0;
|
|
|
|
/* optimization for single byte encoding */
|
|
if (pg_database_encoding_max_length() == 1)
|
|
return limit;
|
|
|
|
while (limit > 0 && *mbstr)
|
|
{
|
|
int l = pg_mblen(mbstr);
|
|
|
|
limit -= l;
|
|
mbstr += l;
|
|
len++;
|
|
}
|
|
return len;
|
|
}
|
|
|
|
/*
|
|
* returns the byte length of a multibyte string
|
|
* (not necessarily NULL terminated)
|
|
* that is no longer than limit.
|
|
* this function does not break multibyte character boundary.
|
|
*/
|
|
int
|
|
pg_mbcliplen(const char *mbstr, int len, int limit)
|
|
{
|
|
return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
|
|
len, limit);
|
|
}
|
|
|
|
/*
|
|
* pg_mbcliplen with specified encoding; string must be valid in encoding
|
|
*/
|
|
int
|
|
pg_encoding_mbcliplen(int encoding, const char *mbstr,
|
|
int len, int limit)
|
|
{
|
|
mblen_converter mblen_fn;
|
|
int clen = 0;
|
|
int l;
|
|
|
|
/* optimization for single byte encoding */
|
|
if (pg_encoding_max_length(encoding) == 1)
|
|
return cliplen(mbstr, len, limit);
|
|
|
|
mblen_fn = pg_wchar_table[encoding].mblen;
|
|
|
|
while (len > 0 && *mbstr)
|
|
{
|
|
l = (*mblen_fn) ((const unsigned char *) mbstr);
|
|
if ((clen + l) > limit)
|
|
break;
|
|
clen += l;
|
|
if (clen == limit)
|
|
break;
|
|
len -= l;
|
|
mbstr += l;
|
|
}
|
|
return clen;
|
|
}
|
|
|
|
/*
|
|
* Similar to pg_mbcliplen except the limit parameter specifies the
|
|
* character length, not the byte length.
|
|
*/
|
|
int
|
|
pg_mbcharcliplen(const char *mbstr, int len, int limit)
|
|
{
|
|
int clen = 0;
|
|
int nch = 0;
|
|
int l;
|
|
|
|
/* optimization for single byte encoding */
|
|
if (pg_database_encoding_max_length() == 1)
|
|
return cliplen(mbstr, len, limit);
|
|
|
|
while (len > 0 && *mbstr)
|
|
{
|
|
l = pg_mblen(mbstr);
|
|
nch++;
|
|
if (nch > limit)
|
|
break;
|
|
clen += l;
|
|
len -= l;
|
|
mbstr += l;
|
|
}
|
|
return clen;
|
|
}
|
|
|
|
/* mbcliplen for any single-byte encoding */
|
|
static int
|
|
cliplen(const char *str, int len, int limit)
|
|
{
|
|
int l = 0;
|
|
|
|
len = Min(len, limit);
|
|
while (l < len && str[l])
|
|
l++;
|
|
return l;
|
|
}
|
|
|
|
void
|
|
SetDatabaseEncoding(int encoding)
|
|
{
|
|
if (!PG_VALID_BE_ENCODING(encoding))
|
|
elog(ERROR, "invalid database encoding: %d", encoding);
|
|
|
|
DatabaseEncoding = &pg_enc2name_tbl[encoding];
|
|
Assert(DatabaseEncoding->encoding == encoding);
|
|
}
|
|
|
|
void
|
|
SetMessageEncoding(int encoding)
|
|
{
|
|
/* Some calls happen before we can elog()! */
|
|
Assert(PG_VALID_ENCODING(encoding));
|
|
|
|
MessageEncoding = &pg_enc2name_tbl[encoding];
|
|
Assert(MessageEncoding->encoding == encoding);
|
|
}
|
|
|
|
#ifdef ENABLE_NLS
|
|
/*
|
|
* Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
|
|
* codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
|
|
* fail for gettext-internal causes like out-of-memory.
|
|
*/
|
|
static bool
|
|
raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
|
|
{
|
|
bool elog_ok = (CurrentMemoryContext != NULL);
|
|
int i;
|
|
|
|
for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
|
|
{
|
|
if (pg_enc2gettext_tbl[i].encoding == encoding)
|
|
{
|
|
if (bind_textdomain_codeset(domainname,
|
|
pg_enc2gettext_tbl[i].name) != NULL)
|
|
return true;
|
|
|
|
if (elog_ok)
|
|
elog(LOG, "bind_textdomain_codeset failed");
|
|
else
|
|
write_stderr("bind_textdomain_codeset failed");
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Bind a gettext message domain to the codeset corresponding to the database
|
|
* encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
|
|
* Return the MessageEncoding implied by the new settings.
|
|
*
|
|
* On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
|
|
* When that matches the database encoding, we don't need to do anything. In
|
|
* CREATE DATABASE, we enforce or trust that the locale's codeset matches the
|
|
* database encoding, except for the C locale. (On Windows, we also permit a
|
|
* discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
|
|
* gettext to the right codeset.
|
|
*
|
|
* On Windows, gettext defaults to the Windows ANSI code page. This is a
|
|
* convenient departure for software that passes the strings to Windows ANSI
|
|
* APIs, but we don't do that. Compel gettext to use database encoding or,
|
|
* failing that, the LC_CTYPE encoding as it would on other platforms.
|
|
*
|
|
* This function is called before elog() and palloc() are usable.
|
|
*/
|
|
int
|
|
pg_bind_textdomain_codeset(const char *domainname)
|
|
{
|
|
bool elog_ok = (CurrentMemoryContext != NULL);
|
|
int encoding = GetDatabaseEncoding();
|
|
int new_msgenc;
|
|
|
|
#ifndef WIN32
|
|
const char *ctype = setlocale(LC_CTYPE, NULL);
|
|
|
|
if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
|
|
#endif
|
|
if (encoding != PG_SQL_ASCII &&
|
|
raw_pg_bind_textdomain_codeset(domainname, encoding))
|
|
return encoding;
|
|
|
|
new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
|
|
if (new_msgenc < 0)
|
|
new_msgenc = PG_SQL_ASCII;
|
|
|
|
#ifdef WIN32
|
|
if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
|
|
/* On failure, the old message encoding remains valid. */
|
|
return GetMessageEncoding();
|
|
#endif
|
|
|
|
return new_msgenc;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* The database encoding, also called the server encoding, represents the
|
|
* encoding of data stored in text-like data types. Affected types include
|
|
* cstring, text, varchar, name, xml, and json.
|
|
*/
|
|
int
|
|
GetDatabaseEncoding(void)
|
|
{
|
|
return DatabaseEncoding->encoding;
|
|
}
|
|
|
|
const char *
|
|
GetDatabaseEncodingName(void)
|
|
{
|
|
return DatabaseEncoding->name;
|
|
}
|
|
|
|
Datum
|
|
getdatabaseencoding(PG_FUNCTION_ARGS)
|
|
{
|
|
return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
|
|
}
|
|
|
|
Datum
|
|
pg_client_encoding(PG_FUNCTION_ARGS)
|
|
{
|
|
return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
|
|
}
|
|
|
|
Datum
|
|
PG_char_to_encoding(PG_FUNCTION_ARGS)
|
|
{
|
|
Name s = PG_GETARG_NAME(0);
|
|
|
|
PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
|
|
}
|
|
|
|
Datum
|
|
PG_encoding_to_char(PG_FUNCTION_ARGS)
|
|
{
|
|
int32 encoding = PG_GETARG_INT32(0);
|
|
const char *encoding_name = pg_encoding_to_char(encoding);
|
|
|
|
return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
|
|
}
|
|
|
|
/*
|
|
* gettext() returns messages in this encoding. This often matches the
|
|
* database encoding, but it differs for SQL_ASCII databases, for processes
|
|
* not attached to a database, and under a database encoding lacking iconv
|
|
* support (MULE_INTERNAL).
|
|
*/
|
|
int
|
|
GetMessageEncoding(void)
|
|
{
|
|
return MessageEncoding->encoding;
|
|
}
|
|
|
|
|
|
/*
|
|
* Generic character incrementer function.
|
|
*
|
|
* Not knowing anything about the properties of the encoding in use, we just
|
|
* keep incrementing the last byte until we get a validly-encoded result,
|
|
* or we run out of values to try. We don't bother to try incrementing
|
|
* higher-order bytes, so there's no growth in runtime for wider characters.
|
|
* (If we did try to do that, we'd need to consider the likelihood that 255
|
|
* is not a valid final byte in the encoding.)
|
|
*/
|
|
static bool
|
|
pg_generic_charinc(unsigned char *charptr, int len)
|
|
{
|
|
unsigned char *lastbyte = charptr + len - 1;
|
|
mbchar_verifier mbverify;
|
|
|
|
/* We can just invoke the character verifier directly. */
|
|
mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
|
|
|
|
while (*lastbyte < (unsigned char) 255)
|
|
{
|
|
(*lastbyte)++;
|
|
if ((*mbverify) (charptr, len) == len)
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* UTF-8 character incrementer function.
|
|
*
|
|
* For a one-byte character less than 0x7F, we just increment the byte.
|
|
*
|
|
* For a multibyte character, every byte but the first must fall between 0x80
|
|
* and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
|
|
* the last byte that's not already at its maximum value. If we can't find a
|
|
* byte that's less than the maximum allowable value, we simply fail. We also
|
|
* need some special-case logic to skip regions used for surrogate pair
|
|
* handling, as those should not occur in valid UTF-8.
|
|
*
|
|
* Note that we don't reset lower-order bytes back to their minimums, since
|
|
* we can't afford to make an exhaustive search (see make_greater_string).
|
|
*/
|
|
static bool
|
|
pg_utf8_increment(unsigned char *charptr, int length)
|
|
{
|
|
unsigned char a;
|
|
unsigned char limit;
|
|
|
|
switch (length)
|
|
{
|
|
default:
|
|
/* reject lengths 5 and 6 for now */
|
|
return false;
|
|
case 4:
|
|
a = charptr[3];
|
|
if (a < 0xBF)
|
|
{
|
|
charptr[3]++;
|
|
break;
|
|
}
|
|
/* FALL THRU */
|
|
case 3:
|
|
a = charptr[2];
|
|
if (a < 0xBF)
|
|
{
|
|
charptr[2]++;
|
|
break;
|
|
}
|
|
/* FALL THRU */
|
|
case 2:
|
|
a = charptr[1];
|
|
switch (*charptr)
|
|
{
|
|
case 0xED:
|
|
limit = 0x9F;
|
|
break;
|
|
case 0xF4:
|
|
limit = 0x8F;
|
|
break;
|
|
default:
|
|
limit = 0xBF;
|
|
break;
|
|
}
|
|
if (a < limit)
|
|
{
|
|
charptr[1]++;
|
|
break;
|
|
}
|
|
/* FALL THRU */
|
|
case 1:
|
|
a = *charptr;
|
|
if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
|
|
return false;
|
|
charptr[0]++;
|
|
break;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* EUC-JP character incrementer function.
|
|
*
|
|
* If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
|
|
* representing JIS X 0201 characters with the second byte ranging between
|
|
* 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
|
|
* and otherwise rewrite the whole sequence to 0xa1 0xa1.
|
|
*
|
|
* If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
|
|
* in which the last two bytes range between 0xa1 and 0xfe. The last byte
|
|
* is incremented if possible, otherwise the second-to-last byte.
|
|
*
|
|
* If the sequence starts with a value other than the above and its MSB
|
|
* is set, it must be a two-byte sequence representing JIS X 0208 characters
|
|
* with both bytes ranging between 0xa1 and 0xfe. The last byte is
|
|
* incremented if possible, otherwise the second-to-last byte.
|
|
*
|
|
* Otherwise, the sequence is a single-byte ASCII character. It is
|
|
* incremented up to 0x7f.
|
|
*/
|
|
static bool
|
|
pg_eucjp_increment(unsigned char *charptr, int length)
|
|
{
|
|
unsigned char c1,
|
|
c2;
|
|
int i;
|
|
|
|
c1 = *charptr;
|
|
|
|
switch (c1)
|
|
{
|
|
case SS2: /* JIS X 0201 */
|
|
if (length != 2)
|
|
return false;
|
|
|
|
c2 = charptr[1];
|
|
|
|
if (c2 >= 0xdf)
|
|
charptr[0] = charptr[1] = 0xa1;
|
|
else if (c2 < 0xa1)
|
|
charptr[1] = 0xa1;
|
|
else
|
|
charptr[1]++;
|
|
break;
|
|
|
|
case SS3: /* JIS X 0212 */
|
|
if (length != 3)
|
|
return false;
|
|
|
|
for (i = 2; i > 0; i--)
|
|
{
|
|
c2 = charptr[i];
|
|
if (c2 < 0xa1)
|
|
{
|
|
charptr[i] = 0xa1;
|
|
return true;
|
|
}
|
|
else if (c2 < 0xfe)
|
|
{
|
|
charptr[i]++;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
/* Out of 3-byte code region */
|
|
return false;
|
|
|
|
default:
|
|
if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
|
|
{
|
|
if (length != 2)
|
|
return false;
|
|
|
|
for (i = 1; i >= 0; i--)
|
|
{
|
|
c2 = charptr[i];
|
|
if (c2 < 0xa1)
|
|
{
|
|
charptr[i] = 0xa1;
|
|
return true;
|
|
}
|
|
else if (c2 < 0xfe)
|
|
{
|
|
charptr[i]++;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
/* Out of 2 byte code region */
|
|
return false;
|
|
}
|
|
else
|
|
{ /* ASCII, single byte */
|
|
if (c1 > 0x7e)
|
|
return false;
|
|
(*charptr)++;
|
|
}
|
|
break;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* get the character incrementer for the encoding for the current database
|
|
*/
|
|
mbcharacter_incrementer
|
|
pg_database_encoding_character_incrementer(void)
|
|
{
|
|
/*
|
|
* Eventually it might be best to add a field to pg_wchar_table[], but for
|
|
* now we just use a switch.
|
|
*/
|
|
switch (GetDatabaseEncoding())
|
|
{
|
|
case PG_UTF8:
|
|
return pg_utf8_increment;
|
|
|
|
case PG_EUC_JP:
|
|
return pg_eucjp_increment;
|
|
|
|
default:
|
|
return pg_generic_charinc;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* fetch maximum length of the encoding for the current database
|
|
*/
|
|
int
|
|
pg_database_encoding_max_length(void)
|
|
{
|
|
return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
|
|
}
|
|
|
|
/*
|
|
* Verify mbstr to make sure that it is validly encoded in the current
|
|
* database encoding. Otherwise same as pg_verify_mbstr().
|
|
*/
|
|
bool
|
|
pg_verifymbstr(const char *mbstr, int len, bool noError)
|
|
{
|
|
return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
|
|
}
|
|
|
|
/*
|
|
* Verify mbstr to make sure that it is validly encoded in the specified
|
|
* encoding.
|
|
*/
|
|
bool
|
|
pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
|
|
{
|
|
int oklen;
|
|
|
|
Assert(PG_VALID_ENCODING(encoding));
|
|
|
|
oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
|
|
if (oklen != len)
|
|
{
|
|
if (noError)
|
|
return false;
|
|
report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Verify mbstr to make sure that it is validly encoded in the specified
|
|
* encoding.
|
|
*
|
|
* mbstr is not necessarily zero terminated; length of mbstr is
|
|
* specified by len.
|
|
*
|
|
* If OK, return length of string in the encoding.
|
|
* If a problem is found, return -1 when noError is
|
|
* true; when noError is false, ereport() a descriptive message.
|
|
*
|
|
* Note: We cannot use the faster encoding-specific mbverifystr() function
|
|
* here, because we need to count the number of characters in the string.
|
|
*/
|
|
int
|
|
pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
|
|
{
|
|
mbchar_verifier mbverifychar;
|
|
int mb_len;
|
|
|
|
Assert(PG_VALID_ENCODING(encoding));
|
|
|
|
/*
|
|
* In single-byte encodings, we need only reject nulls (\0).
|
|
*/
|
|
if (pg_encoding_max_length(encoding) <= 1)
|
|
{
|
|
const char *nullpos = memchr(mbstr, 0, len);
|
|
|
|
if (nullpos == NULL)
|
|
return len;
|
|
if (noError)
|
|
return -1;
|
|
report_invalid_encoding(encoding, nullpos, 1);
|
|
}
|
|
|
|
/* fetch function pointer just once */
|
|
mbverifychar = pg_wchar_table[encoding].mbverifychar;
|
|
|
|
mb_len = 0;
|
|
|
|
while (len > 0)
|
|
{
|
|
int l;
|
|
|
|
/* fast path for ASCII-subset characters */
|
|
if (!IS_HIGHBIT_SET(*mbstr))
|
|
{
|
|
if (*mbstr != '\0')
|
|
{
|
|
mb_len++;
|
|
mbstr++;
|
|
len--;
|
|
continue;
|
|
}
|
|
if (noError)
|
|
return -1;
|
|
report_invalid_encoding(encoding, mbstr, len);
|
|
}
|
|
|
|
l = (*mbverifychar) ((const unsigned char *) mbstr, len);
|
|
|
|
if (l < 0)
|
|
{
|
|
if (noError)
|
|
return -1;
|
|
report_invalid_encoding(encoding, mbstr, len);
|
|
}
|
|
|
|
mbstr += l;
|
|
len -= l;
|
|
mb_len++;
|
|
}
|
|
return mb_len;
|
|
}
|
|
|
|
/*
|
|
* check_encoding_conversion_args: check arguments of a conversion function
|
|
*
|
|
* "expected" arguments can be either an encoding ID or -1 to indicate that
|
|
* the caller will check whether it accepts the ID.
|
|
*
|
|
* Note: the errors here are not really user-facing, so elog instead of
|
|
* ereport seems sufficient. Also, we trust that the "expected" encoding
|
|
* arguments are valid encoding IDs, but we don't trust the actuals.
|
|
*/
|
|
void
|
|
check_encoding_conversion_args(int src_encoding,
|
|
int dest_encoding,
|
|
int len,
|
|
int expected_src_encoding,
|
|
int expected_dest_encoding)
|
|
{
|
|
if (!PG_VALID_ENCODING(src_encoding))
|
|
elog(ERROR, "invalid source encoding ID: %d", src_encoding);
|
|
if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
|
|
elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
|
|
pg_enc2name_tbl[expected_src_encoding].name,
|
|
pg_enc2name_tbl[src_encoding].name);
|
|
if (!PG_VALID_ENCODING(dest_encoding))
|
|
elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
|
|
if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
|
|
elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
|
|
pg_enc2name_tbl[expected_dest_encoding].name,
|
|
pg_enc2name_tbl[dest_encoding].name);
|
|
if (len < 0)
|
|
elog(ERROR, "encoding conversion length must not be negative");
|
|
}
|
|
|
|
/*
|
|
* report_invalid_encoding: complain about invalid multibyte character
|
|
*
|
|
* note: len is remaining length of string, not length of character;
|
|
* len must be greater than zero (or we'd neglect initializing "buf").
|
|
*/
|
|
void
|
|
report_invalid_encoding(int encoding, const char *mbstr, int len)
|
|
{
|
|
int l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
|
|
char buf[8 * 5 + 1];
|
|
char *p = buf;
|
|
int j,
|
|
jlimit;
|
|
|
|
jlimit = Min(l, len);
|
|
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
|
|
|
|
for (j = 0; j < jlimit; j++)
|
|
{
|
|
p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
|
|
if (j < jlimit - 1)
|
|
p += sprintf(p, " ");
|
|
}
|
|
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
|
errmsg("invalid byte sequence for encoding \"%s\": %s",
|
|
pg_enc2name_tbl[encoding].name,
|
|
buf)));
|
|
}
|
|
|
|
/*
|
|
* report_untranslatable_char: complain about untranslatable character
|
|
*
|
|
* note: len is remaining length of string, not length of character;
|
|
* len must be greater than zero (or we'd neglect initializing "buf").
|
|
*/
|
|
void
|
|
report_untranslatable_char(int src_encoding, int dest_encoding,
|
|
const char *mbstr, int len)
|
|
{
|
|
int l;
|
|
char buf[8 * 5 + 1];
|
|
char *p = buf;
|
|
int j,
|
|
jlimit;
|
|
|
|
/*
|
|
* We probably could use plain pg_encoding_mblen(), because
|
|
* gb18030_to_utf8() verifies before it converts. All conversions should.
|
|
* For src_encoding!=GB18030, len>0 meets pg_encoding_mblen() needs. Even
|
|
* so, be defensive, since a buggy conversion might pass invalid data.
|
|
* This is not a performance-critical path.
|
|
*/
|
|
l = pg_encoding_mblen_or_incomplete(src_encoding, mbstr, len);
|
|
jlimit = Min(l, len);
|
|
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
|
|
|
|
for (j = 0; j < jlimit; j++)
|
|
{
|
|
p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
|
|
if (j < jlimit - 1)
|
|
p += sprintf(p, " ");
|
|
}
|
|
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
|
|
errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
|
|
buf,
|
|
pg_enc2name_tbl[src_encoding].name,
|
|
pg_enc2name_tbl[dest_encoding].name)));
|
|
}
|
|
|
|
|
|
#ifdef WIN32
|
|
/*
|
|
* Convert from MessageEncoding to a palloc'ed, null-terminated utf16
|
|
* string. The character length is also passed to utf16len if not
|
|
* null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
|
|
* should be ASCII-only; this will function as though MessageEncoding is UTF8.
|
|
*/
|
|
WCHAR *
|
|
pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
|
|
{
|
|
int msgenc = GetMessageEncoding();
|
|
WCHAR *utf16;
|
|
int dstlen;
|
|
UINT codepage;
|
|
|
|
if (msgenc == PG_SQL_ASCII)
|
|
/* No conversion is possible, and SQL_ASCII is never utf16. */
|
|
return NULL;
|
|
|
|
codepage = pg_enc2name_tbl[msgenc].codepage;
|
|
|
|
/*
|
|
* Use MultiByteToWideChar directly if there is a corresponding codepage,
|
|
* or double conversion through UTF8 if not. Double conversion is needed,
|
|
* for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
|
|
*/
|
|
if (codepage != 0)
|
|
{
|
|
utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
|
|
dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
|
|
utf16[dstlen] = (WCHAR) 0;
|
|
}
|
|
else
|
|
{
|
|
char *utf8;
|
|
|
|
/*
|
|
* XXX pg_do_encoding_conversion() requires a transaction. In the
|
|
* absence of one, hope for the input to be valid UTF8.
|
|
*/
|
|
if (IsTransactionState())
|
|
{
|
|
utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
|
|
len,
|
|
msgenc,
|
|
PG_UTF8);
|
|
if (utf8 != str)
|
|
len = strlen(utf8);
|
|
}
|
|
else
|
|
utf8 = (char *) str;
|
|
|
|
utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
|
|
dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
|
|
utf16[dstlen] = (WCHAR) 0;
|
|
|
|
if (utf8 != str)
|
|
pfree(utf8);
|
|
}
|
|
|
|
if (dstlen == 0 && len > 0)
|
|
{
|
|
pfree(utf16);
|
|
return NULL; /* error */
|
|
}
|
|
|
|
if (utf16len)
|
|
*utf16len = dstlen;
|
|
return utf16;
|
|
}
|
|
|
|
#endif /* WIN32 */
|