1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-15 19:21:59 +03:00

Perform conversion from Python unicode to string/bytes object via UTF-8.

We used to convert the unicode object directly to a string in the server
encoding by calling Python's PyUnicode_AsEncodedString function. In other
words, we used Python's routines to do the encoding. However, that has a
few problems. First of all, it required keeping a mapping table of Python
encoding names and PostgreSQL encodings. But the real killer was that Python
doesn't support EUC_TW and MULE_INTERNAL encodings at all.

Instead, convert the Python unicode object to UTF-8, and use PostgreSQL's
encoding conversion functions to convert from UTF-8 to server encoding. We
were already doing the same in the other direction in PLyUnicode_FromString,
so this is more consistent, too.

Note: This makes SQL_ASCII to behave more leniently. We used to map
SQL_ASCII to Python's 'ascii', which on Python means strict 7-bit ASCII
only, so you got an error if the python string contained anything but pure
ASCII. You no longer get an error; you get the UTF-8 representation of the
string instead.

Backpatch to 9.0, where these conversions were introduced.

Jan Urbański
This commit is contained in:
Heikki Linnakangas
2012-08-06 13:02:15 +03:00
parent 149ac7d455
commit 3ff15883b1
2 changed files with 47 additions and 111 deletions

View File

@ -61,66 +61,56 @@ PLy_free(void *ptr)
PyObject *
PLyUnicode_Bytes(PyObject *unicode)
{
PyObject *rv;
const char *serverenc;
PyObject *bytes, *rv;
char *utf8string, *encoded;
/*
* Map PostgreSQL encoding to a Python encoding name.
*/
switch (GetDatabaseEncoding())
{
case PG_SQL_ASCII:
/*
* Mapping SQL_ASCII to Python's 'ascii' is a bit bogus. Python's
* 'ascii' means true 7-bit only ASCII, while PostgreSQL's
* SQL_ASCII means that anything is allowed, and the system doesn't
* try to interpret the bytes in any way. But not sure what else
* to do, and we haven't heard any complaints...
*/
serverenc = "ascii";
break;
case PG_WIN1250:
serverenc = "cp1250";
break;
case PG_WIN1251:
serverenc = "cp1251";
break;
case PG_WIN1252:
serverenc = "cp1252";
break;
case PG_WIN1253:
serverenc = "cp1253";
break;
case PG_WIN1254:
serverenc = "cp1254";
break;
case PG_WIN1255:
serverenc = "cp1255";
break;
case PG_WIN1256:
serverenc = "cp1256";
break;
case PG_WIN1257:
serverenc = "cp1257";
break;
case PG_WIN1258:
serverenc = "cp1258";
break;
case PG_WIN866:
serverenc = "cp866";
break;
case PG_WIN874:
serverenc = "cp874";
break;
default:
/* Other encodings have the same name in Python. */
serverenc = GetDatabaseEncodingName();
break;
/* First encode the Python unicode object with UTF-8. */
bytes = PyUnicode_AsUTF8String(unicode);
if (bytes == NULL)
PLy_elog(ERROR, "could not convert Python Unicode object to bytes");
utf8string = PyBytes_AsString(bytes);
if (utf8string == NULL) {
Py_DECREF(bytes);
PLy_elog(ERROR, "could not extract bytes from encoded string");
}
rv = PyUnicode_AsEncodedString(unicode, serverenc, "strict");
if (rv == NULL)
PLy_elog(ERROR, "could not convert Python Unicode object to PostgreSQL server encoding");
/*
* Then convert to server encoding if necessary.
*
* PyUnicode_AsEncodedString could be used to encode the object directly
* in the server encoding, but Python doesn't support all the encodings
* that PostgreSQL does (EUC_TW and MULE_INTERNAL). UTF-8 is used as an
* intermediary in PLyUnicode_FromString as well.
*/
if (GetDatabaseEncoding() != PG_UTF8)
{
PG_TRY();
{
encoded = (char *) pg_do_encoding_conversion(
(unsigned char *) utf8string,
strlen(utf8string),
PG_UTF8,
GetDatabaseEncoding());
}
PG_CATCH();
{
Py_DECREF(bytes);
PG_RE_THROW();
}
PG_END_TRY();
}
else
encoded = utf8string;
/* finally, build a bytes object in the server encoding */
rv = PyBytes_FromStringAndSize(encoded, strlen(encoded));
/* if pg_do_encoding_conversion allocated memory, free it now */
if (utf8string != encoded)
pfree(encoded);
Py_DECREF(bytes);
return rv;
}