Perform conversion from Python unicode to string/bytes object via UTF-8.

We used to convert the unicode object directly to a string in the server encoding by calling Python's PyUnicode_AsEncodedString function. In other words, we used Python's routines to do the encoding. However, that has a few problems. First of all, it required keeping a mapping table of Python encoding names and PostgreSQL encodings. But the real killer was that Python doesn't support EUC_TW and MULE_INTERNAL encodings at all. Instead, convert the Python unicode object to UTF-8, and use PostgreSQL's encoding conversion functions to convert from UTF-8 to server encoding. We were already doing the same in the other direction in PLyUnicode_FromString, so this is more consistent, too. Note: This makes SQL_ASCII to behave more leniently. We used to map SQL_ASCII to Python's 'ascii', which on Python means strict 7-bit ASCII only, so you got an error if the python string contained anything but pure ASCII. You no longer get an error; you get the UTF-8 representation of the string instead. Backpatch to 9.0, where these conversions were introduced. Jan Urbański
2025-11-13 16:22:44 +03:00 · 2012-08-06 13:02:15 +03:00
parent a411f7e770
commit d9c77e2493
2 changed files with 47 additions and 111 deletions
--- a/src/pl/plpython/expected/plpython_unicode_3.out
+++ b/src/pl/plpython/expected/plpython_unicode_3.out
@@ -1,54 +0,0 @@
 --
 -- Unicode handling
 --
 SET client_encoding TO UTF8;
 CREATE TABLE unicode_test (
 	testvalue  text NOT NULL
 );
 CREATE FUNCTION unicode_return() RETURNS text AS E'
 return u"\\x80"
 ' LANGUAGE plpythonu;
 CREATE FUNCTION unicode_trigger() RETURNS trigger AS E'
 TD["new"]["testvalue"] = u"\\x80"
 return "MODIFY"
 ' LANGUAGE plpythonu;
 CREATE TRIGGER unicode_test_bi BEFORE INSERT ON unicode_test
  FOR EACH ROW EXECUTE PROCEDURE unicode_trigger();
 CREATE FUNCTION unicode_plan1() RETURNS text AS E'
 plan = plpy.prepare("SELECT $1 AS testvalue", ["text"])
 rv = plpy.execute(plan, [u"\\x80"], 1)
 return rv[0]["testvalue"]
 ' LANGUAGE plpythonu;
 CREATE FUNCTION unicode_plan2() RETURNS text AS E'
 plan = plpy.prepare("SELECT $1 || $2 AS testvalue", ["text", u"text"])
 rv = plpy.execute(plan, ["foo", "bar"], 1)
 return rv[0]["testvalue"]
 ' LANGUAGE plpythonu;
 SELECT unicode_return();
 ERROR:  could not convert Python Unicode object to PostgreSQL server encoding
 DETAIL:  UnicodeEncodeError: 'ascii' codec can't encode character u'\x80' in position 0: ordinal not in range(128)
 CONTEXT:  while creating return value
 PL/Python function "unicode_return"
 INSERT INTO unicode_test (testvalue) VALUES ('test');
 ERROR:  could not convert Python Unicode object to PostgreSQL server encoding
 DETAIL:  UnicodeEncodeError: 'ascii' codec can't encode character u'\x80' in position 0: ordinal not in range(128)
 CONTEXT:  while modifying trigger row
 PL/Python function "unicode_trigger"
 SELECT * FROM unicode_test;
 testvalue 
 -----------
 (0 rows)
 SELECT unicode_plan1();
 ERROR:  spiexceptions.InternalError: could not convert Python Unicode object to PostgreSQL server encoding
 DETAIL:  UnicodeEncodeError: 'ascii' codec can't encode character u'\x80' in position 0: ordinal not in range(128)
 CONTEXT:  Traceback (most recent call last):
  PL/Python function "unicode_plan1", line 3, in <module>
    rv = plpy.execute(plan, [u"\x80"], 1)
 PL/Python function "unicode_plan1"
 SELECT unicode_plan2();
 unicode_plan2 
 ---------------
 foobar
 (1 row)
--- a/src/pl/plpython/plpy_util.c
+++ b/src/pl/plpython/plpy_util.c
@@ -61,66 +61,56 @@ PLy_free(void *ptr)
 PyObject *
 PLyUnicode_Bytes(PyObject *unicode)
 {
-	PyObject   *rv;
+	PyObject	*bytes, *rv;
-	const char *serverenc;
+	char		*utf8string, *encoded;
-	/*
+	/* First encode the Python unicode object with UTF-8. */
-	 * Map PostgreSQL encoding to a Python encoding name.
+	bytes = PyUnicode_AsUTF8String(unicode);
-	 */
+	if (bytes == NULL)
-	switch (GetDatabaseEncoding())
+		PLy_elog(ERROR, "could not convert Python Unicode object to bytes");
-	{
+
-		case PG_SQL_ASCII:
+	utf8string = PyBytes_AsString(bytes);
-			/*
+	if (utf8string == NULL) {
-			 * Mapping SQL_ASCII to Python's 'ascii' is a bit bogus. Python's
+		Py_DECREF(bytes);
-			 * 'ascii' means true 7-bit only ASCII, while PostgreSQL's
+		PLy_elog(ERROR, "could not extract bytes from encoded string");
 			 * SQL_ASCII means that anything is allowed, and the system doesn't
 			 * try to interpret the bytes in any way. But not sure what else
 			 * to do, and we haven't heard any complaints...
 			 */
 			serverenc = "ascii";
 			break;
 		case PG_WIN1250:
 			serverenc = "cp1250";
 			break;
 		case PG_WIN1251:
 			serverenc = "cp1251";
 			break;
 		case PG_WIN1252:
 			serverenc = "cp1252";
 			break;
 		case PG_WIN1253:
 			serverenc = "cp1253";
 			break;
 		case PG_WIN1254:
 			serverenc = "cp1254";
 			break;
 		case PG_WIN1255:
 			serverenc = "cp1255";
 			break;
 		case PG_WIN1256:
 			serverenc = "cp1256";
 			break;
 		case PG_WIN1257:
 			serverenc = "cp1257";
 			break;
 		case PG_WIN1258:
 			serverenc = "cp1258";
 			break;
 		case PG_WIN866:
 			serverenc = "cp866";
 			break;
 		case PG_WIN874:
 			serverenc = "cp874";
 			break;
 		default:
 			/* Other encodings have the same name in Python. */
 			serverenc = GetDatabaseEncodingName();
 			break;
 	}
-	rv = PyUnicode_AsEncodedString(unicode, serverenc, "strict");
+	/*
-	if (rv == NULL)
+	 * Then convert to server encoding if necessary.
-		PLy_elog(ERROR, "could not convert Python Unicode object to PostgreSQL server encoding");
+	 *
 	 * PyUnicode_AsEncodedString could be used to encode the object directly
 	 * in the server encoding, but Python doesn't support all the encodings
 	 * that PostgreSQL does (EUC_TW and MULE_INTERNAL). UTF-8 is used as an
 	 * intermediary in PLyUnicode_FromString as well.
 	 */
 	if (GetDatabaseEncoding() != PG_UTF8)
 	{
 		PG_TRY();
 		{
 			encoded = (char *) pg_do_encoding_conversion(
 				(unsigned char *) utf8string,
 				strlen(utf8string),
 				PG_UTF8,
 				GetDatabaseEncoding());
 		}
 		PG_CATCH();
 		{
 			Py_DECREF(bytes);
 			PG_RE_THROW();
 		}
 		PG_END_TRY();
 	}
 	else
 		encoded = utf8string;
 	/* finally, build a bytes object in the server encoding */
 	rv = PyBytes_FromStringAndSize(encoded, strlen(encoded));
 	/* if pg_do_encoding_conversion allocated memory, free it now */
 	if (utf8string != encoded)
 		pfree(encoded);
 	Py_DECREF(bytes);
 	return rv;
 }