From 3ad8b840ce8b1d7279f2d0d5fb7d346c0a6a3e8d Mon Sep 17 00:00:00 2001 From: Michael Paquier <michael@paquier.xyz> Date: Mon, 16 Dec 2024 11:23:38 +0900 Subject: [PATCH] Add some tests for encoding conversion in COPY TO/FROM This adds a couple of tests to trigger encoding conversion when input and server encodings do not match in COPY FROM/TO, or need_transcoding set to true in the COPY state data. These tests rely on UTF8 <-> LATIN1 for the valid cases as LATIN1 accepts any bytes, and UTF8 <-> EUC_JP for some of the invalid cases where a character cannot be understood, causing a conversion failure. Both ENCODING and client_encoding are covered. Test suggested by Andres Freund. Author: Sutou Kouhei Discussion: https://postgr.es/m/20240206222445.hzq22pb2nye7rm67@awork3.anarazel.de --- src/test/regress/expected/copyencoding.out | 46 +++++++++++++++++ src/test/regress/expected/copyencoding_1.out | 8 +++ src/test/regress/parallel_schedule | 2 +- src/test/regress/sql/copyencoding.sql | 53 ++++++++++++++++++++ 4 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 src/test/regress/expected/copyencoding.out create mode 100644 src/test/regress/expected/copyencoding_1.out create mode 100644 src/test/regress/sql/copyencoding.sql diff --git a/src/test/regress/expected/copyencoding.out b/src/test/regress/expected/copyencoding.out new file mode 100644 index 00000000000..cfa2ed6df00 --- /dev/null +++ b/src/test/regress/expected/copyencoding.out @@ -0,0 +1,46 @@ +-- +-- Test cases for encoding with COPY commands +-- +-- skip test if not UTF8 server encoding +SELECT getdatabaseencoding() <> 'UTF8' + AS skip_test \gset +\if :skip_test +\quit +\endif +-- directory paths are passed to us in environment variables +\getenv abs_builddir PG_ABS_BUILDDIR +\set utf8_csv :abs_builddir '/results/copyencoding_utf8.csv' +CREATE TABLE copy_encoding_tab (t text); +-- Valid cases +-- Use ENCODING option +-- U+3042 HIRAGANA LETTER A +COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8'); +-- Read UTF8 data as LATIN1: no error +COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1'); +-- Use client_encoding +SET client_encoding TO UTF8; +-- U+3042 HIRAGANA LETTER A +COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv); +-- Read UTF8 data as LATIN1: no error +SET client_encoding TO LATIN1; +COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv); +RESET client_encoding; +-- Invalid cases +-- Use ENCODING explicitly +-- U+3042 HIRAGANA LETTER A +COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8'); +-- Read UTF8 data as EUC_JP: no error +COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'EUC_JP'); +ERROR: invalid byte sequence for encoding "EUC_JP": 0xe3 0x81 +CONTEXT: COPY copy_encoding_tab, line 1 +-- Use client_encoding +SET client_encoding TO UTF8; +-- U+3042 HIRAGANA LETTER A +COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv); +-- Read UTF8 data as EUC_JP: no error +SET client_encoding TO EUC_JP; +COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv); +ERROR: invalid byte sequence for encoding "EUC_JP": 0xe3 0x81 +CONTEXT: COPY copy_encoding_tab, line 1 +RESET client_encoding; +DROP TABLE copy_encoding_tab; diff --git a/src/test/regress/expected/copyencoding_1.out b/src/test/regress/expected/copyencoding_1.out new file mode 100644 index 00000000000..a85ee2dbd18 --- /dev/null +++ b/src/test/regress/expected/copyencoding_1.out @@ -0,0 +1,8 @@ +-- +-- Test cases for encoding with COPY commands +-- +-- skip test if not UTF8 server encoding +SELECT getdatabaseencoding() <> 'UTF8' + AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 81e4222d26a..1edd9e45ebb 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -36,7 +36,7 @@ test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comment # execute two copy tests in parallel, to check that copy itself # is concurrent safe. # ---------- -test: copy copyselect copydml insert insert_conflict +test: copy copyselect copydml copyencoding insert insert_conflict # ---------- # More groups of parallel tests diff --git a/src/test/regress/sql/copyencoding.sql b/src/test/regress/sql/copyencoding.sql new file mode 100644 index 00000000000..4e96a4d6505 --- /dev/null +++ b/src/test/regress/sql/copyencoding.sql @@ -0,0 +1,53 @@ +-- +-- Test cases for encoding with COPY commands +-- + +-- skip test if not UTF8 server encoding +SELECT getdatabaseencoding() <> 'UTF8' + AS skip_test \gset +\if :skip_test +\quit +\endif + +-- directory paths are passed to us in environment variables +\getenv abs_builddir PG_ABS_BUILDDIR + +\set utf8_csv :abs_builddir '/results/copyencoding_utf8.csv' + +CREATE TABLE copy_encoding_tab (t text); + +-- Valid cases + +-- Use ENCODING option +-- U+3042 HIRAGANA LETTER A +COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8'); +-- Read UTF8 data as LATIN1: no error +COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1'); + +-- Use client_encoding +SET client_encoding TO UTF8; +-- U+3042 HIRAGANA LETTER A +COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv); +-- Read UTF8 data as LATIN1: no error +SET client_encoding TO LATIN1; +COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv); +RESET client_encoding; + +-- Invalid cases + +-- Use ENCODING explicitly +-- U+3042 HIRAGANA LETTER A +COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8'); +-- Read UTF8 data as EUC_JP: no error +COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'EUC_JP'); + +-- Use client_encoding +SET client_encoding TO UTF8; +-- U+3042 HIRAGANA LETTER A +COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv); +-- Read UTF8 data as EUC_JP: no error +SET client_encoding TO EUC_JP; +COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv); +RESET client_encoding; + +DROP TABLE copy_encoding_tab;